1package Text::Soundex; 2require 5.000; 3require Exporter; 4 5@ISA = qw(Exporter); 6@EXPORT = qw(&soundex $soundex_nocode); 7 8$VERSION = '1.01'; 9 10# $Id: Soundex.pm,v 1.7 2003/12/03 03:02:41 millert Exp $ 11# 12# Implementation of soundex algorithm as described by Knuth in volume 13# 3 of The Art of Computer Programming, with ideas stolen from Ian 14# Phillipps <ian@pipex.net>. 15# 16# Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994. 17# 18# Knuth's test cases are: 19# 20# Euler, Ellery -> E460 21# Gauss, Ghosh -> G200 22# Hilbert, Heilbronn -> H416 23# Knuth, Kant -> K530 24# Lloyd, Ladd -> L300 25# Lukasiewicz, Lissajous -> L222 26# 27# $Log: Soundex.pm,v $ 28# Revision 1.7 2003/12/03 03:02:41 millert 29# Resolve conflicts for perl 5.8.2, remove old files, and add OpenBSD-specific scaffolding 30# 31# Revision 1.2 1994/03/24 00:30:27 mike 32# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> 33# in the way I handles leasing characters which were different but had 34# the same soundex code. This showed up comparing it with Oracle's 35# soundex output. 36# 37# Revision 1.1 1994/03/02 13:01:30 mike 38# Initial revision 39# 40# 41############################################################################## 42 43# $soundex_nocode is used to indicate a string doesn't have a soundex 44# code, I like undef other people may want to set it to 'Z000'. 45 46$soundex_nocode = undef; 47 48sub soundex 49{ 50 local (@s, $f, $fc, $_) = @_; 51 52 push @s, '' unless @s; # handle no args as a single empty string 53 54 foreach (@s) 55 { 56 $_ = uc $_; 57 tr/A-Z//cd; 58 59 if ($_ eq '') 60 { 61 $_ = $soundex_nocode; 62 } 63 else 64 { 65 ($f) = /^(.)/; 66 tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; 67 ($fc) = /^(.)/; 68 s/^$fc+//; 69 tr///cs; 70 tr/0//d; 71 $_ = $f . $_ . '000'; 72 s/^(.{4}).*/$1/; 73 } 74 } 75 76 wantarray ? @s : shift @s; 77} 78 791; 80 81__END__ 82 83=head1 NAME 84 85Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth 86 87=head1 SYNOPSIS 88 89 use Text::Soundex; 90 91 $code = soundex $string; # get soundex code for a string 92 @codes = soundex @list; # get list of codes for list of strings 93 94 # set value to be returned for strings without soundex code 95 96 $soundex_nocode = 'Z000'; 97 98=head1 DESCRIPTION 99 100This module implements the soundex algorithm as described by Donald Knuth 101in Volume 3 of B<The Art of Computer Programming>. The algorithm is 102intended to hash words (in particular surnames) into a small space using a 103simple model which approximates the sound of the word when spoken by an English 104speaker. Each word is reduced to a four character string, the first 105character being an upper case letter and the remaining three being digits. 106 107If there is no soundex code representation for a string then the value of 108C<$soundex_nocode> is returned. This is initially set to C<undef>, but 109many people seem to prefer an I<unlikely> value like C<Z000> 110(how unlikely this is depends on the data set being dealt with.) Any value 111can be assigned to C<$soundex_nocode>. 112 113In scalar context C<soundex> returns the soundex code of its first 114argument, and in list context a list is returned in which each element is the 115soundex code for the corresponding argument passed to C<soundex> e.g. 116 117 @codes = soundex qw(Mike Stok); 118 119leaves C<@codes> containing C<('M200', 'S320')>. 120 121=head1 EXAMPLES 122 123Knuth's examples of various names and the soundex codes they map to 124are listed below: 125 126 Euler, Ellery -> E460 127 Gauss, Ghosh -> G200 128 Hilbert, Heilbronn -> H416 129 Knuth, Kant -> K530 130 Lloyd, Ladd -> L300 131 Lukasiewicz, Lissajous -> L222 132 133so: 134 135 $code = soundex 'Knuth'; # $code contains 'K530' 136 @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' 137 138=head1 LIMITATIONS 139 140As the soundex algorithm was originally used a B<long> time ago in the US 141it considers only the English alphabet and pronunciation. 142 143As it is mapping a large space (arbitrary length strings) onto a small 144space (single letter plus 3 digits) no inference can be made about the 145similarity of two strings which end up with the same soundex code. For 146example, both C<Hilbert> and C<Heilbronn> end up with a soundex code 147of C<H416>. 148 149=head1 AUTHOR 150 151This code was implemented by Mike Stok (C<stok@cybercom.net>) from the 152description given by Knuth. Ian Phillipps (C<ian@pipex.net>) and Rich Pinder 153(C<rpinder@hsc.usc.edu>) supplied ideas and spotted mistakes. 154