[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/i586-linux-thread-multi/Text/ -> Soundex.pm (source)

   1  # -*- perl -*-
   2  
   3  # (c) Copyright 1998-2007 by Mark Mielke
   4  #
   5  # Freedom to use these sources for whatever you want, as long as credit
   6  # is given where credit is due, is hereby granted. You may make modifications
   7  # where you see fit but leave this copyright somewhere visible. As well, try
   8  # to initial any changes you make so that if I like the changes I can
   9  # incorporate them into later versions.
  10  #
  11  #      - Mark Mielke <mark@mielke.cc>
  12  #
  13  
  14  package Text::Soundex;
  15  require 5.006;
  16  
  17  use Exporter ();
  18  use XSLoader ();
  19  
  20  use strict;
  21  
  22  our $VERSION   = '3.03';
  23  our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
  24                      $soundex_nocode);
  25  our @EXPORT    = qw(soundex soundex_nara $soundex_nocode);
  26  our @ISA       = qw(Exporter);
  27  
  28  our $nocode;
  29  
  30  # Previous releases of Text::Soundex made $nocode available as $soundex_nocode.
  31  # For now, this part of the interface is exported and maintained.
  32  # In the feature, $soundex_nocode will be deprecated.
  33  *Text::Soundex::soundex_nocode = \$nocode;
  34  
  35  sub soundex_noxs
  36  {
  37      # Original Soundex algorithm
  38  
  39      my @results = map {
  40          my $code = uc($_);
  41          $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
  42  
  43      if (length($code)) {
  44              my $firstchar = substr($code, 0, 1);
  45          $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
  46                         [0000000000000000111111112222222222222222333344555566]s;
  47          ($code = substr($code, 1)) =~ tr/0//d;
  48          substr($firstchar . $code . '000', 0, 4);
  49      } else {
  50          $nocode;
  51      }
  52      } @_;
  53  
  54      wantarray ? @results : $results[0];
  55  }
  56  
  57  sub soundex_nara
  58  {
  59      # US census (NARA) algorithm.
  60  
  61      my @results = map {
  62      my $code = uc($_);
  63          $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
  64  
  65      if (length($code)) {
  66              my $firstchar = substr($code, 0, 1);
  67          $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
  68                         [0000990000009900111111112222222222222222333344555566]s;
  69              $code =~ s/(.)9\1/$1/gs;
  70          ($code = substr($code, 1)) =~ tr/09//d;
  71          substr($firstchar . $code . '000', 0, 4);
  72      } else {
  73          $nocode
  74      }
  75      } @_;
  76  
  77      wantarray ? @results : $results[0];
  78  }
  79  
  80  sub soundex_unicode
  81  {
  82      require Text::Unidecode unless defined &Text::Unidecode::unidecode;
  83      soundex(Text::Unidecode::unidecode(@_));
  84  }
  85  
  86  sub soundex_nara_unicode
  87  {
  88      require Text::Unidecode unless defined &Text::Unidecode::unidecode;
  89      soundex_nara(Text::Unidecode::unidecode(@_));
  90  }
  91  
  92  eval { XSLoader::load(__PACKAGE__, $VERSION) };
  93  
  94  if (defined(&soundex_xs)) {
  95      *soundex = \&soundex_xs;
  96  } else {
  97      *soundex = \&soundex_noxs;
  98      *soundex_xs = sub {
  99          require Carp;
 100          Carp::croak("XS implementation of Text::Soundex::soundex_xs() ".
 101                      "could not be loaded");
 102      };
 103  }
 104  
 105  1;
 106  
 107  __END__
 108  
 109  # Implementation of the soundex algorithm.
 110  #
 111  # Some of this documention was written by Mike Stok.
 112  #
 113  # Examples:
 114  #
 115  # Euler, Ellery -> E460
 116  # Gauss, Ghosh -> G200
 117  # Hilbert, Heilbronn -> H416
 118  # Knuth, Kant -> K530
 119  # Lloyd, Ladd -> L300
 120  # Lukasiewicz, Lissajous -> L222
 121  #
 122  
 123  =head1 NAME
 124  
 125  Text::Soundex - Implementation of the soundex algorithm.
 126  
 127  =head1 SYNOPSIS
 128  
 129    use Text::Soundex;
 130  
 131    # Original algorithm.
 132    $code = soundex($name);    # Get the soundex code for a name.
 133    @codes = soundex(@names);  # Get the list of codes for a list of names.
 134  
 135    # American Soundex variant (NARA) - Used for US census data.
 136    $code = soundex_nara($name);    # Get the soundex code for a name.
 137    @codes = soundex_nara(@names);  # Get the list of codes for a list of names.
 138  
 139    # Redefine the value that soundex() will return if the input string
 140    # contains no identifiable sounds within it.
 141    $Text::Soundex::nocode = 'Z000';
 142  
 143  =head1 DESCRIPTION
 144  
 145  Soundex is a phonetic algorithm for indexing names by sound, as
 146  pronounced in English. The goal is for names with the same
 147  pronunciation to be encoded to the same representation so that they
 148  can be matched despite minor differences in spelling. Soundex is the
 149  most widely known of all phonetic algorithms and is often used
 150  (incorrectly) as a synonym for "phonetic algorithm". Improvements to
 151  Soundex are the basis for many modern phonetic algorithms. (Wikipedia,
 152  2007)
 153  
 154  This module implements the original soundex algorithm developed by
 155  Robert Russell and Margaret Odell, patented in 1918 and 1922, as well
 156  as a variation called "American Soundex" used for US census data, and
 157  current maintained by the National Archives and Records Administration
 158  (NARA).
 159  
 160  The soundex algorithm may be recognized from Donald Knuth's
 161  B<The Art of Computer Programming>. The algorithm described by
 162  Knuth is the NARA algorithm.
 163  
 164  The value returned for strings which have no soundex encoding is
 165  defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
 166  however values such as C<'Z000'> are commonly used alternatives.
 167  
 168  For backward compatibility with older versions of this module the
 169  C<$Text::Soundex::nocode> is exported into the caller's namespace as
 170  C<$soundex_nocode>.
 171  
 172  In scalar context, C<soundex()> returns the soundex code of its first
 173  argument. In list context, a list is returned in which each element is the
 174  soundex code for the corresponding argument passed to C<soundex()>. For
 175  example, the following code assigns @codes the value C<('M200', 'S320')>:
 176  
 177     @codes = soundex qw(Mike Stok);
 178  
 179  To use C<Text::Soundex> to generate codes that can be used to search one
 180  of the publically available US Censuses, a variant of the soundex
 181  algorithm must be used:
 182  
 183      use Text::Soundex;
 184      $code = soundex_nara($name);
 185  
 186  An example of where these algorithm differ follows:
 187  
 188      use Text::Soundex;
 189      print soundex("Ashcraft"), "\n";       # prints: A226
 190      print soundex_nara("Ashcraft"), "\n";  # prints: A261
 191  
 192  =head1 EXAMPLES
 193  
 194  Donald Knuth's examples of names and the soundex codes they map to
 195  are listed below:
 196  
 197    Euler, Ellery -> E460
 198    Gauss, Ghosh -> G200
 199    Hilbert, Heilbronn -> H416
 200    Knuth, Kant -> K530
 201    Lloyd, Ladd -> L300
 202    Lukasiewicz, Lissajous -> L222
 203  
 204  so:
 205  
 206    $code = soundex 'Knuth';         # $code contains 'K530'
 207    @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'
 208  
 209  =head1 LIMITATIONS
 210  
 211  As the soundex algorithm was originally used a B<long> time ago in the US
 212  it considers only the English alphabet and pronunciation. In particular,
 213  non-ASCII characters will be ignored. The recommended method of dealing
 214  with characters that have accents, or other unicode characters, is to use
 215  the Text::Unidecode module available from CPAN. Either use the module
 216  explicitly:
 217  
 218      use Text::Soundex;
 219      use Text::Unidecode;
 220  
 221      print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n"
 222  
 223  Or use the convenient wrapper routine:
 224  
 225      use Text::Soundex 'soundex_unicode';
 226  
 227      print soundex_unicode("Fran\xE7ais"), "\n";    # Prints "F652\n"
 228  
 229  Since the soundex algorithm maps a large space (strings of arbitrary
 230  length) onto a small space (single letter plus 3 digits) no inference
 231  can be made about the similarity of two strings which end up with the
 232  same soundex code.  For example, both C<Hilbert> and C<Heilbronn> end
 233  up with a soundex code of C<H416>.
 234  
 235  =head1 MAINTAINER
 236  
 237  This module is currently maintain by Mark Mielke (C<mark@mielke.cc>).
 238  
 239  =head1 HISTORY
 240  
 241  Version 3 is a significant update to provide support for versions of
 242  Perl later than Perl 5.004. Specifically, the XS version of the
 243  soundex() subroutine understands strings that are encoded using UTF-8
 244  (unicode strings).
 245  
 246  Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>)
 247  to improve the speed of the subroutines. The XS version of the soundex()
 248  subroutine was introduced in 2.00.
 249  
 250  Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>)
 251  and was included into the Perl core library set.
 252  
 253  Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA
 254  algorithm to be included. The NARA soundex page can be viewed at:
 255  C<http://www.nara.gov/genealogy/soundex/soundex.html>
 256  
 257  Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>)
 258  supplied ideas and spotted mistakes for v1.x.
 259  
 260  =cut


Generated: Tue Mar 17 22:47:18 2015 Cross-referenced by PHPXref 0.7.1