[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/I18N/ -> LangTags.pm (source)

   1  
   2  # Time-stamp: "2004-10-06 23:26:33 ADT"
   3  # Sean M. Burke <sburke@cpan.org>
   4  
   5  require 5.000;
   6  package I18N::LangTags;
   7  use strict;
   8  use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
   9  require Exporter;
  10  @ISA = qw(Exporter);
  11  @EXPORT = qw();
  12  @EXPORT_OK = qw(is_language_tag same_language_tag
  13                  extract_language_tags super_languages
  14                  similarity_language_tag is_dialect_of
  15                  locale2language_tag alternate_language_tags
  16                  encode_language_tag panic_languages
  17                  implicate_supers
  18                  implicate_supers_strictly
  19                 );
  20  %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
  21  
  22  $VERSION = "0.35";
  23  
  24  sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
  25  
  26  
  27  =head1 NAME
  28  
  29  I18N::LangTags - functions for dealing with RFC3066-style language tags
  30  
  31  =head1 SYNOPSIS
  32  
  33    use I18N::LangTags();
  34  
  35  ...or specify whichever of those functions you want to import, like so:
  36  
  37    use I18N::LangTags qw(implicate_supers similarity_language_tag);
  38  
  39  All the exportable functions are listed below -- you're free to import
  40  only some, or none at all.  By default, none are imported.  If you
  41  say:
  42  
  43      use I18N::LangTags qw(:ALL)
  44  
  45  ...then all are exported.  (This saves you from having to use
  46  something less obvious like C<use I18N::LangTags qw(/./)>.)
  47  
  48  If you don't import any of these functions, assume a C<&I18N::LangTags::>
  49  in front of all the function names in the following examples.
  50  
  51  =head1 DESCRIPTION
  52  
  53  Language tags are a formalism, described in RFC 3066 (obsoleting
  54  1766), for declaring what language form (language and possibly
  55  dialect) a given chunk of information is in.
  56  
  57  This library provides functions for common tasks involving language
  58  tags as they are needed in a variety of protocols and applications.
  59  
  60  Please see the "See Also" references for a thorough explanation
  61  of how to correctly use language tags.
  62  
  63  =over
  64  
  65  =cut
  66  
  67  ###########################################################################
  68  
  69  =item * the function is_language_tag($lang1)
  70  
  71  Returns true iff $lang1 is a formally valid language tag.
  72  
  73     is_language_tag("fr")            is TRUE
  74     is_language_tag("x-jicarilla")   is FALSE
  75         (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
  76  
  77     is_language_tag("sgn-US")    is TRUE
  78         (That's American Sign Language)
  79  
  80     is_language_tag("i-Klikitat")    is TRUE
  81         (True without regard to the fact noone has actually
  82          registered Klikitat -- it's a formally valid tag)
  83  
  84     is_language_tag("fr-patois")     is TRUE
  85         (Formally valid -- altho descriptively weak!)
  86  
  87     is_language_tag("Spanish")       is FALSE
  88     is_language_tag("french-patois") is FALSE
  89         (No good -- first subtag has to match
  90          /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
  91  
  92     is_language_tag("x-borg-prot2532") is TRUE
  93         (Yes, subtags can contain digits, as of RFC3066)
  94  
  95  =cut
  96  
  97  sub is_language_tag {
  98  
  99    ## Changes in the language tagging standards may have to be reflected here.
 100  
 101    my($tag) = lc($_[0]);
 102  
 103    return 0 if $tag eq "i" or $tag eq "x";
 104    # Bad degenerate cases that the following
 105    #  regexp would erroneously let pass
 106  
 107    return $tag =~ 
 108      /^(?:  # First subtag
 109           [xi] | [a-z]{2,3}
 110        )
 111        (?:  # Subtags thereafter
 112           -           # separator
 113           [a-z0-9]{1,8}  # subtag  
 114        )*
 115      $/xs ? 1 : 0;
 116  }
 117  
 118  ###########################################################################
 119  
 120  =item * the function extract_language_tags($whatever)
 121  
 122  Returns a list of whatever looks like formally valid language tags
 123  in $whatever.  Not very smart, so don't get too creative with
 124  what you want to feed it.
 125  
 126    extract_language_tags("fr, fr-ca, i-mingo")
 127      returns:   ('fr', 'fr-ca', 'i-mingo')
 128  
 129    extract_language_tags("It's like this: I'm in fr -- French!")
 130      returns:   ('It', 'in', 'fr')
 131    (So don't just feed it any old thing.)
 132  
 133  The output is untainted.  If you don't know what tainting is,
 134  don't worry about it.
 135  
 136  =cut
 137  
 138  sub extract_language_tags {
 139  
 140    ## Changes in the language tagging standards may have to be reflected here.
 141  
 142    my($text) =
 143      $_[0] =~ m/(.+)/  # to make for an untainted result
 144      ? $1 : ''
 145    ;
 146    
 147    return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
 148      $text =~ 
 149      m/
 150        \b
 151        (?:  # First subtag
 152           [iIxX] | [a-zA-Z]{2,3}
 153        )
 154        (?:  # Subtags thereafter
 155           -           # separator
 156           [a-zA-Z0-9]{1,8}  # subtag  
 157        )*
 158        \b
 159      /xsg
 160    );
 161  }
 162  
 163  ###########################################################################
 164  
 165  =item * the function same_language_tag($lang1, $lang2)
 166  
 167  Returns true iff $lang1 and $lang2 are acceptable variant tags
 168  representing the same language-form.
 169  
 170     same_language_tag('x-kadara', 'i-kadara')  is TRUE
 171        (The x/i- alternation doesn't matter)
 172     same_language_tag('X-KADARA', 'i-kadara')  is TRUE
 173        (...and neither does case)
 174     same_language_tag('en',       'en-US')     is FALSE
 175        (all-English is not the SAME as US English)
 176     same_language_tag('x-kadara', 'x-kadar')   is FALSE
 177        (these are totally unrelated tags)
 178     same_language_tag('no-bok',    'nb')       is TRUE
 179        (no-bok is a legacy tag for nb (Norwegian Bokmal))
 180  
 181  C<same_language_tag> works by just seeing whether
 182  C<encode_language_tag($lang1)> is the same as
 183  C<encode_language_tag($lang2)>.
 184  
 185  (Yes, I know this function is named a bit oddly.  Call it historic
 186  reasons.)
 187  
 188  =cut
 189  
 190  sub same_language_tag {
 191    my $el1 = &encode_language_tag($_[0]);
 192    return 0 unless defined $el1;
 193     # this avoids the problem of
 194     # encode_language_tag($lang1) eq and encode_language_tag($lang2)
 195     # being true if $lang1 and $lang2 are both undef
 196  
 197    return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
 198  }
 199  
 200  ###########################################################################
 201  
 202  =item * the function similarity_language_tag($lang1, $lang2)
 203  
 204  Returns an integer representing the degree of similarity between
 205  tags $lang1 and $lang2 (the order of which does not matter), where
 206  similarity is the number of common elements on the left,
 207  without regard to case and to x/i- alternation.
 208  
 209     similarity_language_tag('fr', 'fr-ca')           is 1
 210        (one element in common)
 211     similarity_language_tag('fr-ca', 'fr-FR')        is 1
 212        (one element in common)
 213  
 214     similarity_language_tag('fr-CA-joual',
 215                             'fr-CA-PEI')             is 2
 216     similarity_language_tag('fr-CA-joual', 'fr-CA')  is 2
 217        (two elements in common)
 218  
 219     similarity_language_tag('x-kadara', 'i-kadara')  is 1
 220        (x/i- doesn't matter)
 221  
 222     similarity_language_tag('en',       'x-kadar')   is 0
 223     similarity_language_tag('x-kadara', 'x-kadar')   is 0
 224        (unrelated tags -- no similarity)
 225  
 226     similarity_language_tag('i-cree-syllabic',
 227                             'i-cherokee-syllabic')   is 0
 228        (no B<leftmost> elements in common!)
 229  
 230  =cut
 231  
 232  sub similarity_language_tag {
 233    my $lang1 = &encode_language_tag($_[0]);
 234    my $lang2 = &encode_language_tag($_[1]);
 235     # And encode_language_tag takes care of the whole
 236     #  no-nyn==nn, i-hakka==zh-hakka, etc, things
 237     
 238    # NB: (i-sil-...)?  (i-sgn-...)?
 239  
 240    return undef if !defined($lang1) and !defined($lang2);
 241    return 0 if !defined($lang1) or !defined($lang2);
 242  
 243    my @l1_subtags = split('-', $lang1);
 244    my @l2_subtags = split('-', $lang2);
 245    my $similarity = 0;
 246  
 247    while(@l1_subtags and @l2_subtags) {
 248      if(shift(@l1_subtags) eq shift(@l2_subtags)) {
 249        ++$similarity;
 250      } else {
 251        last;
 252      } 
 253    }
 254    return $similarity;
 255  }
 256  
 257  ###########################################################################
 258  
 259  =item * the function is_dialect_of($lang1, $lang2)
 260  
 261  Returns true iff language tag $lang1 represents a subform of
 262  language tag $lang2.
 263  
 264  B<Get the order right!  It doesn't work the other way around!>
 265  
 266     is_dialect_of('en-US', 'en')            is TRUE
 267       (American English IS a dialect of all-English)
 268  
 269     is_dialect_of('fr-CA-joual', 'fr-CA')   is TRUE
 270     is_dialect_of('fr-CA-joual', 'fr')      is TRUE
 271       (Joual is a dialect of (a dialect of) French)
 272  
 273     is_dialect_of('en', 'en-US')            is FALSE
 274       (all-English is a NOT dialect of American English)
 275  
 276     is_dialect_of('fr', 'en-CA')            is FALSE
 277  
 278     is_dialect_of('en',    'en'   )         is TRUE
 279     is_dialect_of('en-US', 'en-US')         is TRUE
 280       (B<Note:> these are degenerate cases)
 281  
 282     is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
 283       (the x/i thing doesn't matter, nor does case)
 284  
 285     is_dialect_of('nn', 'no')               is TRUE
 286       (because 'nn' (New Norse) is aliased to 'no-nyn',
 287        as a special legacy case, and 'no-nyn' is a
 288        subform of 'no' (Norwegian))
 289  
 290  =cut
 291  
 292  sub is_dialect_of {
 293  
 294    my $lang1 = &encode_language_tag($_[0]);
 295    my $lang2 = &encode_language_tag($_[1]);
 296  
 297    return undef if !defined($lang1) and !defined($lang2);
 298    return 0 if !defined($lang1) or !defined($lang2);
 299  
 300    return 1 if $lang1 eq $lang2;
 301    return 0 if length($lang1) < length($lang2);
 302  
 303    $lang1 .= '-';
 304    $lang2 .= '-';
 305    return
 306      (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
 307  }
 308  
 309  ###########################################################################
 310  
 311  =item * the function super_languages($lang1)
 312  
 313  Returns a list of language tags that are superordinate tags to $lang1
 314  -- it gets this by removing subtags from the end of $lang1 until
 315  nothing (or just "i" or "x") is left.
 316  
 317     super_languages("fr-CA-joual")  is  ("fr-CA", "fr")
 318  
 319     super_languages("en-AU")  is  ("en")
 320  
 321     super_languages("en")  is  empty-list, ()
 322  
 323     super_languages("i-cherokee")  is  empty-list, ()
 324      ...not ("i"), which would be illegal as well as pointless.
 325  
 326  If $lang1 is not a valid language tag, returns empty-list in
 327  a list context, undef in a scalar context.
 328  
 329  A notable and rather unavoidable problem with this method:
 330  "x-mingo-tom" has an "x" because the whole tag isn't an
 331  IANA-registered tag -- but super_languages('x-mingo-tom') is
 332  ('x-mingo') -- which isn't really right, since 'i-mingo' is
 333  registered.  But this module has no way of knowing that.  (But note
 334  that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
 335  
 336  More importantly, you assume I<at your peril> that superordinates of
 337  $lang1 are mutually intelligible with $lang1.  Consider this
 338  carefully.
 339  
 340  =cut 
 341  
 342  sub super_languages {
 343    my $lang1 = $_[0];
 344    return() unless defined($lang1) && &is_language_tag($lang1);
 345  
 346    # a hack for those annoying new (2001) tags:
 347    $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
 348    $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
 349    $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
 350     # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
 351  
 352    my @l1_subtags = split('-', $lang1);
 353  
 354    ## Changes in the language tagging standards may have to be reflected here.
 355  
 356    # NB: (i-sil-...)?
 357  
 358    my @supers = ();
 359    foreach my $bit (@l1_subtags) {
 360      push @supers, 
 361        scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
 362    }
 363    pop @supers if @supers;
 364    shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
 365    return reverse @supers;
 366  }
 367  
 368  ###########################################################################
 369  
 370  =item * the function locale2language_tag($locale_identifier)
 371  
 372  This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
 373  and maps it to a language tag.  If it's not mappable (as with,
 374  notably, "C" and "POSIX"), this returns empty-list in a list context,
 375  or undef in a scalar context.
 376  
 377     locale2language_tag("en") is "en"
 378  
 379     locale2language_tag("en_US") is "en-US"
 380  
 381     locale2language_tag("en_US.ISO8859-1") is "en-US"
 382  
 383     locale2language_tag("C") is undef or ()
 384  
 385     locale2language_tag("POSIX") is undef or ()
 386  
 387     locale2language_tag("POSIX") is undef or ()
 388  
 389  I'm not totally sure that locale names map satisfactorily to language
 390  tags.  Think REAL hard about how you use this.  YOU HAVE BEEN WARNED.
 391  
 392  The output is untainted.  If you don't know what tainting is,
 393  don't worry about it.
 394  
 395  =cut 
 396  
 397  sub locale2language_tag {
 398    my $lang =
 399      $_[0] =~ m/(.+)/  # to make for an untainted result
 400      ? $1 : ''
 401    ;
 402  
 403    return $lang if &is_language_tag($lang); # like "en"
 404  
 405    $lang =~ tr<_><->;  # "en_US" -> en-US
 406    $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s;  # "en_US.ISO8859-1" -> en-US
 407     # it_IT.utf8@euro => it-IT
 408  
 409    return $lang if &is_language_tag($lang);
 410  
 411    return;
 412  }
 413  
 414  ###########################################################################
 415  
 416  =item * the function encode_language_tag($lang1)
 417  
 418  This function, if given a language tag, returns an encoding of it such
 419  that:
 420  
 421  * tags representing different languages never get the same encoding.
 422  
 423  * tags representing the same language always get the same encoding.
 424  
 425  * an encoding of a formally valid language tag always is a string
 426  value that is defined, has length, and is true if considered as a
 427  boolean.
 428  
 429  Note that the encoding itself is B<not> a formally valid language tag.
 430  Note also that you cannot, currently, go from an encoding back to a
 431  language tag that it's an encoding of.
 432  
 433  Note also that you B<must> consider the encoded value as atomic; i.e.,
 434  you should not consider it as anything but an opaque, unanalysable
 435  string value.  (The internals of the encoding method may change in
 436  future versions, as the language tagging standard changes over time.)
 437  
 438  C<encode_language_tag> returns undef if given anything other than a
 439  formally valid language tag.
 440  
 441  The reason C<encode_language_tag> exists is because different language
 442  tags may represent the same language; this is normally treatable with
 443  C<same_language_tag>, but consider this situation:
 444  
 445  You have a data file that expresses greetings in different languages.
 446  Its format is "[language tag]=[how to say 'Hello']", like:
 447  
 448            en-US=Hiho
 449            fr=Bonjour
 450            i-mingo=Hau'
 451  
 452  And suppose you write a program that reads that file and then runs as
 453  a daemon, answering client requests that specify a language tag and
 454  then expect the string that says how to greet in that language.  So an
 455  interaction looks like:
 456  
 457            greeting-client asks:    fr
 458            greeting-server answers: Bonjour
 459  
 460  So far so good.  But suppose the way you're implementing this is:
 461  
 462            my %greetings;
 463            die unless open(IN, "<in.dat");
 464            while(<IN>) {
 465              chomp;
 466              next unless /^([^=]+)=(.+)/s;
 467              my($lang, $expr) = ($1, $2);
 468              $greetings{$lang} = $expr;
 469            }
 470            close(IN);
 471  
 472  at which point %greetings has the contents:
 473  
 474            "en-US"   => "Hiho"
 475            "fr"      => "Bonjour"
 476            "i-mingo" => "Hau'"
 477  
 478  And suppose then that you answer client requests for language $wanted
 479  by just looking up $greetings{$wanted}.
 480  
 481  If the client asks for "fr", that will look up successfully in
 482  %greetings, to the value "Bonjour".  And if the client asks for
 483  "i-mingo", that will look up successfully in %greetings, to the value
 484  "Hau'".
 485  
 486  But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
 487  lookup in %greetings fails.  That's the Wrong Thing.
 488  
 489  You could instead do lookups on $wanted with:
 490  
 491            use I18N::LangTags qw(same_language_tag);
 492            my $response = '';
 493            foreach my $l2 (keys %greetings) {
 494              if(same_language_tag($wanted, $l2)) {
 495                $response = $greetings{$l2};
 496                last;
 497              }
 498            }
 499  
 500  But that's rather inefficient.  A better way to do it is to start your
 501  program with:
 502  
 503            use I18N::LangTags qw(encode_language_tag);
 504            my %greetings;
 505            die unless open(IN, "<in.dat");
 506            while(<IN>) {
 507              chomp;
 508              next unless /^([^=]+)=(.+)/s;
 509              my($lang, $expr) = ($1, $2);
 510              $greetings{
 511                          encode_language_tag($lang)
 512                        } = $expr;
 513            }
 514            close(IN);
 515  
 516  and then just answer client requests for language $wanted by just
 517  looking up
 518  
 519            $greetings{encode_language_tag($wanted)}
 520  
 521  And that does the Right Thing.
 522  
 523  =cut
 524  
 525  sub encode_language_tag {
 526    # Only similarity_language_tag() is allowed to analyse encodings!
 527  
 528    ## Changes in the language tagging standards may have to be reflected here.
 529  
 530    my($tag) = $_[0] || return undef;
 531    return undef unless &is_language_tag($tag);
 532  
 533    # For the moment, these legacy variances are few enough that
 534    #  we can just handle them here with regexps.
 535    $tag =~ s/^iw\b/he/i; # Hebrew
 536    $tag =~ s/^in\b/id/i; # Indonesian
 537    $tag =~ s/^cre\b/cr/i; # Cree
 538    $tag =~ s/^jw\b/jv/i; # Javanese
 539    $tag =~ s/^[ix]-lux\b/lb/i;  # Luxemburger
 540    $tag =~ s/^[ix]-navajo\b/nv/i;  # Navajo
 541    $tag =~ s/^ji\b/yi/i;  # Yiddish
 542    # SMB 2003 -- Hm.  There's a bunch of new XXX->YY variances now,
 543    #  but maybe they're all so obscure I can ignore them.   "Obscure"
 544    #  meaning either that the language is obscure, and/or that the
 545    #  XXX form was extant so briefly that it's unlikely it was ever
 546    #  used.  I hope.
 547    #
 548    # These go FROM the simplex to complex form, to get
 549    #  similarity-comparison right.  And that's okay, since
 550    #  similarity_language_tag is the only thing that
 551    #  analyzes our output.
 552    $tag =~ s/^[ix]-hakka\b/zh-hakka/i;  # Hakka
 553    $tag =~ s/^nb\b/no-bok/i;  # BACKWARDS for Bokmal
 554    $tag =~ s/^nn\b/no-nyn/i;  # BACKWARDS for Nynorsk
 555  
 556    $tag =~ s/^[xiXI]-//s;
 557     # Just lop off any leading "x/i-"
 558  
 559    return "~" . uc($tag);
 560  }
 561  
 562  #--------------------------------------------------------------------------
 563  
 564  =item * the function alternate_language_tags($lang1)
 565  
 566  This function, if given a language tag, returns all language tags that
 567  are alternate forms of this language tag.  (I.e., tags which refer to
 568  the same language.)  This is meant to handle legacy tags caused by
 569  the minor changes in language tag standards over the years; and
 570  the x-/i- alternation is also dealt with.
 571  
 572  Note that this function does I<not> try to equate new (and never-used,
 573  and unusable)
 574  ISO639-2 three-letter tags to old (and still in use) ISO639-1
 575  two-letter equivalents -- like "ara" -> "ar" -- because
 576  "ara" has I<never> been in use as an Internet language tag,
 577  and RFC 3066 stipulates that it never should be, since a shorter
 578  tag ("ar") exists.
 579  
 580  Examples:
 581  
 582            alternate_language_tags('no-bok')       is ('nb')
 583            alternate_language_tags('nb')           is ('no-bok')
 584            alternate_language_tags('he')           is ('iw')
 585            alternate_language_tags('iw')           is ('he')
 586            alternate_language_tags('i-hakka')      is ('zh-hakka', 'x-hakka')
 587            alternate_language_tags('zh-hakka')     is ('i-hakka', 'x-hakka')
 588            alternate_language_tags('en')           is ()
 589            alternate_language_tags('x-mingo-tom')  is ('i-mingo-tom')
 590            alternate_language_tags('x-klikitat')   is ('i-klikitat')
 591            alternate_language_tags('i-klikitat')   is ('x-klikitat')
 592  
 593  This function returns empty-list if given anything other than a formally
 594  valid language tag.
 595  
 596  =cut
 597  
 598  my %alt = qw( i x   x i   I X   X I );
 599  sub alternate_language_tags {
 600    my $tag = $_[0];
 601    return() unless &is_language_tag($tag);
 602  
 603    my @em; # push 'em real goood!
 604  
 605    # For the moment, these legacy variances are few enough that
 606    #  we can just handle them here with regexps.
 607    
 608    if(     $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
 609    } elsif($tag =~ m/^zh-hakka\b(.*)/i) {  push @em, "x-hakka$1", "i-hakka$1";
 610  
 611    } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
 612    } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
 613  
 614    } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
 615    } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
 616  
 617    } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
 618    } elsif($tag =~ m/^lb\b(.*)/i) {       push @em, "i-lux$1", "x-lux$1";
 619  
 620    } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
 621    } elsif($tag =~ m/^nv\b(.*)/i) {          push @em, "i-navajo$1", "x-navajo$1";
 622  
 623    } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
 624    } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
 625  
 626    } elsif($tag =~ m/^nb\b(.*)/i) {     push @em, "no-bok$1";
 627    } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
 628    
 629    } elsif($tag =~ m/^nn\b(.*)/i) {     push @em, "no-nyn$1";
 630    } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
 631    }
 632  
 633    push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
 634    return @em;
 635  }
 636  
 637  ###########################################################################
 638  
 639  {
 640    # Init %Panic...
 641    
 642    my @panic = (  # MUST all be lowercase!
 643     # Only large ("national") languages make it in this list.
 644     #  If you, as a user, are so bizarre that the /only/ language
 645     #  you claim to accept is Galician, then no, we won't do you
 646     #  the favor of providing Catalan as a panic-fallback for
 647     #  you.  Because if I start trying to add "little languages" in
 648     #  here, I'll just go crazy.
 649  
 650     # Scandinavian lgs.  All based on opinion and hearsay.
 651     'sv' => [qw(nb no da nn)],
 652     'da' => [qw(nb no sv nn)], # I guess
 653     [qw(no nn nb)], [qw(no nn nb sv da)],
 654     'is' => [qw(da sv no nb nn)],
 655     'fo' => [qw(da is no nb nn sv)], # I guess
 656     
 657     # I think this is about the extent of tolerable intelligibility
 658     #  among large modern Romance languages.
 659     'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
 660     'ca' => [qw(es pt it fr)],
 661     'es' => [qw(ca it fr pt)],
 662     'it' => [qw(es fr ca pt)],
 663     'fr' => [qw(es it ca pt)],
 664     
 665     # Also assume that speakers of the main Indian languages prefer
 666     #  to read/hear Hindi over English
 667     [qw(
 668       as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
 669     )] => 'hi',
 670      # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
 671      # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
 672      # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
 673     'hi' => [qw(bn pa as or)],
 674     # I welcome finer data for the other Indian languages.
 675     #  E.g., what should Oriya's list be, besides just Hindi?
 676     
 677     # And the panic languages for English is, of course, nil!
 678  
 679     # My guesses at Slavic intelligibility:
 680     ([qw(ru be uk)]) x 2,  # Russian, Belarusian, Ukranian
 681     'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
 682     'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
 683  
 684     'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
 685  
 686     'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
 687  
 688     #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
 689  
 690    );
 691    my($k,$v);
 692    while(@panic) {
 693      ($k,$v) = splice(@panic,0,2);
 694      foreach my $k (ref($k) ? @$k : $k) {
 695        foreach my $v (ref($v) ? @$v : $v) {
 696          push @{$Panic{$k} ||= []}, $v unless $k eq $v;
 697        }
 698      }
 699    }
 700  }
 701  
 702  =item * the function @langs = panic_languages(@accept_languages)
 703  
 704  This function takes a list of 0 or more language
 705  tags that constitute a given user's Accept-Language list, and
 706  returns a list of tags for I<other> (non-super)
 707  languages that are probably acceptable to the user, to be
 708  used I<if all else fails>.
 709  
 710  For example, if a user accepts only 'ca' (Catalan) and
 711  'es' (Spanish), and the documents/interfaces you have
 712  available are just in German, Italian, and Chinese, then
 713  the user will most likely want the Italian one (and not
 714  the Chinese or German one!), instead of getting
 715  nothing.  So C<panic_languages('ca', 'es')> returns
 716  a list containing 'it' (Italian).
 717  
 718  English ('en') is I<always> in the return list, but
 719  whether it's at the very end or not depends
 720  on the input languages.  This function works by consulting
 721  an internal table that stipulates what common
 722  languages are "close" to each other.
 723  
 724  A useful construct you might consider using is:
 725  
 726    @fallbacks = super_languages(@accept_languages);
 727    push @fallbacks, panic_languages(
 728      @accept_languages, @fallbacks,
 729    );
 730  
 731  =cut
 732  
 733  sub panic_languages {
 734    # When in panic or in doubt, run in circles, scream, and shout!
 735    my(@out, %seen);
 736    foreach my $t (@_) {
 737      next unless $t;
 738      next if $seen{$t}++; # so we don't return it or hit it again
 739      # push @out, super_languages($t); # nah, keep that separate
 740      push @out, @{ $Panic{lc $t} || next };
 741    }
 742    return grep !$seen{$_}++,  @out, 'en';
 743  }
 744  
 745  #---------------------------------------------------------------------------
 746  #---------------------------------------------------------------------------
 747  
 748  =item * the function implicate_supers( ...languages... )
 749  
 750  This takes a list of strings (which are presumed to be language-tags;
 751  strings that aren't, are ignored); and after each one, this function
 752  inserts super-ordinate forms that don't already appear in the list.
 753  The original list, plus these insertions, is returned.
 754  
 755  In other words, it takes this:
 756  
 757    pt-br de-DE en-US fr pt-br-janeiro
 758  
 759  and returns this:
 760  
 761    pt-br pt de-DE de en-US en fr pt-br-janeiro
 762  
 763  This function is most useful in the idiom
 764  
 765    implicate_supers( I18N::LangTags::Detect::detect() );
 766  
 767  (See L<I18N::LangTags::Detect>.)
 768  
 769  
 770  =item * the function implicate_supers_strictly( ...languages... )
 771  
 772  This works like C<implicate_supers> except that the implicated
 773  forms are added to the end of the return list.
 774  
 775  In other words, implicate_supers_strictly takes a list of strings
 776  (which are presumed to be language-tags; strings that aren't, are
 777  ignored) and after the whole given list, it inserts the super-ordinate forms 
 778  of all given tags, minus any tags that already appear in the input list.
 779  
 780  In other words, it takes this:
 781  
 782    pt-br de-DE en-US fr pt-br-janeiro
 783  
 784  and returns this:
 785  
 786    pt-br de-DE en-US fr pt-br-janeiro pt de en
 787  
 788  The reason this function has "_strictly" in its name is that when
 789  you're processing an Accept-Language list according to the RFCs, if
 790  you interpret the RFCs quite strictly, then you would use
 791  implicate_supers_strictly, but for normal use (i.e., common-sense use,
 792  as far as I'm concerned) you'd use implicate_supers.
 793  
 794  =cut
 795  
 796  sub implicate_supers {
 797    my @languages = grep is_language_tag($_), @_;
 798    my %seen_encoded;
 799    foreach my $lang (@languages) {
 800      $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
 801    }
 802  
 803    my(@output_languages);
 804    foreach my $lang (@languages) {
 805      push @output_languages, $lang;
 806      foreach my $s ( I18N::LangTags::super_languages($lang) ) {
 807        # Note that super_languages returns the longest first.
 808        last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
 809        push @output_languages, $s;
 810      }
 811    }
 812    return uniq( @output_languages );
 813  
 814  }
 815  
 816  sub implicate_supers_strictly {
 817    my @tags = grep is_language_tag($_), @_;
 818    return uniq( @_,   map super_languages($_), @_ );
 819  }
 820  
 821  
 822  
 823  ###########################################################################
 824  1;
 825  __END__
 826  
 827  =back
 828  
 829  =head1 ABOUT LOWERCASING
 830  
 831  I've considered making all the above functions that output language
 832  tags return all those tags strictly in lowercase.  Having all your
 833  language tags in lowercase does make some things easier.  But you
 834  might as well just lowercase as you like, or call
 835  C<encode_language_tag($lang1)> where appropriate.
 836  
 837  =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
 838  
 839  In some future version of I18N::LangTags, I plan to include support
 840  for RFC2482-style language tags -- which are basically just normal
 841  language tags with their ASCII characters shifted into Plane 14.
 842  
 843  =head1 SEE ALSO
 844  
 845  * L<I18N::LangTags::List|I18N::LangTags::List>
 846  
 847  * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
 848  Identification of Languages".  (Obsoletes RFC 1766)
 849  
 850  * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
 851  Character Sets and Languages".
 852  
 853  * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
 854  Value and Encoded Word Extensions: Character Sets, Languages, and
 855  Continuations".
 856  
 857  * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, 
 858  "Language Tagging in Unicode Plain Text".
 859  
 860  * Locale::Codes, in
 861  C<http://www.perl.com/CPAN/modules/by-module/Locale/>
 862  
 863  * ISO 639-2, "Codes for the representation of names of languages",
 864  including two-letter and three-letter codes,
 865  C<http://www.loc.gov/standards/iso639-2/langcodes.html>
 866  
 867  * The IANA list of registered languages (hopefully up-to-date),
 868  C<http://www.iana.org/assignments/language-tags>
 869  
 870  =head1 COPYRIGHT
 871  
 872  Copyright (c) 1998+ Sean M. Burke. All rights reserved.
 873  
 874  This library is free software; you can redistribute it and/or
 875  modify it under the same terms as Perl itself.
 876  
 877  The programs and documentation in this dist are distributed in
 878  the hope that they will be useful, but without any warranty; without
 879  even the implied warranty of merchantability or fitness for a
 880  particular purpose.
 881  
 882  =head1 AUTHOR
 883  
 884  Sean M. Burke C<sburke@cpan.org>
 885  
 886  =cut
 887  


Generated: Tue Mar 17 22:47:18 2015 Cross-referenced by PHPXref 0.7.1