| Current File : /home/mmdealscpanel/yummmdeals.com/I18N.zip |
PK �V�Z�] �� � Langinfo.pmnu �[��� package I18N::Langinfo;
use 5.006;
use strict;
use warnings;
use Carp;
require Exporter;
require XSLoader;
our @ISA = qw(Exporter);
our @EXPORT = qw(langinfo);
our @EXPORT_OK = qw(
ABDAY_1
ABDAY_2
ABDAY_3
ABDAY_4
ABDAY_5
ABDAY_6
ABDAY_7
ABMON_1
ABMON_10
ABMON_11
ABMON_12
ABMON_2
ABMON_3
ABMON_4
ABMON_5
ABMON_6
ABMON_7
ABMON_8
ABMON_9
ALT_DIGITS
AM_STR
CODESET
CRNCYSTR
DAY_1
DAY_2
DAY_3
DAY_4
DAY_5
DAY_6
DAY_7
D_FMT
D_T_FMT
ERA
ERA_D_FMT
ERA_D_T_FMT
ERA_T_FMT
MON_1
MON_10
MON_11
MON_12
MON_2
MON_3
MON_4
MON_5
MON_6
MON_7
MON_8
MON_9
NOEXPR
NOSTR
PM_STR
RADIXCHAR
THOUSEP
T_FMT
T_FMT_AMPM
YESEXPR
YESSTR
);
our $VERSION = '0.13';
XSLoader::load();
1;
__END__
=head1 NAME
I18N::Langinfo - query locale information
=head1 SYNOPSIS
use I18N::Langinfo;
=head1 DESCRIPTION
The langinfo() function queries various locale information that can be
used to localize output and user interfaces. The langinfo() requires
one numeric argument that identifies the locale constant to query:
if no argument is supplied, C<$_> is used. The numeric constants
appropriate to be used as arguments are exportable from I18N::Langinfo.
The following example will import the langinfo() function itself and
three constants to be used as arguments to langinfo(): a constant for
the abbreviated first day of the week (the numbering starts from
Sunday = 1) and two more constants for the affirmative and negative
answers for a yes/no question in the current locale.
use I18N::Langinfo qw(langinfo ABDAY_1 YESSTR NOSTR);
my ($abday_1, $yesstr, $nostr) =
map { langinfo($_) } (ABDAY_1, YESSTR, NOSTR);
print "$abday_1? [$yesstr/$nostr] ";
In other words, in the "C" (or English) locale the above will probably
print something like:
Sun? [yes/no]
but under a French locale
dim? [oui/non]
The usually available constants are
ABDAY_1 ABDAY_2 ABDAY_3 ABDAY_4 ABDAY_5 ABDAY_6 ABDAY_7
ABMON_1 ABMON_2 ABMON_3 ABMON_4 ABMON_5 ABMON_6
ABMON_7 ABMON_8 ABMON_9 ABMON_10 ABMON_11 ABMON_12
DAY_1 DAY_2 DAY_3 DAY_4 DAY_5 DAY_6 DAY_7
MON_1 MON_2 MON_3 MON_4 MON_5 MON_6
MON_7 MON_8 MON_9 MON_10 MON_11 MON_12
for abbreviated and full length days of the week and months of the year,
D_T_FMT D_FMT T_FMT
for the date-time, date, and time formats used by the strftime() function
(see L<POSIX>)
AM_STR PM_STR T_FMT_AMPM
for the locales for which it makes sense to have ante meridiem and post
meridiem time formats,
CODESET CRNCYSTR RADIXCHAR
for the character code set being used (such as "ISO8859-1", "cp850",
"koi8-r", "sjis", "utf8", etc.), for the currency string, for the
radix character used between the integer and the fractional part
of decimal numbers (yes, this is redundant with POSIX::localeconv())
YESSTR YESEXPR NOSTR NOEXPR
for the affirmative and negative responses and expressions, and
ERA ERA_D_FMT ERA_D_T_FMT ERA_T_FMT
for the Japanese Emperor eras (naturally only defined under Japanese locales).
See your L<langinfo(3)> for more information about the available
constants. (Often this means having to look directly at the
F<langinfo.h> C header file.)
Note that unfortunately none of the above constants are guaranteed
to be available on a particular platform. To be on the safe side
you can wrap the import in an eval like this:
eval {
require I18N::Langinfo;
I18N::Langinfo->import(qw(langinfo CODESET));
$codeset = langinfo(CODESET()); # note the ()
};
if ($@) { ... failed ... }
=head2 EXPORT
By default only the C<langinfo()> function is exported.
=head1 SEE ALSO
L<perllocale>, L<POSIX/localeconv>, L<POSIX/setlocale>, L<nl_langinfo(3)>.
The langinfo() is just a wrapper for the C nl_langinfo() interface.
=head1 AUTHOR
Jarkko Hietaniemi, E<lt>jhi@hut.fiE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright 2001 by Jarkko Hietaniemi
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
PK �6�Z<��c�p �p LangTags/List.pmnu �[���
require 5;
package I18N::LangTags::List;
# Time-stamp: "2004-10-06 23:26:21 ADT"
use strict;
use vars qw(%Name %Is_Disrec $Debug $VERSION);
$VERSION = '0.39';
# POD at the end.
#----------------------------------------------------------------------
{
# read the table out of our own POD!
my $seeking = 1;
my $count = 0;
my($disrec,$tag,$name);
my $last_name = '';
while(<I18N::LangTags::List::DATA>) {
if($seeking) {
$seeking = 0 if m/=for woohah/;
} elsif( ($disrec, $tag, $name) =
m/(\[?)\{([-0-9a-zA-Z]+)\}(?:\s*:)?\s*([^\[\]]+)/
) {
$name =~ s/\s*[;\.]*\s*$//g;
next unless $name;
++$count;
print "<$tag> <$name>\n" if $Debug;
$last_name = $Name{$tag} = $name;
$Is_Disrec{$tag} = 1 if $disrec;
} elsif (m/[Ff]ormerly \"([-a-z0-9]+)\"/) {
$Name{$1} = "$last_name (old tag)" if $last_name;
$Is_Disrec{$1} = 1;
}
}
die "No tags read??" unless $count;
}
#----------------------------------------------------------------------
sub name {
my $tag = lc($_[0] || return);
$tag =~ s/^\s+//s;
$tag =~ s/\s+$//s;
my $alt;
if($tag =~ m/^x-(.+)/) {
$alt = "i-$1";
} elsif($tag =~ m/^i-(.+)/) {
$alt = "x-$1";
} else {
$alt = '';
}
my $subform = '';
my $name = '';
print "Input: {$tag}\n" if $Debug;
while(length $tag) {
last if $name = $Name{$tag};
last if $name = $Name{$alt};
if($tag =~ s/(-[a-z0-9]+)$//s) {
print "Shaving off: $1 leaving $tag\n" if $Debug;
$subform = "$1$subform";
# and loop around again
$alt =~ s/(-[a-z0-9]+)$//s && $Debug && print " alt -> $alt\n";
} else {
# we're trying to pull a subform off a primary tag. TILT!
print "Aborting on: {$name}{$subform}\n" if $Debug;
last;
}
}
print "Output: {$name}{$subform}\n" if $Debug;
return unless $name; # Failure
return $name unless $subform; # Exact match
$subform =~ s/^-//s;
$subform =~ s/-$//s;
return "$name (Subform \"$subform\")";
}
#--------------------------------------------------------------------------
sub is_decent {
my $tag = lc($_[0] || return 0);
#require I18N::LangTags;
return 0 unless
$tag =~
/^(?: # First subtag
[xi] | [a-z]{2,3}
)
(?: # Subtags thereafter
- # separator
[a-z0-9]{1,8} # subtag
)*
$/xs;
my @supers = ();
foreach my $bit (split('-', $tag)) {
push @supers,
scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
}
return 0 unless @supers;
shift @supers if $supers[0] =~ m<^(i|x|sgn)$>s;
return 0 unless @supers;
foreach my $f ($tag, @supers) {
return 0 if $Is_Disrec{$f};
return 2 if $Name{$f};
# so that decent subforms of indecent tags are decent
}
return 2 if $Name{$tag}; # not only is it decent, it's known!
return 1;
}
#--------------------------------------------------------------------------
1;
__DATA__
=head1 NAME
I18N::LangTags::List -- tags and names for human languages
=head1 SYNOPSIS
use I18N::LangTags::List;
print "Parlez-vous... ", join(', ',
I18N::LangTags::List::name('elx') || 'unknown_language',
I18N::LangTags::List::name('ar-Kw') || 'unknown_language',
I18N::LangTags::List::name('en') || 'unknown_language',
I18N::LangTags::List::name('en-CA') || 'unknown_language',
), "?\n";
prints:
Parlez-vous... Elamite, Kuwait Arabic, English, Canadian English?
=head1 DESCRIPTION
This module provides a function
C<I18N::LangTags::List::name( I<langtag> ) > that takes
a language tag (see L<I18N::LangTags|I18N::LangTags>)
and returns the best attempt at an English name for it, or
undef if it can't make sense of the tag.
The function I18N::LangTags::List::name(...) is not exported.
This module also provides a function
C<I18N::LangTags::List::is_decent( I<langtag> )> that returns true iff
the language tag is syntactically valid and is for general use (like
"fr" or "fr-ca", below). That is, it returns false for tags that are
syntactically invalid and for tags, like "aus", that are listed in
brackets below. This function is not exported.
The map of tags-to-names that it uses is accessible as
%I18N::LangTags::List::Name, and it's the same as the list
that follows in this documentation, which should be useful
to you even if you don't use this module.
=head1 ABOUT LANGUAGE TAGS
Internet language tags, as defined in RFC 3066, are a formalism
for denoting human languages. The two-letter ISO 639-1 language
codes are well known (as "en" for English), as are their forms
when qualified by a country code ("en-US"). Less well-known are the
arbitrary-length non-ISO codes (like "i-mingo"), and the
recently (in 2001) introduced three-letter ISO-639-2 codes.
Remember these important facts:
=over
=item *
Language tags are not locale IDs. A locale ID is written with a "_"
instead of a "-", (almost?) always matches C<m/^\w\w_\w\w\b/>, and
I<means> something different than a language tag. A language tag
denotes a language. A locale ID denotes a language I<as used in>
a particular place, in combination with non-linguistic
location-specific information such as what currency is used
there. Locales I<also> often denote character set information,
as in "en_US.ISO8859-1".
=item *
Language tags are not for computer languages.
=item *
"Dialect" is not a useful term, since there is no objective
criterion for establishing when two language-forms are
dialects of eachother, or are separate languages.
=item *
Language tags are not case-sensitive. en-US, en-us, En-Us, etc.,
are all the same tag, and denote the same language.
=item *
Not every language tag really refers to a single language. Some
language tags refer to conditions: i-default (system-message text
in English plus maybe other languages), und (undetermined
language). Others (notably lots of the three-letter codes) are
bibliographic tags that classify whole groups of languages, as
with cus "Cushitic (Other)" (i.e., a
language that has been classed as Cushtic, but which has no more
specific code) or the even less linguistically coherent
sai for "South American Indian (Other)". Though useful in
bibliography, B<SUCH TAGS ARE NOT
FOR GENERAL USE>. For further guidance, email me.
=item *
Language tags are not country codes. In fact, they are often
distinct codes, as with language tag ja for Japanese, and
ISO 3166 country code C<.jp> for Japan.
=back
=head1 LIST OF LANGUAGES
The first part of each item is the language tag, between
{...}. It
is followed by an English name for the language or language-group.
Language tags that I judge to be not for general use, are bracketed.
This list is in alphabetical order by English name of the language.
=for reminder
The name in the =item line MUST NOT have E<...>'s in it!!
=for woohah START
=over
=item {ab} : Abkhazian
eq Abkhaz
=item {ace} : Achinese
=item {ach} : Acoli
=item {ada} : Adangme
=item {ady} : Adyghe
eq Adygei
=item {aa} : Afar
=item {afh} : Afrihili
(Artificial)
=item {af} : Afrikaans
=item [{afa} : Afro-Asiatic (Other)]
=item {ak} : Akan
(Formerly "aka".)
=item {akk} : Akkadian
(Historical)
=item {sq} : Albanian
=item {ale} : Aleut
=item [{alg} : Algonquian languages]
NOT Algonquin!
=item [{tut} : Altaic (Other)]
=item {am} : Amharic
NOT Aramaic!
=item {i-ami} : Ami
eq Amis. eq 'Amis. eq Pangca.
=item [{apa} : Apache languages]
=item {ar} : Arabic
Many forms are mutually un-intelligible in spoken media.
Notable forms:
{ar-ae} UAE Arabic;
{ar-bh} Bahrain Arabic;
{ar-dz} Algerian Arabic;
{ar-eg} Egyptian Arabic;
{ar-iq} Iraqi Arabic;
{ar-jo} Jordanian Arabic;
{ar-kw} Kuwait Arabic;
{ar-lb} Lebanese Arabic;
{ar-ly} Libyan Arabic;
{ar-ma} Moroccan Arabic;
{ar-om} Omani Arabic;
{ar-qa} Qatari Arabic;
{ar-sa} Sauda Arabic;
{ar-sy} Syrian Arabic;
{ar-tn} Tunisian Arabic;
{ar-ye} Yemen Arabic.
=item {arc} : Aramaic
NOT Amharic! NOT Samaritan Aramaic!
=item {arp} : Arapaho
=item {arn} : Araucanian
=item {arw} : Arawak
=item {hy} : Armenian
=item {an} : Aragonese
=item [{art} : Artificial (Other)]
=item {ast} : Asturian
eq Bable.
=item {as} : Assamese
=item [{ath} : Athapascan languages]
eq Athabaskan. eq Athapaskan. eq Athabascan.
=item [{aus} : Australian languages]
=item [{map} : Austronesian (Other)]
=item {av} : Avaric
(Formerly "ava".)
=item {ae} : Avestan
eq Zend
=item {awa} : Awadhi
=item {ay} : Aymara
=item {az} : Azerbaijani
eq Azeri
Notable forms:
{az-Arab} Azerbaijani in Arabic script;
{az-Cyrl} Azerbaijani in Cyrillic script;
{az-Latn} Azerbaijani in Latin script.
=item {ban} : Balinese
=item [{bat} : Baltic (Other)]
=item {bal} : Baluchi
=item {bm} : Bambara
(Formerly "bam".)
=item [{bai} : Bamileke languages]
=item {bad} : Banda
=item [{bnt} : Bantu (Other)]
=item {bas} : Basa
=item {ba} : Bashkir
=item {eu} : Basque
=item {btk} : Batak (Indonesia)
=item {bej} : Beja
=item {be} : Belarusian
eq Belarussian. eq Byelarussian.
eq Belorussian. eq Byelorussian.
eq White Russian. eq White Ruthenian.
NOT Ruthenian!
=item {bem} : Bemba
=item {bn} : Bengali
eq Bangla.
=item [{ber} : Berber (Other)]
=item {bho} : Bhojpuri
=item {bh} : Bihari
=item {bik} : Bikol
=item {bin} : Bini
=item {bi} : Bislama
eq Bichelamar.
=item {bs} : Bosnian
=item {bra} : Braj
=item {br} : Breton
=item {bug} : Buginese
=item {bg} : Bulgarian
=item {i-bnn} : Bunun
=item {bua} : Buriat
=item {my} : Burmese
=item {cad} : Caddo
=item {car} : Carib
=item {ca} : Catalan
eq CatalE<aacute>n. eq Catalonian.
=item [{cau} : Caucasian (Other)]
=item {ceb} : Cebuano
=item [{cel} : Celtic (Other)]
Notable forms:
{cel-gaulish} Gaulish (Historical)
=item [{cai} : Central American Indian (Other)]
=item {chg} : Chagatai
(Historical?)
=item [{cmc} : Chamic languages]
=item {ch} : Chamorro
=item {ce} : Chechen
=item {chr} : Cherokee
eq Tsalagi
=item {chy} : Cheyenne
=item {chb} : Chibcha
(Historical) NOT Chibchan (which is a language family).
=item {ny} : Chichewa
eq Nyanja. eq Chinyanja.
=item {zh} : Chinese
Many forms are mutually un-intelligible in spoken media.
Notable forms:
{zh-Hans} Chinese, in simplified script;
{zh-Hant} Chinese, in traditional script;
{zh-tw} Taiwan Chinese;
{zh-cn} PRC Chinese;
{zh-sg} Singapore Chinese;
{zh-mo} Macau Chinese;
{zh-hk} Hong Kong Chinese;
{zh-guoyu} Mandarin [Putonghua/Guoyu];
{zh-hakka} Hakka [formerly "i-hakka"];
{zh-min} Hokkien;
{zh-min-nan} Southern Hokkien;
{zh-wuu} Shanghaiese;
{zh-xiang} Hunanese;
{zh-gan} Gan;
{zh-yue} Cantonese.
=for etc
{i-hakka} Hakka (old tag)
=item {chn} : Chinook Jargon
eq Chinook Wawa.
=item {chp} : Chipewyan
=item {cho} : Choctaw
=item {cu} : Church Slavic
eq Old Church Slavonic.
=item {chk} : Chuukese
eq Trukese. eq Chuuk. eq Truk. eq Ruk.
=item {cv} : Chuvash
=item {cop} : Coptic
=item {kw} : Cornish
=item {co} : Corsican
eq Corse.
=item {cr} : Cree
NOT Creek! (Formerly "cre".)
=item {mus} : Creek
NOT Cree!
=item [{cpe} : English-based Creoles and pidgins (Other)]
=item [{cpf} : French-based Creoles and pidgins (Other)]
=item [{cpp} : Portuguese-based Creoles and pidgins (Other)]
=item [{crp} : Creoles and pidgins (Other)]
=item {hr} : Croatian
eq Croat.
=item [{cus} : Cushitic (Other)]
=item {cs} : Czech
=item {dak} : Dakota
eq Nakota. eq Latoka.
=item {da} : Danish
=item {dar} : Dargwa
=item {day} : Dayak
=item {i-default} : Default (Fallthru) Language
Defined in RFC 2277, this is for tagging text
(which must include English text, and might/should include text
in other appropriate languages) that is emitted in a context
where language-negotiation wasn't possible -- in SMTP mail failure
messages, for example.
=item {del} : Delaware
=item {din} : Dinka
=item {dv} : Divehi
eq Maldivian. (Formerly "div".)
=item {doi} : Dogri
NOT Dogrib!
=item {dgr} : Dogrib
NOT Dogri!
=item [{dra} : Dravidian (Other)]
=item {dua} : Duala
=item {nl} : Dutch
eq Netherlander. Notable forms:
{nl-nl} Netherlands Dutch;
{nl-be} Belgian Dutch.
=item {dum} : Middle Dutch (ca.1050-1350)
(Historical)
=item {dyu} : Dyula
=item {dz} : Dzongkha
=item {efi} : Efik
=item {egy} : Ancient Egyptian
(Historical)
=item {eka} : Ekajuk
=item {elx} : Elamite
(Historical)
=item {en} : English
Notable forms:
{en-au} Australian English;
{en-bz} Belize English;
{en-ca} Canadian English;
{en-gb} UK English;
{en-ie} Irish English;
{en-jm} Jamaican English;
{en-nz} New Zealand English;
{en-ph} Philippine English;
{en-tt} Trinidad English;
{en-us} US English;
{en-za} South African English;
{en-zw} Zimbabwe English.
=item {enm} : Old English (1100-1500)
(Historical)
=item {ang} : Old English (ca.450-1100)
eq Anglo-Saxon. (Historical)
=item {i-enochian} : Enochian (Artificial)
=item {myv} : Erzya
=item {eo} : Esperanto
(Artificial)
=item {et} : Estonian
=item {ee} : Ewe
(Formerly "ewe".)
=item {ewo} : Ewondo
=item {fan} : Fang
=item {fat} : Fanti
=item {fo} : Faroese
=item {fj} : Fijian
=item {fi} : Finnish
=item [{fiu} : Finno-Ugrian (Other)]
eq Finno-Ugric. NOT Ugaritic!
=item {fon} : Fon
=item {fr} : French
Notable forms:
{fr-fr} France French;
{fr-be} Belgian French;
{fr-ca} Canadian French;
{fr-ch} Swiss French;
{fr-lu} Luxembourg French;
{fr-mc} Monaco French.
=item {frm} : Middle French (ca.1400-1600)
(Historical)
=item {fro} : Old French (842-ca.1400)
(Historical)
=item {fy} : Frisian
=item {fur} : Friulian
=item {ff} : Fulah
(Formerly "ful".)
=item {gaa} : Ga
=item {gd} : Scots Gaelic
NOT Scots!
=item {gl} : Gallegan
eq Galician
=item {lg} : Ganda
(Formerly "lug".)
=item {gay} : Gayo
=item {gba} : Gbaya
=item {gez} : Geez
eq Ge'ez
=item {ka} : Georgian
=item {de} : German
Notable forms:
{de-at} Austrian German;
{de-be} Belgian German;
{de-ch} Swiss German;
{de-de} Germany German;
{de-li} Liechtenstein German;
{de-lu} Luxembourg German.
=item {gmh} : Middle High German (ca.1050-1500)
(Historical)
=item {goh} : Old High German (ca.750-1050)
(Historical)
=item [{gem} : Germanic (Other)]
=item {gil} : Gilbertese
=item {gon} : Gondi
=item {gor} : Gorontalo
=item {got} : Gothic
(Historical)
=item {grb} : Grebo
=item {grc} : Ancient Greek
(Historical) (Until 15th century or so.)
=item {el} : Modern Greek
(Since 15th century or so.)
=item {gn} : Guarani
GuaranE<iacute>
=item {gu} : Gujarati
=item {gwi} : Gwich'in
eq Gwichin
=item {hai} : Haida
=item {ht} : Haitian
eq Haitian Creole
=item {ha} : Hausa
=item {haw} : Hawaiian
Hawai'ian
=item {he} : Hebrew
(Formerly "iw".)
=for etc
{iw} Hebrew (old tag)
=item {hz} : Herero
=item {hil} : Hiligaynon
=item {him} : Himachali
=item {hi} : Hindi
=item {ho} : Hiri Motu
=item {hit} : Hittite
(Historical)
=item {hmn} : Hmong
=item {hu} : Hungarian
=item {hup} : Hupa
=item {iba} : Iban
=item {is} : Icelandic
=item {io} : Ido
(Artificial)
=item {ig} : Igbo
(Formerly "ibo".)
=item {ijo} : Ijo
=item {ilo} : Iloko
=item [{inc} : Indic (Other)]
=item [{ine} : Indo-European (Other)]
=item {id} : Indonesian
(Formerly "in".)
=for etc
{in} Indonesian (old tag)
=item {inh} : Ingush
=item {ia} : Interlingua (International Auxiliary Language Association)
(Artificial) NOT Interlingue!
=item {ie} : Interlingue
(Artificial) NOT Interlingua!
=item {iu} : Inuktitut
A subform of "Eskimo".
=item {ik} : Inupiaq
A subform of "Eskimo".
=item [{ira} : Iranian (Other)]
=item {ga} : Irish
=item {mga} : Middle Irish (900-1200)
(Historical)
=item {sga} : Old Irish (to 900)
(Historical)
=item [{iro} : Iroquoian languages]
=item {it} : Italian
Notable forms:
{it-it} Italy Italian;
{it-ch} Swiss Italian.
=item {ja} : Japanese
(NOT "jp"!)
=item {jv} : Javanese
(Formerly "jw" because of a typo.)
=item {jrb} : Judeo-Arabic
=item {jpr} : Judeo-Persian
=item {kbd} : Kabardian
=item {kab} : Kabyle
=item {kac} : Kachin
=item {kl} : Kalaallisut
eq Greenlandic "Eskimo"
=item {xal} : Kalmyk
=item {kam} : Kamba
=item {kn} : Kannada
eq Kanarese. NOT Canadian!
=item {kr} : Kanuri
(Formerly "kau".)
=item {krc} : Karachay-Balkar
=item {kaa} : Kara-Kalpak
=item {kar} : Karen
=item {ks} : Kashmiri
=item {csb} : Kashubian
eq Kashub
=item {kaw} : Kawi
=item {kk} : Kazakh
=item {kha} : Khasi
=item {km} : Khmer
eq Cambodian. eq Kampuchean.
=item [{khi} : Khoisan (Other)]
=item {kho} : Khotanese
=item {ki} : Kikuyu
eq Gikuyu.
=item {kmb} : Kimbundu
=item {rw} : Kinyarwanda
=item {ky} : Kirghiz
=item {i-klingon} : Klingon
=item {kv} : Komi
=item {kg} : Kongo
(Formerly "kon".)
=item {kok} : Konkani
=item {ko} : Korean
=item {kos} : Kosraean
=item {kpe} : Kpelle
=item {kro} : Kru
=item {kj} : Kuanyama
=item {kum} : Kumyk
=item {ku} : Kurdish
=item {kru} : Kurukh
=item {kut} : Kutenai
=item {lad} : Ladino
eq Judeo-Spanish. NOT Ladin (a minority language in Italy).
=item {lah} : Lahnda
NOT Lamba!
=item {lam} : Lamba
NOT Lahnda!
=item {lo} : Lao
eq Laotian.
=item {la} : Latin
(Historical) NOT Ladin! NOT Ladino!
=item {lv} : Latvian
eq Lettish.
=item {lb} : Letzeburgesch
eq Luxemburgian, eq Luxemburger. (Formerly "i-lux".)
=for etc
{i-lux} Letzeburgesch (old tag)
=item {lez} : Lezghian
=item {li} : Limburgish
eq Limburger, eq Limburgan. NOT Letzeburgesch!
=item {ln} : Lingala
=item {lt} : Lithuanian
=item {nds} : Low German
eq Low Saxon. eq Low German. eq Low Saxon.
=item {art-lojban} : Lojban (Artificial)
=item {loz} : Lozi
=item {lu} : Luba-Katanga
(Formerly "lub".)
=item {lua} : Luba-Lulua
=item {lui} : Luiseno
eq LuiseE<ntilde>o.
=item {lun} : Lunda
=item {luo} : Luo (Kenya and Tanzania)
=item {lus} : Lushai
=item {mk} : Macedonian
eq the modern Slavic language spoken in what was Yugoslavia.
NOT the form of Greek spoken in Greek Macedonia!
=item {mad} : Madurese
=item {mag} : Magahi
=item {mai} : Maithili
=item {mak} : Makasar
=item {mg} : Malagasy
=item {ms} : Malay
NOT Malayalam!
=item {ml} : Malayalam
NOT Malay!
=item {mt} : Maltese
=item {mnc} : Manchu
=item {mdr} : Mandar
NOT Mandarin!
=item {man} : Mandingo
=item {mni} : Manipuri
eq Meithei.
=item [{mno} : Manobo languages]
=item {gv} : Manx
=item {mi} : Maori
NOT Mari!
=item {mr} : Marathi
=item {chm} : Mari
NOT Maori!
=item {mh} : Marshall
eq Marshallese.
=item {mwr} : Marwari
=item {mas} : Masai
=item [{myn} : Mayan languages]
=item {men} : Mende
=item {mic} : Micmac
=item {min} : Minangkabau
=item {i-mingo} : Mingo
eq the Irquoian language West Virginia Seneca. NOT New York Seneca!
=item [{mis} : Miscellaneous languages]
Don't use this.
=item {moh} : Mohawk
=item {mdf} : Moksha
=item {mo} : Moldavian
eq Moldovan.
=item [{mkh} : Mon-Khmer (Other)]
=item {lol} : Mongo
=item {mn} : Mongolian
eq Mongol.
=item {mos} : Mossi
=item [{mul} : Multiple languages]
Not for normal use.
=item [{mun} : Munda languages]
=item {nah} : Nahuatl
=item {nap} : Neapolitan
=item {na} : Nauru
=item {nv} : Navajo
eq Navaho. (Formerly "i-navajo".)
=for etc
{i-navajo} Navajo (old tag)
=item {nd} : North Ndebele
=item {nr} : South Ndebele
=item {ng} : Ndonga
=item {ne} : Nepali
eq Nepalese. Notable forms:
{ne-np} Nepal Nepali;
{ne-in} India Nepali.
=item {new} : Newari
=item {nia} : Nias
=item [{nic} : Niger-Kordofanian (Other)]
=item [{ssa} : Nilo-Saharan (Other)]
=item {niu} : Niuean
=item {nog} : Nogai
=item {non} : Old Norse
(Historical)
=item [{nai} : North American Indian]
Do not use this.
=item {no} : Norwegian
Note the two following forms:
=item {nb} : Norwegian Bokmal
eq BokmE<aring>l, (A form of Norwegian.) (Formerly "no-bok".)
=for etc
{no-bok} Norwegian Bokmal (old tag)
=item {nn} : Norwegian Nynorsk
(A form of Norwegian.) (Formerly "no-nyn".)
=for etc
{no-nyn} Norwegian Nynorsk (old tag)
=item [{nub} : Nubian languages]
=item {nym} : Nyamwezi
=item {nyn} : Nyankole
=item {nyo} : Nyoro
=item {nzi} : Nzima
=item {oc} : Occitan (post 1500)
eq ProvenE<ccedil>al, eq Provencal
=item {oj} : Ojibwa
eq Ojibwe. (Formerly "oji".)
=item {or} : Oriya
=item {om} : Oromo
=item {osa} : Osage
=item {os} : Ossetian; Ossetic
=item [{oto} : Otomian languages]
Group of languages collectively called "OtomE<iacute>".
=item {pal} : Pahlavi
eq Pahlevi
=item {i-pwn} : Paiwan
eq Pariwan
=item {pau} : Palauan
=item {pi} : Pali
(Historical?)
=item {pam} : Pampanga
=item {pag} : Pangasinan
=item {pa} : Panjabi
eq Punjabi
=item {pap} : Papiamento
eq Papiamentu.
=item [{paa} : Papuan (Other)]
=item {fa} : Persian
eq Farsi. eq Iranian.
=item {peo} : Old Persian (ca.600-400 B.C.)
=item [{phi} : Philippine (Other)]
=item {phn} : Phoenician
(Historical)
=item {pon} : Pohnpeian
NOT Pompeiian!
=item {pl} : Polish
=item {pt} : Portuguese
eq Portugese. Notable forms:
{pt-pt} Portugal Portuguese;
{pt-br} Brazilian Portuguese.
=item [{pra} : Prakrit languages]
=item {pro} : Old Provencal (to 1500)
eq Old ProvenE<ccedil>al. (Historical.)
=item {ps} : Pushto
eq Pashto. eq Pushtu.
=item {qu} : Quechua
eq Quecha.
=item {rm} : Raeto-Romance
eq Romansh.
=item {raj} : Rajasthani
=item {rap} : Rapanui
=item {rar} : Rarotongan
=item [{qaa - qtz} : Reserved for local use.]
=item [{roa} : Romance (Other)]
NOT Romanian! NOT Romany! NOT Romansh!
=item {ro} : Romanian
eq Rumanian. NOT Romany!
=item {rom} : Romany
eq Rom. NOT Romanian!
=item {rn} : Rundi
=item {ru} : Russian
NOT White Russian! NOT Rusyn!
=item [{sal} : Salishan languages]
Large language group.
=item {sam} : Samaritan Aramaic
NOT Aramaic!
=item {se} : Northern Sami
eq Lappish. eq Lapp. eq (Northern) Saami.
=item {sma} : Southern Sami
=item {smn} : Inari Sami
=item {smj} : Lule Sami
=item {sms} : Skolt Sami
=item [{smi} : Sami languages (Other)]
=item {sm} : Samoan
=item {sad} : Sandawe
=item {sg} : Sango
=item {sa} : Sanskrit
(Historical)
=item {sat} : Santali
=item {sc} : Sardinian
eq Sard.
=item {sas} : Sasak
=item {sco} : Scots
NOT Scots Gaelic!
=item {sel} : Selkup
=item [{sem} : Semitic (Other)]
=item {sr} : Serbian
eq Serb. NOT Sorbian.
Notable forms:
{sr-Cyrl} : Serbian in Cyrillic script;
{sr-Latn} : Serbian in Latin script.
=item {srr} : Serer
=item {shn} : Shan
=item {sn} : Shona
=item {sid} : Sidamo
=item {sgn-...} : Sign Languages
Always use with a subtag. Notable forms:
{sgn-gb} British Sign Language (BSL);
{sgn-ie} Irish Sign Language (ESL);
{sgn-ni} Nicaraguan Sign Language (ISN);
{sgn-us} American Sign Language (ASL).
(And so on with other country codes as the subtag.)
=item {bla} : Siksika
eq Blackfoot. eq Pikanii.
=item {sd} : Sindhi
=item {si} : Sinhalese
eq Sinhala.
=item [{sit} : Sino-Tibetan (Other)]
=item [{sio} : Siouan languages]
=item {den} : Slave (Athapascan)
("Slavey" is a subform.)
=item [{sla} : Slavic (Other)]
=item {sk} : Slovak
eq Slovakian.
=item {sl} : Slovenian
eq Slovene.
=item {sog} : Sogdian
=item {so} : Somali
=item {son} : Songhai
=item {snk} : Soninke
=item {wen} : Sorbian languages
eq Wendish. eq Sorb. eq Lusatian. eq Wend. NOT Venda! NOT Serbian!
=item {nso} : Northern Sotho
=item {st} : Southern Sotho
eq Sutu. eq Sesotho.
=item [{sai} : South American Indian (Other)]
=item {es} : Spanish
Notable forms:
{es-ar} Argentine Spanish;
{es-bo} Bolivian Spanish;
{es-cl} Chilean Spanish;
{es-co} Colombian Spanish;
{es-do} Dominican Spanish;
{es-ec} Ecuadorian Spanish;
{es-es} Spain Spanish;
{es-gt} Guatemalan Spanish;
{es-hn} Honduran Spanish;
{es-mx} Mexican Spanish;
{es-pa} Panamanian Spanish;
{es-pe} Peruvian Spanish;
{es-pr} Puerto Rican Spanish;
{es-py} Paraguay Spanish;
{es-sv} Salvadoran Spanish;
{es-us} US Spanish;
{es-uy} Uruguayan Spanish;
{es-ve} Venezuelan Spanish.
=item {suk} : Sukuma
=item {sux} : Sumerian
(Historical)
=item {su} : Sundanese
=item {sus} : Susu
=item {sw} : Swahili
eq Kiswahili
=item {ss} : Swati
=item {sv} : Swedish
Notable forms:
{sv-se} Sweden Swedish;
{sv-fi} Finland Swedish.
=item {syr} : Syriac
=item {tl} : Tagalog
=item {ty} : Tahitian
=item [{tai} : Tai (Other)]
NOT Thai!
=item {tg} : Tajik
=item {tmh} : Tamashek
=item {ta} : Tamil
=item {i-tao} : Tao
eq Yami.
=item {tt} : Tatar
=item {i-tay} : Tayal
eq Atayal. eq Atayan.
=item {te} : Telugu
=item {ter} : Tereno
=item {tet} : Tetum
=item {th} : Thai
NOT Tai!
=item {bo} : Tibetan
=item {tig} : Tigre
=item {ti} : Tigrinya
=item {tem} : Timne
eq Themne. eq Timene.
=item {tiv} : Tiv
=item {tli} : Tlingit
=item {tpi} : Tok Pisin
=item {tkl} : Tokelau
=item {tog} : Tonga (Nyasa)
NOT Tsonga!
=item {to} : Tonga (Tonga Islands)
(Pronounced "Tong-a", not "Tong-ga")
NOT Tsonga!
=item {tsi} : Tsimshian
eq Sm'algyax
=item {ts} : Tsonga
NOT Tonga!
=item {i-tsu} : Tsou
=item {tn} : Tswana
Same as Setswana.
=item {tum} : Tumbuka
=item [{tup} : Tupi languages]
=item {tr} : Turkish
(Typically in Roman script)
=item {ota} : Ottoman Turkish (1500-1928)
(Typically in Arabic script) (Historical)
=item {crh} : Crimean Turkish
eq Crimean Tatar
=item {tk} : Turkmen
eq Turkmeni.
=item {tvl} : Tuvalu
=item {tyv} : Tuvinian
eq Tuvan. eq Tuvin.
=item {tw} : Twi
=item {udm} : Udmurt
=item {uga} : Ugaritic
NOT Ugric!
=item {ug} : Uighur
=item {uk} : Ukrainian
=item {umb} : Umbundu
=item {und} : Undetermined
Not a tag for normal use.
=item {ur} : Urdu
=item {uz} : Uzbek
eq E<Ouml>zbek
Notable forms:
{uz-Cyrl} Uzbek in Cyrillic script;
{uz-Latn} Uzbek in Latin script.
=item {vai} : Vai
=item {ve} : Venda
NOT Wendish! NOT Wend! NOT Avestan! (Formerly "ven".)
=item {vi} : Vietnamese
eq Viet.
=item {vo} : Volapuk
eq VolapE<uuml>k. (Artificial)
=item {vot} : Votic
eq Votian. eq Vod.
=item [{wak} : Wakashan languages]
=item {wa} : Walloon
=item {wal} : Walamo
eq Wolaytta.
=item {war} : Waray
Presumably the Philippine language Waray-Waray (SamareE<ntilde>o),
not the smaller Philippine language Waray Sorsogon, nor the extinct
Australian language Waray.
=item {was} : Washo
eq Washoe
=item {cy} : Welsh
=item {wo} : Wolof
=item {x-...} : Unregistered (Semi-Private Use)
"x-" is a prefix for language tags that are not registered with ISO
or IANA. Example, x-double-dutch
=item {xh} : Xhosa
=item {sah} : Yakut
=item {yao} : Yao
(The Yao in Malawi?)
=item {yap} : Yapese
eq Yap
=item {ii} : Sichuan Yi
=item {yi} : Yiddish
Formerly "ji". Usually in Hebrew script.
Notable forms:
{yi-latn} Yiddish in Latin script
=item {yo} : Yoruba
=item [{ypk} : Yupik languages]
Several "Eskimo" languages.
=item {znd} : Zande
=item [{zap} : Zapotec]
(A group of languages.)
=item {zen} : Zenaga
NOT Zend.
=item {za} : Zhuang
=item {zu} : Zulu
=item {zun} : Zuni
eq ZuE<ntilde>i
=back
=for woohah END
=head1 SEE ALSO
L<I18N::LangTags|I18N::LangTags> and its "See Also" section.
=head1 COPYRIGHT AND DISCLAIMER
Copyright (c) 2001+ Sean M. Burke. All rights reserved.
You can redistribute and/or
modify this document under the same terms as Perl itself.
This document is provided in the hope that it will be
useful, but without any warranty;
without even the implied warranty of accuracy, authoritativeness,
completeness, merchantability, or fitness for a particular purpose.
Email any corrections or questions to me.
=head1 AUTHOR
Sean M. Burke, sburkeE<64>cpan.org
=cut
# To generate a list of just the two and three-letter codes:
#!/usr/local/bin/perl -w
require 5; # Time-stamp: "2001-03-13 21:53:39 MST"
# Sean M. Burke, sburke@cpan.org
# This program is for generating the language_codes.txt file
use strict;
use LWP::Simple;
use HTML::TreeBuilder 3.10;
my $root = HTML::TreeBuilder->new();
my $url = 'http://lcweb.loc.gov/standards/iso639-2/bibcodes.html';
$root->parse(get($url) || die "Can't get $url");
$root->eof();
my @codes;
foreach my $tr ($root->find_by_tag_name('tr')) {
my @f = map $_->as_text(), $tr->content_list();
#print map("<$_> ", @f), "\n";
next unless @f == 5;
pop @f; # nix the French name
next if $f[-1] eq 'Language Name (English)'; # it's a header line
my $xx = splice(@f, 2,1); # pull out the two-letter code
$f[-1] =~ s/^\s+//;
$f[-1] =~ s/\s+$//;
if($xx =~ m/[a-zA-Z]/) { # there's a two-letter code for it
push @codes, [ lc($f[-1]), "$xx\t$f[-1]\n" ];
} else { # print the three-letter codes.
if($f[0] eq $f[1]) {
push @codes, [ lc($f[-1]), "$f[1]\t$f[2]\n" ];
} else { # shouldn't happen
push @codes, [ lc($f[-1]), "@f !!!!!!!!!!\n" ];
}
}
}
print map $_->[1], sort {; $a->[0] cmp $b->[0] } @codes;
print "[ based on $url\n at ", scalar(localtime), "]\n",
"[Note: doesn't include IANA-registered codes.]\n";
exit;
__END__
PK �6�Z��W�� � LangTags/Detect.pmnu �[���
# Time-stamp: "2004-06-20 21:47:55 ADT"
require 5;
package I18N::LangTags::Detect;
use strict;
use vars qw( @ISA $VERSION $MATCH_SUPERS $USING_LANGUAGE_TAGS
$USE_LITERALS $MATCH_SUPERS_TIGHTLY);
BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } }
# define the constant 'DEBUG' at compile-time
$VERSION = "1.06";
@ISA = ();
use I18N::LangTags qw(alternate_language_tags locale2language_tag);
sub _uniq { my %seen; return grep(!($seen{$_}++), @_); }
sub _normalize {
my(@languages) =
map lc($_),
grep $_,
map {; $_, alternate_language_tags($_) } @_;
return _uniq(@languages) if wantarray;
return $languages[0];
}
#---------------------------------------------------------------------------
# The extent of our functional interface:
sub detect () { return __PACKAGE__->ambient_langprefs; }
#===========================================================================
sub ambient_langprefs { # always returns things untainted
my $base_class = $_[0];
return $base_class->http_accept_langs
if length( $ENV{'REQUEST_METHOD'} || '' ); # I'm a CGI
# it's off in its own routine because it's complicated
# Not running as a CGI: try to puzzle out from the environment
my @languages;
foreach my $envname (qw( LANGUAGE LC_ALL LC_MESSAGES LANG )) {
next unless $ENV{$envname};
DEBUG and print "Noting \$$envname: $ENV{$envname}\n";
push @languages,
map locale2language_tag($_),
# if it's a lg tag, fine, pass thru (untainted)
# if it's a locale ID, try converting to a lg tag (untainted),
# otherwise nix it.
split m/[,:]/,
$ENV{$envname}
;
last; # first one wins
}
if($ENV{'IGNORE_WIN32_LOCALE'}) {
# no-op
} elsif(&_try_use('Win32::Locale')) {
# If we have that module installed...
push @languages, Win32::Locale::get_language() || ''
if defined &Win32::Locale::get_language;
}
return _normalize @languages;
}
#---------------------------------------------------------------------------
sub http_accept_langs {
# Deal with HTTP "Accept-Language:" stuff. Hassle.
# This code is more lenient than RFC 3282, which you must read.
# Hm. Should I just move this into I18N::LangTags at some point?
no integer;
my $in = (@_ > 1) ? $_[1] : $ENV{'HTTP_ACCEPT_LANGUAGE'};
# (always ends up untainting)
return() unless defined $in and length $in;
$in =~ s/\([^\)]*\)//g; # nix just about any comment
if( $in =~ m/^\s*([a-zA-Z][-a-zA-Z]+)\s*$/s ) {
# Very common case: just one language tag
return _normalize $1;
} elsif( $in =~ m/^\s*[a-zA-Z][-a-zA-Z]+(?:\s*,\s*[a-zA-Z][-a-zA-Z]+)*\s*$/s ) {
# Common case these days: just "foo, bar, baz"
return _normalize( $in =~ m/([a-zA-Z][-a-zA-Z]+)/g );
}
# Else it's complicated...
$in =~ s/\s+//g; # Yes, we can just do without the WS!
my @in = $in =~ m/([^,]+)/g;
my %pref;
my $q;
foreach my $tag (@in) {
next unless $tag =~
m/^([a-zA-Z][-a-zA-Z]+)
(?:
;q=
(
\d* # a bit too broad of a RE, but so what.
(?:
\.\d+
)?
)
)?
$
/sx
;
$q = (defined $2 and length $2) ? $2 : 1;
#print "$1 with q=$q\n";
push @{ $pref{$q} }, lc $1;
}
return _normalize(
# Read off %pref, in descending key order...
map @{$pref{$_}},
sort {$b <=> $a}
keys %pref
);
}
#===========================================================================
my %tried = ();
# memoization of whether we've used this module, or found it unusable.
sub _try_use { # Basically a wrapper around "require Modulename"
# "Many men have tried..." "They tried and failed?" "They tried and died."
return $tried{$_[0]} if exists $tried{$_[0]}; # memoization
my $module = $_[0]; # ASSUME sane module name!
{ no strict 'refs';
no warnings 'once';
return($tried{$module} = 1)
if %{$module . "::Lexicon"} or @{$module . "::ISA"};
# weird case: we never use'd it, but there it is!
}
print " About to use $module ...\n" if DEBUG;
{
local $SIG{'__DIE__'};
local @INC = @INC;
pop @INC if $INC[-1] eq '.';
eval "require $module"; # used to be "use $module", but no point in that.
}
if($@) {
print "Error using $module \: $@\n" if DEBUG > 1;
return $tried{$module} = 0;
} else {
print " OK, $module is used\n" if DEBUG;
return $tried{$module} = 1;
}
}
#---------------------------------------------------------------------------
1;
__END__
=head1 NAME
I18N::LangTags::Detect - detect the user's language preferences
=head1 SYNOPSIS
use I18N::LangTags::Detect;
my @user_wants = I18N::LangTags::Detect::detect();
=head1 DESCRIPTION
It is a common problem to want to detect what language(s) the user would
prefer output in.
=head1 FUNCTIONS
This module defines one public function,
C<I18N::LangTags::Detect::detect()>. This function is not exported
(nor is even exportable), and it takes no parameters.
In scalar context, the function returns the most preferred language
tag (or undef if no preference was seen).
In list context (which is usually what you want),
the function returns a
(possibly empty) list of language tags representing (best first) what
languages the user apparently would accept output in. You will
probably want to pass the output of this through
C<I18N::LangTags::implicate_supers_tightly(...)>
or
C<I18N::LangTags::implicate_supers(...)>, like so:
my @languages =
I18N::LangTags::implicate_supers_tightly(
I18N::LangTags::Detect::detect()
);
=head1 ENVIRONMENT
This module looks for several environment variables, including
REQUEST_METHOD, HTTP_ACCEPT_LANGUAGE,
LANGUAGE, LC_ALL, LC_MESSAGES, and LANG.
It will also use the L<Win32::Locale> module, if it's installed.
=head1 SEE ALSO
L<I18N::LangTags>, L<Win32::Locale>, L<Locale::Maketext>.
(This module's core code started out as a routine in Locale::Maketext;
but I moved it here once I realized it was more generally useful.)
=head1 COPYRIGHT
Copyright (c) 1998-2004 Sean M. Burke. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
The programs and documentation in this dist are distributed in
the hope that they will be useful, but without any warranty; without
even the implied warranty of merchantability or fitness for a
particular purpose.
=head1 AUTHOR
Sean M. Burke C<sburke@cpan.org>
=cut
# a tip: Put a bit of chopped up pickled ginger in your salad. It's tasty!
PK �6�Z
�C�m �m LangTags.pmnu �[���
# Time-stamp: "2004-10-06 23:26:33 ADT"
# Sean M. Burke <sburke@cpan.org>
require 5.000;
package I18N::LangTags;
use strict;
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(is_language_tag same_language_tag
extract_language_tags super_languages
similarity_language_tag is_dialect_of
locale2language_tag alternate_language_tags
encode_language_tag panic_languages
implicate_supers
implicate_supers_strictly
);
%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
$VERSION = "0.42";
sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
=head1 NAME
I18N::LangTags - functions for dealing with RFC3066-style language tags
=head1 SYNOPSIS
use I18N::LangTags();
...or specify whichever of those functions you want to import, like so:
use I18N::LangTags qw(implicate_supers similarity_language_tag);
All the exportable functions are listed below -- you're free to import
only some, or none at all. By default, none are imported. If you
say:
use I18N::LangTags qw(:ALL)
...then all are exported. (This saves you from having to use
something less obvious like C<use I18N::LangTags qw(/./)>.)
If you don't import any of these functions, assume a C<&I18N::LangTags::>
in front of all the function names in the following examples.
=head1 DESCRIPTION
Language tags are a formalism, described in RFC 3066 (obsoleting
1766), for declaring what language form (language and possibly
dialect) a given chunk of information is in.
This library provides functions for common tasks involving language
tags as they are needed in a variety of protocols and applications.
Please see the "See Also" references for a thorough explanation
of how to correctly use language tags.
=over
=cut
###########################################################################
=item * the function is_language_tag($lang1)
Returns true iff $lang1 is a formally valid language tag.
is_language_tag("fr") is TRUE
is_language_tag("x-jicarilla") is FALSE
(Subtags can be 8 chars long at most -- 'jicarilla' is 9)
is_language_tag("sgn-US") is TRUE
(That's American Sign Language)
is_language_tag("i-Klikitat") is TRUE
(True without regard to the fact noone has actually
registered Klikitat -- it's a formally valid tag)
is_language_tag("fr-patois") is TRUE
(Formally valid -- altho descriptively weak!)
is_language_tag("Spanish") is FALSE
is_language_tag("french-patois") is FALSE
(No good -- first subtag has to match
/^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
is_language_tag("x-borg-prot2532") is TRUE
(Yes, subtags can contain digits, as of RFC3066)
=cut
sub is_language_tag {
## Changes in the language tagging standards may have to be reflected here.
my($tag) = lc($_[0]);
return 0 if $tag eq "i" or $tag eq "x";
# Bad degenerate cases that the following
# regexp would erroneously let pass
return $tag =~
/^(?: # First subtag
[xi] | [a-z]{2,3}
)
(?: # Subtags thereafter
- # separator
[a-z0-9]{1,8} # subtag
)*
$/xs ? 1 : 0;
}
###########################################################################
=item * the function extract_language_tags($whatever)
Returns a list of whatever looks like formally valid language tags
in $whatever. Not very smart, so don't get too creative with
what you want to feed it.
extract_language_tags("fr, fr-ca, i-mingo")
returns: ('fr', 'fr-ca', 'i-mingo')
extract_language_tags("It's like this: I'm in fr -- French!")
returns: ('It', 'in', 'fr')
(So don't just feed it any old thing.)
The output is untainted. If you don't know what tainting is,
don't worry about it.
=cut
sub extract_language_tags {
## Changes in the language tagging standards may have to be reflected here.
my($text) =
$_[0] =~ m/(.+)/ # to make for an untainted result
? $1 : ''
;
return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
$text =~
m/
\b
(?: # First subtag
[iIxX] | [a-zA-Z]{2,3}
)
(?: # Subtags thereafter
- # separator
[a-zA-Z0-9]{1,8} # subtag
)*
\b
/xsg
);
}
###########################################################################
=item * the function same_language_tag($lang1, $lang2)
Returns true iff $lang1 and $lang2 are acceptable variant tags
representing the same language-form.
same_language_tag('x-kadara', 'i-kadara') is TRUE
(The x/i- alternation doesn't matter)
same_language_tag('X-KADARA', 'i-kadara') is TRUE
(...and neither does case)
same_language_tag('en', 'en-US') is FALSE
(all-English is not the SAME as US English)
same_language_tag('x-kadara', 'x-kadar') is FALSE
(these are totally unrelated tags)
same_language_tag('no-bok', 'nb') is TRUE
(no-bok is a legacy tag for nb (Norwegian Bokmal))
C<same_language_tag> works by just seeing whether
C<encode_language_tag($lang1)> is the same as
C<encode_language_tag($lang2)>.
(Yes, I know this function is named a bit oddly. Call it historic
reasons.)
=cut
sub same_language_tag {
my $el1 = &encode_language_tag($_[0]);
return 0 unless defined $el1;
# this avoids the problem of
# encode_language_tag($lang1) eq and encode_language_tag($lang2)
# being true if $lang1 and $lang2 are both undef
return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
}
###########################################################################
=item * the function similarity_language_tag($lang1, $lang2)
Returns an integer representing the degree of similarity between
tags $lang1 and $lang2 (the order of which does not matter), where
similarity is the number of common elements on the left,
without regard to case and to x/i- alternation.
similarity_language_tag('fr', 'fr-ca') is 1
(one element in common)
similarity_language_tag('fr-ca', 'fr-FR') is 1
(one element in common)
similarity_language_tag('fr-CA-joual',
'fr-CA-PEI') is 2
similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
(two elements in common)
similarity_language_tag('x-kadara', 'i-kadara') is 1
(x/i- doesn't matter)
similarity_language_tag('en', 'x-kadar') is 0
similarity_language_tag('x-kadara', 'x-kadar') is 0
(unrelated tags -- no similarity)
similarity_language_tag('i-cree-syllabic',
'i-cherokee-syllabic') is 0
(no B<leftmost> elements in common!)
=cut
sub similarity_language_tag {
my $lang1 = &encode_language_tag($_[0]);
my $lang2 = &encode_language_tag($_[1]);
# And encode_language_tag takes care of the whole
# no-nyn==nn, i-hakka==zh-hakka, etc, things
# NB: (i-sil-...)? (i-sgn-...)?
return undef if !defined($lang1) and !defined($lang2);
return 0 if !defined($lang1) or !defined($lang2);
my @l1_subtags = split('-', $lang1);
my @l2_subtags = split('-', $lang2);
my $similarity = 0;
while(@l1_subtags and @l2_subtags) {
if(shift(@l1_subtags) eq shift(@l2_subtags)) {
++$similarity;
} else {
last;
}
}
return $similarity;
}
###########################################################################
=item * the function is_dialect_of($lang1, $lang2)
Returns true iff language tag $lang1 represents a subform of
language tag $lang2.
B<Get the order right! It doesn't work the other way around!>
is_dialect_of('en-US', 'en') is TRUE
(American English IS a dialect of all-English)
is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
is_dialect_of('fr-CA-joual', 'fr') is TRUE
(Joual is a dialect of (a dialect of) French)
is_dialect_of('en', 'en-US') is FALSE
(all-English is a NOT dialect of American English)
is_dialect_of('fr', 'en-CA') is FALSE
is_dialect_of('en', 'en' ) is TRUE
is_dialect_of('en-US', 'en-US') is TRUE
(B<Note:> these are degenerate cases)
is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
(the x/i thing doesn't matter, nor does case)
is_dialect_of('nn', 'no') is TRUE
(because 'nn' (New Norse) is aliased to 'no-nyn',
as a special legacy case, and 'no-nyn' is a
subform of 'no' (Norwegian))
=cut
sub is_dialect_of {
my $lang1 = &encode_language_tag($_[0]);
my $lang2 = &encode_language_tag($_[1]);
return undef if !defined($lang1) and !defined($lang2);
return 0 if !defined($lang1) or !defined($lang2);
return 1 if $lang1 eq $lang2;
return 0 if length($lang1) < length($lang2);
$lang1 .= '-';
$lang2 .= '-';
return
(substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
}
###########################################################################
=item * the function super_languages($lang1)
Returns a list of language tags that are superordinate tags to $lang1
-- it gets this by removing subtags from the end of $lang1 until
nothing (or just "i" or "x") is left.
super_languages("fr-CA-joual") is ("fr-CA", "fr")
super_languages("en-AU") is ("en")
super_languages("en") is empty-list, ()
super_languages("i-cherokee") is empty-list, ()
...not ("i"), which would be illegal as well as pointless.
If $lang1 is not a valid language tag, returns empty-list in
a list context, undef in a scalar context.
A notable and rather unavoidable problem with this method:
"x-mingo-tom" has an "x" because the whole tag isn't an
IANA-registered tag -- but super_languages('x-mingo-tom') is
('x-mingo') -- which isn't really right, since 'i-mingo' is
registered. But this module has no way of knowing that. (But note
that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
More importantly, you assume I<at your peril> that superordinates of
$lang1 are mutually intelligible with $lang1. Consider this
carefully.
=cut
sub super_languages {
my $lang1 = $_[0];
return() unless defined($lang1) && &is_language_tag($lang1);
# a hack for those annoying new (2001) tags:
$lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
$lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
$lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
# i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
my @l1_subtags = split('-', $lang1);
## Changes in the language tagging standards may have to be reflected here.
# NB: (i-sil-...)?
my @supers = ();
foreach my $bit (@l1_subtags) {
push @supers,
scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
}
pop @supers if @supers;
shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
return reverse @supers;
}
###########################################################################
=item * the function locale2language_tag($locale_identifier)
This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
and maps it to a language tag. If it's not mappable (as with,
notably, "C" and "POSIX"), this returns empty-list in a list context,
or undef in a scalar context.
locale2language_tag("en") is "en"
locale2language_tag("en_US") is "en-US"
locale2language_tag("en_US.ISO8859-1") is "en-US"
locale2language_tag("C") is undef or ()
locale2language_tag("POSIX") is undef or ()
locale2language_tag("POSIX") is undef or ()
I'm not totally sure that locale names map satisfactorily to language
tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
The output is untainted. If you don't know what tainting is,
don't worry about it.
=cut
sub locale2language_tag {
my $lang =
$_[0] =~ m/(.+)/ # to make for an untainted result
? $1 : ''
;
return $lang if &is_language_tag($lang); # like "en"
$lang =~ tr<_><->; # "en_US" -> en-US
$lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US
# it_IT.utf8@euro => it-IT
return $lang if &is_language_tag($lang);
return;
}
###########################################################################
=item * the function encode_language_tag($lang1)
This function, if given a language tag, returns an encoding of it such
that:
* tags representing different languages never get the same encoding.
* tags representing the same language always get the same encoding.
* an encoding of a formally valid language tag always is a string
value that is defined, has length, and is true if considered as a
boolean.
Note that the encoding itself is B<not> a formally valid language tag.
Note also that you cannot, currently, go from an encoding back to a
language tag that it's an encoding of.
Note also that you B<must> consider the encoded value as atomic; i.e.,
you should not consider it as anything but an opaque, unanalysable
string value. (The internals of the encoding method may change in
future versions, as the language tagging standard changes over time.)
C<encode_language_tag> returns undef if given anything other than a
formally valid language tag.
The reason C<encode_language_tag> exists is because different language
tags may represent the same language; this is normally treatable with
C<same_language_tag>, but consider this situation:
You have a data file that expresses greetings in different languages.
Its format is "[language tag]=[how to say 'Hello']", like:
en-US=Hiho
fr=Bonjour
i-mingo=Hau'
And suppose you write a program that reads that file and then runs as
a daemon, answering client requests that specify a language tag and
then expect the string that says how to greet in that language. So an
interaction looks like:
greeting-client asks: fr
greeting-server answers: Bonjour
So far so good. But suppose the way you're implementing this is:
my %greetings;
die unless open(IN, "<", "in.dat");
while(<IN>) {
chomp;
next unless /^([^=]+)=(.+)/s;
my($lang, $expr) = ($1, $2);
$greetings{$lang} = $expr;
}
close(IN);
at which point %greetings has the contents:
"en-US" => "Hiho"
"fr" => "Bonjour"
"i-mingo" => "Hau'"
And suppose then that you answer client requests for language $wanted
by just looking up $greetings{$wanted}.
If the client asks for "fr", that will look up successfully in
%greetings, to the value "Bonjour". And if the client asks for
"i-mingo", that will look up successfully in %greetings, to the value
"Hau'".
But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
lookup in %greetings fails. That's the Wrong Thing.
You could instead do lookups on $wanted with:
use I18N::LangTags qw(same_language_tag);
my $response = '';
foreach my $l2 (keys %greetings) {
if(same_language_tag($wanted, $l2)) {
$response = $greetings{$l2};
last;
}
}
But that's rather inefficient. A better way to do it is to start your
program with:
use I18N::LangTags qw(encode_language_tag);
my %greetings;
die unless open(IN, "<", "in.dat");
while(<IN>) {
chomp;
next unless /^([^=]+)=(.+)/s;
my($lang, $expr) = ($1, $2);
$greetings{
encode_language_tag($lang)
} = $expr;
}
close(IN);
and then just answer client requests for language $wanted by just
looking up
$greetings{encode_language_tag($wanted)}
And that does the Right Thing.
=cut
sub encode_language_tag {
# Only similarity_language_tag() is allowed to analyse encodings!
## Changes in the language tagging standards may have to be reflected here.
my($tag) = $_[0] || return undef;
return undef unless &is_language_tag($tag);
# For the moment, these legacy variances are few enough that
# we can just handle them here with regexps.
$tag =~ s/^iw\b/he/i; # Hebrew
$tag =~ s/^in\b/id/i; # Indonesian
$tag =~ s/^cre\b/cr/i; # Cree
$tag =~ s/^jw\b/jv/i; # Javanese
$tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger
$tag =~ s/^[ix]-navajo\b/nv/i; # Navajo
$tag =~ s/^ji\b/yi/i; # Yiddish
# SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now,
# but maybe they're all so obscure I can ignore them. "Obscure"
# meaning either that the language is obscure, and/or that the
# XXX form was extant so briefly that it's unlikely it was ever
# used. I hope.
#
# These go FROM the simplex to complex form, to get
# similarity-comparison right. And that's okay, since
# similarity_language_tag is the only thing that
# analyzes our output.
$tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka
$tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal
$tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk
$tag =~ s/^[xiXI]-//s;
# Just lop off any leading "x/i-"
return "~" . uc($tag);
}
#--------------------------------------------------------------------------
=item * the function alternate_language_tags($lang1)
This function, if given a language tag, returns all language tags that
are alternate forms of this language tag. (I.e., tags which refer to
the same language.) This is meant to handle legacy tags caused by
the minor changes in language tag standards over the years; and
the x-/i- alternation is also dealt with.
Note that this function does I<not> try to equate new (and never-used,
and unusable)
ISO639-2 three-letter tags to old (and still in use) ISO639-1
two-letter equivalents -- like "ara" -> "ar" -- because
"ara" has I<never> been in use as an Internet language tag,
and RFC 3066 stipulates that it never should be, since a shorter
tag ("ar") exists.
Examples:
alternate_language_tags('no-bok') is ('nb')
alternate_language_tags('nb') is ('no-bok')
alternate_language_tags('he') is ('iw')
alternate_language_tags('iw') is ('he')
alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka')
alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka')
alternate_language_tags('en') is ()
alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
alternate_language_tags('x-klikitat') is ('i-klikitat')
alternate_language_tags('i-klikitat') is ('x-klikitat')
This function returns empty-list if given anything other than a formally
valid language tag.
=cut
my %alt = qw( i x x i I X X I );
sub alternate_language_tags {
my $tag = $_[0];
return() unless &is_language_tag($tag);
my @em; # push 'em real goood!
# For the moment, these legacy variances are few enough that
# we can just handle them here with regexps.
if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
} elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1";
} elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
} elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
} elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
} elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
} elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
} elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1";
} elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
} elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1";
} elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
} elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
} elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1";
} elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
} elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1";
} elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
}
push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
return @em;
}
###########################################################################
{
# Init %Panic...
my @panic = ( # MUST all be lowercase!
# Only large ("national") languages make it in this list.
# If you, as a user, are so bizarre that the /only/ language
# you claim to accept is Galician, then no, we won't do you
# the favor of providing Catalan as a panic-fallback for
# you. Because if I start trying to add "little languages" in
# here, I'll just go crazy.
# Scandinavian lgs. All based on opinion and hearsay.
'sv' => [qw(nb no da nn)],
'da' => [qw(nb no sv nn)], # I guess
[qw(no nn nb)], [qw(no nn nb sv da)],
'is' => [qw(da sv no nb nn)],
'fo' => [qw(da is no nb nn sv)], # I guess
# I think this is about the extent of tolerable intelligibility
# among large modern Romance languages.
'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
'ca' => [qw(es pt it fr)],
'es' => [qw(ca it fr pt)],
'it' => [qw(es fr ca pt)],
'fr' => [qw(es it ca pt)],
# Also assume that speakers of the main Indian languages prefer
# to read/hear Hindi over English
[qw(
as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
)] => 'hi',
# Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
# Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
# Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
'hi' => [qw(bn pa as or)],
# I welcome finer data for the other Indian languages.
# E.g., what should Oriya's list be, besides just Hindi?
# And the panic languages for English is, of course, nil!
# My guesses at Slavic intelligibility:
([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian
([qw(sr hr bs)]) x 2, # Serbian, Croatian, Bosnian
'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
#?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
);
my($k,$v);
while(@panic) {
($k,$v) = splice(@panic,0,2);
foreach my $k (ref($k) ? @$k : $k) {
foreach my $v (ref($v) ? @$v : $v) {
push @{$Panic{$k} ||= []}, $v unless $k eq $v;
}
}
}
}
=item * the function @langs = panic_languages(@accept_languages)
This function takes a list of 0 or more language
tags that constitute a given user's Accept-Language list, and
returns a list of tags for I<other> (non-super)
languages that are probably acceptable to the user, to be
used I<if all else fails>.
For example, if a user accepts only 'ca' (Catalan) and
'es' (Spanish), and the documents/interfaces you have
available are just in German, Italian, and Chinese, then
the user will most likely want the Italian one (and not
the Chinese or German one!), instead of getting
nothing. So C<panic_languages('ca', 'es')> returns
a list containing 'it' (Italian).
English ('en') is I<always> in the return list, but
whether it's at the very end or not depends
on the input languages. This function works by consulting
an internal table that stipulates what common
languages are "close" to each other.
A useful construct you might consider using is:
@fallbacks = super_languages(@accept_languages);
push @fallbacks, panic_languages(
@accept_languages, @fallbacks,
);
=cut
sub panic_languages {
# When in panic or in doubt, run in circles, scream, and shout!
my(@out, %seen);
foreach my $t (@_) {
next unless $t;
next if $seen{$t}++; # so we don't return it or hit it again
# push @out, super_languages($t); # nah, keep that separate
push @out, @{ $Panic{lc $t} || next };
}
return grep !$seen{$_}++, @out, 'en';
}
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
=item * the function implicate_supers( ...languages... )
This takes a list of strings (which are presumed to be language-tags;
strings that aren't, are ignored); and after each one, this function
inserts super-ordinate forms that don't already appear in the list.
The original list, plus these insertions, is returned.
In other words, it takes this:
pt-br de-DE en-US fr pt-br-janeiro
and returns this:
pt-br pt de-DE de en-US en fr pt-br-janeiro
This function is most useful in the idiom
implicate_supers( I18N::LangTags::Detect::detect() );
(See L<I18N::LangTags::Detect>.)
=item * the function implicate_supers_strictly( ...languages... )
This works like C<implicate_supers> except that the implicated
forms are added to the end of the return list.
In other words, implicate_supers_strictly takes a list of strings
(which are presumed to be language-tags; strings that aren't, are
ignored) and after the whole given list, it inserts the super-ordinate forms
of all given tags, minus any tags that already appear in the input list.
In other words, it takes this:
pt-br de-DE en-US fr pt-br-janeiro
and returns this:
pt-br de-DE en-US fr pt-br-janeiro pt de en
The reason this function has "_strictly" in its name is that when
you're processing an Accept-Language list according to the RFCs, if
you interpret the RFCs quite strictly, then you would use
implicate_supers_strictly, but for normal use (i.e., common-sense use,
as far as I'm concerned) you'd use implicate_supers.
=cut
sub implicate_supers {
my @languages = grep is_language_tag($_), @_;
my %seen_encoded;
foreach my $lang (@languages) {
$seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
}
my(@output_languages);
foreach my $lang (@languages) {
push @output_languages, $lang;
foreach my $s ( I18N::LangTags::super_languages($lang) ) {
# Note that super_languages returns the longest first.
last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
push @output_languages, $s;
}
}
return uniq( @output_languages );
}
sub implicate_supers_strictly {
my @tags = grep is_language_tag($_), @_;
return uniq( @_, map super_languages($_), @_ );
}
###########################################################################
1;
__END__
=back
=head1 ABOUT LOWERCASING
I've considered making all the above functions that output language
tags return all those tags strictly in lowercase. Having all your
language tags in lowercase does make some things easier. But you
might as well just lowercase as you like, or call
C<encode_language_tag($lang1)> where appropriate.
=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
In some future version of I18N::LangTags, I plan to include support
for RFC2482-style language tags -- which are basically just normal
language tags with their ASCII characters shifted into Plane 14.
=head1 SEE ALSO
* L<I18N::LangTags::List|I18N::LangTags::List>
* RFC 3066, C<http://www.ietf.org/rfc/rfc3066.txt>, "Tags for the
Identification of Languages". (Obsoletes RFC 1766)
* RFC 2277, C<http://www.ietf.org/rfc/rfc2277.txt>, "IETF Policy on
Character Sets and Languages".
* RFC 2231, C<http://www.ietf.org/rfc/rfc2231.txt>, "MIME Parameter
Value and Encoded Word Extensions: Character Sets, Languages, and
Continuations".
* RFC 2482, C<http://www.ietf.org/rfc/rfc2482.txt>,
"Language Tagging in Unicode Plain Text".
* Locale::Codes, in
C<http://www.perl.com/CPAN/modules/by-module/Locale/>
* ISO 639-2, "Codes for the representation of names of languages",
including two-letter and three-letter codes,
C<http://www.loc.gov/standards/iso639-2/php/code_list.php>
* The IANA list of registered languages (hopefully up-to-date),
C<http://www.iana.org/assignments/language-tags>
=head1 COPYRIGHT
Copyright (c) 1998+ Sean M. Burke. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
The programs and documentation in this dist are distributed in
the hope that they will be useful, but without any warranty; without
even the implied warranty of merchantability or fitness for a
particular purpose.
=head1 AUTHOR
Sean M. Burke C<sburke@cpan.org>
=cut
PK �6�Z���
Collate.pmnu �[��� package I18N::Collate;
use strict;
our $VERSION = '1.02';
=head1 NAME
I18N::Collate - compare 8-bit scalar data according to the current locale
=head1 SYNOPSIS
use I18N::Collate;
setlocale(LC_COLLATE, 'locale-of-your-choice');
$s1 = I18N::Collate->new("scalar_data_1");
$s2 = I18N::Collate->new("scalar_data_2");
=head1 DESCRIPTION
***
WARNING: starting from the Perl version 5.003_06
the I18N::Collate interface for comparing 8-bit scalar data
according to the current locale
HAS BEEN DEPRECATED
That is, please do not use it anymore for any new applications
and please migrate the old applications away from it because its
functionality was integrated into the Perl core language in the
release 5.003_06.
See the perllocale manual page for further information.
***
This module provides you with objects that will collate
according to your national character set, provided that the
POSIX setlocale() function is supported on your system.
You can compare $s1 and $s2 above with
$s1 le $s2
to extract the data itself, you'll need a dereference: $$s1
This module uses POSIX::setlocale(). The basic collation conversion is
done by strxfrm() which terminates at NUL characters being a decent C
routine. collate_xfrm() handles embedded NUL characters gracefully.
The available locales depend on your operating system; try whether
C<locale -a> shows them or man pages for "locale" or "nlsinfo" or the
direct approach C<ls /usr/lib/nls/loc> or C<ls /usr/lib/nls> or
C<ls /usr/lib/locale>. Not all the locales that your vendor supports
are necessarily installed: please consult your operating system's
documentation and possibly your local system administration. The
locale names are probably something like C<xx_XX.(ISO)?8859-N> or
C<xx_XX.(ISO)?8859N>, for example C<fr_CH.ISO8859-1> is the Swiss (CH)
variant of French (fr), ISO Latin (8859) 1 (-1) which is the Western
European character set.
=cut
# I18N::Collate.pm
#
# Author: Jarkko Hietaniemi <F<jhi@iki.fi>>
# Helsinki University of Technology, Finland
#
# Acks: Guy Decoux <F<decoux@moulon.inra.fr>> understood
# overloading magic much deeper than I and told
# how to cut the size of this code by more than half.
# (my first version did overload all of lt gt eq le ge cmp)
#
# Purpose: compare 8-bit scalar data according to the current locale
#
# Requirements: Perl5 POSIX::setlocale() and POSIX::strxfrm()
#
# Exports: setlocale 1)
# collate_xfrm 2)
#
# Overloads: cmp # 3)
#
# Usage: use I18N::Collate;
# setlocale(LC_COLLATE, 'locale-of-your-choice'); # 4)
# $s1 = I18N::Collate->("scalar_data_1");
# $s2 = I18N::Collate->("scalar_data_2");
#
# now you can compare $s1 and $s2: $s1 le $s2
# to extract the data itself, you need to deref: $$s1
#
# Notes:
# 1) this uses POSIX::setlocale
# 2) the basic collation conversion is done by strxfrm() which
# terminates at NUL characters being a decent C routine.
# collate_xfrm handles embedded NUL characters gracefully.
# 3) due to cmp and overload magic, lt le eq ge gt work also
# 4) the available locales depend on your operating system;
# try whether "locale -a" shows them or man pages for
# "locale" or "nlsinfo" work or the more direct
# approach "ls /usr/lib/nls/loc" or "ls /usr/lib/nls".
# Not all the locales that your vendor supports
# are necessarily installed: please consult your
# operating system's documentation.
# The locale names are probably something like
# 'xx_XX.(ISO)?8859-N' or 'xx_XX.(ISO)?8859N',
# for example 'fr_CH.ISO8859-1' is the Swiss (CH)
# variant of French (fr), ISO Latin (8859) 1 (-1)
# which is the Western European character set.
#
# Updated: 19961005
#
# ---
use POSIX qw(strxfrm LC_COLLATE);
use warnings::register;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(collate_xfrm setlocale LC_COLLATE);
our @EXPORT_OK = qw();
use overload qw(
fallback 1
cmp collate_cmp
);
our($LOCALE, $C);
our $please_use_I18N_Collate_even_if_deprecated = 0;
sub new {
my $new = $_[1];
if (warnings::enabled() && $] >= 5.003_06) {
unless ($please_use_I18N_Collate_even_if_deprecated) {
warnings::warn <<___EOD___;
***
WARNING: starting from the Perl version 5.003_06
the I18N::Collate interface for comparing 8-bit scalar data
according to the current locale
HAS BEEN DEPRECATED
That is, please do not use it anymore for any new applications
and please migrate the old applications away from it because its
functionality was integrated into the Perl core language in the
release 5.003_06.
See the perllocale manual page for further information.
***
___EOD___
$please_use_I18N_Collate_even_if_deprecated++;
}
}
bless \$new;
}
sub setlocale {
my ($category, $locale) = @_[0,1];
POSIX::setlocale($category, $locale) if (defined $category);
# the current $LOCALE
$LOCALE = $locale || $ENV{'LC_COLLATE'} || $ENV{'LC_ALL'} || '';
}
sub C {
my $s = ${$_[0]};
$C->{$LOCALE}->{$s} = collate_xfrm($s)
unless (defined $C->{$LOCALE}->{$s}); # cache when met
$C->{$LOCALE}->{$s};
}
sub collate_xfrm {
my $s = $_[0];
my $x = '';
for (split(/(\000+)/, $s)) {
$x .= (/^\000/) ? $_ : strxfrm("$_\000");
}
$x;
}
sub collate_cmp {
&C($_[0]) cmp &C($_[1]);
}
# init $LOCALE
&I18N::Collate::setlocale();
1; # keep require happy
PK �($[�g]�>