package Unicode::UCD;
use strict;
use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
our $VERSION = '0.57';
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo
charblock charscript
charblocks charscripts
charinrange
general_categories bidi_types
compexcl
casefold all_casefolds casespec
namedseq
num
prop_aliases
prop_value_aliases
prop_invlist
prop_invmap
search_invlist
MAX_CP
);
use Carp;
sub IS_ASCII_PLATFORM { ord("A") == 65 }
=head1 NAME
Unicode::UCD - Unicode character database
=head1 SYNOPSIS
use Unicode::UCD 'charinfo';
my $charinfo = charinfo($codepoint);
use Unicode::UCD 'casefold';
my $casefold = casefold(0xFB00);
use Unicode::UCD 'all_casefolds';
my $all_casefolds_ref = all_casefolds();
use Unicode::UCD 'casespec';
my $casespec = casespec(0xFB00);
use Unicode::UCD 'charblock';
my $charblock = charblock($codepoint);
use Unicode::UCD 'charscript';
my $charscript = charscript($codepoint);
use Unicode::UCD 'charblocks';
my $charblocks = charblocks();
use Unicode::UCD 'charscripts';
my $charscripts = charscripts();
use Unicode::UCD qw(charscript charinrange);
my $range = charscript($script);
print "looks like $script\n" if charinrange($range, $codepoint);
use Unicode::UCD qw(general_categories bidi_types);
my $categories = general_categories();
my $types = bidi_types();
use Unicode::UCD 'prop_aliases';
my @space_names = prop_aliases("space");
use Unicode::UCD 'prop_value_aliases';
my @gc_punct_names = prop_value_aliases("Gc", "Punct");
use Unicode::UCD 'prop_invlist';
my @puncts = prop_invlist("gc=punctuation");
use Unicode::UCD 'prop_invmap';
my ($list_ref, $map_ref, $format, $missing)
= prop_invmap("General Category");
use Unicode::UCD 'search_invlist';
my $index = search_invlist(\@invlist, $code_point);
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
use Unicode::UCD 'namedseq';
my $namedseq = namedseq($named_sequence_name);
my $unicode_version = Unicode::UCD::UnicodeVersion();
my $convert_to_numeric =
Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
=head1 DESCRIPTION
The Unicode::UCD module offers a series of functions that
provide a simple interface to the Unicode
Character Database.
=head2 code point argument
Some of the functions are called with a I, which is either
a decimal or a hexadecimal scalar designating a code point in the platform's
native character set (extended to Unicode), or C followed by hexadecimals
designating a Unicode code point. A leading 0 will force a hexadecimal
interpretation, as will a hexadecimal digit that isn't a decimal digit.
Examples:
223 # Decimal 223 in native character set
0223 # Hexadecimal 223, native (= 547 decimal)
0xDF # Hexadecimal DF, native (= 223 decimal
U+DF # Hexadecimal DF, in Unicode's character set
(= LATIN SMALL LETTER SHARP S)
Note that the largest code point in Unicode is U+10FFFF.
=cut
my $BLOCKSFH;
my $VERSIONFH;
my $CASEFOLDFH;
my $CASESPECFH;
my $NAMEDSEQFH;
my $v_unicode_version; # v-string.
sub openunicode {
my ($rfh, @path) = @_;
my $f;
unless (defined $$rfh) {
for my $d (@INC) {
use File::Spec;
$f = File::Spec->catfile($d, "unicore", @path);
last if open($$rfh, $f);
undef $f;
}
croak __PACKAGE__, ": failed to find ",
File::Spec->catfile(@path), " in @INC"
unless defined $f;
}
return $f;
}
sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it.
use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
return dclone(shift) if defined &dclone;
my $arg = shift;
my $type = ref $arg;
return $arg unless $type; # No deep cloning needed for scalars
if ($type eq 'ARRAY') {
my @return;
foreach my $element (@$arg) {
push @return, &_dclone($element);
}
return \@return;
}
elsif ($type eq 'HASH') {
my %return;
foreach my $key (keys %$arg) {
$return{$key} = &_dclone($arg->{$key});
}
return \%return;
}
else {
croak "_dclone can't handle " . $type;
}
}
=head2 B
use Unicode::UCD 'charinfo';
my $charinfo = charinfo(0x41);
This returns information about the input L
as a reference to a hash of fields as defined by the Unicode
standard. If the L is not assigned in the standard
(i.e., has the general category C meaning C)
or is a non-character (meaning it is guaranteed to never be assigned in
the standard),
C is returned.
Fields that aren't applicable to the particular code point argument exist in the
returned hash, and are empty.
The keys in the hash with the meanings of their values are:
=over
=item B
the input native L
expressed in hexadecimal, with
leading zeros
added if necessary to make it contain at least four hexdigits
=item B
name of I, all IN UPPER CASE.
Some control-type code points do not have names.
This field will be empty for C and C code points,
and for the others without a name,
it will contain a description enclosed in angle brackets, like
CcontrolE>.
=item B
The short name of the general category of I.
This will match one of the keys in the hash returned by L.
The L function can be used to get all the synonyms
of the category name.
=item B
the combining class number for I used in the Canonical Ordering Algorithm.
For Unicode 5.1, this is described in Section 3.11 C
available at
L
The L function can be used to get all the synonyms
of the combining class number.
=item B
bidirectional type of I.
This will match one of the keys in the hash returned by L.
The L function can be used to get all the synonyms
of the bidi type name.
=item B
is empty if I has no decomposition; or is one or more codes
(separated by spaces) that, taken in order, represent a decomposition for
I. Each has at least four hexdigits.
The codes may be preceded by a word enclosed in angle brackets then a space,
like CcompatE >, giving the type of decomposition
This decomposition may be an intermediate one whose components are also
decomposable. Use L to get the final decomposition.
=item B
if I is a decimal digit this is its integer numeric value
=item B
if I represents some other digit-like number, this is its integer
numeric value
=item B
if I represents a whole or rational number, this is its numeric value.
Rational values are expressed as a string like C<1/4>.
=item B
C or C designating if I is mirrored in bidirectional text
=item B
name of I in the Unicode 1.0 standard if one
existed for this code point and is different from the current name
=item B
As of Unicode 6.0, this is always empty.
=item B
is empty if there is no single code point uppercase mapping for I
(its uppercase mapping is itself);
otherwise it is that mapping expressed as at least four hexdigits.
(L should be used in addition to B
for case mappings when the calling program can cope with multiple code point
mappings.)
=item B
is empty if there is no single code point lowercase mapping for I
(its lowercase mapping is itself);
otherwise it is that mapping expressed as at least four hexdigits.
(L should be used in addition to B
for case mappings when the calling program can cope with multiple code point
mappings.)
=item B
is empty if there is no single code point titlecase mapping for I
(its titlecase mapping is itself);
otherwise it is that mapping expressed as at least four hexdigits.
(L should be used in addition to B
for case mappings when the calling program can cope with multiple code point
mappings.)
=item B
the block I belongs to (used in C<\p{Blk=...}>).
See L.
=item B