package Unicode::UCD;
use strict;
use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
our $VERSION = '0.75';
sub DEBUG () { 0 }
$|=1 if DEBUG;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo
charblock charscript
charblocks charscripts
charinrange
charprop
charprops_all
general_categories bidi_types
compexcl
casefold all_casefolds casespec
namedseq
num
prop_aliases
prop_value_aliases
prop_values
prop_invlist
prop_invmap
search_invlist
MAX_CP
);
use Carp;
sub IS_ASCII_PLATFORM { ord("A") == 65 }
=head1 NAME
Unicode::UCD - Unicode character database
=head1 SYNOPSIS
use Unicode::UCD 'charinfo';
my $charinfo = charinfo($codepoint);
use Unicode::UCD 'charprop';
my $value = charprop($codepoint, $property);
use Unicode::UCD 'charprops_all';
my $all_values_hash_ref = charprops_all($codepoint);
use Unicode::UCD 'casefold';
my $casefold = casefold($codepoint);
use Unicode::UCD 'all_casefolds';
my $all_casefolds_ref = all_casefolds();
use Unicode::UCD 'casespec';
my $casespec = casespec($codepoint);
use Unicode::UCD 'charblock';
my $charblock = charblock($codepoint);
use Unicode::UCD 'charscript';
my $charscript = charscript($codepoint);
use Unicode::UCD 'charblocks';
my $charblocks = charblocks();
use Unicode::UCD 'charscripts';
my $charscripts = charscripts();
use Unicode::UCD qw(charscript charinrange);
my $range = charscript($script);
print "looks like $script\n" if charinrange($range, $codepoint);
use Unicode::UCD qw(general_categories bidi_types);
my $categories = general_categories();
my $types = bidi_types();
use Unicode::UCD 'prop_aliases';
my @space_names = prop_aliases("space");
use Unicode::UCD 'prop_value_aliases';
my @gc_punct_names = prop_value_aliases("Gc", "Punct");
use Unicode::UCD 'prop_values';
my @all_EA_short_names = prop_values("East_Asian_Width");
use Unicode::UCD 'prop_invlist';
my @puncts = prop_invlist("gc=punctuation");
use Unicode::UCD 'prop_invmap';
my ($list_ref, $map_ref, $format, $missing)
= prop_invmap("General Category");
use Unicode::UCD 'search_invlist';
my $index = search_invlist(\@invlist, $code_point);
# The following function should be used only internally in
# implementations of the Unicode Normalization Algorithm, and there
# are better choices than it.
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
use Unicode::UCD 'namedseq';
my $namedseq = namedseq($named_sequence_name);
my $unicode_version = Unicode::UCD::UnicodeVersion();
my $convert_to_numeric =
Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
=head1 DESCRIPTION
The Unicode::UCD module offers a series of functions that
provide a simple interface to the Unicode
Character Database.
=head2 code point argument
Some of the functions are called with a I, which is either
a decimal or a hexadecimal scalar designating a code point in the platform's
native character set (extended to Unicode), or a string containing C
followed by hexadecimals
designating a Unicode code point. A leading 0 will force a hexadecimal
interpretation, as will a hexadecimal digit that isn't a decimal digit.
Examples:
223 # Decimal 223 in native character set
0223 # Hexadecimal 223, native (= 547 decimal)
0xDF # Hexadecimal DF, native (= 223 decimal)
'0xDF' # String form of hexadecimal (= 223 decimal)
'U+DF' # Hexadecimal DF, in Unicode's character set
(= LATIN SMALL LETTER SHARP S)
Note that the largest code point in Unicode is U+10FFFF.
=cut
our %caseless_equivalent;
our $e_precision;
our %file_to_swash_name;
our @inline_definitions;
our %loose_property_name_of;
our %loose_property_to_file_of;
our %loose_to_file_of;
our $MAX_CP;
our %nv_floating_to_rational;
our %prop_aliases;
our %stricter_to_file_of;
our %strict_property_to_file_of;
our %SwashInfo;
our %why_deprecated;
my $v_unicode_version; # v-string.
sub openunicode {
my (@path) = @_;
my $rfh;
for my $d (@INC) {
use File::Spec;
my $f = File::Spec->catfile($d, "unicore", @path);
return $rfh if open($rfh, '<', $f);
}
croak __PACKAGE__, ": failed to find ",
File::Spec->catfile("unicore", @path), " in @INC";
}
sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it.
use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
return dclone(shift) if defined &dclone;
my $arg = shift;
my $type = ref $arg;
return $arg unless $type; # No deep cloning needed for scalars
if ($type eq 'ARRAY') {
my @return;
foreach my $element (@$arg) {
push @return, &_dclone($element);
}
return \@return;
}
elsif ($type eq 'HASH') {
my %return;
foreach my $key (keys %$arg) {
$return{$key} = &_dclone($arg->{$key});
}
return \%return;
}
else {
croak "_dclone can't handle " . $type;
}
}
=head2 B
use Unicode::UCD 'charinfo';
my $charinfo = charinfo(0x41);
This returns information about the input L
as a reference to a hash of fields as defined by the Unicode
standard. If the L is not assigned in the standard
(i.e., has the general category C meaning C)
or is a non-character (meaning it is guaranteed to never be assigned in
the standard),
C is returned.
Fields that aren't applicable to the particular code point argument exist in the
returned hash, and are empty.
For results that are less "raw" than this function returns, or to get the values for
any property, not just the few covered by this function, use the
L function.
The keys in the hash with the meanings of their values are:
=over
=item B
the input native L
expressed in hexadecimal, with
leading zeros
added if necessary to make it contain at least four hexdigits
=item B
name of I, all IN UPPER CASE.
Some control-type code points do not have names.
This field will be empty for C and C code points,
and for the others without a name,
it will contain a description enclosed in angle brackets, like
CcontrolE>.
=item B
The short name of the general category of I.
This will match one of the keys in the hash returned by L.
The L function can be used to get all the synonyms
of the category name.
=item B
the combining class number for I used in the Canonical Ordering Algorithm.
For Unicode 5.1, this is described in Section 3.11 C
available at
L
The L function can be used to get all the synonyms
of the combining class number.
=item B
bidirectional type of I.
This will match one of the keys in the hash returned by L.
The L function can be used to get all the synonyms
of the bidi type name.
=item B
is empty if I has no decomposition; or is one or more codes
(separated by spaces) that, taken in order, represent a decomposition for
I. Each has at least four hexdigits.
The codes may be preceded by a word enclosed in angle brackets, then a space,
like CcompatE >, giving the type of decomposition
This decomposition may be an intermediate one whose components are also
decomposable. Use L to get the final decomposition in one
step.
=item B
if I represents a decimal digit this is its integer numeric value
=item B
if I represents some other digit-like number, this is its integer
numeric value
=item B
if I represents a whole or rational number, this is its numeric value.
Rational values are expressed as a string like C<1/4>.
=item B
C or C designating if I is mirrored in bidirectional text
=item B
name of I in the Unicode 1.0 standard if one
existed for this code point and is different from the current name
=item B
As of Unicode 6.0, this is always empty.
=item B
is, if non-empty, the uppercase mapping for I expressed as at least four
hexdigits. This indicates that the full uppercase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple uppercase mapping is
I itself; you'll need some other means, (like L or
L to get the full mapping.
=item B
is, if non-empty, the lowercase mapping for I expressed as at least four
hexdigits. This indicates that the full lowercase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple lowercase mapping is
I itself; you'll need some other means, (like L or
L to get the full mapping.
=item B
is, if non-empty, the titlecase mapping for I expressed as at least four
hexdigits. This indicates that the full titlecase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple titlecase mapping is
I itself; you'll need some other means, (like L or
L to get the full mapping.
=item B
the block I belongs to (used in C<\p{Blk=...}>).
The L function can be used to get all the synonyms
of the block name.
See L.
=item B