package Unicode::UCD;
use strict;
use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
use Unicode::Normalize qw(getCombinClass NFD);
our $VERSION = '0.43';
use Storable qw(dclone);
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo
charblock charscript
charblocks charscripts
charinrange
general_categories bidi_types
compexcl
casefold casespec
namedseq
num
prop_aliases
prop_value_aliases
prop_invlist
prop_invmap
MAX_CP
);
use Carp;
=head1 NAME
Unicode::UCD - Unicode character database
=head1 SYNOPSIS
use Unicode::UCD 'charinfo';
my $charinfo = charinfo($codepoint);
use Unicode::UCD 'casefold';
my $casefold = casefold(0xFB00);
use Unicode::UCD 'casespec';
my $casespec = casespec(0xFB00);
use Unicode::UCD 'charblock';
my $charblock = charblock($codepoint);
use Unicode::UCD 'charscript';
my $charscript = charscript($codepoint);
use Unicode::UCD 'charblocks';
my $charblocks = charblocks();
use Unicode::UCD 'charscripts';
my $charscripts = charscripts();
use Unicode::UCD qw(charscript charinrange);
my $range = charscript($script);
print "looks like $script\n" if charinrange($range, $codepoint);
use Unicode::UCD qw(general_categories bidi_types);
my $categories = general_categories();
my $types = bidi_types();
use Unicode::UCD 'prop_aliases';
my @space_names = prop_aliases("space");
use Unicode::UCD 'prop_value_aliases';
my @gc_punct_names = prop_value_aliases("Gc", "Punct");
use Unicode::UCD 'prop_invlist';
my @puncts = prop_invlist("gc=punctuation");
use Unicode::UCD 'prop_invmap';
my ($list_ref, $map_ref, $format, $missing)
= prop_invmap("General Category");
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
use Unicode::UCD 'namedseq';
my $namedseq = namedseq($named_sequence_name);
my $unicode_version = Unicode::UCD::UnicodeVersion();
my $convert_to_numeric =
Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
=head1 DESCRIPTION
The Unicode::UCD module offers a series of functions that
provide a simple interface to the Unicode
Character Database.
=head2 code point argument
Some of the functions are called with a I, which is either
a decimal or a hexadecimal scalar designating a Unicode code point, or C
followed by hexadecimals designating a Unicode code point. In other words, if
you want a code point to be interpreted as a hexadecimal number, you must
prefix it with either C<0x> or C, because a string like e.g. C<123> will be
interpreted as a decimal code point. Note that the largest code point in
Unicode is U+10FFFF.
=cut
my $BLOCKSFH;
my $VERSIONFH;
my $CASEFOLDFH;
my $CASESPECFH;
my $NAMEDSEQFH;
sub openunicode {
my ($rfh, @path) = @_;
my $f;
unless (defined $$rfh) {
for my $d (@INC) {
use File::Spec;
$f = File::Spec->catfile($d, "unicore", @path);
last if open($$rfh, $f);
undef $f;
}
croak __PACKAGE__, ": failed to find ",
File::Spec->catfile(@path), " in @INC"
unless defined $f;
}
return $f;
}
=head2 B
use Unicode::UCD 'charinfo';
my $charinfo = charinfo(0x41);
This returns information about the input L
as a reference to a hash of fields as defined by the Unicode
standard. If the L is not assigned in the standard
(i.e., has the general category C meaning C)
or is a non-character (meaning it is guaranteed to never be assigned in
the standard),
C is returned.
Fields that aren't applicable to the particular code point argument exist in the
returned hash, and are empty.
The keys in the hash with the meanings of their values are:
=over
=item B
the input L
expressed in hexadecimal, with leading zeros
added if necessary to make it contain at least four hexdigits
=item B
name of I, all IN UPPER CASE.
Some control-type code points do not have names.
This field will be empty for C and C code points,
and for the others without a name,
it will contain a description enclosed in angle brackets, like
CcontrolE>.
=item B
The short name of the general category of I.
This will match one of the keys in the hash returned by L.
The L function can be used to get all the synonyms
of the category name.
=item B
the combining class number for I used in the Canonical Ordering Algorithm.
For Unicode 5.1, this is described in Section 3.11 C
available at
L
The L function can be used to get all the synonyms
of the combining class number.
=item B
bidirectional type of I.
This will match one of the keys in the hash returned by L.
The L function can be used to get all the synonyms
of the bidi type name.
=item B
is empty if I has no decomposition; or is one or more codes
(separated by spaces) that, taken in order, represent a decomposition for
I. Each has at least four hexdigits.
The codes may be preceded by a word enclosed in angle brackets then a space,
like CcompatE >, giving the type of decomposition
This decomposition may be an intermediate one whose components are also
decomposable. Use L to get the final decomposition.
=item B
if I is a decimal digit this is its integer numeric value
=item B
if I represents some other digit-like number, this is its integer
numeric value
=item B
if I represents a whole or rational number, this is its numeric value.
Rational values are expressed as a string like C<1/4>.
=item B
C or C designating if I is mirrored in bidirectional text
=item B
name of I in the Unicode 1.0 standard if one
existed for this code point and is different from the current name
=item B
As of Unicode 6.0, this is always empty.
=item B
is empty if there is no single code point uppercase mapping for I
(its uppercase mapping is itself);
otherwise it is that mapping expressed as at least four hexdigits.
(L should be used in addition to B
for case mappings when the calling program can cope with multiple code point
mappings.)
=item B
is empty if there is no single code point lowercase mapping for I
(its lowercase mapping is itself);
otherwise it is that mapping expressed as at least four hexdigits.
(L should be used in addition to B
for case mappings when the calling program can cope with multiple code point
mappings.)
=item B
is empty if there is no single code point titlecase mapping for I
(its titlecase mapping is itself);
otherwise it is that mapping expressed as at least four hexdigits.
(L should be used in addition to B
for case mappings when the calling program can cope with multiple code point
mappings.)
=item B
the block I belongs to (used in C<\p{Blk=...}>).
See L.
=item B