my @FIELDS = qw(
Codepoint
Name
General_Category
Canonical_Combining_Class
Bidi_Class
Decomposition_Type_Mapping
IF_Numeric_Type_Decimal
If_Numeric_Type_Digit
If_Numeric_Type_Numeric
Bidi_Mirrored
Unicode_1_Name
ISO_Comment
Simple_Uppercase_Mapping
Simple_Lowercase_Mapping
Simple_Titlecase_Mapping
);
my %CANONICAL_COMBINING_CLASS_VALUES = (
0 => 'Not_Reordered, Spacing and enclosing marks; also many vowel and consonant signs, even if nonspacing',
1 => 'Overlay, Marks which overlay a base letter or symbol',
7 => 'Nukta, Diacritic nukta marks in Brahmi-derived scripts',
8 => 'Kana_Voicing, Hiragana/Katakana voicing marks',
9 => 'Virama, Viramas',
10 => 'Ccc10, Start of fixed position classes',
199 => 'End of fixed position classes',
200 => 'Attached_Below_Left, Marks attached at the bottom left',
202 => 'Attached_Below, Marks attached directly below',
204 => 'Marks attached at the bottom right',
208 => 'Marks attached to the left',
210 => 'Marks attached to the right',
212 => 'Marks attached at the top left',
214 => 'Attached_Above, Marks attached directly above',
216 => 'Attached_Above_Right, Marks attached at the top right',
218 => 'Below_Left, Distinct marks at the bottom left',
220 => 'Below, Distinct marks directly below',
222 => 'Below_Right, Distinct marks at the bottom right',
224 => 'Left, Distinct marks to the left',
226 => 'Right, Distinct marks to the right',
228 => 'Above_Left, Distinct marks at the top left',
230 => 'Above, Distinct marks directly above',
232 => 'Above_Right, Distinct marks at the top right',
233 => 'Double_Below, Distinct marks subtending two bases',
234 => 'Double_Above, Distinct marks extending above two bases',
240 => 'Iota_Subscript, Greek iota subscript only'
);
my %BIDI_CLASS_VALUES = (
L => 'Left_To_Right, any strong left-to-right character',
R => 'Right_To_Left, any strong right-to-left (non-Arabic-type) character',
AL => 'Arabic_Letter, any strong right-to-left (Arabic-type) character',
EN => 'European_Number, any ASCII digit or Eastern Arabic-Indic digit',
ES => 'European_Separator, plus and minus signs',
ET => 'European_Terminator, a terminator in a numeric format context, includes currency signs',
AN => 'Arabic_Number, any Arabic-Indic digit',
CS => 'Common_Separator, commas, colons, and slashes',
NSM => 'Nonspacing_Mark, any nonspacing mark',
BN => 'Boundary_Neutral, most format characters, control codes, or noncharacters',
B => 'Paragraph_Separator, various newline characters',
S => 'Segment_Separator, various segment-related control codes',
WS => 'White_Space, spaces',
ON => 'Other_Neutral, most other symbols and punctuation marks',
LRE => 'Left_To_Right_Embedding, U+202A: the LR embedding control',
LRO => 'Left_To_Right_Override, U+202D: the LR override control',
RLE => 'Right_To_Left_Embedding, U+202B: the RL embedding control',
RLO => 'Right_To_Left_Override, U+202E: the RL override control',
PDF => 'Pop_Directional_Format, U+202C: terminates an embedding or override control',
LRI => 'Left_To_Right_Isolate, U+2066: the LR isolate control',
RLI => 'Right_To_Left_Isolate, U+2067: the RL isolate control',
FSI => 'First_Strong_Isolate, U+2068: the first strong isolate control',
PDI => 'Pop_Directional_Isolate, U+2069: terminates an isolate control'
);
my %COMPATIBILITY_TAGS = (
font => 'Font variant (for example, a blackletter form)',
noBreak => 'No-break version of a space or hyphen',
initial => 'Initial presentation form (Arabic)',
medial => 'Medial presentation form (Arabic)',
final => 'Final presentation form (Arabic)',
isolated => 'Isolated presentation form (Arabic)',
circle => 'Encircled form',
super => 'Superscript form',
sub => 'Subscript form',
vertical => 'Vertical layout presentation form',
wide => 'Wide (or zenkaku) compatibility character',
narrow => 'Narrow (or hankaku) compatibility character',
small => 'Small variant form (CNS compatibility)',
square => 'CJK squared font variant',
fraction => 'Vulgar fraction form',
compat => 'Otherwise unspecified compatibility character'
);
my %GENERAL_CATEGORY_VALUES = (
Lu => 'Uppercase_Letter, an uppercase letter',
Ll => 'Lowercase_Letter, a lowercase letter',
Lt => 'Titlecase_Letter, a digraphic character, with first part uppercase',
LC => 'Cased_Letter, Lu | Ll | Lt',
Lm => 'Modifier_Letter, a modifier letter',
Lo => 'Other_Letter, other letters, including syllables and ideographs',
L => 'Letter Lu | Ll | Lt | Lm | Lo',
Mn => 'Nonspacing_Mark, a nonspacing combining mark (zero advance width)',
Mc => 'Spacing_Mark, a spacing combining mark (positive advance width)',
Me => 'Enclosing_Mark, an enclosing combining mark',
M => 'Mark, Mn | Mc | Me',
Nd => 'Decimal_Number, a decimal digit',
Nl => 'Letter_Number, a letterlike numeric character',
No => 'Other_Number, a numeric character of other type',
N => 'Number Nd | Nl | No',
Pc => 'Connector_Punctuation, a connecting punctuation mark, like a tie',
Pd => 'Dash_Punctuation, a dash or hyphen punctuation mark',
Ps => 'Open_Punctuation, an opening punctuation mark (of a pair)',
Pe => 'Close_Punctuation, a closing punctuation mark (of a pair)',
Pi => 'Initial_Punctuation, an initial quotation mark',
Pf => 'Final_Punctuation, a final quotation mark',
Po => 'Other_Punctuation, a punctuation mark of other type',
P => 'Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po',
Sm => 'Math_Symbol, a symbol of mathematical use',
Sc => 'Currency_Symbol, a currency sign',
Sk => 'Modifier_Symbol, a non-letterlike modifier symbol',
So => 'Other_Symbol, a symbol of other type',
S => 'Symbol Sm | Sc | Sk | So',
Zs => 'Space_Separator, a space character (of various non-zero widths)',
Zl => 'Line_Separator, U+2028 LINE SEPARATOR only',
Zp => 'Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only',
Z => 'Separator, Zs | Zl | Zp',
Cc => 'Control, a C0 or C1 control code',
Cf => 'Format, a format control character',
Cs => 'Surrogate, a surrogate code point',
Co => 'Private_Use, a private-use character',
Cn => 'Unassigned, a reserved unassigned code point or a noncharacter',
C => 'Other, Cc | Cf | Cs | Co | Cn',
);