# siehe http://www.unicode.org/reports/tr44/#UnicodeData.txt my @FIELDS = qw( Codepoint Name General_Category Canonical_Combining_Class Bidi_Class Decomposition_Type_Mapping IF_Numeric_Type_Decimal If_Numeric_Type_Digit If_Numeric_Type_Numeric Bidi_Mirrored Unicode_1_Name ISO_Comment Simple_Uppercase_Mapping Simple_Lowercase_Mapping Simple_Titlecase_Mapping ); my %CANONICAL_COMBINING_CLASS_VALUES = ( 0 => 'Not_Reordered, Spacing and enclosing marks; also many vowel and consonant signs, even if nonspacing', 1 => 'Overlay, Marks which overlay a base letter or symbol', 7 => 'Nukta, Diacritic nukta marks in Brahmi-derived scripts', 8 => 'Kana_Voicing, Hiragana/Katakana voicing marks', 9 => 'Virama, Viramas', 10 => 'Ccc10, Start of fixed position classes', 199 => 'End of fixed position classes', 200 => 'Attached_Below_Left, Marks attached at the bottom left', 202 => 'Attached_Below, Marks attached directly below', 204 => 'Marks attached at the bottom right', 208 => 'Marks attached to the left', 210 => 'Marks attached to the right', 212 => 'Marks attached at the top left', 214 => 'Attached_Above, Marks attached directly above', 216 => 'Attached_Above_Right, Marks attached at the top right', 218 => 'Below_Left, Distinct marks at the bottom left', 220 => 'Below, Distinct marks directly below', 222 => 'Below_Right, Distinct marks at the bottom right', 224 => 'Left, Distinct marks to the left', 226 => 'Right, Distinct marks to the right', 228 => 'Above_Left, Distinct marks at the top left', 230 => 'Above, Distinct marks directly above', 232 => 'Above_Right, Distinct marks at the top right', 233 => 'Double_Below, Distinct marks subtending two bases', 234 => 'Double_Above, Distinct marks extending above two bases', 240 => 'Iota_Subscript, Greek iota subscript only' ); my %BIDI_CLASS_VALUES = ( L => 'Left_To_Right, any strong left-to-right character', R => 'Right_To_Left, any strong right-to-left (non-Arabic-type) character', AL => 'Arabic_Letter, any strong right-to-left (Arabic-type) character', EN => 'European_Number, any ASCII digit or Eastern Arabic-Indic digit', ES => 'European_Separator, plus and minus signs', ET => 'European_Terminator, a terminator in a numeric format context, includes currency signs', AN => 'Arabic_Number, any Arabic-Indic digit', CS => 'Common_Separator, commas, colons, and slashes', NSM => 'Nonspacing_Mark, any nonspacing mark', BN => 'Boundary_Neutral, most format characters, control codes, or noncharacters', B => 'Paragraph_Separator, various newline characters', S => 'Segment_Separator, various segment-related control codes', WS => 'White_Space, spaces', ON => 'Other_Neutral, most other symbols and punctuation marks', LRE => 'Left_To_Right_Embedding, U+202A: the LR embedding control', LRO => 'Left_To_Right_Override, U+202D: the LR override control', RLE => 'Right_To_Left_Embedding, U+202B: the RL embedding control', RLO => 'Right_To_Left_Override, U+202E: the RL override control', PDF => 'Pop_Directional_Format, U+202C: terminates an embedding or override control', LRI => 'Left_To_Right_Isolate, U+2066: the LR isolate control', RLI => 'Right_To_Left_Isolate, U+2067: the RL isolate control', FSI => 'First_Strong_Isolate, U+2068: the first strong isolate control', PDI => 'Pop_Directional_Isolate, U+2069: terminates an isolate control' ); my %COMPATIBILITY_TAGS = ( font => 'Font variant (for example, a blackletter form)', noBreak => 'No-break version of a space or hyphen', initial => 'Initial presentation form (Arabic)', medial => 'Medial presentation form (Arabic)', final => 'Final presentation form (Arabic)', isolated => 'Isolated presentation form (Arabic)', circle => 'Encircled form', super => 'Superscript form', sub => 'Subscript form', vertical => 'Vertical layout presentation form', wide => 'Wide (or zenkaku) compatibility character', narrow => 'Narrow (or hankaku) compatibility character', small => 'Small variant form (CNS compatibility)', square => 'CJK squared font variant', fraction => 'Vulgar fraction form', compat => 'Otherwise unspecified compatibility character' ); my %GENERAL_CATEGORY_VALUES = ( Lu => 'Uppercase_Letter, an uppercase letter', Ll => 'Lowercase_Letter, a lowercase letter', Lt => 'Titlecase_Letter, a digraphic character, with first part uppercase', LC => 'Cased_Letter, Lu | Ll | Lt', Lm => 'Modifier_Letter, a modifier letter', Lo => 'Other_Letter, other letters, including syllables and ideographs', L => 'Letter Lu | Ll | Lt | Lm | Lo', Mn => 'Nonspacing_Mark, a nonspacing combining mark (zero advance width)', Mc => 'Spacing_Mark, a spacing combining mark (positive advance width)', Me => 'Enclosing_Mark, an enclosing combining mark', M => 'Mark, Mn | Mc | Me', Nd => 'Decimal_Number, a decimal digit', Nl => 'Letter_Number, a letterlike numeric character', No => 'Other_Number, a numeric character of other type', N => 'Number Nd | Nl | No', Pc => 'Connector_Punctuation, a connecting punctuation mark, like a tie', Pd => 'Dash_Punctuation, a dash or hyphen punctuation mark', Ps => 'Open_Punctuation, an opening punctuation mark (of a pair)', Pe => 'Close_Punctuation, a closing punctuation mark (of a pair)', Pi => 'Initial_Punctuation, an initial quotation mark', Pf => 'Final_Punctuation, a final quotation mark', Po => 'Other_Punctuation, a punctuation mark of other type', P => 'Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po', Sm => 'Math_Symbol, a symbol of mathematical use', Sc => 'Currency_Symbol, a currency sign', Sk => 'Modifier_Symbol, a non-letterlike modifier symbol', So => 'Other_Symbol, a symbol of other type', S => 'Symbol Sm | Sc | Sk | So', Zs => 'Space_Separator, a space character (of various non-zero widths)', Zl => 'Line_Separator, U+2028 LINE SEPARATOR only', Zp => 'Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only', Z => 'Separator, Zs | Zl | Zp', Cc => 'Control, a C0 or C1 control code', Cf => 'Format, a format control character', Cs => 'Surrogate, a surrogate code point', Co => 'Private_Use, a private-use character', Cn => 'Unassigned, a reserved unassigned code point or a noncharacter', C => 'Other, Cc | Cf | Cs | Co | Cn', );