From 6758be8d15b9d0b14e06f4cbbb3a76ff645d280c Mon Sep 17 00:00:00 2001 From: Wolfram Schneider Date: Tue, 18 Nov 2008 23:32:50 +0100 Subject: [PATCH] Implement real ISO546 character set, first version. --- src/codetables-iso5426.xml | 968 ++++++++++++++++++++++++-------------------- 1 file changed, 519 insertions(+), 449 deletions(-) diff --git a/src/codetables-iso5426.xml b/src/codetables-iso5426.xml index 6dd818c..abcae3d 100644 --- a/src/codetables-iso5426.xml +++ b/src/codetables-iso5426.xml @@ -2,9 +2,9 @@ - The first column in this table contains the MARC-8 code (in hex) for + The first column in this table contains the ISO5426 code (in hex) for the character as coming from the G0 graphic set, the second column - contains the MARC-8 code (in hex) for the character as coming from the G1 + contains the ISO5426 code (in hex) for the character as coming from the G1 graphic set, the third column contains the UCS/Unicode 16-bit code (in hex), the fourth column contains the UTF-8 code (in hex) for the UCS characters, the fifth column contains a representation of the character (where possible), @@ -14,12 +14,14 @@ in Unicode and UTF-8 are given. When that occurs the alternate Unicode and alternate UTF-8 columns follow the character name. + 1D 001D @@ -612,8 +614,14 @@ BRACKET 7E SPACING TILDE / TILDE + + + See also Zeichentabelle MAB2 (ISO 5426-1983), http://www.gymel.com/charsets/MAB2.html + See also MAB2-Zeichensatz ISO 646 (IRV) + ISO 5426-1983, http://www.d-nb.de/standardisierung/pdf/mab_unic.pdf + See also Zeichenkonkordanz MAB2-Zeichensatz - MARC-8, http://www.d-nb.de/standardisierung/pdf/mab_marc.pdf + 88 0098 @@ -626,504 +634,566 @@ BRACKET C29C NON-SORT END / STRING TERMINATOR + - 8D - 200D - E2808D - JOINER / ZERO WIDTH JOINER - - - - - E8 - 0141 - C581 - UPPERCASE POLISH L / LATIN CAPITAL LETTER L WITH -STROKE + A1 + 00A1 + C2A1 + INVERTED EXCLAMATION MARK + - E9 - 00D8 - C398 - UPPERCASE SCANDINAVIAN O / LATIN CAPITAL LETTER -O WITH STROKE + A2 + 201E + E2809E + LOW DOUBLE COMMA QUOTATION MARK + - E2 - 0110 - C490 - UPPERCASE D WITH CROSSBAR / LATIN CAPITAL LETTER -D WITH STROKE + A3 + 00A3 + C2A3 + BRITISH POUND / POUND SIGN - EC - 00DE - C39E - UPPERCASE ICELANDIC THORN / LATIN CAPITAL LETTER -THORN (Icelandic) + A4 + 0024 + 24 + DOLLAR SIGN + - E1 - 00C6 - C386 - UPPERCASE DIGRAPH AE / LATIN CAPITAL LIGATURE -AE - + A5 + 00A5 + C2A5 + YEN SIGN + A6 - 0152 - C592 - UPPERCASE DIGRAPH OE / LATIN CAPITAL LIGATURE -OE - + 2020 + E280A0 + DAGGER + - EA - 02B9 - CAB9 - SOFT SIGN, PRIME / MODIFIER LETTER PRIME + A7 + 00A7 + C2A7 + SECTION SIGN - B7 - 00B7 - C2B7 - MIDDLE DOT + A8 + 2032 + E280A0 + PRIME + - AC - 266D - E299AD - MUSIC FLAT SIGN - + A9 + 2018 + E28098 + SINGLE TURNED COMMA QUOTATION MARK + - AF - 00AE - C2AE - PATENT MARK / REGISTERED SIGN - - - + 00AB + E280A0 + LEFT-POINTING DOUBLE ANGLE QUOTATION MARK (LEFT POINTING GUILLEMET) + AC - 01A0 - C6A0 - UPPERCASE O-HOOK / LATIN CAPITAL LETTER O WITH -HORN - + 266D + E299AD + MUSIC FLAT SIGN (FLAT) + AD - 01AF - C6AF - UPPERCASE U-HOOK / LATIN CAPITAL LETTER U WITH -HORN + 00A9 + C2A9 + COPYRIGHT SIGN AE - 02BC - CABC - CABE - ALIF / MODIFIER LETTER APOSTROPHE + 2117 + E28497 + SOUND RECORDING COPYRIGHT + + + AF + 00AE + C2AE + PATENT MARK / REGISTERED SIGN + + + + B0 02BB CABB AYN / MODIFIER LETTER TURNED COMMA + B1 - 0142 - C582 - LOWERCASE POLISH L / LATIN SMALL LETTER L WITH -STROKE + 02BC + CABC + CABE + ALIF / MODIFIER LETTER APOSTROPHE + B2 - 00F8 - C3B8 - LOWERCASE SCANDINAVIAN O / LATIN SMALL LETTER O -WITH STROKE - - - B3 - 0111 - C491 - LOWERCASE D WITH CROSSBAR / LATIN SMALL LETTER -D WITH STROKE - - - B4 - 00FE - C3BE - LOWERCASE ICELANDIC THORN / LATIN SMALL LETTER -THORN (Icelandic) - - - B5 - 00E6 - C3A6 - LOWERCASE DIGRAPH AE / LATIN SMALL LIGATURE -AE + 201A + E2809A + SINGLE LOW-9 QUOTATION MARK (LOW SINGLE COMMA QUOTATION MARK) + B6 - 0153 - C593 - LOWERCASE DIGRAPH OE / LATIN SMALL LIGATURE -OE + 2021 + E280A1 + B7 - 02BA - CABA - HARD SIGN, DOUBLE PRIME / MODIFIER LETTER DOUBLE -PRIME + 00B7 + C2B7 + MIDDLE DOT B8 - 0131 - C4B1 - LOWERCASE TURKISH I / LATIN SMALL LETTER DOTLESS -I - + 2033 + E280B3 + DOUBLE PRIME + B9 - 00A3 - C2A3 - BRITISH POUND / POUND SIGN - + 2019 + E2809D + RIGHT SINGLE QUOTATION MARK (SINGLE COMMA QUOTATION MARK) + BA - 00F0 - C3B0 - LOWERCASE ETH / LATIN SMALL LETTER ETH -(Icelandic) - - - BC - 01A1 - C6A1 - LOWERCASE O-HOOK / LATIN SMALL LETTER O WITH -HORN - - - BD - 01B0 - C6B0 - LOWERCASE U-HOOK / LATIN SMALL LETTER U WITH -HORN - - - C0 - 00B0 - C2B0 - DEGREE SIGN - - - C1 - 2113 - E28493 - SCRIPT SMALL L - - - C2 - 2117 - E28497 - SOUND RECORDING COPYRIGHT - - - C3 - 00A9 - C2A9 - COPYRIGHT SIGN - - - C4 - 266F - E299AF - MUSIC SHARP SIGN - - - C5 - 00BF - C2BF - INVERTED QUESTION MARK - - - C6 - 00A1 - C2A1 - INVERTED EXCLAMATION MARK - - - C7 - 00DF - C39F - ESZETT SYMBOL - - - C8 - 20AC - E282AC - EURO SIGN - - - true - E0 - 0309 - CC89 - PSEUDO QUESTION MARK / COMBINING HOOK -ABOVE - - - true - E1 - 0300 - CC80 - GRAVE / COMBINING GRAVE ACCENT (Varia) - - - true - E2 - 0301 - CC81 - ACUTE / COMBINING ACUTE ACCENT (Oxia) - - - true - E3 - 0302 - CC82 - CIRCUMFLEX / COMBINING CIRCUMFLEX -ACCENT - - - true - E4 - 0303 - CC83 - TILDE / COMBINING TILDE - - - true - E5 - 0304 - CC84 - MACRON / COMBINING MACRON - - - true - E6 - 0306 - CC86 - BREVE / COMBINING BREVE (Vrachy) - - - true - E7 - 0307 - CC87 - SUPERIOR DOT / COMBINING DOT ABOVE - - - true - E8 - 0308 - CC88 - UMLAUT, DIAERESIS / COMBINING DIAERESIS -(Dialytika) - - - true - E9 - 030C - CC8C - HACEK / COMBINING CARON - - - true - EA - 030A - CC8A - CIRCLE ABOVE, ANGSTROM / COMBINING RING -ABOVE - - - true - EB - 0361 - CDA1 - FE20 - EFB8A0 - LIGATURE, FIRST HALF / COMBINING DOUBLE - INVERTED BREVE - - - true - EC - - - FE21 - EFB8A1 - LIGATURE, SECOND HALF / COMBINING LIGATURE RIGHT HALF - The Ligature that spans two characters - is constructed of two halves in MARC-8: EB - (Ligature, first half) and EC (Ligature, second - half). The preferred Unicode/UTF-8 mapping is to - the single character Ligature that spans two characters, - U+0361. The single character Ligature is encoded - following the second of the two characters to be spanned. - The two half Ligatures in Unicode, to which the - Ligature has been mapped since 1996, are indicted - in the mapping as alternatives, but their use is not - recommended. It is expected that font support for - the single character Ligature mark will be more - easily obtained than for the two halves. - - - true - ED - 0315 - CC95 - HIGH COMMA, OFF CENTER / COMBINING COMMA ABOVE -RIGHT - - - true - EE - 030B - CC8B - DOUBLE ACUTE / COMBINING DOUBLE ACUTE -ACCENT - - - true - EF - 0310 - CC90 - CANDRABINDU / COMBINING CANDRABINDU - - - true - F0 - 0327 - CCA7 - CEDILLA / COMBINING CEDILLA - - - true - F1 - 0328 - CCA8 - RIGHT HOOK, OGONEK / COMBINING OGONEK - - - true - F2 - 0323 - CCA3 - DOT BELOW / COMBINING DOT BELOW - - - true - F3 - 0324 - CCA4 - DOUBLE DOT BELOW / COMBINING DIAERESIS -BELOW - - - true - F4 - 0325 - CCA5 - CIRCLE BELOW / COMBINING RING BELOW - - - true - F5 - 0333 - CCB3 - DOUBLE UNDERSCORE / COMBINING DOUBLE LOW -LINE - - - true - F6 - 0332 - CCB2 - UNDERSCORE / COMBINING LOW LINE - - - true - F7 - 0326 - CCA6 - LEFT HOOK (COMMA BELOW) / COMBINING COMMA -BELOW - - - true - F8 - 031C - CC9C - RIGHT CEDILLA / COMBINING LEFT HALF RING -BELOW - - - true - F9 - 032E - CCAE - UPADHMANIYA / COMBINING BREVE BELOW - - - true - FA - 0360 - CDA0 - FE22 - EFB8A2 - DOUBLE TILDE, FIRST HALF / COMBINING DOUBLE TILDE - - - true - FB - - - FE23 - EFB8A3 - DOUBLE TILDE, SECOND HALF / COMBINING DOUBLE TILDE RIGHT HALF - The Double Tilde that spans two characters is - constructed of two halves in MARC-8: FA (Double - Tilde, first half) and FB (Double Tilde, second - half). The preferred Unicode/UTF-8 mapping - is to the single character Double Tilde that - spans two characters, U+0360. The single - character Double Tilde is encoded following - the second of the two characters to be spanned. - The two half Double Tildes in Unicode, to - which the MARC8 Double Tilde has been - mapped since 1996, are indicted in the - mapping as alternatives, but their use is not - recommended. It is expected that font support - for the single character Double Tilde mark will - be more easily obtained than for the two halves. - - - true - FE - 0313 - CC93 - HIGH COMMA, CENTERED / COMBINING COMMA ABOVE -(Psili) - + 201D + E2809D + RIGHT DOUBLE QUOTATION MARK (DOUBLE COMMA QUOTATION MARK) + + + BB + 00BB + C2BB + RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK (RIGHT POINTING GUILLEMET) + + + BC + 266F + E299AF + MUSIC SHARP SIGN + + + BD + 02B9 + CAB9 + SOFT SIGN, PRIME / MODIFIER LETTER PRIME + + + BE + 02BA + CABA + HARD SIGN, DOUBLE PRIME / MODIFIER LETTER DOUBLE PRIME + + + BF + 00BF + C2BF + INVERTED QUESTION MARK + + + + true + C0 + 0309 + CC89 + PSEUDO QUESTION MARK / COMBINING HOOK ABOVE + + + true + C1 + 0300 + CC80 + GRAVE / COMBINING GRAVE ACCENT (Varia) + + + true + C2 + 0301 + CC81 + ACUTE / COMBINING ACUTE ACCENT (Oxia) + + + true + C3 + 0302 + CC82 + CIRCUMFLEX / COMBINING CIRCUMFLEX ACCENT + + + true + C4 + 0303 + CC83 + TILDE / COMBINING TILDE + + + true + C5 + 0304 + CC84 + MACRON / COMBINING MACRON + + + true + C6 + 0306 + CC86 + BREVE / COMBINING BREVE (Vrachy) + + + true + C7 + 0307 + CC87 + SUPERIOR DOT / COMBINING DOT ABOVE + + + Q: Unicode doesn't seem to distinguish between tréma and umlaut, + but I need to distinguish. What shall I do? + http://www.unicode.org/faq/char_combmark.html#18 + true + C8 + 034F0308 + CC88 + U+034F COMBINING GRAPHEME JOINER (CGJ) / tréma + + + true + C9 + 0308 + CC88 + UMLAUT, DIAERESIS / COMBINING DIAERESIS (Dialytika) + + + true + CA + 030A + CC8A + CIRCLE ABOVE, ANGSTROM / COMBINING RING ABOVE + + + true + CB + 0315 + CC95 + HIGH COMMA, OFF CENTER / COMBINING COMMA ABOVE RIGHT + + + true + CC + 0313 + CC93 + HIGH COMMA, CENTERED / COMBINING COMMA ABOVE (Psili) + + + true + CD + 030B + CC8B + DOUBLE ACUTE / COMBINING DOUBLE ACUTE ACCENT + + + true + CE + 031B + CC9B + COMBINING HORN (NON-SPACING HORN) + + + true + CF + 030C + CC8C + HACEK / COMBINING CARON + + + + true + D0 + 0327 + CCA7 + CEDILLA / COMBINING CEDILLA + + + true + D1 + 031C + CC9C + RIGHT CEDILLA / COMBINING LEFT HALF RING BELOW + + + true + D2 + 0326 + CCA6 + LEFT HOOK (COMMA BELOW) / COMBINING COMMA BELOW + + + true + D3 + 0328 + CCA8 + RIGHT HOOK, OGONEK / COMBINING OGONEK + + + true + D4 + 0325 + CCA5 + CIRCLE BELOW / COMBINING RING BELOW + + + true + D5 + 032E + CCAE + UPADHMANIYA / COMBINING BREVE BELOW + + + true + D6 + 0323 + CCA3 + DOT BELOW / COMBINING DOT BELOW + + + true + D7 + 0324 + CCA4 + DOUBLE DOT BELOW / COMBINING DIAERESIS BELOW + + + true + D8 + 0332 + CCB2 + UNDERSCORE / COMBINING LOW LINE + + + true + D9 + 0333 + CCB3 + DOUBLE UNDERSCORE / COMBINING DOUBLE LOW LINE + + + + true + DA + 0329 + CCA9 + COMBINING VERTICAL LINE BELOW (NON-SPACING VERTICAL LINE BELOW) + + + true + DB + 032D + CCAD + COMBINING CIRCUMFLEX ACCENT BELOW (NON-SPACING CIRCUMFLEX BELOW) + + + + true + DD + 0360 + CDA0 + FE22 + EFB8A2 + DOUBLE TILDE, FIRST HALF / COMBINING DOUBLE TILDE + + + true + DE + + + FE21 + EFB8A1 + LIGATURE, SECOND HALF / COMBINING LIGATURE RIGHT HALF + The Ligature that spans two characters + is constructed of two halves in MARC-8: EB + (Ligature, first half) and EC (Ligature, second + half). The preferred Unicode/UTF-8 mapping is to + the single character Ligature that spans two characters, + U+0361. The single character Ligature is encoded + following the second of the two characters to be spanned. + The two half Ligatures in Unicode, to which the + Ligature has been mapped since 1996, are indicted + in the mapping as alternatives, but their use is not + recommended. It is expected that font support for + the single character Ligature mark will be more + easily obtained than for the two halves. + + + true + DF + + + FE23 + EFB8A3 + DOUBLE TILDE, SECOND HALF / COMBINING DOUBLE TILDE RIGHT HALF + The Double Tilde that spans two characters is + constructed of two halves in MARC-8: FA (Double + Tilde, first half) and FB (Double Tilde, second + half). The preferred Unicode/UTF-8 mapping + is to the single character Double Tilde that + spans two characters, U+0360. The single + character Double Tilde is encoded following + the second of the two characters to be spanned. + The two half Double Tildes in Unicode, to + which the MARC8 Double Tilde has been + mapped since 1996, are indicted in the + mapping as alternatives, but their use is not + recommended. It is expected that font support + for the single character Double Tilde mark will + be more easily obtained than for the two halves. + + + + + + E1 + 00C6 + C386 + UPPERCASE DIGRAPH AE / LATIN CAPITAL LIGATURE AE + + + E2 + 0110 + C490 + UPPERCASE D WITH CROSSBAR / LATIN CAPITAL LETTER D WITH STROKE + + + + E6 + 0132 + C4B2 + LATIN CAPITAL LIGATURE IJ (LATIN CAPITAL LETTER I J) + + + + E8 + 0141 + C581 + UPPERCASE POLISH L / LATIN CAPITAL LETTER L WITH STROKE + + + E9 + 00D8 + C398 + UPPERCASE SCANDINAVIAN O / LATIN CAPITAL LETTER O WITH STROKE + + + EA + 0152 + C592 + UPPERCASE DIGRAPH OE / LATIN CAPITAL LIGATURE OE + + + + EC + 00DE + C39E + UPPERCASE ICELANDIC THORN / LATIN CAPITAL LETTER THORN (Icelandic) + + + + F1 + 00E6 + C3A6 + LOWERCASE DIGRAPH AE / LATIN SMALL LIGATURE AE + + + F2 + 0111 + C491 + LOWERCASE D WITH CROSSBAR / LATIN SMALL LETTER D WITH STROKE + + + F3 + 00F0 + C3B0 + LOWERCASE ETH / LATIN SMALL LETTER ETH (Icelandic) + + + + F5 + 0131 + C4B1 + LOWERCASE TURKISH I / LATIN SMALL LETTER DOTLESS I + + + F6 + 0133 + C4B3 + LATIN SMALL LIGATURE IJ (LATIN SMALL LETTER I J) + + + + F8 + 0142 + C582 + LOWERCASE POLISH L / LATIN SMALL LETTER L WITH STROKE + + + F9 + 00F8 + C3B8 + LOWERCASE SCANDINAVIAN O / LATIN SMALL LETTER O WITH STROKE + + + FA + 0153 + C593 + LOWERCASE DIGRAPH OE / LATIN SMALL LIGATURE OE + + + FB + 00DF + C39F + ESZETT SYMBOL + + + FC + 00FE + C3BE + LOWERCASE ICELANDIC THORN / LATIN SMALL LETTER THORN (Icelandic) + + + + -- 1.7.10.4