| Index: source/data/translit/Zawgyi_my.txt
|
| diff --git a/source/data/translit/Zawgyi_my.txt b/source/data/translit/Zawgyi_my.txt
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..fd1f3fa6e3beb75b814bf79b69fc03197c3a3075
|
| --- /dev/null
|
| +++ b/source/data/translit/Zawgyi_my.txt
|
| @@ -0,0 +1,203 @@
|
| +# © 2016 and later: Unicode, Inc. and others.
|
| +# License & terms of use: http://www.unicode.org/copyright.html#License
|
| +#
|
| +# File: Zawgyi_my.txt
|
| +# Generated from CLDR
|
| +#
|
| +
|
| +# This transform converts Zawgyi "encoded" Burmese into proper
|
| +# unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses
|
| +# the Myanmar unicode range but assigns different characters or
|
| +# glyphs to some codepoints. In addition to the character mapping,
|
| +# there is reordering of codepoints needed to match the expected
|
| +# unicode order. This reordering is context-based.
|
| +#
|
| +# This transform is done in two main stages:
|
| +# (1) Map all Zawgyi codepoints to their Unicode counterpart.
|
| +# (2) Perform reordering.
|
| +# Modern Burmese digits & Unicode code points.
|
| +$nondigits = [^\u1040-\u1049];
|
| +$consonant = [\u1000-\u1021];
|
| +$vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031)
|
| +$umedial = [\u103B-\u103E]; # Medial codepoints in Unicode
|
| +$vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F]; # Union of vowel signs and medials
|
| +$ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode
|
| +# ZAWGYI MYANMAR CONSONANT SIGN MEDIAL RA
|
| +# This character has multiple representations in the Zawgyi font.
|
| +$zmedialra = [\u103B\u107E-\u1084];
|
| +####
|
| +#### STAGE (1): CODEPOINT MAPPING FROM ZAWGYI TO UNICODE
|
| +####
|
| +# Kinzi (predefined ligatures)
|
| +# Move base character to the right
|
| +($consonant) \u103A \u1064 → $ukinzi $1 \u103B;
|
| +($consonant) \u1064 → $ukinzi $1;
|
| +\u1064 → $ukinzi;
|
| +# Special cases moving base character to right before
|
| +($consonant) \u108b → $ukinzi $1 \u102D;
|
| +($consonant) \u108C → $ukinzi $1 \u102E;
|
| +($consonant) \u108D → $ukinzi $1 \u1036;
|
| +# Special cases moving Kinzi block to left
|
| +($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F;
|
| +($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ;
|
| +($consonant) \u103A \u108C \u1033 → $ukinzi $1 \u103B \u102E \u102F;
|
| +($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ;
|
| +($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ;
|
| +($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ;
|
| +\u108B → $ukinzi \u102D ;
|
| +\u108C → $ukinzi \u102E ;
|
| +\u108D → $ukinzi \u1036 ;
|
| +# Consonants (only the ones that have to change)
|
| +\u106A ($vowelsign) \u1038 → \u1025 $1 \u1038 ; # U sound
|
| +\u106A → \u1009 ; # NYA
|
| +\u106B → \u100A ;
|
| +\u108F → \u1014 ;
|
| +\u1090 → \u101B ;
|
| +\u1086 → \u103F ;
|
| +# yapin
|
| +\u103A → \u103B ;
|
| +\u107D → \u103B ;
|
| +# wasway
|
| +\u103C \u108A → \u103D \u103E; # To avoid duplicate medials
|
| +\u103C → \u103D ;
|
| +\u108A → \u103D \u103E ;
|
| +# hatoh
|
| +\u103D → \u103E ;
|
| +\u1087 → \u103E ;
|
| +\u1088 → \u103E \u102F ;
|
| +\u1089 → \u103E \u1030 ;
|
| +# Single diacritics with space - use non-breaking
|
| +# TODO(ccornelius): determine if this breaks transliteration
|
| +# asat
|
| +\u1039 → \u103A ;
|
| +# Vowels
|
| +\u1033 → \u102F ;
|
| +\u1034 → \u1030 ;
|
| +\u105A → \u102B \u103A ;
|
| +\u108E → \u102D \u1036 ;
|
| +# lDot
|
| +# Special cases to move dot to right of base consonant
|
| +\u1031 \u1094 ($consonant) \u103D → $1 \u103E \u1031 \u1037 ;
|
| +\u1094 → \u1037 ;
|
| +\u1095 → \u1037 ;
|
| +# Special cases for 1025 vs 1009
|
| +\u1025 \u1061 → \u1009 \u1039 \u1001;
|
| +\u1025 \u1062 → \u1009 \u1039 \u1002;
|
| +\u1025 \u1065 → \u1009 \u1039 \u1005;
|
| +\u1025 \u1068 → \u1009 \u1039 \u1007;
|
| +\u1025 \u1076 → \u1009 \u1039 \u1013;
|
| +\u1025 \u1078 → \u1009 \u1039 \u1015;
|
| +\u1025 \u107A → \u1009 \u1039 \u1017;
|
| +\u1025 \u1079 → \u1009 \u1039 \u1016;
|
| +($consonant) \u103A \u1039 → $1 \u103A \u103B;
|
| +# Stacked Consonants
|
| +\u1060 → \u1039 \u1000 ;
|
| +\u1061 → \u1039 \u1001 ;
|
| +\u1062 → \u1039 \u1002 ;
|
| +\u1063 → \u1039 \u1003 ;
|
| +\u1065 → \u1039 \u1005 ;
|
| +\u1066 → \u1039 \u1006 ;
|
| +\u1067 → \u1039 \u1006 ;
|
| +\u1068 → \u1039 \u1007 ;
|
| +\u1069 → \u1039 \u1008 ;
|
| +\u106C → \u1039 \u100B ;
|
| +\u106D → \u1039 \u100C ;
|
| +\u1070 → \u1039 \u100F ;
|
| +\u1071 → \u1039 \u1010 ;
|
| +\u1072 → \u1039 \u1010 ;
|
| +\u1096 → \u1039 \u1010 \u103D;
|
| +\u1073 → \u1039 \u1011 ;
|
| +\u1074 → \u1039 \u1011 ;
|
| +\u1075 → \u1039 \u1012 ;
|
| +\u1076 → \u1039 \u1013 ;
|
| +\u1077 → \u1039 \u1014 ;
|
| +\u1078 → \u1039 \u1015 ;
|
| +\u1079 → \u1039 \u1016 ;
|
| +\u107A → \u1039 \u1017 ;
|
| +\u107B → \u1039 \u1018 ;
|
| +\u1093 → \u1039 \u1018 ;
|
| +\u107C → \u1039 \u1019 ;
|
| +\u1085 → \u1039 \u101C ;
|
| +# Pre-defined ligatures
|
| +\u106E → \u100D\u1039\u100D ;
|
| +\u106F → \u100D\u1039\u100E ;
|
| +\u1091 → \u100F\u1039\u100D ;
|
| +\u1092 → \u100B\u1039\u100C ;
|
| +\u1097 → \u100B\u1039\u100B ;
|
| +\u104E → \u104E\u1004\u103A\u1038 ;
|
| +# yayit
|
| +$zmedialra → \u103C ;
|
| +####
|
| +#### STAGE (2): POST REORDERING RULES FOR UNICODE RENDERING
|
| +#### Now every codepoint is Unicode. This starts conversion
|
| +#### from semi-visual order to logical order.
|
| +####
|
| +::Null;
|
| +# Case of MYANMAR digit being used instead of a letter
|
| +\u1044 \u103a → | \u104E \u103A ;
|
| +# Lone zero with diacritic mark
|
| +\u1031 \u1040 ($nondigits) → \u1031 \u101D $1;
|
| +($nondigits) \u1040 ([\u102B-\u103F]) → $1 \u101D $2;
|
| +# cwc: Simpler replacements for Zawgyi 1025
|
| +\u1025 \u103A → \u1009 \u103A;
|
| +\u1025 \u102E → \u1026;
|
| +# Asat and dot below reordering.
|
| +\u1037\u103A → \u103A\u1037;
|
| +# Reorder some vowel signs
|
| +\u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ;
|
| +([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1;
|
| +# Move ra medial, but not others.
|
| +\u103C ($consonant) → $1 \u103C;
|
| +####
|
| +#### Stage 3
|
| +#### Move \u1031, \u1036, and \u103C after consonants.
|
| +::Null;
|
| +# 1031 moved after consonant, with and without kinzi or medials
|
| +([\u1031]+) $ukinzi ($consonant) → $ukinzi $2 $1;
|
| +([\u1031]+) ($consonant) ($umedial+) → $2 $3 $1;
|
| +([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] → $2 $1;
|
| +\u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C;
|
| +\u1036 ($umedial+) → $1 \u1036;
|
| +####
|
| +#### Stage 4
|
| +#### Reordering medials, dot below, contractions, E sign, and asat.
|
| +::Null;
|
| +# Reorder the medials
|
| +([\u103C\u103D\u103E]+) \u103B → \u103B $1;
|
| +([\u103D\u103E]+) \u103C → \u103C $1;
|
| +\u103E\u103D → \u103D\u103E ;
|
| +# Contractions with vowel signs
|
| +([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2;
|
| +($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1;
|
| +# Move vowel sign E \u1031 after medials, but not across consonants
|
| +($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2;
|
| +# Reorder dot below after medials and vowel diacritics
|
| +\u1037 ([\u102D-\u1030\u1032\u1036]) → $1 \u1037;
|
| +\u1037 ($umedial+) → $1 \u1037;
|
| +# Move vowel signs after medials
|
| +($vowelsign+) ($umedial+) → $2 $1;
|
| +# Reorder modifiers and asat
|
| +($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3;
|
| +####
|
| +#### Stage 5. More reorderings
|
| +#### Vowel signs after medials, sort medials,
|
| +####
|
| +::Null;
|
| +([\u1031]+) ($umedial+) → $2 $1;
|
| +# More moving vowel signs after medials
|
| +($vowelsign) ($umedial) → $2 $1;
|
| +# Sort the medials
|
| +([\u103C\u103D\u103E]) \u103B → \u103B $1;
|
| +([\u103D\u103E]) \u103C → \u103C $1;
|
| +\u103E\u103D → \u103D\u103E ;
|
| +# Move visarga (\u1038) after other signs
|
| +\u1038 ([$vowelmedial]) → $1 \u1038;
|
| +\u1038 ([\u1036\u1037\u103A]) → $1 \u1038;
|
| +###
|
| +### Stage 6
|
| +### Finish medial sorting, fix conflicting and extra diacritics
|
| +###
|
| +::Null;
|
| +# Fix 103B/103A order for asat.
|
| +($consonant) \u103B \u103A → $1 \u103A \u103B;
|
| +
|
|
|