source/data/translit/Zawgyi_my.txt - Issue 2440913002: Update ICU to 58.1

Unified Diff: source/data/translit/Zawgyi_my.txt

Issue 2440913002: Update ICU to 58.1

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/data/translit/Zawgyi_my.txt

diff --git a/source/data/translit/Zawgyi_my.txt b/source/data/translit/Zawgyi_my.txt

new file mode 100644

index 0000000000000000000000000000000000000000..fd1f3fa6e3beb75b814bf79b69fc03197c3a3075

--- /dev/null

+++ b/source/data/translit/Zawgyi_my.txt

@@ -0,0 +1,203 @@

+# License & terms of use: http://www.unicode.org/copyright.html#License

+# File: Zawgyi_my.txt

+# Generated from CLDR

+# This transform converts Zawgyi "encoded" Burmese into proper

+# unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses

+# the Myanmar unicode range but assigns different characters or

+# glyphs to some codepoints. In addition to the character mapping,

+# there is reordering of codepoints needed to match the expected

+# unicode order. This reordering is context-based.

+# This transform is done in two main stages:

+# (1) Map all Zawgyi codepoints to their Unicode counterpart.

+# (2) Perform reordering.

+# Modern Burmese digits & Unicode code points.

+$nondigits = [^\u1040-\u1049];

+$consonant = [\u1000-\u1021];

+$vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031)

+$umedial = [\u103B-\u103E]; # Medial codepoints in Unicode

+$vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F]; # Union of vowel signs and medials

+$ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode

+# ZAWGYI MYANMAR CONSONANT SIGN MEDIAL RA

+# This character has multiple representations in the Zawgyi font.

+$zmedialra = [\u103B\u107E-\u1084];

+####

+#### STAGE (1): CODEPOINT MAPPING FROM ZAWGYI TO UNICODE

+####

+# Kinzi (predefined ligatures)

+# Move base character to the right

+($consonant) \u103A \u1064 → $ukinzi $1 \u103B;

+($consonant) \u1064 → $ukinzi $1;

+\u1064 → $ukinzi;

+# Special cases moving base character to right before

+($consonant) \u108b → $ukinzi $1 \u102D;

+($consonant) \u108C → $ukinzi $1 \u102E;

+($consonant) \u108D → $ukinzi $1 \u1036;

+# Special cases moving Kinzi block to left

+($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F;

+($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ;

+($consonant) \u103A \u108C \u1033 → $ukinzi $1 \u103B \u102E \u102F;

+($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ;

+($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ;

+($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ;

+\u108B → $ukinzi \u102D ;

+\u108C → $ukinzi \u102E ;

+\u108D → $ukinzi \u1036 ;

+# Consonants (only the ones that have to change)

+\u106A ($vowelsign) \u1038 → \u1025 $1 \u1038 ; # U sound

+\u106A → \u1009 ; # NYA

+\u106B → \u100A ;

+\u108F → \u1014 ;

+\u1090 → \u101B ;

+\u1086 → \u103F ;

+# yapin

+\u103A → \u103B ;

+\u107D → \u103B ;

+# wasway

+\u103C \u108A → \u103D \u103E; # To avoid duplicate medials

+\u103C → \u103D ;

+\u108A → \u103D \u103E ;

+# hatoh

+\u103D → \u103E ;

+\u1087 → \u103E ;

+\u1088 → \u103E \u102F ;

+\u1089 → \u103E \u1030 ;

+# Single diacritics with space - use non-breaking

+# TODO(ccornelius): determine if this breaks transliteration

+# asat

+\u1039 → \u103A ;

+# Vowels

+\u1033 → \u102F ;

+\u1034 → \u1030 ;

+\u105A → \u102B \u103A ;

+\u108E → \u102D \u1036 ;

+# lDot

+# Special cases to move dot to right of base consonant

+\u1031 \u1094 ($consonant) \u103D → $1 \u103E \u1031 \u1037 ;

+\u1094 → \u1037 ;

+\u1095 → \u1037 ;

+# Special cases for 1025 vs 1009

+\u1025 \u1061 → \u1009 \u1039 \u1001;

+\u1025 \u1062 → \u1009 \u1039 \u1002;

+\u1025 \u1065 → \u1009 \u1039 \u1005;

+\u1025 \u1068 → \u1009 \u1039 \u1007;

+\u1025 \u1076 → \u1009 \u1039 \u1013;

+\u1025 \u1078 → \u1009 \u1039 \u1015;

+\u1025 \u107A → \u1009 \u1039 \u1017;

+\u1025 \u1079 → \u1009 \u1039 \u1016;

+($consonant) \u103A \u1039 → $1 \u103A \u103B;

+# Stacked Consonants

+\u1060 → \u1039 \u1000 ;

+\u1061 → \u1039 \u1001 ;

+\u1062 → \u1039 \u1002 ;

+\u1063 → \u1039 \u1003 ;

+\u1065 → \u1039 \u1005 ;

+\u1066 → \u1039 \u1006 ;

+\u1067 → \u1039 \u1006 ;

+\u1068 → \u1039 \u1007 ;

+\u1069 → \u1039 \u1008 ;

+\u106C → \u1039 \u100B ;

+\u106D → \u1039 \u100C ;

+\u1070 → \u1039 \u100F ;

+\u1071 → \u1039 \u1010 ;

+\u1072 → \u1039 \u1010 ;

+\u1096 → \u1039 \u1010 \u103D;

+\u1073 → \u1039 \u1011 ;

+\u1074 → \u1039 \u1011 ;

+\u1075 → \u1039 \u1012 ;

+\u1076 → \u1039 \u1013 ;

+\u1077 → \u1039 \u1014 ;

+\u1078 → \u1039 \u1015 ;

+\u1079 → \u1039 \u1016 ;

+\u107A → \u1039 \u1017 ;

+\u107B → \u1039 \u1018 ;

+\u1093 → \u1039 \u1018 ;

+\u107C → \u1039 \u1019 ;

+\u1085 → \u1039 \u101C ;

+# Pre-defined ligatures

+\u106E → \u100D\u1039\u100D ;

+\u106F → \u100D\u1039\u100E ;

+\u1091 → \u100F\u1039\u100D ;

+\u1092 → \u100B\u1039\u100C ;

+\u1097 → \u100B\u1039\u100B ;

+\u104E → \u104E\u1004\u103A\u1038 ;

+# yayit

+$zmedialra → \u103C ;

+####

+#### STAGE (2): POST REORDERING RULES FOR UNICODE RENDERING

+#### Now every codepoint is Unicode. This starts conversion

+#### from semi-visual order to logical order.

+####

+::Null;

+# Case of MYANMAR digit being used instead of a letter

+\u1044 \u103a → | \u104E \u103A ;

+# Lone zero with diacritic mark

+\u1031 \u1040 ($nondigits) → \u1031 \u101D $1;

+($nondigits) \u1040 ([\u102B-\u103F]) → $1 \u101D $2;

+# cwc: Simpler replacements for Zawgyi 1025

+\u1025 \u103A → \u1009 \u103A;

+\u1025 \u102E → \u1026;

+# Asat and dot below reordering.

+\u1037\u103A → \u103A\u1037;

+# Reorder some vowel signs

+\u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ;

+([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1;

+# Move ra medial, but not others.

+\u103C ($consonant) → $1 \u103C;

+####

+#### Stage 3

+#### Move \u1031, \u1036, and \u103C after consonants.

+::Null;

+# 1031 moved after consonant, with and without kinzi or medials

+([\u1031]+) $ukinzi ($consonant) → $ukinzi $2 $1;

+([\u1031]+) ($consonant) ($umedial+) → $2 $3 $1;

+([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] → $2 $1;

+\u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C;

+\u1036 ($umedial+) → $1 \u1036;

+####

+#### Stage 4

+#### Reordering medials, dot below, contractions, E sign, and asat.

+::Null;

+# Reorder the medials

+([\u103C\u103D\u103E]+) \u103B → \u103B $1;

+([\u103D\u103E]+) \u103C → \u103C $1;

+\u103E\u103D → \u103D\u103E ;

+# Contractions with vowel signs

+([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2;

+($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1;

+# Move vowel sign E \u1031 after medials, but not across consonants

+($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2;

+# Reorder dot below after medials and vowel diacritics

+\u1037 ([\u102D-\u1030\u1032\u1036]) → $1 \u1037;

+\u1037 ($umedial+) → $1 \u1037;

+# Move vowel signs after medials

+($vowelsign+) ($umedial+) → $2 $1;

+# Reorder modifiers and asat

+($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3;

+####

+#### Stage 5. More reorderings

+#### Vowel signs after medials, sort medials,

+####

+::Null;

+([\u1031]+) ($umedial+) → $2 $1;

+# More moving vowel signs after medials

+($vowelsign) ($umedial) → $2 $1;

+# Sort the medials

+([\u103C\u103D\u103E]) \u103B → \u103B $1;

+([\u103D\u103E]) \u103C → \u103C $1;

+\u103E\u103D → \u103D\u103E ;

+# Move visarga (\u1038) after other signs

+\u1038 ([$vowelmedial]) → $1 \u1038;

+\u1038 ([\u1036\u1037\u103A]) → $1 \u1038;

+###

+### Stage 6

+### Finish medial sorting, fix conflicting and extra diacritics

+###

+::Null;

+# Fix 103B/103A order for asat.

+($consonant) \u103B \u103A → $1 \u103A \u103B;

« no previous file with comments | « source/data/translit/Uzbek_Latin_BGN.txt ('k') | source/data/translit/am_am_FONIPA.txt » ('j') | no next file with comments »