Index: source/data/translit/ThaiLogical_Latin.txt |
diff --git a/source/data/translit/ThaiLogical_Latin.txt b/source/data/translit/ThaiLogical_Latin.txt |
index 4912be5a56a3a017ba5213a6aa0d90c7c466bebc..c063e7901b4afc45c998066107d04eebf88d37fd 100644 |
--- a/source/data/translit/ThaiLogical_Latin.txt |
+++ b/source/data/translit/ThaiLogical_Latin.txt |
@@ -1,14 +1,37 @@ |
-# *************************************************************************** |
-# * |
-# * Copyright (C) 2004-2015, International Business Machines |
-# * Corporation; Unicode, Inc.; and others. All Rights Reserved. |
-# * |
-# *************************************************************************** |
+# © 2016 and later: Unicode, Inc. and others. |
+# License & terms of use: http://www.unicode.org/copyright.html#License |
+# |
# File: ThaiLogical_Latin.txt |
-# Generated from CLDR |
+# Generated from CLDR |
+# |
+ |
+# Thai-Latin |
+# This set of rules follows ISO 11940 |
+# see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf |
+# except that that does not mention an implicit vowel, so we use o\u0323 |
+# |
+# The transcription is fairly ugly, so we ought to also do the UNGEGN version |
+# see: http://www.eki.ee/wgrs/rom1_th.pdf |
+# and probably make that the main variant. |
# |
+# Note: this is an internal file. The NFD/NFC is handled externally, in the index |
+# The insertion of spaces between words, the reversal of the vowels |
+# and the conversion of space to semicolon are done *outside* of these rules. |
+# So as far as these rules are concerned, the vowels are in logical order! |
+# insert implicit vowel (and remove it going the other way) |
+# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically |
+#$consonant = [ก-ฮ]; |
+#$vowel = [ะ-\u0E3Aเ-ไ\u0E47]; |
+#{ ( $consonant ) } [^$vowel \uE000] → | $1 \uE000 ; |
+#\uE000 → o\u0323 ; |
+# ← o\u0323 ; |
$notAbove = [^\p{ccc=0}\p{ccc=above}] ; |
$notBelow = [^\p{ccc=0}\p{ccc=below}] ; |
+# Consonants |
+# Warning: the 'h's need to be handled carefully! |
+# What we really want to say is the following, but we can't |
+# $notHAccent = !($notAbove* \u0304 | $notBelow* \u0323) ; |
+# Since the only accents we care about that could cause problems are free-standing accents below, we use instead: |
$freeStandingBelow = [\u0325 ]; |
$hAccent = [ \u0304 \u0323]; |
$notHAccent0 = [^$freeStandingBelow$hAccent]; |
@@ -40,8 +63,10 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; |
ธ ↔ t\u0323h ; # THAI CHARACTER THO THONG |
ท ← th } $notHAccent1 ; # THAI CHARACTER THO THAHAN |
ท ↔ th } $notHAccent0 ; # THAI CHARACTER THO THAHAN |
+#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick. |
ฏ ↔ t\u0329 ; # THAI CHARACTER TO PATAK |
ต ↔ t ; # THAI CHARACTER TO TAO |
+# since there is no singleton g (generated), don't worry about that. |
ง ↔ ng ; # THAI CHARACTER NGO NGU |
ณ ↔ n\u0323 ; # THAI CHARACTER NO NEN |
น ↔ n ; # THAI CHARACTER NO NU |
@@ -67,9 +92,11 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; |
ฟ ↔ f ; # THAI CHARACTER FO FAN |
อ ↔ x ; # THAI CHARACTER O ANG |
ซ ↔ s ; # THAI CHARACTER SO SO |
+# vowels |
\u0E31 ↔ a\u0323 ; # THAI CHARACTER MAI HAN-AKAT |
า → a\u0304 ; # THAI CHARACTER SARA AA |
า | $1 ← a ($notAbove*) \u0304; # backward case, account for reordering |
+# We deviate from ISO for SARA AM for disambiguation |
ำ → a \u0309; # THAI CHARACTER SARA AM |
ำ | $1 ← a ($notAbove*) \u0309 ; # backward case, account for reordering |
ะ ↔ a ; # THAI CHARACTER SARA A |
@@ -82,6 +109,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; |
\u0E39 | $1 ← u ($notAbove*) \u0304 ; # backward case, account for reordering |
\u0E38 ↔ u ; # THAI CHARACTER SARA U |
ฯ ↔ ‡ ; # THAI CHARACTER PAIYANNOI |
+# ฿ ↔ XXX ; # THAI CURRENCY SYMBOL BAHT |
เ ↔ e ; # THAI CHARACTER SARA E |
แ ↔ æ ; # THAI CHARACTER SARA AE |
โ ↔ o ; # THAI CHARACTER SARA O |
@@ -95,6 +123,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; |
\u0E4B ↔ \u030C ; # THAI CHARACTER MAI CHATTAWA |
\u0E4C ↔ \u0312 ; # THAI CHARACTER THANTHAKHAT |
\u0E4E ↔ '~' ; # THAI CHARACTER YAMAKKAN |
+# We deviate from ISO for disambiguation |
\u0E4D ↔ \u030A ; # THAI CHARACTER NIKHAHIT |
๏ ↔ '§' ; # THAI CHARACTER FONGMAN |
๐ ↔ 0 ; # THAI DIGIT ZERO |
@@ -110,11 +139,15 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; |
๚ ↔ '||' ; # THAI CHARACTER ANGKHANKHU |
๛ ↔ » ; # THAI CHARACTER KHOMUT |
ๆ ↔ « ; # THAI CHARACTER MAIYAMOK |
+# moved down to make shorter first |
+#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below. |
\u0E3A ↔ ˌ ; # THAI CHARACTER PHINTHU |
\u0E34 ↔ i ; # THAI CHARACTER SARA I |
+# fallbacks |
| k ← g ; |
| k ← h ; |
| c ← j ; |
| k ← q ; |
| s ← z ; |
:: (lower); |
+ |