| Index: source/data/translit/ThaiLogical_Latin.txt
|
| diff --git a/source/data/translit/ThaiLogical_Latin.txt b/source/data/translit/ThaiLogical_Latin.txt
|
| index 4912be5a56a3a017ba5213a6aa0d90c7c466bebc..c063e7901b4afc45c998066107d04eebf88d37fd 100644
|
| --- a/source/data/translit/ThaiLogical_Latin.txt
|
| +++ b/source/data/translit/ThaiLogical_Latin.txt
|
| @@ -1,14 +1,37 @@
|
| -# ***************************************************************************
|
| -# *
|
| -# * Copyright (C) 2004-2015, International Business Machines
|
| -# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
|
| -# *
|
| -# ***************************************************************************
|
| +# © 2016 and later: Unicode, Inc. and others.
|
| +# License & terms of use: http://www.unicode.org/copyright.html#License
|
| +#
|
| # File: ThaiLogical_Latin.txt
|
| -# Generated from CLDR
|
| +# Generated from CLDR
|
| +#
|
| +
|
| +# Thai-Latin
|
| +# This set of rules follows ISO 11940
|
| +# see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf
|
| +# except that that does not mention an implicit vowel, so we use o\u0323
|
| +#
|
| +# The transcription is fairly ugly, so we ought to also do the UNGEGN version
|
| +# see: http://www.eki.ee/wgrs/rom1_th.pdf
|
| +# and probably make that the main variant.
|
| #
|
| +# Note: this is an internal file. The NFD/NFC is handled externally, in the index
|
| +# The insertion of spaces between words, the reversal of the vowels
|
| +# and the conversion of space to semicolon are done *outside* of these rules.
|
| +# So as far as these rules are concerned, the vowels are in logical order!
|
| +# insert implicit vowel (and remove it going the other way)
|
| +# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically
|
| +#$consonant = [ก-ฮ];
|
| +#$vowel = [ะ-\u0E3Aเ-ไ\u0E47];
|
| +#{ ( $consonant ) } [^$vowel \uE000] → | $1 \uE000 ;
|
| +#\uE000 → o\u0323 ;
|
| +# ← o\u0323 ;
|
| $notAbove = [^\p{ccc=0}\p{ccc=above}] ;
|
| $notBelow = [^\p{ccc=0}\p{ccc=below}] ;
|
| +# Consonants
|
| +# Warning: the 'h's need to be handled carefully!
|
| +# What we really want to say is the following, but we can't
|
| +# $notHAccent = !($notAbove* \u0304 | $notBelow* \u0323) ;
|
| +# Since the only accents we care about that could cause problems are free-standing accents below, we use instead:
|
| $freeStandingBelow = [\u0325 ];
|
| $hAccent = [ \u0304 \u0323];
|
| $notHAccent0 = [^$freeStandingBelow$hAccent];
|
| @@ -40,8 +63,10 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
|
| ธ ↔ t\u0323h ; # THAI CHARACTER THO THONG
|
| ท ← th } $notHAccent1 ; # THAI CHARACTER THO THAHAN
|
| ท ↔ th } $notHAccent0 ; # THAI CHARACTER THO THAHAN
|
| +#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick.
|
| ฏ ↔ t\u0329 ; # THAI CHARACTER TO PATAK
|
| ต ↔ t ; # THAI CHARACTER TO TAO
|
| +# since there is no singleton g (generated), don't worry about that.
|
| ง ↔ ng ; # THAI CHARACTER NGO NGU
|
| ณ ↔ n\u0323 ; # THAI CHARACTER NO NEN
|
| น ↔ n ; # THAI CHARACTER NO NU
|
| @@ -67,9 +92,11 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
|
| ฟ ↔ f ; # THAI CHARACTER FO FAN
|
| อ ↔ x ; # THAI CHARACTER O ANG
|
| ซ ↔ s ; # THAI CHARACTER SO SO
|
| +# vowels
|
| \u0E31 ↔ a\u0323 ; # THAI CHARACTER MAI HAN-AKAT
|
| า → a\u0304 ; # THAI CHARACTER SARA AA
|
| า | $1 ← a ($notAbove*) \u0304; # backward case, account for reordering
|
| +# We deviate from ISO for SARA AM for disambiguation
|
| ำ → a \u0309; # THAI CHARACTER SARA AM
|
| ำ | $1 ← a ($notAbove*) \u0309 ; # backward case, account for reordering
|
| ะ ↔ a ; # THAI CHARACTER SARA A
|
| @@ -82,6 +109,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
|
| \u0E39 | $1 ← u ($notAbove*) \u0304 ; # backward case, account for reordering
|
| \u0E38 ↔ u ; # THAI CHARACTER SARA U
|
| ฯ ↔ ‡ ; # THAI CHARACTER PAIYANNOI
|
| +# ฿ ↔ XXX ; # THAI CURRENCY SYMBOL BAHT
|
| เ ↔ e ; # THAI CHARACTER SARA E
|
| แ ↔ æ ; # THAI CHARACTER SARA AE
|
| โ ↔ o ; # THAI CHARACTER SARA O
|
| @@ -95,6 +123,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
|
| \u0E4B ↔ \u030C ; # THAI CHARACTER MAI CHATTAWA
|
| \u0E4C ↔ \u0312 ; # THAI CHARACTER THANTHAKHAT
|
| \u0E4E ↔ '~' ; # THAI CHARACTER YAMAKKAN
|
| +# We deviate from ISO for disambiguation
|
| \u0E4D ↔ \u030A ; # THAI CHARACTER NIKHAHIT
|
| ๏ ↔ '§' ; # THAI CHARACTER FONGMAN
|
| ๐ ↔ 0 ; # THAI DIGIT ZERO
|
| @@ -110,11 +139,15 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
|
| ๚ ↔ '||' ; # THAI CHARACTER ANGKHANKHU
|
| ๛ ↔ » ; # THAI CHARACTER KHOMUT
|
| ๆ ↔ « ; # THAI CHARACTER MAIYAMOK
|
| +# moved down to make shorter first
|
| +#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below.
|
| \u0E3A ↔ ˌ ; # THAI CHARACTER PHINTHU
|
| \u0E34 ↔ i ; # THAI CHARACTER SARA I
|
| +# fallbacks
|
| | k ← g ;
|
| | k ← h ;
|
| | c ← j ;
|
| | k ← q ;
|
| | s ← z ;
|
| :: (lower);
|
| +
|
|
|