Index: source/data/translit/Arab_Latn.txt |
diff --git a/source/data/translit/Arabic_Latin.txt b/source/data/translit/Arab_Latn.txt |
similarity index 71% |
rename from source/data/translit/Arabic_Latin.txt |
rename to source/data/translit/Arab_Latn.txt |
index ef0484f34af17479e30d8cbcdea9123adc2b0870..ead4fcb89f874e24a593d144f4a2a1024e87879c 100644 |
--- a/source/data/translit/Arabic_Latin.txt |
+++ b/source/data/translit/Arab_Latn.txt |
@@ -1,23 +1,35 @@ |
-# *************************************************************************** |
-# * |
-# * Copyright (C) 2004-2015, International Business Machines |
-# * Corporation; Unicode, Inc.; and others. All Rights Reserved. |
-# * |
-# *************************************************************************** |
-# File: Arabic_Latin.txt |
-# Generated from CLDR |
+# © 2016 and later: Unicode, Inc. and others. |
+# License & terms of use: http://www.unicode.org/copyright.html#License |
# |
+# File: Arab_Latn.txt |
+# Generated from CLDR |
+# |
+ |
+# Generally follows UNGEGN |
+# http://www.eki.ee/wgrs/rom1_ar.pdf |
+# Occasionally deviates in the direction of ISO 233 |
+# http://homepage.mac.com/sirbinks/pdf/Arabic.pdf |
+# a) where required for disambiguation. |
+# b) with underdot instead of cedilla for letter like SAD, |
+# since those are explicitly in Unicode for transliteration. |
+# c) with extra non-Arabic-language letters, like PEH |
+# |
+# Does *not* do assimilation of "al", nor hyphenation. |
+# While it could be done, we need to determine whether a prefix "al" could |
+# occur other than as the definite article (since no space is used). |
:: [[:Arabic:][:block=ARABIC:][ⁿ،؛؟ـ\u064B-\u0655٠-٬۰-۹﷼ښ]] ; |
:: NFKD (NFC); |
$disambig = \u0331 ; |
$disambig2 = \u0330 ; |
$under = \u0323 ; |
$descender = ˌ; |
-$notAbove = [[:^ccc=0:]&[:^ccc=230:]]; |
+$notAbove = [[:^ccc=0:] & [:^ccc=230:]]; |
+# non-letters |
[:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR |
[:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR |
٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR |
٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR |
+# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate |
، ↔ ',' ; # ARABIC COMMA |
؛ ↔ ';' ; # ARABIC SEMICOLON |
؟ ↔ '?' ; # ARABIC QUESTION MARK |
@@ -42,9 +54,12 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; |
٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN |
٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT |
٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE |
+# letters |
+# long vowels |
\u064Eا↔ a\u0304 ; # ARABIC FATHA, ARABIC LETTER ALEF |
\u064Fو ↔ u\u0304 ; # ARABIC DAMMA, ARABIC LETTER WAW |
\u0650ي ↔ i\u0304 ; # ARABIC KASRA, ARABIC LETTER YEH |
+# longer items moved here to prevent masking |
ث ↔ t h $disambig ; # ARABIC LETTER THEH |
ذ ↔ d h $disambig ; # ARABIC LETTER THAL |
ش ↔ s h $disambig ; # ARABIC LETTER SHEEN |
@@ -53,13 +68,19 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; |
ط ↔ t $under ; # ARABIC LETTER TAH |
ظ ↔ z $under ; # ARABIC LETTER ZAH |
غ ↔ g h $disambig ; # ARABIC LETTER GHAIN |
+# WARNING: special case |
+# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→ |
+# so on the return, we have to skip over (but preserve) the half-ring below (or others like it) |
+# ة\u0655 ← t\u0339\u0308 ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS |
ة ↔ t \u0308 ; # ARABIC LETTER TEH MARBUTA |
ة | $1 ← t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA |
+# non-Arabic language |
ژ ↔ z h $disambig ; # ARABIC LETTER JEH |
ڭ ↔ n $disambig g ; # ARABIC LETTER NG |
ۋ ↔ v $disambig ; # ARABIC LETTER VE |
ی ↔ y $disambig2 ; # ARABIC LETTER FARSI YEH |
ښ ↔ s $descender; |
+# Arabic language |
ء ↔ ʾ ; # ARABIC LETTER HAMZA |
ا ↔ a $under; # ARABIC LETTER ALEF |
ب ↔ b ; # ARABIC LETTER BEH |
@@ -92,13 +113,18 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; |
\u0650 ↔ i ; # ARABIC KASRA |
\u0651 ↔ \u0303 ; # ARABIC SHADDA |
\u0652 ↔ \u030A ; # ARABIC SUKUN |
+# special combining marks |
\u0653 ↔ \u0302 ; # ARABIC MADDAH ABOVE |
\u0654 ↔ \u0309 ; # ARABIC HAMZA ABOVE |
\u0655 ↔ \u0339 ; # ARABIC HAMZA BELOW |
+# Some non-Arabic language (not in UNGEGN) |
پ ↔ p ; # ARABIC LETTER PEH |
چ ↔ c h $disambig ; # ARABIC LETTER TCHEH |
ڤ ↔ v ; # ARABIC LETTER VEH |
+# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW |
+# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW |
گ ↔ g ; # ARABIC LETTER GAF |
+# fallbacks |
| s ← c } [eiy]; |
| k ← c ; |
| i ← e ; |
@@ -108,3 +134,4 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; |
:: (lower) ; |
::NFC (NFD); |
:: ( [[:Latin:] [%,.0-9;?ʾ-ʿ\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339;ˌ]] ); |
+ |