| Index: source/data/translit/Arab_Latn.txt
|
| diff --git a/source/data/translit/Arabic_Latin.txt b/source/data/translit/Arab_Latn.txt
|
| similarity index 71%
|
| rename from source/data/translit/Arabic_Latin.txt
|
| rename to source/data/translit/Arab_Latn.txt
|
| index ef0484f34af17479e30d8cbcdea9123adc2b0870..ead4fcb89f874e24a593d144f4a2a1024e87879c 100644
|
| --- a/source/data/translit/Arabic_Latin.txt
|
| +++ b/source/data/translit/Arab_Latn.txt
|
| @@ -1,23 +1,35 @@
|
| -# ***************************************************************************
|
| -# *
|
| -# * Copyright (C) 2004-2015, International Business Machines
|
| -# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
|
| -# *
|
| -# ***************************************************************************
|
| -# File: Arabic_Latin.txt
|
| -# Generated from CLDR
|
| +# © 2016 and later: Unicode, Inc. and others.
|
| +# License & terms of use: http://www.unicode.org/copyright.html#License
|
| #
|
| +# File: Arab_Latn.txt
|
| +# Generated from CLDR
|
| +#
|
| +
|
| +# Generally follows UNGEGN
|
| +# http://www.eki.ee/wgrs/rom1_ar.pdf
|
| +# Occasionally deviates in the direction of ISO 233
|
| +# http://homepage.mac.com/sirbinks/pdf/Arabic.pdf
|
| +# a) where required for disambiguation.
|
| +# b) with underdot instead of cedilla for letter like SAD,
|
| +# since those are explicitly in Unicode for transliteration.
|
| +# c) with extra non-Arabic-language letters, like PEH
|
| +#
|
| +# Does *not* do assimilation of "al", nor hyphenation.
|
| +# While it could be done, we need to determine whether a prefix "al" could
|
| +# occur other than as the definite article (since no space is used).
|
| :: [[:Arabic:][:block=ARABIC:][ⁿ،؛؟ـ\u064B-\u0655٠-٬۰-۹﷼ښ]] ;
|
| :: NFKD (NFC);
|
| $disambig = \u0331 ;
|
| $disambig2 = \u0330 ;
|
| $under = \u0323 ;
|
| $descender = ˌ;
|
| -$notAbove = [[:^ccc=0:]&[:^ccc=230:]];
|
| +$notAbove = [[:^ccc=0:] & [:^ccc=230:]];
|
| +# non-letters
|
| [:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR
|
| [:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR
|
| ٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR
|
| ٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR
|
| +# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate
|
| ، ↔ ',' ; # ARABIC COMMA
|
| ؛ ↔ ';' ; # ARABIC SEMICOLON
|
| ؟ ↔ '?' ; # ARABIC QUESTION MARK
|
| @@ -42,9 +54,12 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]];
|
| ٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN
|
| ٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT
|
| ٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE
|
| +# letters
|
| +# long vowels
|
| \u064Eا↔ a\u0304 ; # ARABIC FATHA, ARABIC LETTER ALEF
|
| \u064Fو ↔ u\u0304 ; # ARABIC DAMMA, ARABIC LETTER WAW
|
| \u0650ي ↔ i\u0304 ; # ARABIC KASRA, ARABIC LETTER YEH
|
| +# longer items moved here to prevent masking
|
| ث ↔ t h $disambig ; # ARABIC LETTER THEH
|
| ذ ↔ d h $disambig ; # ARABIC LETTER THAL
|
| ش ↔ s h $disambig ; # ARABIC LETTER SHEEN
|
| @@ -53,13 +68,19 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]];
|
| ط ↔ t $under ; # ARABIC LETTER TAH
|
| ظ ↔ z $under ; # ARABIC LETTER ZAH
|
| غ ↔ g h $disambig ; # ARABIC LETTER GHAIN
|
| +# WARNING: special case
|
| +# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→
|
| +# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
|
| +# ة\u0655 ← t\u0339\u0308 ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
|
| ة ↔ t \u0308 ; # ARABIC LETTER TEH MARBUTA
|
| ة | $1 ← t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA
|
| +# non-Arabic language
|
| ژ ↔ z h $disambig ; # ARABIC LETTER JEH
|
| ڭ ↔ n $disambig g ; # ARABIC LETTER NG
|
| ۋ ↔ v $disambig ; # ARABIC LETTER VE
|
| ی ↔ y $disambig2 ; # ARABIC LETTER FARSI YEH
|
| ښ ↔ s $descender;
|
| +# Arabic language
|
| ء ↔ ʾ ; # ARABIC LETTER HAMZA
|
| ا ↔ a $under; # ARABIC LETTER ALEF
|
| ب ↔ b ; # ARABIC LETTER BEH
|
| @@ -92,13 +113,18 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]];
|
| \u0650 ↔ i ; # ARABIC KASRA
|
| \u0651 ↔ \u0303 ; # ARABIC SHADDA
|
| \u0652 ↔ \u030A ; # ARABIC SUKUN
|
| +# special combining marks
|
| \u0653 ↔ \u0302 ; # ARABIC MADDAH ABOVE
|
| \u0654 ↔ \u0309 ; # ARABIC HAMZA ABOVE
|
| \u0655 ↔ \u0339 ; # ARABIC HAMZA BELOW
|
| +# Some non-Arabic language (not in UNGEGN)
|
| پ ↔ p ; # ARABIC LETTER PEH
|
| چ ↔ c h $disambig ; # ARABIC LETTER TCHEH
|
| ڤ ↔ v ; # ARABIC LETTER VEH
|
| +# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
|
| +# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
|
| گ ↔ g ; # ARABIC LETTER GAF
|
| +# fallbacks
|
| | s ← c } [eiy];
|
| | k ← c ;
|
| | i ← e ;
|
| @@ -108,3 +134,4 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]];
|
| :: (lower) ;
|
| ::NFC (NFD);
|
| :: ( [[:Latin:] [%,.0-9;?ʾ-ʿ\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339;ˌ]] );
|
| +
|
|
|