Index: source/data/translit/Grek_Latn.txt |
diff --git a/source/data/translit/Greek_Latin.txt b/source/data/translit/Grek_Latn.txt |
similarity index 75% |
rename from source/data/translit/Greek_Latin.txt |
rename to source/data/translit/Grek_Latn.txt |
index 5118c6fe6e56948a533e97bd5872cb2572017806..a682aff6712b54f27e2c5a30ef698b9b7b762834 100644 |
--- a/source/data/translit/Greek_Latin.txt |
+++ b/source/data/translit/Grek_Latn.txt |
@@ -1,18 +1,30 @@ |
-# *************************************************************************** |
-# * |
-# * Copyright (C) 2004-2015, International Business Machines |
-# * Corporation; Unicode, Inc.; and others. All Rights Reserved. |
-# * |
-# *************************************************************************** |
-# File: Greek_Latin.txt |
-# Generated from CLDR |
+# © 2016 and later: Unicode, Inc. and others. |
+# License & terms of use: http://www.unicode.org/copyright.html#License |
# |
+# File: Grek_Latn.txt |
+# Generated from CLDR |
+# |
+ |
+# Rules are predicated on running NFD first, and NFC afterwards |
+# :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:nonspacing mark:]] ; |
+# MINIMAL FILTER GENERATED FOR: Greek-Latin |
:: [;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ\u0304\u0308\u0313-\u0314\u0342-\u0345ͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ; |
:: NFD (NFC) ; |
+# TEST CASES |
+# Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος |
+# ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ |
+# ᾳ ῃ ῳ ὃ ὄ |
+# ὠς ὡς ὢς ὣς |
+# Ὠς Ὡς Ὢς Ὣς |
+# ὨΣ ὩΣ ὪΣ ὫΣ |
+# Ạ, ạ, Ẹ, ẹ, Ọ, ọ |
+# Useful variables |
$lower = [[:latin:][:greek:] & [:Ll:]]; |
$glower = [[:greek:] & [:Ll:]]; |
$upper = [[:latin:][:greek:] & [:Lu:]] ; |
$accent = [:M:] ; |
+# NOTE: restrict to just the Greek & Latin accents that we care about |
+# TODO: broaden out once interation is fixed |
$accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ; |
$macron = \u0304 ; |
$ddot = \u0308 ; |
@@ -37,18 +49,27 @@ $beforeLetter = [[:M:]\']* [:L:] ; |
$beforeLower = $accent * $lower ; |
$notLetter = [^[:L:][:M:]] ; |
$under = \u0331; |
+# Fix punctuation |
+# preserve original |
\: ↔ \: $under ; |
\? ↔ \? $under ; |
\; ↔ \? ; |
· ↔ \: ; |
+# CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve |
\u0342 ↔ \u0302 ; |
+# IOTA: convert iota subscript to iota |
+# first make previous alpha long! |
$accent_minus = [[$accent]-[$iotasub$macron]]; |
Α } $accent_minus * $iotasub → | Α $macron ; |
α } $accent_minus * $iotasub → | α $macron ; |
+# now convert to uppercase if after uppercase, ow to lowercase |
$upper $accent * { $iotasub → I ; |
$iotasub → i ; |
| $1 $iotasub ← ($evowel $macron $accentMinus *) i ; |
| $1 $iotasub ← ($evowel $macron $accentMinus *) I ; |
+# BREATHING |
+# Convert rough breathing to h, and move before letters. |
+# Make A ` x = → H a x |
Α ($macron?) $rough } $beforeLower → H | α $1; |
Ε $rough } $beforeLower → H | ε; |
Η $rough } $beforeLower → H | η ; |
@@ -56,6 +77,7 @@ $iotasub → i ; |
Ο $rough } $beforeLower → H | ο ; |
Υ $rough } $beforeLower → H | υ ; |
Ω ($ddot?) $rough } $beforeLower → H | ω $1; |
+# Make A x ` = → H a x |
Α ($glower $macron?) $rough → H | α $1 ; |
Ε ($glower) $rough → H | ε $1 ; |
Η ($glower) $rough → H | η $1 ; |
@@ -63,14 +85,18 @@ $iotasub → i ; |
Ο ($glower) $rough → H | ο $1 ; |
Υ ($glower) $rough → H | υ $1 ; |
Ω ($glower $ddot?) $rough → H | ω $1 ; |
+#Otherwise, make x ` into h x and X ` into H X |
($lcgvowel + $ddotmac? ) $rough → h | $1 ; |
($gvowel + $ddotmac? ) $rough → H | $1 ; |
+# Go backwards with H |
| $1 $rough ← h ($evowel $macron $ddot? $evowel2_i $macron?) ; |
| $1 $rough ← h ($evowel $ddot? $evowel2 $macron?) ; |
| $1 $rough ← h ($evowel $macron? $ddot?) ; |
| $1 $rough ← H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ; |
| $1 $rough ← H ([AEIOUY] $ddot? $evowel2 $macron?) ; |
| $1 $rough ← H ([AEIOUY] $macron? $ddot?) ; |
+# titlecase, have to fix individually |
+# in the future, we should add &uppercase() to make this easier |
| A $1 $rough ← H a ($macron $ddot? $evowel2_i $macron?) ; |
| E $1 $rough ← H e ($macron $ddot? $evowel2_i $macron?) ; |
| I $1 $rough ← H i ($macron $ddot? $evowel2_i $macron?) ; |
@@ -89,10 +115,18 @@ $iotasub → i ; |
| O $1 $rough ← H o ($macron? $ddot? ) ; |
| U $1 $rough ← H u ($macron? $ddot? ) ; |
| Y $1 $rough ← H y ($macron? $ddot? ) ; |
+# Now do smooth |
+#delete smooth breathing for Latin |
$smooth → ; |
+# insert in Greek |
+# the assumption is that all Marks are on letters. |
| $1 $smooth ← $notLetter { ([rR]) } [^hH$smooth$rough] ; |
| $1 $smooth ← $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ; |
| $1 $smooth ← $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ; |
+# TODO: preserve smooth/rough breathing if not |
+# on initial vowel sequence |
+# need to have these up here so the rules don't mask |
+# remove now superfluous macron when returning |
Α ← A $macron ; |
α ← a $macron ; |
η ↔ e $macron ; |
@@ -105,6 +139,7 @@ $smooth → ; |
ψ ↔ ps ; |
ω ↔ o $macron ; |
Ω ↔ O $macron; |
+# NORMAL |
α ↔ a ; |
Α ↔ A ; |
β ↔ b ; |
@@ -145,17 +180,24 @@ $smooth → ; |
Ρ $rough ↔ RH ; |
ρ ↔ r ; |
Ρ ↔ R ; |
+# insert separator before things that turn into s |
[Pp] { } [ςσΣϷϸϺϻ] → \' ; |
+# special S variants |
Ϸ ↔ S\u030C ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L |
ϸ ↔ s\u030C ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L |
Ϻ ↔ S\u0302 ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L |
ϻ ↔ s\u0302 ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L |
+# underbar means exception |
+# before a letter, initial |
ς } $beforeLetter ↔ s $underbar } $beforeLetter; |
σ } $beforeLetter ↔ s } $beforeLetter; |
+# otherwise, after a letter = final |
$afterLetter { σ ↔ $afterLetter { s $underbar; |
$afterLetter { ς ↔ $afterLetter { s ; |
+# otherwise (isolated) = initial |
ς ↔ s $underbar; |
σ ↔ s ; |
+# [Pp] { Σ ↔ \'S ; |
Σ ↔ S ; |
τ ↔ t ; |
Τ ↔ T ; |
@@ -166,6 +208,7 @@ $vowel { Υ ↔ U ; |
χ ↔ ch ; |
Χ } $beforeLower ↔ Ch ; |
Χ ↔ CH ; |
+# Completeness for ASCII |
$ignore = [[:Mark:]''] * ; |
| k ← c ; |
| ph ← f ; |
@@ -187,6 +230,7 @@ $rough } $ignore [:UppercaseLetter:] → H ; |
$ignore [:UppercaseLetter:] { $rough → H ; |
$rough ← H ; |
$rough ↔ h ; |
+# Completeness for Greek |
ϐ → | β ; |
ϑ → | θ ; |
ϒ → | Υ ; |
@@ -201,7 +245,12 @@ $rough ↔ h ; |
ϵ → | ε ; |
µ → | μ ; |
ͺ → i; |
+# delete any trailing ' marks used for roundtripping |
← [Ππ] { \' } [Ss] ; |
← [Νν] { \' } $egammaLike ; |
::NFC (NFD) ; |
+# ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ; |
+# ([\u0000-\u007F · [:Latin:] [:nonspacing mark:]]) ; |
+# MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD |
:: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0300-\u0337\u0339-\u0345΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ; |
+ |