Index: source/data/translit/si_si_FONIPA.txt |
diff --git a/source/data/translit/si_si_FONIPA.txt b/source/data/translit/si_si_FONIPA.txt |
new file mode 100644 |
index 0000000000000000000000000000000000000000..b1c6c8ac28fb8d159b5618cc0c4e86d4c5e455a5 |
--- /dev/null |
+++ b/source/data/translit/si_si_FONIPA.txt |
@@ -0,0 +1,163 @@ |
+# © 2016 and later: Unicode, Inc. and others. |
+# License & terms of use: http://www.unicode.org/copyright.html#License |
+# |
+# File: si_si_FONIPA.txt |
+# Generated from CLDR |
+# |
+ |
+# Sinhala pronunciation rules |
+# |
+# Output |
+# k ɡ ŋ ᵑɡ c ɟ ɲ ʈ ɖ ⁿɖ t d n ⁿd p b m ᵐb j r l w ʃ s h f |
+# ə əː a aː æ æː i iː u uː e eː o oː |
+# |
+# References |
+# [1] Asanka Wasala, Ruvan Weerasinghe, and Kumudu Gamage: |
+# Sinhala Grapheme-to-Phoneme Conversion and Rules for Schwa Epenthesis. |
+# Proceedings of the COLING/ACL 2006 Main Conference Poster Sessions, |
+# pages 890–897. http://www.aclweb.org/anthology/P06-2114 |
+# Simplify ya + yansaya to plain ya after a consonant. |
+[\u0D9A-\u0DC6] \u0DCA (\u200D)? { ය\u0DCAය → ය; |
+# Delete ZWNJ and ZWJ to simplify further processing. |
+\u200C → ; |
+\u200D → ; |
+# Insert a schwa after every consonant that is not followed by a dependent vowel |
+# or virama. |
+::Null; |
+([\u0D9A-\u0DC6]) } [^\u0DCA-\u0DDF \u0DF2\u0DF3] → $1 ə; |
+# Pronunciation rules proper. |
+::Null; |
+# fප is an alternative spelling of ෆ. |
+# This occurs e.g. in ඩේව\u0DD2ඩ\u0DCA කොපර\u0DCAfප\u0DD3ල\u0DCAඩ\u0DCA (David Copperfield) |
+# [see http://bradshawofthefuture.blogspot.com/2013/02/f.html]. |
+[Ff]ප → f; |
+# zස is seemingly the only way to unambiguously indicate a voiced /z/ sound. |
+# This occurs in e.g. ඇල\u0DCAzසය\u0DD2ම' රෝගය (Alzheimer's disease) |
+# [see https://si.wikipedia.org/wiki/ඇල\u0DCAzසය\u0DD2ම%27_රෝගය] |
+# or in zස\u0DD3බ\u0DCAරා (zebra) [see https://si.wikipedia.org/wiki/zස\u0DD3බ\u0DCAරා]. |
+[Zz]ස → z; |
+ං → ŋ; |
+o → ŋ; # common substitution for anusvaraya |
+ඃ ([\u0D9A-\u0DC6]) → | $1 \u0DCA $1; # TODO: check which consonants geminate |
+ඃ → h; |
+අ → a; |
+ආ → aː; |
+ඇ → æ; |
+ඈ → æː; |
+ඉ → i; |
+ඊ → iː; |
+උ → u; |
+ඌ → uː; |
+ඍ → ri; |
+ඎ → ruː; |
+ඏ → ilu; |
+ඐ → iluː; |
+එ → e; |
+ඒ → eː; |
+ඓ → aj; |
+ඔ → o; |
+ඕ → oː; |
+ඖ → aw; # TODO: check if this is correct |
+ක → k; |
+ඛ → k; |
+ග → ɡ; |
+ඝ → ɡ; |
+ඞ → ŋ; |
+ඟ → ᵑɡ; |
+ච → c; |
+ඡ → c; |
+ජ → ɟ; |
+ඣ → ɟ; |
+ඤ → ɲ; |
+ඥ → kɲ; # TODO: double-check |
+ඦ → ɟ; |
+ට → ʈ; |
+ඨ → ʈ; |
+ඩ → ɖ; |
+ඪ → ɖ; |
+ණ → n; |
+ඬ → ⁿɖ; |
+ත → t; |
+ථ → t; |
+ද → d; |
+ධ → d; |
+න → n; |
+ඳ → ⁿd; |
+ප → p; |
+ඵ → p; |
+බ → b; |
+භ → b; |
+ම → m; |
+ඹ → ᵐb; |
+ය → j; |
+ර → r; |
+ල → l; |
+ව → w; |
+ශ → ʃ; |
+ෂ → ʃ; |
+ස → s; |
+හ → h; |
+ළ → l; |
+ෆ → f; |
+\u0DCA → ; # delete virama |
+ා → aː; |
+ැ → æ; |
+ෑ → æː; |
+\u0DD2 → i; |
+\u0DD3 → iː; |
+\u0DD4 → u; |
+\u0DD6 → uː; |
+ෘ → ru; |
+ෙ → e; |
+ේ → eː; |
+ෛ → aj; |
+ො → o; |
+ෝ → oː; |
+ෞ → aw; # TODO: check if this is correct |
+ෟ → lu; |
+ෲ → ruː; |
+ෳ → luː; |
+# Heuristics for turning /ə/ into /a/. Based on [1]. |
+$c=[k ɡ ŋ {ᵑɡ} c ɟ ɲ ʈ ɖ {ⁿɖ} t d n {ⁿd} p b m {ᵐb} j r l w ʃ s z h f]; |
+$s=[:^L:]; |
+# Rule #1 |
+::Null; |
+$s sv { ə → ə; # exception (a) |
+$s k { ə } r → ə; # exception (b) |
+$s $c { ə } $s → ə; # exception (c) |
+$s $c $c { ə → a; |
+$s $c { ə → a; |
+# Rule #2 |
+::Null; |
+$c r { ə } $c → a; # clause (a) and (b) |
+$c r { a } h → a; # clause (d), exception |
+$c r { a } $c → ə; # clause (c) |
+# Rule #3 |
+# The paper is unclear about what this rule means. The interpretation here |
+# assumes that "preceded" in the paper is a typo and should be read "followed". |
+::Null; |
+[a e æ o ə] h { ə → a; |
+# Rules #4 through #7 |
+::Null; |
+ə } $c $c → a; # Rule #4 |
+ə } [rbɖʈ] $s → ə; # Rule #5 exception |
+ə } $c $s → a; # Rule #5 |
+ə } ji $s → a; # Rule #6 |
+k { ə } [rl] u → a; # Rule #7 |
+# Rule #8 |
+# Note that the paper doesn't say explicitly that this rule should be |
+# anchored at the beginning of a word, but the remarks before the rules |
+# seem to imply this. |
+::Null; |
+$s k { a } l[aeo]ːj → ə; # Typo in paper: /j/ was /y/. |
+$s k { a } le[mh][ui] → ə; |
+$s k { alə } h[ui] → əle; |
+$s k { a } lə → ə; |
+# Diphthongs |
+::Null; |
+www+ → ww; # යෞව\u0DCAවන |
+[i {iː} e {eː} æ {æː} o {oː} a {aː}] { wu → w; |
+əji → aj; |
+iji → iː; # perhaps: ij |
+[u {uː} e {eː} æ {æː} o {oː} a {aː}] { ji → j; |
+ |