source/data/translit/Latn_Kana.txt - Issue 2440913002: Update ICU to 58.1

Unified Diff: source/data/translit/Latn_Kana.txt

Issue 2440913002: Update ICU to 58.1

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/data/translit/Latn_Kana.txt

diff --git a/source/data/translit/Latin_Katakana.txt b/source/data/translit/Latn_Kana.txt

similarity index 63%

rename from source/data/translit/Latin_Katakana.txt

rename to source/data/translit/Latn_Kana.txt

index bd0e07c1b113fa9294072a64ebc1588b90c5caf8..ea4b7dd6875eb8f3ae07ace84ae1d2e52acb238e 100644

--- a/source/data/translit/Latin_Katakana.txt

+++ b/source/data/translit/Latn_Kana.txt

@@ -1,19 +1,67 @@

-# ***************************************************************************

-# *

-# ***************************************************************************

-# File: Latin_Katakana.txt

-# Generated from CLDR

+# License & terms of use: http://www.unicode.org/copyright.html#License

+# File: Latn_Kana.txt

+# Generated from CLDR

+# note: a global filter is more efficient, but MUST include all source chars

+#:: [\u0000-\u007E 、。 \u3099-゜ァ-ー｡-ﾟ [:Latin:][:Katakana:] [:nonspacing mark:]] ;

+# MINIMAL FILTER GENERATED FOR: Latin-Katakana

+### WARNING -- must add width filter, both here and below!!! ###

:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー！-～￠-￦][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;

:: [:Latin:] fullwidth-halfwidth ();

:: NFD (NFC);

:: Lower (); # whenever transliterating from cased to uncased script, include this

+# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese

+# Uses modified Hepburn. Small changes to make unambiguous.

+# | Kunrei-shiki: Hepburn/MHepburn

+# | ------------------------------

+# | si: shi

+# | si ~ya: sha

+# | si ~yu: shu

+# | si ~yo: sho

+# | zi: ji

+# | zi ~ya: ja

+# | zi ~yu: ju

+# | zi ~yo: jo

+# | ti: chi

+# | ti ~ya: cha

+# | ti ~yu: chu

+# | ti ~yu: cho

+# | tu: tsu

+# | di: ji/dji

+# | du: zu/dzu

+# | hu: fu

+# | For foreign words:

+# | -----------------

+# | se ~i si

+# | si ~e she

+# |

+# | ze ~i zi

+# | zi ~e je

+# |

+# | te ~i ti

+# | ti ~e che

+# | te ~u tu

+# |

+# | de ~i di

+# | de ~u du

+# | de ~i di

+# |

+# | he ~u: hu

+# | hu ~a fa

+# | hu ~i fi

+# | hu ~e he

+# | hu ~o ho

+# Most small forms are generated, but if necessary

+# explicit small forms are given with ~a, ~ya, etc.

+#------------------------------------------------------

+# Variables

$vowel = [aeiou] ;

$consonant = [bcdfghjklmnpqrstvwxyz] ;

$macron = \u0304 ;

+# Variables used for doubled-consonants with tsu

$kana = [ぁ-ゔ] ;

$voice = [\u3099゛];

$semivoice = [\u309A゜];

@@ -30,22 +78,38 @@ $r_start = [ラリルレロらりるれろ] ;

$w_start = [ワヰヱヲわゐゑを] ;

$v_start = [ワヰヱヲ]\u3099 ;

$voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ;

+# if ン is followed by $n_quoter, then it needs an

+# apostrophe after its romaji form to disambiguate it.

+# e.g., ンア ! = ナ, so represent as "n'a", not "na".

$n_quoter = [アイウエオナニヌネノヤユヨン] ;

$small_y = [ャィュェョ] ;

$iteration = ゝ ;

+#------------------------------------------------------

+# katakana rules

+# Punctuation

'.' ↔ 。;

',' ↔ 、;

+# ' ' } [a-z] → ; # delete spaces before latin

+# ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana

+# Iteration Mark

+# Copy previous letter § marks

+# TODO

+# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration

+# Specials for katakana -- not shared with hiragana

va ↔ ワ\u3099 ;

vi ↔ ヰ\u3099 ;

ve ↔ ヱ\u3099 ;

vo ↔ ヲ\u3099 ;

'~ka' ↔ ヵ ;

'~ke' ↔ ヶ ;

+# ~~~ begin shared rules ~~~

+#special

ya ← '~'ャ;

yi ← '~'ィ ;

yu ← '~'ュ;

ye ← '~'ェ;

yo ← '~'ョ;

+#normal

a ↔ ア ;

b | '~' ← ヒ \u3099} $small_y ;

by } $vowel → ヒ\u3099 | '~y' ;

@@ -69,6 +133,7 @@ dje ← チ\u3099ェ ;

djo ← チ\u3099ョ ;

dji ↔ チ\u3099 ;

dj } $vowel → チ\u3099 | '~y' ;

+# TODO: QUESTION: use ĵĴżŻ instead of dj, dz

cha ← チャ ;

chi'~i' ← チィ ; # liu

chu ← チュ ;

@@ -85,6 +150,7 @@ gu ↔ ク\u3099 ;

ge ↔ ケ\u3099 ;

go ↔ コ\u3099 ;

i ↔ イ ;

+# j } $vowel → シ\u3099 | '~y' ;

ja ↔ シ\u3099ャ ;

ji'~i' ← シ\u3099ィ ; # liu

ju ↔ シ\u3099ュ ;

@@ -128,6 +194,8 @@ hi ↔ ヒ ;

hu ↔ ヘゥ ;

he ↔ ヘ ;

ho ↔ ホ ;

+# f | '~' ← フ } $small_y ;

+# f } $vowel → フ | '~' ;

fa ↔ ファ ;

fi ↔ フィ ;

fe ↔ フェ ;

@@ -163,8 +231,14 @@ tu ↔ テゥ ;

te ↔ テ ;

to ↔ ト ;

tsu ↔ ツ ;

+# v } $vowel → ウ\u3099 | '~' ;

+#'v~a' ← ウ\u3099ァ ; # liu

+#'v~i' ← ウ\u3099ィ ; # liu

+#'v~e' ← ウ\u3099ェ ; # liu

+#'v~o' ← ウ\u3099ォ ; # liu

vu ↔ ウ\u3099 ;

u ↔ ウ ;

+# w } $vowel → ウ | '~' ;

wa ↔ ワ ;

wi ↔ ヰ ;

wu → ウ ;

@@ -175,15 +249,20 @@ yi → イ ;

yu ↔ ユ ;

ye → エ ;

yo ↔ ヨ ;

+# double consonants

+#specials

s } sh → ッ ;

t } ch → ッ ;

+#voiced

j } j ↔ ッ } $j_start ;

b } b ↔ ッ } [$h_start$f_start] $voice;

d } d ↔ ッ } $t_start $voice;

g } g ↔ ッ } $k_start $voice;

p } p ↔ ッ } [$h_start$f_start] $semivoice;

+# v } v ↔ ッ } [ワヰウヱヲう] $voice ;

z } z ↔ ッ } $s_start $voice;

v } v ↔ ッ } $v_start;

+# normal

k } k ↔ ッ } $k_start ;

m } m ↔ ッ } $m_start ;

n } n ↔ ッ } $n_start ;

@@ -194,13 +273,24 @@ t } t ↔ ッ } $t_start ;

s } s ↔ ッ } $s_start ;

w } w ↔ ッ } $w_start;

y } y ↔ ッ } $y_start;

+# completeness

x } x → ッ ;

c } k → ッ ;

c } c → ッ ;

c } q → ッ ;

l } l → ッ ;

q } q → ッ ;

+# y } y → ッ ;

+# w } w → ッ ;

+# prolonged vowel mark. this indicates a doubling of

+# the preceding vowel sound

+#a ← a { ー ; # liu

+#e ← e { ー ; # liu

+#i ← i { ー ; # liu

+#o ← o { ー ; # liu

+#u ← u { ー ; # liu

$macron ↔ ー ;

+# small forms

'~a' ↔ ァ ;

'~i' ↔ ィ ;

'~u' ↔ ゥ ;

@@ -213,6 +303,8 @@ $macron ↔ ー ;

'~yu' ↔ ュ ;

'~ye' → ェ ;

'~yo' ↔ ョ ;

+# iteration marks

+# TODO: make more accurate

j $1 ← sh (y* $vowel) {ヽ$voice ;

dj $1 ← ch (y* $vowel) {ヽ$voice ;

dz $1 ← ts (y* $vowel) {ヽ$voice ;

@@ -230,7 +322,16 @@ dz $1 ← dz (y* $vowel) {ヽ$voice ;

$1 ← ($consonant y* $vowel) {ヽ$voice? ;

$1 ← (.) {ヽ $voice? ; # otherwise repeat last character

← ヽ $voice? ; # delete if no characters found

+# h- rule: lengthens vowel if not followed by a vowel.

+# At the point this is applied, latin [cons]?vowel sequences

+# have been converted to katakana in NFD form.

$voweled_basekana [\u3099 \u309A]? { h → ー ;

+# one-way latin- → kana rules. these do not occur in

+# well-formed romaji representing actual japanese text.

+# their purpose is to make all romaji map to kana of

+# some sort.

+# the following are not really necessary, but produce

+# slightly more natural results.

cy → セィ ;

dy → テ\u3099ィ ;

hy → ヒ ;

@@ -238,6 +339,8 @@ sy → セィ ;

ty → ティ ;

zy → セ\u3099ィ ;

h → ヘ ;

+# isolated consonants listed here so as not to mask

+# longer rules above.

ch → チ;

sh → シ ;

dz → ツ\u3099 ;

@@ -264,12 +367,22 @@ w → ウ;

ð → | d ;

ø → | u ;

þ → | th ;

+# simple substitutions using backup

c → | k ;

l → | r ;

q → | k ;

x → | ks ;

+# ~~~ END shared rules ~~~

+#------------------------------------------------------

+# Final cleanup

'~' → ; # delete stray tildes between letters

[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters

+# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use

:: NFC (NFD) ;

:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);

+# note: a global filter is more efficient, but MUST include all source chars!!

+#:: ([\u0000-\u007E 、。 \u3099-゜ァ-ー｡-ﾟ [:Latin:][:Katakana:] [:nonspacing mark:]]);

+# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD

:: ( [[\ -~¢-£¥-¦¬\u0304₩｡-ﾾￂ-ￇￊ-ￏￒ-ￗￚ-ￜ￨-￮][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;

+# eof

« no previous file with comments | « source/data/translit/Latn_Jamo.txt ('k') | source/data/translit/Latn_Knda.txt » ('j') | no next file with comments »