Chromium Code Reviews

Unified Diff: source/data/translit/Latn_Kana.txt

Issue 2440913002: Update ICU to 58.1
Patch Set: Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
« no previous file with comments | « source/data/translit/Latn_Jamo.txt ('k') | source/data/translit/Latn_Knda.txt » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/data/translit/Latn_Kana.txt
diff --git a/source/data/translit/Latin_Katakana.txt b/source/data/translit/Latn_Kana.txt
similarity index 63%
rename from source/data/translit/Latin_Katakana.txt
rename to source/data/translit/Latn_Kana.txt
index bd0e07c1b113fa9294072a64ebc1588b90c5caf8..ea4b7dd6875eb8f3ae07ace84ae1d2e52acb238e 100644
--- a/source/data/translit/Latin_Katakana.txt
+++ b/source/data/translit/Latn_Kana.txt
@@ -1,19 +1,67 @@
-# ***************************************************************************
-# *
-# * Copyright (C) 2004-2015, International Business Machines
-# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
-# *
-# ***************************************************************************
-# File: Latin_Katakana.txt
-# Generated from CLDR
+# © 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html#License
#
+# File: Latn_Kana.txt
+# Generated from CLDR
+#
+
+# note: a global filter is more efficient, but MUST include all source chars
+#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ;
+# MINIMAL FILTER GENERATED FOR: Latin-Katakana
+### WARNING -- must add width filter, both here and below!!! ###
:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
:: [:Latin:] fullwidth-halfwidth ();
:: NFD (NFC);
:: Lower (); # whenever transliterating from cased to uncased script, include this
+# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
+# Uses modified Hepburn. Small changes to make unambiguous.
+# | Kunrei-shiki: Hepburn/MHepburn
+# | ------------------------------
+# | si: shi
+# | si ~ya: sha
+# | si ~yu: shu
+# | si ~yo: sho
+# | zi: ji
+# | zi ~ya: ja
+# | zi ~yu: ju
+# | zi ~yo: jo
+# | ti: chi
+# | ti ~ya: cha
+# | ti ~yu: chu
+# | ti ~yu: cho
+# | tu: tsu
+# | di: ji/dji
+# | du: zu/dzu
+# | hu: fu
+# | For foreign words:
+# | -----------------
+# | se ~i si
+# | si ~e she
+# |
+# | ze ~i zi
+# | zi ~e je
+# |
+# | te ~i ti
+# | ti ~e che
+# | te ~u tu
+# |
+# | de ~i di
+# | de ~u du
+# | de ~i di
+# |
+# | he ~u: hu
+# | hu ~a fa
+# | hu ~i fi
+# | hu ~e he
+# | hu ~o ho
+# Most small forms are generated, but if necessary
+# explicit small forms are given with ~a, ~ya, etc.
+#------------------------------------------------------
+# Variables
$vowel = [aeiou] ;
$consonant = [bcdfghjklmnpqrstvwxyz] ;
$macron = \u0304 ;
+# Variables used for doubled-consonants with tsu
$kana = [ぁ-ゔ] ;
$voice = [\u3099゛];
$semivoice = [\u309A゜];
@@ -30,22 +78,38 @@ $r_start = [ラリルレロらりるれろ] ;
$w_start = [ワヰヱヲわゐゑを] ;
$v_start = [ワヰヱヲ]\u3099 ;
$voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ;
+# if ン is followed by $n_quoter, then it needs an
+# apostrophe after its romaji form to disambiguate it.
+# e.g., ン ア ! = ナ, so represent as "n'a", not "na".
$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
$small_y = [ャィュェョ] ;
$iteration = ゝ ;
+#------------------------------------------------------
+# katakana rules
+# Punctuation
'.' ↔ 。;
',' ↔ 、;
+# ' ' } [a-z] → ; # delete spaces before latin
+# ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana
+# Iteration Mark
+# Copy previous letter § marks
+# TODO
+# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration
+# Specials for katakana -- not shared with hiragana
va ↔ ワ\u3099 ;
vi ↔ ヰ\u3099 ;
ve ↔ ヱ\u3099 ;
vo ↔ ヲ\u3099 ;
'~ka' ↔ ヵ ;
'~ke' ↔ ヶ ;
+# ~~~ begin shared rules ~~~
+#special
ya ← '~'ャ;
yi ← '~'ィ ;
yu ← '~'ュ;
ye ← '~'ェ;
yo ← '~'ョ;
+#normal
a ↔ ア ;
b | '~' ← ヒ \u3099} $small_y ;
by } $vowel → ヒ\u3099 | '~y' ;
@@ -69,6 +133,7 @@ dje ← チ\u3099ェ ;
djo ← チ\u3099ョ ;
dji ↔ チ\u3099 ;
dj } $vowel → チ\u3099 | '~y' ;
+# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
cha ← チャ ;
chi'~i' ← チィ ; # liu
chu ← チュ ;
@@ -85,6 +150,7 @@ gu ↔ ク\u3099 ;
ge ↔ ケ\u3099 ;
go ↔ コ\u3099 ;
i ↔ イ ;
+# j } $vowel → シ\u3099 | '~y' ;
ja ↔ シ\u3099ャ ;
ji'~i' ← シ\u3099ィ ; # liu
ju ↔ シ\u3099ュ ;
@@ -128,6 +194,8 @@ hi ↔ ヒ ;
hu ↔ ヘゥ ;
he ↔ ヘ ;
ho ↔ ホ ;
+# f | '~' ← フ } $small_y ;
+# f } $vowel → フ | '~' ;
fa ↔ ファ ;
fi ↔ フィ ;
fe ↔ フェ ;
@@ -163,8 +231,14 @@ tu ↔ テゥ ;
te ↔ テ ;
to ↔ ト ;
tsu ↔ ツ ;
+# v } $vowel → ウ\u3099 | '~' ;
+#'v~a' ← ウ\u3099ァ ; # liu
+#'v~i' ← ウ\u3099ィ ; # liu
+#'v~e' ← ウ\u3099ェ ; # liu
+#'v~o' ← ウ\u3099ォ ; # liu
vu ↔ ウ\u3099 ;
u ↔ ウ ;
+# w } $vowel → ウ | '~' ;
wa ↔ ワ ;
wi ↔ ヰ ;
wu → ウ ;
@@ -175,15 +249,20 @@ yi → イ ;
yu ↔ ユ ;
ye → エ ;
yo ↔ ヨ ;
+# double consonants
+#specials
s } sh → ッ ;
t } ch → ッ ;
+#voiced
j } j ↔ ッ } $j_start ;
b } b ↔ ッ } [$h_start$f_start] $voice;
d } d ↔ ッ } $t_start $voice;
g } g ↔ ッ } $k_start $voice;
p } p ↔ ッ } [$h_start$f_start] $semivoice;
+# v } v ↔ ッ } [ワヰウヱヲう] $voice ;
z } z ↔ ッ } $s_start $voice;
v } v ↔ ッ } $v_start;
+# normal
k } k ↔ ッ } $k_start ;
m } m ↔ ッ } $m_start ;
n } n ↔ ッ } $n_start ;
@@ -194,13 +273,24 @@ t } t ↔ ッ } $t_start ;
s } s ↔ ッ } $s_start ;
w } w ↔ ッ } $w_start;
y } y ↔ ッ } $y_start;
+# completeness
x } x → ッ ;
c } k → ッ ;
c } c → ッ ;
c } q → ッ ;
l } l → ッ ;
q } q → ッ ;
+# y } y → ッ ;
+# w } w → ッ ;
+# prolonged vowel mark. this indicates a doubling of
+# the preceding vowel sound
+#a ← a { ー ; # liu
+#e ← e { ー ; # liu
+#i ← i { ー ; # liu
+#o ← o { ー ; # liu
+#u ← u { ー ; # liu
$macron ↔ ー ;
+# small forms
'~a' ↔ ァ ;
'~i' ↔ ィ ;
'~u' ↔ ゥ ;
@@ -213,6 +303,8 @@ $macron ↔ ー ;
'~yu' ↔ ュ ;
'~ye' → ェ ;
'~yo' ↔ ョ ;
+# iteration marks
+# TODO: make more accurate
j $1 ← sh (y* $vowel) {ヽ$voice ;
dj $1 ← ch (y* $vowel) {ヽ$voice ;
dz $1 ← ts (y* $vowel) {ヽ$voice ;
@@ -230,7 +322,16 @@ dz $1 ← dz (y* $vowel) {ヽ$voice ;
$1 ← ($consonant y* $vowel) {ヽ$voice? ;
$1 ← (.) {ヽ $voice? ; # otherwise repeat last character
← ヽ $voice? ; # delete if no characters found
+# h- rule: lengthens vowel if not followed by a vowel.
+# At the point this is applied, latin [cons]?vowel sequences
+# have been converted to katakana in NFD form.
$voweled_basekana [\u3099 \u309A]? { h → ー ;
+# one-way latin- → kana rules. these do not occur in
+# well-formed romaji representing actual japanese text.
+# their purpose is to make all romaji map to kana of
+# some sort.
+# the following are not really necessary, but produce
+# slightly more natural results.
cy → セィ ;
dy → テ\u3099ィ ;
hy → ヒ ;
@@ -238,6 +339,8 @@ sy → セィ ;
ty → ティ ;
zy → セ\u3099ィ ;
h → ヘ ;
+# isolated consonants listed here so as not to mask
+# longer rules above.
ch → チ;
sh → シ ;
dz → ツ\u3099 ;
@@ -264,12 +367,22 @@ w → ウ;
ð → | d ;
ø → | u ;
þ → | th ;
+# simple substitutions using backup
c → | k ;
l → | r ;
q → | k ;
x → | ks ;
+# ~~~ END shared rules ~~~
+#------------------------------------------------------
+# Final cleanup
'~' → ; # delete stray tildes between letters
[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
+# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use
:: NFC (NFD) ;
:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
+# note: a global filter is more efficient, but MUST include all source chars!!
+#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]);
+# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
:: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
+# eof
+
« no previous file with comments | « source/data/translit/Latn_Jamo.txt ('k') | source/data/translit/Latn_Knda.txt » ('j') | no next file with comments »

Powered by Google App Engine