| Index: source/data/translit/Latn_Kana.txt
|
| diff --git a/source/data/translit/Latin_Katakana.txt b/source/data/translit/Latn_Kana.txt
|
| similarity index 63%
|
| rename from source/data/translit/Latin_Katakana.txt
|
| rename to source/data/translit/Latn_Kana.txt
|
| index bd0e07c1b113fa9294072a64ebc1588b90c5caf8..ea4b7dd6875eb8f3ae07ace84ae1d2e52acb238e 100644
|
| --- a/source/data/translit/Latin_Katakana.txt
|
| +++ b/source/data/translit/Latn_Kana.txt
|
| @@ -1,19 +1,67 @@
|
| -# ***************************************************************************
|
| -# *
|
| -# * Copyright (C) 2004-2015, International Business Machines
|
| -# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
|
| -# *
|
| -# ***************************************************************************
|
| -# File: Latin_Katakana.txt
|
| -# Generated from CLDR
|
| +# © 2016 and later: Unicode, Inc. and others.
|
| +# License & terms of use: http://www.unicode.org/copyright.html#License
|
| #
|
| +# File: Latn_Kana.txt
|
| +# Generated from CLDR
|
| +#
|
| +
|
| +# note: a global filter is more efficient, but MUST include all source chars
|
| +#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ;
|
| +# MINIMAL FILTER GENERATED FOR: Latin-Katakana
|
| +### WARNING -- must add width filter, both here and below!!! ###
|
| :: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
|
| :: [:Latin:] fullwidth-halfwidth ();
|
| :: NFD (NFC);
|
| :: Lower (); # whenever transliterating from cased to uncased script, include this
|
| +# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
|
| +# Uses modified Hepburn. Small changes to make unambiguous.
|
| +# | Kunrei-shiki: Hepburn/MHepburn
|
| +# | ------------------------------
|
| +# | si: shi
|
| +# | si ~ya: sha
|
| +# | si ~yu: shu
|
| +# | si ~yo: sho
|
| +# | zi: ji
|
| +# | zi ~ya: ja
|
| +# | zi ~yu: ju
|
| +# | zi ~yo: jo
|
| +# | ti: chi
|
| +# | ti ~ya: cha
|
| +# | ti ~yu: chu
|
| +# | ti ~yu: cho
|
| +# | tu: tsu
|
| +# | di: ji/dji
|
| +# | du: zu/dzu
|
| +# | hu: fu
|
| +# | For foreign words:
|
| +# | -----------------
|
| +# | se ~i si
|
| +# | si ~e she
|
| +# |
|
| +# | ze ~i zi
|
| +# | zi ~e je
|
| +# |
|
| +# | te ~i ti
|
| +# | ti ~e che
|
| +# | te ~u tu
|
| +# |
|
| +# | de ~i di
|
| +# | de ~u du
|
| +# | de ~i di
|
| +# |
|
| +# | he ~u: hu
|
| +# | hu ~a fa
|
| +# | hu ~i fi
|
| +# | hu ~e he
|
| +# | hu ~o ho
|
| +# Most small forms are generated, but if necessary
|
| +# explicit small forms are given with ~a, ~ya, etc.
|
| +#------------------------------------------------------
|
| +# Variables
|
| $vowel = [aeiou] ;
|
| $consonant = [bcdfghjklmnpqrstvwxyz] ;
|
| $macron = \u0304 ;
|
| +# Variables used for doubled-consonants with tsu
|
| $kana = [ぁ-ゔ] ;
|
| $voice = [\u3099゛];
|
| $semivoice = [\u309A゜];
|
| @@ -30,22 +78,38 @@ $r_start = [ラリルレロらりるれろ] ;
|
| $w_start = [ワヰヱヲわゐゑを] ;
|
| $v_start = [ワヰヱヲ]\u3099 ;
|
| $voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ;
|
| +# if ン is followed by $n_quoter, then it needs an
|
| +# apostrophe after its romaji form to disambiguate it.
|
| +# e.g., ン ア ! = ナ, so represent as "n'a", not "na".
|
| $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
|
| $small_y = [ャィュェョ] ;
|
| $iteration = ゝ ;
|
| +#------------------------------------------------------
|
| +# katakana rules
|
| +# Punctuation
|
| '.' ↔ 。;
|
| ',' ↔ 、;
|
| +# ' ' } [a-z] → ; # delete spaces before latin
|
| +# ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana
|
| +# Iteration Mark
|
| +# Copy previous letter § marks
|
| +# TODO
|
| +# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration
|
| +# Specials for katakana -- not shared with hiragana
|
| va ↔ ワ\u3099 ;
|
| vi ↔ ヰ\u3099 ;
|
| ve ↔ ヱ\u3099 ;
|
| vo ↔ ヲ\u3099 ;
|
| '~ka' ↔ ヵ ;
|
| '~ke' ↔ ヶ ;
|
| +# ~~~ begin shared rules ~~~
|
| +#special
|
| ya ← '~'ャ;
|
| yi ← '~'ィ ;
|
| yu ← '~'ュ;
|
| ye ← '~'ェ;
|
| yo ← '~'ョ;
|
| +#normal
|
| a ↔ ア ;
|
| b | '~' ← ヒ \u3099} $small_y ;
|
| by } $vowel → ヒ\u3099 | '~y' ;
|
| @@ -69,6 +133,7 @@ dje ← チ\u3099ェ ;
|
| djo ← チ\u3099ョ ;
|
| dji ↔ チ\u3099 ;
|
| dj } $vowel → チ\u3099 | '~y' ;
|
| +# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
|
| cha ← チャ ;
|
| chi'~i' ← チィ ; # liu
|
| chu ← チュ ;
|
| @@ -85,6 +150,7 @@ gu ↔ ク\u3099 ;
|
| ge ↔ ケ\u3099 ;
|
| go ↔ コ\u3099 ;
|
| i ↔ イ ;
|
| +# j } $vowel → シ\u3099 | '~y' ;
|
| ja ↔ シ\u3099ャ ;
|
| ji'~i' ← シ\u3099ィ ; # liu
|
| ju ↔ シ\u3099ュ ;
|
| @@ -128,6 +194,8 @@ hi ↔ ヒ ;
|
| hu ↔ ヘゥ ;
|
| he ↔ ヘ ;
|
| ho ↔ ホ ;
|
| +# f | '~' ← フ } $small_y ;
|
| +# f } $vowel → フ | '~' ;
|
| fa ↔ ファ ;
|
| fi ↔ フィ ;
|
| fe ↔ フェ ;
|
| @@ -163,8 +231,14 @@ tu ↔ テゥ ;
|
| te ↔ テ ;
|
| to ↔ ト ;
|
| tsu ↔ ツ ;
|
| +# v } $vowel → ウ\u3099 | '~' ;
|
| +#'v~a' ← ウ\u3099ァ ; # liu
|
| +#'v~i' ← ウ\u3099ィ ; # liu
|
| +#'v~e' ← ウ\u3099ェ ; # liu
|
| +#'v~o' ← ウ\u3099ォ ; # liu
|
| vu ↔ ウ\u3099 ;
|
| u ↔ ウ ;
|
| +# w } $vowel → ウ | '~' ;
|
| wa ↔ ワ ;
|
| wi ↔ ヰ ;
|
| wu → ウ ;
|
| @@ -175,15 +249,20 @@ yi → イ ;
|
| yu ↔ ユ ;
|
| ye → エ ;
|
| yo ↔ ヨ ;
|
| +# double consonants
|
| +#specials
|
| s } sh → ッ ;
|
| t } ch → ッ ;
|
| +#voiced
|
| j } j ↔ ッ } $j_start ;
|
| b } b ↔ ッ } [$h_start$f_start] $voice;
|
| d } d ↔ ッ } $t_start $voice;
|
| g } g ↔ ッ } $k_start $voice;
|
| p } p ↔ ッ } [$h_start$f_start] $semivoice;
|
| +# v } v ↔ ッ } [ワヰウヱヲう] $voice ;
|
| z } z ↔ ッ } $s_start $voice;
|
| v } v ↔ ッ } $v_start;
|
| +# normal
|
| k } k ↔ ッ } $k_start ;
|
| m } m ↔ ッ } $m_start ;
|
| n } n ↔ ッ } $n_start ;
|
| @@ -194,13 +273,24 @@ t } t ↔ ッ } $t_start ;
|
| s } s ↔ ッ } $s_start ;
|
| w } w ↔ ッ } $w_start;
|
| y } y ↔ ッ } $y_start;
|
| +# completeness
|
| x } x → ッ ;
|
| c } k → ッ ;
|
| c } c → ッ ;
|
| c } q → ッ ;
|
| l } l → ッ ;
|
| q } q → ッ ;
|
| +# y } y → ッ ;
|
| +# w } w → ッ ;
|
| +# prolonged vowel mark. this indicates a doubling of
|
| +# the preceding vowel sound
|
| +#a ← a { ー ; # liu
|
| +#e ← e { ー ; # liu
|
| +#i ← i { ー ; # liu
|
| +#o ← o { ー ; # liu
|
| +#u ← u { ー ; # liu
|
| $macron ↔ ー ;
|
| +# small forms
|
| '~a' ↔ ァ ;
|
| '~i' ↔ ィ ;
|
| '~u' ↔ ゥ ;
|
| @@ -213,6 +303,8 @@ $macron ↔ ー ;
|
| '~yu' ↔ ュ ;
|
| '~ye' → ェ ;
|
| '~yo' ↔ ョ ;
|
| +# iteration marks
|
| +# TODO: make more accurate
|
| j $1 ← sh (y* $vowel) {ヽ$voice ;
|
| dj $1 ← ch (y* $vowel) {ヽ$voice ;
|
| dz $1 ← ts (y* $vowel) {ヽ$voice ;
|
| @@ -230,7 +322,16 @@ dz $1 ← dz (y* $vowel) {ヽ$voice ;
|
| $1 ← ($consonant y* $vowel) {ヽ$voice? ;
|
| $1 ← (.) {ヽ $voice? ; # otherwise repeat last character
|
| ← ヽ $voice? ; # delete if no characters found
|
| +# h- rule: lengthens vowel if not followed by a vowel.
|
| +# At the point this is applied, latin [cons]?vowel sequences
|
| +# have been converted to katakana in NFD form.
|
| $voweled_basekana [\u3099 \u309A]? { h → ー ;
|
| +# one-way latin- → kana rules. these do not occur in
|
| +# well-formed romaji representing actual japanese text.
|
| +# their purpose is to make all romaji map to kana of
|
| +# some sort.
|
| +# the following are not really necessary, but produce
|
| +# slightly more natural results.
|
| cy → セィ ;
|
| dy → テ\u3099ィ ;
|
| hy → ヒ ;
|
| @@ -238,6 +339,8 @@ sy → セィ ;
|
| ty → ティ ;
|
| zy → セ\u3099ィ ;
|
| h → ヘ ;
|
| +# isolated consonants listed here so as not to mask
|
| +# longer rules above.
|
| ch → チ;
|
| sh → シ ;
|
| dz → ツ\u3099 ;
|
| @@ -264,12 +367,22 @@ w → ウ;
|
| ð → | d ;
|
| ø → | u ;
|
| þ → | th ;
|
| +# simple substitutions using backup
|
| c → | k ;
|
| l → | r ;
|
| q → | k ;
|
| x → | ks ;
|
| +# ~~~ END shared rules ~~~
|
| +#------------------------------------------------------
|
| +# Final cleanup
|
| '~' → ; # delete stray tildes between letters
|
| [:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
|
| +# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use
|
| :: NFC (NFD) ;
|
| :: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
|
| +# note: a global filter is more efficient, but MUST include all source chars!!
|
| +#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]);
|
| +# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
|
| :: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
|
| +# eof
|
| +
|
|
|