Index: source/data/translit/Han_Spacedhan.txt |
diff --git a/source/data/translit/Han_Spacedhan.txt b/source/data/translit/Han_Spacedhan.txt |
index 0126b40f23b5babd3c2a705ca17bab799b4e8635..9428d4dd9c8f1317a7500d517db6fdfa48ae5ebf 100644 |
--- a/source/data/translit/Han_Spacedhan.txt |
+++ b/source/data/translit/Han_Spacedhan.txt |
@@ -1,18 +1,24 @@ |
-# *************************************************************************** |
-# * |
-# * Copyright (C) 2004-2015, International Business Machines |
-# * Corporation; Unicode, Inc.; and others. All Rights Reserved. |
-# * |
-# *************************************************************************** |
+# © 2016 and later: Unicode, Inc. and others. |
+# License & terms of use: http://www.unicode.org/copyright.html#License |
+# |
# File: Han_Spacedhan.txt |
-# Generated from CLDR |
+# Generated from CLDR |
# |
-:: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc; |
+ |
+# Only intended for internal use |
+# Make sure Han are normalized, including characters that contain them. |
+# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:] |
+# Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release! |
+:: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc; |
:: fullwidth-halfwidth; |
。 → '.'; |
$terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]]; |
$initialPunct = [:Ps:][:Pi:]; |
+# add space between any Han or terminal punctuation and letters, and |
+# between letters and Han or initial punct |
[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ; |
[:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ; |
+# remove spacing between ideographs and other letters |
← [:Ideographic:] { ' ' } [:Letter:] ; |
← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ; |
+ |