Index: source/i18n/uspoof_impl.cpp |
diff --git a/source/i18n/uspoof_impl.cpp b/source/i18n/uspoof_impl.cpp |
index e9077d3ac3414027735b4cbc8880fb92a6e9a996..a062d3ee24bd533798451a415fed5712f0335606 100644 |
--- a/source/i18n/uspoof_impl.cpp |
+++ b/source/i18n/uspoof_impl.cpp |
@@ -1,6 +1,8 @@ |
+// Copyright (C) 2016 and later: Unicode, Inc. and others. |
+// License & terms of use: http://www.unicode.org/copyright.html |
/* |
********************************************************************** |
-* Copyright (C) 2008-2015, International Business Machines |
+* Copyright (C) 2008-2016, International Business Machines |
* Corporation and others. All Rights Reserved. |
********************************************************************** |
*/ |
@@ -13,11 +15,11 @@ |
#include "utrie2.h" |
#include "cmemory.h" |
#include "cstring.h" |
-#include "identifier_info.h" |
#include "scriptset.h" |
#include "umutex.h" |
#include "udataswp.h" |
#include "uassert.h" |
+#include "ucln_in.h" |
#include "uspoof_impl.h" |
#if !UCONFIG_NO_NORMALIZATION |
@@ -27,41 +29,53 @@ U_NAMESPACE_BEGIN |
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) |
-SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : |
- fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) , |
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { |
- if (U_FAILURE(status)) { |
- return; |
- } |
+SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) { |
+ construct(status); |
+ fSpoofData = data; |
+} |
+ |
+SpoofImpl::SpoofImpl(UErrorCode& status) { |
+ construct(status); |
+ |
+ // TODO: Call this method where it is actually needed, instead of in the |
+ // constructor, to allow for lazy data loading. See #12696. |
+ fSpoofData = SpoofData::getDefault(status); |
+} |
+ |
+SpoofImpl::SpoofImpl() { |
+ UErrorCode status = U_ZERO_ERROR; |
+ construct(status); |
+ |
+ // TODO: Call this method where it is actually needed, instead of in the |
+ // constructor, to allow for lazy data loading. See #12696. |
+ fSpoofData = SpoofData::getDefault(status); |
+} |
+ |
+void SpoofImpl::construct(UErrorCode& status) { |
+ fMagic = USPOOF_MAGIC; |
+ fChecks = USPOOF_ALL_CHECKS; |
+ fSpoofData = NULL; |
+ fAllowedCharsSet = NULL; |
+ fAllowedLocales = NULL; |
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; |
+ if (U_FAILURE(status)) { return; } |
+ |
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); |
- allowedCharsSet->freeze(); |
fAllowedCharsSet = allowedCharsSet; |
fAllowedLocales = uprv_strdup(""); |
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { |
status = U_MEMORY_ALLOCATION_ERROR; |
return; |
} |
- fMagic = USPOOF_MAGIC; |
-} |
- |
- |
-SpoofImpl::SpoofImpl() : |
- fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , |
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { |
- UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); |
allowedCharsSet->freeze(); |
- fAllowedCharsSet = allowedCharsSet; |
- fAllowedLocales = uprv_strdup(""); |
- fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; |
} |
// Copy Constructor, used by the user level clone() function. |
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : |
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , |
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { |
+ fAllowedLocales(NULL) { |
if (U_FAILURE(status)) { |
return; |
} |
@@ -71,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : |
fSpoofData = src.fSpoofData->addReference(); |
} |
fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); |
- if (fAllowedCharsSet == NULL) { |
+ fAllowedLocales = uprv_strdup(src.fAllowedLocales); |
+ if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { |
status = U_MEMORY_ALLOCATION_ERROR; |
} |
- fAllowedLocales = uprv_strdup(src.fAllowedLocales); |
fRestrictionLevel = src.fRestrictionLevel; |
} |
@@ -86,7 +100,11 @@ SpoofImpl::~SpoofImpl() { |
} |
delete fAllowedCharsSet; |
uprv_free((void *)fAllowedLocales); |
- delete fCachedIdentifierInfo; |
+} |
+ |
+// Cast this instance as a USpoofChecker for the C API. |
+USpoofChecker *SpoofImpl::asUSpoofChecker() { |
+ return reinterpret_cast<USpoofChecker*>(this); |
} |
// |
@@ -102,12 +120,11 @@ const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &st |
return NULL; |
} |
SpoofImpl *This = (SpoofImpl *)sc; |
- if (This->fMagic != USPOOF_MAGIC || |
- This->fSpoofData == NULL) { |
+ if (This->fMagic != USPOOF_MAGIC) { |
status = U_INVALID_FORMAT_ERROR; |
return NULL; |
} |
- if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { |
+ if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) { |
return NULL; |
} |
return This; |
@@ -119,148 +136,6 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { |
} |
- |
-//-------------------------------------------------------------------------------------- |
-// |
-// confusableLookup() This is the heart of the confusable skeleton generation |
-// implementation. |
-// |
-// Given a source character, produce the corresponding |
-// replacement character(s), appending them to the dest string. |
-// |
-//--------------------------------------------------------------------------------------- |
-int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const { |
- |
- // Binary search the spoof data key table for the inChar |
- int32_t *low = fSpoofData->fCFUKeys; |
- int32_t *mid = NULL; |
- int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; |
- UChar32 midc; |
- do { |
- int32_t delta = ((int32_t)(limit-low))/2; |
- mid = low + delta; |
- midc = *mid & 0x1fffff; |
- if (inChar == midc) { |
- goto foundChar; |
- } else if (inChar < midc) { |
- limit = mid; |
- } else { |
- low = mid; |
- } |
- } while (low < limit-1); |
- mid = low; |
- midc = *mid & 0x1fffff; |
- if (inChar != midc) { |
- // Char not found. It maps to itself. |
- int i = 0; |
- dest.append(inChar); |
- return i; |
- } |
- foundChar: |
- int32_t keyFlags = *mid & 0xff000000; |
- if ((keyFlags & tableMask) == 0) { |
- // We found the right key char, but the entry doesn't pertain to the |
- // table we need. See if there is an adjacent key that does |
- if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { |
- int32_t *altMid; |
- for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { |
- keyFlags = *altMid & 0xff000000; |
- if (keyFlags & tableMask) { |
- mid = altMid; |
- goto foundKey; |
- } |
- } |
- for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { |
- keyFlags = *altMid & 0xff000000; |
- if (keyFlags & tableMask) { |
- mid = altMid; |
- goto foundKey; |
- } |
- } |
- } |
- // No key entry for this char & table. |
- // The input char maps to itself. |
- int i = 0; |
- dest.append(inChar); |
- return i; |
- } |
- |
- foundKey: |
- int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; |
- int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); |
- |
- // Value is either a UChar (for strings of length 1) or |
- // an index into the string table (for longer strings) |
- uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; |
- if (stringLen == 1) { |
- dest.append((UChar)value); |
- return 1; |
- } |
- |
- // String length of 4 from the above lookup is used for all strings of length >= 4. |
- // For these, get the real length from the string lengths table, |
- // which maps string table indexes to lengths. |
- // All strings of the same length are stored contiguously in the string table. |
- // 'value' from the lookup above is the starting index for the desired string. |
- |
- int32_t ix; |
- if (stringLen == 4) { |
- int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; |
- for (ix = 0; ix < stringLengthsLimit; ix++) { |
- if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { |
- stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; |
- break; |
- } |
- } |
- U_ASSERT(ix < stringLengthsLimit); |
- } |
- |
- U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); |
- UChar *src = &fSpoofData->fCFUStrings[value]; |
- dest.append(src, stringLen); |
- return stringLen; |
-} |
- |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// wholeScriptCheck() |
-// |
-// Input text is already normalized to NFD |
-// Return the set of scripts, each of which can represent something that is |
-// confusable with the input text. The script of the input text |
-// is included; input consisting of characters from a single script will |
-// always produce a result consisting of a set containing that script. |
-// |
-//--------------------------------------------------------------------------------------- |
-void SpoofImpl::wholeScriptCheck( |
- const UnicodeString &text, ScriptSet *result, UErrorCode &status) const { |
- |
- UTrie2 *table = |
- (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; |
- result->setAll(); |
- int32_t length = text.length(); |
- for (int32_t inputIdx=0; inputIdx < length;) { |
- UChar32 c = text.char32At(inputIdx); |
- inputIdx += U16_LENGTH(c); |
- uint32_t index = utrie2_get32(table, c); |
- if (index == 0) { |
- // No confusables in another script for this char. |
- // TODO: we should change the data to have sets with just the single script |
- // bit for the script of this char. Gets rid of this special case. |
- // Until then, grab the script from the char and intersect it with the set. |
- UScriptCode cpScript = uscript_getScript(c, &status); |
- U_ASSERT(cpScript > USCRIPT_INHERITED); |
- result->intersect(cpScript, status); |
- } else if (index == 1) { |
- // Script == Common or Inherited. Nothing to do. |
- } else { |
- result->intersect(fSpoofData->fScriptSets[index]); |
- } |
- } |
-} |
- |
- |
void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { |
UnicodeSet allowedChars; |
UnicodeSet *tmpSet = NULL; |
@@ -356,7 +231,7 @@ const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { |
void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { |
UScriptCode scripts[30]; |
- int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); |
+ int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status); |
if (U_FAILURE(status)) { |
return; |
} |
@@ -372,6 +247,137 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr |
} |
} |
+// Computes the augmented script set for a code point, according to UTS 39 section 5.1. |
+void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) { |
+ result.resetAll(); |
+ result.setScriptExtensions(codePoint, status); |
+ if (U_FAILURE(status)) { return; } |
+ |
+ // Section 5.1 step 1 |
+ if (result.test(USCRIPT_HAN, status)) { |
+ result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); |
+ result.set(USCRIPT_JAPANESE, status); |
+ result.set(USCRIPT_KOREAN, status); |
+ } |
+ if (result.test(USCRIPT_HIRAGANA, status)) { |
+ result.set(USCRIPT_JAPANESE, status); |
+ } |
+ if (result.test(USCRIPT_KATAKANA, status)) { |
+ result.set(USCRIPT_JAPANESE, status); |
+ } |
+ if (result.test(USCRIPT_HANGUL, status)) { |
+ result.set(USCRIPT_KOREAN, status); |
+ } |
+ if (result.test(USCRIPT_BOPOMOFO, status)) { |
+ result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); |
+ } |
+ |
+ // Section 5.1 step 2 |
+ if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) { |
+ result.setAll(); |
+ } |
+} |
+ |
+// Computes the resolved script set for a string, according to UTS 39 section 5.1. |
+void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const { |
+ getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status); |
+} |
+ |
+// Computes the resolved script set for a string, omitting characters having the specified script. |
+// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. |
+void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { |
+ result.setAll(); |
+ |
+ ScriptSet temp; |
+ UChar32 codePoint; |
+ for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { |
+ codePoint = input.char32At(i); |
+ |
+ // Compute the augmented script set for the character |
+ getAugmentedScriptSet(codePoint, temp, status); |
+ if (U_FAILURE(status)) { return; } |
+ |
+ // Intersect the augmented script set with the resolved script set, but only if the character doesn't |
+ // have the script specified in the function call |
+ if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { |
+ result.intersect(temp); |
+ } |
+ } |
+} |
+ |
+// Computes the set of numerics for a string, according to UTS 39 section 5.3. |
+void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const { |
+ result.clear(); |
+ |
+ UChar32 codePoint; |
+ for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { |
+ codePoint = input.char32At(i); |
+ |
+ // Store a representative character for each kind of decimal digit |
+ if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) { |
+ // Store the zero character as a representative for comparison. |
+ // Unicode guarantees it is codePoint - value |
+ result.add(codePoint - (UChar32)u_getNumericValue(codePoint)); |
+ } |
+ } |
+} |
+ |
+// Computes the restriction level of a string, according to UTS 39 section 5.2. |
+URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { |
+ // Section 5.2 step 1: |
+ if (!fAllowedCharsSet->containsAll(input)) { |
+ return USPOOF_UNRESTRICTIVE; |
+ } |
+ |
+ // Section 5.2 step 2 |
+ // Java use a static UnicodeSet for this test. In C++, avoid the static variable |
+ // and just do a simple for loop. |
+ UBool allASCII = TRUE; |
+ for (int32_t i=0, length=input.length(); i<length; i++) { |
+ if (input.charAt(i) > 0x7f) { |
+ allASCII = FALSE; |
+ break; |
+ } |
+ } |
+ if (allASCII) { |
+ return USPOOF_ASCII; |
+ } |
+ |
+ // Section 5.2 steps 3: |
+ ScriptSet resolvedScriptSet; |
+ getResolvedScriptSet(input, resolvedScriptSet, status); |
+ if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } |
+ |
+ // Section 5.2 step 4: |
+ if (!resolvedScriptSet.isEmpty()) { |
+ return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; |
+ } |
+ |
+ // Section 5.2 step 5: |
+ ScriptSet resolvedNoLatn; |
+ getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); |
+ if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } |
+ |
+ // Section 5.2 step 6: |
+ if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) |
+ || resolvedNoLatn.test(USCRIPT_JAPANESE, status) |
+ || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { |
+ return USPOOF_HIGHLY_RESTRICTIVE; |
+ } |
+ |
+ // Section 5.2 step 7: |
+ if (!resolvedNoLatn.isEmpty() |
+ && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) |
+ && !resolvedNoLatn.test(USCRIPT_GREEK, status) |
+ && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { |
+ return USPOOF_MODERATELY_RESTRICTIVE; |
+ } |
+ |
+ // Section 5.2 step 8: |
+ return USPOOF_MINIMALLY_RESTRICTIVE; |
+} |
+ |
+ |
// Convert a text format hex number. Utility function used by builder code. Static. |
// Input: UChar *string text. Output: a UChar32 |
@@ -404,55 +410,60 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC |
return (UChar32)val; |
} |
-// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. |
-// Maintain a one-element cache, which is sufficient to avoid repeatedly |
-// creating new ones unless we get multi-thread concurrency in spoof |
-// check operations, which should be statistically uncommon. |
-// These functions are used in place of new & delete of an IdentifierInfo. |
-// They will recycle the IdentifierInfo when possible. |
-// They are logically const, and used within const functions that must be thread safe. |
-IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const { |
- IdentifierInfo *returnIdInfo = NULL; |
- if (U_FAILURE(status)) { |
- return returnIdInfo; |
- } |
- SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); |
- { |
- Mutex m; |
- returnIdInfo = nonConstThis->fCachedIdentifierInfo; |
- nonConstThis->fCachedIdentifierInfo = NULL; |
- } |
- if (returnIdInfo == NULL) { |
- returnIdInfo = new IdentifierInfo(status); |
- if (U_SUCCESS(status) && returnIdInfo == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- } |
- if (U_FAILURE(status) && returnIdInfo != NULL) { |
- delete returnIdInfo; |
- returnIdInfo = NULL; |
- } |
- } |
- return returnIdInfo; |
+//----------------------------------------- |
+// |
+// class CheckResult Implementation |
+// |
+//----------------------------------------- |
+ |
+CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) { |
+ clear(); |
} |
+USpoofCheckResult* CheckResult::asUSpoofCheckResult() { |
+ return reinterpret_cast<USpoofCheckResult*>(this); |
+} |
-void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { |
- if (idInfo != NULL) { |
- SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); |
- { |
- Mutex m; |
- if (nonConstThis->fCachedIdentifierInfo == NULL) { |
- nonConstThis->fCachedIdentifierInfo = idInfo; |
- idInfo = NULL; |
- } |
- } |
- delete idInfo; |
+// |
+// Incoming parameter check on Status and the CheckResult object |
+// received from the C API. |
+// |
+const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) { |
+ if (U_FAILURE(status)) { return NULL; } |
+ if (ptr == NULL) { |
+ status = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
} |
+ CheckResult *This = (CheckResult*) ptr; |
+ if (This->fMagic != USPOOF_CHECK_MAGIC) { |
+ status = U_INVALID_FORMAT_ERROR; |
+ return NULL; |
+ } |
+ return This; |
+} |
+ |
+CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) { |
+ return const_cast<CheckResult *> |
+ (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status)); |
} |
+void CheckResult::clear() { |
+ fChecks = 0; |
+ fNumerics.clear(); |
+ fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE; |
+} |
+int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) { |
+ if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) { |
+ return fChecks | fRestrictionLevel; |
+ } else { |
+ return fChecks; |
+ } |
+} |
+CheckResult::~CheckResult() { |
+} |
//---------------------------------------------------------------------------------------------- |
// |
@@ -461,12 +472,14 @@ void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { |
//---------------------------------------------------------------------------------------------- |
-UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { |
+UBool SpoofData::validateDataVersion(UErrorCode &status) const { |
if (U_FAILURE(status) || |
- rawData == NULL || |
- rawData->fMagic != USPOOF_MAGIC || |
- rawData->fFormatVersion[0] > 1 || |
- rawData->fFormatVersion[1] > 0) { |
+ fRawData == NULL || |
+ fRawData->fMagic != USPOOF_MAGIC || |
+ fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION || |
+ fRawData->fFormatVersion[1] != 0 || |
+ fRawData->fFormatVersion[2] != 0 || |
+ fRawData->fFormatVersion[3] != 0) { |
status = U_INVALID_FORMAT_ERROR; |
return FALSE; |
} |
@@ -485,7 +498,7 @@ spoofDataIsAcceptable(void *context, |
pInfo->dataFormat[1] == 0x66 && |
pInfo->dataFormat[2] == 0x75 && |
pInfo->dataFormat[3] == 0x20 && |
- pInfo->formatVersion[0] == 1 |
+ pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION |
) { |
UVersionInfo *version = static_cast<UVersionInfo *>(context); |
if(version != NULL) { |
@@ -497,32 +510,61 @@ spoofDataIsAcceptable(void *context, |
} |
} |
+// Methods for the loading of the default confusables data file. The confusable |
+// data is loaded only when it is needed. |
+// |
+// SpoofData::getDefault() - Return the default confusables data, and call the |
+// initOnce() if it is not available. Adds a reference |
+// to the SpoofData that the caller is responsible for |
+// decrementing when they are done with the data. |
// |
-// SpoofData::getDefault() - return a wrapper around the spoof data that is |
-// baked into the default ICU data. |
+// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData |
+// is shared by all spoof checkers using the default data. |
// |
-// Called once, from the initOnce() function in uspoof_impl.cpp; the resulting |
-// SpoofData is shared by all spoof checkers using the default data. |
+// uspoof_cleanupDefaultData - Called during cleanup. |
// |
-SpoofData *SpoofData::getDefault(UErrorCode &status) { |
+ |
+static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER; |
+static SpoofData* gDefaultSpoofData; |
+ |
+static UBool U_CALLCONV |
+uspoof_cleanupDefaultData(void) { |
+ if (gDefaultSpoofData) { |
+ // Will delete, assuming all user-level spoof checkers were closed. |
+ gDefaultSpoofData->removeReference(); |
+ gDefaultSpoofData = NULL; |
+ gSpoofInitDefaultOnce.reset(); |
+ } |
+ return TRUE; |
+} |
+ |
+static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) { |
UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables", |
spoofDataIsAcceptable, |
NULL, // context, would receive dataVersion if supplied. |
&status); |
+ if (U_FAILURE(status)) { return; } |
+ gDefaultSpoofData = new SpoofData(udm, status); |
if (U_FAILURE(status)) { |
- return NULL; |
- } |
- SpoofData *This = new SpoofData(udm, status); |
- if (U_FAILURE(status)) { |
- delete This; |
- return NULL; |
+ delete gDefaultSpoofData; |
+ return; |
} |
- if (This == NULL) { |
+ if (gDefaultSpoofData == NULL) { |
status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
} |
- return This; |
+ ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData); |
+} |
+ |
+SpoofData* SpoofData::getDefault(UErrorCode& status) { |
+ umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status); |
+ if (U_FAILURE(status)) { return NULL; } |
+ gDefaultSpoofData->addReference(); |
+ return gDefaultSpoofData; |
} |
+ |
+ |
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) |
{ |
reset(); |
@@ -533,7 +575,7 @@ SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) |
// fRawData is non-const because it may be constructed by the data builder. |
fRawData = reinterpret_cast<SpoofDataHeader *>( |
const_cast<void *>(udata_getMemory(udm))); |
- validateDataVersion(fRawData, status); |
+ validateDataVersion(status); |
initPtrs(status); |
} |
@@ -554,7 +596,7 @@ SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) |
status = U_INVALID_FORMAT_ERROR; |
return; |
} |
- validateDataVersion(fRawData, status); |
+ validateDataVersion(status); |
initPtrs(status); |
} |
@@ -582,7 +624,7 @@ SpoofData::SpoofData(UErrorCode &status) { |
uprv_memset(fRawData, 0, initialSize); |
fRawData->fMagic = USPOOF_MAGIC; |
- fRawData->fFormatVersion[0] = 1; |
+ fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION; |
fRawData->fFormatVersion[1] = 0; |
fRawData->fFormatVersion[2] = 0; |
fRawData->fFormatVersion[3] = 0; |
@@ -600,11 +642,7 @@ void SpoofData::reset() { |
fRefCount = 1; |
fCFUKeys = NULL; |
fCFUValues = NULL; |
- fCFUStringLengths = NULL; |
fCFUStrings = NULL; |
- fAnyCaseTrie = NULL; |
- fLowerCaseTrie = NULL; |
- fScriptSets = NULL; |
} |
@@ -626,7 +664,6 @@ void SpoofData::reset() { |
void SpoofData::initPtrs(UErrorCode &status) { |
fCFUKeys = NULL; |
fCFUValues = NULL; |
- fCFUStringLengths = NULL; |
fCFUStrings = NULL; |
if (U_FAILURE(status)) { |
return; |
@@ -637,33 +674,13 @@ void SpoofData::initPtrs(UErrorCode &status) { |
if (fRawData->fCFUStringIndex != 0) { |
fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); |
} |
- if (fRawData->fCFUStringLengths != 0) { |
- fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); |
- } |
if (fRawData->fCFUStringTable != 0) { |
fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); |
} |
- |
- if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { |
- fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
- (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); |
- } |
- if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { |
- fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
- (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); |
- } |
- |
- if (fRawData->fScriptSets != 0) { |
- fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); |
- } |
} |
SpoofData::~SpoofData() { |
- utrie2_close(fAnyCaseTrie); |
- fAnyCaseTrie = NULL; |
- utrie2_close(fLowerCaseTrie); |
- fLowerCaseTrie = NULL; |
if (fDataOwned) { |
uprv_free(fRawData); |
} |
@@ -708,6 +725,78 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { |
return (char *)fRawData + returnOffset; |
} |
+int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const { |
+ int32_t dataSize = fRawData->fLength; |
+ if (capacity < dataSize) { |
+ status = U_BUFFER_OVERFLOW_ERROR; |
+ return dataSize; |
+ } |
+ uprv_memcpy(buf, fRawData, dataSize); |
+ return dataSize; |
+} |
+ |
+int32_t SpoofData::size() const { |
+ return fRawData->fLength; |
+} |
+ |
+//------------------------------- |
+// |
+// Front-end APIs for SpoofData |
+// |
+//------------------------------- |
+ |
+int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const { |
+ // Perform a binary search. |
+ // [lo, hi), i.e lo is inclusive, hi is exclusive. |
+ // The result after the loop will be in lo. |
+ int32_t lo = 0; |
+ int32_t hi = length(); |
+ do { |
+ int32_t mid = (lo + hi) / 2; |
+ if (codePointAt(mid) > inChar) { |
+ hi = mid; |
+ } else if (codePointAt(mid) < inChar) { |
+ lo = mid; |
+ } else { |
+ // Found result. Break early. |
+ lo = mid; |
+ break; |
+ } |
+ } while (hi - lo > 1); |
+ |
+ // Did we find an entry? If not, the char maps to itself. |
+ if (codePointAt(lo) != inChar) { |
+ dest.append(inChar); |
+ return 1; |
+ } |
+ |
+ // Add the element to the string builder and return. |
+ return appendValueTo(lo, dest); |
+} |
+ |
+int32_t SpoofData::length() const { |
+ return fRawData->fCFUKeysSize; |
+} |
+ |
+UChar32 SpoofData::codePointAt(int32_t index) const { |
+ return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]); |
+} |
+ |
+int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const { |
+ int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]); |
+ |
+ // Value is either a char (for strings of length 1) or |
+ // an index into the string table (for longer strings) |
+ uint16_t value = fCFUValues[index]; |
+ if (stringLength == 1) { |
+ dest.append((UChar)value); |
+ } else { |
+ dest.append(fCFUStrings + value, stringLength); |
+ } |
+ |
+ return stringLength; |
+} |
+ |
U_NAMESPACE_END |
@@ -739,7 +828,10 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou |
pInfo->dataFormat[1]==0x66 && |
pInfo->dataFormat[2]==0x75 && |
pInfo->dataFormat[3]==0x20 && |
- pInfo->formatVersion[0]==1 )) { |
+ pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION && |
+ pInfo->formatVersion[1]==0 && |
+ pInfo->formatVersion[2]==0 && |
+ pInfo->formatVersion[3]==0 )) { |
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " |
"(format version %02x %02x %02x %02x) is not recognized\n", |
pInfo->dataFormat[0], pInfo->dataFormat[1], |
@@ -828,26 +920,6 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou |
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; |
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
- // String Lengths Section |
- sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); |
- sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; |
- ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
- |
- // Any Case Trie |
- sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); |
- sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); |
- utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
- |
- // Lower Case Trie |
- sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); |
- sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); |
- utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
- |
- // Script Sets. The data is an array of int32_t |
- sectionStart = ds->readUInt32(spoofDH->fScriptSets); |
- sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); |
- ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
- |
// And, last, swap the header itself. |
// int32_t fMagic // swap this |
// uint8_t fFormatVersion[4] // Do not swap this, just copy |