| Index: source/i18n/uspoof_impl.cpp
|
| diff --git a/source/i18n/uspoof_impl.cpp b/source/i18n/uspoof_impl.cpp
|
| index e9077d3ac3414027735b4cbc8880fb92a6e9a996..a062d3ee24bd533798451a415fed5712f0335606 100644
|
| --- a/source/i18n/uspoof_impl.cpp
|
| +++ b/source/i18n/uspoof_impl.cpp
|
| @@ -1,6 +1,8 @@
|
| +// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
| +// License & terms of use: http://www.unicode.org/copyright.html
|
| /*
|
| **********************************************************************
|
| -* Copyright (C) 2008-2015, International Business Machines
|
| +* Copyright (C) 2008-2016, International Business Machines
|
| * Corporation and others. All Rights Reserved.
|
| **********************************************************************
|
| */
|
| @@ -13,11 +15,11 @@
|
| #include "utrie2.h"
|
| #include "cmemory.h"
|
| #include "cstring.h"
|
| -#include "identifier_info.h"
|
| #include "scriptset.h"
|
| #include "umutex.h"
|
| #include "udataswp.h"
|
| #include "uassert.h"
|
| +#include "ucln_in.h"
|
| #include "uspoof_impl.h"
|
|
|
| #if !UCONFIG_NO_NORMALIZATION
|
| @@ -27,41 +29,53 @@ U_NAMESPACE_BEGIN
|
|
|
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
|
|
|
| -SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
|
| - fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,
|
| - fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
| - if (U_FAILURE(status)) {
|
| - return;
|
| - }
|
| +SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
|
| + construct(status);
|
| + fSpoofData = data;
|
| +}
|
| +
|
| +SpoofImpl::SpoofImpl(UErrorCode& status) {
|
| + construct(status);
|
| +
|
| + // TODO: Call this method where it is actually needed, instead of in the
|
| + // constructor, to allow for lazy data loading. See #12696.
|
| + fSpoofData = SpoofData::getDefault(status);
|
| +}
|
| +
|
| +SpoofImpl::SpoofImpl() {
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + construct(status);
|
| +
|
| + // TODO: Call this method where it is actually needed, instead of in the
|
| + // constructor, to allow for lazy data loading. See #12696.
|
| + fSpoofData = SpoofData::getDefault(status);
|
| +}
|
| +
|
| +void SpoofImpl::construct(UErrorCode& status) {
|
| + fMagic = USPOOF_MAGIC;
|
| + fChecks = USPOOF_ALL_CHECKS;
|
| + fSpoofData = NULL;
|
| + fAllowedCharsSet = NULL;
|
| + fAllowedLocales = NULL;
|
| fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
|
|
|
| + if (U_FAILURE(status)) { return; }
|
| +
|
| UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
| - allowedCharsSet->freeze();
|
| fAllowedCharsSet = allowedCharsSet;
|
| fAllowedLocales = uprv_strdup("");
|
| if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
|
| status = U_MEMORY_ALLOCATION_ERROR;
|
| return;
|
| }
|
| - fMagic = USPOOF_MAGIC;
|
| -}
|
| -
|
| -
|
| -SpoofImpl::SpoofImpl() :
|
| - fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
|
| - fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
| - UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
| allowedCharsSet->freeze();
|
| - fAllowedCharsSet = allowedCharsSet;
|
| - fAllowedLocales = uprv_strdup("");
|
| - fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
|
| }
|
|
|
|
|
| // Copy Constructor, used by the user level clone() function.
|
| SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
|
| fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
|
| - fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
|
| + fAllowedLocales(NULL) {
|
| if (U_FAILURE(status)) {
|
| return;
|
| }
|
| @@ -71,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
|
| fSpoofData = src.fSpoofData->addReference();
|
| }
|
| fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
|
| - if (fAllowedCharsSet == NULL) {
|
| + fAllowedLocales = uprv_strdup(src.fAllowedLocales);
|
| + if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
|
| status = U_MEMORY_ALLOCATION_ERROR;
|
| }
|
| - fAllowedLocales = uprv_strdup(src.fAllowedLocales);
|
| fRestrictionLevel = src.fRestrictionLevel;
|
| }
|
|
|
| @@ -86,7 +100,11 @@ SpoofImpl::~SpoofImpl() {
|
| }
|
| delete fAllowedCharsSet;
|
| uprv_free((void *)fAllowedLocales);
|
| - delete fCachedIdentifierInfo;
|
| +}
|
| +
|
| +// Cast this instance as a USpoofChecker for the C API.
|
| +USpoofChecker *SpoofImpl::asUSpoofChecker() {
|
| + return reinterpret_cast<USpoofChecker*>(this);
|
| }
|
|
|
| //
|
| @@ -102,12 +120,11 @@ const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &st
|
| return NULL;
|
| }
|
| SpoofImpl *This = (SpoofImpl *)sc;
|
| - if (This->fMagic != USPOOF_MAGIC ||
|
| - This->fSpoofData == NULL) {
|
| + if (This->fMagic != USPOOF_MAGIC) {
|
| status = U_INVALID_FORMAT_ERROR;
|
| return NULL;
|
| }
|
| - if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
|
| + if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
|
| return NULL;
|
| }
|
| return This;
|
| @@ -119,148 +136,6 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
|
| }
|
|
|
|
|
| -
|
| -//--------------------------------------------------------------------------------------
|
| -//
|
| -// confusableLookup() This is the heart of the confusable skeleton generation
|
| -// implementation.
|
| -//
|
| -// Given a source character, produce the corresponding
|
| -// replacement character(s), appending them to the dest string.
|
| -//
|
| -//---------------------------------------------------------------------------------------
|
| -int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
|
| -
|
| - // Binary search the spoof data key table for the inChar
|
| - int32_t *low = fSpoofData->fCFUKeys;
|
| - int32_t *mid = NULL;
|
| - int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
|
| - UChar32 midc;
|
| - do {
|
| - int32_t delta = ((int32_t)(limit-low))/2;
|
| - mid = low + delta;
|
| - midc = *mid & 0x1fffff;
|
| - if (inChar == midc) {
|
| - goto foundChar;
|
| - } else if (inChar < midc) {
|
| - limit = mid;
|
| - } else {
|
| - low = mid;
|
| - }
|
| - } while (low < limit-1);
|
| - mid = low;
|
| - midc = *mid & 0x1fffff;
|
| - if (inChar != midc) {
|
| - // Char not found. It maps to itself.
|
| - int i = 0;
|
| - dest.append(inChar);
|
| - return i;
|
| - }
|
| - foundChar:
|
| - int32_t keyFlags = *mid & 0xff000000;
|
| - if ((keyFlags & tableMask) == 0) {
|
| - // We found the right key char, but the entry doesn't pertain to the
|
| - // table we need. See if there is an adjacent key that does
|
| - if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
|
| - int32_t *altMid;
|
| - for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
|
| - keyFlags = *altMid & 0xff000000;
|
| - if (keyFlags & tableMask) {
|
| - mid = altMid;
|
| - goto foundKey;
|
| - }
|
| - }
|
| - for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
|
| - keyFlags = *altMid & 0xff000000;
|
| - if (keyFlags & tableMask) {
|
| - mid = altMid;
|
| - goto foundKey;
|
| - }
|
| - }
|
| - }
|
| - // No key entry for this char & table.
|
| - // The input char maps to itself.
|
| - int i = 0;
|
| - dest.append(inChar);
|
| - return i;
|
| - }
|
| -
|
| - foundKey:
|
| - int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
|
| - int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
|
| -
|
| - // Value is either a UChar (for strings of length 1) or
|
| - // an index into the string table (for longer strings)
|
| - uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
|
| - if (stringLen == 1) {
|
| - dest.append((UChar)value);
|
| - return 1;
|
| - }
|
| -
|
| - // String length of 4 from the above lookup is used for all strings of length >= 4.
|
| - // For these, get the real length from the string lengths table,
|
| - // which maps string table indexes to lengths.
|
| - // All strings of the same length are stored contiguously in the string table.
|
| - // 'value' from the lookup above is the starting index for the desired string.
|
| -
|
| - int32_t ix;
|
| - if (stringLen == 4) {
|
| - int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
|
| - for (ix = 0; ix < stringLengthsLimit; ix++) {
|
| - if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
|
| - stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
|
| - break;
|
| - }
|
| - }
|
| - U_ASSERT(ix < stringLengthsLimit);
|
| - }
|
| -
|
| - U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
|
| - UChar *src = &fSpoofData->fCFUStrings[value];
|
| - dest.append(src, stringLen);
|
| - return stringLen;
|
| -}
|
| -
|
| -
|
| -//---------------------------------------------------------------------------------------
|
| -//
|
| -// wholeScriptCheck()
|
| -//
|
| -// Input text is already normalized to NFD
|
| -// Return the set of scripts, each of which can represent something that is
|
| -// confusable with the input text. The script of the input text
|
| -// is included; input consisting of characters from a single script will
|
| -// always produce a result consisting of a set containing that script.
|
| -//
|
| -//---------------------------------------------------------------------------------------
|
| -void SpoofImpl::wholeScriptCheck(
|
| - const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
|
| -
|
| - UTrie2 *table =
|
| - (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
|
| - result->setAll();
|
| - int32_t length = text.length();
|
| - for (int32_t inputIdx=0; inputIdx < length;) {
|
| - UChar32 c = text.char32At(inputIdx);
|
| - inputIdx += U16_LENGTH(c);
|
| - uint32_t index = utrie2_get32(table, c);
|
| - if (index == 0) {
|
| - // No confusables in another script for this char.
|
| - // TODO: we should change the data to have sets with just the single script
|
| - // bit for the script of this char. Gets rid of this special case.
|
| - // Until then, grab the script from the char and intersect it with the set.
|
| - UScriptCode cpScript = uscript_getScript(c, &status);
|
| - U_ASSERT(cpScript > USCRIPT_INHERITED);
|
| - result->intersect(cpScript, status);
|
| - } else if (index == 1) {
|
| - // Script == Common or Inherited. Nothing to do.
|
| - } else {
|
| - result->intersect(fSpoofData->fScriptSets[index]);
|
| - }
|
| - }
|
| -}
|
| -
|
| -
|
| void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
|
| UnicodeSet allowedChars;
|
| UnicodeSet *tmpSet = NULL;
|
| @@ -356,7 +231,7 @@ const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
|
| void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
|
| UScriptCode scripts[30];
|
|
|
| - int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
|
| + int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
|
| if (U_FAILURE(status)) {
|
| return;
|
| }
|
| @@ -372,6 +247,137 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr
|
| }
|
| }
|
|
|
| +// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
|
| +void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
|
| + result.resetAll();
|
| + result.setScriptExtensions(codePoint, status);
|
| + if (U_FAILURE(status)) { return; }
|
| +
|
| + // Section 5.1 step 1
|
| + if (result.test(USCRIPT_HAN, status)) {
|
| + result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
|
| + result.set(USCRIPT_JAPANESE, status);
|
| + result.set(USCRIPT_KOREAN, status);
|
| + }
|
| + if (result.test(USCRIPT_HIRAGANA, status)) {
|
| + result.set(USCRIPT_JAPANESE, status);
|
| + }
|
| + if (result.test(USCRIPT_KATAKANA, status)) {
|
| + result.set(USCRIPT_JAPANESE, status);
|
| + }
|
| + if (result.test(USCRIPT_HANGUL, status)) {
|
| + result.set(USCRIPT_KOREAN, status);
|
| + }
|
| + if (result.test(USCRIPT_BOPOMOFO, status)) {
|
| + result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
|
| + }
|
| +
|
| + // Section 5.1 step 2
|
| + if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
|
| + result.setAll();
|
| + }
|
| +}
|
| +
|
| +// Computes the resolved script set for a string, according to UTS 39 section 5.1.
|
| +void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
|
| + getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
|
| +}
|
| +
|
| +// Computes the resolved script set for a string, omitting characters having the specified script.
|
| +// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
|
| +void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
|
| + result.setAll();
|
| +
|
| + ScriptSet temp;
|
| + UChar32 codePoint;
|
| + for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
|
| + codePoint = input.char32At(i);
|
| +
|
| + // Compute the augmented script set for the character
|
| + getAugmentedScriptSet(codePoint, temp, status);
|
| + if (U_FAILURE(status)) { return; }
|
| +
|
| + // Intersect the augmented script set with the resolved script set, but only if the character doesn't
|
| + // have the script specified in the function call
|
| + if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
|
| + result.intersect(temp);
|
| + }
|
| + }
|
| +}
|
| +
|
| +// Computes the set of numerics for a string, according to UTS 39 section 5.3.
|
| +void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
|
| + result.clear();
|
| +
|
| + UChar32 codePoint;
|
| + for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
|
| + codePoint = input.char32At(i);
|
| +
|
| + // Store a representative character for each kind of decimal digit
|
| + if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
|
| + // Store the zero character as a representative for comparison.
|
| + // Unicode guarantees it is codePoint - value
|
| + result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
|
| + }
|
| + }
|
| +}
|
| +
|
| +// Computes the restriction level of a string, according to UTS 39 section 5.2.
|
| +URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
|
| + // Section 5.2 step 1:
|
| + if (!fAllowedCharsSet->containsAll(input)) {
|
| + return USPOOF_UNRESTRICTIVE;
|
| + }
|
| +
|
| + // Section 5.2 step 2
|
| + // Java use a static UnicodeSet for this test. In C++, avoid the static variable
|
| + // and just do a simple for loop.
|
| + UBool allASCII = TRUE;
|
| + for (int32_t i=0, length=input.length(); i<length; i++) {
|
| + if (input.charAt(i) > 0x7f) {
|
| + allASCII = FALSE;
|
| + break;
|
| + }
|
| + }
|
| + if (allASCII) {
|
| + return USPOOF_ASCII;
|
| + }
|
| +
|
| + // Section 5.2 steps 3:
|
| + ScriptSet resolvedScriptSet;
|
| + getResolvedScriptSet(input, resolvedScriptSet, status);
|
| + if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
|
| +
|
| + // Section 5.2 step 4:
|
| + if (!resolvedScriptSet.isEmpty()) {
|
| + return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
|
| + }
|
| +
|
| + // Section 5.2 step 5:
|
| + ScriptSet resolvedNoLatn;
|
| + getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
|
| + if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
|
| +
|
| + // Section 5.2 step 6:
|
| + if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
|
| + || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
|
| + || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
|
| + return USPOOF_HIGHLY_RESTRICTIVE;
|
| + }
|
| +
|
| + // Section 5.2 step 7:
|
| + if (!resolvedNoLatn.isEmpty()
|
| + && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
|
| + && !resolvedNoLatn.test(USCRIPT_GREEK, status)
|
| + && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
|
| + return USPOOF_MODERATELY_RESTRICTIVE;
|
| + }
|
| +
|
| + // Section 5.2 step 8:
|
| + return USPOOF_MINIMALLY_RESTRICTIVE;
|
| +}
|
| +
|
| +
|
|
|
| // Convert a text format hex number. Utility function used by builder code. Static.
|
| // Input: UChar *string text. Output: a UChar32
|
| @@ -404,55 +410,60 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC
|
| return (UChar32)val;
|
| }
|
|
|
| -// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
|
| -// Maintain a one-element cache, which is sufficient to avoid repeatedly
|
| -// creating new ones unless we get multi-thread concurrency in spoof
|
| -// check operations, which should be statistically uncommon.
|
|
|
| -// These functions are used in place of new & delete of an IdentifierInfo.
|
| -// They will recycle the IdentifierInfo when possible.
|
| -// They are logically const, and used within const functions that must be thread safe.
|
| -IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
|
| - IdentifierInfo *returnIdInfo = NULL;
|
| - if (U_FAILURE(status)) {
|
| - return returnIdInfo;
|
| - }
|
| - SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
|
| - {
|
| - Mutex m;
|
| - returnIdInfo = nonConstThis->fCachedIdentifierInfo;
|
| - nonConstThis->fCachedIdentifierInfo = NULL;
|
| - }
|
| - if (returnIdInfo == NULL) {
|
| - returnIdInfo = new IdentifierInfo(status);
|
| - if (U_SUCCESS(status) && returnIdInfo == NULL) {
|
| - status = U_MEMORY_ALLOCATION_ERROR;
|
| - }
|
| - if (U_FAILURE(status) && returnIdInfo != NULL) {
|
| - delete returnIdInfo;
|
| - returnIdInfo = NULL;
|
| - }
|
| - }
|
| - return returnIdInfo;
|
| +//-----------------------------------------
|
| +//
|
| +// class CheckResult Implementation
|
| +//
|
| +//-----------------------------------------
|
| +
|
| +CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
|
| + clear();
|
| }
|
|
|
| +USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
|
| + return reinterpret_cast<USpoofCheckResult*>(this);
|
| +}
|
|
|
| -void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
|
| - if (idInfo != NULL) {
|
| - SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
|
| - {
|
| - Mutex m;
|
| - if (nonConstThis->fCachedIdentifierInfo == NULL) {
|
| - nonConstThis->fCachedIdentifierInfo = idInfo;
|
| - idInfo = NULL;
|
| - }
|
| - }
|
| - delete idInfo;
|
| +//
|
| +// Incoming parameter check on Status and the CheckResult object
|
| +// received from the C API.
|
| +//
|
| +const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
|
| + if (U_FAILURE(status)) { return NULL; }
|
| + if (ptr == NULL) {
|
| + status = U_ILLEGAL_ARGUMENT_ERROR;
|
| + return NULL;
|
| }
|
| + CheckResult *This = (CheckResult*) ptr;
|
| + if (This->fMagic != USPOOF_CHECK_MAGIC) {
|
| + status = U_INVALID_FORMAT_ERROR;
|
| + return NULL;
|
| + }
|
| + return This;
|
| +}
|
| +
|
| +CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
|
| + return const_cast<CheckResult *>
|
| + (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
|
| }
|
|
|
| +void CheckResult::clear() {
|
| + fChecks = 0;
|
| + fNumerics.clear();
|
| + fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
|
| +}
|
|
|
| +int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
|
| + if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
|
| + return fChecks | fRestrictionLevel;
|
| + } else {
|
| + return fChecks;
|
| + }
|
| +}
|
|
|
| +CheckResult::~CheckResult() {
|
| +}
|
|
|
| //----------------------------------------------------------------------------------------------
|
| //
|
| @@ -461,12 +472,14 @@ void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
|
| //----------------------------------------------------------------------------------------------
|
|
|
|
|
| -UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
|
| +UBool SpoofData::validateDataVersion(UErrorCode &status) const {
|
| if (U_FAILURE(status) ||
|
| - rawData == NULL ||
|
| - rawData->fMagic != USPOOF_MAGIC ||
|
| - rawData->fFormatVersion[0] > 1 ||
|
| - rawData->fFormatVersion[1] > 0) {
|
| + fRawData == NULL ||
|
| + fRawData->fMagic != USPOOF_MAGIC ||
|
| + fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
|
| + fRawData->fFormatVersion[1] != 0 ||
|
| + fRawData->fFormatVersion[2] != 0 ||
|
| + fRawData->fFormatVersion[3] != 0) {
|
| status = U_INVALID_FORMAT_ERROR;
|
| return FALSE;
|
| }
|
| @@ -485,7 +498,7 @@ spoofDataIsAcceptable(void *context,
|
| pInfo->dataFormat[1] == 0x66 &&
|
| pInfo->dataFormat[2] == 0x75 &&
|
| pInfo->dataFormat[3] == 0x20 &&
|
| - pInfo->formatVersion[0] == 1
|
| + pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
|
| ) {
|
| UVersionInfo *version = static_cast<UVersionInfo *>(context);
|
| if(version != NULL) {
|
| @@ -497,32 +510,61 @@ spoofDataIsAcceptable(void *context,
|
| }
|
| }
|
|
|
| +// Methods for the loading of the default confusables data file. The confusable
|
| +// data is loaded only when it is needed.
|
| +//
|
| +// SpoofData::getDefault() - Return the default confusables data, and call the
|
| +// initOnce() if it is not available. Adds a reference
|
| +// to the SpoofData that the caller is responsible for
|
| +// decrementing when they are done with the data.
|
| //
|
| -// SpoofData::getDefault() - return a wrapper around the spoof data that is
|
| -// baked into the default ICU data.
|
| +// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
|
| +// is shared by all spoof checkers using the default data.
|
| //
|
| -// Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
|
| -// SpoofData is shared by all spoof checkers using the default data.
|
| +// uspoof_cleanupDefaultData - Called during cleanup.
|
| //
|
| -SpoofData *SpoofData::getDefault(UErrorCode &status) {
|
| +
|
| +static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
|
| +static SpoofData* gDefaultSpoofData;
|
| +
|
| +static UBool U_CALLCONV
|
| +uspoof_cleanupDefaultData(void) {
|
| + if (gDefaultSpoofData) {
|
| + // Will delete, assuming all user-level spoof checkers were closed.
|
| + gDefaultSpoofData->removeReference();
|
| + gDefaultSpoofData = NULL;
|
| + gSpoofInitDefaultOnce.reset();
|
| + }
|
| + return TRUE;
|
| +}
|
| +
|
| +static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
|
| UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
|
| spoofDataIsAcceptable,
|
| NULL, // context, would receive dataVersion if supplied.
|
| &status);
|
| + if (U_FAILURE(status)) { return; }
|
| + gDefaultSpoofData = new SpoofData(udm, status);
|
| if (U_FAILURE(status)) {
|
| - return NULL;
|
| - }
|
| - SpoofData *This = new SpoofData(udm, status);
|
| - if (U_FAILURE(status)) {
|
| - delete This;
|
| - return NULL;
|
| + delete gDefaultSpoofData;
|
| + return;
|
| }
|
| - if (This == NULL) {
|
| + if (gDefaultSpoofData == NULL) {
|
| status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| }
|
| - return This;
|
| + ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
|
| +}
|
| +
|
| +SpoofData* SpoofData::getDefault(UErrorCode& status) {
|
| + umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
|
| + if (U_FAILURE(status)) { return NULL; }
|
| + gDefaultSpoofData->addReference();
|
| + return gDefaultSpoofData;
|
| }
|
|
|
| +
|
| +
|
| SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
|
| {
|
| reset();
|
| @@ -533,7 +575,7 @@ SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
|
| // fRawData is non-const because it may be constructed by the data builder.
|
| fRawData = reinterpret_cast<SpoofDataHeader *>(
|
| const_cast<void *>(udata_getMemory(udm)));
|
| - validateDataVersion(fRawData, status);
|
| + validateDataVersion(status);
|
| initPtrs(status);
|
| }
|
|
|
| @@ -554,7 +596,7 @@ SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
|
| status = U_INVALID_FORMAT_ERROR;
|
| return;
|
| }
|
| - validateDataVersion(fRawData, status);
|
| + validateDataVersion(status);
|
| initPtrs(status);
|
| }
|
|
|
| @@ -582,7 +624,7 @@ SpoofData::SpoofData(UErrorCode &status) {
|
| uprv_memset(fRawData, 0, initialSize);
|
|
|
| fRawData->fMagic = USPOOF_MAGIC;
|
| - fRawData->fFormatVersion[0] = 1;
|
| + fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
|
| fRawData->fFormatVersion[1] = 0;
|
| fRawData->fFormatVersion[2] = 0;
|
| fRawData->fFormatVersion[3] = 0;
|
| @@ -600,11 +642,7 @@ void SpoofData::reset() {
|
| fRefCount = 1;
|
| fCFUKeys = NULL;
|
| fCFUValues = NULL;
|
| - fCFUStringLengths = NULL;
|
| fCFUStrings = NULL;
|
| - fAnyCaseTrie = NULL;
|
| - fLowerCaseTrie = NULL;
|
| - fScriptSets = NULL;
|
| }
|
|
|
|
|
| @@ -626,7 +664,6 @@ void SpoofData::reset() {
|
| void SpoofData::initPtrs(UErrorCode &status) {
|
| fCFUKeys = NULL;
|
| fCFUValues = NULL;
|
| - fCFUStringLengths = NULL;
|
| fCFUStrings = NULL;
|
| if (U_FAILURE(status)) {
|
| return;
|
| @@ -637,33 +674,13 @@ void SpoofData::initPtrs(UErrorCode &status) {
|
| if (fRawData->fCFUStringIndex != 0) {
|
| fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
|
| }
|
| - if (fRawData->fCFUStringLengths != 0) {
|
| - fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
|
| - }
|
| if (fRawData->fCFUStringTable != 0) {
|
| fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
|
| }
|
| -
|
| - if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
|
| - fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
| - (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
|
| - }
|
| - if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
|
| - fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
| - (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
|
| - }
|
| -
|
| - if (fRawData->fScriptSets != 0) {
|
| - fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
|
| - }
|
| }
|
|
|
|
|
| SpoofData::~SpoofData() {
|
| - utrie2_close(fAnyCaseTrie);
|
| - fAnyCaseTrie = NULL;
|
| - utrie2_close(fLowerCaseTrie);
|
| - fLowerCaseTrie = NULL;
|
| if (fDataOwned) {
|
| uprv_free(fRawData);
|
| }
|
| @@ -708,6 +725,78 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
|
| return (char *)fRawData + returnOffset;
|
| }
|
|
|
| +int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
|
| + int32_t dataSize = fRawData->fLength;
|
| + if (capacity < dataSize) {
|
| + status = U_BUFFER_OVERFLOW_ERROR;
|
| + return dataSize;
|
| + }
|
| + uprv_memcpy(buf, fRawData, dataSize);
|
| + return dataSize;
|
| +}
|
| +
|
| +int32_t SpoofData::size() const {
|
| + return fRawData->fLength;
|
| +}
|
| +
|
| +//-------------------------------
|
| +//
|
| +// Front-end APIs for SpoofData
|
| +//
|
| +//-------------------------------
|
| +
|
| +int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
|
| + // Perform a binary search.
|
| + // [lo, hi), i.e lo is inclusive, hi is exclusive.
|
| + // The result after the loop will be in lo.
|
| + int32_t lo = 0;
|
| + int32_t hi = length();
|
| + do {
|
| + int32_t mid = (lo + hi) / 2;
|
| + if (codePointAt(mid) > inChar) {
|
| + hi = mid;
|
| + } else if (codePointAt(mid) < inChar) {
|
| + lo = mid;
|
| + } else {
|
| + // Found result. Break early.
|
| + lo = mid;
|
| + break;
|
| + }
|
| + } while (hi - lo > 1);
|
| +
|
| + // Did we find an entry? If not, the char maps to itself.
|
| + if (codePointAt(lo) != inChar) {
|
| + dest.append(inChar);
|
| + return 1;
|
| + }
|
| +
|
| + // Add the element to the string builder and return.
|
| + return appendValueTo(lo, dest);
|
| +}
|
| +
|
| +int32_t SpoofData::length() const {
|
| + return fRawData->fCFUKeysSize;
|
| +}
|
| +
|
| +UChar32 SpoofData::codePointAt(int32_t index) const {
|
| + return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
|
| +}
|
| +
|
| +int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
|
| + int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
|
| +
|
| + // Value is either a char (for strings of length 1) or
|
| + // an index into the string table (for longer strings)
|
| + uint16_t value = fCFUValues[index];
|
| + if (stringLength == 1) {
|
| + dest.append((UChar)value);
|
| + } else {
|
| + dest.append(fCFUStrings + value, stringLength);
|
| + }
|
| +
|
| + return stringLength;
|
| +}
|
| +
|
|
|
| U_NAMESPACE_END
|
|
|
| @@ -739,7 +828,10 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
|
| pInfo->dataFormat[1]==0x66 &&
|
| pInfo->dataFormat[2]==0x75 &&
|
| pInfo->dataFormat[3]==0x20 &&
|
| - pInfo->formatVersion[0]==1 )) {
|
| + pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
|
| + pInfo->formatVersion[1]==0 &&
|
| + pInfo->formatVersion[2]==0 &&
|
| + pInfo->formatVersion[3]==0 )) {
|
| udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
|
| "(format version %02x %02x %02x %02x) is not recognized\n",
|
| pInfo->dataFormat[0], pInfo->dataFormat[1],
|
| @@ -828,26 +920,6 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
|
| sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
|
| ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
|
|
| - // String Lengths Section
|
| - sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
|
| - sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
|
| - ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
| -
|
| - // Any Case Trie
|
| - sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
|
| - sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
|
| - utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
| -
|
| - // Lower Case Trie
|
| - sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
|
| - sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
|
| - utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
| -
|
| - // Script Sets. The data is an array of int32_t
|
| - sectionStart = ds->readUInt32(spoofDH->fScriptSets);
|
| - sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
|
| - ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
| -
|
| // And, last, swap the header itself.
|
| // int32_t fMagic // swap this
|
| // uint8_t fFormatVersion[4] // Do not swap this, just copy
|
|
|