source/i18n/uspoof_impl.cpp - Issue 2440913002: Update ICU to 58.1

Unified Diff: source/i18n/uspoof_impl.cpp

Issue 2440913002: Update ICU to 58.1

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/i18n/uspoof_impl.cpp

diff --git a/source/i18n/uspoof_impl.cpp b/source/i18n/uspoof_impl.cpp

index e9077d3ac3414027735b4cbc8880fb92a6e9a996..a062d3ee24bd533798451a415fed5712f0335606 100644

--- a/source/i18n/uspoof_impl.cpp

+++ b/source/i18n/uspoof_impl.cpp

@@ -1,6 +1,8 @@

+// License & terms of use: http://www.unicode.org/copyright.html

**********************************************************************

@@ -13,11 +15,11 @@

#include "utrie2.h"

#include "cmemory.h"

#include "cstring.h"

-#include "identifier_info.h"

#include "scriptset.h"

#include "umutex.h"

#include "udataswp.h"

#include "uassert.h"

+#include "ucln_in.h"

#include "uspoof_impl.h"

#if !UCONFIG_NO_NORMALIZATION

@@ -27,41 +29,53 @@ U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)

-SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :

- fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,

- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {

- if (U_FAILURE(status)) {

- return;

- }

+SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {

+ construct(status);

+ fSpoofData = data;

+SpoofImpl::SpoofImpl(UErrorCode& status) {

+ construct(status);

+ // TODO: Call this method where it is actually needed, instead of in the

+ // constructor, to allow for lazy data loading. See #12696.

+ fSpoofData = SpoofData::getDefault(status);

+SpoofImpl::SpoofImpl() {

+ UErrorCode status = U_ZERO_ERROR;

+ construct(status);

+ // TODO: Call this method where it is actually needed, instead of in the

+ // constructor, to allow for lazy data loading. See #12696.

+ fSpoofData = SpoofData::getDefault(status);

+void SpoofImpl::construct(UErrorCode& status) {

+ fMagic = USPOOF_MAGIC;

+ fChecks = USPOOF_ALL_CHECKS;

+ fSpoofData = NULL;

+ fAllowedCharsSet = NULL;

+ fAllowedLocales = NULL;

fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;

+ if (U_FAILURE(status)) { return; }

UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);

- allowedCharsSet->freeze();

fAllowedCharsSet = allowedCharsSet;

fAllowedLocales = uprv_strdup("");

if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {

status = U_MEMORY_ALLOCATION_ERROR;

return;

}

- fMagic = USPOOF_MAGIC;

-SpoofImpl::SpoofImpl() :

- fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,

- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {

- UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);

allowedCharsSet->freeze();

- fAllowedCharsSet = allowedCharsSet;

- fAllowedLocales = uprv_strdup("");

- fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;

}

// Copy Constructor, used by the user level clone() function.

SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :

fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,

- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {

+ fAllowedLocales(NULL) {

if (U_FAILURE(status)) {

return;

}

@@ -71,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :

fSpoofData = src.fSpoofData->addReference();

}

fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());

- if (fAllowedCharsSet == NULL) {

+ fAllowedLocales = uprv_strdup(src.fAllowedLocales);

+ if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {

status = U_MEMORY_ALLOCATION_ERROR;

}

- fAllowedLocales = uprv_strdup(src.fAllowedLocales);

fRestrictionLevel = src.fRestrictionLevel;

}

@@ -86,7 +100,11 @@ SpoofImpl::~SpoofImpl() {

}

delete fAllowedCharsSet;

uprv_free((void *)fAllowedLocales);

- delete fCachedIdentifierInfo;

+// Cast this instance as a USpoofChecker for the C API.

+USpoofChecker *SpoofImpl::asUSpoofChecker() {

+ return reinterpret_cast<USpoofChecker*>(this);

}

@@ -102,12 +120,11 @@ const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &st

return NULL;

}

SpoofImpl *This = (SpoofImpl *)sc;

- if (This->fMagic != USPOOF_MAGIC ||

- This->fSpoofData == NULL) {

+ if (This->fMagic != USPOOF_MAGIC) {

status = U_INVALID_FORMAT_ERROR;

return NULL;

}

- if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {

+ if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {

return NULL;

}

return This;

@@ -119,148 +136,6 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {

}

-//--------------------------------------------------------------------------------------

-//

-// confusableLookup() This is the heart of the confusable skeleton generation

-// implementation.

-//

-// Given a source character, produce the corresponding

-// replacement character(s), appending them to the dest string.

-//

-//---------------------------------------------------------------------------------------

-int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {

- // Binary search the spoof data key table for the inChar

- int32_t *low = fSpoofData->fCFUKeys;

- int32_t *mid = NULL;

- int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;

- UChar32 midc;

- do {

- int32_t delta = ((int32_t)(limit-low))/2;

- mid = low + delta;

- midc = *mid & 0x1fffff;

- if (inChar == midc) {

- goto foundChar;

- } else if (inChar < midc) {

- limit = mid;

- } else {

- low = mid;

- }

- } while (low < limit-1);

- mid = low;

- midc = *mid & 0x1fffff;

- if (inChar != midc) {

- // Char not found. It maps to itself.

- int i = 0;

- dest.append(inChar);

- return i;

- }

- foundChar:

- int32_t keyFlags = *mid & 0xff000000;

- if ((keyFlags & tableMask) == 0) {

- // We found the right key char, but the entry doesn't pertain to the

- // table we need. See if there is an adjacent key that does

- if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {

- int32_t *altMid;

- for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {

- keyFlags = *altMid & 0xff000000;

- if (keyFlags & tableMask) {

- mid = altMid;

- goto foundKey;

- }

- for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {

- keyFlags = *altMid & 0xff000000;

- if (keyFlags & tableMask) {

- mid = altMid;

- goto foundKey;

- }

- // No key entry for this char & table.

- // The input char maps to itself.

- int i = 0;

- dest.append(inChar);

- return i;

- }

- foundKey:

- int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;

- int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);

- // Value is either a UChar (for strings of length 1) or

- // an index into the string table (for longer strings)

- uint16_t value = fSpoofData->fCFUValues[keyTableIndex];

- if (stringLen == 1) {

- dest.append((UChar)value);

- return 1;

- }

- // String length of 4 from the above lookup is used for all strings of length >= 4.

- // For these, get the real length from the string lengths table,

- // which maps string table indexes to lengths.

- // All strings of the same length are stored contiguously in the string table.

- // 'value' from the lookup above is the starting index for the desired string.

- int32_t ix;

- if (stringLen == 4) {

- int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;

- for (ix = 0; ix < stringLengthsLimit; ix++) {

- if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {

- stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;

- break;

- }

- U_ASSERT(ix < stringLengthsLimit);

- }

- U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);

- UChar *src = &fSpoofData->fCFUStrings[value];

- dest.append(src, stringLen);

- return stringLen;

-//---------------------------------------------------------------------------------------

-//

-// wholeScriptCheck()

-//

-// Input text is already normalized to NFD

-// Return the set of scripts, each of which can represent something that is

-// confusable with the input text. The script of the input text

-// is included; input consisting of characters from a single script will

-// always produce a result consisting of a set containing that script.

-//

-//---------------------------------------------------------------------------------------

-void SpoofImpl::wholeScriptCheck(

- const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {

- UTrie2 *table =

- (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;

- result->setAll();

- int32_t length = text.length();

- for (int32_t inputIdx=0; inputIdx < length;) {

- UChar32 c = text.char32At(inputIdx);

- inputIdx += U16_LENGTH(c);

- uint32_t index = utrie2_get32(table, c);

- if (index == 0) {

- // No confusables in another script for this char.

- // TODO: we should change the data to have sets with just the single script

- // bit for the script of this char. Gets rid of this special case.

- // Until then, grab the script from the char and intersect it with the set.

- UScriptCode cpScript = uscript_getScript(c, &status);

- U_ASSERT(cpScript > USCRIPT_INHERITED);

- result->intersect(cpScript, status);

- } else if (index == 1) {

- // Script == Common or Inherited. Nothing to do.

- } else {

- result->intersect(fSpoofData->fScriptSets[index]);

- }

void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {

UnicodeSet allowedChars;

UnicodeSet *tmpSet = NULL;

@@ -356,7 +231,7 @@ const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {

void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {

UScriptCode scripts[30];

- int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);

+ int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);

if (U_FAILURE(status)) {

return;

}

@@ -372,6 +247,137 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr

}

+// Computes the augmented script set for a code point, according to UTS 39 section 5.1.

+void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {

+ result.resetAll();

+ result.setScriptExtensions(codePoint, status);

+ if (U_FAILURE(status)) { return; }

+ // Section 5.1 step 1

+ if (result.test(USCRIPT_HAN, status)) {

+ result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);

+ result.set(USCRIPT_JAPANESE, status);

+ result.set(USCRIPT_KOREAN, status);

+ }

+ if (result.test(USCRIPT_HIRAGANA, status)) {

+ result.set(USCRIPT_JAPANESE, status);

+ }

+ if (result.test(USCRIPT_KATAKANA, status)) {

+ result.set(USCRIPT_JAPANESE, status);

+ }

+ if (result.test(USCRIPT_HANGUL, status)) {

+ result.set(USCRIPT_KOREAN, status);

+ }

+ if (result.test(USCRIPT_BOPOMOFO, status)) {

+ result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);

+ }

+ // Section 5.1 step 2

+ if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {

+ result.setAll();

+ }

+// Computes the resolved script set for a string, according to UTS 39 section 5.1.

+void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {

+ getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);

+// Computes the resolved script set for a string, omitting characters having the specified script.

+// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.

+void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {

+ result.setAll();

+ ScriptSet temp;

+ UChar32 codePoint;

+ for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {

+ codePoint = input.char32At(i);

+ // Compute the augmented script set for the character

+ getAugmentedScriptSet(codePoint, temp, status);

+ if (U_FAILURE(status)) { return; }

+ // Intersect the augmented script set with the resolved script set, but only if the character doesn't

+ // have the script specified in the function call

+ if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {

+ result.intersect(temp);

+ }

+// Computes the set of numerics for a string, according to UTS 39 section 5.3.

+void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {

+ result.clear();

+ UChar32 codePoint;

+ for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {

+ codePoint = input.char32At(i);

+ // Store a representative character for each kind of decimal digit

+ if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {

+ // Store the zero character as a representative for comparison.

+ // Unicode guarantees it is codePoint - value

+ result.add(codePoint - (UChar32)u_getNumericValue(codePoint));

+ }

+// Computes the restriction level of a string, according to UTS 39 section 5.2.

+URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {

+ // Section 5.2 step 1:

+ if (!fAllowedCharsSet->containsAll(input)) {

+ return USPOOF_UNRESTRICTIVE;

+ }

+ // Section 5.2 step 2

+ // Java use a static UnicodeSet for this test. In C++, avoid the static variable

+ // and just do a simple for loop.

+ UBool allASCII = TRUE;

+ for (int32_t i=0, length=input.length(); i<length; i++) {

+ if (input.charAt(i) > 0x7f) {

+ allASCII = FALSE;

+ break;

+ }

+ if (allASCII) {

+ return USPOOF_ASCII;

+ }

+ // Section 5.2 steps 3:

+ ScriptSet resolvedScriptSet;

+ getResolvedScriptSet(input, resolvedScriptSet, status);

+ if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }

+ // Section 5.2 step 4:

+ if (!resolvedScriptSet.isEmpty()) {

+ return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;

+ }

+ // Section 5.2 step 5:

+ ScriptSet resolvedNoLatn;

+ getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);

+ if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }

+ // Section 5.2 step 6:

+ if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)

+ || resolvedNoLatn.test(USCRIPT_JAPANESE, status)

+ || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {

+ return USPOOF_HIGHLY_RESTRICTIVE;

+ }

+ // Section 5.2 step 7:

+ if (!resolvedNoLatn.isEmpty()

+ && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)

+ && !resolvedNoLatn.test(USCRIPT_GREEK, status)

+ && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {

+ return USPOOF_MODERATELY_RESTRICTIVE;

+ }

+ // Section 5.2 step 8:

+ return USPOOF_MINIMALLY_RESTRICTIVE;

// Convert a text format hex number. Utility function used by builder code. Static.

// Input: UChar *string text. Output: a UChar32

@@ -404,55 +410,60 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC

return (UChar32)val;

}

-// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.

-// Maintain a one-element cache, which is sufficient to avoid repeatedly

-// creating new ones unless we get multi-thread concurrency in spoof

-// check operations, which should be statistically uncommon.

-// These functions are used in place of new & delete of an IdentifierInfo.

-// They will recycle the IdentifierInfo when possible.

-// They are logically const, and used within const functions that must be thread safe.

-IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {

- IdentifierInfo *returnIdInfo = NULL;

- if (U_FAILURE(status)) {

- return returnIdInfo;

- }

- SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);

- {

- Mutex m;

- returnIdInfo = nonConstThis->fCachedIdentifierInfo;

- nonConstThis->fCachedIdentifierInfo = NULL;

- }

- if (returnIdInfo == NULL) {

- returnIdInfo = new IdentifierInfo(status);

- if (U_SUCCESS(status) && returnIdInfo == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- }

- if (U_FAILURE(status) && returnIdInfo != NULL) {

- delete returnIdInfo;

- returnIdInfo = NULL;

- }

- return returnIdInfo;

+//-----------------------------------------

+//

+// class CheckResult Implementation

+//

+//-----------------------------------------

+CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {

+ clear();

}

+USpoofCheckResult* CheckResult::asUSpoofCheckResult() {

+ return reinterpret_cast<USpoofCheckResult*>(this);

-void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {

- if (idInfo != NULL) {

- SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);

- {

- Mutex m;

- if (nonConstThis->fCachedIdentifierInfo == NULL) {

- nonConstThis->fCachedIdentifierInfo = idInfo;

- idInfo = NULL;

- }

- delete idInfo;

+//

+// Incoming parameter check on Status and the CheckResult object

+// received from the C API.

+//

+const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {

+ if (U_FAILURE(status)) { return NULL; }

+ if (ptr == NULL) {

+ status = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

}

+ CheckResult *This = (CheckResult*) ptr;

+ if (This->fMagic != USPOOF_CHECK_MAGIC) {

+ status = U_INVALID_FORMAT_ERROR;

+ return NULL;

+ }

+ return This;

+CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {

+ return const_cast<CheckResult *>

+ (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));

}

+void CheckResult::clear() {

+ fChecks = 0;

+ fNumerics.clear();

+ fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;

+int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {

+ if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {

+ return fChecks | fRestrictionLevel;

+ } else {

+ return fChecks;

+ }

+CheckResult::~CheckResult() {

//----------------------------------------------------------------------------------------------

@@ -461,12 +472,14 @@ void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {

//----------------------------------------------------------------------------------------------

-UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {

+UBool SpoofData::validateDataVersion(UErrorCode &status) const {

if (U_FAILURE(status) ||

- rawData == NULL ||

- rawData->fMagic != USPOOF_MAGIC ||

- rawData->fFormatVersion[0] > 1 ||

- rawData->fFormatVersion[1] > 0) {

+ fRawData == NULL ||

+ fRawData->fMagic != USPOOF_MAGIC ||

+ fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||

+ fRawData->fFormatVersion[1] != 0 ||

+ fRawData->fFormatVersion[2] != 0 ||

+ fRawData->fFormatVersion[3] != 0) {

status = U_INVALID_FORMAT_ERROR;

return FALSE;

}

@@ -485,7 +498,7 @@ spoofDataIsAcceptable(void *context,

pInfo->dataFormat[1] == 0x66 &&

pInfo->dataFormat[2] == 0x75 &&

pInfo->dataFormat[3] == 0x20 &&

- pInfo->formatVersion[0] == 1

+ pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION

) {

UVersionInfo *version = static_cast<UVersionInfo *>(context);

if(version != NULL) {

@@ -497,32 +510,61 @@ spoofDataIsAcceptable(void *context,

}

+// Methods for the loading of the default confusables data file. The confusable

+// data is loaded only when it is needed.

+//

+// SpoofData::getDefault() - Return the default confusables data, and call the

+// initOnce() if it is not available. Adds a reference

+// to the SpoofData that the caller is responsible for

+// decrementing when they are done with the data.

-// SpoofData::getDefault() - return a wrapper around the spoof data that is

-// baked into the default ICU data.

+// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData

+// is shared by all spoof checkers using the default data.

-// Called once, from the initOnce() function in uspoof_impl.cpp; the resulting

-// SpoofData is shared by all spoof checkers using the default data.

+// uspoof_cleanupDefaultData - Called during cleanup.

-SpoofData *SpoofData::getDefault(UErrorCode &status) {

+static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;

+static SpoofData* gDefaultSpoofData;

+static UBool U_CALLCONV

+uspoof_cleanupDefaultData(void) {

+ if (gDefaultSpoofData) {

+ // Will delete, assuming all user-level spoof checkers were closed.

+ gDefaultSpoofData->removeReference();

+ gDefaultSpoofData = NULL;

+ gSpoofInitDefaultOnce.reset();

+ }

+ return TRUE;

+static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {

UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",

spoofDataIsAcceptable,

NULL, // context, would receive dataVersion if supplied.

&status);

+ if (U_FAILURE(status)) { return; }

+ gDefaultSpoofData = new SpoofData(udm, status);

if (U_FAILURE(status)) {

- return NULL;

- }

- SpoofData *This = new SpoofData(udm, status);

- if (U_FAILURE(status)) {

- delete This;

- return NULL;

+ delete gDefaultSpoofData;

+ return;

}

- if (This == NULL) {

+ if (gDefaultSpoofData == NULL) {

status = U_MEMORY_ALLOCATION_ERROR;

+ return;

}

- return This;

+ ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);

+SpoofData* SpoofData::getDefault(UErrorCode& status) {

+ umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);

+ if (U_FAILURE(status)) { return NULL; }

+ gDefaultSpoofData->addReference();

+ return gDefaultSpoofData;

}

SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)

{

reset();

@@ -533,7 +575,7 @@ SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)

// fRawData is non-const because it may be constructed by the data builder.

fRawData = reinterpret_cast<SpoofDataHeader *>(

const_cast<void *>(udata_getMemory(udm)));

- validateDataVersion(fRawData, status);

+ validateDataVersion(status);

initPtrs(status);

}

@@ -554,7 +596,7 @@ SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)

status = U_INVALID_FORMAT_ERROR;

return;

}

- validateDataVersion(fRawData, status);

+ validateDataVersion(status);

initPtrs(status);

}

@@ -582,7 +624,7 @@ SpoofData::SpoofData(UErrorCode &status) {

uprv_memset(fRawData, 0, initialSize);

fRawData->fMagic = USPOOF_MAGIC;

- fRawData->fFormatVersion[0] = 1;

+ fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;

fRawData->fFormatVersion[1] = 0;

fRawData->fFormatVersion[2] = 0;

fRawData->fFormatVersion[3] = 0;

@@ -600,11 +642,7 @@ void SpoofData::reset() {

fRefCount = 1;

fCFUKeys = NULL;

fCFUValues = NULL;

- fCFUStringLengths = NULL;

fCFUStrings = NULL;

- fAnyCaseTrie = NULL;

- fLowerCaseTrie = NULL;

- fScriptSets = NULL;

}

@@ -626,7 +664,6 @@ void SpoofData::reset() {

void SpoofData::initPtrs(UErrorCode &status) {

fCFUKeys = NULL;

fCFUValues = NULL;

- fCFUStringLengths = NULL;

fCFUStrings = NULL;

if (U_FAILURE(status)) {

return;

@@ -637,33 +674,13 @@ void SpoofData::initPtrs(UErrorCode &status) {

if (fRawData->fCFUStringIndex != 0) {

fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);

}

- if (fRawData->fCFUStringLengths != 0) {

- fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);

- }

if (fRawData->fCFUStringTable != 0) {

fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);

}

- if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {

- fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,

- (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);

- }

- if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {

- fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,

- (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);

- }

- if (fRawData->fScriptSets != 0) {

- fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);

- }

}

SpoofData::~SpoofData() {

- utrie2_close(fAnyCaseTrie);

- fAnyCaseTrie = NULL;

- utrie2_close(fLowerCaseTrie);

- fLowerCaseTrie = NULL;

if (fDataOwned) {

uprv_free(fRawData);

}

@@ -708,6 +725,78 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {

return (char *)fRawData + returnOffset;

}

+int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {

+ int32_t dataSize = fRawData->fLength;

+ if (capacity < dataSize) {

+ status = U_BUFFER_OVERFLOW_ERROR;

+ return dataSize;

+ }

+ uprv_memcpy(buf, fRawData, dataSize);

+ return dataSize;

+int32_t SpoofData::size() const {

+ return fRawData->fLength;

+//-------------------------------

+//

+// Front-end APIs for SpoofData

+//

+//-------------------------------

+int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {

+ // Perform a binary search.

+ // [lo, hi), i.e lo is inclusive, hi is exclusive.

+ // The result after the loop will be in lo.

+ int32_t lo = 0;

+ int32_t hi = length();

+ do {

+ int32_t mid = (lo + hi) / 2;

+ if (codePointAt(mid) > inChar) {

+ hi = mid;

+ } else if (codePointAt(mid) < inChar) {

+ lo = mid;

+ } else {

+ // Found result. Break early.

+ lo = mid;

+ break;

+ }

+ } while (hi - lo > 1);

+ // Did we find an entry? If not, the char maps to itself.

+ if (codePointAt(lo) != inChar) {

+ dest.append(inChar);

+ return 1;

+ }

+ // Add the element to the string builder and return.

+ return appendValueTo(lo, dest);

+int32_t SpoofData::length() const {

+ return fRawData->fCFUKeysSize;

+UChar32 SpoofData::codePointAt(int32_t index) const {

+ return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);

+int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {

+ int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);

+ // Value is either a char (for strings of length 1) or

+ // an index into the string table (for longer strings)

+ uint16_t value = fCFUValues[index];

+ if (stringLength == 1) {

+ dest.append((UChar)value);

+ } else {

+ dest.append(fCFUStrings + value, stringLength);

+ }

+ return stringLength;

U_NAMESPACE_END

@@ -739,7 +828,10 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou

pInfo->dataFormat[1]==0x66 &&

pInfo->dataFormat[2]==0x75 &&

pInfo->dataFormat[3]==0x20 &&

- pInfo->formatVersion[0]==1 )) {

+ pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&

+ pInfo->formatVersion[1]==0 &&

+ pInfo->formatVersion[2]==0 &&

+ pInfo->formatVersion[3]==0 )) {

udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "

"(format version %02x %02x %02x %02x) is not recognized\n",

pInfo->dataFormat[0], pInfo->dataFormat[1],

@@ -828,26 +920,6 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou

sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;

ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

- // String Lengths Section

- sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);

- sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;

- ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

- // Any Case Trie

- sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);

- sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);

- utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

- // Lower Case Trie

- sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);

- sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);

- utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

- // Script Sets. The data is an array of int32_t

- sectionStart = ds->readUInt32(spoofDH->fScriptSets);

- sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);

- ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

// And, last, swap the header itself.

// int32_t fMagic // swap this

// uint8_t fFormatVersion[4] // Do not swap this, just copy

« no previous file with comments | « source/i18n/uspoof_impl.h ('k') | source/i18n/uspoof_wsconf.h » ('j') | no next file with comments »