source/i18n/uspoof_wsconf.cpp - Issue 2440913002: Update ICU to 58.1

Unified Diff: source/i18n/uspoof_wsconf.cpp

Issue 2440913002: Update ICU to 58.1

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/i18n/uspoof_wsconf.cpp

diff --git a/source/i18n/uspoof_wsconf.cpp b/source/i18n/uspoof_wsconf.cpp

deleted file mode 100644

index ca232834fa2a0be1daf4c481f5802c807849aa07..0000000000000000000000000000000000000000

--- a/source/i18n/uspoof_wsconf.cpp

+++ /dev/null

@@ -1,436 +0,0 @@

-/*

-******************************************************************************

-* file name: uspoof_wsconf.cpp

-* encoding: US-ASCII

-* tab size: 8 (not used)

-* indentation:4

-* created on: 2009Jan05 (refactoring earlier files)

-* created by: Andy Heninger

-* Internal functions for compililing Whole Script confusable source data

-* into its binary (runtime) form. The binary data format is described

-* in uspoof_impl.h

-*/

-#include "unicode/utypes.h"

-#include "unicode/uspoof.h"

-#if !UCONFIG_NO_NORMALIZATION

-#if !UCONFIG_NO_REGULAR_EXPRESSIONS

-#include "unicode/unorm.h"

-#include "unicode/uregex.h"

-#include "unicode/ustring.h"

-#include "cmemory.h"

-#include "scriptset.h"

-#include "uspoof_impl.h"

-#include "uhash.h"

-#include "uvector.h"

-#include "uassert.h"

-#include "uspoof_wsconf.h"

-U_NAMESPACE_USE

-// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt

-// Example Lines:

-// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O

-// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I

-// | | | |

-// | | | |---- Which table, Any Case or Lower Case (A or L)

-// | | |----------Target script. We need this.

-// | |----------------Src script. Should match the script of the source

-// | code points. Beyond checking that, we don't keep it.

-// |--------------------------------Source code points or range.

-//

-// The expression will match _all_ lines, including erroneous lines.

-// The result of the parse is returned via the contents of the (match) groups.

-static const char *parseExp =

- "(?m)" // Multi-line mode

- "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.

- "|^(?:" // OR

- "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.

- "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.

- "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.

- "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7

- "[ \\t]*(?:#.*?)?" // Trailing commment

- ")$|" // OR

- "^(.*?)$"; // An error line. Group 8.

- // Any line not matching the preceding

- // parts of the expression.will match

- // this, and thus be flagged as an error

-// Extract a regular expression match group into a char * string.

-// The group must contain only invariant characters.

-// Used for script names

-//

-static void extractGroup(

- URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {

- UChar ubuf[50];

- ubuf[0] = 0;

- destBuf[0] = 0;

- int32_t len = uregex_group(e, group, ubuf, 50, &status);

- if (U_FAILURE(status) || len == -1 || len >= destCapacity) {

- return;

- }

- UnicodeString s(FALSE, ubuf, len); // Aliasing constructor

- s.extract(0, len, destBuf, destCapacity, US_INV);

-U_NAMESPACE_BEGIN

-// Build the Whole Script Confusable data

-//

-// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,

-// because everything is local to this one build function anyhow,

-// OR

-// break this function into more reasonably sized pieces, with

-// state in WSConfusableDataBuilder.

-//

-void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,

- int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)

- if (U_FAILURE(status)) {

- return;

- }

- URegularExpression *parseRegexp = NULL;

- int32_t inputLen = 0;

- UChar *input = NULL;

- int32_t lineNum = 0;

- UVector *scriptSets = NULL;

- uint32_t rtScriptSetsCount = 2;

- UTrie2 *anyCaseTrie = NULL;

- UTrie2 *lowerCaseTrie = NULL;

- anyCaseTrie = utrie2_open(0, 0, &status);

- lowerCaseTrie = utrie2_open(0, 0, &status);

- UnicodeString pattern(parseExp, -1, US_INV);

- // The scriptSets vector provides a mapping from TRIE values to the set of scripts.

- //

- // Reserved TRIE values:

- // 0: Code point has no whole script confusables.

- // 1: Code point is of script Common or Inherited.

- // These code points do not participate in whole script confusable detection.

- // (This is logically equivalent to saying that they contain confusables in

- // all scripts)

- //

- // Because Trie values are indexes into the ScriptSets vector, pre-fill

- // vector positions 0 and 1 to avoid conflicts with the reserved values.

- scriptSets = new UVector(status);

- if (scriptSets == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- goto cleanup;

- }

- scriptSets->addElement((void *)NULL, status);

- // Convert the user input data from UTF-8 to UChar (UTF-16)

- u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);

- if (status != U_BUFFER_OVERFLOW_ERROR) {

- goto cleanup;

- }

- status = U_ZERO_ERROR;

- input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));

- if (input == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- goto cleanup;

- }

- u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);

- parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);

- // Zap any Byte Order Mark at the start of input. Changing it to a space is benign

- // given the syntax of the input.

- if (*input == 0xfeff) {

- *input = 0x20;

- }

- // Parse the input, one line per iteration of this loop.

- uregex_setText(parseRegexp, input, inputLen, &status);

- while (uregex_findNext(parseRegexp, &status)) {

- lineNum++;

- if (uregex_start(parseRegexp, 1, &status) >= 0) {

- // this was a blank or comment line.

- continue;

- }

- if (uregex_start(parseRegexp, 8, &status) >= 0) {

- // input file syntax error.

- status = U_PARSE_ERROR;

- goto cleanup;

- }

- if (U_FAILURE(status)) {

- goto cleanup;

- }

- // Pick up the start and optional range end code points from the parsed line.

- UChar32 startCodePoint = SpoofImpl::ScanHex(

- input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);

- UChar32 endCodePoint = startCodePoint;

- if (uregex_start(parseRegexp, 3, &status) >=0) {

- endCodePoint = SpoofImpl::ScanHex(

- input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);

- }

- // Extract the two script names from the source line. We need these in an 8 bit

- // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on

- // to the ICU u_getPropertyValueEnum() function. Ugh.

- char srcScriptName[20];

- char targScriptName[20];

- extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);

- extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);

- UScriptCode srcScript =

- static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));

- UScriptCode targScript =

- static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));

- if (U_FAILURE(status)) {

- goto cleanup;

- }

- if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {

- status = U_INVALID_FORMAT_ERROR;

- goto cleanup;

- }

- // select the table - (A) any case or (L) lower case only

- UTrie2 *table = anyCaseTrie;

- if (uregex_start(parseRegexp, 7, &status) >= 0) {

- table = lowerCaseTrie;

- }

- // Build the set of scripts containing confusable characters for

- // the code point(s) specified in this input line.

- // Sanity check that the script of the source code point is the same

- // as the source script indicated in the input file. Failure of this check is

- // an error in the input file.

- // Include the source script in the set (needed for Mixed Script Confusable detection).

- //

- UChar32 cp;

- for (cp=startCodePoint; cp<=endCodePoint; cp++) {

- int32_t setIndex = utrie2_get32(table, cp);

- BuilderScriptSet *bsset = NULL;

- if (setIndex > 0) {

- U_ASSERT(setIndex < scriptSets->size());

- bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));

- } else {

- bsset = new BuilderScriptSet();

- if (bsset == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- goto cleanup;

- }

- bsset->codePoint = cp;

- bsset->trie = table;

- bsset->sset = new ScriptSet();

- setIndex = scriptSets->size();

- bsset->index = setIndex;

- bsset->rindex = 0;

- if (bsset->sset == NULL) {

- status = U_MEMORY_ALLOCATION_ERROR;

- goto cleanup;

- }

- scriptSets->addElement(bsset, status);

- utrie2_set32(table, cp, setIndex, &status);

- }

- bsset->sset->set(targScript, status);

- bsset->sset->set(srcScript, status);

- if (U_FAILURE(status)) {

- goto cleanup;

- }

- UScriptCode cpScript = uscript_getScript(cp, &status);

- if (cpScript != srcScript) {

- status = U_INVALID_FORMAT_ERROR;

- goto cleanup;

- }

- // Eliminate duplicate script sets. At this point we have a separate

- // script set for every code point that had data in the input file.

- //

- // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them

- //

- // printf("Number of scriptSets: %d\n", scriptSets->size());

- {

- int32_t duplicateCount = 0;

- rtScriptSetsCount = 2;

- for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {

- BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));

- if (outerSet->index != static_cast<uint32_t>(outeri)) {

- // This set was already identified as a duplicate.

- // It will not be allocated a position in the runtime array of ScriptSets.

- continue;

- }

- outerSet->rindex = rtScriptSetsCount++;

- for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {

- BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));

- if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {

- delete innerSet->sset;

- innerSet->scriptSetOwned = FALSE;

- innerSet->sset = outerSet->sset;

- innerSet->index = outeri;

- innerSet->rindex = outerSet->rindex;

- duplicateCount++;

- }

- // But this doesn't get all. We need to fix the TRIE.

- }

- // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);

- }

- // Update the Trie values to be reflect the run time script indexes (after duplicate merging).

- // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets

- // are unused, which is why the loop index starts at 2.)

- {

- for (int32_t i=2; i<scriptSets->size(); i++) {

- BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));

- if (bSet->rindex != (uint32_t)i) {

- utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);

- }

- // For code points with script==Common or script==Inherited,

- // Set the reserved value of 1 into both Tries. These characters do not participate

- // in Whole Script Confusable detection; this reserved value is the means

- // by which they are detected.

- {

- UnicodeSet ignoreSet;

- ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);

- UnicodeSet inheritedSet;

- inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);

- ignoreSet.addAll(inheritedSet);

- for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {

- UChar32 rangeStart = ignoreSet.getRangeStart(rn);

- UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);

- utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);

- utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);

- }

- // Serialize the data to the Spoof Detector

- {

- utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);

- int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);

- // printf("Any case Trie size: %d\n", size);

- if (status != U_BUFFER_OVERFLOW_ERROR) {

- goto cleanup;

- }

- status = U_ZERO_ERROR;

- spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;

- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;

- spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;

- void *where = spImpl->fSpoofData->reserveSpace(size, status);

- utrie2_serialize(anyCaseTrie, where, size, &status);

- utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);

- size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);

- // printf("Lower case Trie size: %d\n", size);

- if (status != U_BUFFER_OVERFLOW_ERROR) {

- goto cleanup;

- }

- status = U_ZERO_ERROR;

- spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;

- spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;

- spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;

- where = spImpl->fSpoofData->reserveSpace(size, status);

- utrie2_serialize(lowerCaseTrie, where, size, &status);

- spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;

- spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;

- ScriptSet *rtScriptSets = static_cast<ScriptSet *>

- (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));

- uint32_t rindex = 2;

- for (int32_t i=2; i<scriptSets->size(); i++) {

- BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));

- if (bSet->rindex < rindex) {

- // We have already copied this script set to the serialized data.

- continue;

- }

- U_ASSERT(rindex == bSet->rindex);

- rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.

- rindex++;

- }

- // Open new utrie2s from the serialized data. We don't want to keep the ones

- // we just built because we would then have two copies of the data, one internal to

- // the utries that we have already constructed, and one in the serialized data area.

- // An alternative would be to not pre-serialize the Trie data, but that makes the

- // spoof detector data different, depending on how the detector was constructed.

- // It's simpler to keep the data always the same.

- spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(

- UTRIE2_16_VALUE_BITS,

- (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,

- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,

- NULL,

- &status);

- spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(

- UTRIE2_16_VALUE_BITS,

- (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,

- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,

- NULL,

- &status);

-cleanup:

- if (U_FAILURE(status)) {

- pe->line = lineNum;

- }

- uregex_close(parseRegexp);

- uprv_free(input);

- int32_t i;

- if (scriptSets != NULL) {

- for (i=0; i<scriptSets->size(); i++) {

- BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));

- delete bsset;

- }

- delete scriptSets;

- }

- utrie2_close(anyCaseTrie);

- utrie2_close(lowerCaseTrie);

- return;

-U_NAMESPACE_END

-BuilderScriptSet::BuilderScriptSet() {

- codePoint = -1;

- trie = NULL;

- sset = NULL;

- index = 0;

- rindex = 0;

- scriptSetOwned = TRUE;

-BuilderScriptSet::~BuilderScriptSet() {

- if (scriptSetOwned) {

- delete sset;

- }

-#endif

-#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

« no previous file with comments | « source/i18n/uspoof_wsconf.h ('k') | source/i18n/usrchimp.h » ('j') | no next file with comments »