| Index: source/i18n/uspoof_wsconf.cpp
|
| diff --git a/source/i18n/uspoof_wsconf.cpp b/source/i18n/uspoof_wsconf.cpp
|
| deleted file mode 100644
|
| index ca232834fa2a0be1daf4c481f5802c807849aa07..0000000000000000000000000000000000000000
|
| --- a/source/i18n/uspoof_wsconf.cpp
|
| +++ /dev/null
|
| @@ -1,436 +0,0 @@
|
| -/*
|
| -******************************************************************************
|
| -*
|
| -* Copyright (C) 2008-2013, International Business Machines
|
| -* Corporation and others. All Rights Reserved.
|
| -*
|
| -******************************************************************************
|
| -* file name: uspoof_wsconf.cpp
|
| -* encoding: US-ASCII
|
| -* tab size: 8 (not used)
|
| -* indentation:4
|
| -*
|
| -* created on: 2009Jan05 (refactoring earlier files)
|
| -* created by: Andy Heninger
|
| -*
|
| -* Internal functions for compililing Whole Script confusable source data
|
| -* into its binary (runtime) form. The binary data format is described
|
| -* in uspoof_impl.h
|
| -*/
|
| -
|
| -#include "unicode/utypes.h"
|
| -#include "unicode/uspoof.h"
|
| -
|
| -#if !UCONFIG_NO_NORMALIZATION
|
| -
|
| -#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
| -
|
| -#include "unicode/unorm.h"
|
| -#include "unicode/uregex.h"
|
| -#include "unicode/ustring.h"
|
| -#include "cmemory.h"
|
| -#include "scriptset.h"
|
| -#include "uspoof_impl.h"
|
| -#include "uhash.h"
|
| -#include "uvector.h"
|
| -#include "uassert.h"
|
| -#include "uspoof_wsconf.h"
|
| -
|
| -U_NAMESPACE_USE
|
| -
|
| -
|
| -// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
|
| -// Example Lines:
|
| -// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
|
| -// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
|
| -// | | | |
|
| -// | | | |---- Which table, Any Case or Lower Case (A or L)
|
| -// | | |----------Target script. We need this.
|
| -// | |----------------Src script. Should match the script of the source
|
| -// | code points. Beyond checking that, we don't keep it.
|
| -// |--------------------------------Source code points or range.
|
| -//
|
| -// The expression will match _all_ lines, including erroneous lines.
|
| -// The result of the parse is returned via the contents of the (match) groups.
|
| -static const char *parseExp =
|
| - "(?m)" // Multi-line mode
|
| - "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
|
| - "|^(?:" // OR
|
| - "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
|
| - "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
|
| - "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
|
| - "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
|
| - "[ \\t]*(?:#.*?)?" // Trailing commment
|
| - ")$|" // OR
|
| - "^(.*?)$"; // An error line. Group 8.
|
| - // Any line not matching the preceding
|
| - // parts of the expression.will match
|
| - // this, and thus be flagged as an error
|
| -
|
| -
|
| -// Extract a regular expression match group into a char * string.
|
| -// The group must contain only invariant characters.
|
| -// Used for script names
|
| -//
|
| -static void extractGroup(
|
| - URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
|
| -
|
| - UChar ubuf[50];
|
| - ubuf[0] = 0;
|
| - destBuf[0] = 0;
|
| - int32_t len = uregex_group(e, group, ubuf, 50, &status);
|
| - if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
|
| - return;
|
| - }
|
| - UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
|
| - s.extract(0, len, destBuf, destCapacity, US_INV);
|
| -}
|
| -
|
| -
|
| -
|
| -U_NAMESPACE_BEGIN
|
| -
|
| -// Build the Whole Script Confusable data
|
| -//
|
| -// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
|
| -// because everything is local to this one build function anyhow,
|
| -// OR
|
| -// break this function into more reasonably sized pieces, with
|
| -// state in WSConfusableDataBuilder.
|
| -//
|
| -void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
|
| - int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
|
| -{
|
| - if (U_FAILURE(status)) {
|
| - return;
|
| - }
|
| - URegularExpression *parseRegexp = NULL;
|
| - int32_t inputLen = 0;
|
| - UChar *input = NULL;
|
| - int32_t lineNum = 0;
|
| -
|
| - UVector *scriptSets = NULL;
|
| - uint32_t rtScriptSetsCount = 2;
|
| -
|
| - UTrie2 *anyCaseTrie = NULL;
|
| - UTrie2 *lowerCaseTrie = NULL;
|
| -
|
| - anyCaseTrie = utrie2_open(0, 0, &status);
|
| - lowerCaseTrie = utrie2_open(0, 0, &status);
|
| -
|
| - UnicodeString pattern(parseExp, -1, US_INV);
|
| -
|
| - // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
|
| - //
|
| - // Reserved TRIE values:
|
| - // 0: Code point has no whole script confusables.
|
| - // 1: Code point is of script Common or Inherited.
|
| - // These code points do not participate in whole script confusable detection.
|
| - // (This is logically equivalent to saying that they contain confusables in
|
| - // all scripts)
|
| - //
|
| - // Because Trie values are indexes into the ScriptSets vector, pre-fill
|
| - // vector positions 0 and 1 to avoid conflicts with the reserved values.
|
| -
|
| - scriptSets = new UVector(status);
|
| - if (scriptSets == NULL) {
|
| - status = U_MEMORY_ALLOCATION_ERROR;
|
| - goto cleanup;
|
| - }
|
| - scriptSets->addElement((void *)NULL, status);
|
| - scriptSets->addElement((void *)NULL, status);
|
| -
|
| - // Convert the user input data from UTF-8 to UChar (UTF-16)
|
| - u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
|
| - if (status != U_BUFFER_OVERFLOW_ERROR) {
|
| - goto cleanup;
|
| - }
|
| - status = U_ZERO_ERROR;
|
| - input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
|
| - if (input == NULL) {
|
| - status = U_MEMORY_ALLOCATION_ERROR;
|
| - goto cleanup;
|
| - }
|
| - u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
|
| -
|
| - parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
|
| -
|
| - // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
|
| - // given the syntax of the input.
|
| - if (*input == 0xfeff) {
|
| - *input = 0x20;
|
| - }
|
| -
|
| - // Parse the input, one line per iteration of this loop.
|
| - uregex_setText(parseRegexp, input, inputLen, &status);
|
| - while (uregex_findNext(parseRegexp, &status)) {
|
| - lineNum++;
|
| - if (uregex_start(parseRegexp, 1, &status) >= 0) {
|
| - // this was a blank or comment line.
|
| - continue;
|
| - }
|
| - if (uregex_start(parseRegexp, 8, &status) >= 0) {
|
| - // input file syntax error.
|
| - status = U_PARSE_ERROR;
|
| - goto cleanup;
|
| - }
|
| - if (U_FAILURE(status)) {
|
| - goto cleanup;
|
| - }
|
| -
|
| - // Pick up the start and optional range end code points from the parsed line.
|
| - UChar32 startCodePoint = SpoofImpl::ScanHex(
|
| - input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
|
| - UChar32 endCodePoint = startCodePoint;
|
| - if (uregex_start(parseRegexp, 3, &status) >=0) {
|
| - endCodePoint = SpoofImpl::ScanHex(
|
| - input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
|
| - }
|
| -
|
| - // Extract the two script names from the source line. We need these in an 8 bit
|
| - // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
|
| - // to the ICU u_getPropertyValueEnum() function. Ugh.
|
| - char srcScriptName[20];
|
| - char targScriptName[20];
|
| - extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
|
| - extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
|
| - UScriptCode srcScript =
|
| - static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
|
| - UScriptCode targScript =
|
| - static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
|
| - if (U_FAILURE(status)) {
|
| - goto cleanup;
|
| - }
|
| - if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
|
| - status = U_INVALID_FORMAT_ERROR;
|
| - goto cleanup;
|
| - }
|
| -
|
| - // select the table - (A) any case or (L) lower case only
|
| - UTrie2 *table = anyCaseTrie;
|
| - if (uregex_start(parseRegexp, 7, &status) >= 0) {
|
| - table = lowerCaseTrie;
|
| - }
|
| -
|
| - // Build the set of scripts containing confusable characters for
|
| - // the code point(s) specified in this input line.
|
| - // Sanity check that the script of the source code point is the same
|
| - // as the source script indicated in the input file. Failure of this check is
|
| - // an error in the input file.
|
| - // Include the source script in the set (needed for Mixed Script Confusable detection).
|
| - //
|
| - UChar32 cp;
|
| - for (cp=startCodePoint; cp<=endCodePoint; cp++) {
|
| - int32_t setIndex = utrie2_get32(table, cp);
|
| - BuilderScriptSet *bsset = NULL;
|
| - if (setIndex > 0) {
|
| - U_ASSERT(setIndex < scriptSets->size());
|
| - bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
|
| - } else {
|
| - bsset = new BuilderScriptSet();
|
| - if (bsset == NULL) {
|
| - status = U_MEMORY_ALLOCATION_ERROR;
|
| - goto cleanup;
|
| - }
|
| - bsset->codePoint = cp;
|
| - bsset->trie = table;
|
| - bsset->sset = new ScriptSet();
|
| - setIndex = scriptSets->size();
|
| - bsset->index = setIndex;
|
| - bsset->rindex = 0;
|
| - if (bsset->sset == NULL) {
|
| - status = U_MEMORY_ALLOCATION_ERROR;
|
| - goto cleanup;
|
| - }
|
| - scriptSets->addElement(bsset, status);
|
| - utrie2_set32(table, cp, setIndex, &status);
|
| - }
|
| - bsset->sset->set(targScript, status);
|
| - bsset->sset->set(srcScript, status);
|
| -
|
| - if (U_FAILURE(status)) {
|
| - goto cleanup;
|
| - }
|
| - UScriptCode cpScript = uscript_getScript(cp, &status);
|
| - if (cpScript != srcScript) {
|
| - status = U_INVALID_FORMAT_ERROR;
|
| - goto cleanup;
|
| - }
|
| - }
|
| - }
|
| -
|
| - // Eliminate duplicate script sets. At this point we have a separate
|
| - // script set for every code point that had data in the input file.
|
| - //
|
| - // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
|
| - //
|
| - // printf("Number of scriptSets: %d\n", scriptSets->size());
|
| - {
|
| - int32_t duplicateCount = 0;
|
| - rtScriptSetsCount = 2;
|
| - for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
|
| - BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
|
| - if (outerSet->index != static_cast<uint32_t>(outeri)) {
|
| - // This set was already identified as a duplicate.
|
| - // It will not be allocated a position in the runtime array of ScriptSets.
|
| - continue;
|
| - }
|
| - outerSet->rindex = rtScriptSetsCount++;
|
| - for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
|
| - BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
|
| - if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
|
| - delete innerSet->sset;
|
| - innerSet->scriptSetOwned = FALSE;
|
| - innerSet->sset = outerSet->sset;
|
| - innerSet->index = outeri;
|
| - innerSet->rindex = outerSet->rindex;
|
| - duplicateCount++;
|
| - }
|
| - // But this doesn't get all. We need to fix the TRIE.
|
| - }
|
| - }
|
| - // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
|
| - }
|
| -
|
| -
|
| -
|
| - // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
|
| - // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
|
| - // are unused, which is why the loop index starts at 2.)
|
| - {
|
| - for (int32_t i=2; i<scriptSets->size(); i++) {
|
| - BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
| - if (bSet->rindex != (uint32_t)i) {
|
| - utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
|
| - }
|
| - }
|
| - }
|
| -
|
| - // For code points with script==Common or script==Inherited,
|
| - // Set the reserved value of 1 into both Tries. These characters do not participate
|
| - // in Whole Script Confusable detection; this reserved value is the means
|
| - // by which they are detected.
|
| - {
|
| - UnicodeSet ignoreSet;
|
| - ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
|
| - UnicodeSet inheritedSet;
|
| - inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
|
| - ignoreSet.addAll(inheritedSet);
|
| - for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
|
| - UChar32 rangeStart = ignoreSet.getRangeStart(rn);
|
| - UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
|
| - utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
| - utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
| - }
|
| - }
|
| -
|
| - // Serialize the data to the Spoof Detector
|
| - {
|
| - utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
| - int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
|
| - // printf("Any case Trie size: %d\n", size);
|
| - if (status != U_BUFFER_OVERFLOW_ERROR) {
|
| - goto cleanup;
|
| - }
|
| - status = U_ZERO_ERROR;
|
| - spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
|
| - spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
|
| - spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
|
| - void *where = spImpl->fSpoofData->reserveSpace(size, status);
|
| - utrie2_serialize(anyCaseTrie, where, size, &status);
|
| -
|
| - utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
| - size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
|
| - // printf("Lower case Trie size: %d\n", size);
|
| - if (status != U_BUFFER_OVERFLOW_ERROR) {
|
| - goto cleanup;
|
| - }
|
| - status = U_ZERO_ERROR;
|
| - spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
|
| - spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
|
| - spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
|
| - where = spImpl->fSpoofData->reserveSpace(size, status);
|
| - utrie2_serialize(lowerCaseTrie, where, size, &status);
|
| -
|
| - spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
|
| - spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
|
| - ScriptSet *rtScriptSets = static_cast<ScriptSet *>
|
| - (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
|
| - uint32_t rindex = 2;
|
| - for (int32_t i=2; i<scriptSets->size(); i++) {
|
| - BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
| - if (bSet->rindex < rindex) {
|
| - // We have already copied this script set to the serialized data.
|
| - continue;
|
| - }
|
| - U_ASSERT(rindex == bSet->rindex);
|
| - rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
|
| - rindex++;
|
| - }
|
| - }
|
| -
|
| - // Open new utrie2s from the serialized data. We don't want to keep the ones
|
| - // we just built because we would then have two copies of the data, one internal to
|
| - // the utries that we have already constructed, and one in the serialized data area.
|
| - // An alternative would be to not pre-serialize the Trie data, but that makes the
|
| - // spoof detector data different, depending on how the detector was constructed.
|
| - // It's simpler to keep the data always the same.
|
| -
|
| - spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
|
| - UTRIE2_16_VALUE_BITS,
|
| - (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
|
| - spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
| - NULL,
|
| - &status);
|
| -
|
| - spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
|
| - UTRIE2_16_VALUE_BITS,
|
| - (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
|
| - spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
| - NULL,
|
| - &status);
|
| -
|
| -
|
| -
|
| -cleanup:
|
| - if (U_FAILURE(status)) {
|
| - pe->line = lineNum;
|
| - }
|
| - uregex_close(parseRegexp);
|
| - uprv_free(input);
|
| -
|
| - int32_t i;
|
| - if (scriptSets != NULL) {
|
| - for (i=0; i<scriptSets->size(); i++) {
|
| - BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
| - delete bsset;
|
| - }
|
| - delete scriptSets;
|
| - }
|
| - utrie2_close(anyCaseTrie);
|
| - utrie2_close(lowerCaseTrie);
|
| - return;
|
| -}
|
| -
|
| -U_NAMESPACE_END
|
| -
|
| -
|
| -
|
| -BuilderScriptSet::BuilderScriptSet() {
|
| - codePoint = -1;
|
| - trie = NULL;
|
| - sset = NULL;
|
| - index = 0;
|
| - rindex = 0;
|
| - scriptSetOwned = TRUE;
|
| -}
|
| -
|
| -BuilderScriptSet::~BuilderScriptSet() {
|
| - if (scriptSetOwned) {
|
| - delete sset;
|
| - }
|
| -}
|
| -
|
| -#endif
|
| -#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
| -
|
|
|