Index: source/i18n/uspoof_wsconf.cpp |
diff --git a/source/i18n/uspoof_wsconf.cpp b/source/i18n/uspoof_wsconf.cpp |
deleted file mode 100644 |
index ca232834fa2a0be1daf4c481f5802c807849aa07..0000000000000000000000000000000000000000 |
--- a/source/i18n/uspoof_wsconf.cpp |
+++ /dev/null |
@@ -1,436 +0,0 @@ |
-/* |
-****************************************************************************** |
-* |
-* Copyright (C) 2008-2013, International Business Machines |
-* Corporation and others. All Rights Reserved. |
-* |
-****************************************************************************** |
-* file name: uspoof_wsconf.cpp |
-* encoding: US-ASCII |
-* tab size: 8 (not used) |
-* indentation:4 |
-* |
-* created on: 2009Jan05 (refactoring earlier files) |
-* created by: Andy Heninger |
-* |
-* Internal functions for compililing Whole Script confusable source data |
-* into its binary (runtime) form. The binary data format is described |
-* in uspoof_impl.h |
-*/ |
- |
-#include "unicode/utypes.h" |
-#include "unicode/uspoof.h" |
- |
-#if !UCONFIG_NO_NORMALIZATION |
- |
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS |
- |
-#include "unicode/unorm.h" |
-#include "unicode/uregex.h" |
-#include "unicode/ustring.h" |
-#include "cmemory.h" |
-#include "scriptset.h" |
-#include "uspoof_impl.h" |
-#include "uhash.h" |
-#include "uvector.h" |
-#include "uassert.h" |
-#include "uspoof_wsconf.h" |
- |
-U_NAMESPACE_USE |
- |
- |
-// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt |
-// Example Lines: |
-// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O |
-// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I |
-// | | | | |
-// | | | |---- Which table, Any Case or Lower Case (A or L) |
-// | | |----------Target script. We need this. |
-// | |----------------Src script. Should match the script of the source |
-// | code points. Beyond checking that, we don't keep it. |
-// |--------------------------------Source code points or range. |
-// |
-// The expression will match _all_ lines, including erroneous lines. |
-// The result of the parse is returned via the contents of the (match) groups. |
-static const char *parseExp = |
- "(?m)" // Multi-line mode |
- "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. |
- "|^(?:" // OR |
- "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. |
- "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. |
- "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. |
- "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 |
- "[ \\t]*(?:#.*?)?" // Trailing commment |
- ")$|" // OR |
- "^(.*?)$"; // An error line. Group 8. |
- // Any line not matching the preceding |
- // parts of the expression.will match |
- // this, and thus be flagged as an error |
- |
- |
-// Extract a regular expression match group into a char * string. |
-// The group must contain only invariant characters. |
-// Used for script names |
-// |
-static void extractGroup( |
- URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { |
- |
- UChar ubuf[50]; |
- ubuf[0] = 0; |
- destBuf[0] = 0; |
- int32_t len = uregex_group(e, group, ubuf, 50, &status); |
- if (U_FAILURE(status) || len == -1 || len >= destCapacity) { |
- return; |
- } |
- UnicodeString s(FALSE, ubuf, len); // Aliasing constructor |
- s.extract(0, len, destBuf, destCapacity, US_INV); |
-} |
- |
- |
- |
-U_NAMESPACE_BEGIN |
- |
-// Build the Whole Script Confusable data |
-// |
-// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, |
-// because everything is local to this one build function anyhow, |
-// OR |
-// break this function into more reasonably sized pieces, with |
-// state in WSConfusableDataBuilder. |
-// |
-void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, |
- int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) |
-{ |
- if (U_FAILURE(status)) { |
- return; |
- } |
- URegularExpression *parseRegexp = NULL; |
- int32_t inputLen = 0; |
- UChar *input = NULL; |
- int32_t lineNum = 0; |
- |
- UVector *scriptSets = NULL; |
- uint32_t rtScriptSetsCount = 2; |
- |
- UTrie2 *anyCaseTrie = NULL; |
- UTrie2 *lowerCaseTrie = NULL; |
- |
- anyCaseTrie = utrie2_open(0, 0, &status); |
- lowerCaseTrie = utrie2_open(0, 0, &status); |
- |
- UnicodeString pattern(parseExp, -1, US_INV); |
- |
- // The scriptSets vector provides a mapping from TRIE values to the set of scripts. |
- // |
- // Reserved TRIE values: |
- // 0: Code point has no whole script confusables. |
- // 1: Code point is of script Common or Inherited. |
- // These code points do not participate in whole script confusable detection. |
- // (This is logically equivalent to saying that they contain confusables in |
- // all scripts) |
- // |
- // Because Trie values are indexes into the ScriptSets vector, pre-fill |
- // vector positions 0 and 1 to avoid conflicts with the reserved values. |
- |
- scriptSets = new UVector(status); |
- if (scriptSets == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- goto cleanup; |
- } |
- scriptSets->addElement((void *)NULL, status); |
- scriptSets->addElement((void *)NULL, status); |
- |
- // Convert the user input data from UTF-8 to UChar (UTF-16) |
- u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); |
- if (status != U_BUFFER_OVERFLOW_ERROR) { |
- goto cleanup; |
- } |
- status = U_ZERO_ERROR; |
- input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); |
- if (input == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- goto cleanup; |
- } |
- u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); |
- |
- parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); |
- |
- // Zap any Byte Order Mark at the start of input. Changing it to a space is benign |
- // given the syntax of the input. |
- if (*input == 0xfeff) { |
- *input = 0x20; |
- } |
- |
- // Parse the input, one line per iteration of this loop. |
- uregex_setText(parseRegexp, input, inputLen, &status); |
- while (uregex_findNext(parseRegexp, &status)) { |
- lineNum++; |
- if (uregex_start(parseRegexp, 1, &status) >= 0) { |
- // this was a blank or comment line. |
- continue; |
- } |
- if (uregex_start(parseRegexp, 8, &status) >= 0) { |
- // input file syntax error. |
- status = U_PARSE_ERROR; |
- goto cleanup; |
- } |
- if (U_FAILURE(status)) { |
- goto cleanup; |
- } |
- |
- // Pick up the start and optional range end code points from the parsed line. |
- UChar32 startCodePoint = SpoofImpl::ScanHex( |
- input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); |
- UChar32 endCodePoint = startCodePoint; |
- if (uregex_start(parseRegexp, 3, &status) >=0) { |
- endCodePoint = SpoofImpl::ScanHex( |
- input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); |
- } |
- |
- // Extract the two script names from the source line. We need these in an 8 bit |
- // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on |
- // to the ICU u_getPropertyValueEnum() function. Ugh. |
- char srcScriptName[20]; |
- char targScriptName[20]; |
- extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); |
- extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); |
- UScriptCode srcScript = |
- static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); |
- UScriptCode targScript = |
- static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); |
- if (U_FAILURE(status)) { |
- goto cleanup; |
- } |
- if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { |
- status = U_INVALID_FORMAT_ERROR; |
- goto cleanup; |
- } |
- |
- // select the table - (A) any case or (L) lower case only |
- UTrie2 *table = anyCaseTrie; |
- if (uregex_start(parseRegexp, 7, &status) >= 0) { |
- table = lowerCaseTrie; |
- } |
- |
- // Build the set of scripts containing confusable characters for |
- // the code point(s) specified in this input line. |
- // Sanity check that the script of the source code point is the same |
- // as the source script indicated in the input file. Failure of this check is |
- // an error in the input file. |
- // Include the source script in the set (needed for Mixed Script Confusable detection). |
- // |
- UChar32 cp; |
- for (cp=startCodePoint; cp<=endCodePoint; cp++) { |
- int32_t setIndex = utrie2_get32(table, cp); |
- BuilderScriptSet *bsset = NULL; |
- if (setIndex > 0) { |
- U_ASSERT(setIndex < scriptSets->size()); |
- bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); |
- } else { |
- bsset = new BuilderScriptSet(); |
- if (bsset == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- goto cleanup; |
- } |
- bsset->codePoint = cp; |
- bsset->trie = table; |
- bsset->sset = new ScriptSet(); |
- setIndex = scriptSets->size(); |
- bsset->index = setIndex; |
- bsset->rindex = 0; |
- if (bsset->sset == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- goto cleanup; |
- } |
- scriptSets->addElement(bsset, status); |
- utrie2_set32(table, cp, setIndex, &status); |
- } |
- bsset->sset->set(targScript, status); |
- bsset->sset->set(srcScript, status); |
- |
- if (U_FAILURE(status)) { |
- goto cleanup; |
- } |
- UScriptCode cpScript = uscript_getScript(cp, &status); |
- if (cpScript != srcScript) { |
- status = U_INVALID_FORMAT_ERROR; |
- goto cleanup; |
- } |
- } |
- } |
- |
- // Eliminate duplicate script sets. At this point we have a separate |
- // script set for every code point that had data in the input file. |
- // |
- // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them |
- // |
- // printf("Number of scriptSets: %d\n", scriptSets->size()); |
- { |
- int32_t duplicateCount = 0; |
- rtScriptSetsCount = 2; |
- for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { |
- BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); |
- if (outerSet->index != static_cast<uint32_t>(outeri)) { |
- // This set was already identified as a duplicate. |
- // It will not be allocated a position in the runtime array of ScriptSets. |
- continue; |
- } |
- outerSet->rindex = rtScriptSetsCount++; |
- for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { |
- BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); |
- if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { |
- delete innerSet->sset; |
- innerSet->scriptSetOwned = FALSE; |
- innerSet->sset = outerSet->sset; |
- innerSet->index = outeri; |
- innerSet->rindex = outerSet->rindex; |
- duplicateCount++; |
- } |
- // But this doesn't get all. We need to fix the TRIE. |
- } |
- } |
- // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); |
- } |
- |
- |
- |
- // Update the Trie values to be reflect the run time script indexes (after duplicate merging). |
- // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets |
- // are unused, which is why the loop index starts at 2.) |
- { |
- for (int32_t i=2; i<scriptSets->size(); i++) { |
- BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
- if (bSet->rindex != (uint32_t)i) { |
- utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); |
- } |
- } |
- } |
- |
- // For code points with script==Common or script==Inherited, |
- // Set the reserved value of 1 into both Tries. These characters do not participate |
- // in Whole Script Confusable detection; this reserved value is the means |
- // by which they are detected. |
- { |
- UnicodeSet ignoreSet; |
- ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); |
- UnicodeSet inheritedSet; |
- inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); |
- ignoreSet.addAll(inheritedSet); |
- for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { |
- UChar32 rangeStart = ignoreSet.getRangeStart(rn); |
- UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); |
- utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); |
- utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); |
- } |
- } |
- |
- // Serialize the data to the Spoof Detector |
- { |
- utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); |
- int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); |
- // printf("Any case Trie size: %d\n", size); |
- if (status != U_BUFFER_OVERFLOW_ERROR) { |
- goto cleanup; |
- } |
- status = U_ZERO_ERROR; |
- spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; |
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; |
- spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; |
- void *where = spImpl->fSpoofData->reserveSpace(size, status); |
- utrie2_serialize(anyCaseTrie, where, size, &status); |
- |
- utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); |
- size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); |
- // printf("Lower case Trie size: %d\n", size); |
- if (status != U_BUFFER_OVERFLOW_ERROR) { |
- goto cleanup; |
- } |
- status = U_ZERO_ERROR; |
- spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; |
- spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; |
- spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; |
- where = spImpl->fSpoofData->reserveSpace(size, status); |
- utrie2_serialize(lowerCaseTrie, where, size, &status); |
- |
- spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; |
- spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; |
- ScriptSet *rtScriptSets = static_cast<ScriptSet *> |
- (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); |
- uint32_t rindex = 2; |
- for (int32_t i=2; i<scriptSets->size(); i++) { |
- BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
- if (bSet->rindex < rindex) { |
- // We have already copied this script set to the serialized data. |
- continue; |
- } |
- U_ASSERT(rindex == bSet->rindex); |
- rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. |
- rindex++; |
- } |
- } |
- |
- // Open new utrie2s from the serialized data. We don't want to keep the ones |
- // we just built because we would then have two copies of the data, one internal to |
- // the utries that we have already constructed, and one in the serialized data area. |
- // An alternative would be to not pre-serialize the Trie data, but that makes the |
- // spoof detector data different, depending on how the detector was constructed. |
- // It's simpler to keep the data always the same. |
- |
- spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( |
- UTRIE2_16_VALUE_BITS, |
- (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, |
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, |
- NULL, |
- &status); |
- |
- spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( |
- UTRIE2_16_VALUE_BITS, |
- (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, |
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, |
- NULL, |
- &status); |
- |
- |
- |
-cleanup: |
- if (U_FAILURE(status)) { |
- pe->line = lineNum; |
- } |
- uregex_close(parseRegexp); |
- uprv_free(input); |
- |
- int32_t i; |
- if (scriptSets != NULL) { |
- for (i=0; i<scriptSets->size(); i++) { |
- BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
- delete bsset; |
- } |
- delete scriptSets; |
- } |
- utrie2_close(anyCaseTrie); |
- utrie2_close(lowerCaseTrie); |
- return; |
-} |
- |
-U_NAMESPACE_END |
- |
- |
- |
-BuilderScriptSet::BuilderScriptSet() { |
- codePoint = -1; |
- trie = NULL; |
- sset = NULL; |
- index = 0; |
- rindex = 0; |
- scriptSetOwned = TRUE; |
-} |
- |
-BuilderScriptSet::~BuilderScriptSet() { |
- if (scriptSetOwned) { |
- delete sset; |
- } |
-} |
- |
-#endif |
-#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
- |