Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Unified Diff: source/i18n/uspoof_wsconf.cpp

Issue 2440913002: Update ICU to 58.1
Patch Set: Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/i18n/uspoof_wsconf.h ('k') | source/i18n/usrchimp.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/i18n/uspoof_wsconf.cpp
diff --git a/source/i18n/uspoof_wsconf.cpp b/source/i18n/uspoof_wsconf.cpp
deleted file mode 100644
index ca232834fa2a0be1daf4c481f5802c807849aa07..0000000000000000000000000000000000000000
--- a/source/i18n/uspoof_wsconf.cpp
+++ /dev/null
@@ -1,436 +0,0 @@
-/*
-******************************************************************************
-*
-* Copyright (C) 2008-2013, International Business Machines
-* Corporation and others. All Rights Reserved.
-*
-******************************************************************************
-* file name: uspoof_wsconf.cpp
-* encoding: US-ASCII
-* tab size: 8 (not used)
-* indentation:4
-*
-* created on: 2009Jan05 (refactoring earlier files)
-* created by: Andy Heninger
-*
-* Internal functions for compililing Whole Script confusable source data
-* into its binary (runtime) form. The binary data format is described
-* in uspoof_impl.h
-*/
-
-#include "unicode/utypes.h"
-#include "unicode/uspoof.h"
-
-#if !UCONFIG_NO_NORMALIZATION
-
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-
-#include "unicode/unorm.h"
-#include "unicode/uregex.h"
-#include "unicode/ustring.h"
-#include "cmemory.h"
-#include "scriptset.h"
-#include "uspoof_impl.h"
-#include "uhash.h"
-#include "uvector.h"
-#include "uassert.h"
-#include "uspoof_wsconf.h"
-
-U_NAMESPACE_USE
-
-
-// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
-// Example Lines:
-// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
-// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
-// | | | |
-// | | | |---- Which table, Any Case or Lower Case (A or L)
-// | | |----------Target script. We need this.
-// | |----------------Src script. Should match the script of the source
-// | code points. Beyond checking that, we don't keep it.
-// |--------------------------------Source code points or range.
-//
-// The expression will match _all_ lines, including erroneous lines.
-// The result of the parse is returned via the contents of the (match) groups.
-static const char *parseExp =
- "(?m)" // Multi-line mode
- "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
- "|^(?:" // OR
- "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
- "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
- "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
- "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
- "[ \\t]*(?:#.*?)?" // Trailing commment
- ")$|" // OR
- "^(.*?)$"; // An error line. Group 8.
- // Any line not matching the preceding
- // parts of the expression.will match
- // this, and thus be flagged as an error
-
-
-// Extract a regular expression match group into a char * string.
-// The group must contain only invariant characters.
-// Used for script names
-//
-static void extractGroup(
- URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
-
- UChar ubuf[50];
- ubuf[0] = 0;
- destBuf[0] = 0;
- int32_t len = uregex_group(e, group, ubuf, 50, &status);
- if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
- return;
- }
- UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
- s.extract(0, len, destBuf, destCapacity, US_INV);
-}
-
-
-
-U_NAMESPACE_BEGIN
-
-// Build the Whole Script Confusable data
-//
-// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
-// because everything is local to this one build function anyhow,
-// OR
-// break this function into more reasonably sized pieces, with
-// state in WSConfusableDataBuilder.
-//
-void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
- int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
-{
- if (U_FAILURE(status)) {
- return;
- }
- URegularExpression *parseRegexp = NULL;
- int32_t inputLen = 0;
- UChar *input = NULL;
- int32_t lineNum = 0;
-
- UVector *scriptSets = NULL;
- uint32_t rtScriptSetsCount = 2;
-
- UTrie2 *anyCaseTrie = NULL;
- UTrie2 *lowerCaseTrie = NULL;
-
- anyCaseTrie = utrie2_open(0, 0, &status);
- lowerCaseTrie = utrie2_open(0, 0, &status);
-
- UnicodeString pattern(parseExp, -1, US_INV);
-
- // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
- //
- // Reserved TRIE values:
- // 0: Code point has no whole script confusables.
- // 1: Code point is of script Common or Inherited.
- // These code points do not participate in whole script confusable detection.
- // (This is logically equivalent to saying that they contain confusables in
- // all scripts)
- //
- // Because Trie values are indexes into the ScriptSets vector, pre-fill
- // vector positions 0 and 1 to avoid conflicts with the reserved values.
-
- scriptSets = new UVector(status);
- if (scriptSets == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- scriptSets->addElement((void *)NULL, status);
- scriptSets->addElement((void *)NULL, status);
-
- // Convert the user input data from UTF-8 to UChar (UTF-16)
- u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
- if (status != U_BUFFER_OVERFLOW_ERROR) {
- goto cleanup;
- }
- status = U_ZERO_ERROR;
- input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
- if (input == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
-
- parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
-
- // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
- // given the syntax of the input.
- if (*input == 0xfeff) {
- *input = 0x20;
- }
-
- // Parse the input, one line per iteration of this loop.
- uregex_setText(parseRegexp, input, inputLen, &status);
- while (uregex_findNext(parseRegexp, &status)) {
- lineNum++;
- if (uregex_start(parseRegexp, 1, &status) >= 0) {
- // this was a blank or comment line.
- continue;
- }
- if (uregex_start(parseRegexp, 8, &status) >= 0) {
- // input file syntax error.
- status = U_PARSE_ERROR;
- goto cleanup;
- }
- if (U_FAILURE(status)) {
- goto cleanup;
- }
-
- // Pick up the start and optional range end code points from the parsed line.
- UChar32 startCodePoint = SpoofImpl::ScanHex(
- input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
- UChar32 endCodePoint = startCodePoint;
- if (uregex_start(parseRegexp, 3, &status) >=0) {
- endCodePoint = SpoofImpl::ScanHex(
- input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
- }
-
- // Extract the two script names from the source line. We need these in an 8 bit
- // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
- // to the ICU u_getPropertyValueEnum() function. Ugh.
- char srcScriptName[20];
- char targScriptName[20];
- extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
- extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
- UScriptCode srcScript =
- static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
- UScriptCode targScript =
- static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
- if (U_FAILURE(status)) {
- goto cleanup;
- }
- if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
- status = U_INVALID_FORMAT_ERROR;
- goto cleanup;
- }
-
- // select the table - (A) any case or (L) lower case only
- UTrie2 *table = anyCaseTrie;
- if (uregex_start(parseRegexp, 7, &status) >= 0) {
- table = lowerCaseTrie;
- }
-
- // Build the set of scripts containing confusable characters for
- // the code point(s) specified in this input line.
- // Sanity check that the script of the source code point is the same
- // as the source script indicated in the input file. Failure of this check is
- // an error in the input file.
- // Include the source script in the set (needed for Mixed Script Confusable detection).
- //
- UChar32 cp;
- for (cp=startCodePoint; cp<=endCodePoint; cp++) {
- int32_t setIndex = utrie2_get32(table, cp);
- BuilderScriptSet *bsset = NULL;
- if (setIndex > 0) {
- U_ASSERT(setIndex < scriptSets->size());
- bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
- } else {
- bsset = new BuilderScriptSet();
- if (bsset == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- bsset->codePoint = cp;
- bsset->trie = table;
- bsset->sset = new ScriptSet();
- setIndex = scriptSets->size();
- bsset->index = setIndex;
- bsset->rindex = 0;
- if (bsset->sset == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- scriptSets->addElement(bsset, status);
- utrie2_set32(table, cp, setIndex, &status);
- }
- bsset->sset->set(targScript, status);
- bsset->sset->set(srcScript, status);
-
- if (U_FAILURE(status)) {
- goto cleanup;
- }
- UScriptCode cpScript = uscript_getScript(cp, &status);
- if (cpScript != srcScript) {
- status = U_INVALID_FORMAT_ERROR;
- goto cleanup;
- }
- }
- }
-
- // Eliminate duplicate script sets. At this point we have a separate
- // script set for every code point that had data in the input file.
- //
- // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
- //
- // printf("Number of scriptSets: %d\n", scriptSets->size());
- {
- int32_t duplicateCount = 0;
- rtScriptSetsCount = 2;
- for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
- BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
- if (outerSet->index != static_cast<uint32_t>(outeri)) {
- // This set was already identified as a duplicate.
- // It will not be allocated a position in the runtime array of ScriptSets.
- continue;
- }
- outerSet->rindex = rtScriptSetsCount++;
- for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
- BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
- if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
- delete innerSet->sset;
- innerSet->scriptSetOwned = FALSE;
- innerSet->sset = outerSet->sset;
- innerSet->index = outeri;
- innerSet->rindex = outerSet->rindex;
- duplicateCount++;
- }
- // But this doesn't get all. We need to fix the TRIE.
- }
- }
- // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
- }
-
-
-
- // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
- // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
- // are unused, which is why the loop index starts at 2.)
- {
- for (int32_t i=2; i<scriptSets->size(); i++) {
- BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
- if (bSet->rindex != (uint32_t)i) {
- utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
- }
- }
- }
-
- // For code points with script==Common or script==Inherited,
- // Set the reserved value of 1 into both Tries. These characters do not participate
- // in Whole Script Confusable detection; this reserved value is the means
- // by which they are detected.
- {
- UnicodeSet ignoreSet;
- ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
- UnicodeSet inheritedSet;
- inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
- ignoreSet.addAll(inheritedSet);
- for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
- UChar32 rangeStart = ignoreSet.getRangeStart(rn);
- UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
- utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
- utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
- }
- }
-
- // Serialize the data to the Spoof Detector
- {
- utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
- int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
- // printf("Any case Trie size: %d\n", size);
- if (status != U_BUFFER_OVERFLOW_ERROR) {
- goto cleanup;
- }
- status = U_ZERO_ERROR;
- spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
- spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
- void *where = spImpl->fSpoofData->reserveSpace(size, status);
- utrie2_serialize(anyCaseTrie, where, size, &status);
-
- utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
- size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
- // printf("Lower case Trie size: %d\n", size);
- if (status != U_BUFFER_OVERFLOW_ERROR) {
- goto cleanup;
- }
- status = U_ZERO_ERROR;
- spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
- spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
- spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
- where = spImpl->fSpoofData->reserveSpace(size, status);
- utrie2_serialize(lowerCaseTrie, where, size, &status);
-
- spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
- spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
- ScriptSet *rtScriptSets = static_cast<ScriptSet *>
- (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
- uint32_t rindex = 2;
- for (int32_t i=2; i<scriptSets->size(); i++) {
- BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
- if (bSet->rindex < rindex) {
- // We have already copied this script set to the serialized data.
- continue;
- }
- U_ASSERT(rindex == bSet->rindex);
- rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
- rindex++;
- }
- }
-
- // Open new utrie2s from the serialized data. We don't want to keep the ones
- // we just built because we would then have two copies of the data, one internal to
- // the utries that we have already constructed, and one in the serialized data area.
- // An alternative would be to not pre-serialize the Trie data, but that makes the
- // spoof detector data different, depending on how the detector was constructed.
- // It's simpler to keep the data always the same.
-
- spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
- UTRIE2_16_VALUE_BITS,
- (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
- NULL,
- &status);
-
- spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
- UTRIE2_16_VALUE_BITS,
- (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
- NULL,
- &status);
-
-
-
-cleanup:
- if (U_FAILURE(status)) {
- pe->line = lineNum;
- }
- uregex_close(parseRegexp);
- uprv_free(input);
-
- int32_t i;
- if (scriptSets != NULL) {
- for (i=0; i<scriptSets->size(); i++) {
- BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
- delete bsset;
- }
- delete scriptSets;
- }
- utrie2_close(anyCaseTrie);
- utrie2_close(lowerCaseTrie);
- return;
-}
-
-U_NAMESPACE_END
-
-
-
-BuilderScriptSet::BuilderScriptSet() {
- codePoint = -1;
- trie = NULL;
- sset = NULL;
- index = 0;
- rindex = 0;
- scriptSetOwned = TRUE;
-}
-
-BuilderScriptSet::~BuilderScriptSet() {
- if (scriptSetOwned) {
- delete sset;
- }
-}
-
-#endif
-#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
-
« no previous file with comments | « source/i18n/uspoof_wsconf.h ('k') | source/i18n/usrchimp.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698