| Index: source/i18n/rematch.cpp
|
| diff --git a/source/i18n/rematch.cpp b/source/i18n/rematch.cpp
|
| index c7aeac015ff3e99e9496f3a247f0e44ccbf419da..0e795f216c23d5ed19abcb04934dda1263204ab0 100644
|
| --- a/source/i18n/rematch.cpp
|
| +++ b/source/i18n/rematch.cpp
|
| @@ -1,7 +1,9 @@
|
| +// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
| +// License & terms of use: http://www.unicode.org/copyright.html
|
| /*
|
| **************************************************************************
|
| -* Copyright (C) 2002-2015 International Business Machines Corporation *
|
| -* and others. All rights reserved. *
|
| +* Copyright (C) 2002-2016 International Business Machines Corporation
|
| +* and others. All rights reserved.
|
| **************************************************************************
|
| */
|
| //
|
| @@ -23,6 +25,7 @@
|
| #include "unicode/utf16.h"
|
| #include "uassert.h"
|
| #include "cmemory.h"
|
| +#include "cstr.h"
|
| #include "uvector.h"
|
| #include "uvectr32.h"
|
| #include "uvectr64.h"
|
| @@ -33,6 +36,7 @@
|
|
|
| // #include <malloc.h> // Needed for heapcheck testing
|
|
|
| +
|
| U_NAMESPACE_BEGIN
|
|
|
| // Default limit for the size of the back track stack, to avoid system
|
| @@ -237,7 +241,7 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
|
| return;
|
| }
|
|
|
| - if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
|
| + if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
|
| fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
|
| if (fData == NULL) {
|
| status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
| @@ -782,7 +786,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
|
| if (fMatch) {
|
| return TRUE;
|
| }
|
| - UTEXT_SETNATIVEINDEX(fInputText, pos);
|
| + UTEXT_SETNATIVEINDEX(fInputText, startPos);
|
| }
|
| if (startPos > testStartLimit) {
|
| fMatch = FALSE;
|
| @@ -2723,6 +2727,18 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
|
| return (REStackFrame *)newFP;
|
| }
|
|
|
| +#if defined(REGEX_DEBUG)
|
| +namespace {
|
| +UnicodeString StringFromUText(UText *ut) {
|
| + UnicodeString result;
|
| + for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
|
| + result.append(c);
|
| + }
|
| + return result;
|
| +}
|
| +}
|
| +#endif // REGEX_DEBUG
|
| +
|
|
|
| //--------------------------------------------------------------------------------
|
| //
|
| @@ -2742,32 +2758,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
| int32_t opValue; // and the operand value.
|
|
|
| #ifdef REGEX_RUN_DEBUG
|
| - if (fTraceDebug)
|
| - {
|
| + if (fTraceDebug) {
|
| printf("MatchAt(startIdx=%ld)\n", startIdx);
|
| - printf("Original Pattern: ");
|
| - UChar32 c = utext_next32From(fPattern->fPattern, 0);
|
| - while (c != U_SENTINEL) {
|
| - if (c<32 || c>256) {
|
| - c = '.';
|
| - }
|
| - printf("%c", c);
|
| -
|
| - c = UTEXT_NEXT32(fPattern->fPattern);
|
| - }
|
| - printf("\n");
|
| - printf("Input String: ");
|
| - c = utext_next32From(fInputText, 0);
|
| - while (c != U_SENTINEL) {
|
| - if (c<32 || c>256) {
|
| - c = '.';
|
| - }
|
| - printf("%c", c);
|
| -
|
| - c = UTEXT_NEXT32(fInputText);
|
| - }
|
| - printf("\n");
|
| - printf("\n");
|
| + printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
|
| + printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
|
| }
|
| #endif
|
|
|
| @@ -3936,28 +3930,38 @@ GC_Done:
|
| // of this op in the pattern.
|
| int32_t minML = (int32_t)pat[fp->fPatIdx++];
|
| int32_t maxML = (int32_t)pat[fp->fPatIdx++];
|
| + if (!UTEXT_USES_U16(fInputText)) {
|
| + // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
|
| + // The max length need not be exact; it just needs to be >= actual maximum.
|
| + maxML *= 3;
|
| + }
|
| U_ASSERT(minML <= maxML);
|
| U_ASSERT(minML >= 0);
|
|
|
| // Fetch (from data) the last input index where a match was attempted.
|
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
|
| - int64_t *lbStartIdx = &fData[opValue+2];
|
| - if (*lbStartIdx < 0) {
|
| + int64_t &lbStartIdx = fData[opValue+2];
|
| + if (lbStartIdx < 0) {
|
| // First time through loop.
|
| - *lbStartIdx = fp->fInputIdx - minML;
|
| + lbStartIdx = fp->fInputIdx - minML;
|
| + if (lbStartIdx > 0) {
|
| + // move index to a code point boudary, if it's not on one already.
|
| + UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
|
| + lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| + }
|
| } else {
|
| // 2nd through nth time through the loop.
|
| // Back up start position for match by one.
|
| - if (*lbStartIdx == 0) {
|
| - (*lbStartIdx)--;
|
| + if (lbStartIdx == 0) {
|
| + (lbStartIdx)--;
|
| } else {
|
| - UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
|
| + UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
|
| (void)UTEXT_PREVIOUS32(fInputText);
|
| - *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| + lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| }
|
| }
|
|
|
| - if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
|
| + if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
|
| // We have tried all potential match starting points without
|
| // getting a match. Backtrack out, and out of the
|
| // Look Behind altogether.
|
| @@ -3972,7 +3976,7 @@ GC_Done:
|
| // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
|
| // (successful match will fall off the end of the loop.)
|
| fp = StateSave(fp, fp->fPatIdx-3, status);
|
| - fp->fInputIdx = *lbStartIdx;
|
| + fp->fInputIdx = lbStartIdx;
|
| }
|
| break;
|
|
|
| @@ -4009,6 +4013,11 @@ GC_Done:
|
| // Fetch the extra parameters of this op.
|
| int32_t minML = (int32_t)pat[fp->fPatIdx++];
|
| int32_t maxML = (int32_t)pat[fp->fPatIdx++];
|
| + if (!UTEXT_USES_U16(fInputText)) {
|
| + // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
|
| + // The max length need not be exact; it just needs to be >= actual maximum.
|
| + maxML *= 3;
|
| + }
|
| int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
|
| continueLoc = URX_VAL(continueLoc);
|
| U_ASSERT(minML <= maxML);
|
| @@ -4017,23 +4026,28 @@ GC_Done:
|
|
|
| // Fetch (from data) the last input index where a match was attempted.
|
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
|
| - int64_t *lbStartIdx = &fData[opValue+2];
|
| - if (*lbStartIdx < 0) {
|
| + int64_t &lbStartIdx = fData[opValue+2];
|
| + if (lbStartIdx < 0) {
|
| // First time through loop.
|
| - *lbStartIdx = fp->fInputIdx - minML;
|
| + lbStartIdx = fp->fInputIdx - minML;
|
| + if (lbStartIdx > 0) {
|
| + // move index to a code point boudary, if it's not on one already.
|
| + UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
|
| + lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| + }
|
| } else {
|
| // 2nd through nth time through the loop.
|
| // Back up start position for match by one.
|
| - if (*lbStartIdx == 0) {
|
| - (*lbStartIdx)--;
|
| + if (lbStartIdx == 0) {
|
| + (lbStartIdx)--;
|
| } else {
|
| - UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
|
| + UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
|
| (void)UTEXT_PREVIOUS32(fInputText);
|
| - *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| + lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
| }
|
| }
|
|
|
| - if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
|
| + if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
|
| // We have tried all potential match starting points without
|
| // getting a match, which means that the negative lookbehind as
|
| // a whole has succeeded. Jump forward to the continue location
|
| @@ -4048,7 +4062,7 @@ GC_Done:
|
| // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
|
| // (successful match will cause a FAIL out of the loop altogether.)
|
| fp = StateSave(fp, fp->fPatIdx-4, status);
|
| - fp->fInputIdx = *lbStartIdx;
|
| + fp->fInputIdx = lbStartIdx;
|
| }
|
| break;
|
|
|
| @@ -4310,29 +4324,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
| #ifdef REGEX_RUN_DEBUG
|
| if (fTraceDebug) {
|
| printf("MatchAt(startIdx=%d)\n", startIdx);
|
| - printf("Original Pattern: ");
|
| - UChar32 c = utext_next32From(fPattern->fPattern, 0);
|
| - while (c != U_SENTINEL) {
|
| - if (c<32 || c>256) {
|
| - c = '.';
|
| - }
|
| - printf("%c", c);
|
| -
|
| - c = UTEXT_NEXT32(fPattern->fPattern);
|
| - }
|
| - printf("\n");
|
| - printf("Input String: ");
|
| - c = utext_next32From(fInputText, 0);
|
| - while (c != U_SENTINEL) {
|
| - if (c<32 || c>256) {
|
| - c = '.';
|
| - }
|
| - printf("%c", c);
|
| -
|
| - c = UTEXT_NEXT32(fInputText);
|
| - }
|
| - printf("\n");
|
| - printf("\n");
|
| + printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
|
| + printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
|
| }
|
| #endif
|
|
|
| @@ -5232,6 +5225,12 @@ GC_Done:
|
| break;
|
| }
|
| }
|
| + if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
|
| + inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
|
| + // Capture group ended with an unpaired lead surrogate.
|
| + // Back reference is not permitted to match lead only of a surrogatge pair.
|
| + success = FALSE;
|
| + }
|
| if (success) {
|
| fp->fInputIdx = inputIndex;
|
| } else {
|
| @@ -5444,21 +5443,24 @@ GC_Done:
|
|
|
| // Fetch (from data) the last input index where a match was attempted.
|
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
|
| - int64_t *lbStartIdx = &fData[opValue+2];
|
| - if (*lbStartIdx < 0) {
|
| + int64_t &lbStartIdx = fData[opValue+2];
|
| + if (lbStartIdx < 0) {
|
| // First time through loop.
|
| - *lbStartIdx = fp->fInputIdx - minML;
|
| + lbStartIdx = fp->fInputIdx - minML;
|
| + if (lbStartIdx > 0) {
|
| + U16_SET_CP_START(inputBuf, 0, lbStartIdx);
|
| + }
|
| } else {
|
| // 2nd through nth time through the loop.
|
| // Back up start position for match by one.
|
| - if (*lbStartIdx == 0) {
|
| - (*lbStartIdx)--;
|
| + if (lbStartIdx == 0) {
|
| + lbStartIdx--;
|
| } else {
|
| - U16_BACK_1(inputBuf, 0, *lbStartIdx);
|
| + U16_BACK_1(inputBuf, 0, lbStartIdx);
|
| }
|
| }
|
|
|
| - if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
|
| + if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
|
| // We have tried all potential match starting points without
|
| // getting a match. Backtrack out, and out of the
|
| // Look Behind altogether.
|
| @@ -5473,7 +5475,7 @@ GC_Done:
|
| // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
|
| // (successful match will fall off the end of the loop.)
|
| fp = StateSave(fp, fp->fPatIdx-3, status);
|
| - fp->fInputIdx = *lbStartIdx;
|
| + fp->fInputIdx = lbStartIdx;
|
| }
|
| break;
|
|
|
| @@ -5518,21 +5520,24 @@ GC_Done:
|
|
|
| // Fetch (from data) the last input index where a match was attempted.
|
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
|
| - int64_t *lbStartIdx = &fData[opValue+2];
|
| - if (*lbStartIdx < 0) {
|
| + int64_t &lbStartIdx = fData[opValue+2];
|
| + if (lbStartIdx < 0) {
|
| // First time through loop.
|
| - *lbStartIdx = fp->fInputIdx - minML;
|
| + lbStartIdx = fp->fInputIdx - minML;
|
| + if (lbStartIdx > 0) {
|
| + U16_SET_CP_START(inputBuf, 0, lbStartIdx);
|
| + }
|
| } else {
|
| // 2nd through nth time through the loop.
|
| // Back up start position for match by one.
|
| - if (*lbStartIdx == 0) {
|
| - (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
|
| + if (lbStartIdx == 0) {
|
| + lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
|
| } else {
|
| - U16_BACK_1(inputBuf, 0, *lbStartIdx);
|
| + U16_BACK_1(inputBuf, 0, lbStartIdx);
|
| }
|
| }
|
|
|
| - if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
|
| + if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
|
| // We have tried all potential match starting points without
|
| // getting a match, which means that the negative lookbehind as
|
| // a whole has succeeded. Jump forward to the continue location
|
| @@ -5547,7 +5552,7 @@ GC_Done:
|
| // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
|
| // (successful match will cause a FAIL out of the loop altogether.)
|
| fp = StateSave(fp, fp->fPatIdx-4, status);
|
| - fp->fInputIdx = *lbStartIdx;
|
| + fp->fInputIdx = lbStartIdx;
|
| }
|
| break;
|
|
|
|
|