diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_compile.c')
-rw-r--r-- | ext/pcre/pcrelib/pcre_compile.c | 180 |
1 files changed, 140 insertions, 40 deletions
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c index b197c1b54..54e23ea39 100644 --- a/ext/pcre/pcrelib/pcre_compile.c +++ b/ext/pcre/pcrelib/pcre_compile.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2010 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -51,10 +51,11 @@ supporting internal functions that are not used by other modules. */ #include "pcre_internal.h" -/* When DEBUG is defined, we need the pcre_printint() function, which is also -used by pcretest. DEBUG is not defined when building a production library. */ +/* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is +also used by pcretest. PCRE_DEBUG is not defined when building a production +library. */ -#ifdef DEBUG +#ifdef PCRE_DEBUG #include "pcre_printint.src" #endif @@ -89,6 +90,11 @@ is 4 there is plenty of room. */ #define COMPILE_WORK_SIZE (4096) +/* The overrun tests check for a slightly smaller size so that they detect the +overrun before it actually does run off the end of the data block. */ + +#define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100) + /* Table for handling escaped characters in the range '0'-'z'. Positive returns are simple data values; negative values are for special things like \d and so @@ -260,7 +266,11 @@ the number of relocations needed when a shared library is loaded dynamically, it is now one long string. We cannot use a table of offsets, because the lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we simply count through to the one we want - this isn't a performance issue -because these strings are used only when there is a compilation error. */ +because these strings are used only when there is a compilation error. + +Each substring ends with \0 to insert a null character. This includes the final +substring, so that the whole string ends with \0\0, which can be detected when +counting through. */ static const char error_texts[] = "no error\0" @@ -341,8 +351,7 @@ static const char error_texts[] = "digit expected after (?+\0" "] is an invalid data character in JavaScript compatibility mode\0" /* 65 */ - "different names for subpatterns of the same number are not allowed"; - + "different names for subpatterns of the same number are not allowed\0"; /* Table to identify digits and hex digits. This is used when compiling patterns. Note that the tables in chartables are dependent on the locale, and @@ -500,7 +509,11 @@ static const char * find_error_text(int n) { const char *s = error_texts; -for (; n > 0; n--) while (*s++ != 0) {}; +for (; n > 0; n--) + { + while (*s++ != 0) {}; + if (*s == 0) return "Error text not found (please report)"; + } return s; } @@ -1440,6 +1453,7 @@ for (;;) case OP_CALLOUT: case OP_SOD: case OP_SOM: + case OP_SET_SOM: case OP_EOD: case OP_EODN: case OP_CIRC: @@ -1774,12 +1788,14 @@ Arguments: code points to start of search endcode points to where to stop utf8 TRUE if in UTF8 mode + cd contains pointers to tables etc. Returns: TRUE if what is matched could be empty */ static BOOL -could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8) +could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8, + compile_data *cd) { register int c; for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); @@ -1810,6 +1826,28 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE continue; } + /* For a recursion/subroutine call, if its end has been reached, which + implies a subroutine call, we can scan it. */ + + if (c == OP_RECURSE) + { + BOOL empty_branch = FALSE; + const uschar *scode = cd->start_code + GET(code, 1); + if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ + do + { + if (could_be_empty_branch(scode, endcode, utf8, cd)) + { + empty_branch = TRUE; + break; + } + scode += GET(scode, 1); + } + while (*scode == OP_ALT); + if (!empty_branch) return FALSE; /* All branches are non-empty */ + continue; + } + /* For other groups, scan the branches. */ if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) @@ -1828,7 +1866,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE empty_branch = FALSE; do { - if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) + if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd)) empty_branch = TRUE; code += GET(code, 1); } @@ -1962,6 +2000,11 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f]; break; #endif + + /* None of the remaining opcodes are required to match a character. */ + + default: + break; } } @@ -1984,17 +2027,19 @@ Arguments: endcode points to where to stop (current RECURSE item) bcptr points to the chain of current (unclosed) branch starts utf8 TRUE if in UTF-8 mode + cd pointers to tables etc Returns: TRUE if what is matched could be empty */ static BOOL could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, - BOOL utf8) + BOOL utf8, compile_data *cd) { -while (bcptr != NULL && bcptr->current >= code) +while (bcptr != NULL && bcptr->current_branch >= code) { - if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE; + if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd)) + return FALSE; bcptr = bcptr->outer; } return TRUE; @@ -2656,7 +2701,7 @@ BOOL utf8 = FALSE; uschar *utf8_char = NULL; #endif -#ifdef DEBUG +#ifdef PCRE_DEBUG if (lengthptr != NULL) DPRINTF((">> start branch\n")); #endif @@ -2715,10 +2760,10 @@ for (;; ptr++) if (lengthptr != NULL) { -#ifdef DEBUG +#ifdef PCRE_DEBUG if (code > cd->hwm) cd->hwm = code; /* High water info */ #endif - if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */ + if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */ { *errorcodeptr = ERR52; goto FAILED; @@ -2767,7 +2812,7 @@ for (;; ptr++) /* In the real compile phase, just check the workspace used by the forward reference list. */ - else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) + else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK) { *errorcodeptr = ERR52; goto FAILED; @@ -4211,13 +4256,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { /* In the pre-compile phase, we don't actually do the replication. We just adjust the length as if we had. Do some paranoid checks for - potential integer overflow. */ + potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit + integer type when available, otherwise double. */ if (lengthptr != NULL) { int delta = (repeat_min - 1)*length_prevgroup; - if ((double)(repeat_min - 1)*(double)length_prevgroup > - (double)INT_MAX || + if ((INT64_OR_DOUBLE)(repeat_min - 1)* + (INT64_OR_DOUBLE)length_prevgroup > + (INT64_OR_DOUBLE)INT_MAX || OFLOW_MAX - *lengthptr < delta) { *errorcodeptr = ERR20; @@ -4263,15 +4310,16 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ just adjust the length as if we had. For each repetition we must add 1 to the length for BRAZERO and for all but the last repetition we must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some - paranoid checks to avoid integer overflow. */ + paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is + a 64-bit integer type when available, otherwise double. */ if (lengthptr != NULL && repeat_max > 0) { int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - 2 - 2*LINK_SIZE; /* Last one doesn't nest */ - if ((double)repeat_max * - (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - > (double)INT_MAX || + if ((INT64_OR_DOUBLE)repeat_max * + (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) + > (INT64_OR_DOUBLE)INT_MAX || OFLOW_MAX - *lengthptr < delta) { *errorcodeptr = ERR20; @@ -4348,7 +4396,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ uschar *scode = bracode; do { - if (could_be_empty_branch(scode, ketcode, utf8)) + if (could_be_empty_branch(scode, ketcode, utf8, cd)) { *bracode += OP_SBRA - OP_BRA; break; @@ -4423,7 +4471,12 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; + /* Because we are moving code along, we must ensure that any + pending recursive references are updated. */ + default: + *code = OP_END; + adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm); memmove(tempcode + 1+LINK_SIZE, tempcode, len); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; @@ -5142,6 +5195,11 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ *errorcodeptr = ERR15; goto FAILED; } + + /* Fudge the value of "called" so that when it is inserted as an + offset below, what it actually inserted is the reference number + of the group. */ + called = cd->start_code + recno; PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); } @@ -5151,7 +5209,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ recursion that could loop for ever, and diagnose that case. */ else if (GET(called, 1) == 0 && - could_be_empty(called, code, bcptr, utf8)) + could_be_empty(called, code, bcptr, utf8, cd)) { *errorcodeptr = ERR40; goto FAILED; @@ -5246,7 +5304,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { cd->external_options = newoptions; } - else + else { if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) { @@ -5583,6 +5641,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (-c >= ESC_REF) { + open_capitem *oc; recno = -c - ESC_REF; HANDLE_REFERENCE: /* Come here from named backref handling */ @@ -5592,6 +5651,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ PUT2INC(code, 0, recno); cd->backref_map |= (recno < 32)? (1 << recno) : 1; if (recno > cd->top_backref) cd->top_backref = recno; + + /* Check to see if this back reference is recursive, that it, it + is inside the group that it references. A flag is set so that the + group can be made atomic. */ + + for (oc = cd->open_caps; oc != NULL; oc = oc->next) + { + if (oc->number == recno) + { + oc->flag = TRUE; + break; + } + } } /* So are Unicode property matches, if supported. */ @@ -5781,10 +5853,11 @@ int branchfirstbyte, branchreqbyte; int length; int orig_bracount; int max_bracount; +int old_external_options = cd->external_options; branch_chain bc; bc.outer = bcptr; -bc.current = code; +bc.current_branch = code; firstbyte = reqbyte = REQ_UNSET; @@ -5803,13 +5876,15 @@ them global. It tests the value of length for (2 + 2*LINK_SIZE) in the pre-compile phase to find out whether anything has yet been compiled or not. */ /* If this is a capturing subpattern, add to the chain of open capturing items -so that we can detect them if (*ACCEPT) is encountered. */ +so that we can detect them if (*ACCEPT) is encountered. This is also used to +detect groups that contain recursive back references to themselves. */ if (*code == OP_CBRA) { capnumber = GET2(code, 1 + LINK_SIZE); capitem.number = capnumber; capitem.next = cd->open_caps; + capitem.flag = FALSE; cd->open_caps = &capitem; } @@ -5857,6 +5932,15 @@ for (;;) return FALSE; } + /* If the external options have changed during this branch, it means that we + are at the top level, and a leading option setting has been encountered. We + need to re-set the original option values to take account of this so that, + during the pre-compile phase, we know to allow for a re-set at the start of + subsequent branches. */ + + if (old_external_options != cd->external_options) + oldims = cd->external_options & PCRE_IMS; + /* Keep the highest bracket count in case (?| was used and some branch has fewer than the rest. */ @@ -5957,17 +6041,34 @@ for (;;) while (branch_length > 0); } - /* If it was a capturing subpattern, remove it from the chain. */ - - if (capnumber > 0) cd->open_caps = cd->open_caps->next; - /* Fill in the ket */ *code = OP_KET; PUT(code, 1, code - start_bracket); code += 1 + LINK_SIZE; - /* Resetting option if needed */ + /* If it was a capturing subpattern, check to see if it contained any + recursive back references. If so, we must wrap it in atomic brackets. + In any event, remove the block from the chain. */ + + if (capnumber > 0) + { + if (cd->open_caps->flag) + { + memmove(start_bracket + 1 + LINK_SIZE, start_bracket, + code - start_bracket); + *start_bracket = OP_ONCE; + code += 1 + LINK_SIZE; + PUT(start_bracket, 1, code - start_bracket); + *code = OP_KET; + PUT(code, 1, code - start_bracket); + code += 1 + LINK_SIZE; + length += 2 + 2*LINK_SIZE; + } + cd->open_caps = cd->open_caps->next; + } + + /* Reset options if needed. */ if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS) { @@ -6016,7 +6117,7 @@ for (;;) { *code = OP_ALT; PUT(code, 1, code - last_branch); - bc.current = last_branch = code; + bc.current_branch = last_branch = code; code += 1 + LINK_SIZE; } @@ -6432,7 +6533,7 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && - (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) + (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0) { errorcode = ERR44; goto PCRE_EARLY_ERROR_RETURN2; @@ -6629,7 +6730,7 @@ if debugging, leave the test till after things are printed out. */ *code++ = OP_END; -#ifndef DEBUG +#ifndef PCRE_DEBUG if (code - codestart > length) errorcode = ERR23; #endif @@ -6753,8 +6854,7 @@ if (reqbyte >= 0 && /* Print out the compiled data if debugging is enabled. This is never the case when building a production library. */ -#ifdef DEBUG - +#ifdef PCRE_DEBUG printf("Length = %d top_bracket = %d top_backref = %d\n", length, re->top_bracket, re->top_backref); @@ -6791,7 +6891,7 @@ if (code - codestart > length) if (errorcodeptr != NULL) *errorcodeptr = ERR23; return NULL; } -#endif /* DEBUG */ +#endif /* PCRE_DEBUG */ return (pcre *)re; } |