diff options
author | Mike Hommey <glandium@debian.org> | 2006-01-06 18:28:17 +0100 |
---|---|---|
committer | Mike Hommey <glandium@debian.org> | 2006-01-06 18:28:17 +0100 |
commit | 0fd83af441e251fc23fc1af7959fd6ecfa105fe1 (patch) | |
tree | a2b35749a66abce02e6f07983ef50618d93bef58 /xmlregexp.c | |
parent | 17049f05f9ef09b3dc2a9c5d1de3f21de7c03193 (diff) | |
download | libxml2-0fd83af441e251fc23fc1af7959fd6ecfa105fe1.tar.gz |
Load /tmp/tmp.U9vXwU/libxml2-2.6.23 intoupstream/2.6.23
local/libxml2/branches/upstream/current.
Diffstat (limited to 'xmlregexp.c')
-rw-r--r-- | xmlregexp.c | 517 |
1 files changed, 426 insertions, 91 deletions
diff --git a/xmlregexp.c b/xmlregexp.c index 45b917b..de581f0 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -42,6 +42,8 @@ /* #define DEBUG_PUSH */ /* #define DEBUG_COMPACTION */ +#define MAX_PUSH 10000000 + #define ERROR(str) \ ctxt->error = XML_REGEXP_COMPILE_ERROR; \ xmlRegexpErrCompile(ctxt, str); @@ -73,20 +75,20 @@ typedef enum { XML_REGEXP_EPSILON = 1, XML_REGEXP_CHARVAL, XML_REGEXP_RANGES, - XML_REGEXP_SUBREG, + XML_REGEXP_SUBREG, /* used for () sub regexps */ XML_REGEXP_STRING, XML_REGEXP_ANYCHAR, /* . */ XML_REGEXP_ANYSPACE, /* \s */ XML_REGEXP_NOTSPACE, /* \S */ XML_REGEXP_INITNAME, /* \l */ - XML_REGEXP_NOTINITNAME, /* \l */ + XML_REGEXP_NOTINITNAME, /* \L */ XML_REGEXP_NAMECHAR, /* \c */ XML_REGEXP_NOTNAMECHAR, /* \C */ XML_REGEXP_DECIMAL, /* \d */ - XML_REGEXP_NOTDECIMAL, /* \d */ + XML_REGEXP_NOTDECIMAL, /* \D */ XML_REGEXP_REALCHAR, /* \w */ - XML_REGEXP_NOTREALCHAR, /* \w */ - XML_REGEXP_LETTER, + XML_REGEXP_NOTREALCHAR, /* \W */ + XML_REGEXP_LETTER = 100, XML_REGEXP_LETTER_UPPERCASE, XML_REGEXP_LETTER_LOWERCASE, XML_REGEXP_LETTER_TITLECASE, @@ -201,6 +203,7 @@ struct _xmlRegTrans { int to; int counter; int count; + int nd; }; struct _xmlAutomataState { @@ -326,6 +329,7 @@ struct _xmlRegExecCtxt { xmlRegStatePtr errState; /* the error state */ xmlChar *errString; /* the string raising the error */ int *errCounts; /* counters at the error state */ + int nbPush; }; #define REGEXP_ALL_COUNTER 0x123456 @@ -335,6 +339,9 @@ static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); static void xmlRegFreeState(xmlRegStatePtr state); static void xmlRegFreeAtom(xmlRegAtomPtr atom); static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); +static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); +static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, + int neg, int start, int end, const xmlChar *blockName); /************************************************************************ * * @@ -417,6 +424,9 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { ret->nbCounters = ctxt->nbCounters; ret->counters = ctxt->counters; ret->determinist = ctxt->determinist; + if (ret->determinist == -1) { + xmlRegexpIsDeterminist(ret); + } if ((ret->determinist != 0) && (ret->nbCounters == 0) && @@ -569,7 +579,6 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { i, j, trans->atom->no, trans->to, atomno, targetno); printf(" previous to is %d\n", prev); #endif - ret->determinist = 0; if (transdata != NULL) xmlFree(transdata); xmlFree(transitions); @@ -1016,6 +1025,12 @@ xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { fprintf(output, "removed\n"); return; } + if (trans->nd != 0) { + if (trans->nd == 2) + fprintf(output, "last not determinist, "); + else + fprintf(output, "not determinist, "); + } if (trans->counter >= 0) { fprintf(output, "counted %d, ", trans->counter); } @@ -1235,7 +1250,7 @@ xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, static void xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, xmlRegAtomPtr atom, xmlRegStatePtr target, - int counter, int count, int nchk) { + int counter, int count) { int nrtrans; @@ -1253,19 +1268,17 @@ xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, * so, silently ignore this request. */ - if (nchk == 0) { - for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { - xmlRegTransPtr trans = &(state->trans[nrtrans]); - if ((trans->atom == atom) && - (trans->to == target->no) && - (trans->counter == counter) && - (trans->count == count)) { + for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { + xmlRegTransPtr trans = &(state->trans[nrtrans]); + if ((trans->atom == atom) && + (trans->to == target->no) && + (trans->counter == counter) && + (trans->count == count)) { #ifdef DEBUG_REGEXP_GRAPH - printf("Ignoring duplicate transition from %d to %d\n", - state->no, target->no); + printf("Ignoring duplicate transition from %d to %d\n", + state->no, target->no); #endif - return; - } + return; } } @@ -1308,6 +1321,7 @@ xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, state->trans[state->nbTrans].to = target->no; state->trans[state->nbTrans].counter = counter; state->trans[state->nbTrans].count = count; + state->trans[state->nbTrans].nd = 0; state->nbTrans++; xmlRegStateAddTransTo(ctxt, target, state->no); } @@ -1359,9 +1373,9 @@ xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, ctxt->state = to; } if (lax) - xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER, 0); + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); else - xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER, 0); + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); } /** @@ -1379,7 +1393,7 @@ xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, xmlRegStatePush(ctxt, to); ctxt->state = to; } - xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1, 0); + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); } /** @@ -1398,7 +1412,7 @@ xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, xmlRegStatePush(ctxt, to); ctxt->state = to; } - xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1, 0); + xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); } /** @@ -1417,7 +1431,7 @@ xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, xmlRegStatePush(ctxt, to); ctxt->state = to; } - xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter, 0); + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); } /** @@ -1450,6 +1464,14 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, * Generate an epsilon transition to link to the target */ xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); +#ifdef DV + } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && + (atom->quant != XML_REGEXP_QUANT_ONCE)) { + to = xmlRegNewState(ctxt); + xmlRegStatePush(ctxt, to); + ctxt->state = to; + xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); +#endif } switch (atom->quant) { case XML_REGEXP_QUANT_OPT: @@ -1504,8 +1526,8 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, } else { newstate = xmlRegNewState(ctxt); xmlRegStatePush(ctxt, newstate); - ctxt->state = newstate; } + ctxt->state = newstate; xmlFAGenerateCountedTransition(ctxt, atom->stop, newstate, counter); } @@ -1513,7 +1535,8 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, break; } return(0); - } else if ((atom->min == 0) && (atom->max == 0) && + } + if ((atom->min == 0) && (atom->max == 0) && (atom->quant == XML_REGEXP_QUANT_RANGE)) { /* * we can discard the atom and generate an epsilon transition instead @@ -1530,21 +1553,20 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, ctxt->state = to; xmlRegFreeAtom(atom); return(0); - } else { - if (to == NULL) { - to = xmlRegNewState(ctxt); - if (to != NULL) - xmlRegStatePush(ctxt, to); - else { - return(-1); - } - } - if (xmlRegAtomPush(ctxt, atom) < 0) { + } + if (to == NULL) { + to = xmlRegNewState(ctxt); + if (to != NULL) + xmlRegStatePush(ctxt, to); + else { return(-1); } - xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1, 0); - ctxt->state = to; } + if (xmlRegAtomPush(ctxt, atom) < 0) { + return(-1); + } + xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); + ctxt->state = to; switch (atom->quant) { case XML_REGEXP_QUANT_OPT: atom->quant = XML_REGEXP_QUANT_ONCE; @@ -1553,11 +1575,11 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, case XML_REGEXP_QUANT_MULT: atom->quant = XML_REGEXP_QUANT_ONCE; xmlFAGenerateEpsilonTransition(ctxt, from, to); - xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1, 0); + xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); break; case XML_REGEXP_QUANT_PLUS: atom->quant = XML_REGEXP_QUANT_ONCE; - xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1, 0); + xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); break; default: break; @@ -1614,7 +1636,7 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, xmlRegStateAddTrans(ctxt, from, NULL, ctxt->states[newto], - -1, to->trans[transnr].count, 0); + -1, to->trans[transnr].count); } else { #ifdef DEBUG_REGEXP_GRAPH printf("Found epsilon trans %d from %d to %d\n", @@ -1637,10 +1659,10 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, if (to->trans[transnr].counter >= 0) { xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, ctxt->states[newto], - to->trans[transnr].counter, -1, 1); + to->trans[transnr].counter, -1); } else { xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, - ctxt->states[newto], counter, -1, 1); + ctxt->states[newto], counter, -1); } } } @@ -1869,12 +1891,175 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { } +static int +xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { + int ret = 0; + + if ((range1->type == XML_REGEXP_RANGES) || + (range2->type == XML_REGEXP_RANGES) || + (range2->type == XML_REGEXP_SUBREG) || + (range1->type == XML_REGEXP_SUBREG) || + (range1->type == XML_REGEXP_STRING) || + (range2->type == XML_REGEXP_STRING)) + return(-1); + + /* put them in order */ + if (range1->type > range2->type) { + xmlRegRangePtr tmp; + + tmp = range1; + range1 = range2; + range2 = tmp; + } + if ((range1->type == XML_REGEXP_ANYCHAR) || + (range2->type == XML_REGEXP_ANYCHAR)) { + ret = 1; + } else if ((range1->type == XML_REGEXP_EPSILON) || + (range2->type == XML_REGEXP_EPSILON)) { + return(0); + } else if (range1->type == range2->type) { + if ((range1->type != XML_REGEXP_CHARVAL) || + (range1->end < range2->start) || + (range2->end < range1->start)) + ret = 1; + else + ret = 0; + } else if (range1->type == XML_REGEXP_CHARVAL) { + int codepoint; + int neg = 0; + + /* + * just check all codepoints in the range for acceptance, + * this is usually way cheaper since done only once at + * compilation than testing over and over at runtime or + * pushing too many states when evaluating. + */ + if (((range1->neg == 0) && (range2->neg != 0)) || + ((range1->neg != 0) && (range2->neg == 0))) + neg = 1; + + for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { + ret = xmlRegCheckCharacterRange(range2->type, codepoint, + 0, range2->start, range2->end, + range2->blockName); + if (ret < 0) + return(-1); + if (((neg == 1) && (ret == 0)) || + ((neg == 0) && (ret == 1))) + return(1); + } + return(0); + } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || + (range2->type == XML_REGEXP_BLOCK_NAME)) { + if (range1->type == range2->type) { + ret = xmlStrEqual(range1->blockName, range2->blockName); + } else { + /* + * comparing a block range with anything else is way + * too costly, and maintining the table is like too much + * memory too, so let's force the automata to save state + * here. + */ + return(1); + } + } else if ((range1->type < XML_REGEXP_LETTER) || + (range2->type < XML_REGEXP_LETTER)) { + if ((range1->type == XML_REGEXP_ANYSPACE) && + (range2->type == XML_REGEXP_NOTSPACE)) + ret = 0; + else if ((range1->type == XML_REGEXP_INITNAME) && + (range2->type == XML_REGEXP_NOTINITNAME)) + ret = 0; + else if ((range1->type == XML_REGEXP_NAMECHAR) && + (range2->type == XML_REGEXP_NOTNAMECHAR)) + ret = 0; + else if ((range1->type == XML_REGEXP_DECIMAL) && + (range2->type == XML_REGEXP_NOTDECIMAL)) + ret = 0; + else if ((range1->type == XML_REGEXP_REALCHAR) && + (range2->type == XML_REGEXP_NOTREALCHAR)) + ret = 0; + else { + /* same thing to limit complexity */ + return(1); + } + } else { + ret = 0; + /* range1->type < range2->type here */ + switch (range1->type) { + case XML_REGEXP_LETTER: + /* all disjoint except in the subgroups */ + if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || + (range2->type == XML_REGEXP_LETTER_LOWERCASE) || + (range2->type == XML_REGEXP_LETTER_TITLECASE) || + (range2->type == XML_REGEXP_LETTER_MODIFIER) || + (range2->type == XML_REGEXP_LETTER_OTHERS)) + ret = 1; + break; + case XML_REGEXP_MARK: + if ((range2->type == XML_REGEXP_MARK_NONSPACING) || + (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || + (range2->type == XML_REGEXP_MARK_ENCLOSING)) + ret = 1; + break; + case XML_REGEXP_NUMBER: + if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || + (range2->type == XML_REGEXP_NUMBER_LETTER) || + (range2->type == XML_REGEXP_NUMBER_OTHERS)) + ret = 1; + break; + case XML_REGEXP_PUNCT: + if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || + (range2->type == XML_REGEXP_PUNCT_DASH) || + (range2->type == XML_REGEXP_PUNCT_OPEN) || + (range2->type == XML_REGEXP_PUNCT_CLOSE) || + (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || + (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || + (range2->type == XML_REGEXP_PUNCT_OTHERS)) + ret = 1; + break; + case XML_REGEXP_SEPAR: + if ((range2->type == XML_REGEXP_SEPAR_SPACE) || + (range2->type == XML_REGEXP_SEPAR_LINE) || + (range2->type == XML_REGEXP_SEPAR_PARA)) + ret = 1; + break; + case XML_REGEXP_SYMBOL: + if ((range2->type == XML_REGEXP_SYMBOL_MATH) || + (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || + (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || + (range2->type == XML_REGEXP_SYMBOL_OTHERS)) + ret = 1; + break; + case XML_REGEXP_OTHER: + if ((range2->type == XML_REGEXP_OTHER_CONTROL) || + (range2->type == XML_REGEXP_OTHER_FORMAT) || + (range2->type == XML_REGEXP_OTHER_PRIVATE)) + ret = 1; + break; + default: + if ((range2->type >= XML_REGEXP_LETTER) && + (range2->type < XML_REGEXP_BLOCK_NAME)) + ret = 0; + else { + /* safety net ! */ + return(1); + } + } + } + if (((range1->neg == 0) && (range2->neg != 0)) || + ((range1->neg != 0) && (range2->neg == 0))) + ret = !ret; + return(1); +} + /** * xmlFACompareAtoms: * @atom1: an atom * @atom2: an atom * - * Compares two atoms to check whether they are equivalents + * Compares two atoms to check whether they intersect in some ways, + * this is used by xmlFAComputesDeterminism only * * Returns 1 if yes and 0 otherwise */ @@ -1887,28 +2072,65 @@ xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { if ((atom1 == NULL) || (atom2 == NULL)) return(0); - if (atom1->type != atom2->type) + if ((atom1->type == XML_REGEXP_RANGES) && + (atom2->type == XML_REGEXP_CHARVAL)) { + } else if ((atom1->type == XML_REGEXP_CHARVAL) && + (atom2->type == XML_REGEXP_RANGES)) { + xmlRegAtomPtr tmp; + tmp = atom1; + atom1 = atom2; + atom2 = tmp; + } else if (atom1->type != atom2->type) { return(0); + } switch (atom1->type) { case XML_REGEXP_STRING: ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, (xmlChar *)atom2->valuep); break; case XML_REGEXP_EPSILON: - return(1); + goto not_determinist; case XML_REGEXP_CHARVAL: - ret = atom1->codepoint == atom2->codepoint; + ret = (atom1->codepoint == atom2->codepoint); break; case XML_REGEXP_RANGES: - TODO; - return(0); + if (atom2->type == XML_REGEXP_CHARVAL) { + ret = xmlRegCheckCharacter(atom1, atom2->codepoint); + if (ret < 0) + return(-1); + break; + } else { + int i, j, res; + xmlRegRangePtr r1, r2; + + /* + * need to check that none of the ranges eventually matches + */ + for (i = 0;i < atom1->nbRanges;i++) { + for (j = 0;j < atom2->nbRanges;j++) { + r1 = atom1->ranges[i]; + r2 = atom2->ranges[j]; + res = xmlFACompareRanges(r1, r2); + if (res == 1) { + ret = 1; + goto done; + } + } + } + ret = 0; + } + break; default: - return(1); + goto not_determinist; } +done: if (atom1->neg != atom2->neg) { ret = !ret; } - return(ret); + if (ret == 0) + return(0); +not_determinist: + return(1); } /** @@ -1923,12 +2145,18 @@ static int xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, int to, xmlRegAtomPtr atom) { int ret = 1; - int transnr; + int res; + int transnr, nbTrans; xmlRegTransPtr t1; if (state == NULL) return(ret); - for (transnr = 0;transnr < state->nbTrans;transnr++) { + /* + * don't recurse on transitions potentially added in the course of + * the elimination. + */ + nbTrans = state->nbTrans; + for (transnr = 0;transnr < nbTrans;transnr++) { t1 = &(state->trans[transnr]); /* * check transitions conflicting with the one looked at @@ -1936,16 +2164,21 @@ xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, if (t1->atom == NULL) { if (t1->to == -1) continue; - ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], + res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], to, atom); - if (ret == 0) - return(0); + if (res == 0) { + ret = 0; + /* t1->nd = 1; */ + } continue; } if (t1->to != to) continue; - if (xmlFACompareAtoms(t1->atom, atom)) - return(0); + if (xmlFACompareAtoms(t1->atom, atom)) { + ret = 0; + /* mark the transition as non-deterministic */ + t1->nd = 1; + } } return(ret); } @@ -1962,7 +2195,7 @@ static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { int statenr, transnr; xmlRegStatePtr state; - xmlRegTransPtr t1, t2; + xmlRegTransPtr t1, t2, last; int i; int ret = 1; @@ -1974,8 +2207,7 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { return(ctxt->determinist); /* - * Check for all states that there aren't 2 transitions - * with the same atom and a different target. + * First cleanup the automata removing cancelled transitions */ for (statenr = 0;statenr < ctxt->nbStates;statenr++) { state = ctxt->states[statenr]; @@ -1989,8 +2221,10 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { * Determinism checks in case of counted or all transitions * will have to be handled separately */ - if (t1->atom == NULL) + if (t1->atom == NULL) { + /* t1->nd = 1; */ continue; + } if (t1->to == -1) /* eliminated */ continue; for (i = 0;i < transnr;i++) { @@ -2001,10 +2235,46 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { if (t1->to == t2->to) { if (xmlFACompareAtoms(t1->atom, t2->atom)) t2->to = -1; /* eliminated */ - } else { - /* not determinist ! */ - if (xmlFACompareAtoms(t1->atom, t2->atom)) - ret = 0; + } + } + } + } + } + + /* + * Check for all states that there aren't 2 transitions + * with the same atom and a different target. + */ + for (statenr = 0;statenr < ctxt->nbStates;statenr++) { + state = ctxt->states[statenr]; + if (state == NULL) + continue; + if (state->nbTrans < 2) + continue; + last = NULL; + for (transnr = 0;transnr < state->nbTrans;transnr++) { + t1 = &(state->trans[transnr]); + /* + * Determinism checks in case of counted or all transitions + * will have to be handled separately + */ + if (t1->atom == NULL) { + continue; + } + if (t1->to == -1) /* eliminated */ + continue; + for (i = 0;i < transnr;i++) { + t2 = &(state->trans[i]); + if (t2->to == -1) /* eliminated */ + continue; + if (t2->atom != NULL) { + /* not determinist ! */ + if (xmlFACompareAtoms(t1->atom, t2->atom)) { + ret = 0; + /* mark the transitions as non-deterministic ones */ + t1->nd = 1; + t2->nd = 1; + last = t1; } } else if (t1->to != -1) { /* @@ -2013,16 +2283,39 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { */ ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], t2->to, t2->atom); + /* don't shortcut the computation so all non deterministic + transition get marked down if (ret == 0) return(0); + */ + if (ret == 0) { + t1->nd = 1; + /* t2->nd = 1; */ + last = t1; + } } } + /* don't shortcut the computation so all non deterministic + transition get marked down if (ret == 0) - break; + break; */ + } + + /* + * mark specifically the last non-deterministic transition + * from a state since there is no need to set-up rollback + * from it + */ + if (last != NULL) { + last->nd = 2; } + + /* don't shortcut the computation so all non deterministic + transition get marked down if (ret == 0) - break; + break; */ } + ctxt->determinist = ret; return(ret); } @@ -2336,6 +2629,12 @@ xmlFARegExecSave(xmlRegExecCtxtPtr exec) { xmlFARegDebugExec(exec); exec->transno--; #endif +#ifdef MAX_PUSH + if (exec->nbPush > MAX_PUSH) { + return; + } + exec->nbPush++; +#endif if (exec->maxRollbacks == 0) { exec->maxRollbacks = 4; @@ -2422,10 +2721,11 @@ static int xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { xmlRegExecCtxt execval; xmlRegExecCtxtPtr exec = &execval; - int ret, codepoint = 0, len; + int ret, codepoint = 0, len, deter; exec->inputString = content; exec->index = 0; + exec->nbPush = 0; exec->determinist = 1; exec->maxRollbacks = 0; exec->nbRollbacks = 0; @@ -2482,6 +2782,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { continue; atom = trans->atom; ret = 0; + deter = 1; if (trans->count >= 0) { int count; xmlRegCounterPtr counter; @@ -2497,6 +2798,8 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { trans->count, count, counter->min, counter->max); #endif ret = ((count >= counter->min) && (count <= counter->max)); + if ((ret) && (counter->min != counter->max)) + deter = 0; } else if (atom == NULL) { fprintf(stderr, "epsilon transition left at runtime\n"); exec->status = -2; @@ -2509,7 +2812,15 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { /* * this is a multiple input sequence + * If there is a counter associated increment it now. + * before potentially saving and rollback */ + if (trans->counter >= 0) { +#ifdef DEBUG_REGEXP_EXEC + printf("Increasing count %d\n", trans->counter); +#endif + exec->counts[trans->counter]++; + } if (exec->state->nbTrans > exec->transno + 1) { xmlFARegExecSave(exec); } @@ -2559,6 +2870,12 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { if (ret == 0) { goto rollback; } + if (trans->counter >= 0) { +#ifdef DEBUG_REGEXP_EXEC + printf("Decreasing count %d\n", trans->counter); +#endif + exec->counts[trans->counter]--; + } } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { /* * we don't match on the codepoint, but minOccurs of 0 @@ -2576,7 +2893,17 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { ret = 1; } if (ret == 1) { - if (exec->state->nbTrans > exec->transno + 1) { + if ((trans->nd == 1) || + ((trans->count >= 0) && (deter == 0) && + (exec->state->nbTrans > exec->transno + 1))) { +#ifdef DEBUG_REGEXP_EXEC + if (trans->nd == 1) + printf("Saving on nd transition atom %d for %c at %d\n", + trans->atom->no, codepoint, exec->index); + else + printf("Saving on counted transition count %d for %c at %d\n", + trans->count, codepoint, exec->index); +#endif xmlFARegExecSave(exec); } if (trans->counter >= 0) { @@ -2613,6 +2940,10 @@ rollback: * Failed to find a way out */ exec->determinist = 0; +#ifdef DEBUG_REGEXP_EXEC + printf("rollback from state %d on %d:%c\n", exec->state->no, + codepoint,codepoint); +#endif xmlFARegExecRollBack(exec); } progress: @@ -2632,8 +2963,11 @@ progress: xmlFree(exec->counts); if (exec->status == 0) return(1); - if (exec->status == -1) + if (exec->status == -1) { + if (exec->nbPush > MAX_PUSH) + return(-1); return(0); + } return(exec->status); } @@ -2708,6 +3042,7 @@ xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { exec->inputStack = NULL; exec->errStateNo = -1; exec->errString = NULL; + exec->nbPush = 0; return(exec); } @@ -3344,7 +3679,7 @@ xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, ret = xmlRegExecPushStringInternal(exec, str, data, 1); if (str != buf) - xmlFree(buf); + xmlFree(str); return(ret); } @@ -4784,11 +5119,11 @@ xmlNewAutomata(void) { /* initialize the parser */ ctxt->end = NULL; ctxt->start = ctxt->state = xmlRegNewState(ctxt); - ctxt->start->type = XML_REGEXP_START_STATE; if (ctxt->start == NULL) { xmlFreeAutomata(ctxt); return(NULL); } + ctxt->start->type = XML_REGEXP_START_STATE; if (xmlRegStatePush(ctxt, ctxt->start) < 0) { xmlRegFreeState(ctxt->start); xmlFreeAutomata(ctxt); @@ -5081,7 +5416,7 @@ xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, to = xmlRegNewState(am); xmlRegStatePush(am, to); } - xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); + xmlRegStateAddTrans(am, from, atom, to, counter, -1); xmlRegAtomPush(am, atom); am->state = to; @@ -5147,7 +5482,7 @@ xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, to = xmlRegNewState(am); xmlRegStatePush(am, to); } - xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); + xmlRegStateAddTrans(am, from, atom, to, counter, -1); xmlRegAtomPush(am, atom); am->state = to; @@ -5236,7 +5571,7 @@ xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, to = xmlRegNewState(am); xmlRegStatePush(am, to); } - xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); + xmlRegStateAddTrans(am, from, atom, to, counter, -1); xmlRegAtomPush(am, atom); am->state = to; return(to); @@ -5298,7 +5633,7 @@ xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, to = xmlRegNewState(am); xmlRegStatePush(am, to); } - xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); + xmlRegStateAddTrans(am, from, atom, to, counter, -1); xmlRegAtomPush(am, atom); am->state = to; return(to); @@ -6134,7 +6469,7 @@ tail: * xmlExpGetLanguage: * @ctxt: the expression context * @exp: the expression - * @list: where to store the tokens + * @langList: where to store the tokens * @len: the allocated lenght of @list * * Find all the strings used in @exp and store them in @list @@ -6144,10 +6479,10 @@ tail: */ int xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, - const xmlChar**list, int len) { - if ((ctxt == NULL) || (exp == NULL) || (list == NULL) || (len <= 0)) + const xmlChar**langList, int len) { + if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) return(-1); - return(xmlExpGetLanguageInt(ctxt, exp, list, len, 0)); + return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); } static int @@ -6200,7 +6535,7 @@ tail: * xmlExpGetStart: * @ctxt: the expression context * @exp: the expression - * @list: where to store the tokens + * @tokList: where to store the tokens * @len: the allocated lenght of @list * * Find all the strings that appears at the start of the languages @@ -6212,10 +6547,10 @@ tail: */ int xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, - const xmlChar**list, int len) { - if ((ctxt == NULL) || (exp == NULL) || (list == NULL) || (len <= 0)) + const xmlChar**tokList, int len) { + if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) return(-1); - return(xmlExpGetStartInt(ctxt, exp, list, len, 0)); + return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); } /** @@ -6861,10 +7196,10 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); while (len < 0) { const xmlChar **temp; - temp = (const xmlChar **) xmlRealloc(tab, ctxt->tabSize * 2 * + temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * sizeof(const xmlChar *)); if (temp == NULL) { - xmlFree(tab); + xmlFree((xmlChar **) tab); return(NULL); } tab = temp; @@ -6875,14 +7210,14 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); if ((tmp == NULL) || (tmp == forbiddenExp)) { xmlExpFree(ctxt, ret); - xmlFree(tab); + xmlFree((xmlChar **) tab); return(tmp); } tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { xmlExpFree(ctxt, tmp); xmlExpFree(ctxt, ret); - xmlFree(tab); + xmlFree((xmlChar **) tab); return(tmp); } tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); @@ -6891,7 +7226,7 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { xmlExpFree(ctxt, ret); - xmlFree(tab); + xmlFree((xmlChar **) tab); return(tmp3); } @@ -6900,12 +7235,12 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { else { ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); if (ret == NULL) { - xmlFree(tab); + xmlFree((xmlChar **) tab); return(NULL); } } } - xmlFree(tab); + xmlFree((xmlChar **) tab); return(ret); } |