summaryrefslogtreecommitdiff
path: root/xmlregexp.c
diff options
context:
space:
mode:
authorMike Hommey <glandium@debian.org>2006-01-06 18:28:17 +0100
committerMike Hommey <glandium@debian.org>2006-01-06 18:28:17 +0100
commit0fd83af441e251fc23fc1af7959fd6ecfa105fe1 (patch)
treea2b35749a66abce02e6f07983ef50618d93bef58 /xmlregexp.c
parent17049f05f9ef09b3dc2a9c5d1de3f21de7c03193 (diff)
downloadlibxml2-0fd83af441e251fc23fc1af7959fd6ecfa105fe1.tar.gz
Load /tmp/tmp.U9vXwU/libxml2-2.6.23 intoupstream/2.6.23
local/libxml2/branches/upstream/current.
Diffstat (limited to 'xmlregexp.c')
-rw-r--r--xmlregexp.c517
1 files changed, 426 insertions, 91 deletions
diff --git a/xmlregexp.c b/xmlregexp.c
index 45b917b..de581f0 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -42,6 +42,8 @@
/* #define DEBUG_PUSH */
/* #define DEBUG_COMPACTION */
+#define MAX_PUSH 10000000
+
#define ERROR(str) \
ctxt->error = XML_REGEXP_COMPILE_ERROR; \
xmlRegexpErrCompile(ctxt, str);
@@ -73,20 +75,20 @@ typedef enum {
XML_REGEXP_EPSILON = 1,
XML_REGEXP_CHARVAL,
XML_REGEXP_RANGES,
- XML_REGEXP_SUBREG,
+ XML_REGEXP_SUBREG, /* used for () sub regexps */
XML_REGEXP_STRING,
XML_REGEXP_ANYCHAR, /* . */
XML_REGEXP_ANYSPACE, /* \s */
XML_REGEXP_NOTSPACE, /* \S */
XML_REGEXP_INITNAME, /* \l */
- XML_REGEXP_NOTINITNAME, /* \l */
+ XML_REGEXP_NOTINITNAME, /* \L */
XML_REGEXP_NAMECHAR, /* \c */
XML_REGEXP_NOTNAMECHAR, /* \C */
XML_REGEXP_DECIMAL, /* \d */
- XML_REGEXP_NOTDECIMAL, /* \d */
+ XML_REGEXP_NOTDECIMAL, /* \D */
XML_REGEXP_REALCHAR, /* \w */
- XML_REGEXP_NOTREALCHAR, /* \w */
- XML_REGEXP_LETTER,
+ XML_REGEXP_NOTREALCHAR, /* \W */
+ XML_REGEXP_LETTER = 100,
XML_REGEXP_LETTER_UPPERCASE,
XML_REGEXP_LETTER_LOWERCASE,
XML_REGEXP_LETTER_TITLECASE,
@@ -201,6 +203,7 @@ struct _xmlRegTrans {
int to;
int counter;
int count;
+ int nd;
};
struct _xmlAutomataState {
@@ -326,6 +329,7 @@ struct _xmlRegExecCtxt {
xmlRegStatePtr errState; /* the error state */
xmlChar *errString; /* the string raising the error */
int *errCounts; /* counters at the error state */
+ int nbPush;
};
#define REGEXP_ALL_COUNTER 0x123456
@@ -335,6 +339,9 @@ static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
static void xmlRegFreeState(xmlRegStatePtr state);
static void xmlRegFreeAtom(xmlRegAtomPtr atom);
static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
+static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
+static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
+ int neg, int start, int end, const xmlChar *blockName);
/************************************************************************
* *
@@ -417,6 +424,9 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
ret->nbCounters = ctxt->nbCounters;
ret->counters = ctxt->counters;
ret->determinist = ctxt->determinist;
+ if (ret->determinist == -1) {
+ xmlRegexpIsDeterminist(ret);
+ }
if ((ret->determinist != 0) &&
(ret->nbCounters == 0) &&
@@ -569,7 +579,6 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
i, j, trans->atom->no, trans->to, atomno, targetno);
printf(" previous to is %d\n", prev);
#endif
- ret->determinist = 0;
if (transdata != NULL)
xmlFree(transdata);
xmlFree(transitions);
@@ -1016,6 +1025,12 @@ xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
fprintf(output, "removed\n");
return;
}
+ if (trans->nd != 0) {
+ if (trans->nd == 2)
+ fprintf(output, "last not determinist, ");
+ else
+ fprintf(output, "not determinist, ");
+ }
if (trans->counter >= 0) {
fprintf(output, "counted %d, ", trans->counter);
}
@@ -1235,7 +1250,7 @@ xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
xmlRegAtomPtr atom, xmlRegStatePtr target,
- int counter, int count, int nchk) {
+ int counter, int count) {
int nrtrans;
@@ -1253,19 +1268,17 @@ xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
* so, silently ignore this request.
*/
- if (nchk == 0) {
- for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
- xmlRegTransPtr trans = &(state->trans[nrtrans]);
- if ((trans->atom == atom) &&
- (trans->to == target->no) &&
- (trans->counter == counter) &&
- (trans->count == count)) {
+ for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
+ xmlRegTransPtr trans = &(state->trans[nrtrans]);
+ if ((trans->atom == atom) &&
+ (trans->to == target->no) &&
+ (trans->counter == counter) &&
+ (trans->count == count)) {
#ifdef DEBUG_REGEXP_GRAPH
- printf("Ignoring duplicate transition from %d to %d\n",
- state->no, target->no);
+ printf("Ignoring duplicate transition from %d to %d\n",
+ state->no, target->no);
#endif
- return;
- }
+ return;
}
}
@@ -1308,6 +1321,7 @@ xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
state->trans[state->nbTrans].to = target->no;
state->trans[state->nbTrans].counter = counter;
state->trans[state->nbTrans].count = count;
+ state->trans[state->nbTrans].nd = 0;
state->nbTrans++;
xmlRegStateAddTransTo(ctxt, target, state->no);
}
@@ -1359,9 +1373,9 @@ xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
ctxt->state = to;
}
if (lax)
- xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER, 0);
+ xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
else
- xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER, 0);
+ xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
}
/**
@@ -1379,7 +1393,7 @@ xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
xmlRegStatePush(ctxt, to);
ctxt->state = to;
}
- xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1, 0);
+ xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
}
/**
@@ -1398,7 +1412,7 @@ xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
xmlRegStatePush(ctxt, to);
ctxt->state = to;
}
- xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1, 0);
+ xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
}
/**
@@ -1417,7 +1431,7 @@ xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
xmlRegStatePush(ctxt, to);
ctxt->state = to;
}
- xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter, 0);
+ xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
}
/**
@@ -1450,6 +1464,14 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
* Generate an epsilon transition to link to the target
*/
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
+#ifdef DV
+ } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
+ (atom->quant != XML_REGEXP_QUANT_ONCE)) {
+ to = xmlRegNewState(ctxt);
+ xmlRegStatePush(ctxt, to);
+ ctxt->state = to;
+ xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
+#endif
}
switch (atom->quant) {
case XML_REGEXP_QUANT_OPT:
@@ -1504,8 +1526,8 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
} else {
newstate = xmlRegNewState(ctxt);
xmlRegStatePush(ctxt, newstate);
- ctxt->state = newstate;
}
+ ctxt->state = newstate;
xmlFAGenerateCountedTransition(ctxt, atom->stop,
newstate, counter);
}
@@ -1513,7 +1535,8 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
break;
}
return(0);
- } else if ((atom->min == 0) && (atom->max == 0) &&
+ }
+ if ((atom->min == 0) && (atom->max == 0) &&
(atom->quant == XML_REGEXP_QUANT_RANGE)) {
/*
* we can discard the atom and generate an epsilon transition instead
@@ -1530,21 +1553,20 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
ctxt->state = to;
xmlRegFreeAtom(atom);
return(0);
- } else {
- if (to == NULL) {
- to = xmlRegNewState(ctxt);
- if (to != NULL)
- xmlRegStatePush(ctxt, to);
- else {
- return(-1);
- }
- }
- if (xmlRegAtomPush(ctxt, atom) < 0) {
+ }
+ if (to == NULL) {
+ to = xmlRegNewState(ctxt);
+ if (to != NULL)
+ xmlRegStatePush(ctxt, to);
+ else {
return(-1);
}
- xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1, 0);
- ctxt->state = to;
}
+ if (xmlRegAtomPush(ctxt, atom) < 0) {
+ return(-1);
+ }
+ xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
+ ctxt->state = to;
switch (atom->quant) {
case XML_REGEXP_QUANT_OPT:
atom->quant = XML_REGEXP_QUANT_ONCE;
@@ -1553,11 +1575,11 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
case XML_REGEXP_QUANT_MULT:
atom->quant = XML_REGEXP_QUANT_ONCE;
xmlFAGenerateEpsilonTransition(ctxt, from, to);
- xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1, 0);
+ xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
break;
case XML_REGEXP_QUANT_PLUS:
atom->quant = XML_REGEXP_QUANT_ONCE;
- xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1, 0);
+ xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
break;
default:
break;
@@ -1614,7 +1636,7 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
xmlRegStateAddTrans(ctxt, from, NULL,
ctxt->states[newto],
- -1, to->trans[transnr].count, 0);
+ -1, to->trans[transnr].count);
} else {
#ifdef DEBUG_REGEXP_GRAPH
printf("Found epsilon trans %d from %d to %d\n",
@@ -1637,10 +1659,10 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
if (to->trans[transnr].counter >= 0) {
xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
ctxt->states[newto],
- to->trans[transnr].counter, -1, 1);
+ to->trans[transnr].counter, -1);
} else {
xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
- ctxt->states[newto], counter, -1, 1);
+ ctxt->states[newto], counter, -1);
}
}
}
@@ -1869,12 +1891,175 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
}
+static int
+xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
+ int ret = 0;
+
+ if ((range1->type == XML_REGEXP_RANGES) ||
+ (range2->type == XML_REGEXP_RANGES) ||
+ (range2->type == XML_REGEXP_SUBREG) ||
+ (range1->type == XML_REGEXP_SUBREG) ||
+ (range1->type == XML_REGEXP_STRING) ||
+ (range2->type == XML_REGEXP_STRING))
+ return(-1);
+
+ /* put them in order */
+ if (range1->type > range2->type) {
+ xmlRegRangePtr tmp;
+
+ tmp = range1;
+ range1 = range2;
+ range2 = tmp;
+ }
+ if ((range1->type == XML_REGEXP_ANYCHAR) ||
+ (range2->type == XML_REGEXP_ANYCHAR)) {
+ ret = 1;
+ } else if ((range1->type == XML_REGEXP_EPSILON) ||
+ (range2->type == XML_REGEXP_EPSILON)) {
+ return(0);
+ } else if (range1->type == range2->type) {
+ if ((range1->type != XML_REGEXP_CHARVAL) ||
+ (range1->end < range2->start) ||
+ (range2->end < range1->start))
+ ret = 1;
+ else
+ ret = 0;
+ } else if (range1->type == XML_REGEXP_CHARVAL) {
+ int codepoint;
+ int neg = 0;
+
+ /*
+ * just check all codepoints in the range for acceptance,
+ * this is usually way cheaper since done only once at
+ * compilation than testing over and over at runtime or
+ * pushing too many states when evaluating.
+ */
+ if (((range1->neg == 0) && (range2->neg != 0)) ||
+ ((range1->neg != 0) && (range2->neg == 0)))
+ neg = 1;
+
+ for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
+ ret = xmlRegCheckCharacterRange(range2->type, codepoint,
+ 0, range2->start, range2->end,
+ range2->blockName);
+ if (ret < 0)
+ return(-1);
+ if (((neg == 1) && (ret == 0)) ||
+ ((neg == 0) && (ret == 1)))
+ return(1);
+ }
+ return(0);
+ } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
+ (range2->type == XML_REGEXP_BLOCK_NAME)) {
+ if (range1->type == range2->type) {
+ ret = xmlStrEqual(range1->blockName, range2->blockName);
+ } else {
+ /*
+ * comparing a block range with anything else is way
+ * too costly, and maintining the table is like too much
+ * memory too, so let's force the automata to save state
+ * here.
+ */
+ return(1);
+ }
+ } else if ((range1->type < XML_REGEXP_LETTER) ||
+ (range2->type < XML_REGEXP_LETTER)) {
+ if ((range1->type == XML_REGEXP_ANYSPACE) &&
+ (range2->type == XML_REGEXP_NOTSPACE))
+ ret = 0;
+ else if ((range1->type == XML_REGEXP_INITNAME) &&
+ (range2->type == XML_REGEXP_NOTINITNAME))
+ ret = 0;
+ else if ((range1->type == XML_REGEXP_NAMECHAR) &&
+ (range2->type == XML_REGEXP_NOTNAMECHAR))
+ ret = 0;
+ else if ((range1->type == XML_REGEXP_DECIMAL) &&
+ (range2->type == XML_REGEXP_NOTDECIMAL))
+ ret = 0;
+ else if ((range1->type == XML_REGEXP_REALCHAR) &&
+ (range2->type == XML_REGEXP_NOTREALCHAR))
+ ret = 0;
+ else {
+ /* same thing to limit complexity */
+ return(1);
+ }
+ } else {
+ ret = 0;
+ /* range1->type < range2->type here */
+ switch (range1->type) {
+ case XML_REGEXP_LETTER:
+ /* all disjoint except in the subgroups */
+ if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
+ (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
+ (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
+ (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
+ (range2->type == XML_REGEXP_LETTER_OTHERS))
+ ret = 1;
+ break;
+ case XML_REGEXP_MARK:
+ if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
+ (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
+ (range2->type == XML_REGEXP_MARK_ENCLOSING))
+ ret = 1;
+ break;
+ case XML_REGEXP_NUMBER:
+ if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
+ (range2->type == XML_REGEXP_NUMBER_LETTER) ||
+ (range2->type == XML_REGEXP_NUMBER_OTHERS))
+ ret = 1;
+ break;
+ case XML_REGEXP_PUNCT:
+ if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
+ (range2->type == XML_REGEXP_PUNCT_DASH) ||
+ (range2->type == XML_REGEXP_PUNCT_OPEN) ||
+ (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
+ (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
+ (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
+ (range2->type == XML_REGEXP_PUNCT_OTHERS))
+ ret = 1;
+ break;
+ case XML_REGEXP_SEPAR:
+ if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
+ (range2->type == XML_REGEXP_SEPAR_LINE) ||
+ (range2->type == XML_REGEXP_SEPAR_PARA))
+ ret = 1;
+ break;
+ case XML_REGEXP_SYMBOL:
+ if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
+ (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
+ (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
+ (range2->type == XML_REGEXP_SYMBOL_OTHERS))
+ ret = 1;
+ break;
+ case XML_REGEXP_OTHER:
+ if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
+ (range2->type == XML_REGEXP_OTHER_FORMAT) ||
+ (range2->type == XML_REGEXP_OTHER_PRIVATE))
+ ret = 1;
+ break;
+ default:
+ if ((range2->type >= XML_REGEXP_LETTER) &&
+ (range2->type < XML_REGEXP_BLOCK_NAME))
+ ret = 0;
+ else {
+ /* safety net ! */
+ return(1);
+ }
+ }
+ }
+ if (((range1->neg == 0) && (range2->neg != 0)) ||
+ ((range1->neg != 0) && (range2->neg == 0)))
+ ret = !ret;
+ return(1);
+}
+
/**
* xmlFACompareAtoms:
* @atom1: an atom
* @atom2: an atom
*
- * Compares two atoms to check whether they are equivalents
+ * Compares two atoms to check whether they intersect in some ways,
+ * this is used by xmlFAComputesDeterminism only
*
* Returns 1 if yes and 0 otherwise
*/
@@ -1887,28 +2072,65 @@ xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
if ((atom1 == NULL) || (atom2 == NULL))
return(0);
- if (atom1->type != atom2->type)
+ if ((atom1->type == XML_REGEXP_RANGES) &&
+ (atom2->type == XML_REGEXP_CHARVAL)) {
+ } else if ((atom1->type == XML_REGEXP_CHARVAL) &&
+ (atom2->type == XML_REGEXP_RANGES)) {
+ xmlRegAtomPtr tmp;
+ tmp = atom1;
+ atom1 = atom2;
+ atom2 = tmp;
+ } else if (atom1->type != atom2->type) {
return(0);
+ }
switch (atom1->type) {
case XML_REGEXP_STRING:
ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
(xmlChar *)atom2->valuep);
break;
case XML_REGEXP_EPSILON:
- return(1);
+ goto not_determinist;
case XML_REGEXP_CHARVAL:
- ret = atom1->codepoint == atom2->codepoint;
+ ret = (atom1->codepoint == atom2->codepoint);
break;
case XML_REGEXP_RANGES:
- TODO;
- return(0);
+ if (atom2->type == XML_REGEXP_CHARVAL) {
+ ret = xmlRegCheckCharacter(atom1, atom2->codepoint);
+ if (ret < 0)
+ return(-1);
+ break;
+ } else {
+ int i, j, res;
+ xmlRegRangePtr r1, r2;
+
+ /*
+ * need to check that none of the ranges eventually matches
+ */
+ for (i = 0;i < atom1->nbRanges;i++) {
+ for (j = 0;j < atom2->nbRanges;j++) {
+ r1 = atom1->ranges[i];
+ r2 = atom2->ranges[j];
+ res = xmlFACompareRanges(r1, r2);
+ if (res == 1) {
+ ret = 1;
+ goto done;
+ }
+ }
+ }
+ ret = 0;
+ }
+ break;
default:
- return(1);
+ goto not_determinist;
}
+done:
if (atom1->neg != atom2->neg) {
ret = !ret;
}
- return(ret);
+ if (ret == 0)
+ return(0);
+not_determinist:
+ return(1);
}
/**
@@ -1923,12 +2145,18 @@ static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
int to, xmlRegAtomPtr atom) {
int ret = 1;
- int transnr;
+ int res;
+ int transnr, nbTrans;
xmlRegTransPtr t1;
if (state == NULL)
return(ret);
- for (transnr = 0;transnr < state->nbTrans;transnr++) {
+ /*
+ * don't recurse on transitions potentially added in the course of
+ * the elimination.
+ */
+ nbTrans = state->nbTrans;
+ for (transnr = 0;transnr < nbTrans;transnr++) {
t1 = &(state->trans[transnr]);
/*
* check transitions conflicting with the one looked at
@@ -1936,16 +2164,21 @@ xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
if (t1->atom == NULL) {
if (t1->to == -1)
continue;
- ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
+ res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
to, atom);
- if (ret == 0)
- return(0);
+ if (res == 0) {
+ ret = 0;
+ /* t1->nd = 1; */
+ }
continue;
}
if (t1->to != to)
continue;
- if (xmlFACompareAtoms(t1->atom, atom))
- return(0);
+ if (xmlFACompareAtoms(t1->atom, atom)) {
+ ret = 0;
+ /* mark the transition as non-deterministic */
+ t1->nd = 1;
+ }
}
return(ret);
}
@@ -1962,7 +2195,7 @@ static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
int statenr, transnr;
xmlRegStatePtr state;
- xmlRegTransPtr t1, t2;
+ xmlRegTransPtr t1, t2, last;
int i;
int ret = 1;
@@ -1974,8 +2207,7 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
return(ctxt->determinist);
/*
- * Check for all states that there aren't 2 transitions
- * with the same atom and a different target.
+ * First cleanup the automata removing cancelled transitions
*/
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
@@ -1989,8 +2221,10 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
* Determinism checks in case of counted or all transitions
* will have to be handled separately
*/
- if (t1->atom == NULL)
+ if (t1->atom == NULL) {
+ /* t1->nd = 1; */
continue;
+ }
if (t1->to == -1) /* eliminated */
continue;
for (i = 0;i < transnr;i++) {
@@ -2001,10 +2235,46 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
if (t1->to == t2->to) {
if (xmlFACompareAtoms(t1->atom, t2->atom))
t2->to = -1; /* eliminated */
- } else {
- /* not determinist ! */
- if (xmlFACompareAtoms(t1->atom, t2->atom))
- ret = 0;
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ * Check for all states that there aren't 2 transitions
+ * with the same atom and a different target.
+ */
+ for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+ state = ctxt->states[statenr];
+ if (state == NULL)
+ continue;
+ if (state->nbTrans < 2)
+ continue;
+ last = NULL;
+ for (transnr = 0;transnr < state->nbTrans;transnr++) {
+ t1 = &(state->trans[transnr]);
+ /*
+ * Determinism checks in case of counted or all transitions
+ * will have to be handled separately
+ */
+ if (t1->atom == NULL) {
+ continue;
+ }
+ if (t1->to == -1) /* eliminated */
+ continue;
+ for (i = 0;i < transnr;i++) {
+ t2 = &(state->trans[i]);
+ if (t2->to == -1) /* eliminated */
+ continue;
+ if (t2->atom != NULL) {
+ /* not determinist ! */
+ if (xmlFACompareAtoms(t1->atom, t2->atom)) {
+ ret = 0;
+ /* mark the transitions as non-deterministic ones */
+ t1->nd = 1;
+ t2->nd = 1;
+ last = t1;
}
} else if (t1->to != -1) {
/*
@@ -2013,16 +2283,39 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
*/
ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
t2->to, t2->atom);
+ /* don't shortcut the computation so all non deterministic
+ transition get marked down
if (ret == 0)
return(0);
+ */
+ if (ret == 0) {
+ t1->nd = 1;
+ /* t2->nd = 1; */
+ last = t1;
+ }
}
}
+ /* don't shortcut the computation so all non deterministic
+ transition get marked down
if (ret == 0)
- break;
+ break; */
+ }
+
+ /*
+ * mark specifically the last non-deterministic transition
+ * from a state since there is no need to set-up rollback
+ * from it
+ */
+ if (last != NULL) {
+ last->nd = 2;
}
+
+ /* don't shortcut the computation so all non deterministic
+ transition get marked down
if (ret == 0)
- break;
+ break; */
}
+
ctxt->determinist = ret;
return(ret);
}
@@ -2336,6 +2629,12 @@ xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
xmlFARegDebugExec(exec);
exec->transno--;
#endif
+#ifdef MAX_PUSH
+ if (exec->nbPush > MAX_PUSH) {
+ return;
+ }
+ exec->nbPush++;
+#endif
if (exec->maxRollbacks == 0) {
exec->maxRollbacks = 4;
@@ -2422,10 +2721,11 @@ static int
xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
xmlRegExecCtxt execval;
xmlRegExecCtxtPtr exec = &execval;
- int ret, codepoint = 0, len;
+ int ret, codepoint = 0, len, deter;
exec->inputString = content;
exec->index = 0;
+ exec->nbPush = 0;
exec->determinist = 1;
exec->maxRollbacks = 0;
exec->nbRollbacks = 0;
@@ -2482,6 +2782,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
continue;
atom = trans->atom;
ret = 0;
+ deter = 1;
if (trans->count >= 0) {
int count;
xmlRegCounterPtr counter;
@@ -2497,6 +2798,8 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
trans->count, count, counter->min, counter->max);
#endif
ret = ((count >= counter->min) && (count <= counter->max));
+ if ((ret) && (counter->min != counter->max))
+ deter = 0;
} else if (atom == NULL) {
fprintf(stderr, "epsilon transition left at runtime\n");
exec->status = -2;
@@ -2509,7 +2812,15 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
/*
* this is a multiple input sequence
+ * If there is a counter associated increment it now.
+ * before potentially saving and rollback
*/
+ if (trans->counter >= 0) {
+#ifdef DEBUG_REGEXP_EXEC
+ printf("Increasing count %d\n", trans->counter);
+#endif
+ exec->counts[trans->counter]++;
+ }
if (exec->state->nbTrans > exec->transno + 1) {
xmlFARegExecSave(exec);
}
@@ -2559,6 +2870,12 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
if (ret == 0) {
goto rollback;
}
+ if (trans->counter >= 0) {
+#ifdef DEBUG_REGEXP_EXEC
+ printf("Decreasing count %d\n", trans->counter);
+#endif
+ exec->counts[trans->counter]--;
+ }
} else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
/*
* we don't match on the codepoint, but minOccurs of 0
@@ -2576,7 +2893,17 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
ret = 1;
}
if (ret == 1) {
- if (exec->state->nbTrans > exec->transno + 1) {
+ if ((trans->nd == 1) ||
+ ((trans->count >= 0) && (deter == 0) &&
+ (exec->state->nbTrans > exec->transno + 1))) {
+#ifdef DEBUG_REGEXP_EXEC
+ if (trans->nd == 1)
+ printf("Saving on nd transition atom %d for %c at %d\n",
+ trans->atom->no, codepoint, exec->index);
+ else
+ printf("Saving on counted transition count %d for %c at %d\n",
+ trans->count, codepoint, exec->index);
+#endif
xmlFARegExecSave(exec);
}
if (trans->counter >= 0) {
@@ -2613,6 +2940,10 @@ rollback:
* Failed to find a way out
*/
exec->determinist = 0;
+#ifdef DEBUG_REGEXP_EXEC
+ printf("rollback from state %d on %d:%c\n", exec->state->no,
+ codepoint,codepoint);
+#endif
xmlFARegExecRollBack(exec);
}
progress:
@@ -2632,8 +2963,11 @@ progress:
xmlFree(exec->counts);
if (exec->status == 0)
return(1);
- if (exec->status == -1)
+ if (exec->status == -1) {
+ if (exec->nbPush > MAX_PUSH)
+ return(-1);
return(0);
+ }
return(exec->status);
}
@@ -2708,6 +3042,7 @@ xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
exec->inputStack = NULL;
exec->errStateNo = -1;
exec->errString = NULL;
+ exec->nbPush = 0;
return(exec);
}
@@ -3344,7 +3679,7 @@ xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
ret = xmlRegExecPushStringInternal(exec, str, data, 1);
if (str != buf)
- xmlFree(buf);
+ xmlFree(str);
return(ret);
}
@@ -4784,11 +5119,11 @@ xmlNewAutomata(void) {
/* initialize the parser */
ctxt->end = NULL;
ctxt->start = ctxt->state = xmlRegNewState(ctxt);
- ctxt->start->type = XML_REGEXP_START_STATE;
if (ctxt->start == NULL) {
xmlFreeAutomata(ctxt);
return(NULL);
}
+ ctxt->start->type = XML_REGEXP_START_STATE;
if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
xmlRegFreeState(ctxt->start);
xmlFreeAutomata(ctxt);
@@ -5081,7 +5416,7 @@ xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
}
- xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0);
+ xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
@@ -5147,7 +5482,7 @@ xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
}
- xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0);
+ xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
@@ -5236,7 +5571,7 @@ xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
}
- xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0);
+ xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
return(to);
@@ -5298,7 +5633,7 @@ xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
}
- xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0);
+ xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
return(to);
@@ -6134,7 +6469,7 @@ tail:
* xmlExpGetLanguage:
* @ctxt: the expression context
* @exp: the expression
- * @list: where to store the tokens
+ * @langList: where to store the tokens
* @len: the allocated lenght of @list
*
* Find all the strings used in @exp and store them in @list
@@ -6144,10 +6479,10 @@ tail:
*/
int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
- const xmlChar**list, int len) {
- if ((ctxt == NULL) || (exp == NULL) || (list == NULL) || (len <= 0))
+ const xmlChar**langList, int len) {
+ if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
return(-1);
- return(xmlExpGetLanguageInt(ctxt, exp, list, len, 0));
+ return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
}
static int
@@ -6200,7 +6535,7 @@ tail:
* xmlExpGetStart:
* @ctxt: the expression context
* @exp: the expression
- * @list: where to store the tokens
+ * @tokList: where to store the tokens
* @len: the allocated lenght of @list
*
* Find all the strings that appears at the start of the languages
@@ -6212,10 +6547,10 @@ tail:
*/
int
xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
- const xmlChar**list, int len) {
- if ((ctxt == NULL) || (exp == NULL) || (list == NULL) || (len <= 0))
+ const xmlChar**tokList, int len) {
+ if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
return(-1);
- return(xmlExpGetStartInt(ctxt, exp, list, len, 0));
+ return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
}
/**
@@ -6861,10 +7196,10 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
while (len < 0) {
const xmlChar **temp;
- temp = (const xmlChar **) xmlRealloc(tab, ctxt->tabSize * 2 *
+ temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
sizeof(const xmlChar *));
if (temp == NULL) {
- xmlFree(tab);
+ xmlFree((xmlChar **) tab);
return(NULL);
}
tab = temp;
@@ -6875,14 +7210,14 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
if ((tmp == NULL) || (tmp == forbiddenExp)) {
xmlExpFree(ctxt, ret);
- xmlFree(tab);
+ xmlFree((xmlChar **) tab);
return(tmp);
}
tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
xmlExpFree(ctxt, tmp);
xmlExpFree(ctxt, ret);
- xmlFree(tab);
+ xmlFree((xmlChar **) tab);
return(tmp);
}
tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
@@ -6891,7 +7226,7 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
xmlExpFree(ctxt, ret);
- xmlFree(tab);
+ xmlFree((xmlChar **) tab);
return(tmp3);
}
@@ -6900,12 +7235,12 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
else {
ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
if (ret == NULL) {
- xmlFree(tab);
+ xmlFree((xmlChar **) tab);
return(NULL);
}
}
}
- xmlFree(tab);
+ xmlFree((xmlChar **) tab);
return(ret);
}