summaryrefslogtreecommitdiff
path: root/xmlregexp.c
diff options
context:
space:
mode:
authorMike Hommey <glandium@debian.org>2007-04-17 20:40:00 +0200
committerMike Hommey <glandium@debian.org>2007-04-17 20:40:00 +0200
commit789259a1b6850d30acffbb62b11456b9ed7a8f59 (patch)
tree842f2f9042a4264898ec29078aa029640078c393 /xmlregexp.c
parent968041a8b2ec86c39b5074024ce97d136ecd9a95 (diff)
downloadlibxml2-789259a1b6850d30acffbb62b11456b9ed7a8f59.tar.gz
Load /tmp/libxml2-2.6.28 intoupstream/2.6.28.dfsg
libxml2/branches/upstream/current.
Diffstat (limited to 'xmlregexp.c')
-rw-r--r--xmlregexp.c78
1 files changed, 52 insertions, 26 deletions
diff --git a/xmlregexp.c b/xmlregexp.c
index e7d519e..2a30d66 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -54,6 +54,11 @@
#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
#define NEXTL(l) ctxt->cur += l;
#define XML_REG_STRING_SEPARATOR '|'
+/*
+ * Need PREV to check on a '-' within a Character Group. May only be used
+ * when it's guaranteed that cur is not at the beginning of ctxt->string!
+ */
+#define PREV (ctxt->cur[-1])
/**
* TODO:
@@ -145,7 +150,8 @@ typedef enum {
XML_REGEXP_START_STATE = 1,
XML_REGEXP_FINAL_STATE,
XML_REGEXP_TRANS_STATE,
- XML_REGEXP_SINK_STATE
+ XML_REGEXP_SINK_STATE,
+ XML_REGEXP_UNREACH_STATE
} xmlRegStateType;
typedef enum {
@@ -1595,6 +1601,11 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
atom->quant = XML_REGEXP_QUANT_ONCE;
xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
break;
+ case XML_REGEXP_QUANT_RANGE:
+ if (atom->min == 0) {
+ xmlFAGenerateEpsilonTransition(ctxt, from, to);
+ }
+ break;
default:
break;
}
@@ -1709,6 +1720,8 @@ xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
continue;
if (state->nbTrans != 1)
continue;
+ if (state->type == XML_REGEXP_UNREACH_STATE)
+ continue;
/* is the only transition out a basic transition */
if ((state->trans[0].atom == NULL) &&
(state->trans[0].to >= 0) &&
@@ -1731,35 +1744,24 @@ xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
tmp = ctxt->states[state->transTo[i]];
for (j = 0;j < tmp->nbTrans;j++) {
if (tmp->trans[j].to == statenr) {
- tmp->trans[j].to = newto;
-#ifdef DEBUG_REGEXP_GRAPH
- printf("Changed transition %d on %d to go to %d\n",
- j, tmp->no, newto);
-#endif
- xmlRegStateAddTransTo(ctxt, ctxt->states[newto],
- tmp->no);
- }
- }
- }
-#if 0
- for (i = 0;i < ctxt->nbStates;i++) {
- tmp = ctxt->states[i];
- for (j = 0;j < tmp->nbTrans;j++) {
- if (tmp->trans[j].to == statenr) {
- tmp->trans[j].to = newto;
#ifdef DEBUG_REGEXP_GRAPH
printf("Changed transition %d on %d to go to %d\n",
j, tmp->no, newto);
#endif
+ tmp->trans[j].to = -1;
+ xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
+ ctxt->states[newto],
+ tmp->trans[j].counter,
+ tmp->trans[j].count);
}
}
}
-#endif
if (state->type == XML_REGEXP_FINAL_STATE)
ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
/* eliminate the transition completely */
state->nbTrans = 0;
+ state->type = XML_REGEXP_UNREACH_STATE;
}
@@ -1779,16 +1781,33 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
if (ctxt->states == NULL) return;
+ /*
+ * Eliminate simple epsilon transition and the associated unreachable
+ * states.
+ */
xmlFAEliminateSimpleEpsilonTransitions(ctxt);
+ for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+ state = ctxt->states[statenr];
+ if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
+#ifdef DEBUG_REGEXP_GRAPH
+ printf("Removed unreachable state %d\n", statenr);
+#endif
+ xmlRegFreeState(state);
+ ctxt->states[statenr] = NULL;
+ }
+ }
has_epsilon = 0;
/*
- * build the completed transitions bypassing the epsilons
+ * Build the completed transitions bypassing the epsilons
* Use a marking algorithm to avoid loops
- * mark sink states too.
+ * Mark sink states too.
+ * Process from the latests states backward to the start when
+ * there is long cascading epsilon chains this minimize the
+ * recursions and transition compares when adding the new ones
*/
- for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+ for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
state = ctxt->states[statenr];
if (state == NULL)
continue;
@@ -1812,8 +1831,9 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
printf("Found epsilon trans %d from %d to %d\n",
transnr, statenr, newto);
#endif
- state->mark = XML_REGEXP_MARK_START;
has_epsilon = 1;
+ state->trans[transnr].to = -2;
+ state->mark = XML_REGEXP_MARK_START;
xmlFAReduceEpsilonTransitions(ctxt, statenr,
newto, state->trans[transnr].counter);
state->mark = XML_REGEXP_MARK_NORMAL;
@@ -2424,7 +2444,7 @@ xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
* check transitions conflicting with the one looked at
*/
if (t1->atom == NULL) {
- if (t1->to == -1)
+ if (t1->to < 0)
continue;
res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
to, atom);
@@ -2875,7 +2895,8 @@ xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
int i;
printf(": ");
for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
- printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
+ printf("%s ", (const char *)
+ exec->inputStack[exec->inputStackNr - (i + 1)].value);
} else {
printf(": %s", &(exec->inputString[exec->index]));
}
@@ -4842,10 +4863,15 @@ xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
ERROR("Expecting a char range");
return;
}
- NEXTL(len);
- if (start == '-') {
+ /*
+ * Since we are "inside" a range, we can assume ctxt->cur is past
+ * the start of ctxt->string, and PREV should be safe
+ */
+ if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
+ NEXTL(len);
return;
}
+ NEXTL(len);
cur = CUR;
if ((cur != '-') || (NXT(1) == ']')) {
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,