summaryrefslogtreecommitdiff
path: root/xmlregexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'xmlregexp.c')
-rw-r--r--xmlregexp.c162
1 files changed, 111 insertions, 51 deletions
diff --git a/xmlregexp.c b/xmlregexp.c
index d1e6f38..12ecd83 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -3,7 +3,7 @@
*
* Basically designed with the purpose of compiling regexps for
* the variety of validation/shemas mechanisms now available in
- * XML related specifications thise includes:
+ * XML related specifications these include:
* - XML-1.0 DTD validation
* - XML Schemas structure part 1
* - XML Schemas Datatypes part 2 especially Appendix F
@@ -267,7 +267,7 @@ struct _xmlRegExecRollback {
xmlRegStatePtr state;/* the current state */
int index; /* the index in the input stack */
int nextbranch; /* the next transition to explore in that state */
- int *counts; /* save the automate state if it has some */
+ int *counts; /* save the automata state if it has some */
};
typedef struct _xmlRegInputToken xmlRegInputToken;
@@ -280,14 +280,14 @@ struct _xmlRegInputToken {
struct _xmlRegExecCtxt {
int status; /* execution status != 0 indicate an error */
- int determinist; /* did we found an inderterministic behaviour */
+ int determinist; /* did we find an indeterministic behaviour */
xmlRegexpPtr comp; /* the compiled regexp */
xmlRegExecCallbacks callback;
void *data;
xmlRegStatePtr state;/* the current state */
int transno; /* the current transition on that state */
- int transcount; /* the number of char in char counted transitions */
+ int transcount; /* the number of chars in char counted transitions */
/*
* A stack of rollback states
@@ -327,7 +327,7 @@ static void xmlRegFreeAtom(xmlRegAtomPtr atom);
************************************************************************/
/**
* xmlRegexpErrMemory:
- * @extra: extra informations
+ * @extra: extra information
*
* Handle an out of memory condition
*/
@@ -347,9 +347,9 @@ xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
/**
* xmlRegexpErrCompile:
- * @extra: extra informations
+ * @extra: extra information
*
- * Handle an compilation failure
+ * Handle a compilation failure
*/
static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
@@ -379,7 +379,7 @@ static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
* xmlRegEpxFromParse:
* @ctxt: the parser context used to build it
*
- * Allocate a new regexp and fill it with the reult from the parser
+ * Allocate a new regexp and fill it with the result from the parser
*
* Returns the new regexp or NULL in case of error
*/
@@ -418,7 +418,7 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
/*
* Switch to a compact representation
* 1/ counting the effective number of states left
- * 2/ conting the unique number of atoms, and check that
+ * 2/ counting the unique number of atoms, and check that
* they are all of the string type
* 3/ build a table state x atom for the transitions
*/
@@ -505,7 +505,7 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
/*
* Allocate the transition table. The first entry for each
- * state correspond to the state type.
+ * state corresponds to the state type.
*/
transdata = NULL;
@@ -539,7 +539,7 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
}
targetno = stateRemap[trans->to];
/*
- * if the same atome can generate transition to 2 different
+ * if the same atom can generate transitions to 2 different
* states then it means the automata is not determinist and
* the compact form can't be used !
*/
@@ -1182,6 +1182,9 @@ static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
xmlRegAtomPtr atom, xmlRegStatePtr target,
int counter, int count) {
+
+ int nrtrans;
+
if (state == NULL) {
ERROR("add state: state is NULL");
return;
@@ -1190,6 +1193,25 @@ xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
ERROR("add state: target is NULL");
return;
}
+ /*
+ * Other routines follow the philosophy 'When in doubt, add a transition'
+ * so we check here whether such a transition is already present and, if
+ * so, silently ignore this request.
+ */
+
+ for (nrtrans=0; nrtrans<state->nbTrans; nrtrans++) {
+ if ((state->trans[nrtrans].atom == atom) &&
+ (state->trans[nrtrans].to == target->no) &&
+ (state->trans[nrtrans].counter == counter) &&
+ (state->trans[nrtrans].count == count)) {
+#ifdef DEBUG_REGEXP_GRAPH
+ printf("Ignoring duplicate transition from %d to %d\n",
+ state->no, target->no);
+#endif
+ return;
+ }
+ }
+
if (state->maxTrans == 0) {
state->maxTrans = 4;
state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
@@ -1347,7 +1369,7 @@ xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
* @to: the target state or NULL for building a new one
* @atom: the atom generating the transition
*
- * Returns 0 if succes and -1 in case of error.
+ * Returns 0 if success and -1 in case of error.
*/
static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
@@ -1359,7 +1381,7 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
if (atom->type == XML_REGEXP_SUBREG) {
/*
* this is a subexpression handling one should not need to
- * create a new node excep for XML_REGEXP_QUANT_RANGE.
+ * create a new node except for XML_REGEXP_QUANT_RANGE.
*/
if (xmlRegAtomPush(ctxt, atom) < 0) {
return(-1);
@@ -1391,13 +1413,26 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
/*
* This one is nasty:
- * 1/ register a new counter
- * 2/ register an epsilon transition associated to
+ * 1/ if range has minOccurs == 0, create a new state
+ * and create epsilon transitions from atom->start
+ * to atom->stop, as well as atom->start to the new
+ * state
+ * 2/ register a new counter
+ * 3/ register an epsilon transition associated to
* this counter going from atom->stop to atom->start
- * 3/ create a new state
- * 4/ generate a counted transition from atom->stop to
+ * 4/ create a new state
+ * 5/ generate a counted transition from atom->stop to
* that state
*/
+ if (atom->min == 0) {
+ xmlFAGenerateEpsilonTransition(ctxt, atom->start,
+ atom->stop);
+ newstate = xmlRegNewState(ctxt);
+ xmlRegStatePush(ctxt, newstate);
+ ctxt->state = newstate;
+ xmlFAGenerateEpsilonTransition(ctxt, atom->start,
+ newstate);
+ }
counter = xmlRegGetCounter(ctxt);
ctxt->counters[counter].min = atom->min - 1;
ctxt->counters[counter].max = atom->max - 1;
@@ -1460,7 +1495,7 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
* @ctxt: a regexp parser context
* @fromnr: the from state
* @tonr: the to state
- * @cpunter: should that transition be associted to a counted
+ * @counter: should that transition be associated to a counted
*
*/
static void
@@ -1616,7 +1651,7 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
xmlRegStatePtr target = NULL;
state->reached = XML_REGEXP_MARK_VISITED;
/*
- * Mark all state reachable from the current reachable state
+ * Mark all states reachable from the current reachable state
*/
for (transnr = 0;transnr < state->nbTrans;transnr++) {
if ((state->trans[transnr].to >= 0) &&
@@ -1665,7 +1700,7 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
* @atom1: an atom
* @atom2: an atom
*
- * Compares two atoms to check whether they are equivatents
+ * Compares two atoms to check whether they are equivalents
*
* Returns 1 if yes and 0 otherwise
*/
@@ -1758,7 +1793,7 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
return(ctxt->determinist);
/*
- * Check for all states that there isn't 2 transitions
+ * Check for all states that there aren't 2 transitions
* with the same atom and a different target.
*/
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
@@ -1782,7 +1817,7 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
if (t2->atom != NULL) {
if (t1->to == t2->to) {
if (xmlFACompareAtoms(t1->atom, t2->atom))
- t2->to = -1; /* eliminate */
+ t2->to = -1; /* eliminated */
} else {
/* not determinist ! */
if (xmlFACompareAtoms(t1->atom, t2->atom))
@@ -2090,7 +2125,7 @@ xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
/************************************************************************
* *
- * Saving an restoring state of an execution context *
+ * Saving and restoring state of an execution context *
* *
************************************************************************/
@@ -2196,7 +2231,7 @@ xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
/************************************************************************
* *
- * Verifyer, running an input against a compiled regexp *
+ * Verifier, running an input against a compiled regexp *
* *
************************************************************************/
@@ -2235,12 +2270,27 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
xmlRegAtomPtr atom;
/*
- * End of input on non-terminal state, rollback, however we may
+ * If end of input on non-terminal state, rollback, however we may
* still have epsilon like transition for counted transitions
- * on counters, in that case don't break too early.
+ * on counters, in that case don't break too early. Additionally,
+ * if we are working on a range like "AB{0,2}", where B is not present,
+ * we don't want to break.
*/
- if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
- goto rollback;
+ if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
+ /*
+ * if there is a transition, we must check if
+ * atom allows minOccurs of 0
+ */
+ if (exec->transno < exec->state->nbTrans) {
+ trans = &exec->state->trans[exec->transno];
+ if (trans->to >=0) {
+ atom = trans->atom;
+ if (!((atom->min == 0) && (atom->max > 0)))
+ goto rollback;
+ }
+ } else
+ goto rollback;
+ }
exec->transcount = 0;
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
@@ -2271,7 +2321,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
} else if (exec->inputString[exec->index] != 0) {
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
ret = xmlRegCheckCharacter(atom, codepoint);
- if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
+ if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
xmlRegStatePtr to = comp->states[trans->to];
/*
@@ -2326,7 +2376,21 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
if (ret == 0) {
goto rollback;
}
+ } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
+ /*
+ * we don't match on the codepoint, but minOccurs of 0
+ * says that's ok. Setting len to 0 inhibits stepping
+ * over the codepoint.
+ */
+ exec->transcount = 1;
+ len = 0;
+ ret = 1;
}
+ } else if ((atom->min == 0) && (atom->max > 0)) {
+ /* another spot to match when minOccurs is 0 */
+ exec->transcount = 1;
+ len = 0;
+ ret = 1;
}
if (ret == 1) {
if (exec->state->nbTrans > exec->transno + 1) {
@@ -2384,7 +2448,7 @@ progress:
/************************************************************************
* *
- * Progressive interface to the verifyer one atom at a time *
+ * Progressive interface to the verifier one atom at a time *
* *
************************************************************************/
@@ -2552,7 +2616,7 @@ xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
#endif
/*
- * Examine all outside transition from current state
+ * Examine all outside transitions from current state
*/
for (i = 0;i < comp->nbstrings;i++) {
target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
@@ -3099,7 +3163,7 @@ progress:
#endif
/************************************************************************
* *
- * Parser for the Shemas Datatype Regular Expressions *
+ * Parser for the Schemas Datatype Regular Expressions *
* http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
* *
************************************************************************/
@@ -3896,7 +3960,7 @@ xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) {
/**
* xmlFAParseRegExp:
* @ctxt: a regexp parser context
- * @top: is that the top-level expressions ?
+ * @top: is this the top-level expression ?
*
* [1] regExp ::= branch ( '|' branch )*
*/
@@ -3988,7 +4052,7 @@ xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
* @regexp: a regular expression string
*
* Parses a regular expression conforming to XML Schemas Part 2 Datatype
- * Appendix F and build an automata suitable for testing strings against
+ * Appendix F and builds an automata suitable for testing strings against
* that regular expression
*
* Returns the compiled expression or NULL in case of error
@@ -4034,9 +4098,9 @@ xmlRegexpCompile(const xmlChar *regexp) {
* @comp: the compiled regular expression
* @content: the value to check against the regular expression
*
- * Check if the regular expression generate the value
+ * Check if the regular expression generates the value
*
- * Returns 1 if it matches, 0 if not and a negativa value in case of error
+ * Returns 1 if it matches, 0 if not and a negative value in case of error
*/
int
xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
@@ -4051,7 +4115,7 @@ xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
*
* Check if the regular expression is determinist
*
- * Returns 1 if it yes, 0 if not and a negativa value in case of error
+ * Returns 1 if it yes, 0 if not and a negative value in case of error
*/
int
xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
@@ -4213,7 +4277,7 @@ xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
* @token: the input string associated to that transition
* @data: data passed to the callback function if the transition is activated
*
- * If @to is NULL, this create first a new target state in the automata
+ * If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by the value of @token
*
@@ -4253,7 +4317,7 @@ xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
* @token2: the second input string associated to that transition
* @data: data passed to the callback function if the transition is activated
*
- * If @to is NULL, this create first a new target state in the automata
+ * If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by the value of @token
*
@@ -4312,7 +4376,7 @@ xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
* @max: the maximum successive occurences of token
* @data: data associated to the transition
*
- * If @to is NULL, this create first a new target state in the automata
+ * If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by a succession of input of value @token and whose number
* is between @min and @max
@@ -4378,10 +4442,10 @@ xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
* @max: the maximum successive occurences of token
* @data: data associated to the transition
*
- * If @to is NULL, this create first a new target state in the automata
+ * If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by a succession of input of value @token and whose number
- * is between @min and @max, moreover that transistion can only be crossed
+ * is between @min and @max, moreover that transition can only be crossed
* once.
*
* Returns the target state or NULL in case of error
@@ -4425,10 +4489,6 @@ xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
- if (to == NULL)
- to = am->state;
- if (to == NULL)
- return(NULL);
return(to);
}
@@ -4457,8 +4517,8 @@ xmlAutomataNewState(xmlAutomataPtr am) {
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
*
- * If @to is NULL, this create first a new target state in the automata
- * and then adds a an epsilon transition from the @from state to the
+ * If @to is NULL, this creates first a new target state in the automata
+ * and then adds an epsilon transition from the @from state to the
* target state
*
* Returns the target state or NULL in case of error
@@ -4481,7 +4541,7 @@ xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
* @to: the target point of the transition or NULL
* @lax: allow to transition if not all all transitions have been activated
*
- * If @to is NULL, this create first a new target state in the automata
+ * If @to is NULL, this creates first a new target state in the automata
* and then adds a an ALL transition from the @from state to the
* target state. That transition is an epsilon transition allowed only when
* all transitions from the @from node have been activated.
@@ -4531,7 +4591,7 @@ xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
* @to: the target point of the transition or NULL
* @counter: the counter associated to that transition
*
- * If @to is NULL, this create first a new target state in the automata
+ * If @to is NULL, this creates first a new target state in the automata
* and then adds an epsilon transition from the @from state to the target state
* which will increment the counter provided
*
@@ -4555,7 +4615,7 @@ xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
* @to: the target point of the transition or NULL
* @counter: the counter associated to that transition
*
- * If @to is NULL, this create first a new target state in the automata
+ * If @to is NULL, this creates first a new target state in the automata
* and then adds an epsilon transition from the @from state to the target state
* which will be allowed only if the counter is within the right range.
*