diff options
Diffstat (limited to 'xmlregexp.c')
-rw-r--r-- | xmlregexp.c | 2243 |
1 files changed, 2191 insertions, 52 deletions
diff --git a/xmlregexp.c b/xmlregexp.c index ee635f1..45b917b 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -38,7 +38,7 @@ #endif /* #define DEBUG_REGEXP_GRAPH */ -/* #define DEBUG_REGEXP_EXEC */ +/* #define DEBUG_REGEXP_EXEC */ /* #define DEBUG_PUSH */ /* #define DEBUG_COMPACTION */ @@ -211,6 +211,10 @@ struct _xmlAutomataState { int maxTrans; int nbTrans; xmlRegTrans *trans; + /* knowing states ponting to us can speed things up */ + int maxTransTo; + int nbTransTo; + int *transTo; }; typedef struct _xmlAutomata xmlRegParserCtxt; @@ -242,6 +246,7 @@ struct _xmlAutomata { xmlRegCounter *counters; int determinist; + int negs; }; struct _xmlRegexp { @@ -329,6 +334,7 @@ struct _xmlRegExecCtxt { static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); static void xmlRegFreeState(xmlRegStatePtr state); static void xmlRegFreeAtom(xmlRegAtomPtr atom); +static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); /************************************************************************ * * @@ -414,6 +420,7 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { if ((ret->determinist != 0) && (ret->nbCounters == 0) && + (ctxt->negs == 0) && (ret->atoms != NULL) && (ret->atoms[0] != NULL) && (ret->atoms[0]->type == XML_REGEXP_STRING)) { @@ -656,6 +663,7 @@ xmlRegNewParserCtxt(const xmlChar *string) { ret->string = xmlStrdup(string); ret->cur = ret->string; ret->neg = 0; + ret->negs = 0; ret->error = 0; ret->determinist = -1; return(ret); @@ -751,6 +759,8 @@ xmlRegFreeAtom(xmlRegAtomPtr atom) { xmlFree(atom->ranges); if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) xmlFree(atom->valuep); + if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) + xmlFree(atom->valuep2); if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) xmlFree(atom->valuep); xmlFree(atom); @@ -784,6 +794,8 @@ xmlRegFreeState(xmlRegStatePtr state) { if (state->trans != NULL) xmlFree(state->trans); + if (state->transTo != NULL) + xmlFree(state->transTo); xmlFree(state); } @@ -971,6 +983,8 @@ xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { fprintf(output, "NULL\n"); return; } + if (atom->neg) + fprintf(output, "not "); xmlRegPrintAtomType(output, atom->type); xmlRegPrintQuantType(output, atom->quant); if (atom->quant == XML_REGEXP_QUANT_RANGE) @@ -1191,9 +1205,37 @@ xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { } static void +xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, + int from) { + if (target->maxTransTo == 0) { + target->maxTransTo = 8; + target->transTo = (int *) xmlMalloc(target->maxTransTo * + sizeof(int)); + if (target->transTo == NULL) { + xmlRegexpErrMemory(ctxt, "adding transition"); + target->maxTransTo = 0; + return; + } + } else if (target->nbTransTo >= target->maxTransTo) { + int *tmp; + target->maxTransTo *= 2; + tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * + sizeof(int)); + if (tmp == NULL) { + xmlRegexpErrMemory(ctxt, "adding transition"); + target->maxTransTo /= 2; + return; + } + target->transTo = tmp; + } + target->transTo[target->nbTransTo] = from; + target->nbTransTo++; +} + +static void xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, xmlRegAtomPtr atom, xmlRegStatePtr target, - int counter, int count) { + int counter, int count, int nchk) { int nrtrans; @@ -1211,21 +1253,24 @@ xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, * so, silently ignore this request. */ - for (nrtrans=0; nrtrans<state->nbTrans; nrtrans++) { - if ((state->trans[nrtrans].atom == atom) && - (state->trans[nrtrans].to == target->no) && - (state->trans[nrtrans].counter == counter) && - (state->trans[nrtrans].count == count)) { + if (nchk == 0) { + for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { + xmlRegTransPtr trans = &(state->trans[nrtrans]); + if ((trans->atom == atom) && + (trans->to == target->no) && + (trans->counter == counter) && + (trans->count == count)) { #ifdef DEBUG_REGEXP_GRAPH - printf("Ignoring duplicate transition from %d to %d\n", - state->no, target->no); + printf("Ignoring duplicate transition from %d to %d\n", + state->no, target->no); #endif - return; - } + return; + } + } } if (state->maxTrans == 0) { - state->maxTrans = 4; + state->maxTrans = 8; state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * sizeof(xmlRegTrans)); if (state->trans == NULL) { @@ -1264,6 +1309,7 @@ xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, state->trans[state->nbTrans].counter = counter; state->trans[state->nbTrans].count = count; state->nbTrans++; + xmlRegStateAddTransTo(ctxt, target, state->no); } static int @@ -1313,9 +1359,9 @@ xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, ctxt->state = to; } if (lax) - xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER, 0); else - xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER, 0); } /** @@ -1333,7 +1379,7 @@ xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, xmlRegStatePush(ctxt, to); ctxt->state = to; } - xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1, 0); } /** @@ -1352,7 +1398,7 @@ xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, xmlRegStatePush(ctxt, to); ctxt->state = to; } - xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); + xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1, 0); } /** @@ -1371,7 +1417,7 @@ xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, xmlRegStatePush(ctxt, to); ctxt->state = to; } - xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter, 0); } /** @@ -1467,6 +1513,23 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, break; } return(0); + } else if ((atom->min == 0) && (atom->max == 0) && + (atom->quant == XML_REGEXP_QUANT_RANGE)) { + /* + * we can discard the atom and generate an epsilon transition instead + */ + if (to == NULL) { + to = xmlRegNewState(ctxt); + if (to != NULL) + xmlRegStatePush(ctxt, to); + else { + return(-1); + } + } + xmlFAGenerateEpsilonTransition(ctxt, from, to); + ctxt->state = to; + xmlRegFreeAtom(atom); + return(0); } else { if (to == NULL) { to = xmlRegNewState(ctxt); @@ -1479,7 +1542,7 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, if (xmlRegAtomPush(ctxt, atom) < 0) { return(-1); } - xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); + xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1, 0); ctxt->state = to; } switch (atom->quant) { @@ -1490,11 +1553,11 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, case XML_REGEXP_QUANT_MULT: atom->quant = XML_REGEXP_QUANT_ONCE; xmlFAGenerateEpsilonTransition(ctxt, from, to); - xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); + xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1, 0); break; case XML_REGEXP_QUANT_PLUS: atom->quant = XML_REGEXP_QUANT_ONCE; - xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); + xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1, 0); break; default: break; @@ -1538,6 +1601,8 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, from->type = XML_REGEXP_FINAL_STATE; } for (transnr = 0;transnr < to->nbTrans;transnr++) { + if (to->trans[transnr].to < 0) + continue; if (to->trans[transnr].atom == NULL) { /* * Don't remove counted transitions @@ -1549,7 +1614,7 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, xmlRegStateAddTrans(ctxt, from, NULL, ctxt->states[newto], - -1, to->trans[transnr].count); + -1, to->trans[transnr].count, 0); } else { #ifdef DEBUG_REGEXP_GRAPH printf("Found epsilon trans %d from %d to %d\n", @@ -1572,10 +1637,10 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, if (to->trans[transnr].counter >= 0) { xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, ctxt->states[newto], - to->trans[transnr].counter, -1); + to->trans[transnr].counter, -1, 1); } else { xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, - ctxt->states[newto], counter, -1); + ctxt->states[newto], counter, -1, 1); } } } @@ -1583,6 +1648,89 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, } /** + * xmlFAEliminateSimpleEpsilonTransitions: + * @ctxt: a regexp parser context + * + * Eliminating general epsilon transitions can get costly in the general + * algorithm due to the large amount of generated new transitions and + * associated comparisons. However for simple epsilon transition used just + * to separate building blocks when generating the automata this can be + * reduced to state elimination: + * - if there exists an epsilon from X to Y + * - if there is no other transition from X + * then X and Y are semantically equivalent and X can be eliminated + * If X is the start state then make Y the start state, else replace the + * target of all transitions to X by transitions to Y. + */ +static void +xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { + int statenr, i, j, newto; + xmlRegStatePtr state, tmp; + + for (statenr = 0;statenr < ctxt->nbStates;statenr++) { + state = ctxt->states[statenr]; + if (state == NULL) + continue; + if (state->nbTrans != 1) + continue; + /* is the only transition out a basic transition */ + if ((state->trans[0].atom == NULL) && + (state->trans[0].to >= 0) && + (state->trans[0].to != statenr) && + (state->trans[0].counter < 0) && + (state->trans[0].count < 0)) { + newto = state->trans[0].to; + + if (state->type == XML_REGEXP_START_STATE) { +#ifdef DEBUG_REGEXP_GRAPH + printf("Found simple epsilon trans from start %d to %d\n", + statenr, newto); +#endif + } else { +#ifdef DEBUG_REGEXP_GRAPH + printf("Found simple epsilon trans from %d to %d\n", + statenr, newto); +#endif + for (i = 0;i < state->nbTransTo;i++) { + tmp = ctxt->states[state->transTo[i]]; + for (j = 0;j < tmp->nbTrans;j++) { + if (tmp->trans[j].to == statenr) { + tmp->trans[j].to = newto; +#ifdef DEBUG_REGEXP_GRAPH + printf("Changed transition %d on %d to go to %d\n", + j, tmp->no, newto); +#endif + xmlRegStateAddTransTo(ctxt, ctxt->states[newto], + tmp->no); + } + } + } +#if 0 + for (i = 0;i < ctxt->nbStates;i++) { + tmp = ctxt->states[i]; + for (j = 0;j < tmp->nbTrans;j++) { + if (tmp->trans[j].to == statenr) { + tmp->trans[j].to = newto; +#ifdef DEBUG_REGEXP_GRAPH + printf("Changed transition %d on %d to go to %d\n", + j, tmp->no, newto); +#endif + } + } + } +#endif + if (state->type == XML_REGEXP_FINAL_STATE) + ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; + /* eliminate the transition completely */ + state->nbTrans = 0; + + + } + + } + } +} +/** * xmlFAEliminateEpsilonTransitions: * @ctxt: a regexp parser context * @@ -1591,9 +1739,13 @@ static void xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { int statenr, transnr; xmlRegStatePtr state; + int has_epsilon; if (ctxt->states == NULL) return; + xmlFAEliminateSimpleEpsilonTransitions(ctxt); + + has_epsilon = 0; /* * build the completed transitions bypassing the epsilons @@ -1625,6 +1777,7 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { transnr, statenr, newto); #endif state->mark = XML_REGEXP_MARK_START; + has_epsilon = 1; xmlFAReduceEpsilonTransitions(ctxt, statenr, newto, state->trans[transnr].counter); state->mark = XML_REGEXP_MARK_NORMAL; @@ -1640,15 +1793,18 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { /* * Eliminate the epsilon transitions */ - for (statenr = 0;statenr < ctxt->nbStates;statenr++) { - state = ctxt->states[statenr]; - if (state == NULL) - continue; - for (transnr = 0;transnr < state->nbTrans;transnr++) { - if ((state->trans[transnr].atom == NULL) && - (state->trans[transnr].count < 0) && - (state->trans[transnr].to >= 0)) { - state->trans[transnr].to = -1; + if (has_epsilon) { + for (statenr = 0;statenr < ctxt->nbStates;statenr++) { + state = ctxt->states[statenr]; + if (state == NULL) + continue; + for (transnr = 0;transnr < state->nbTrans;transnr++) { + xmlRegTransPtr trans = &(state->trans[transnr]); + if ((trans->atom == NULL) && + (trans->count < 0) && + (trans->to >= 0)) { + trans->to = -1; + } } } } @@ -1724,6 +1880,8 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { */ static int xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { + int ret; + if (atom1 == atom2) return(1); if ((atom1 == NULL) || (atom2 == NULL)) @@ -1733,19 +1891,24 @@ xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { return(0); switch (atom1->type) { case XML_REGEXP_STRING: - return(xmlStrEqual((xmlChar *)atom1->valuep, - (xmlChar *)atom2->valuep)); + ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, + (xmlChar *)atom2->valuep); + break; case XML_REGEXP_EPSILON: return(1); case XML_REGEXP_CHARVAL: - return(atom1->codepoint == atom2->codepoint); + ret = atom1->codepoint == atom2->codepoint; + break; case XML_REGEXP_RANGES: TODO; return(0); default: - break; + return(1); } - return(1); + if (atom1->neg != atom2->neg) { + ret = !ret; + } + return(ret); } /** @@ -1818,6 +1981,8 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { state = ctxt->states[statenr]; if (state == NULL) continue; + if (state->nbTrans < 2) + continue; for (transnr = 0;transnr < state->nbTrans;transnr++) { t1 = &(state->trans[transnr]); /* @@ -2420,6 +2585,14 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { #endif exec->counts[trans->counter]++; } + if ((trans->count >= 0) && + (trans->count < REGEXP_ALL_COUNTER)) { +#ifdef DEBUG_REGEXP_EXEC + printf("resetting count %d on transition\n", + trans->count); +#endif + exec->counts[trans->count] = 0; + } #ifdef DEBUG_REGEXP_EXEC printf("entering state %d\n", trans->to); #endif @@ -2633,6 +2806,14 @@ xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { * Eval if we have a wildcard for the current item. */ if (*expStr != *valStr) { + /* if one of them starts with a wildcard make valStr be it */ + if (*valStr == '*') { + const xmlChar *tmp; + + tmp = valStr; + valStr = expStr; + expStr = tmp; + } if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { do { if (*valStr == XML_REG_STRING_SEPARATOR) @@ -2736,19 +2917,20 @@ error: } /** - * xmlRegExecPushString: + * xmlRegExecPushStringInternal: * @exec: a regexp execution context or NULL to indicate the end * @value: a string token input * @data: data associated to the token to reuse in callbacks + * @compound: value was assembled from 2 strings * * Push one input token in the execution context * * Returns: 1 if the regexp reached a final state, 0 if non-final, and * a negative value in case of error. */ -int -xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, - void *data) { +static int +xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, + void *data, int compound) { xmlRegTransPtr trans; xmlRegAtomPtr atom; int ret; @@ -2890,6 +3072,11 @@ xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, break; } else if (value != NULL) { ret = xmlRegStrEqualWildcard(atom->valuep, value); + if (atom->neg) { + ret = !ret; + if (!compound) + ret = 0; + } if ((ret == 1) && (trans->counter >= 0)) { xmlRegCounterPtr counter; int count; @@ -2985,6 +3172,14 @@ xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, #endif exec->counts[trans->counter]++; } + if ((trans->count >= 0) && + (trans->count < REGEXP_ALL_COUNTER)) { +#ifdef DEBUG_REGEXP_EXEC + printf("resetting count %d on transition\n", + trans->count); +#endif + exec->counts[trans->count] = 0; + } #ifdef DEBUG_PUSH printf("entering state %d\n", trans->to); #endif @@ -3081,6 +3276,23 @@ progress: } /** + * xmlRegExecPushString: + * @exec: a regexp execution context or NULL to indicate the end + * @value: a string token input + * @data: data associated to the token to reuse in callbacks + * + * Push one input token in the execution context + * + * Returns: 1 if the regexp reached a final state, 0 if non-final, and + * a negative value in case of error. + */ +int +xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, + void *data) { + return(xmlRegExecPushStringInternal(exec, value, data, 0)); +} + +/** * xmlRegExecPushString2: * @exec: a regexp execution context or NULL to indicate the end * @value: the first string token input @@ -3129,7 +3341,7 @@ xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, if (exec->comp->compact != NULL) ret = xmlRegCompactPushString(exec, exec->comp, str, data); else - ret = xmlRegExecPushString(exec, str, data); + ret = xmlRegExecPushStringInternal(exec, str, data, 1); if (str != buf) xmlFree(buf); @@ -3137,7 +3349,7 @@ xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, } /** - * xmlRegExecGetalues: + * xmlRegExecGetValues: * @exec: a regexp execution context * @err: error extraction or normal one * @nbval: pointer to the number of accepted values IN/OUT @@ -3246,14 +3458,20 @@ xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, count = exec->counts[trans->counter]; counter = &exec->comp->counters[trans->counter]; if (count < counter->max) { - values[nb++] = (xmlChar *) atom->valuep; + if (atom->neg) + values[nb++] = (xmlChar *) atom->valuep2; + else + values[nb++] = (xmlChar *) atom->valuep; (*nbval)++; } } else { if ((exec->comp->states[trans->to] != NULL) && (exec->comp->states[trans->to]->type != XML_REGEXP_SINK_STATE)) { - values[nb++] = (xmlChar *) atom->valuep; + if (atom->neg) + values[nb++] = (xmlChar *) atom->valuep2; + else + values[nb++] = (xmlChar *) atom->valuep; (*nbval)++; } } @@ -3277,7 +3495,10 @@ xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, if ((exec->comp->states[trans->to] != NULL) && (exec->comp->states[trans->to]->type == XML_REGEXP_SINK_STATE)) { - values[nb++] = (xmlChar *) atom->valuep; + if (atom->neg) + values[nb++] = (xmlChar *) atom->valuep2; + else + values[nb++] = (xmlChar *) atom->valuep; (*nbneg)++; } } @@ -3815,8 +4036,21 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { (cur == 0x5E)) { if (ctxt->atom == NULL) { ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); - if (ctxt->atom != NULL) - ctxt->atom->codepoint = cur; + if (ctxt->atom != NULL) { + switch (cur) { + case 'n': + ctxt->atom->codepoint = '\n'; + break; + case 'r': + ctxt->atom->codepoint = '\r'; + break; + case 't': + ctxt->atom->codepoint = '\t'; + break; + default: + ctxt->atom->codepoint = cur; + } + } } else if (ctxt->atom->type == XML_REGEXP_RANGES) { xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, XML_REGEXP_CHARVAL, cur, cur, NULL); @@ -4550,6 +4784,7 @@ xmlNewAutomata(void) { /* initialize the parser */ ctxt->end = NULL; ctxt->start = ctxt->state = xmlRegNewState(ctxt); + ctxt->start->type = XML_REGEXP_START_STATE; if (ctxt->start == NULL) { xmlFreeAutomata(ctxt); return(NULL); @@ -4706,6 +4941,72 @@ xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, } /** + * xmlAutomataNewNegTrans: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * @token: the first input string associated to that transition + * @token2: the second input string associated to that transition + * @data: data passed to the callback function if the transition is activated + * + * If @to is NULL, this creates first a new target state in the automata + * and then adds a transition from the @from state to the target state + * activated by any value except (@token,@token2) + * Note that if @token2 is not NULL, then (X, NULL) won't match to follow + # the semantic of XSD ##other + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, + xmlAutomataStatePtr to, const xmlChar *token, + const xmlChar *token2, void *data) { + xmlRegAtomPtr atom; + xmlChar err_msg[200]; + + if ((am == NULL) || (from == NULL) || (token == NULL)) + return(NULL); + atom = xmlRegNewAtom(am, XML_REGEXP_STRING); + if (atom == NULL) + return(NULL); + atom->data = data; + atom->neg = 1; + if ((token2 == NULL) || (*token2 == 0)) { + atom->valuep = xmlStrdup(token); + } else { + int lenn, lenp; + xmlChar *str; + + lenn = strlen((char *) token2); + lenp = strlen((char *) token); + + str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); + if (str == NULL) { + xmlRegFreeAtom(atom); + return(NULL); + } + memcpy(&str[0], token, lenp); + str[lenp] = '|'; + memcpy(&str[lenp + 1], token2, lenn); + str[lenn + lenp + 1] = 0; + + atom->valuep = str; + } + snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); + err_msg[199] = 0; + atom->valuep2 = xmlStrdup(err_msg); + + if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { + xmlRegFreeAtom(atom); + return(NULL); + } + am->negs++; + if (to == NULL) + return(am->state); + return(to); +} + +/** * xmlAutomataNewCountTrans2: * @am: an automata * @from: the starting point of the transition @@ -4780,7 +5081,7 @@ xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, to = xmlRegNewState(am); xmlRegStatePush(am, to); } - xmlRegStateAddTrans(am, from, atom, to, counter, -1); + xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); xmlRegAtomPush(am, atom); am->state = to; @@ -4846,7 +5147,7 @@ xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, to = xmlRegNewState(am); xmlRegStatePush(am, to); } - xmlRegStateAddTrans(am, from, atom, to, counter, -1); + xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); xmlRegAtomPush(am, atom); am->state = to; @@ -4935,7 +5236,7 @@ xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, to = xmlRegNewState(am); xmlRegStatePush(am, to); } - xmlRegStateAddTrans(am, from, atom, to, counter, -1); + xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); xmlRegAtomPush(am, atom); am->state = to; return(to); @@ -4997,7 +5298,7 @@ xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, to = xmlRegNewState(am); xmlRegStatePush(am, to); } - xmlRegStateAddTrans(am, from, atom, to, counter, -1); + xmlRegStateAddTrans(am, from, atom, to, counter, -1, 0); xmlRegAtomPush(am, atom); am->state = to; return(to); @@ -5183,6 +5484,1844 @@ xmlAutomataIsDeterminist(xmlAutomataPtr am) { return(ret); } #endif /* LIBXML_AUTOMATA_ENABLED */ + +#ifdef LIBXML_EXPR_ENABLED +/************************************************************************ + * * + * Formal Expression handling code * + * * + ************************************************************************/ +/************************************************************************ + * * + * Expression handling context * + * * + ************************************************************************/ + +struct _xmlExpCtxt { + xmlDictPtr dict; + xmlExpNodePtr *table; + int size; + int nbElems; + int nb_nodes; + const char *expr; + const char *cur; + int nb_cons; + int tabSize; +}; + +/** + * xmlExpNewCtxt: + * @maxNodes: the maximum number of nodes + * @dict: optional dictionnary to use internally + * + * Creates a new context for manipulating expressions + * + * Returns the context or NULL in case of error + */ +xmlExpCtxtPtr +xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { + xmlExpCtxtPtr ret; + int size = 256; + + if (maxNodes <= 4096) + maxNodes = 4096; + + ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); + if (ret == NULL) + return(NULL); + memset(ret, 0, sizeof(xmlExpCtxt)); + ret->size = size; + ret->nbElems = 0; + ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); + if (ret->table == NULL) { + xmlFree(ret); + return(NULL); + } + memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); + if (dict == NULL) { + ret->dict = xmlDictCreate(); + if (ret->dict == NULL) { + xmlFree(ret->table); + xmlFree(ret); + return(NULL); + } + } else { + ret->dict = dict; + xmlDictReference(ret->dict); + } + return(ret); +} + +/** + * xmlExpFreeCtxt: + * @ctxt: an expression context + * + * Free an expression context + */ +void +xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { + if (ctxt == NULL) + return; + xmlDictFree(ctxt->dict); + if (ctxt->table != NULL) + xmlFree(ctxt->table); + xmlFree(ctxt); +} + +/************************************************************************ + * * + * Structure associated to an expression node * + * * + ************************************************************************/ +#define MAX_NODES 10000 + +/* #define DEBUG_DERIV */ + +/* + * TODO: + * - Wildcards + * - public API for creation + * + * Started + * - regression testing + * + * Done + * - split into module and test tool + * - memleaks + */ + +typedef enum { + XML_EXP_NILABLE = (1 << 0) +} xmlExpNodeInfo; + +#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) + +struct _xmlExpNode { + unsigned char type;/* xmlExpNodeType */ + unsigned char info;/* OR of xmlExpNodeInfo */ + unsigned short key; /* the hash key */ + unsigned int ref; /* The number of references */ + int c_max; /* the maximum length it can consume */ + xmlExpNodePtr exp_left; + xmlExpNodePtr next;/* the next node in the hash table or free list */ + union { + struct { + int f_min; + int f_max; + } count; + struct { + xmlExpNodePtr f_right; + } children; + const xmlChar *f_str; + } field; +}; + +#define exp_min field.count.f_min +#define exp_max field.count.f_max +/* #define exp_left field.children.f_left */ +#define exp_right field.children.f_right +#define exp_str field.f_str + +static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); +static xmlExpNode forbiddenExpNode = { + XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} +}; +xmlExpNodePtr forbiddenExp = &forbiddenExpNode; +static xmlExpNode emptyExpNode = { + XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} +}; +xmlExpNodePtr emptyExp = &emptyExpNode; + +/************************************************************************ + * * + * The custom hash table for unicity and canonicalization * + * of sub-expressions pointers * + * * + ************************************************************************/ +/* + * xmlExpHashNameComputeKey: + * Calculate the hash key for a token + */ +static unsigned short +xmlExpHashNameComputeKey(const xmlChar *name) { + unsigned short value = 0L; + char ch; + + if (name != NULL) { + value += 30 * (*name); + while ((ch = *name++) != 0) { + value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); + } + } + return (value); +} + +/* + * xmlExpHashComputeKey: + * Calculate the hash key for a compound expression + */ +static unsigned short +xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, + xmlExpNodePtr right) { + unsigned long value; + unsigned short ret; + + switch (type) { + case XML_EXP_SEQ: + value = left->key; + value += right->key; + value *= 3; + ret = (unsigned short) value; + break; + case XML_EXP_OR: + value = left->key; + value += right->key; + value *= 7; + ret = (unsigned short) value; + break; + case XML_EXP_COUNT: + value = left->key; + value += right->key; + ret = (unsigned short) value; + break; + default: + ret = 0; + } + return(ret); +} + + +static xmlExpNodePtr +xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { + xmlExpNodePtr ret; + + if (ctxt->nb_nodes >= MAX_NODES) + return(NULL); + ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); + if (ret == NULL) + return(NULL); + memset(ret, 0, sizeof(xmlExpNode)); + ret->type = type; + ret->next = NULL; + ctxt->nb_nodes++; + ctxt->nb_cons++; + return(ret); +} + +/** + * xmlExpHashGetEntry: + * @table: the hash table + * + * Get the unique entry from the hash table. The entry is created if + * needed. @left and @right are consumed, i.e. their ref count will + * be decremented by the operation. + * + * Returns the pointer or NULL in case of error + */ +static xmlExpNodePtr +xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, + xmlExpNodePtr left, xmlExpNodePtr right, + const xmlChar *name, int min, int max) { + unsigned short kbase, key; + xmlExpNodePtr entry; + xmlExpNodePtr insert; + + if (ctxt == NULL) + return(NULL); + + /* + * Check for duplicate and insertion location. + */ + if (type == XML_EXP_ATOM) { + kbase = xmlExpHashNameComputeKey(name); + } else if (type == XML_EXP_COUNT) { + /* COUNT reduction rule 1 */ + /* a{1} -> a */ + if (min == max) { + if (min == 1) { + return(left); + } + if (min == 0) { + xmlExpFree(ctxt, left); + return(emptyExp); + } + } + if (min < 0) { + xmlExpFree(ctxt, left); + return(forbiddenExp); + } + if (max == -1) + kbase = min + 79; + else + kbase = max - min; + kbase += left->key; + } else if (type == XML_EXP_OR) { + /* Forbid reduction rules */ + if (left->type == XML_EXP_FORBID) { + xmlExpFree(ctxt, left); + return(right); + } + if (right->type == XML_EXP_FORBID) { + xmlExpFree(ctxt, right); + return(left); + } + + /* OR reduction rule 1 */ + /* a | a reduced to a */ + if (left == right) { + left->ref--; + return(left); + } + /* OR canonicalization rule 1 */ + /* linearize (a | b) | c into a | (b | c) */ + if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { + xmlExpNodePtr tmp = left; + left = right; + right = tmp; + } + /* OR reduction rule 2 */ + /* a | (a | b) and b | (a | b) are reduced to a | b */ + if (right->type == XML_EXP_OR) { + if ((left == right->exp_left) || + (left == right->exp_right)) { + xmlExpFree(ctxt, left); + return(right); + } + } + /* OR canonicalization rule 2 */ + /* linearize (a | b) | c into a | (b | c) */ + if (left->type == XML_EXP_OR) { + xmlExpNodePtr tmp; + + /* OR canonicalization rule 2 */ + if ((left->exp_right->type != XML_EXP_OR) && + (left->exp_right->key < left->exp_left->key)) { + tmp = left->exp_right; + left->exp_right = left->exp_left; + left->exp_left = tmp; + } + left->exp_right->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, + NULL, 0, 0); + left->exp_left->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, + NULL, 0, 0); + + xmlExpFree(ctxt, left); + return(tmp); + } + if (right->type == XML_EXP_OR) { + /* Ordering in the tree */ + /* C | (A | B) -> A | (B | C) */ + if (left->key > right->exp_right->key) { + xmlExpNodePtr tmp; + right->exp_right->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, + left, NULL, 0, 0); + right->exp_left->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, + tmp, NULL, 0, 0); + xmlExpFree(ctxt, right); + return(tmp); + } + /* Ordering in the tree */ + /* B | (A | C) -> A | (B | C) */ + if (left->key > right->exp_left->key) { + xmlExpNodePtr tmp; + right->exp_right->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, + right->exp_right, NULL, 0, 0); + right->exp_left->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, + tmp, NULL, 0, 0); + xmlExpFree(ctxt, right); + return(tmp); + } + } + /* we know both types are != XML_EXP_OR here */ + else if (left->key > right->key) { + xmlExpNodePtr tmp = left; + left = right; + right = tmp; + } + kbase = xmlExpHashComputeKey(type, left, right); + } else if (type == XML_EXP_SEQ) { + /* Forbid reduction rules */ + if (left->type == XML_EXP_FORBID) { + xmlExpFree(ctxt, right); + return(left); + } + if (right->type == XML_EXP_FORBID) { + xmlExpFree(ctxt, left); + return(right); + } + /* Empty reduction rules */ + if (right->type == XML_EXP_EMPTY) { + return(left); + } + if (left->type == XML_EXP_EMPTY) { + return(right); + } + kbase = xmlExpHashComputeKey(type, left, right); + } else + return(NULL); + + key = kbase % ctxt->size; + if (ctxt->table[key] != NULL) { + for (insert = ctxt->table[key]; insert != NULL; + insert = insert->next) { + if ((insert->key == kbase) && + (insert->type == type)) { + if (type == XML_EXP_ATOM) { + if (name == insert->exp_str) { + insert->ref++; + return(insert); + } + } else if (type == XML_EXP_COUNT) { + if ((insert->exp_min == min) && (insert->exp_max == max) && + (insert->exp_left == left)) { + insert->ref++; + left->ref--; + return(insert); + } + } else if ((insert->exp_left == left) && + (insert->exp_right == right)) { + insert->ref++; + left->ref--; + right->ref--; + return(insert); + } + } + } + } + + entry = xmlExpNewNode(ctxt, type); + if (entry == NULL) + return(NULL); + entry->key = kbase; + if (type == XML_EXP_ATOM) { + entry->exp_str = name; + entry->c_max = 1; + } else if (type == XML_EXP_COUNT) { + entry->exp_min = min; + entry->exp_max = max; + entry->exp_left = left; + if ((min == 0) || (IS_NILLABLE(left))) + entry->info |= XML_EXP_NILABLE; + if (max < 0) + entry->c_max = -1; + else + entry->c_max = max * entry->exp_left->c_max; + } else { + entry->exp_left = left; + entry->exp_right = right; + if (type == XML_EXP_OR) { + if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) + entry->info |= XML_EXP_NILABLE; + if ((entry->exp_left->c_max == -1) || + (entry->exp_right->c_max == -1)) + entry->c_max = -1; + else if (entry->exp_left->c_max > entry->exp_right->c_max) + entry->c_max = entry->exp_left->c_max; + else + entry->c_max = entry->exp_right->c_max; + } else { + if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) + entry->info |= XML_EXP_NILABLE; + if ((entry->exp_left->c_max == -1) || + (entry->exp_right->c_max == -1)) + entry->c_max = -1; + else + entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; + } + } + entry->ref = 1; + if (ctxt->table[key] != NULL) + entry->next = ctxt->table[key]; + + ctxt->table[key] = entry; + ctxt->nbElems++; + + return(entry); +} + +/** + * xmlExpFree: + * @ctxt: the expression context + * @exp: the expression + * + * Dereference the expression + */ +void +xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { + if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) + return; + exp->ref--; + if (exp->ref == 0) { + unsigned short key; + + /* Unlink it first from the hash table */ + key = exp->key % ctxt->size; + if (ctxt->table[key] == exp) { + ctxt->table[key] = exp->next; + } else { + xmlExpNodePtr tmp; + + tmp = ctxt->table[key]; + while (tmp != NULL) { + if (tmp->next == exp) { + tmp->next = exp->next; + break; + } + tmp = tmp->next; + } + } + + if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { + xmlExpFree(ctxt, exp->exp_left); + xmlExpFree(ctxt, exp->exp_right); + } else if (exp->type == XML_EXP_COUNT) { + xmlExpFree(ctxt, exp->exp_left); + } + xmlFree(exp); + ctxt->nb_nodes--; + } +} + +/** + * xmlExpRef: + * @exp: the expression + * + * Increase the reference count of the expression + */ +void +xmlExpRef(xmlExpNodePtr exp) { + if (exp != NULL) + exp->ref++; +} + +/** + * xmlExpNewAtom: + * @ctxt: the expression context + * @name: the atom name + * @len: the atom name lenght in byte (or -1); + * + * Get the atom associated to this name from that context + * + * Returns the node or NULL in case of error + */ +xmlExpNodePtr +xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { + if ((ctxt == NULL) || (name == NULL)) + return(NULL); + name = xmlDictLookup(ctxt->dict, name, len); + if (name == NULL) + return(NULL); + return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); +} + +/** + * xmlExpNewOr: + * @ctxt: the expression context + * @left: left expression + * @right: right expression + * + * Get the atom associated to the choice @left | @right + * Note that @left and @right are consumed in the operation, to keep + * an handle on them use xmlExpRef() and use xmlExpFree() to release them, + * this is true even in case of failure (unless ctxt == NULL). + * + * Returns the node or NULL in case of error + */ +xmlExpNodePtr +xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { + if ((ctxt == NULL) || (left == NULL) || (right == NULL)) { + xmlExpFree(ctxt, left); + xmlExpFree(ctxt, right); + return(NULL); + } + return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); +} + +/** + * xmlExpNewSeq: + * @ctxt: the expression context + * @left: left expression + * @right: right expression + * + * Get the atom associated to the sequence @left , @right + * Note that @left and @right are consumed in the operation, to keep + * an handle on them use xmlExpRef() and use xmlExpFree() to release them, + * this is true even in case of failure (unless ctxt == NULL). + * + * Returns the node or NULL in case of error + */ +xmlExpNodePtr +xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { + if ((ctxt == NULL) || (left == NULL) || (right == NULL)) { + xmlExpFree(ctxt, left); + xmlExpFree(ctxt, right); + return(NULL); + } + return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); +} + +/** + * xmlExpNewRange: + * @ctxt: the expression context + * @subset: the expression to be repeated + * @min: the lower bound for the repetition + * @max: the upper bound for the repetition, -1 means infinite + * + * Get the atom associated to the range (@subset){@min, @max} + * Note that @subset is consumed in the operation, to keep + * an handle on it use xmlExpRef() and use xmlExpFree() to release it, + * this is true even in case of failure (unless ctxt == NULL). + * + * Returns the node or NULL in case of error + */ +xmlExpNodePtr +xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { + if ((ctxt == NULL) || (subset == NULL) || (min < 0) || (max < -1) || + ((max >= 0) && (min > max))) { + xmlExpFree(ctxt, subset); + return(NULL); + } + return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, + NULL, NULL, min, max)); +} + +/************************************************************************ + * * + * Public API for operations on expressions * + * * + ************************************************************************/ + +static int +xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, + const xmlChar**list, int len, int nb) { + int tmp, tmp2; +tail: + switch (exp->type) { + case XML_EXP_EMPTY: + return(0); + case XML_EXP_ATOM: + for (tmp = 0;tmp < nb;tmp++) + if (list[tmp] == exp->exp_str) + return(0); + if (nb >= len) + return(-2); + list[nb++] = exp->exp_str; + return(1); + case XML_EXP_COUNT: + exp = exp->exp_left; + goto tail; + case XML_EXP_SEQ: + case XML_EXP_OR: + tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); + if (tmp < 0) + return(tmp); + tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, + nb + tmp); + if (tmp2 < 0) + return(tmp2); + return(tmp + tmp2); + } + return(-1); +} + +/** + * xmlExpGetLanguage: + * @ctxt: the expression context + * @exp: the expression + * @list: where to store the tokens + * @len: the allocated lenght of @list + * + * Find all the strings used in @exp and store them in @list + * + * Returns the number of unique strings found, -1 in case of errors and + * -2 if there is more than @len strings + */ +int +xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, + const xmlChar**list, int len) { + if ((ctxt == NULL) || (exp == NULL) || (list == NULL) || (len <= 0)) + return(-1); + return(xmlExpGetLanguageInt(ctxt, exp, list, len, 0)); +} + +static int +xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, + const xmlChar**list, int len, int nb) { + int tmp, tmp2; +tail: + switch (exp->type) { + case XML_EXP_FORBID: + return(0); + case XML_EXP_EMPTY: + return(0); + case XML_EXP_ATOM: + for (tmp = 0;tmp < nb;tmp++) + if (list[tmp] == exp->exp_str) + return(0); + if (nb >= len) + return(-2); + list[nb++] = exp->exp_str; + return(1); + case XML_EXP_COUNT: + exp = exp->exp_left; + goto tail; + case XML_EXP_SEQ: + tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); + if (tmp < 0) + return(tmp); + if (IS_NILLABLE(exp->exp_left)) { + tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, + nb + tmp); + if (tmp2 < 0) + return(tmp2); + tmp += tmp2; + } + return(tmp); + case XML_EXP_OR: + tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); + if (tmp < 0) + return(tmp); + tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, + nb + tmp); + if (tmp2 < 0) + return(tmp2); + return(tmp + tmp2); + } + return(-1); +} + +/** + * xmlExpGetStart: + * @ctxt: the expression context + * @exp: the expression + * @list: where to store the tokens + * @len: the allocated lenght of @list + * + * Find all the strings that appears at the start of the languages + * accepted by @exp and store them in @list. E.g. for (a, b) | c + * it will return the list [a, c] + * + * Returns the number of unique strings found, -1 in case of errors and + * -2 if there is more than @len strings + */ +int +xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, + const xmlChar**list, int len) { + if ((ctxt == NULL) || (exp == NULL) || (list == NULL) || (len <= 0)) + return(-1); + return(xmlExpGetStartInt(ctxt, exp, list, len, 0)); +} + +/** + * xmlExpIsNillable: + * @exp: the expression + * + * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce + * + * Returns 1 if nillable, 0 if not and -1 in case of error + */ +int +xmlExpIsNillable(xmlExpNodePtr exp) { + if (exp == NULL) + return(-1); + return(IS_NILLABLE(exp) != 0); +} + +static xmlExpNodePtr +xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) +{ + xmlExpNodePtr ret; + + switch (exp->type) { + case XML_EXP_EMPTY: + return(forbiddenExp); + case XML_EXP_FORBID: + return(forbiddenExp); + case XML_EXP_ATOM: + if (exp->exp_str == str) { +#ifdef DEBUG_DERIV + printf("deriv atom: equal => Empty\n"); +#endif + ret = emptyExp; + } else { +#ifdef DEBUG_DERIV + printf("deriv atom: mismatch => forbid\n"); +#endif + /* TODO wildcards here */ + ret = forbiddenExp; + } + return(ret); + case XML_EXP_OR: { + xmlExpNodePtr tmp; + +#ifdef DEBUG_DERIV + printf("deriv or: => or(derivs)\n"); +#endif + tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); + if (tmp == NULL) { + return(NULL); + } + ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); + if (ret == NULL) { + xmlExpFree(ctxt, tmp); + return(NULL); + } + ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, + NULL, 0, 0); + return(ret); + } + case XML_EXP_SEQ: +#ifdef DEBUG_DERIV + printf("deriv seq: starting with left\n"); +#endif + ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); + if (ret == NULL) { + return(NULL); + } else if (ret == forbiddenExp) { + if (IS_NILLABLE(exp->exp_left)) { +#ifdef DEBUG_DERIV + printf("deriv seq: left failed but nillable\n"); +#endif + ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); + } + } else { +#ifdef DEBUG_DERIV + printf("deriv seq: left match => sequence\n"); +#endif + exp->exp_right->ref++; + ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, + NULL, 0, 0); + } + return(ret); + case XML_EXP_COUNT: { + int min, max; + xmlExpNodePtr tmp; + + if (exp->exp_max == 0) + return(forbiddenExp); + ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); + if (ret == NULL) + return(NULL); + if (ret == forbiddenExp) { +#ifdef DEBUG_DERIV + printf("deriv count: pattern mismatch => forbid\n"); +#endif + return(ret); + } + if (exp->exp_max == 1) + return(ret); + if (exp->exp_max < 0) /* unbounded */ + max = -1; + else + max = exp->exp_max - 1; + if (exp->exp_min > 0) + min = exp->exp_min - 1; + else + min = 0; + exp->exp_left->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, + NULL, min, max); + if (ret == emptyExp) { +#ifdef DEBUG_DERIV + printf("deriv count: match to empty => new count\n"); +#endif + return(tmp); + } +#ifdef DEBUG_DERIV + printf("deriv count: match => sequence with new count\n"); +#endif + return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, + NULL, 0, 0)); + } + } + return(NULL); +} + +/** + * xmlExpStringDerive: + * @ctxt: the expression context + * @exp: the expression + * @str: the string + * @len: the string len in bytes if available + * + * Do one step of Brzozowski derivation of the expression @exp with + * respect to the input string + * + * Returns the resulting expression or NULL in case of internal error + */ +xmlExpNodePtr +xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, + const xmlChar *str, int len) { + const xmlChar *input; + + if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { + return(NULL); + } + /* + * check the string is in the dictionnary, if yes use an interned + * copy, otherwise we know it's not an acceptable input + */ + input = xmlDictExists(ctxt->dict, str, len); + if (input == NULL) { + return(forbiddenExp); + } + return(xmlExpStringDeriveInt(ctxt, exp, input)); +} + +static int +xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { + int ret = 1; + + if (sub->c_max == -1) { + if (exp->c_max != -1) + ret = 0; + } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { + ret = 0; + } +#if 0 + if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) + ret = 0; +#endif + return(ret); +} + +static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, + xmlExpNodePtr sub); +/** + * xmlExpDivide: + * @ctxt: the expressions context + * @exp: the englobing expression + * @sub: the subexpression + * @mult: the multiple expression + * @remain: the remain from the derivation of the multiple + * + * Check if exp is a multiple of sub, i.e. if there is a finite number n + * so that sub{n} subsume exp + * + * Returns the multiple value if successful, 0 if it is not a multiple + * and -1 in case of internel error. + */ + +static int +xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, + xmlExpNodePtr *mult, xmlExpNodePtr *remain) { + int i; + xmlExpNodePtr tmp, tmp2; + + if (mult != NULL) *mult = NULL; + if (remain != NULL) *remain = NULL; + if (exp->c_max == -1) return(0); + if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); + + for (i = 1;i <= exp->c_max;i++) { + sub->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, + sub, NULL, NULL, i, i); + if (tmp == NULL) { + return(-1); + } + if (!xmlExpCheckCard(tmp, exp)) { + xmlExpFree(ctxt, tmp); + continue; + } + tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); + if (tmp2 == NULL) { + xmlExpFree(ctxt, tmp); + return(-1); + } + if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { + if (remain != NULL) + *remain = tmp2; + else + xmlExpFree(ctxt, tmp2); + if (mult != NULL) + *mult = tmp; + else + xmlExpFree(ctxt, tmp); +#ifdef DEBUG_DERIV + printf("Divide succeeded %d\n", i); +#endif + return(i); + } + xmlExpFree(ctxt, tmp); + xmlExpFree(ctxt, tmp2); + } +#ifdef DEBUG_DERIV + printf("Divide failed\n"); +#endif + return(0); +} + +/** + * xmlExpExpDeriveInt: + * @ctxt: the expressions context + * @exp: the englobing expression + * @sub: the subexpression + * + * Try to do a step of Brzozowski derivation but at a higher level + * the input being a subexpression. + * + * Returns the resulting expression or NULL in case of internal error + */ +static xmlExpNodePtr +xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { + xmlExpNodePtr ret, tmp, tmp2, tmp3; + const xmlChar **tab; + int len, i; + + /* + * In case of equality and if the expression can only consume a finite + * amount, then the derivation is empty + */ + if ((exp == sub) && (exp->c_max >= 0)) { +#ifdef DEBUG_DERIV + printf("Equal(exp, sub) and finite -> Empty\n"); +#endif + return(emptyExp); + } + /* + * decompose sub sequence first + */ + if (sub->type == XML_EXP_EMPTY) { +#ifdef DEBUG_DERIV + printf("Empty(sub) -> Empty\n"); +#endif + exp->ref++; + return(exp); + } + if (sub->type == XML_EXP_SEQ) { +#ifdef DEBUG_DERIV + printf("Seq(sub) -> decompose\n"); +#endif + tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); + if (tmp == NULL) + return(NULL); + if (tmp == forbiddenExp) + return(tmp); + ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); + xmlExpFree(ctxt, tmp); + return(ret); + } + if (sub->type == XML_EXP_OR) { +#ifdef DEBUG_DERIV + printf("Or(sub) -> decompose\n"); +#endif + tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); + if (tmp == forbiddenExp) + return(tmp); + if (tmp == NULL) + return(NULL); + ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); + if ((ret == NULL) || (ret == forbiddenExp)) { + xmlExpFree(ctxt, tmp); + return(ret); + } + return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); + } + if (!xmlExpCheckCard(exp, sub)) { +#ifdef DEBUG_DERIV + printf("CheckCard(exp, sub) failed -> Forbid\n"); +#endif + return(forbiddenExp); + } + switch (exp->type) { + case XML_EXP_EMPTY: + if (sub == emptyExp) + return(emptyExp); +#ifdef DEBUG_DERIV + printf("Empty(exp) -> Forbid\n"); +#endif + return(forbiddenExp); + case XML_EXP_FORBID: +#ifdef DEBUG_DERIV + printf("Forbid(exp) -> Forbid\n"); +#endif + return(forbiddenExp); + case XML_EXP_ATOM: + if (sub->type == XML_EXP_ATOM) { + /* TODO: handle wildcards */ + if (exp->exp_str == sub->exp_str) { +#ifdef DEBUG_DERIV + printf("Atom match -> Empty\n"); +#endif + return(emptyExp); + } +#ifdef DEBUG_DERIV + printf("Atom mismatch -> Forbid\n"); +#endif + return(forbiddenExp); + } + if ((sub->type == XML_EXP_COUNT) && + (sub->exp_max == 1) && + (sub->exp_left->type == XML_EXP_ATOM)) { + /* TODO: handle wildcards */ + if (exp->exp_str == sub->exp_left->exp_str) { +#ifdef DEBUG_DERIV + printf("Atom match -> Empty\n"); +#endif + return(emptyExp); + } +#ifdef DEBUG_DERIV + printf("Atom mismatch -> Forbid\n"); +#endif + return(forbiddenExp); + } +#ifdef DEBUG_DERIV + printf("Compex exp vs Atom -> Forbid\n"); +#endif + return(forbiddenExp); + case XML_EXP_SEQ: + /* try to get the sequence consumed only if possible */ + if (xmlExpCheckCard(exp->exp_left, sub)) { + /* See if the sequence can be consumed directly */ +#ifdef DEBUG_DERIV + printf("Seq trying left only\n"); +#endif + ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); + if ((ret != forbiddenExp) && (ret != NULL)) { +#ifdef DEBUG_DERIV + printf("Seq trying left only worked\n"); +#endif + /* + * TODO: assumption here that we are determinist + * i.e. we won't get to a nillable exp left + * subset which could be matched by the right + * part too. + * e.g.: (a | b)+,(a | c) and 'a+,a' + */ + exp->exp_right->ref++; + return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, + exp->exp_right, NULL, 0, 0)); + } +#ifdef DEBUG_DERIV + } else { + printf("Seq: left too short\n"); +#endif + } + /* Try instead to decompose */ + if (sub->type == XML_EXP_COUNT) { + int min, max; + +#ifdef DEBUG_DERIV + printf("Seq: sub is a count\n"); +#endif + ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); + if (ret == NULL) + return(NULL); + if (ret != forbiddenExp) { +#ifdef DEBUG_DERIV + printf("Seq , Count match on left\n"); +#endif + if (sub->exp_max < 0) + max = -1; + else + max = sub->exp_max -1; + if (sub->exp_min > 0) + min = sub->exp_min -1; + else + min = 0; + exp->exp_right->ref++; + tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, + exp->exp_right, NULL, 0, 0); + if (tmp == NULL) + return(NULL); + + sub->exp_left->ref++; + tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, + sub->exp_left, NULL, NULL, min, max); + if (tmp2 == NULL) { + xmlExpFree(ctxt, tmp); + return(NULL); + } + ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); + xmlExpFree(ctxt, tmp); + xmlExpFree(ctxt, tmp2); + return(ret); + } + } + /* we made no progress on structured operations */ + break; + case XML_EXP_OR: +#ifdef DEBUG_DERIV + printf("Or , trying both side\n"); +#endif + ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); + if (ret == NULL) + return(NULL); + tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); + if (tmp == NULL) { + xmlExpFree(ctxt, ret); + return(NULL); + } + return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); + case XML_EXP_COUNT: { + int min, max; + + if (sub->type == XML_EXP_COUNT) { + /* + * Try to see if the loop is completely subsumed + */ + tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); + if (tmp == NULL) + return(NULL); + if (tmp == forbiddenExp) { + int mult; + +#ifdef DEBUG_DERIV + printf("Count, Count inner don't subsume\n"); +#endif + mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, + NULL, &tmp); + if (mult <= 0) { +#ifdef DEBUG_DERIV + printf("Count, Count not multiple => forbidden\n"); +#endif + return(forbiddenExp); + } + if (sub->exp_max == -1) { + max = -1; + if (exp->exp_max == -1) { + if (exp->exp_min <= sub->exp_min * mult) + min = 0; + else + min = exp->exp_min - sub->exp_min * mult; + } else { +#ifdef DEBUG_DERIV + printf("Count, Count finite can't subsume infinite\n"); +#endif + xmlExpFree(ctxt, tmp); + return(forbiddenExp); + } + } else { + if (exp->exp_max == -1) { +#ifdef DEBUG_DERIV + printf("Infinite loop consume mult finite loop\n"); +#endif + if (exp->exp_min > sub->exp_min * mult) { + max = -1; + min = exp->exp_min - sub->exp_min * mult; + } else { + max = -1; + min = 0; + } + } else { + if (exp->exp_max < sub->exp_max * mult) { +#ifdef DEBUG_DERIV + printf("loops max mult mismatch => forbidden\n"); +#endif + xmlExpFree(ctxt, tmp); + return(forbiddenExp); + } + if (sub->exp_max * mult > exp->exp_min) + min = 0; + else + min = exp->exp_min - sub->exp_max * mult; + max = exp->exp_max - sub->exp_max * mult; + } + } + } else if (!IS_NILLABLE(tmp)) { + /* + * TODO: loop here to try to grow if working on finite + * blocks. + */ +#ifdef DEBUG_DERIV + printf("Count, Count remain not nillable => forbidden\n"); +#endif + xmlExpFree(ctxt, tmp); + return(forbiddenExp); + } else if (sub->exp_max == -1) { + if (exp->exp_max == -1) { + if (exp->exp_min <= sub->exp_min) { +#ifdef DEBUG_DERIV + printf("Infinite loops Okay => COUNT(0,Inf)\n"); +#endif + max = -1; + min = 0; + } else { +#ifdef DEBUG_DERIV + printf("Infinite loops min => Count(X,Inf)\n"); +#endif + max = -1; + min = exp->exp_min - sub->exp_min; + } + } else if (exp->exp_min > sub->exp_min) { +#ifdef DEBUG_DERIV + printf("loops min mismatch 1 => forbidden ???\n"); +#endif + xmlExpFree(ctxt, tmp); + return(forbiddenExp); + } else { + max = -1; + min = 0; + } + } else { + if (exp->exp_max == -1) { +#ifdef DEBUG_DERIV + printf("Infinite loop consume finite loop\n"); +#endif + if (exp->exp_min > sub->exp_min) { + max = -1; + min = exp->exp_min - sub->exp_min; + } else { + max = -1; + min = 0; + } + } else { + if (exp->exp_max < sub->exp_max) { +#ifdef DEBUG_DERIV + printf("loops max mismatch => forbidden\n"); +#endif + xmlExpFree(ctxt, tmp); + return(forbiddenExp); + } + if (sub->exp_max > exp->exp_min) + min = 0; + else + min = exp->exp_min - sub->exp_max; + max = exp->exp_max - sub->exp_max; + } + } +#ifdef DEBUG_DERIV + printf("loops match => SEQ(COUNT())\n"); +#endif + exp->exp_left->ref++; + tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, + NULL, NULL, min, max); + if (tmp2 == NULL) { + return(NULL); + } + ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, + NULL, 0, 0); + return(ret); + } + tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); + if (tmp == NULL) + return(NULL); + if (tmp == forbiddenExp) { +#ifdef DEBUG_DERIV + printf("loop mismatch => forbidden\n"); +#endif + return(forbiddenExp); + } + if (exp->exp_min > 0) + min = exp->exp_min - 1; + else + min = 0; + if (exp->exp_max < 0) + max = -1; + else + max = exp->exp_max - 1; + +#ifdef DEBUG_DERIV + printf("loop match => SEQ(COUNT())\n"); +#endif + exp->exp_left->ref++; + tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, + NULL, NULL, min, max); + if (tmp2 == NULL) + return(NULL); + ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, + NULL, 0, 0); + return(ret); + } + } + +#ifdef DEBUG_DERIV + printf("Fallback to derivative\n"); +#endif + if (IS_NILLABLE(sub)) { + if (!(IS_NILLABLE(exp))) + return(forbiddenExp); + else + ret = emptyExp; + } else + ret = NULL; + /* + * here the structured derivation made no progress so + * we use the default token based derivation to force one more step + */ + if (ctxt->tabSize == 0) + ctxt->tabSize = 40; + + tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * + sizeof(const xmlChar *)); + if (tab == NULL) { + return(NULL); + } + + /* + * collect all the strings accepted by the subexpression on input + */ + len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); + while (len < 0) { + const xmlChar **temp; + temp = (const xmlChar **) xmlRealloc(tab, ctxt->tabSize * 2 * + sizeof(const xmlChar *)); + if (temp == NULL) { + xmlFree(tab); + return(NULL); + } + tab = temp; + ctxt->tabSize *= 2; + len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); + } + for (i = 0;i < len;i++) { + tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); + if ((tmp == NULL) || (tmp == forbiddenExp)) { + xmlExpFree(ctxt, ret); + xmlFree(tab); + return(tmp); + } + tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); + if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { + xmlExpFree(ctxt, tmp); + xmlExpFree(ctxt, ret); + xmlFree(tab); + return(tmp); + } + tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); + xmlExpFree(ctxt, tmp); + xmlExpFree(ctxt, tmp2); + + if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { + xmlExpFree(ctxt, ret); + xmlFree(tab); + return(tmp3); + } + + if (ret == NULL) + ret = tmp3; + else { + ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); + if (ret == NULL) { + xmlFree(tab); + return(NULL); + } + } + } + xmlFree(tab); + return(ret); +} + +/** + * xmlExpExpDerive: + * @ctxt: the expressions context + * @exp: the englobing expression + * @sub: the subexpression + * + * Evaluates the expression resulting from @exp consuming a sub expression @sub + * Based on algebraic derivation and sometimes direct Brzozowski derivation + * it usually tatkes less than linear time and can handle expressions generating + * infinite languages. + * + * Returns the resulting expression or NULL in case of internal error, the + * result must be freed + */ +xmlExpNodePtr +xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { + if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) + return(NULL); + + /* + * O(1) speedups + */ + if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { +#ifdef DEBUG_DERIV + printf("Sub nillable and not exp : can't subsume\n"); +#endif + return(forbiddenExp); + } + if (xmlExpCheckCard(exp, sub) == 0) { +#ifdef DEBUG_DERIV + printf("sub generate longuer sequances than exp : can't subsume\n"); +#endif + return(forbiddenExp); + } + return(xmlExpExpDeriveInt(ctxt, exp, sub)); +} + +/** + * xmlExpSubsume: + * @ctxt: the expressions context + * @exp: the englobing expression + * @sub: the subexpression + * + * Check whether @exp accepts all the languages accexpted by @sub + * the input being a subexpression. + * + * Returns 1 if true 0 if false and -1 in case of failure. + */ +int +xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { + xmlExpNodePtr tmp; + + if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) + return(-1); + + /* + * TODO: speedup by checking the language of sub is a subset of the + * language of exp + */ + /* + * O(1) speedups + */ + if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { +#ifdef DEBUG_DERIV + printf("Sub nillable and not exp : can't subsume\n"); +#endif + return(0); + } + if (xmlExpCheckCard(exp, sub) == 0) { +#ifdef DEBUG_DERIV + printf("sub generate longuer sequances than exp : can't subsume\n"); +#endif + return(0); + } + tmp = xmlExpExpDeriveInt(ctxt, exp, sub); +#ifdef DEBUG_DERIV + printf("Result derivation :\n"); + PRINT_EXP(tmp); +#endif + if (tmp == NULL) + return(-1); + if (tmp == forbiddenExp) + return(0); + if (tmp == emptyExp) + return(1); + if ((tmp != NULL) && (IS_NILLABLE(tmp))) { + xmlExpFree(ctxt, tmp); + return(1); + } + xmlExpFree(ctxt, tmp); + return(0); +} + +/************************************************************************ + * * + * Parsing expression * + * * + ************************************************************************/ + +static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); + +#undef CUR +#define CUR (*ctxt->cur) +#undef NEXT +#define NEXT ctxt->cur++; +#undef IS_BLANK +#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) +#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; + +static int +xmlExpParseNumber(xmlExpCtxtPtr ctxt) { + int ret = 0; + + SKIP_BLANKS + if (CUR == '*') { + NEXT + return(-1); + } + if ((CUR < '0') || (CUR > '9')) + return(-1); + while ((CUR >= '0') && (CUR <= '9')) { + ret = ret * 10 + (CUR - '0'); + NEXT + } + return(ret); +} + +static xmlExpNodePtr +xmlExpParseOr(xmlExpCtxtPtr ctxt) { + const char *base; + xmlExpNodePtr ret; + const xmlChar *val; + + SKIP_BLANKS + base = ctxt->cur; + if (*ctxt->cur == '(') { + NEXT + ret = xmlExpParseExpr(ctxt); + SKIP_BLANKS + if (*ctxt->cur != ')') { + fprintf(stderr, "unbalanced '(' : %s\n", base); + xmlExpFree(ctxt, ret); + return(NULL); + } + NEXT; + SKIP_BLANKS + goto parse_quantifier; + } + while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && + (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && + (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) + NEXT; + val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); + if (val == NULL) + return(NULL); + ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); + if (ret == NULL) + return(NULL); + SKIP_BLANKS +parse_quantifier: + if (CUR == '{') { + int min, max; + + NEXT + min = xmlExpParseNumber(ctxt); + if (min < 0) { + xmlExpFree(ctxt, ret); + return(NULL); + } + SKIP_BLANKS + if (CUR == ',') { + NEXT + max = xmlExpParseNumber(ctxt); + SKIP_BLANKS + } else + max = min; + if (CUR != '}') { + xmlExpFree(ctxt, ret); + return(NULL); + } + NEXT + ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, + min, max); + SKIP_BLANKS + } else if (CUR == '?') { + NEXT + ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, + 0, 1); + SKIP_BLANKS + } else if (CUR == '+') { + NEXT + ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, + 1, -1); + SKIP_BLANKS + } else if (CUR == '*') { + NEXT + ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, + 0, -1); + SKIP_BLANKS + } + return(ret); +} + + +static xmlExpNodePtr +xmlExpParseSeq(xmlExpCtxtPtr ctxt) { + xmlExpNodePtr ret, right; + + ret = xmlExpParseOr(ctxt); + SKIP_BLANKS + while (CUR == '|') { + NEXT + right = xmlExpParseOr(ctxt); + if (right == NULL) { + xmlExpFree(ctxt, ret); + return(NULL); + } + ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); + if (ret == NULL) + return(NULL); + } + return(ret); +} + +static xmlExpNodePtr +xmlExpParseExpr(xmlExpCtxtPtr ctxt) { + xmlExpNodePtr ret, right; + + ret = xmlExpParseSeq(ctxt); + SKIP_BLANKS + while (CUR == ',') { + NEXT + right = xmlExpParseSeq(ctxt); + if (right == NULL) { + xmlExpFree(ctxt, ret); + return(NULL); + } + ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); + if (ret == NULL) + return(NULL); + } + return(ret); +} + +/** + * xmlExpParse: + * @ctxt: the expressions context + * @expr: the 0 terminated string + * + * Minimal parser for regexps, it understand the following constructs + * - string terminals + * - choice operator | + * - sequence operator , + * - subexpressions (...) + * - usual cardinality operators + * and ? + * - finite sequences { min, max } + * - infinite sequences { min, * } + * There is minimal checkings made especially no checking on strings values + * + * Returns a new expression or NULL in case of failure + */ +xmlExpNodePtr +xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { + xmlExpNodePtr ret; + + ctxt->expr = expr; + ctxt->cur = expr; + + ret = xmlExpParseExpr(ctxt); + SKIP_BLANKS + if (*ctxt->cur != 0) { + xmlExpFree(ctxt, ret); + return(NULL); + } + return(ret); +} + +static void +xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { + xmlExpNodePtr c; + + if (expr == NULL) return; + if (glob) xmlBufferWriteChar(buf, "("); + switch (expr->type) { + case XML_EXP_EMPTY: + xmlBufferWriteChar(buf, "empty"); + break; + case XML_EXP_FORBID: + xmlBufferWriteChar(buf, "forbidden"); + break; + case XML_EXP_ATOM: + xmlBufferWriteCHAR(buf, expr->exp_str); + break; + case XML_EXP_SEQ: + c = expr->exp_left; + if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) + xmlExpDumpInt(buf, c, 1); + else + xmlExpDumpInt(buf, c, 0); + xmlBufferWriteChar(buf, " , "); + c = expr->exp_right; + if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) + xmlExpDumpInt(buf, c, 1); + else + xmlExpDumpInt(buf, c, 0); + break; + case XML_EXP_OR: + c = expr->exp_left; + if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) + xmlExpDumpInt(buf, c, 1); + else + xmlExpDumpInt(buf, c, 0); + xmlBufferWriteChar(buf, " | "); + c = expr->exp_right; + if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) + xmlExpDumpInt(buf, c, 1); + else + xmlExpDumpInt(buf, c, 0); + break; + case XML_EXP_COUNT: { + char rep[40]; + + c = expr->exp_left; + if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) + xmlExpDumpInt(buf, c, 1); + else + xmlExpDumpInt(buf, c, 0); + if ((expr->exp_min == 0) && (expr->exp_max == 1)) { + rep[0] = '?'; + rep[1] = 0; + } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { + rep[0] = '*'; + rep[1] = 0; + } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { + rep[0] = '+'; + rep[1] = 0; + } else if (expr->exp_max == expr->exp_min) { + snprintf(rep, 39, "{%d}", expr->exp_min); + } else if (expr->exp_max < 0) { + snprintf(rep, 39, "{%d,inf}", expr->exp_min); + } else { + snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); + } + rep[39] = 0; + xmlBufferWriteChar(buf, rep); + break; + } + default: + fprintf(stderr, "Error in tree\n"); + } + if (glob) + xmlBufferWriteChar(buf, ")"); +} +/** + * xmlExpDump: + * @buf: a buffer to receive the output + * @expr: the compiled expression + * + * Serialize the expression as compiled to the buffer + */ +void +xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { + if ((buf == NULL) || (expr == NULL)) + return; + xmlExpDumpInt(buf, expr, 0); +} + +/** + * xmlExpMaxToken: + * @expr: a compiled expression + * + * Indicate the maximum number of input a expression can accept + * + * Returns the maximum length or -1 in case of error + */ +int +xmlExpMaxToken(xmlExpNodePtr expr) { + if (expr == NULL) + return(-1); + return(expr->c_max); +} + +/** + * xmlExpCtxtNbNodes: + * @ctxt: an expression context + * + * Debugging facility provides the number of allocated nodes at a that point + * + * Returns the number of nodes in use or -1 in case of error + */ +int +xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { + if (ctxt == NULL) + return(-1); + return(ctxt->nb_nodes); +} + +/** + * xmlExpCtxtNbCons: + * @ctxt: an expression context + * + * Debugging facility provides the number of allocated nodes over lifetime + * + * Returns the number of nodes ever allocated or -1 in case of error + */ +int +xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { + if (ctxt == NULL) + return(-1); + return(ctxt->nb_cons); +} + +#endif /* LIBXML_EXPR_ENABLED */ #define bottom_xmlregexp #include "elfgcchack.h" #endif /* LIBXML_REGEXP_ENABLED */ |