summaryrefslogtreecommitdiff
path: root/src/lib/libast/regex/reglib.h
diff options
context:
space:
mode:
authorIgor Pashev <pashev.igor@gmail.com>2012-06-24 22:28:35 +0000
committerIgor Pashev <pashev.igor@gmail.com>2012-06-24 22:28:35 +0000
commit3950ffe2a485479f6561c27364d3d7df5a21d124 (patch)
tree468c6e14449d1b1e279222ec32f676b0311917d2 /src/lib/libast/regex/reglib.h
downloadksh-upstream.tar.gz
Imported Upstream version 93u+upstream
Diffstat (limited to 'src/lib/libast/regex/reglib.h')
-rw-r--r--src/lib/libast/regex/reglib.h582
1 files changed, 582 insertions, 0 deletions
diff --git a/src/lib/libast/regex/reglib.h b/src/lib/libast/regex/reglib.h
new file mode 100644
index 0000000..6b84e5d
--- /dev/null
+++ b/src/lib/libast/regex/reglib.h
@@ -0,0 +1,582 @@
+/***********************************************************************
+* *
+* This software is part of the ast package *
+* Copyright (c) 1985-2011 AT&T Intellectual Property *
+* and is licensed under the *
+* Eclipse Public License, Version 1.0 *
+* by AT&T Intellectual Property *
+* *
+* A copy of the License is available at *
+* http://www.eclipse.org/org/documents/epl-v10.html *
+* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
+* *
+* Information and Software Systems Research *
+* AT&T Research *
+* Florham Park NJ *
+* *
+* Glenn Fowler <gsf@research.att.com> *
+* David Korn <dgk@research.att.com> *
+* Phong Vo <kpv@research.att.com> *
+* *
+***********************************************************************/
+#pragma prototyped
+
+/*
+ * posix regex implementation
+ *
+ * based on Doug McIlroy's C++ implementation
+ * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
+ * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
+ */
+
+#ifndef _REGLIB_H
+#define _REGLIB_H
+
+#define REG_VERSION_EXEC 20020509L
+#define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */
+
+#define re_info env
+
+#define alloc _reg_alloc
+#define classfun _reg_classfun
+#define drop _reg_drop
+#define fatal _reg_fatal
+#define state _reg_state
+
+typedef struct regsubop_s
+{
+ int op; /* REG_SUB_LOWER,REG_SUB_UPPER */
+ int off; /* re_rhs or match[] offset */
+ int len; /* re_rhs len or len==0 match[] */
+} regsubop_t;
+
+#define _REG_SUB_PRIVATE_ \
+ char* re_cur; /* re_buf cursor */ \
+ char* re_end; /* re_buf end */ \
+ regsubop_t* re_ops; /* rhs ops */ \
+ char re_rhs[1]; /* substitution rhs */
+
+#include <ast.h>
+#include <cdt.h>
+#include <stk.h>
+
+#include "regex.h"
+
+#include <ctype.h>
+#include <errno.h>
+
+#if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG)
+#define _AST_REGEX_DEBUG 1
+#endif
+
+#define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)
+
+#undef RE_DUP_MAX /* posix puts this in limits.h! */
+#define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */
+#define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */
+#define BACK_REF_MAX 9
+
+#define REG_COMP (~REG_EXEC)
+#define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)
+
+#define REX_NULL 0 /* null string (internal) */
+#define REX_ALT 1 /* a|b */
+#define REX_ALT_CATCH 2 /* REX_ALT catcher */
+#define REX_BACK 3 /* \1, \2, etc */
+#define REX_BEG 4 /* initial ^ */
+#define REX_BEG_STR 5 /* initial ^ w/ no newline */
+#define REX_BM 6 /* Boyer-Moore */
+#define REX_CAT 7 /* catenation catcher */
+#define REX_CLASS 8 /* [...] */
+#define REX_COLL_CLASS 9 /* collation order [...] */
+#define REX_CONJ 10 /* a&b */
+#define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */
+#define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */
+#define REX_DONE 13 /* completed match (internal) */
+#define REX_DOT 14 /* . */
+#define REX_END 15 /* final $ */
+#define REX_END_STR 16 /* final $ before tail newline */
+#define REX_EXEC 17 /* call re.re_exec() */
+#define REX_FIN_STR 18 /* final $ w/ no newline */
+#define REX_GROUP 19 /* \(...\) */
+#define REX_GROUP_CATCH 20 /* REX_GROUP catcher */
+#define REX_GROUP_AHEAD 21 /* 0-width lookahead */
+#define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */
+#define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */
+#define REX_GROUP_BEHIND 24 /* 0-width lookbehind */
+#define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */
+#define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */
+#define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */
+#define REX_GROUP_COND 28 /* conditional group */
+#define REX_GROUP_COND_CATCH 29 /* conditional group catcher */
+#define REX_GROUP_CUT 30 /* don't backtrack over this */
+#define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */
+#define REX_KMP 32 /* Knuth-Morris-Pratt */
+#define REX_NEG 33 /* negation */
+#define REX_NEG_CATCH 34 /* REX_NEG catcher */
+#define REX_NEST 35 /* nested match */
+#define REX_ONECHAR 36 /* a single-character literal */
+#define REX_REP 37 /* Kleene closure */
+#define REX_REP_CATCH 38 /* REX_REP catcher */
+#define REX_STRING 39 /* some chars */
+#define REX_TRIE 40 /* alternation of strings */
+#define REX_WBEG 41 /* \< */
+#define REX_WEND 42 /* \> */
+#define REX_WORD 43 /* word boundary */
+#define REX_WORD_NOT 44 /* not word boundary */
+
+#define T_META ((int)UCHAR_MAX+1)
+#define T_STAR (T_META+0)
+#define T_PLUS (T_META+1)
+#define T_QUES (T_META+2)
+#define T_BANG (T_META+3)
+#define T_AT (T_META+4)
+#define T_TILDE (T_META+5)
+#define T_PERCENT (T_META+6)
+#define T_LEFT (T_META+7)
+#define T_OPEN (T_META+8)
+#define T_CLOSE (T_OPEN+1)
+#define T_RIGHT (T_OPEN+2)
+#define T_CFLX (T_OPEN+3)
+#define T_DOT (T_OPEN+4)
+#define T_DOTSTAR (T_OPEN+5)
+#define T_END (T_OPEN+6)
+#define T_BAD (T_OPEN+7)
+#define T_DOLL (T_OPEN+8)
+#define T_BRA (T_OPEN+9)
+#define T_BAR (T_OPEN+10)
+#define T_AND (T_OPEN+11)
+#define T_LT (T_OPEN+12)
+#define T_GT (T_OPEN+13)
+#define T_SLASHPLUS (T_OPEN+14)
+#define T_GROUP (T_OPEN+15)
+#define T_WORD (T_OPEN+16)
+#define T_WORD_NOT (T_WORD+1)
+#define T_BEG_STR (T_WORD+2)
+#define T_END_STR (T_WORD+3)
+#define T_FIN_STR (T_WORD+4)
+#define T_ESCAPE (T_WORD+5)
+#define T_ALNUM (T_WORD+6)
+#define T_ALNUM_NOT (T_ALNUM+1)
+#define T_DIGIT (T_ALNUM+2)
+#define T_DIGIT_NOT (T_ALNUM+3)
+#define T_SPACE (T_ALNUM+4)
+#define T_SPACE_NOT (T_ALNUM+5)
+#define T_BACK (T_ALNUM+6)
+
+#define BRE 0
+#define ERE 3
+#define ARE 6
+#define SRE 9
+#define KRE 12
+
+#define HIT SSIZE_MAX
+
+#define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
+#define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07)))
+#define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07)))
+
+#define setadd(p,c) bitset((p)->bits,c)
+#define setclr(p,c) bitclr((p)->bits,c)
+#define settst(p,c) bittst((p)->bits,c)
+
+#if _hdr_wchar && _lib_wctype && _lib_iswctype
+
+#include <stdio.h> /* because <wchar.h> includes it and we generate it */
+#include <wchar.h>
+#if _hdr_wctype
+#include <wctype.h>
+#endif
+
+#if !defined(iswblank) && !_lib_iswblank
+#define _need_iswblank 1
+#define iswblank(x) _reg_iswblank(x)
+extern int _reg_iswblank(wint_t);
+#endif
+
+#if !defined(towupper) && !_lib_towupper
+#define towupper(x) toupper(x)
+#endif
+
+#if !defined(towlower) && !_lib_towlower
+#define towlower(x) tolower(x)
+#endif
+
+#else
+
+#undef _lib_wctype
+
+#ifndef iswalnum
+#define iswalnum(x) isalnum(x)
+#endif
+#ifndef iswalpha
+#define iswalpha(x) isalpha(x)
+#endif
+#ifndef iswcntrl
+#define iswcntrl(x) iscntrl(x)
+#endif
+#ifndef iswdigit
+#define iswdigit(x) isdigit(x)
+#endif
+#ifndef iswgraph
+#define iswgraph(x) isgraph(x)
+#endif
+#ifndef iswlower
+#define iswlower(x) islower(x)
+#endif
+#ifndef iswprint
+#define iswprint(x) isprint(x)
+#endif
+#ifndef iswpunct
+#define iswpunct(x) ispunct(x)
+#endif
+#ifndef iswspace
+#define iswspace(x) isspace(x)
+#endif
+#ifndef iswupper
+#define iswupper(x) isupper(x)
+#endif
+#ifndef iswxdigit
+#define iswxdigit(x) isxdigit(x)
+#endif
+
+#ifndef towlower
+#define towlower(x) tolower(x)
+#endif
+#ifndef towupper
+#define towupper(x) toupper(x)
+#endif
+
+#endif
+
+#ifndef iswblank
+#define iswblank(x) ((x)==' '||(x)=='\t')
+#endif
+
+#ifndef iswgraph
+#define iswgraph(x) (iswprint(x)&&!iswblank(x))
+#endif
+
+#define isword(x) (isalnum(x)||(x)=='_')
+
+/*
+ * collation element support
+ */
+
+#define COLL_KEY_MAX 32
+
+#if COLL_KEY_MAX < MB_LEN_MAX
+#undef COLL_KEY_MAX
+#define COLL_KEY_MAX MB_LEN_MAX
+#endif
+
+typedef unsigned char Ckey_t[COLL_KEY_MAX+1];
+
+#define COLL_end 0
+#define COLL_call 1
+#define COLL_char 2
+#define COLL_range 3
+#define COLL_range_lc 4
+#define COLL_range_uc 5
+
+typedef struct Celt_s
+{
+ short typ;
+ short min;
+ short max;
+ regclass_t fun;
+ Ckey_t beg;
+ Ckey_t end;
+} Celt_t;
+
+/*
+ * private stuff hanging off regex_t
+ */
+
+typedef struct Stk_pos_s
+{
+ off_t offset;
+ char* base;
+} Stk_pos_t;
+
+typedef struct Vector_s
+{
+ Stk_t* stk; /* stack pointer */
+ char* vec; /* the data */
+ int inc; /* growth increment */
+ int siz; /* element size */
+ int max; /* max index */
+ int cur; /* current index -- user domain */
+} Vector_t;
+
+/*
+ * Rex_t subtypes
+ */
+
+typedef struct Cond_s
+{
+ unsigned char* beg; /* beginning of next match */
+ struct Rex_s* next[2]; /* 0:no 1:yes next pattern */
+ struct Rex_s* cont; /* right catcher */
+ int yes; /* yes condition hit */
+} Cond_t;
+
+typedef struct Conj_left_s
+{
+ unsigned char* beg; /* beginning of left match */
+ struct Rex_s* right; /* right pattern */
+ struct Rex_s* cont; /* right catcher */
+} Conj_left_t;
+
+typedef struct Conj_right_s
+{
+ unsigned char* end; /* end of left match */
+ struct Rex_s* cont; /* ambient continuation */
+} Conj_right_t;
+
+typedef unsigned int Bm_mask_t;
+
+typedef struct Bm_s
+{
+ Bm_mask_t** mask;
+ size_t* skip;
+ size_t* fail;
+ size_t size;
+ ssize_t back;
+ ssize_t left;
+ ssize_t right;
+ size_t complete;
+} Bm_t;
+
+typedef struct String_s
+{
+ int* fail;
+ unsigned char* base;
+ size_t size;
+} String_t;
+
+typedef struct Set_s
+{
+ unsigned char bits[(UCHAR_MAX+1)/CHAR_BIT];
+} Set_t;
+
+typedef struct Collate_s
+{
+ int invert;
+ Celt_t* elements;
+} Collate_t;
+
+typedef struct Binary_s
+{
+ struct Rex_s* left;
+ struct Rex_s* right;
+ int serial;
+} Binary_t;
+
+typedef struct Group_s
+{
+ int number; /* group number */
+ int last; /* last contained group number */
+ int size; /* lookbehind size */
+ int back; /* backreferenced */
+ regflags_t flags; /* group flags */
+ union
+ {
+ Binary_t binary;
+ struct Rex_s* rex;
+ } expr;
+} Group_t;
+
+typedef struct Exec_s
+{
+ void* data;
+ const char* text;
+ size_t size;
+} Exec_t;
+
+#define REX_NEST_open 0x01
+#define REX_NEST_close 0x02
+#define REX_NEST_escape 0x04
+#define REX_NEST_quote 0x08
+#define REX_NEST_literal 0x10
+#define REX_NEST_delimiter 0x20
+#define REX_NEST_terminator 0x40
+#define REX_NEST_separator 0x80
+
+#define REX_NEST_SHIFT 8
+
+typedef struct Nest_s
+{
+ int primary;
+ unsigned short none; /* for Nest_t.type[-1] */
+ unsigned short type[1];
+} Nest_t;
+
+/*
+ * REX_ALT catcher, solely to get control at the end of an
+ * alternative to keep records for comparing matches.
+ */
+
+typedef struct Alt_catch_s
+{
+ struct Rex_s* cont;
+} Alt_catch_t;
+
+typedef struct Group_catch_s
+{
+ struct Rex_s* cont;
+ regoff_t* eo;
+} Group_catch_t;
+
+typedef struct Behind_catch_s
+{
+ struct Rex_s* cont;
+ unsigned char* beg;
+ unsigned char* end;
+} Behind_catch_t;
+
+/*
+ * REX_NEG catcher determines what string lengths can be matched,
+ * then Neg investigates continuations of other lengths.
+ * This is inefficient. For !POSITIONS expressions, we can do better:
+ * since matches to rex will be enumerated in decreasing order,
+ * we can investigate continuations whenever a length is skipped.
+ */
+
+typedef struct Neg_catch_s
+{
+ unsigned char* beg;
+ unsigned char* index;
+} Neg_catch_t;
+
+/*
+ * REX_REP catcher. One is created on the stack for
+ * each iteration of a complex repetition.
+ */
+
+typedef struct Rep_catch_s
+{
+ struct Rex_s* cont;
+ struct Rex_s* ref;
+ unsigned char* beg;
+ int n;
+} Rep_catch_t;
+
+/*
+ * data structure for an alternation of pure strings
+ * son points to a subtree of all strings with a common
+ * prefix ending in character c. sib links alternate
+ * letters in the same position of a word. end=1 if
+ * some word ends with c. the order of strings is
+ * irrelevant, except long words must be investigated
+ * before short ones.
+ */
+
+typedef struct Trie_node_s
+{
+ unsigned char c;
+ unsigned char end;
+ struct Trie_node_s* son;
+ struct Trie_node_s* sib;
+} Trie_node_t;
+
+typedef struct Trie_s
+{
+ Trie_node_t** root;
+ int min;
+ int max;
+} Trie_t;
+
+/*
+ * Rex_t is a node in a regular expression
+ */
+
+typedef struct Rex_s
+{
+ unsigned char type; /* node type */
+ unsigned char marked; /* already marked */
+ short serial; /* subpattern number */
+ regflags_t flags; /* scoped flags */
+ int explicit; /* scoped explicit match*/
+ struct Rex_s* next; /* remaining parts */
+ int lo; /* lo dup count */
+ int hi; /* hi dup count */
+ unsigned char* map; /* fold and/or ccode map*/
+ union
+ {
+ Alt_catch_t alt_catch; /* alt catcher */
+ Bm_t bm; /* bm */
+ Behind_catch_t behind_catch; /* behind catcher */
+ Set_t* charclass; /* char class */
+ Collate_t collate; /* collation class */
+ Cond_t cond_catch; /* cond catcher */
+ Conj_left_t conj_left; /* conj left catcher */
+ Conj_right_t conj_right; /* conj right catcher */
+ void* data; /* data after Rex_t */
+ Exec_t exec; /* re.re_exec() args */
+ Group_t group; /* a|b or rep */
+ Group_catch_t group_catch; /* group catcher */
+ Neg_catch_t neg_catch; /* neg catcher */
+ Nest_t nest; /* nested match */
+ unsigned char onechar; /* single char */
+ Rep_catch_t rep_catch; /* rep catcher */
+ String_t string; /* string/kmp */
+ Trie_t trie; /* trie */
+ } re;
+} Rex_t;
+
+typedef struct reglib_s /* library private regex_t info */
+{
+ struct Rex_s* rex; /* compiled expression */
+ regdisc_t* disc; /* REG_DISCIPLINE discipline */
+ const regex_t* regex; /* from regexec */
+ unsigned char* beg; /* beginning of string */
+ unsigned char* end; /* end of string */
+ Vector_t* pos; /* posns of certain subpatterns */
+ Vector_t* bestpos; /* ditto for best match */
+ regmatch_t* match; /* subexrs in current match */
+ regmatch_t* best; /* ditto in best match yet */
+ Stk_pos_t stk; /* exec stack pos */
+ size_t min; /* minimum match length */
+ size_t nsub; /* internal re_nsub */
+ regflags_t flags; /* flags from regcomp() */
+ int error; /* last error */
+ int explicit; /* explicit match on this char */
+ int leading; /* leading match on this char */
+ int refs; /* regcomp()+regdup() references*/
+ Rex_t done; /* the last continuation */
+ regstat_t stats; /* for regstat() */
+ unsigned char fold[UCHAR_MAX+1]; /* REG_ICASE map */
+ unsigned char hard; /* hard comp */
+ unsigned char once; /* if 1st parse fails, quit */
+ unsigned char separate; /* cannot combine */
+ unsigned char stack; /* hard comp or exec */
+ unsigned char sub; /* re_sub is valid */
+ unsigned char test; /* debug/test bitmask */
+} Env_t;
+
+typedef struct State_s /* shared state */
+{
+ regmatch_t nomatch;
+ struct
+ {
+ unsigned char key;
+ short val[15];
+ } escape[52];
+ short* magic[UCHAR_MAX+1];
+ regdisc_t disc;
+ int fatal;
+ int initialized;
+ Dt_t* attrs;
+ Dt_t* names;
+ Dtdisc_t dtdisc;
+} State_t;
+
+extern State_t state;
+
+extern void* alloc(regdisc_t*, void*, size_t);
+extern regclass_t classfun(int);
+extern void drop(regdisc_t*, Rex_t*);
+extern int fatal(regdisc_t*, int, const char*);
+
+#endif