diff options
Diffstat (limited to 'ipl/procs/findre.icn')
-rw-r--r-- | ipl/procs/findre.icn | 737 |
1 files changed, 737 insertions, 0 deletions
diff --git a/ipl/procs/findre.icn b/ipl/procs/findre.icn new file mode 100644 index 0000000..85abc30 --- /dev/null +++ b/ipl/procs/findre.icn @@ -0,0 +1,737 @@ +############################################################################ +# +# File: findre.icn +# +# Subject: Procedure to find regular expression +# +# Author: Richard L. Goerwitz +# +# Date: March 3, 1996 +# +############################################################################ +# +# This file is in the public domain. +# +############################################################################ +# +# Version: 1.17 +# +############################################################################ +# +# DESCRIPTION: findre() is like the Icon builtin function find(), +# except that it takes, as its first argument, a regular expression +# pretty much like the ones the Unix egrep command uses (the few +# minor differences are listed below). Its syntax is the same as +# find's (i.e. findre(s1,s2,i,j)), with the exception that a no- +# argument invocation wipes out all static structures utilized by +# findre, and then forces a garbage collection. +# +############################################################################ +# +# (For those not familiar with regular expressions and the Unix egrep +# command: findre() offers a simple and compact wildcard-based search +# system. If you do a lot of searches through text files, or write +# programs which do searches based on user input, then findre is a +# utility you might want to look over.) +# +# IMPORTANT DIFFERENCES between find and findre: As noted above, +# findre() is just a find() function that takes a regular expression +# as its first argument. One major problem with this setup is that +# it leaves the user with no easy way to tab past a matched +# substring, as with +# +# s ? write(tab(find("hello")+5)) +# +# In order to remedy this intrinsic deficiency, findre() sets the +# global variable __endpoint to the first position after any given +# match occurs. Use this variable with great care, preferably +# assigning its value to some other variable immediately after the +# match (for example, findre("hello [.?!]*",s) & tmp := __endpoint). +# Otherwise, you will certainly run into trouble. (See the example +# below for an illustration of how __endpoint is used). +# +# IMPORTANT DIFFERENCES between egrep and findre: findre utilizes +# the same basic language as egrep. The only big difference is that +# findre uses intrinsic Icon data structures and escaping conven- +# tions rather than those of any particular Unix variant. Be care- +# ful! If you put findre("\(hello\)",s) into your source file, +# findre will treat it just like findre("(hello)",s). If, however, +# you enter '\(hello\)' at run-time (via, say, findre(!&input,s)), +# what Icon receives will depend on your operating system (most +# likely, a trace will show "\\(hello\\)"). +# +############################################################################ +# +# BUGS: Space has essentially been conserved at the expense of time +# in the automata produced by findre(). The algorithm, in other +# words, will produce the equivalent of a pushdown automaton under +# certain circumstances, rather than strive (at the expense of space) +# for full determinism. I tried to make up a nfa -> dfa converter +# that would only create that portion of the dfa it needed to accept +# or reject a string, but the resulting automaton was actually quite +# slow (if anyone can think of a way to do this in Icon, and keep it +# small and fast, please let us all know about it). Note that under +# version 8 of Icon, findre takes up negligible storage space, due to +# the much improved hashing algorithm. I have not tested it under +# version 7, but I would expect it to use up quite a bit more space +# in that environment. +# +# IMPORTANT NOTE: Findre takes a shortest-possible-match approach +# to regular expressions. In other words, if you look for "a*", +# findre will not even bother looking for an "a." It will just match +# the empty string. Without this feature, findre would perform a bit +# more slowly. The problem with such an approach is that often the +# user will want to tab past the longest possible string of matched +# characters (say tab((findre("a*|b*"), __endpoint)). In circumstan- +# ces like this, please just use something like: +# +# s ? { +# tab(find("a")) & # or use Arb() from the IPL (patterns.icn) +# tab(many('a')) +# tab(many('b')) +# } +# +# or else use some combination of findre and the above. +# +############################################################################ +# +# REGULAR EXPRESSION SYNTAX: Regular expression syntax is complex, +# and yet simple. It is simple in the sense that most of its power +# is concentrated in about a dozen easy-to-learn symbols. It is +# complex in the sense that, by combining these symbols with +# characters, you can represent very intricate patterns. +# +# I make no pretense here of offering a full explanation of regular +# expressions, their usage, and the deeper nuances of their syntax. +# As noted above, this should be gleaned from a Unix manual. For +# quick reference, however, I have included a brief summary of all +# the special symbols used, accompanied by an explanation of what +# they mean, and, in some cases, of how they are used (most of this +# is taken from the comments prepended to Jerry Nowlin's Icon-grep +# command, as posted a couple of years ago): +# +# ^ - matches if the following pattern is at the beginning +# of a line (i.e. ^# matches lines beginning with "#") +# $ - matches if the preceding pattern is at the end of a line +# . - matches any single character +# + - matches from 1 to any number of occurrences of the +# previous expression (i.e. a character, or set of paren- +# thesized/bracketed characters) +# * - matches from 0 to any number of occurrences of the previous +# expression +# \ - removes the special meaning of any special characters +# recognized by this program (i.e if you want to match lines +# beginning with a "[", write ^\[, and not ^[) +# | - matches either the pattern before it, or the one after +# it (i.e. abc|cde matches either abc or cde) +# [] - matches any member of the enclosed character set, or, +# if ^ is the first character, any nonmember of the +# enclosed character set (i.e. [^ab] matches any character +# _except_ a and b). +# () - used for grouping (e.g. ^(abc|cde)$ matches lines consist- +# ing of either "abc" or "cde," while ^abc|cde$ matches +# lines either beginning with "abc" or ending in "cde") +# +############################################################################ +# +# EXAMPLE program: +# +# procedure main(a) +# while line := !&input do { +# token_list := tokenize_line(line,a[1]) +# every write(!token_list) +# } +# end +# +# procedure tokenize_line(s,sep) +# tmp_lst := [] +# s ? { +# while field := tab(findre(sep)|0) & +# mark := __endpoint +# do { +# put(tmp_lst,"" ~== field) +# if pos(0) then break +# else tab(mark) +# } +# } +# return tmp_lst +# end +# +# The above program would be compiled with findre (e.g. "icont +# test_prg.icn findre.icn") to produce a single executable which +# tokenizes each line of input based on a user-specified delimiter. +# Note how __endpoint is set soon after findre() succeeds. Note +# also how empty fields are excluded with "" ~==, etc. Finally, note +# that the temporary list, tmp_lst, is not needed. It is included +# here merely to illustrate one way in which tokens might be stored. +# +# Tokenizing is, of course, only one of many uses one might put +# findre to. It is very helpful in allowing the user to construct +# automata at run-time. If, say, you want to write a program that +# searches text files for patterns given by the user, findre would be +# a perfect utility to use. Findre in general permits more compact +# expression of patterns than one can obtain using intrinsic Icon +# scanning facilities. Its near complete compatibility with the Unix +# regexp library, moreover, makes for greater ease of porting, +# especially in cases where Icon is being used to prototype C code. +# +############################################################################ + + +global state_table, parends_present, slash_present +global biggest_nonmeta_str, __endpoint +record o_a_s(op,arg,state) + + +procedure findre(re, s, i, j) + + local p, default_val, x, nonmeta_len, tokenized_re, tmp + static FSTN_table, STRING_table + initial { + FSTN_table := table() + STRING_table := table() + } + + if /re then { + FSTN_table := table() + STRING_table := table() + collect() # do it *now* + return + } + + if /s := &subject + then default_val := &pos + else default_val := 1 + + if \i then { + if i < 1 then + i := *s + (i+1) + } + else i := default_val + + if \j then { + if j < 1 then + j := *s + (j+1) + } + else j := *s+1 + + if /FSTN_table[re] then { + # If we haven't seen this re before, then... + if \STRING_table[re] then { + # ...if it's in the STRING_table, use plain find() + every p := find(STRING_table[re],s,i,j) + do { __endpoint := p + *STRING_table[re]; suspend p } + fail + } + else { + # However, if it's not in the string table, we have to + # tokenize it and check for metacharacters. If it has + # metas, we create an FSTN, and put that into FSTN_table; + # otherwise, we just put it into the STRING_table. + tokenized_re := tokenize(re) + if 0 > !tokenized_re then { + # if at least one element is < 0, re has metas + MakeFSTN(tokenized_re) | err_out(re,2) + # both biggest_nonmeta_str and state_table are global + /FSTN_table[re] := [.biggest_nonmeta_str, copy(state_table)] + } + else { + # re has no metas; put the input string into STRING_table + # for future reference, and execute find() at once + tmp := ""; every tmp ||:= char(!tokenized_re) + insert(STRING_table,re,tmp) + every p := find(STRING_table[re],s,i,j) + do { __endpoint := p + *STRING_table[re]; suspend p } + fail + } + } + } + + + if nonmeta_len := (1 < *FSTN_table[re][1]) then { + # If the biggest non-meta string in the original re + # was more than 1, then put in a check for it... + s[1:j] ? { + tab(x := i to j - nonmeta_len) & + (find(FSTN_table[re][1]) | fail) \ 1 & + (__endpoint := apply_FSTN(&null,FSTN_table[re][2])) & + (suspend x) + } + } + else { + #...otherwise it's not worth worrying about the biggest nonmeta str + s[1:j] ? { + tab(x := i to j) & + (__endpoint := apply_FSTN(&null,FSTN_table[re][2])) & + (suspend x) + } + } + +end + + + +procedure apply_FSTN(ini,tbl) + + local biggest_pos, POS, tmp, fin + static s_tbl + + /ini := 1 & s_tbl := tbl & biggest_pos := 1 + if ini = 0 then { + return &pos + } + POS := &pos + fin := 0 + + repeat { + if tmp := !s_tbl[ini] & + tab(tmp.op(tmp.arg)) + then { + if tmp.state = fin + then return &pos + else ini := tmp.state + } + else (&pos := POS, fail) + } + +end + + + +procedure tokenize(s) + + local token_list, chr, tmp, b_loc, next_one, fixed_length_token_list, i + + token_list := list() + s ? { + tab(many('*+?|')) + while chr := move(1) do { + if chr == "\\" + # it can't be a metacharacter; remove the \ and "put" + # the integer value of the next chr into token_list + then put(token_list,ord(move(1))) | err_out(s,2,chr) + else if any('*+()|?.$^',chr) + then { + # Yuck! Egrep compatibility stuff. + case chr of { + "*" : { + tab(many('*+?')) + put(token_list,-ord("*")) + } + "+" : { + tmp := tab(many('*?+')) | &null + if upto('*?',\tmp) + then put(token_list,-ord("*")) + else put(token_list,-ord("+")) + } + "?" : { + tmp := tab(many('*?+')) | &null + if upto('*+',\tmp) + then put(token_list,-ord("*")) + else put(token_list,-ord("?")) + } + "(" : { + tab(many('*+?')) + put(token_list,-ord("(")) + } + default: { + put(token_list,-ord(chr)) + } + } + } + else { + case chr of { + # More egrep compatibility stuff. + "[" : { + b_loc := find("[") | *&subject+1 + every next_one := find("]",,,b_loc) + \next_one ~= &pos | err_out(s,2,chr) + put(token_list,-ord(chr)) + } + "]" : { + if &pos = (\next_one+1) + then put(token_list,-ord(chr)) & + next_one := &null + else put(token_list,ord(chr)) + } + default: put(token_list,ord(chr)) + } + } + } + } + + token_list := UnMetaBrackets(token_list) + + fixed_length_token_list := list(*token_list) + every i := 1 to *token_list + do fixed_length_token_list[i] := token_list[i] + return fixed_length_token_list + +end + + + +procedure UnMetaBrackets(l) + + # Since brackets delineate a cset, it doesn't make + # any sense to have metacharacters inside of them. + # UnMetaBrackets makes sure there are no metacharac- + # ters inside of the braces. + + local tmplst, i, Lb, Rb + + tmplst := list(); i := 0 + Lb := -ord("[") + Rb := -ord("]") + + while (i +:= 1) <= *l do { + if l[i] = Lb then { + put(tmplst,l[i]) + until l[i +:= 1] = Rb + do put(tmplst,abs(l[i])) + put(tmplst,l[i]) + } + else put(tmplst,l[i]) + } + return tmplst + +end + + + +procedure MakeFSTN(l,INI,FIN) + + # MakeFSTN recursively descends through the tree structure + # implied by the tokenized string, l, recording in (global) + # fstn_table a list of operations to be performed, and the + # initial and final states which apply to them. + + local i, inter, inter2, tmp, Op, Arg + static Lp, Rp, Sl, Lb, Rb, Caret_inside, Dot, Dollar, Caret_outside + # global biggest_nonmeta_str, slash_present, parends_present + initial { + Lp := -ord("("); Rp := -ord(")") + Sl := -ord("|") + Lb := -ord("["); Rb := -ord("]"); Caret_inside := ord("^") + Dot := -ord("."); Dollar := -ord("$"); Caret_outside := -ord("^") + } + + /INI := 1 & state_table := table() & + NextState("new") & biggest_nonmeta_str := "" + /FIN := 0 + + # I haven't bothered to test for empty lists everywhere. + if *l = 0 then { + /state_table[INI] := [] + put(state_table[INI],o_a_s(zSucceed,&null,FIN)) + return + } + + # HUNT DOWN THE SLASH (ALTERNATION OPERATOR) + every i := 1 to *l do { + if l[i] = Sl & tab_bal(l,Lp,Rp) = i then { + if i = 1 then err_out(l,2,char(abs(l[i]))) else { + /slash_present := "yes" + inter := NextState() + inter2:= NextState() + MakeFSTN(l[1:i],inter2,FIN) + MakeFSTN(l[i+1:0],inter,FIN) + /state_table[INI] := [] + put(state_table[INI],o_a_s(apply_FSTN,inter2,0)) + put(state_table[INI],o_a_s(apply_FSTN,inter,0)) + return + } + } + } + + # HUNT DOWN PARENTHESES + if l[1] = Lp then { + i := tab_bal(l,Lp,Rp) | err_out(l,2,"(") + inter := NextState() + if any('*+?',char(abs(0 > l[i+1]))) then { + case l[i+1] of { + -ord("*") : { + /state_table[INI] := [] + put(state_table[INI],o_a_s(apply_FSTN,inter,0)) + MakeFSTN(l[2:i],INI,INI) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + -ord("+") : { + inter2 := NextState() + /state_table[inter2] := [] + MakeFSTN(l[2:i],INI,inter2) + put(state_table[inter2],o_a_s(apply_FSTN,inter,0)) + MakeFSTN(l[2:i],inter2,inter2) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + -ord("?") : { + /state_table[INI] := [] + put(state_table[INI],o_a_s(apply_FSTN,inter,0)) + MakeFSTN(l[2:i],INI,inter) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + } + } + else { + MakeFSTN(l[2:i],INI,inter) + MakeFSTN(l[i+1:0],inter,FIN) + return + } + } + else { # I.E. l[1] NOT = Lp (left parenthesis as -ord("(")) + every i := 1 to *l do { + case l[i] of { + Lp : { + inter := NextState() + MakeFSTN(l[1:i],INI,inter) + /parends_present := "yes" + MakeFSTN(l[i:0],inter,FIN) + return + } + Rp : err_out(l,2,")") + } + } + } + + # NOW, HUNT DOWN BRACKETS + if l[1] = Lb then { + i := tab_bal(l,Lb,Rb) | err_out(l,2,"[") + inter := NextState() + tmp := ""; every tmp ||:= char(l[2 to i-1]) + if Caret_inside = l[2] + then tmp := ~cset(Expand(tmp[2:0])) + else tmp := cset(Expand(tmp)) + if any('*+?',char(abs(0 > l[i+1]))) then { + case l[i+1] of { + -ord("*") : { + /state_table[INI] := [] + put(state_table[INI],o_a_s(apply_FSTN,inter,0)) + put(state_table[INI],o_a_s(any,tmp,INI)) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + -ord("+") : { + inter2 := NextState() + /state_table[INI] := [] + put(state_table[INI],o_a_s(any,tmp,inter2)) + /state_table[inter2] := [] + put(state_table[inter2],o_a_s(apply_FSTN,inter,0)) + put(state_table[inter2],o_a_s(any,tmp,inter2)) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + -ord("?") : { + /state_table[INI] := [] + put(state_table[INI],o_a_s(apply_FSTN,inter,0)) + put(state_table[INI],o_a_s(any,tmp,inter)) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + } + } + else { + /state_table[INI] := [] + put(state_table[INI],o_a_s(any,tmp,inter)) + MakeFSTN(l[i+1:0],inter,FIN) + return + } + } + else { # I.E. l[1] not = Lb + every i := 1 to *l do { + case l[i] of { + Lb : { + inter := NextState() + MakeFSTN(l[1:i],INI,inter) + MakeFSTN(l[i:0],inter,FIN) + return + } + Rb : err_out(l,2,"]") + } + } + } + + # FIND INITIAL SEQUENCES OF POSITIVE INTEGERS, CONCATENATE THEM + if i := match_positive_ints(l) then { + inter := NextState() + tmp := Ints2String(l[1:i]) + # if a slash has been encountered already, forget optimizing + # in this way; if parends are present, too, then forget it, + # unless we are at the beginning or end of the input string + if INI = 1 | FIN = 2 | /parends_present & + /slash_present & *tmp > *biggest_nonmeta_str + then biggest_nonmeta_str := tmp + /state_table[INI] := [] + put(state_table[INI],o_a_s(match,tmp,inter)) + MakeFSTN(l[i:0],inter,FIN) + return + } + + # OKAY, CLEAN UP ALL THE JUNK THAT'S LEFT + i := 0 + while (i +:= 1) <= *l do { + case l[i] of { + Dot : { Op := any; Arg := &cset } + Dollar : { Op := pos; Arg := 0 } + Caret_outside: { Op := pos; Arg := 1 } + default : { Op := match; Arg := char(0 < l[i]) } + } | err_out(l,2,char(abs(l[i]))) + inter := NextState() + if any('*+?',char(abs(0 > l[i+1]))) then { + case l[i+1] of { + -ord("*") : { + /state_table[INI] := [] + put(state_table[INI],o_a_s(apply_FSTN,inter,0)) + put(state_table[INI],o_a_s(Op,Arg,INI)) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + -ord("+") : { + inter2 := NextState() + /state_table[INI] := [] + put(state_table[INI],o_a_s(Op,Arg,inter2)) + /state_table[inter2] := [] + put(state_table[inter2],o_a_s(apply_FSTN,inter,0)) + put(state_table[inter2],o_a_s(Op,Arg,inter2)) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + -ord("?") : { + /state_table[INI] := [] + put(state_table[INI],o_a_s(apply_FSTN,inter,0)) + put(state_table[INI],o_a_s(Op,Arg,inter)) + MakeFSTN(l[i+2:0],inter,FIN) + return + } + } + } + else { + /state_table[INI] := [] + put(state_table[INI],o_a_s(Op,Arg,inter)) + MakeFSTN(l[i+1:0],inter,FIN) + return + } + } + + # WE SHOULD NOW BE DONE INSERTING EVERYTHING INTO state_table + # IF WE GET TO HERE, WE'VE PARSED INCORRECTLY! + err_out(l,4) + +end + + + +procedure NextState(new) + static nextstate + if \new then nextstate := 1 + else nextstate +:= 1 + return nextstate +end + + + +procedure err_out(x,i,elem) + writes(&errout,"Error number ",i," parsing ",image(x)," at ") + if \elem + then write(&errout,image(elem),".") + else write(&errout,"(?).") + exit(i) +end + + + +procedure zSucceed() + return .&pos +end + + + +procedure Expand(s) + + local s2, c1, c2 + + s2 := "" + s ? { + s2 ||:= ="^" + s2 ||:= ="-" + while s2 ||:= tab(find("-")-1) do { + if (c1 := move(1), ="-", + c2 := move(1), + c1 << c2) + then every s2 ||:= char(ord(c1) to ord(c2)) + else s2 ||:= 1(move(2), not(pos(0))) | err_out(s,2,"-") + } + s2 ||:= tab(0) + } + return s2 + +end + + + +procedure tab_bal(l,i1,i2) + + local i, i1_count, i2_count + + i := 0 + i1_count := 0; i2_count := 0 + while (i +:= 1) <= *l do { + case l[i] of { + i1 : i1_count +:= 1 + i2 : i2_count +:= 1 + } + if i1_count = i2_count + then suspend i + } + +end + + +procedure match_positive_ints(l) + + # Matches the longest sequence of positive integers in l, + # beginning at l[1], which neither contains, nor is fol- + # lowed by a negative integer. Returns the first position + # after the match. Hence, given [55, 55, 55, -42, 55], + # match_positive_ints will return 3. [55, -42] will cause + # it to fail rather than return 1 (NOTE WELL!). + + local i + + every i := 1 to *l do { + if l[i] < 0 + then return (3 < i) - 1 | fail + } + return *l + 1 + +end + + +procedure Ints2String(l) + + local tmp + + tmp := "" + every tmp ||:= char(!l) + return tmp + +end + + +procedure StripChar(s,s2) + + local tmp + + if find(s2,s) then { + tmp := "" + s ? { + while tmp ||:= tab(find("s2")) + do tab(many(cset(s2))) + tmp ||:= tab(0) + } + } + return \tmp | s + +end |