diff options
Diffstat (limited to 'ipl/packs/ibpag2/itokens.icn')
-rw-r--r-- | ipl/packs/ibpag2/itokens.icn | 925 |
1 files changed, 925 insertions, 0 deletions
diff --git a/ipl/packs/ibpag2/itokens.icn b/ipl/packs/ibpag2/itokens.icn new file mode 100644 index 0000000..1bb9cd1 --- /dev/null +++ b/ipl/packs/ibpag2/itokens.icn @@ -0,0 +1,925 @@ +############################################################################ +# +# Name: itokens.icn +# +# Title: itokens (Icon source-file tokenizer) +# +# Author: Richard L. Goerwitz +# +# Version: 1.11 +# +############################################################################ +# +# This file contains itokens() - a utility for breaking Icon source +# files up into individual tokens. This is the sort of routine one +# needs to have around when implementing things like pretty printers, +# preprocessors, code obfuscators, etc. It would also be useful for +# implementing cut-down implementations of Icon written in Icon - the +# sort of thing one might use in an interactive tutorial. +# +# Itokens(f, x) takes, as its first argument, f, an open file, and +# suspends successive TOK records. TOK records contain two fields. +# The first field, sym, contains a string that represents the name of +# the next token (e.g. "CSET", "STRING", etc.). The second field, +# str, gives that token's literal value. E.g. the TOK for a literal +# semicolon is TOK("SEMICOL", ";"). For a mandatory newline, itokens +# would suspend TOK("SEMICOL", "\n"). +# +# Unlike Icon's own tokenizer, itokens() does not return an EOFX +# token on end-of-file, but rather simply fails. It also can be +# instructed to return syntactically meaningless newlines by passing +# it a nonnull second argument (e.g. itokens(infile, 1)). These +# meaningless newlines are returned as TOK records with a null sym +# field (i.e. TOK(&null, "\n")). +# +# NOTE WELL: If new reserved words or operators are added to a given +# implementation, the tables below will have to be altered. Note +# also that &keywords should be implemented on the syntactic level - +# not on the lexical one. As a result, a keyword like &features will +# be suspended as TOK("CONJUNC", "&") and TOK("IDENT", "features"). +# +############################################################################ +# +# Links: slshupto +# +# Requires: coexpressions +# +############################################################################ + +#link ximage, slshupto +link slshupto #make sure you have version 1.2 or above + +global next_c, line_number +record TOK(sym, str) + +# +# main: an Icon source code uglifier +# +# Stub main for testing; uncomment & compile. The resulting +# executable will act as an Icon file compressor, taking the +# standard input and outputting Icon code stripped of all +# unnecessary whitespace. Guaranteed to make the code a visual +# mess :-). +# +#procedure main() +# +# local separator, T +# separator := "" +# every T := itokens(&input) do { +# if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT" +# then writes(separator) +# if T.sym == "SEMICOL" then writes(";") else writes(T.str) +# if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT" +# then separator := " " else separator := "" +# } +# +#end + + +# +# itokens: file x anything -> TOK records (a generator) +# (stream, nostrip) -> Rs +# +# Where stream is an open file, anything is any object (it only +# matters whether it is null or not), and Rs are TOK records. +# Note that itokens strips out useless newlines. If the second +# argument is nonnull, itokens does not strip out superfluous +# newlines. It may be useful to keep them when the original line +# structure of the input file must be maintained. +# +procedure itokens(stream, nostrip) + + local T, last_token + + # initialize to some meaningless value + last_token := TOK() + + every T := \iparse_tokens(stream) do { + if \T.sym then { + if T.sym == "EOFX" then fail + else { + # + # If the last token was a semicolon, then interpret + # all ambiguously unary/binary sequences like "**" as + # beginners (** could be two unary stars or the [c]set + # intersection operator). + # + if \last_token.sym == "SEMICOL" + then suspend last_token := expand_fake_beginner(T) + else suspend last_token := T + } + } else { + if \nostrip + then suspend last_token := T + } + } + +end + + +# +# expand_fake_beginner: TOK record -> TOK records +# +# Some "beginner" tokens aren't really beginners. They are token +# sequences that could be either a single binary operator or a +# series of unary operators. The tokenizer's job is just to snap +# up as many characters as could logically constitute an operator. +# Here is where we decide whether to break the sequence up into +# more than one op or not. +# +procedure expand_fake_beginner(next_token) + + static exptbl + initial { + exptbl := table() + insert(exptbl, "CONCAT", [TOK("BAR", "|"), TOK("BAR", "|")]) + insert(exptbl, "DIFF", [TOK("MINUS", "-"), TOK("MINUS", "-")]) + insert(exptbl, "EQUIV", [TOK("NUMEQ", "="), TOK("NUMEQ", "="), + TOK("NUMEQ", "=")]) + insert(exptbl, "INTER", [TOK("STAR", "*"), TOK("STAR", "*")]) + insert(exptbl, "LCONCAT", [TOK("BAR", "|"), TOK("BAR", "|"), + TOK("BAR", "|")]) + insert(exptbl, "LEXEQ", [TOK("NUMEQ", "="), TOK("NUMEQ", "=")]) + insert(exptbl, "LEXNE", [TOK("TILDE", "~"), TOK("NUMEQ", "="), + TOK("NUMEQ", "=")]) + insert(exptbl, "NOTEQUIV",[TOK("TILDE", "~"), TOK("NUMEQ","="), + TOK("NUMEQ", "="), TOK("NUMEQ", "=")]) + insert(exptbl, "NUMNE", [TOK("TILDE", "~"), TOK("NUMEQ","=")]) + insert(exptbl, "UNION", [TOK("PLUS", "+"), TOK("PLUS", "+")]) + } + + if \exptbl[next_token.sym] + then suspend !exptbl[next_token.sym] + else return next_token + +end + + +# +# iparse_tokens: file -> TOK records (a generator) +# (stream) -> tokens +# +# Where file is an open input stream, and tokens are TOK records +# holding both the token type and actual token text. +# +# TOK records contain two parts, a preterminal symbol (the first +# "sym" field), and the actual text of the token ("str"). The +# parser only pays attention to the sym field, although the +# strings themselves get pushed onto the value stack. +# +# Note the following kludge: Unlike real Icon tokenizers, this +# procedure returns syntactially meaningless newlines as TOK +# records with a null sym field. Normally they would be ignored. +# I wanted to return them so they could be printed on the output +# stream, thus preserving the line structure of the original +# file, and making later diagnostic messages more usable. +# +procedure iparse_tokens(stream, getchar) + + local elem, whitespace, token, last_token, primitives, reserveds + static be_tbl, reserved_tbl, operators + initial { + + # Primitive Tokens + # + primitives := [ + ["identifier", "IDENT", "be"], + ["integer-literal", "INTLIT", "be"], + ["real-literal", "REALLIT", "be"], + ["string-literal", "STRINGLIT", "be"], + ["cset-literal", "CSETLIT", "be"], + ["end-of-file", "EOFX", "" ]] + + # Reserved Words + # + reserveds := [ + ["break", "BREAK", "be"], + ["by", "BY", "" ], + ["case", "CASE", "b" ], + ["create", "CREATE", "b" ], + ["default", "DEFAULT", "b" ], + ["do", "DO", "" ], + ["else", "ELSE", "" ], + ["end", "END", "b" ], + ["every", "EVERY", "b" ], + ["fail", "FAIL", "be"], + ["global", "GLOBAL", "" ], + ["if", "IF", "b" ], + ["initial", "INITIAL", "b" ], + ["invocable", "INVOCABLE", "" ], + ["link", "LINK", "" ], + ["local", "LOCAL", "b" ], + ["next", "NEXT", "be"], + ["not", "NOT", "b" ], + ["of", "OF", "" ], + ["procedure", "PROCEDURE", "" ], + ["record", "RECORD", "" ], + ["repeat", "REPEAT", "b" ], + ["return", "RETURN", "be"], + ["static", "STATIC", "b" ], + ["suspend", "SUSPEND", "be"], + ["then", "THEN", "" ], + ["to", "TO", "" ], + ["until", "UNTIL", "b" ], + ["while", "WHILE", "b" ]] + + # Operators + # + operators := [ + [":=", "ASSIGN", "" ], + ["@", "AT", "b" ], + ["@:=", "AUGACT", "" ], + ["&:=", "AUGAND", "" ], + ["=:=", "AUGEQ", "" ], + ["===:=", "AUGEQV", "" ], + [">=:=", "AUGGE", "" ], + [">:=", "AUGGT", "" ], + ["<=:=", "AUGLE", "" ], + ["<:=", "AUGLT", "" ], + ["~=:=", "AUGNE", "" ], + ["~===:=", "AUGNEQV", "" ], + ["==:=", "AUGSEQ", "" ], + [">>=:=", "AUGSGE", "" ], + [">>:=", "AUGSGT", "" ], + ["<<=:=", "AUGSLE", "" ], + ["<<:=", "AUGSLT", "" ], + ["~==:=", "AUGSNE", "" ], + ["\\", "BACKSLASH", "b" ], + ["!", "BANG", "b" ], + ["|", "BAR", "b" ], + ["^", "CARET", "b" ], + ["^:=", "CARETASGN", "b" ], + [":", "COLON", "" ], + [",", "COMMA", "" ], + ["||", "CONCAT", "b" ], + ["||:=", "CONCATASGN","" ], + ["&", "CONJUNC", "b" ], + [".", "DOT", "b" ], + ["--", "DIFF", "b" ], + ["--:=", "DIFFASGN", "" ], + ["===", "EQUIV", "b" ], + ["**", "INTER", "b" ], + ["**:=", "INTERASGN", "" ], + ["{", "LBRACE", "b" ], + ["[", "LBRACK", "b" ], + ["|||", "LCONCAT", "b" ], + ["|||:=", "LCONCATASGN","" ], + ["==", "LEXEQ", "b" ], + [">>=", "LEXGE", "" ], + [">>", "LEXGT", "" ], + ["<<=", "LEXLE", "" ], + ["<<", "LEXLT", "" ], + ["~==", "LEXNE", "b" ], + ["(", "LPAREN", "b" ], + ["-:", "MCOLON", "" ], + ["-", "MINUS", "b" ], + ["-:=", "MINUSASGN", "" ], + ["%", "MOD", "" ], + ["%:=", "MODASGN", "" ], + ["~===", "NOTEQUIV", "b" ], + ["=", "NUMEQ", "b" ], + [">=", "NUMGE", "" ], + [">", "NUMGT", "" ], + ["<=", "NUMLE", "" ], + ["<", "NUMLT", "" ], + ["~=", "NUMNE", "b" ], + ["+:", "PCOLON", "" ], + ["+", "PLUS", "b" ], + ["+:=", "PLUSASGN", "" ], + ["?", "QMARK", "b" ], + ["<-", "REVASSIGN", "" ], + ["<->", "REVSWAP", "" ], + ["}", "RBRACE", "e" ], + ["]", "RBRACK", "e" ], + [")", "RPAREN", "e" ], + [";", "SEMICOL", "" ], + ["?:=", "SCANASGN", "" ], + ["/", "SLASH", "b" ], + ["/:=", "SLASHASGN", "" ], + ["*", "STAR", "b" ], + ["*:=", "STARASGN", "" ], + [":=:", "SWAP", "" ], + ["~", "TILDE", "b" ], + ["++", "UNION", "b" ], + ["++:=", "UNIONASGN", "" ], + ["$(", "LBRACE", "b" ], + ["$)", "RBRACE", "e" ], + ["$<", "LBRACK", "b" ], + ["$>", "RBRACK", "e" ], + ["$", "RHSARG", "b" ], + ["%$(", "BEGGLOB", "b" ], + ["%$)", "ENDGLOB", "e" ], + ["%{", "BEGGLOB", "b" ], + ["%}", "ENDGLOB", "e" ], + ["%%", "NEWSECT", "be"]] + + # static be_tbl, reserved_tbl + reserved_tbl := table() + every elem := !reserveds do + insert(reserved_tbl, elem[1], elem[2]) + be_tbl := table() + every elem := !primitives | !reserveds | !operators do { + insert(be_tbl, elem[2], elem[3]) + } + } + + /getchar := create { + line_number := 0 + ! ( 1(!stream, line_number +:=1) || "\n" ) + } + whitespace := ' \t' + /next_c := @getchar | { + if \stream then + return TOK("EOFX") + else fail + } + + repeat { + case next_c of { + + "." : { + # Could be a real literal *or* a dot operator. Check + # following character to see if it's a digit. If so, + # it's a real literal. We can only get away with + # doing the dot here because it is not a substring of + # any longer identifier. If this gets changed, we'll + # have to move this code into do_operator(). + # + last_token := do_dot(getchar) + suspend last_token +# write(&errout, "next_c == ", image(next_c)) + next + } + + "\n" : { + # If do_newline fails, it means we're at the end of + # the input stream, and we should break out of the + # repeat loop. + # + every last_token := do_newline(getchar, last_token, be_tbl) + do suspend last_token + if next_c === &null then break + next + } + + "\#" : { + # Just a comment. Strip it by reading every character + # up to the next newline. The global var next_c + # should *always* == "\n" when this is done. + # + do_number_sign(getchar) +# write(&errout, "next_c == ", image(next_c)) + next + } + + "\"" : { + # Suspend as STRINGLIT everything from here up to the + # next non-backslashed quotation mark, inclusive + # (accounting for the _ line-continuation convention). + # + last_token := do_quotation_mark(getchar) + suspend last_token +# write(&errout, "next_c == ", image(next_c)) + next + } + + "'" : { + # Suspend as CSETLIT everything from here up to the + # next non-backslashed apostrophe, inclusive. + # + last_token := do_apostrophe(getchar) + suspend last_token +# write(&errout, "next_c == ", image(next_c)) + next + } + + &null : stop("iparse_tokens (lexer): unexpected EOF") + + default : { + # If we get to here, we have either whitespace, an + # integer or real literal, an identifier or reserved + # word (both get handled by do_identifier), or an + # operator. The question of which we have can be + # determined by checking the first character. + # + if any(whitespace, next_c) then { + # Like all of the TOK forming procedures, + # do_whitespace resets next_c. + do_whitespace(getchar, whitespace) + # don't suspend any tokens + next + } + if any(&digits, next_c) then { + last_token := do_digits(getchar) + suspend last_token + next + } + if any(&letters ++ '_', next_c) then { + last_token := do_identifier(getchar, reserved_tbl) + suspend last_token + next + } +# write(&errout, "it's an operator") + last_token := do_operator(getchar, operators) + suspend last_token + next + } + } + } + + # If stream argument is nonnull, then we are in the top-level + # iparse_tokens(). If not, then we are in a recursive call, and + # we should not emit all this end-of-file crap. + # + if \stream then { + return TOK("EOFX") + } + else fail + +end + + +# +# do_dot: coexpression -> TOK record +# getchar -> t +# +# Where getchar is the coexpression that produces the next +# character from the input stream and t is a token record whose +# sym field contains either "REALLIT" or "DOT". Essentially, +# do_dot checks the next char on the input stream to see if it's +# an integer. Since the preceding char was a dot, an integer +# tips us off that we have a real literal. Otherwise, it's just +# a dot operator. Note that do_dot resets next_c for the next +# cycle through the main case loop in the calling procedure. +# +procedure do_dot(getchar) + + local token + # global next_c + +# write(&errout, "it's a dot") + + # If dot's followed by a digit, then we have a real literal. + # + if any(&digits, next_c := @getchar) then { +# write(&errout, "dot -> it's a real literal") + token := "." || next_c + while any(&digits, next_c := @getchar) do + token ||:= next_c + if token ||:= (next_c == ("e"|"E")) then { + while (next_c := @getchar) == "0" + while any(&digits, next_c) do { + token ||:= next_c + next_c = @getchar + } + } + return TOK("REALLIT", token) + } + + # Dot not followed by an integer; so we just have a dot operator, + # and not a real literal. + # +# write(&errout, "dot -> just a plain dot") + return TOK("DOT", ".") + +end + + +# +# do_newline: coexpression x TOK record x table -> TOK records +# (getchar, last_token, be_tbl) -> Ts (a generator) +# +# Where getchar is the coexpression that returns the next +# character from the input stream, last_token is the last TOK +# record suspended by the calling procedure, be_tbl is a table of +# tokens and their "beginner/ender" status, and Ts are TOK +# records. Note that do_newline resets next_c. Do_newline is a +# mess. What it does is check the last token suspended by the +# calling procedure to see if it was a beginner or ender. It +# then gets the next token by calling iparse_tokens again. If +# the next token is a beginner and the last token is an ender, +# then we have to suspend a SEMICOL token. In either event, both +# the last and next token are suspended. +# +procedure do_newline(getchar, last_token, be_tbl) + + local next_token + # global next_c + +# write(&errout, "it's a newline") + + # Go past any additional newlines. + # + while next_c == "\n" do { + # NL can be the last char in the getchar stream; if it *is*, + # then signal that it's time to break out of the repeat loop + # in the calling procedure. + # + next_c := @getchar | { + next_c := &null + fail + } + suspend TOK(&null, next_c == "\n") + } + + # If there was a last token (i.e. if a newline wasn't the first + # character of significance in the input stream), then check to + # see if it was an ender. If so, then check to see if the next + # token is a beginner. If so, then suspend a TOK("SEMICOL") + # record before suspending the next token. + # + if find("e", be_tbl[(\last_token).sym]) then { +# write(&errout, "calling iparse_tokens via do_newline") +# &trace := -1 + # First arg to iparse_tokens can be null here. + \ (next_token := iparse_tokens(&null, getchar)).sym + if \next_token then { +# write(&errout, "call of iparse_tokens via do_newline yields ", +# ximage(next_token)) + if find("b", be_tbl[next_token.sym]) + then suspend TOK("SEMICOL", "\n") + # + # See below. If this were like the real Icon parser, + # the following line would be commented out. + # + else suspend TOK(&null, "\n") + return next_token + } + else { + # + # If this were a *real* Icon tokenizer, it would not emit + # any record here, but would simply fail. Instead, we'll + # emit a dummy record with a null sym field. + # + return TOK(&null, "\n") +# &trace := 0 +# fail + } + } + + # See above. Again, if this were like Icon's own tokenizer, we + # would just fail here, and not return any TOK record. + # +# &trace := 0 + return TOK(&null, "\n") +# fail + +end + + +# +# do_number_sign: coexpression -> &null +# getchar -> +# +# Where getchar is the coexpression that pops characters off the +# main input stream. Sets the global variable next_c. This +# procedure simply reads characters until it gets a newline, then +# returns with next_c == "\n". Since the starting character was +# a number sign, this has the effect of stripping comments. +# +procedure do_number_sign(getchar) + + # global next_c + +# write(&errout, "it's a number sign") + while next_c ~== "\n" do { + next_c := @getchar + } + + # Return to calling procedure to cycle around again with the new + # next_c already set. Next_c should always be "\n" at this point. + return + +end + + +# +# do_quotation_mark: coexpression -> TOK record +# getchar -> t +# +# Where getchar is the coexpression that yields another character +# from the input stream, and t is a TOK record with "STRINGLIT" +# as its sym field. Puts everything upto and including the next +# non-backslashed quotation mark into the str field. Handles the +# underscore continuation convention. +# +procedure do_quotation_mark(getchar) + + local token + # global next_c + + # write(&errout, "it's a string literal") + token := "\"" + next_c := @getchar + repeat { + if next_c == "\n" & token[-1] == "_" then { + token := token[1:-1] + while any('\t ', next_c := @getchar) + next + } else { + if slshupto('"', token ||:= next_c, 2) + then { + next_c := @getchar + # resume outermost (repeat) loop in calling procedure, + # with the new (here explicitly set) next_c + return TOK("STRINGLIT", token) + } + next_c := @getchar + } + } + +end + + +# +# do_apostrophe: coexpression -> TOK record +# getchar -> t +# +# Where getchar is the coexpression that yields another character +# from the input stream, and t is a TOK record with "CSETLIT" +# as its sym field. Puts everything upto and including the next +# non-backslashed apostrope into the str field. +# +procedure do_apostrophe(getchar) + + local token + # global next_c + +# write(&errout, "it's a cset literal") + token := "'" + next_c := @getchar + repeat { + if next_c == "\n" & token[-1] == "_" then { + token := token[1:-1] + while any('\t ', next_c := @getchar) + next + } else { + if slshupto("'", token ||:= next_c, 2) + then { + next_c := @getchar + # Return & resume outermost containing loop in calling + # procedure w/ new next_c. + return TOK("CSETLIT", token) + } + next_c := @getchar + } + } + +end + + +# +# do_digits: coexpression -> TOK record +# getchar -> t +# +# Where getchar is the coexpression that produces the next char +# on the input stream, and where t is a TOK record containing +# either "REALLIT" or "INTLIT" in its sym field, and the text of +# the numeric literal in its str field. +# +procedure do_digits(getchar) + + local token, tok_record, extras, digits, over + # global next_c + + # For bases > 16 + extras := "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz" + # Assume integer literal until proven otherwise.... + tok_record := TOK("INTLIT") + +# write(&errout, "it's an integer or real literal") + token := ("0" ~== next_c) | "" + while any(&digits, next_c := @getchar) do + token ||:= next_c + if token ||:= (next_c == ("R"|"r")) then { + digits := &digits + if over := ((10 < token[1:-1]) - 10) * 2 then + digits ++:= extras[1:over+1] | extras + next_c := @getchar + if next_c == "-" then { + token ||:= next_c + next_c := @getchar + } + while any(digits, next_c) do { + token ||:= next_c + next_c := @getchar + } + } else { + if token ||:= (next_c == ".") then { + while any(&digits, next_c := @getchar) do + token ||:= next_c + tok_record := TOK("REALLIT") + } + if token ||:= (next_c == ("e"|"E")) then { + next_c := @getchar + if next_c == "-" then { + token ||:= next_c + next_c := @getchar + } + while any(&digits, next_c) do { + token ||:= next_c + next_c := @getchar + } + tok_record := TOK("REALLIT") + } + } + tok_record.str := ("" ~== token) | "0" + return tok_record + +end + + +# +# do_whitespace: coexpression x cset -> &null +# getchar x whitespace -> &null +# +# Where getchar is the coexpression producing the next char on +# the input stream. Do_whitespace just repeats until it finds a +# non-whitespace character, whitespace being defined as +# membership of a given character in the whitespace argument (a +# cset). +# +procedure do_whitespace(getchar, whitespace) + +# write(&errout, "it's junk") + while any(whitespace, next_c) do + next_c := @getchar + return + +end + + +# +# do_identifier: coexpression x table -> TOK record +# (getchar, reserved_tbl) -> t +# +# Where getchar is the coexpression that pops off characters from +# the input stream, reserved_tbl is a table of reserved words +# (keys = the string values, values = the names qua symbols in +# the grammar), and t is a TOK record containing all subsequent +# letters, digits, or underscores after next_c (which must be a +# letter or underscore). Note that next_c is global and gets +# reset by do_identifier. +# +procedure do_identifier(getchar, reserved_tbl) + + local token + # global next_c + +# write(&errout, "it's an indentifier") + token := next_c + while any(&letters ++ &digits ++ '_', next_c := @getchar) + do token ||:= next_c + return TOK(\reserved_tbl[token], token) | TOK("IDENT", token) + +end + + +# +# do_operator: coexpression x list -> TOK record +# (getchar, operators) -> t +# +# Where getchar is the coexpression that produces the next +# character on the input stream, operators is the operator list, +# and where t is a TOK record describing the operator just +# scanned. Calls recognop, which creates a DFSA to recognize +# valid Icon operators. Arg2 (operators) is the list of lists +# containing valid Icon operator string values and names (see +# above). +# +procedure do_operator(getchar, operators) + + local token, elem + + token := next_c + + # Go until recognop fails. + while elem := recognop(operators, token, 1) do + token ||:= (next_c := @getchar) +# write(&errout, ximage(elem)) + if *\elem = 1 then + return TOK(elem[1][2], elem[1][1]) + else fail + +end + + +record dfstn_state(b, e, tbl) +record start_state(b, e, tbl, master_list) +# +# recognop: list x string x integer -> list +# (l, s, i) -> l2 +# +# Where l is the list of lists created by the calling procedure +# (each element contains a token string value, name, and +# beginner/ender string), where s is a string possibly +# corresponding to a token in the list, where i is the position in +# the elements of l where the operator string values are recorded, +# and where l2 is a list of elements from l that contain operators +# for which string s is an exact match. Fails if there are no +# operators that s is a prefix of, but returns an empty list if +# there just aren't any that happen to match exactly. +# +# What this does is let the calling procedure just keep adding +# characters to s until recognop fails, then check the last list +# it returned to see if it is of length 1. If it is, then it +# contains list with the vital stats for the operator last +# recognized. If it is of length 0, then string s did not +# contain any recognizable operator. +# +procedure recognop(l, s, i) + + local current_state, master_list, c, result, j + static dfstn_table + initial dfstn_table := table() + + /i := 1 + # See if we've created an automaton for l already. + /dfstn_table[l] := start_state(1, *l, &null, &null) & { + dfstn_table[l].master_list := sortf(l, i) + } + + current_state := dfstn_table[l] + # Save master_list, as current_state will change later on. + master_list := current_state.master_list + + s ? { + while c := move(1) do { + + # Null means that this part of the automaton isn't + # complete. + # + if /current_state.tbl then + create_arcs(master_list, i, current_state, &pos) + + # If the table has been clobbered, then there are no arcs + # leading out of the current state. Fail. + # + if current_state.tbl === 0 then + fail + +# write(&errout, "c = ", image(c)) +# write(&errout, "table for current state = ", +# ximage(current_state.tbl)) + + # If we get to here, the current state has arcs leading + # out of it. See if c is one of them. If so, make the + # node to which arc c is connected the current state. + # Otherwise fail. + # + current_state := \current_state.tbl[c] | fail + } + } + + # Return possible completions. + # + result := list() + every j := current_state.b to current_state.e do { + if *master_list[j][i] = *s then + put(result, master_list[j]) + } + # return empty list if nothing the right length is found + return result + +end + + +# +# create_arcs: fill out a table of arcs leading out of the current +# state, and place that table in the tbl field for +# current_state +# +procedure create_arcs(master_list, field, current_state, POS) + + local elem, i, first_char, old_first_char + + current_state.tbl := table() + old_first_char := "" + + every elem := master_list[i := current_state.b to current_state.e][field] + do { + + # Get the first character for the current position (note that + # we're one character behind the calling routine; hence + # POS-1). + # + first_char := elem[POS-1] | next + + # If we have a new first character, create a new arc out of + # the current state. + # + if first_char ~== old_first_char then { + # Store the start position for the current character. + current_state.tbl[first_char] := dfstn_state(i) + # Store the end position for the old character. + (\current_state.tbl[old_first_char]).e := i-1 + old_first_char := first_char + } + } + (\current_state.tbl[old_first_char]).e := i + + # Clobber table with 0 if no arcs were added. + current_state.tbl := (*current_state.tbl = 0) + return current_state + +end |