summaryrefslogtreecommitdiff
path: root/ipl/packs/ibpag2/itokens.icn
diff options
context:
space:
mode:
Diffstat (limited to 'ipl/packs/ibpag2/itokens.icn')
-rw-r--r--ipl/packs/ibpag2/itokens.icn925
1 files changed, 925 insertions, 0 deletions
diff --git a/ipl/packs/ibpag2/itokens.icn b/ipl/packs/ibpag2/itokens.icn
new file mode 100644
index 0000000..1bb9cd1
--- /dev/null
+++ b/ipl/packs/ibpag2/itokens.icn
@@ -0,0 +1,925 @@
+############################################################################
+#
+# Name: itokens.icn
+#
+# Title: itokens (Icon source-file tokenizer)
+#
+# Author: Richard L. Goerwitz
+#
+# Version: 1.11
+#
+############################################################################
+#
+# This file contains itokens() - a utility for breaking Icon source
+# files up into individual tokens. This is the sort of routine one
+# needs to have around when implementing things like pretty printers,
+# preprocessors, code obfuscators, etc. It would also be useful for
+# implementing cut-down implementations of Icon written in Icon - the
+# sort of thing one might use in an interactive tutorial.
+#
+# Itokens(f, x) takes, as its first argument, f, an open file, and
+# suspends successive TOK records. TOK records contain two fields.
+# The first field, sym, contains a string that represents the name of
+# the next token (e.g. "CSET", "STRING", etc.). The second field,
+# str, gives that token's literal value. E.g. the TOK for a literal
+# semicolon is TOK("SEMICOL", ";"). For a mandatory newline, itokens
+# would suspend TOK("SEMICOL", "\n").
+#
+# Unlike Icon's own tokenizer, itokens() does not return an EOFX
+# token on end-of-file, but rather simply fails. It also can be
+# instructed to return syntactically meaningless newlines by passing
+# it a nonnull second argument (e.g. itokens(infile, 1)). These
+# meaningless newlines are returned as TOK records with a null sym
+# field (i.e. TOK(&null, "\n")).
+#
+# NOTE WELL: If new reserved words or operators are added to a given
+# implementation, the tables below will have to be altered. Note
+# also that &keywords should be implemented on the syntactic level -
+# not on the lexical one. As a result, a keyword like &features will
+# be suspended as TOK("CONJUNC", "&") and TOK("IDENT", "features").
+#
+############################################################################
+#
+# Links: slshupto
+#
+# Requires: coexpressions
+#
+############################################################################
+
+#link ximage, slshupto
+link slshupto #make sure you have version 1.2 or above
+
+global next_c, line_number
+record TOK(sym, str)
+
+#
+# main: an Icon source code uglifier
+#
+# Stub main for testing; uncomment & compile. The resulting
+# executable will act as an Icon file compressor, taking the
+# standard input and outputting Icon code stripped of all
+# unnecessary whitespace. Guaranteed to make the code a visual
+# mess :-).
+#
+#procedure main()
+#
+# local separator, T
+# separator := ""
+# every T := itokens(&input) do {
+# if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT"
+# then writes(separator)
+# if T.sym == "SEMICOL" then writes(";") else writes(T.str)
+# if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT"
+# then separator := " " else separator := ""
+# }
+#
+#end
+
+
+#
+# itokens: file x anything -> TOK records (a generator)
+# (stream, nostrip) -> Rs
+#
+# Where stream is an open file, anything is any object (it only
+# matters whether it is null or not), and Rs are TOK records.
+# Note that itokens strips out useless newlines. If the second
+# argument is nonnull, itokens does not strip out superfluous
+# newlines. It may be useful to keep them when the original line
+# structure of the input file must be maintained.
+#
+procedure itokens(stream, nostrip)
+
+ local T, last_token
+
+ # initialize to some meaningless value
+ last_token := TOK()
+
+ every T := \iparse_tokens(stream) do {
+ if \T.sym then {
+ if T.sym == "EOFX" then fail
+ else {
+ #
+ # If the last token was a semicolon, then interpret
+ # all ambiguously unary/binary sequences like "**" as
+ # beginners (** could be two unary stars or the [c]set
+ # intersection operator).
+ #
+ if \last_token.sym == "SEMICOL"
+ then suspend last_token := expand_fake_beginner(T)
+ else suspend last_token := T
+ }
+ } else {
+ if \nostrip
+ then suspend last_token := T
+ }
+ }
+
+end
+
+
+#
+# expand_fake_beginner: TOK record -> TOK records
+#
+# Some "beginner" tokens aren't really beginners. They are token
+# sequences that could be either a single binary operator or a
+# series of unary operators. The tokenizer's job is just to snap
+# up as many characters as could logically constitute an operator.
+# Here is where we decide whether to break the sequence up into
+# more than one op or not.
+#
+procedure expand_fake_beginner(next_token)
+
+ static exptbl
+ initial {
+ exptbl := table()
+ insert(exptbl, "CONCAT", [TOK("BAR", "|"), TOK("BAR", "|")])
+ insert(exptbl, "DIFF", [TOK("MINUS", "-"), TOK("MINUS", "-")])
+ insert(exptbl, "EQUIV", [TOK("NUMEQ", "="), TOK("NUMEQ", "="),
+ TOK("NUMEQ", "=")])
+ insert(exptbl, "INTER", [TOK("STAR", "*"), TOK("STAR", "*")])
+ insert(exptbl, "LCONCAT", [TOK("BAR", "|"), TOK("BAR", "|"),
+ TOK("BAR", "|")])
+ insert(exptbl, "LEXEQ", [TOK("NUMEQ", "="), TOK("NUMEQ", "=")])
+ insert(exptbl, "LEXNE", [TOK("TILDE", "~"), TOK("NUMEQ", "="),
+ TOK("NUMEQ", "=")])
+ insert(exptbl, "NOTEQUIV",[TOK("TILDE", "~"), TOK("NUMEQ","="),
+ TOK("NUMEQ", "="), TOK("NUMEQ", "=")])
+ insert(exptbl, "NUMNE", [TOK("TILDE", "~"), TOK("NUMEQ","=")])
+ insert(exptbl, "UNION", [TOK("PLUS", "+"), TOK("PLUS", "+")])
+ }
+
+ if \exptbl[next_token.sym]
+ then suspend !exptbl[next_token.sym]
+ else return next_token
+
+end
+
+
+#
+# iparse_tokens: file -> TOK records (a generator)
+# (stream) -> tokens
+#
+# Where file is an open input stream, and tokens are TOK records
+# holding both the token type and actual token text.
+#
+# TOK records contain two parts, a preterminal symbol (the first
+# "sym" field), and the actual text of the token ("str"). The
+# parser only pays attention to the sym field, although the
+# strings themselves get pushed onto the value stack.
+#
+# Note the following kludge: Unlike real Icon tokenizers, this
+# procedure returns syntactially meaningless newlines as TOK
+# records with a null sym field. Normally they would be ignored.
+# I wanted to return them so they could be printed on the output
+# stream, thus preserving the line structure of the original
+# file, and making later diagnostic messages more usable.
+#
+procedure iparse_tokens(stream, getchar)
+
+ local elem, whitespace, token, last_token, primitives, reserveds
+ static be_tbl, reserved_tbl, operators
+ initial {
+
+ # Primitive Tokens
+ #
+ primitives := [
+ ["identifier", "IDENT", "be"],
+ ["integer-literal", "INTLIT", "be"],
+ ["real-literal", "REALLIT", "be"],
+ ["string-literal", "STRINGLIT", "be"],
+ ["cset-literal", "CSETLIT", "be"],
+ ["end-of-file", "EOFX", "" ]]
+
+ # Reserved Words
+ #
+ reserveds := [
+ ["break", "BREAK", "be"],
+ ["by", "BY", "" ],
+ ["case", "CASE", "b" ],
+ ["create", "CREATE", "b" ],
+ ["default", "DEFAULT", "b" ],
+ ["do", "DO", "" ],
+ ["else", "ELSE", "" ],
+ ["end", "END", "b" ],
+ ["every", "EVERY", "b" ],
+ ["fail", "FAIL", "be"],
+ ["global", "GLOBAL", "" ],
+ ["if", "IF", "b" ],
+ ["initial", "INITIAL", "b" ],
+ ["invocable", "INVOCABLE", "" ],
+ ["link", "LINK", "" ],
+ ["local", "LOCAL", "b" ],
+ ["next", "NEXT", "be"],
+ ["not", "NOT", "b" ],
+ ["of", "OF", "" ],
+ ["procedure", "PROCEDURE", "" ],
+ ["record", "RECORD", "" ],
+ ["repeat", "REPEAT", "b" ],
+ ["return", "RETURN", "be"],
+ ["static", "STATIC", "b" ],
+ ["suspend", "SUSPEND", "be"],
+ ["then", "THEN", "" ],
+ ["to", "TO", "" ],
+ ["until", "UNTIL", "b" ],
+ ["while", "WHILE", "b" ]]
+
+ # Operators
+ #
+ operators := [
+ [":=", "ASSIGN", "" ],
+ ["@", "AT", "b" ],
+ ["@:=", "AUGACT", "" ],
+ ["&:=", "AUGAND", "" ],
+ ["=:=", "AUGEQ", "" ],
+ ["===:=", "AUGEQV", "" ],
+ [">=:=", "AUGGE", "" ],
+ [">:=", "AUGGT", "" ],
+ ["<=:=", "AUGLE", "" ],
+ ["<:=", "AUGLT", "" ],
+ ["~=:=", "AUGNE", "" ],
+ ["~===:=", "AUGNEQV", "" ],
+ ["==:=", "AUGSEQ", "" ],
+ [">>=:=", "AUGSGE", "" ],
+ [">>:=", "AUGSGT", "" ],
+ ["<<=:=", "AUGSLE", "" ],
+ ["<<:=", "AUGSLT", "" ],
+ ["~==:=", "AUGSNE", "" ],
+ ["\\", "BACKSLASH", "b" ],
+ ["!", "BANG", "b" ],
+ ["|", "BAR", "b" ],
+ ["^", "CARET", "b" ],
+ ["^:=", "CARETASGN", "b" ],
+ [":", "COLON", "" ],
+ [",", "COMMA", "" ],
+ ["||", "CONCAT", "b" ],
+ ["||:=", "CONCATASGN","" ],
+ ["&", "CONJUNC", "b" ],
+ [".", "DOT", "b" ],
+ ["--", "DIFF", "b" ],
+ ["--:=", "DIFFASGN", "" ],
+ ["===", "EQUIV", "b" ],
+ ["**", "INTER", "b" ],
+ ["**:=", "INTERASGN", "" ],
+ ["{", "LBRACE", "b" ],
+ ["[", "LBRACK", "b" ],
+ ["|||", "LCONCAT", "b" ],
+ ["|||:=", "LCONCATASGN","" ],
+ ["==", "LEXEQ", "b" ],
+ [">>=", "LEXGE", "" ],
+ [">>", "LEXGT", "" ],
+ ["<<=", "LEXLE", "" ],
+ ["<<", "LEXLT", "" ],
+ ["~==", "LEXNE", "b" ],
+ ["(", "LPAREN", "b" ],
+ ["-:", "MCOLON", "" ],
+ ["-", "MINUS", "b" ],
+ ["-:=", "MINUSASGN", "" ],
+ ["%", "MOD", "" ],
+ ["%:=", "MODASGN", "" ],
+ ["~===", "NOTEQUIV", "b" ],
+ ["=", "NUMEQ", "b" ],
+ [">=", "NUMGE", "" ],
+ [">", "NUMGT", "" ],
+ ["<=", "NUMLE", "" ],
+ ["<", "NUMLT", "" ],
+ ["~=", "NUMNE", "b" ],
+ ["+:", "PCOLON", "" ],
+ ["+", "PLUS", "b" ],
+ ["+:=", "PLUSASGN", "" ],
+ ["?", "QMARK", "b" ],
+ ["<-", "REVASSIGN", "" ],
+ ["<->", "REVSWAP", "" ],
+ ["}", "RBRACE", "e" ],
+ ["]", "RBRACK", "e" ],
+ [")", "RPAREN", "e" ],
+ [";", "SEMICOL", "" ],
+ ["?:=", "SCANASGN", "" ],
+ ["/", "SLASH", "b" ],
+ ["/:=", "SLASHASGN", "" ],
+ ["*", "STAR", "b" ],
+ ["*:=", "STARASGN", "" ],
+ [":=:", "SWAP", "" ],
+ ["~", "TILDE", "b" ],
+ ["++", "UNION", "b" ],
+ ["++:=", "UNIONASGN", "" ],
+ ["$(", "LBRACE", "b" ],
+ ["$)", "RBRACE", "e" ],
+ ["$<", "LBRACK", "b" ],
+ ["$>", "RBRACK", "e" ],
+ ["$", "RHSARG", "b" ],
+ ["%$(", "BEGGLOB", "b" ],
+ ["%$)", "ENDGLOB", "e" ],
+ ["%{", "BEGGLOB", "b" ],
+ ["%}", "ENDGLOB", "e" ],
+ ["%%", "NEWSECT", "be"]]
+
+ # static be_tbl, reserved_tbl
+ reserved_tbl := table()
+ every elem := !reserveds do
+ insert(reserved_tbl, elem[1], elem[2])
+ be_tbl := table()
+ every elem := !primitives | !reserveds | !operators do {
+ insert(be_tbl, elem[2], elem[3])
+ }
+ }
+
+ /getchar := create {
+ line_number := 0
+ ! ( 1(!stream, line_number +:=1) || "\n" )
+ }
+ whitespace := ' \t'
+ /next_c := @getchar | {
+ if \stream then
+ return TOK("EOFX")
+ else fail
+ }
+
+ repeat {
+ case next_c of {
+
+ "." : {
+ # Could be a real literal *or* a dot operator. Check
+ # following character to see if it's a digit. If so,
+ # it's a real literal. We can only get away with
+ # doing the dot here because it is not a substring of
+ # any longer identifier. If this gets changed, we'll
+ # have to move this code into do_operator().
+ #
+ last_token := do_dot(getchar)
+ suspend last_token
+# write(&errout, "next_c == ", image(next_c))
+ next
+ }
+
+ "\n" : {
+ # If do_newline fails, it means we're at the end of
+ # the input stream, and we should break out of the
+ # repeat loop.
+ #
+ every last_token := do_newline(getchar, last_token, be_tbl)
+ do suspend last_token
+ if next_c === &null then break
+ next
+ }
+
+ "\#" : {
+ # Just a comment. Strip it by reading every character
+ # up to the next newline. The global var next_c
+ # should *always* == "\n" when this is done.
+ #
+ do_number_sign(getchar)
+# write(&errout, "next_c == ", image(next_c))
+ next
+ }
+
+ "\"" : {
+ # Suspend as STRINGLIT everything from here up to the
+ # next non-backslashed quotation mark, inclusive
+ # (accounting for the _ line-continuation convention).
+ #
+ last_token := do_quotation_mark(getchar)
+ suspend last_token
+# write(&errout, "next_c == ", image(next_c))
+ next
+ }
+
+ "'" : {
+ # Suspend as CSETLIT everything from here up to the
+ # next non-backslashed apostrophe, inclusive.
+ #
+ last_token := do_apostrophe(getchar)
+ suspend last_token
+# write(&errout, "next_c == ", image(next_c))
+ next
+ }
+
+ &null : stop("iparse_tokens (lexer): unexpected EOF")
+
+ default : {
+ # If we get to here, we have either whitespace, an
+ # integer or real literal, an identifier or reserved
+ # word (both get handled by do_identifier), or an
+ # operator. The question of which we have can be
+ # determined by checking the first character.
+ #
+ if any(whitespace, next_c) then {
+ # Like all of the TOK forming procedures,
+ # do_whitespace resets next_c.
+ do_whitespace(getchar, whitespace)
+ # don't suspend any tokens
+ next
+ }
+ if any(&digits, next_c) then {
+ last_token := do_digits(getchar)
+ suspend last_token
+ next
+ }
+ if any(&letters ++ '_', next_c) then {
+ last_token := do_identifier(getchar, reserved_tbl)
+ suspend last_token
+ next
+ }
+# write(&errout, "it's an operator")
+ last_token := do_operator(getchar, operators)
+ suspend last_token
+ next
+ }
+ }
+ }
+
+ # If stream argument is nonnull, then we are in the top-level
+ # iparse_tokens(). If not, then we are in a recursive call, and
+ # we should not emit all this end-of-file crap.
+ #
+ if \stream then {
+ return TOK("EOFX")
+ }
+ else fail
+
+end
+
+
+#
+# do_dot: coexpression -> TOK record
+# getchar -> t
+#
+# Where getchar is the coexpression that produces the next
+# character from the input stream and t is a token record whose
+# sym field contains either "REALLIT" or "DOT". Essentially,
+# do_dot checks the next char on the input stream to see if it's
+# an integer. Since the preceding char was a dot, an integer
+# tips us off that we have a real literal. Otherwise, it's just
+# a dot operator. Note that do_dot resets next_c for the next
+# cycle through the main case loop in the calling procedure.
+#
+procedure do_dot(getchar)
+
+ local token
+ # global next_c
+
+# write(&errout, "it's a dot")
+
+ # If dot's followed by a digit, then we have a real literal.
+ #
+ if any(&digits, next_c := @getchar) then {
+# write(&errout, "dot -> it's a real literal")
+ token := "." || next_c
+ while any(&digits, next_c := @getchar) do
+ token ||:= next_c
+ if token ||:= (next_c == ("e"|"E")) then {
+ while (next_c := @getchar) == "0"
+ while any(&digits, next_c) do {
+ token ||:= next_c
+ next_c = @getchar
+ }
+ }
+ return TOK("REALLIT", token)
+ }
+
+ # Dot not followed by an integer; so we just have a dot operator,
+ # and not a real literal.
+ #
+# write(&errout, "dot -> just a plain dot")
+ return TOK("DOT", ".")
+
+end
+
+
+#
+# do_newline: coexpression x TOK record x table -> TOK records
+# (getchar, last_token, be_tbl) -> Ts (a generator)
+#
+# Where getchar is the coexpression that returns the next
+# character from the input stream, last_token is the last TOK
+# record suspended by the calling procedure, be_tbl is a table of
+# tokens and their "beginner/ender" status, and Ts are TOK
+# records. Note that do_newline resets next_c. Do_newline is a
+# mess. What it does is check the last token suspended by the
+# calling procedure to see if it was a beginner or ender. It
+# then gets the next token by calling iparse_tokens again. If
+# the next token is a beginner and the last token is an ender,
+# then we have to suspend a SEMICOL token. In either event, both
+# the last and next token are suspended.
+#
+procedure do_newline(getchar, last_token, be_tbl)
+
+ local next_token
+ # global next_c
+
+# write(&errout, "it's a newline")
+
+ # Go past any additional newlines.
+ #
+ while next_c == "\n" do {
+ # NL can be the last char in the getchar stream; if it *is*,
+ # then signal that it's time to break out of the repeat loop
+ # in the calling procedure.
+ #
+ next_c := @getchar | {
+ next_c := &null
+ fail
+ }
+ suspend TOK(&null, next_c == "\n")
+ }
+
+ # If there was a last token (i.e. if a newline wasn't the first
+ # character of significance in the input stream), then check to
+ # see if it was an ender. If so, then check to see if the next
+ # token is a beginner. If so, then suspend a TOK("SEMICOL")
+ # record before suspending the next token.
+ #
+ if find("e", be_tbl[(\last_token).sym]) then {
+# write(&errout, "calling iparse_tokens via do_newline")
+# &trace := -1
+ # First arg to iparse_tokens can be null here.
+ \ (next_token := iparse_tokens(&null, getchar)).sym
+ if \next_token then {
+# write(&errout, "call of iparse_tokens via do_newline yields ",
+# ximage(next_token))
+ if find("b", be_tbl[next_token.sym])
+ then suspend TOK("SEMICOL", "\n")
+ #
+ # See below. If this were like the real Icon parser,
+ # the following line would be commented out.
+ #
+ else suspend TOK(&null, "\n")
+ return next_token
+ }
+ else {
+ #
+ # If this were a *real* Icon tokenizer, it would not emit
+ # any record here, but would simply fail. Instead, we'll
+ # emit a dummy record with a null sym field.
+ #
+ return TOK(&null, "\n")
+# &trace := 0
+# fail
+ }
+ }
+
+ # See above. Again, if this were like Icon's own tokenizer, we
+ # would just fail here, and not return any TOK record.
+ #
+# &trace := 0
+ return TOK(&null, "\n")
+# fail
+
+end
+
+
+#
+# do_number_sign: coexpression -> &null
+# getchar ->
+#
+# Where getchar is the coexpression that pops characters off the
+# main input stream. Sets the global variable next_c. This
+# procedure simply reads characters until it gets a newline, then
+# returns with next_c == "\n". Since the starting character was
+# a number sign, this has the effect of stripping comments.
+#
+procedure do_number_sign(getchar)
+
+ # global next_c
+
+# write(&errout, "it's a number sign")
+ while next_c ~== "\n" do {
+ next_c := @getchar
+ }
+
+ # Return to calling procedure to cycle around again with the new
+ # next_c already set. Next_c should always be "\n" at this point.
+ return
+
+end
+
+
+#
+# do_quotation_mark: coexpression -> TOK record
+# getchar -> t
+#
+# Where getchar is the coexpression that yields another character
+# from the input stream, and t is a TOK record with "STRINGLIT"
+# as its sym field. Puts everything upto and including the next
+# non-backslashed quotation mark into the str field. Handles the
+# underscore continuation convention.
+#
+procedure do_quotation_mark(getchar)
+
+ local token
+ # global next_c
+
+ # write(&errout, "it's a string literal")
+ token := "\""
+ next_c := @getchar
+ repeat {
+ if next_c == "\n" & token[-1] == "_" then {
+ token := token[1:-1]
+ while any('\t ', next_c := @getchar)
+ next
+ } else {
+ if slshupto('"', token ||:= next_c, 2)
+ then {
+ next_c := @getchar
+ # resume outermost (repeat) loop in calling procedure,
+ # with the new (here explicitly set) next_c
+ return TOK("STRINGLIT", token)
+ }
+ next_c := @getchar
+ }
+ }
+
+end
+
+
+#
+# do_apostrophe: coexpression -> TOK record
+# getchar -> t
+#
+# Where getchar is the coexpression that yields another character
+# from the input stream, and t is a TOK record with "CSETLIT"
+# as its sym field. Puts everything upto and including the next
+# non-backslashed apostrope into the str field.
+#
+procedure do_apostrophe(getchar)
+
+ local token
+ # global next_c
+
+# write(&errout, "it's a cset literal")
+ token := "'"
+ next_c := @getchar
+ repeat {
+ if next_c == "\n" & token[-1] == "_" then {
+ token := token[1:-1]
+ while any('\t ', next_c := @getchar)
+ next
+ } else {
+ if slshupto("'", token ||:= next_c, 2)
+ then {
+ next_c := @getchar
+ # Return & resume outermost containing loop in calling
+ # procedure w/ new next_c.
+ return TOK("CSETLIT", token)
+ }
+ next_c := @getchar
+ }
+ }
+
+end
+
+
+#
+# do_digits: coexpression -> TOK record
+# getchar -> t
+#
+# Where getchar is the coexpression that produces the next char
+# on the input stream, and where t is a TOK record containing
+# either "REALLIT" or "INTLIT" in its sym field, and the text of
+# the numeric literal in its str field.
+#
+procedure do_digits(getchar)
+
+ local token, tok_record, extras, digits, over
+ # global next_c
+
+ # For bases > 16
+ extras := "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz"
+ # Assume integer literal until proven otherwise....
+ tok_record := TOK("INTLIT")
+
+# write(&errout, "it's an integer or real literal")
+ token := ("0" ~== next_c) | ""
+ while any(&digits, next_c := @getchar) do
+ token ||:= next_c
+ if token ||:= (next_c == ("R"|"r")) then {
+ digits := &digits
+ if over := ((10 < token[1:-1]) - 10) * 2 then
+ digits ++:= extras[1:over+1] | extras
+ next_c := @getchar
+ if next_c == "-" then {
+ token ||:= next_c
+ next_c := @getchar
+ }
+ while any(digits, next_c) do {
+ token ||:= next_c
+ next_c := @getchar
+ }
+ } else {
+ if token ||:= (next_c == ".") then {
+ while any(&digits, next_c := @getchar) do
+ token ||:= next_c
+ tok_record := TOK("REALLIT")
+ }
+ if token ||:= (next_c == ("e"|"E")) then {
+ next_c := @getchar
+ if next_c == "-" then {
+ token ||:= next_c
+ next_c := @getchar
+ }
+ while any(&digits, next_c) do {
+ token ||:= next_c
+ next_c := @getchar
+ }
+ tok_record := TOK("REALLIT")
+ }
+ }
+ tok_record.str := ("" ~== token) | "0"
+ return tok_record
+
+end
+
+
+#
+# do_whitespace: coexpression x cset -> &null
+# getchar x whitespace -> &null
+#
+# Where getchar is the coexpression producing the next char on
+# the input stream. Do_whitespace just repeats until it finds a
+# non-whitespace character, whitespace being defined as
+# membership of a given character in the whitespace argument (a
+# cset).
+#
+procedure do_whitespace(getchar, whitespace)
+
+# write(&errout, "it's junk")
+ while any(whitespace, next_c) do
+ next_c := @getchar
+ return
+
+end
+
+
+#
+# do_identifier: coexpression x table -> TOK record
+# (getchar, reserved_tbl) -> t
+#
+# Where getchar is the coexpression that pops off characters from
+# the input stream, reserved_tbl is a table of reserved words
+# (keys = the string values, values = the names qua symbols in
+# the grammar), and t is a TOK record containing all subsequent
+# letters, digits, or underscores after next_c (which must be a
+# letter or underscore). Note that next_c is global and gets
+# reset by do_identifier.
+#
+procedure do_identifier(getchar, reserved_tbl)
+
+ local token
+ # global next_c
+
+# write(&errout, "it's an indentifier")
+ token := next_c
+ while any(&letters ++ &digits ++ '_', next_c := @getchar)
+ do token ||:= next_c
+ return TOK(\reserved_tbl[token], token) | TOK("IDENT", token)
+
+end
+
+
+#
+# do_operator: coexpression x list -> TOK record
+# (getchar, operators) -> t
+#
+# Where getchar is the coexpression that produces the next
+# character on the input stream, operators is the operator list,
+# and where t is a TOK record describing the operator just
+# scanned. Calls recognop, which creates a DFSA to recognize
+# valid Icon operators. Arg2 (operators) is the list of lists
+# containing valid Icon operator string values and names (see
+# above).
+#
+procedure do_operator(getchar, operators)
+
+ local token, elem
+
+ token := next_c
+
+ # Go until recognop fails.
+ while elem := recognop(operators, token, 1) do
+ token ||:= (next_c := @getchar)
+# write(&errout, ximage(elem))
+ if *\elem = 1 then
+ return TOK(elem[1][2], elem[1][1])
+ else fail
+
+end
+
+
+record dfstn_state(b, e, tbl)
+record start_state(b, e, tbl, master_list)
+#
+# recognop: list x string x integer -> list
+# (l, s, i) -> l2
+#
+# Where l is the list of lists created by the calling procedure
+# (each element contains a token string value, name, and
+# beginner/ender string), where s is a string possibly
+# corresponding to a token in the list, where i is the position in
+# the elements of l where the operator string values are recorded,
+# and where l2 is a list of elements from l that contain operators
+# for which string s is an exact match. Fails if there are no
+# operators that s is a prefix of, but returns an empty list if
+# there just aren't any that happen to match exactly.
+#
+# What this does is let the calling procedure just keep adding
+# characters to s until recognop fails, then check the last list
+# it returned to see if it is of length 1. If it is, then it
+# contains list with the vital stats for the operator last
+# recognized. If it is of length 0, then string s did not
+# contain any recognizable operator.
+#
+procedure recognop(l, s, i)
+
+ local current_state, master_list, c, result, j
+ static dfstn_table
+ initial dfstn_table := table()
+
+ /i := 1
+ # See if we've created an automaton for l already.
+ /dfstn_table[l] := start_state(1, *l, &null, &null) & {
+ dfstn_table[l].master_list := sortf(l, i)
+ }
+
+ current_state := dfstn_table[l]
+ # Save master_list, as current_state will change later on.
+ master_list := current_state.master_list
+
+ s ? {
+ while c := move(1) do {
+
+ # Null means that this part of the automaton isn't
+ # complete.
+ #
+ if /current_state.tbl then
+ create_arcs(master_list, i, current_state, &pos)
+
+ # If the table has been clobbered, then there are no arcs
+ # leading out of the current state. Fail.
+ #
+ if current_state.tbl === 0 then
+ fail
+
+# write(&errout, "c = ", image(c))
+# write(&errout, "table for current state = ",
+# ximage(current_state.tbl))
+
+ # If we get to here, the current state has arcs leading
+ # out of it. See if c is one of them. If so, make the
+ # node to which arc c is connected the current state.
+ # Otherwise fail.
+ #
+ current_state := \current_state.tbl[c] | fail
+ }
+ }
+
+ # Return possible completions.
+ #
+ result := list()
+ every j := current_state.b to current_state.e do {
+ if *master_list[j][i] = *s then
+ put(result, master_list[j])
+ }
+ # return empty list if nothing the right length is found
+ return result
+
+end
+
+
+#
+# create_arcs: fill out a table of arcs leading out of the current
+# state, and place that table in the tbl field for
+# current_state
+#
+procedure create_arcs(master_list, field, current_state, POS)
+
+ local elem, i, first_char, old_first_char
+
+ current_state.tbl := table()
+ old_first_char := ""
+
+ every elem := master_list[i := current_state.b to current_state.e][field]
+ do {
+
+ # Get the first character for the current position (note that
+ # we're one character behind the calling routine; hence
+ # POS-1).
+ #
+ first_char := elem[POS-1] | next
+
+ # If we have a new first character, create a new arc out of
+ # the current state.
+ #
+ if first_char ~== old_first_char then {
+ # Store the start position for the current character.
+ current_state.tbl[first_char] := dfstn_state(i)
+ # Store the end position for the old character.
+ (\current_state.tbl[old_first_char]).e := i-1
+ old_first_char := first_char
+ }
+ }
+ (\current_state.tbl[old_first_char]).e := i
+
+ # Clobber table with 0 if no arcs were added.
+ current_state.tbl := (*current_state.tbl = 0)
+ return current_state
+
+end