1 files changed, 925 insertions, 0 deletions
diff --git a/ipl/packs/ibpag2/itokens.icn b/ipl/packs/ibpag2/itokens.icn
new file mode 100644
index 0000000..1bb9cd1
--- /dev/null
+++ b/ipl/packs/ibpag2/itokens.icn
@@ -0,0 +1,925 @@
+############################################################################
+#
+#	Name:	 itokens.icn
+#
+#	Title:	 itokens (Icon source-file tokenizer)
+#
+#	Author:	 Richard L. Goerwitz
+#
+#	Version: 1.11
+#
+############################################################################
+#
+#  This file contains itokens() - a utility for breaking Icon source
+#  files up into individual tokens.  This is the sort of routine one
+#  needs to have around when implementing things like pretty printers,
+#  preprocessors, code obfuscators, etc.  It would also be useful for
+#  implementing cut-down implementations of Icon written in Icon - the
+#  sort of thing one might use in an interactive tutorial.
+#
+#  Itokens(f, x) takes, as its first argument, f, an open file, and
+#  suspends successive TOK records.  TOK records contain two fields.
+#  The first field, sym, contains a string that represents the name of
+#  the next token (e.g. "CSET", "STRING", etc.).  The second field,
+#  str, gives that token's literal value.  E.g. the TOK for a literal
+#  semicolon is TOK("SEMICOL", ";").  For a mandatory newline, itokens
+#  would suspend TOK("SEMICOL", "\n").
+#
+#  Unlike Icon's own tokenizer, itokens() does not return an EOFX
+#  token on end-of-file, but rather simply fails.  It also can be
+#  instructed to return syntactically meaningless newlines by passing
+#  it a nonnull second argument (e.g. itokens(infile, 1)).  These
+#  meaningless newlines are returned as TOK records with a null sym
+#  field (i.e. TOK(&null, "\n")).
+#
+#  NOTE WELL: If new reserved words or operators are added to a given
+#  implementation, the tables below will have to be altered.  Note
+#  also that &keywords should be implemented on the syntactic level -
+#  not on the lexical one.  As a result, a keyword like &features will
+#  be suspended as TOK("CONJUNC", "&") and TOK("IDENT", "features").
+#
+############################################################################
+#
+#  Links: slshupto
+#
+#  Requires: coexpressions
+#
+############################################################################
+
+#link ximage, slshupto
+link slshupto #make sure you have version 1.2 or above
+
+global next_c, line_number
+record TOK(sym, str)
+
+#
+# main:  an Icon source code uglifier
+#
+#     Stub main for testing; uncomment & compile.  The resulting
+#     executable will act as an Icon file compressor, taking the
+#     standard input and outputting Icon code stripped of all
+#     unnecessary whitespace.  Guaranteed to make the code a visual
+#     mess :-).
+#
+#procedure main()
+#
+#    local separator, T
+#    separator := ""
+#    every T := itokens(&input) do {
+#	if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT"
+#	then writes(separator)
+#	if T.sym == "SEMICOL" then writes(";") else writes(T.str)
+#	if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT"
+#	then separator := " " else separator := ""
+#    }
+#
+#end
+
+
+#
+# itokens:  file x anything    -> TOK records (a generator)
+#           (stream, nostrip)  -> Rs
+#
+#     Where stream is an open file, anything is any object (it only
+#     matters whether it is null or not), and Rs are TOK records.
+#     Note that itokens strips out useless newlines.  If the second
+#     argument is nonnull, itokens does not strip out superfluous
+#     newlines.  It may be useful to keep them when the original line
+#     structure of the input file must be maintained.
+#
+procedure itokens(stream, nostrip)
+
+    local T, last_token
+
+    # initialize to some meaningless value
+    last_token := TOK()
+
+    every T := \iparse_tokens(stream) do {
+	if \T.sym then {
+	    if T.sym == "EOFX" then fail
+	    else {
+		#
+		# If the last token was a semicolon, then interpret
+		# all ambiguously unary/binary sequences like "**" as
+		# beginners (** could be two unary stars or the [c]set
+		# intersection operator).
+		#
+		if \last_token.sym == "SEMICOL"
+		then suspend last_token := expand_fake_beginner(T)
+		else suspend last_token := T
+	    }
+	} else {
+	    if \nostrip
+	    then suspend last_token := T
+	}
+    }
+
+end
+
+
+#
+# expand_fake_beginner: TOK record -> TOK records
+#
+#     Some "beginner" tokens aren't really beginners.  They are token
+#     sequences that could be either a single binary operator or a
+#     series of unary operators.  The tokenizer's job is just to snap
+#     up as many characters as could logically constitute an operator.
+#     Here is where we decide whether to break the sequence up into
+#     more than one op or not.
+#
+procedure expand_fake_beginner(next_token)
+
+    static exptbl
+    initial {
+	exptbl := table()
+	insert(exptbl, "CONCAT",  [TOK("BAR", "|"),   TOK("BAR", "|")])
+	insert(exptbl, "DIFF",    [TOK("MINUS", "-"), TOK("MINUS", "-")])
+ 	insert(exptbl, "EQUIV",   [TOK("NUMEQ", "="), TOK("NUMEQ", "="),
+				   TOK("NUMEQ", "=")])
+	insert(exptbl, "INTER",   [TOK("STAR", "*"),  TOK("STAR", "*")])
+	insert(exptbl, "LCONCAT", [TOK("BAR", "|"),   TOK("BAR", "|"),
+				   TOK("BAR", "|")])
+	insert(exptbl, "LEXEQ",   [TOK("NUMEQ", "="), TOK("NUMEQ", "=")])
+	insert(exptbl, "LEXNE",   [TOK("TILDE", "~"), TOK("NUMEQ", "="),
+				   TOK("NUMEQ", "=")])
+	insert(exptbl, "NOTEQUIV",[TOK("TILDE", "~"), TOK("NUMEQ","="),
+				   TOK("NUMEQ", "="), TOK("NUMEQ", "=")])
+	insert(exptbl, "NUMNE",   [TOK("TILDE", "~"), TOK("NUMEQ","=")])
+	insert(exptbl, "UNION",   [TOK("PLUS", "+"),  TOK("PLUS", "+")])
+    }
+
+    if \exptbl[next_token.sym]
+    then suspend !exptbl[next_token.sym]
+    else return next_token
+
+end
+
+
+#
+# iparse_tokens:  file     -> TOK records (a generator)
+#                 (stream) -> tokens
+#
+#     Where file is an open input stream, and tokens are TOK records
+#     holding both the token type and actual token text.
+#
+#     TOK records contain two parts, a preterminal symbol (the first
+#     "sym" field), and the actual text of the token ("str").  The
+#     parser only pays attention to the sym field, although the
+#     strings themselves get pushed onto the value stack.
+#
+#     Note the following kludge:  Unlike real Icon tokenizers, this
+#     procedure returns syntactially meaningless newlines as TOK
+#     records with a null sym field.  Normally they would be ignored.
+#     I wanted to return them so they could be printed on the output
+#     stream, thus preserving the line structure of the original
+#     file, and making later diagnostic messages more usable.
+#
+procedure iparse_tokens(stream, getchar)
+
+    local elem, whitespace, token, last_token, primitives, reserveds
+    static be_tbl, reserved_tbl, operators
+    initial {
+
+	#  Primitive Tokens
+	#
+	primitives := [
+		       ["identifier",      "IDENT",     "be"],
+		       ["integer-literal", "INTLIT",    "be"],
+		       ["real-literal",    "REALLIT",   "be"],
+		       ["string-literal",  "STRINGLIT", "be"],
+		       ["cset-literal",    "CSETLIT",   "be"],
+		       ["end-of-file",     "EOFX",      "" ]]
+
+	# Reserved Words
+	#
+	reserveds  := [
+		       ["break",           "BREAK",     "be"],
+		       ["by",              "BY",        ""  ],
+		       ["case",            "CASE",      "b" ],
+		       ["create",          "CREATE",    "b" ],
+		       ["default",         "DEFAULT",   "b" ],
+		       ["do",              "DO",        ""  ],
+                       ["else",            "ELSE",      ""  ],
+		       ["end",             "END",       "b" ],
+		       ["every",           "EVERY",     "b" ],
+		       ["fail",            "FAIL",      "be"],
+		       ["global",          "GLOBAL",    ""  ],
+		       ["if",              "IF",        "b" ],
+		       ["initial",         "INITIAL",   "b" ],
+		       ["invocable",       "INVOCABLE", ""  ],
+		       ["link",            "LINK",      ""  ],
+		       ["local",           "LOCAL",     "b" ],
+		       ["next",            "NEXT",      "be"],
+		       ["not",             "NOT",       "b" ],
+		       ["of",              "OF",        ""  ],
+		       ["procedure",       "PROCEDURE", ""  ],
+		       ["record",          "RECORD",    ""  ],
+		       ["repeat",          "REPEAT",    "b" ],
+		       ["return",          "RETURN",    "be"],
+		       ["static",          "STATIC",    "b" ],
+		       ["suspend",         "SUSPEND",   "be"],
+		       ["then",            "THEN",      ""  ],
+		       ["to",              "TO",        ""  ],
+		       ["until",           "UNTIL",     "b" ],
+		       ["while",           "WHILE",     "b" ]]
+
+	# Operators
+	#
+	operators  := [
+		       [":=",              "ASSIGN",    ""  ],
+		       ["@",               "AT",        "b" ],
+		       ["@:=",             "AUGACT",    ""  ],
+		       ["&:=",             "AUGAND",    ""  ],
+		       ["=:=",             "AUGEQ",     ""  ],
+		       ["===:=",           "AUGEQV",    ""  ],
+		       [">=:=",            "AUGGE",     ""  ],
+		       [">:=",             "AUGGT",     ""  ],
+		       ["<=:=",            "AUGLE",     ""  ],
+		       ["<:=",             "AUGLT",     ""  ],
+		       ["~=:=",            "AUGNE",     ""  ],
+		       ["~===:=",          "AUGNEQV",   ""  ],
+		       ["==:=",            "AUGSEQ",    ""  ],
+		       [">>=:=",           "AUGSGE",    ""  ],
+		       [">>:=",            "AUGSGT",    ""  ],
+		       ["<<=:=",           "AUGSLE",    ""  ],
+		       ["<<:=",            "AUGSLT",    ""  ],
+		       ["~==:=",           "AUGSNE",    ""  ],
+		       ["\\",              "BACKSLASH", "b" ],
+		       ["!",               "BANG",      "b" ],
+		       ["|",               "BAR",       "b" ],
+		       ["^",               "CARET",     "b" ],
+		       ["^:=",             "CARETASGN", "b" ],
+		       [":",               "COLON",     ""  ],
+		       [",",               "COMMA",     ""  ],
+		       ["||",              "CONCAT",    "b" ],
+                       ["||:=",            "CONCATASGN",""  ],
+		       ["&",               "CONJUNC",   "b" ],
+		       [".",               "DOT",       "b" ],
+		       ["--",              "DIFF",      "b" ],
+		       ["--:=",            "DIFFASGN",  ""  ],
+		       ["===",             "EQUIV",     "b" ],
+		       ["**",              "INTER",     "b" ],
+		       ["**:=",            "INTERASGN", ""  ],
+		       ["{",               "LBRACE",    "b" ],
+		       ["[",               "LBRACK",    "b" ],
+		       ["|||",             "LCONCAT",   "b" ],
+		       ["|||:=",           "LCONCATASGN","" ],
+		       ["==",              "LEXEQ",     "b" ],
+		       [">>=",             "LEXGE",     ""  ],
+		       [">>",              "LEXGT",     ""  ],
+		       ["<<=",             "LEXLE",     ""  ],
+		       ["<<",              "LEXLT",     ""  ],
+		       ["~==",             "LEXNE",     "b" ],
+		       ["(",               "LPAREN",    "b" ],
+		       ["-:",              "MCOLON",    ""  ],
+		       ["-",               "MINUS",     "b" ],
+		       ["-:=",             "MINUSASGN", ""  ],
+		       ["%",               "MOD",       ""  ],
+		       ["%:=",             "MODASGN",   ""  ],
+		       ["~===",            "NOTEQUIV",  "b" ],
+		       ["=",               "NUMEQ",     "b" ],
+		       [">=",              "NUMGE",     ""  ],
+		       [">",               "NUMGT",     ""  ],
+		       ["<=",              "NUMLE",     ""  ],
+		       ["<",               "NUMLT",     ""  ],
+		       ["~=",              "NUMNE",     "b" ],
+		       ["+:",              "PCOLON",    ""  ],
+		       ["+",               "PLUS",      "b" ],
+		       ["+:=",             "PLUSASGN",  ""  ],
+		       ["?",               "QMARK",     "b" ],
+		       ["<-",              "REVASSIGN", ""  ],
+		       ["<->",             "REVSWAP",   ""  ],
+		       ["}",               "RBRACE",    "e" ],
+		       ["]",               "RBRACK",    "e" ],
+		       [")",               "RPAREN",    "e" ],
+		       [";",               "SEMICOL",   ""  ],
+		       ["?:=",             "SCANASGN",  ""  ],
+		       ["/",               "SLASH",     "b" ],
+		       ["/:=",             "SLASHASGN", ""  ],
+		       ["*",               "STAR",      "b" ],
+		       ["*:=",             "STARASGN",  ""  ],
+		       [":=:",             "SWAP",      ""  ],
+		       ["~",               "TILDE",     "b" ],
+		       ["++",              "UNION",     "b" ],
+		       ["++:=",            "UNIONASGN", ""  ],
+		       ["$(",              "LBRACE",    "b" ],
+		       ["$)",              "RBRACE",    "e" ],
+		       ["$<",              "LBRACK",    "b" ],
+		       ["$>",              "RBRACK",    "e" ],
+		       ["$",               "RHSARG",    "b" ],
+		       ["%$(",             "BEGGLOB",   "b" ],
+		       ["%$)",             "ENDGLOB",   "e" ],
+		       ["%{",              "BEGGLOB",   "b" ],
+		       ["%}",              "ENDGLOB",   "e" ],
+		       ["%%",              "NEWSECT",   "be"]]
+
+	# static be_tbl, reserved_tbl
+	reserved_tbl := table()
+	every elem := !reserveds do
+	    insert(reserved_tbl, elem[1], elem[2])
+	be_tbl := table()
+	every elem := !primitives | !reserveds | !operators do {
+	    insert(be_tbl, elem[2], elem[3])
+	}
+    }
+
+    /getchar   := create {
+	line_number := 0
+	! ( 1(!stream, line_number +:=1) || "\n" )
+    }
+    whitespace := ' \t'
+    /next_c    := @getchar | {
+	if \stream then
+	    return TOK("EOFX")
+	else fail
+    }
+
+    repeat {
+	case next_c of {
+
+	    "."      : {
+		# Could be a real literal *or* a dot operator.  Check
+		# following character to see if it's a digit.  If so,
+		# it's a real literal.  We can only get away with
+		# doing the dot here because it is not a substring of
+		# any longer identifier.  If this gets changed, we'll
+		# have to move this code into do_operator().
+		#
+		last_token := do_dot(getchar)
+		suspend last_token
+#		write(&errout, "next_c == ", image(next_c))
+		next
+	    }
+
+	    "\n"     : {
+		# If do_newline fails, it means we're at the end of
+		# the input stream, and we should break out of the
+		# repeat loop.
+		#
+		every last_token := do_newline(getchar, last_token, be_tbl)
+		do suspend last_token
+		if next_c === &null then break
+		next
+	    }
+
+	    "\#"     : {
+		# Just a comment.  Strip it by reading every character
+		# up to the next newline.  The global var next_c
+		# should *always* == "\n" when this is done.
+		#
+		do_number_sign(getchar)
+#		write(&errout, "next_c == ", image(next_c))
+		next
+	    }
+
+	    "\""    : {
+		# Suspend as STRINGLIT everything from here up to the
+		# next non-backslashed quotation mark, inclusive
+		# (accounting for the _ line-continuation convention).
+		#
+		last_token := do_quotation_mark(getchar)
+		suspend last_token
+#		write(&errout, "next_c == ", image(next_c))
+		next
+	    }
+
+	    "'"     : {
+		# Suspend as CSETLIT everything from here up to the
+		# next non-backslashed apostrophe, inclusive.
+		#
+		last_token := do_apostrophe(getchar)
+		suspend last_token
+#		write(&errout, "next_c == ", image(next_c))
+		next
+	    }
+
+	    &null   : stop("iparse_tokens (lexer):  unexpected EOF")
+
+	    default : {
+		# If we get to here, we have either whitespace, an
+		# integer or real literal, an identifier or reserved
+		# word (both get handled by do_identifier), or an
+		# operator.  The question of which we have can be
+		# determined by checking the first character.
+		#
+		if any(whitespace, next_c) then {
+		    # Like all of the TOK forming procedures,
+		    # do_whitespace resets next_c.
+		    do_whitespace(getchar, whitespace)
+		    # don't suspend any tokens
+		    next
+		}
+		if any(&digits, next_c) then {
+		    last_token := do_digits(getchar)
+		    suspend last_token
+		    next
+		}
+		if any(&letters ++ '_', next_c) then {
+		    last_token := do_identifier(getchar, reserved_tbl)
+		    suspend last_token
+		    next
+		}
+#		write(&errout, "it's an operator")
+		last_token := do_operator(getchar, operators)
+		suspend last_token
+		next
+	    }
+	}
+    }
+
+    # If stream argument is nonnull, then we are in the top-level
+    # iparse_tokens().  If not, then we are in a recursive call, and
+    # we should not emit all this end-of-file crap.
+    #
+    if \stream then {
+	return TOK("EOFX")
+    }
+    else fail
+
+end
+
+
+#
+#  do_dot:  coexpression -> TOK record
+#           getchar      -> t
+#
+#      Where getchar is the coexpression that produces the next
+#      character from the input stream and t is a token record whose
+#      sym field contains either "REALLIT" or "DOT".  Essentially,
+#      do_dot checks the next char on the input stream to see if it's
+#      an integer.  Since the preceding char was a dot, an integer
+#      tips us off that we have a real literal.  Otherwise, it's just
+#      a dot operator.  Note that do_dot resets next_c for the next
+#      cycle through the main case loop in the calling procedure.
+#
+procedure do_dot(getchar)
+
+    local token
+    # global next_c
+
+#    write(&errout, "it's a dot")
+
+    # If dot's followed by a digit, then we have a real literal.
+    #
+    if any(&digits, next_c := @getchar) then {
+#	write(&errout, "dot -> it's a real literal")
+	token := "." || next_c
+	while any(&digits, next_c := @getchar) do
+	    token ||:= next_c
+	if token ||:= (next_c == ("e"|"E")) then {
+	    while (next_c := @getchar) == "0"
+	    while any(&digits, next_c) do {
+		token ||:= next_c
+		next_c = @getchar
+	    }
+	}
+	return TOK("REALLIT", token)
+    }
+
+    # Dot not followed by an integer; so we just have a dot operator,
+    # and not a real literal.
+    #
+#    write(&errout, "dot -> just a plain dot")
+    return TOK("DOT", ".")
+    
+end
+
+
+#
+#  do_newline:  coexpression x TOK record x table -> TOK records
+#               (getchar, last_token, be_tbl)     -> Ts (a generator)
+#
+#      Where getchar is the coexpression that returns the next
+#      character from the input stream, last_token is the last TOK
+#      record suspended by the calling procedure, be_tbl is a table of
+#      tokens and their "beginner/ender" status, and Ts are TOK
+#      records.  Note that do_newline resets next_c.  Do_newline is a
+#      mess.  What it does is check the last token suspended by the
+#      calling procedure to see if it was a beginner or ender.  It
+#      then gets the next token by calling iparse_tokens again.  If
+#      the next token is a beginner and the last token is an ender,
+#      then we have to suspend a SEMICOL token.  In either event, both
+#      the last and next token are suspended.
+#
+procedure do_newline(getchar, last_token, be_tbl)
+
+    local next_token
+    # global next_c
+
+#    write(&errout, "it's a newline")
+
+    # Go past any additional newlines.
+    #
+    while next_c == "\n" do {
+        # NL can be the last char in the getchar stream; if it *is*,
+	# then signal that it's time to break out of the repeat loop
+	# in the calling procedure.
+	#
+	next_c := @getchar | {
+	    next_c := &null
+	    fail
+	}
+	suspend TOK(&null, next_c == "\n")
+    }
+
+    # If there was a last token (i.e. if a newline wasn't the first
+    # character of significance in the input stream), then check to
+    # see if it was an ender.  If so, then check to see if the next
+    # token is a beginner.  If so, then suspend a TOK("SEMICOL")
+    # record before suspending the next token.
+    #
+    if find("e", be_tbl[(\last_token).sym]) then {
+#	write(&errout, "calling iparse_tokens via do_newline")
+#	&trace := -1
+	# First arg to iparse_tokens can be null here.
+	\ (next_token := iparse_tokens(&null, getchar)).sym
+	if \next_token then {
+#	    write(&errout, "call of iparse_tokens via do_newline yields ",
+#		  ximage(next_token))
+	    if find("b", be_tbl[next_token.sym])
+	    then suspend TOK("SEMICOL", "\n")
+	    #
+	    # See below.  If this were like the real Icon parser,
+	    # the following line would be commented out.
+	    #
+	    else suspend TOK(&null, "\n")
+	    return next_token
+	}
+	else {
+	    #
+	    # If this were a *real* Icon tokenizer, it would not emit
+	    # any record here, but would simply fail.  Instead, we'll
+	    # emit a dummy record with a null sym field.
+	    #
+	    return TOK(&null, "\n")
+#           &trace := 0
+#	    fail
+	}
+    }
+
+    # See above.  Again, if this were like Icon's own tokenizer, we
+    # would just fail here, and not return any TOK record.
+    #
+#   &trace := 0
+    return TOK(&null, "\n")
+#   fail
+
+end
+
+
+#
+#  do_number_sign:  coexpression -> &null
+#                   getchar      -> 
+#
+#      Where getchar is the coexpression that pops characters off the
+#      main input stream.  Sets the global variable next_c.  This
+#      procedure simply reads characters until it gets a newline, then
+#      returns with next_c == "\n".  Since the starting character was
+#      a number sign, this has the effect of stripping comments.
+#
+procedure do_number_sign(getchar)
+
+    # global next_c
+
+#    write(&errout, "it's a number sign")
+    while next_c ~== "\n" do {
+	next_c := @getchar
+    }
+
+    # Return to calling procedure to cycle around again with the new
+    # next_c already set.  Next_c should always be "\n" at this point.
+    return
+
+end
+
+
+#
+#  do_quotation_mark:  coexpression -> TOK record
+#                      getchar      -> t
+#
+#      Where getchar is the coexpression that yields another character
+#      from the input stream, and t is a TOK record with "STRINGLIT"
+#      as its sym field.  Puts everything upto and including the next
+#      non-backslashed quotation mark into the str field.  Handles the
+#      underscore continuation convention.
+#
+procedure do_quotation_mark(getchar)
+
+    local token
+    # global next_c
+
+    # write(&errout, "it's a string literal")
+    token := "\""
+    next_c := @getchar
+    repeat {
+	if next_c == "\n" & token[-1] == "_" then {
+	    token := token[1:-1]
+	    while any('\t ', next_c := @getchar)
+	    next
+	} else {
+	    if slshupto('"', token ||:= next_c, 2)
+	    then {
+		next_c := @getchar
+		# resume outermost (repeat) loop in calling procedure,
+		# with the new (here explicitly set) next_c
+		return TOK("STRINGLIT", token)
+	    }
+	    next_c := @getchar
+	}
+    }
+
+end
+
+
+#
+#  do_apostrophe:  coexpression -> TOK record
+#                  getchar      -> t
+#
+#      Where getchar is the coexpression that yields another character
+#      from the input stream, and t is a TOK record with "CSETLIT"
+#      as its sym field.  Puts everything upto and including the next
+#      non-backslashed apostrope into the str field.
+#
+procedure do_apostrophe(getchar)
+
+    local token
+    # global next_c
+
+#   write(&errout, "it's a cset literal")
+    token := "'"
+    next_c := @getchar
+    repeat {
+	if next_c == "\n" & token[-1] == "_" then {
+	    token := token[1:-1]
+	    while any('\t ', next_c := @getchar)
+	    next
+	} else {
+	    if slshupto("'", token ||:= next_c, 2)
+	    then {
+		next_c := @getchar
+		# Return & resume outermost containing loop in calling
+		# procedure w/ new next_c.
+		return TOK("CSETLIT", token)
+	    }
+	    next_c := @getchar
+	}
+    }
+
+end
+
+
+#
+#  do_digits:  coexpression -> TOK record
+#              getchar      -> t
+#
+#      Where getchar is the coexpression that produces the next char
+#      on the input stream, and where t is a TOK record containing
+#      either "REALLIT" or "INTLIT" in its sym field, and the text of
+#      the numeric literal in its str field.
+#
+procedure do_digits(getchar)
+
+    local token, tok_record, extras, digits, over
+    # global next_c
+
+    # For bases > 16
+    extras := "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz"
+    # Assume integer literal until proven otherwise....
+    tok_record := TOK("INTLIT")
+
+#   write(&errout, "it's an integer or real literal")
+    token := ("0" ~== next_c) | ""
+    while any(&digits, next_c := @getchar) do
+	token ||:= next_c
+    if token ||:= (next_c == ("R"|"r")) then {
+	digits := &digits
+	if over := ((10 < token[1:-1]) - 10) * 2 then
+	    digits ++:= extras[1:over+1] | extras
+	next_c := @getchar
+	if next_c == "-" then {
+	    token ||:= next_c
+	    next_c := @getchar
+	}
+	while any(digits, next_c) do {
+	    token ||:= next_c
+	    next_c := @getchar
+	}
+    } else {
+	if token ||:= (next_c == ".") then {
+	    while any(&digits, next_c := @getchar) do
+		token ||:= next_c
+	    tok_record := TOK("REALLIT")
+	}
+	if token ||:= (next_c == ("e"|"E")) then {
+	    next_c := @getchar
+	    if next_c == "-" then {
+		token ||:= next_c
+		next_c := @getchar
+	    }
+	    while any(&digits, next_c) do {
+		token ||:= next_c
+		next_c := @getchar
+	    }
+	    tok_record := TOK("REALLIT")
+	}
+    }
+    tok_record.str := ("" ~== token) | "0"
+    return tok_record
+    
+end
+
+
+#
+#  do_whitespace:  coexpression x cset  -> &null
+#                  getchar x whitespace -> &null
+#
+#      Where getchar is the coexpression producing the next char on
+#      the input stream.  Do_whitespace just repeats until it finds a
+#      non-whitespace character, whitespace being defined as
+#      membership of a given character in the whitespace argument (a
+#      cset). 
+#
+procedure do_whitespace(getchar, whitespace)
+
+#   write(&errout, "it's junk")
+    while any(whitespace, next_c) do
+	next_c := @getchar
+    return
+
+end
+
+
+#
+#  do_identifier:  coexpression x table    -> TOK record
+#                  (getchar, reserved_tbl) -> t
+#
+#      Where getchar is the coexpression that pops off characters from
+#      the input stream, reserved_tbl is a table of reserved words
+#      (keys = the string values, values = the names qua symbols in
+#      the grammar), and t is a TOK record containing all subsequent
+#      letters, digits, or underscores after next_c (which must be a
+#      letter or underscore).  Note that next_c is global and gets
+#      reset by do_identifier.
+#
+procedure do_identifier(getchar, reserved_tbl)
+
+    local token
+    # global next_c
+
+#   write(&errout, "it's an indentifier")
+    token := next_c
+    while any(&letters ++ &digits ++ '_', next_c := @getchar)
+    do token ||:= next_c
+    return TOK(\reserved_tbl[token], token) | TOK("IDENT", token)
+    
+end
+
+
+#
+#  do_operator:  coexpression x list  -> TOK record
+#                (getchar, operators) -> t
+#
+#     Where getchar is the coexpression that produces the next
+#     character on the input stream, operators is the operator list,
+#     and where t is a TOK record describing the operator just
+#     scanned.  Calls recognop, which creates a DFSA to recognize
+#     valid Icon operators.  Arg2 (operators) is the list of lists
+#     containing valid Icon operator string values and names (see
+#     above).
+#
+procedure do_operator(getchar, operators)
+
+    local token, elem
+
+    token := next_c
+
+    # Go until recognop fails.
+    while elem := recognop(operators, token, 1) do
+	token ||:= (next_c := @getchar)
+#   write(&errout, ximage(elem))
+    if *\elem = 1 then
+	return TOK(elem[1][2], elem[1][1])
+    else fail
+
+end
+
+
+record dfstn_state(b, e, tbl)
+record start_state(b, e, tbl, master_list)
+#
+#  recognop: list x string x integer -> list
+#            (l, s, i)               -> l2
+#
+#     Where l is the list of lists created by the calling procedure
+#     (each element contains a token string value, name, and
+#     beginner/ender string), where s is a string possibly
+#     corresponding to a token in the list, where i is the position in
+#     the elements of l where the operator string values are recorded,
+#     and where l2 is a list of elements from l that contain operators
+#     for which string s is an exact match.  Fails if there are no
+#     operators that s is a prefix of, but returns an empty list if
+#     there just aren't any that happen to match exactly.
+#
+#      What this does is let the calling procedure just keep adding
+#      characters to s until recognop fails, then check the last list
+#      it returned to see if it is of length 1.  If it is, then it
+#      contains list with the vital stats for the operator last
+#      recognized.  If it is of length 0, then string s did not
+#      contain any recognizable operator.
+#
+procedure recognop(l, s, i)
+
+    local   current_state, master_list, c, result, j
+    static  dfstn_table
+    initial dfstn_table := table()
+
+    /i := 1
+    # See if we've created an automaton for l already.
+    /dfstn_table[l] := start_state(1, *l, &null, &null) & {
+	dfstn_table[l].master_list := sortf(l, i)
+    }
+
+    current_state := dfstn_table[l]
+    # Save master_list, as current_state will change later on.
+    master_list   := current_state.master_list
+
+    s ? {
+	while c := move(1) do {
+
+	    # Null means that this part of the automaton isn't
+	    # complete.
+	    #
+	    if /current_state.tbl then
+		create_arcs(master_list, i, current_state, &pos)
+
+	    # If the table has been clobbered, then there are no arcs
+	    # leading out of the current state.  Fail.
+	    #
+	    if current_state.tbl === 0 then
+		fail
+	    
+#	    write(&errout, "c = ", image(c))
+#	    write(&errout, "table for current state = ", 
+#		  ximage(current_state.tbl))
+
+	    # If we get to here, the current state has arcs leading
+	    # out of it.  See if c is one of them.  If so, make the
+	    # node to which arc c is connected the current state.
+	    # Otherwise fail.
+	    #
+	    current_state := \current_state.tbl[c] | fail
+	}
+    }
+
+    # Return possible completions.
+    #
+    result := list()
+    every j := current_state.b to current_state.e do {
+	if *master_list[j][i] = *s then
+	    put(result, master_list[j])
+    }
+    # return empty list if nothing the right length is found
+    return result
+
+end
+
+
+#
+#  create_arcs:  fill out a table of arcs leading out of the current
+#                state, and place that table in the tbl field for
+#                current_state
+#
+procedure create_arcs(master_list, field, current_state, POS)
+
+    local elem, i, first_char, old_first_char
+
+    current_state.tbl := table()
+    old_first_char := ""
+    
+    every elem := master_list[i := current_state.b to current_state.e][field]
+    do {
+	
+	# Get the first character for the current position (note that
+	# we're one character behind the calling routine; hence
+	# POS-1).
+	#
+	first_char := elem[POS-1] | next
+	
+	# If we have a new first character, create a new arc out of
+	# the current state.
+	#
+	if first_char ~== old_first_char then {
+	    # Store the start position for the current character.
+	    current_state.tbl[first_char] := dfstn_state(i)
+	    # Store the end position for the old character.
+	    (\current_state.tbl[old_first_char]).e := i-1
+	    old_first_char := first_char
+	}
+    }
+    (\current_state.tbl[old_first_char]).e := i
+
+    # Clobber table with 0 if no arcs were added.
+    current_state.tbl := (*current_state.tbl = 0)
+    return current_state
+
+end