diff options
Diffstat (limited to 'ipl/packs/ibpag2/ibreader.icn')
-rw-r--r-- | ipl/packs/ibpag2/ibreader.icn | 515 |
1 files changed, 515 insertions, 0 deletions
diff --git a/ipl/packs/ibpag2/ibreader.icn b/ipl/packs/ibpag2/ibreader.icn new file mode 100644 index 0000000..8401159 --- /dev/null +++ b/ipl/packs/ibpag2/ibreader.icn @@ -0,0 +1,515 @@ +############################################################################ +# +# Name: ibreader.icn +# +# Title: reader for Ibpag2 source files +# +# Author: Richard L. Goerwitz +# +# Version: 1.29 +# +############################################################################ +# +# This file contains a collection of procedures that 1) read in an +# Ibpag2 source file, 2) output token defines, 3) emit action code, +# and finally 4) pass a start symbol, list of productions, and token +# table back to the calling procedure. Described formally: +# +# ibreader: file x file x string -> ib_grammar record +# (in, out, module) -> grammar +# +# In is the input stream; out is the output stream; module is an +# optional string that distinguishes this grammar from others that +# might also be running simultaneously. Grammar is an ib_grammar +# record containing the start symbol in its first field and the +# production list in its second. Its third field contains a table +# used to map integers to actual token names or character literals, +# i.e. its keys are things like -1, 0, etc. and its values are things +# like "error," "EOF," etc. +# +# Note that if a module argument is supplied to ibreader(), one must +# also be supplied to ibwriter(). See ibwriter.icn. +# +# The format of the input file is highly reminiscent of YACC. It +# consists of three basic sections, the first two of which are +# followed by %%. See the main documentation to Ibpag2 for +# specifics. Major differences between Ibpag2 and YACC input format +# include: +# +# 1) "$$ = x" constructs are replaced by "return x" (e.g. "$$ = +# $1 + $3" -> "return $1 + $3") +# +# 2) all variables within a given action are, by default, local +# to that action; i.e. they cannot be accessed by other +# actions unless you declare them global elsewhere (e.g. in +# the pass-through part of the declarations section %{ ... %}) +# +# 3) the %union declaration is not needed by Ibpag +# +# 4) tokens and symbols are separated from each other by a comma +# (e.g. %token '+', '-' and S : NP, VP) +# +# 5) epsilon is indicated by the keyword "epsilon" (e.g. REL : +# epsilon) +# +# 6) both epsilon and error *may* be declared as %tokens for +# reasons of precedence, although they retain hard-coded +# internal values (-2 and -1, respectively) +# +# 7) all actions must follow the last RHS symbol of the rule they +# apply to (preceded by an optional %prec directive); to +# achieve S : NP { action1 }, VP { action2 }, insert a dummy +# rule: S : NP, dummy, VP { action2 }; dummy : epsilon { +# action1 } ; +# +# 8) YYERROR, YYACCEPT, yyclearin, and yyerrok are the same, +# except they are written IIERROR, IIACCEPT, iiclearin, and +# iierrok (i.e. "ii" replaces "yy") +# +# 9) Ibpag2's input files are tokenized like modified Icon files, +# and, as a consequence, Icon's reserved words must not be +# used as symbols (e.g. "if : if, then" is no go) +# +############################################################################ +# +# Links: itokens, escape +# +# See also: ibwriter +# +############################################################################ + +#link itokens, escape +link escape + +record ib_grammar(start, rules, tbl) +record tokstats(str, no, prec, assoc) + +# Declared in itokens.icn: +# global line_number + +# +# ibreader: file x file x string x string -> ib_grammar record +# (in, out, module, source_fname) -> grammar +# +# Where in is an input stream, out is an output stream, module is +# some string uniquely identifying this module (optional), and +# where grammar is an ib_grammar record containing the start +# symbol in its first field and a list of production records in +# its second. Source_fname is the string name of Ibpag2's input +# grammar file. Defaults to "source file." +# +procedure ibreader(in, out, module, source_fname) + + local tmp, grammar, toktbl, next_token, next_token_no_nl, + token, LHS, t + + /source_fname := "source file" + grammar := ib_grammar(&null, list(), table()) + toktbl := table() + next_token := create itokens(in, 1) + next_token_no_nl := create 1(tmp := |@next_token, \tmp.sym) + token := @next_token_no_nl | iohno(4) + + # Do the %{ $} and %token stuff, i.e. everything up to %% + # (NEWSECT). + # + until token.sym == "NEWSECT" do { + case token.sym of { + default : { + iohno(48, "token "||image(token.str) ||"; line "|| line_number) + } + "SEMICOL" : { + # Skip semicolon. Get another token while we're at it. + token := @next_token_no_nl | iohno(47, "line "||line_number) + } + "BEGGLOB" : { + write(out, "\n$line ", line_number, " ", image(source_fname)) + # Copy token values to out until we reach "%}" (ENDGLOB). + (token := copy_icon_stuff(next_token, out)).sym == "ENDGLOB" + token := @next_token_no_nl + } + "MOD" : { + (token := @next_token_no_nl).sym == "IDENT" | + iohno(30, "line " || line_number) + # + # Read in token declarations, set associativity and + # precedences, and enter the tokens into toktbl. + # + token := { + case token.str of { + default : iohno(30, "line " || line_number) + "token" : read_decl(next_token_no_nl, toktbl, &null) + "right" : read_decl(next_token_no_nl, toktbl, "r") + "left" : read_decl(next_token_no_nl, toktbl, "l") + "nonassoc": read_decl(next_token_no_nl, toktbl, "n") + "union" : iohno(45, "line "|| line_number) + "start" : { + (token := @next_token_no_nl).sym == "IDENT" | + iohno(31, "line " || line_number) + /grammar.start := token.str | + iohno(32, "line " || line_number) + @next_token_no_nl | iohno(4) + } + } + } + } + } + } + # Skip past %% (NEWSECT) and semicolon (if present). + token := @next_token_no_nl | iohno(47, "line "|| line_number) + (token := token | @next_token_no_nl | iohno(4)).sym ~== "SEMICOL" + token.sym == "NEWSECT" & iohno(47, "line "|| line_number) + + # + # Fetch start symbol if it wasn't defined above via %start; by + # default the start symbol is the LHS of rule 1. + # + /grammar.start := token.str + + # Having reached the end of the declarations section, we can now + # copy out a define for each token number, not counting character + # literals (which are stored as integers). While we're at it, + # create a table that maps token numbers back to character + # literals and strings (for use in later verbose and debugging + # displays). + # + write(out, "\n") + every t := !toktbl do { + if type(t.str) == "integer" then + insert(grammar.tbl, t.no, image(char(t.str))) + else { + insert(grammar.tbl, t.no, t.str) + write(out, "$define ", t.str, "\t", t.no) + } + } + + # Now, finally, read in rules up until we reach EOF or %% (i.e. + # NEWSECT). EOF is signaled below by failure of read_RHS(). + # + until token.sym == "NEWSECT" do { + token.sym == "IDENT" | iohno(33, token.str ||" line "|| line_number) + LHS := token.str + token := @next_token_no_nl | iohno(4) + token.sym == "COLON" | iohno(34, token.str ||" line "|| line_number) + # + # Read in RHS, then the action (if any) then the prec (if + # any). If we see a BAR, then repeat, re-using the same + # left-hand side symbol. + # + while token := + read_RHS(next_token, next_token_no_nl, out, toktbl, LHS, + grammar, module, source_fname) | + # if read_RHS fails, we're at EOF + break break + do token.sym == "BAR" | break + } + + # Copy the remainder of the file to out as Icon code. + write(out, "\n$line ", line_number, " ", image(source_fname)) + every copy_icon_stuff(next_token, out, "EOFX") + + # Do final setup on the reverse token table. This table will be + # used later to map integers to their original names in verbose or + # debugging displays. + # + insert(grammar.tbl, 0, "$") + + return grammar + +end + + +# +# copy_icon_stuff: coexpression x file x string -> ib_TOK records +# (next_token, out, except) -> token records +# +# Copy Icon code to output stream, also suspending as we go. +# Insert separators between tokens where needed. Do not output +# any token whose sym field matches except. The point in +# suspending tokens as we go is to enable the calling procedure to +# look for signal tokens that indicate insertion or termination +# points. +# +procedure copy_icon_stuff(next_token, out, except) + + local separator, T + + separator := "" + while T := @next_token do { + if \T.sym then suspend T + if \T.sym == \except then next + if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT" + then writes(out, separator) + writes(out, T.str) + if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT" + then separator := " " else separator := "" + } + + # unexpected EOF error + (except === "EOFX") | iohno(4) + +end + + +# +# read_decl: coexpression x table x string -> ib_TOK +# (next_token_no_nl, toktbl, assoc) -> token +# +# Read in token declarations, assigning them the correct +# precedence and associativity. Number the tokens for later +# $define preprocessor directives. When done, return the last +# token processed. Toktbl is the table that holds the stats for +# each declared token. +# +procedure read_decl(next_token_no_nl, toktbl, assoc) + + local token, c + static token_no, prec + initial { + token_no := 256 + prec := 0 + } + + # All tokens in this list have the same prec and assoc. + # Precedence is determined by order. Associativity is determined + # by keyword in the calling procedure, and is passed as arg 3. + # + prec +:= 1 + assoc === ("n"|"r"|"l"|&null) | iohno(5, image(assoc)) + + # As long as we find commas and token names, keep on adding tokens + # to the token table. Return the unused token when done. If we + # reach EOF, there's been an error. + # + repeat { + token := @next_token_no_nl | iohno(4) + case token.sym of { + default : iohno(31, token.str ||" line "|| line_number) + "CSETLIT" | "STRING": { + # Enter character literals as integers. + *escape(token.str[2:-1]) = 1 | iohno(49, token.str) + c := ord(escape(token.str[2:-1])) + toktbl[c] := tokstats(c, c, prec, assoc) + } + "IDENT" : { + case token.str of { + "error" : + toktbl[token.str] := tokstats("error", -1, prec, assoc) + "epsilon": + toktbl[token.str] := tokstats("epsilon",-2,prec, assoc) + default : { + # Enter TOKENs as string-keyed records in toktbl. + token_no +:= 1 + toktbl[token.str] := + tokstats(token.str, token_no, prec, assoc) + } + } + } + } + # As long as we're seeing commas, go back for more tokens. + token := @next_token_no_nl | iohno(4) + token.sym == "COMMA" | break + } + + # Skip past semicolon, if present (as set up now, it shouldn't be). + (token := token | @next_token_no_nl | iohno(4)).sym ~== "SEMICOL" + return token + +end + + +# +# read_RHS: coexpression x coexpression x file x table x +# string x ib_grammar record x string x string -> token +# +# Read_RHS goes through the RHS of rule definitions, inserting the +# resulting productions into a master rule list. At the same +# time, it outputs the actions corresponding to those productions +# as procedures that are given names corresponding to the numbers +# of the productions. I.e. production 1, if endowed with an { +# action }, will correspond to procedure _1_. Prec and assoc are +# automatically set to that of the last RHS nonterminal, but this +# may be changed explicitly by the %prec keyword, as in YACC. +# Source_fname is the name of the source grammar file we're pro- +# cessing (caller will give us some reasonable default if we're +# reading &input). +# +# Fails on EOF. +# +procedure read_RHS(next_token, next_token_no_nl, out, toktbl, LHS, + grammar, module, source_fname) + + local token, rule, c + static rule_no + initial rule_no := 0 + + rule_no +:= 1 + # LHS RHS POS LOOK no prec assoc + rule := production(LHS, list(), &null, &null, rule_no, &null, &null) + put(grammar.rules, rule) + + # Read in RHS symbols. + # + repeat { + token := @next_token_no_nl | iohno(4) + case token.sym of { + default : + iohno(35, "token "|| image(token.str)||"; line "|| line_number) + "CSETLIT" | "STRING": { + *escape(token.str[2:-1]) = 1 | iohno(49, token.str) + c := ord(escape(token.str[2:-1])) + if \toktbl[c] then { + rule.prec := toktbl[c].prec + rule.assoc := toktbl[c].assoc + } + # literals not declared earlier will get caught here + else insert(grammar.tbl, c, image(char(c))) + put(rule.RHS, c) + } + "IDENT" : { + # If it's a terminal (i.e. a declared token), assign + # this rule its precedence and associativity. If it's + # not in toktbl, then it's not a declared token.... + if \toktbl[token.str] then { + rule.prec := toktbl[token.str].prec + rule.assoc := toktbl[token.str].assoc + put(rule.RHS, toktbl[token.str].no) + if toktbl[token.str].no = -2 then { + *rule.RHS > 1 & iohno(44, "line ", line_number) + rule.POS := 2 + } + } + # ...undeclared stuff. Could be a nonterminal. If + # error and/or epsilon weren't declared as tokens, + # they will get caught here, too. + else { + case token.str of { + &null : stop("What is going on here?") + default : put(rule.RHS, token.str) + "error" : { + put(rule.RHS, -1) + insert(grammar.tbl, -1, "error") + } + "epsilon" : { + if *put(rule.RHS, -2) > 1 + then iohno(44, "line ", line_number) + else rule.POS := 2 + insert(grammar.tbl, -2, "epsilon") + } + } + } + } + } + # Comma means: Go back for another RHS symbol. + token := @next_token_no_nl | fail + token.sym == "COMMA" | break + } + + # Skip semicolon token, if present. + (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" + + # Read and set (optional) precedence. + # + if token.sym == "MOD" then { + token := @next_token_no_nl | iohno(4) + (token.sym == "IDENT" & token.str == "prec") | + iohno(43, token.str || " line " || line_number) + token := @next_token_no_nl | iohno(4) + case token.sym of { + "CSETLIT" | "STRING" : { + *escape(token.str[2:-1]) = 1 | iohno(49, token.str) + c := ord(escape(token.str[2:-1])) & + rule.prec := toktbl[c].prec & + rule.assoc := toktbl[c].assoc + } + "IDENT" : { + \toktbl[token.str] | + iohno(43, token.str || " line " || line_number) + rule.prec := toktbl[token.str].prec & + rule.assoc := toktbl[token.str].assoc + } + default : 1 = 4 # deliberate failure + } | iohno(43, "line ", line_number) + token := @next_token_no_nl | fail + } + + # Skip semicolon token, if present. + (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" + + # Read in (optional) action. + # + if token.sym == "LBRACE" then { + write_action_as_procedure(next_token, out, rule, + module, source_fname) + token := @next_token_no_nl | fail + } + + # Skip semicolon token, if present. + (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" + return token + +end + + +# +# write_action_as_procedure +# +procedure write_action_as_procedure(next_token, out, rule, + module, source_fname) + + local argstr, bracelevel, token, i, neg + + /module := "" + argstr := "" + # + # Decide the number of arguments based on the length of the RHS of + # rule. Exception: Epsilon productions are empty, and pop nothing + # off the stack, so take zero args. + # + if rule.RHS[1] ~=== -2 then { + every argstr ||:= "arg" || (1 to *rule.RHS) || "," + argstr := trim(argstr, ',') + } + write(out, "procedure _", rule.no, "_", module, "(", argstr, ")") + write(out, "\n$line ", line_number, " ", image(source_fname)) + + bracelevel := 1 + until bracelevel = 0 do { + every token := copy_icon_stuff(next_token, out, "RHSARG") do { + case token.sym of { + default : next + "LBRACE" : bracelevel +:= 1 + "RBRACE" : bracelevel -:= 1 + "RHSARG" : { + until \ (token := @next_token).sym do + writes(out, token.str) + if neg := (token.sym == "MINUS") then + until \ (token := @next_token).sym do + writes(out, token.str) + else neg := &null + token.sym == "INTLIT" | iohno(37, "$"||token.str) + if /neg & token.str ~== "0" then { + token.str <= *rule.RHS | iohno(38, "$"||token.str) + writes(out, " arg", token.str, " ") + } else { + # Code for $0, $-1, etc. + # + # Warning! If the name of the stack is changed + # in iiparse.lib, it has to be changed here, too. + # + i := abs(token.str)+1 + writes(out, " value_stack", module, "[", i, "] ") + } + } + } + if bracelevel = 0 then { + write(out, "\nend\n") + return token + } + } + } + + iohno(39, "line "|| line_number) + +end + |