diff options
author | Igor Pashev <pashev.igor@gmail.com> | 2013-01-27 23:51:56 +0000 |
---|---|---|
committer | Igor Pashev <pashev.igor@gmail.com> | 2013-01-27 23:51:56 +0000 |
commit | 6ab0c0f5bf14ed9c15370407b9ee7e0b4b089ae1 (patch) | |
tree | 926065cf45450116098db664e3c61dced9e1f21a /ipl/packs/ibpag2 | |
download | icon-6ab0c0f5bf14ed9c15370407b9ee7e0b4b089ae1.tar.gz |
Initial upstream version 9.4.3upstream/9.4.3
Diffstat (limited to 'ipl/packs/ibpag2')
-rw-r--r-- | ipl/packs/ibpag2/Makefile | 107 | ||||
-rw-r--r-- | ipl/packs/ibpag2/README | 1093 | ||||
-rw-r--r-- | ipl/packs/ibpag2/beta2ref.ibp | 117 | ||||
-rw-r--r-- | ipl/packs/ibpag2/follow.icn | 332 | ||||
-rw-r--r-- | ipl/packs/ibpag2/iacc.ibp | 495 | ||||
-rw-r--r-- | ipl/packs/ibpag2/ibpag2.icn | 303 | ||||
-rw-r--r-- | ipl/packs/ibpag2/ibreader.icn | 515 | ||||
-rw-r--r-- | ipl/packs/ibpag2/ibutil.icn | 296 | ||||
-rw-r--r-- | ipl/packs/ibpag2/ibwriter.icn | 110 | ||||
-rw-r--r-- | ipl/packs/ibpag2/iiglrpar.lib | 946 | ||||
-rw-r--r-- | ipl/packs/ibpag2/iiparse.lib | 419 | ||||
-rw-r--r-- | ipl/packs/ibpag2/iohno.icn | 95 | ||||
-rw-r--r-- | ipl/packs/ibpag2/itokens.icn | 925 | ||||
-rw-r--r-- | ipl/packs/ibpag2/outbits.icn | 100 | ||||
-rw-r--r-- | ipl/packs/ibpag2/rewrap.icn | 144 | ||||
-rw-r--r-- | ipl/packs/ibpag2/sample.ibp | 111 | ||||
-rw-r--r-- | ipl/packs/ibpag2/shrnktbl.icn | 131 | ||||
-rw-r--r-- | ipl/packs/ibpag2/slritems.icn | 244 | ||||
-rw-r--r-- | ipl/packs/ibpag2/slrtbls.icn | 370 | ||||
-rw-r--r-- | ipl/packs/ibpag2/slshupto.icn | 79 | ||||
-rw-r--r-- | ipl/packs/ibpag2/sortff.icn | 82 | ||||
-rw-r--r-- | ipl/packs/ibpag2/version.icn | 19 |
22 files changed, 7033 insertions, 0 deletions
diff --git a/ipl/packs/ibpag2/Makefile b/ipl/packs/ibpag2/Makefile new file mode 100644 index 0000000..56d917e --- /dev/null +++ b/ipl/packs/ibpag2/Makefile @@ -0,0 +1,107 @@ +########################################################################## +# + PROGNAME = ibpag2 +# +########################################################################## +# +# User-modifiable section. Read carefully! You will almost +# certainly have to change some settings here. +# + +# +# Destination directory for binaries files. Owner and group for +# public executables. Leave the trailing slash off of directory +# names. +# +OWNER = richard # root +GROUP = group # root +DESTDIR = /usr/local/bin +# Put this path into your LPATH variable (on which, see the Icon +# documentation). Make sure that the directory exists. +LIBDIR = /usr/local/lib/icon/data + +# +# Name of your icon compiler and compiler flags. +# +ICONC = icont +IFLAGS = -u -s #-Sc 400 -Sg 400 -Si 2000 -Sn 4000 -SF 40 + +SHAR = /usr/local/bin/shar +COMPRESS = /usr/bin/compress +# COMPRESS = /usr/local/bin/gzip + +########################################################################### +# +# Don't change anything below this line unless you're really sure of +# what you're doing. +# + +AUX = slshupto.icn rewrap.icn outbits.icn sortff.icn itokens.icn +SRC = $(PROGNAME).icn $(AUX) slrtbls.icn slritems.icn follow.icn \ + ibutil.icn iohno.icn ibreader.icn ibwriter.icn shrnktbl.icn \ + version.icn +PARSER = iiparse.lib +GLRPARSER = iiglrpar.lib +SHARFILES = $(SRC) $(PARSER) $(GLRPARSER) sample.ibp beta2ref.ibp \ + iacc.ibp Makefile.dist README + +all: $(PROGNAME) + +$(PROGNAME): $(SRC) + $(ICONC) $(IFLAGS) -o $(PROGNAME) $(SRC) + + +########################################################################## +# +# Pseudo-target names (shar, install, clean, clobber) +# + +# +# Assumes you have a shar program like mine. +# +shar: $(SHARFILES) + @echo "" + @echo "Removing any old shars in this directory." + @echo "" + -rm -f $(PROGNAME).[0-9][0-9].Z + @echo "" + $(SHAR) -fVc -o$(PROGNAME) -L32 $(SHARFILES) + $(COMPRESS) -f $(PROGNAME).[0-9][0-9] + @echo "" + @echo "Shell archive finished." + @echo "" + +# Pessimistic assumptions regarding the environment (in particular, +# I don't assume you have the BSD "install" shell script). +install: all + @echo "" + -test -d $(DESTDIR) || mkdir $(DESTDIR) && chmod 755 $(DESTDIR) + cp $(PROGNAME) $(DESTDIR)/$(PROGNAME) + -chgrp $(GROUP) $(DESTDIR)/$(PROGNAME) + -chown $(OWNER) $(DESTDIR)/$(PROGNAME) + -chmod 755 $(DESTDIR)/$(PROGNAME) + -test -d $(LIBDIR) || mkdir $(LIBDIR) && chmod 755 $(LIBDIR) + cp $(PARSER) $(LIBDIR)/$(PARSER) + cp $(GLRPARSER) $(LIBDIR)/$(GLRPARSER) + -chgrp $(GROUP) $(LIBDIR)/$(PARSER) + -chown $(OWNER) $(LIBDIR)/$(PARSER) + -chgrp $(GROUP) $(LIBDIR)/$(GLRPARSER) + -chown $(OWNER) $(LIBDIR)/$(GLRPARSER) + -chmod 644 $(LIBDIR)/$(PARSER) + -chmod 644 $(LIBDIR)/$(GLRPARSER) + @echo "" + @echo "Done installing." + @echo "" + +# Build executable and copy to ../../iexe. +# Nothing done in this case because the executable doesn't stand alone. +Iexe: + + +# +# Cleanup +# +clean: + -rm -f *~ #*# core *.u[12] $(PROGNAME).output +Clean clobber: clean + -rm -f $(PROGNAME) diff --git a/ipl/packs/ibpag2/README b/ipl/packs/ibpag2/README new file mode 100644 index 0000000..c2f5d82 --- /dev/null +++ b/ipl/packs/ibpag2/README @@ -0,0 +1,1093 @@ + + + + + + + A User's Manual for Ibpag2 + (Icon-Based Parser Generation System 2) + Version 1.2 + + - or - + + How to Use an LR-based Parser Generator + + + Richard L. Goerwitz, III + University of Chicago + + + + + + +1.__What_is_Ibpag2? + + Ibpag2 is a so-called "parser generator," i.e. a tool for +automating the process of generating a recognizer and/or parser from +abstract structural descriptions of an input language. Put in more +practical terms, Ibpag2 is a piece of software that a) reads a source +file containing a grammar that defines an input language, and then b) +outputs an automaton that recognizes that language. The user may, at +his or her option, specify actions this automaton should take when it +sees various substructures within its input language. By default, +however, the parser simply recognizes a given sequence as belonging, +or not, to that language. + + Ibpag2 utilizes so-called "LR" table generation and parsing +algorithms. These algorithms facilitate construction of reasonably +fast deterministic pushdown automata that are powerful enough to +handle most commonly used programming language constructs. LR-based +systems come in three main flavors: SLR(1), LALR(1), and LR(1). The +LR(1) flavor is fairly easy to implement, but uses too many resources +to be practical. LALR(1) algorithms are harder to implement, but much +faster, and the parse tables they construct use considerably less +memory than do those of their LR(1) counterparts. SLR(1) algorithms +are the easiest to implement, compile the fastest, and use about as +much memory as LALR(1)s. SLR(1) is the least powerful of the three, +though, so there is a tradeoff. Ibpag2 is an "enhanced" SLR(1) parser +generator. It is enhanced in the sense that it can operate both in +its native SLR(1) mode, and in a more powerful "quasi-GLR" mode (on +which, see section 5 below). + + As its full title ("Icon-Based Parser Generator 2") implies, +Ibpag2 is written in Icon [2,3], as are the automata it creates. +Ibpag2 has been tested with Icon version 8.10. So far I have only run +it on an i386 box running Xenix 2.3.3, and on a Sun 4 running some +version of SunOS. I have many reports, though, of it running under +other UNIX variants. It will probably also run under other operating +systems, though modifications will in some instances be required. +Using Ibpag2 under MS-DOS may not be possible, on account of the way +it manages memory. + + The Ibpag2 distribution adheres to de facto UNIX installation +standards: Just set the appropriate variables in the makefile, and +then "make install." For those who are using a non-UNIX system, or +who have not installed such a package before, there is a section at +the end entitled "Installing Ibpag2" that details the installation +procedure (section 6). + + Aside from the above-mentioned installation section (6), the +remainder of this document aims to provide the reader a) with a +simple, practical explanation of what LR-family parser generators are +and how they work (section 2), and b) with a set of directions +specifically on how to use Ibpag2 (section 3). There is also an +advanced section on debugging (4), and one on using Ibpag2 with non-LR +and/or ambiguous languages (5). The discussion is geared for those +that have little or no experience in parsing or automaton theory. For +very advanced reading, consult the bibliography. For a brief summary +of Ibpag's command-line options, see the main Ibpag2 source file, +ibpag2.icn, or invoke ibpag2 with the -h (help) option. + + In general, be warned that Ibpag2 works best with small or +medium-sized grammars. Its parse tables have to be reconstructed at +run-time, and the code for doing this can become a bit cumbersome for +grammars with more than 100 rules and fifty or so terminal symbols. I +myself have processed grammars with as many as 300 terminals and 400 +rules. Although the resulting automata run well enough, the output +files are over 300k, and Ibpag2 takes a long time to create them. If +you must use Ibpag2 with a very large grammar symbols, try the -c +command-line option (which produces compressed parse tables). This +option is discussed below, in section 4. Compiling (rather than +interpreting) Ibpag2 may result in much faster processing, as will +resetting your BLOCKSIZE and STRSIZE environment variables. See the +installation section (6) below on using the Icon compiler to create +the Ibpag2 executable. Good starting values for BLOCKSIZE and STRSIZE +are triple their default values (i.e. 3 x 65000). These variables are +discussed in the Icon manual page. + + My ultimate aim in writing this document has been to make +accessible to the non-CS portion of the Icon community what for them +might seem an inaccessible branch of applied parsing and automaton +theory. I am a philologist myself, and feel that there is a great +deal that can and ought to be done to make advanced tools accessible +to people with other interests than twiddling bits or pondering the +true meaning of epsilon closures :-). + + Any comments on the Ibpag2 system itself or its documentation +will be gratefully received. Write to me at the address appended to +the final section (6). + + +2.__What_is_an_LR_Parser_Generator? + + Back in the late 50s and 60s, linguists, mathematicians, and +software engineers all became intensely interested in the formal +properties of languages: Can they be described as a series of logical +structures and relations? Can computers recognize and manipulate +these structures efficiently? Linguists, in particular, quickly +realized that the amount of structural complexity, ambiguity, and pure +noise in natural language would render it computationally intractable, +especially given the limited memory/throughput of then available CPUs. +Mathematicians and engineers, however, found that many of the +formalized notations they dealt with could, in fact, be (re)designed +in such a way that efficient computer processing could - at least in +principle - be achieved. + + Principle, in this case, did not squarely meet reality until +viable parser generation tools came into being. Parser generation +tools map an abstract structural description of a formal notation or +"language" to working computer code. Ideally, the designer simply +makes assertions like: + + an expression is composed of either + 1) a term (e.g. 10), or + 2) an expression, a "+" or "-", and another expression + +Parser generator systems translate these assertions (the "grammar") +into a machine, i.e. automaton, that can recognize and/or manipulate +input streams that conform to the "language" so described. + + Let me dwell, for a moment, on the toy expression grammar +offered above. Note that it describes a set of simple mathematical +constructs like: + + 9 + 9 + 3 + 9 + 3 - 8 + +According to the specifications given above, the nine, three, and +eight alone constitute terms - which are also expressions (via rule +1). Because these terms are also expressions, "9 + 3" can be reduced +to a larger expression by rule 2. The same is true for "9 + 3 - 8," +except that there rule 2 must apply twice - once for "9 + 3," and then +again for that and the remainder of the line - in effect grouping the +expressions as ( ( (9) + (3) ) - (8) ). It is also possible to group +the expression ( (9) + ( (3) - (8) ) ), although for the discussion +that immediately follows this second grouping will be ignored (see +below on the terms "precedence" and "associativity"). + + If we add actions to the above grammar specification, we can +create a calculator-like automaton. Traditionally, LR-family automata +(like the ones Ibpag2 creates) contain a parser, one or more stacks, +and a set of action tables. The parser reads from an input stream +segmented into "tokens" (e.g. TERM, '+', '-'), and then manipulates +its stacks according to directives contained in so-called "action" and +"goto" tables. As it reads the input stream, the parser matches rules +with action code specified by the programmer, e.g. rule 2 above might +be matched with code that added/subtracted the expressions on either +side of the '+'/'-' operator, and produced (in calculator style) the +result. Alternatively, it might be matched with code that generated +an equivalent construct in another language. + + In the case of our toy expression grammar above, the +corresponding LR automaton operates as follows. Omitting and/or +simplifying some of the inner details, it first looks at the input +stream to see what the next token is. If the next token is an +operator or end-of-input, it checks the top of its stack. If the top +of the stack has a term on it, that term is popped off, and pushed +back on, this time renamed as an expression (rule 1 above). The input +token is then shifted from the input stream onto the stack, unless it +is the end-of-input token, in which case the parser returns with a +result. If the top of the stack has an expression on it (rather than +a term), the parser pops the top three elements off of the stack, and +then either subtracts the third element from the first or adds the two +together, depending on whether the second element down was the +addition or subtraction operator, and the result is pushed onto the +stack as yet another expression. + + Even in this much-simplified form, the automaton's structure +is complex. Let us look briefly, therefore, at a practical example of +its actual workings. If we were to feed it "9 + 3 + 8," our +calculator would take the following actions: + + 1) read the 9, and push it onto the stack as a term + 2) see a plus sign on the input stream + 3) pop the term (9) off of the stack and push it back on again + (this time calling it an expression) + 4) push the plus sign onto the stack + 5) read the 3, and push it onto the stack as a term + 6) see a minus sign on the input stream + 7) pop the 3 off of the stack and push it back on again (this + time calling it an expression) + 8) see a minus sign still waiting on the input stream + 9) pop 9, +, and 3 off of the stack, apply the plus operator + to 9 and 3, then push the result onto the stack again a + single expression (the stack now has 12 on top) + 10) read the minus sign, and push it onto the stack + 11) read the 8, and push it onto the stack as a term + 12) see the end of input coming up on the input stream + 13) pop the 8 off of the stack and push it back on again as an + expression + 14) see the end-of-input token still sitting on the input + stream + 15) pop 12, -, and 8 off of the stack, apply the minus operator + to 12 and 8, then push the result onto the stack again (the + stack now has 4 on top) + 16) return the "answer" (i.e. 4) + + This series of actions is hard to describe, and even more so +to model as part of a hand-written computer program. And, even if +such a program were written by hand, this program would have to be +modified, at times radically, every time the grammar it assumes was +augmented or changed. What I am leading up to is that, with a parser +generator, the hand compilation stage can be eliminated by allowing +the programmer simply to declare his/her tokens and language specs, +then have the appropriate automaton constructed with little, or no, +human intervention. This is why parser generation tools were critical +to the development of not just theoretically feasible, but truly +*practical*, LR-based computer language design systems. + + +3.__Using_Ibpag2 + + To recode the above toy expression grammar in +Ibpag2-compatible format is relatively simple, especially if we omit +the actions initially, and concentrate on simple recognition. We need +only a set of token declarations and three rules. Certain +modifications will have to be made to the token declarations later on. +For general illustration's sake, however, the following will suffice: + + %token TERM, '+', '-' + %% + expression : TERM + expression : expression, '+', expression + expression : expression, '-', expression + +TERM, and the addition and subtraction operators, are the tokens (i.e. +the terminals symbols out of which the grammar is constructed - the +things that the input stream is segmented into). Note the %token +keyword used to declare them. The colon means "is composed of." The +double percent sign separates token declarations from the grammar +proper. + + Adding in our actions - which above were keyed to a complex +set of decisions based on input tokens and stack conditions - requires +just a few extra lines of Ibpag2 action code, set off in curly braces: + + %token TERM, '+', '-' + %% + expression : TERM { return arg1 } + expression : expression, '+', expression { return arg1 + arg3 } + expression : expression, '-', expression { return arg1 - arg3 } + +Using a "|" shorthand for repeated left-hand sides of rules, we may +reformat this as: + + %token TERM, '+', '-' + %% + expression : TERM { return arg1 } + | expression, '+', expression { return arg1 + arg3 } + | expression, '-', expression { return arg1 - arg3 } + + ArgX above refers to the Xth element of the right-hand side of +the preceding rule. So, for example, arg1 in "{ return arg1 }" above +refers to TERM - the only right-hand side element of the first rule. +The action "{ return arg1 }" means, "once you find a TERM and have +renamed it as an expression, use the value of TERM as the value for +that expression." By way of contrast, the action "{ return arg1 + +arg3 }" means, in conjunction with the rule it follows: "When you find +an expression consisting of a sub-expression, a plus operator, and +another sub-expression, use the value of sub-expression 1 + the value +of sub-expression 2 as the value for the expression as a whole." +Technically, the action "{ return arg1 }" for expression : TERM is not +necessary, since the Ibpag2 parser, by default, pushes the value of +the last RHS arg onto the stack. For epsilon productions (to be +discussed below), it pushes &null. + + One serious problem with this set of specifications is that +the operators '-' and '+' are left associative. We humans take this +for granted, because correct algebraic grouping is something our +high-school math teachers burned into us. The computer, though, has +to be told, pedantically, how to group addition and subtraction +expressions. It has to be explicitly instructed, in other words, to +group expressions like "9 + 32 - 4" as (9 + 32) - 4. Without +instructions of this kind, the parser does not know, after it has read +"9 + 32" and is looking at a minus sign, whether to shift the minus +sign onto the stack, and eventually try to group as 9 + (32 - 4), or +to reduce "9 + 32" to an expression and group as (9 + 32) - 4. +Although in this case the grouping may not seem to matter, it +sometimes does. Some operators group right to left. The unary minus +sign, for example, is one such operator (--4 groups as (- (- 4))). To +include the unary minus sign in our grammar, we might append yet +another rule: + + %token TERM + %left '+', '-' + %right '-' + %% + expression : TERM { return arg1 } + | expression, '+', expression { return arg1 + arg3 } + | expression, '-', expression { return arg1 - arg3 } + | '-', expression { return - arg2 } + +The trouble with this arrangement is that the minus sign was already +declared as left associative. To get around the conflict we use a +"dummy" token declaration, and a %prec declaration in the applicable +rule: + + %token TERM + %left '+', '-' + %right UMINUS + %% + expression : TERM { return arg1 } + | expression, '+', expression { return arg1 + arg3 } + | expression, '-', expression { return arg1 - arg3 } + | '-', expression %prec UMINUS { return - arg2 } + +The %prec declaration simply tells the parser that, even though the +rule contains a '-' operator, the rule should be handled as if the +operator were UMINUS. UMINUS is not actually used as a symbol in the +right-hand side of any rule (hence the designation "dummy"). It is +there simply to make the last rule behave as if the minus sign in the +last rule were different than in the second-to-last rule. + + Let us now add in multiplication and division operators to our +calculator specifications, and see what happens. Let me reiterate +here that the action "{ return arg1 }" for rule 1 (expression : TERM) +is not strictly necessary, since the default is to push the last RHS +arg onto the value stack: + + %token TERM + %left '+', '-' + %left '*', '/' + %right UMINUS + %% + expression : TERM { return arg1 } + | expression, '+', expression { return arg1 + arg3 } + | expression, '-', expression { return arg1 - arg3 } + | expression, '*', expression { return arg1 * arg3 } + | expression, '/', expression { return arg1 / arg3 } + | '-', expression %prec UMINUS { return - arg2 } + +Note that the multiplication and division operators were defined +*after* the addition and subtraction operators. The reason for this +is that, technically speaking, the grammar itself is ambiguous. If we +treat all operators identically, the parser will not be able to tell +whether "9 + 1 * 3" should be parsed as (9 + 1) * 3 or as 9 + (1 * 3). +As we all know from our high-school algebra, multiplication has a +higher precedence than addition. You do the multiplications before +the additions, in other words, no matter where they occur. To tell +the parser to behave in this same manner, we declare '*' after '+'. +Note that, despite their higher priority, the '*' and '/' operators +are still left associative. Hence, given "3 / 4 * 7," the parser will +group its input as (3 / 4) * 7. As a brain teaser, try to figure out +how the parser might group the input "9 + 3 / 4 * 7." Remember that +higher-precedence rules get done first, but that same-precedence rules +get done according to associativity. + + The only fundamental problem remaining with the above grammar +is that it assumes that the end of the input coincides with the end of +the line. Is it possible to redefine the language described as +consisting of arbitrary many lines? The answer to this question is +"yes." One can simply add another set of productions to the grammar +that state, essentially, that the input language consists of lines +made up of an expression and a carriage return or of nothing. Nothing +is indicated by the keyword epsilon. Note that only the first rule +has an action field: + + lines : lines, expression, '\n' { write(arg2) } + | lines, '\n' + | epsilon + +This rule-series may seem rather abstruse, but it becomes a bit +clearer when you think about what happens on actual input. If there +is no input (epsilon), nothing gets printed, because lines : epsilon +has no action field. If the parser sees an expression and a newline, +the parser takes this as an instance of epsilon, plus an expression, +plus a newline. This, then, becomes the first component of rule 1 if +another expression + newline follows, or of rule two if just a newline +occurs. Every time an instance of rule 1 occurs, the action "{ +write(arg2) }" is executed, i.e. the value of the expression gets +printed. If this still seems hard to fathom, try walking through +step-by-step. Even experienced hands may find these sorts of rules +difficult to construct and debug. + + Note that "lines" is now the so-called "start symbol" of our +grammar. It is, in other words, the goal of every parse. By default +the left-hand side symbol of the first rule is the start symbol. This +may be overridden with a %start declaration in the tokens section (on +which, see the sample Ibpag2 input file below). + + With our new, multi-line start symbol in place, the only piece +that needs to be added, in order to make our calculator specification +a full working input to Ibpag2, is a tokenizer. A tokenizer is a +routine that reads input from a file or from some other stream (e.g. +the user's console), and then segments this input into tokens that its +parser can understand. In some cases, the tokens must be accompanied +by a literal value. For example, if we encounter a TERM, we return +TERM, just as it is listed in the %token declaration. But what is the +literal value of a TERM token? It could be, for example, 9, or 5, or +700. The tokenizer returns the symbol TERM, in this case, but then +records that TERM's actual value by setting some global variable. In +Ibpag2's parser, this variable is assumed to be "iilval." In the +tokenizer, therefore, one might write + + iilval := (literal value) + suspend TERM + +For literal operators like '+' and '*', there is no need to set +iilval, since their literal value is irrelevant. One simply returns +these as integers (usually via "suspend ord(c)"). + + The tokenizer routine is normally appended to the grammar +after another double percent sign. Everything after this second +double percent sign is copied literally to the output file. +Alternatively, the tokenizer can be $included via Icon's preprocessor. +Ibpag2 demands that the tokenizer be called iilex, and that it take a +single file argument, that it be a generator, and that it fail when it +reaches end-of-input. Combined with our "lines" productions, the +addition of an iilex routine to our calculator grammar yields the +following Ibpag2 input file: + + %token TERM + %left '+', '-' + %left '*', '/' + %right UMINUS + + %start lines + + %% + + expression : TERM { return arg1 } + | expression, '+', expression { return arg1 + arg3 } + | expression, '-', expression { return arg1 - arg3 } + | expression, '*', expression { return arg1 * arg3 } + | expression, '/', expression { return arg1 / arg3 } + | '-', expression %prec UMINUS { return - arg2 } + + lines : lines, expression, '\n' { write(arg2) } + | lines, '\n' + | epsilon + + %% + + procedure iilex(infile) + + local nextchar, c, num + + nextchar := create !(!infile || "\n" || "\n") + c := @nextchar | fail + + repeat { + if any(&digits, c) then { + if not (\num ||:= c) then + num := c + } else { + if iilval := \num then { + suspend TERM + num := &null + } + if any('+-*/()\n', c) then { + iilval := c + suspend ord(c) + } else { + if not any(' \t', c) then { + # deliberate error - will be handled later + suspend &null + } + } + } + c := @nextchar | break + } + if iilval := \num then { + return TERM + num := &null + } + + end + + procedure main() + return iiparse(&input, 1) + end + +As noted above, the tokenizer (iilex) must be a generator. It must +suspend integers either directly (e.g. ord(c)), or else via symbolic +defines like TERM, created by Ibpag2 on the basis of %token, %right, +%left, and %nonassoc declarations. The tokenizer must fail on end of +input. + + If you like, cut the above code out, place it in a temporary +file, tmp.ibp, and then feed this file to Ibpag2 by typing "ibpag2 -f +tmp.ibp -o tmp.icn." If your system supports input and output +redirection, type: "ibpag2 < tmp.ibp > tmp.icn." Ibpag2 will turn +your grammar specifications and actions into a routine called iiparse. +If you look above, you will see that I appended a main procedure that, +in fact, calls iiparse(). Iiparse() takes two arguments: 1) an input +stream, and 2) a switch that, if nonnull, tells the parser to fail +rather than abort on unrecoverable errors. When Ibpag2 is finished +creating its output file (tmp.icn above), compile that file the way +you would compile any other Icon program (e.g. "icont tmp"). Finally, +run the executable. You should be able to type in various simple +arithmetic expressions and have the program spit back answers each +time you hit a return. The only problem you might encounter is that +the parser aborts on erroneous input. + + The issue of erroneous input brings up yet another point of +general Ibpag2 usage. Normally, if one is processing input, one does +not want to abort on errors, but rather just emit an error message, +and to continue processing - if this is at all possible. To do this, +Ibpag2 provides a simple but fairly effective mechanism: A reserved +"error" token. + + When Ibpag2 encounters an error, it will remove symbols from +its stack until it has backtracked to a point where the error token is +legal. It then shifts the error token onto the stack, and tries to +re-start the token stream at the point where it left off, discarding +tokens if necessary in order to get itself resynchronized. The parser +considers itself resynchronized when it has successfully read and +shifted three tokens after shifting the error token. Until then it +remains in an error state, and will not output additional error +messages as it discards tokens. + + This explanation may sound a bit abstruse, but in practice it +is turns out to be quite simple. To implement error handling for our +calculator, we really have to add only one production to the end of +the "lines" section: + + lines : lines, expression, '\n' { write(arg2) } + | lines, '\n' + | epsilon + | error, '\n' { + write("syntax error; try again:") + iierrok + } + +Given the above grammar, the parser will handle errors as follows: If +an error occurs (say it has an expression then an operator on its +stack and sees a newline on the input stream) the parser will throw +out the operator, then check if the error token would be OK in this +state (which it would not). Then it would throw out the expression. +At this point, the stack is in the ready-to-read-a-lines state - the +state it was in before it read the last expression. Since "lines" may +consist of error and '\n,' the error token is legal here, and so the +parser pushes error onto the stack, then looks back at the input +stream (where a newline is still waiting). Since the newline now +completes the rule lines : error, '\n', the parser pushes the newline +onto its stack, then executes the action associated with this +production, i.e. it writes "syntax error; try again:" to the console, +prompting the user for additional input. + + The keyword "iierrok" in the above error production's action +field is there for a subtle, but important, reason: It tells the +parser to consider itself resynchronized, even if three tokens have +not yet been shifted. If iierrok were not in the action code for this +rule, and the user were to supply more bad input after the prompt, +then the parser would simply discard those tokens, without emitting +another error message. Why? Because, as you will recall, the parser +discards tokens after an error, in efforts to resynchronize itself. +Until it reads and shifts three tokens successfully, it considers +itself in an error state, and will not emit additional error messages. +The three-token resync rule is there to prevent a cascade of +irrelevant error messages touched off by a single error. In our +calculator's case above, though, we are smarter than the parser. We +know that it is resynchronized as soon as it reduces error, '\n' to +lines. So if a syntax error occurs on the next token, it should be +reported. Adding "iierrok" to the action insures that the parser will +do just this. + + In addition to iierrok, there are several other directives +Ibpag2 accepts as part of the action code segments. These are as +follows: + + iiclearin clear the current input token + IIERROR perform error recovery + IIACCEPT simulate an accept action + +There are several other directives (all implemented as macros) that +Ibpag2 accepts in GLR mode. For a discussion of GLR mode, see below, +section 5. IIERROR in particular, and error recovery in general, work +a bit differently in that mode than they do in Ibpag2's normal (i.e. +LR) mode. + + There are admittedly many other topics that might be covered +here. This treatment, however, is intended as a general nontechnical +introduction, and not as a complete textbook on parser generation use. +If you want to learn more about this topic, consult the bibliography. +Also, check the UNIX manual pages on the YACC utility (Yet Another +Compiler Compiler). Ibpag's input format is fairly close (too close, +perhaps) to YACC's. In fact, most of what is said about YACC in UNIX +documentation can be carried directly over to Ibpag2. Several salient +differences, though, should be kept in mind: + + 1) YACC's "$$ = x" constructs are replaced by "return x" (e.g. + "$$ = $1 + $3" -> "return $1 + $3" [$1 is a synonym for + "arg1", $3 for "arg3", etc.]) + + 2) all variables within a given action are, by default, local + to that action; i.e. they cannot be accessed by other + actions unless you declare them global elsewhere (e.g. in + the pass-through part of the declarations section %{ ... + %}) + + 3) the %union and %type declarations/tags are not needed by + Ibpag2 (both for better and for worse) + + 4) tokens and symbols are separated from each other by a comma + in Ibpag2 files (e.g. %token '+', '-' and S : NP, VP) + + 5) epsilon is indicated by the keyword "epsilon" (e.g. REL : + epsilon), and not by an empty RHS + + 6) both epsilon and error *may* be declared as %tokens for + reasons of precedence, although they retain hard-coded + internal values (-2 and -1, respectively) + + 7) all actions must follow the last RHS symbol of the rule + they apply to (preceded by an optional %prec directive); to + achieve S : NP { action1 }, VP { action2 }, insert a dummy + rule: S : NP, dummy, VP { action2 }; dummy : epsilon { + action1 } ; + + 8) YYERROR, YYACCEPT, yyclearin, and yyerrok are the same, + except they are written IIERROR, IIACCEPT, iiclearin, and + iierrok (i.e. "ii" replaces "yy") + + 9) Ibpag2's input files are tokenized as modified Icon files, + and, as a consequence, Icon's reserved words must not be + used as symbols (e.g. "if : if, then" is no go) + +I myself find YACC to be ugly. As a result, Ibpag2 is not an exact +YACC clone. I would like to underscore the fact that I have no +intention to move in this direction, either. It's as YACC-like as +it's going to get! + + Both YACC and non-YACC users should note number 9 in the above +list. Don't use things like "while," "every," "do," etc. as symbols +in your grammar! Just use the same rules for Ibpag2 nonterminals as +for Icon variables, and you'll be OK. + + For those that just can't bear using anything but a strictly +YACC-conformant system, I've included a preprocessor with the Ibpag2 +distribution called (at one user's recommendation) "iacc." Iacc reads +&input - assumed to be a YACCish grammar - and sends to &output an +Ibpag2-conformant file. I have not tested this file extensively, and +there are likely to be bugs in the way I've handled the necessary 2 +token lookaheads and value stack references. Give it a whirl, though, +if you are feeling adventurous. The only reason I personally use Iacc +is that some YACCs (e.g. BSD YACC) have particularly nice debugging +messages and help. If my grammar is particularly complex, I just run +it through YACC without action code first, then use Iacc to convert it +to Ibpag2 format. Iacc's output, as I noted, is not meant to be +pretty, so I invariably end up doing a little editing - usually just +respacing a few rules, and re-inserting any comments that I might have +put in the original YACC file. + + In general, Ibpag2 (like YACC) handles epsilon moves and +indirect cycles. LR-mode shift-reduce conflicts are also handled in +the normal way (i.e. pick the rule with the highest priority, and, in +cases where the priority is the same, check the associativities). In +contrast to YACC, Ibpag2 flags reduce/reduce conflicts as errors +(since these often conceal deeper precedence problems and just plain +kludges). Reduce/reduce conflict errors are easily enough remedied, +if need be, via (dummy) precedences. One can convert these errors to +warnings by specifying -y on the command line. With the -y option, +reduce/reduce conflicts are resolved in favor of the rule that occurs +first in the grammar. The -y switch also prevents Ibpag2 from +aborting on shift/reduce conflicts, telling it instead to resolve in +favor of shift. Basically, -y is a partial YACC compatibility switch. +Normally (i.e. in SLR mode) Ibpag2 is much more finicky than YACC +about conflicts in its grammars. + + Also in contrast to YACC, Ibpag2 supports multiple +simultaneous parsers. Ibpag2 normally names its main parser routine +iiparse(). By using the -m command-line option, however, you can +override this default behavior, and force Ibpag2 to augment this name +in some uniquely identifiable fashion. For example, "ibpag2 -m _1 < +tmp.ibp > tmp.icn" will force Ibpag2 to write a parser called +"iiparse_1" to tmp.icn. Note that, instead of calling iilex, this +iiparse_1() routine will now call iilex_1, and all necessary global +variables will have _1 appended to them (e.g. errors will become +errors_1). I don't expect that many people will have occasion to use +this feature. It is there, though, for those that want it. + + +4.__Debugging + + Constructing and debugging LR(1) family parsers can sometimes +be hair raising, even with a parser generator. Several precautions +can be taken, however, to minimize the agony. The first is to declare +all tokens initially as part of a single %token declaration, i.e. with +no precedences, and with the same associativities. Also, leave out +action code until the grammar seems to be working. In this stage, you +can even run the grammar through (BSD)YACC or GNU Bison. All you +would need to do is remove the commas between tokens and symbols, and +place a semicolon at the end of every rule. During this and all +debugging stages, supply Ibpag2 with a -v command-line switch. This +will cause Ibpag2 to write a summary of rules, tokens, and its two +state tables to "ibpag2.output" (a bit like GNU Bison, but with a +hard-coded name). If you get messages about conflicts in your parse +tables (e.g. "unresolvable reduce/reduce conflict, state 5, token +257, rules 4,5"). This file will tell you what rules these are, and +what token number 257 is. Use precedences and associativities to +clear these problems up as they arise. If you are comfortable having +reduce/reduce errors resolved by the order in which the conflicting +rules occur, then use the -y command-line switch. With -y on the +command line, Ibpag2 will always resolve in favor of the earlier rule. +This option will also cause it to resolve all shift/reduce conflicts +in favor of shift. + + There are certain languages that are not ambiguous that SLR(1) +parsers like Ibpag2 will fail to produce an unambiguous parse table +for. The classic example is + + expr : lval, '=', rval | rval + lval : '*', rval | ID + rval : lval + +C programmers will recognize this as a toy expression grammar with +code for identifiers, assignments, and pointers. The problem is that +if we feed this grammar to Ibpag2, it will claim that there is a +conflict on lookahead '='. In truth, there is no ambiguity. The SLR +parser simply doesn't remember the pathway the parser used to get to +the state it is in when it sees '=' on the input stream. Whether the +parser gets into this state by seeing '*' plus and ID, or by seeing +just an ID, it knows to turn the ID into an lval. Then it knows to +turn lval into rval. At this point, though, it doesn't know whether +to shift the = sign via rule 1, or to turn rval and the preceding '*' +into an lval. The parser has "forgotten" that the '*' is there +waiting on level down on the stack! + + The solution to this problem is actually quite simple (at +least in concept). Just provide a unique pathway in the grammar for +the conflicting rules. In this case, they are rules 1 and 5 (the +first and last): + + expr : lval, '=', rval | rval + lval : '*', pval | ID + pval : lval + rval : lval + +Now when the parser sees '*,' it can only have a pval after it. Never +mind that pval is composed of precisely the same things as rval. The +point is that the parser generator follows a different route after +seeing '*' than if it starts with ID and no preceding '*'. Hence it +"remembers" that that the '*' is back on the stack, waiting for the +"lval : '*', pval" rule to apply. There is no more conflict. + + Go ahead and run these grammars through Ibpag2 if you aren't +sure what is going on. Remember to declare ID as a token, and to +place "%%" in the appropriate spot! + + If you get your parser up and running, but find that it is not +functioning quite the way you expect, add the following line somewhere +near the start of Ibpag2's output file: + + $define IIDEBUG + +If you like, you can add it to the beginning of your Ibpag2 input +file. Place it in the declarations section (before the first double +percent sign), and surround it by %{ and %}, e.g.: + + %{ + $define IIDEBUG + %} + +This tells Ibpag2 to send $define IIDEBUG straight through to the +output file. + + What defining IIDEBUG does is tell iiparse, once compiled, to +emit profuse debugging messages about the parser's actions, and about +the state of its stacks. This display will not make a whole lot of +sense to anyone who doesn't understand LR-family parsers, so those who +want to access this feature should perhaps go through a standard +reference like Aho, Sethi, and Ullman [1]. + + If, after you are finished debugging your grammar, you find +that Ibpag2's output files are rather large, you may try saving space +by compressing the action and goto tables. This is accomplished by +invoking Ibpag2 with the -c (compress) option. Using this option +makes debugging difficult, and makes the parser run a bit more slowly. +It also only works for rather large grammars with long nonterminal +symbol names. Don't even consider it until the grammar is thoroughly +debugged and you have determined that the output file's size is just +too great for practical use. Even then, compression may or may not +help, depending on how long your nonterminal names are. In general, +Ibpag2 is best as a teaching tool, or as a production system for +medium or small grammars. + + +5.__Using_Ibpag2_with_Non-LR_Grammars + + There may be times when you *want* to parse languages that no +LR-based algorithm can handle. There may be times, that is, when the +grammar you want to use contains conflicts or ambiguities that are +there by design, and not by oversight. For example, you may want to +parse a natural language. Full-blown natural languages involve many +highly ambiguous constructs, and are not LR-parsable. By invoking it +with the -a option, Ibpag2 can parse or recognize certain natural +languages, or, more practically speaking, certain NL subsets. The +letter "a" in -a is supposed to stand for "ambiguous," although what +this option really does is put Ibpag2 into a quasi-GLR mode - i.e. +into a kind of "generalized" LR mode in which it can accept non-LR +grammars [4,5]. + + User-visible changes to Ibpag2's operation in quasi-GLR mode +(i.e. with the -a option) are as follows: + + 1) iiparse() is now a generator + 2) action code can use suspend as well as return + 3) IIERROR places the current thread in an error state (i.e. + it doesn't *necessarily* trigger error recovery; see below) + 4) there are two new action-code directives (iiprune and + iiisolate) and a general define (AUTO_PRUNE) + 5) conflicts due to ambiguities in the grammar no longer + result in aborted processing (so, e.g., if you do not + specify the -y option on a grammar with reduce/reduce + conflicts, Ibpag2 will simply generate a parser capable of + producing multiple parses for the same input) + + In quasi-GLR mode, iiparse() should be invoked in a way that +will render multiple results usable, if they are available (e.g. +"every result := iiparse(&input) do...". Action code is also allowed +to produce more than one value (i.e. to use suspend). When it does +so, iiparse() creates separate parse threads for each value. So, for +instance, if your action code for some production suspends both of the +following lists, + + ["noun", "will", "gloss: desire"] + ["noun", "will", "gloss: legal document mandating how _ + one's possessions are to be disposed _ + of after one's death"], + +iiparse() would create two separate parse threads - one for each +result. Note that in this case, the syntactic structure of each +thread is the same. It is their semantics (i.e. the stuff on the +value stack) that differs. + + If you use the iierrok and iiclearin macros in your action +code before suspending any result, their affect persists through all +subseqent suspensions and resulting parse threads. If you use these +macros after suspending one or more times, however, they are valid +only for the parse thread generated by the next suspension. By way of +contrast, the IIERROR macro *always* flags only the next parse thread +as erroneous. Likewise, IIACCEPT always simulates an accept action on +the next suspension only. IIERROR and IIACCEPT, in other words, never +have any effect on subsequent suspensions and parse threads other than +the one that immediately follows them. This is true of iierrok and +iiclearin only when used after the first suspension. + + In quasi-GLR mode, IIERROR (number three in the difference +list above) becomes a mechanism for placing the current parse thread +in error mode. This is similar to, but not quite identical to, how +IIERROR functions in straight LR mode. In quasi-GLR mode, if other +threads can carry on the parse without error the erroneous parse +thread is quietly clobbered. Full-blown error recovery only occurs if +all of the other parsers halt as well. This makes sense if you think +about it. Why keep erroneous threads around when there are threads +still continuing a valid parse? For some large interactive systems, +it might be necessary to keep bogus threads around longer, and weed +them out only after a lengthy grading process. If you are +constructing a system such as this, you'll have to modify Ibpag2's +iiglrpar.lib file. In particular, you'll need to change the segment +in iiparse() that takes out the trash, so to speak, in such a way that +it does so only if the error count in a given parser either rises +above a specific threshhold or else exceeds the number of errors in +the "most correct" parser by a certain amount. This is not that hard +to do. I just don't expect that most parsers people generate with +Ibpag2 will use IIERROR or error recovery in general in so involved a +fashion. + + Iiprune and iiisolate (number 4 above) are used to control the +growth of the parallel parser array. In order to give straightforward +(read "implementationally trivial") support for action code, Ibpag2 +cannot create a parse "forest" in the sense that a standard GLR parser +does. Instead, it simply duplicates the current parser environment +whenever it encounters a conflict in its action table. Even if the +conflict turns out to reflect only a local ambiguity, the parsers, by +default, remain separate. Put differently, Ibpag2's quasi-GLR parser, +by default, makes no direct effort to reduce the size of its parser +arrays or to alter the essentially linear structure of their value and +state stacks. Size reduction, where necessary and/or desirable, is up +to the programmer. What the iiprune macro is there to do is to give +the programmer a way of pruning a given thread out of the active +parser list. Iiisolate allows him or her to prune out every thread +*but* the current one. AUTO_PRUNE makes the parser behave more like a +standard GLR parser, instructing it to prune parse threads that are +essentially duplicating another parse thread's efforts. The parser, +though, does not build a parse tree per se, the way most GLR parsers +typically do, but rather manipulates its value stack like a +traditional LR-family parser. + + Iiprune is useful when, for example, the semantics (i.e. your +"action" code segments) determine that a given parse thread is no +longer viable, and you want to signal the syntactic analyzer not to +continue pursuing it. The difference between iiprune and IIERROR is +that iiprune clobbers the current parser immediately. IIERROR only +puts it into an error state. If all active parsers end up in an error +state, and none can shift additional input symbols, then the IIERROR +macro induces error recovery. Iiprune does not. NB: iiprune, if used +in action code that suspends multiple results, cancels the current and +remaining results (i.e. it does not clobber parsers already spun off +by previous suspensions by invocation of that same code; it merely +cuts the result sequence). Iiprune essentially stands in for "fail" +in this situation. Fail itself can be used in the code, but be warned +that iiparse() will still push *at least one* value onto its value +stack, even if a given action code segment fails. This keeps the +value stack in sync with the syntax. To avoid confusion, I recommend +not using "fail" in any action code. + + Iiisolate is useful if, during error recovery, you prompt the +user interactively, or do something else that cannot be elegantly done +in parallel for two or more distinct parse threads. Iiisolate allows +you to preserve only the the current parse thread, and to clobber the +rest. Iiisolate can also be useful as a way of making sure that only +one thread carries on the parse in non-error situations. Suppose that +we have a series of productions: + + sentences : sentences sentence '\n' + { print_parse(arg2) } + | sentences '\n' + | error '\n' + | epsilon + +If we get a sentence with more than one parse, all of the underlying +threads that produced these parses will be active for the next +sentence as well. In many situations this will not be what we want. +If our desire it to have only one active parse thread at the start of +each sentence, we simply tell our lexical analyzer to suspend two +newlines every time it sees a newline on the input stream. This +insures that the second rule will always apply right after the first. +We then insert iiisolate directives for both it and the one error +production: + + sentences : sentences sentence '\n' + { print_parse(arg2) } + | sentences '\n' + { iiisolate } + | error '\n' + { iiisolate; iierrok } + | epsilon + +The effect here is to allow multiple parsers to be generated only +while parsing "sentence". The iiisolate directive, in other words, +sees to it that no sentence parse will ever begin with multiple active +parsers. As with LR mode, iierrok clears the error flag for the +(current) parser. + + Note that if you use iiisolate in action code that suspends +multiple results, iiisolate will clobber all parsers but the one +generated by the next suspension. + + If there is no need for close control over the details of the +parser array, and you wish only to clobber parsers that end up doing +the same thing as some other parser (and hence returning identical +values), then just make sure you add "$define AUTO_PRUNE" to the +pass-through code section at the top of the file. Put differently, +defining AUTO_PRUNE instructs the quasi-GLR parser to weed out parsers +that are in the same state, and which have identical value stacks. +AUTO_PRUNE can often be used in place of iiisolate in situations like +the one discussed just above. Its only drawback is that it slows +the parser a bit. + + Other than these deviations (action code and iiparse becoming +generators, IIERROR's altered behavior, and the addition of iiprune, +iiisolate, and AUTO_PRUNE), Ibpag2's quasi-GLR mode - at least on the +surface - works pretty much like its straight LR mode. In fact, if +you take one of your SLR(1) grammars, and run it through Ibpag2 using +the -a option, you probably won't notice any difference in the +resulting automaton unless you do some debugging or perform some +timing tests (the GLR parser is slower, though for straight SLR(1) +grammars not by much). Even with non-SLR(1) grammars, the quasi-GLR +parser will clip along merrily, using all the same sorts of rules, +action code, and macros that you would typically use in LR mode! + + +6.__Installing_Ibpag + + If you are a UNIX user, or have a generic "make" utility, you +are in luck. Just edit Makefile.dist according to the directions +given in that file, rename it as "makefile," then execute "make." +Ibpag2 should be created automatically. If everything goes smoothly, +then "make install" (su-ing root, if both possible and necessary for +correct installation of the iiparse.icn file). Check with your system +administrator if you are on a public system, and aren't sure what to +do. + + Please be sure to read the directions in the makefile +carefully, and set DESTDIR and LIBDIR to the directory where you want +the executable and parser file to reside. Also, make sure the paths +you specify are correct for your Icon executables. Although Ibpag2 +will apparently compile using iconc, I would recommend using the +interpreter, icont, first, unless you are planning on working with a +large grammar. + + If you are using some other system - one that lacks "make" - +then shame on your manufacturer :-). You'll be a bit inconvenienced. +Try typing: + + icont -o ibpag2 follow.icn ibpag2.icn ibreader.icn \ + ibtokens.icn ibutil.icn ibwriter.icn iohno.icn \ + outbits.icn slritems.icn slrtbls.icn shrnktbl.icn \ + version.icn slshupto.icn + +The backslashes merely indicate that the next line is a continuation. +The whole thing should, in other words, be on a single line. As noted +above, you may compile rather than interpret - if your OS supports the +Icon compiler. Just replace "icont" above with "iconc." The +resulting executable will run considerably faster than with "icont," +although the time required to compile it may be large, and the (still +somewhat experimental) compiler may not work smoothly in all +environments. + + If your operating system support environment variables, and +you have set up your LPATH according to the specifications in the Icon +distribution (see below), then you may copy iiparse.lib and +iiglrpar.lib to some file in your LPATH. If you do not do this, or if +your OS does not support environment variables, then you must be in +the directory where you keep your Ibpag2 files when you use it, or +else invoke Ibpag2 with the -p dirname option (where dirname is the +directory that holds the iiparse.lib and iiglrpar.lib files that come +with the Ibpag2 distribution). The .lib files contain template +parsers that are critical to Ibpag2's operation. Ibpag2 will abort if +it cannot find them. + + If your operating system permits the creation of macros or +batch files, it might be useful to create one that changes +automatically to the Ibpag2 source directory, and runs the executable. +This has the side-benefit of making it easier for Ibapg2 to find the +parser library files, iiparse.lib and iiglrpar.lib. Under DOS, for +instance, one might create a batch file that says: + + c: + cd c:\ibpag2 + iconx ibpag2 %1 %2 %3 %4 %5 %6 %7 %8 %9 + +DOS, it turns out, has to execute Icon files indirectly through iconx, +so this technique has yet another advantage in that it hides the +second level of indirection - although it prevents you from using +input and output redirection. Naturally, the above example assumes +that Ibpag2 is in c:\ibpag2. + + Ibpag2 assumes the existence on your system, not only of an +Icon interpreter or compiler, but also of an up-to-date Icon Program +Library. There are several routines included in the IPL that Bibleref +uses. Make sure you (or the local system administrators) have put the +IPL online, and have translated the appropriate object modules. Set +your IPATH environment variable to point to the place where the object +modules reside. Set LPATH to point to the modules' source files. +Both IPATH and LPATH are documented in doc directory of the Icon +source tree (ipd224.doc). If your system does not support environment +variables, copy ximage.icn, options.icn, ebcdic.icn, and escape.icn +from the IPL into the Ibpag2 source directory, and compile them in +with the rest of the Ibpag2 source files, either by adding them to the +SRC variable in the makefile, or by adding them manually to the "icont +-o ..." command line given above. + + If you have any problems installing or using Ibpag2, please +feel free to drop me, Richard Goerwitz, an e-mail message at +goer@midway.uchicago.edu, or (via the post) at: + + 5410 S. Ridgewood Ct., 2E + Chicago, IL 60615 + + +6.__Bibliography + +1. Aho, Alfred V., Sethi, Ravi, and Ullman, Jeffrey D. Compilers. + Addison-Wesley: Reading, Massachusetts, second printing, 1988. + +2. Griswold, Ralph E. and Griswold, Madge T. The Icon Programming + Language. Prentice-Hall, Inc.: Englewood Cliffs, New Jersey, USA, + second edition, 1990. + +3. Griswold, Ralph E., Jeffery, Clinton L., and Townsend, Gregg M. + Version 8.10 of the Icon Programming Language. Univ. of Arizona + Icon Project Document 212, 1993. (obtain via anonymous FTP from + cs.arizona.edu ~ftp/icon/docs/ipd212.doc) + +4. Tomita, Masaru. Efficient Parsing for Natural Language. Boston: + Kluwer Academic Publishers, c. 1985. + +5. Tomita, Masaru editor. Generalized LR Parsing. Boston: Kluwer + Academic Publishers, 1991. diff --git a/ipl/packs/ibpag2/beta2ref.ibp b/ipl/packs/ibpag2/beta2ref.ibp new file mode 100644 index 0000000..62fa62b --- /dev/null +++ b/ipl/packs/ibpag2/beta2ref.ibp @@ -0,0 +1,117 @@ +# +# Ibpag2 source file for OT betacode-to-English converter. +# +# "Betacode" is the name used for the markers that the Thesaurus +# Linguae Graecae uses to segment texts into works, books, chapters, +# verses, etc. The Michigan-Claremont scan of the Hebrew OT (BHS) +# uses a subset of the betacode "language." This file contains a +# parser for that language that converts it into human readable form. +# +# Reads the standard input. Sends the original text, with betacode +# markers converted to human-readable form, to the standard output. +# + +%{ + +# These need to be global, because all of the actions modify them. +# Remember that the default scope for a variable used in an action is +# that action. +# +global betavals, blev + +%} + +%token INTVAL, STRVAL, LINE + +%% + +betalines : betalines, betaline + | epsilon + ; + +betaline : '~', cvalue, xvalue, yvalue, '\n' + { if integer(betavals[2]) then { + write(betavals[1], " ", + betavals[2], ":", + betavals[3]) + } + blev := 4 # global + } + | LINE, '\n' { write($1) } + ; + +cvalue : 'a', value, 'b', value, 'c', value + { betavals[blev := 1] := $6 } + | 'c', value { betavals[blev := 1] := $2 } + | epsilon + ; + +xvalue : 'x', value { betavals[blev := 2] := $2 } + | 'x' { if integer(betavals[2]) + then betavals[blev := 2] +:= 1 + else betavals[blev := 2] := 1 + } + | epsilon { if blev < 2 then + betavals[2] := 1 + } + ; + +yvalue : 'y', value { betavals[blev := 3] := $2 } + | 'y' { betavals[blev := 3] +:= 1 } + | epsilon { if blev < 3 then + betavals[3] := 1 + } + ; + +value : INTVAL { return $1 } + | STRVAL { return $1 } + ; + + +%% + + +procedure iilex(infile) + + local line + # betavals is global + initial betavals := ["", 0, 0] + + while line := read(infile) do { + line ? { + if ="~" then { + suspend ord("~") + until pos(0) do { + case move(1) of { + "a" : suspend ord("a") + "b" : suspend ord("b") + "c" : suspend ord("c") + "x" : suspend ord("x") + "y" : suspend ord("y") + default : stop("betacode error: ", line) + } + if ="\"" then { + iilval := tab(find("\"")) + suspend STRVAL + move(1) + } else { + if iilval := integer(tab(many(&digits))) + then suspend INTVAL + } + } + suspend ord("\n") + } + else { + iilval := line + suspend LINE + suspend ord("\n") + } + } + } + +end + + +procedure main() + return iiparse(&input) +end diff --git a/ipl/packs/ibpag2/follow.icn b/ipl/packs/ibpag2/follow.icn new file mode 100644 index 0000000..fa3c8c6 --- /dev/null +++ b/ipl/packs/ibpag2/follow.icn @@ -0,0 +1,332 @@ +############################################################################ +# +# Name: follow.icn +# +# Title: compute follow sets for grammar +# +# Author: Richard L. Goerwitz +# +# Version: 1.15 +# +############################################################################ +# +# This file contains FIRST(st, symbol...) and FOLLOW(start_symbol, +# st, symbol). For FIRST(), arg1 is a list of productions. Arg 2 is +# a string (nonterminal) or an integer (terminal). FIRST may take +# more than one symbol argument. FOLLOW takes a string as its first +# argument, a list of productions as its second, and a symbol as its +# third. There is never any need to call FOLLOW with any more than +# one symbol. The return values for FIRST() and FOLLOW() may be +# described as follows: +# +# FIRST returns the set of all terminal symbols that begin valid +# prefixes of the first symbol argument, or, if this contains +# epsilon, of the first symbol -- <epsilon> ++ the set of terminals +# beginning valid prefixes of the second symbol, etc.... The first +# argument, st, contains the production list over which FIRST is to +# be computed. +# +# FOLLOW is similar, except that it accepts only one symbol argument, +# and returns the set of nonterminals that begin valid prefixes of +# symbols that may follow symbol in the grammar defined by the +# productions in st. +# +# Both FIRST() and FOLLOW() are optimized. When called for the first +# time with a specific production list (st), both FIRST() and +# FOLLOW() create the necessary data structures to calculate their +# respective return values. Once created, these data structures are +# saved, and re-used for subsequent calls with the same st argument. +# The implications for the user are two: 1) The first call to FOLLOW +# or FIRST for a given production list will take a while to return, +# but 2) subsequent calls will return much faster. Naturally, you +# can call both FIRST() and FOLLOW() with various st arguments +# throughout the life of a given program. +# +############################################################################ +# +# Links: none +# +############################################################################ + + +# +# FIRST: list|set x string|integer... -> set +# (st, symbols...) -> FIRST_set +# +# Where symbols are strings or integers (nonterminal or terminal +# symbols in a production in the list or set of productions, st), +# and where FIRST_set is a set of integers corresponding to +# terminal symbols that begin valid prefixes of symbols[1], or if +# that derives epsilon, of symbols[1] -- epsilon ++ symbols[2], +# unless that derives epsilon, etc... +# +procedure FIRST(st, symbols[]) + + local i, result, FIRST_tbl + static FIRST_tbl_tbl + initial FIRST_tbl_tbl := table() + + /FIRST_tbl_tbl[st] := make_FIRST_sets(st) + FIRST_tbl := FIRST_tbl_tbl[st] + + result := set() + i := 0 + while *symbols >= (i +:= 1) do { + /FIRST_tbl[symbols[i]] & iohno(90, image(symbols[i])) + if not member(FIRST_tbl[symbols[i]], -2) then { + # We're done if no epsilons. + result ++:= FIRST_tbl[symbols[i]] + break + } else { + # Remove the epsilon & try the next symbol in p.RHS. + result ++:= FIRST_tbl[symbols[i]] -- FIRST_tbl[-2] + } + } + # If we get to here without finding a symbol that doesn't derive + # epsilon, then give up and insert <epsilon> into result. + if i > *symbols then + result ++:= FIRST_tbl[-2] + + return result + +end + + +# +# FOLLOW: list|set x string|integer -> set +# (st, symbol) -> FOLLOW_set +# +procedure FOLLOW(start_symbol, st, symbol) + + static FOLLOW_tbl_tbl + initial FOLLOW_tbl_tbl := table() + + /FOLLOW_tbl_tbl[st] := make_slr_FOLLOW_sets(start_symbol, st) + return FOLLOW_tbl_tbl[st][symbol] + +end + + +# +# Below is the procedure make_slr_FOLLOW_sets(start_symbol, st), +# which accepts a string, a set, and a table as its arguments and +# returns another table. The first argument must contain the start +# symbol for the set (or list) of productions contained in the second +# argument. Returns a table of FOLLOW sets, where keys = symbols and +# values = follow sets for those symbols. +# +# The algorithm - somewhat inefficiently implemented here - works out +# as follows: +# +# 1. Place $ (internal 0) in FOLLOW_tbl[start_symbol]. +# 2. Initialize FOLLOW_tbl[symbol] to { } for every other symbol. +# 3. For each production A -> aBb do FOLLOW_tbl[B] ++:= FIRST(b) -- +# FIRST(<epsilon>). +# 4. For each production A -> aBb where FIRST(b) contains +# <epsilon> and for each production A -> aB, do FOLLOW_tbl[B] ++:= +# FOLLOW_tbl[A]. +# +# Repeat steps 3 and 4 until no FOLLOW set can be expanded, at which +# point return the FOLLOW table. +# +# Note that <epsilon> is represented internally by -2. +# + + +# +# make_slr_FOLLOW_sets: string x set/list -> table +# (start_symbol, st) -> FOLLOW_tbl +# +# Where start_symbol is the start symbol for the grammar defined +# by the set/list of productions in st, and where FOLLOW_tbl is a +# table of follow sets (keys = symbols, values = follow sets for +# the symbols). +# +procedure make_slr_FOLLOW_sets(start_symbol, st) + + local FOLLOW_tbl, k, size, old_size, p, i, j + + FOLLOW_tbl := table() + # step 1 above; note that 0 = EOF + FOLLOW_tbl[start_symbol] := set([0]) + + # step 2 + every k := (!st).LHS do + /FOLLOW_tbl[k] := set() + + # steps 3 and 4 + size := 0 + # + # When the old size of the FOLLOW sets equals the new size, we are + # done because nothing was added to the FOLLOW sets on the last + # pass. + # + while old_size ~===:= size do { + size := 0 + every p := !st do { + every i := 1 to *p.RHS-1 do { + type(p.RHS[i]) == "string" | next + /FOLLOW_tbl[p.RHS[i]] & iohno(90, image(p.RHS[i])) + # Go through every RHS symbol until we get a FIRST set + # without an epsilon move. + every j := i+1 to *p.RHS do { + if member(FIRST(st, p.RHS[j]), -2) then { + FOLLOW_tbl[p.RHS[i]] ++:= + FIRST(st, p.RHS[j]) -- FIRST(st, -2) + } else { + FOLLOW_tbl[p.RHS[i]] ++:= FIRST(st, p.RHS[j]) + size +:= *FOLLOW_tbl[p.RHS[i]] + break next + } + } + # If we get past "break next" then b in A -> aBb =>* + # <epsilon>; add FOLLOW_tbl[A] to FOLLOW_tbl[B]. + FOLLOW_tbl[p.RHS[i]] ++:= FOLLOW_tbl[p.LHS] + size +:= *FOLLOW_tbl[p.RHS[i]] + } + # Add FOLLOW_tbl[A] to FOLLOW_tbl[B] for the last symbol in the + # RHS of every rule. + type(p.RHS[*p.RHS]) == "string" | next + /FOLLOW_tbl[p.RHS[*p.RHS]] & iohno(90, image(p.RHS[*p.RHS])) + FOLLOW_tbl[p.RHS[*p.RHS]] ++:= FOLLOW_tbl[p.LHS] + size +:= *FOLLOW_tbl[p.RHS[*p.RHS]] + } + } + + # Print human-readable version of FOLLOW_tbl if instructed to do so. + if \DEBUG then + print_follow_sets(FOLLOW_tbl) + + # check for useless nonterminal symbols + every k := (!st).LHS do + *FOLLOW_tbl[k] = 0 & iohno(91, k) + + return FOLLOW_tbl + +end + + +# +# Below is the routine make_FIRST_sets(st), which accepts as its one +# argument a list or set of production records, and which returns a +# table t, where t's keys are symbols from the grammar defined by the +# productions in st, and where the values assocated with each of +# these keys is the FIRST set for that key. +# +# Production records are structures where the first two fields, LHS +# and RHS, contain the left-hand and right-hand side of each rule in +# a given grammar. The right-hand side is a linked list of integers +# (used for terminals) and strings (used for nonterminals). LHS must +# contain a string. Terminals below 1 are reserved. Currently three +# are actually used: +# +# 0 EOF +# -1 error +# -2 epsilon +# +# For a description of the FIRST() construction algorithm, see Alfred +# Aho, Ravi Sethi, and Jeffrey D. Ullman _Compilers_ (Reading, +# Massachusetts: Addison & Wesley, 1986), section 4.4, page 189. +# Their algorithm is not strictly suitable, as is, for use here. I +# thank Dave Schaumann of the University of Arizona at Tuscon for +# explaining to me the iterative construction algorithm that in fact +# *is* suitable. +# +# FIRST is computed on an iterative basis as follows: +# +# 1. For every terminal symbol a, FIRST(a) = { a } +# 2. For every non-terminal symbol A, initialize FIRST(A) = { } +# 3. For every production A -> <epsilon>, add <epsilon> to FIRST(A) +# 4. For each production of the grammar having the form X -> Y1 +# Y2 ... Yn, perform the following procedure: +# i := 1 +# while i <= number-of-RHS-symbols do { +# if <epsilon> is not in FIRST(Y[i]) then { +# FIRST(X) ++:= FIRST(Y[i]) +# break +# } else { +# FIRST(X) ++:= FIRST(Y[i]) -- FIRST[<epsilon>] +# i +:= 1 +# } +# } +# if i > number-of-RHS-symbols then +# # <epsilon> is in FIRST(Y[i]) +# FIRST(X) ++:= FIRST[epsilon] +# 5. Repeat step 3 until no new symbols or <epsilon> can be added +# to any FIRST set +# + + +# +# make_FIRST_sets: set/list -> table +# st -> t +# +# Where st is a set or list of production records, and t is a +# table of FIRST sets, where the keys = terminal or nonterminal +# symbols and the values = sets of terminal symbols. +# +# Epsilon move is -2; terminals are positive integers; +# nonterminals are strings. Error is -1; EOF is 0. +# +procedure make_FIRST_sets(st) + + local FIRST_tbl, symbol, p, old_size, size, i + + FIRST_tbl := table() + FIRST_tbl[0] := set([0]) + + # steps 1, 2, and 3 above + every p := !st do { + # check for empty RHS (an error) + *p.RHS = 0 & iohno(11, production_2_string(p)) + # step 1 + every symbol := !p.RHS do { + if type(symbol) == "integer" + then FIRST_tbl[symbol] := set([symbol]) + } + # step 2 + /FIRST_tbl[p.LHS] := set() & + # step 3 + if *p.RHS = 1 then { + if p.RHS[1] === -2 # -2 is epsilon + then insert(FIRST_tbl[p.LHS], -2) + } + } + + # steps 4 and 5 above + size := 0 + # + # When the old size of the FIRST sets equals the new size, we are + # done. As long as they're unequal, set old_size to size and try + # to add to the FIRST sets. + # + while old_size ~===:= size do { + size := 0 + every p := !st do { + every i := 1 to *p.RHS do { + \FIRST_tbl[p.RHS[i]] | iohno(90, image(p.RHS[i])) + if not member(FIRST_tbl[p.RHS[i]], -2) then { + # We're done with this pass if no epsilons. + FIRST_tbl[p.LHS] ++:= FIRST_tbl[p.RHS[i]] + size +:= *FIRST_tbl[p.LHS] + break next + } else { + # Remove the epsilon & try the next symbol in p.RHS. + FIRST_tbl[p.LHS] ++:= FIRST_tbl[p.RHS[i]] -- FIRST_tbl[-2] + } + } + # If we get past the every...do structure without + # break+next-ing, then we are still finding epsilons. In + # this case, add epsilon to FIRST_tbl[p.LHS]. + FIRST_tbl[p.LHS] ++:= FIRST_tbl[-2] + size +:= *FIRST_tbl[p.LHS] + } + } + + # Print human-readable version of FIRST_tbl if instructed to do so. + if \DEBUG then + print_first_sets(FIRST_tbl) + + return FIRST_tbl + +end diff --git a/ipl/packs/ibpag2/iacc.ibp b/ipl/packs/ibpag2/iacc.ibp new file mode 100644 index 0000000..a169db8 --- /dev/null +++ b/ipl/packs/ibpag2/iacc.ibp @@ -0,0 +1,495 @@ +############################################################################ +# +# Name: iacc.ibp +# +# Title: YACC-like front-end for Ibpag2 (experimental) +# +# Author: Richard L. Goerwitz +# +# Version: 1.6 +# +############################################################################ +# +# Summary: +# +# Iacc is a YACC-like Ibpag2 preprocessor (very experimental). +# Iacc simply reads &input (assumed to be a YACC file, but with Icon +# code in the action fields), and writes an Ibpag2 file to &output. +# +############################################################################ +# +# Installation: +# +# This file is not an Icon file, but rather an Ibpag2 file. You +# must have Ibpag2 installed in order to run it. To create the iacc +# executable, first create iacc.icn by typing "ibpag2 -f iacc.ibp -o +# iacc.icn," then compile iacc.icn as you would any other Icon file +# to create iacc (or on systems without direct execution, iacc.icx). +# Put more simply, iacc.ibp not only outputs Ibpag2 files, but is +# itself generated using Ibpag2 + icon{t,c}. +# +############################################################################ +# +# Implementation notes: +# +# Iacc uses an YACC grammar that is actually LR(2), and not +# LR(1), as Ipbag2 would normally require in standard mode. Iacc +# obtains the additional token lookahead via the lexical analyzer. +# The place it uses that lookahead is when it sees an identifier. If +# the next token is a colon, then it is the LHS of a rule (C_IDENT +# below); otherwise it's an IDENT in the RHS of some rule. Crafting +# the lexical analyzer in this fashion makes semicolons totally +# superfluous (good riddance!), but it makes it necessary for the +# lexical analyzer to suspend some dummy tokens whose only purpose is +# to make sure that it doesn't eat up C or Icon action code while +# trying to satisfy the grammar's two-token lookahead requirements +# (see how RCURL and '}' are used below in the cdef and act +# productions). +# +# Iacc does its work by making six basic changes to the input +# stream: 1) puts commas between tokens and symbols in rules, 2) +# removes superfluous union and type declarations/tags, 3) inserts +# "epsilon" into the RHS of empty rules, 4) turns "$$ = x" into +# "return x", 5) rewrites rules so that all actions appear at the end +# of a production, and 6) strips all comments. +# +# Although Iacc is really meant for grammars with Icon action +# code, Iacc can, in fact, accept straight YACC files, with C action +# code. There isn't much point to using it this way, though, since +# its output is not meant to be human readable. Rather, it is to be +# passed directly to Ibpag2 for processing. Iacc is simply a YACCish +# front end. Its output can be piped directly to Ibpag2 in most +# cases: iacc < infile.iac | ibpag2 > infile.icn. +# +############################################################################ +# +# Links: longstr, strings +# See also: ibpag2 +# +############################################################################ + +%{ + +link strings, longstr +global newrules, lval, symbol_no + +%} + +# basic entities +%token C_IDENT, IDENT # identifiers and literals +%token NUMBER # [0-9]+ + +# reserved words: %type -> TYPE, %left -> LEFT, etc. +%token LEFT, RIGHT, NONASSOC, TOKEN, PREC, TYPE, START, UNION + +# miscellaneous +%token MARK # %% +%token LCURL # %{ +%token RCURL # dummy token used to start processing of C code + +%start yaccf + +%% + +yaccf : front, back +front : defs, MARK { write(arg2) } +back : rules, tail { + every write(!\newrules) + if write(\arg2) then + every write(!&input) + } +tail : epsilon { return &null } + | MARK { return arg1 } + +defs : epsilon + | defs, def { write(\arg2) } + | defs, cdef { write(\arg2) } + +def : START, IDENT { return arg1 || " " || arg2 } + | rword, tag, nlist { + if arg1 == "%type" + then return &null + else return arg1 || " " || arg3 + } +cdef : stuff, RCURL, RCURL { return arg1 } +stuff : UNION { get_icon_code("%}"); return &null } + | LCURL { return "%{ " || get_icon_code("%}") } + +rword : TOKEN | LEFT | RIGHT | NONASSOC | TYPE + +tag : epsilon { return &null } + | '<', IDENT, '>' { return "<" || arg2 || ">" } + +nlist : nmno { return arg1 } + | nlist, nmno { return arg1 || ", " || arg2 } + | nlist, ',', nmno { return arg1 || ", " || arg3 } + +nmno : IDENT { return arg1 } + | IDENT, NUMBER { return arg1 } + +rules : LHS, ':', RHS { write(arg1, "\t: ", arg3) } + | rules, rule { write(arg2) } + +RHS : rbody, prec { return arg1 || " " || arg2 } + +rule : LHS, '|', RHS { return "\t| " || arg3 } + | LHS, ':', RHS { return arg1 || "\t: " || arg3 } + +LHS : C_IDENT { symbol_no := 0 ; return arg1 } + | epsilon { symbol_no := 0 } + +rbody : IDENT { symbol_no +:= 1; return arg1 } + | act { return "epsilon " || arg1 } + | middle, IDENT { return arg1 || ", " || arg2 } + | middle, act { return arg1 || " " || arg2 } + | middle, ',', IDENT { return arg1 || ", " || arg3 } + | epsilon { return "epsilon" } + +middle : IDENT { symbol_no +:= 1; return arg1 } + | act { symbol_no +:= 1; return arg1 } + | middle, IDENT { symbol_no +:= 1; return arg1 || ", "||arg2 } + | middle, ',', IDENT { symbol_no +:= 1; return arg1 || ", "||arg3 } + | middle, act { + local i, l1, l2 + static actno + initial { actno := 0; newrules := [] } + actno +:= 1 + l1 := []; l2 := [] + every i := 1 to symbol_no do { + every put(l1, ("arg"|"$") || i) + if symbol_no-i = 0 then i := "0" + else i := "-" || symbol_no - i + every put(l2, ("$"|"$") || i) + } + put(newrules, "ACT_"|| actno || + "\t: epsilon "|| mapargs(arg2, l1, l2)) + symbol_no +:= 1 + return arg1 || ", " || "ACT_" || actno + } + +act : '{', cstuff, '}', '}' { return "{" || arg2 } +cstuff : epsilon { return get_icon_code("}") } + +prec : epsilon { return "" } + | PREC, IDENT { return arg1 || arg2 } + | PREC, IDENT, act { return arg1 || arg2 || arg3 } + + +%% + + +procedure iilex() + + local t + static last_token, last_lval, colon + initial colon := ord(":") + + every t := next_token() do { + iilval := last_lval + if \last_token then { + if t = colon then { + if last_token = IDENT + then suspend C_IDENT + else suspend last_token + } else + suspend last_token + } + last_token := t + last_lval := lval + } + iilval := last_lval + suspend \last_token + +end + + +procedure next_token() + + local reserveds, UNreserveds, c, idchars, marks + + reserveds := ["break","by","case","create","default","do", + "else","end","every","fail","global","if", + "initial","invocable","link","local","next", + "not","of","procedure","record","repeat", + "return","static","suspend","then","to","until", + "while"] + + UNreserveds := ["break_","by_","case_","create_","default_","do_", + "else_","end_","every_","fail_","global_","if_", + "initial_","invocable_","link_","local_","next_", + "not_","of_","procedure_","record_","repeat_", + "return_","static_","suspend_","then_","to_", + "until_","while_"] + + idchars := &letters ++ '._' + marks := 0 + + c := reads() + repeat { + lval := &null + case c of { + "#" : { do_icon_comment(); c := reads() | break } + "<" : { suspend ord(c); c := reads() | break } + ">" : { suspend ord(c); c := reads() | break } + ":" : { suspend ord(c); c := reads() | break } + "|" : { suspend ord(c); c := reads() | break } + "," : { suspend ord(c); c := reads() | break } + "{" : { suspend ord(c | "}" | "}"); c := reads() } + "/" : { + reads() == "*" | stop("unknown YACC operator, \"/\"") + do_c_comment() + c := reads() | break + } + "'" : { + lval := "'" + while lval ||:= (c := reads()) do { + if c == "\\" + then lval ||:= reads() + else if c == "'" then { + suspend IDENT + break + } + } + c := reads() | break + } + "%" : { + lval := "%" + while any(&letters, c := reads()) do + lval ||:= c + if *lval = 1 then { + if c == "%" then { + lval := "%%" + suspend MARK + if (marks +:= 1) > 1 then + fail + } else { + if c == "{" then { + lval := "%{" + suspend LCURL | RCURL | RCURL + } + else stop("malformed %declaration") + } + c := reads() | break + } else { + case lval of { + "%prec" : suspend PREC + "%left" : suspend LEFT + "%token" : suspend TOKEN + "%right" : suspend RIGHT + "%type" : suspend TYPE + "%start" : suspend START + "%union" : suspend UNION | RCURL | RCURL + "%nonassoc" : suspend NONASSOC + default : stop("unknown % code in def section") + } + } + } + default : { + if any(&digits, c) then { + lval := c + while any(&digits, c := reads()) do + lval ||:= c + suspend NUMBER + } + else { + if any(idchars, c) then { + lval := c + while any(&digits ++ idchars, c := reads()) do + lval ||:= c + lval := mapargs(lval, reserveds, UNreserveds) + suspend IDENT + } + else { + # whitespace + c := reads() | break + } + } + } + } + } + + +end + + +procedure get_icon_code(endmark, comment) + + local yaccwords, ibpagwords, count, c, c2, s + + yaccwords := ["YYDEBUG", "YYACCEPT", "YYERROR", "yyclearin", "yyerrok"] + ibpagwords := ["IIDEBUG", "IIACCEPT", "IIERROR", "iiclearin", "iierrok"] + + s := "" + count := 1 + c := reads() + + repeat { + case c of { + "\"" : s ||:= c || do_string() + "'" : s ||:= c || do_charlit() + "$" : { + c2 := reads() | break + if c2 == "$" then { + until (c := reads()) == "=" + s ||:= "return " + } else { + s ||:= c + c := c2 + next + } + } + "#" : { + if s[-1] == "\n" + then s[-1] := "" + do_icon_comment() + } + "/" : { + c := reads() | break + if c == "*" then + do_c_comment() + else { + s ||:= c + next + } + } + "{" : { + s ||:= c + if endmark == "}" then + count +:= 1 + } + "}" : { + s ||:= c + if endmark == "}" then { + count -:= 1 + count = 0 & (return mapargs(s, yaccwords, ibpagwords)) + } + } + "%" : { + s ||:= c + if endmark == "%}" then { + if (c := reads()) == "}" + then return mapargs(s || c, yaccwords, ibpagwords) + else next + } + } + default : s ||:= c + } + c := reads() | break + } + + # if there is no endmark, just go to EOF + if \endmark + then stop("input file has mis-braced { code }") + else return mapargs(s, yaccwords, ibpagwords) + +end + + +procedure do_string() + + local c, s + + s := "" + while c := reads() do { + case c of { + "\\" : s ||:= c || reads() + "\"" : return s || c || reads() + default : s ||:= c + } + } + + stop("malformed string literal") + +end + + +procedure do_charlit() + + local c, s + + s := "" + while c := reads() do { + case c of { + "\\" : s ||:= c || reads() + "'" : return s || c || reads() + default : s ||:= c + } + } + + stop("malformed character literal") + +end + + +procedure do_c_comment() + + local c, s + + s := c := reads() | + stop("malformed C-style /* comment */") + + repeat { + if c == "*" then { + s ||:= (c := reads() | break) + if c == "/" then + return s + } + else s ||:= (c := reads() | break) + } + + return s # EOF okay + +end + + +procedure do_icon_comment() + + local c, s + + s := "" + while c := reads() do { + case c of { + "\\" : s ||:= c || (reads() | break) + "\n" : return s + default : s ||:= c + } + } + + return s # EOF okay + +end + + +procedure mapargs(s, l1, l2) + + local i, s2 + static cs, tbl, last_l1, last_l2 + + if /l1 | *l1 = 0 then return s + + if not (last_l1 === l1, last_l2 === l2) then { + cs := '' + every cs ++:= (!l1)[1] + tbl := table() + every i := 1 to *l1 do + insert(tbl, l1[i], (\l2)[i] | "") + } + + s2 := "" + s ? { + while s2 ||:= tab(upto(cs)) do { + (s2 <- (s2 || tbl[tab(longstr(l1))]), + not any(&letters++&digits++'_')) | + (s2 ||:= move(1)) + } + s2 ||:= tab(0) + } + + return s2 + +end + + +procedure main() + iiparse() +end diff --git a/ipl/packs/ibpag2/ibpag2.icn b/ipl/packs/ibpag2/ibpag2.icn new file mode 100644 index 0000000..994cff6 --- /dev/null +++ b/ipl/packs/ibpag2/ibpag2.icn @@ -0,0 +1,303 @@ +############################################################################ +# +# Name: ibpag2.icn +# +# Title: Icon-based parser generator (version 2) +# +# Author: Richard L. Goerwitz +# +# Version: 1.22 +# +############################################################################ +# +# The Basics +# +# Ibpag2 is a simple tool for generating parsers from grammar +# specifications. This may sound pretty arcane to those who have +# never used a parser generator. In fact, though, this kind of tool +# forms the basis of most programming language implementations. +# Parser generators are also used in preprocessors, transducers, +# compilers, interpreters, calculators and in fact for just about any +# situation where some form of structured input needs to be read into +# an internal data structure and/or converted into some form of +# structured output. This might include something as mundane as +# reading in recepts or mailing addresses from a file, or turning +# dates of one type (e.g. "September 3, 1993") into another +# ("9/3/93"). For more information on how to use it, see the README +# file included with the Ibpag2 distribution. +# +############################################################################ +# +# Running Ibpag2: +# +# Invoking Ibpag2 is very, very simple. There are quite a few +# command-line switches, but all are optional: +# +# ibpag2 [-f infile] [-m module] [-o outfile] [-p iiparse.lib dir] +# [-a] [-c] [-v] [-y] +# +# Where infile is the Ibpag2 source file (default &input), outfile is +# the output file (default &output), module is an optional string +# appended to all global variables and all procedure calls (to allow +# multiple running parsers), and where -v instructs Ibpag2 to write a +# summary of its operations to ibpag2.output. Normally all of these +# arguments can be ignored. Ibpag2 can usually be run using simple +# shell redirection symbols (if your OS supports them). See the next +# paragraph for an explanation of the -p option. The -c option is +# for compressed tables, and -a is for non-LR or ambiguous grammars. +# See the advanced sections of README file. -y directs Ibpag2 to +# resolve reduce/reduce conflicts by their order of occurrence in the +# grammar, and to resolve shift/reduce conflicts in favor of shift - +# just like YACC. Invoking Ibpag with -h causes it to abort with a +# brief help message. +# +# Make sure that the iiparse.lib and iiglrpar.lib files are in +# some path listed in your LPATH directory, or else in a data +# directory adjacent to some IPL "procs" directory in your LPATH. +# Basically, LPATH is just a space-separated list of places where +# .icn library source files reside. If your system does not support +# environment variables, then there are two ways to tell Ibpag2 where +# the .lib files are without using LPATH. The first is to move into +# the directory that contains these files. The second is to supply +# the files' location using Ibpag's -p option (e.g. ibpag2 -p +# /usr/local/lib/icon/data). +# +############################################################################ +# +# More Technical Details +# +# Technically speaking, Ibpag2 is a preprocessor that accepts a +# YACC-like source file containing grammar productions and actions, +# then 1) converts these into parse tables and associated code, 2) +# adds to them an LR parser, and a few debugging tools, and 3) writes +# the combination to the standard output, along with the necessary +# action and goto table construction code. The user must $include, +# or hard-code into the Ibpag2 source file, a lexical analyzer that +# returns integers via symbolic $defines generated by %token, %right, +# etc. declarations in the Ibpag2 source file. +# +# Cycles and epsilon moves are handled correctly (to my +# knowledge). Shift-reduce conflicts are handled in the normal way +# (i.e. pick the rule with the highest priority, and, in cases where +# the priority is the same, check the associativities) I decided to +# flag reduce/reduce conflicts as errors by default, since these +# often conceal deeper precedence problems. They are easily enough +# handled, if need be, via dummy precedences. The -y command-line +# switch turns off this behavior, causing Ibpag2 to resolve +# reduce/reduce conflicts in a YACCish manner (i.e. favoring the rule +# that occurs first in the grammar). Ibpag2 normally aborts on +# shift/reduce conflicts. The -y switch makes Ibpag resolve these in +# favor of shift, and to keep on processing - again, just like YACC. +# +# For more information, see the README file. +# +############################################################################ +# +# Links: ibreader, ibwriter, slrtbls, ibutil, version, options +# +############################################################################ + +# link ibreader, ibwriter, slrtbls, ibutil, version, options +link options + +global DEBUG + +procedure main(a) + + local infile, outfile, verbosefile, atbl, gtbl, grammar, opttbl, + module, abort_on_conflict, paths, path, parser_name, + iiparse_file + + # Get command-line options. + opttbl := options(a, "f:o:vdm:p:hcay", bad_arg) + + # Abort with help message if -h is supplied. + if \opttbl["h"] then { + write(&errout, ib_version()) + return ib_help_() + } + + # If an input file was specified, open it. Otherwise use stdin. + # + if \opttbl["f"] then + infile := open(opttbl["f"], "r") | + bad_arg("can't open " || opttbl["f"]) + else infile := &input + + # If an output file was specified, use it. Otherwise use stdout. + # + if \opttbl["o"] then + outfile := open(opttbl["o"], "w") | + bad_arg("can't open " || opttbl["o"]) + else outfile := &output + + # If a module name was specified (-m), then use it. + # + module := opttbl["m"] | "" + + # If the debug option was specified, set all verbose output to go + # to errout. + # + if \opttbl["d"] then { + verbosefile := &errout + DEBUG := 1 + } + + # If the verbose option was specified, send all verbose output to + # "ibpag2.output" (a bit like YACC's yacc.output file). + # + else if \opttbl["v"] then + verbosefile := open("ibpag2.output", "w") | + bad_arg("can't open " || opttbl["v"]) + + # Output defines for YACC-like macros. Output iiisolate and + # iiprune if -a option is specified. Sorry for the ugly code. + # + write_defines(opttbl, outfile, module) + + # Whew! Now fetch the grammar from the input file. + # + # Emit line directives keyed to actual line numbers in the + # original file. Pass its name as arg4. If obttbl["f"] is + # null (and the input file is &input), ibreader will default + # to something else. + # + grammar := ibreader(infile, outfile, module, opttbl["f"]) + if \verbosefile then + # grammar contains start symbol, rules, and terminal token table + print_grammar(grammar, verbosefile) + + # Fill in parse tables, atbl and gtbl. Abort if there is a + # conflict caused by an ambiguity in the grammar or by some + # precedence/associativity problem, unless the -a option is + # supplied (telling Ibpag2 that ambiguous tables are okay). + # + if /opttbl["a"] then + abort_on_conflict := "yes" + atbl := table(); gtbl := table() + make_slr_tables(grammar, atbl, gtbl, abort_on_conflict, opttbl["y"]) + if \verbosefile then + # grammar.tbl maps integer terminal symbols to human-readable strings + print_action_goto_tables(atbl, gtbl, grammar.tbl, verbosefile) + + # If -c was specified on the command line, compress the action and + # goto tables. + # + if \opttbl["c"] then { + write(outfile, "\n$define COMPRESSED_TABLES\n") + if \verbosefile then + write(verbosefile, "\nNote: parse tables are compressed") + shrink_tables(grammar, atbl, gtbl) + } + + # Try to find the .lib file using LPATH. + # + parser_name := { + if \opttbl["a"] then "iiglrpar.lib" + else "iiparse.lib" + } + + paths := [] + put(paths, trim(\opttbl["p"], '/') || "/") + put(paths, "") + (\getenv)("LPATH") ? { + while path := trim(tab(find(" ") | 0), '/') || "/" do { + tab(many(' ')) + if find("procs", path) then + put(paths, ibreplace(path, "procs", "data")) + put(paths, path) + pos(0) & break + } + } + iiparse_file := open(!paths || parser_name, "r") | iohno(2) + + # Write .lib file (contains the iiparse() parser routine), along + # with the start symbol, action table, goto table, and a list of + # productions. + # + # grammar contains start symbol, rules, and terminal token table + # + ibwriter(iiparse_file, outfile, grammar, atbl, gtbl, module) + + return exit(0) + +end + + +# +# write_defines +# +procedure write_defines(opttbl, outfile, module) + + # Output defines for YACC-like macros. Output iiisolate and + # iiprune if -a option is specified. Sorry for the ugly code. + # + if \opttbl["a"] then { + write(outfile, + "$define iiisolate (iidirective", module, " ||:= \"isolate\")") + write(outfile, + "$define iiprune (iidirective", module, " ||:= \"prune\")") + write(outfile, + "$define iierrok (iidirective", module, " ||:= \"errok\")") + } else { + write(outfile, + "$define iierrok (recover_shifts", module, " := &null &", + " discards", module, " := 0)") + } + write(outfile, + "$define iiclearin (iidirective", module, " ||:= \"clearin\")") + write(outfile, + "$define IIERROR (iidirective", module, " ||:= \"error\")") + write(outfile, + "$define IIACCEPT (iidirective", module, " ||:= \"accept\")") +end + + +# +# bad_arg +# +# Simple routine called if command-line arguments are bad. +# +procedure bad_arg(s) + + write(&errout, "ibpag2: ",s) + write(&errout, + "usage: ibpag2 [-f inf] [-m str ] [-o outf] _ + [-p dir] [-a] [-c] [-v] [-y]") + write(&errout, " for help, type \"ibpag2 -h\"") + stop() + +end + + +# +# ib_help_ +# +procedure ib_help_() + + write(&errout, "") + write(&errout, + "usage: ibpag2 [-f inf] [-m str] [-o outf] [-p dir] _ + [-a] [-c] [-v] [-y]") + write(&errout, "") + write(&errout, " -f inf........where inf = Ibpag2's input file (default") + write(&errout, " &input)") + write(&errout, " -m str........where str = a string to be appended to") + write(&errout, " global identifiers and procedures") + write(&errout, " -o outf.......where outf = Ibpag2's output file (default") + write(&errout, " &output)") + write(&errout, " -p dir........where dir = directory in which the") + write(&errout, " iiparse.lib file resides (mainly for") + write(&errout, " systems lacking LPATH support)") + write(&errout, " -a............permits ambiguous grammars and multiple") + write(&errout, " parses (makes iiparse() a generator).") + write(&errout, " -c............compresses action/goto tables (obstructs") + write(&errout, " debugging somewhat).") + write(&errout, " -v............sends debugging info to ibpag2.output") + write(&errout, " -y............tells Ibpag2 to resolve reduce/reduce") + write(&errout, " conflicts by order of occurrence in") + write(&errout, " the grammar, and to resolve shift/") + write(&errout, " reduce conflicts in favor of shift") + stop("") + +end diff --git a/ipl/packs/ibpag2/ibreader.icn b/ipl/packs/ibpag2/ibreader.icn new file mode 100644 index 0000000..8401159 --- /dev/null +++ b/ipl/packs/ibpag2/ibreader.icn @@ -0,0 +1,515 @@ +############################################################################ +# +# Name: ibreader.icn +# +# Title: reader for Ibpag2 source files +# +# Author: Richard L. Goerwitz +# +# Version: 1.29 +# +############################################################################ +# +# This file contains a collection of procedures that 1) read in an +# Ibpag2 source file, 2) output token defines, 3) emit action code, +# and finally 4) pass a start symbol, list of productions, and token +# table back to the calling procedure. Described formally: +# +# ibreader: file x file x string -> ib_grammar record +# (in, out, module) -> grammar +# +# In is the input stream; out is the output stream; module is an +# optional string that distinguishes this grammar from others that +# might also be running simultaneously. Grammar is an ib_grammar +# record containing the start symbol in its first field and the +# production list in its second. Its third field contains a table +# used to map integers to actual token names or character literals, +# i.e. its keys are things like -1, 0, etc. and its values are things +# like "error," "EOF," etc. +# +# Note that if a module argument is supplied to ibreader(), one must +# also be supplied to ibwriter(). See ibwriter.icn. +# +# The format of the input file is highly reminiscent of YACC. It +# consists of three basic sections, the first two of which are +# followed by %%. See the main documentation to Ibpag2 for +# specifics. Major differences between Ibpag2 and YACC input format +# include: +# +# 1) "$$ = x" constructs are replaced by "return x" (e.g. "$$ = +# $1 + $3" -> "return $1 + $3") +# +# 2) all variables within a given action are, by default, local +# to that action; i.e. they cannot be accessed by other +# actions unless you declare them global elsewhere (e.g. in +# the pass-through part of the declarations section %{ ... %}) +# +# 3) the %union declaration is not needed by Ibpag +# +# 4) tokens and symbols are separated from each other by a comma +# (e.g. %token '+', '-' and S : NP, VP) +# +# 5) epsilon is indicated by the keyword "epsilon" (e.g. REL : +# epsilon) +# +# 6) both epsilon and error *may* be declared as %tokens for +# reasons of precedence, although they retain hard-coded +# internal values (-2 and -1, respectively) +# +# 7) all actions must follow the last RHS symbol of the rule they +# apply to (preceded by an optional %prec directive); to +# achieve S : NP { action1 }, VP { action2 }, insert a dummy +# rule: S : NP, dummy, VP { action2 }; dummy : epsilon { +# action1 } ; +# +# 8) YYERROR, YYACCEPT, yyclearin, and yyerrok are the same, +# except they are written IIERROR, IIACCEPT, iiclearin, and +# iierrok (i.e. "ii" replaces "yy") +# +# 9) Ibpag2's input files are tokenized like modified Icon files, +# and, as a consequence, Icon's reserved words must not be +# used as symbols (e.g. "if : if, then" is no go) +# +############################################################################ +# +# Links: itokens, escape +# +# See also: ibwriter +# +############################################################################ + +#link itokens, escape +link escape + +record ib_grammar(start, rules, tbl) +record tokstats(str, no, prec, assoc) + +# Declared in itokens.icn: +# global line_number + +# +# ibreader: file x file x string x string -> ib_grammar record +# (in, out, module, source_fname) -> grammar +# +# Where in is an input stream, out is an output stream, module is +# some string uniquely identifying this module (optional), and +# where grammar is an ib_grammar record containing the start +# symbol in its first field and a list of production records in +# its second. Source_fname is the string name of Ibpag2's input +# grammar file. Defaults to "source file." +# +procedure ibreader(in, out, module, source_fname) + + local tmp, grammar, toktbl, next_token, next_token_no_nl, + token, LHS, t + + /source_fname := "source file" + grammar := ib_grammar(&null, list(), table()) + toktbl := table() + next_token := create itokens(in, 1) + next_token_no_nl := create 1(tmp := |@next_token, \tmp.sym) + token := @next_token_no_nl | iohno(4) + + # Do the %{ $} and %token stuff, i.e. everything up to %% + # (NEWSECT). + # + until token.sym == "NEWSECT" do { + case token.sym of { + default : { + iohno(48, "token "||image(token.str) ||"; line "|| line_number) + } + "SEMICOL" : { + # Skip semicolon. Get another token while we're at it. + token := @next_token_no_nl | iohno(47, "line "||line_number) + } + "BEGGLOB" : { + write(out, "\n$line ", line_number, " ", image(source_fname)) + # Copy token values to out until we reach "%}" (ENDGLOB). + (token := copy_icon_stuff(next_token, out)).sym == "ENDGLOB" + token := @next_token_no_nl + } + "MOD" : { + (token := @next_token_no_nl).sym == "IDENT" | + iohno(30, "line " || line_number) + # + # Read in token declarations, set associativity and + # precedences, and enter the tokens into toktbl. + # + token := { + case token.str of { + default : iohno(30, "line " || line_number) + "token" : read_decl(next_token_no_nl, toktbl, &null) + "right" : read_decl(next_token_no_nl, toktbl, "r") + "left" : read_decl(next_token_no_nl, toktbl, "l") + "nonassoc": read_decl(next_token_no_nl, toktbl, "n") + "union" : iohno(45, "line "|| line_number) + "start" : { + (token := @next_token_no_nl).sym == "IDENT" | + iohno(31, "line " || line_number) + /grammar.start := token.str | + iohno(32, "line " || line_number) + @next_token_no_nl | iohno(4) + } + } + } + } + } + } + # Skip past %% (NEWSECT) and semicolon (if present). + token := @next_token_no_nl | iohno(47, "line "|| line_number) + (token := token | @next_token_no_nl | iohno(4)).sym ~== "SEMICOL" + token.sym == "NEWSECT" & iohno(47, "line "|| line_number) + + # + # Fetch start symbol if it wasn't defined above via %start; by + # default the start symbol is the LHS of rule 1. + # + /grammar.start := token.str + + # Having reached the end of the declarations section, we can now + # copy out a define for each token number, not counting character + # literals (which are stored as integers). While we're at it, + # create a table that maps token numbers back to character + # literals and strings (for use in later verbose and debugging + # displays). + # + write(out, "\n") + every t := !toktbl do { + if type(t.str) == "integer" then + insert(grammar.tbl, t.no, image(char(t.str))) + else { + insert(grammar.tbl, t.no, t.str) + write(out, "$define ", t.str, "\t", t.no) + } + } + + # Now, finally, read in rules up until we reach EOF or %% (i.e. + # NEWSECT). EOF is signaled below by failure of read_RHS(). + # + until token.sym == "NEWSECT" do { + token.sym == "IDENT" | iohno(33, token.str ||" line "|| line_number) + LHS := token.str + token := @next_token_no_nl | iohno(4) + token.sym == "COLON" | iohno(34, token.str ||" line "|| line_number) + # + # Read in RHS, then the action (if any) then the prec (if + # any). If we see a BAR, then repeat, re-using the same + # left-hand side symbol. + # + while token := + read_RHS(next_token, next_token_no_nl, out, toktbl, LHS, + grammar, module, source_fname) | + # if read_RHS fails, we're at EOF + break break + do token.sym == "BAR" | break + } + + # Copy the remainder of the file to out as Icon code. + write(out, "\n$line ", line_number, " ", image(source_fname)) + every copy_icon_stuff(next_token, out, "EOFX") + + # Do final setup on the reverse token table. This table will be + # used later to map integers to their original names in verbose or + # debugging displays. + # + insert(grammar.tbl, 0, "$") + + return grammar + +end + + +# +# copy_icon_stuff: coexpression x file x string -> ib_TOK records +# (next_token, out, except) -> token records +# +# Copy Icon code to output stream, also suspending as we go. +# Insert separators between tokens where needed. Do not output +# any token whose sym field matches except. The point in +# suspending tokens as we go is to enable the calling procedure to +# look for signal tokens that indicate insertion or termination +# points. +# +procedure copy_icon_stuff(next_token, out, except) + + local separator, T + + separator := "" + while T := @next_token do { + if \T.sym then suspend T + if \T.sym == \except then next + if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT" + then writes(out, separator) + writes(out, T.str) + if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT" + then separator := " " else separator := "" + } + + # unexpected EOF error + (except === "EOFX") | iohno(4) + +end + + +# +# read_decl: coexpression x table x string -> ib_TOK +# (next_token_no_nl, toktbl, assoc) -> token +# +# Read in token declarations, assigning them the correct +# precedence and associativity. Number the tokens for later +# $define preprocessor directives. When done, return the last +# token processed. Toktbl is the table that holds the stats for +# each declared token. +# +procedure read_decl(next_token_no_nl, toktbl, assoc) + + local token, c + static token_no, prec + initial { + token_no := 256 + prec := 0 + } + + # All tokens in this list have the same prec and assoc. + # Precedence is determined by order. Associativity is determined + # by keyword in the calling procedure, and is passed as arg 3. + # + prec +:= 1 + assoc === ("n"|"r"|"l"|&null) | iohno(5, image(assoc)) + + # As long as we find commas and token names, keep on adding tokens + # to the token table. Return the unused token when done. If we + # reach EOF, there's been an error. + # + repeat { + token := @next_token_no_nl | iohno(4) + case token.sym of { + default : iohno(31, token.str ||" line "|| line_number) + "CSETLIT" | "STRING": { + # Enter character literals as integers. + *escape(token.str[2:-1]) = 1 | iohno(49, token.str) + c := ord(escape(token.str[2:-1])) + toktbl[c] := tokstats(c, c, prec, assoc) + } + "IDENT" : { + case token.str of { + "error" : + toktbl[token.str] := tokstats("error", -1, prec, assoc) + "epsilon": + toktbl[token.str] := tokstats("epsilon",-2,prec, assoc) + default : { + # Enter TOKENs as string-keyed records in toktbl. + token_no +:= 1 + toktbl[token.str] := + tokstats(token.str, token_no, prec, assoc) + } + } + } + } + # As long as we're seeing commas, go back for more tokens. + token := @next_token_no_nl | iohno(4) + token.sym == "COMMA" | break + } + + # Skip past semicolon, if present (as set up now, it shouldn't be). + (token := token | @next_token_no_nl | iohno(4)).sym ~== "SEMICOL" + return token + +end + + +# +# read_RHS: coexpression x coexpression x file x table x +# string x ib_grammar record x string x string -> token +# +# Read_RHS goes through the RHS of rule definitions, inserting the +# resulting productions into a master rule list. At the same +# time, it outputs the actions corresponding to those productions +# as procedures that are given names corresponding to the numbers +# of the productions. I.e. production 1, if endowed with an { +# action }, will correspond to procedure _1_. Prec and assoc are +# automatically set to that of the last RHS nonterminal, but this +# may be changed explicitly by the %prec keyword, as in YACC. +# Source_fname is the name of the source grammar file we're pro- +# cessing (caller will give us some reasonable default if we're +# reading &input). +# +# Fails on EOF. +# +procedure read_RHS(next_token, next_token_no_nl, out, toktbl, LHS, + grammar, module, source_fname) + + local token, rule, c + static rule_no + initial rule_no := 0 + + rule_no +:= 1 + # LHS RHS POS LOOK no prec assoc + rule := production(LHS, list(), &null, &null, rule_no, &null, &null) + put(grammar.rules, rule) + + # Read in RHS symbols. + # + repeat { + token := @next_token_no_nl | iohno(4) + case token.sym of { + default : + iohno(35, "token "|| image(token.str)||"; line "|| line_number) + "CSETLIT" | "STRING": { + *escape(token.str[2:-1]) = 1 | iohno(49, token.str) + c := ord(escape(token.str[2:-1])) + if \toktbl[c] then { + rule.prec := toktbl[c].prec + rule.assoc := toktbl[c].assoc + } + # literals not declared earlier will get caught here + else insert(grammar.tbl, c, image(char(c))) + put(rule.RHS, c) + } + "IDENT" : { + # If it's a terminal (i.e. a declared token), assign + # this rule its precedence and associativity. If it's + # not in toktbl, then it's not a declared token.... + if \toktbl[token.str] then { + rule.prec := toktbl[token.str].prec + rule.assoc := toktbl[token.str].assoc + put(rule.RHS, toktbl[token.str].no) + if toktbl[token.str].no = -2 then { + *rule.RHS > 1 & iohno(44, "line ", line_number) + rule.POS := 2 + } + } + # ...undeclared stuff. Could be a nonterminal. If + # error and/or epsilon weren't declared as tokens, + # they will get caught here, too. + else { + case token.str of { + &null : stop("What is going on here?") + default : put(rule.RHS, token.str) + "error" : { + put(rule.RHS, -1) + insert(grammar.tbl, -1, "error") + } + "epsilon" : { + if *put(rule.RHS, -2) > 1 + then iohno(44, "line ", line_number) + else rule.POS := 2 + insert(grammar.tbl, -2, "epsilon") + } + } + } + } + } + # Comma means: Go back for another RHS symbol. + token := @next_token_no_nl | fail + token.sym == "COMMA" | break + } + + # Skip semicolon token, if present. + (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" + + # Read and set (optional) precedence. + # + if token.sym == "MOD" then { + token := @next_token_no_nl | iohno(4) + (token.sym == "IDENT" & token.str == "prec") | + iohno(43, token.str || " line " || line_number) + token := @next_token_no_nl | iohno(4) + case token.sym of { + "CSETLIT" | "STRING" : { + *escape(token.str[2:-1]) = 1 | iohno(49, token.str) + c := ord(escape(token.str[2:-1])) & + rule.prec := toktbl[c].prec & + rule.assoc := toktbl[c].assoc + } + "IDENT" : { + \toktbl[token.str] | + iohno(43, token.str || " line " || line_number) + rule.prec := toktbl[token.str].prec & + rule.assoc := toktbl[token.str].assoc + } + default : 1 = 4 # deliberate failure + } | iohno(43, "line ", line_number) + token := @next_token_no_nl | fail + } + + # Skip semicolon token, if present. + (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" + + # Read in (optional) action. + # + if token.sym == "LBRACE" then { + write_action_as_procedure(next_token, out, rule, + module, source_fname) + token := @next_token_no_nl | fail + } + + # Skip semicolon token, if present. + (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" + return token + +end + + +# +# write_action_as_procedure +# +procedure write_action_as_procedure(next_token, out, rule, + module, source_fname) + + local argstr, bracelevel, token, i, neg + + /module := "" + argstr := "" + # + # Decide the number of arguments based on the length of the RHS of + # rule. Exception: Epsilon productions are empty, and pop nothing + # off the stack, so take zero args. + # + if rule.RHS[1] ~=== -2 then { + every argstr ||:= "arg" || (1 to *rule.RHS) || "," + argstr := trim(argstr, ',') + } + write(out, "procedure _", rule.no, "_", module, "(", argstr, ")") + write(out, "\n$line ", line_number, " ", image(source_fname)) + + bracelevel := 1 + until bracelevel = 0 do { + every token := copy_icon_stuff(next_token, out, "RHSARG") do { + case token.sym of { + default : next + "LBRACE" : bracelevel +:= 1 + "RBRACE" : bracelevel -:= 1 + "RHSARG" : { + until \ (token := @next_token).sym do + writes(out, token.str) + if neg := (token.sym == "MINUS") then + until \ (token := @next_token).sym do + writes(out, token.str) + else neg := &null + token.sym == "INTLIT" | iohno(37, "$"||token.str) + if /neg & token.str ~== "0" then { + token.str <= *rule.RHS | iohno(38, "$"||token.str) + writes(out, " arg", token.str, " ") + } else { + # Code for $0, $-1, etc. + # + # Warning! If the name of the stack is changed + # in iiparse.lib, it has to be changed here, too. + # + i := abs(token.str)+1 + writes(out, " value_stack", module, "[", i, "] ") + } + } + } + if bracelevel = 0 then { + write(out, "\nend\n") + return token + } + } + } + + iohno(39, "line "|| line_number) + +end + diff --git a/ipl/packs/ibpag2/ibutil.icn b/ipl/packs/ibpag2/ibutil.icn new file mode 100644 index 0000000..d16e511 --- /dev/null +++ b/ipl/packs/ibpag2/ibutil.icn @@ -0,0 +1,296 @@ +############################################################################ +# +# Name: ibutil.icn +# +# Title: utilities for Ibpag2 +# +# Author: Richard L. Goerwitz +# +# Version: 1.21 +# +############################################################################ +# +# Contains: +# +# production_2_string(p) makes production or item p human- +# readable +# +# print_item_list(C, i) returns human-readable version of +# item list C +# +# print_grammar(grammar, f) sends to file f (default &output) +# a human-readable printout of a grammar, +# as recorded in an ib_grammar structure +# +# print_action_goto_tables(atbl, gtbl, ibtoktbl, f) +# sends to file f (default (&output) +# a human-readable printout of action +# table atbl and goto table gtbl +# +# print_follow_sets(FOLLOW_table) +# returns a human-readable version +# of a FOLLOW table (table of sets) +# +# print_first_sets(FIRST_table) +# returns a human-readable version +# of a FIRST table (a table of sets) +# +# ibreplace(s1, s2, s3) replaces s2 with s3 in s1 +# +# equivalent_items(i1, i2) succeeds if item i1 is structurally +# identical to item i2 +# +# equivalent_item_lists(l1,l2) same as equivalent_items, but for +# lists of items, not individual items +# +############################################################################ +# +# Links: none +# +############################################################################ + + +record production(LHS, RHS, POS, LOOK, no, prec, assoc) + +# +# production_2_string: production record -> string +# p -> s +# +# Stringizes an image of the LHS and RHS of production p in +# human-readable form. +# +procedure production_2_string(p, ibtoktbl) + + local s, m, t + + s := image(p.LHS) || " -> " + every m := !p.RHS do { + if t := \ (\ibtoktbl)[m] + then s ||:= t || " " + else s ||:= image(m) || " " + } + # if the POS field is nonnull, print it + s ||:= "(POS = " || image(\p.POS) || ") " + # if the LOOK field is nonnull, print it, too + s ||:= "lookahead = " || image(\p.LOOK) + + return trim(s) + +end + + +# +# print_item_list: makes item list human readable +# +procedure print_item_list(C, i) + + write(&errout, "Productions for item list ", i, ":") + every write(&errout, "\t", production_2_string(!C[i])) + write(&errout) + return + +end + + +# +# print_grammar: makes entire grammar human readable +# +procedure print_grammar(grammar, f) + + local p, i, sl + + /f := &errout + + write(f, "Start symbol:") + write(f, "\t", grammar.start) + write(f) + write(f, "Rules:") + every p := !grammar.rules do { + writes(f, "\tRule ", right(p.no, 3, " "), " ") + write(f, production_2_string(p, grammar.tbl)) + } + write(f) + write(f, "Tokens:") + sl := sort(grammar.tbl, 3) + every i := 1 to *sl-1 by 2 do + write(f, "\t", left(sl[i], 5, "."), right(sl[i+1], 20, ".")) + write(f) + return + +end + + +# +# print_action_goto_tables +# +# Makes action & goto tables human readable. If a table mapping +# integer (i.e. char) literals to token names is supplied, the +# token names themselves are printed. +# +procedure print_action_goto_tables(atbl, gtbl, ibtoktbl, f) + + local TAB, tbl, key_set, size, i, column, k + + /f := &errout + TAB := "\t" + + every tbl := atbl|gtbl do { + + key_set := set(); every insert(key_set, key(tbl)) + writes(f, TAB) + every k := !key_set do + writes(f, \(\ibtoktbl)[k] | k, TAB) + write(f) + + size := 0; every size <:= key(!tbl) + every i := 1 to size do { + writes(f, i, TAB) + every column := tbl[!key_set] do { + # action lists may have more than one element + if /column[i] then + writes(f, " ", TAB) & next + \column[i] ? { + if any('asr') then { + while any('asr') do { + writes(f, ="a") & next + writes(f, tab(upto('.<'))) + if ="<" then tab(find(">")+1) else ="." + tab(many(&digits)) + } + writes(f, TAB) + } + else writes(f, tab(many(&digits)), TAB) + } + } + write(f) + } + write(f) + } + + return + +end + + +# +# print_follow_sets: make FOLLOW table human readable +# +procedure print_follow_sets(FOLLOW_table) + + local FOLLOW_sets, i + + FOLLOW_sets := sort(FOLLOW_table, 3) + write(&errout, "FOLLOW sets are as follows:") + every i := 1 to *FOLLOW_sets-1 by 2 do { + writes(&errout, "\tFOLLOW(", image(FOLLOW_sets[i]), ") = ") + every writes(&errout, image(! FOLLOW_sets[i+1]), " ") + write(&errout) + } + write(&errout) + return + +end + + +# +# print_first_sets: make FIRST table human readable +# +procedure print_first_sets(FIRST_table) + + local FIRST_sets, i + + FIRST_sets := sort(FIRST_table, 3) + write(&errout, "FIRST sets are as follows:") + every i := 1 to *FIRST_sets-1 by 2 do { + writes(&errout, "\tFIRST(", image(FIRST_sets[i]), ") = ") + every writes(&errout, image(! FIRST_sets[i+1]), " ") + write(&errout) + } + write(&errout) + return + +end + + +# +# ibreplace: string x string x string -> string +# (s1, s2, s3) -> s4 +# +# Where s4 is s1, with every instance of s2 stripped out and +# replaced by s3. E.g. replace("hello there; hello", "hello", +# "hi") yields "hi there; hi". Taken straight from the IPL. +# +procedure ibreplace(s1,s2,s3) + + local result, i + + result := "" + i := *s2 + + s1 ? { + while result ||:= tab(find(s2)) do { + result ||:= s3 + move(i) + } + return result || tab(0) + } + +end + + +# +# equivalent_items: record x record -> record or failure +# (item1, item2) -> item1 or failure +# +# Where item1 and item2 are records having LHS, RHS, POS, & LOOK +# fields (and possibly others, though they aren't used). Returns +# item1 if item1 and item2 are structurally identical as far as +# their LHS, RHS, LOOK, and POS fields are concerned. For SLR +# table generators, LOOK will always be null. +# +procedure equivalent_items(item1, item2) + + local i + + item1 === item2 & (return item1) + + if item1.LHS == item2.LHS & + item1.POS = item2.POS & + # + # This comparison doesn't have to be recursive, since I take + # care never to alter RHS structures. Identical RHSs should + # always be *the same underlying structure*. + # + item1.RHS === item2.RHS & + item1.LOOK === item2.LOOK + then + return item1 + +end + + +# +# equivalent_item_lists: list x list -> list or fail +# (il1, il2) -> il1 +# +# Where il1 is one sorted list-of-items (as returned by goto() or +# by closure()), where il2 is another such list. Returns the +# first list if the LHS, RHS, and POS fields of the constituent +# items are all structurally identical, i.e. if the two lists +# contain the structurally identical items. +# +procedure equivalent_item_lists(il1, il2) + + local i + + il1 === il2 & (return il1) + if *il1 = *il2 + then { + every i := 1 to *il1 do + equivalent_items(il1[i], il2[i]) | fail + } + else fail + + return il1 + +end diff --git a/ipl/packs/ibpag2/ibwriter.icn b/ipl/packs/ibpag2/ibwriter.icn new file mode 100644 index 0000000..8bf0263 --- /dev/null +++ b/ipl/packs/ibpag2/ibwriter.icn @@ -0,0 +1,110 @@ +############################################################################ +# +# Name: ibwriter.icn +# +# Title: Ibpag2 parser/library writer +# +# Author: Richard L. Goerwitz +# +# Version: 1.7 +# +############################################################################ +# +# Given a grammar, an action table, a goto table, an open output +# file, an open iiparser file, and a module name, sends to the output +# file a fully loaded LR parser with run-time constructible action +# and goto tables. The iiparser file contains the base LR parser +# that the output file uses. +# +############################################################################ +# +# Links: itokens, ximage +# +# See also: iiparse.icn +# +############################################################################ + +#link itokens, ximage +link ximage + +# defined in itokens.icn +# record ib_TOK(sym, str) + +procedure ibwriter(iiparse_file, outfile, grammar, atbl, gtbl, module) + + local token, next_token, start_symbol, rule_list, ttbl + + /module := "" + start_symbol := grammar.start + rule_list := grammar.rules + ttbl := grammar.tbl + next_token := create itokens(iiparse_file, 1) + + # + # Copy tokens in iiparse_file to outfile. Whenever we find a $ + # (RHSARG), process: If we find $$, output $; If we find $module, + # output image(module); and other such stuff. Note that + # copy_iiparse_tokens suspends tokens before writing them. It + # also blocks writing of any token whose sym field matches the + # string given as arg 3. + # + every token := copy_iiparse_tokens(next_token, outfile, "RHSARG") + do { + if token.sym == "RHSARG" then { + if (token := @next_token).sym == "RHSARG" then { + writes(outfile, token.str) + next + } + token.sym == "IDENT" | iohno(60, "line "|| line_number) + writes(outfile, " ") + case token.str of { + # copy $module name over as a literal + "module" : writes(outfile, image(module)) + # use ximage to copy over action, goto, and token tables, + # as well as the production list (used only for debugging) + "atbl_insertion_point": writes(outfile, ximage(atbl)) + "gtbl_insertion_point": writes(outfile, ximage(gtbl)) + "ttbl_insertion_point": writes(outfile, ximage(ttbl)) + "rule_list_insertion_point" : + writes(outfile, ximage(rule_list)) + # use image to copy the start symbol into the output file + "start_symbol_insertion_point" : + writes(outfile, image(start_symbol)) + # add the module name to anything else beginning with $ + default : writes(outfile, token.str, module, " ") + } + } + } + + return + +end + + +# +# copy_iiparse_tokens: coexpression x file x string -> ib_TOK records +# (next_token, out, except) -> token records +# +# Copy Icon code to output stream, also suspending as we go. +# Insert separators between tokens where needed. Do not output +# any token whose sym field matches except. The point in +# suspending tokens as we go is to enable the calling procedure to +# look for signal tokens that indicate insertion or termination +# points. Fail on EOF. +# +procedure copy_iiparse_tokens(next_token, out, except) + + local separator, T + + separator := "" + while T := @next_token do { + if \T.sym then suspend T + if \T.sym == \except then next + if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT" + then writes(out, separator) + writes(out, T.str) + if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT" + then separator := " " else separator := "" + } + +end diff --git a/ipl/packs/ibpag2/iiglrpar.lib b/ipl/packs/ibpag2/iiglrpar.lib new file mode 100644 index 0000000..059b0bf --- /dev/null +++ b/ipl/packs/ibpag2/iiglrpar.lib @@ -0,0 +1,946 @@ +############################################################################ +# +# Name: iiglrpar.lib +# +# Title: Quasi-GLR parser code +# +# Author: Richard L. Goerwitz +# +# Version: 1.20 +# +############################################################################ +# +# This file contains quasi-GLR parser code for use by Ibpag2's +# output. See below on what I mean by "quasi-GLR." Entry point is +# iiparse(infile, fail_on_error). Infile is the stream from which +# input is to be taken. Infile is passed as argument 1 to the +# user-supplied lexical analyzer, iilex_module() (where _module is +# the string supplied with the -m option to Ibpag2). If +# fail_on_error is nonnull, the parser, iiparse, will fail on errors, +# rather than abort. Iiparse() returns the top element on its value +# stack on a successful parse (which can be handy). +# +# Iilex_module() must suspend integers for tokens and may also set +# iilval_module to the actual string values. Tokens -2, -1, and 0 +# are reserved. -2 is epsilon, and -1 is error. 0 is EOF, and is +# automatically appended to the token stream when iilex_module, the +# tokenizer, fails. These values should not normally be returned by +# the analyzer. In general, it is a good idea to $include +# iilex_module from your Ibpag2 source files, so that it can use the +# symbolic %token names declared in the original Ibpag2 source file. +# As implied above ("suspend"), iilex_module must be a generator, +# failing on EOF. +# +# If desired, you may include your own error-handling routine. It +# must be called iiparse_module (where _module is once again the +# module name supplied to ibpag2 via the -m option). The global +# variable line_number_module is automatically defined below, so a +# typical arrangement would be for the lexical analyzer to initialize +# line_number_module to 0, and increment by 1 for each line read. +# The error handler, iierror_module() can then display this variable. +# Note that the error handler should accept a single string argument +# (set by iiparse to describe the token on the input stream when the +# error was encountered). +# +# I label this parser "GLR" because it does support multiple parallel +# parsers (like GLR parsers are supposed to). I use the qualifier +# "quasi," though, because it does not use a graph-structured stack. +# Instead it copies both value and state stacks (in fact, the whole +# parser environment) when creating new automata to handle +# alternative parse paths. Slower, yes. But it enables the user to +# use almost precisely the action and input format that is used for +# the standard parser. +# +# Note that iiparse(), as implemented here, may suspend multiple +# results. So be sure to call it in some context where multiple +# results can be used (e.g. every parse := iiparse(&input, 1), or the +# like). Note also that when new parser "edges" get created, a +# rather cumbersome recursive copy routine is used. Sorry, but it's +# necessary to prevent unintended side-effects. +# +############################################################################ +# +# The algorithm: +# +# A = list of active parsers needing action lookup +# S = list of parsers to be shifted +# R = list of parsers to be reduced +# B = list of parsers that "choked" +# +# for every token on the input stream +# begin +# until length of R = 0 and length of A = 0 +# begin +# - pop successive parsers off of A, and placing them in S, +# R, or B, depending on parse table directives; suspend a +# result for each parser that has reached an accepting +# state +# - pop successive parsers off of R, reducing them, and +# placing them back in A; perform the action code +# associated with each reduction +# end +# - pop successive parsers off of S, shifting them, and placing +# them back in A; mark recovering parsers as recovered when +# they have successfully shifted three tokens +# if length of A = 0 and token not = EOF +# then +# - initiate error recovery on the parsers in B, i.e. for +# each parser in B that is not already recovering, pop its +# stack until error (-1) can legally be shifted, then shift +# error, mark the parser as recovering from an error, and +# place it back in A; if the parser is already recovering, +# discard the current token +# else +# - clobber the parsers in B +# end +# end +# +# Note that when a given active parser in A is being classified +# as needing a reduction, shift, suspension, or entry into the error +# list (B), more than one action may apply due to ambiguity in the +# grammar. At such points, the parser environment is duplicated, +# once for each alternative pathway, and each of the new parsers is +# then entered into the appropriate list (R or S; if accept is an +# alternative, the classification routine suspends). +# +# Note also that when performing the action code associated with +# reductions, parsers may be reclassified as erroneous, accepting, +# etc. via "semantic" directives like IIERROR and IIACCEPT. See the +# README file. Multiple-result action code will cause new parser +# threads to be created, just as ambiguities in the grammar do within +# the classification routine above. +# +############################################################################# +# +# See also: ibpag2.icn, iiparse.icn +# +############################################################################ + +$$line 119 "iiglrpar.lib" + +$$ifndef IIDEBUG + $$define $iidebug 1 + $$define show_new_forest 1 +$$endif # not IIDEBUG + +# These defines are output by Ibpag2 ahead of time (with the module +# name appended, if need be): +# +# IIERROR +# IIACCEPT +# iiprune - GLR mode only +# iiisolate - GLR mode only +# iierrok +# iiclearin + +# Parser environment + lookahead and pending action field. +# +record $ib_pe(state_stack, value_stack, action, errors, + recover_shifts, discards, clearin) + +# Warning! If you change the name of the value stack, change it also +# in ibreader.icn, procedure write_action_as_procedure(). +# +global $iilval, $line_number, $state_stack, $value_stack, + $iidirective, $ttbl, $errors, $discard_token + +# +# iiparse: file x anything -> ?s (a generator) +# (stream, fail_on_error) -> ? +# +# Where stream is an open file, where fail_on_error is a switch +# that (if nonnull) tells the iiparse to fail, rather than abort, +# on error, and where ?s represent the user-defined results of a +# completed parse of file, from the current location up to the +# point where the parser executes an "accept" action. Note that +# iiparse, as implemented here, is a generator. +# +procedure $iiparse(stream, fail_on_error) + + local token, next_token, actives, reducers, shifters, barfers + #global ttbl, errors + static atbl + initial { + $iidirective := "" + atbl := $atbl_insertion_point + $ttbl := $ttbl_insertion_point + $$line 166 "iiglrpar.lib" + \$iilex | stop("no iilex tokenizer defined") + } + + actives := [ $ib_pe([1], [], &null, 0) ] + $state_stack := actives[1].state_stack + $value_stack := actives[1].value_stack + $errors := actives[1].errors + reducers := list() + shifters := list() + # I get tired of bland error code. We'll call the list of + # parsers in an error state "barfers" :-). + barfers := list() + + next_token := create $iilex(stream, fail_on_error) | 0 + + token := @next_token + # + # After this ^, new tokens are read in near the end of the repeat + # loop. None is read in on an error, since then we will try again + # on the token that caused the error. + # + repeat { + until *actives = *reducers = 0 + do { + + # Prune out parsers that are doing the same thing as some + # other parser. + # + $$ifdef AUTO_PRUNE + auto_prune(actives) + $$endif + + # Suspends $value_stack[1] on accept actions. Otherwise, + # puts parsers that need shifting into the shifters list, + # parsers that need reducing into the reducers list, and + # error-state parsers into the barfers list. Creates new + # parser environments as needed. + # + suspend $ib_action(atbl, token, actives, shifters, + reducers, barfers) + + # Perform reductions. If instructed via the iiaccept + # macro, simulate an accept action, and suspend with a + # result. + # + suspend $perform_reductions(token, actives, shifters, + reducers, barfers) + } + + # Shift token for every parser in the shifters list. This + # will create a bunch of new active parsers. + # + $perform_shifts(token, actives, shifters) + # + # If we get to here and have no actives, and we're not at the + # end of the input stream, then we are at an error impasse. + # Do formal error recovery. + # + if *actives = 0 & token ~=== 0 then { + suspend $perform_barfs(atbl, token, actives, barfers,fail_on_error) + # + # Perform_barfs sets discard_token if recovery was + # unsuccessful on the last token, and it needs discarding. + # + if \$discard_token := &null then + token := @next_token | break + # + # If there *still* aren't any active parsers, we've + # reached an impasse (or there are no error productions). + # Abort. + # + if *actives = 0 then { + if \fail_on_error then fail + else stop() + } + } + else { + # + # Parsers in an error state should be weeded out, since if + # we get to here, we have some valid parsers still going. + # I.e. only use them if there are *no* actives (see above). + # + $$ifdef IIDEBUG + write(&errout, "+++ pruning ", *barfers, " erroneous parsers") + while parser := pop(barfers) + do $iidebug("p", token, &null, parser) + $$else + while pop(barfers) + $$endif #IIDEBUG + # + # Get the next token. Only do this if we have active + # parsers not recovering from an error, i.e., if we're here. + # + token := @next_token | break + } + } + +end + + +# +# ib_action +# +procedure $ib_action(atbl, token, actives, shifters, reducers, + barfers) + + local a, act, num, parser, new_parser + + # While there is an active parser, take it off the actives list, + # and... + while parser := pop(actives) do { + + # ...check for a valid action (if none, then there is an + # error; put it into the barfers list). + # + if a := \ (\atbl[token])[parser.state_stack[1]] + then { + a ? { + # Keep track of how many actions we've seen. + num := 0 + + # Snip off successive actions. If there's no + # ambiguity, there will be only one action, & no + # additional parser environments will be created. + # + while { + $$ifdef COMPRESSED_TABLES + # "\x80" is the accept action; uncompress_action + # does its own move()ing + act := $uncompress_action() + $$else + act := ="a" | { + tab(any('sr')) || tab(upto('.<')) || + ((="<" || tab(find(">")+1)) | =".") || + tab(many(&digits)) + } + $$endif #COMPRESSED TABLES + } + do { + # New parser environment only needed for num > 1. + # + if (num +:= 1) > 1 then { + new_parser := $fullcopy(parser) + show_new_forest("=== table conflict; new parser", + actives, shifters, reducers, barfers, new_parser) + } + else new_parser := parser + new_parser.action := act + + # Classify the action as s, r, or a, and place i + # the appropriate list (or suspend a result if a). + # + case act[1] of { + "s" : put(shifters, new_parser) + "r" : put(reducers, new_parser) + "a" : { + $iidebug("a", token, ruleno, parser) + suspend parser.value_stack[1] + } + } + } + } + } + else { + # + # Error. Parser will get garbage collected before another + # token is read from iilex, unless the parsers all fail - + # in which case, error recovery will be tried. + # + $iidebug("e", token, &null, parser) + put(barfers, parser) + } + } + +end + + +# +# perform_reductions +# +procedure $perform_reductions(token, actives, shifters, reducers, barfers) + + local parser, ruleno, newsym, rhsize, arglist, result, num, + new_parser, tmp, p + static gtbl + initial { + gtbl := $gtbl_insertion_point + $$line 336 "iiglrpar.lib" + } + + while parser := get(reducers) + do { + + # Set up global state and value stacks, so that the action + # code can access them. + # + $state_stack := parser.state_stack + $value_stack := parser.value_stack + $errors := parser.errors + + # Finally, perform the given action: + # + parser.action ? { + # + # Reduce action format, e.g. r1<S>2 = reduce by rule 1 + # (LHS = S, RHS length = 2). + # + move(1) + ruleno := integer(1(tab(find("<")), move(1))) + newsym := 1(tab(find(">")), move(1)) + rhsize := integer(tab(many(&digits))) + arglist := [] + every 1 to rhsize do { + pop($state_stack) + push(arglist, pop($value_stack)) + } + # Gtbl is "backwards," i.e. token first, state second. + # The value produced is the "goto" state. + # + push($state_stack, gtbl[newsym][$state_stack[1]]) + # + # The actions are in procedures having the same name as + # the number of their rule, bracketed by underscores, & + # followed by the current module name. If there is such a + # procedure associated with the current reduce action, + # call it. + # + if func := proc("_" || ruleno || "_" || $module) + then { + num := 0 + # + # For every valid result from the action code for the + # current reduction, create a new parser if need be + # (i.e. if num > 1), and check iidirective. Push the + # result onto the stack of the new parser & put the + # new parser into the actives list. + # + every result := func!arglist do { + # For all but the first result, create a new parser. + if (num +:= 1) > 1 then { + new_parser := $fullcopy(parser) + pop(new_parser.value_stack) # take off pushed result + show_new_forest("=== multi-result action; new parser", + actives, shifters, reducers, barfers, new_parser) + } + else new_parser := parser + # + # IIERROR, IIACCEPT, iierrok, iiisolate, and iiprune + # are all implemented using a search through a global + # iidirective variable; see the $defines described + # above. + # + tmp := $iidirective + $iidirective := "" + if *tmp > 0 then { + if find("clearin", tmp) then { + # see perform_shifts() below + new_parser.clearin := 1 + } + if find("error", tmp) then { + $iidebug("e", token, ruleno, new_parser) + put(barfers, new_parser) + next + } + if find("errok", tmp) then { + new_parser.recover_shifts := &null + new_parser.discards := 0 + } + if find("prune", tmp) then { + # Garden path. + $iidebug("p", token, ruleno, new_parser) + break next + } + if find("isolate", tmp) then { + # Prune all but the current parser. + $$ifdef IIDEBUG + write(&errout, "+++ isolating by pruning") + while p := pop(actives) do + $iidebug("p", token, ruleno, p) + while p := pop(reducers) do + $iidebug("p", token, ruleno, p) + while p := pop(shifters) do + $iidebug("p", token, ruleno, p) + while p := pop(barfers) do + $iidebug("p", token, ruleno, p) + $$else + while pop(actives) + while pop(reducers) + while pop(shifters) + while pop(barfers) + $$endif #IIDEBUG + push(new_parser.value_stack, result) + $iidebug("r", token, ruleno, new_parser) + put(actives, new_parser) + break next + } + if find("accept", tmp) then { + $iidebug("a", token, ruleno, new_parser) + suspend result + next + } + } + # + # Push result onto the new parser thread's value + # stack. + # + push(new_parser.value_stack, result) + $iidebug("r", token, ruleno, new_parser) + put(actives, new_parser) + # + # Action code must have the stack in its original + # form. So restore the stack's old form before + # going back to the action code. + # + if num = 1 then + $value_stack := parser.value_stack[2:0] + } + # + # If the action code for this rule failed, push &null. + # But first check $iidirective. + # + if num = 0 then { + # + # Same $iidirective code as above repeated + # (inelegantly) because it accesses too many + # variables to be easily isolated. + # + tmp := $iidirective + $iidirective := "" + if *tmp > 0 then { + if find("clearin", tmp) then { + # see perform_shifts() below + parser.clearin := 1 + } + if find("error", tmp) then { + $iidebug("e", token, ruleno, parser) + put(barfers, parser) + next + } + if find("errok", tmp) then { + parser.recover_shifts := &null + parser.discards := 0 + } + if find("prune", tmp) then { + # Garden path. + $iidebug("p", token, ruleno, parser) + next # go back to enclosing while pop... + } + if find("isolate", tmp) then { + # Prune all but the current parser. + $$ifdef IIDEBUG + write(&errout, "+++ isolating by pruning") + while p := pop(actives) do + $iidebug("p", token, ruleno, p) + while p := pop(reducers) do + $iidebug("p", token, ruleno, p) + while p := pop(shifters) do + $iidebug("p", token, ruleno, p) + while p := pop(barfers) do + $iidebug("p", token, ruleno, p) + $$else + while pop(actives) + while pop(reducers) + while pop(shifters) + while pop(barfers) + $$endif #IIDEBUG + } + if find("accept", tmp) then { + $iidebug("a", token, ruleno, parser) + suspend arglist[-1] | &null + next + } + } + # Finally, push the result! + result := arglist[-1] | &null + push(parser.value_stack, result) + $iidebug("r", token, ruleno, parser) + put(actives, parser) + } + } + # If there is no action code for this rule... + else { + # ...push the value of the last RHS arg. + # For 0-length e-productions, push &null. + result := arglist[-1] | &null + push(parser.value_stack, result) + $iidebug("r", token, ruleno, parser) + put(actives, parser) + } + } + } + +end + + +# +# perform_shifts +# +procedure $perform_shifts(token, actives, shifters) + + local parser, ruleno + + *shifters = 0 & fail + + while parser := pop(shifters) do { + # + # One of the iidirectives is iiclearin, i.e. clear the input + # token and try again on the next token. + # + \parser.clearin := &null & { + put(actives, parser) + next + } + parser.action ? { + # + # Shift action format, e.g. s2.1 = shift and go to state 2 + # by rule 1. + # + move(1) + push(parser.state_stack, integer(tab(find(".")))) + push(parser.value_stack, $iilval) + ="."; ruleno := integer(tab(many(&digits))) + pos(0) | stop("malformed action: ", act) + # + # If, while recovering, we can manage to shift 3 tokens, + # then we consider ourselves resynchronized. Don't count + # the error token (-1). + # + if token ~= -1 then { + if \parser.recover_shifts +:= 1 then { + # 3 shifts make a successful recovery + if parser.recover_shifts > 4 then { + parser.recover_shifts := &null + parser.discards := 0 + } + } + } + $iidebug("s", token, ruleno, parser) + } + put(actives, parser) + } + + return + +end + + +# +# perform_barfs +# +procedure $perform_barfs(atbl, token, actives, barfers, fail_on_error) + + # + # Note how this procedure has its own local reducers and shifters + # list. These are *not* passed from the parent environment! + # + local parser, count, reducers, shifters, recoverers + + # To hold the list of parsers that need to shift error (-1). + recoverers := list() + + count := 0 + while parser := pop(barfers) do { + count +:= 1 + if \parser.recover_shifts := 0 then { + # + # If we're already in an error state, discard the + # current token, and increment the number of discards + # we have made. 500 is too many; abort. + # + if (parser.discards +:= 1) > 500 then { + if proc($iierror) + then $iierror("fatal error: can't resynchronize") + else write(&errout, "fatal error: can't resynchronize") + if \fail_on_error then fail + else stop() + } + # try again on this one with the next token + put(actives, parser) + } else { + parser.errors +:= 1 # error count for this parser + parser.discards := parser.recover_shifts := 0 + # If this is our first erroneous parser, print a message. + if count = 1 then { + if proc($iierror) + then $iierror(image(\$ttbl[token]) | image(token)) + else write(&errout, "parse error") + } + # + # If error appears in a RHS, pop states until we get to a + # spot where error (-1) is a valid lookahead token: + # + if \$ttbl[-1] then { + until *parser.state_stack = 0 do { + if \atbl[-1][parser.state_stack[1]] then { + put(recoverers, parser) + break next + } else pop(parser.state_stack) & pop(parser.value_stack) + } + } + # If we get past here, the stack is now empty or there + # are no error productions. Abandon this parser. + $iidebug("p", token, &null, parser) + } + } + + # Parsers still recovering are in the actives list; those that + # need to shift error (-1) are in the recoverers list. The + # following turns recoverers into actives: + # + if *recoverers > 0 then { + reducers := list() # a scratch list + shifters := list() # ditto + until *recoverers = *reducers = 0 do { + $$ifdef AUTO_PRUNE + auto_prune(actives) + $$endif + suspend $ib_action(atbl, -1, recoverers, shifters, + reducers, barfers) + suspend $perform_reductions(-1, recoverers, shifters, + reducers, barfers) + } + $perform_shifts(-1, recoverers, shifters) + every put(actives, !recoverers) + } + # + # If there were no recoverers, we've already shifted the error + # token, and are discarding tokens from the input stream. Note + # that if one parser was recovering, they *all* should be + # recovering, since if one was not recovering, it the erroneous + # parsers should all have been discarded by the calling proc. + # + else + $discard_token := 1 + +end + + +$$ifdef IIDEBUG + +record production(LHS, RHS, POS, LOOK, no, prec, assoc) +# +# iidebug +# +procedure $iidebug(action, token, ruleno, parser) + + local p, t, state + static rule_list + initial { + rule_list := $rule_list_insertion_point + $$line 693 "iiglrpar.lib" + } + + write(&errout, "--- In parser ", image(parser), ":") + case action of { + "a" : writes(&errout, "accepting ") & + state := parser.state_stack[1] + "e" : writes(&errout, "***ERROR***\n") & + write(&errout, "recover shifts = ", + parser.recover_shifts) & + write(&errout, "discarded tokens = ", + parser.discards) & + writes(&errout, "error action ") & + state := parser.state_stack[1] + "p" : writes(&errout, "***PRUNING***\n") & + writes(&errout, "prune action ") & + state := parser.state_stack[1] + "r" : writes(&errout, "reducing ") & + state := parser.state_stack[2] + "s" : writes(&errout, "shifting ") & + state := parser.state_stack[2] + default : stop("malformed action argument to iidebug") + } + + t := image(token) || (" (" || (\$ttbl[token] | "unknown") || ")") + writes(&errout, "on lookahead ", t, ", in state ", state) + if \ruleno then { + (p := !rule_list).no === ruleno & + write(&errout, "; rule ", $production_2_string(p, $ttbl)) + } + # for errors, ruleno is null + else write(&errout) + + write(&errout, " state stack now: ") + every write(&errout, "\t", image(!parser.state_stack)) + write(&errout, " value stack now: ") + if *parser.value_stack > 0 + then every write(&errout, "\t", image(!parser.value_stack)) + else write(&errout, "\t(empty)") + + return + +end + + +# +# production_2_string: production record -> string +# p -> s +# +# Stringizes an image of the LHS and RHS of production p in +# human-readable form. +# +procedure $production_2_string(p, ibtoktbl) + + local s, m, t + + s := image(p.LHS) || " -> " + every m := !p.RHS do { + if t := \ (\ibtoktbl)[m] + then s ||:= t || " " + else s ||:= image(m) || " " + } + # if the POS field is nonnull, print it + s ||:= "(POS = " || image(\p.POS) || ") " + # if the LOOK field is nonnull, print it, too + s ||:= "lookahead = " || image(\p.LOOK) + + return trim(s) + +end + +# +# show_new_forest +# +procedure show_new_forest(msg, actives, shifters, reducers, barfers, parser) + write(&errout, msg) + write(&errout, " List of active parsers:") + every write(&errout, "\t", image(!actives)) + every write(&errout, "\t", image(!shifters)) + every write(&errout, "\t", image(!reducers)) + every write(&errout, "\t", image(!barfers), " (error)") + write(&errout, "\tnew -> ", image(parser)) +end +$$endif # IIDEBUG + + +$$ifdef COMPRESSED_TABLES + +# +# uncompress_action +# +procedure $uncompress_action() + + local next_chunk, full_action + + next_chunk := create ord(!&subject[&pos:0]) + case $in_ib_bits(next_chunk, 2) of { + 0: { + full_action := "s" + full_action ||:= $in_ib_bits(next_chunk, 11) + full_action ||:= "." + full_action ||:= $in_ib_bits(next_chunk, 11) + move(3) + } + 1: { + full_action := "r" + full_action ||:= $in_ib_bits(next_chunk, 11) + full_action ||:= "<" + full_action ||:= $in_ib_bits(next_chunk, 11) + full_action ||:= ">" + full_action ||:= $in_ib_bits(next_chunk, 8) + move(4) + } + 2: { + full_action := "a" + move(1) + } + } | fail + + return full_action + +end + + +# +# in_ib_bits: like inbits (IPL), but with coexpression for file +# +procedure $in_ib_bits(next_chunk, len) + + local i, byte, old_byte_mask + static old_byte, old_len, byte_length + initial { + old_byte := old_len := 0 + byte_length := 8 + } + + old_byte_mask := (0 < 2^old_len - 1) | 0 + old_byte := iand(old_byte, old_byte_mask) + i := ishift(old_byte, len-old_len) + + len -:= (len > old_len) | { + old_len -:= len + return i + } + + while byte := @next_chunk do { + i := ior(i, ishift(byte, len-byte_length)) + len -:= (len > byte_length) | { + old_len := byte_length-len + old_byte := byte + return i + } + } + +end + +$$endif # COMPRESSED_TABLES + +# +# fullcopy: make full recursive copy of object obj +# +procedure $fullcopy(obj) + + local retval, i, k + + case type(obj) of { + "co-expression" : return obj + "cset" : return obj + "file" : return obj + "integer" : return obj + "list" : { + retval := list(*obj) + every i := 1 to *obj do + retval[i] := $fullcopy(obj[i]) + return retval + } + "null" : return &null + "procedure" : return obj + "real" : return obj + "set" : { + retval := set() + every insert(retval, $fullcopy(!obj)) + return retval + } + "string" : return obj + "table" : { + retval := table(obj[[]]) + every k := key(obj) do + insert(retval, $fullcopy(k), $fullcopy(obj[k])) + return retval + } + # probably a record; if not, we're dealing with a new + # version of Icon or a nonstandard implementation, and + # we're screwed + default : { + retval := copy(obj) + every i := 1 to *obj do + retval[i] := $fullcopy(obj[i]) + return retval + } + } + +end + + +$$ifdef AUTO_PRUNE +procedure auto_prune(actives) + + new_actives := [] + while parser1 := pop(actives) do { + every parser2 := actives[j := 1 to *actives] do { + parser1.state_stack[1] = parser2.state_stack[1] | next + *parser1.value_stack = *parser2.value_stack | next + every i := 1 to *parser1.value_stack do { + parser1.value_stack[i] === parser2.value_stack[i] | + break next + } + if parser1.errors < parser2.errors then + actives[j] := parser1 + break next + } + put(new_actives, parser1) + } + + every put(actives, !new_actives) + return &null + +end +$$endif # AUTO_PRUNE diff --git a/ipl/packs/ibpag2/iiparse.lib b/ipl/packs/ibpag2/iiparse.lib new file mode 100644 index 0000000..7367735 --- /dev/null +++ b/ipl/packs/ibpag2/iiparse.lib @@ -0,0 +1,419 @@ +############################################################################ +# +# Name: iiparse.lib +# +# Title: LR parser code +# +# Author: Richard L. Goerwitz +# +# Version: 1.31 +# +############################################################################ +# +# LR parser code for use by Ibpag2-generated files. Entry point is +# iiparse(infile, fail_on_error). Infile is the stream from which +# input is to be taken. Infile is passed as argument 1 to the +# user-supplied lexical analyzer, iilex_module() (where _module is +# the string supplied with the -m option to Ibpag2). If +# fail_on_error is nonnull, the parser, iiparse, will fail on errors, +# rather than abort. Iiparse() returns the top element on its value +# stack on a successful parse (which can be handy). +# +# Iilex_module() must suspend integers for tokens and may also set +# iilval_module to the actual string values. Tokens -2, -1, and 0 +# are reserved. -2 is epsilon, and -1 is error. 0 is EOF, and is +# automatically appended to the token stream when iilex_module, the +# tokenizer, fails. These values should not normally be returned by +# the analyzer. In general, it is a good idea to $include +# iilex_module from your Ibpag2 source files, so that it can use the +# symbolic %token names declared in the original Ibpag2 source file. +# As implied above ("suspend"), iilex_module must be a generator, +# failing on EOF. +# +# If desired, the user may include his or her own error-handling +# routine. It must be called iiparse_module (where _module is once +# again the module name supplied to ibpag2 via the -m option). The +# global variable line_number_module is automatically defined below, +# so a typical arrangement would be for the lexical analyzer to +# initialize line_number_module to 0, and increment by 1 for each +# line read. The error handler, iierror_module() can then display +# this variable. Note that the error handler should accept a single +# string argument (set by iiparse to describe the error just +# encountered). +# +############################################################################ +# +# See also: ibpag2.icn +# +############################################################################ + +$$line 50 "iiparse.lib" + +# These defines are output by Ibpag2 ahead of time (with the module +# name appended, if need be): +# +# $define iierrok recover_shifts := &null; +# $define IIERROR iidirective ||:= "error"; +# $define IIACCEPT iidirective ||:= "accept"; +# $define iiclearin iidirective ||:= "clearin"; + +# Warning! If you change the name of the value stack, change it also +# in ibreader.icn, procedure write_action_as_procedure(). +# +global $iilval, $errors, $line_number, $state_stack, $value_stack, + $iidirective, $recover_shifts, $discards + +# +# iiparse: file x anything -> ? +# (stream, fail_on_error) -> ? +# +# Where stream is an open file, where fail_on_error is a switch +# that (if nonnull) tells the iiparse to fail, rather than abort, +# on error, and where ? represents the user-defined result of a +# completed parse of file, from the current location up to the +# point where the parser executes an "accept" action. +# +procedure $iiparse(stream, fail_on_error) + + local token, next_token, act, ruleno, newsym, rhsize, arglist, + result, tmp, func + static atbl, gtbl, ttbl + + initial { + $iidirective := "" + atbl := $atbl_insertion_point + gtbl := $gtbl_insertion_point + ttbl := $ttbl_insertion_point + $$line 86 "iiparse.lib" + \$iilex | stop("no iilex tokenizer defined") + } + +$$ifndef IIDEBUG + $iidebug := 1 +$$endif # not IIDEBUG + + $state_stack := [1] + $value_stack := [] + + $errors := 0 # errors is global + next_token := create $iilex(stream, fail_on_error) | 0 + + token := @next_token + repeat { + # + # Begin cycle by checking whether there is a valid action + # for state $state_stack[1] and lookahead token. Atbl and + # gtbl here have a "backwards" structure: t[token][state] + # (usually they go t[state][token]). + # + if act := \ (\atbl[token])[$state_stack[1]] then { + $$ifdef COMPRESSED_TABLES + act := $uncompress_action(act) + $$endif #COMPRESSED TABLES + act ? { + # There's a valid action: Perform it. + case move(1) of { + "s": { + # + # Shift action format, e.g. s2.1 = shift and + # go to state 2 by rule 1. + # + push($state_stack, integer(tab(find(".")))) + push($value_stack, $iilval) + ="."; ruleno := integer(tab(many(&digits))) + pos(0) | stop("malformed action: ", act) + # + # If, while recovering, we can manage to + # shift 3 tokens, then we consider ourselves + # resynchronized. Don't count error (-1). + # + if token ~= -1 then { + if \$recover_shifts +:= 1 then { + # 3 shifts = successful recovery + if $recover_shifts > 4 then { + $recover_shifts := &null + $discards := 0 + } + } + } + $iidebug("s", ttbl, token, ruleno) + token := @next_token | break + } + "r": { + # + # Reduce action format, e.g. r1<S>2 = reduce + # by rule 1 (LHS = S, RHS length = 2). + # + ruleno := integer(1(tab(find("<")), move(1))) + newsym := 1(tab(find(">")), move(1)) + rhsize := integer(tab(many(&digits))) + arglist := [] + every 1 to rhsize do { + pop($state_stack) + push(arglist, pop($value_stack)) + } + # on the structure of gtbl, see above on atbl + push($state_stack, gtbl[newsym][$state_stack[1]]) + # + # The actions are in procedures having the same + # name as the number of their rule, bracketed + # by underscores followed by the current module. + # + if func := proc("_" || ruleno || "_" || $module) + then { + result := func!arglist | arglist[-1] | &null + tmp := $iidirective + $iidirective := "" + # + # IIERROR, IIACCEPT, iierrok, and iiclearin + # are implemented using a search through a global + # iidirective variable; see the $defines + # above + # + if *tmp > 0 then { + if find("clearin", tmp) then + token := @next_token + if find("error", tmp) then { + # restore stacks & fake an error + pop($state_stack) + every 1 to rhsize do + push($value_stack, !arglist) + $errors +:= 1 + next_token := create (token | + (|@next_token)) + token := -1 + next + } + if find("accept", tmp) then { + $iidebug("a", ttbl, token, ruleno) + return result + } + } + } + # If there is no action code for this rule... + else { + # ...push the value of the last RHS arg. + # For 0-length e-productions, push &null. + result := arglist[-1] | &null + } + push($value_stack, result) + $iidebug("r", ttbl, token, ruleno) + } + # We're done. Return the last-generated value. + "a": { + $iidebug("a", ttbl, token, ruleno) + return $value_stack[1] + } + } + } + } + # + # ...but if there is *no* action for atbl[token][$state_stack[1]], + # then we have an error. + # + else { + if \$recover_shifts := 0 then { + # + # If we're already in an error state, discard the + # current token, and increment the number of discards + # we have made. 500 is too many; abort. + # + if ($discards +:= 1) > 500 then { + if \$iierror + then $iierror("fatal error: can't resynchronize") + else write(&errout, "fatal error: can't resynchronize") + if \fail_on_error then fail + else stop() + } + $iidebug("e", ttbl, token) + # + # We were in the process of recovering, and the late + # token didn't help; discard it and try again. + # + token := @next_token | break + } else { + $errors +:= 1 # global error count + $discards := $recover_shifts := 0 + if \$iierror + then $iierror(image(\ttbl[token]) | image(token)) + else write(&errout, "parse error") + # + # If error appears in a RHS, pop states until we get to + # a spot where error (-1) is a valid lookahead token: + # + if \ttbl[-1] then { + until *$state_stack = 0 do { + if \atbl[-1][$state_stack[1]] then { + $iidebug("e", ttbl, token) + next_token := create (token | (|@next_token)) + token := -1 + break next + } else pop($state_stack) & pop($value_stack) + } + # If we get past here, the stack is now empty. Abort. + } + if \fail_on_error then fail + else stop() + } + } + } + + # + # If we get to here without hitting a final state, then we aren't + # going to get a valid parse. Abort. + # + if \$iierror + then $iierror("unexpected EOF") + else write(&errout, "unexpected EOF") + + if \fail_on_error then fail + else stop() + +end + + +$$ifdef IIDEBUG + +record production(LHS, RHS, POS, LOOK, no, prec, assoc) +# +# iidebug +# +procedure $iidebug(action, ttbl, token, ruleno) + + local p, t, state + static rule_list + initial { + rule_list := $rule_list_insertion_point + $$line 279 "iiparse.lib" + } + + case action of { + "a" : writes(&errout, "accepting ") & state := $state_stack[1] + "e" : writes(&errout, "***ERROR***\n") & + writes(&errout, "recovery shifts = ", $recover_shifts,"\n") & + writes(&errout, "discarded tokens = ", $discards, "\n") & + writes(&errout, "total error count = ", $errors, "\n") & + writes(&errout, "error action ") & state := $state_stack[1] + "r" : writes(&errout, "reducing ") & state := $state_stack[2] + "s" : writes(&errout, "shifting ") & state := $state_stack[2] + default : stop("malformed action argument to iidebug") + } + + t := image(token) || (" (" || (\ttbl[token] | "unknown") || ")") + writes(&errout, "on lookahead ", t, ", in state ", state) + if \ruleno then { + (p := !rule_list).no = ruleno | + stop("no rule number ", tbl[symbol][state]) + write(&errout, "; rule ", $production_2_string(p, ttbl)) + } + # for errors, ruleno is null + else write(&errout) + + write(&errout, " state stack now: ") + every write(&errout, "\t", image(!$state_stack)) + write(&errout, " value stack now: ") + if *$value_stack > 0 + then every write(&errout, "\t", image(!$value_stack)) + else write(&errout, "\t(empty)") + + return + +end + + +# +# production_2_string: production record -> string +# p -> s +# +# Stringizes an image of the LHS and RHS of production p in +# human-readable form. +# +procedure $production_2_string(p, ibtoktbl) + + local s, m, t + + s := image(p.LHS) || " -> " + every m := !p.RHS do { + if t := \ (\ibtoktbl)[m] + then s ||:= t || " " + else s ||:= image(m) || " " + } + # if the POS field is nonnull, print it + s ||:= "(POS = " || image(\p.POS) || ") " + # if the LOOK field is nonnull, print it, too + s ||:= "lookahead = " || image(\p.LOOK) + + return trim(s) + +end +$$endif # IIDEBUG + + +$$ifdef COMPRESSED_TABLES + +# +# uncompress_action +# +procedure $uncompress_action(action) + + local next_chunk, full_action + + next_chunk := create ord(!action) + case $in_ib_bits(next_chunk, 2) of { + 0: { + full_action := "s" + full_action ||:= $in_ib_bits(next_chunk, 11) + full_action ||:= "." + full_action ||:= $in_ib_bits(next_chunk, 11) + } + 1: { + full_action := "r" + full_action ||:= $in_ib_bits(next_chunk, 11) + full_action ||:= "<" + full_action ||:= $in_ib_bits(next_chunk, 11) + full_action ||:= ">" + full_action ||:= $in_ib_bits(next_chunk, 8) + } + 2: { + full_action := "a" + } + } + + return full_action + +end + + +# +# in_ib_bits: like inbits (IPL), but with coexpression for file +# +procedure $in_ib_bits(next_chunk, len) + + local i, byte, old_byte_mask + static old_byte, old_len, byte_length + initial { + old_byte := old_len := 0 + byte_length := 8 + } + + old_byte_mask := (0 < 2^old_len - 1) | 0 + old_byte := iand(old_byte, old_byte_mask) + i := ishift(old_byte, len-old_len) + + len -:= (len > old_len) | { + old_len -:= len + return i + } + + while byte := @next_chunk do { + i := ior(i, ishift(byte, len-byte_length)) + len -:= (len > byte_length) | { + old_len := byte_length-len + old_byte := byte + return i + } + } + +end + +$$endif # COMPRESSED_TABLES diff --git a/ipl/packs/ibpag2/iohno.icn b/ipl/packs/ibpag2/iohno.icn new file mode 100644 index 0000000..dcf54d0 --- /dev/null +++ b/ipl/packs/ibpag2/iohno.icn @@ -0,0 +1,95 @@ +############################################################################ +# +# Name: iohno.icn +# +# Title: iohno (error handler, with hard-coded messages) +# +# Author: Richard L. Goerwitz +# +# Version: 1.20 +# +############################################################################ +# +# This file contains iohno(n, s) - an error handler taking two +# arguments: 1) an integer and 2) a string. The string (2) is an +# optional error message. The integer (1) is one of several +# hard-coded error numbers (see below). +# +############################################################################ +# +# Links: rewrap +# +############################################################################ + +# +# iohno: print error message s to stderr; abort with exit status n +# +procedure iohno(n, s) + + local i, msg + static errlist + initial { + errlist := [[100, "unspecified failure"], + + [2, "can't find iiparse.lib file"], + + [4, "unexpected EOF"], + [5, "unknown associativity value"], + + [11, "malformed right-hand side"], + [12, "unexpected RHS symbol type"], + + [21, "malformed left-hand side"], + + [30, "unknown or unimplemented % declaration"], + [31, "malformed token declaration"], + [32, "start symbol redefined"], + [33, "LHS symbol expected"], + [34, "colon missing"], + [35, "malformed RHS in rule declaration"], + [36, "undeclared character literal"], + [37, "illegal $integer reference"], + [38, "out-of-range $reference"], + [39, "unterminated brace { in action"], + [43, "bogus precedence"], + [44, "superfluous epsilon"], + [45, "superfluous %union declaration"], + [47, "empty or missing rules section"], + [48, "garbled declarations section"], + [49, "multiple characters within quotes"], + + [40, "same prec, different (or perhaps lacking) assoc"], + [41, "conflict between nonassociative rules"], + [42, "reduce -- reduce conflict"], + [46, "unresolvable shift/reduce conflict"], + + [50, "illegal conflict for nonassociative rules"], + [51, "reduce/reduce conflict"], + [52, "nonterminal useless and/or declared as a terminal"], + + [60, "malformed $insertion point in iiparse file"], + + [70, "bad action format"], + [71, "nonexistent rule number specified in old action"], + [72, "nonexistent rule number specified in new action"], + + [80, "conflict in goto table"], + + [90, "RHS nonterminal appears in no LHS"], + [91, "useless nonterminal"] + ] + } + + /n := 0 + every i := 1 to *errlist do + if errlist[i][1] = n then msg := errlist[i][2] + writes(&errout, "error ", n, " (", msg, ")") + if \s then { + write(&errout, ": ") + every write(&errout, "\t", rewrap(s) | rewrap()) + } + else write(&errout) + + exit(n) + +end diff --git a/ipl/packs/ibpag2/itokens.icn b/ipl/packs/ibpag2/itokens.icn new file mode 100644 index 0000000..1bb9cd1 --- /dev/null +++ b/ipl/packs/ibpag2/itokens.icn @@ -0,0 +1,925 @@ +############################################################################ +# +# Name: itokens.icn +# +# Title: itokens (Icon source-file tokenizer) +# +# Author: Richard L. Goerwitz +# +# Version: 1.11 +# +############################################################################ +# +# This file contains itokens() - a utility for breaking Icon source +# files up into individual tokens. This is the sort of routine one +# needs to have around when implementing things like pretty printers, +# preprocessors, code obfuscators, etc. It would also be useful for +# implementing cut-down implementations of Icon written in Icon - the +# sort of thing one might use in an interactive tutorial. +# +# Itokens(f, x) takes, as its first argument, f, an open file, and +# suspends successive TOK records. TOK records contain two fields. +# The first field, sym, contains a string that represents the name of +# the next token (e.g. "CSET", "STRING", etc.). The second field, +# str, gives that token's literal value. E.g. the TOK for a literal +# semicolon is TOK("SEMICOL", ";"). For a mandatory newline, itokens +# would suspend TOK("SEMICOL", "\n"). +# +# Unlike Icon's own tokenizer, itokens() does not return an EOFX +# token on end-of-file, but rather simply fails. It also can be +# instructed to return syntactically meaningless newlines by passing +# it a nonnull second argument (e.g. itokens(infile, 1)). These +# meaningless newlines are returned as TOK records with a null sym +# field (i.e. TOK(&null, "\n")). +# +# NOTE WELL: If new reserved words or operators are added to a given +# implementation, the tables below will have to be altered. Note +# also that &keywords should be implemented on the syntactic level - +# not on the lexical one. As a result, a keyword like &features will +# be suspended as TOK("CONJUNC", "&") and TOK("IDENT", "features"). +# +############################################################################ +# +# Links: slshupto +# +# Requires: coexpressions +# +############################################################################ + +#link ximage, slshupto +link slshupto #make sure you have version 1.2 or above + +global next_c, line_number +record TOK(sym, str) + +# +# main: an Icon source code uglifier +# +# Stub main for testing; uncomment & compile. The resulting +# executable will act as an Icon file compressor, taking the +# standard input and outputting Icon code stripped of all +# unnecessary whitespace. Guaranteed to make the code a visual +# mess :-). +# +#procedure main() +# +# local separator, T +# separator := "" +# every T := itokens(&input) do { +# if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT" +# then writes(separator) +# if T.sym == "SEMICOL" then writes(";") else writes(T.str) +# if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT" +# then separator := " " else separator := "" +# } +# +#end + + +# +# itokens: file x anything -> TOK records (a generator) +# (stream, nostrip) -> Rs +# +# Where stream is an open file, anything is any object (it only +# matters whether it is null or not), and Rs are TOK records. +# Note that itokens strips out useless newlines. If the second +# argument is nonnull, itokens does not strip out superfluous +# newlines. It may be useful to keep them when the original line +# structure of the input file must be maintained. +# +procedure itokens(stream, nostrip) + + local T, last_token + + # initialize to some meaningless value + last_token := TOK() + + every T := \iparse_tokens(stream) do { + if \T.sym then { + if T.sym == "EOFX" then fail + else { + # + # If the last token was a semicolon, then interpret + # all ambiguously unary/binary sequences like "**" as + # beginners (** could be two unary stars or the [c]set + # intersection operator). + # + if \last_token.sym == "SEMICOL" + then suspend last_token := expand_fake_beginner(T) + else suspend last_token := T + } + } else { + if \nostrip + then suspend last_token := T + } + } + +end + + +# +# expand_fake_beginner: TOK record -> TOK records +# +# Some "beginner" tokens aren't really beginners. They are token +# sequences that could be either a single binary operator or a +# series of unary operators. The tokenizer's job is just to snap +# up as many characters as could logically constitute an operator. +# Here is where we decide whether to break the sequence up into +# more than one op or not. +# +procedure expand_fake_beginner(next_token) + + static exptbl + initial { + exptbl := table() + insert(exptbl, "CONCAT", [TOK("BAR", "|"), TOK("BAR", "|")]) + insert(exptbl, "DIFF", [TOK("MINUS", "-"), TOK("MINUS", "-")]) + insert(exptbl, "EQUIV", [TOK("NUMEQ", "="), TOK("NUMEQ", "="), + TOK("NUMEQ", "=")]) + insert(exptbl, "INTER", [TOK("STAR", "*"), TOK("STAR", "*")]) + insert(exptbl, "LCONCAT", [TOK("BAR", "|"), TOK("BAR", "|"), + TOK("BAR", "|")]) + insert(exptbl, "LEXEQ", [TOK("NUMEQ", "="), TOK("NUMEQ", "=")]) + insert(exptbl, "LEXNE", [TOK("TILDE", "~"), TOK("NUMEQ", "="), + TOK("NUMEQ", "=")]) + insert(exptbl, "NOTEQUIV",[TOK("TILDE", "~"), TOK("NUMEQ","="), + TOK("NUMEQ", "="), TOK("NUMEQ", "=")]) + insert(exptbl, "NUMNE", [TOK("TILDE", "~"), TOK("NUMEQ","=")]) + insert(exptbl, "UNION", [TOK("PLUS", "+"), TOK("PLUS", "+")]) + } + + if \exptbl[next_token.sym] + then suspend !exptbl[next_token.sym] + else return next_token + +end + + +# +# iparse_tokens: file -> TOK records (a generator) +# (stream) -> tokens +# +# Where file is an open input stream, and tokens are TOK records +# holding both the token type and actual token text. +# +# TOK records contain two parts, a preterminal symbol (the first +# "sym" field), and the actual text of the token ("str"). The +# parser only pays attention to the sym field, although the +# strings themselves get pushed onto the value stack. +# +# Note the following kludge: Unlike real Icon tokenizers, this +# procedure returns syntactially meaningless newlines as TOK +# records with a null sym field. Normally they would be ignored. +# I wanted to return them so they could be printed on the output +# stream, thus preserving the line structure of the original +# file, and making later diagnostic messages more usable. +# +procedure iparse_tokens(stream, getchar) + + local elem, whitespace, token, last_token, primitives, reserveds + static be_tbl, reserved_tbl, operators + initial { + + # Primitive Tokens + # + primitives := [ + ["identifier", "IDENT", "be"], + ["integer-literal", "INTLIT", "be"], + ["real-literal", "REALLIT", "be"], + ["string-literal", "STRINGLIT", "be"], + ["cset-literal", "CSETLIT", "be"], + ["end-of-file", "EOFX", "" ]] + + # Reserved Words + # + reserveds := [ + ["break", "BREAK", "be"], + ["by", "BY", "" ], + ["case", "CASE", "b" ], + ["create", "CREATE", "b" ], + ["default", "DEFAULT", "b" ], + ["do", "DO", "" ], + ["else", "ELSE", "" ], + ["end", "END", "b" ], + ["every", "EVERY", "b" ], + ["fail", "FAIL", "be"], + ["global", "GLOBAL", "" ], + ["if", "IF", "b" ], + ["initial", "INITIAL", "b" ], + ["invocable", "INVOCABLE", "" ], + ["link", "LINK", "" ], + ["local", "LOCAL", "b" ], + ["next", "NEXT", "be"], + ["not", "NOT", "b" ], + ["of", "OF", "" ], + ["procedure", "PROCEDURE", "" ], + ["record", "RECORD", "" ], + ["repeat", "REPEAT", "b" ], + ["return", "RETURN", "be"], + ["static", "STATIC", "b" ], + ["suspend", "SUSPEND", "be"], + ["then", "THEN", "" ], + ["to", "TO", "" ], + ["until", "UNTIL", "b" ], + ["while", "WHILE", "b" ]] + + # Operators + # + operators := [ + [":=", "ASSIGN", "" ], + ["@", "AT", "b" ], + ["@:=", "AUGACT", "" ], + ["&:=", "AUGAND", "" ], + ["=:=", "AUGEQ", "" ], + ["===:=", "AUGEQV", "" ], + [">=:=", "AUGGE", "" ], + [">:=", "AUGGT", "" ], + ["<=:=", "AUGLE", "" ], + ["<:=", "AUGLT", "" ], + ["~=:=", "AUGNE", "" ], + ["~===:=", "AUGNEQV", "" ], + ["==:=", "AUGSEQ", "" ], + [">>=:=", "AUGSGE", "" ], + [">>:=", "AUGSGT", "" ], + ["<<=:=", "AUGSLE", "" ], + ["<<:=", "AUGSLT", "" ], + ["~==:=", "AUGSNE", "" ], + ["\\", "BACKSLASH", "b" ], + ["!", "BANG", "b" ], + ["|", "BAR", "b" ], + ["^", "CARET", "b" ], + ["^:=", "CARETASGN", "b" ], + [":", "COLON", "" ], + [",", "COMMA", "" ], + ["||", "CONCAT", "b" ], + ["||:=", "CONCATASGN","" ], + ["&", "CONJUNC", "b" ], + [".", "DOT", "b" ], + ["--", "DIFF", "b" ], + ["--:=", "DIFFASGN", "" ], + ["===", "EQUIV", "b" ], + ["**", "INTER", "b" ], + ["**:=", "INTERASGN", "" ], + ["{", "LBRACE", "b" ], + ["[", "LBRACK", "b" ], + ["|||", "LCONCAT", "b" ], + ["|||:=", "LCONCATASGN","" ], + ["==", "LEXEQ", "b" ], + [">>=", "LEXGE", "" ], + [">>", "LEXGT", "" ], + ["<<=", "LEXLE", "" ], + ["<<", "LEXLT", "" ], + ["~==", "LEXNE", "b" ], + ["(", "LPAREN", "b" ], + ["-:", "MCOLON", "" ], + ["-", "MINUS", "b" ], + ["-:=", "MINUSASGN", "" ], + ["%", "MOD", "" ], + ["%:=", "MODASGN", "" ], + ["~===", "NOTEQUIV", "b" ], + ["=", "NUMEQ", "b" ], + [">=", "NUMGE", "" ], + [">", "NUMGT", "" ], + ["<=", "NUMLE", "" ], + ["<", "NUMLT", "" ], + ["~=", "NUMNE", "b" ], + ["+:", "PCOLON", "" ], + ["+", "PLUS", "b" ], + ["+:=", "PLUSASGN", "" ], + ["?", "QMARK", "b" ], + ["<-", "REVASSIGN", "" ], + ["<->", "REVSWAP", "" ], + ["}", "RBRACE", "e" ], + ["]", "RBRACK", "e" ], + [")", "RPAREN", "e" ], + [";", "SEMICOL", "" ], + ["?:=", "SCANASGN", "" ], + ["/", "SLASH", "b" ], + ["/:=", "SLASHASGN", "" ], + ["*", "STAR", "b" ], + ["*:=", "STARASGN", "" ], + [":=:", "SWAP", "" ], + ["~", "TILDE", "b" ], + ["++", "UNION", "b" ], + ["++:=", "UNIONASGN", "" ], + ["$(", "LBRACE", "b" ], + ["$)", "RBRACE", "e" ], + ["$<", "LBRACK", "b" ], + ["$>", "RBRACK", "e" ], + ["$", "RHSARG", "b" ], + ["%$(", "BEGGLOB", "b" ], + ["%$)", "ENDGLOB", "e" ], + ["%{", "BEGGLOB", "b" ], + ["%}", "ENDGLOB", "e" ], + ["%%", "NEWSECT", "be"]] + + # static be_tbl, reserved_tbl + reserved_tbl := table() + every elem := !reserveds do + insert(reserved_tbl, elem[1], elem[2]) + be_tbl := table() + every elem := !primitives | !reserveds | !operators do { + insert(be_tbl, elem[2], elem[3]) + } + } + + /getchar := create { + line_number := 0 + ! ( 1(!stream, line_number +:=1) || "\n" ) + } + whitespace := ' \t' + /next_c := @getchar | { + if \stream then + return TOK("EOFX") + else fail + } + + repeat { + case next_c of { + + "." : { + # Could be a real literal *or* a dot operator. Check + # following character to see if it's a digit. If so, + # it's a real literal. We can only get away with + # doing the dot here because it is not a substring of + # any longer identifier. If this gets changed, we'll + # have to move this code into do_operator(). + # + last_token := do_dot(getchar) + suspend last_token +# write(&errout, "next_c == ", image(next_c)) + next + } + + "\n" : { + # If do_newline fails, it means we're at the end of + # the input stream, and we should break out of the + # repeat loop. + # + every last_token := do_newline(getchar, last_token, be_tbl) + do suspend last_token + if next_c === &null then break + next + } + + "\#" : { + # Just a comment. Strip it by reading every character + # up to the next newline. The global var next_c + # should *always* == "\n" when this is done. + # + do_number_sign(getchar) +# write(&errout, "next_c == ", image(next_c)) + next + } + + "\"" : { + # Suspend as STRINGLIT everything from here up to the + # next non-backslashed quotation mark, inclusive + # (accounting for the _ line-continuation convention). + # + last_token := do_quotation_mark(getchar) + suspend last_token +# write(&errout, "next_c == ", image(next_c)) + next + } + + "'" : { + # Suspend as CSETLIT everything from here up to the + # next non-backslashed apostrophe, inclusive. + # + last_token := do_apostrophe(getchar) + suspend last_token +# write(&errout, "next_c == ", image(next_c)) + next + } + + &null : stop("iparse_tokens (lexer): unexpected EOF") + + default : { + # If we get to here, we have either whitespace, an + # integer or real literal, an identifier or reserved + # word (both get handled by do_identifier), or an + # operator. The question of which we have can be + # determined by checking the first character. + # + if any(whitespace, next_c) then { + # Like all of the TOK forming procedures, + # do_whitespace resets next_c. + do_whitespace(getchar, whitespace) + # don't suspend any tokens + next + } + if any(&digits, next_c) then { + last_token := do_digits(getchar) + suspend last_token + next + } + if any(&letters ++ '_', next_c) then { + last_token := do_identifier(getchar, reserved_tbl) + suspend last_token + next + } +# write(&errout, "it's an operator") + last_token := do_operator(getchar, operators) + suspend last_token + next + } + } + } + + # If stream argument is nonnull, then we are in the top-level + # iparse_tokens(). If not, then we are in a recursive call, and + # we should not emit all this end-of-file crap. + # + if \stream then { + return TOK("EOFX") + } + else fail + +end + + +# +# do_dot: coexpression -> TOK record +# getchar -> t +# +# Where getchar is the coexpression that produces the next +# character from the input stream and t is a token record whose +# sym field contains either "REALLIT" or "DOT". Essentially, +# do_dot checks the next char on the input stream to see if it's +# an integer. Since the preceding char was a dot, an integer +# tips us off that we have a real literal. Otherwise, it's just +# a dot operator. Note that do_dot resets next_c for the next +# cycle through the main case loop in the calling procedure. +# +procedure do_dot(getchar) + + local token + # global next_c + +# write(&errout, "it's a dot") + + # If dot's followed by a digit, then we have a real literal. + # + if any(&digits, next_c := @getchar) then { +# write(&errout, "dot -> it's a real literal") + token := "." || next_c + while any(&digits, next_c := @getchar) do + token ||:= next_c + if token ||:= (next_c == ("e"|"E")) then { + while (next_c := @getchar) == "0" + while any(&digits, next_c) do { + token ||:= next_c + next_c = @getchar + } + } + return TOK("REALLIT", token) + } + + # Dot not followed by an integer; so we just have a dot operator, + # and not a real literal. + # +# write(&errout, "dot -> just a plain dot") + return TOK("DOT", ".") + +end + + +# +# do_newline: coexpression x TOK record x table -> TOK records +# (getchar, last_token, be_tbl) -> Ts (a generator) +# +# Where getchar is the coexpression that returns the next +# character from the input stream, last_token is the last TOK +# record suspended by the calling procedure, be_tbl is a table of +# tokens and their "beginner/ender" status, and Ts are TOK +# records. Note that do_newline resets next_c. Do_newline is a +# mess. What it does is check the last token suspended by the +# calling procedure to see if it was a beginner or ender. It +# then gets the next token by calling iparse_tokens again. If +# the next token is a beginner and the last token is an ender, +# then we have to suspend a SEMICOL token. In either event, both +# the last and next token are suspended. +# +procedure do_newline(getchar, last_token, be_tbl) + + local next_token + # global next_c + +# write(&errout, "it's a newline") + + # Go past any additional newlines. + # + while next_c == "\n" do { + # NL can be the last char in the getchar stream; if it *is*, + # then signal that it's time to break out of the repeat loop + # in the calling procedure. + # + next_c := @getchar | { + next_c := &null + fail + } + suspend TOK(&null, next_c == "\n") + } + + # If there was a last token (i.e. if a newline wasn't the first + # character of significance in the input stream), then check to + # see if it was an ender. If so, then check to see if the next + # token is a beginner. If so, then suspend a TOK("SEMICOL") + # record before suspending the next token. + # + if find("e", be_tbl[(\last_token).sym]) then { +# write(&errout, "calling iparse_tokens via do_newline") +# &trace := -1 + # First arg to iparse_tokens can be null here. + \ (next_token := iparse_tokens(&null, getchar)).sym + if \next_token then { +# write(&errout, "call of iparse_tokens via do_newline yields ", +# ximage(next_token)) + if find("b", be_tbl[next_token.sym]) + then suspend TOK("SEMICOL", "\n") + # + # See below. If this were like the real Icon parser, + # the following line would be commented out. + # + else suspend TOK(&null, "\n") + return next_token + } + else { + # + # If this were a *real* Icon tokenizer, it would not emit + # any record here, but would simply fail. Instead, we'll + # emit a dummy record with a null sym field. + # + return TOK(&null, "\n") +# &trace := 0 +# fail + } + } + + # See above. Again, if this were like Icon's own tokenizer, we + # would just fail here, and not return any TOK record. + # +# &trace := 0 + return TOK(&null, "\n") +# fail + +end + + +# +# do_number_sign: coexpression -> &null +# getchar -> +# +# Where getchar is the coexpression that pops characters off the +# main input stream. Sets the global variable next_c. This +# procedure simply reads characters until it gets a newline, then +# returns with next_c == "\n". Since the starting character was +# a number sign, this has the effect of stripping comments. +# +procedure do_number_sign(getchar) + + # global next_c + +# write(&errout, "it's a number sign") + while next_c ~== "\n" do { + next_c := @getchar + } + + # Return to calling procedure to cycle around again with the new + # next_c already set. Next_c should always be "\n" at this point. + return + +end + + +# +# do_quotation_mark: coexpression -> TOK record +# getchar -> t +# +# Where getchar is the coexpression that yields another character +# from the input stream, and t is a TOK record with "STRINGLIT" +# as its sym field. Puts everything upto and including the next +# non-backslashed quotation mark into the str field. Handles the +# underscore continuation convention. +# +procedure do_quotation_mark(getchar) + + local token + # global next_c + + # write(&errout, "it's a string literal") + token := "\"" + next_c := @getchar + repeat { + if next_c == "\n" & token[-1] == "_" then { + token := token[1:-1] + while any('\t ', next_c := @getchar) + next + } else { + if slshupto('"', token ||:= next_c, 2) + then { + next_c := @getchar + # resume outermost (repeat) loop in calling procedure, + # with the new (here explicitly set) next_c + return TOK("STRINGLIT", token) + } + next_c := @getchar + } + } + +end + + +# +# do_apostrophe: coexpression -> TOK record +# getchar -> t +# +# Where getchar is the coexpression that yields another character +# from the input stream, and t is a TOK record with "CSETLIT" +# as its sym field. Puts everything upto and including the next +# non-backslashed apostrope into the str field. +# +procedure do_apostrophe(getchar) + + local token + # global next_c + +# write(&errout, "it's a cset literal") + token := "'" + next_c := @getchar + repeat { + if next_c == "\n" & token[-1] == "_" then { + token := token[1:-1] + while any('\t ', next_c := @getchar) + next + } else { + if slshupto("'", token ||:= next_c, 2) + then { + next_c := @getchar + # Return & resume outermost containing loop in calling + # procedure w/ new next_c. + return TOK("CSETLIT", token) + } + next_c := @getchar + } + } + +end + + +# +# do_digits: coexpression -> TOK record +# getchar -> t +# +# Where getchar is the coexpression that produces the next char +# on the input stream, and where t is a TOK record containing +# either "REALLIT" or "INTLIT" in its sym field, and the text of +# the numeric literal in its str field. +# +procedure do_digits(getchar) + + local token, tok_record, extras, digits, over + # global next_c + + # For bases > 16 + extras := "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz" + # Assume integer literal until proven otherwise.... + tok_record := TOK("INTLIT") + +# write(&errout, "it's an integer or real literal") + token := ("0" ~== next_c) | "" + while any(&digits, next_c := @getchar) do + token ||:= next_c + if token ||:= (next_c == ("R"|"r")) then { + digits := &digits + if over := ((10 < token[1:-1]) - 10) * 2 then + digits ++:= extras[1:over+1] | extras + next_c := @getchar + if next_c == "-" then { + token ||:= next_c + next_c := @getchar + } + while any(digits, next_c) do { + token ||:= next_c + next_c := @getchar + } + } else { + if token ||:= (next_c == ".") then { + while any(&digits, next_c := @getchar) do + token ||:= next_c + tok_record := TOK("REALLIT") + } + if token ||:= (next_c == ("e"|"E")) then { + next_c := @getchar + if next_c == "-" then { + token ||:= next_c + next_c := @getchar + } + while any(&digits, next_c) do { + token ||:= next_c + next_c := @getchar + } + tok_record := TOK("REALLIT") + } + } + tok_record.str := ("" ~== token) | "0" + return tok_record + +end + + +# +# do_whitespace: coexpression x cset -> &null +# getchar x whitespace -> &null +# +# Where getchar is the coexpression producing the next char on +# the input stream. Do_whitespace just repeats until it finds a +# non-whitespace character, whitespace being defined as +# membership of a given character in the whitespace argument (a +# cset). +# +procedure do_whitespace(getchar, whitespace) + +# write(&errout, "it's junk") + while any(whitespace, next_c) do + next_c := @getchar + return + +end + + +# +# do_identifier: coexpression x table -> TOK record +# (getchar, reserved_tbl) -> t +# +# Where getchar is the coexpression that pops off characters from +# the input stream, reserved_tbl is a table of reserved words +# (keys = the string values, values = the names qua symbols in +# the grammar), and t is a TOK record containing all subsequent +# letters, digits, or underscores after next_c (which must be a +# letter or underscore). Note that next_c is global and gets +# reset by do_identifier. +# +procedure do_identifier(getchar, reserved_tbl) + + local token + # global next_c + +# write(&errout, "it's an indentifier") + token := next_c + while any(&letters ++ &digits ++ '_', next_c := @getchar) + do token ||:= next_c + return TOK(\reserved_tbl[token], token) | TOK("IDENT", token) + +end + + +# +# do_operator: coexpression x list -> TOK record +# (getchar, operators) -> t +# +# Where getchar is the coexpression that produces the next +# character on the input stream, operators is the operator list, +# and where t is a TOK record describing the operator just +# scanned. Calls recognop, which creates a DFSA to recognize +# valid Icon operators. Arg2 (operators) is the list of lists +# containing valid Icon operator string values and names (see +# above). +# +procedure do_operator(getchar, operators) + + local token, elem + + token := next_c + + # Go until recognop fails. + while elem := recognop(operators, token, 1) do + token ||:= (next_c := @getchar) +# write(&errout, ximage(elem)) + if *\elem = 1 then + return TOK(elem[1][2], elem[1][1]) + else fail + +end + + +record dfstn_state(b, e, tbl) +record start_state(b, e, tbl, master_list) +# +# recognop: list x string x integer -> list +# (l, s, i) -> l2 +# +# Where l is the list of lists created by the calling procedure +# (each element contains a token string value, name, and +# beginner/ender string), where s is a string possibly +# corresponding to a token in the list, where i is the position in +# the elements of l where the operator string values are recorded, +# and where l2 is a list of elements from l that contain operators +# for which string s is an exact match. Fails if there are no +# operators that s is a prefix of, but returns an empty list if +# there just aren't any that happen to match exactly. +# +# What this does is let the calling procedure just keep adding +# characters to s until recognop fails, then check the last list +# it returned to see if it is of length 1. If it is, then it +# contains list with the vital stats for the operator last +# recognized. If it is of length 0, then string s did not +# contain any recognizable operator. +# +procedure recognop(l, s, i) + + local current_state, master_list, c, result, j + static dfstn_table + initial dfstn_table := table() + + /i := 1 + # See if we've created an automaton for l already. + /dfstn_table[l] := start_state(1, *l, &null, &null) & { + dfstn_table[l].master_list := sortf(l, i) + } + + current_state := dfstn_table[l] + # Save master_list, as current_state will change later on. + master_list := current_state.master_list + + s ? { + while c := move(1) do { + + # Null means that this part of the automaton isn't + # complete. + # + if /current_state.tbl then + create_arcs(master_list, i, current_state, &pos) + + # If the table has been clobbered, then there are no arcs + # leading out of the current state. Fail. + # + if current_state.tbl === 0 then + fail + +# write(&errout, "c = ", image(c)) +# write(&errout, "table for current state = ", +# ximage(current_state.tbl)) + + # If we get to here, the current state has arcs leading + # out of it. See if c is one of them. If so, make the + # node to which arc c is connected the current state. + # Otherwise fail. + # + current_state := \current_state.tbl[c] | fail + } + } + + # Return possible completions. + # + result := list() + every j := current_state.b to current_state.e do { + if *master_list[j][i] = *s then + put(result, master_list[j]) + } + # return empty list if nothing the right length is found + return result + +end + + +# +# create_arcs: fill out a table of arcs leading out of the current +# state, and place that table in the tbl field for +# current_state +# +procedure create_arcs(master_list, field, current_state, POS) + + local elem, i, first_char, old_first_char + + current_state.tbl := table() + old_first_char := "" + + every elem := master_list[i := current_state.b to current_state.e][field] + do { + + # Get the first character for the current position (note that + # we're one character behind the calling routine; hence + # POS-1). + # + first_char := elem[POS-1] | next + + # If we have a new first character, create a new arc out of + # the current state. + # + if first_char ~== old_first_char then { + # Store the start position for the current character. + current_state.tbl[first_char] := dfstn_state(i) + # Store the end position for the old character. + (\current_state.tbl[old_first_char]).e := i-1 + old_first_char := first_char + } + } + (\current_state.tbl[old_first_char]).e := i + + # Clobber table with 0 if no arcs were added. + current_state.tbl := (*current_state.tbl = 0) + return current_state + +end diff --git a/ipl/packs/ibpag2/outbits.icn b/ipl/packs/ibpag2/outbits.icn new file mode 100644 index 0000000..cf3f597 --- /dev/null +++ b/ipl/packs/ibpag2/outbits.icn @@ -0,0 +1,100 @@ +############################################################################ +# +# Name: outbits.icn +# +# Title: output variable-length characters in byte-size chunks +# +# Author: Richard L. Goerwitz +# +# Version: 1.5 +# +############################################################################ +# +# In any number of instances (e.g. when outputting variable-length +# characters or fixed-length encoded strings), the programmer must +# fit variable and/or non-byte-sized blocks into standard 8-bit +# bytes. Outbits() performs this task. +# +# Pass to outbits(i, len) an integer i, and a length parameter (len), +# and outbits will suspend byte-sized chunks of i converted to +# characters (most significant bits first) until there is not enough +# left of i to fill up an 8-bit character. The remaining portion is +# stored in a buffer until outbits() is called again, at which point +# the buffer is combined with the new i and then output in the same +# manner as before. The buffer is flushed by calling outbits() with +# a null i argument. Note that len gives the number of bits there +# are in i (or at least the number of bits you want preserved; those +# that are discarded are the most significant ones). +# +# A trivial example of how outbits() might be used: +# +# outtext := open("some.file.name","w") +# l := [1,2,3,4] +# every writes(outtext, outbits(!l,3)) +# writes(outtext, outbits(&null,3)) # flush buffer +# +# List l may be reconstructed with inbits() (see inbits.icn): +# +# intext := open("some.file.name") +# l := [] +# while put(l, inbits(intext, 3)) +# +# Note that outbits() is a generator, while inbits() is not. +# +############################################################################ +# +# Links: none +# See also: inbits.icn +# +############################################################################ + + +procedure outbits(i, len) + + local old_part, new_part, window, old_byte_mask + static old_i, old_len, byte_length, byte_mask + initial { + old_i := old_len := 0 + byte_length := 8 + byte_mask := (2^byte_length)-1 + } + + old_byte_mask := (0 < 2^old_len - 1) | 0 + window := byte_length - old_len + old_part := ishift(iand(old_i, old_byte_mask), window) + + # If we have a no-arg invocation, then flush buffer (old_i). + if /i then { + if old_len > 0 then { + old_i := old_len := 0 + return char(old_part) + } else { + old_i := old_len := 0 + fail + } + } else { + new_part := ishift(i, window-len) + len -:= (len >= window) | { + old_len +:= len + old_i := ior(ishift(old_part, len-window), i) + fail + } +# For debugging purposes. +# write("old_byte_mask = ", old_byte_mask) +# write("window = ", image(window)) +# write("old_part = ", image(old_part)) +# write("new_part = ", image(new_part)) +# write("outputting ", image(ior(old_part, new_part))) + suspend char(ior(old_part, new_part)) + } + + until len < byte_length do { + suspend char(iand(ishift(i, byte_length-len), byte_mask)) + len -:= byte_length + } + + old_len := len + old_i := i + fail + +end diff --git a/ipl/packs/ibpag2/rewrap.icn b/ipl/packs/ibpag2/rewrap.icn new file mode 100644 index 0000000..9ceff0c --- /dev/null +++ b/ipl/packs/ibpag2/rewrap.icn @@ -0,0 +1,144 @@ +############################################################################ +# +# Name: rewrap.icn +# +# Title: advanced line rewrap utility +# +# Author: Richard L. Goerwitz +# +# Version: 1.4 +# +############################################################################ +# +# The procedure rewrap(s,i), included in this file, reformats text +# fed to it into strings < i in length. Rewrap utilizes a static +# buffer, so it can be called repeatedly with different s arguments, +# and still produce homogenous output. This buffer is flushed by +# calling rewrap with a null first argument. The default for +# argument 2 (i) is 70. +# +# Here's a simple example of how rewrap could be used. The following +# program reads the standard input, producing fully rewrapped output. +# +# procedure main() +# every write(rewrap(!&input)) +# write(rewrap()) +# end +# +# Naturally, in practice you would want to do things like check for in- +# dentation or blank lines in order to wrap only on a paragraph-by para- +# graph basis, as in +# +# procedure main() +# while line := read(&input) do { +# if line == "" then { +# write("" ~== rewrap()) +# write(line) +# } else { +# if match("\t", line) then { +# write(rewrap()) +# write(rewrap(line)) +# } else { +# write(rewrap(line)) +# } +# } +# } +# end +# +# Fill-prefixes can be implemented simply by prepending them to the +# output of rewrap: +# +# i := 70; fill_prefix := " > " +# while line := read(input_file) do { +# line ?:= (f_bit := tab(many('> ')) | "", tab(0)) +# write(fill_prefix || f_bit || rewrap(line, i - *fill_prefix)) +# etc. +# +# Obviously, these examples are fairly simplistic. Putting them to +# actual use would certainly require a few environment-specific +# modifications and/or extensions. Still, I hope they offer some +# indication of the kinds of applications rewrap might be used in. +# +# Note: If you want leading and trailing tabs removed, map them to +# spaces first. Rewrap only fools with spaces, leaving tabs intact. +# This can be changed easily enough, by running its input through the +# Icon detab() function. +# +############################################################################ +# +# See also: wrap.icn +# +############################################################################ + + +procedure rewrap(s,i) + + local extra_bit, line + static old_line + initial old_line := "" + + # Default column to wrap on is 70. + /i := 70 + # Flush buffer on null first argument. + if /s then { + extra_bit := old_line + old_line := "" + return "" ~== extra_bit + } + + # Prepend to s anything that is in the buffer (leftovers from the last s). + s ?:= { tab(many(' ')); old_line || trim(tab(0)) } + + # If the line isn't long enough, just add everything to old_line. + if *s < i then old_line := s || " " & fail + + s ? { + + # While it is possible to find places to break s, do so. + while any(' -',line := EndToFront(i),-1) do { + # Clean up and suspend the last piece of s tabbed over. + line ?:= (tab(many(' ')), trim(tab(0))) + if *&subject - &pos + *line > i + then suspend line + else { + old_line := "" + return line || tab(0) + } + } + + # Keep the extra section of s in a buffer. + old_line := tab(0) + + # If the reason the remaining section of s was unrewrapable was + # that it was too long, and couldn't be broken up, then just return + # the thing as-is. + if *old_line > i then { + old_line ? { + if extra_bit := tab(upto(' -')+1) || (tab(many(' ')) | "") + then old_line := tab(0) + else extra_bit := old_line & old_line := "" + return trim(extra_bit) + } + } + # Otherwise, clean up the buffer for prepending to the next s. + else { + # If old_line is blank, then don't mess with it. Otherwise, + # add whatever is needed in order to link it with the next s. + if old_line ~== "" then { + # If old_line ends in a dash, then there's no need to add a + # space to it. + if old_line[-1] ~== "-" + then old_line ||:= " " + } + } + } + +end + + + +procedure EndToFront(i) + # Goes with rewrap(s,i) + *&subject+1 - &pos >= i | fail + suspend &subject[.&pos:&pos <- &pos+i to &pos by -1] +end diff --git a/ipl/packs/ibpag2/sample.ibp b/ipl/packs/ibpag2/sample.ibp new file mode 100644 index 0000000..ab8358f --- /dev/null +++ b/ipl/packs/ibpag2/sample.ibp @@ -0,0 +1,111 @@ +# +# Sample Ibpag2 grammar file. +# + +# +# The code between %{ and %} gets copied directly. Note the Iconish +# comment syntax. +# +%{ + +# Note: If IIDEBUG is defined in the output file, debugging messages +# about the stacks and actions get displayed. +# +$define IIDEBUG 1 + +%} + +# +# Here we declare the tokens returned by the lexical analyzer. +# Precedences increase as we go on. Note how (unlike YACC), tokens +# are separated by commas. Note also how UMINUS is used only for its +# %prec later. +# +%token NUMBER +%left '+', '-' +%left '*', '/' +%right UMINUS + +%% + +# +# After this point, and up to the next %%, we have the grammar itself. +# By default, the start symbol is the left-hand side of the first +# rule. +# + +lines : lines, expr, '\n' { write($2) } + | lines, '\n' + | epsilon # Note use of epsilon/error tokens. + | error, '\n' { + write("syntax error; try again:") + # like YACC's yyerrok macro + iierrok + } + ; + +expr : expr, '+', expr { return $1 + $3 } + | expr, '-', expr { return $1 - $3 } + | expr, '*', expr { return $1 * $3 } + | expr, '/', expr { return $1 / $3 } + | '(', expr, ')' { return $2 } + | '-', expr %prec UMINUS { return -$2 } + | NUMBER { return $1 } + ; + +%% + +# +# From here on, code gets copied directly to the output file. We are +# no longer in the grammar proper. +# + +# +# The lexical analyzer must be called iilex, with the module name +# appended (if there is one). It must take one argument, infile (an +# input stream). It must be a generator, and fail on EOF (not return +# something <= 0, as is the case for YACC + Lex). Iilval holds the +# literal string value of the token just suspended by iilex(). +# +procedure iilex(infile) + + local nextchar, c, num + initial { + # Here's where you'd initialize any %{ globals %} declared + # above. + } + + nextchar := create !(!infile || "\n" || "\n") + + c := @nextchar | fail + repeat { + if any(&digits, c) then { + if not (\num ||:= c) then + num := c + } else { + if iilval := \num then { + suspend NUMBER + num := &null + } + if any('+-*/()\n', c) then { + iilval := c + suspend ord(c) + } else { + if not any(' \t', c) then { + # deliberate error - will be handled later + suspend &null + } + } + } + c := @nextchar | break + } + if iilval := \num then { + return NUMBER + num := &null + } + +end + +procedure main() + return iiparse(&input, 1) +end diff --git a/ipl/packs/ibpag2/shrnktbl.icn b/ipl/packs/ibpag2/shrnktbl.icn new file mode 100644 index 0000000..a91ca3d --- /dev/null +++ b/ipl/packs/ibpag2/shrnktbl.icn @@ -0,0 +1,131 @@ +############################################################################ +# +# Name: shrnktbl.icn +# +# Title: table shrinker +# +# Author: Richard L. Goerwitz +# +# Version: 1.4 (later modified 4-Aug-2000/gmt) +# +############################################################################ +# +# Action/goto table shrinking routine. +# +# Entry point: shrink_tables(start_symbol, st, atbl, gtbl), where +# start_symbol is the start symbol for the grammar whose productions +# are contained in the list/set st, and where atbl and gtbl are the +# action and goto tables, respectively. Returns &null, for lack of +# anything better. +# +# Basically, this routine merges duplicate structures in atbl and +# gtbl (if there are any), replaces the nonterminal symbols in the +# action table with integers (including the start symbol), then +# resets the goto table so that its keys point to these integers, +# instead of to the original nonterminal symbol strings. +# +############################################################################ +# +# Links: equiv, lists, sets, tables, outbits +# +############################################################################ +# +# See also: ibpag2, slrtbls +# +############################################################################ + +# structs has equiv; outbits is for outputting variable-width integers +# as 8-bit characters +# +link equiv +link lists +link sets +link tables +link outbits + +# +# shrink_tables +# +procedure shrink_tables(grammar, atbl, gtbl) + + local t, k, seen, nontermtbl, r, a, action, state, by_rule, + rule_len, LHS, keys + + # Create a table mapping nonterminal symbols to integers. + nontermtbl := table() + every r := !grammar.rules do + # r is a production; production records have LHS, RHS,...no + # fields, where the no field contains the rule number; we can + # use this as an arbitrary representation for that rule's LHS + # nonterminal + insert(nontermtbl, r.LHS, r.no) + + # Replace old start symbol. + grammar.start := nontermtbl[grammar.start] + + # Re-form the goto table to use the new integer values for + # nonterminals. + keys := set() + every insert(keys, key(gtbl)) + every k := !keys do { + # first create a column for the new integer-valued nonterminal + insert(gtbl, string(nontermtbl[k]), gtbl[k]) + # then clobber the old column with a string-valued nonterminal + gtbl[k] := &null + } + + # Rewrite actions using a fixed field-width format. + every t := !atbl do { + every k := key(t) do { + a := "" + t[k] ? { + while action := tab(any('sra')) do { + case action of { + "s": { + outbits(0, 2) + state := integer(tab(find("."))) + every a ||:= outbits(state, 11) + move(1) + by_rule := integer(tab(many(&digits))) + every a ||:= outbits(by_rule, 11) + outbits() + } + "r": { + outbits(1, 2) + state := integer(tab(find("<"))) + every a ||:= outbits(state, 11) + move(1) + LHS := nontermtbl[tab(find(">"))] + every a ||:= outbits(LHS, 11) + move(1) + rule_len := integer(tab(many(&digits))) + every a ||:= outbits(rule_len, 8) + outbits() + } + "a": { + outbits(2, 2) + a ||:= outbits() + } + } + } + } + t[k] := a + } + } + + # + # Turn pointers to identical structures into pointers to the same + # structure. + # + seen := set() + every t := atbl | gtbl do { + every k := key(t) do { + if t[k] := equiv(t[k], !seen) + then next else insert(seen, t[k]) + } + } + + # signal success + return &null + +end diff --git a/ipl/packs/ibpag2/slritems.icn b/ipl/packs/ibpag2/slritems.icn new file mode 100644 index 0000000..2a87f2c --- /dev/null +++ b/ipl/packs/ibpag2/slritems.icn @@ -0,0 +1,244 @@ +############################################################################ +# +# Name: slritems.icn +# +# Title: compute item sets for a grammar +# +# Author: Richard L. Goerwitz +# +# Version: 1.10 +# +############################################################################ +# +# Contains make_slr_item_sets(start_symbol, st), slr_goto(l, symbol, +# st), slr_closure(l, st). The user need only worry about +# make_slr_item_sets() initially. The slr_goto() routine may be +# useful later when constructing action and goto tables. +# +# Slr_closure(l, st) accepts a list of items as its first argument, a +# list or set of the productions in the grammar as its second, and +# returns the closure of item list l, in the form of another item +# list. +# +# Note also that the production record structure (LHS, RHS, POS, +# LOOK...) has a POS field, and therefore can serve also as an item. +# In fact, any structure can be used, as long as its first three +# fields are LHS, RHS, and POS. +# +# See the "Dragon Book" (cited in first.icn) p. 222 ff. +# +# Slr_goto(l, symbol, st) accepts a list as its first argument, a +# string or integer as its second (string = nonterminal, integer = +# terminal), and a list or set for its third, returning another list. +# Arg 1 must be an item list, as generated either by another call to +# slr_goto() or by closure of the start production of the augmented +# grammar. Arg 2, symbol, is some terminal or nonterminal symbol. +# Arg 3 is the list or set of all productions in the current grammar. +# The return value is the closure of the set of all items [A -> aX.b] +# such that [A -> a.Xb] is in l (arg 1). +# +# make_slr_item_sets(start_sym, st) takes a string, start_sym, as its +# first argument, and a list or set of productions as its second. +# Returns a list of canonical LR(0) item sets or states. It returns, +# in other words, a list of lists of items. Items can be any record +# type that has LHS, RHS, and POS as its first three fields. +# +# See the "Dragon Book," example 4.35 (p. 224). +# +############################################################################ +# +# Links: ibutil +# +############################################################################ + +# link ibutil + +# +# slr_closure: list x list/set -> list +# (l2, st) -> l2 +# +# Where l is a list of items, where st is a list/set of all +# productions in the grammar from which l was derived, and where +# l(2) is the SLR closure of l, as constructed using the standard +# SLR closure operation. +# +# Ignore the third to fifth arguments, len to added. They are +# used internally by recursive calls to slr_closure(). +# +procedure slr_closure(l, st, len, LHS_tbl, added) + + local p, i, new_p, symbol + static LHS_tbl_tbl + initial LHS_tbl_tbl := table() + + if /LHS_tbl then { + if /LHS_tbl_tbl[st] := table() then { + # makes looking up all rules with a given LHS easier + every p := !st do { + /LHS_tbl_tbl[st][p.LHS] := list() + put(LHS_tbl_tbl[st][p.LHS], p) + } + } + LHS_tbl := LHS_tbl_tbl[st] + } + + /len := 0 + /added := set() + + # Len tells us where the elements in l start that we haven't yet + # tried to generate more items from. These elements are basically + # the items added on the last recursive call (or the "core," if + # there has not yet been a recursive call). + # + every i := len+1 to *l do { + /l[i].POS := 1 + # Fails if dot (i.e. l[i].POS) is at the end of the RHS; + # also fails if the current symbol (i.e. l[i].RHS[l[i].POS]) + # is a nonterminal. + symbol := l[i].RHS[l[i].POS] + # No need to add productions having symbol as their LHS if + # we've already done so for this particular l. + member(added, symbol) & next + every p := !\LHS_tbl[symbol] do { + # Make a copy of p, but with dot set to position 1. + new_p := copy(p) + # Set POS to 1 for non-epsilon productions; otherwise to 2. + if *new_p.RHS = 1 & new_p.RHS[1] === -2 then + new_p.POS := 2 + else new_p.POS := 1 + # if new_p isn't in l, add it to the end of l + if not equivalent_items(new_p, !l) then + put(l, new_p) + } + insert(added, symbol) + } + return { + # If nothing new has been added, sort the result and return... + if *l = i then sortff(l, 1, 2, 3) + # ...otherwise, try to add more items to l. + else slr_closure(l, st, i, LHS_tbl, added) + } + +end + + +# +# slr_goto: list x string|integer x list|set -> list +# (l, symbol, st) -> l2 +# +# Where l is an item set previously returned by slr_goto or (for +# the start symbol of the augmented grammar) by slr_closure(), +# where symbol is a string (nonterminal) or integer (terminal), +# where st is a list or set of all productions in the current +# grammar, and where l2 is the SLR closure of the set of all items +# [A -> aX.b] such that [A -> a.Xb] is in l. +# +# The idea is just to move the dots for all productions where the +# dots precede "symbol," creating a new item list for the "moved" +# items, and then performing a slr_closure() on that new item list. +# Note that items can be represented by any structure where fields +# 1, 2, and 3 are LHS, RHS, and POS. +# +# Note that slr_goto(l, symbol, st) may yield a result that's +# structurally equivalent to one already in the sets of items thus +# far generated. This won't normally happen, because slr_goto() +# saves old results, never re-calcing for the same l x symbol +# combination. Still, a duplicate result could theoretically +# happen. +# +procedure slr_goto(l, symbol, st) + + local item, item2, l2, iteml_symbol_table + static iteml_symbol_table_table + initial iteml_symbol_table_table := table() + + # Keep old results for this grammar (st) in a table of tables of + # tables! + # + /iteml_symbol_table_table[st] := table() + iteml_symbol_table := iteml_symbol_table_table[st] + + # See if we've already performed this same calculation. + # + if l2 := \(\iteml_symbol_table[l])[symbol] + then return l2 + + l2 := list() + every item := !l do { + # Subscripting operation fails if the dot's at end. + if item.RHS[item.POS] === symbol + then { + item2 := copy(item) # copy is nonrecursive + item2.POS +:= 1 + put(l2, item2) + } + } + if *l2 = 0 then fail + else l2 := slr_closure(l2, st) + # + # Keep track of item lists and symbols we've already seen. + # + /iteml_symbol_table[l] := table() + /iteml_symbol_table[l][symbol] := l2 + + if *l2 > 0 then + return l2 + else fail + +end + + +# +# make_slr_item_sets: string x list|set -> list +# (start_sym, st) -> l +# +# Where start_sym is the start symbol for the grammar defined by +# the productions contained in st, and where l is the list of item +# lists generated by the standard LR(0) set-of-items construction +# algorithm. +# +# Ignore the third and fourth arguments. They are used internally +# by recursive calls. +# +procedure make_slr_item_sets(start_sym, st, C, len) + + local i, next_items, item_list, new_list, item, symbol + + # + # First extend the old start symbol and use the result as the new + # start symbol for the augmented grammar to which the set-of-items + # construction will be applied. + # + # &trace := -1 + /C := [slr_closure( + [production("`_" || start_sym || "_'", [start_sym], 1)],st)] + /len := 0 + + # Iterate through C (the list of item-lists), doing gotos, and adding + # new states, until no more states can be added to C. + # + every item_list := C[i := len+1 to *C] do { + if \DEBUG then + print_item_list(C, i) + # collect all symbols after the dot for the the items in C[i]... + next_items := set() + every item := !item_list do + insert(next_items, item.RHS[item.POS]) + # ...now, try to do a slr_goto() for every collected symbol. + every symbol := !next_items do { + new_list := slr_goto(item_list, symbol, st) | next + if not equivalent_item_lists(new_list, !C) + then put(C, new_list) + } + } + # If nothing has been inserted, return C and quit; otherwise, call + # recursively and try again. + # + return { + if i = *C then C + else make_slr_item_sets(&null, st, C, i) + } + +end + + diff --git a/ipl/packs/ibpag2/slrtbls.icn b/ipl/packs/ibpag2/slrtbls.icn new file mode 100644 index 0000000..8d00f12 --- /dev/null +++ b/ipl/packs/ibpag2/slrtbls.icn @@ -0,0 +1,370 @@ +############################################################################ +# +# Name: slrtbls.icn +# +# Title: slr table generation routines +# +# Author: Richard L. Goerwitz +# +# Version: 1.20 +# +############################################################################ +# +# Contains make_slr_tables(grammar, atbl, gtbl, noconflict, +# like_yacc), where grammar is an ib_grammar record (as returned by +# ibreader), where atbl and gtbl are initialized (default &null) hash +# tables, and where noconflict is a switch that, if nonnull, directs +# the resolver to abort on unresolvable conflicts. Returns &null if +# successful in filling out atbl and gtbl. If likeyacc is nonnull, +# make_slr_tables will resolve reduce/reduce conflicts by order of +# occurrence in the grammar, just like YACC. Shift/reduce conflicts +# will be resolved in favor of shift. +# +# The reason for the noconflict switch is that there are parsers that +# can accept tables with multiple action entries, i.e. parsers that +# can use tables generated by ambiguous grammars. +# +# In this routine's case, success is identified with creating a +# standard SLR action and goto table. Note that both tables end up +# as tables of tables, with symbols being the primary or first key, +# and state numbers being the second. This is the reverse of the +# usual arrangement, but turns out to save a lot of space. Atbl +# values are of the form "s2.3", "r4<A>10", "a", etc. The string +# "s2.3" means "shift the current lookahead token, and enter state 2 +# via rule 3." By way of contrast, "r4<A>10" means "reduce by rule +# number 4, which has A as its LHS symbol and 10 RHS symbols." A +# single "a" means "accept." + +# Atbl entries may contain more than one action. The actions are +# simply concatenated: "s2.3r4<A>10a". Conflicts may be resolved +# later by associativity or precedence, if available. Unresolvable +# conflicts only cause error termination if the 5th and final +# argument is nonnull (see above on "noconflict"). +# +# Gtbl entries are simpler than atble entries, consisting of a single +# integer. +# +############################################################################ +# +# Links: follow, slritems, iohno +# +############################################################################ + +# declared in ibreader.icn +# record ib_grammar(start, rules, tbl) + +#link follow, slritems, iohno#, ximage + +# +# make_slr_tables +# +procedure make_slr_tables(grammar, atbl, gtbl, noconflict, like_yacc) + + local start_symbol, st, C, i, augmented_start_symbol, item, + symbol, new_item_list, j, action + + # Initialize start symbol and rule list/set (either is okay). + start_symbol := grammar.start + st := grammar.rules + + # Number the rules, and then construct the canonical LR(0) item sets. + every i := 1 to *st do st[i].no := i + C := make_slr_item_sets(start_symbol, st) + + # Now, go through each item in each item set in C filling out the + # action (atbl) and goto table (gtbl) as we go. + # + augmented_start_symbol := "`_" || start_symbol || "_'" + every i := 1 to *C do { + every item := !C[i] do { + # if the dot's *not* at the end of the production... + if symbol := item.RHS[item.POS] then { + # if were looking at a terminal, enter a shift action + if type(symbol) == "integer" then { + if symbol = -2 then next # Never shift epsilon! + new_item_list := slr_goto(C[i], symbol, st) + every j := 1 to *C do { + if equivalent_item_lists(new_item_list, C[j]) then { + action := "s" || j || "." || item.no + resolve(st, atbl, symbol, i, action, + noconflict, like_yacc) + break next + } + } + # if we're looking at a nonterminal, add action to gtbl + } else { + new_item_list := slr_goto(C[i], symbol, st) + every j := 1 to *C do { + if equivalent_item_lists(new_item_list, C[j]) then { + /gtbl[symbol] := table() + /gtbl[symbol][i] := j | + gtbl[symbol][i] =:= j | + iohno(80, image(symbol), ".", image(i), ":", j) + break next + } + } + } + # ...else if the dot *is* at the end of the production + } else { + if item.LHS == augmented_start_symbol then { + action := "a" + # 0 = EOF + resolve(st, atbl, 0, i, action, noconflict, like_yacc) + } else { + # add a reduce for every symbol in FOLLOW(item.LHS) + every symbol := !FOLLOW(start_symbol, st, item.LHS) do { + # RHS size is 0 for epsilon. + if item.RHS[1] === -2 then { + action := "r" || item.no || "<" || item.LHS || + ">0" + } else + action := "r" || item.no || "<" || item.LHS || + ">" || *item.RHS + resolve(st, atbl, symbol, i, action, + noconflict, like_yacc) + } + } + } + } + } + + return + +end + + +# +# resolve: list|set x table x string|integer, integer, anything, anything +# -> string +# (st, tbl, symbol, state, action, noconflict, like_yacc) +# -> new_action_list +# +# Add action to action table, resolving conflicts by precedence +# and associativity, if need be. If noconflict is nonnull, abort +# on unresolvable conflicts. Fails on shift/shift "conflicts," or +# if an identical action is already present in the table entry to +# be modified. If like_yacc is nonnull, resolve reduce/reduce +# conflicts by their order of occurrence in the grammar; resolve +# shift/reduce conflicts in favor of shift. +# +procedure resolve(st, tbl, symbol, state, action, noconflict, like_yacc) + + local actions, chr, a, ruleno, p, newp + + /tbl[symbol] := table() + /tbl[symbol][state] := "" + + # If this action is already present, then don't re-enter it. Just + # fail. + # + tbl[symbol][state] ? { + while a := tab(any('sra')) do { + a ||:= tab(upto('.<')) + a ||:= { (="<" || tab(find(">")+1)) | ="." } + a ||:= tab(many(&digits)) + if a == action then fail + } + } + + # Get rule number for the new action specified as arg 5, and + # fetch its source production. + action ? { + case move(1) of { + "s": ruleno := (tab(find(".")+1), tab(many(&digits))) + "r": ruleno := 1(tab(find("<")), move(1)) + "a": return tbl[symbol][state] := action || tbl[symbol][state] + } | iohno(70, tbl[symbol][state]) + (newp := !st).no = ruleno | + iohno(72, tbl[symbol][state]) + } + + # Resolve any conflicts that might be present. + # + actions := "" + tbl[symbol][state] ? { + while a := tab(any('sra')) do { + # Snip out the old action, and put it into a. + a ||:= tab(upto('.<')) + a ||:= { (="<" || tab(find(">")+1)) | ="." } + a ||:= tab(many(&digits)) + # + # Get the old action's rule number, and use it to fetch + # the full production that it is keyed to. + # + a ? { + case move(1) of { + "s": ruleno := (tab(find(".")+1), tab(many(&digits))) + "r": ruleno := 1(tab(find("<")), move(1)) + "a": return tbl[symbol][state] := a || actions || action + } | iohno(70, tbl[symbol][state]) + # Go through rule list; find the one whose number is ruleno. + (p := !st).no = ruleno | + iohno(71, tbl[symbol][state]) + } + + # Check precedences to see if we can resolve the conflict + # this way. + # + if \newp.prec > \p.prec then + # discard the old action, a + return tbl[symbol][state] := actions || action || tab(0) + else if \newp.prec < \p.prec then + # discard the new action, action + return tbl[symbol][state] := actions || a || tab(0) + else { + # + # If, however, both precedences are the same (i.e. + # newp.prec === p.prec), then we must check the + # associativities. Right implies shift; left, reduce. + # If there is no associativity, then we have a + # conflict. Nonassociative ("n") implies error. + # + case action[1] of { + default: iohno(70, tbl[symbol][state]) + # case "a" is handled above; look for "s" & "r" + "s" : { + if a[1] == "s" then fail # no shift/shift "conflict" + else if a[1] == "r" then { + newp.assoc === p.assoc | { + iohno(40, "state " || state || "; token " || + symbol || "; rules " || newp.no || + "," || p.no) + } + case newp.assoc of { + "n" : iohno(41, production_2_string(newp)) + &null: { # no associativity given + if \noconflict & /like_yacc then + iohno(46, "state " || state || + "; token " || symbol || + "; rules " || newp.no || + "," || p.no) + else { + write(&errout, "warning: shift/reduce", + " conflict in state " || state || + "; token " || symbol || + "; rules " || newp.no || + "," || p.no) + if \like_yacc then { + write(&errout, "resolving in _ + favor of shift.") + return tbl[symbol][state] := + actions || action || tab(0) + } else { + write(&errout, "creating multi-_ + action table entry") + return tbl[symbol][state] := + actions || action || a || tab(0) + } + } + } + "l" : { # left associative + # discard new action, action + return tbl[symbol][state] := + actions || a || tab(0) + } + "r" : { # right associative + # remove old action, a + return tbl[symbol][state] := + actions || action || tab(0) + } + } + } + } + "r" : { + if a[1] == "r" then { + # + # If conflicts in general, and reduce-reduce + # conflicts in specific are not okay... + # + if \noconflict & /like_yacc then { + # ...abort, otherwise... + iohno(42, "state " || state || "; token " || + symbol || "; " || "; rules " || + newp.no || "," || p.no) + } else { + # + # ...flag reduce-reduce conficts, and + # then resolve them by their order of + # occurrence in the grammar. + # + write(&errout, "warning: reduce/reduce", + " conflict in state ", state, + "; token ", symbol, "; rules ", + newp.no, ",", p.no) + if \like_yacc then { + write(&errout, "resolving by order of _ + occurrence in the grammar") + if newp.no > p.no + # discard later production (newp) + then return return tbl[symbol][state] := + actions || a || tab(0) + # discard later production (old p) + else return tbl[symbol][state] := + actions || action || tab(0) + } else { + # + # If conflicts ok, but we aren't supposed + # to resolve reduce-reduce conflicts by + # order of rule occurrence: + # + write(&errout, "creating multi-action _ + table entry") + return tbl[symbol][state] := + actions || action || a || tab(0) + } + } + } else { + # associativities must be the same for both rules: + newp.assoc === p.assoc | { + iohno(40, "state " || state || "; token " || + symbol || "; rules " || newp.no || + "," || p.no) + } + case newp.assoc of { + "n" : iohno(41, production_2_string(newp)) + &null: { + if \noconflict & /like_yacc then + iohno(46, "state " || state || + "; token " || symbol || + "; rules " || newp.no || + "," || p.no) + else { + write(&errout, "warning: shift/reduce", + " conflict in state " || state || + "; token " || symbol || + "; rules " || newp.no || + "," || p.no) + if \like_yacc then { + write(&errout, "resolving in _ + favor of shift.") + return tbl[symbol][state] := + actions || a || tab(0) + } else { + write(&errout, "creating multi-_ + action table entry") + return tbl[symbol][state] := + actions || action || a || tab(0) + } + } + } + "r" : { + # discard new action, action + return tbl[symbol][state] := + actions || a || tab(0) + } + "l" : { + # remove old action, a + return tbl[symbol][state] := + actions || action || tab(0) + } + } + } + } + } + } + } + } + + return tbl[symbol][state] ||:= action + +end diff --git a/ipl/packs/ibpag2/slshupto.icn b/ipl/packs/ibpag2/slshupto.icn new file mode 100644 index 0000000..07cbece --- /dev/null +++ b/ipl/packs/ibpag2/slshupto.icn @@ -0,0 +1,79 @@ +############################################################################ +# +# Name: slshupto.icn +# +# Title: slshupto (upto with backslash escaping) +# +# Author: Richard L. Goerwitz +# +# Version: 1.4 +# +############################################################################ +# +# Slshupto works just like upto, except that it ignores backslash +# escaped characters. I can't even begin to express how often I've +# run into problems applying Icon's string scanning facilities to +# to input that uses backslash escaping. Normally, I tokenize first, +# and then work with lists. With slshupto() I can now postpone or +# even eliminate the traditional tokenizing step, and let Icon's +# string scanning facilities to more of the work. +# +# If you're confused: +# +# Typically UNIX utilities (and probably others) use backslashes to +# "escape" (i.e. remove the special meaning of) metacharacters. For +# instance, UNIX shells normally accept "*" as a shorthand for "any +# series of zero or more characters. You can make the "*" a literal +# "*," with no special meaning, by prepending a backslash. The rou- +# tine slshupto() understands these backslashing conventions. You +# can use it to find the "*" and other special characters because it +# will ignore "escaped" characters. +# +############################################################################ +# +# Links: none +# +# See also: slashbal.icn +# +############################################################################ + +# for compatibility with the original name +# +procedure slashupto(c, s, i, j) + suspend slshupto(c, s, i, j) +end + +# +# slshupto: cset x string x integer x integer -> integers +# (c, s, i, j) -> Is (a generator) +# where Is are the integer positions in s[i:j] before characters +# in c that is not preceded by a backslash escape +# +procedure slshupto(c, s, i, j) + + local c2 + + if /s := &subject + then /i := &pos + else /i := 1 + /j := *s + 1 + + /c := &cset + c2 := '\\' ++ c + s[1:j] ? { + tab(i) + while tab(upto(c2)) do { + if ="\\" then { + move(1) | { + if find("\\", c) + then return &pos - 1 + } + next + } + suspend .&pos + move(1) + } + } + +end + diff --git a/ipl/packs/ibpag2/sortff.icn b/ipl/packs/ibpag2/sortff.icn new file mode 100644 index 0000000..c198c55 --- /dev/null +++ b/ipl/packs/ibpag2/sortff.icn @@ -0,0 +1,82 @@ +############################################################################ +# +# Name: sortff.icn +# +# Title: sortf with multiple field arguments +# +# Author: Bob Alexander and Richard L. Goerwitz +# +# Date: July 14, 1993 +# +############################################################################ +# +# Sortff is like sortf(), except takes an unlimited number of field +# arguments. E.g. if you want to sort a list of structures on field +# 5, and (for those objects that have the same field 5) do a sub-sort +# on field 2, you would use "sortff(list_of_objects, 5, 2)." +# +############################################################################ + +# +# sortff: structure [x integer [x integer...]] -> structure +# (L, [fields ...]) -> new_L +# +# Where L is any subscriptable structure, and fields are any +# number of integer subscripts in any desired order. Returns +# a copy of structure L with its elements sorted on field 1, +# and, for those elements having an identical field 1, sub- +# sorted on field 2, etc. +# +procedure sortff(L, fields[]) + *L <= 1 & { return copy(L) } + return sortff_1(L, fields, 1, []) +end + +procedure sortff_1(L, fields, k, uniqueObject) + + local sortField, cachedKeyValue, i, startOfRun, thisKey + + sortField := fields[k] + L := sortf(L, sortField) # initial sort using fields[k] + # + # If more than one sort field is given, use each field successively + # as the current key, and, where members in L have the same value for + # this key, do a subsort using fields[k+1]. + # + if fields[k +:= 1] then { + # + # Set the equal-key-run pointer to the start of the list and + # save the value of the first key in the run. + # + startOfRun := 1 + cachedKeyValue := L[startOfRun][sortField] | uniqueObject + every i := 2 to *L do { + thisKey := L[i][sortField] | uniqueObject + if not (thisKey === cachedKeyValue) then { + # + # We have an element with a sort key different from the + # previous. If there's a run of more than one equal keys, + # sort the sublist. + # + if i - startOfRun > 1 then { + L := L[1:startOfRun] ||| + sortff_1(L[startOfRun:i], fields, k, uniqueObject) ||| + L[i:0] + } + # Reset the equal-key-run pointer to this key and cache. + startOfRun := i + cachedKeyValue := L[startOfRun][sortField] | uniqueObject + } + } + # + # Sort a final run if it exists. + # + if i - startOfRun > 1 then { + L := L[1:startOfRun] ||| + sortff_1(L[startOfRun:0], fields, k, uniqueObject) + } + } + + return L + +end diff --git a/ipl/packs/ibpag2/version.icn b/ipl/packs/ibpag2/version.icn new file mode 100644 index 0000000..597a4f4 --- /dev/null +++ b/ipl/packs/ibpag2/version.icn @@ -0,0 +1,19 @@ +############################################################################ +# +# Name: version.icn +# +# Title: return Ibpag2 version number +# +# Author: Richard L. Goerwitz +# +# Version: 1.13 +# +############################################################################ +# +# See also: ibpag2.icn +# +############################################################################ + +procedure ib_version() + return "Ibpag2, version 1.3.7" +end |