diff options
Diffstat (limited to 'ipl/procs/ichartp.icn')
-rw-r--r-- | ipl/procs/ichartp.icn | 611 |
1 files changed, 611 insertions, 0 deletions
diff --git a/ipl/procs/ichartp.icn b/ipl/procs/ichartp.icn new file mode 100644 index 0000000..b5968bd --- /dev/null +++ b/ipl/procs/ichartp.icn @@ -0,0 +1,611 @@ +############################################################################ +# +# File: ichartp.icn +# +# Subject: Procedures for a simple chart parser +# +# Author: Richard L. Goerwitz +# +# Date: August 3, 2000 +# +############################################################################ +# +# This file is in the public domain. +# +############################################################################ +# +# Version: 1.11 +# +############################################################################ +# +# General: +# +# Ichartp implements a simple chart parser - a slow but +# easy-to-implement strategy for parsing context free grammars (it +# has a cubic worst-case time factor). Chart parsers are flexible +# enough to handle a lot of natural language constructs. They also +# lack many of the troubles associated with empty and left-recursive +# derivations. To obtain a parse, just create a BNF file, obtain a +# line of input, and then invoke parse_sentence(sentence, +# bnf_filename, start-symbol). Parse_sentence suspends successive +# edge structures corresponding to possible parses of the input +# sentence. There is a routine called edge_2_tree() that converts +# these edges to a more standard form. See the stub main() procedure +# for an example of how to make use of all these facilities. +# +############################################################################ +# +# Implementation details: +# +# The parser itself operates in bottom-up fashion, but it might +# just as well have been coded top-down, or for that matter as a +# combination bottom-up/top-down parser (chart parsers don't care). +# The parser operates in breadth-first fashion, rather than walking +# through each alternative until it is exhausted. As a result, there +# tends to be a pregnant pause before any results appear, but when +# they appear they come out in rapid succession. To use a depth-first +# strategy, just change the "put" in "put(ch.active, new_e)" to read +# "push." I haven't tried to do this, but it should be that simple +# to implement. +# BNFs are specified using the same notation used in Griswold & +# Griswold, and as described in the IPL program "pargen.icn," with +# the following difference: All metacharacters (space, tab, vertical +# slash, right/left parends, brackets and angle brackets) are +# converted to literals by prepending a backslash. Comments can be +# include along with BNFs using the same notation as for Icon code +# (i.e. #-sign). +# +############################################################################ +# +# Gotchas: +# +# Pitfalls to be aware of include things like <L> ::= <L> | ha | +# () (a weak attempt at a laugh recognizer). This grammar will +# accept "ha," "ha ha," etc. but will suspend an infinite number of +# possible parses. The right way to do this sort of thing is <L> ::= +# ha <S> | ha, or if you really insist on having the empty string as +# a possibility, try things like: +# +# <S> ::= () | <LAUGHS> +# <LAUGHS> ::= ha <LAUGHS> | ha +# +# Of course, the whole problem of infinite parses can be avoided by +# simply invoking the parser in a context where it is not going to +# be resumed, or else one in which it will be resumed a finite number +# of times. +# +############################################################################ +# +# Motivation: +# +# I was reading Byte Magazine (vol. 17:2 [February, 1992]), and +# ran into an article entitled "A Natural Solution" (pages 237-244) +# in which a standard chart parser was described in terms of its C++ +# implementation. The author remarked at how his optimizations made +# it possible to parse a 14-word sentence in only 32 seconds (versus +# 146 for a straight Gazdar-Mellish LISP chart parser). 32 seconds +# struck me as hardly anything to write home about, so I coded up a +# quick system in Icon to see how it compared. This library is the +# result. +# I'm quite sure that this code could be very much improved upon. +# As it stands, its performance seems as good as the C++ parser in +# BYTE, if not better. It's hard to tell, though, seeing as I have +# no idea what hardware the guy was using. I'd guess a 386 running +# DOS. On a 386 running Xenix the Icon version beats the BYTE times +# by a factor of about four. The Icon compiler creates an executable +# that (in the above environment) parses 14-15 word sentences in +# anywhere from 6 to 8 seconds. Once the BNF file is read, it does +# short sentences in a second or two. If I get around to writing it, +# I'll probably use the code here as the basic parsing engine for an +# adventure game my son wants me to write. +# +############################################################################ +# +# Links: trees, rewrap, scan, strip, stripcom, strings +# +############################################################################ +# +# Requires: co-expressions +# +############################################################################ +# +# Here's a sample BNF file (taken, modified, from the BYTE +# Magazine article mentioned above). Note again the conventions a) +# that nonterminals be enclosed in angle brackets & b) that overlong +# lines be continued by terminating the preceding line with a +# backslash. Although not illustrated below, the metacharacters <, +# >, (, ), and | can all be escaped (i.e. can all have their special +# meaning neutralized) with a backslash (e.g. \<). Comments can also +# be included using the Icon #-notation. Empty symbols are illegal, +# so if you want to specify a zero-derivation, use "()." There is an +# example of this usage below. +# +# <S> ::= <NP> <VP> | <S> <CONJ> <S> +# <VP> ::= <VP> <CONJ> <VP> | <IV> ( () | <PP> ) | \ +# <TV> ( <NP> | <NP> <PP> | <NP> <VP> | <REL> <S> ) +# <NP> ::= <DET> ( <NP> | <ADJ> <NP> | <ADJ> <NP> <PP> | <NP> <PP> ) | \ +# <ADJ> <NP> | <N> | <N> <CONJ> <N> | \ +# <NP> <CONJ> <NP> +# <PP> ::= <P> ( <NP> | <ADJ> <NP> ) | <PP> <CONJ> <PP> +# <ADJ> ::= <ADJ> <CONJ> <ADJ> +# <CONJ> ::= and +# <DET> ::= the | a | his | her +# <NP> ::= her | he | they +# <N> ::= nurse | nurses | book | books | travel | arrow | arrows | \ +# fortune | fortunes | report +# <ADJ> ::= outrageous | silly | blue | green | heavy | white | red | \ +# black | yellow +# <IV> ::= travel | travels | report | see | suffer +# <TV> ::= hear | see | suffer +# <P> ::= on | of +# <REL> ::= that +# +############################################################################ +# +# Addendum: +# +# Sometimes, when writing BNFs, one finds oneself repeatedly +# writing the same things. In efforts to help eliminate the need for +# doing this, I've written a simple macro facility. It involves one +# reserved word: "define." Just make sure it begins a line. It +# takes two arguments. The first is the macro. The second is its +# expansion. The first argument must not contain any spaces. The +# second, however, may. Here's an example: +# +# define <silluq-clause> ( <silluq-phrase> | \ +# <tifcha-silluq-clause> | \ +# <zaqef-silluq-clause> \ +# ) +# +############################################################################ + +link trees +link scan +link rewrap +link strip +link stripcom +link strings + +record stats(edge_list, lhs_table, term_set) +record chart(inactive, active) # inactive - set; active - list +record retval(no, item) + +record edge(LHS, RHS, LEN, DONE, BEG, END, SEEN) +record short_edge(LHS, RHS) + +# +# For debugging only. +# +procedure main(a) + + local res, filename, line + # &trace := -1 + filename := \a[1] | "bnfs.byte" + while line := read(&input) do { + res := &null + every res := parse_sentence(line, filename, "S") do { + if res.no = 0 then + write(stree(edge2tree(res.item))) +# write(ximage(res.item)) + else if res.no = 1 then { + write("hmmm") + write(stree(edge2tree(res.item))) + } + } + /res & write("can't parse ",line) + } + +end + + +# +# parse_sentence: string x string -> edge records +# (s, filename) -> Es +# where s is a chunk of text presumed to constitute a sentence +# where filename is the name of a grammar file containing BNFs +# where Es are edge records containing possible parses of s +# +procedure parse_sentence(s, filename, start_symbol) + + local file, e, i, elist, ltbl, tset, ch, tokens, st, + memb, new_e, token_set, none_found, active_modified + static master, old_filename + initial master := table() + + # + # Initialize and store stats for filename (if not already stored). + # + if not (filename == \old_filename) then { + file := open(filename, "r") | p_err(filename, 7) + # + # Read BNFs from file; turn them into edge structs, and + # store them all in a list; insert terminal symbols into a set. + # + elist := list(); ltbl := table(); tset := set() + every e := bnf_file_2_edges(file) do { + put(elist, e) # main edge list (active) + (/ltbl[e.LHS] := set([e])) | insert(ltbl[e.LHS], e) # index LHSs + every i := 1 to e.LEN do # LEN holds length of e.RHS + if /e.RHS[i].RHS then # RHS for terminals is null + insert(tset, e.RHS[i].LHS) + } + insert(master, filename, stats(elist, ltbl, tset)) + old_filename := filename + close(file) + } + elist := fullcopy(master[filename].edge_list) + ltbl := fullcopy(master[filename].lhs_table) + tset := master[filename].term_set + + # + # Make edge list into the active section of chart; tokenize the + # sentence s & check for unrecognized terminals. + # + ch := chart(set(), elist) + tokens := tokenize(s) + + # + # Begin parse by entering all tokens in s into the inactive set + # in the chart as edges with no RHS (a NULL RHS is characteristic + # of all terminals). + # + token_set := set(tokens) + every i := 1 to *tokens do { + # Flag words not in the grammar as errors. + if not member(tset, tokens[i]) then + suspend retval(1, tokens[i]) + # Now, give us an inactive edge corresponding to word i. + insert(ch.inactive, e := edge(tokens[i], &null, 1, 1, i, i+1)) + # Insert word i into the LHS table. + (/ltbl[tokens[i]] := set([e])) | insert(ltbl[tokens[i]], e) + # Watch out for those empty RHSs. + insert(ch.inactive, e := edge("", &null, 1, 1, i, i)) + (/ltbl[""] := set([e])) | insert(ltbl[""], e) + } + *tokens = 0 & i := 0 + insert(ch.inactive, e := edge("", &null, 1, 1, i+1, i+1)) + (/ltbl[""] := set([e])) | insert(ltbl[""], e) + + # + # Until no new active edges can be built, keep ploughing through + # the active edge list, trying to match unconfirmed members of their + # RHSs up with inactive edges. + # + until \none_found do { +# write(ximage(ch)) + none_found := 1 + every e := !ch.active do { + active_modified := &null + # keep track of inactive edges we've already tried + /e.SEEN := set() + # + # e.RHS[e.DONE+1] is the first unconfirmed category in the + # RHS of e; ltbl[e.RHS[e.DONE+1].LHS] are all edges having + # as their LHS the LHS of the first unconfirmed category in + # e's RHS; we simply intersect this set with the inactives, + # and then subtract out those we've seen before in connec- + # tion with this edge - + # + if *(st := \ltbl[e.RHS[e.DONE+1].LHS] ** ch.inactive -- e.SEEN) > 0 + then { + # record all the inactive edges being looked at as seen + e.SEEN ++:= st + every memb := !st do { + # make sure this inactive edge starts where the + # last confirmed edge in e.RHS ends! + if memb.BEG ~= \e.RHS[e.DONE].END then next + # set none_found to indicate we've created a new edge + else none_found := &null + # create a new edge, having the LHS of e, the RHS of e, + # the start point of e, the end point of st, and one more + # confirmed RHS members than e + new_e := edge(e.LHS, fullcopy(e.RHS), + e.LEN, e.DONE+1, e.BEG, memb.END) + new_e.RHS[new_e.DONE] := memb + /new_e.BEG := memb.BEG + if new_e.LEN = new_e.DONE then { # it's inactive + insert(ch.inactive, new_e) + insert(ltbl[e.LHS], new_e) + if new_e.BEG = 1 & new_e.END = (*tokens+1) then { + if new_e.LHS == start_symbol # complete parse + then suspend retval(0, new_e) + } + } else { + put(ch.active, new_e) # it's active + active_modified := 1 + } + } + } + # restart if the ch.active list has been modified + if \active_modified then break next + } + } + +end + + +# +# tokenize: break up a sentence into constituent words, using spaces, +# tabs, and other punctuation as separators (we'll need to +# change this a bit later on to cover apostrophed words) +# +procedure tokenize(s) + + local l, word + + l := list() + s ? { + while tab(upto(&letters)) do + put(l, map(tab(many(&letters)))) + } + return l + +end + + +# +# edge2tree: edge -> tree +# e -> t +# +# where e is an edge structure (active or inactive; both are okay) +# where t is a tree like what's described in Ralph Griswold's +# structs library (IPL); I don't know about the 2nd ed. of +# Griswold & Griswold, but the structure is described in the 1st +# ed. in section 16.1 +# +# fails if, for some reason, the conversion can't be made (e.g. the +# edge structure has been screwed around with in some way) +# +procedure edge2tree(e) + + local memb, t + + t := [e.LHS] + \e.RHS | (return t) # a terminal + type(e) == "edge" | (return put(t, [])) # An incomplete edge + every memb := !e.RHS do # has daughters. + put(t, edge2tree(memb)) + return t + +end + + +# +# bnf_file_2_edges: concatenate backslash-final lines & parse +# +procedure bnf_file_2_edges(f) + + local getline, line, macro_list, old, new, i + + macro_list := list() + getline := create stripcom(!f) + while line := @getline do { + while line ?:= 1(tab(-2) || tab(slshupto('\\')), pos(-1)) || @getline + line ? { + if ="define" then { + tab(many('\t ')) + old := tab(slshupto('\t ')) | + stop("bnf_file_2_edges", 7, tab(0)) + tab(many('\t ')) + new := tab(0) + (!macro_list)[1] == old & + stop("bnf_file_2_edges", 8, old) + put(macro_list, [old, new]) + next # go back to main loop + } + else { + every i := 1 to *macro_list do + # Replace is in the IPL (strings.icn). + line := replace(line, macro_list[i][1], macro_list[i][2]) + suspend bnf_2_edges(line) + } + } + } + +end + + +# +# bnf_2_edges: string -> edge records +# s -> Es (a generator) +# where s is a CFPSG rule in BNF form +# where Es are edges +# +procedure bnf_2_edges(s) + + local tmp, RHS, LHS + # + # Break BNF-style CFPSG rule into LHS and RHS. If there is more + # than one RHS (a la the | alternation op), suspend multiple re- + # sults. + # + s ? { + # tab upto the ::= sign + tmp := (tab(slshupto(':')) || ="::=") | p_err(s, 1) + # strip non-backslashed spaces, and extract LHS symbol + stripspaces(tmp) ? { + LHS := 1(tab(slashbal(':', '<', '>')), ="::=") | p_err(s, 1) + LHS ?:= strip(2(="<", tab(-1), =">"), '\\') | p_err(s, 2) + LHS == "" & p_err(s, 6) + } + every RHS := do_slash(tab(0) \ 1) do { + RHS := string_2_list(RHS) + suspend edge(LHS, RHS, *RHS, 0, &null, &null) + } + } + +end + + +# +# string_2_list: string -> list +# s -> L +# where L is a list of partially constructed (short) edges, having +# only LHS and RHS; in the case of nonterminals, the RHS is set +# to 1, while for terminals the RHS is null (and remains that way +# throughout the parse) +# +procedure string_2_list(s) + + local tmp, RHS_list, LHS + + (s || "\x00") ? { + tab(many(' \t')) + pos(-1) & (return [short_edge("", &null)]) + RHS_list := list() + repeat { + tab(many(' \t')) + pos(-1) & break + if match("<") then { + tmp := ("" ~== tab(slashbal(&cset, '<', '>'))) | p_err(s, 4) + LHS := stripspaces(tmp) + LHS ?:= strip(2(="<", tab(-1), =">"), '\\') | p_err(s, 4) + LHS == "" & p_err(s, 10) + put(RHS_list, short_edge(LHS, 1)) + } else { + LHS := stripspaces(tab(slshupto(' <') | -1)) + slshupto('>', LHS) & p_err(s, 5) + put(RHS_list, short_edge(strip(LHS, '\\'), &null)) + } + } + } + return RHS_list + +end + + +# +# fullcopy: make full recursive copy of object +# +procedure fullcopy(obj) + + local retval, i, k + + case type(obj) of { + "co-expression" : return obj + "cset" : return obj + "file" : return obj + "integer" : return obj + "list" : { + retval := list(*obj) + every i := 1 to *obj do + retval[i] := fullcopy(obj[i]) + return retval + } + "null" : return &null + "procedure" : return obj + "real" : return obj + "set" : { + retval := set() + every insert(retval, fullcopy(!obj)) + return retval + } + "string" : return obj + "table" : { + retval := table(obj[[]]) + every k := key(obj) do + insert(retval, fullcopy(k), fullcopy(obj[k])) + return retval + } + # probably a record; if not, we're dealing with a new + # version of Icon or a nonstandard implementation, and + # we're screwed + default : { + retval := copy(obj) + every i := 1 to *obj do + retval[i] := fullcopy(obj[i]) + return retval + } + } + +end + + +# +# do_slash: string -> string(s) +# Given a|b suspend a then b. Used in conjunction with do_parends(). +# +procedure do_slash(s) + + local chunk + s ? { + while chunk := tab(slashbal('|', '(', ')')) do { + suspend do_parends(chunk) + move(1) + } + suspend do_parends(tab(0)) + } + +end + + +# +# do_parends: string -> string(s) +# Given a(b)c suspend abc; given a(b|c)d suspend abd and acd, etc. +# Used in conjuction with do_slash(). +# +procedure do_parends(s) + + local chunk, i, j + s ? { + if not (i := slshupto('(')) then { + chunk := tab(0) + slshupto(')') & p_err(s, 8) + suspend chunk + } else { + j := i + slashbal(')', '(', ')', s[i+1:0]) | p_err(s, 9) + suspend tab(i) || + (move(1), do_slash(tab(j))) || + (move(1), do_parends(tab(0))) + } + } + +end + + +# +# p_err: print error message to stderr & abort +# +procedure p_err(s, n) + + local i, msg + static errlist + initial { + errlist := [[1, "malformed LHS"], + [2, "nonterminal lacks proper <> enclosure"], + [3, "missing left angle bracket"], + [4, "unmatched left angle bracket"], + [5, "unmatched right angle bracket"], + [6, "empty symbol in LHS"], + [7, "unable to open file"], + [8, "unmatched right parenthesis"], + [9, "unmatched left parenthesis"], + [10, "empty symbol in RHS"] + ] + } + every i := 1 to *errlist do + if errlist[i][1] = n then msg := errlist[i][2] + writes(&errout, "error ", n, " (", msg, ") in \n") + every write("\t", rewrap(s) | rewrap()) + exit(n) + +end + + +# +# Remove non-backslashed spaces and tabs. +# +procedure stripspaces(s) + + local s2 + + s2 := "" + s ? { + while s2 ||:= tab(slshupto(' \t')) do + tab(many(' \t')) + s2 ||:= tab(0) + } + + return s2 + +end |