summaryrefslogtreecommitdiff
path: root/ipl/procs/ichartp.icn
diff options
context:
space:
mode:
Diffstat (limited to 'ipl/procs/ichartp.icn')
-rw-r--r--ipl/procs/ichartp.icn611
1 files changed, 611 insertions, 0 deletions
diff --git a/ipl/procs/ichartp.icn b/ipl/procs/ichartp.icn
new file mode 100644
index 0000000..b5968bd
--- /dev/null
+++ b/ipl/procs/ichartp.icn
@@ -0,0 +1,611 @@
+############################################################################
+#
+# File: ichartp.icn
+#
+# Subject: Procedures for a simple chart parser
+#
+# Author: Richard L. Goerwitz
+#
+# Date: August 3, 2000
+#
+############################################################################
+#
+# This file is in the public domain.
+#
+############################################################################
+#
+# Version: 1.11
+#
+############################################################################
+#
+# General:
+#
+# Ichartp implements a simple chart parser - a slow but
+# easy-to-implement strategy for parsing context free grammars (it
+# has a cubic worst-case time factor). Chart parsers are flexible
+# enough to handle a lot of natural language constructs. They also
+# lack many of the troubles associated with empty and left-recursive
+# derivations. To obtain a parse, just create a BNF file, obtain a
+# line of input, and then invoke parse_sentence(sentence,
+# bnf_filename, start-symbol). Parse_sentence suspends successive
+# edge structures corresponding to possible parses of the input
+# sentence. There is a routine called edge_2_tree() that converts
+# these edges to a more standard form. See the stub main() procedure
+# for an example of how to make use of all these facilities.
+#
+############################################################################
+#
+# Implementation details:
+#
+# The parser itself operates in bottom-up fashion, but it might
+# just as well have been coded top-down, or for that matter as a
+# combination bottom-up/top-down parser (chart parsers don't care).
+# The parser operates in breadth-first fashion, rather than walking
+# through each alternative until it is exhausted. As a result, there
+# tends to be a pregnant pause before any results appear, but when
+# they appear they come out in rapid succession. To use a depth-first
+# strategy, just change the "put" in "put(ch.active, new_e)" to read
+# "push." I haven't tried to do this, but it should be that simple
+# to implement.
+# BNFs are specified using the same notation used in Griswold &
+# Griswold, and as described in the IPL program "pargen.icn," with
+# the following difference: All metacharacters (space, tab, vertical
+# slash, right/left parends, brackets and angle brackets) are
+# converted to literals by prepending a backslash. Comments can be
+# include along with BNFs using the same notation as for Icon code
+# (i.e. #-sign).
+#
+############################################################################
+#
+# Gotchas:
+#
+# Pitfalls to be aware of include things like <L> ::= <L> | ha |
+# () (a weak attempt at a laugh recognizer). This grammar will
+# accept "ha," "ha ha," etc. but will suspend an infinite number of
+# possible parses. The right way to do this sort of thing is <L> ::=
+# ha <S> | ha, or if you really insist on having the empty string as
+# a possibility, try things like:
+#
+# <S> ::= () | <LAUGHS>
+# <LAUGHS> ::= ha <LAUGHS> | ha
+#
+# Of course, the whole problem of infinite parses can be avoided by
+# simply invoking the parser in a context where it is not going to
+# be resumed, or else one in which it will be resumed a finite number
+# of times.
+#
+############################################################################
+#
+# Motivation:
+#
+# I was reading Byte Magazine (vol. 17:2 [February, 1992]), and
+# ran into an article entitled "A Natural Solution" (pages 237-244)
+# in which a standard chart parser was described in terms of its C++
+# implementation. The author remarked at how his optimizations made
+# it possible to parse a 14-word sentence in only 32 seconds (versus
+# 146 for a straight Gazdar-Mellish LISP chart parser). 32 seconds
+# struck me as hardly anything to write home about, so I coded up a
+# quick system in Icon to see how it compared. This library is the
+# result.
+# I'm quite sure that this code could be very much improved upon.
+# As it stands, its performance seems as good as the C++ parser in
+# BYTE, if not better. It's hard to tell, though, seeing as I have
+# no idea what hardware the guy was using. I'd guess a 386 running
+# DOS. On a 386 running Xenix the Icon version beats the BYTE times
+# by a factor of about four. The Icon compiler creates an executable
+# that (in the above environment) parses 14-15 word sentences in
+# anywhere from 6 to 8 seconds. Once the BNF file is read, it does
+# short sentences in a second or two. If I get around to writing it,
+# I'll probably use the code here as the basic parsing engine for an
+# adventure game my son wants me to write.
+#
+############################################################################
+#
+# Links: trees, rewrap, scan, strip, stripcom, strings
+#
+############################################################################
+#
+# Requires: co-expressions
+#
+############################################################################
+#
+# Here's a sample BNF file (taken, modified, from the BYTE
+# Magazine article mentioned above). Note again the conventions a)
+# that nonterminals be enclosed in angle brackets & b) that overlong
+# lines be continued by terminating the preceding line with a
+# backslash. Although not illustrated below, the metacharacters <,
+# >, (, ), and | can all be escaped (i.e. can all have their special
+# meaning neutralized) with a backslash (e.g. \<). Comments can also
+# be included using the Icon #-notation. Empty symbols are illegal,
+# so if you want to specify a zero-derivation, use "()." There is an
+# example of this usage below.
+#
+# <S> ::= <NP> <VP> | <S> <CONJ> <S>
+# <VP> ::= <VP> <CONJ> <VP> | <IV> ( () | <PP> ) | \
+# <TV> ( <NP> | <NP> <PP> | <NP> <VP> | <REL> <S> )
+# <NP> ::= <DET> ( <NP> | <ADJ> <NP> | <ADJ> <NP> <PP> | <NP> <PP> ) | \
+# <ADJ> <NP> | <N> | <N> <CONJ> <N> | \
+# <NP> <CONJ> <NP>
+# <PP> ::= <P> ( <NP> | <ADJ> <NP> ) | <PP> <CONJ> <PP>
+# <ADJ> ::= <ADJ> <CONJ> <ADJ>
+# <CONJ> ::= and
+# <DET> ::= the | a | his | her
+# <NP> ::= her | he | they
+# <N> ::= nurse | nurses | book | books | travel | arrow | arrows | \
+# fortune | fortunes | report
+# <ADJ> ::= outrageous | silly | blue | green | heavy | white | red | \
+# black | yellow
+# <IV> ::= travel | travels | report | see | suffer
+# <TV> ::= hear | see | suffer
+# <P> ::= on | of
+# <REL> ::= that
+#
+############################################################################
+#
+# Addendum:
+#
+# Sometimes, when writing BNFs, one finds oneself repeatedly
+# writing the same things. In efforts to help eliminate the need for
+# doing this, I've written a simple macro facility. It involves one
+# reserved word: "define." Just make sure it begins a line. It
+# takes two arguments. The first is the macro. The second is its
+# expansion. The first argument must not contain any spaces. The
+# second, however, may. Here's an example:
+#
+# define <silluq-clause> ( <silluq-phrase> | \
+# <tifcha-silluq-clause> | \
+# <zaqef-silluq-clause> \
+# )
+#
+############################################################################
+
+link trees
+link scan
+link rewrap
+link strip
+link stripcom
+link strings
+
+record stats(edge_list, lhs_table, term_set)
+record chart(inactive, active) # inactive - set; active - list
+record retval(no, item)
+
+record edge(LHS, RHS, LEN, DONE, BEG, END, SEEN)
+record short_edge(LHS, RHS)
+
+#
+# For debugging only.
+#
+procedure main(a)
+
+ local res, filename, line
+ # &trace := -1
+ filename := \a[1] | "bnfs.byte"
+ while line := read(&input) do {
+ res := &null
+ every res := parse_sentence(line, filename, "S") do {
+ if res.no = 0 then
+ write(stree(edge2tree(res.item)))
+# write(ximage(res.item))
+ else if res.no = 1 then {
+ write("hmmm")
+ write(stree(edge2tree(res.item)))
+ }
+ }
+ /res & write("can't parse ",line)
+ }
+
+end
+
+
+#
+# parse_sentence: string x string -> edge records
+# (s, filename) -> Es
+# where s is a chunk of text presumed to constitute a sentence
+# where filename is the name of a grammar file containing BNFs
+# where Es are edge records containing possible parses of s
+#
+procedure parse_sentence(s, filename, start_symbol)
+
+ local file, e, i, elist, ltbl, tset, ch, tokens, st,
+ memb, new_e, token_set, none_found, active_modified
+ static master, old_filename
+ initial master := table()
+
+ #
+ # Initialize and store stats for filename (if not already stored).
+ #
+ if not (filename == \old_filename) then {
+ file := open(filename, "r") | p_err(filename, 7)
+ #
+ # Read BNFs from file; turn them into edge structs, and
+ # store them all in a list; insert terminal symbols into a set.
+ #
+ elist := list(); ltbl := table(); tset := set()
+ every e := bnf_file_2_edges(file) do {
+ put(elist, e) # main edge list (active)
+ (/ltbl[e.LHS] := set([e])) | insert(ltbl[e.LHS], e) # index LHSs
+ every i := 1 to e.LEN do # LEN holds length of e.RHS
+ if /e.RHS[i].RHS then # RHS for terminals is null
+ insert(tset, e.RHS[i].LHS)
+ }
+ insert(master, filename, stats(elist, ltbl, tset))
+ old_filename := filename
+ close(file)
+ }
+ elist := fullcopy(master[filename].edge_list)
+ ltbl := fullcopy(master[filename].lhs_table)
+ tset := master[filename].term_set
+
+ #
+ # Make edge list into the active section of chart; tokenize the
+ # sentence s & check for unrecognized terminals.
+ #
+ ch := chart(set(), elist)
+ tokens := tokenize(s)
+
+ #
+ # Begin parse by entering all tokens in s into the inactive set
+ # in the chart as edges with no RHS (a NULL RHS is characteristic
+ # of all terminals).
+ #
+ token_set := set(tokens)
+ every i := 1 to *tokens do {
+ # Flag words not in the grammar as errors.
+ if not member(tset, tokens[i]) then
+ suspend retval(1, tokens[i])
+ # Now, give us an inactive edge corresponding to word i.
+ insert(ch.inactive, e := edge(tokens[i], &null, 1, 1, i, i+1))
+ # Insert word i into the LHS table.
+ (/ltbl[tokens[i]] := set([e])) | insert(ltbl[tokens[i]], e)
+ # Watch out for those empty RHSs.
+ insert(ch.inactive, e := edge("", &null, 1, 1, i, i))
+ (/ltbl[""] := set([e])) | insert(ltbl[""], e)
+ }
+ *tokens = 0 & i := 0
+ insert(ch.inactive, e := edge("", &null, 1, 1, i+1, i+1))
+ (/ltbl[""] := set([e])) | insert(ltbl[""], e)
+
+ #
+ # Until no new active edges can be built, keep ploughing through
+ # the active edge list, trying to match unconfirmed members of their
+ # RHSs up with inactive edges.
+ #
+ until \none_found do {
+# write(ximage(ch))
+ none_found := 1
+ every e := !ch.active do {
+ active_modified := &null
+ # keep track of inactive edges we've already tried
+ /e.SEEN := set()
+ #
+ # e.RHS[e.DONE+1] is the first unconfirmed category in the
+ # RHS of e; ltbl[e.RHS[e.DONE+1].LHS] are all edges having
+ # as their LHS the LHS of the first unconfirmed category in
+ # e's RHS; we simply intersect this set with the inactives,
+ # and then subtract out those we've seen before in connec-
+ # tion with this edge -
+ #
+ if *(st := \ltbl[e.RHS[e.DONE+1].LHS] ** ch.inactive -- e.SEEN) > 0
+ then {
+ # record all the inactive edges being looked at as seen
+ e.SEEN ++:= st
+ every memb := !st do {
+ # make sure this inactive edge starts where the
+ # last confirmed edge in e.RHS ends!
+ if memb.BEG ~= \e.RHS[e.DONE].END then next
+ # set none_found to indicate we've created a new edge
+ else none_found := &null
+ # create a new edge, having the LHS of e, the RHS of e,
+ # the start point of e, the end point of st, and one more
+ # confirmed RHS members than e
+ new_e := edge(e.LHS, fullcopy(e.RHS),
+ e.LEN, e.DONE+1, e.BEG, memb.END)
+ new_e.RHS[new_e.DONE] := memb
+ /new_e.BEG := memb.BEG
+ if new_e.LEN = new_e.DONE then { # it's inactive
+ insert(ch.inactive, new_e)
+ insert(ltbl[e.LHS], new_e)
+ if new_e.BEG = 1 & new_e.END = (*tokens+1) then {
+ if new_e.LHS == start_symbol # complete parse
+ then suspend retval(0, new_e)
+ }
+ } else {
+ put(ch.active, new_e) # it's active
+ active_modified := 1
+ }
+ }
+ }
+ # restart if the ch.active list has been modified
+ if \active_modified then break next
+ }
+ }
+
+end
+
+
+#
+# tokenize: break up a sentence into constituent words, using spaces,
+# tabs, and other punctuation as separators (we'll need to
+# change this a bit later on to cover apostrophed words)
+#
+procedure tokenize(s)
+
+ local l, word
+
+ l := list()
+ s ? {
+ while tab(upto(&letters)) do
+ put(l, map(tab(many(&letters))))
+ }
+ return l
+
+end
+
+
+#
+# edge2tree: edge -> tree
+# e -> t
+#
+# where e is an edge structure (active or inactive; both are okay)
+# where t is a tree like what's described in Ralph Griswold's
+# structs library (IPL); I don't know about the 2nd ed. of
+# Griswold & Griswold, but the structure is described in the 1st
+# ed. in section 16.1
+#
+# fails if, for some reason, the conversion can't be made (e.g. the
+# edge structure has been screwed around with in some way)
+#
+procedure edge2tree(e)
+
+ local memb, t
+
+ t := [e.LHS]
+ \e.RHS | (return t) # a terminal
+ type(e) == "edge" | (return put(t, [])) # An incomplete edge
+ every memb := !e.RHS do # has daughters.
+ put(t, edge2tree(memb))
+ return t
+
+end
+
+
+#
+# bnf_file_2_edges: concatenate backslash-final lines & parse
+#
+procedure bnf_file_2_edges(f)
+
+ local getline, line, macro_list, old, new, i
+
+ macro_list := list()
+ getline := create stripcom(!f)
+ while line := @getline do {
+ while line ?:= 1(tab(-2) || tab(slshupto('\\')), pos(-1)) || @getline
+ line ? {
+ if ="define" then {
+ tab(many('\t '))
+ old := tab(slshupto('\t ')) |
+ stop("bnf_file_2_edges", 7, tab(0))
+ tab(many('\t '))
+ new := tab(0)
+ (!macro_list)[1] == old &
+ stop("bnf_file_2_edges", 8, old)
+ put(macro_list, [old, new])
+ next # go back to main loop
+ }
+ else {
+ every i := 1 to *macro_list do
+ # Replace is in the IPL (strings.icn).
+ line := replace(line, macro_list[i][1], macro_list[i][2])
+ suspend bnf_2_edges(line)
+ }
+ }
+ }
+
+end
+
+
+#
+# bnf_2_edges: string -> edge records
+# s -> Es (a generator)
+# where s is a CFPSG rule in BNF form
+# where Es are edges
+#
+procedure bnf_2_edges(s)
+
+ local tmp, RHS, LHS
+ #
+ # Break BNF-style CFPSG rule into LHS and RHS. If there is more
+ # than one RHS (a la the | alternation op), suspend multiple re-
+ # sults.
+ #
+ s ? {
+ # tab upto the ::= sign
+ tmp := (tab(slshupto(':')) || ="::=") | p_err(s, 1)
+ # strip non-backslashed spaces, and extract LHS symbol
+ stripspaces(tmp) ? {
+ LHS := 1(tab(slashbal(':', '<', '>')), ="::=") | p_err(s, 1)
+ LHS ?:= strip(2(="<", tab(-1), =">"), '\\') | p_err(s, 2)
+ LHS == "" & p_err(s, 6)
+ }
+ every RHS := do_slash(tab(0) \ 1) do {
+ RHS := string_2_list(RHS)
+ suspend edge(LHS, RHS, *RHS, 0, &null, &null)
+ }
+ }
+
+end
+
+
+#
+# string_2_list: string -> list
+# s -> L
+# where L is a list of partially constructed (short) edges, having
+# only LHS and RHS; in the case of nonterminals, the RHS is set
+# to 1, while for terminals the RHS is null (and remains that way
+# throughout the parse)
+#
+procedure string_2_list(s)
+
+ local tmp, RHS_list, LHS
+
+ (s || "\x00") ? {
+ tab(many(' \t'))
+ pos(-1) & (return [short_edge("", &null)])
+ RHS_list := list()
+ repeat {
+ tab(many(' \t'))
+ pos(-1) & break
+ if match("<") then {
+ tmp := ("" ~== tab(slashbal(&cset, '<', '>'))) | p_err(s, 4)
+ LHS := stripspaces(tmp)
+ LHS ?:= strip(2(="<", tab(-1), =">"), '\\') | p_err(s, 4)
+ LHS == "" & p_err(s, 10)
+ put(RHS_list, short_edge(LHS, 1))
+ } else {
+ LHS := stripspaces(tab(slshupto(' <') | -1))
+ slshupto('>', LHS) & p_err(s, 5)
+ put(RHS_list, short_edge(strip(LHS, '\\'), &null))
+ }
+ }
+ }
+ return RHS_list
+
+end
+
+
+#
+# fullcopy: make full recursive copy of object
+#
+procedure fullcopy(obj)
+
+ local retval, i, k
+
+ case type(obj) of {
+ "co-expression" : return obj
+ "cset" : return obj
+ "file" : return obj
+ "integer" : return obj
+ "list" : {
+ retval := list(*obj)
+ every i := 1 to *obj do
+ retval[i] := fullcopy(obj[i])
+ return retval
+ }
+ "null" : return &null
+ "procedure" : return obj
+ "real" : return obj
+ "set" : {
+ retval := set()
+ every insert(retval, fullcopy(!obj))
+ return retval
+ }
+ "string" : return obj
+ "table" : {
+ retval := table(obj[[]])
+ every k := key(obj) do
+ insert(retval, fullcopy(k), fullcopy(obj[k]))
+ return retval
+ }
+ # probably a record; if not, we're dealing with a new
+ # version of Icon or a nonstandard implementation, and
+ # we're screwed
+ default : {
+ retval := copy(obj)
+ every i := 1 to *obj do
+ retval[i] := fullcopy(obj[i])
+ return retval
+ }
+ }
+
+end
+
+
+#
+# do_slash: string -> string(s)
+# Given a|b suspend a then b. Used in conjunction with do_parends().
+#
+procedure do_slash(s)
+
+ local chunk
+ s ? {
+ while chunk := tab(slashbal('|', '(', ')')) do {
+ suspend do_parends(chunk)
+ move(1)
+ }
+ suspend do_parends(tab(0))
+ }
+
+end
+
+
+#
+# do_parends: string -> string(s)
+# Given a(b)c suspend abc; given a(b|c)d suspend abd and acd, etc.
+# Used in conjuction with do_slash().
+#
+procedure do_parends(s)
+
+ local chunk, i, j
+ s ? {
+ if not (i := slshupto('(')) then {
+ chunk := tab(0)
+ slshupto(')') & p_err(s, 8)
+ suspend chunk
+ } else {
+ j := i + slashbal(')', '(', ')', s[i+1:0]) | p_err(s, 9)
+ suspend tab(i) ||
+ (move(1), do_slash(tab(j))) ||
+ (move(1), do_parends(tab(0)))
+ }
+ }
+
+end
+
+
+#
+# p_err: print error message to stderr & abort
+#
+procedure p_err(s, n)
+
+ local i, msg
+ static errlist
+ initial {
+ errlist := [[1, "malformed LHS"],
+ [2, "nonterminal lacks proper <> enclosure"],
+ [3, "missing left angle bracket"],
+ [4, "unmatched left angle bracket"],
+ [5, "unmatched right angle bracket"],
+ [6, "empty symbol in LHS"],
+ [7, "unable to open file"],
+ [8, "unmatched right parenthesis"],
+ [9, "unmatched left parenthesis"],
+ [10, "empty symbol in RHS"]
+ ]
+ }
+ every i := 1 to *errlist do
+ if errlist[i][1] = n then msg := errlist[i][2]
+ writes(&errout, "error ", n, " (", msg, ") in \n")
+ every write("\t", rewrap(s) | rewrap())
+ exit(n)
+
+end
+
+
+#
+# Remove non-backslashed spaces and tabs.
+#
+procedure stripspaces(s)
+
+ local s2
+
+ s2 := ""
+ s ? {
+ while s2 ||:= tab(slshupto(' \t')) do
+ tab(many(' \t'))
+ s2 ||:= tab(0)
+ }
+
+ return s2
+
+end