summaryrefslogtreecommitdiff
path: root/ipl/procs/ichartp.icn
blob: b5968bde0479f6c8cfd3fea2898e120e090373dc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
############################################################################
#
#	File:     ichartp.icn
#
#	Subject:  Procedures for a simple chart parser
#
#	Author:   Richard L. Goerwitz
#
#	Date:	  August 3, 2000
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
#	Version:  1.11
#
############################################################################
#
#  General:
#
#      Ichartp implements a simple chart parser - a slow but
#  easy-to-implement strategy for parsing context free grammars (it
#  has a cubic worst-case time factor).  Chart parsers are flexible
#  enough to handle a lot of natural language constructs.  They also
#  lack many of the troubles associated with empty and left-recursive
#  derivations.  To obtain a parse, just create a BNF file, obtain a
#  line of input, and then invoke parse_sentence(sentence,
#  bnf_filename, start-symbol).  Parse_sentence suspends successive
#  edge structures corresponding to possible parses of the input
#  sentence.  There is a routine called edge_2_tree() that converts
#  these edges to a more standard form.  See the stub main() procedure
#  for an example of how to make use of all these facilities.
#
############################################################################
#
#  Implementation details:
#
#      The parser itself operates in bottom-up fashion, but it might
#  just as well have been coded top-down, or for that matter as a
#  combination bottom-up/top-down parser (chart parsers don't care).
#  The parser operates in breadth-first fashion, rather than walking
#  through each alternative until it is exhausted.  As a result, there
#  tends to be a pregnant pause before any results appear, but when
#  they appear they come out in rapid succession.  To use a depth-first
#  strategy, just change the "put" in "put(ch.active, new_e)" to read
#  "push."  I haven't tried to do this, but it should be that simple
#  to implement.
#      BNFs are specified using the same notation used in Griswold &
#  Griswold, and as described in the IPL program "pargen.icn," with
#  the following difference:  All metacharacters (space, tab, vertical
#  slash, right/left parends, brackets and angle brackets) are
#  converted to literals by prepending a backslash.  Comments can be
#  include along with BNFs using the same notation as for Icon code
#  (i.e. #-sign).
#
############################################################################
#
#  Gotchas:
#
#      Pitfalls to be aware of include things like <L> ::= <L> | ha |
#  () (a weak attempt at a laugh recognizer).  This grammar will
#  accept "ha," "ha ha," etc. but will suspend an infinite number of
#  possible parses.  The right way to do this sort of thing is <L> ::=
#  ha <S> | ha, or if you really insist on having the empty string as
#  a possibility, try things like:
#
#          <S>      ::= () | <LAUGHS>
#          <LAUGHS> ::= ha <LAUGHS> | ha
#
#  Of course, the whole problem of infinite parses can be avoided by
#  simply invoking the parser in a context where it is not going to
#  be resumed, or else one in which it will be resumed a finite number
#  of times.
#
############################################################################
#
#  Motivation:
#
#      I was reading Byte Magazine (vol. 17:2 [February, 1992]), and
#  ran into an article entitled "A Natural Solution" (pages 237-244)
#  in which a standard chart parser was described in terms of its C++
#  implementation.  The author remarked at how his optimizations made
#  it possible to parse a 14-word sentence in only 32 seconds (versus
#  146 for a straight Gazdar-Mellish LISP chart parser).  32 seconds
#  struck me as hardly anything to write home about, so I coded up a
#  quick system in Icon to see how it compared.  This library is the
#  result.
#      I'm quite sure that this code could be very much improved upon.
#  As it stands, its performance seems as good as the C++ parser in
#  BYTE, if not better.  It's hard to tell, though, seeing as I have
#  no idea what hardware the guy was using.  I'd guess a 386 running
#  DOS.  On a 386 running Xenix the Icon version beats the BYTE times
#  by a factor of about four.  The Icon compiler creates an executable
#  that (in the above environment) parses 14-15 word sentences in
#  anywhere from 6 to 8 seconds.  Once the BNF file is read, it does
#  short sentences in a second or two.  If I get around to writing it,
#  I'll probably use the code here as the basic parsing engine for an
#  adventure game my son wants me to write.
#
############################################################################
#
#  Links: trees, rewrap, scan, strip, stripcom, strings
#
############################################################################
#
#  Requires:  co-expressions
#
############################################################################
#
#       Here's a sample BNF file (taken, modified, from the BYTE
#  Magazine article mentioned above).  Note again the conventions a)
#  that nonterminals be enclosed in angle brackets & b) that overlong
#  lines be continued by terminating the preceding line with a
#  backslash.  Although not illustrated below, the metacharacters <,
#  >, (, ), and | can all be escaped (i.e. can all have their special
#  meaning neutralized) with a backslash (e.g. \<).  Comments can also
#  be included using the Icon #-notation.  Empty symbols are illegal,
#  so if you want to specify a zero-derivation, use "()."  There is an
#  example of this usage below.
#
#  <S>    ::= <NP> <VP> | <S> <CONJ> <S>
#  <VP>   ::= <VP> <CONJ> <VP> | <IV> ( () | <PP> ) | \
#  	   <TV> ( <NP> | <NP> <PP> | <NP> <VP> | <REL> <S> )
#  <NP>   ::= <DET> ( <NP> | <ADJ> <NP> | <ADJ> <NP> <PP> | <NP> <PP> ) | \
#  	   <ADJ> <NP> | <N> | <N> <CONJ> <N> | \
#  	   <NP> <CONJ> <NP>
#  <PP>   ::= <P> ( <NP> | <ADJ> <NP> ) | <PP> <CONJ> <PP>
#  <ADJ>  ::= <ADJ> <CONJ> <ADJ>
#  <CONJ> ::= and
#  <DET>  ::= the | a | his | her
#  <NP>   ::= her | he | they
#  <N>    ::= nurse | nurses | book | books | travel | arrow | arrows | \
#  	  fortune | fortunes | report
#  <ADJ>  ::= outrageous | silly | blue | green | heavy | white | red | \
#  	  black | yellow
#  <IV>   ::= travel | travels | report | see | suffer
#  <TV>   ::= hear | see | suffer
#  <P>    ::= on | of
#  <REL>  ::= that
#
############################################################################
#
#  Addendum:
#
#      Sometimes, when writing BNFs, one finds oneself repeatedly
#  writing the same things.  In efforts to help eliminate the need for
#  doing this, I've written a simple macro facility.  It involves one
#  reserved word:  "define."  Just make sure it begins a line.  It
#  takes two arguments.  The first is the macro.  The second is its
#  expansion.  The first argument must not contain any spaces.  The
#  second, however, may.  Here's an example:
#
#      define <silluq-clause>    (   <silluq-phrase> | \
#                                    <tifcha-silluq-clause> | \
#                                    <zaqef-silluq-clause> \
#                                )
#
############################################################################

link trees
link scan
link rewrap
link strip
link stripcom
link strings

record stats(edge_list, lhs_table, term_set)
record chart(inactive, active)               # inactive - set; active - list
record retval(no, item)

record edge(LHS, RHS, LEN, DONE, BEG, END, SEEN)
record short_edge(LHS, RHS)

#
# For debugging only.
#
procedure main(a)

    local res, filename, line
    # &trace := -1
    filename := \a[1] | "bnfs.byte"
    while line := read(&input) do {
	res := &null
        every res := parse_sentence(line, filename, "S") do {
            if res.no = 0 then
	        write(stree(edge2tree(res.item)))
#	        write(ximage(res.item))
	    else if res.no = 1 then {
		write("hmmm")
		write(stree(edge2tree(res.item)))
	    }
        }
	/res & write("can't parse ",line)
    }

end


#
# parse_sentence:  string x string -> edge records
#                  (s, filename) -> Es
#     where s is a chunk of text presumed to constitute a sentence
#     where filename is the name of a grammar file containing BNFs
#     where Es are edge records containing possible parses of s
#
procedure parse_sentence(s, filename, start_symbol)

    local file, e, i, elist, ltbl, tset, ch, tokens, st, 
        memb, new_e, token_set, none_found, active_modified
    static master, old_filename
    initial master := table()

    #
    # Initialize and store stats for filename (if not already stored).
    #
    if not (filename == \old_filename) then {
        file := open(filename, "r") | p_err(filename, 7)
        #
        # Read BNFs from file; turn them into edge structs, and
        # store them all in a list; insert terminal symbols into a set.
        #
        elist := list(); ltbl := table(); tset := set()
        every e := bnf_file_2_edges(file) do {
            put(elist, e)                      # main edge list (active)
            (/ltbl[e.LHS] := set([e])) | insert(ltbl[e.LHS], e) # index LHSs
            every i := 1 to e.LEN do           # LEN holds length of e.RHS
                if /e.RHS[i].RHS then          # RHS for terminals is null
                    insert(tset, e.RHS[i].LHS)
        }
        insert(master, filename, stats(elist, ltbl, tset))
        old_filename := filename
        close(file)
    }
    elist := fullcopy(master[filename].edge_list)
    ltbl  := fullcopy(master[filename].lhs_table)
    tset  := master[filename].term_set
    
    #
    # Make edge list into the active section of chart; tokenize the
    # sentence s & check for unrecognized terminals.
    #
    ch := chart(set(), elist)
    tokens := tokenize(s)

    #
    # Begin parse by entering all tokens in s into the inactive set
    # in the chart as edges with no RHS (a NULL RHS is characteristic
    # of all terminals).
    #
    token_set := set(tokens)
    every i := 1 to *tokens do {
        # Flag words not in the grammar as errors.
        if not member(tset, tokens[i]) then
            suspend retval(1, tokens[i])
        # Now, give us an inactive edge corresponding to word i.
        insert(ch.inactive, e := edge(tokens[i], &null, 1, 1, i, i+1))
        # Insert word i into the LHS table.
        (/ltbl[tokens[i]] := set([e])) | insert(ltbl[tokens[i]], e)
	# Watch out for those empty RHSs.
	insert(ch.inactive, e := edge("", &null, 1, 1, i, i))
        (/ltbl[""] := set([e])) | insert(ltbl[""], e)
    }
    *tokens = 0 & i := 0
    insert(ch.inactive, e := edge("", &null, 1, 1, i+1, i+1))
    (/ltbl[""] := set([e])) | insert(ltbl[""], e)

    #
    # Until no new active edges can be built, keep ploughing through
    # the active edge list, trying to match unconfirmed members of their
    # RHSs up with inactive edges.
    #
    until \none_found do {
#	write(ximage(ch))
        none_found := 1
        every e := !ch.active do {
            active_modified := &null
            # keep track of inactive edges we've already tried
            /e.SEEN := set()
            #
            # e.RHS[e.DONE+1] is the first unconfirmed category in the
            # RHS of e; ltbl[e.RHS[e.DONE+1].LHS] are all edges having
            # as their LHS the LHS of the first unconfirmed category in
            # e's RHS; we simply intersect this set with the inactives,
            # and then subtract out those we've seen before in connec-
            # tion with this edge -
            #
            if *(st := \ltbl[e.RHS[e.DONE+1].LHS] ** ch.inactive -- e.SEEN) > 0
            then {
                # record all the inactive edges being looked at as seen
                e.SEEN ++:= st
                every memb := !st do {
		    # make sure this inactive edge starts where the
		    # last confirmed edge in e.RHS ends!
		    if memb.BEG ~= \e.RHS[e.DONE].END then next
		    # set none_found to indicate we've created a new edge
		    else none_found := &null
                    # create a new edge, having the LHS of e, the RHS of e,
                    # the start point of e, the end point of st, and one more
                    # confirmed RHS members than e
                    new_e := edge(e.LHS, fullcopy(e.RHS),
				  e.LEN, e.DONE+1, e.BEG, memb.END)
                    new_e.RHS[new_e.DONE] := memb
                    /new_e.BEG := memb.BEG
                    if new_e.LEN = new_e.DONE then {      # it's inactive
                        insert(ch.inactive, new_e)
                        insert(ltbl[e.LHS], new_e)
                        if new_e.BEG = 1 & new_e.END = (*tokens+1) then {
                            if new_e.LHS == start_symbol  # complete parse
                            then suspend retval(0, new_e)
                        }
                    } else {
                        put(ch.active, new_e)            # it's active
                        active_modified := 1
                    }
                }
            }
            # restart if the ch.active list has been modified
            if \active_modified then break next
        }
    }

end


#
# tokenize:  break up a sentence into constituent words, using spaces,
#            tabs, and other punctuation as separators (we'll need to
#            change this a bit later on to cover apostrophed words)
#
procedure tokenize(s)

    local l, word

    l := list()
    s ? {
        while tab(upto(&letters)) do
            put(l, map(tab(many(&letters))))
    }
    return l

end


#
# edge2tree:  edge -> tree
#             e -> t
#
#    where e is an edge structure (active or inactive; both are okay)
#    where t is a tree like what's described in Ralph Griswold's
#    structs library (IPL); I don't know about the 2nd ed. of
#    Griswold & Griswold, but the structure is described in the 1st
#    ed. in section 16.1
#
#    fails if, for some reason, the conversion can't be made (e.g. the
#    edge structure has been screwed around with in some way)
#
procedure edge2tree(e)

    local memb, t

    t := [e.LHS]
    \e.RHS | (return t)                                 # a terminal
    type(e) == "edge" | (return put(t, []))             # An incomplete edge
    every memb := !e.RHS do                             # has daughters.
	put(t, edge2tree(memb))
    return t

end


#
# bnf_file_2_edges: concatenate backslash-final lines & parse
#
procedure bnf_file_2_edges(f)

    local getline, line, macro_list, old, new, i

    macro_list := list()
    getline := create stripcom(!f)
    while line := @getline do {
        while line ?:= 1(tab(-2) || tab(slshupto('\\')), pos(-1)) || @getline
	line ? {
	    if ="define" then {
		tab(many('\t '))
		old := tab(slshupto('\t ')) |
		    stop("bnf_file_2_edges", 7, tab(0))
		tab(many('\t '))
		new := tab(0)
		(!macro_list)[1] == old &
		    stop("bnf_file_2_edges", 8, old)
		put(macro_list, [old, new])
		next		# go back to main loop
	    }
	    else {
		every i := 1 to *macro_list do
		    # Replace is in the IPL (strings.icn).
		    line := replace(line, macro_list[i][1], macro_list[i][2])
		suspend bnf_2_edges(line)
	    }
	}
    }

end


#
# bnf_2_edges: string -> edge records
#              s -> Es (a generator)
#    where s is a CFPSG rule in BNF form
#    where Es are edges
#
procedure bnf_2_edges(s)
    
    local tmp, RHS, LHS
    #
    # Break BNF-style CFPSG rule into LHS and RHS.  If there is more
    # than one RHS (a la the | alternation op), suspend multiple re-
    # sults.
    #
    s ? {
	# tab upto the ::= sign
	tmp := (tab(slshupto(':')) || ="::=") | p_err(s, 1)
	# strip non-backslashed spaces, and extract LHS symbol
	stripspaces(tmp) ? {
	    LHS := 1(tab(slashbal(':', '<', '>')), ="::=") | p_err(s, 1)
	    LHS ?:= strip(2(="<", tab(-1), =">"), '\\') | p_err(s, 2)
	    LHS == "" & p_err(s, 6)
	}
        every RHS := do_slash(tab(0) \ 1) do {
            RHS := string_2_list(RHS)
            suspend edge(LHS, RHS, *RHS, 0, &null, &null)
        }
    }

end


#
# string_2_list:  string -> list
#                 s -> L
#    where L is a list of partially constructed (short) edges, having
#    only LHS and RHS; in the case of nonterminals, the RHS is set
#    to 1, while for terminals the RHS is null (and remains that way
#    throughout the parse)
#
procedure string_2_list(s)

    local tmp, RHS_list, LHS

    (s || "\x00") ? {
	tab(many(' \t'))
        pos(-1) & (return [short_edge("", &null)])
        RHS_list := list()
        repeat {
	    tab(many(' \t'))
	    pos(-1) & break
            if match("<") then {
                tmp := ("" ~== tab(slashbal(&cset, '<', '>'))) | p_err(s, 4)
		LHS := stripspaces(tmp)
                LHS ?:= strip(2(="<", tab(-1), =">"), '\\') | p_err(s, 4)
		LHS == "" & p_err(s, 10)
                put(RHS_list, short_edge(LHS, 1))
            } else {
                LHS := stripspaces(tab(slshupto(' <') | -1))
                slshupto('>', LHS) & p_err(s, 5)
                put(RHS_list, short_edge(strip(LHS, '\\'), &null))
            }
        }
    }
    return RHS_list

end


#
# fullcopy:  make full recursive copy of object
#
procedure fullcopy(obj)

    local retval, i, k

    case type(obj) of {
        "co-expression"  : return obj
        "cset"           : return obj
        "file"           : return obj
        "integer"        : return obj
        "list"           : {
            retval := list(*obj)
            every i := 1 to *obj do
                retval[i] := fullcopy(obj[i])
            return retval
        }
        "null"           :  return &null
        "procedure"      :  return obj
        "real"           :  return obj
        "set"            :  {
            retval := set()
            every insert(retval, fullcopy(!obj))
            return retval
        }
        "string"         :  return obj
        "table"          :  {
            retval := table(obj[[]])
            every k := key(obj) do
                insert(retval, fullcopy(k), fullcopy(obj[k]))
            return retval
        }
        # probably a record; if not, we're dealing with a new
        # version of Icon or a nonstandard implementation, and
	# we're screwed
        default          :  {
            retval := copy(obj)
            every i := 1 to *obj do
                retval[i] := fullcopy(obj[i])
            return retval
        }
    }

end


#
# do_slash:  string -> string(s)
#     Given a|b suspend a then b.  Used in conjunction with do_parends().
#
procedure do_slash(s)

    local chunk
    s ? {
	while chunk := tab(slashbal('|', '(', ')')) do {
	    suspend do_parends(chunk)
	    move(1)
	}
	suspend do_parends(tab(0))
    }

end


#
# do_parends:  string -> string(s)
#    Given a(b)c suspend abc; given a(b|c)d suspend abd and acd, etc.
#    Used in conjuction with do_slash().
#
procedure do_parends(s)

    local chunk, i, j
    s ? {
	if not (i := slshupto('(')) then {
	    chunk := tab(0)
	    slshupto(')') & p_err(s, 8)
	    suspend chunk
	} else {
	    j := i + slashbal(')', '(', ')', s[i+1:0]) | p_err(s, 9)
	    suspend tab(i) ||
		(move(1), do_slash(tab(j))) ||
		(move(1), do_parends(tab(0)))
	}
    }

end


#
# p_err:  print error message to stderr & abort
#
procedure p_err(s, n)

    local i, msg
    static errlist
    initial {
        errlist := [[1,  "malformed LHS"],
                    [2,  "nonterminal lacks proper <> enclosure"],
                    [3,  "missing left angle bracket"],
                    [4,  "unmatched left angle bracket"],
                    [5,  "unmatched right angle bracket"],
		    [6,  "empty symbol in LHS"],
                    [7,  "unable to open file"],
                    [8,  "unmatched right parenthesis"],
                    [9,  "unmatched left parenthesis"],
                    [10, "empty symbol in RHS"]
                   ]
    }
    every i := 1 to *errlist do
        if errlist[i][1] = n then msg := errlist[i][2]
    writes(&errout, "error ", n, " (", msg, ") in \n")
    every write("\t", rewrap(s) | rewrap())
    exit(n)

end


#
# Remove non-backslashed spaces and tabs.
#
procedure stripspaces(s)

    local s2

    s2 := ""
    s ? {
        while s2 ||:= tab(slshupto(' \t')) do
            tab(many(' \t'))
        s2 ||:= tab(0)
    }

    return s2

end