ipl/procs/sentence.icn


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

############################################################################
#
#	File:     sentence.icn
#
#	Subject:  Procedure to generate sentences in file
#
#	Author:   Richard L. Goerwitz
#
#	Date:     August 14, 1996
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
#	Version:  1.2
#
############################################################################
#  
#  sentence(f)   - suspends sentences from file f
#
#  A lot of grammatical and stylistic analysis programs are predicated
#  on the notion of a sentence.  For instance, some programs count the
#  number of words in each sentence.  Other count the number and length
#  of clauses.  Still others pedantically check for sentence-final par-
#  ticles and prepositions.
#
#  This procedure, sentence(), is supposed to be used as a filter for
#  ASCII text files, suspending everything that looks remotely like a
#  sentence in them.
#
############################################################################
#
#  BUGS:  Cannot correctly parse sentences with constructs like "R. L.
#  Goerwitz" in them.  The algorithm can be much improved simply by
#  checking to see if the word after the period is in /usr/dict/words
#  or whatever your system dictionary file is.  If it isn't, then it's
#  likely not to be the beginning of a sentence (this also is not in-
#  fallible, naturally).
#
############################################################################
#
#  Requires:  co-expressions
#
############################################################################


procedure sentence(intext)

    local sentence, get_line, line, tmp_s, end_part, whole_thing
    static inits, punct
    initial {
	inits := &ucase ++ &digits
	punct := ".\"'!?)]"
    }
    sentence := ""
    get_line := create read_line(intext)

    while line := @get_line do {

	# If we hit a blank line, it's a signal from read_line that we
	# have encountered a change in the indentation level, and
	# should call it a sentence break (though it could just be
	# indentation for a quote, a section header, etc., it seems
	# these all indicate major, sentence-like divisions in the
	# text).
	if line == "" then {
	    suspend sentence
	    sentence := ""
	    next
	}

	# Go on until you can't find any more sentence-endings in line,
	# then break and get another line.
	repeat {

	    # Scan for a sentence break somewhere in line.
	    line ? {

		# Ugly, but it works.  Look for sequences containing
		# things like periods and question marks, followed by
		# a space and another space or a word beginning with
		# a capital letter.  If we don't have enough context,
		# append the next line from intext to line & scan again.
		if tmp_s := tab(upto(punct)) &
		    upto('!?.', end_part := tab(many(punct))) &
		    not (pos(-1), line ||:= @get_line, next) &
		    =" " & (=" " | (tab(many('\'"('))|&null,any(inits)))
		# IF YOU WANT TO ADD A DICTIONARY CHECK, then read in
		# a dictionary like /usr/dict/words, and then change
		# any(inits) above to something like (any(inits),
		# longstr(list_of_usrdictwords,map(&subject),&pos), =" ")
		# where longstr() matches each string in list_of_usr-
		# dictwords.
		then {

		    # Don't bother with little two-letter hunks.
		    whole_thing := sentence || tmp_s || end_part
		    if *whole_thing > 3 | find(" ",whole_thing)
		    then suspend whole_thing

		    tab(many(' '))
		    line := tab(0)
		    sentence := ""
		    next
		}
		else break
	    }
	}

	# Otherwise just tack line onto sentence & try again.
	sentence ||:= line
    }

    return sentence

end


procedure read_line(intext)

    local new_line, ilevel, junk_count, space_count, line
    static last_ilevel, blank_flag
    last_ilevel := 0

    while line := trim(!intext,'\t ') do {

	# Check to see if line is blank; if so, set blank_flag.
	if line == "" then
	    { blank_flag := 1; next }

	# Determine current indentation level.
	detab(line) ? {
	    ilevel := *tab(many(' ')) | 0
	}

	line ? {

	    tab(many('\t '))

	    # Signal the calling procedure if there is a change in the
	    # indentation level by suspending a blank line.
	    if (ilevel > last_ilevel) | (ilevel < last_ilevel, \blank_flag)
	    then suspend ""
	    last_ilevel := ilevel

	    # Put a space on the end of line, unless it ends in a dash.
	    new_line := tab(-1) || (="-" | (move(1) || " "))
	    # Make sure the flag that indicates blank lines is unset.
	    blank_flag := &null
	}

	# Suspend the newly reformatted, trimmed, space-terminated line.
	suspend new_line
    }

end