ipl/procs/senten1.icn


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236

############################################################################
#
#	File:     senten1.icn
#
#	Subject:  Procedure to generate sentences
#
#	Author:   Peter A. Bigot
#
#	Date:     August 14, 1996
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
# sentence(f) generates the English sentences encountered in a file.
#
############################################################################
#
# The following rules describe what a 'sentence' is.
# 
# * A sentence begins with a capital letter.
# 
# * A sentence ends with one or more of '.!?', subject to other
#   constraints.
# 
# * If a period is immediately followed by:
#   - a digit
#   - a letter
#   - one of ',;:'
#   it is not a sentence end.
# 
# * If a period is followed (with intervening space) by a lower case
#   letter, it is not a sentence end (assume it's part of an abbreviation).
#
# * The sequence '...' does not end a sentence.  The sequence '....' does.
# 
# * If a sentence end character appears after more opening parens than
#   closing parens in a given sequence, it is not the end of that
#   particular sentence. (I.e., full sentences in a parenthetical remark
#   in an enclosing sentence are considered part of the enclosing
#   sentence.  Their grammaticality is in question, anyway.) (It also
#   helps with attributions and abbreviations that would fail outside
#   the parens.)
#
# * No attempt is made to ensure balancing of double-quoted (") material.
# 
# * When scanning for a sentence start, material which does not conform is
#   discarded.
# 
# * Corollary: Quotes or parentheses which enclose a sentence are not
#   considered part of it.
# 
# * An end-of-line on input is replaced by a space unless the last
#   character of the line is 'a-' (where 'a' is any letter), in which case
#   the hyphen is deleted.
#
# * Leading and trailing space (tab, space, newline) chars are removed
#   from each line of the input.
#
# * If a blank line is encountered on input while scanning a sentence,
#   the scan is aborted and search for a new sentence begins (rationale:
#   ignore section and chapter headers separated from text by newlines).
#
# * Most titles before names would fail the above constraints.  They are
#   special-cased.
#
# * This does NOT handle when a person uses their middle initial.  To do
#   so would rule out sentences such as 'It was I.',  Six of one, half-dozen
#   of the other--I made my choice.
#
# * Note that ':' does not end a sentence.  This is a stylistic choice,
#   and can be modified by simply adding ':' to sentend below.
#
############################################################################

procedure sentence (infile)
   local
      line,                     # Line read from input, beginning could be sent.
      sentence,                 # A possible sentence
      lstend,                   # Position in line of last checked sentence end
      possentp,                 # Boolean: non-null if line mod context =  sent.
      spaceskip,                # Spaces betwen EOSent and next char (context)
      nextch,                   # Next char after EOSent
      cnt,                       # Balanced count of parens in possible sent.
      t,
      newline
   static
      sentend,                  # Cset for sentence end chars
      wspace,                   # White space characters
      noperend,                 # Chars which, after period, don't end sentence
      titles                    # Titles that can appear before names.
   initial {
      sentend := '.?!'          # Initial value for sentend
      wspace := ' \t\n'         # Space chars
      noperend := &digits ++ &letters ++ ',:;' # No-end after period chars
      titles := ["Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Pres."]
      }

   line := ""
   # Repeat scanning for and suspending sentences until input fails.
   repeat {
      # Try to find the start of a sentence in the current input string.
      # If there are none, read more from file; fail if file exhausted.
      # Trim trailing space from line (leading skipped by sentence start)
      while not (line ?:= (tab (upto (&ucase)) & tab (0))) do {
         line := trim (read (infile), wspace) | fail
         }

      # Find the sentence end.  If there's no viable candidate, read more
      # from input.  Set the last end position to the first char in the
      # sentence.
      lstend := 1
      possentp := &null
      repeat {
         line ? {
            # Skip up to new stuff (scanned in previous lines).
            sentence := tab (lstend)
            while sentence ||:= tab (upto (sentend)) do {
               sentence ||:= tab (many (sentend))
               
               # Verify end-of-sentence.  Assume it doesn't pass.
               possentp := &null
               
               # Check for sentence end conformance.  See what follows it: put
               # that in nextch, and the intervening space before it in
               # spaceskip.
               # Note hack to scan in remainder of line w/o changing &pos.
               nextch := &null
               every tab (0) ? {
                  spaceskip := tab (many (wspace)) | ""
                  nextch := move (1)
                  }
                  
               if /nextch then {
                  # Don't have enough context to ensure a proper sentence end.
                  # Read more, but let readers know that this could be a
                  # sentence end (e.g., in case of EOF on input).
                  possentp := 1
                  break
                  }
               
               # Save position of last checked sentence end, so we don't try to
               # recheck this one.
               lstend := &pos
               
               # .<noperend> doesn't end a sentence.
               if (sentence [-1] == '.' &
                   spaceskip == "" &
                   any (noperend, nextch)) then {
                  next
                  }
               
               # .<spc><lcase> doesn't end sentence
               if (sentence [-1] == '.' &
                   any (&lcase, nextch)) then {
                  next
                  }

               # ... doesn't end sentence. .... does.
               if (sentence [-3:0] == "..." &
                   sentence [-4] ~== ".") then {
                  next
                  }

               # Number of ')' must be >= number '(' in sentence.
               sentence ? {
                  cnt := 0
                  while tab (upto ('()')) do {
                     if ="(" then {
                        cnt +:= 1
                        }
                     else {
                        =")"
                        cnt -:= 1
                        }
                     }
                  }
               if (cnt > 0) then {
                  next
                  }

               # Special case titles that appear before names (otherwise look
               # like sentence ends).
               every t := ! titles do {
                  if (t == sentence [- *t:0]) then {
                     # Break every, next in sentence-end search repeat
                     break next
                     }
                  }

               # This is a sentence.  Replace the line with what follows the
               # sentence, and break out of the sentence-end-search loop.
               line := tab (0)
               break break
               }
            }
         # There is no valid sentence end so far.  Remove a trailing hyphen
         # from the current line, or add a word-separating space.
         if line [-1] == '-' & any (&letters, line [-2]) then {
            line := line [1:-1]
            }
         else {
            line ||:= " "
            }

         # Read another line.  If can't, then fail--but suspend sentence first
         # if it _could_ be a sentence end.  Trim leading and trailing spaces
         # from the new line--if it's empty, toss the line so far and restart;
         # otherwise, tack it onto the end of the current line.
         if not (newline := read (infile)) then {
            if \possentp then {
               suspend (sentence)
               }
            fail
            }
         if any (wspace, newline) then {
            newline ?:= (tab (many (wspace)), tab (0))
            }
         newline := trim (newline, wspace)
         if (*newline = 0) then {
            if \possentp then {
               suspend (sentence)
               }
            line := ""
            # Break EOS check, next beginning-of-sent scan
            break next
            }
         line ||:= newline
         }

      # Suspend the sentence, then loop back for more.
      suspend sentence
      }
   end # procedure sentence