1 files changed, 160 insertions, 0 deletions
diff --git a/ipl/procs/sentence.icn b/ipl/procs/sentence.icn
new file mode 100644
index 0000000..f80def3
--- /dev/null
+++ b/ipl/procs/sentence.icn
@@ -0,0 +1,160 @@
+############################################################################
+#
+#	File:     sentence.icn
+#
+#	Subject:  Procedure to generate sentences in file
+#
+#	Author:   Richard L. Goerwitz
+#
+#	Date:     August 14, 1996
+#
+############################################################################
+#
+#   This file is in the public domain.
+#
+############################################################################
+#
+#	Version:  1.2
+#
+############################################################################
+#  
+#  sentence(f)   - suspends sentences from file f
+#
+#  A lot of grammatical and stylistic analysis programs are predicated
+#  on the notion of a sentence.  For instance, some programs count the
+#  number of words in each sentence.  Other count the number and length
+#  of clauses.  Still others pedantically check for sentence-final par-
+#  ticles and prepositions.
+#
+#  This procedure, sentence(), is supposed to be used as a filter for
+#  ASCII text files, suspending everything that looks remotely like a
+#  sentence in them.
+#
+############################################################################
+#
+#  BUGS:  Cannot correctly parse sentences with constructs like "R. L.
+#  Goerwitz" in them.  The algorithm can be much improved simply by
+#  checking to see if the word after the period is in /usr/dict/words
+#  or whatever your system dictionary file is.  If it isn't, then it's
+#  likely not to be the beginning of a sentence (this also is not in-
+#  fallible, naturally).
+#
+############################################################################
+#
+#  Requires:  co-expressions
+#
+############################################################################
+
+
+procedure sentence(intext)
+
+    local sentence, get_line, line, tmp_s, end_part, whole_thing
+    static inits, punct
+    initial {
+	inits := &ucase ++ &digits
+	punct := ".\"'!?)]"
+    }
+    sentence := ""
+    get_line := create read_line(intext)
+
+    while line := @get_line do {
+
+	# If we hit a blank line, it's a signal from read_line that we
+	# have encountered a change in the indentation level, and
+	# should call it a sentence break (though it could just be
+	# indentation for a quote, a section header, etc., it seems
+	# these all indicate major, sentence-like divisions in the
+	# text).
+	if line == "" then {
+	    suspend sentence
+	    sentence := ""
+	    next
+	}
+
+	# Go on until you can't find any more sentence-endings in line,
+	# then break and get another line.
+	repeat {
+
+	    # Scan for a sentence break somewhere in line.
+	    line ? {
+
+		# Ugly, but it works.  Look for sequences containing
+		# things like periods and question marks, followed by
+		# a space and another space or a word beginning with
+		# a capital letter.  If we don't have enough context,
+		# append the next line from intext to line & scan again.
+		if tmp_s := tab(upto(punct)) &
+		    upto('!?.', end_part := tab(many(punct))) &
+		    not (pos(-1), line ||:= @get_line, next) &
+		    =" " & (=" " | (tab(many('\'"('))|&null,any(inits)))
+		# IF YOU WANT TO ADD A DICTIONARY CHECK, then read in
+		# a dictionary like /usr/dict/words, and then change
+		# any(inits) above to something like (any(inits),
+		# longstr(list_of_usrdictwords,map(&subject),&pos), =" ")
+		# where longstr() matches each string in list_of_usr-
+		# dictwords.
+		then {
+
+		    # Don't bother with little two-letter hunks.
+		    whole_thing := sentence || tmp_s || end_part
+		    if *whole_thing > 3 | find(" ",whole_thing)
+		    then suspend whole_thing
+
+		    tab(many(' '))
+		    line := tab(0)
+		    sentence := ""
+		    next
+		}
+		else break
+	    }
+	}
+
+	# Otherwise just tack line onto sentence & try again.
+	sentence ||:= line
+    }
+
+    return sentence
+
+end
+
+
+
+
+procedure read_line(intext)
+
+    local new_line, ilevel, junk_count, space_count, line
+    static last_ilevel, blank_flag
+    last_ilevel := 0
+
+    while line := trim(!intext,'\t ') do {
+
+	# Check to see if line is blank; if so, set blank_flag.
+	if line == "" then
+	    { blank_flag := 1; next }
+
+	# Determine current indentation level.
+	detab(line) ? {
+	    ilevel := *tab(many(' ')) | 0
+	}
+
+	line ? {
+
+	    tab(many('\t '))
+
+	    # Signal the calling procedure if there is a change in the
+	    # indentation level by suspending a blank line.
+	    if (ilevel > last_ilevel) | (ilevel < last_ilevel, \blank_flag)
+	    then suspend ""
+	    last_ilevel := ilevel
+
+	    # Put a space on the end of line, unless it ends in a dash.
+	    new_line := tab(-1) || (="-" | (move(1) || " "))
+	    # Make sure the flag that indicates blank lines is unset.
+	    blank_flag := &null
+	}
+
+	# Suspend the newly reformatted, trimmed, space-terminated line.
+	suspend new_line
+    }
+
+end