summaryrefslogtreecommitdiff
path: root/ipl/procs/ngrams.icn
blob: 6de13c3ce1998842b888be5df6db2cfd9aed0dc9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
############################################################################
#
#	File:     ngrams.icn
#
#	Subject:  Procedures to produce n-grams
#
#	Author:   Ralph E. Griswold
#
#	Date:     March 20, 1998
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
#     The procedure ngrams(s, n, c, t) generates a tabulation of the n-grams
#  in the specified string.  If c is non-null, it is used as the set of
#  characters from which n-grams are taken (other characters break n-grams).
#  The default for c is the upper- and lowercase letters.  If t is non-null,
#  the tabulation is given in order of frequency; otherwise in alphabetical
#  order of n-grams.
#
#     For backward compatibility, the first argument may be a file, in
#  which case, it is read to provide the string.
#
############################################################################

procedure ngrams(s, i, c, t)		#: n-grams with count
   local line, grams, a, count, f

   if not (integer(i) > 0) then stop("*** invalid ngrams specification")

   /c := &lcase || &ucase
   if not (c := cset(c)) then stop("*** invalid cset specification")

   grams := table(0)

   if type(s) == "file" then {
      line := ""
      while line ||:= reads(f, 1000)
      }
   else line := s
   line ? while tab(upto(c)) do
      (tab(many(c)) \ 1) ? while grams[move(i)] +:= 1 do
         move(-i + 1)
   if /t then {
      a := sort(grams, 4)
      while count := pull(a) do
         suspend pull(a) || right(count, 8)
         }
   else {
      a := sort(grams, 3)
      suspend |(get(a) || right(get(a),8))
      }
end

procedure ngramset(s, i, c)		#: n-grams set
   local line, grams, a, count, f

   if not (integer(i) > 0) then stop("*** invalid ngrams specification")

   /c := &lcase || &ucase
   if not (c := cset(c)) then stop("*** invalid cset specification")

   grams := set()

   if type(s) == "file" then {
      line := ""
      while line ||:= reads(f, 1000)
      }
   else line := s

   line ? while tab(upto(c)) do
      (tab(many(c)) \ 1) ? while insert(grams, move(i)) do
         move(-i + 1)

   return grams

end