summaryrefslogtreecommitdiff
path: root/ipl/procs/scan.icn
blob: 2b8b5c6180a3e1bda702de871217b34ee0f15d62 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
############################################################################
#
#	File:     scan.icn
#
#	Subject:  Procedures related to scanning
#
#	Author:   Richard L. Goerwitz, David A. Gamey, and Ralph E. Griswold
#
#	Date:     May 2, 2001
#
############################################################################
#
#  This file is in the public domain.
#
############################################################################
#
#	Contributors:  Randal L. Schwartz and Cheyenne Wills
#
############################################################################
#
#  This module contains procedures related to string scanning:
#
#	balq(c1, c2, c3, c4, c5, s, i1, i2)
#		like bal() with quoting from characters in c5.
#
#       balqc(c1, c2, c3, c4, c5, s1, s2, s3, i1, i2)
#		like balq() with the addition that balanced characters within 
#       	"comments", as delimited by the strings s1 and s2, are also 
#        	excluded from balancing.  In addition, if s1 is given and s2 
#
#       limatch(L, c)
#		matches items in list L delimited by characters in c
#
#	slashbal(c1, c2, c3, s, i, j)
#		like bal() with escape processing
#
#	slashupto(c, s, i, j)
#		like upto() with escape processing
#
#	slshupto()
#		synonym for slashupto()
#
#	snapshot(title, len)
#		snapshot of string scanning with optional title and
#		maximum length.
#
#  More extensive documentation proceeds each procedure.
#
############################################################################
#
#  Richard L. Goerwitz:
#
#  I am often frustrated at bal()'s inability to deal elegantly with
#  the common \backslash escaping convention (a way of telling Unix
#  Bourne and C shells, for instance, not to interpret a given
#  character as a "metacharacter").  I recognize that bal()'s generic
#  behavior is a must, and so I wrote slashbal() to fill the gap.
#
#  Slashbal behaves like bal, except that it ignores, for purposes of
#  balancing, any c2/c3 char which is preceded by a backslash.  Note
#  that we are talking about internally represented backslashes, and
#  not necessarily the backslashes used in Icon string literals.  If
#  you have "\(" in your source code, the string produced will have no
#  backslash.  To get this effect, you would need to write "\\(."
#
#  BUGS:  Note that, like bal() (v8), slashbal() cannot correctly
#  handle cases where c2 and c3 intersect.  Note also that older ver-
#  sions of this routine counted from the beginning of the string,
#  instead of from i.  This feature came to be regarded as a bug when
#  put into actual use (especially when I realized that bal() doesn't
#  work this way).
#
############################################################################

procedure slashbal(c1, c2, c3, s, i, j)	#: bal() with escapes

    local twocs, allcs, default_val, POS, chr, chr2, count

    /c1 := &cset
    /c2 := '('
    /c3 := ')'
    twocs := c2 ++ c3
    allcs := c1 ++ c2 ++ c3 ++ '\\'

    if /s := &subject
    then default_val := &pos
    else default_val := 1

    if \i then {
	if i < 1 then
	    i := *s + (i+1)
    }
    else i := default_val
    if \j then {
	if j < 1 then
	    j := *s + (j+1)
    }
    else j := *s + 1

    count := 0; POS := i - 1
    s[i:j] ? {
	while tab(upto(allcs)) do {
	    chr := move(1)
	    if chr == "\\" & any(twocs) then {
		chr2 := move(1)
		if any(c1, chr) & count = 0 then
		    suspend POS + .&pos - 2
		if any(c1, chr2) & count = 0 then
		    suspend POS + .&pos - 1
	    }
	    else {
		if any(c1, chr) & count = 0 then
		    suspend POS + .&pos - 1
		if any(c2, chr) then
		    count +:= 1
		else if any(c3, chr) & count > 0 then
		    count -:= 1
	    }
	}
    }

end

############################################################################
#
#  Richard L. Goerwitz:
#
#  Slshupto works just like upto, except that it ignores backslash
#  escaped characters.  I can't even begin to express how often I've
#  run into problems applying Icon's string scanning facilities to
#  to input that uses backslash escaping.  Normally, I tokenize first,
#  and then work with lists.  With slshupto() I can now postpone or
#  even eliminate the traditional tokenizing step, and let Icon's
#  string scanning facilities to more of the work.
#
#  If you're confused:
#
#  Typically UNIX utilities (and probably others) use backslashes to
#  "escape" (i.e. remove the special meaning of) metacharacters.  For
#  instance, UNIX shells normally accept "*" as a shorthand for "any
#  series of zero or more characters.  You can make the "*" a literal
#  "*," with no special meaning, by prepending a backslash.  The rou-
#  tine slshupto() understands these backslashing conventions.  You
#  can use it to find the "*" and other special characters because it
#  will ignore "escaped" characters.
#
############################################################################

# for compatibility with the original name
#
procedure slashupto(c, s, i, j)		#: upto() with escapes
    suspend slshupto(c, s, i, j)
end

#
# slshupto:  cset x string x integer x integer -> integers
#             (c, s, i, j) -> Is (a generator)
#    where Is are the integer positions in s[i:j] before characters
#    in c that is not preceded by a backslash escape
#
procedure slshupto(c, s, i, j)		#: upto() with escapes

    local c2

    if /s := &subject
    then /i := &pos
    else /i := 1
    /j := *s + 1
    
    /c := &cset
    c2 := '\\' ++ c
    s[1:j] ? {
        tab(i)
        while tab(upto(c2)) do {
            if ="\\" then {
		move(1) | {
		    if find("\\", c)
		    then return &pos - 1
		}
		next
	    }
            suspend .&pos
            move(1)
        }
    }

end

############################################################################
#
#     The procedure snapshot(title,len) writes a snapshot of the state
#  of string scanning, showing the value of &subject and &pos, an
#  optional title, and (again optionally) wrapping the display
#  for len widht.
#
#  For example,
#
#     "((a+b)-delta)/(c*d))" ? {
#	 tab(bal('+-/*'))
#	 snapshot("example")
#	 }
#
#  produces
#
#	---example---------------------------
#	|				    |
#	|				    |
#	| &subject = "((a+b)-delta)/(c*d))" |
#	|			   |	    |
#	|			    	    |
#	-------------------------------------
#
#     Note that the bar showing the &pos is positioned under the &posth
#  character (actual positions are between characters).  If &pos is
#  at the end of &subject, the bar is positioned under the quotation
#  mark delimiting the subject. For example,
#
#     "abcdefgh" ? (tab(0) & snapshot())
#
#  produces
#
#	-------------------------
#	|			|
#	|			|
#	| &subject = "abcdefgh" |
#	|		      | |
#	|			|
#	-------------------------
#
#     Escape sequences are handled properly. For example,
#
#     "abc\tdef\nghi" ? (tab(upto('\n')) & snapshot())
#
#  produces
#
#	------------------------------
#	|			     |
#	|			     |
#	| &subject = "abc\tdef\nghi" |
#	|		      |      |
#	|			     |
#	------------------------------
#
#  The title argument places a title into the top bar, as in
#
#	"abc\tdef\nghi" ? (tab(upto('\n')) & snapshot("upto('\n')")
#
#  which produces
#
#      --upto('\n')-------------------
#      |                             |
#      |                             |
#      | &subject = "abc\tdef\nghi"  |
#      |                     |       |
#      |                             |
#      -------------------------------
#
#  The len argument rewraps the display for a screen of len width.
#
############################################################################


procedure snapshot(title,len)		#: snapshot of string scanning

   local bar1, bar2, bar3, is, is0, prefix, titlel, placement, POS

   /title := ""			# no meaningful default
   \len <:= 20			# any less is really not useful
   prefix := "&subject = "
   is := image(&subject)
   is0 := *image(&subject[1:&pos]) | fail

   #
   # Set up top and bottom bars (not exceeding len width, if
   # len is nonnull).  Fit title into top bar (bar1).
   #
   bar1 := bar3 := repl("-", *is + *prefix + 4)[1:\len-4|0]
   # in *is + *prefix + 4, the 4 is for two vbars/two spaces
   titlel := (*title > *bar3-4) | *title[1:\len-4|0]
   bar1 ?:= move(3) || (tab(4+titlel), title) || tab(0)

   #
   # Write bar1, then spacers (bar2).  Then write out len-size chunks
   # of &subject, with the | pointer-line, where appropriate. Finally,
   # write out bar3 (like bar1, but with no title).
   #
   write(bar1)
   bar2 := "|" || repl(" ", *bar3 - 2) || "|"
   write(bar2, "\n", bar2)
   placement := *prefix + is0
   (prefix || is) ? {
       until pos(0) do {
	   POS := &pos - 1
	   write("| ", move(*bar3-4) | left(tab(0), *bar3-4), " |")
	   if POS < placement < &pos then {
	       writes("| ")
	       writes(left(repl(" ", placement - POS - 1) || "|", *bar3-4))
	       write(" |\n", bar2)
	   }
	   else write(bar2, "\n", bar2)
       }
   }
   write(bar3)
   return			# nothing useful to return

end

############################################################################
#
#  David A. Gamey:
#
#     balq( c1, c2, c3, c4, c5, s, i1, i2 ) : i3
#
#        generates the sequence of integer positions in s preceding a 
#        character of c1 in s[i1:i2] that is (a) balanced with respect to
#        characters in c2 and c3 and (b) not "quoted" by characters in c4 
#        with "escape" sequences as defined in c5, but 
#        fails if there is no such position.
#
#        defaults:   same as for bal,
#                    c4  the single and double quote characters ' and "
#                    c5  the backwards slash \
#        errors:     same as for bal,
#                    c4 & c5 not csets
#
#     balqc( c1, c2, c3, c4, c5, s1, s2, s3, i1, i2 ) : i3
#
#        like balq with the addition that balanced characters within 
#        "comments", as delimited by the strings s1 and s2, are also 
#        excluded from balancing.  In addition, if s1 is given and s2 
#        is null then the comment terminates at the end of string.
#
#        defaults:   same as for balq,
#                    s3 is the subject string
#                    s1 "/*" 
#                    s2 "*/" if s1 defaults, null otherwise
#        errors:     same as for balq,
#                    s1 is not a string
#                    s2 is not a string (if s1 defaults or is specified) 
#
#############################################################################

procedure balq(				#: bal() with quote escaping.
   cstop, copen, cclose, cquote, cescape, s, i1, i2)

local quote, pcount, spos
local ca, c, sp

if /s := &subject then /i1 := &pos
/i1 := 1
/i2 := 0
/cstop   := &cset                                     # stopping characters
/copen   := '('                                       # open characters
/cclose  := ')'                                       # close characters
/cquote  := '\'\"'                                    # quote characters
/cescape := '\\'                                      # escape characters


pcount := 0                                           # "parenthesis" counter
spos   := i1                                          # scanning position

ca := cstop ++ copen ++ cclose ++ cquote ++ cescape   # characters to check

while sp := s[ spos := upto( ca, s, spos, i2 ) ] do { 
      
      if /quote & ( pcount = 0 ) & any( cstop, sp) then suspend spos
      
      if any( c := ( copen | cclose | cquote | cescape ), sp ) then
         
         case c of {
            
            copen  : if /quote then 
                        pcount +:= 1
            
            cclose : if /quote then 
                        if ( pcount -:= 1 ) < 0 then
                           fail
            
            cquote : if /quote then 
                        quote := sp 
                     else 
                        if quote == sp then quote := &null
            
            cescape: if \quote then
                        spos +:= 1
            }
      
      spos +:= 1
      
      }

end

procedure balqc(			#: balq() with comment escaping
   cstop, copen, cclose, cquote, cescape, scm, ecm, s, i1, i2)

local quote, pcount, spos
local ca, c, sp
local ccom, comnt

if /s := &subject then /i1 := &pos
/i1 := 1
/i2 := 0
/cstop   := &cset                                     # stopping characters
/copen   := '('                                       # open characters
/cclose  := ')'                                       # close characters
/cquote  := '\'\"'                                    # quote characters
/cescape := '\\'                                      # escape characters

if /scm & /ecm then {
   scm := "/*"                                        # start of comment
   ecm := "*/"                                        # end of comment
   }
else 
   if \scm & /ecm then 
      ecm := &null                                    # icon style comment

ccom := ''
ccom ++:= cset(\scm[1])
ccom ++:= cset(\ecm[1])

pcount := 0                                           # "parenthesis" counter
spos   := i1                                          # scanning position

ca := cstop ++ copen ++ cclose ++ cquote ++ cescape ++ ccom # chars to check

while sp := s[ spos := upto( ca, s, spos, i2 ) ] do { 
      
      if /quote & ( pcount = 0 ) & /comnt & any( cstop, sp) then 
         suspend spos
      
      if any( c := ( copen | cclose | cquote | cescape | ccom ), sp ) then
         
         case c of {
            
            copen  : if /quote & /comnt then 
                        pcount +:= 1
            
            cclose : if /quote & /comnt then 
                        if ( pcount -:= 1 ) < 0 then
                           fail
            
            cquote : if /comnt then
                        if /quote then 
                           quote := sp 
                        else 
                           if quote == sp then quote := &null
            
            cescape: if \quote then 
                        spos +:= 1

            ccom   : if /quote then 
                        if /comnt then {
                           if comnt := ( s[ spos +: *scm ] == scm ) then
                              spos +:= *scm - 1
                           }
                        else
                           if \ecm == s[ spos +: *ecm ] then {
                              spos +:= *ecm - 1
                              comnt := &null
                              }
                              
            }
      
      spos +:= 1
      
      }

end

#############################################################################
#
#  This matching function illustrates how every can be 
#  used in string scanning.   
#
#                 1. Each element of the list argument is matched in 
#                    succession. 
#                 2. Leading characters in the subject are skipped over
#                    to match the first element.
#                 3. The strings listed may be seperated by other characters
#                    provided they are specified in a cset of characters to 
#                    be ignored.
#
#                 It could be used to find things in text that have varying
#                 representations, for example: "i.e.", "e.g.", "P.O.", etc.
#
#  limatch(l,i)
#  
#        l  list of strings to be found
#        i  cset containing characters to be ignored between each string
#
#        returns the last cursor position scanned to, or fails
#
#############################################################################

procedure limatch(l,i)			#: matching items in list

local s, f, p

p := &pos
every ( s := !l ) | ( return p ) do
{
   if /f := 1 then tab(find(s))        #  startup - position at first string
   tab(match(s)) | fail                #  fail if not matched
   tab(many(i) | &pos)                 #  skip ignore chars. if any
   p := &pos                           #  remember last position
}
end