summaryrefslogtreecommitdiff
path: root/ipl/procs/regexp.icn
blob: 6b881f5b7dcc3ad5b0f5fd8c3d01dbe6787f79c3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
############################################################################
#
#	File:     regexp.icn
#
#	Subject:  Procedure for regular-expression pattern matching
#
#	Author:   Robert J. Alexander
#
#	Date:     May 19, 1996
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
#  This is a kit of procedures to deal with UNIX-like regular expression
#  patterns.
#
#  These procedures are interesting partly because of the "recursive
#  suspension" (or "suspensive recursion" :-) technique used to simulate
#  conjunction of an arbitrary number of computed expressions (see
#  notes, below).
#
#  The public procedures are:
#
#  ReMatch(pattern,s,i1,i2) : i3,i4,...,iN
#  ReFind(pattern,s,i1,i2) : i3,i4,...,iN
#  RePat(s) : pattern list
#
############################################################################
#
#  ReMatch() produces the sequence of positions in "s" past a substring
#  starting at "i1" that matches "pattern", but fails if there is no
#  such position.  Similar to match(), but is capable of generating
#  multiple positions.
#
#  ReFind() produces the sequence of positions in "s" where substrings
#  begin that match "pattern", but fails if there is no such position.
#  Similar to find().  Each position is produced only once, even if
#  several possible matches are possible at that position.
#
#  "pattern" can be either a string or a pattern list -- see RePat(),
#  below.
#
#  Default values of s, i1, and i2 are handled as for Icon's built-in
#  string scanning procedures such as match().
#
############################################################################
#
#  RePat(s) : L
#
#  Creates a pattern element list from pattern string "s", but fails if
#  the pattern string is not syntactically correct.  ReMatch() and
#  ReFind() will automatically convert a pattern string to a pattern
#  list, but it is faster to do the conversion explicitly if multiple
#  operations are done using the same pattern.  An additional advantage
#  to compiling the pattern separately is avoiding ambiguity of failure
#  caused by an incorrect pattern and failure to match a correct pattern.
#
############################################################################
#
#  ReCaseIndependent() : n
#  ReCaseDependent() : n
#
#  Set mode for case-independent or case-dependent matching.  The initial
#  mode is case-dependent.
#
############################################################################
#
#  Accessible Global Variables
#
#  After a match, the strings matched by parenthesized regular
#  expressions are left in list "Re_ParenGroups", and can be accessed by
#  subscripting in using the same number as the \N construct.
#
#  If it is desired that regular expression format be similar to UNIX
#  filename generation patterns but still retain the power of full
#  regular expressions, make the following assignments prior to
#  compiling the pattern string:
#
#       Re_ArbString := "*"     # Defaults to ".*"
#
#  The sets of characters (csets) that define a word, digits, and white
#  space can be modified.  The following assignments can be made before
#  compiling the pattern string.  The character sets are captured when
#  the pattern is compiled, so changing them after pattern compilation
#  will not alter the behavior of matches unless the pattern string is
#  recompiled.
#
#       Re_WordChars := 'whatever you like'
#                       # Defaults to &letters ++ &digits ++ "_"
#       Re_Digits := &digits ++ 'ABCDEFabcdef'
#                       # Defaults to &digits
#       Re_Space := 'whatever you like'
#                       # Defaults to ' \t\v\n\r\f'
#
#  These globals are normally not initialized until the first call to
#  RePat(), and then only if they are null.  They can be explicitly
#  initialized to their defaults (if they are null) by calling
#  Re_Default().
#
############################################################################
#
#  Characters compiled into patterns can be passed through a
#  user-supplied filter procedure, provided in global variable
#  Re_Filter.  The filtering is done before the characters are bound
#  into the pattern.  The filter proc is passed one argument, the string
#  to filter, and it must return the filtered string as its result.  If
#  the filter proc fails, the string will be used unfiltered.  The
#  filter proc is called with an argument of either type string (for
#  characters in the pattern) or cset (for character classes [...]).
#
#  Filtering is done only as the pattern is compiled.  Any filtering of
#  strings to be matched must be explicitly done.
#
############################################################################
#
#  By default, individual pattern elements are matched in a "leftmost-
#  longest-first" sequence, which is the order observed by perl, egrep,
#  and most other regular expression matchers.  If the order of matching
#  is not important a performance improvement might be seen if pattern
#  elements are matched in "shortest-first" order.  The following global
#  variable setting causes the matcher to operate in leftmost-shortest-
#  first order.
#
#   Re_LeftmostShortest := 1
#  
############################################################################
#
#  In the case of patterns containing alternation, ReFind() will
#  generally not produce positions in increasing order, but will produce
#  all positions from the first term of the alternation (in increasing
#  order) followed by all positions from the second (in increasing
#  order).  If it is necessary that the positions be generated in
#  strictly increasing order, with no duplicates, assign any non-null
#  value to Re_Ordered:
#
#       Re_Ordered := 1
#
#  If the Re_Ordered option is chosen, there is a *small* penalty in
#  efficiency in some cases, and the co-expression facility is required
#  in your Icon implementation.
#  
############################################################################
#
#  Regular Expression Characters and Features Supported
#
#  The regular expression format supported by procedures in this file
#  model very closely those supported by the UNIX "egrep" program, with
#  modifications as described in the Perl programming language
#  definition.  Following is a brief description of the special
#  characters used in regular expressions.  In the description, the
#  abbreviation RE means regular expression.
#
#  c            An ordinary character (not one of the special characters
#               discussed below) is a one-character RE that matches that
#               character.
#
#  \c           A backslash followed by any special character is a one-
#               character RE that matches the special character itself.
#
#               Note that backslash escape sequences representing
#               non-graphic characters are not supported directly
#               by these procedures.  Of course, strings coded in an
#               Icon program will have such escapes handled by the
#               Icon translator.  If such escapes must be supported
#               in strings read from the run-time environment (e.g.
#               files), they will have to be converted by other means,
#               such as the Icon Program Library procedure "escape()".
#
#  .            A period is a one-character RE that matches any
#               character.
#
#  [string]     A non-empty string enclosed in square brackets is a one-
#               character RE that matches any *one* character of that
#               string.  If, the first character is "^" (circumflex),
#               the RE matches any character not in the remaining
#               characters of the string.  The "-" (minus), when between
#               two other characters, may be used to indicate a range of
#               consecutive ASCII characters (e.g. [0-9] is equivalent to
#               [0123456789]).  Other special characters stand for
#               themselves in a bracketed string.
#
#  *            Matches zero or more occurrences of the RE to its left.
#
#  +            Matches one or more occurrences of the RE to its left.
#
#  ?            Matches zero or one occurrences of the RE to its left.
#
#  {N}          Matches exactly N occurrences of the RE to its left.
#
#  {N,}         Matches at least N occurrences of the RE to its left.
#
#  {N,M}        Matches at least N occurrences but at most M occurrences
#               of the RE to its left.
#
#  ^            A caret at the beginning of an entire RE constrains
#               that RE to match an initial substring of the subject
#               string.
#
#  $            A currency symbol at the end of an entire RE constrains
#               that RE to match a final substring of the subject string.
#
#  |            Alternation: two REs separated by "|" match either a
#               match for the first or a match for the second.
#
#  ()           A RE enclosed in parentheses matches a match for the
#               regular expression (parenthesized groups are used
#               for grouping, and for accessing the matched string
#               subsequently in the match using the \N expression).
#
#  \N           Where N is a digit in the range 1-9, matches the same
#               string of characters as was matched by a parenthesized
#               RE to the left in the same RE.  The sub-expression
#               specified is that beginning with the Nth occurrence
#               of "(" counting from the left.  E.g., ^(.*)\1$ matches
#               a string consisting of two consecutive occurrences of
#               the same string.
#
############################################################################
#
#  Extensions beyond UNIX egrep
#
#  The following extensions to UNIX REs, as specified in the Perl
#  programming language, are supported.
#
#  \w           Matches any alphanumeric (including "_").
#  \W           Matches any non-alphanumeric.
#
#  \b           Matches only at a word-boundary (word defined as a string
#               of alphanumerics as in \w).
#  \B           Matches only non-word-boundaries.
#
#  \s           Matches any white-space character.
#  \S           Matches any non-white-space character.
#
#  \d           Matches any digit [0-9].
#  \D           Matches any non-digit.
#
#  \w, \W, \s, \S, \d, \D can be used within [string] REs.
#
############################################################################
#
#  Notes on computed conjunction expressions by "suspensive recursion"
#
#  A conjunction expression of an arbitrary number of terms can be
#  computed in a looping fashion by the following recursive technique:
#
#       procedure Conjunct(v)
#          if <there is another term to be appended to the conjunction> then
#             suspend Conjunct(<the next term expression>)
#          else
#             suspend v
#       end
#
#  The argument "v" is needed for producing the value of the last term
#  as the value of the conjunction expression, accurately modeling Icon
#  conjunction.  If the value of the conjunction is not needed, the
#  technique can be slightly simplified by eliminating "v":
#
#       procedure ConjunctAndProduceNull()
#          if <there is another term to be appended to the conjunction> then
#             suspend ConjunctAndProduceNull(<the next term expression>)
#          else
#             suspend
#       end
#
#  Note that <the next term expression> must still remain in the suspend
#  expression to test for failure of the term, although its value is not
#  passed to the recursive invocation.  This could have been coded as
#
#             suspend <the next term expression> & ConjunctAndProduceNull()
#
#  but wouldn't have been as provocative.
#
#  Since the computed conjunctions in this program are evaluated only for
#  their side effects, the second technique is used in two situations:
#
#       (1)     To compute the conjunction of all of the elements in the
#               regular expression pattern list (Re_match1()).
#
#       (2)     To evaluate the "exactly N times" and "N to M times"
#               control operations (Re_NTimes()).
#
############################################################################

record Re_Tok(proc,args)

global Re_ParenGroups,Re_Filter,Re_Ordered
global Re_WordChars,Re_NonWordChars
global Re_Space,Re_NonSpace
global Re_Digits,Re_NonDigits
global Re_ArbString,Re_AnyString
global Re_LeftmostShortest

invocable "=":1

###################  Pattern Translation Procedures  ###################


procedure RePat(s)	#: regular expression pattern list
#
#  Produce pattern list representing pattern string s.
#
   #
   #  Create a list of pattern elements.  Pattern strings are parsed
   #  and converted into list elements as shown in the following table.
   #  Since some list elements reference other pattern lists, the
   #  structure is really a tree.
   #
   # Token      Generates                       Matches...
   # -----      ---------                       ----------
   #  ^         Re_Tok(pos,[1])                 Start of string or line
   #  $         Re_Tok(pos,[0])                 End of string or line
   #  .         Re_Tok(move,[1])                Any single character
   #  +         Re_Tok(Re_OneOrMore,[tok])      At least one occurrence of
   #                                            previous token
   #  *         Re_Tok(Re_ArbNo,[tok])          Zero or more occurrences of
   #                                            previous token
   #  |         Re_Tok(Re_Alt,[pattern,pattern]) Either of prior expression
   #                                            or next expression
   #  [...]     Re_Tok(Re_TabAny,[cset])        Any single character in
   #                                            specified set (see below)
   #  (...)     Re_Tok(Re_MatchReg,[pattern])   Parenthesized pattern as
   #                                            single token
   #  <string of non-special characters>        The string of no-special
   #            Re_Tok(Re__tabmatch,string)       characters
   #  \b        Re_Tok(Re_WordBoundary,[Re_WordChars,Re_NonWordChars])
   #                                            A word-boundary
   #                                              (word default: [A-Za-z0-9_]+)
   #  \B        Re_Tok(Re_NonWordBoundary,[Re_WordChars,Re_NonWordChars])
   #                                            A non-word-boundary
   #  \w        Re_Tok(Re_TabAny,[Re_WordChars])A word-character
   #  \W        Re_Tok(Re_TabAny,[Re_NonWordChars]) A non-word-character
   #  \s        Re_Tok(Re_TabAny,[Re_Space])    A space-character
   #  \S        Re_Tok(Re_TabAny,[Re_NonSpace]) A non-space-character
   #  \d        Re_Tok(Re_TabAny,[Re_Digits])   A digit
   #  \D        Re_Tok(Re_TabAny,[Re_NonDigits]) A non-digit
   #  {n,m}     Re_Tok(Re_NToMTimes,[tok,n,m])  n to m occurrences of
   #                                            previous token
   #  {n,}      Re_Tok(Re_NOrMoreTimes,[tok,n]) n or more occurrences of
   #                                            previous token
   #  {n}       Re_Tok(Re_NTimes,[tok,n])       exactly n occurrences of
   #                                            previous token
   #  ?         Re_Tok(Re_ZeroOrOneTimes,[tok]) one or zero occurrences of
   #                                            previous token
   #  \<digit>  Re_Tok(Re_MatchParenGroup,[n])  The string matched by
   #                                            parenthesis group <digit>
   #
   local plist
   static lastString,lastPList
   #
   #  Initialize.
   #
   initial {
      Re_Default()
      lastString := ""
      lastPList := []
      }

   if s === lastString then return lastPList

   Re_WordChars := cset(Re_WordChars)
   Re_NonWordChars := ~Re_WordChars
   Re_Space := cset(Re_Space)
   Re_NonSpace := ~Re_Space
   Re_Digits := cset(Re_Digits)
   Re_NonDigits := ~Re_Digits


   s ? (plist := Re_pat1(0)) | fail
   lastString := s
   lastPList := plist
   return plist
end


procedure Re_pat1(level) # L
#
#  Recursive portion of RePat()
#
   local plist,n,m,c,comma
   static parenNbr
   initial {
      if /Re__match then ReCaseDependent()
      }
   if level = 0 then parenNbr := 0
   plist := []
   #
   #  Loop to put pattern elements on list.
   #
   until pos(0) do {
      (="|",plist := [Re_Tok(Re_Alt,[plist,Re_pat1(level + 1) | fail])]) |
      put(plist,
         (="^",pos(2) | &subject[-2] == ("|" | "("),Re_Tok(pos,[1])) |
         (="$",pos(0) | match("|" | ")"),Re_Tok(pos,[0])) |
         (match(")"),level > 0,break) |
         (=Re_ArbString,Re_Tok(Re_Arb)) |
         (=Re_AnyString,Re_Tok(move,[1])) |
         (="+",Re_Tok(Re_OneOrMore,[Re_prevTok(plist) | fail])) |
         (="*",Re_Tok(Re_ArbNo,[Re_prevTok(plist) | fail])) |
         1(Re_Tok(Re_TabAny,[c := Re_cset()]),\c | fail) |
         3(="(",n := parenNbr +:= 1,
               Re_Tok(Re_MatchReg,[Re_pat1(level + 1) | fail,n]),
               move(1) | fail) |
         (="\\b",Re_Tok(Re_WordBoundary,[Re_WordChars,Re_NonWordChars])) |
         (="\\B",Re_Tok(Re_NonWordBoundary,[Re_WordChars,Re_NonWordChars])) |
         (="\\w",Re_Tok(Re_TabAny,[Re_WordChars])) |
         (="\\W",Re_Tok(Re_TabAny,[Re_NonWordChars])) |
         (="\\s",Re_Tok(Re_TabAny,[Re_Space])) |
         (="\\S",Re_Tok(Re_TabAny,[Re_NonSpace])) |
         (="\\d",Re_Tok(Re_TabAny,[Re_Digits])) |
         (="\\D",Re_Tok(Re_TabAny,[Re_NonDigits])) |
         (="{",(n := tab(many(&digits)),comma := =(",") | &null,
            m := tab(many(&digits)) | &null,="}") | fail,
            if \m then Re_Tok(Re_NToMTimes,
                  [Re_prevTok(plist),integer(n),integer(m)])
            else if \comma then Re_Tok(Re_NOrMoreTimes,
                  [Re_prevTok(plist),integer(n)])
            else Re_Tok(Re_NTimes,[Re_prevTok(plist),integer(n)])) |
         (="?",Re_Tok(Re_ZeroOrOneTimes,[Re_prevTok(plist) | fail])) |
         Re_Tok(Re__tabmatch,[Re_string(level)]) |
         (="\\",n := tab(any(&digits)),Re_Tok(Re_MatchParenGroup,[integer(n)]))
         ) |
      fail
      }
   return plist
end


procedure Re_prevTok(plist)
#
#  Pull previous token from the pattern list.  This procedure must take
#  into account the fact that successive character tokens have been
#  optimized into a single string token.
#
   local lastTok,s,r
   lastTok := pull(plist) | fail
   if lastTok.proc === Re__tabmatch then {
      s := lastTok.args[1]
      r := Re_Tok(Re__tabmatch,[s[-1]])
      s[-1] := ""
      if *s > 0 then {
         put(plist,lastTok)
         lastTok.args[1] := s
         }
      return r
      }
   return lastTok
end


procedure Re_Default()
#
#  Assign default values to regular expression translation globals, but
#  only to variables whose values are null.
#
   /Re_WordChars := &letters ++ &digits ++ "_"
   /Re_Space := ' \t\v\n\r\f'
   /Re_Digits := &digits
   /Re_ArbString := ".*"
   /Re_AnyString := "."
   return
end


procedure Re_cset()
#
#  Matches a [...] construct and returns a cset.
#
   local complement,c,e,ch,chars
   ="[" | fail
   (complement := ="^" | &null,c := move(1) || tab(find("]")),move(1)) |
         return &null
   c ? {
      e := (="-" | "")
      while chars := tab(upto('-\\')) do {
         e ++:= case move(1) of {
            "-": chars[1:-1] ++
                  &cset[ord(chars[-1]) + 1:ord(move(1)) + 2] | return &null
            "\\": case ch := move(1) of {
               "w": Re_WordChars
               "W": Re_NonWordChars
               "s": Re_Space
               "S": Re_NonSpace
               "d": Re_Digits
               "D": Re_NonDigits
               default: ch
               }
            }
         }
      e ++:= tab(0)
      if \complement then e := ~e
      }
   e := (\Re_Filter)(e)
   return cset(e)
end


procedure Re_string(level)
#
#  Matches a string of non-special characters, returning a string.
#
   local special,s,p
   static nondigits
   initial nondigits := ~&digits
   special := if level = 0 then '\\.+*|[({?' else  '\\.+*|[({?)'
   s := tab(upto(special) | 0)
   while ="\\" do {
      p := &pos
      if tab(any('wWbBsSdD')) |
            (tab(any('123456789')) & (pos(0) | any(nondigits))) then {
         tab(p - 1)
         break
         }
      s ||:= move(1) || tab(upto(special) | 0)
      }
   if pos(0) & s[-1] == "$" then {
      move(-1)
      s[-1] := ""
      }
   s := string((\Re_Filter)(s))
   return "" ~== s
end


#####################  Matching Engine Procedures  ########################


procedure ReMatch(plist,s,i1,i2) #: position past regular expression matched
#
#  Produce the sequence of positions in s past a string starting at i1
#  that matches the pattern plist, but fails if there is no such
#  position.  Similar to match(), but is capable of generating multiple
#  positions.
#
   local i
   if type(plist) ~== "list" then plist := RePat(plist) | fail
   if /s := &subject then /i1 := &pos else /i1 := 1 ; /i2 := 0
   i := match("",s,i1,i2) - 1 | fail
   Re_ParenGroups := []
   suspend s[i1:i2] ? (Re_match1(plist,1),i + &pos)
end


procedure Re_match1(plist,i) # s1,s2,...,sN
#
#  Used privately by ReMatch() to simulate a computed conjunction
#  expression via recursive generation.
#
   local tok
   suspend if tok := plist[i] then
      Re_tok_match(tok,plist,i) & Re_match1(plist,i + 1) else &null
end


procedure ReFind(plist,s,i1,i2) #: position where regular expression matched
#
#  Produce the sequence of positions in s where strings begin that match
#  the pattern plist, but fails if there is no such position.  Similar
#  to find().
#
   local i,p
   if type(plist) ~== "list" then plist := RePat(plist) | fail
   if /s := &subject then /i1 := &pos else /i1 := 1 ; /i2 := 0
   i := match("",s,i1,i2) - 1 | fail
   Re_ParenGroups := []
   s[i1:i2] ? suspend (
         tab(Re_skip(plist)) &
         p := &pos &
         Re_match1(plist,1)\1 &
         i + p)
end


procedure Re_tok_match(tok,plist,i)
#
#  Match a single token.  Can be recursively called by the token
#  procedure.
#
   local prc,results,result
   prc := tok.proc
   if \Re_LeftmostShortest then
         suspend if prc === Re_Arb then Re_Arb(plist,i) else prc!tok.args
   else {
      results := []
      every (if prc === Re_Arb then Re_Arb(plist,i) else prc!tok.args) do
        push(results,[&pos,copy(Re_ParenGroups)])
      every result := !results do {
         Re_ParenGroups := result[2]
     suspend tab(result[1])
         }
      }
end


##########  Heuristic Code for Matching Arbitrary Characters  ##########


procedure Re_skip(plist,i) # s1,s2,...,sN
#
#  Used privately -- match a sequence of strings in s past which a match
#  of the first pattern element in plist is likely to succeed.  This
#  procedure is used for heuristic performance improvement by ReMatch()
#  for the ".*" pattern element, and by ReFind().
#
   local x,s,p,prc,args
   /i := 1
   x := if type(plist) == "list" then plist[i] else plist
   if /x then suspend find("")
   else {
      args := x.args
      suspend case prc := x.proc of {
     Re__tabmatch: Re__find!args
     Re_TabAny: Re__upto!args
     pos: args[1]
     Re_WordBoundary |
     Re_NonWordBoundary:
           p := &pos & tab(Re_skip(plist,i + 1)) & prc!args & untab(p)
     Re_MatchParenGroup: if s := \(\Re_ParenGroups)[args[1]] then
           find(s) else find("")
     Re_NToMTimes |
     Re_NOrMoreTimes |
     Re_NTimes:
           if args[2] > 0 then Re_skip(args[1]) else find("")
     Re_OneOrMore |
     Re_MatchReg: Re_skip(args[1])
     Re_Alt:
           if \Re_Ordered then
             Re_result_merge{Re_skip(args[1]),Re_skip(args[2])}
           else
             Re_skip(args[1 | 2])
     default: find("")
     }
      }
end


procedure Re_result_merge(L)
#
#  Programmer-defined control operation to merge the result sequences of
#  two integer-producing generators.  Both generators must produce their
#  result sequences in numerically increasing order with no duplicates,
#  and the output sequence will be in increasing order with no
#  duplicates.
#
   local e1,e2,r1,r2
   e1 := L[1] ; e2 := L[2]
   r1 := @e1 ; r2 := @e2
   while \(r1 | r2) do
         if /r2 | \r1 < r2 then
               suspend r1 do r1 := @e1 | &null
         else if /r1 | r1 > r2 then
               suspend r2 do r2 := @e2 | &null
         else
               r2 := @e2 | &null
end


procedure untab(origPos)
#
#  Converts a string scanning expression that moves the cursor to one
#  that produces a cursor position and doesn't move the cursor (converts
#  something like tab(find(x)) to find(x).  The template for using this
#  procedure is
#
#       origPos := &pos ; tab(x) & ... & untab(origPos)
#
   local newPos
   newPos := &pos
   tab(origPos)
   suspend newPos
   tab(newPos)
end


#######################  Matching Procedures #######################


procedure Re_Arb(plist,i)
#
#  Match arbitrary characters (.*)
#
   suspend tab(if plist[i + 1] then Re_skip(plist,i + 1) else Re__find(""))
end


procedure Re_TabAny(C)
#
#  Match a character of a character set ([...],\w,\W,\s,\S,\d,\D)
#
   suspend tab(Re__any(C))
end


procedure Re_MatchReg(tokList,groupNbr)
#
#  Match parenthesized group and assign matched string to list Re_ParenGroup
#
   local p,s
   p := &pos
   /Re_ParenGroups := []
   every Re_match1(tokList,1) do {
      while *Re_ParenGroups < groupNbr do put(Re_ParenGroups)
      s := &subject[p:&pos]
      Re_ParenGroups[groupNbr] := s
      suspend s
      }
   Re_ParenGroups[groupNbr] := &null
end


procedure Re_WordBoundary(wd,nonwd)
#
#  Match word-boundary (\b)
#
   suspend ((pos(1),any(wd)) | (pos(0),move(-1),tab(any(wd))) | (move(-1),
         (tab(any(wd)),any(nonwd)) | (tab(any(nonwd)),any(wd))),"")
end


procedure Re_NonWordBoundary(wd,nonwd)
#
#  Match non-word-boundary (\B)
#
   suspend ((pos(1),any(nonwd)) | (pos(0),move(-1),tab(any(nonwd))) | (move(-1),
         (tab(any(wd)),any(wd)) | (tab(any(nonwd)),any(nonwd)),""))
end


procedure Re_MatchParenGroup(n)
#
#  Match same string matched by previous parenthesized group (\N)
#
   local s
   suspend if s := \Re_ParenGroups[n] then =s else ""
end


###################  Control Operation Procedures  ###################


procedure Re_ArbNo(tok)
#
#  Match any number of times (*)
#
   suspend "" | (Re_tok_match(tok) & Re_ArbNo(tok))
end


procedure Re_OneOrMore(tok)
#
#  Match one or more times (+)
#
   suspend Re_tok_match(tok) & Re_ArbNo(tok)
end


procedure Re_NToMTimes(tok,n,m)
#
#  Match n to m times ({n,m}
#
   suspend Re_NTimes(tok,n) & Re_ArbNo(tok)\(m - n + 1)
end


procedure Re_NOrMoreTimes(tok,n)
#
#  Match n or more times ({n,})
#
   suspend Re_NTimes(tok,n) & Re_ArbNo(tok)
end


procedure Re_NTimes(tok,n)
#
#  Match exactly n times ({n})
#
   if n > 0 then
      suspend Re_tok_match(tok) & Re_NTimes(tok,n - 1)
   else suspend
end


procedure Re_ZeroOrOneTimes(tok)
#
#  Match zero or one times (?)
#
   suspend "" | Re_tok_match(tok)
end


procedure Re_Alt(tokList1,tokList2)
#
#  Alternation (|)
#
   suspend Re_match1(tokList1 | tokList2,1)
end


###################  Case Independence Procedures  ###################


link noncase

global Re__find,Re__match,Re__any,Re__many,Re__upto,Re__tabmatch

procedure ReCaseIndependent()
  Re__find := c_find
  Re__match := c_match
  Re__any := c_any
  Re__many := c_many
  Re__upto := c_upto
  Re__tabmatch := Re_c_tabmatch
  return
end

procedure ReCaseDependent()
  Re__find := find
  Re__match := match
  Re__any := any
  Re__many := many
  Re__upto := upto
  Re__tabmatch := proc("=",1)
  return
end

procedure Re_c_tabmatch(s)
  suspend tab(c_match(s))
end