1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
package syntax
import (
"bytes"
"strconv"
)
// Compiled program.
// May not belong in this package, but convenient for now.
// A Prog is a compiled regular expression program.
type Prog struct {
Inst []Inst
Start int // index of start instruction
NumCap int // number of InstCapture insts in re
}
// An InstOp is an instruction opcode.
type InstOp uint8
const (
InstAlt InstOp = iota
InstAltMatch
InstCapture
InstEmptyWidth
InstMatch
InstFail
InstNop
InstRune
)
// An EmptyOp specifies a kind or mixture of zero-width assertions.
type EmptyOp uint8
const (
EmptyBeginLine EmptyOp = 1 << iota
EmptyEndLine
EmptyBeginText
EmptyEndText
EmptyWordBoundary
EmptyNoWordBoundary
)
// An Inst is a single instruction in a regular expression program.
type Inst struct {
Op InstOp
Out uint32 // all but InstMatch, InstFail
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
Rune []int
}
func (p *Prog) String() string {
var b bytes.Buffer
dumpProg(&b, p)
return b.String()
}
// skipNop follows any no-op or capturing instructions
// and returns the resulting pc.
func (p *Prog) skipNop(pc uint32) *Inst {
i := &p.Inst[pc]
for i.Op == InstNop || i.Op == InstCapture {
pc = i.Out
i = &p.Inst[pc]
}
return i
}
// Prefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool) {
i := p.skipNop(uint32(p.Start))
// Avoid allocation of buffer if prefix is empty.
if i.Op != InstRune || len(i.Rune) != 1 {
return "", i.Op == InstMatch
}
// Have prefix; gather characters.
var buf bytes.Buffer
for i.Op == InstRune && len(i.Rune) == 1 {
buf.WriteRune(i.Rune[0])
i = p.skipNop(i.Out)
}
return buf.String(), i.Op == InstMatch
}
// StartCond returns the leading empty-width conditions that must
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
var flag EmptyOp
pc := uint32(p.Start)
i := &p.Inst[pc]
Loop:
for {
switch i.Op {
case InstEmptyWidth:
flag |= EmptyOp(i.Arg)
case InstFail:
return ^EmptyOp(0)
case InstCapture, InstNop:
// skip
default:
break Loop
}
pc = i.Out
i = &p.Inst[pc]
}
return flag
}
// MatchRune returns true if the instruction matches (and consumes) r.
// It should only be called when i.Op == InstRune.
func (i *Inst) MatchRune(r int) bool {
rune := i.Rune
// Special case: single-rune slice is from literal string, not char class.
// TODO: Case folding.
if len(rune) == 1 {
return r == rune[0]
}
// Peek at the first few pairs.
// Should handle ASCII well.
for j := 0; j < len(rune) && j <= 8; j += 2 {
if r < rune[j] {
return false
}
if r <= rune[j+1] {
return true
}
}
// Otherwise binary search.
lo := 0
hi := len(rune) / 2
for lo < hi {
m := lo + (hi-lo)/2
if c := rune[2*m]; c <= r {
if r <= rune[2*m+1] {
return true
}
lo = m + 1
} else {
hi = m
}
}
return false
}
// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
// Since we act on runes, it would be easy to support Unicode here.
func wordRune(rune int) bool {
return rune == '_' ||
('A' <= rune && rune <= 'Z') ||
('a' <= rune && rune <= 'z') ||
('0' <= rune && rune <= '9')
}
// MatchEmptyWidth returns true if the instruction matches
// an empty string between the runes before and after.
// It should only be called when i.Op == InstEmptyWidth.
func (i *Inst) MatchEmptyWidth(before int, after int) bool {
switch EmptyOp(i.Arg) {
case EmptyBeginLine:
return before == '\n' || before == -1
case EmptyEndLine:
return after == '\n' || after == -1
case EmptyBeginText:
return before == -1
case EmptyEndText:
return after == -1
case EmptyWordBoundary:
return wordRune(before) != wordRune(after)
case EmptyNoWordBoundary:
return wordRune(before) == wordRune(after)
}
panic("unknown empty width arg")
}
func (i *Inst) String() string {
var b bytes.Buffer
dumpInst(&b, i)
return b.String()
}
func bw(b *bytes.Buffer, args ...string) {
for _, s := range args {
b.WriteString(s)
}
}
func dumpProg(b *bytes.Buffer, p *Prog) {
for j := range p.Inst {
i := &p.Inst[j]
pc := strconv.Itoa(j)
if len(pc) < 3 {
b.WriteString(" "[len(pc):])
}
if j == p.Start {
pc += "*"
}
bw(b, pc, "\t")
dumpInst(b, i)
bw(b, "\n")
}
}
func u32(i uint32) string {
return strconv.Uitoa64(uint64(i))
}
func dumpInst(b *bytes.Buffer, i *Inst) {
switch i.Op {
case InstAlt:
bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
case InstAltMatch:
bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
case InstCapture:
bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
case InstEmptyWidth:
bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
case InstMatch:
bw(b, "match")
case InstFail:
bw(b, "fail")
case InstNop:
bw(b, "nop -> ", u32(i.Out))
case InstRune:
if i.Rune == nil {
// shouldn't happen
bw(b, "rune <nil>")
}
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
}
}
|