1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
|
############################################################################
#
# File: fuzz.icn
#
# Subject: Program to perform fuzzy pattern matching
#
# Author: Alex Cecil
#
# Date: November 10, 1993
#
############################################################################
#
# This file is in the public domain.
#
############################################################################
#
# This program illustrates "fuzzy" string pattern matching. The result
# of matching s and t is a number between 0 and 1 which is based on
# counting matching pairs of characters in increasingly long substrings
# of s and t. Characters may be weighted differently, and the reverse
# tally may be given a negative bias.
#
############################################################################
global bias, rank_list_max, weight1, weight2, weight_set, which_fuzz_value
procedure main()
local alphanum, in_id, in_name, in_record, rank_list,
start_time, word_requested
bias := -2 # Reduce importance of reverse match
rank_list_max := 15 # Number of best matches to write
weight1 := 6 # Weight of chars not in weight_set
weight2 := 2 # Weight of chars in weight_set
weight_set := 'aehiouwy' # Soundex ignore list
write("The ",rank_list_max,
" best matches for the first word in each line will be written.")
writes("\nName of input file: "); in_name := read()
in_id := (open(in_name,"r")) | (stop("Can't open file ",in_name))
writes("\nWord to search for: ")
word_requested := map(read())
writes("\nWhich function: Simple, Optimized, Weighted (1,2,3): ")
which_fuzz_value := case read() of {
"1" : fuzz_value_1 # Simple, "obvious" implementation
"2" : fuzz_value_2 # Simple, linearized for speed
default : fuzz_value_3 # Weights and bias included
}
write("\nSearching for \"",word_requested,"\" in file ",in_name)
start_time := &time
alphanum := &letters ++ &digits
rank_list := [] # [[fuzz-value,in-record],...]
while in_record := read(in_id) do {
in_record ? {
tab(upto(alphanum))
rank(word_requested,map(tab(many(alphanum))),in_record,
rank_list,rank_list_max)
}
}
write("\nFuzz Value of first word\n | Input Record...")
every rank := !rank_list do {
write(left(string(rank[1]),5)," ",left(rank[2],72))
}
write("\nElapsed time in milliseconds: ",&time - start_time)
end
procedure rank(s,t,r,rl,rm)
# Maintain a sorted list (rl) of the rm best Fuzz values with records (r).
# Special cases to save time: strings are the same; or s and t have fewer
# than about 50% characters in common.
local i, v
if s == t then v := 1.0
else if *(s ** t) * 4 <= (*s + *t) then v := 0.0
else v := which_fuzz_value(s,t,weight1,weight2,weight_set,bias)
# 3rd-last args needed by fuzz_value_3
if *rl = 0 then put(rl,[v,r]) # First entry in list
else if v >= rl[*rl][1] then { # If value greater than least in list...
put(rl,[v,r]) # add to list, sort, and trim
every i := *rl to 2 by -1 do {
if rl[i][1] > rl[i-1][1] then rl[i] :=: rl[i-1]
}
if *rl > rm then pull(rl)
}
end
procedure fuzz_value_1(s,t)
# Calculate Fuzz Value of s and t with weight=1 and bias=0
# Simple, non-optomized algorithm.
if *s > *t then s :=: t
return 2.0 * (fuzz_match_1(s,t) + fuzz_match_1(reverse(s),reverse(t)))/
((*s * (*s+1)) + (*t * (*t+1)))
end
procedure fuzz_match_1(s,ti)
# Calculate the Fuzz Matches between s and t. Simple algorithm.
# ASCII NUL is used to mark matched pairs, so can't be used in strings
local i, imax, jmax, m, t, tsdif
tsdif := *ti - *s
m := 0
every imax := 1 to *s do {
t := ti
jmax := imax + tsdif + 1
every i := 1 to imax do
if t[find(s[i],t,1,jmax)] := "\0" then m +:= 1
}
return m
end
procedure fuzz_value_2(s,t)
# Calculate Fuzz Value with weight=1 and bias=0
# Optomized version.
if *s > *t then s :=: t
return 2.0 * (fuzz_match_2(s,t) + fuzz_match_2(reverse(s),reverse(t)))/
((*s * (*s+1)) + (*t * (*t+1)))
end
procedure fuzz_match_2(s,t)
# Calculate the Fuzz Matches between s and t.
# Replace column loop by imperical calculation.
# ASCII NUL is used to mark matched pairs, so can't be used in s or t.
# s(ip) is ith char from right, similarly for t(jp)
local ip, j, jmp, jp, m, si
ip := *s
jmp := *t + 1
m := 0
every si := !s do {
if t[j := find(si,t)] := "\0" then {
jp := jmp - j
m +:= (ip <= jp | ip) - abs(ip - jp) # max column minus column offset
}
ip -:= 1
}
return m
end
procedure fuzz_value_3(s,t,w1,w2,w2c,b,c)
# Calculate Fuzz Value with weight w2 if in cset w2c, else weight w1; bias b.
if *s > *t then s :=: t
return 2.0 * (fuzz_match_3(s,t,w1,w2,w2c) +
fuzz_match_3(reverse(s),reverse(t),w1+b,w2+b,w2c)) /
(fuzz_self_3(s,w1+w1+b,w2+w2+b,w2c) + fuzz_self_3(t,w1+w1+b,w2+w2+b,w2c))
end
procedure fuzz_match_3(s,t,w1,w2,w2c)
# Calculate the Fuzz Matches between s and t.
# Replace column loop by imperical calculation.
# ASCII NUL is used to mark matched pairs, so can't be used in s or t.
# s(ip) is ith char from right, similarly for t(jp)
local ip, j, jmp, jp, m, mo, si
ip := *s
jmp := *t + 1
m := 0
every si := !s do {
if t[j := find(si,t)] := "\0" then {
jp := jmp - j
mo := (ip <= jp | ip) - abs(ip - jp) # max column minus column offset
m +:= (any(w2c,si) & (w2 * mo)) | (w1 * mo)
}
ip -:= 1
}
return m
end
procedure fuzz_self_3(s,w1fr,w2fr,w2c)
# fuzz matches of s with s
# w1fr, w2fr: forward plus reverse weights.
local ip, m, si
ip := *s
m := 0
every si := !s do {
m +:= (any(w2c,si) & (w2fr * ip)) | (w1fr * ip)
ip -:= 1
}
return m
end
|