summaryrefslogtreecommitdiff
path: root/ipl/progs/fuzz.icn
diff options
context:
space:
mode:
Diffstat (limited to 'ipl/progs/fuzz.icn')
-rw-r--r--ipl/progs/fuzz.icn179
1 files changed, 179 insertions, 0 deletions
diff --git a/ipl/progs/fuzz.icn b/ipl/progs/fuzz.icn
new file mode 100644
index 0000000..de81814
--- /dev/null
+++ b/ipl/progs/fuzz.icn
@@ -0,0 +1,179 @@
+############################################################################
+#
+# File: fuzz.icn
+#
+# Subject: Program to perform fuzzy pattern matching
+#
+# Author: Alex Cecil
+#
+# Date: November 10, 1993
+#
+############################################################################
+#
+# This file is in the public domain.
+#
+############################################################################
+#
+# This program illustrates "fuzzy" string pattern matching. The result
+# of matching s and t is a number between 0 and 1 which is based on
+# counting matching pairs of characters in increasingly long substrings
+# of s and t. Characters may be weighted differently, and the reverse
+# tally may be given a negative bias.
+#
+############################################################################
+
+
+global bias, rank_list_max, weight1, weight2, weight_set, which_fuzz_value
+
+procedure main()
+ local alphanum, in_id, in_name, in_record, rank_list,
+ start_time, word_requested
+
+ bias := -2 # Reduce importance of reverse match
+ rank_list_max := 15 # Number of best matches to write
+ weight1 := 6 # Weight of chars not in weight_set
+ weight2 := 2 # Weight of chars in weight_set
+ weight_set := 'aehiouwy' # Soundex ignore list
+
+ write("The ",rank_list_max,
+ " best matches for the first word in each line will be written.")
+ writes("\nName of input file: "); in_name := read()
+ in_id := (open(in_name,"r")) | (stop("Can't open file ",in_name))
+
+ writes("\nWord to search for: ")
+ word_requested := map(read())
+
+ writes("\nWhich function: Simple, Optimized, Weighted (1,2,3): ")
+ which_fuzz_value := case read() of {
+ "1" : fuzz_value_1 # Simple, "obvious" implementation
+ "2" : fuzz_value_2 # Simple, linearized for speed
+ default : fuzz_value_3 # Weights and bias included
+ }
+
+ write("\nSearching for \"",word_requested,"\" in file ",in_name)
+ start_time := &time
+ alphanum := &letters ++ &digits
+ rank_list := [] # [[fuzz-value,in-record],...]
+ while in_record := read(in_id) do {
+ in_record ? {
+ tab(upto(alphanum))
+ rank(word_requested,map(tab(many(alphanum))),in_record,
+ rank_list,rank_list_max)
+ }
+ }
+ write("\nFuzz Value of first word\n | Input Record...")
+ every rank := !rank_list do {
+ write(left(string(rank[1]),5)," ",left(rank[2],72))
+ }
+ write("\nElapsed time in milliseconds: ",&time - start_time)
+end
+
+procedure rank(s,t,r,rl,rm)
+# Maintain a sorted list (rl) of the rm best Fuzz values with records (r).
+# Special cases to save time: strings are the same; or s and t have fewer
+# than about 50% characters in common.
+ local i, v
+ if s == t then v := 1.0
+ else if *(s ** t) * 4 <= (*s + *t) then v := 0.0
+ else v := which_fuzz_value(s,t,weight1,weight2,weight_set,bias)
+ # 3rd-last args needed by fuzz_value_3
+ if *rl = 0 then put(rl,[v,r]) # First entry in list
+ else if v >= rl[*rl][1] then { # If value greater than least in list...
+ put(rl,[v,r]) # add to list, sort, and trim
+ every i := *rl to 2 by -1 do {
+ if rl[i][1] > rl[i-1][1] then rl[i] :=: rl[i-1]
+ }
+ if *rl > rm then pull(rl)
+ }
+end
+
+procedure fuzz_value_1(s,t)
+# Calculate Fuzz Value of s and t with weight=1 and bias=0
+# Simple, non-optomized algorithm.
+ if *s > *t then s :=: t
+ return 2.0 * (fuzz_match_1(s,t) + fuzz_match_1(reverse(s),reverse(t)))/
+ ((*s * (*s+1)) + (*t * (*t+1)))
+end
+
+procedure fuzz_match_1(s,ti)
+# Calculate the Fuzz Matches between s and t. Simple algorithm.
+# ASCII NUL is used to mark matched pairs, so can't be used in strings
+ local i, imax, jmax, m, t, tsdif
+ tsdif := *ti - *s
+ m := 0
+ every imax := 1 to *s do {
+ t := ti
+ jmax := imax + tsdif + 1
+ every i := 1 to imax do
+ if t[find(s[i],t,1,jmax)] := "\0" then m +:= 1
+ }
+ return m
+end
+
+procedure fuzz_value_2(s,t)
+# Calculate Fuzz Value with weight=1 and bias=0
+# Optomized version.
+ if *s > *t then s :=: t
+ return 2.0 * (fuzz_match_2(s,t) + fuzz_match_2(reverse(s),reverse(t)))/
+ ((*s * (*s+1)) + (*t * (*t+1)))
+end
+
+procedure fuzz_match_2(s,t)
+# Calculate the Fuzz Matches between s and t.
+# Replace column loop by imperical calculation.
+# ASCII NUL is used to mark matched pairs, so can't be used in s or t.
+# s(ip) is ith char from right, similarly for t(jp)
+ local ip, j, jmp, jp, m, si
+ ip := *s
+ jmp := *t + 1
+ m := 0
+ every si := !s do {
+ if t[j := find(si,t)] := "\0" then {
+ jp := jmp - j
+ m +:= (ip <= jp | ip) - abs(ip - jp) # max column minus column offset
+ }
+ ip -:= 1
+ }
+ return m
+end
+
+procedure fuzz_value_3(s,t,w1,w2,w2c,b,c)
+# Calculate Fuzz Value with weight w2 if in cset w2c, else weight w1; bias b.
+ if *s > *t then s :=: t
+ return 2.0 * (fuzz_match_3(s,t,w1,w2,w2c) +
+ fuzz_match_3(reverse(s),reverse(t),w1+b,w2+b,w2c)) /
+ (fuzz_self_3(s,w1+w1+b,w2+w2+b,w2c) + fuzz_self_3(t,w1+w1+b,w2+w2+b,w2c))
+end
+
+procedure fuzz_match_3(s,t,w1,w2,w2c)
+# Calculate the Fuzz Matches between s and t.
+# Replace column loop by imperical calculation.
+# ASCII NUL is used to mark matched pairs, so can't be used in s or t.
+# s(ip) is ith char from right, similarly for t(jp)
+ local ip, j, jmp, jp, m, mo, si
+ ip := *s
+ jmp := *t + 1
+ m := 0
+ every si := !s do {
+ if t[j := find(si,t)] := "\0" then {
+ jp := jmp - j
+ mo := (ip <= jp | ip) - abs(ip - jp) # max column minus column offset
+ m +:= (any(w2c,si) & (w2 * mo)) | (w1 * mo)
+ }
+ ip -:= 1
+ }
+ return m
+end
+
+procedure fuzz_self_3(s,w1fr,w2fr,w2c)
+# fuzz matches of s with s
+# w1fr, w2fr: forward plus reverse weights.
+ local ip, m, si
+ ip := *s
+ m := 0
+ every si := !s do {
+ m +:= (any(w2c,si) & (w2fr * ip)) | (w1fr * ip)
+ ip -:= 1
+ }
+ return m
+end