summaryrefslogtreecommitdiff
path: root/ipl/progs/weblinks.icn
blob: 8fd62c1f9f849831472974e5dbcbcc2e4c1e26e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
############################################################################
#
#	File:     weblinks.icn
#
#	Subject:  Program to check links in HTML files
#
#	Author:   Gregg M. Townsend
#
#	Date:     March 29, 2006
#
############################################################################
#
#	This file is in the public domain.
#
############################################################################
#
#	Weblinks is a program for checking links in a collection of HTML
#	files.  It is designed for use directly on the file structure
#	containing the HTML files.
#
#	Given one or more starting points, weblinks parses each file and
#	validates the HTTP: and FILE: links it finds.  Errors are reported
#	on standard output.  FILE: links, including relative links, can be
#	followed recursively.
#
############################################################################
#
#	By design, only local files are scanned.  Only an existence check is
#	performed for HTTP: links.  Validation of HTTP: links is aided by
#	caching and subject to speed limits; see "vhttp.icn" for details.
#
#	Remote links are checked by sending an HTTP "HEAD" request. 
#	Unfortunately, some sites respond with "Server Error" or even with
#	snide remarks like "Because I felt like it".  These are reported
#	as errors and must be inspected manually.
#
#	NOTE:  if the environment variable USER is set, as it usually is,
#	then "From: $USER@hostname" is sent as part of each remote inquiry
#	in order to identify the source.  This is standard etiquette for
#	automated checkers.
#
#	Limitations:
#	   url(...) links within embedded stylesheets are not recognized.
#	   FTP:, MAILTO:, and other link types are not validated.
#	   Files are checked recursively only if named *.htm*.
#	   Proper file permission (for web export) is not checked.
#
#	The common error of failing to put a trailing slash on a directory
#	specification results in a "453 Is A Directory" error message for a
#	local file or, typically, a "301 Moved Permanently" message for a
#	remote file.
#
############################################################################
#
#	usage:   weblinks [options] file...
#
#	-R	follow file links recursively
#		(http links are never followed recursively)
#
#	-t	trace files as visited
#
#	-s	report successes as well as problems
#
#	-v	report tracing and successes, if selected, more verbosely
#
#	-i	invert output (sort by referencing page, not by status)
#
#	-r root
#		specify starting point for file names beginning with "/"
#		(e.g. -r /cs/www).  This is needed if such references are
#		to be followed or checked.  If a root is specified it
#		affects all file specifications including those on the
#		command line.
#
#	-h home
#		specify starting point for file names beginning with "/~".
#
#	-p prefix[,prefix...]
#		prune (don't check) files beginning with given prefix
#
#	-b prefix
#		specify bounds for files scanned:  do not scan files
#		that do not begin with prefix.  Default bounds are
#		directory of last file name.  For example,
#			weblinks /foo/bar /foo/baz  
#		implies "-b /foo/".
#
#	If the environment variable WEBLINKS_INIT is set, its whitespace-
#	separated words are prepended to the explicit command argument list.
#
############################################################################
#
#	Examples (all assuming a web area rooted at /cs/www)
#
#		To check one new page:
#		weblinks -r /cs/www  /icon/books.htm
#
#		To check a personal hierarchy, with tracing:
#		setenv WEBLINKS_INIT "-r /cs/www -h /cs/www/people"
#		weblinks -R -t /~gmt/
#
#		To check with pruning:
#		weblinks -R -t -r /cs/www -p /icon/library /icon/index.htm
#
############################################################################
#
#  Links:  options, strings, html, vhttp
#
############################################################################
#
#  Requires:  Unix, dynamic loading
#
############################################################################


#  to do:
#	add -u option (report unchecked URLs); -s should imply -u
#	provide way to ask for warnings about (e.g.) /http/html paths
#	provide way to specify translation from http:lww... into file: /...
#	provide way to specify translation from ftp:... into file: /...
#	provide depth limit control
#	allow longer history persistence
#	history is clumsy -- hard to recheck a connection that failed
#	  add option to retry failed entries (but believe cached successes)


$define URLCOLS 56	# number of columns allotted for tracing URLs
$define STATCOLS 22	# number of columns allotted for status messages

link options
link strings
link html
link vhttp


global root
global home
global prune
global bounds

global invert
global recurse
global trace
global verbose
global successes

global todo, done, nscanned
global refto, reffrom


procedure main(args)
   local opts, url, tmp

   # initialize data structures

   prune := list()
   todo := list()
   done := table()
   refto := table()
   reffrom := table()
   nscanned := 0

   # add arguments from the environment to the command line

   tmp := list()
   every put(tmp, words(getenv("WEBLINKS_INIT")))
   while push(args, pull(tmp))

   # process command line

   opts := options(args, "b:p:r:h:iRstv")
   invert := opts["i"]
   recurse := opts["R"]
   successes := opts["s"]
   trace := opts["t"]
   verbose := opts["v"]

   if *args = 0 then
      stop("usage: ", &progname, " [options] file ...")

   setroot(\opts["r"] | "/")
   sethome(\opts["h"] | "/usr/")
   setbounds(\opts["b"] | urlmerge(args[-1], ""))
   every setprune(words(\opts["p"], ' ,'))
   setfrom()

   register("initial:")
   register("implicit:")
   every addref("initial:", urlmerge("file:", !args))

   wheader()

   while url := get(todo) do
      try(url)

   if \trace then
      write()

   report()
end

procedure setroot(s)
   if s[-1] ~== "/" then
      s ||:= "/"
   root := s
   return
end

procedure sethome(s)
   if s[-1] ~== "/" then
      s ||:= "/"
   home := s
   return
end

procedure setprune(s)
   put(prune, s)
   return
end

procedure setbounds(s)
   bounds := s
   return
end

procedure setfrom()
   local user, host, f

   user := getenv("USER")	| fail
   *user > 0			| fail
   f := open("uname -n", "rp")	| fail
   host := read(f)
   close(f)
   *\host > 0			| fail
   vhttp_from := user || "@" || host
   return
end


procedure wheader()
   write("From:\t", \vhttp_from | "[none]")
   write("root:\t", root)
   write("home:\t", home)
   write("bounds:\t", bounds)
   every write("start:\t", (!todo)[6:0])
   every write("prune:\t", !prune)
   write()
   return
end

procedure try(url)
   local result

   (/done[url] := "[processing]") | return	# return if already checked

   if \trace then {
      writes(pad(url, URLCOLS))
      flush(&output)
      }

   result := check(url)
   done[url] := result

   if \trace then
      write("  ", result)
   return
end


procedure check(url)
   local protocol, fspec, fname, f, s, ref, base

   url ? {
      protocol := map(tab(upto(':'))) | ""
      =":"
      fspec := tab(0)
   }

   if protocol == "http" then
      return vhttp(url) | "451 Illegal URL"

   if protocol ~== "file" then
      return "152 Not Checked"

   fspec ? {
      if ="/~" then
         fname := home || tab(0)
      else if ="/" then
         fname := root || tab(0)
      else if pos(0) then
         fname := "./"
      else
         fname := fspec
      }

   if fname[-1] == "/" then {
      if (close(open(fname || "index.html"))) then {
         addref("implicit:", url || "index.html")
         return "154 Found index.html"
         }
      if (close(open(fname || "index.htm"))) then {
         addref("implicit:", url || "index.htm")
         return "155 Found index.htm"
         }
      if (close(open(fname || "."))) then
         return "153 Found Directory"
      }

   if not (f := open(fname)) then
      return "452 Cannot Open"

   if (/recurse & not member(reffrom["initial:"], url)) |
   (fspec ? (not match(bounds)) | match(!prune)) | 
   (not find(".htm", map(url))) then {
      close(f)
      if close(open(fname || "/.")) then
         return "453 Is A Directory"
      else
         return "251 File Exists"
      }

   base := url
   every s := htrefs(f) do s ? {
      if ="BASE HREF " then {
         base := tab(0)
         }
      else {
         tab(upto(' ') + 1)
         tab(upto(' ') + 1)
         ref := urlmerge(base, tab(0))
         addref(url, ref)
         }
      if \verbose then
         writes("\n   references: ", ref)
      }
   if \verbose then
      writes("\n", repl(" ", URLCOLS))

   close(f)
   nscanned +:= 1
   return "252 File Scanned"
end

procedure report()
   local l, url, stat, s, t, u

   l := sort(done, 4)
   t := table()
   while (url := get(l)) & (stat := get(l)) do {
      if \successes | (any('3456789', stat) & stat ~== "302 Found") then {
         s := pad(stat || ":", STATCOLS) || "  " || url
         if \invert then
            every u := !refto[url] do
               put(\t[u] | (t[u] := []), s)
         else {
            write(s)
            if \verbose | any('3456789', stat) then
               every write("   referenced by:\t", !sort(refto[url]))
            }
         }
      }

   if \invert then {
      l := sort(t, 3)
      while (url := get(l)) & (stat := get(l)) do {
         write(url)
         every write("   ", !stat)
         }
      }

   write()

   if nscanned = 1 then
      write("1 file scanned")
   else
      write(nscanned, " files scanned")

   if *done = 1 then
      write("1 reference checked")
   else
      write(*done, " references checked")

   return
end

procedure addref(src, dst)
   dst := (dst ? tab(upto('#') | 0))
   register(dst)
   insert(refto[dst], src)
   insert(reffrom[src], dst)
   if /done[dst] then
      put(todo, dst)
   return
end

procedure register(url)
   /refto[url] := set()
   /reffrom[url] := set()
   return
end



#  pad(s, n) -- pad string to length n, never truncating

procedure pad(s, n)
   if *s < n then
      return left(s, n)
   else
      return s
end