ipl/procs/html.icn


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334

############################################################################
#
#	File:     html.icn
#
#	Subject:  Procedures for parsing HTML
#
#	Author:   Gregg M. Townsend
#
#	Date:     April 26, 2005
#
############################################################################
#
#   This file is in the public domain.
#
############################################################################
#
#	These procedures parse HTML files:
#
#	htchunks(f)	generates the basic chunks -- tags and text --
#			that compose an HTML file.
#
#	htrefs(f)	generates the tagname/keyword/value combinations
#			that reference other files.
#
#	These procedures process strings from HTML files:
#
#	httag(s)	extracts the name of a tag.
#
#	htvals(s)	generates the keyword/value pairs from a tag.
#
#	urlmerge(base,new) interprets a new URL in the context of a base.
#
#       canpath(s)	puts a path in canonical form
#
############################################################################
#
#   	htchunks(f) generates the HTML chunks from file f.
#	It returns strings beginning with
#
#		<!--	for unclosed comments (legal comments are deleted)
#		<	for tags (will end with ">" unless unclosed at EOF)
#	anything else	for text
#
#	At this level entities such as &amp are left unprocessed and all
#	whitespace is preserved, including newlines.
#
############################################################################
#
#	htrefs(f) extracts file/url references from within an HTML file
#	and generates a string of the form
#		tagname keyword value
#   	for each reference.
#
#	A single space character separates the three fields, but if no
#	value is supplied for the keyword, no space follows the keyword.
#	Tag and keyword names are always returned in upper case.
#
#	Quotation marks are stripped from the value, but note that the
#	value can contain spaces or other special characters (although
#	by strict HTML rules it probably shouldn't).
#
#       A table in the code determines which fields are references to
#	other files.  For example, with <IMG>, SRC= is a reference but
#	WIDTH= is not.  The table is based on the HTML 4.0 standard:
#		http://www.w3.org/TR/REC-html40/
#
############################################################################
#
#	httag(s) extracts and returns the tag name from within an HTML
#	tag string of the form "<tagname...>".   The tag name is returned
#	in upper case.
#
############################################################################
#
#	htvals(s) generates the tag values contained within an HTML tag
#	string of the form "<tagname kw=val kw=val ...>".   For each
#	keyword=value pair beyond the tagname, a string of the form
#
#		keyword value
#
#	is generated.  One space follows the keyword, which is returned
#	in upper case, and quotation marks are stripped from the value.
#	The value itself can be an empty string.
#
#	For each keyword given without a value, the keyword is generated
#	in upper case with no following space.
#
#	Parsing is somewhat tolerant of errors.
#
############################################################################
#
#	urlmerge(base,new) interprets a full or partial new URL in the
#	context of a base URL, returning the combined URL.
#
#	Here are some examples of applying urlmerge() with a base value
#	of "http://www.vcu.edu/misc/sched.html" and a new value as given:
#
#	new		result
#	-------------	-------------------
#	#tuesday	http://www.vcu.edu/misc/sched.html#tuesday
#	bulletin.html	http://www.vcu.edu/misc/bulletin.html
#	./results.html	http://www.vcu.edu/misc/results.html
#	images/rs.gif	http://www.vcu.edu/misc/images/rs.gif
#	../		http://www.vcu.edu/
#	/greet.html	http://www.vcu.edu/greet.html
#	file:a.html	file:a.html
#
############################################################################
#
#	canpath(s) returns the canonical form of a file path by squeezing
#	out components such as "./" and "dir/../".
#
############################################################################


#   htchunks(f) -- generate HTML chunks from file f

procedure htchunks(f)			#: generate chunks of HTML file
   local prev, line, s

   "" ? repeat {

      if pos(0) then
         &subject := (read(f) || "\n") | fail

      if ="<!--" then
         suspend htc_comment(f)		# fails if comment is legal
      else if ="<" then
         suspend htc_tag(f)		# generate tag
      else
         suspend htc_text(f)		# generate text chunk

      }
end

procedure htc_tag(f)
   local s

   s := "<"
   repeat {
      if s ||:= tab(upto('>') + 1) then
         return s			# completed tag
      s ||:= tab(0)
      &subject := (read(f) || "\n") | break
      }
   return s				# unclosed tag
end

procedure htc_comment(f)
   local s

   s := ""
   repeat {
      if s ||:= tab(find("-->") + 3) then
         fail				# normal case: discard comment
      s ||:= tab(0)
      &subject := (read(f) || "\n") | break
      }

   &subject := s			# rescan unclosed comment
   return "<!--"			# return error indicator
end

procedure htc_text(f)
   local s

   s := ""
   repeat {
      if s ||:= tab(upto('<')) then
         return s
      s ||:= tab(0)
      &subject := (read(f) || "\n") | return s
      }
end


##  htrefs(f) -- generate references from HTML file f

procedure htrefs(f)			#: generate references from HTML file
   local tag, tagname, kwset, s
   static ttable
   initial {
      ttable := table()
      ttable["A"]	:= set(["HREF"])
      ttable["APPLET"]	:= set(["CODEBASE"])
      ttable["AREA"]	:= set(["HREF"])
      ttable["BASE"]	:= set(["HREF"])
      ttable["BLOCKQUOTE"] := set(["CITE"])
      ttable["BODY"]	:= set(["BACKGROUND"])
      ttable["DEL"]	:= set(["CITE"])
      ttable["FORM"]	:= set(["ACTION"])
      ttable["FRAME"]	:= set(["SRC", "LONGDESC"])
      ttable["HEAD"]	:= set(["PROFILE"])
      ttable["IFRAME"]	:= set(["SRC", "LONGDESC"])
      ttable["IMG"]	:= set(["SRC", "LONGDESC", "USEMAP"])
      ttable["INPUT"]	:= set(["SRC", "USEMAP"])
      ttable["INS"]	:= set(["CITE"])
      ttable["LINK"]	:= set(["HREF"])
      ttable["OBJECT"]	:= set(["CLASSID","CODEBASE","DATA","ARCHIVE","USEMAP"])
      ttable["Q"]	:= set(["CITE"])
      ttable["SCRIPT"]	:= set(["SRC", "FOR"])
      }

   every tag := htchunks(f) do {
      tagname := httag(tag) | next
      kwset := \ttable[tagname] | next
      every s := htvals(tag) do
         if member(kwset, s ? tab(upto(' '))) then
            suspend tagname || " " || s
      }
end


##  httag(s) -- return the name of the HTML tag s

procedure httag(s)			#: extract name of HTML tag
   static idset, wset, lcase, ucase
   initial {
      idset := &letters ++ &digits ++ '.-/'
      wset := ' \t\r\n\v\f'
      lcase := string(&lcase)
      ucase := string(&ucase)
   }

   s ? {
      ="<" | fail
      tab(many(wset))
      return map(tab(many(idset)), lcase, ucase)
   }
end


##  htvals(s) -- generate tag values of HTML tag s

procedure htvals(s)			#: generate values in HTML tag
   local kw
   static idset, wset, qset, lcase, ucase
   initial {
      idset := &letters ++ &digits ++ '.-/'
      wset := ' \t\r\n\v\f'
      qset := wset ++ '>'
      lcase := string(&lcase)
      ucase := string(&ucase)
   }

   s ? {
      ="<" | fail
      tab(many(wset))
      tab(many(idset)) | fail		# no name
      repeat {
         tab(upto(idset)) | fail
         kw := map(tab(many(idset)), lcase, ucase)
         tab(many(wset))
         if ="=" then {
            tab(many(wset))
            kw ||:= " "
            if ="\"" then {
               kw ||:= tab(upto('"') | 0)
               tab(any('"'))
               }
            else if ="'" then {
               kw ||:= tab(upto('\'') | 0)
               tab(any('\''))
               }
            else
               kw ||:= tab(upto(qset) | 0)
            }
         suspend kw
      }
   }
end


#  urlmerge(base,new) -- merge URLs

procedure urlmerge(base, new)		#: merge URLs
   local protocol, host, path
   static notslash
   initial notslash := ~'/'

   if new ? (tab(many(&letters)) & =":") then
      return new			# new is fully specified

   base ? {
      protocol := (tab(many(&letters)) || =":") | ""
      host := (="//" || tab(upto('/') | 0)) | ""
      path := tab(upto('#') | 0)
      }

   new ? {
      if ="#" then
         return protocol || host || path || new
      if ="/" then
         return protocol || host || new
   }

   path := trim(path, notslash) || new

   return protocol || host || canpath(path)
end


#  canpath(path) -- return canonical version of path
#
#  This is similar to step 6 of section 4 of RFC 1808.

procedure canpath(path)			#: put path in canonical form
   static notslash
   initial notslash := ~'/'

   # change "/./" to "/"
   while path ?:= 1(tab(find("/./")), move(2)) || tab(0)

   # change "//" to "/"
   while path ?:= tab(find("//") + 1) || (tab(many('/')) & tab(0))

   # remove "dir/../"
   while path ?:=
      (tab(1 | (upto('/') + 1))) || 
         ((tab(many(notslash)) ~== "..") & ="/../" & tab(0))
   
   # remove leading "./"
   while path ?:= (="./" & tab(0))

   # remove trailing "."
   path ?:= if tab(-2) & ="/." then path[1:-1]
   path ?:= if ="." & pos(0) then ""

   return path
end