1 files changed, 334 insertions, 0 deletions
diff --git a/ipl/procs/html.icn b/ipl/procs/html.icn
new file mode 100644
index 0000000..50f7086
--- /dev/null
+++ b/ipl/procs/html.icn
@@ -0,0 +1,334 @@
+############################################################################
+#
+#	File:     html.icn
+#
+#	Subject:  Procedures for parsing HTML
+#
+#	Author:   Gregg M. Townsend
+#
+#	Date:     April 26, 2005
+#
+############################################################################
+#
+#   This file is in the public domain.
+#
+############################################################################
+#
+#	These procedures parse HTML files:
+#
+#	htchunks(f)	generates the basic chunks -- tags and text --
+#			that compose an HTML file.
+#
+#	htrefs(f)	generates the tagname/keyword/value combinations
+#			that reference other files.
+#
+#	These procedures process strings from HTML files:
+#
+#	httag(s)	extracts the name of a tag.
+#
+#	htvals(s)	generates the keyword/value pairs from a tag.
+#
+#	urlmerge(base,new) interprets a new URL in the context of a base.
+#
+#       canpath(s)	puts a path in canonical form
+#
+############################################################################
+#
+#   	htchunks(f) generates the HTML chunks from file f.
+#	It returns strings beginning with
+#
+#		<!--	for unclosed comments (legal comments are deleted)
+#		<	for tags (will end with ">" unless unclosed at EOF)
+#	anything else	for text
+#
+#	At this level entities such as &amp are left unprocessed and all
+#	whitespace is preserved, including newlines.
+#
+############################################################################
+#
+#	htrefs(f) extracts file/url references from within an HTML file
+#	and generates a string of the form
+#		tagname keyword value
+#   	for each reference.
+#
+#	A single space character separates the three fields, but if no
+#	value is supplied for the keyword, no space follows the keyword.
+#	Tag and keyword names are always returned in upper case.
+#
+#	Quotation marks are stripped from the value, but note that the
+#	value can contain spaces or other special characters (although
+#	by strict HTML rules it probably shouldn't).
+#
+#       A table in the code determines which fields are references to
+#	other files.  For example, with <IMG>, SRC= is a reference but
+#	WIDTH= is not.  The table is based on the HTML 4.0 standard:
+#		http://www.w3.org/TR/REC-html40/
+#
+############################################################################
+#
+#	httag(s) extracts and returns the tag name from within an HTML
+#	tag string of the form "<tagname...>".   The tag name is returned
+#	in upper case.
+#
+############################################################################
+#
+#	htvals(s) generates the tag values contained within an HTML tag
+#	string of the form "<tagname kw=val kw=val ...>".   For each
+#	keyword=value pair beyond the tagname, a string of the form
+#
+#		keyword value
+#
+#	is generated.  One space follows the keyword, which is returned
+#	in upper case, and quotation marks are stripped from the value.
+#	The value itself can be an empty string.
+#
+#	For each keyword given without a value, the keyword is generated
+#	in upper case with no following space.
+#
+#	Parsing is somewhat tolerant of errors.
+#
+############################################################################
+#
+#	urlmerge(base,new) interprets a full or partial new URL in the
+#	context of a base URL, returning the combined URL.
+#
+#	Here are some examples of applying urlmerge() with a base value
+#	of "http://www.vcu.edu/misc/sched.html" and a new value as given:
+#
+#	new		result
+#	-------------	-------------------
+#	#tuesday	http://www.vcu.edu/misc/sched.html#tuesday
+#	bulletin.html	http://www.vcu.edu/misc/bulletin.html
+#	./results.html	http://www.vcu.edu/misc/results.html
+#	images/rs.gif	http://www.vcu.edu/misc/images/rs.gif
+#	../		http://www.vcu.edu/
+#	/greet.html	http://www.vcu.edu/greet.html
+#	file:a.html	file:a.html
+#
+############################################################################
+#
+#	canpath(s) returns the canonical form of a file path by squeezing
+#	out components such as "./" and "dir/../".
+#
+############################################################################
+
+
+#   htchunks(f) -- generate HTML chunks from file f
+
+procedure htchunks(f)			#: generate chunks of HTML file
+   local prev, line, s
+
+   "" ? repeat {
+
+      if pos(0) then
+         &subject := (read(f) || "\n") | fail
+
+      if ="<!--" then
+         suspend htc_comment(f)		# fails if comment is legal
+      else if ="<" then
+         suspend htc_tag(f)		# generate tag
+      else
+         suspend htc_text(f)		# generate text chunk
+
+      }
+end
+
+procedure htc_tag(f)
+   local s
+
+   s := "<"
+   repeat {
+      if s ||:= tab(upto('>') + 1) then
+         return s			# completed tag
+      s ||:= tab(0)
+      &subject := (read(f) || "\n") | break
+      }
+   return s				# unclosed tag
+end
+
+procedure htc_comment(f)
+   local s
+
+   s := ""
+   repeat {
+      if s ||:= tab(find("-->") + 3) then
+         fail				# normal case: discard comment
+      s ||:= tab(0)
+      &subject := (read(f) || "\n") | break
+      }
+
+   &subject := s			# rescan unclosed comment
+   return "<!--"			# return error indicator
+end
+
+procedure htc_text(f)
+   local s
+
+   s := ""
+   repeat {
+      if s ||:= tab(upto('<')) then
+         return s
+      s ||:= tab(0)
+      &subject := (read(f) || "\n") | return s
+      }
+end
+
+
+##  htrefs(f) -- generate references from HTML file f
+
+procedure htrefs(f)			#: generate references from HTML file
+   local tag, tagname, kwset, s
+   static ttable
+   initial {
+      ttable := table()
+      ttable["A"]	:= set(["HREF"])
+      ttable["APPLET"]	:= set(["CODEBASE"])
+      ttable["AREA"]	:= set(["HREF"])
+      ttable["BASE"]	:= set(["HREF"])
+      ttable["BLOCKQUOTE"] := set(["CITE"])
+      ttable["BODY"]	:= set(["BACKGROUND"])
+      ttable["DEL"]	:= set(["CITE"])
+      ttable["FORM"]	:= set(["ACTION"])
+      ttable["FRAME"]	:= set(["SRC", "LONGDESC"])
+      ttable["HEAD"]	:= set(["PROFILE"])
+      ttable["IFRAME"]	:= set(["SRC", "LONGDESC"])
+      ttable["IMG"]	:= set(["SRC", "LONGDESC", "USEMAP"])
+      ttable["INPUT"]	:= set(["SRC", "USEMAP"])
+      ttable["INS"]	:= set(["CITE"])
+      ttable["LINK"]	:= set(["HREF"])
+      ttable["OBJECT"]	:= set(["CLASSID","CODEBASE","DATA","ARCHIVE","USEMAP"])
+      ttable["Q"]	:= set(["CITE"])
+      ttable["SCRIPT"]	:= set(["SRC", "FOR"])
+      }
+
+   every tag := htchunks(f) do {
+      tagname := httag(tag) | next
+      kwset := \ttable[tagname] | next
+      every s := htvals(tag) do
+         if member(kwset, s ? tab(upto(' '))) then
+            suspend tagname || " " || s
+      }
+end
+
+
+
+##  httag(s) -- return the name of the HTML tag s
+
+procedure httag(s)			#: extract name of HTML tag
+   static idset, wset, lcase, ucase
+   initial {
+      idset := &letters ++ &digits ++ '.-/'
+      wset := ' \t\r\n\v\f'
+      lcase := string(&lcase)
+      ucase := string(&ucase)
+   }
+
+   s ? {
+      ="<" | fail
+      tab(many(wset))
+      return map(tab(many(idset)), lcase, ucase)
+   }
+end
+
+
+
+##  htvals(s) -- generate tag values of HTML tag s
+
+procedure htvals(s)			#: generate values in HTML tag
+   local kw
+   static idset, wset, qset, lcase, ucase
+   initial {
+      idset := &letters ++ &digits ++ '.-/'
+      wset := ' \t\r\n\v\f'
+      qset := wset ++ '>'
+      lcase := string(&lcase)
+      ucase := string(&ucase)
+   }
+
+   s ? {
+      ="<" | fail
+      tab(many(wset))
+      tab(many(idset)) | fail		# no name
+      repeat {
+         tab(upto(idset)) | fail
+         kw := map(tab(many(idset)), lcase, ucase)
+         tab(many(wset))
+         if ="=" then {
+            tab(many(wset))
+            kw ||:= " "
+            if ="\"" then {
+               kw ||:= tab(upto('"') | 0)
+               tab(any('"'))
+               }
+            else if ="'" then {
+               kw ||:= tab(upto('\'') | 0)
+               tab(any('\''))
+               }
+            else
+               kw ||:= tab(upto(qset) | 0)
+            }
+         suspend kw
+      }
+   }
+end
+
+
+
+#  urlmerge(base,new) -- merge URLs
+
+procedure urlmerge(base, new)		#: merge URLs
+   local protocol, host, path
+   static notslash
+   initial notslash := ~'/'
+
+   if new ? (tab(many(&letters)) & =":") then
+      return new			# new is fully specified
+
+   base ? {
+      protocol := (tab(many(&letters)) || =":") | ""
+      host := (="//" || tab(upto('/') | 0)) | ""
+      path := tab(upto('#') | 0)
+      }
+
+   new ? {
+      if ="#" then
+         return protocol || host || path || new
+      if ="/" then
+         return protocol || host || new
+   }
+
+   path := trim(path, notslash) || new
+
+   return protocol || host || canpath(path)
+end
+
+
+
+#  canpath(path) -- return canonical version of path
+#
+#  This is similar to step 6 of section 4 of RFC 1808.
+
+procedure canpath(path)			#: put path in canonical form
+   static notslash
+   initial notslash := ~'/'
+
+   # change "/./" to "/"
+   while path ?:= 1(tab(find("/./")), move(2)) || tab(0)
+
+   # change "//" to "/"
+   while path ?:= tab(find("//") + 1) || (tab(many('/')) & tab(0))
+
+   # remove "dir/../"
+   while path ?:=
+      (tab(1 | (upto('/') + 1))) || 
+         ((tab(many(notslash)) ~== "..") & ="/../" & tab(0))
+   
+   # remove leading "./"
+   while path ?:= (="./" & tab(0))
+
+   # remove trailing "."
+   path ?:= if tab(-2) & ="/." then path[1:-1]
+   path ?:= if ="." & pos(0) then ""
+
+   return path
+end