diff options
Diffstat (limited to 'ipl/procs/html.icn')
-rw-r--r-- | ipl/procs/html.icn | 334 |
1 files changed, 334 insertions, 0 deletions
diff --git a/ipl/procs/html.icn b/ipl/procs/html.icn new file mode 100644 index 0000000..50f7086 --- /dev/null +++ b/ipl/procs/html.icn @@ -0,0 +1,334 @@ +############################################################################ +# +# File: html.icn +# +# Subject: Procedures for parsing HTML +# +# Author: Gregg M. Townsend +# +# Date: April 26, 2005 +# +############################################################################ +# +# This file is in the public domain. +# +############################################################################ +# +# These procedures parse HTML files: +# +# htchunks(f) generates the basic chunks -- tags and text -- +# that compose an HTML file. +# +# htrefs(f) generates the tagname/keyword/value combinations +# that reference other files. +# +# These procedures process strings from HTML files: +# +# httag(s) extracts the name of a tag. +# +# htvals(s) generates the keyword/value pairs from a tag. +# +# urlmerge(base,new) interprets a new URL in the context of a base. +# +# canpath(s) puts a path in canonical form +# +############################################################################ +# +# htchunks(f) generates the HTML chunks from file f. +# It returns strings beginning with +# +# <!-- for unclosed comments (legal comments are deleted) +# < for tags (will end with ">" unless unclosed at EOF) +# anything else for text +# +# At this level entities such as & are left unprocessed and all +# whitespace is preserved, including newlines. +# +############################################################################ +# +# htrefs(f) extracts file/url references from within an HTML file +# and generates a string of the form +# tagname keyword value +# for each reference. +# +# A single space character separates the three fields, but if no +# value is supplied for the keyword, no space follows the keyword. +# Tag and keyword names are always returned in upper case. +# +# Quotation marks are stripped from the value, but note that the +# value can contain spaces or other special characters (although +# by strict HTML rules it probably shouldn't). +# +# A table in the code determines which fields are references to +# other files. For example, with <IMG>, SRC= is a reference but +# WIDTH= is not. The table is based on the HTML 4.0 standard: +# http://www.w3.org/TR/REC-html40/ +# +############################################################################ +# +# httag(s) extracts and returns the tag name from within an HTML +# tag string of the form "<tagname...>". The tag name is returned +# in upper case. +# +############################################################################ +# +# htvals(s) generates the tag values contained within an HTML tag +# string of the form "<tagname kw=val kw=val ...>". For each +# keyword=value pair beyond the tagname, a string of the form +# +# keyword value +# +# is generated. One space follows the keyword, which is returned +# in upper case, and quotation marks are stripped from the value. +# The value itself can be an empty string. +# +# For each keyword given without a value, the keyword is generated +# in upper case with no following space. +# +# Parsing is somewhat tolerant of errors. +# +############################################################################ +# +# urlmerge(base,new) interprets a full or partial new URL in the +# context of a base URL, returning the combined URL. +# +# Here are some examples of applying urlmerge() with a base value +# of "http://www.vcu.edu/misc/sched.html" and a new value as given: +# +# new result +# ------------- ------------------- +# #tuesday http://www.vcu.edu/misc/sched.html#tuesday +# bulletin.html http://www.vcu.edu/misc/bulletin.html +# ./results.html http://www.vcu.edu/misc/results.html +# images/rs.gif http://www.vcu.edu/misc/images/rs.gif +# ../ http://www.vcu.edu/ +# /greet.html http://www.vcu.edu/greet.html +# file:a.html file:a.html +# +############################################################################ +# +# canpath(s) returns the canonical form of a file path by squeezing +# out components such as "./" and "dir/../". +# +############################################################################ + + +# htchunks(f) -- generate HTML chunks from file f + +procedure htchunks(f) #: generate chunks of HTML file + local prev, line, s + + "" ? repeat { + + if pos(0) then + &subject := (read(f) || "\n") | fail + + if ="<!--" then + suspend htc_comment(f) # fails if comment is legal + else if ="<" then + suspend htc_tag(f) # generate tag + else + suspend htc_text(f) # generate text chunk + + } +end + +procedure htc_tag(f) + local s + + s := "<" + repeat { + if s ||:= tab(upto('>') + 1) then + return s # completed tag + s ||:= tab(0) + &subject := (read(f) || "\n") | break + } + return s # unclosed tag +end + +procedure htc_comment(f) + local s + + s := "" + repeat { + if s ||:= tab(find("-->") + 3) then + fail # normal case: discard comment + s ||:= tab(0) + &subject := (read(f) || "\n") | break + } + + &subject := s # rescan unclosed comment + return "<!--" # return error indicator +end + +procedure htc_text(f) + local s + + s := "" + repeat { + if s ||:= tab(upto('<')) then + return s + s ||:= tab(0) + &subject := (read(f) || "\n") | return s + } +end + + +## htrefs(f) -- generate references from HTML file f + +procedure htrefs(f) #: generate references from HTML file + local tag, tagname, kwset, s + static ttable + initial { + ttable := table() + ttable["A"] := set(["HREF"]) + ttable["APPLET"] := set(["CODEBASE"]) + ttable["AREA"] := set(["HREF"]) + ttable["BASE"] := set(["HREF"]) + ttable["BLOCKQUOTE"] := set(["CITE"]) + ttable["BODY"] := set(["BACKGROUND"]) + ttable["DEL"] := set(["CITE"]) + ttable["FORM"] := set(["ACTION"]) + ttable["FRAME"] := set(["SRC", "LONGDESC"]) + ttable["HEAD"] := set(["PROFILE"]) + ttable["IFRAME"] := set(["SRC", "LONGDESC"]) + ttable["IMG"] := set(["SRC", "LONGDESC", "USEMAP"]) + ttable["INPUT"] := set(["SRC", "USEMAP"]) + ttable["INS"] := set(["CITE"]) + ttable["LINK"] := set(["HREF"]) + ttable["OBJECT"] := set(["CLASSID","CODEBASE","DATA","ARCHIVE","USEMAP"]) + ttable["Q"] := set(["CITE"]) + ttable["SCRIPT"] := set(["SRC", "FOR"]) + } + + every tag := htchunks(f) do { + tagname := httag(tag) | next + kwset := \ttable[tagname] | next + every s := htvals(tag) do + if member(kwset, s ? tab(upto(' '))) then + suspend tagname || " " || s + } +end + + + +## httag(s) -- return the name of the HTML tag s + +procedure httag(s) #: extract name of HTML tag + static idset, wset, lcase, ucase + initial { + idset := &letters ++ &digits ++ '.-/' + wset := ' \t\r\n\v\f' + lcase := string(&lcase) + ucase := string(&ucase) + } + + s ? { + ="<" | fail + tab(many(wset)) + return map(tab(many(idset)), lcase, ucase) + } +end + + + +## htvals(s) -- generate tag values of HTML tag s + +procedure htvals(s) #: generate values in HTML tag + local kw + static idset, wset, qset, lcase, ucase + initial { + idset := &letters ++ &digits ++ '.-/' + wset := ' \t\r\n\v\f' + qset := wset ++ '>' + lcase := string(&lcase) + ucase := string(&ucase) + } + + s ? { + ="<" | fail + tab(many(wset)) + tab(many(idset)) | fail # no name + repeat { + tab(upto(idset)) | fail + kw := map(tab(many(idset)), lcase, ucase) + tab(many(wset)) + if ="=" then { + tab(many(wset)) + kw ||:= " " + if ="\"" then { + kw ||:= tab(upto('"') | 0) + tab(any('"')) + } + else if ="'" then { + kw ||:= tab(upto('\'') | 0) + tab(any('\'')) + } + else + kw ||:= tab(upto(qset) | 0) + } + suspend kw + } + } +end + + + +# urlmerge(base,new) -- merge URLs + +procedure urlmerge(base, new) #: merge URLs + local protocol, host, path + static notslash + initial notslash := ~'/' + + if new ? (tab(many(&letters)) & =":") then + return new # new is fully specified + + base ? { + protocol := (tab(many(&letters)) || =":") | "" + host := (="//" || tab(upto('/') | 0)) | "" + path := tab(upto('#') | 0) + } + + new ? { + if ="#" then + return protocol || host || path || new + if ="/" then + return protocol || host || new + } + + path := trim(path, notslash) || new + + return protocol || host || canpath(path) +end + + + +# canpath(path) -- return canonical version of path +# +# This is similar to step 6 of section 4 of RFC 1808. + +procedure canpath(path) #: put path in canonical form + static notslash + initial notslash := ~'/' + + # change "/./" to "/" + while path ?:= 1(tab(find("/./")), move(2)) || tab(0) + + # change "//" to "/" + while path ?:= tab(find("//") + 1) || (tab(many('/')) & tab(0)) + + # remove "dir/../" + while path ?:= + (tab(1 | (upto('/') + 1))) || + ((tab(many(notslash)) ~== "..") & ="/../" & tab(0)) + + # remove leading "./" + while path ?:= (="./" & tab(0)) + + # remove trailing "." + path ?:= if tab(-2) & ="/." then path[1:-1] + path ?:= if ="." & pos(0) then "" + + return path +end |