introduce non-strict mode in xml parser,

good enough to parse some html. in reader, add "comment" tag to collect comment text. do not allocate during Unmarshal unless pointer is nil. R=r DELTA=441 (416 added, 1 deleted, 24 changed) OCL=35586 CL=35594
author: Russ Cox <rsc@golang.org> 2009-10-11 23:51:46 -0700
committer: Russ Cox <rsc@golang.org> 2009-10-11 23:51:46 -0700
commit: 1ec7f0285bddd3bc1e8534fea6077a8b6f841f15 (patch)
tree: 566631aff92d3ed507a84fd04025e3fd62c38b33 /src/pkg/xml/xml.go
parent: 12b5a55f4a94341ea7e1ad00ffa6f295cb4268d5 (diff)
download: golang-1ec7f0285bddd3bc1e8534fea6077a8b6f841f15.tar.gz
1 files changed, 404 insertions, 10 deletions
diff --git a/src/pkg/xml/xml.go b/src/pkg/xml/xml.go
index 5baaafbb3..d73ebdc2a 100644
--- a/src/pkg/xml/xml.go
+++ b/src/pkg/xml/xml.go
@@ -111,12 +111,49 @@ type readByter interface {
 // A Parser represents an XML parser reading a particular input stream.
 // The parser assumes that its input is encoded in UTF-8.
 type Parser struct {
+	// Strict defaults to true, enforcing the requirements
+	// of the XML specification.
+	// If set to false, the parser allows input containing common
+	// mistakes:
+	//	* If an element is missing an end tag, the parser invents
+	//	  end tags as necessary to keep the return values from Token
+	//	  properly balanced.
+	//	* In attribute values and character data, unknown or malformed
+	//	  character entities (sequences beginning with &) are left alone.
+	//
+	// Setting:
+	//
+	//	p.Strict = false;
+	//	p.AutoClose = HTMLAutoClose;
+	//	p.Entity = HTMLEntity
+	//
+	// creates a parser that can handle typical HTML.
+	Strict	bool;
+
+	// When Strict == false, AutoClose indicates a set of elements to
+	// consider closed immediately after they are opened, regardless
+	// of whether an end element is present.
+	AutoClose	[]string;
+
+	// Entity can be used to map non-standard entity names to string replacements.
+	// The parser behaves as if these standard mappings are present in the map,
+	// regardless of the actual map content:
+	//
+	//	"lt": "<",
+	//	"gt": ">",
+	//	"amp": "&",
+	//	"pos": "'",
+	//	"quot": `"`,
+	//
+	Entity	map[string]string;
+
 	r		readByter;
 	buf		bytes.Buffer;
 	stk		*stack;
 	free		*stack;
 	needClose	bool;
 	toClose		Name;
+	nextToken	Token;
 	nextByte	int;
 	ns		map[string]string;
 	err		os.Error;
@@ -130,6 +167,7 @@ func NewParser(r io.Reader) *Parser {
 		ns: make(map[string]string),
 		nextByte: -1,
 		line: 1,
+		Strict: true,
 	};
 
 	// Get efficient byte at a time reader.
@@ -169,9 +207,19 @@ func NewParser(r io.Reader) *Parser {
 // it uses the prefix as the Space rather than report an error.
 //
 func (p *Parser) Token() (t Token, err os.Error) {
-	if t, err = p.RawToken(); err != nil {
+	if p.nextToken != nil {
+		t = p.nextToken;
+		p.nextToken = nil;
+	} else if t, err = p.RawToken(); err != nil {
 		return;
 	}
+
+	if !p.Strict {
+		if t1, ok := p.autoClose(t); ok {
+			p.nextToken = t;
+			t = t1;
+		}
+	}
 	switch t1 := t.(type) {
 	case StartElement:
 		// In XML name spaces, the translations listed in the
@@ -201,7 +249,7 @@ func (p *Parser) Token() (t Token, err os.Error) {
 
 	case EndElement:
 		p.translate(&t1.Name, true);
-		if !p.popElement(t1.Name) {
+		if !p.popElement(&t1) {
 			return nil, p.err;
 		}
 		t = t1;
@@ -286,13 +334,20 @@ func (p *Parser) pushNs(local string, url string, ok bool) {
 // After popping the element, apply any undo records from
 // the stack to restore the name translations that existed
 // before we saw this element.
-func (p *Parser) popElement(name Name) bool {
+func (p *Parser) popElement(t *EndElement) bool {
 	s := p.pop();
+	name := t.Name;
 	switch {
 	case s == nil || s.kind != stkStart:
 		p.err = SyntaxError("unexpected end element </" + name.Local + ">");
 		return false;
 	case s.name.Local != name.Local:
+		if !p.Strict {
+			p.needClose = true;
+			p.toClose = t.Name;
+			t.Name = s.name;
+			return true;
+		}
 		p.err = SyntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">");
 		return false;
 	case s.name.Space != name.Space:
@@ -311,6 +366,27 @@ func (p *Parser) popElement(name Name) bool {
 	return true;
 }
 
+// If the top element on the stack is autoclosing and
+// t is not the end tag, invent the end tag.
+func (p *Parser) autoClose(t Token) (Token, bool) {
+	if p.stk == nil || p.stk.kind != stkStart {
+		return nil, false;
+	}
+	name := strings.ToLower(p.stk.name.Local);
+	for _, s := range p.AutoClose {
+		if strings.ToLower(s) == name {
+			// This one should be auto closed if t doesn't close it.
+			et, ok := t.(EndElement);
+			if !ok || et.Name.Local != name {
+				return EndElement{p.stk.name}, true;
+			}
+			break;
+		}
+	}
+	return nil, false;
+}
+
+
 // RawToken is like Token but does not verify that
 // start and end elements match and does not translate
 // name space prefixes to their corresponding URLs.
@@ -645,21 +721,38 @@ Input:
 			// Parsers are required to recognize lt, gt, amp, apos, and quot
 			// even if they have not been declared.  That's all we allow.
 			var i int;
+		CharLoop:
 			for i = 0; i < len(p.tmp); i++ {
 				p.tmp[i], p.err = p.r.ReadByte();
 				if p.err != nil {
 					return nil;
 				}
-				if p.tmp[i] == ';' {
+				c := p.tmp[i];
+				if c == ';' {
 					break;
 				}
+				if 'a' <= c && c <= 'z' ||
+					'A' <= c && c <= 'Z' ||
+					'0' <= c && c <= '9' ||
+					c == '_' || c == '#' {
+					continue;
+				}
+				p.ungetc(c);
+				break;
 			}
 			s := string(p.tmp[0:i]);
 			if i >= len(p.tmp) {
+				if !p.Strict {
+					b0, b1 = 0, 0;
+					p.buf.WriteByte('&');
+					p.buf.Write(p.tmp[0:i]);
+					continue Input;
+				}
 				p.err = SyntaxError("character entity expression &" + s + "... too long");
 				return nil;
 			}
-			rune := -1;
+			var haveText bool;
+			var text string;
 			if i >= 2 && s[0] == '#' {
 				var n uint64;
 				var err os.Error;
@@ -669,19 +762,28 @@ Input:
 					n, err = strconv.Btoui64(s[1:len(s)], 10);
 				}
 				if err == nil && n <= unicode.MaxRune {
-					rune = int(n);
+					text = string(n);
+					haveText = true;
 				}
 			} else {
 				if r, ok := entity[s]; ok {
-					rune = r;
+					text = string(r);
+					haveText = true;
+				} else {
+					text, haveText = p.Entity[s];
 				}
 			}
-			if rune < 0 {
+			if !haveText {
+				if !p.Strict {
+					b0, b1 = 0, 0;
+					p.buf.WriteByte('&');
+					p.buf.Write(p.tmp[0:i]);
+					continue Input;
+				}
 				p.err = SyntaxError("invalid character entity &" + s + ";");
 				return nil;
 			}
-			i = utf8.EncodeRune(rune, &p.tmp);
-			p.buf.Write(p.tmp[0:i]);
+			p.buf.Write(strings.Bytes(text));
 			b0, b1 = 0, 0;
 			continue Input;
 		}
@@ -764,6 +866,7 @@ func (p *Parser) name() (s string, ok bool) {
 func isNameByte(c byte) bool {
 	return 'A' <= c && c <= 'Z' ||
 		'a' <= c && c <= 'z' ||
+		'0' <= c && c <= '9' ||
 		c == '_' || c == ':' || c == '.' || c == '-';
 }
 
@@ -1079,3 +1182,294 @@ var second = []unicode.Range{
 	unicode.Range{0x309D, 0x309E, 1},
 	unicode.Range{0x30FC, 0x30FE, 1},
 }
+
+// HTMLEntity is an entity map containing translations for the
+// standard HTML entity characters.
+var HTMLEntity = htmlEntity
+
+var htmlEntity = map[string]string {
+/*
+	hget http://www.w3.org/TR/html4/sgml/entities.html |
+	ssam '
+		,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
+		,x v/^\&lt;!ENTITY/d
+		,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/	"\1": "\\u\2",/g
+	'
+*/
+	"nbsp": "\u00A0",
+	"iexcl": "\u00A1",
+	"cent": "\u00A2",
+	"pound": "\u00A3",
+	"curren": "\u00A4",
+	"yen": "\u00A5",
+	"brvbar": "\u00A6",
+	"sect": "\u00A7",
+	"uml": "\u00A8",
+	"copy": "\u00A9",
+	"ordf": "\u00AA",
+	"laquo": "\u00AB",
+	"not": "\u00AC",
+	"shy": "\u00AD",
+	"reg": "\u00AE",
+	"macr": "\u00AF",
+	"deg": "\u00B0",
+	"plusmn": "\u00B1",
+	"sup2": "\u00B2",
+	"sup3": "\u00B3",
+	"acute": "\u00B4",
+	"micro": "\u00B5",
+	"para": "\u00B6",
+	"middot": "\u00B7",
+	"cedil": "\u00B8",
+	"sup1": "\u00B9",
+	"ordm": "\u00BA",
+	"raquo": "\u00BB",
+	"frac14": "\u00BC",
+	"frac12": "\u00BD",
+	"frac34": "\u00BE",
+	"iquest": "\u00BF",
+	"Agrave": "\u00C0",
+	"Aacute": "\u00C1",
+	"Acirc": "\u00C2",
+	"Atilde": "\u00C3",
+	"Auml": "\u00C4",
+	"Aring": "\u00C5",
+	"AElig": "\u00C6",
+	"Ccedil": "\u00C7",
+	"Egrave": "\u00C8",
+	"Eacute": "\u00C9",
+	"Ecirc": "\u00CA",
+	"Euml": "\u00CB",
+	"Igrave": "\u00CC",
+	"Iacute": "\u00CD",
+	"Icirc": "\u00CE",
+	"Iuml": "\u00CF",
+	"ETH": "\u00D0",
+	"Ntilde": "\u00D1",
+	"Ograve": "\u00D2",
+	"Oacute": "\u00D3",
+	"Ocirc": "\u00D4",
+	"Otilde": "\u00D5",
+	"Ouml": "\u00D6",
+	"times": "\u00D7",
+	"Oslash": "\u00D8",
+	"Ugrave": "\u00D9",
+	"Uacute": "\u00DA",
+	"Ucirc": "\u00DB",
+	"Uuml": "\u00DC",
+	"Yacute": "\u00DD",
+	"THORN": "\u00DE",
+	"szlig": "\u00DF",
+	"agrave": "\u00E0",
+	"aacute": "\u00E1",
+	"acirc": "\u00E2",
+	"atilde": "\u00E3",
+	"auml": "\u00E4",
+	"aring": "\u00E5",
+	"aelig": "\u00E6",
+	"ccedil": "\u00E7",
+	"egrave": "\u00E8",
+	"eacute": "\u00E9",
+	"ecirc": "\u00EA",
+	"euml": "\u00EB",
+	"igrave": "\u00EC",
+	"iacute": "\u00ED",
+	"icirc": "\u00EE",
+	"iuml": "\u00EF",
+	"eth": "\u00F0",
+	"ntilde": "\u00F1",
+	"ograve": "\u00F2",
+	"oacute": "\u00F3",
+	"ocirc": "\u00F4",
+	"otilde": "\u00F5",
+	"ouml": "\u00F6",
+	"divide": "\u00F7",
+	"oslash": "\u00F8",
+	"ugrave": "\u00F9",
+	"uacute": "\u00FA",
+	"ucirc": "\u00FB",
+	"uuml": "\u00FC",
+	"yacute": "\u00FD",
+	"thorn": "\u00FE",
+	"yuml": "\u00FF",
+	"fnof": "\u0192",
+	"Alpha": "\u0391",
+	"Beta": "\u0392",
+	"Gamma": "\u0393",
+	"Delta": "\u0394",
+	"Epsilon": "\u0395",
+	"Zeta": "\u0396",
+	"Eta": "\u0397",
+	"Theta": "\u0398",
+	"Iota": "\u0399",
+	"Kappa": "\u039A",
+	"Lambda": "\u039B",
+	"Mu": "\u039C",
+	"Nu": "\u039D",
+	"Xi": "\u039E",
+	"Omicron": "\u039F",
+	"Pi": "\u03A0",
+	"Rho": "\u03A1",
+	"Sigma": "\u03A3",
+	"Tau": "\u03A4",
+	"Upsilon": "\u03A5",
+	"Phi": "\u03A6",
+	"Chi": "\u03A7",
+	"Psi": "\u03A8",
+	"Omega": "\u03A9",
+	"alpha": "\u03B1",
+	"beta": "\u03B2",
+	"gamma": "\u03B3",
+	"delta": "\u03B4",
+	"epsilon": "\u03B5",
+	"zeta": "\u03B6",
+	"eta": "\u03B7",
+	"theta": "\u03B8",
+	"iota": "\u03B9",
+	"kappa": "\u03BA",
+	"lambda": "\u03BB",
+	"mu": "\u03BC",
+	"nu": "\u03BD",
+	"xi": "\u03BE",
+	"omicron": "\u03BF",
+	"pi": "\u03C0",
+	"rho": "\u03C1",
+	"sigmaf": "\u03C2",
+	"sigma": "\u03C3",
+	"tau": "\u03C4",
+	"upsilon": "\u03C5",
+	"phi": "\u03C6",
+	"chi": "\u03C7",
+	"psi": "\u03C8",
+	"omega": "\u03C9",
+	"thetasym": "\u03D1",
+	"upsih": "\u03D2",
+	"piv": "\u03D6",
+	"bull": "\u2022",
+	"hellip": "\u2026",
+	"prime": "\u2032",
+	"Prime": "\u2033",
+	"oline": "\u203E",
+	"frasl": "\u2044",
+	"weierp": "\u2118",
+	"image": "\u2111",
+	"real": "\u211C",
+	"trade": "\u2122",
+	"alefsym": "\u2135",
+	"larr": "\u2190",
+	"uarr": "\u2191",
+	"rarr": "\u2192",
+	"darr": "\u2193",
+	"harr": "\u2194",
+	"crarr": "\u21B5",
+	"lArr": "\u21D0",
+	"uArr": "\u21D1",
+	"rArr": "\u21D2",
+	"dArr": "\u21D3",
+	"hArr": "\u21D4",
+	"forall": "\u2200",
+	"part": "\u2202",
+	"exist": "\u2203",
+	"empty": "\u2205",
+	"nabla": "\u2207",
+	"isin": "\u2208",
+	"notin": "\u2209",
+	"ni": "\u220B",
+	"prod": "\u220F",
+	"sum": "\u2211",
+	"minus": "\u2212",
+	"lowast": "\u2217",
+	"radic": "\u221A",
+	"prop": "\u221D",
+	"infin": "\u221E",
+	"ang": "\u2220",
+	"and": "\u2227",
+	"or": "\u2228",
+	"cap": "\u2229",
+	"cup": "\u222A",
+	"int": "\u222B",
+	"there4": "\u2234",
+	"sim": "\u223C",
+	"cong": "\u2245",
+	"asymp": "\u2248",
+	"ne": "\u2260",
+	"equiv": "\u2261",
+	"le": "\u2264",
+	"ge": "\u2265",
+	"sub": "\u2282",
+	"sup": "\u2283",
+	"nsub": "\u2284",
+	"sube": "\u2286",
+	"supe": "\u2287",
+	"oplus": "\u2295",
+	"otimes": "\u2297",
+	"perp": "\u22A5",
+	"sdot": "\u22C5",
+	"lceil": "\u2308",
+	"rceil": "\u2309",
+	"lfloor": "\u230A",
+	"rfloor": "\u230B",
+	"lang": "\u2329",
+	"rang": "\u232A",
+	"loz": "\u25CA",
+	"spades": "\u2660",
+	"clubs": "\u2663",
+	"hearts": "\u2665",
+	"diams": "\u2666",
+	"quot": "\u0022",
+	"amp": "\u0026",
+	"lt": "\u003C",
+	"gt": "\u003E",
+	"OElig": "\u0152",
+	"oelig": "\u0153",
+	"Scaron": "\u0160",
+	"scaron": "\u0161",
+	"Yuml": "\u0178",
+	"circ": "\u02C6",
+	"tilde": "\u02DC",
+	"ensp": "\u2002",
+	"emsp": "\u2003",
+	"thinsp": "\u2009",
+	"zwnj": "\u200C",
+	"zwj": "\u200D",
+	"lrm": "\u200E",
+	"rlm": "\u200F",
+	"ndash": "\u2013",
+	"mdash": "\u2014",
+	"lsquo": "\u2018",
+	"rsquo": "\u2019",
+	"sbquo": "\u201A",
+	"ldquo": "\u201C",
+	"rdquo": "\u201D",
+	"bdquo": "\u201E",
+	"dagger": "\u2020",
+	"Dagger": "\u2021",
+	"permil": "\u2030",
+	"lsaquo": "\u2039",
+	"rsaquo": "\u203A",
+	"euro": "\u20AC",
+}
+
+// HTMLAutoClose is the set of HTML elements that
+// should be considered to close automatically.
+var HTMLAutoClose = htmlAutoClose
+
+var htmlAutoClose = []string {
+/*
+	hget http://www.w3.org/TR/html4/loose.dtd |
+	9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/	"\1",/p' | tr A-Z a-z
+*/
+	"basefont",
+	"br",
+	"area",
+	"link",
+	"img",
+	"param",
+	"hr",
+	"input",
+	"col     ",
+	"frame",
+	"isindex",
+	"base",
+	"meta",
+}
author	Russ Cox <rsc@golang.org>	2009-10-11 23:51:46 -0700
committer	Russ Cox <rsc@golang.org>	2009-10-11 23:51:46 -0700
commit	1ec7f0285bddd3bc1e8534fea6077a8b6f841f15 (patch)
tree	566631aff92d3ed507a84fd04025e3fd62c38b33 /src/pkg/xml/xml.go
parent	12b5a55f4a94341ea7e1ad00ffa6f295cb4268d5 (diff)
download	golang-1ec7f0285bddd3bc1e8534fea6077a8b6f841f15.tar.gz