From d39f5aa373a4422f7a5f3ee764fb0f6b0b719d61 Mon Sep 17 00:00:00 2001
From: Ondřej Surý <ondrej@sury.org>
Date: Thu, 30 Jun 2011 15:34:22 +0200
Subject: Imported Upstream version 58

---
 src/pkg/html/doc.go        |  1 +
 src/pkg/html/token.go      | 66 +++++++++++++++++++++++++++++++++++++---------
 src/pkg/html/token_test.go | 50 ++++++++++++++++++++++++++++++++++-
 3 files changed, 104 insertions(+), 13 deletions(-)

(limited to 'src/pkg/html')

diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go
index 55135c3d0..5bc063086 100644
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@@ -4,6 +4,7 @@
 
 /*
 Package html implements an HTML5-compliant tokenizer and parser.
+INCOMPLETE.
 
 Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
 caller's responsibility to ensure that r provides UTF-8 encoded HTML.
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go
index ad03241ed..23c95ece6 100644
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -331,10 +331,10 @@ func (z *Tokenizer) trim(i int) int {
 	return k
 }
 
-// lower finds the largest alphabetic [0-9A-Za-z]* word at the start of z.buf[i:]
-// and returns that word lower-cased, as well as the trimmed cursor location
-// after that word.
-func (z *Tokenizer) lower(i int) ([]byte, int) {
+// word finds the largest alphabetic [0-9A-Za-z]* word at the start
+// of z.buf[i:] and returns that word (optionally lower-cased), as
+// well as the trimmed cursor location after that word.
+func (z *Tokenizer) word(i int, lower bool) ([]byte, int) {
 	i0 := i
 loop:
 	for ; i < z.p1; i++ {
@@ -343,7 +343,9 @@ loop:
 		case '0' <= c && c <= '9':
 			// No-op.
 		case 'A' <= c && c <= 'Z':
-			z.buf[i] = c + 'a' - 'A'
+			if lower {
+				z.buf[i] = c + 'a' - 'A'
+			}
 		case 'a' <= c && c <= 'z':
 			// No-op.
 		default:
@@ -353,6 +355,33 @@ loop:
 	return z.buf[i0:i], z.trim(i)
 }
 
+// attrName finds the largest attribute name at the start
+// of z.buf[i:] and returns it lower-cased, as well
+// as the trimmed cursor location after that word.
+//
+// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
+// TODO: unicode characters
+func (z *Tokenizer) attrName(i int) ([]byte, int) {
+	i0 := i
+loop:
+	for ; i < z.p1; i++ {
+		c := z.buf[i]
+		switch c {
+		case '<', '>', '"', '\'', '/', '=':
+			break loop
+		}
+		switch {
+		case 'A' <= c && c <= 'Z':
+			z.buf[i] = c + 'a' - 'A'
+		case c > ' ' && c < 0x7f:
+			// No-op.
+		default:
+			break loop
+		}
+	}
+	return z.buf[i0:i], z.trim(i)
+}
+
 // Text returns the unescaped text of a TextToken or a CommentToken.
 // The contents of the returned slice may change on the next call to Next.
 func (z *Tokenizer) Text() []byte {
@@ -388,7 +417,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
 	if z.buf[i] == '/' {
 		i++
 	}
-	name, z.p0 = z.lower(i)
+	name, z.p0 = z.word(i, true)
 	hasAttr = z.p0 != z.p1
 	return
 }
@@ -397,23 +426,36 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
 // attribute for the current tag token and whether there are more attributes.
 // The contents of the returned slices may change on the next call to Next.
 func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
-	key, i := z.lower(z.p0)
-	// Get past the "=\"".
-	if i == z.p1 || z.buf[i] != '=' {
+	key, i := z.attrName(z.p0)
+	// Check for an empty attribute value.
+	if i == z.p1 {
+		z.p0 = i
+		return
+	}
+	// Get past the equals and quote characters.
+	if z.buf[i] != '=' {
+		z.p0, moreAttr = i, true
 		return
 	}
 	i = z.trim(i + 1)
-	if i == z.p1 || z.buf[i] != '"' {
+	if i == z.p1 {
+		z.p0 = i
+		return
+	}
+	closeQuote := z.buf[i]
+	if closeQuote != '\'' && closeQuote != '"' {
+		val, z.p0 = z.word(i, false)
+		moreAttr = z.p0 != z.p1
 		return
 	}
 	i = z.trim(i + 1)
-	// Copy and unescape everything up to the closing '"'.
+	// Copy and unescape everything up to the closing quote.
 	dst, src := i, i
 loop:
 	for src < z.p1 {
 		c := z.buf[src]
 		switch c {
-		case '"':
+		case closeQuote:
 			src++
 			break loop
 		case '&':
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go
index 5cf1f6dac..c17b436aa 100644
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -100,13 +100,61 @@ var tokenTests = []tokenTest{
 		"<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
 		`<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
 	},
-	// A non-existant entity. Tokenizing and converting back to a string should
+	// A nonexistent entity. Tokenizing and converting back to a string should
 	// escape the "&" to become "&amp;".
 	{
 		"noSuchEntity",
 		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
 		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 	},
+
+	// Attribute tests:
+	// http://dev.w3.org/html5/spec/Overview.html#attributes-0
+	{
+		"Empty attribute",
+		`<input disabled FOO>`,
+		`<input disabled="" foo="">`,
+	},
+	{
+		"Empty attribute, whitespace",
+		`<input disabled FOO >`,
+		`<input disabled="" foo="">`,
+	},
+	{
+		"Unquoted attribute value",
+		`<input value=yes FOO=BAR>`,
+		`<input value="yes" foo="BAR">`,
+	},
+	{
+		"Unquoted attribute value, spaces",
+		`<input value = yes FOO = BAR>`,
+		`<input value="yes" foo="BAR">`,
+	},
+	{
+		"Unquoted attribute value, trailing space",
+		`<input value=yes FOO=BAR >`,
+		`<input value="yes" foo="BAR">`,
+	},
+	{
+		"Single-quoted attribute value",
+		`<input value='yes' FOO='BAR'>`,
+		`<input value="yes" foo="BAR">`,
+	},
+	{
+		"Single-quoted attribute value, trailing space",
+		`<input value='yes' FOO='BAR' >`,
+		`<input value="yes" foo="BAR">`,
+	},
+	{
+		"Double-quoted attribute value",
+		`<input value="I'm an attribute" FOO="BAR">`,
+		`<input value="I&apos;m an attribute" foo="BAR">`,
+	},
+	{
+		"Attribute name characters",
+		`<meta http-equiv="content-type">`,
+		`<meta http-equiv="content-type">`,
+	},
 }
 
 func TestTokenizer(t *testing.T) {
-- 
cgit v1.2.3