diff options
Diffstat (limited to 'src/pkg/html')
-rw-r--r-- | src/pkg/html/doc.go | 1 | ||||
-rw-r--r-- | src/pkg/html/token.go | 66 | ||||
-rw-r--r-- | src/pkg/html/token_test.go | 50 |
3 files changed, 104 insertions, 13 deletions
diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go index 55135c3d0..5bc063086 100644 --- a/src/pkg/html/doc.go +++ b/src/pkg/html/doc.go @@ -4,6 +4,7 @@ /* Package html implements an HTML5-compliant tokenizer and parser. +INCOMPLETE. Tokenization is done by creating a Tokenizer for an io.Reader r. It is the caller's responsibility to ensure that r provides UTF-8 encoded HTML. diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index ad03241ed..23c95ece6 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -331,10 +331,10 @@ func (z *Tokenizer) trim(i int) int { return k } -// lower finds the largest alphabetic [0-9A-Za-z]* word at the start of z.buf[i:] -// and returns that word lower-cased, as well as the trimmed cursor location -// after that word. -func (z *Tokenizer) lower(i int) ([]byte, int) { +// word finds the largest alphabetic [0-9A-Za-z]* word at the start +// of z.buf[i:] and returns that word (optionally lower-cased), as +// well as the trimmed cursor location after that word. +func (z *Tokenizer) word(i int, lower bool) ([]byte, int) { i0 := i loop: for ; i < z.p1; i++ { @@ -343,7 +343,9 @@ loop: case '0' <= c && c <= '9': // No-op. case 'A' <= c && c <= 'Z': - z.buf[i] = c + 'a' - 'A' + if lower { + z.buf[i] = c + 'a' - 'A' + } case 'a' <= c && c <= 'z': // No-op. default: @@ -353,6 +355,33 @@ loop: return z.buf[i0:i], z.trim(i) } +// attrName finds the largest attribute name at the start +// of z.buf[i:] and returns it lower-cased, as well +// as the trimmed cursor location after that word. +// +// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name +// TODO: unicode characters +func (z *Tokenizer) attrName(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + c := z.buf[i] + switch c { + case '<', '>', '"', '\'', '/', '=': + break loop + } + switch { + case 'A' <= c && c <= 'Z': + z.buf[i] = c + 'a' - 'A' + case c > ' ' && c < 0x7f: + // No-op. + default: + break loop + } + } + return z.buf[i0:i], z.trim(i) +} + // Text returns the unescaped text of a TextToken or a CommentToken. // The contents of the returned slice may change on the next call to Next. func (z *Tokenizer) Text() []byte { @@ -388,7 +417,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { if z.buf[i] == '/' { i++ } - name, z.p0 = z.lower(i) + name, z.p0 = z.word(i, true) hasAttr = z.p0 != z.p1 return } @@ -397,23 +426,36 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { // attribute for the current tag token and whether there are more attributes. // The contents of the returned slices may change on the next call to Next. func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { - key, i := z.lower(z.p0) - // Get past the "=\"". - if i == z.p1 || z.buf[i] != '=' { + key, i := z.attrName(z.p0) + // Check for an empty attribute value. + if i == z.p1 { + z.p0 = i + return + } + // Get past the equals and quote characters. + if z.buf[i] != '=' { + z.p0, moreAttr = i, true return } i = z.trim(i + 1) - if i == z.p1 || z.buf[i] != '"' { + if i == z.p1 { + z.p0 = i + return + } + closeQuote := z.buf[i] + if closeQuote != '\'' && closeQuote != '"' { + val, z.p0 = z.word(i, false) + moreAttr = z.p0 != z.p1 return } i = z.trim(i + 1) - // Copy and unescape everything up to the closing '"'. + // Copy and unescape everything up to the closing quote. dst, src := i, i loop: for src < z.p1 { c := z.buf[src] switch c { - case '"': + case closeQuote: src++ break loop case '&': diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index 5cf1f6dac..c17b436aa 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -100,13 +100,61 @@ var tokenTests = []tokenTest{ "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, }, - // A non-existant entity. Tokenizing and converting back to a string should + // A nonexistent entity. Tokenizing and converting back to a string should // escape the "&" to become "&". { "noSuchEntity", `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, }, + + // Attribute tests: + // http://dev.w3.org/html5/spec/Overview.html#attributes-0 + { + "Empty attribute", + `<input disabled FOO>`, + `<input disabled="" foo="">`, + }, + { + "Empty attribute, whitespace", + `<input disabled FOO >`, + `<input disabled="" foo="">`, + }, + { + "Unquoted attribute value", + `<input value=yes FOO=BAR>`, + `<input value="yes" foo="BAR">`, + }, + { + "Unquoted attribute value, spaces", + `<input value = yes FOO = BAR>`, + `<input value="yes" foo="BAR">`, + }, + { + "Unquoted attribute value, trailing space", + `<input value=yes FOO=BAR >`, + `<input value="yes" foo="BAR">`, + }, + { + "Single-quoted attribute value", + `<input value='yes' FOO='BAR'>`, + `<input value="yes" foo="BAR">`, + }, + { + "Single-quoted attribute value, trailing space", + `<input value='yes' FOO='BAR' >`, + `<input value="yes" foo="BAR">`, + }, + { + "Double-quoted attribute value", + `<input value="I'm an attribute" FOO="BAR">`, + `<input value="I'm an attribute" foo="BAR">`, + }, + { + "Attribute name characters", + `<meta http-equiv="content-type">`, + `<meta http-equiv="content-type">`, + }, } func TestTokenizer(t *testing.T) { |