summaryrefslogtreecommitdiff
path: root/src/pkg/html/token_test.go
diff options
context:
space:
mode:
authorOndřej Surý <ondrej@sury.org>2011-09-13 13:13:40 +0200
committerOndřej Surý <ondrej@sury.org>2011-09-13 13:13:40 +0200
commit5ff4c17907d5b19510a62e08fd8d3b11e62b431d (patch)
treec0650497e988f47be9c6f2324fa692a52dea82e1 /src/pkg/html/token_test.go
parent80f18fc933cf3f3e829c5455a1023d69f7b86e52 (diff)
downloadgolang-5ff4c17907d5b19510a62e08fd8d3b11e62b431d.tar.gz
Imported Upstream version 60upstream/60
Diffstat (limited to 'src/pkg/html/token_test.go')
-rw-r--r--src/pkg/html/token_test.go340
1 files changed, 340 insertions, 0 deletions
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go
new file mode 100644
index 000000000..0a0beb201
--- /dev/null
+++ b/src/pkg/html/token_test.go
@@ -0,0 +1,340 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+ "bytes"
+ "os"
+ "strings"
+ "testing"
+)
+
+type tokenTest struct {
+ // A short description of the test case.
+ desc string
+ // The HTML to parse.
+ html string
+ // The string representations of the expected tokens, joined by '$'.
+ golden string
+}
+
+var tokenTests = []tokenTest{
+ // A single text node. The tokenizer should not break text nodes on whitespace,
+ // nor should it normalize whitespace within a text node.
+ {
+ "text",
+ "foo bar",
+ "foo bar",
+ },
+ // An entity.
+ {
+ "entity",
+ "one &lt; two",
+ "one &lt; two",
+ },
+ // A start, self-closing and end tag. The tokenizer does not care if the start
+ // and end tokens don't match; that is the job of the parser.
+ {
+ "tags",
+ "<a>b<c/>d</e>",
+ "<a>$b$<c/>$d$</e>",
+ },
+ // Some malformed tags that are missing a '>'.
+ {
+ "malformed tag #0",
+ `<p</p>`,
+ `<p< p="">`,
+ },
+ {
+ "malformed tag #1",
+ `<p </p>`,
+ `<p <="" p="">`,
+ },
+ {
+ "malformed tag #2",
+ `<p id=0</p>`,
+ `<p id="0&lt;/p">`,
+ },
+ {
+ "malformed tag #3",
+ `<p id="0</p>`,
+ `<p id="0&lt;/p&gt;">`,
+ },
+ {
+ "malformed tag #4",
+ `<p id="0"</p>`,
+ `<p id="0" <="" p="">`,
+ },
+ // Comments.
+ {
+ "comment0",
+ "abc<b><!-- skipme --></b>def",
+ "abc$<b>$</b>$def",
+ },
+ {
+ "comment1",
+ "a<!-->z",
+ "a$z",
+ },
+ {
+ "comment2",
+ "a<!--->z",
+ "a$z",
+ },
+ {
+ "comment3",
+ "a<!--x>-->z",
+ "a$z",
+ },
+ {
+ "comment4",
+ "a<!--x->-->z",
+ "a$z",
+ },
+ {
+ "comment5",
+ "a<!>z",
+ "a$&lt;!&gt;z",
+ },
+ {
+ "comment6",
+ "a<!->z",
+ "a$&lt;!-&gt;z",
+ },
+ {
+ "comment7",
+ "a<!---<>z",
+ "a$&lt;!---&lt;&gt;z",
+ },
+ {
+ "comment8",
+ "a<!--z",
+ "a$&lt;!--z",
+ },
+ // An attribute with a backslash.
+ {
+ "backslash",
+ `<p id="a\"b">`,
+ `<p id="a&quot;b">`,
+ },
+ // Entities, tag name and attribute key lower-casing, and whitespace
+ // normalization within a tag.
+ {
+ "tricky",
+ "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
+ `<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
+ },
+ // A nonexistent entity. Tokenizing and converting back to a string should
+ // escape the "&" to become "&amp;".
+ {
+ "noSuchEntity",
+ `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
+ `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
+ },
+ {
+ "entity without semicolon",
+ `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
+ `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
+ },
+ {
+ "entity with digits",
+ "&frac12;",
+ "½",
+ },
+ // Attribute tests:
+ // http://dev.w3.org/html5/spec/Overview.html#attributes-0
+ {
+ "Empty attribute",
+ `<input disabled FOO>`,
+ `<input disabled="" foo="">`,
+ },
+ {
+ "Empty attribute, whitespace",
+ `<input disabled FOO >`,
+ `<input disabled="" foo="">`,
+ },
+ {
+ "Unquoted attribute value",
+ `<input value=yes FOO=BAR>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Unquoted attribute value, spaces",
+ `<input value = yes FOO = BAR>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Unquoted attribute value, trailing space",
+ `<input value=yes FOO=BAR >`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Single-quoted attribute value",
+ `<input value='yes' FOO='BAR'>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Single-quoted attribute value, trailing space",
+ `<input value='yes' FOO='BAR' >`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Double-quoted attribute value",
+ `<input value="I'm an attribute" FOO="BAR">`,
+ `<input value="I&apos;m an attribute" foo="BAR">`,
+ },
+ {
+ "Attribute name characters",
+ `<meta http-equiv="content-type">`,
+ `<meta http-equiv="content-type">`,
+ },
+}
+
+func TestTokenizer(t *testing.T) {
+loop:
+ for _, tt := range tokenTests {
+ z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
+ for i, s := range strings.Split(tt.golden, "$") {
+ if z.Next() == ErrorToken {
+ t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
+ continue loop
+ }
+ actual := z.Token().String()
+ if s != actual {
+ t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
+ continue loop
+ }
+ }
+ z.Next()
+ if z.Error() != os.EOF {
+ t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())
+ }
+ }
+}
+
+type unescapeTest struct {
+ // A short description of the test case.
+ desc string
+ // The HTML text.
+ html string
+ // The unescaped text.
+ unescaped string
+}
+
+var unescapeTests = []unescapeTest{
+ // Handle no entities.
+ {
+ "copy",
+ "A\ttext\nstring",
+ "A\ttext\nstring",
+ },
+ // Handle simple named entities.
+ {
+ "simple",
+ "&amp; &gt; &lt;",
+ "& > <",
+ },
+ // Handle hitting the end of the string.
+ {
+ "stringEnd",
+ "&amp &amp",
+ "& &",
+ },
+ // Handle entities with two codepoints.
+ {
+ "multiCodepoint",
+ "text &gesl; blah",
+ "text \u22db\ufe00 blah",
+ },
+ // Handle decimal numeric entities.
+ {
+ "decimalEntity",
+ "Delta = &#916; ",
+ "Delta = Δ ",
+ },
+ // Handle hexadecimal numeric entities.
+ {
+ "hexadecimalEntity",
+ "Lambda = &#x3bb; = &#X3Bb ",
+ "Lambda = λ = λ ",
+ },
+ // Handle numeric early termination.
+ {
+ "numericEnds",
+ "&# &#x &#128;43 &copy = &#169f = &#xa9",
+ "&# &#x €43 © = ©f = ©",
+ },
+ // Handle numeric ISO-8859-1 entity replacements.
+ {
+ "numericReplacements",
+ "Footnote&#x87;",
+ "Footnote‡",
+ },
+}
+
+func TestUnescape(t *testing.T) {
+ for _, tt := range unescapeTests {
+ unescaped := UnescapeString(tt.html)
+ if unescaped != tt.unescaped {
+ t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
+ }
+ }
+}
+
+func TestUnescapeEscape(t *testing.T) {
+ ss := []string{
+ ``,
+ `abc def`,
+ `a & b`,
+ `a&amp;b`,
+ `a &amp b`,
+ `&quot;`,
+ `"`,
+ `"<&>"`,
+ `&quot;&lt;&amp;&gt;&quot;`,
+ `3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
+ }
+ for _, s := range ss {
+ if s != UnescapeString(EscapeString(s)) {
+ t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
+ }
+ }
+}
+
+func TestBufAPI(t *testing.T) {
+ s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
+ z := NewTokenizer(bytes.NewBuffer([]byte(s)))
+ result := bytes.NewBuffer(nil)
+ depth := 0
+loop:
+ for {
+ tt := z.Next()
+ switch tt {
+ case ErrorToken:
+ if z.Error() != os.EOF {
+ t.Error(z.Error())
+ }
+ break loop
+ case TextToken:
+ if depth > 0 {
+ result.Write(z.Text())
+ }
+ case StartTagToken, EndTagToken:
+ tn, _ := z.TagName()
+ if len(tn) == 1 && tn[0] == 'a' {
+ if tt == StartTagToken {
+ depth++
+ } else {
+ depth--
+ }
+ }
+ }
+ }
+ u := "14567"
+ v := string(result.Bytes())
+ if u != v {
+ t.Errorf("TestBufAPI: want %q got %q", u, v)
+ }
+}