diff options
author | Ondřej Surý <ondrej@sury.org> | 2011-01-17 12:40:45 +0100 |
---|---|---|
committer | Ondřej Surý <ondrej@sury.org> | 2011-01-17 12:40:45 +0100 |
commit | 3e45412327a2654a77944249962b3652e6142299 (patch) | |
tree | bc3bf69452afa055423cbe0c5cfa8ca357df6ccf /src/pkg/html/token_test.go | |
parent | c533680039762cacbc37db8dc7eed074c3e497be (diff) | |
download | golang-upstream/2011.01.12.tar.gz |
Imported Upstream version 2011.01.12upstream/2011.01.12
Diffstat (limited to 'src/pkg/html/token_test.go')
-rw-r--r-- | src/pkg/html/token_test.go | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go new file mode 100644 index 000000000..e07999ca5 --- /dev/null +++ b/src/pkg/html/token_test.go @@ -0,0 +1,231 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "os" + "testing" +) + +type tokenTest struct { + // A short description of the test case. + desc string + // The HTML to parse. + html string + // The string representations of the expected tokens. + tokens []string +} + +var tokenTests = []tokenTest{ + // A single text node. The tokenizer should not break text nodes on whitespace, + // nor should it normalize whitespace within a text node. + { + "text", + "foo bar", + []string{ + "foo bar", + }, + }, + // An entity. + { + "entity", + "one < two", + []string{ + "one < two", + }, + }, + // A start, self-closing and end tag. The tokenizer does not care if the start + // and end tokens don't match; that is the job of the parser. + { + "tags", + "<a>b<c/>d</e>", + []string{ + "<a>", + "b", + "<c/>", + "d", + "</e>", + }, + }, + // An attribute with a backslash. + { + "backslash", + `<p id="a\"b">`, + []string{ + `<p id="a"b">`, + }, + }, + // Entities, tag name and attribute key lower-casing, and whitespace + // normalization within a tag. + { + "tricky", + "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", + []string{ + `<p id="a"B" foo="bar">`, + "<em>", + "te<&;xt", + "</em>", + "</p>", + }, + }, + // A non-existant entity. Tokenizing and converting back to a string should + // escape the "&" to become "&". + { + "noSuchEntity", + `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, + []string{ + `<a b="c&noSuchEntity;d">`, + "<&alsoDoesntExist;&", + }, + }, +} + +func TestTokenizer(t *testing.T) { +loop: + for _, tt := range tokenTests { + z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) + for i, s := range tt.tokens { + if z.Next() == ErrorToken { + t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) + continue loop + } + actual := z.Token().String() + if s != actual { + t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) + continue loop + } + } + z.Next() + if z.Error() != os.EOF { + t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String()) + } + } +} + +type unescapeTest struct { + // A short description of the test case. + desc string + // The HTML text. + html string + // The unescaped text. + unescaped string +} + +var unescapeTests = []unescapeTest{ + // Handle no entities. + { + "copy", + "A\ttext\nstring", + "A\ttext\nstring", + }, + // Handle simple named entities. + { + "simple", + "& > <", + "& > <", + }, + // Handle hitting the end of the string. + { + "stringEnd", + "& &", + "& &", + }, + // Handle entities with two codepoints. + { + "multiCodepoint", + "text ⋛︀ blah", + "text \u22db\ufe00 blah", + }, + // Handle decimal numeric entities. + { + "decimalEntity", + "Delta = Δ ", + "Delta = Δ ", + }, + // Handle hexadecimal numeric entities. + { + "hexadecimalEntity", + "Lambda = λ = λ ", + "Lambda = λ = λ ", + }, + // Handle numeric early termination. + { + "numericEnds", + "&# &#x €43 © = ©f = ©", + "&# &#x €43 © = ©f = ©", + }, + // Handle numeric ISO-8859-1 entity replacements. + { + "numericReplacements", + "Footnote‡", + "Footnote‡", + }, +} + +func TestUnescape(t *testing.T) { + for _, tt := range unescapeTests { + unescaped := UnescapeString(tt.html) + if unescaped != tt.unescaped { + t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped) + } + } +} + +func TestUnescapeEscape(t *testing.T) { + ss := []string{ + ``, + `abc def`, + `a & b`, + `a&b`, + `a & b`, + `"`, + `"`, + `"<&>"`, + `"<&>"`, + `3&5==1 && 0<1, "0<1", a+acute=á`, + } + for _, s := range ss { + if s != UnescapeString(EscapeString(s)) { + t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s) + } + } +} + +func TestBufAPI(t *testing.T) { + s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" + z := NewTokenizer(bytes.NewBuffer([]byte(s))) + result := bytes.NewBuffer(nil) + depth := 0 +loop: + for { + tt := z.Next() + switch tt { + case ErrorToken: + if z.Error() != os.EOF { + t.Error(z.Error()) + } + break loop + case TextToken: + if depth > 0 { + result.Write(z.Text()) + } + case StartTagToken, EndTagToken: + tn, _ := z.TagName() + if len(tn) == 1 && tn[0] == 'a' { + if tt == StartTagToken { + depth++ + } else { + depth-- + } + } + } + } + u := "14567" + v := string(result.Bytes()) + if u != v { + t.Errorf("TestBufAPI: want %q got %q", u, v) + } +} |