diff options
Diffstat (limited to 'src/pkg/exp/html/parse_test.go')
-rw-r--r-- | src/pkg/exp/html/parse_test.go | 276 |
1 files changed, 276 insertions, 0 deletions
diff --git a/src/pkg/exp/html/parse_test.go b/src/pkg/exp/html/parse_test.go new file mode 100644 index 000000000..1528dffaa --- /dev/null +++ b/src/pkg/exp/html/parse_test.go @@ -0,0 +1,276 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "os" + "strings" + "testing" +) + +// readParseTest reads a single test case from r. +func readParseTest(r *bufio.Reader) (text, want, context string, err error) { + line, err := r.ReadSlice('\n') + if err != nil { + return "", "", "", err + } + var b []byte + + // Read the HTML. + if string(line) != "#data\n" { + return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line) + } + for { + line, err = r.ReadSlice('\n') + if err != nil { + return "", "", "", err + } + if line[0] == '#' { + break + } + b = append(b, line...) + } + text = strings.TrimRight(string(b), "\n") + b = b[:0] + + // Skip the error list. + if string(line) != "#errors\n" { + return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line) + } + for { + line, err = r.ReadSlice('\n') + if err != nil { + return "", "", "", err + } + if line[0] == '#' { + break + } + } + + if string(line) == "#document-fragment\n" { + line, err = r.ReadSlice('\n') + if err != nil { + return "", "", "", err + } + context = strings.TrimSpace(string(line)) + line, err = r.ReadSlice('\n') + if err != nil { + return "", "", "", err + } + } + + // Read the dump of what the parse tree should be. + if string(line) != "#document\n" { + return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line) + } + for { + line, err = r.ReadSlice('\n') + if err != nil && err != io.EOF { + return "", "", "", err + } + if len(line) == 0 || len(line) == 1 && line[0] == '\n' { + break + } + b = append(b, line...) + } + return text, string(b), context, nil +} + +func dumpIndent(w io.Writer, level int) { + io.WriteString(w, "| ") + for i := 0; i < level; i++ { + io.WriteString(w, " ") + } +} + +func dumpLevel(w io.Writer, n *Node, level int) error { + dumpIndent(w, level) + switch n.Type { + case ErrorNode: + return errors.New("unexpected ErrorNode") + case DocumentNode: + return errors.New("unexpected DocumentNode") + case ElementNode: + if n.Namespace != "" { + fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data) + } else { + fmt.Fprintf(w, "<%s>", n.Data) + } + attr := n.Attr + if len(attr) == 2 && attr[0].Namespace == "xml" && attr[1].Namespace == "xlink" { + // Some of the test cases in tests10.dat change the order of adjusted + // foreign attributes, but that behavior is not in the spec, and could + // simply be an implementation detail of html5lib's python map ordering. + attr[0], attr[1] = attr[1], attr[0] + } + for _, a := range attr { + io.WriteString(w, "\n") + dumpIndent(w, level+1) + if a.Namespace != "" { + fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val) + } else { + fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val) + } + } + case TextNode: + fmt.Fprintf(w, `"%s"`, n.Data) + case CommentNode: + fmt.Fprintf(w, "<!-- %s -->", n.Data) + case DoctypeNode: + fmt.Fprintf(w, "<!DOCTYPE %s", n.Data) + if n.Attr != nil { + var p, s string + for _, a := range n.Attr { + switch a.Key { + case "public": + p = a.Val + case "system": + s = a.Val + } + } + if p != "" || s != "" { + fmt.Fprintf(w, ` "%s"`, p) + fmt.Fprintf(w, ` "%s"`, s) + } + } + io.WriteString(w, ">") + case scopeMarkerNode: + return errors.New("unexpected scopeMarkerNode") + default: + return errors.New("unknown node type") + } + io.WriteString(w, "\n") + for _, c := range n.Child { + if err := dumpLevel(w, c, level+1); err != nil { + return err + } + } + return nil +} + +func dump(n *Node) (string, error) { + if n == nil || len(n.Child) == 0 { + return "", nil + } + b := bytes.NewBuffer(nil) + for _, child := range n.Child { + if err := dumpLevel(b, child, 0); err != nil { + return "", err + } + } + return b.String(), nil +} + +func TestParser(t *testing.T) { + testFiles := []struct { + filename string + // n is the number of test cases to run from that file. + // -1 means all test cases. + n int + }{ + // TODO(nigeltao): Process all the test cases from all the .dat files. + {"adoption01.dat", -1}, + {"doctype01.dat", -1}, + {"tests1.dat", -1}, + {"tests2.dat", -1}, + {"tests3.dat", -1}, + {"tests4.dat", -1}, + {"tests5.dat", -1}, + {"tests6.dat", -1}, + {"tests10.dat", 35}, + } + for _, tf := range testFiles { + f, err := os.Open("testdata/webkit/" + tf.filename) + if err != nil { + t.Fatal(err) + } + defer f.Close() + r := bufio.NewReader(f) + for i := 0; i != tf.n; i++ { + text, want, context, err := readParseTest(r) + if err == io.EOF && tf.n == -1 { + break + } + if err != nil { + t.Fatal(err) + } + + var doc *Node + if context == "" { + doc, err = Parse(strings.NewReader(text)) + if err != nil { + t.Fatal(err) + } + } else { + contextNode := &Node{ + Type: ElementNode, + Data: context, + } + nodes, err := ParseFragment(strings.NewReader(text), contextNode) + if err != nil { + t.Fatal(err) + } + doc = &Node{ + Type: DocumentNode, + } + for _, n := range nodes { + doc.Add(n) + } + } + + got, err := dump(doc) + if err != nil { + t.Fatal(err) + } + // Compare the parsed tree to the #document section. + if got != want { + t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want) + continue + } + if renderTestBlacklist[text] || context != "" { + continue + } + // Check that rendering and re-parsing results in an identical tree. + pr, pw := io.Pipe() + go func() { + pw.CloseWithError(Render(pw, doc)) + }() + doc1, err := Parse(pr) + if err != nil { + t.Fatal(err) + } + got1, err := dump(doc1) + if err != nil { + t.Fatal(err) + } + if got != got1 { + t.Errorf("%s test #%d %q, got vs got1:\n----\n%s----\n%s----", tf.filename, i, text, got, got1) + continue + } + } + } +} + +// Some test input result in parse trees are not 'well-formed' despite +// following the HTML5 recovery algorithms. Rendering and re-parsing such a +// tree will not result in an exact clone of that tree. We blacklist such +// inputs from the render test. +var renderTestBlacklist = map[string]bool{ + // The second <a> will be reparented to the first <table>'s parent. This + // results in an <a> whose parent is an <a>, which is not 'well-formed'. + `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true, + // More cases of <a> being reparented: + `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true, + `<a><table><a></table><p><a><div><a>`: true, + `<a><table><td><a><table></table><a></tr><a></table><a>`: true, + // A <plaintext> element is reparented, putting it before a table. + // A <plaintext> element can't have anything after it in HTML. + `<table><plaintext><td>`: true, +} |