// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package html import ( "io" "strings" ) // A parser implements the HTML5 parsing algorithm: // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction type parser struct { // tokenizer provides the tokens for the parser. tokenizer *Tokenizer // tok is the most recently read token. tok Token // Self-closing tags like
block. d := p.tok.Data if d != "" && d[0] == '\r' { d = d[1:] } if d != "" && d[0] == '\n' { d = d[1:] } if d == "" { return true } p.tok.Data = d } } p.reconstructActiveFormattingElements() p.addText(p.tok.Data) p.framesetOK = false case StartTagToken: switch p.tok.Data { case "html": copyAttributes(p.oe[0], p.tok) case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul": p.popUntil(buttonScope, "p") p.addElement(p.tok.Data, p.tok.Attr) case "h1", "h2", "h3", "h4", "h5", "h6": p.popUntil(buttonScope, "p") switch n := p.top(); n.Data { case "h1", "h2", "h3", "h4", "h5", "h6": p.oe.pop() } p.addElement(p.tok.Data, p.tok.Attr) case "a": for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { if n := p.afe[i]; n.Type == ElementNode && n.Data == "a" { p.inBodyEndTagFormatting("a") p.oe.remove(n) p.afe.remove(n) break } } p.reconstructActiveFormattingElements() p.addFormattingElement(p.tok.Data, p.tok.Attr) case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u": p.reconstructActiveFormattingElements() p.addFormattingElement(p.tok.Data, p.tok.Attr) case "nobr": p.reconstructActiveFormattingElements() if p.elementInScope(defaultScope, "nobr") { p.inBodyEndTagFormatting("nobr") p.reconstructActiveFormattingElements() } p.addFormattingElement(p.tok.Data, p.tok.Attr) case "applet", "marquee", "object": p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) p.afe = append(p.afe, &scopeMarker) p.framesetOK = false case "area", "br", "embed", "img", "input", "keygen", "wbr": p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) p.oe.pop() p.acknowledgeSelfClosingTag() p.framesetOK = false case "table": if !p.quirks { p.popUntil(buttonScope, "p") } p.addElement(p.tok.Data, p.tok.Attr) p.framesetOK = false p.im = inTableIM return true case "hr": p.popUntil(buttonScope, "p") p.addElement(p.tok.Data, p.tok.Attr) p.oe.pop() p.acknowledgeSelfClosingTag() p.framesetOK = false case "select": p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) p.framesetOK = false p.im = inSelectIM return true case "form": if p.form == nil { p.popUntil(buttonScope, "p") p.addElement(p.tok.Data, p.tok.Attr) p.form = p.top() } case "li": p.framesetOK = false for i := len(p.oe) - 1; i >= 0; i-- { node := p.oe[i] switch node.Data { case "li": p.popUntil(listItemScope, "li") case "address", "div", "p": continue default: if !isSpecialElement(node) { continue } } break } p.popUntil(buttonScope, "p") p.addElement(p.tok.Data, p.tok.Attr) case "dd", "dt": p.framesetOK = false for i := len(p.oe) - 1; i >= 0; i-- { node := p.oe[i] switch node.Data { case "dd", "dt": p.oe = p.oe[:i] case "address", "div", "p": continue default: if !isSpecialElement(node) { continue } } break } p.popUntil(buttonScope, "p") p.addElement(p.tok.Data, p.tok.Attr) case "plaintext": p.popUntil(buttonScope, "p") p.addElement(p.tok.Data, p.tok.Attr) case "button": p.popUntil(defaultScope, "button") p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) p.framesetOK = false case "optgroup", "option": if p.top().Data == "option" { p.oe.pop() } p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) case "body": if len(p.oe) >= 2 { body := p.oe[1] if body.Type == ElementNode && body.Data == "body" { p.framesetOK = false copyAttributes(body, p.tok) } } case "frameset": if !p.framesetOK || len(p.oe) < 2 || p.oe[1].Data != "body" { // Ignore the token. return true } body := p.oe[1] if body.Parent != nil { body.Parent.Remove(body) } p.oe = p.oe[:1] p.addElement(p.tok.Data, p.tok.Attr) p.im = inFramesetIM return true case "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title": return inHeadIM(p) case "image": p.tok.Data = "img" return false case "isindex": if p.form != nil { // Ignore the token. return true } action := "" prompt := "This is a searchable index. Enter search keywords: " attr := []Attribute{{Key: "name", Val: "isindex"}} for _, a := range p.tok.Attr { switch a.Key { case "action": action = a.Val case "name": // Ignore the attribute. case "prompt": prompt = a.Val default: attr = append(attr, a) } } p.acknowledgeSelfClosingTag() p.popUntil(buttonScope, "p") p.addElement("form", nil) p.form = p.top() if action != "" { p.form.Attr = []Attribute{{Key: "action", Val: action}} } p.addElement("hr", nil) p.oe.pop() p.addElement("label", nil) p.addText(prompt) p.addElement("input", attr) p.oe.pop() p.oe.pop() p.addElement("hr", nil) p.oe.pop() p.oe.pop() p.form = nil case "xmp": p.popUntil(buttonScope, "p") p.reconstructActiveFormattingElements() p.framesetOK = false p.addElement(p.tok.Data, p.tok.Attr) case "math", "svg": p.reconstructActiveFormattingElements() if p.tok.Data == "math" { // TODO: adjust MathML attributes. } else { // TODO: adjust SVG attributes. } adjustForeignAttributes(p.tok.Attr) p.addElement(p.tok.Data, p.tok.Attr) p.top().Namespace = p.tok.Data return true case "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr": // Ignore the token. default: // TODO. p.addElement(p.tok.Data, p.tok.Attr) } case EndTagToken: switch p.tok.Data { case "body": // TODO: autoclose the stack of open elements. p.im = afterBodyIM return true case "p": if !p.elementInScope(buttonScope, "p") { p.addElement("p", nil) } p.popUntil(buttonScope, "p") case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u": p.inBodyEndTagFormatting(p.tok.Data) case "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul": p.popUntil(defaultScope, p.tok.Data) case "applet", "marquee", "object": if p.popUntil(defaultScope, p.tok.Data) { p.clearActiveFormattingElements() } case "br": p.tok.Type = StartTagToken return false default: p.inBodyEndTagOther(p.tok.Data) } case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) } return true } func (p *parser) inBodyEndTagFormatting(tag string) { // This is the "adoption agency" algorithm, described at // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency // TODO: this is a fairly literal line-by-line translation of that algorithm. // Once the code successfully parses the comprehensive test suite, we should // refactor this code to be more idiomatic. // Steps 1-3. The outer loop. for i := 0; i < 8; i++ { // Step 4. Find the formatting element. var formattingElement *Node for j := len(p.afe) - 1; j >= 0; j-- { if p.afe[j].Type == scopeMarkerNode { break } if p.afe[j].Data == tag { formattingElement = p.afe[j] break } } if formattingElement == nil { p.inBodyEndTagOther(tag) return } feIndex := p.oe.index(formattingElement) if feIndex == -1 { p.afe.remove(formattingElement) return } if !p.elementInScope(defaultScope, tag) { // Ignore the tag. return } // Steps 5-6. Find the furthest block. var furthestBlock *Node for _, e := range p.oe[feIndex:] { if isSpecialElement(e) { furthestBlock = e break } } if furthestBlock == nil { e := p.oe.pop() for e != formattingElement { e = p.oe.pop() } p.afe.remove(e) return } // Steps 7-8. Find the common ancestor and bookmark node. commonAncestor := p.oe[feIndex-1] bookmark := p.afe.index(formattingElement) // Step 9. The inner loop. Find the lastNode to reparent. lastNode := furthestBlock node := furthestBlock x := p.oe.index(node) // Steps 9.1-9.3. for j := 0; j < 3; j++ { // Step 9.4. x-- node = p.oe[x] // Step 9.5. if p.afe.index(node) == -1 { p.oe.remove(node) continue } // Step 9.6. if node == formattingElement { break } // Step 9.7. clone := node.clone() p.afe[p.afe.index(node)] = clone p.oe[p.oe.index(node)] = clone node = clone // Step 9.8. if lastNode == furthestBlock { bookmark = p.afe.index(node) + 1 } // Step 9.9. if lastNode.Parent != nil { lastNode.Parent.Remove(lastNode) } node.Add(lastNode) // Step 9.10. lastNode = node } // Step 10. Reparent lastNode to the common ancestor, // or for misnested table nodes, to the foster parent. if lastNode.Parent != nil { lastNode.Parent.Remove(lastNode) } switch commonAncestor.Data { case "table", "tbody", "tfoot", "thead", "tr": p.fosterParent(lastNode) default: commonAncestor.Add(lastNode) } // Steps 11-13. Reparent nodes from the furthest block's children // to a clone of the formatting element. clone := formattingElement.clone() reparentChildren(clone, furthestBlock) furthestBlock.Add(clone) // Step 14. Fix up the list of active formatting elements. if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { // Move the bookmark with the rest of the list. bookmark-- } p.afe.remove(formattingElement) p.afe.insert(bookmark, clone) // Step 15. Fix up the stack of open elements. p.oe.remove(formattingElement) p.oe.insert(p.oe.index(furthestBlock)+1, clone) } } // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. func (p *parser) inBodyEndTagOther(tag string) { for i := len(p.oe) - 1; i >= 0; i-- { if p.oe[i].Data == tag { p.oe = p.oe[:i] break } if isSpecialElement(p.oe[i]) { break } } } // Section 12.2.5.4.8. func textIM(p *parser) bool { switch p.tok.Type { case ErrorToken: p.oe.pop() case TextToken: p.addText(p.tok.Data) return true case EndTagToken: p.oe.pop() } p.im = p.originalIM p.originalIM = nil return p.tok.Type == EndTagToken } // Section 12.2.5.4.9. func inTableIM(p *parser) bool { switch p.tok.Type { case ErrorToken: // Stop parsing. return true case TextToken: // TODO. case StartTagToken: switch p.tok.Data { case "caption": p.clearStackToContext(tableScope) p.afe = append(p.afe, &scopeMarker) p.addElement(p.tok.Data, p.tok.Attr) p.im = inCaptionIM return true case "tbody", "tfoot", "thead": p.clearStackToContext(tableScope) p.addElement(p.tok.Data, p.tok.Attr) p.im = inTableBodyIM return true case "td", "th", "tr": p.clearStackToContext(tableScope) p.addElement("tbody", nil) p.im = inTableBodyIM return false case "table": if p.popUntil(tableScope, "table") { p.resetInsertionMode() return false } // Ignore the token. return true case "colgroup": p.clearStackToContext(tableScope) p.addElement(p.tok.Data, p.tok.Attr) p.im = inColumnGroupIM return true case "col": p.clearStackToContext(tableScope) p.addElement("colgroup", p.tok.Attr) p.im = inColumnGroupIM return false case "select": p.reconstructActiveFormattingElements() switch p.top().Data { case "table", "tbody", "tfoot", "thead", "tr": p.fosterParenting = true } p.addElement(p.tok.Data, p.tok.Attr) p.fosterParenting = false p.framesetOK = false p.im = inSelectInTableIM return true default: // TODO. } case EndTagToken: switch p.tok.Data { case "table": if p.popUntil(tableScope, "table") { p.resetInsertionMode() return true } // Ignore the token. return true case "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr": // Ignore the token. return true } case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) return true } switch p.top().Data { case "table", "tbody", "tfoot", "thead", "tr": p.fosterParenting = true defer func() { p.fosterParenting = false }() } return inBodyIM(p) } // Section 12.2.5.4.11. func inCaptionIM(p *parser) bool { switch p.tok.Type { case StartTagToken: switch p.tok.Data { case "caption", "col", "colgroup", "tbody", "td", "tfoot", "thead", "tr": if p.popUntil(tableScope, "caption") { p.clearActiveFormattingElements() p.im = inTableIM return false } else { // Ignore the token. return true } case "select": p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) p.framesetOK = false p.im = inSelectInTableIM return true } case EndTagToken: switch p.tok.Data { case "caption": if p.popUntil(tableScope, "caption") { p.clearActiveFormattingElements() p.im = inTableIM } return true case "table": if p.popUntil(tableScope, "caption") { p.clearActiveFormattingElements() p.im = inTableIM return false } else { // Ignore the token. return true } case "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr": // Ignore the token. return true } } return inBodyIM(p) } // Section 12.2.5.4.12. func inColumnGroupIM(p *parser) bool { switch p.tok.Type { case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) return true case DoctypeToken: // Ignore the token. return true case StartTagToken: switch p.tok.Data { case "html": return inBodyIM(p) case "col": p.addElement(p.tok.Data, p.tok.Attr) p.oe.pop() p.acknowledgeSelfClosingTag() return true } case EndTagToken: switch p.tok.Data { case "colgroup": if p.oe.top().Data != "html" { p.oe.pop() p.im = inTableIM } return true case "col": // Ignore the token. return true } } if p.oe.top().Data != "html" { p.oe.pop() p.im = inTableIM return false } return true } // Section 12.2.5.4.13. func inTableBodyIM(p *parser) bool { var ( add bool data string attr []Attribute consumed bool ) switch p.tok.Type { case ErrorToken: // TODO. case TextToken: // TODO. case StartTagToken: switch p.tok.Data { case "tr": add = true data = p.tok.Data attr = p.tok.Attr consumed = true case "td", "th": add = true data = "tr" consumed = false case "caption", "col", "colgroup", "tbody", "tfoot", "thead": if !p.popUntil(tableScope, "tbody", "thead", "tfoot") { // Ignore the token. return true } p.im = inTableIM return false default: // TODO. } case EndTagToken: switch p.tok.Data { case "table": if p.popUntil(tableScope, "tbody", "thead", "tfoot") { p.im = inTableIM return false } // Ignore the token. return true case "body", "caption", "col", "colgroup", "html", "td", "th", "tr": // Ignore the token. return true } case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) return true } if add { // TODO: clear the stack back to a table body context. p.addElement(data, attr) p.im = inRowIM return consumed } return inTableIM(p) } // Section 12.2.5.4.14. func inRowIM(p *parser) bool { switch p.tok.Type { case ErrorToken: // TODO. case TextToken: // TODO. case StartTagToken: switch p.tok.Data { case "td", "th": p.clearStackToContext(tableRowScope) p.addElement(p.tok.Data, p.tok.Attr) p.afe = append(p.afe, &scopeMarker) p.im = inCellIM return true case "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr": if p.popUntil(tableScope, "tr") { p.im = inTableBodyIM return false } // Ignore the token. return true default: // TODO. } case EndTagToken: switch p.tok.Data { case "tr": if p.popUntil(tableScope, "tr") { p.im = inTableBodyIM return true } // Ignore the token. return true case "table": if p.popUntil(tableScope, "tr") { p.im = inTableBodyIM return false } // Ignore the token. return true case "tbody", "tfoot", "thead": // TODO. case "body", "caption", "col", "colgroup", "html", "td", "th": // Ignore the token. return true default: // TODO. } case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) return true } return inTableIM(p) } // Section 12.2.5.4.15. func inCellIM(p *parser) bool { var ( closeTheCellAndReprocess bool ) switch p.tok.Type { case StartTagToken: switch p.tok.Data { case "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr": // TODO: check for "td" or "th" in table scope. closeTheCellAndReprocess = true case "select": p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) p.framesetOK = false p.im = inSelectInTableIM return true } case EndTagToken: switch p.tok.Data { case "td", "th": if !p.popUntil(tableScope, p.tok.Data) { // Ignore the token. return true } p.clearActiveFormattingElements() p.im = inRowIM return true case "body", "caption", "col", "colgroup", "html": // TODO. case "table", "tbody", "tfoot", "thead", "tr": // TODO: check for matching element in table scope. closeTheCellAndReprocess = true } case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) return true } if closeTheCellAndReprocess { if p.popUntil(tableScope, "td") || p.popUntil(tableScope, "th") { p.clearActiveFormattingElements() p.im = inRowIM return false } } return inBodyIM(p) } // Section 12.2.5.4.16. func inSelectIM(p *parser) bool { endSelect := false switch p.tok.Type { case ErrorToken: // TODO. case TextToken: p.addText(p.tok.Data) case StartTagToken: switch p.tok.Data { case "html": // TODO. case "option": if p.top().Data == "option" { p.oe.pop() } p.addElement(p.tok.Data, p.tok.Attr) case "optgroup": if p.top().Data == "option" { p.oe.pop() } if p.top().Data == "optgroup" { p.oe.pop() } p.addElement(p.tok.Data, p.tok.Attr) case "select": endSelect = true case "input", "keygen", "textarea": // TODO. case "script": // TODO. default: // Ignore the token. } case EndTagToken: switch p.tok.Data { case "option": if p.top().Data == "option" { p.oe.pop() } case "optgroup": i := len(p.oe) - 1 if p.oe[i].Data == "option" { i-- } if p.oe[i].Data == "optgroup" { p.oe = p.oe[:i] } case "select": endSelect = true default: // Ignore the token. } case CommentToken: p.doc.Add(&Node{ Type: CommentNode, Data: p.tok.Data, }) } if endSelect { p.endSelect() } return true } // Section 12.2.5.4.17. func inSelectInTableIM(p *parser) bool { switch p.tok.Type { case StartTagToken, EndTagToken: switch p.tok.Data { case "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th": if p.tok.Type == StartTagToken || p.elementInScope(tableScope, p.tok.Data) { p.endSelect() return false } else { // Ignore the token. return true } } } return inSelectIM(p) } func (p *parser) endSelect() { for i := len(p.oe) - 1; i >= 0; i-- { switch p.oe[i].Data { case "option", "optgroup": continue case "select": p.oe = p.oe[:i] p.resetInsertionMode() } return } } // Section 12.2.5.4.18. func afterBodyIM(p *parser) bool { switch p.tok.Type { case ErrorToken: // Stop parsing. return true case StartTagToken: if p.tok.Data == "html" { return inBodyIM(p) } case EndTagToken: if p.tok.Data == "html" { p.im = afterAfterBodyIM return true } case CommentToken: // The comment is attached to the element. if len(p.oe) < 1 || p.oe[0].Data != "html" { panic("html: bad parser state: element not found, in the after-body insertion mode") } p.oe[0].Add(&Node{ Type: CommentNode, Data: p.tok.Data, }) return true } p.im = inBodyIM return false } // Section 12.2.5.4.19. func inFramesetIM(p *parser) bool { switch p.tok.Type { case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) case TextToken: // Ignore all text but whitespace. s := strings.Map(func(c rune) rune { switch c { case ' ', '\t', '\n', '\f', '\r': return c } return -1 }, p.tok.Data) if s != "" { p.addText(s) } case StartTagToken: switch p.tok.Data { case "html": return inBodyIM(p) case "frameset": p.addElement(p.tok.Data, p.tok.Attr) case "frame": p.addElement(p.tok.Data, p.tok.Attr) p.oe.pop() p.acknowledgeSelfClosingTag() case "noframes": return inHeadIM(p) } case EndTagToken: switch p.tok.Data { case "frameset": if p.oe.top().Data != "html" { p.oe.pop() if p.oe.top().Data != "frameset" { p.im = afterFramesetIM return true } } } default: // Ignore the token. } return true } // Section 12.2.5.4.20. func afterFramesetIM(p *parser) bool { switch p.tok.Type { case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) case TextToken: // Ignore all text but whitespace. s := strings.Map(func(c rune) rune { switch c { case ' ', '\t', '\n', '\f', '\r': return c } return -1 }, p.tok.Data) if s != "" { p.addText(s) } case StartTagToken: switch p.tok.Data { case "html": return inBodyIM(p) case "noframes": return inHeadIM(p) } case EndTagToken: switch p.tok.Data { case "html": p.im = afterAfterFramesetIM return true } default: // Ignore the token. } return true } // Section 12.2.5.4.21. func afterAfterBodyIM(p *parser) bool { switch p.tok.Type { case ErrorToken: // Stop parsing. return true case TextToken: // TODO. case StartTagToken: if p.tok.Data == "html" { return inBodyIM(p) } case CommentToken: p.doc.Add(&Node{ Type: CommentNode, Data: p.tok.Data, }) return true } p.im = inBodyIM return false } // Section 12.2.5.4.22. func afterAfterFramesetIM(p *parser) bool { switch p.tok.Type { case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) case TextToken: // Ignore all text but whitespace. s := strings.Map(func(c rune) rune { switch c { case ' ', '\t', '\n', '\f', '\r': return c } return -1 }, p.tok.Data) if s != "" { p.reconstructActiveFormattingElements() p.addText(s) } case StartTagToken: switch p.tok.Data { case "html": return inBodyIM(p) case "noframes": return inHeadIM(p) } default: // Ignore the token. } return true } // Section 12.2.5.5. func parseForeignContent(p *parser) bool { switch p.tok.Type { case TextToken: // TODO: HTML integration points. if p.top().Namespace == "" { inBodyIM(p) p.resetInsertionMode() return true } if p.framesetOK { p.framesetOK = strings.TrimLeft(p.tok.Data, whitespace) == "" } p.addText(p.tok.Data) case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) case StartTagToken: if htmlIntegrationPoint(p.top()) { inBodyIM(p) p.resetInsertionMode() return true } if breakout[p.tok.Data] { for i := len(p.oe) - 1; i >= 0; i-- { // TODO: MathML integration points. if p.oe[i].Namespace == "" || htmlIntegrationPoint(p.oe[i]) { p.oe = p.oe[:i+1] break } } return false } switch p.top().Namespace { case "math": // TODO: adjust MathML attributes. case "svg": // Adjust SVG tag names. The tokenizer lower-cases tag names, but // SVG wants e.g. "foreignObject" with a capital second "O". if x := svgTagNameAdjustments[p.tok.Data]; x != "" { p.tok.Data = x } // TODO: adjust SVG attributes. default: panic("html: bad parser state: unexpected namespace") } adjustForeignAttributes(p.tok.Attr) namespace := p.top().Namespace p.addElement(p.tok.Data, p.tok.Attr) p.top().Namespace = namespace case EndTagToken: for i := len(p.oe) - 1; i >= 0; i-- { if p.oe[i].Namespace == "" { return p.im(p) } if strings.EqualFold(p.oe[i].Data, p.tok.Data) { p.oe = p.oe[:i] break } } return true default: // Ignore the token. } return true } // Section 12.2.5. func (p *parser) inForeignContent() bool { if len(p.oe) == 0 { return false } n := p.oe[len(p.oe)-1] if n.Namespace == "" { return false } // TODO: MathML, HTML integration points. // TODO: MathML's annotation-xml combining with SVG's svg. return true } func (p *parser) parse() error { // Iterate until EOF. Any other error will cause an early return. consumed := true for { if consumed { if err := p.read(); err != nil { if err == io.EOF { break } return err } } if p.inForeignContent() { consumed = parseForeignContent(p) } else { consumed = p.im(p) } } // Loop until the final token (the ErrorToken signifying EOF) is consumed. for { if consumed = p.im(p); consumed { break } } return nil } // Parse returns the parse tree for the HTML from the given Reader. // The input is assumed to be UTF-8 encoded. func Parse(r io.Reader) (*Node, error) { p := &parser{ tokenizer: NewTokenizer(r), doc: &Node{ Type: DocumentNode, }, scripting: true, framesetOK: true, im: initialIM, } err := p.parse() if err != nil { return nil, err } return p.doc, nil } // ParseFragment parses a fragment of HTML and returns the nodes that were // found. If the fragment is the InnerHTML for an existing element, pass that // element in context. func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { p := &parser{ tokenizer: NewTokenizer(r), doc: &Node{ Type: DocumentNode, }, scripting: true, context: context, } if context != nil { switch context.Data { case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp": p.tokenizer.rawTag = context.Data } } root := &Node{ Type: ElementNode, Data: "html", } p.doc.Add(root) p.oe = nodeStack{root} p.resetInsertionMode() for n := context; n != nil; n = n.Parent { if n.Type == ElementNode && n.Data == "form" { p.form = n break } } err := p.parse() if err != nil { return nil, err } parent := p.doc if context != nil { parent = root } result := parent.Child parent.Child = nil for _, n := range result { n.Parent = nil } return result, nil }