summaryrefslogtreecommitdiff
path: root/src/pkg/encoding/xml/xml.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/encoding/xml/xml.go')
-rw-r--r--src/pkg/encoding/xml/xml.go290
1 files changed, 192 insertions, 98 deletions
diff --git a/src/pkg/encoding/xml/xml.go b/src/pkg/encoding/xml/xml.go
index 5066f5c01..143fec554 100644
--- a/src/pkg/encoding/xml/xml.go
+++ b/src/pkg/encoding/xml/xml.go
@@ -181,7 +181,6 @@ type Decoder struct {
ns map[string]string
err error
line int
- tmp [32]byte
}
// NewDecoder creates a new XML parser reading from r.
@@ -584,6 +583,7 @@ func (d *Decoder) RawToken() (Token, error) {
if inquote == 0 && b == '>' && depth == 0 {
break
}
+ HandleB:
d.buf.WriteByte(b)
switch {
case b == inquote:
@@ -599,7 +599,35 @@ func (d *Decoder) RawToken() (Token, error) {
depth--
case b == '<' && inquote == 0:
- depth++
+ // Look for <!-- to begin comment.
+ s := "!--"
+ for i := 0; i < len(s); i++ {
+ if b, ok = d.mustgetc(); !ok {
+ return nil, d.err
+ }
+ if b != s[i] {
+ for j := 0; j < i; j++ {
+ d.buf.WriteByte(s[j])
+ }
+ depth++
+ goto HandleB
+ }
+ }
+
+ // Remove < that was written above.
+ d.buf.Truncate(d.buf.Len() - 1)
+
+ // Look for terminator.
+ var b0, b1 byte
+ for {
+ if b, ok = d.mustgetc(); !ok {
+ return nil, d.err
+ }
+ if b0 == '-' && b1 == '-' && b == '>' {
+ break
+ }
+ b0, b1 = b1, b
+ }
}
}
return Directive(d.buf.Bytes()), nil
@@ -848,78 +876,103 @@ Input:
// XML in all its glory allows a document to define and use
// its own character names with <!ENTITY ...> directives.
// Parsers are required to recognize lt, gt, amp, apos, and quot
- // even if they have not been declared. That's all we allow.
- var i int
- for i = 0; i < len(d.tmp); i++ {
- var ok bool
- d.tmp[i], ok = d.getc()
- if !ok {
- if d.err == io.EOF {
- d.err = d.syntaxError("unexpected EOF")
- }
+ // even if they have not been declared.
+ before := d.buf.Len()
+ d.buf.WriteByte('&')
+ var ok bool
+ var text string
+ var haveText bool
+ if b, ok = d.mustgetc(); !ok {
+ return nil
+ }
+ if b == '#' {
+ d.buf.WriteByte(b)
+ if b, ok = d.mustgetc(); !ok {
return nil
}
- c := d.tmp[i]
- if c == ';' {
- break
- }
- if 'a' <= c && c <= 'z' ||
- 'A' <= c && c <= 'Z' ||
- '0' <= c && c <= '9' ||
- c == '_' || c == '#' {
- continue
+ base := 10
+ if b == 'x' {
+ base = 16
+ d.buf.WriteByte(b)
+ if b, ok = d.mustgetc(); !ok {
+ return nil
+ }
}
- d.ungetc(c)
- break
- }
- s := string(d.tmp[0:i])
- if i >= len(d.tmp) {
- if !d.Strict {
- b0, b1 = 0, 0
- d.buf.WriteByte('&')
- d.buf.Write(d.tmp[0:i])
- continue Input
+ start := d.buf.Len()
+ for '0' <= b && b <= '9' ||
+ base == 16 && 'a' <= b && b <= 'f' ||
+ base == 16 && 'A' <= b && b <= 'F' {
+ d.buf.WriteByte(b)
+ if b, ok = d.mustgetc(); !ok {
+ return nil
+ }
}
- d.err = d.syntaxError("character entity expression &" + s + "... too long")
- return nil
- }
- var haveText bool
- var text string
- if i >= 2 && s[0] == '#' {
- var n uint64
- var err error
- if i >= 3 && s[1] == 'x' {
- n, err = strconv.ParseUint(s[2:], 16, 64)
+ if b != ';' {
+ d.ungetc(b)
} else {
- n, err = strconv.ParseUint(s[1:], 10, 64)
- }
- if err == nil && n <= unicode.MaxRune {
- text = string(n)
- haveText = true
+ s := string(d.buf.Bytes()[start:])
+ d.buf.WriteByte(';')
+ n, err := strconv.ParseUint(s, base, 64)
+ if err == nil && n <= unicode.MaxRune {
+ text = string(n)
+ haveText = true
+ }
}
} else {
- if r, ok := entity[s]; ok {
- text = string(r)
- haveText = true
- } else if d.Entity != nil {
- text, haveText = d.Entity[s]
+ d.ungetc(b)
+ if !d.readName() {
+ if d.err != nil {
+ return nil
+ }
+ ok = false
}
- }
- if !haveText {
- if !d.Strict {
- b0, b1 = 0, 0
- d.buf.WriteByte('&')
- d.buf.Write(d.tmp[0:i])
- continue Input
+ if b, ok = d.mustgetc(); !ok {
+ return nil
}
- d.err = d.syntaxError("invalid character entity &" + s + ";")
- return nil
+ if b != ';' {
+ d.ungetc(b)
+ } else {
+ name := d.buf.Bytes()[before+1:]
+ d.buf.WriteByte(';')
+ if isName(name) {
+ s := string(name)
+ if r, ok := entity[s]; ok {
+ text = string(r)
+ haveText = true
+ } else if d.Entity != nil {
+ text, haveText = d.Entity[s]
+ }
+ }
+ }
+ }
+
+ if haveText {
+ d.buf.Truncate(before)
+ d.buf.Write([]byte(text))
+ b0, b1 = 0, 0
+ continue Input
}
- d.buf.Write([]byte(text))
- b0, b1 = 0, 0
- continue Input
+ if !d.Strict {
+ b0, b1 = 0, 0
+ continue Input
+ }
+ ent := string(d.buf.Bytes()[before])
+ if ent[len(ent)-1] != ';' {
+ ent += " (no semicolon)"
+ }
+ d.err = d.syntaxError("invalid character entity " + ent)
+ return nil
}
- d.buf.WriteByte(b)
+
+ // We must rewrite unescaped \r and \r\n into \n.
+ if b == '\r' {
+ d.buf.WriteByte('\n')
+ } else if b1 == '\r' && b == '\n' {
+ // Skip \r\n--we already wrote \n.
+ } else {
+ d.buf.WriteByte(b)
+ }
+
b0, b1 = b1, b
}
data := d.buf.Bytes()
@@ -940,20 +993,7 @@ Input:
}
}
- // Must rewrite \r and \r\n into \n.
- w := 0
- for r := 0; r < len(data); r++ {
- b := data[r]
- if b == '\r' {
- if r+1 < len(data) && data[r+1] == '\n' {
- continue
- }
- b = '\n'
- }
- data[w] = b
- w++
- }
- return data[0:w]
+ return data
}
// Decide whether the given rune is in the XML Character Range, per
@@ -989,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) {
// Do not set d.err if the name is missing (unless unexpected EOF is received):
// let the caller provide better context.
func (d *Decoder) name() (s string, ok bool) {
+ d.buf.Reset()
+ if !d.readName() {
+ return "", false
+ }
+
+ // Now we check the characters.
+ s = d.buf.String()
+ if !isName([]byte(s)) {
+ d.err = d.syntaxError("invalid XML name: " + s)
+ return "", false
+ }
+ return s, true
+}
+
+// Read a name and append its bytes to d.buf.
+// The name is delimited by any single-byte character not valid in names.
+// All multi-byte characters are accepted; the caller must check their validity.
+func (d *Decoder) readName() (ok bool) {
var b byte
if b, ok = d.mustgetc(); !ok {
return
}
-
- // As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*
if b < utf8.RuneSelf && !isNameByte(b) {
d.ungetc(b)
- return "", false
+ return false
}
- d.buf.Reset()
d.buf.WriteByte(b)
+
for {
if b, ok = d.mustgetc(); !ok {
return
@@ -1011,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) {
}
d.buf.WriteByte(b)
}
-
- // Then we check the characters.
- s = d.buf.String()
- for i, c := range s {
- if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) {
- d.err = d.syntaxError("invalid XML name: " + s)
- return "", false
- }
- }
- return s, true
+ return true
}
func isNameByte(c byte) bool {
@@ -1030,6 +1077,30 @@ func isNameByte(c byte) bool {
c == '_' || c == ':' || c == '.' || c == '-'
}
+func isName(s []byte) bool {
+ if len(s) == 0 {
+ return false
+ }
+ c, n := utf8.DecodeRune(s)
+ if c == utf8.RuneError && n == 1 {
+ return false
+ }
+ if !unicode.Is(first, c) {
+ return false
+ }
+ for n < len(s) {
+ s = s[n:]
+ c, n = utf8.DecodeRune(s)
+ if c == utf8.RuneError && n == 1 {
+ return false
+ }
+ if !unicode.Is(first, c) && !unicode.Is(second, c) {
+ return false
+ }
+ }
+ return true
+}
+
// These tables were generated by cut and paste from Appendix B of
// the XML spec at http://www.xml.com/axml/testaxml.htm
// and then reformatting. First corresponds to (Letter | '_' | ':')
@@ -1621,7 +1692,7 @@ var HTMLAutoClose = htmlAutoClose
var htmlAutoClose = []string{
/*
hget http://www.w3.org/TR/html4/loose.dtd |
- 9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/ "\1",/p' | tr A-Z a-z
+ 9 sed -n 's/<!ELEMENT ([^ ]*) +- O EMPTY.+/ "\1",/p' | tr A-Z a-z
*/
"basefont",
"br",
@@ -1631,7 +1702,7 @@ var htmlAutoClose = []string{
"param",
"hr",
"input",
- "col ",
+ "col",
"frame",
"isindex",
"base",
@@ -1644,11 +1715,14 @@ var (
esc_amp = []byte("&amp;")
esc_lt = []byte("&lt;")
esc_gt = []byte("&gt;")
+ esc_tab = []byte("&#x9;")
+ esc_nl = []byte("&#xA;")
+ esc_cr = []byte("&#xD;")
)
-// Escape writes to w the properly escaped XML equivalent
+// EscapeText writes to w the properly escaped XML equivalent
// of the plain text data s.
-func Escape(w io.Writer, s []byte) {
+func EscapeText(w io.Writer, s []byte) error {
var esc []byte
last := 0
for i, c := range s {
@@ -1663,14 +1737,34 @@ func Escape(w io.Writer, s []byte) {
esc = esc_lt
case '>':
esc = esc_gt
+ case '\t':
+ esc = esc_tab
+ case '\n':
+ esc = esc_nl
+ case '\r':
+ esc = esc_cr
default:
continue
}
- w.Write(s[last:i])
- w.Write(esc)
+ if _, err := w.Write(s[last:i]); err != nil {
+ return err
+ }
+ if _, err := w.Write(esc); err != nil {
+ return err
+ }
last = i + 1
}
- w.Write(s[last:])
+ if _, err := w.Write(s[last:]); err != nil {
+ return err
+ }
+ return nil
+}
+
+// Escape is like EscapeText but omits the error return value.
+// It is provided for backwards compatibility with Go 1.0.
+// Code targeting Go 1.1 or later should use EscapeText.
+func Escape(w io.Writer, s []byte) {
+ EscapeText(w, s)
}
// procInstEncoding parses the `encoding="..."` or `encoding='...'`