diff options
Diffstat (limited to 'src/pkg/encoding/xml/xml.go')
| -rw-r--r-- | src/pkg/encoding/xml/xml.go | 290 | 
1 files changed, 192 insertions, 98 deletions
| diff --git a/src/pkg/encoding/xml/xml.go b/src/pkg/encoding/xml/xml.go index 5066f5c01..143fec554 100644 --- a/src/pkg/encoding/xml/xml.go +++ b/src/pkg/encoding/xml/xml.go @@ -181,7 +181,6 @@ type Decoder struct {  	ns        map[string]string  	err       error  	line      int -	tmp       [32]byte  }  // NewDecoder creates a new XML parser reading from r. @@ -584,6 +583,7 @@ func (d *Decoder) RawToken() (Token, error) {  			if inquote == 0 && b == '>' && depth == 0 {  				break  			} +		HandleB:  			d.buf.WriteByte(b)  			switch {  			case b == inquote: @@ -599,7 +599,35 @@ func (d *Decoder) RawToken() (Token, error) {  				depth--  			case b == '<' && inquote == 0: -				depth++ +				// Look for <!-- to begin comment. +				s := "!--" +				for i := 0; i < len(s); i++ { +					if b, ok = d.mustgetc(); !ok { +						return nil, d.err +					} +					if b != s[i] { +						for j := 0; j < i; j++ { +							d.buf.WriteByte(s[j]) +						} +						depth++ +						goto HandleB +					} +				} + +				// Remove < that was written above. +				d.buf.Truncate(d.buf.Len() - 1) + +				// Look for terminator. +				var b0, b1 byte +				for { +					if b, ok = d.mustgetc(); !ok { +						return nil, d.err +					} +					if b0 == '-' && b1 == '-' && b == '>' { +						break +					} +					b0, b1 = b1, b +				}  			}  		}  		return Directive(d.buf.Bytes()), nil @@ -848,78 +876,103 @@ Input:  			// XML in all its glory allows a document to define and use  			// its own character names with <!ENTITY ...> directives.  			// Parsers are required to recognize lt, gt, amp, apos, and quot -			// even if they have not been declared.  That's all we allow. -			var i int -			for i = 0; i < len(d.tmp); i++ { -				var ok bool -				d.tmp[i], ok = d.getc() -				if !ok { -					if d.err == io.EOF { -						d.err = d.syntaxError("unexpected EOF") -					} +			// even if they have not been declared. +			before := d.buf.Len() +			d.buf.WriteByte('&') +			var ok bool +			var text string +			var haveText bool +			if b, ok = d.mustgetc(); !ok { +				return nil +			} +			if b == '#' { +				d.buf.WriteByte(b) +				if b, ok = d.mustgetc(); !ok {  					return nil  				} -				c := d.tmp[i] -				if c == ';' { -					break -				} -				if 'a' <= c && c <= 'z' || -					'A' <= c && c <= 'Z' || -					'0' <= c && c <= '9' || -					c == '_' || c == '#' { -					continue +				base := 10 +				if b == 'x' { +					base = 16 +					d.buf.WriteByte(b) +					if b, ok = d.mustgetc(); !ok { +						return nil +					}  				} -				d.ungetc(c) -				break -			} -			s := string(d.tmp[0:i]) -			if i >= len(d.tmp) { -				if !d.Strict { -					b0, b1 = 0, 0 -					d.buf.WriteByte('&') -					d.buf.Write(d.tmp[0:i]) -					continue Input +				start := d.buf.Len() +				for '0' <= b && b <= '9' || +					base == 16 && 'a' <= b && b <= 'f' || +					base == 16 && 'A' <= b && b <= 'F' { +					d.buf.WriteByte(b) +					if b, ok = d.mustgetc(); !ok { +						return nil +					}  				} -				d.err = d.syntaxError("character entity expression &" + s + "... too long") -				return nil -			} -			var haveText bool -			var text string -			if i >= 2 && s[0] == '#' { -				var n uint64 -				var err error -				if i >= 3 && s[1] == 'x' { -					n, err = strconv.ParseUint(s[2:], 16, 64) +				if b != ';' { +					d.ungetc(b)  				} else { -					n, err = strconv.ParseUint(s[1:], 10, 64) -				} -				if err == nil && n <= unicode.MaxRune { -					text = string(n) -					haveText = true +					s := string(d.buf.Bytes()[start:]) +					d.buf.WriteByte(';') +					n, err := strconv.ParseUint(s, base, 64) +					if err == nil && n <= unicode.MaxRune { +						text = string(n) +						haveText = true +					}  				}  			} else { -				if r, ok := entity[s]; ok { -					text = string(r) -					haveText = true -				} else if d.Entity != nil { -					text, haveText = d.Entity[s] +				d.ungetc(b) +				if !d.readName() { +					if d.err != nil { +						return nil +					} +					ok = false  				} -			} -			if !haveText { -				if !d.Strict { -					b0, b1 = 0, 0 -					d.buf.WriteByte('&') -					d.buf.Write(d.tmp[0:i]) -					continue Input +				if b, ok = d.mustgetc(); !ok { +					return nil  				} -				d.err = d.syntaxError("invalid character entity &" + s + ";") -				return nil +				if b != ';' { +					d.ungetc(b) +				} else { +					name := d.buf.Bytes()[before+1:] +					d.buf.WriteByte(';') +					if isName(name) { +						s := string(name) +						if r, ok := entity[s]; ok { +							text = string(r) +							haveText = true +						} else if d.Entity != nil { +							text, haveText = d.Entity[s] +						} +					} +				} +			} + +			if haveText { +				d.buf.Truncate(before) +				d.buf.Write([]byte(text)) +				b0, b1 = 0, 0 +				continue Input  			} -			d.buf.Write([]byte(text)) -			b0, b1 = 0, 0 -			continue Input +			if !d.Strict { +				b0, b1 = 0, 0 +				continue Input +			} +			ent := string(d.buf.Bytes()[before]) +			if ent[len(ent)-1] != ';' { +				ent += " (no semicolon)" +			} +			d.err = d.syntaxError("invalid character entity " + ent) +			return nil  		} -		d.buf.WriteByte(b) + +		// We must rewrite unescaped \r and \r\n into \n. +		if b == '\r' { +			d.buf.WriteByte('\n') +		} else if b1 == '\r' && b == '\n' { +			// Skip \r\n--we already wrote \n. +		} else { +			d.buf.WriteByte(b) +		} +  		b0, b1 = b1, b  	}  	data := d.buf.Bytes() @@ -940,20 +993,7 @@ Input:  		}  	} -	// Must rewrite \r and \r\n into \n. -	w := 0 -	for r := 0; r < len(data); r++ { -		b := data[r] -		if b == '\r' { -			if r+1 < len(data) && data[r+1] == '\n' { -				continue -			} -			b = '\n' -		} -		data[w] = b -		w++ -	} -	return data[0:w] +	return data  }  // Decide whether the given rune is in the XML Character Range, per @@ -989,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) {  // Do not set d.err if the name is missing (unless unexpected EOF is received):  // let the caller provide better context.  func (d *Decoder) name() (s string, ok bool) { +	d.buf.Reset() +	if !d.readName() { +		return "", false +	} + +	// Now we check the characters. +	s = d.buf.String() +	if !isName([]byte(s)) { +		d.err = d.syntaxError("invalid XML name: " + s) +		return "", false +	} +	return s, true +} + +// Read a name and append its bytes to d.buf. +// The name is delimited by any single-byte character not valid in names. +// All multi-byte characters are accepted; the caller must check their validity. +func (d *Decoder) readName() (ok bool) {  	var b byte  	if b, ok = d.mustgetc(); !ok {  		return  	} - -	// As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*  	if b < utf8.RuneSelf && !isNameByte(b) {  		d.ungetc(b) -		return "", false +		return false  	} -	d.buf.Reset()  	d.buf.WriteByte(b) +  	for {  		if b, ok = d.mustgetc(); !ok {  			return @@ -1011,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) {  		}  		d.buf.WriteByte(b)  	} - -	// Then we check the characters. -	s = d.buf.String() -	for i, c := range s { -		if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) { -			d.err = d.syntaxError("invalid XML name: " + s) -			return "", false -		} -	} -	return s, true +	return true  }  func isNameByte(c byte) bool { @@ -1030,6 +1077,30 @@ func isNameByte(c byte) bool {  		c == '_' || c == ':' || c == '.' || c == '-'  } +func isName(s []byte) bool { +	if len(s) == 0 { +		return false +	} +	c, n := utf8.DecodeRune(s) +	if c == utf8.RuneError && n == 1 { +		return false +	} +	if !unicode.Is(first, c) { +		return false +	} +	for n < len(s) { +		s = s[n:] +		c, n = utf8.DecodeRune(s) +		if c == utf8.RuneError && n == 1 { +			return false +		} +		if !unicode.Is(first, c) && !unicode.Is(second, c) { +			return false +		} +	} +	return true +} +  // These tables were generated by cut and paste from Appendix B of  // the XML spec at http://www.xml.com/axml/testaxml.htm  // and then reformatting.  First corresponds to (Letter | '_' | ':') @@ -1621,7 +1692,7 @@ var HTMLAutoClose = htmlAutoClose  var htmlAutoClose = []string{  	/*  		hget http://www.w3.org/TR/html4/loose.dtd | -		9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/	"\1",/p' | tr A-Z a-z +		9 sed -n 's/<!ELEMENT ([^ ]*) +- O EMPTY.+/	"\1",/p' | tr A-Z a-z  	*/  	"basefont",  	"br", @@ -1631,7 +1702,7 @@ var htmlAutoClose = []string{  	"param",  	"hr",  	"input", -	"col     ", +	"col",  	"frame",  	"isindex",  	"base", @@ -1644,11 +1715,14 @@ var (  	esc_amp  = []byte("&")  	esc_lt   = []byte("<")  	esc_gt   = []byte(">") +	esc_tab  = []byte("	") +	esc_nl   = []byte("
") +	esc_cr   = []byte("
")  ) -// Escape writes to w the properly escaped XML equivalent +// EscapeText writes to w the properly escaped XML equivalent  // of the plain text data s. -func Escape(w io.Writer, s []byte) { +func EscapeText(w io.Writer, s []byte) error {  	var esc []byte  	last := 0  	for i, c := range s { @@ -1663,14 +1737,34 @@ func Escape(w io.Writer, s []byte) {  			esc = esc_lt  		case '>':  			esc = esc_gt +		case '\t': +			esc = esc_tab +		case '\n': +			esc = esc_nl +		case '\r': +			esc = esc_cr  		default:  			continue  		} -		w.Write(s[last:i]) -		w.Write(esc) +		if _, err := w.Write(s[last:i]); err != nil { +			return err +		} +		if _, err := w.Write(esc); err != nil { +			return err +		}  		last = i + 1  	} -	w.Write(s[last:]) +	if _, err := w.Write(s[last:]); err != nil { +		return err +	} +	return nil +} + +// Escape is like EscapeText but omits the error return value. +// It is provided for backwards compatibility with Go 1.0. +// Code targeting Go 1.1 or later should use EscapeText. +func Escape(w io.Writer, s []byte) { +	EscapeText(w, s)  }  // procInstEncoding parses the `encoding="..."` or `encoding='...'` | 
