Imported Upstream version 2011.04.27upstream/2011.04.27

author: Ondřej Surý <ondrej@sury.org> 2011-04-28 10:35:15 +0200
committer: Ondřej Surý <ondrej@sury.org> 2011-04-28 10:35:15 +0200
commit: c1ba1a0fec4aed430709030f98a3bdb90bfeea16 (patch)
tree: 3df18657e50a0313ed6defcda30e4474cb28a467 /src/pkg/xml/xml.go
parent: 7b15ed9ef455b6b66c6b376898a88aef5d6a9970 (diff)
download: golang-c1ba1a0fec4aed430709030f98a3bdb90bfeea16.tar.gz
1 files changed, 62 insertions, 11 deletions
diff --git a/src/pkg/xml/xml.go b/src/pkg/xml/xml.go
index f92abe825..42d8b986e 100644
--- a/src/pkg/xml/xml.go
+++ b/src/pkg/xml/xml.go
@@ -163,6 +163,13 @@ type Parser struct {
 	//	"quot": `"`,
 	Entity map[string]string
 
+	// CharsetReader, if non-nil, defines a function to generate
+	// charset-conversion readers, converting from the provided
+	// non-UTF-8 charset into UTF-8. If CharsetReader is nil or
+	// returns an error, parsing stops with an error. One of the
+	// the CharsetReader's result values must be non-nil.
+	CharsetReader func(charset string, input io.Reader) (io.Reader, os.Error)
+
 	r         io.ByteReader
 	buf       bytes.Buffer
 	saved     *bytes.Buffer
@@ -186,17 +193,7 @@ func NewParser(r io.Reader) *Parser {
 		line:     1,
 		Strict:   true,
 	}
-
-	// Get efficient byte at a time reader.
-	// Assume that if reader has its own
-	// ReadByte, it's efficient enough.
-	// Otherwise, use bufio.
-	if rb, ok := r.(io.ByteReader); ok {
-		p.r = rb
-	} else {
-		p.r = bufio.NewReader(r)
-	}
-
+	p.switchToReader(r)
 	return p
 }
 
@@ -290,6 +287,18 @@ func (p *Parser) translate(n *Name, isElementName bool) {
 	}
 }
 
+func (p *Parser) switchToReader(r io.Reader) {
+	// Get efficient byte at a time reader.
+	// Assume that if reader has its own
+	// ReadByte, it's efficient enough.
+	// Otherwise, use bufio.
+	if rb, ok := r.(io.ByteReader); ok {
+		p.r = rb
+	} else {
+		p.r = bufio.NewReader(r)
+	}
+}
+
 // Parsing state - stack holds old name space translations
 // and the current set of open elements.  The translations to pop when
 // ending a given tag are *below* it on the stack, which is
@@ -487,6 +496,25 @@ func (p *Parser) RawToken() (Token, os.Error) {
 		}
 		data := p.buf.Bytes()
 		data = data[0 : len(data)-2] // chop ?>
+
+		if target == "xml" {
+			enc := procInstEncoding(string(data))
+			if enc != "" && enc != "utf-8" && enc != "UTF-8" {
+				if p.CharsetReader == nil {
+					p.err = fmt.Errorf("xml: encoding %q declared but Parser.CharsetReader is nil", enc)
+					return nil, p.err
+				}
+				newr, err := p.CharsetReader(enc, p.r.(io.Reader))
+				if err != nil {
+					p.err = fmt.Errorf("xml: opening charset %q: %v", enc, err)
+					return nil, p.err
+				}
+				if newr == nil {
+					panic("CharsetReader returned a nil Reader for charset " + enc)
+				}
+				p.switchToReader(newr)
+			}
+		}
 		return ProcInst{target, data}, nil
 
 	case '!':
@@ -1633,3 +1661,26 @@ func Escape(w io.Writer, s []byte) {
 	}
 	w.Write(s[last:])
 }
+
+// procInstEncoding parses the `encoding="..."` or `encoding='...'`
+// value out of the provided string, returning "" if not found.
+func procInstEncoding(s string) string {
+	// TODO: this parsing is somewhat lame and not exact.
+	// It works for all actual cases, though.
+	idx := strings.Index(s, "encoding=")
+	if idx == -1 {
+		return ""
+	}
+	v := s[idx+len("encoding="):]
+	if v == "" {
+		return ""
+	}
+	if v[0] != '\'' && v[0] != '"' {
+		return ""
+	}
+	idx = strings.IndexRune(v[1:], int(v[0]))
+	if idx == -1 {
+		return ""
+	}
+	return v[1 : idx+1]
+}
author	Ondřej Surý <ondrej@sury.org>	2011-04-28 10:35:15 +0200
committer	Ondřej Surý <ondrej@sury.org>	2011-04-28 10:35:15 +0200
commit	c1ba1a0fec4aed430709030f98a3bdb90bfeea16 (patch)
tree	3df18657e50a0313ed6defcda30e4474cb28a467 /src/pkg/xml/xml.go
parent	7b15ed9ef455b6b66c6b376898a88aef5d6a9970 (diff)
download	golang-c1ba1a0fec4aed430709030f98a3bdb90bfeea16.tar.gz