diff options
author | Ondřej Surý <ondrej@sury.org> | 2011-04-28 10:35:15 +0200 |
---|---|---|
committer | Ondřej Surý <ondrej@sury.org> | 2011-04-28 10:35:15 +0200 |
commit | c1ba1a0fec4aed430709030f98a3bdb90bfeea16 (patch) | |
tree | 3df18657e50a0313ed6defcda30e4474cb28a467 /src/pkg/xml/xml.go | |
parent | 7b15ed9ef455b6b66c6b376898a88aef5d6a9970 (diff) | |
download | golang-c1ba1a0fec4aed430709030f98a3bdb90bfeea16.tar.gz |
Imported Upstream version 2011.04.27upstream/2011.04.27
Diffstat (limited to 'src/pkg/xml/xml.go')
-rw-r--r-- | src/pkg/xml/xml.go | 73 |
1 files changed, 62 insertions, 11 deletions
diff --git a/src/pkg/xml/xml.go b/src/pkg/xml/xml.go index f92abe825..42d8b986e 100644 --- a/src/pkg/xml/xml.go +++ b/src/pkg/xml/xml.go @@ -163,6 +163,13 @@ type Parser struct { // "quot": `"`, Entity map[string]string + // CharsetReader, if non-nil, defines a function to generate + // charset-conversion readers, converting from the provided + // non-UTF-8 charset into UTF-8. If CharsetReader is nil or + // returns an error, parsing stops with an error. One of the + // the CharsetReader's result values must be non-nil. + CharsetReader func(charset string, input io.Reader) (io.Reader, os.Error) + r io.ByteReader buf bytes.Buffer saved *bytes.Buffer @@ -186,17 +193,7 @@ func NewParser(r io.Reader) *Parser { line: 1, Strict: true, } - - // Get efficient byte at a time reader. - // Assume that if reader has its own - // ReadByte, it's efficient enough. - // Otherwise, use bufio. - if rb, ok := r.(io.ByteReader); ok { - p.r = rb - } else { - p.r = bufio.NewReader(r) - } - + p.switchToReader(r) return p } @@ -290,6 +287,18 @@ func (p *Parser) translate(n *Name, isElementName bool) { } } +func (p *Parser) switchToReader(r io.Reader) { + // Get efficient byte at a time reader. + // Assume that if reader has its own + // ReadByte, it's efficient enough. + // Otherwise, use bufio. + if rb, ok := r.(io.ByteReader); ok { + p.r = rb + } else { + p.r = bufio.NewReader(r) + } +} + // Parsing state - stack holds old name space translations // and the current set of open elements. The translations to pop when // ending a given tag are *below* it on the stack, which is @@ -487,6 +496,25 @@ func (p *Parser) RawToken() (Token, os.Error) { } data := p.buf.Bytes() data = data[0 : len(data)-2] // chop ?> + + if target == "xml" { + enc := procInstEncoding(string(data)) + if enc != "" && enc != "utf-8" && enc != "UTF-8" { + if p.CharsetReader == nil { + p.err = fmt.Errorf("xml: encoding %q declared but Parser.CharsetReader is nil", enc) + return nil, p.err + } + newr, err := p.CharsetReader(enc, p.r.(io.Reader)) + if err != nil { + p.err = fmt.Errorf("xml: opening charset %q: %v", enc, err) + return nil, p.err + } + if newr == nil { + panic("CharsetReader returned a nil Reader for charset " + enc) + } + p.switchToReader(newr) + } + } return ProcInst{target, data}, nil case '!': @@ -1633,3 +1661,26 @@ func Escape(w io.Writer, s []byte) { } w.Write(s[last:]) } + +// procInstEncoding parses the `encoding="..."` or `encoding='...'` +// value out of the provided string, returning "" if not found. +func procInstEncoding(s string) string { + // TODO: this parsing is somewhat lame and not exact. + // It works for all actual cases, though. + idx := strings.Index(s, "encoding=") + if idx == -1 { + return "" + } + v := s[idx+len("encoding="):] + if v == "" { + return "" + } + if v[0] != '\'' && v[0] != '"' { + return "" + } + idx = strings.IndexRune(v[1:], int(v[0])) + if idx == -1 { + return "" + } + return v[1 : idx+1] +} |