diff options
| author | Russ Cox <rsc@golang.org> | 2009-10-05 15:10:00 -0700 |
|---|---|---|
| committer | Russ Cox <rsc@golang.org> | 2009-10-05 15:10:00 -0700 |
| commit | caec21dcf8031743dc8c026b32fd8910b4f26eb5 (patch) | |
| tree | 9aa7a18694ca5e8e809b28c83b911fa4784a7533 /src | |
| parent | 555f87a06c92e3055f738219ab50829d9afa431b (diff) | |
| download | golang-caec21dcf8031743dc8c026b32fd8910b4f26eb5.tar.gz | |
XML parser
R=r
DELTA=546 (545 added, 0 deleted, 1 changed)
OCL=35318
CL=35341
Diffstat (limited to 'src')
| -rw-r--r-- | src/pkg/Make.deps | 3 | ||||
| -rw-r--r-- | src/pkg/Makefile | 1 | ||||
| -rw-r--r-- | src/pkg/xml/Makefile | 13 | ||||
| -rw-r--r-- | src/pkg/xml/read.go | 320 | ||||
| -rw-r--r-- | src/pkg/xml/read_test.go | 210 |
5 files changed, 546 insertions, 1 deletions
diff --git a/src/pkg/Make.deps b/src/pkg/Make.deps index 1f85b2c39..9f95da540 100644 --- a/src/pkg/Make.deps +++ b/src/pkg/Make.deps @@ -30,7 +30,7 @@ expvar.install: bytes.install fmt.install http.install log.install strconv.insta flag.install: fmt.install os.install strconv.install fmt.install: io.install os.install reflect.install strconv.install utf8.install go/ast.install: go/token.install unicode.install utf8.install -go/doc.install: container/vector.install go/ast.install go/token.install io.install once.install regexp.install sort.install strings.install template.install +go/doc.install: container/vector.install go/ast.install go/token.install io.install regexp.install sort.install strings.install template.install go/parser.install: bytes.install container/vector.install fmt.install go/ast.install go/scanner.install go/token.install io.install os.install path.install strings.install go/printer.install: bytes.install container/vector.install fmt.install go/ast.install go/token.install io.install os.install reflect.install runtime.install strings.install tabwriter.install go/scanner.install: bytes.install container/vector.install fmt.install go/token.install io.install os.install sort.install strconv.install unicode.install utf8.install @@ -68,3 +68,4 @@ testing/iotest.install: bytes.install io.install log.install os.install time.install: io.install once.install os.install syscall.install unicode.install: utf8.install: unicode.install +xml.install: bufio.install bytes.install io.install os.install reflect.install strconv.install strings.install unicode.install utf8.install diff --git a/src/pkg/Makefile b/src/pkg/Makefile index 6dd11f93e..13899671d 100644 --- a/src/pkg/Makefile +++ b/src/pkg/Makefile @@ -82,6 +82,7 @@ DIRS=\ time\ unicode\ utf8\ + xml\ NOTEST=\ debug/proc\ diff --git a/src/pkg/xml/Makefile b/src/pkg/xml/Makefile new file mode 100644 index 000000000..3c005c605 --- /dev/null +++ b/src/pkg/xml/Makefile @@ -0,0 +1,13 @@ +# Copyright 2009 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +include $(GOROOT)/src/Make.$(GOARCH) + +TARG=xml + +GOFILES=\ + read.go\ + xml.go\ + +include $(GOROOT)/src/Make.pkg diff --git a/src/pkg/xml/read.go b/src/pkg/xml/read.go new file mode 100644 index 000000000..c6f81ee5c --- /dev/null +++ b/src/pkg/xml/read.go @@ -0,0 +1,320 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xml + +import ( + "bytes"; + "io"; + "os"; + "reflect"; + "strings"; +) + +// BUG(rsc): Mapping between XML elements and data structures is inherently flawed: +// an XML element is an order-dependent collection of anonymous +// values, while a data structure is an order-independent collection +// of named values. +// See package json for a textual representation more suitable +// to data structures. + +// Unmarshal parses an XML element from r and uses the +// reflect library to fill in an arbitrary struct, slice, or string +// pointed at by val. Well-formed data that does not fit +// into val is discarded. +// +// For example, given these definitions: +// +// type Email struct { +// Where string "attr"; +// Addr string; +// } +// +// type Result struct { +// XMLName xml.Name "result"; +// Name string; +// Phone string; +// Email []Email; +// } +// +// var result = Result{ "name", "phone", nil } +// +// unmarshalling the XML input +// +// <result> +// <email where="home"> +// <addr>gre@example.com</addr> +// </email> +// <email where='work'> +// <addr>gre@work.com</addr> +// </email> +// <name>Grace R. Emlin</name> +// <address>123 Main Street</address> +// </result> +// +// via Unmarshal(r, &result) is equivalent to assigning +// +// r = Result{ +// xml.Name{"", "result"}, +// "Grace R. Emlin", // name +// "phone", // no phone given +// []Email{ +// Email{ "home", "gre@example.com" }, +// Email{ "work", "gre@work.com" } +// } +// } +// +// Note that the field r.Phone has not been modified and +// that the XML <address> element was discarded. +// +// Because Unmarshal uses the reflect package, it can only +// assign to upper case fields. Unmarshal uses a case-insensitive +// comparison to match XML element names to struct field names. +// +// Unmarshal maps an XML element to a struct using the following rules: +// +// * If the struct has a field named XMLName of type xml.Name, +// Unmarshal records the element name in that field. +// +// * If the XMLName field has an associated tag string of the form +// "tag" or "namespace-URL tag", the XML element must have +// the given tag (and, optionally, name space) or else Unmarshal +// returns an error. +// +// * If the XML element has an attribute whose name matches a +// struct field of type string with tag "attr", Unmarshal records +// the attribute value in that field. +// +// * If the XML element contains character data, that data is +// accumulated in the first struct field that has tag "chardata". +// The struct field may have type []byte or string. +// If there is no such field, the character data is discarded. +// +// * If the XML element contains a sub-element whose name +// matches a struct field whose tag is neither "attr" nor "chardata", +// Unmarshal maps the sub-element to that struct field. +// +// Unmarshal maps an XML element to a string or []byte by saving the +// concatenation of that elements character data in the string or []byte. +// +// Unmarshal maps an XML element to a slice by extending the length +// of the slice and mapping the element to the newly created value. +// +func Unmarshal(r io.Reader, val interface{}) os.Error { + v, ok := reflect.NewValue(val).(*reflect.PtrValue); + if !ok { + return os.NewError("non-pointer passed to Unmarshal"); + } + p := NewParser(r); + elem := v.Elem(); + for { + err := p.unmarshal(elem, nil); + if err != nil { + if err == os.EOF { + break; + } + return err; + } + } + return nil; +} + +// An UnmarshalError represents an error in the unmarshalling process. +type UnmarshalError string +func (e UnmarshalError) String() string { + return string(e); +} + +// Unmarshal a single XML element into val. +func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error { + // Find start element if we need it. + if start == nil { + for { + tok, err := p.Token(); + if err != nil { + return err; + } + if t, ok := tok.(StartElement); ok { + start = &t; + break; + } + } + } + + var ( + data []byte; + saveData reflect.Value; + sv *reflect.StructValue; + styp *reflect.StructType; + ) + switch v := val.(type) { + case *reflect.SliceValue: + typ := v.Type().(*reflect.SliceType); + if _, ok := typ.Elem().(*reflect.Uint8Type); ok { + // []byte + saveData = v; + break; + } + + // Slice of element values. + // Grow slice. + n := v.Len(); + if n >= v.Cap() { + ncap := 2*n; + if ncap < 4 { + ncap = 4; + } + new := reflect.MakeSlice(typ, n, ncap); + reflect.ArrayCopy(new, v); + v.Set(new); + } + v.SetLen(n+1); + + // Recur to read element into slice. + if err := p.unmarshal(v.Elem(n), start); err != nil { + v.SetLen(n); + return err; + } + return nil; + + case *reflect.StringValue: + saveData = v; + + case *reflect.StructValue: + sv = v; + typ := sv.Type().(*reflect.StructType); + styp = typ; + // Assign name. + if f, ok := typ.FieldByName("XMLName"); ok { + // Validate element name. + if f.Tag != "" { + tag := f.Tag; + ns := ""; + i := strings.LastIndex(tag, " "); + if i >= 0 { + ns, tag = tag[0:i], tag[i+1:len(tag)]; + } + if tag != start.Name.Local { + return UnmarshalError("expected element type <" + tag + "> but have <" + start.Name.Local + ">"); + } + if ns != "" && ns != start.Name.Space { + e := "expected element <" + tag + "> in name space " + ns + " but have "; + if start.Name.Space == "" { + e += "no name space"; + } else { + e += start.Name.Space; + } + return UnmarshalError(e); + } + } + + // Save + v := sv.FieldByIndex(f.Index); + if _, ok := v.Interface().(Name); !ok { + return UnmarshalError(sv.Type().String() + " field XMLName does not have type xml.Name"); + } + v.(*reflect.StructValue).Set(reflect.NewValue(start.Name).(*reflect.StructValue)); + } + + // Assign attributes. + // Also, do we need to save character data? + for i, n := 0, typ.NumField(); i < n; i++ { + f := typ.Field(i); + switch f.Tag { + case "attr": + strv, ok := sv.FieldByIndex(f.Index).(*reflect.StringValue); + if !ok { + return UnmarshalError(sv.Type().String() + " field " + f.Name + " has attr tag but is not type string"); + } + // Look for attribute. + val := ""; + k := strings.ToLower(f.Name); + for _, a := range start.Attr { + if strings.ToLower(a.Name.Local) == k { + val = a.Value; + break; + } + } + strv.Set(val); + + case "chardata": + if saveData == nil { + saveData = sv.FieldByIndex(f.Index); + } + } + } + } + + // Find end element. + // Process sub-elements along the way. +Loop: + for { + tok, err := p.Token(); + if err != nil { + return err; + } + switch t := tok.(type) { + case StartElement: + // Sub-element. + if sv != nil { + k := strings.ToLower(t.Name.Local); + for i, n := 0, styp.NumField(); i < n; i++ { + f := styp.Field(i); + if strings.ToLower(f.Name) == k { + if err := p.unmarshal(sv.FieldByIndex(f.Index), &t); err != nil { + return err; + } + continue Loop; + } + } + } + // Not saving sub-element but still have to skip over it. + if err := p.skip(); err != nil { + return err; + } + + case EndElement: + break Loop; + + case CharData: + if saveData != nil { + data = bytes.Add(data, t); + } + } + } + + // Save accumulated character data + if saveData != nil { + switch t := saveData.(type) { + case *reflect.StringValue: + t.Set(string(data)); + case *reflect.SliceValue: + t.Set(reflect.NewValue(data).(*reflect.SliceValue)); + } + } + + return nil; +} + +// Have already read a start element. +// Read tokens until we find the end element. +// Token is taking care of making sure the +// end element matches the start element we saw. +func (p *Parser) skip() os.Error { + for { + tok, err := p.Token(); + if err != nil { + return err; + } + switch t := tok.(type) { + case StartElement: + if err := p.skip(); err != nil { + return err; + } + case EndElement: + return nil; + } + } + panic("unreachable"); +} diff --git a/src/pkg/xml/read_test.go b/src/pkg/xml/read_test.go new file mode 100644 index 000000000..760d28b92 --- /dev/null +++ b/src/pkg/xml/read_test.go @@ -0,0 +1,210 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xml + +import ( + "reflect"; + "testing"; +) + +// Stripped down Atom feed data structures. + +func TestUnmarshalFeed(t *testing.T) { + var f Feed; + if err := Unmarshal(StringReader(rssFeedString), &f); err != nil { + t.Fatalf("Unmarshal: %s", err); + } + if !reflect.DeepEqual(f, rssFeed) { + t.Fatalf("have %#v\nwant %#v\n\n%#v", f); + } +} + +// hget http://codereview.appspot.com/rss/mine/rsc +const rssFeedString = ` +<?xml version="1.0" encoding="utf-8"?> +<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-us"><title>Code Review - My issues</title><link href="http://codereview.appspot.com/" rel="alternate"></link><link href="http://codereview.appspot.com/rss/mine/rsc" rel="self"></link><id>http://codereview.appspot.com/</id><updated>2009-10-04T01:35:58+00:00</updated><author><name>rietveld</name></author><entry><title>rietveld: an attempt at pubsubhubbub +</title><link href="http://codereview.appspot.com/126085" rel="alternate"></link><updated>2009-10-04T01:35:58+00:00</updated><author><name>email-address-removed</name></author><id>urn:md5:134d9179c41f806be79b3a5f7877d19a</id><summary type="html"> + An attempt at adding pubsubhubbub support to Rietveld. +http://code.google.com/p/pubsubhubbub +http://code.google.com/p/rietveld/issues/detail?id=155 + +The server side of the protocol is trivial: + 1. add a &lt;link rel=&quot;hub&quot; href=&quot;hub-server&quot;&gt; tag to all + feeds that will be pubsubhubbubbed. + 2. every time one of those feeds changes, tell the hub + with a simple POST request. + +I have tested this by adding debug prints to a local hub +server and checking that the server got the right publish +requests. + +I can&#39;t quite get the server to work, but I think the bug +is not in my code. I think that the server expects to be +able to grab the feed and see the feed&#39;s actual URL in +the link rel=&quot;self&quot;, but the default value for that drops +the :port from the URL, and I cannot for the life of me +figure out how to get the Atom generator deep inside +django not to do that, or even where it is doing that, +or even what code is running to generate the Atom feed. +(I thought I knew but I added some assert False statements +and it kept running!) + +Ignoring that particular problem, I would appreciate +feedback on the right way to get the two values at +the top of feeds.py marked NOTE(rsc). + + +</summary></entry><entry><title>rietveld: correct tab handling +</title><link href="http://codereview.appspot.com/124106" rel="alternate"></link><updated>2009-10-03T23:02:17+00:00</updated><author><name>email-address-removed</name></author><id>urn:md5:0a2a4f19bb815101f0ba2904aed7c35a</id><summary type="html"> + This fixes the buggy tab rendering that can be seen at +http://codereview.appspot.com/116075/diff/1/2 + +The fundamental problem was that the tab code was +not being told what column the text began in, so it +didn&#39;t know where to put the tab stops. Another problem +was that some of the code assumed that string byte +offsets were the same as column offsets, which is only +true if there are no tabs. + +In the process of fixing this, I cleaned up the arguments +to Fold and ExpandTabs and renamed them Break and +_ExpandTabs so that I could be sure that I found all the +call sites. I also wanted to verify that ExpandTabs was +not being used from outside intra_region_diff.py. + + +</summary></entry></feed>` + +type Feed struct { + XMLName Name "http://www.w3.org/2005/Atom feed"; + Title string; + Id string; + Link []Link; + Updated Time; + Author Person; + Entry []Entry; +} + +type Entry struct { + Title string; + Id string; + Link []Link; + Updated Time; + Author Person; + Summary Text; +} + +type Link struct { + Rel string "attr"; + Href string "attr"; +} + +type Person struct { + Name string; + URI string; + Email string; +} + +type Text struct { + Type string "attr"; + Body string "chardata"; +} + +type Time string + +var rssFeed = Feed{ + XMLName: Name{"http://www.w3.org/2005/Atom", "feed"}, + Title: "Code Review - My issues", + Link: []Link{ + Link{Rel: "alternate", Href: "http://codereview.appspot.com/"}, + Link{Rel: "self", Href: "http://codereview.appspot.com/rss/mine/rsc"}, + }, + Id: "http://codereview.appspot.com/", + Updated: "2009-10-04T01:35:58+00:00", + Author: Person{ + Name: "rietveld" + }, + Entry: []Entry{ + Entry{ + Title: "rietveld: an attempt at pubsubhubbub\n", + Link: []Link{ + Link{Rel: "alternate", Href: "http://codereview.appspot.com/126085"}, + }, + Updated: "2009-10-04T01:35:58+00:00", + Author: Person{ + Name: "email-address-removed" + }, + Id: "urn:md5:134d9179c41f806be79b3a5f7877d19a", + Summary: Text{ + Type: "html", + Body: ` + An attempt at adding pubsubhubbub support to Rietveld. +http://code.google.com/p/pubsubhubbub +http://code.google.com/p/rietveld/issues/detail?id=155 + +The server side of the protocol is trivial: + 1. add a <link rel="hub" href="hub-server"> tag to all + feeds that will be pubsubhubbubbed. + 2. every time one of those feeds changes, tell the hub + with a simple POST request. + +I have tested this by adding debug prints to a local hub +server and checking that the server got the right publish +requests. + +I can't quite get the server to work, but I think the bug +is not in my code. I think that the server expects to be +able to grab the feed and see the feed's actual URL in +the link rel="self", but the default value for that drops +the :port from the URL, and I cannot for the life of me +figure out how to get the Atom generator deep inside +django not to do that, or even where it is doing that, +or even what code is running to generate the Atom feed. +(I thought I knew but I added some assert False statements +and it kept running!) + +Ignoring that particular problem, I would appreciate +feedback on the right way to get the two values at +the top of feeds.py marked NOTE(rsc). + + +` + }, + }, + Entry{ + Title: "rietveld: correct tab handling\n", + Link: []Link{ + Link{Rel: "alternate", Href: "http://codereview.appspot.com/124106"}, + }, + Updated: "2009-10-03T23:02:17+00:00", + Author: Person{ + Name: "email-address-removed" + }, + Id: "urn:md5:0a2a4f19bb815101f0ba2904aed7c35a", + Summary: Text{ + Type: "html", + Body: ` + This fixes the buggy tab rendering that can be seen at +http://codereview.appspot.com/116075/diff/1/2 + +The fundamental problem was that the tab code was +not being told what column the text began in, so it +didn't know where to put the tab stops. Another problem +was that some of the code assumed that string byte +offsets were the same as column offsets, which is only +true if there are no tabs. + +In the process of fixing this, I cleaned up the arguments +to Fold and ExpandTabs and renamed them Break and +_ExpandTabs so that I could be sure that I found all the +call sites. I also wanted to verify that ExpandTabs was +not being used from outside intra_region_diff.py. + + +` + } + }, + } +} |
