summaryrefslogtreecommitdiff
path: root/src/pkg/xml/xml.go
blob: bd944337e98bf460372500e60c24820dfe77b336 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// NOTE(rsc): Actually, this package is just a description
// of an implementation that hasn't been written yet.

// This package implements an XML parser but relies on
// clients to implement the parsing actions.

// An XML document is a single XML element.
//
// An XML element is either a start tag and an end tag,
// like <tag>...</tag>, or a combined start/end tag <tag/>.
// The latter is identical in semantics to <tag></tag>,
// and this parser does not distinguish them.
//
// The start (or combined start/end) tag can have
// name="value" attributes inside the angle brackets after
// the tag name, as in <img src="http://google.com/icon.png" alt="Google">.
// Names are drawn from a fixed set of alphabetic letters;
// Values are strings quoted with single or double quotes.
//
// An element made up of distinct start and end tags can
// contain free-form text and other elements inside it,
// as in <a href="http://www.google.com">Google</a>
// or <b><a href="http://www.google.com">Google</a></b>.
// The former is an <a> element with the text "Google" inside it.
// The latter is a <b> element with that <a> element inside it.
// In general, an element can contain a sequence of elements
// and text inside it.  In XML, white space inside an element is
// always counted as text--it is never discarded by the parser.
// XML parsers do translate \r and \r\n into \n in text.
//
// This parser reads an XML document and calls methods on a
// Builder interface object in response to the text.
// It calls the builder's StartElement, Text, and EndElement
// methods, mimicking the structure of the text.
// For example, the simple XML document:
//
//	<a href="http://www.google.com">
//		<img src="http://www.google.com/icon.png" alt="Google" />
//	<br/></a>
//
// results in the following sequence of builder calls:
//
//	StartElement("a", []Attr(Attr("href", "http://www.google.com")));
//	Text("\n\t");
//	StartElement("img", []Attr(Attr("src", "http://www.google.com/icon.png"),
//	                           Attr("alt", "Google")));
//	EndElement("img");
//	Text("\n");
//	StartElement("br", []Attr());
//	EndElement("br");
//	EndElement("a");
//
// There are, of course, a few more details, but the story so far
// should be enough for the majority of uses.  The details are:
//
// * XML documents typically begin with an XML declaration line like
// <?xml version="1.0" encoding="UTF-8"?>.
// This line is strongly recommended, but not strictly required.
// It introduces the XML version and text encoding for the rest
// of the file.  XML parsers are required to recognize UTF-8 and
// UTF-16.  This parser only recognizes UTF-8 (for now?).
//
// * After the XML declaration comes an optional doctype declaration like
// <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
//   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
// The parser should pass this information on to the client in some
// form, but does not.  It discards such lines.
//
// * The XML declaration line is an instance of a more general tag
// called a processing instruction, XML's #pragma.  The general form is
// <?target text?>, where target is a name (like "xml") specifying
// the intended recipient of the instruction, and text is the
// instruction itself.  This XML parser keeps the <?xml ...?> declaration
// to itself but passes along other processing instructions using
// the ProcInst method.  Processing instructions can appear anywhere
// in an XML document.  Most clients will simply ignore them.
//
// * An XML comment can appear anywhere in an XML document.
// Comments have the form <!--text-->.  The XML parser passes
// them along by calling the Comment method.  Again, most clients
// will simply ignore them.
//
// * Text inside an XML element must be escaped to avoid looking like
// a start/end tag.  Specifically, the characters < and & must be
// written as &lt; and &amp;.  An alternate quoting mechanism is to
// use the construct <![CDATA[...]]>.  The quoted text ... can contain
// < characters, but not the sequence ]]>.  Ampersands must still be
// escaped.  For some reason, the existence of the CDATA quoting mechanism
// infects the processing of ordinary unquoted text, which is not allowed
// to contain the literal sequence ]]>.  Instead, it would be written
// escaped, as in ]]&gt;.  The parser hides all these considerations
// from the library client -- it reports all text, regardless of original
// form and already unescaped, using the Text method.
//
// * A revision to XML 1.0 introduced the concept of name spaces
// for attribute and tag names.  A start tag with an attribute
// xmlns:prefix="URL" introduces `prefix' as a shorthand
// for the name space whose identifier is URL.  Inside the element
// with that start tag, an element name or attribute prefix:foo
// (as in <prefix:foo prefix:bar="baz">) is understood to refer
// to name `foo' in the name space denoted by `URL'.  Although
// this is a shorthand, there is no canonical expansion.  Thus:
//
//	<tag xmlns:foo="http://google.com/foo" xmlns:bar="http://google.com/bar">
//		<foo:red bar:attr="value">text1</foo:red>
//		<bar:red>text2</bar:red>
//	</tag>
//
// and
//
//	<tag xmlns:bar="http://google.com/foo" xmlns:foo="http://google.com/bar">
//		<bar:red foo:attr="value">text1</bar:red>
//		<foo:red>text2</foo:red>
//	</tag>
//
// are equivalent XML documents, and there is no canonical form.
//
// The special attribute xmlns="URL" sets the default name space
// for unprefixed tags (but not attribute names) to URL.
// Thus:
//
//	<tag xmlns="http://google.com/foo" xmlns:bar="http://google.com/bar">
//		<red bar:attr="value">text1</red>
//		<bar:red>text2</bar:red>
//	</tag>
//
// is another XML document equivalent to the first two, and
//
//	<tag xmlns:bar="http://google.com/foo" xmlns="http://google.com/bar">
//		<bar:red attr="value">text1</bar:red>
//		<red>text2</red>
//	</tag>
//
// would be equivalent, except that `attr' in attr="value" has no
// associated name space, in contrast to the previous three where it
// is in the http://google.com/bar name space.
//
// The XML parser hides these details from the client by passing
// a Name struct (ns + name pair) for tag and attribute names.
// Tags and attributes without a name space have ns == "".
//
// References:
//	Annotated XML spec: http://www.xml.com/axml/testaxml.htm
//	XML name spaces: http://www.w3.org/TR/REC-xml-names/

package xml

import (
	"io";
	"os";
)

// XML name, annotated with name space URL
type Name struct {
	ns, name string;
}

// XML attribute (name=value).
type Attr struct {
	name Name;
	value string;
}

// XML Builder - methods client provides to Parser.
// Parser calls methods on builder as it reads and parses XML.
// If a builder method returns an error, the parse stops.
type Builder interface {
	// Called when an element starts.
	// Attr is list of attributes given in the tag.
	//	<name attr.name=attr.value attr1.name=attr1.value ...>
	//	<name attr.name=attr.value attr1.name=attr1.value ... />
	// xmlns and xmlns:foo attributes are handled internally
	// and not passed through to StartElement.
	StartElement(name Name, attr []Attr) os.Error;

	// Called when an element ends.
	//	</name>
	//	<name ... />
	EndElement(name Name) os.Error;

	// Called for non-empty character data string inside element.
	// Can be called multiple times between elements.
	//	text
	//	<![CDATA[text]]>
	Text(text []byte) os.Error;

	// Called when a comment is found in the XML.
	//	<!-- text -->
	Comment(text []byte) os.Error;

	// Called for a processing instruction
	// <?target text?>
	ProcInst(target string, text []byte) os.Error;
}

// Default builder.  Implements no-op Builder methods.
// Embed this in your own Builders to handle the calls
// you don't care about (e.g., Comment, ProcInst).
type BaseBuilder struct {
}

func (b *BaseBuilder) StartElement(name Name, attr []Attr) os.Error {
	return nil;
}

func (b *BaseBuilder) EndElement(name Name) os.Error {
	return nil;
}

func (b *BaseBuilder) Text(text []byte) os.Error {
	return nil;
}

func (b *BaseBuilder) Comment(text []byte) os.Error {
	return nil;
}

func (b *BaseBuilder) ProcInst(target string, text []byte) os.Error {
	return nil;
}

// XML Parser.  Calls Builder methods as it parses.
func Parse(r io.Read, b Builder) os.Error {
	return os.NewError("unimplemented");
}

// Channel interface to XML parser: create a new channel,
// go ParseTokens(r, c), and then read from the channel
// until TokenEnd.  This variant has the benefit that
// the process reading the channel can be a recursive
// function instead of a set of callbacks, but it has the
// drawback that the channel interface cannot signal an
// error to cause the parser to stop early.

// An XML parsing token.
const (
	TokenStartElement = 1 + iota;
	TokenEndElement;
	TokenText;
	TokenComment;
	TokenProcInst;
	TokenEnd;
)

type Token struct {
	Kind int;		// TokenStartElement, TokenEndElement, etc.
	Name Name;		// name (TokenStartElement, TokenEndElement)
	Attr []Attr;		// attributes (TokenStartElement)
	Target string;		// target (TokenProcessingInstruction)
	Text []byte;		// text (TokenCharData, TokenComment, etc.)
	Err os.Error;		// error (TokenEnd)
}

type ChanBuilder chan Token;

func (c ChanBuilder) StartElement(name Name, attr []Attr) os.Error {
	var t Token;
	t.Kind = TokenStartElement;
	t.Name = name;
	t.Attr = attr;
	c <- t;
	return nil;
}

func (c ChanBuilder) EndElement(name Name) os.Error {
	var t Token;
	t.Kind = TokenEndElement;
	t.Name = name;
	c <- t;
	return nil;
}

func (c ChanBuilder) Text(text []byte) os.Error {
	var t Token;
	t.Kind = TokenText;
	t.Text = text;
	c <- t;
	return nil;
}

func (c ChanBuilder) Comment(text []byte) os.Error {
	var t Token;
	t.Kind = TokenComment;
	t.Text = text;
	c <- t;
	return nil;
}

func (c ChanBuilder) ProcInst(target string, text []byte) os.Error {
	var t Token;
	t.Kind = TokenProcInst;
	t.Target = target;
	t.Text = text;
	c <- t;
	return nil;
}

func ParseToChan(r io.Read, c chan Token) {
	var t Token;
	t.Kind = TokenEnd;
	t.Err = Parse(r, ChanBuilder(c));
	c <- t;
}


// scribbled notes based on XML spec.

// document is
//	xml decl?
// 	doctype decl?
//	element
//
// if xml decl is present, must be first.  after that,
// can have comments and procinsts scattered throughout,
// even after the element is done.
//
// xml decl is:
//
// <\?xml version='[a-zA-Z0-9_.:\-]+'( encoding='[A-Za-z][A-Za-z0-9._\-]*')?
//	( standalone='(yes|no)')? ?\?>
//
// spaces denote [ \r\t\n]+.
// written with '' above but can use "" too.
//
// doctype decl might as well be <!DOCTYPE[^>]*>
//
// procinst is <\?name( .*?)\?>.  name cannot be [Xx][Mm][Ll].
//
// comment is <!--(.*?)-->.
//
// tags are:
//	<name( attrib)* ?>	start tag
//	<name( attrib)* ?/>	combined start/end tag
//	</name ?>		end tag
// (the " ?" is an optional space, not a literal question mark.)
//
// plain text is [^<&]* except cannot contain "]]>".
// can also have escaped characters:
//	&#[0-9]+;
//	&#x[0-9A-Fa-f]+;
//	&name;
//
// can use <![CDATA[.*?]]> to avoid escaping < characters.
//
// must rewrite \r and \r\n into \n in text.
//
// names are Unicode.  valid chars listed below.
//
// attrib is name="value" or name='value'.
// can have spaces around =.
// attribute value text is [^<&"]* for appropriate ".
// can also use the &...; escape sequences above.
// cannot use <![CDATA[...]]>.
//
// xmlns attributes are name=value where name has form xmlns:name
// (i.e., xmlns:123 is not okay, because 123 is not a name; xmlns:a123 is ok).
// sub-name must not start with : either.
//
// name is first(second)*.
//
// first is
//
// 003A        04D0-04EB   0A59-0A5C   0C35-0C39   0F49-0F69   1E00-1E9B
// 0041-005A   04EE-04F5   0A5E        0C60-0C61   10A0-10C5   1EA0-1EF9
// 005F        04F8-04F9   0A72-0A74   0C85-0C8C   10D0-10F6   1F00-1F15
// 0061-007A   0531-0556   0A85-0A8B   0C8E-0C90   1100        1F18-1F1D
// 00C0-00D6   0559        0A8D        0C92-0CA8   1102-1103   1F20-1F45
// 00D8-00F6   0561-0586   0A8F-0A91   0CAA-0CB3   1105-1107   1F48-1F4D
// 00F8-00FF   05D0-05EA   0A93-0AA8   0CB5-0CB9   1109        1F50-1F57
// 0100-0131   05F0-05F2   0AAA-0AB0   0CDE        110B-110C   1F59
// 0134-013E   0621-063A   0AB2-0AB3   0CE0-0CE1   110E-1112   1F5B
// 0141-0148   0641-064A   0AB5-0AB9   0D05-0D0C   113C        1F5D
// 014A-017E   0671-06B7   0ABD        0D0E-0D10   113E        1F5F-1F7D
// 0180-01C3   06BA-06BE   0AE0        0D12-0D28   1140        1F80-1FB4
// 01CD-01F0   06C0-06CE   0B05-0B0C   0D2A-0D39   114C        1FB6-1FBC
// 01F4-01F5   06D0-06D3   0B0F-0B10   0D60-0D61   114E        1FBE
// 01FA-0217   06D5        0B13-0B28   0E01-0E2E   1150        1FC2-1FC4
// 0250-02A8   06E5-06E6   0B2A-0B30   0E30        1154-1155   1FC6-1FCC
// 02BB-02C1   0905-0939   0B32-0B33   0E32-0E33   1159        1FD0-1FD3
// 0386        093D        0B36-0B39   0E40-0E45   115F-1161   1FD6-1FDB
// 0388-038A   0958-0961   0B3D        0E81-0E82   1163        1FE0-1FEC
// 038C        0985-098C   0B5C-0B5D   0E84        1165        1FF2-1FF4
// 038E-03A1   098F-0990   0B5F-0B61   0E87-0E88   1167        1FF6-1FFC
// 03A3-03CE   0993-09A8   0B85-0B8A   0E8A        1169        2126
// 03D0-03D6   09AA-09B0   0B8E-0B90   0E8D        116D-116E   212A-212B
// 03DA        09B2        0B92-0B95   0E94-0E97   1172-1173   212E
// 03DC        09B6-09B9   0B99-0B9A   0E99-0E9F   1175        2180-2182
// 03DE        09DC-09DD   0B9C        0EA1-0EA3   119E        3007
// 03E0        09DF-09E1   0B9E-0B9F   0EA5        11A8        3021-3029
// 03E2-03F3   09F0-09F1   0BA3-0BA4   0EA7        11AB        3041-3094
// 0401-040C   0A05-0A0A   0BA8-0BAA   0EAA-0EAB   11AE-11AF   30A1-30FA
// 040E-044F   0A0F-0A10   0BAE-0BB5   0EAD-0EAE   11B7-11B8   3105-312C
// 0451-045C   0A13-0A28   0BB7-0BB9   0EB0        11BA        4E00-9FA5
// 045E-0481   0A2A-0A30   0C05-0C0C   0EB2-0EB3   11BC-11C2   AC00-D7A3
// 0490-04C4   0A32-0A33   0C0E-0C10   0EBD        11EB
// 04C7-04C8   0A35-0A36   0C12-0C28   0EC0-0EC4   11F0
// 04CB-04CC   0A38-0A39   0C2A-0C33   0F40-0F47   11F9
//
// second is first plus
//
// 002D        06DD-06DF   09E6-09EF   0B56-0B57   0D3E-0D43   0F3E
// 002E        06E0-06E4   0A02        0B66-0B6F   0D46-0D48   0F3F
// 0030-0039   06E7-06E8   0A3C        0B82-0B83   0D4A-0D4D   0F71-0F84
// 00B7        06EA-06ED   0A3E        0BBE-0BC2   0D57        0F86-0F8B
// 02D0        06F0-06F9   0A3F        0BC6-0BC8   0D66-0D6F   0F90-0F95
// 02D1        0901-0903   0A40-0A42   0BCA-0BCD   0E31        0F97
// 0300-0345   093C        0A47-0A48   0BD7        0E34-0E3A   0F99-0FAD
// 0360-0361   093E-094C   0A4B-0A4D   0BE7-0BEF   0E46        0FB1-0FB7
// 0387        094D        0A66-0A6F   0C01-0C03   0E47-0E4E   0FB9
// 0483-0486   0951-0954   0A70-0A71   0C3E-0C44   0E50-0E59   20D0-20DC
// 0591-05A1   0962-0963   0A81-0A83   0C46-0C48   0EB1        20E1
// 05A3-05B9   0966-096F   0ABC        0C4A-0C4D   0EB4-0EB9   3005
// 05BB-05BD   0981-0983   0ABE-0AC5   0C55-0C56   0EBB-0EBC   302A-302F
// 05BF        09BC        0AC7-0AC9   0C66-0C6F   0EC6        3031-3035
// 05C1-05C2   09BE        0ACB-0ACD   0C82-0C83   0EC8-0ECD   3099
// 05C4        09BF        0AE6-0AEF   0CBE-0CC4   0ED0-0ED9   309A
// 0640        09C0-09C4   0B01-0B03   0CC6-0CC8   0F18-0F19   309D-309E
// 064B-0652   09C7-09C8   0B3C        0CCA-0CCD   0F20-0F29   30FC-30FE
// 0660-0669   09CB-09CD   0B3E-0B43   0CD5-0CD6   0F35
// 0670        09D7        0B47-0B48   0CE6-0CEF   0F37
// 06D6-06DC   09E2-09E3   0B4B-0B4D   0D02-0D03   0F39