summaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authorSteve Newman <devnull@localhost>2009-06-02 12:48:18 -0700
committerSteve Newman <devnull@localhost>2009-06-02 12:48:18 -0700
commitcc1ec4adad8c2671b71dba4cd7eba47d0d62d84d (patch)
treed611e8cc380e26e374b6771a58202ee157cceb2b /src/lib
parent55ae14e3b0197115c261879b722b7710ebded49e (diff)
downloadgolang-cc1ec4adad8c2671b71dba4cd7eba47d0d62d84d.tar.gz
Fixes to URL functionality:
- Extend http.URLUnescape to convert '+' to space - Add http.URLEscape - Rename URL.Query to EncodedQuery (and stop decoding it, as decoding this field before separating key/value pairs loses important information) - Report a clean error on incomplete hex escapes - Update existing tests, add new ones APPROVED=rsc DELTA=293 (256 added, 3 deleted, 34 changed) OCL=29685 CL=29759
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/http/url.go117
-rw-r--r--src/lib/http/url_test.go200
2 files changed, 285 insertions, 32 deletions
diff --git a/src/lib/http/url.go b/src/lib/http/url.go
index db51d99aa..0325b04ee 100644
--- a/src/lib/http/url.go
+++ b/src/lib/http/url.go
@@ -41,37 +41,60 @@ func unhex(c byte) byte {
return 0
}
+// Return true if the specified character should be escaped when appearing in a
+// URL string.
+//
+// TODO: for now, this is a hack; it only flags a few common characters that have
+// special meaning in URLs. That will get the job done in the common cases.
+func shouldEscape(c byte) bool {
+ switch c {
+ case ' ', '?', '&', '=', '#', '+', '%':
+ return true;
+ }
+ return false;
+}
+
// URLUnescape unescapes a URL-encoded string,
-// converting %AB into the byte 0xAB.
+// converting %AB into the byte 0xAB and '+' into ' ' (space).
// It returns a BadURL error if any % is not followed
// by two hexadecimal digits.
func URLUnescape(s string) (string, os.Error) {
// Count %, check that they're well-formed.
n := 0;
+ anyPlusses := false;
for i := 0; i < len(s); {
- if s[i] == '%' {
+ switch s[i] {
+ case '%':
n++;
- if !ishex(s[i+1]) || !ishex(s[i+2]) {
+ if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
return "", BadURL{"invalid hexadecimal escape"}
}
- i += 3
- } else {
+ i += 3;
+ case '+':
+ anyPlusses = true;
+ i++;
+ default:
i++
}
}
- if n == 0 {
+ if n == 0 && !anyPlusses {
return s, nil
}
t := make([]byte, len(s)-2*n);
j := 0;
for i := 0; i < len(s); {
- if s[i] == '%' {
+ switch s[i] {
+ case '%':
t[j] = unhex(s[i+1]) << 4 | unhex(s[i+2]);
j++;
i += 3;
- } else {
+ case '+':
+ t[j] = ' ';
+ j++;
+ i++;
+ default:
t[j] = s[i];
j++;
i++;
@@ -80,9 +103,53 @@ func URLUnescape(s string) (string, os.Error) {
return string(t), nil;
}
+// URLEscape converts a string into URL-encoded form.
+func URLEscape(s string) string {
+ spaceCount, hexCount := 0, 0;
+ for i := 0; i < len(s); i++ {
+ c := s[i];
+ if (shouldEscape(c)) {
+ if (c == ' ') {
+ spaceCount++;
+ } else {
+ hexCount++;
+ }
+ }
+ }
+
+ if spaceCount == 0 && hexCount == 0 {
+ return s;
+ }
+
+ t := make([]byte, len(s)+2*hexCount);
+ j := 0;
+ for i := 0; i < len(s); i++ {
+ c := s[i];
+ if !shouldEscape(c) {
+ t[j] = s[i];
+ j++;
+ } else if (c == ' ') {
+ t[j] = '+';
+ j++;
+ } else {
+ t[j] = '%';
+ t[j+1] = "0123456789abcdef"[c>>4];
+ t[j+2] = "0123456789abcdef"[c&15];
+ j += 3;
+ }
+ }
+ return string(t);
+}
+
// A URL represents a parsed URL (technically, a URI reference).
// The general form represented is:
// scheme://[userinfo@]host/path[?query][#fragment]
+// The Raw, RawPath, and RawQuery fields are in "wire format" (special
+// characters must be hex-escaped if not meant to have special meaning).
+// All other fields are logical values; '+' or '%' represent themselves.
+//
+// Note, the reason for using wire format for the query is that it needs
+// to be split into key/value pairs before decoding.
type URL struct {
Raw string; // the original string
Scheme string; // scheme
@@ -91,7 +158,7 @@ type URL struct {
Userinfo string; // userinfo
Host string; // host
Path string; // /path
- Query string; // query
+ RawQuery string; // query
Fragment string; // fragment
}
@@ -156,10 +223,7 @@ func ParseURL(rawurl string) (url *URL, err os.Error) {
// RFC 2396: a relative URI (no scheme) has a ?query,
// but absolute URIs only have query if path begins with /
if url.Scheme == "" || len(path) > 0 && path[0] == '/' {
- path, url.Query = split(path, '?', true);
- if url.Query, err = URLUnescape(url.Query); err != nil {
- return nil, err
- }
+ path, url.RawQuery = split(path, '?', true);
}
// Maybe path is //authority/path
@@ -180,6 +244,21 @@ func ParseURL(rawurl string) (url *URL, err os.Error) {
return nil, err
}
+ // Remove escapes from the Authority and Userinfo fields, and verify
+ // that Scheme and Host contain no escapes (that would be illegal).
+ if url.Authority, err = URLUnescape(url.Authority); err != nil {
+ return nil, err
+ }
+ if url.Userinfo, err = URLUnescape(url.Userinfo); err != nil {
+ return nil, err
+ }
+ if (strings.Index(url.Scheme, "%") >= 0) {
+ return nil, BadURL{"hexadecimal escape in scheme"}
+ }
+ if (strings.Index(url.Host, "%") >= 0) {
+ return nil, BadURL{"hexadecimal escape in host"}
+ }
+
return url, nil
}
@@ -200,7 +279,7 @@ func ParseURLReference(rawurlref string) (url *URL, err os.Error) {
//
// There are redundant fields stored in the URL structure:
// the String method consults Scheme, Path, Host, Userinfo,
-// Query, and Fragment, but not RawPath or Authority.
+// RawQuery, and Fragment, but not Raw, RawPath or Authority.
func (url *URL) String() string {
result := "";
if url.Scheme != "" {
@@ -209,16 +288,16 @@ func (url *URL) String() string {
if url.Host != "" || url.Userinfo != "" {
result += "//";
if url.Userinfo != "" {
- result += url.Userinfo + "@";
+ result += URLEscape(url.Userinfo) + "@";
}
result += url.Host;
}
- result += url.Path;
- if url.Query != "" {
- result += "?" + url.Query;
+ result += URLEscape(url.Path);
+ if url.RawQuery != "" {
+ result += "?" + url.RawQuery;
}
if url.Fragment != "" {
- result += "#" + url.Fragment;
+ result += "#" + URLEscape(url.Fragment);
}
return result;
}
diff --git a/src/lib/http/url_test.go b/src/lib/http/url_test.go
index f5a7069ae..8d8fabad5 100644
--- a/src/lib/http/url_test.go
+++ b/src/lib/http/url_test.go
@@ -20,6 +20,7 @@ import (
type URLTest struct {
in string;
out *URL;
+ roundtrip string; // expected result of reserializing the URL; empty means same as "in".
}
var urltests = []URLTest {
@@ -31,7 +32,8 @@ var urltests = []URLTest {
"http", "//www.google.com",
"www.google.com", "", "www.google.com",
"", "", ""
- }
+ },
+ ""
},
// path
URLTest{
@@ -41,7 +43,19 @@ var urltests = []URLTest {
"http", "//www.google.com/",
"www.google.com", "", "www.google.com",
"/", "", ""
- }
+ },
+ ""
+ },
+ // path with hex escaping... note that space roundtrips to +
+ URLTest{
+ "http://www.google.com/file%20one%26two",
+ &URL{
+ "http://www.google.com/file%20one%26two",
+ "http", "//www.google.com/file%20one%26two",
+ "www.google.com", "", "www.google.com",
+ "/file one&two", "", ""
+ },
+ "http://www.google.com/file+one%26two"
},
// user
URLTest{
@@ -51,7 +65,19 @@ var urltests = []URLTest {
"ftp", "//webmaster@www.google.com/",
"webmaster@www.google.com", "webmaster", "www.google.com",
"/", "", ""
- }
+ },
+ ""
+ },
+ // escape sequence in username
+ URLTest{
+ "ftp://john%20doe@www.google.com/",
+ &URL{
+ "ftp://john%20doe@www.google.com/",
+ "ftp", "//john%20doe@www.google.com/",
+ "john doe@www.google.com", "john doe", "www.google.com",
+ "/", "", ""
+ },
+ "ftp://john+doe@www.google.com/"
},
// query
URLTest{
@@ -61,7 +87,19 @@ var urltests = []URLTest {
"http", "//www.google.com/?q=go+language",
"www.google.com", "", "www.google.com",
"/", "q=go+language", ""
- }
+ },
+ ""
+ },
+ // query with hex escaping: NOT parsed
+ URLTest{
+ "http://www.google.com/?q=go%20language",
+ &URL{
+ "http://www.google.com/?q=go%20language",
+ "http", "//www.google.com/?q=go%20language",
+ "www.google.com", "", "www.google.com",
+ "/", "q=go%20language", ""
+ },
+ ""
},
// path without /, so no query parsing
URLTest{
@@ -70,8 +108,9 @@ var urltests = []URLTest {
"http:www.google.com/?q=go+language",
"http", "www.google.com/?q=go+language",
"", "", "",
- "www.google.com/?q=go+language", "", ""
- }
+ "www.google.com/?q=go language", "", ""
+ },
+ "http:www.google.com/%3fq%3dgo+language"
},
// non-authority
URLTest{
@@ -81,7 +120,8 @@ var urltests = []URLTest {
"mailto", "/webmaster@golang.org",
"", "", "",
"/webmaster@golang.org", "", ""
- }
+ },
+ ""
},
// non-authority
URLTest{
@@ -91,7 +131,8 @@ var urltests = []URLTest {
"mailto", "webmaster@golang.org",
"", "", "",
"webmaster@golang.org", "", ""
- }
+ },
+ ""
},
}
@@ -103,7 +144,8 @@ var urlnofragtests = []URLTest {
"http", "//www.google.com/?q=go+language#foo",
"www.google.com", "", "www.google.com",
"/", "q=go+language#foo", ""
- }
+ },
+ ""
},
}
@@ -115,7 +157,18 @@ var urlfragtests = []URLTest {
"http", "//www.google.com/?q=go+language",
"www.google.com", "", "www.google.com",
"/", "q=go+language", "foo"
- }
+ },
+ ""
+ },
+ URLTest{
+ "http://www.google.com/?q=go+language#foo%26bar",
+ &URL{
+ "http://www.google.com/?q=go+language",
+ "http", "//www.google.com/?q=go+language",
+ "www.google.com", "", "www.google.com",
+ "/", "q=go+language", "foo&bar"
+ },
+ ""
},
}
@@ -123,7 +176,7 @@ var urlfragtests = []URLTest {
func ufmt(u *URL) string {
return fmt.Sprintf("%q, %q, %q, %q, %q, %q, %q, %q, %q",
u.Raw, u.Scheme, u.RawPath, u.Authority, u.Userinfo,
- u.Host, u.Path, u.Query, u.Fragment);
+ u.Host, u.Path, u.RawQuery, u.Fragment);
}
func DoTest(t *testing.T, parse func(string) (*URL, os.Error), name string, tests []URLTest) {
@@ -158,8 +211,12 @@ func DoTestString(t *testing.T, parse func(string) (*URL, os.Error), name string
continue;
}
s := u.String();
- if s != tt.in {
- t.Errorf("%s(%q).String() == %q", tt.in, s);
+ expected := tt.in;
+ if len(tt.roundtrip) > 0 {
+ expected = tt.roundtrip;
+ }
+ if s != expected {
+ t.Errorf("%s(%q).String() == %q (expected %q)", name, tt.in, s, expected);
}
}
}
@@ -172,3 +229,120 @@ func TestURLString(t *testing.T) {
DoTestString(t, ParseURLReference, "ParseURLReference", urlfragtests);
DoTestString(t, ParseURLReference, "ParseURLReference", urlnofragtests);
}
+
+type URLEscapeTest struct {
+ in string;
+ out string;
+ err os.Error;
+}
+
+var unescapeTests = []URLEscapeTest {
+ URLEscapeTest{
+ "",
+ "",
+ nil
+ },
+ URLEscapeTest{
+ "abc",
+ "abc",
+ nil
+ },
+ URLEscapeTest{
+ "1%41",
+ "1A",
+ nil
+ },
+ URLEscapeTest{
+ "1%41%42%43",
+ "1ABC",
+ nil
+ },
+ URLEscapeTest{
+ "%4a",
+ "J",
+ nil
+ },
+ URLEscapeTest{
+ "%6F",
+ "o",
+ nil
+ },
+ URLEscapeTest{
+ "%", // not enough characters after %
+ "",
+ BadURL{"invalid hexadecimal escape"}
+ },
+ URLEscapeTest{
+ "%a", // not enough characters after %
+ "",
+ BadURL{"invalid hexadecimal escape"}
+ },
+ URLEscapeTest{
+ "%1", // not enough characters after %
+ "",
+ BadURL{"invalid hexadecimal escape"}
+ },
+ URLEscapeTest{
+ "123%45%6", // not enough characters after %
+ "",
+ BadURL{"invalid hexadecimal escape"}
+ },
+ URLEscapeTest{
+ "%zz", // invalid hex digits
+ "",
+ BadURL{"invalid hexadecimal escape"}
+ },
+}
+
+func TestURLUnescape(t *testing.T) {
+ for i, tt := range unescapeTests {
+ actual, err := URLUnescape(tt.in);
+ if actual != tt.out || (err != nil) != (tt.err != nil) {
+ t.Errorf("URLUnescape(%q) = %q, %s; want %q, %s", tt.in, actual, err, tt.out, tt.err);
+ }
+ }
+}
+
+var escapeTests = []URLEscapeTest {
+ URLEscapeTest{
+ "",
+ "",
+ nil
+ },
+ URLEscapeTest{
+ "abc",
+ "abc",
+ nil
+ },
+ URLEscapeTest{
+ "one two",
+ "one+two",
+ nil
+ },
+ URLEscapeTest{
+ "10%",
+ "10%25",
+ nil
+ },
+ URLEscapeTest{
+ " ?&=#+%!",
+ "+%3f%26%3d%23%2b%25!",
+ nil
+ },
+}
+
+func TestURLEscape(t *testing.T) {
+ for i, tt := range escapeTests {
+ actual := URLEscape(tt.in);
+ if tt.out != actual {
+ t.Errorf("URLEscape(%q) = %q, want %q", tt.in, actual, tt.out);
+ }
+
+ // for bonus points, verify that escape:unescape is an identity.
+ roundtrip, err := URLUnescape(actual);
+ if roundtrip != tt.in || err != nil {
+ t.Errorf("URLUnescape(%q) = %q, %s; want %q, %s", actual, roundtrip, err, tt.in, "[no error]");
+ }
+ }
+}
+