summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2010-04-21 16:27:18 -0700
committerRuss Cox <rsc@golang.org>2010-04-21 16:27:18 -0700
commit7c298c744f5477f352fdc1d5acf01710f79e34f2 (patch)
treeb99804fdfeca6a295655df9293cb10a1503ac091 /src
parente5468688f01afa3f4f9707051c756ae7bf7c1e93 (diff)
downloadgolang-7c298c744f5477f352fdc1d5acf01710f79e34f2.tar.gz
utf16: add DecodeRune, EncodeRune
R=r CC=golang-dev http://codereview.appspot.com/970041
Diffstat (limited to 'src')
-rw-r--r--src/pkg/utf16/utf16.go35
-rw-r--r--src/pkg/utf16/utf16_test.go36
2 files changed, 67 insertions, 4 deletions
diff --git a/src/pkg/utf16/utf16.go b/src/pkg/utf16/utf16.go
index 303162452..372e38a71 100644
--- a/src/pkg/utf16/utf16.go
+++ b/src/pkg/utf16/utf16.go
@@ -18,6 +18,33 @@ const (
surrSelf = 0x10000
)
+// IsSurrogate returns true if the specified Unicode code point
+// can appear in a surrogate pair.
+func IsSurrogate(rune int) bool {
+ return surr1 <= rune && rune < surr3
+}
+
+// DecodeRune returns the UTF-16 decoding of a surrogate pair.
+// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
+// the Unicode replacement code point U+FFFD.
+func DecodeRune(r1, r2 int) int {
+ if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
+ return (int(r1)-surr1)<<10 | (int(r2) - surr2) + 0x10000
+ }
+ return unicode.ReplacementChar
+}
+
+// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
+// If the rune is not a valid Unicode code point or does not need encoding,
+// EncodeRune returns U+FFFD, U+FFFD.
+func EncodeRune(rune int) (r1, r2 int) {
+ if rune < surrSelf || rune > unicode.MaxRune || IsSurrogate(rune) {
+ return unicode.ReplacementChar, unicode.ReplacementChar
+ }
+ rune -= surrSelf
+ return surr1 + (rune>>10)&0x3ff, surr2 + rune&0x3ff
+}
+
// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []int) []uint16 {
n := len(s)
@@ -38,9 +65,9 @@ func Encode(s []int) []uint16 {
a[n] = uint16(v)
n++
default:
- v -= surrSelf
- a[n] = uint16(surr1 + (v>>10)&0x3ff)
- a[n+1] = uint16(surr2 + v&0x3ff)
+ r1, r2 := EncodeRune(v)
+ a[n] = uint16(r1)
+ a[n+1] = uint16(r2)
n += 2
}
}
@@ -57,7 +84,7 @@ func Decode(s []uint16) []int {
case surr1 <= r && r < surr2 && i+1 < len(s) &&
surr2 <= s[i+1] && s[i+1] < surr3:
// valid surrogate sequence
- a[n] = (int(r)-surr1)<<10 | (int(s[i+1]) - surr2) + 0x10000
+ a[n] = DecodeRune(int(r), int(s[i+1]))
i++
n++
case surr1 <= r && r < surr3:
diff --git a/src/pkg/utf16/utf16_test.go b/src/pkg/utf16/utf16_test.go
index c6e269aad..c0848aa38 100644
--- a/src/pkg/utf16/utf16_test.go
+++ b/src/pkg/utf16/utf16_test.go
@@ -8,6 +8,7 @@ import (
"fmt"
"reflect"
"testing"
+ "unicode"
)
type encodeTest struct {
@@ -32,6 +33,41 @@ func TestEncode(t *testing.T) {
}
}
+func TestEncodeRune(t *testing.T) {
+ for i, tt := range encodeTests {
+ j := 0
+ for _, r := range tt.in {
+ r1, r2 := EncodeRune(r)
+ if r < 0x10000 || r > unicode.MaxRune {
+ if j >= len(tt.out) {
+ t.Errorf("#%d: ran out of tt.out", i)
+ break
+ }
+ if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar {
+ t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2)
+ }
+ j++
+ } else {
+ if j+1 >= len(tt.out) {
+ t.Errorf("#%d: ran out of tt.out", i)
+ break
+ }
+ if r1 != int(tt.out[j]) || r2 != int(tt.out[j+1]) {
+ t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1])
+ }
+ j += 2
+ dec := DecodeRune(r1, r2)
+ if dec != r {
+ t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r)
+ }
+ }
+ }
+ if j != len(tt.out) {
+ t.Errorf("#%d: EncodeRune didn't generate enough output", i)
+ }
+ }
+}
+
type decodeTest struct {
in []uint16
out []int