diff options
| author | Russ Cox <rsc@golang.org> | 2010-04-21 16:27:18 -0700 | 
|---|---|---|
| committer | Russ Cox <rsc@golang.org> | 2010-04-21 16:27:18 -0700 | 
| commit | 7c298c744f5477f352fdc1d5acf01710f79e34f2 (patch) | |
| tree | b99804fdfeca6a295655df9293cb10a1503ac091 | |
| parent | e5468688f01afa3f4f9707051c756ae7bf7c1e93 (diff) | |
| download | golang-7c298c744f5477f352fdc1d5acf01710f79e34f2.tar.gz | |
utf16: add DecodeRune, EncodeRune
R=r
CC=golang-dev
http://codereview.appspot.com/970041
| -rw-r--r-- | src/pkg/utf16/utf16.go | 35 | ||||
| -rw-r--r-- | src/pkg/utf16/utf16_test.go | 36 | 
2 files changed, 67 insertions, 4 deletions
| diff --git a/src/pkg/utf16/utf16.go b/src/pkg/utf16/utf16.go index 303162452..372e38a71 100644 --- a/src/pkg/utf16/utf16.go +++ b/src/pkg/utf16/utf16.go @@ -18,6 +18,33 @@ const (  	surrSelf = 0x10000  ) +// IsSurrogate returns true if the specified Unicode code point +// can appear in a surrogate pair. +func IsSurrogate(rune int) bool { +	return surr1 <= rune && rune < surr3 +} + +// DecodeRune returns the UTF-16 decoding of a surrogate pair. +// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns +// the Unicode replacement code point U+FFFD. +func DecodeRune(r1, r2 int) int { +	if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { +		return (int(r1)-surr1)<<10 | (int(r2) - surr2) + 0x10000 +	} +	return unicode.ReplacementChar +} + +// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune. +// If the rune is not a valid Unicode code point or does not need encoding, +// EncodeRune returns U+FFFD, U+FFFD. +func EncodeRune(rune int) (r1, r2 int) { +	if rune < surrSelf || rune > unicode.MaxRune || IsSurrogate(rune) { +		return unicode.ReplacementChar, unicode.ReplacementChar +	} +	rune -= surrSelf +	return surr1 + (rune>>10)&0x3ff, surr2 + rune&0x3ff +} +  // Encode returns the UTF-16 encoding of the Unicode code point sequence s.  func Encode(s []int) []uint16 {  	n := len(s) @@ -38,9 +65,9 @@ func Encode(s []int) []uint16 {  			a[n] = uint16(v)  			n++  		default: -			v -= surrSelf -			a[n] = uint16(surr1 + (v>>10)&0x3ff) -			a[n+1] = uint16(surr2 + v&0x3ff) +			r1, r2 := EncodeRune(v) +			a[n] = uint16(r1) +			a[n+1] = uint16(r2)  			n += 2  		}  	} @@ -57,7 +84,7 @@ func Decode(s []uint16) []int {  		case surr1 <= r && r < surr2 && i+1 < len(s) &&  			surr2 <= s[i+1] && s[i+1] < surr3:  			// valid surrogate sequence -			a[n] = (int(r)-surr1)<<10 | (int(s[i+1]) - surr2) + 0x10000 +			a[n] = DecodeRune(int(r), int(s[i+1]))  			i++  			n++  		case surr1 <= r && r < surr3: diff --git a/src/pkg/utf16/utf16_test.go b/src/pkg/utf16/utf16_test.go index c6e269aad..c0848aa38 100644 --- a/src/pkg/utf16/utf16_test.go +++ b/src/pkg/utf16/utf16_test.go @@ -8,6 +8,7 @@ import (  	"fmt"  	"reflect"  	"testing" +	"unicode"  )  type encodeTest struct { @@ -32,6 +33,41 @@ func TestEncode(t *testing.T) {  	}  } +func TestEncodeRune(t *testing.T) { +	for i, tt := range encodeTests { +		j := 0 +		for _, r := range tt.in { +			r1, r2 := EncodeRune(r) +			if r < 0x10000 || r > unicode.MaxRune { +				if j >= len(tt.out) { +					t.Errorf("#%d: ran out of tt.out", i) +					break +				} +				if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar { +					t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2) +				} +				j++ +			} else { +				if j+1 >= len(tt.out) { +					t.Errorf("#%d: ran out of tt.out", i) +					break +				} +				if r1 != int(tt.out[j]) || r2 != int(tt.out[j+1]) { +					t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1]) +				} +				j += 2 +				dec := DecodeRune(r1, r2) +				if dec != r { +					t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r) +				} +			} +		} +		if j != len(tt.out) { +			t.Errorf("#%d: EncodeRune didn't generate enough output", i) +		} +	} +} +  type decodeTest struct {  	in  []uint16  	out []int | 
