diff options
author | Rob Pike <r@golang.org> | 2009-05-05 17:05:39 -0700 |
---|---|---|
committer | Rob Pike <r@golang.org> | 2009-05-05 17:05:39 -0700 |
commit | b6708b2dad4af458e69fde7fc596ecdce03280d0 (patch) | |
tree | 11cfc1b980ecce9cb0eec06f49cc53ce05faf350 /src/lib/utf8 | |
parent | 658b10477a16151de7e4d51657993f95445f11ee (diff) | |
download | golang-b6708b2dad4af458e69fde7fc596ecdce03280d0.tar.gz |
directory-per-package step 1: move files from lib/X.go to lib/X/X.go
no substantive changes except:
- new Makefiles, all auto-generated
- go/src/lib/Makefile has been extensively edited
R=rsc
OCL=28310
CL=28310
Diffstat (limited to 'src/lib/utf8')
-rw-r--r-- | src/lib/utf8/Makefile | 68 | ||||
-rw-r--r-- | src/lib/utf8/utf8.go | 290 | ||||
-rw-r--r-- | src/lib/utf8/utf8_test.go | 179 |
3 files changed, 537 insertions, 0 deletions
diff --git a/src/lib/utf8/Makefile b/src/lib/utf8/Makefile new file mode 100644 index 000000000..2919ddb4e --- /dev/null +++ b/src/lib/utf8/Makefile @@ -0,0 +1,68 @@ +# Copyright 2009 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# DO NOT EDIT. Automatically generated by gobuild. +# gobuild -m >Makefile + +D= + +O_arm=5 +O_amd64=6 +O_386=8 +OS=568vq + +O=$(O_$(GOARCH)) +GC=$(O)g -I_obj +CC=$(O)c -FVw +AS=$(O)a +AR=6ar + +default: packages + +clean: + rm -rf *.[$(OS)] *.a [$(OS)].out _obj + +test: packages + gotest + +coverage: packages + gotest + 6cov -g `pwd` | grep -v '_test\.go:' + +%.$O: %.go + $(GC) $*.go + +%.$O: %.c + $(CC) $*.c + +%.$O: %.s + $(AS) $*.s + +O1=\ + utf8.$O\ + + +phases: a1 +_obj$D/utf8.a: phases + +a1: $(O1) + $(AR) grc _obj$D/utf8.a utf8.$O + rm -f $(O1) + + +newpkg: clean + mkdir -p _obj$D + $(AR) grc _obj$D/utf8.a + +$(O1): newpkg +$(O2): a1 + +nuke: clean + rm -f $(GOROOT)/pkg$D/utf8.a + +packages: _obj$D/utf8.a + +install: packages + test -d $(GOROOT)/pkg && mkdir -p $(GOROOT)/pkg$D + cp _obj$D/utf8.a $(GOROOT)/pkg$D/utf8.a diff --git a/src/lib/utf8/utf8.go b/src/lib/utf8/utf8.go new file mode 100644 index 000000000..5ce59894b --- /dev/null +++ b/src/lib/utf8/utf8.go @@ -0,0 +1,290 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Functions and constants to support text encoded in UTF-8. +// This package calls a Unicode character a rune for brevity. +package utf8 + +// Numbers fundamental to the encoding. +const ( + RuneError = 0xFFFD; // the "error" Rune or "replacement character". + RuneSelf = 0x80; // characters below Runeself are represented as themselves in a single byte. + RuneMax = 0x10FFFF; // maximum Unicode code point. + UTFMax = 4; // maximum number of bytes of a UTF-8 encoded Unicode character. +) + +const ( + _T1 = 0x00; // 0000 0000 + _Tx = 0x80; // 1000 0000 + _T2 = 0xC0; // 1100 0000 + _T3 = 0xE0; // 1110 0000 + _T4 = 0xF0; // 1111 0000 + _T5 = 0xF8; // 1111 1000 + + _Maskx = 0x3F; // 0011 1111 + _Mask2 = 0x1F; // 0001 1111 + _Mask3 = 0x0F; // 0000 1111 + _Mask4 = 0x07; // 0000 0111 + + _Rune1Max = 1<<7 - 1; + _Rune2Max = 1<<11 - 1; + _Rune3Max = 1<<16 - 1; + _Rune4Max = 1<<21 - 1; +) + +func decodeRuneInternal(p []byte) (rune, size int, short bool) { + n := len(p); + if n < 1 { + return RuneError, 0, true; + } + c0 := p[0]; + + // 1-byte, 7-bit sequence? + if c0 < _Tx { + return int(c0), 1, false + } + + // unexpected continuation byte? + if c0 < _T2 { + return RuneError, 1, false + } + + // need first continuation byte + if n < 2 { + return RuneError, 1, true + } + c1 := p[1]; + if c1 < _Tx || _T2 <= c1 { + return RuneError, 1, false + } + + // 2-byte, 11-bit sequence? + if c0 < _T3 { + rune = int(c0&_Mask2)<<6 | int(c1&_Maskx); + if rune <= _Rune1Max { + return RuneError, 1, false + } + return rune, 2, false + } + + // need second continuation byte + if n < 3 { + return RuneError, 1, true + } + c2 := p[2]; + if c2 < _Tx || _T2 <= c2 { + return RuneError, 1, false + } + + // 3-byte, 16-bit sequence? + if c0 < _T4 { + rune = int(c0&_Mask3)<<12 | int(c1&_Maskx)<<6 | int(c2&_Maskx); + if rune <= _Rune2Max { + return RuneError, 1, false + } + return rune, 3, false + } + + // need third continuation byte + if n < 4 { + return RuneError, 1, true + } + c3 := p[3]; + if c3 < _Tx || _T2 <= c3 { + return RuneError, 1, false + } + + // 4-byte, 21-bit sequence? + if c0 < _T5 { + rune = int(c0&_Mask4)<<18 | int(c1&_Maskx)<<12 | int(c2&_Maskx)<<6 | int(c3&_Maskx); + if rune <= _Rune3Max { + return RuneError, 1, false + } + return rune, 4, false + } + + // error + return RuneError, 1, false +} + +func decodeRuneInStringInternal(s string, i int, n int) (rune, size int, short bool) { + if n < 1 { + return RuneError, 0, true; + } + c0 := s[i]; + + // 1-byte, 7-bit sequence? + if c0 < _Tx { + return int(c0), 1, false + } + + // unexpected continuation byte? + if c0 < _T2 { + return RuneError, 1, false + } + + // need first continuation byte + if n < 2 { + return RuneError, 1, true + } + c1 := s[i+1]; + if c1 < _Tx || _T2 <= c1 { + return RuneError, 1, false + } + + // 2-byte, 11-bit sequence? + if c0 < _T3 { + rune = int(c0&_Mask2)<<6 | int(c1&_Maskx); + if rune <= _Rune1Max { + return RuneError, 1, false + } + return rune, 2, false + } + + // need second continuation byte + if n < 3 { + return RuneError, 1, true + } + c2 := s[i+2]; + if c2 < _Tx || _T2 <= c2 { + return RuneError, 1, false + } + + // 3-byte, 16-bit sequence? + if c0 < _T4 { + rune = int(c0&_Mask3)<<12 | int(c1&_Maskx)<<6 | int(c2&_Maskx); + if rune <= _Rune2Max { + return RuneError, 1, false + } + return rune, 3, false + } + + // need third continuation byte + if n < 4 { + return RuneError, 1, true + } + c3 := s[i+3]; + if c3 < _Tx || _T2 <= c3 { + return RuneError, 1, false + } + + // 4-byte, 21-bit sequence? + if c0 < _T5 { + rune = int(c0&_Mask4)<<18 | int(c1&_Maskx)<<12 | int(c2&_Maskx)<<6 | int(c3&_Maskx); + if rune <= _Rune3Max { + return RuneError, 1, false + } + return rune, 4, false + } + + // error + return RuneError, 1, false +} + +// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. +// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. +func FullRune(p []byte) bool { + rune, size, short := decodeRuneInternal(p); + return !short +} + +// FullRuneInString is like FullRune but its input is a string. +func FullRuneInString(s string, i int) bool { + rune, size, short := decodeRuneInStringInternal(s, i, len(s) - i); + return !short +} + +// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes. +func DecodeRune(p []byte) (rune, size int) { + var short bool; + rune, size, short = decodeRuneInternal(p); + return; +} + +// DecodeRuneInString is like DecodeRune but its input is a string. +func DecodeRuneInString(s string, i int) (rune, size int) { + var short bool; + rune, size, short = decodeRuneInStringInternal(s, i, len(s) - i); + return; +} + +// RuneLen returns the number of bytes required to encode the rune. +func RuneLen(rune int) int { + switch { + case rune <= _Rune1Max: + return 1; + case rune <= _Rune2Max: + return 2; + case rune <= _Rune3Max: + return 3; + case rune <= _Rune4Max: + return 4; + } + return -1; +} + +// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. +// It returns the number of bytes written. +func EncodeRune(rune int, p []byte) int { + if rune <= _Rune1Max { + p[0] = byte(rune); + return 1; + } + + if rune <= _Rune2Max { + p[0] = _T2 | byte(rune>>6); + p[1] = _Tx | byte(rune)&_Maskx; + return 2; + } + + if rune > RuneMax { + rune = RuneError + } + + if rune <= _Rune3Max { + p[0] = _T3 | byte(rune>>12); + p[1] = _Tx | byte(rune>>6)&_Maskx; + p[2] = _Tx | byte(rune)&_Maskx; + return 3; + } + + p[0] = _T4 | byte(rune>>18); + p[1] = _Tx | byte(rune>>12)&_Maskx; + p[2] = _Tx | byte(rune>>6)&_Maskx; + p[3] = _Tx | byte(rune)&_Maskx; + return 4; +} + +// RuneCount returns the number of runes in p. Erroneous and short +// encodings are treated as single runes of width 1 byte. +func RuneCount(p []byte) int { + i := 0; + var n int; + for n = 0; i < len(p); n++ { + if p[i] < RuneSelf { + i++; + } else { + rune, size := DecodeRune(p[i:len(p)]); + i += size; + } + } + return n; +} + +// RuneCountInString is like RuneCount but its input is a string. +func RuneCountInString(s string) int { + ei := len(s); + i := 0; + n := 0; + for n = 0; i < ei; n++ { + if s[i] < RuneSelf { + i++; + } else { + rune, size, short := decodeRuneInStringInternal(s, i, ei - i); + i += size; + } + } + return n; +} + diff --git a/src/lib/utf8/utf8_test.go b/src/lib/utf8/utf8_test.go new file mode 100644 index 000000000..3ba5ee2b8 --- /dev/null +++ b/src/lib/utf8/utf8_test.go @@ -0,0 +1,179 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package utf8 + +import ( + "fmt"; + "io"; + "testing"; + "utf8"; +) + +type Utf8Map struct { + rune int; + str string; +} + +var utf8map = []Utf8Map { + Utf8Map{ 0x0000, "\x00" }, + Utf8Map{ 0x0001, "\x01" }, + Utf8Map{ 0x007e, "\x7e" }, + Utf8Map{ 0x007f, "\x7f" }, + Utf8Map{ 0x0080, "\xc2\x80" }, + Utf8Map{ 0x0081, "\xc2\x81" }, + Utf8Map{ 0x00bf, "\xc2\xbf" }, + Utf8Map{ 0x00c0, "\xc3\x80" }, + Utf8Map{ 0x00c1, "\xc3\x81" }, + Utf8Map{ 0x00c8, "\xc3\x88" }, + Utf8Map{ 0x00d0, "\xc3\x90" }, + Utf8Map{ 0x00e0, "\xc3\xa0" }, + Utf8Map{ 0x00f0, "\xc3\xb0" }, + Utf8Map{ 0x00f8, "\xc3\xb8" }, + Utf8Map{ 0x00ff, "\xc3\xbf" }, + Utf8Map{ 0x0100, "\xc4\x80" }, + Utf8Map{ 0x07ff, "\xdf\xbf" }, + Utf8Map{ 0x0800, "\xe0\xa0\x80" }, + Utf8Map{ 0x0801, "\xe0\xa0\x81" }, + Utf8Map{ 0xfffe, "\xef\xbf\xbe" }, + Utf8Map{ 0xffff, "\xef\xbf\xbf" }, + Utf8Map{ 0x10000, "\xf0\x90\x80\x80" }, + Utf8Map{ 0x10001, "\xf0\x90\x80\x81" }, + Utf8Map{ 0x10fffe, "\xf4\x8f\xbf\xbe" }, + Utf8Map{ 0x10ffff, "\xf4\x8f\xbf\xbf" }, +} + +// io.StringBytes with one extra byte at end +func bytes(s string) []byte { + s += "\x00"; + b := io.StringBytes(s); + return b[0:len(s)-1]; +} + +func TestFullRune(t *testing.T) { + for i := 0; i < len(utf8map); i++ { + m := utf8map[i]; + b := bytes(m.str); + if !utf8.FullRune(b) { + t.Errorf("FullRune(%q) (rune %04x) = false, want true", b, m.rune); + } + s := "xx"+m.str; + if !utf8.FullRuneInString(s, 2) { + t.Errorf("FullRuneInString(%q, 2) (rune %04x) = false, want true", s, m.rune); + } + b1 := b[0:len(b)-1]; + if utf8.FullRune(b1) { + t.Errorf("FullRune(%q) = true, want false", b1); + } + s1 := "xxx"+string(b1); + if utf8.FullRuneInString(s1, 3) { + t.Errorf("FullRune(%q, 3) = true, want false", s1); + } + } +} + +func equalBytes(a, b []byte) bool { + if len(a) != len(b) { + return false; + } + for i := 0; i < len(a); i++ { + if a[i] != b[i] { + return false; + } + } + return true; +} + +func TestEncodeRune(t *testing.T) { + for i := 0; i < len(utf8map); i++ { + m := utf8map[i]; + b := bytes(m.str); + var buf [10]byte; + n := utf8.EncodeRune(m.rune, &buf); + b1 := buf[0:n]; + if !equalBytes(b, b1) { + t.Errorf("EncodeRune(0x%04x) = %q want %q", m.rune, b1, b); + } + } +} + +func TestDecodeRune(t *testing.T) { + for i := 0; i < len(utf8map); i++ { + m := utf8map[i]; + b := bytes(m.str); + rune, size := utf8.DecodeRune(b); + if rune != m.rune || size != len(b) { + t.Errorf("DecodeRune(%q) = 0x%04x, %d want 0x%04x, %d", b, rune, size, m.rune, len(b)); + } + s := "xx"+m.str; + rune, size = utf8.DecodeRuneInString(s, 2); + if rune != m.rune || size != len(b) { + t.Errorf("DecodeRune(%q, 2) = 0x%04x, %d want 0x%04x, %d", s, rune, size, m.rune, len(b)); + } + + // there's an extra byte that bytes left behind - make sure trailing byte works + rune, size = utf8.DecodeRune(b[0:cap(b)]); + if rune != m.rune || size != len(b) { + t.Errorf("DecodeRune(%q) = 0x%04x, %d want 0x%04x, %d", b, rune, size, m.rune, len(b)); + } + s = "x"+m.str+"\x00"; + rune, size = utf8.DecodeRuneInString(s, 1); + if rune != m.rune || size != len(b) { + t.Errorf("DecodeRuneInString(%q, 1) = 0x%04x, %d want 0x%04x, %d", s, rune, size, m.rune, len(b)); + } + + // make sure missing bytes fail + wantsize := 1; + if wantsize >= len(b) { + wantsize = 0; + } + rune, size = utf8.DecodeRune(b[0:len(b)-1]); + if rune != RuneError || size != wantsize { + t.Errorf("DecodeRune(%q) = 0x%04x, %d want 0x%04x, %d", b[0:len(b)-1], rune, size, RuneError, wantsize); + } + s = "xxx"+m.str[0:len(m.str)-1]; + rune, size = utf8.DecodeRuneInString(s, 3); + if rune != RuneError || size != wantsize { + t.Errorf("DecodeRuneInString(%q, 3) = 0x%04x, %d want 0x%04x, %d", s, rune, size, RuneError, wantsize); + } + + // make sure bad sequences fail + if len(b) == 1 { + b[0] = 0x80; + } else { + b[len(b)-1] = 0x7F; + } + rune, size = utf8.DecodeRune(b); + if rune != RuneError || size != 1 { + t.Errorf("DecodeRune(%q) = 0x%04x, %d want 0x%04x, %d", b, rune, size, RuneError, 1); + } + s = "xxxx"+string(b); + rune, size = utf8.DecodeRune(b); + if rune != RuneError || size != 1 { + t.Errorf("DecodeRuneInString(%q, 4) = 0x%04x, %d want 0x%04x, %d", s, rune, size, RuneError, 1); + } + } +} + +type RuneCountTest struct { + in string; + out int; +} +var runecounttests = []RuneCountTest { + RuneCountTest{ "abcd", 4 }, + RuneCountTest{ "☺☻☹", 3 }, + RuneCountTest{ "1,2,3,4", 7 }, + RuneCountTest{ "\xe2\x00", 2 }, +} +func TestRuneCount(t *testing.T) { + for i := 0; i < len(runecounttests); i++ { + tt := runecounttests[i]; + if out := utf8.RuneCountInString(tt.in); out != tt.out { + t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out); + } + if out := utf8.RuneCount(bytes(tt.in)); out != tt.out { + t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out); + } + } +} |