blob: 320b23a85e7241474ce54177fc5a499be1420eb6 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
// UTF8Reader
// - by John Hodge (thePowersGang)
//
//
// Reads a stream of UTF-8 encoded codepoints from a "Reader"
use std::io::IoResult;
/// Unicode replacement character
static BADCHAR: char = '\uFFFD';
/// UTF8 reader structure
pub struct UTF8Reader<T: Reader>
{
stream: T,
}
fn tochar(codepoint: u32) -> char
{
match ::std::char::from_u32(codepoint)
{
Some(c) => c,
None => BADCHAR,
}
}
impl<T:Reader> UTF8Reader<T>
{
pub fn new(reader: T) -> UTF8Reader<T>
{
UTF8Reader {
stream: reader,
}
}
/// Read a single codepoint from the stream.
/// On an encoding error, it returns '\uFFFD' (the unicode replacement character)
pub fn getc(&mut self) -> IoResult<char>
{
let ch1 = try!(self.stream.read_byte()) as u32;
if ch1 & 0xC0 == 0x80 {
return Ok( BADCHAR )
}
if ch1 & 0x80 == 0x00
{
// Single-byte
Ok( tochar(ch1) )
}
else if ch1 & 0xE0 == 0xC0
{
// Two-byte sequence
let ch2 = try!(self.stream.read_byte()) as u32;
if ch2 & 0xC0 != 0x80 {
return Ok( BADCHAR );
}
let ret = (ch1 & 0x1F << 6) | (ch2 & 0x3F << 0);
Ok( tochar(ret) )
}
else if ch1 & 0xF0 == 0xE0
{
// Three-byte sequence
let ch2 = try!(self.stream.read_byte()) as u32;
if ch2 & 0xC0 != 0x80 {
return Ok( BADCHAR );
}
let ch3 = try!(self.stream.read_byte()) as u32;
if ch3 & 0xC0 != 0x80 {
return Ok( BADCHAR );
}
let ret = (ch1 & 0x0F << 12) | (ch2 & 0x3F << 6) | (ch3 & 0x3F << 0);
Ok( tochar(ret) )
}
else if ch1 & 0xF8 == 0xF0
{
// Four-byte sequence
let ch2 = try!(self.stream.read_byte()) as u32;
if ch2 & 0xC0 != 0x80 {
return Ok( BADCHAR );
}
let ch3 = try!(self.stream.read_byte()) as u32;
if ch3 & 0xC0 != 0x80 {
return Ok( BADCHAR );
}
let ch4 = try!(self.stream.read_byte()) as u32;
if ch4 & 0xC0 != 0x80 {
return Ok( BADCHAR );
}
let ret = (ch1 & 0x07 << 18) | (ch2 & 0x3F << 12) | (ch3 & 0x3F << 6) | (ch4 & 0x3F << 0);
Ok( tochar(ret) )
}
else
{
// More than four bytes is invalid
Ok( BADCHAR )
}
}
}
/// Implmentation of the same interface as 'Chars' provides, returns None at the end of the stream
impl<T:Reader> Iterator for UTF8Reader<T>
{
type Item = IoResult<char>;
fn next(&mut self) -> Option<IoResult<char>>
{
// Get result from decoder
match self.getc()
{
// - All good, return a character
Ok(c) => Some( Ok(c) ),
// - Error, check if it's EOF
Err(e) => match e.kind {
// Return 'None' on EOF (end of stream)
::std::io::IoError::EndOfFile => None,
_ => Some( Err( e ) ),
}
}
}
}
#[test]
fn it_works() {
}
// vim: ft=rust
|