summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn Hodge <tpg@mutabah.net>2016-02-23 10:31:41 +1100
committerJohn Hodge <tpg@mutabah.net>2016-02-23 10:31:41 +1100
commit1592a53e831b041b8e3392a06e12179379394eda (patch)
treef221fa96837cd8e809928158ee1cc5fcf150930f /src
parentea3291cbe5192fa20f1a5d4dc45d776e48c21b7d (diff)
downloadmrust-1592a53e831b041b8e3392a06e12179379394eda.tar.gz
UTF-8 lex
Diffstat (limited to 'src')
-rw-r--r--src/parse/lex.cpp48
1 files changed, 47 insertions, 1 deletions
diff --git a/src/parse/lex.cpp b/src/parse/lex.cpp
index dc206ad8..7ec28ef0 100644
--- a/src/parse/lex.cpp
+++ b/src/parse/lex.cpp
@@ -741,8 +741,54 @@ Codepoint Lexer::getc_codepoint()
if( v1 < 128 ) {
return {v1};
}
+ else if( (v1 & 0xC0) == 0x80 ) {
+ // Invalid (continuation)
+ return {0xFFFE};
+ }
+ else if( (v1 & 0xE0) == 0xC0 ) {
+ // Two bytes
+ uint8_t e1 = this->getc();
+ if( (e1 & 0xC0) != 0x80 ) return {0xFFFE};
+
+ uint32_t outval
+ = ((v1 & 0x1F) << 6)
+ | ((e1 & 0x3F) <<0)
+ ;
+ return {outval};
+ }
+ else if( (v1 & 0xF0) == 0xE0 ) {
+ // Three bytes
+ uint8_t e1 = this->getc();
+ if( (e1 & 0xC0) != 0x80 ) return {0xFFFE};
+ uint8_t e2 = this->getc();
+ if( (e2 & 0xC0) != 0x80 ) return {0xFFFE};
+
+ uint32_t outval
+ = ((v1 & 0x0F) << 12)
+ | ((e1 & 0x3F) << 6)
+ | ((e2 & 0x3F) << 0)
+ ;
+ return {outval};
+ }
+ else if( (v1 & 0xF8) == 0xF0 ) {
+ // Four bytes
+ uint8_t e1 = this->getc();
+ if( (e1 & 0xC0) != 0x80 ) return {0xFFFE};
+ uint8_t e2 = this->getc();
+ if( (e2 & 0xC0) != 0x80 ) return {0xFFFE};
+ uint8_t e3 = this->getc();
+ if( (e3 & 0xC0) != 0x80 ) return {0xFFFE};
+
+ uint32_t outval
+ = ((v1 & 0x07) << 18)
+ | ((e1 & 0x3F) << 12)
+ | ((e2 & 0x3F) << 6)
+ | ((e3 & 0x3F) << 0)
+ ;
+ return {outval};
+ }
else {
- throw ParseError::Todo("getc_codepoint");
+ throw ParseError::Generic("Invalid UTF-8 (too long)");
}
}