diff options
author | John Hodge <tpg@mutabah.net> | 2016-02-23 10:31:41 +1100 |
---|---|---|
committer | John Hodge <tpg@mutabah.net> | 2016-02-23 10:31:41 +1100 |
commit | 1592a53e831b041b8e3392a06e12179379394eda (patch) | |
tree | f221fa96837cd8e809928158ee1cc5fcf150930f | |
parent | ea3291cbe5192fa20f1a5d4dc45d776e48c21b7d (diff) | |
download | mrust-1592a53e831b041b8e3392a06e12179379394eda.tar.gz |
UTF-8 lex
-rw-r--r-- | src/parse/lex.cpp | 48 |
1 files changed, 47 insertions, 1 deletions
diff --git a/src/parse/lex.cpp b/src/parse/lex.cpp index dc206ad8..7ec28ef0 100644 --- a/src/parse/lex.cpp +++ b/src/parse/lex.cpp @@ -741,8 +741,54 @@ Codepoint Lexer::getc_codepoint() if( v1 < 128 ) { return {v1}; } + else if( (v1 & 0xC0) == 0x80 ) { + // Invalid (continuation) + return {0xFFFE}; + } + else if( (v1 & 0xE0) == 0xC0 ) { + // Two bytes + uint8_t e1 = this->getc(); + if( (e1 & 0xC0) != 0x80 ) return {0xFFFE}; + + uint32_t outval + = ((v1 & 0x1F) << 6) + | ((e1 & 0x3F) <<0) + ; + return {outval}; + } + else if( (v1 & 0xF0) == 0xE0 ) { + // Three bytes + uint8_t e1 = this->getc(); + if( (e1 & 0xC0) != 0x80 ) return {0xFFFE}; + uint8_t e2 = this->getc(); + if( (e2 & 0xC0) != 0x80 ) return {0xFFFE}; + + uint32_t outval + = ((v1 & 0x0F) << 12) + | ((e1 & 0x3F) << 6) + | ((e2 & 0x3F) << 0) + ; + return {outval}; + } + else if( (v1 & 0xF8) == 0xF0 ) { + // Four bytes + uint8_t e1 = this->getc(); + if( (e1 & 0xC0) != 0x80 ) return {0xFFFE}; + uint8_t e2 = this->getc(); + if( (e2 & 0xC0) != 0x80 ) return {0xFFFE}; + uint8_t e3 = this->getc(); + if( (e3 & 0xC0) != 0x80 ) return {0xFFFE}; + + uint32_t outval + = ((v1 & 0x07) << 18) + | ((e1 & 0x3F) << 12) + | ((e2 & 0x3F) << 6) + | ((e3 & 0x3F) << 0) + ; + return {outval}; + } else { - throw ParseError::Todo("getc_codepoint"); + throw ParseError::Generic("Invalid UTF-8 (too long)"); } } |