diff options
author | John Hodge <tpg@mutabah.net> | 2016-02-27 08:21:47 +0800 |
---|---|---|
committer | John Hodge <tpg@mutabah.net> | 2016-02-27 08:21:47 +0800 |
commit | 11dfb163601c8cc71f7b0ee9f9103b5b9cf72706 (patch) | |
tree | 6d437ba716dc405f8b7bde3f4092631dc71ec0b7 /src | |
parent | ba563a6e9a5d698990e1bddaeeecc512fd670bcc (diff) | |
download | mrust-11dfb163601c8cc71f7b0ee9f9103b5b9cf72706.tar.gz |
Parse/lex - Fix handling of escape codes
Diffstat (limited to 'src')
-rw-r--r-- | src/parse/lex.cpp | 40 | ||||
-rw-r--r-- | src/parse/lex.hpp | 10 |
2 files changed, 39 insertions, 11 deletions
diff --git a/src/parse/lex.cpp b/src/parse/lex.cpp index b6463b21..77d12f7f 100644 --- a/src/parse/lex.cpp +++ b/src/parse/lex.cpp @@ -557,8 +557,17 @@ Token Lexer::getTokenInt() while( (ch = this->getc()) != '"' ) { if( ch == '\\' ) - ch = this->parseEscape('"'); - str.push_back(ch); + { + auto v = this->parseEscape('"'); + if( v != ~0u ) + { + str += Codepoint(v); + } + } + else + { + str.push_back(ch); + } } return Token(TOK_STRING, str); } @@ -1231,3 +1240,30 @@ SERIALISE_TYPE_A(TokenTree::, "TokenTree", { s.item(m_subtrees); }) +::std::string& operator+=(::std::string& s, const Codepoint& cp) +{ + if( cp.v < 0x80 ) { + s += (char)cp.v; + } + else if( cp.v < (0x1F+1)<<(1*6) ) { + + s += (char)(0xC0 | ((cp.v >> 6) & 0x1F)); + s += (char)(0x80 | ((cp.v >> 0) & 0x3F)); + } + else if( cp.v <= (0x0F+1)<<(2*6) ) { + s += (char)(0xE0 | ((cp.v >> 12) & 0x0F)); + s += (char)(0x80 | ((cp.v >> 6) & 0x3F)); + s += (char)(0x80 | ((cp.v >> 0) & 0x3F)); + } + else if( cp.v <= (0x07+1)<<(2*6) ) { + s += (char)(0xF0 | ((cp.v >> 18) & 0x07)); + s += (char)(0x80 | ((cp.v >> 12) & 0x3F)); + s += (char)(0x80 | ((cp.v >> 6) & 0x3F)); + s += (char)(0x80 | ((cp.v >> 0) & 0x3F)); + } + else { + throw ::std::runtime_error("BUGCHECK: Bad unicode codepoint encountered"); + } + return s; +} + diff --git a/src/parse/lex.hpp b/src/parse/lex.hpp index 3081e255..937be516 100644 --- a/src/parse/lex.hpp +++ b/src/parse/lex.hpp @@ -175,16 +175,8 @@ public: struct Codepoint { uint32_t v; Codepoint(uint32_t v): v(v) { } - friend ::std::string& operator+=(::std::string& s, const Codepoint& cp) { - if( cp.v < 128 ) { - s += (char)cp.v; - } - else { - throw ::std::runtime_error("TODO: Encode UTF-8 codepoint"); - } - return s; - } }; +extern ::std::string& operator+=(::std::string& s, const Codepoint& cp); class Lexer: public TokenStream |