diff options
author | John Hodge <tpg@mutabah.net> | 2016-08-18 10:43:40 +0800 |
---|---|---|
committer | John Hodge <tpg@mutabah.net> | 2016-08-18 10:43:40 +0800 |
commit | c48d39448d06c1ac97838f4cf7f2ed7526adb2fb (patch) | |
tree | 7119d10e59c5268dc17c52b320dc6698a1811cb9 /src/parse | |
parent | 5511349eb94867c2314569089128a1f6088cf818 (diff) | |
download | mrust-c48d39448d06c1ac97838f4cf7f2ed7526adb2fb.tar.gz |
Parse - Rework lexer to be fully unicode aware
Diffstat (limited to 'src/parse')
-rw-r--r-- | src/parse/lex.cpp | 210 | ||||
-rw-r--r-- | src/parse/lex.hpp | 21 |
2 files changed, 145 insertions, 86 deletions
diff --git a/src/parse/lex.cpp b/src/parse/lex.cpp index c3bfa5e5..62e25942 100644 --- a/src/parse/lex.cpp +++ b/src/parse/lex.cpp @@ -189,7 +189,7 @@ static const struct { signed int Lexer::getSymbol() { - char ch = this->getc(); + Codepoint ch = this->getc(); // 1. lsearch for character // 2. Consume as many characters as currently match // 3. IF: a smaller character or, EOS is hit - Return current best @@ -200,12 +200,12 @@ signed int Lexer::getSymbol() const char* const chars = TOKENMAP[i].chars; const size_t len = TOKENMAP[i].len; - if( ofs >= len || chars[ofs] > ch ) { + if( ofs >= len || static_cast<uint32_t>(chars[ofs]) > ch.v ) { this->ungetc(); return best; } - while( chars[ofs] && chars[ofs] == ch ) + while( chars[ofs] && ch == chars[ofs] ) { try { ch = this->getc(); @@ -225,13 +225,13 @@ signed int Lexer::getSymbol() return best; } -bool issym(int ch) +bool issym(Codepoint ch) { - if( ::std::isalnum(ch) ) + if( ::std::isalnum(ch.v) ) return true; if( ch == '_' ) return true; - if( ch >= 128 || ch < 0 ) + if( ch.v >= 128 ) return true; return false; } @@ -275,13 +275,13 @@ Token Lexer::getTokenInt() } try { - char ch = this->getc(); + Codepoint ch = this->getc(); if( ch == '#' && m_line == 1 && m_line_ofs == 1 ) { - switch(ch = this->getc()) + switch( (ch = this->getc()).v ) { case '!': - switch(ch = this->getc()) + switch( (ch = this->getc()).v ) { case '/': // SHEBANG! @@ -291,22 +291,22 @@ Token Lexer::getTokenInt() case '[': return Token(TOK_CATTR_OPEN); default: - throw ParseError::BadChar(*this, ch); + throw ParseError::BadChar(*this, ch.v); } case '[': return Token(TOK_ATTR_OPEN); default: this->ungetc(); //return Token(TOK_HASH); - throw ParseError::BadChar(*this, ch); + throw ParseError::BadChar(*this, ch.v); } } if( ch == '\n' ) return Token(TOK_NEWLINE); - if( isspace(ch) ) + if( ch.isspace() ) { - while( isspace(ch = this->getc()) && ch != '\n' ) + while( (ch = this->getc()).isspace() && ch != '\n' ) ; this->ungetc(); return Token(TOK_WHITESPACE); @@ -317,8 +317,8 @@ Token Lexer::getTokenInt() if( sym == 0 ) { // No match at all, check for symbol - char ch = this->getc(); - if( isdigit(ch) ) + auto ch = this->getc(); + if( ch.isdigit() ) { enum eCoreType num_type = CORETYPE_ANY; enum { @@ -336,25 +336,25 @@ Token Lexer::getTokenInt() ch = this->getc_num(); if( ch == 'x' ) { num_mode = HEX; - while( isxdigit(ch = this->getc_num()) ) + while( (ch = this->getc_num()).isxdigit() ) { val *= 16; - if(ch <= '9') - val += ch - '0'; - else if( ch <= 'F' ) - val += ch - 'A' + 10; - else if( ch <= 'f' ) - val += ch - 'a' + 10; + if(ch.v <= '9') + val += ch.v - '0'; + else if( ch.v <= 'F' ) + val += ch.v - 'A' + 10; + else if( ch.v <= 'f' ) + val += ch.v - 'a' + 10; } } else if( ch == 'b' ) { num_mode = BIN; - while( isdigit(ch = this->getc_num()) ) + while( (ch = this->getc_num()).isdigit() ) { val *= 2; - if(ch == '0') + if(ch.v == '0') val += 0; - else if( ch == '1' ) + else if( ch.v == '1' ) val += 1; else throw ParseError::Generic("Invalid digit in binary literal"); @@ -362,27 +362,27 @@ Token Lexer::getTokenInt() } else if( ch == 'o' ) { num_mode = OCT; - while( isdigit(ch = this->getc_num()) ) { + while( (ch = this->getc_num()).isdigit() ) { val *= 8; - if('0' <= ch && ch <= '7') - val += ch - '0'; + if('0' <= ch.v && ch.v <= '7') + val += ch.v - '0'; else throw ParseError::Generic("Invalid digit in octal literal"); } } else { num_mode = DEC; - while( isdigit(ch) ) { + while( ch.isdigit() ) { val *= 10; - val += ch - '0'; + val += ch.v - '0'; ch = this->getc_num(); } } } else { - while( isdigit(ch) ) { + while( ch.isdigit() ) { val *= 10; - val += ch - '0'; + val += ch.v - '0'; ch = this->getc_num(); } } @@ -407,7 +407,7 @@ Token Lexer::getTokenInt() return Token(val, CORETYPE_ANY); } // Single dot - Still a float. (TODO) - else if( !isdigit(ch) ) + else if( !ch.isdigit() ) { this->ungetc(); if( issym(ch) ) @@ -434,7 +434,7 @@ Token Lexer::getTokenInt() ::std::string suffix; while( issym(ch) ) { - suffix.push_back(ch); + suffix += ch; ch = this->getc(); } this->ungetc(); @@ -457,7 +457,7 @@ Token Lexer::getTokenInt() ::std::string suffix; while( issym(ch) ) { - suffix.push_back(ch); + suffix += ch; ch = this->getc(); } this->ungetc(); @@ -513,7 +513,7 @@ Token Lexer::getTokenInt() } } else { - str.push_back(ch); + str += ch; } } return Token(TOK_BYTESTRING, str); @@ -531,7 +531,7 @@ Token Lexer::getTokenInt() else { if( this->getc() != '\'' ) throw ParseError::Generic(*this, "Multi-byte character literal"); - return Token((uint64_t)ch, CORETYPE_U8); + return Token((uint64_t)ch.v, CORETYPE_U8); } } else { @@ -548,7 +548,7 @@ Token Lexer::getTokenInt() } else { - throw ParseError::BadChar(*this, ch); + throw ParseError::BadChar(*this, ch.v); } } else if( sym > 0 ) @@ -562,10 +562,10 @@ Token Lexer::getTokenInt() case LINECOMMENT: { // Line comment ::std::string str; - char ch = this->getc(); + auto ch = this->getc(); while(ch != '\n' && ch != '\r') { - str.push_back(ch); + str += ch; ch = this->getc(); } this->ungetc(); @@ -578,12 +578,12 @@ Token Lexer::getTokenInt() ch = this->getc(); if( ch == '/' ) { - str.push_back(ch); + str += ch; ch = this->getc(); if( ch == '*' ) { level ++; } - str.push_back(ch); + str += ch; } else { if( ch == '*' ) { @@ -597,17 +597,17 @@ Token Lexer::getTokenInt() } else { str.push_back('*'); - str.push_back(ch); + str += ch; } } else { - str.push_back(ch); + str += ch; } } } return Token(TOK_COMMENT, str); } case SINGLEQUOTE: { - auto firstchar = this->getc_codepoint(); + auto firstchar = this->getc(); if( firstchar.v == '\\' ) { // Character constant with an escape code uint32_t val = this->parseEscape('\''); @@ -628,7 +628,7 @@ Token Lexer::getTokenInt() str += firstchar; while( issym(ch) ) { - str.push_back(ch); + str += ch; ch = this->getc(); } this->ungetc(); @@ -653,7 +653,7 @@ Token Lexer::getTokenInt() } else { - str.push_back(ch); + str += ch; } } return Token(TOK_STRING, str); @@ -673,7 +673,7 @@ Token Lexer::getTokenInt() Token Lexer::getTokenInt_RawString(bool is_byte) { // Raw string (possibly byte) - char ch = this->getc(); + Codepoint ch = this->getc(); unsigned int hashes = 0; while(ch == '#') { @@ -687,7 +687,7 @@ Token Lexer::getTokenInt_RawString(bool is_byte) else return this->getTokenInt_Identifier('r'); } - char terminator = ch; + auto terminator = ch; ::std::string val; DEBUG("terminator = '" << terminator << "', hashes = " << hashes); @@ -736,15 +736,15 @@ Token Lexer::getTokenInt_RawString(bool is_byte) } return Token(is_byte ? TOK_BYTESTRING : TOK_STRING, val); } -Token Lexer::getTokenInt_Identifier(char leader, char leader2) +Token Lexer::getTokenInt_Identifier(Codepoint leader, Codepoint leader2) { ::std::string str; if( leader2 != '\0' ) str += leader; - char ch = leader2 == '\0' ? leader : leader2; + auto ch = leader2 == '\0' ? leader : leader2; while( issym(ch) ) { - str.push_back(ch); + str += ch; ch = this->getc(); } @@ -772,9 +772,9 @@ double Lexer::parseFloat(uint64_t whole) char buf[MAX_LEN+1]; int ofs = snprintf(buf, MAX_LEN+1, "%llu.", (unsigned long long)whole); - char ch = this->getc_num(); - #define PUTC(ch) do { if( ofs < MAX_SIG ) { buf[ofs] = ch; ofs ++; } else { throw ParseError::Generic("Oversized float"); } } while(0) - while( isdigit(ch) ) + auto ch = this->getc_num(); + #define PUTC(ch) do { if( ofs < MAX_SIG ) { assert(ch.v < 127); buf[ofs] = ch.v; ofs ++; } else { throw ParseError::Generic("Oversized float"); } } while(0) + while( ch.isdigit() ) { PUTC(ch); ch = this->getc_num(); @@ -787,12 +787,12 @@ double Lexer::parseFloat(uint64_t whole) PUTC(ch); ch = this->getc_num(); } - if( !isdigit(ch) ) + if( !ch.isdigit() ) throw ParseError::Generic( FMT("Non-numeric '"<<ch<<"' in float exponent") ); do { PUTC(ch); ch = this->getc_num(); - } while( isdigit(ch) ); + } while( ch.isdigit() ); } this->ungetc(); buf[ofs] = 0; @@ -804,8 +804,8 @@ double Lexer::parseFloat(uint64_t whole) uint32_t Lexer::parseEscape(char enclosing) { - char ch = this->getc(); - switch(ch) + auto ch = this->getc(); + switch(ch.v) { case 'x': case 'u': { @@ -817,17 +817,17 @@ uint32_t Lexer::parseEscape(char enclosing) req_close_brace = true; ch = this->getc(); } - if( !isxdigit(ch) ) - throw ParseError::Generic(*this, FMT("Found invalid character '\\x" << ::std::hex << (int)ch << "' in \\u sequence" ) ); - while( isxdigit(ch) ) + if( !ch.isxdigit() ) + throw ParseError::Generic(*this, FMT("Found invalid character '\\x" << ::std::hex << ch.v << "' in \\u sequence" ) ); + while( ch.isxdigit() ) { - char tmp[2] = {ch, 0}; + char tmp[2] = {static_cast<char>(ch.v), 0}; val *= 16; val += ::std::strtol(tmp, NULL, 16); ch = this->getc(); } if( !req_close_brace ) - this->ungetc(); + this->ungetc(); else if( ch != '}' ) throw ParseError::Generic(*this, "Expected terminating } in \\u sequence"); else { @@ -850,19 +850,26 @@ uint32_t Lexer::parseEscape(char enclosing) case '\r': case '\n': m_line ++; - while( isspace(ch) ) + while( ch.isspace() ) ch = this->getc(); this->ungetc(); if( ch == enclosing ) return ~0; else - return ch; + return ch.v; default: throw ParseError::Todo( FMT("Unknown escape sequence \\" << ch) ); } } -char Lexer::getc() +char Lexer::getc_byte() +{ + char rv = m_istream.get(); + if( m_istream.eof() ) + throw Lexer::EndOfFile(); + return rv; +} +Codepoint Lexer::getc() { if( m_last_char_valid ) { @@ -870,26 +877,24 @@ char Lexer::getc() } else { - m_last_char = m_istream.get(); + m_last_char = this->getc_cp(); m_line_ofs += 1; - if( m_istream.eof() ) - throw Lexer::EndOfFile(); } //::std::cout << "getc(): '" << m_last_char << "'" << ::std::endl; return m_last_char; } -char Lexer::getc_num() +Codepoint Lexer::getc_num() { - char ch; + Codepoint ch; do { ch = this->getc(); } while( ch == '_' ); return ch; } -Codepoint Lexer::getc_codepoint() +Codepoint Lexer::getc_cp() { - uint8_t v1 = this->getc(); + uint8_t v1 = this->getc_byte(); if( v1 < 128 ) { return {v1}; } @@ -899,7 +904,7 @@ Codepoint Lexer::getc_codepoint() } else if( (v1 & 0xE0) == 0xC0 ) { // Two bytes - uint8_t e1 = this->getc(); + uint8_t e1 = this->getc_byte(); if( (e1 & 0xC0) != 0x80 ) return {0xFFFE}; uint32_t outval @@ -910,9 +915,9 @@ Codepoint Lexer::getc_codepoint() } else if( (v1 & 0xF0) == 0xE0 ) { // Three bytes - uint8_t e1 = this->getc(); + uint8_t e1 = this->getc_byte(); if( (e1 & 0xC0) != 0x80 ) return {0xFFFE}; - uint8_t e2 = this->getc(); + uint8_t e2 = this->getc_byte(); if( (e2 & 0xC0) != 0x80 ) return {0xFFFE}; uint32_t outval @@ -924,11 +929,11 @@ Codepoint Lexer::getc_codepoint() } else if( (v1 & 0xF8) == 0xF0 ) { // Four bytes - uint8_t e1 = this->getc(); + uint8_t e1 = this->getc_byte(); if( (e1 & 0xC0) != 0x80 ) return {0xFFFE}; - uint8_t e2 = this->getc(); + uint8_t e2 = this->getc_byte(); if( (e2 & 0xC0) != 0x80 ) return {0xFFFE}; - uint8_t e3 = this->getc(); + uint8_t e3 = this->getc_byte(); if( (e3 & 0xC0) != 0x80 ) return {0xFFFE}; uint32_t outval @@ -1158,6 +1163,25 @@ SERIALISE_TYPE_A(TokenTree::, "TokenTree", { s.item(m_subtrees); }) +bool Codepoint::isspace() const { + switch(this->v) + { + case '\t': + case '\r': + case '\n': + case ' ': + return true; + default: + return false; + } +} +bool Codepoint::isdigit() const { + return this->v < 128 && std::isdigit(static_cast<int>(this->v)); +} +bool Codepoint::isxdigit() const { + return this->v < 128 && std::isxdigit(static_cast<int>(this->v)); +} + ::std::string& operator+=(::std::string& s, const Codepoint& cp) { if( cp.v < 0x80 ) { @@ -1184,4 +1208,28 @@ SERIALISE_TYPE_A(TokenTree::, "TokenTree", { } return s; } - +::std::ostream& operator<<(::std::ostream& os, const Codepoint& cp) +{ + if( cp.v < 0x80 ) { + os << (char)cp.v; + } + else if( cp.v < (0x1F+1)<<(1*6) ) { + os << (char)(0xC0 | ((cp.v >> 6) & 0x1F)); + os << (char)(0x80 | ((cp.v >> 0) & 0x3F)); + } + else if( cp.v <= (0x0F+1)<<(2*6) ) { + os << (char)(0xE0 | ((cp.v >> 12) & 0x0F)); + os << (char)(0x80 | ((cp.v >> 6) & 0x3F)); + os << (char)(0x80 | ((cp.v >> 0) & 0x3F)); + } + else if( cp.v <= (0x07+1)<<(2*6) ) { + os << (char)(0xF0 | ((cp.v >> 18) & 0x07)); + os << (char)(0x80 | ((cp.v >> 12) & 0x3F)); + os << (char)(0x80 | ((cp.v >> 6) & 0x3F)); + os << (char)(0x80 | ((cp.v >> 0) & 0x3F)); + } + else { + throw ::std::runtime_error("BUGCHECK: Bad unicode codepoint encountered"); + } + return os; +} diff --git a/src/parse/lex.hpp b/src/parse/lex.hpp index a626a374..77151bea 100644 --- a/src/parse/lex.hpp +++ b/src/parse/lex.hpp @@ -100,9 +100,19 @@ public: struct Codepoint { uint32_t v; + Codepoint(): v(0) { } Codepoint(uint32_t v): v(v) { } + bool isspace() const; + bool isdigit() const; + bool isxdigit() const; + bool operator==(char x) { return v == static_cast<uint32_t>(x); } + bool operator!=(char x) { return v != static_cast<uint32_t>(x); } + bool operator==(Codepoint x) { return v == x.v; } + bool operator!=(Codepoint x) { return v != x.v; } }; extern ::std::string& operator+=(::std::string& s, const Codepoint& cp); +extern ::std::ostream& operator<<(::std::ostream& s, const Codepoint& cp); +typedef Codepoint uchar; class Lexer: public TokenStream @@ -113,7 +123,7 @@ class Lexer: ::std::ifstream m_istream; bool m_last_char_valid; - char m_last_char; + Codepoint m_last_char; Token m_next_token; // Used when lexing generated two tokens public: Lexer(const ::std::string& filename); @@ -126,14 +136,15 @@ private: signed int getSymbol(); Token getTokenInt_RawString(bool is_byte); - Token getTokenInt_Identifier(char ch, char ch2='\0'); + Token getTokenInt_Identifier(Codepoint ch, Codepoint ch2='\0'); double parseFloat(uint64_t whole); uint32_t parseEscape(char enclosing); - char getc(); - char getc_num(); - Codepoint getc_codepoint(); void ungetc(); + Codepoint getc_num(); + Codepoint getc(); + Codepoint getc_cp(); + char getc_byte(); class EndOfFile {}; }; |