Parse - Rework lexer to be fully unicode aware

author: John Hodge <tpg@mutabah.net> 2016-08-18 10:43:40 +0800
committer: John Hodge <tpg@mutabah.net> 2016-08-18 10:43:40 +0800
commit: c48d39448d06c1ac97838f4cf7f2ed7526adb2fb (patch)
tree: 7119d10e59c5268dc17c52b320dc6698a1811cb9 /src/parse
parent: 5511349eb94867c2314569089128a1f6088cf818 (diff)
download: mrust-c48d39448d06c1ac97838f4cf7f2ed7526adb2fb.tar.gz
2 files changed, 145 insertions, 86 deletions
diff --git a/src/parse/lex.cpp b/src/parse/lex.cpp
index c3bfa5e5..62e25942 100644
--- a/src/parse/lex.cpp
+++ b/src/parse/lex.cpp
@@ -189,7 +189,7 @@ static const struct {
 
 signed int Lexer::getSymbol()
 {
-    char ch = this->getc();
+    Codepoint ch = this->getc();
     // 1. lsearch for character
     // 2. Consume as many characters as currently match
     // 3. IF: a smaller character or, EOS is hit - Return current best
@@ -200,12 +200,12 @@ signed int Lexer::getSymbol()
         const char* const chars = TOKENMAP[i].chars;
         const size_t len = TOKENMAP[i].len;
 
-        if( ofs >= len || chars[ofs] > ch ) {
+        if( ofs >= len || static_cast<uint32_t>(chars[ofs]) > ch.v ) {
             this->ungetc();
             return best;
         }
 
-        while( chars[ofs] && chars[ofs] == ch )
+        while( chars[ofs] && ch == chars[ofs] )
         {
             try {
                 ch = this->getc();
@@ -225,13 +225,13 @@ signed int Lexer::getSymbol()
     return best;
 }
 
-bool issym(int ch)
+bool issym(Codepoint ch)
 {
-    if( ::std::isalnum(ch) )
+    if( ::std::isalnum(ch.v) )
         return true;
     if( ch == '_' )
         return true;
-    if( ch >= 128 || ch < 0 )
+    if( ch.v >= 128 )
         return true;
     return false;
 }
@@ -275,13 +275,13 @@ Token Lexer::getTokenInt()
     }
     try
     {
-        char ch = this->getc();
+        Codepoint ch = this->getc();
         
         if( ch == '#' && m_line == 1 && m_line_ofs == 1 ) {
-            switch(ch = this->getc())
+            switch( (ch = this->getc()).v )
             {
             case '!':
-                switch(ch = this->getc())
+                switch( (ch = this->getc()).v )
                 {
                 case '/':
                     // SHEBANG!
@@ -291,22 +291,22 @@ Token Lexer::getTokenInt()
                 case '[':
                     return Token(TOK_CATTR_OPEN);
                 default:
-                    throw ParseError::BadChar(*this, ch);
+                    throw ParseError::BadChar(*this, ch.v);
                 }
             case '[':
                 return Token(TOK_ATTR_OPEN);
             default:
                 this->ungetc();
                 //return Token(TOK_HASH);
-                throw ParseError::BadChar(*this, ch);
+                throw ParseError::BadChar(*this, ch.v);
             }
         }
 
         if( ch == '\n' )
             return Token(TOK_NEWLINE);
-        if( isspace(ch) )
+        if( ch.isspace() )
         {
-            while( isspace(ch = this->getc()) && ch != '\n' )
+            while( (ch = this->getc()).isspace() && ch != '\n' )
                 ;
             this->ungetc();
             return Token(TOK_WHITESPACE);
@@ -317,8 +317,8 @@ Token Lexer::getTokenInt()
         if( sym == 0 )
         {
             // No match at all, check for symbol
-            char ch = this->getc();
-            if( isdigit(ch) )
+            auto ch = this->getc();
+            if( ch.isdigit() )
             {
                 enum eCoreType  num_type = CORETYPE_ANY;
                 enum {
@@ -336,25 +336,25 @@ Token Lexer::getTokenInt()
                     ch = this->getc_num();
                     if( ch == 'x' ) {
                         num_mode = HEX;
-                        while( isxdigit(ch = this->getc_num()) )
+                        while( (ch = this->getc_num()).isxdigit() )
                         {
                             val *= 16;
-                            if(ch <= '9')
-                                val += ch - '0';
-                            else if( ch <= 'F' )
-                                val += ch - 'A' + 10;
-                            else if( ch <= 'f' )
-                                val += ch - 'a' + 10;
+                            if(ch.v <= '9')
+                                val += ch.v - '0';
+                            else if( ch.v <= 'F' )
+                                val += ch.v - 'A' + 10;
+                            else if( ch.v <= 'f' )
+                                val += ch.v - 'a' + 10;
                         }
                     }
                     else if( ch == 'b' ) {
                         num_mode = BIN;
-                        while( isdigit(ch = this->getc_num()) )
+                        while( (ch = this->getc_num()).isdigit() )
                         {
                             val *= 2;
-                            if(ch == '0')
+                            if(ch.v == '0')
                                 val += 0;
-                            else if( ch == '1' )
+                            else if( ch.v == '1' )
                                 val += 1;
                             else
                                 throw ParseError::Generic("Invalid digit in binary literal");
@@ -362,27 +362,27 @@ Token Lexer::getTokenInt()
                     }
                     else if( ch == 'o' ) {
                         num_mode = OCT;
-                        while( isdigit(ch = this->getc_num()) ) {
+                        while( (ch = this->getc_num()).isdigit() ) {
                             val *= 8;
-                            if('0' <= ch && ch <= '7')
-                                val += ch - '0';
+                            if('0' <= ch.v && ch.v <= '7')
+                                val += ch.v - '0';
                             else
                                 throw ParseError::Generic("Invalid digit in octal literal");
                         }
                     }
                     else {
                         num_mode = DEC;
-                        while( isdigit(ch) ) {
+                        while( ch.isdigit() ) {
                             val *= 10;
-                            val += ch - '0';
+                            val += ch.v - '0';
                             ch = this->getc_num();
                         }
                     }
                 }
                 else {
-                    while( isdigit(ch) ) {
+                    while( ch.isdigit() ) {
                         val *= 10;
-                        val += ch - '0';
+                        val += ch.v - '0';
                         ch = this->getc_num();
                     }
                 }
@@ -407,7 +407,7 @@ Token Lexer::getTokenInt()
                             return Token(val, CORETYPE_ANY);
                         }
                         // Single dot - Still a float. (TODO)
-                        else if( !isdigit(ch) )
+                        else if( !ch.isdigit() )
                         {
                             this->ungetc();
                             if( issym(ch) )
@@ -434,7 +434,7 @@ Token Lexer::getTokenInt()
                         ::std::string   suffix;
                         while( issym(ch) )
                         {
-                            suffix.push_back(ch);
+                            suffix += ch;
                             ch = this->getc();
                         }
                         this->ungetc();
@@ -457,7 +457,7 @@ Token Lexer::getTokenInt()
                     ::std::string   suffix;
                     while( issym(ch) )
                     {
-                        suffix.push_back(ch);
+                        suffix += ch;
                         ch = this->getc();
                     }
                     this->ungetc();
@@ -513,7 +513,7 @@ Token Lexer::getTokenInt()
                                 }
                             }
                             else {
-                                str.push_back(ch);
+                                str += ch;
                             }
                         }
                         return Token(TOK_BYTESTRING, str);
@@ -531,7 +531,7 @@ Token Lexer::getTokenInt()
                         else {
                             if( this->getc() != '\'' )
                                 throw ParseError::Generic(*this, "Multi-byte character literal");
-                            return Token((uint64_t)ch, CORETYPE_U8);
+                            return Token((uint64_t)ch.v, CORETYPE_U8);
                         }
                     }
                     else {
@@ -548,7 +548,7 @@ Token Lexer::getTokenInt()
             }
             else
             {
-                throw ParseError::BadChar(*this, ch);
+                throw ParseError::BadChar(*this, ch.v);
             }
         }
         else if( sym > 0 )
@@ -562,10 +562,10 @@ Token Lexer::getTokenInt()
             case LINECOMMENT: {
                 // Line comment
                 ::std::string   str;
-                char ch = this->getc();
+                auto ch = this->getc();
                 while(ch != '\n' && ch != '\r')
                 {
-                    str.push_back(ch);
+                    str += ch;
                     ch = this->getc();
                 }
                 this->ungetc();
@@ -578,12 +578,12 @@ Token Lexer::getTokenInt()
                     ch = this->getc();
                     
                     if( ch == '/' ) {
-                        str.push_back(ch);
+                        str += ch;
                         ch = this->getc();
                         if( ch == '*' ) {
                             level ++;
                         }
-                        str.push_back(ch);
+                        str += ch;
                     }
                     else {
                         if( ch == '*' ) {
@@ -597,17 +597,17 @@ Token Lexer::getTokenInt()
                             }
                             else {
                                 str.push_back('*');
-                                str.push_back(ch);
+                                str += ch;
                             }
                         }
                         else {
-                            str.push_back(ch);
+                            str += ch;
                         }
                     }
                 }
                 return Token(TOK_COMMENT, str); }
             case SINGLEQUOTE: {
-                auto firstchar = this->getc_codepoint();
+                auto firstchar = this->getc();
                 if( firstchar.v == '\\' ) {
                     // Character constant with an escape code
                     uint32_t val = this->parseEscape('\'');
@@ -628,7 +628,7 @@ Token Lexer::getTokenInt()
                         str += firstchar;
                         while( issym(ch) )
                         {
-                            str.push_back(ch);
+                            str += ch;
                             ch = this->getc();
                         }
                         this->ungetc();
@@ -653,7 +653,7 @@ Token Lexer::getTokenInt()
                     }
                     else
                     {
-                        str.push_back(ch);
+                        str += ch;
                     }
                 }
                 return Token(TOK_STRING, str);
@@ -673,7 +673,7 @@ Token Lexer::getTokenInt()
 Token Lexer::getTokenInt_RawString(bool is_byte)
 {
     // Raw string (possibly byte)
-    char ch = this->getc();
+    Codepoint ch = this->getc();
     unsigned int hashes = 0;
     while(ch == '#')
     {
@@ -687,7 +687,7 @@ Token Lexer::getTokenInt_RawString(bool is_byte)
         else
             return this->getTokenInt_Identifier('r');
     }
-    char terminator = ch;
+    auto terminator = ch;
     ::std::string   val;
     DEBUG("terminator = '" << terminator << "', hashes = " << hashes);
 
@@ -736,15 +736,15 @@ Token Lexer::getTokenInt_RawString(bool is_byte)
     }
     return Token(is_byte ? TOK_BYTESTRING : TOK_STRING, val);
 }
-Token Lexer::getTokenInt_Identifier(char leader, char leader2)
+Token Lexer::getTokenInt_Identifier(Codepoint leader, Codepoint leader2)
 {
     ::std::string   str;
     if( leader2 != '\0' )
         str += leader;
-    char ch = leader2 == '\0' ? leader : leader2;
+    auto ch = leader2 == '\0' ? leader : leader2;
     while( issym(ch) )
     {
-        str.push_back(ch);
+        str += ch;
         ch = this->getc();
     }
 
@@ -772,9 +772,9 @@ double Lexer::parseFloat(uint64_t whole)
     char buf[MAX_LEN+1];
     int ofs = snprintf(buf, MAX_LEN+1, "%llu.", (unsigned long long)whole);
 
-    char ch = this->getc_num();
-    #define PUTC(ch)    do { if( ofs < MAX_SIG ) { buf[ofs] = ch; ofs ++; } else { throw ParseError::Generic("Oversized float"); } } while(0)
-    while( isdigit(ch) )
+    auto ch = this->getc_num();
+    #define PUTC(ch)    do { if( ofs < MAX_SIG ) { assert(ch.v < 127); buf[ofs] = ch.v; ofs ++; } else { throw ParseError::Generic("Oversized float"); } } while(0)
+    while( ch.isdigit() )
     {
         PUTC(ch);
         ch = this->getc_num();
@@ -787,12 +787,12 @@ double Lexer::parseFloat(uint64_t whole)
             PUTC(ch);
             ch = this->getc_num();
         }
-        if( !isdigit(ch) )
+        if( !ch.isdigit() )
             throw ParseError::Generic( FMT("Non-numeric '"<<ch<<"' in float exponent") );
         do {
             PUTC(ch);
             ch = this->getc_num();
-        } while( isdigit(ch) );
+        } while( ch.isdigit() );
     }
     this->ungetc();
     buf[ofs] = 0;
@@ -804,8 +804,8 @@ double Lexer::parseFloat(uint64_t whole)
 
 uint32_t Lexer::parseEscape(char enclosing)
 {
-    char ch = this->getc();
-    switch(ch)
+    auto ch = this->getc();
+    switch(ch.v)
     {
     case 'x':
     case 'u': {
@@ -817,17 +817,17 @@ uint32_t Lexer::parseEscape(char enclosing)
             req_close_brace = true;
             ch = this->getc();
         }
-        if( !isxdigit(ch) )
-            throw ParseError::Generic(*this, FMT("Found invalid character '\\x" << ::std::hex << (int)ch << "' in \\u sequence" ) );
-        while( isxdigit(ch) )
+        if( !ch.isxdigit() )
+            throw ParseError::Generic(*this, FMT("Found invalid character '\\x" << ::std::hex << ch.v << "' in \\u sequence" ) );
+        while( ch.isxdigit() )
         {
-            char    tmp[2] = {ch, 0};
+            char    tmp[2] = {static_cast<char>(ch.v), 0};
             val *= 16;
             val += ::std::strtol(tmp, NULL, 16);
             ch = this->getc();
         }
         if( !req_close_brace )
-                this->ungetc();
+            this->ungetc();
         else if( ch != '}' )
             throw ParseError::Generic(*this, "Expected terminating } in \\u sequence");
         else {
@@ -850,19 +850,26 @@ uint32_t Lexer::parseEscape(char enclosing)
     case '\r':
     case '\n':
         m_line ++;
-        while( isspace(ch) )
+        while( ch.isspace() )
             ch = this->getc();
         this->ungetc();
         if( ch == enclosing )
             return ~0;
         else
-            return ch;
+            return ch.v;
     default:
         throw ParseError::Todo( FMT("Unknown escape sequence \\" << ch) );
     }
 }
 
-char Lexer::getc()
+char Lexer::getc_byte()
+{
+    char rv = m_istream.get();
+    if( m_istream.eof() )
+        throw Lexer::EndOfFile();
+    return rv;
+}
+Codepoint Lexer::getc()
 {
     if( m_last_char_valid )
     {
@@ -870,26 +877,24 @@ char Lexer::getc()
     }
     else
     {
-        m_last_char = m_istream.get();
+        m_last_char = this->getc_cp();
         m_line_ofs += 1;
-        if( m_istream.eof() )
-            throw Lexer::EndOfFile();
     }
     //::std::cout << "getc(): '" << m_last_char << "'" << ::std::endl;
     return m_last_char;
 }
 
-char Lexer::getc_num()
+Codepoint Lexer::getc_num()
 {
-    char ch;
+    Codepoint ch;
     do {
         ch = this->getc();
     } while( ch == '_' );
     return ch;
 }
-Codepoint Lexer::getc_codepoint()
+Codepoint Lexer::getc_cp()
 {
-    uint8_t v1 = this->getc();
+    uint8_t v1 = this->getc_byte();
     if( v1 < 128 ) {
         return {v1};
     }
@@ -899,7 +904,7 @@ Codepoint Lexer::getc_codepoint()
     }
     else if( (v1 & 0xE0) == 0xC0 ) {
         // Two bytes
-        uint8_t e1 = this->getc();
+        uint8_t e1 = this->getc_byte();
         if( (e1 & 0xC0) != 0x80 )  return {0xFFFE};
         
         uint32_t outval
@@ -910,9 +915,9 @@ Codepoint Lexer::getc_codepoint()
     }
     else if( (v1 & 0xF0) == 0xE0 ) {
         // Three bytes
-        uint8_t e1 = this->getc();
+        uint8_t e1 = this->getc_byte();
         if( (e1 & 0xC0) != 0x80 )  return {0xFFFE};
-        uint8_t e2 = this->getc();
+        uint8_t e2 = this->getc_byte();
         if( (e2 & 0xC0) != 0x80 )  return {0xFFFE};
         
         uint32_t outval
@@ -924,11 +929,11 @@ Codepoint Lexer::getc_codepoint()
     }
     else if( (v1 & 0xF8) == 0xF0 ) {
         // Four bytes
-        uint8_t e1 = this->getc();
+        uint8_t e1 = this->getc_byte();
         if( (e1 & 0xC0) != 0x80 )  return {0xFFFE};
-        uint8_t e2 = this->getc();
+        uint8_t e2 = this->getc_byte();
         if( (e2 & 0xC0) != 0x80 )  return {0xFFFE};
-        uint8_t e3 = this->getc();
+        uint8_t e3 = this->getc_byte();
         if( (e3 & 0xC0) != 0x80 )  return {0xFFFE};
         
         uint32_t outval
@@ -1158,6 +1163,25 @@ SERIALISE_TYPE_A(TokenTree::, "TokenTree", {
     s.item(m_subtrees);
 })
 
+bool Codepoint::isspace() const {
+    switch(this->v)
+    {
+    case '\t':
+    case '\r':
+    case '\n':
+    case ' ':
+        return true;
+    default:
+        return false;
+    }
+}
+bool Codepoint::isdigit() const {
+    return this->v < 128 && std::isdigit(static_cast<int>(this->v));
+}
+bool Codepoint::isxdigit() const {
+    return this->v < 128 && std::isxdigit(static_cast<int>(this->v));
+}
+
 ::std::string& operator+=(::std::string& s, const Codepoint& cp)
 {
     if( cp.v < 0x80 ) {
@@ -1184,4 +1208,28 @@ SERIALISE_TYPE_A(TokenTree::, "TokenTree", {
     }
     return s;
 }
-
+::std::ostream& operator<<(::std::ostream& os, const Codepoint& cp)
+{
+    if( cp.v < 0x80 ) {
+        os << (char)cp.v;
+    }
+    else if( cp.v < (0x1F+1)<<(1*6) ) {
+        os << (char)(0xC0 | ((cp.v >> 6) & 0x1F));
+        os << (char)(0x80 | ((cp.v >> 0) & 0x3F));
+    }
+    else if( cp.v <= (0x0F+1)<<(2*6) ) {
+        os << (char)(0xE0 | ((cp.v >> 12) & 0x0F));
+        os << (char)(0x80 | ((cp.v >>  6) & 0x3F));
+        os << (char)(0x80 | ((cp.v >>  0) & 0x3F));
+    }
+    else if( cp.v <= (0x07+1)<<(2*6) ) {
+        os << (char)(0xF0 | ((cp.v >> 18) & 0x07));
+        os << (char)(0x80 | ((cp.v >> 12) & 0x3F));
+        os << (char)(0x80 | ((cp.v >>  6) & 0x3F));
+        os << (char)(0x80 | ((cp.v >>  0) & 0x3F));
+    }
+    else {
+        throw ::std::runtime_error("BUGCHECK: Bad unicode codepoint encountered");
+    }
+    return os;
+}
diff --git a/src/parse/lex.hpp b/src/parse/lex.hpp
index a626a374..77151bea 100644
--- a/src/parse/lex.hpp
+++ b/src/parse/lex.hpp
@@ -100,9 +100,19 @@ public:
 
 struct Codepoint {
     uint32_t    v;
+    Codepoint(): v(0) { }
     Codepoint(uint32_t v): v(v) { }
+    bool isspace() const;
+    bool isdigit() const;
+    bool isxdigit() const;
+    bool operator==(char x) { return v == static_cast<uint32_t>(x); }
+    bool operator!=(char x) { return v != static_cast<uint32_t>(x); }
+    bool operator==(Codepoint x) { return v == x.v; }
+    bool operator!=(Codepoint x) { return v != x.v; }
 };
 extern ::std::string& operator+=(::std::string& s, const Codepoint& cp);
+extern ::std::ostream& operator<<(::std::ostream& s, const Codepoint& cp);
+typedef Codepoint   uchar;
 
 class Lexer:
     public TokenStream
@@ -113,7 +123,7 @@ class Lexer:
 
     ::std::ifstream m_istream;
     bool    m_last_char_valid;
-    char    m_last_char;
+    Codepoint   m_last_char;
     Token   m_next_token;   // Used when lexing generated two tokens
 public:
     Lexer(const ::std::string& filename);
@@ -126,14 +136,15 @@ private:
     
     signed int getSymbol();
     Token getTokenInt_RawString(bool is_byte);
-    Token getTokenInt_Identifier(char ch, char ch2='\0');
+    Token getTokenInt_Identifier(Codepoint ch, Codepoint ch2='\0');
     double parseFloat(uint64_t whole);
     uint32_t parseEscape(char enclosing);
 
-    char getc();
-    char getc_num();
-    Codepoint getc_codepoint();
     void ungetc();
+    Codepoint getc_num();
+    Codepoint getc();
+    Codepoint getc_cp();
+    char getc_byte();
 
     class EndOfFile {};
 };
author	John Hodge <tpg@mutabah.net>	2016-08-18 10:43:40 +0800
committer	John Hodge <tpg@mutabah.net>	2016-08-18 10:43:40 +0800
commit	c48d39448d06c1ac97838f4cf7f2ed7526adb2fb (patch)
tree	7119d10e59c5268dc17c52b320dc6698a1811cb9 /src/parse
parent	5511349eb94867c2314569089128a1f6088cf818 (diff)
download	mrust-c48d39448d06c1ac97838f4cf7f2ed7526adb2fb.tar.gz