summaryrefslogtreecommitdiff
path: root/src/parse/lex.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse/lex.cpp')
-rw-r--r--src/parse/lex.cpp205
1 files changed, 144 insertions, 61 deletions
diff --git a/src/parse/lex.cpp b/src/parse/lex.cpp
index 2338adcd..dc206ad8 100644
--- a/src/parse/lex.cpp
+++ b/src/parse/lex.cpp
@@ -200,12 +200,14 @@ signed int Lexer::getSymbol()
return best;
}
-bool issym(char ch)
+bool issym(int ch)
{
if( ::std::isalnum(ch) )
return true;
if( ch == '_' )
return true;
+ if( ch >= 128 || ch < 0 )
+ return true;
return false;
}
@@ -279,7 +281,7 @@ Token Lexer::getTokenInt()
uint64_t val = 0;
if( ch == '0' ) {
// Octal/hex handling
- ch = this->getc();
+ ch = this->getc_num();
if( ch == 'x' ) {
num_mode = HEX;
while( isxdigit(ch = this->getc_num()) )
@@ -323,7 +325,7 @@ Token Lexer::getTokenInt()
}
}
- if(ch == 'u' || ch == 'i') {
+ if(issym(ch)) {
// Unsigned
::std::string suffix;
while( issym(ch) )
@@ -344,6 +346,8 @@ Token Lexer::getTokenInt()
else if(suffix == "u32") num_type = CORETYPE_U32;
else if(suffix == "u64") num_type = CORETYPE_U64;
else if(suffix == "usize") num_type = CORETYPE_UINT;
+ else if(suffix == "f32") num_type = CORETYPE_F32;
+ else if(suffix == "f64") num_type = CORETYPE_F64;
else
throw ParseError::Generic(*this, FMT("Unknown integer suffix '" << suffix << "'"));
return Token(val, num_type);
@@ -375,7 +379,7 @@ Token Lexer::getTokenInt()
this->ungetc();
double fval = this->parseFloat(val);
- if( (ch = this->getc()) == 'f' )
+ if( issym(ch = this->getc()) )
{
::std::string suffix;
while( issym(ch) )
@@ -403,62 +407,60 @@ Token Lexer::getTokenInt()
return Token(val, num_type);
}
}
- // Symbols
- else if( issym(ch) )
+ // Byte/Raw strings
+ else if( ch == 'b' || ch == 'r' )
{
- ::std::string str;
- while( issym(ch) )
- {
- str.push_back(ch);
+ bool is_byte = false;
+ if(ch == 'b') {
+ is_byte = true;
ch = this->getc();
}
-
- if( ch == '!' )
- {
- return Token(TOK_MACRO, str);
+
+ if(ch == 'r') {
+ return this->getTokenInt_RawString(is_byte);
}
- else
- {
- if( str == "b" )
- {
- if( ch == '\'' ) {
- // Byte constant
- ch = this->getc();
- if( ch == '\\' ) {
- uint32_t val = this->parseEscape('\'');
- if( this->getc() != '\'' )
- throw ParseError::Generic(*this, "Multi-byte character literal");
- return Token((uint64_t)val, CORETYPE_U8);
- }
- else {
- if( this->getc() != '\'' )
- throw ParseError::Generic(*this, "Multi-byte character literal");
- return Token((uint64_t)ch, CORETYPE_U8);
- }
+ else {
+ assert(is_byte);
+
+ // Byte string
+ if( ch == '"' ) {
+ ::std::string str;
+ while( (ch = this->getc()) != '"' )
+ {
+ if( ch == '\\' )
+ ch = this->parseEscape('"');
+ str.push_back(ch);
}
- else if( ch == '"') {
- ::std::string str;
- while( (ch = this->getc()) != '"' )
- {
- if( ch == '\\' )
- ch = this->parseEscape('"');
- str.push_back(ch);
- }
- return Token(TOK_BYTESTRING, str);
+ return Token(TOK_BYTESTRING, str);
+ }
+ // Byte constant
+ else if( ch == '\'' ) {
+ // Byte constant
+ ch = this->getc();
+ if( ch == '\\' ) {
+ uint32_t val = this->parseEscape('\'');
+ if( this->getc() != '\'' )
+ throw ParseError::Generic(*this, "Multi-byte character literal");
+ return Token((uint64_t)val, CORETYPE_U8);
}
else {
+ if( this->getc() != '\'' )
+ throw ParseError::Generic(*this, "Multi-byte character literal");
+ return Token((uint64_t)ch, CORETYPE_U8);
}
}
-
- this->ungetc();
- for( unsigned int i = 0; i < LEN(RWORDS); i ++ )
- {
- if( str < RWORDS[i].chars ) break;
- if( str == RWORDS[i].chars ) return Token((enum eTokenType)RWORDS[i].type);
+ else {
+ assert(is_byte);
+ this->ungetc();
+ return this->getTokenInt_Identifier('b');
}
- return Token(TOK_IDENT, str);
}
}
+ // Symbols
+ else if( issym(ch) )
+ {
+ return this->getTokenInt_Identifier(ch);
+ }
else
{
throw ParseError::BadChar(ch);
@@ -497,17 +499,25 @@ Token Lexer::getTokenInt()
}
return Token(TOK_COMMENT, str); }
case SINGLEQUOTE: {
- char firstchar = this->getc();
- if( firstchar != '\\' ) {
+ auto firstchar = this->getc_codepoint();
+ if( firstchar.v == '\\' ) {
+ // Character constant with an escape code
+ uint32_t val = this->parseEscape('\'');
+ if(this->getc() != '\'') {
+ throw ParseError::Todo("Proper error for lex failures");
+ }
+ return Token((uint64_t)val, CORETYPE_CHAR);
+ }
+ else {
ch = this->getc();
if( ch == '\'' ) {
// Character constant
- return Token((uint64_t)firstchar, CORETYPE_CHAR);
+ return Token((uint64_t)firstchar.v, CORETYPE_CHAR);
}
- else {
+ else if( issym(firstchar.v) ) {
// Lifetime name
::std::string str;
- str.push_back(firstchar);
+ str += firstchar;
while( issym(ch) )
{
str.push_back(ch);
@@ -516,14 +526,9 @@ Token Lexer::getTokenInt()
this->ungetc();
return Token(TOK_LIFETIME, str);
}
- }
- else {
- // Character constant with an escape code
- uint32_t val = this->parseEscape('\'');
- if(this->getc() != '\'') {
- throw ParseError::Todo("Proper error for lex failures");
+ else {
+ throw ParseError::Todo("Lex Fail - Expected ' after character constant");
}
- return Token((uint64_t)val, CORETYPE_CHAR);
}
break; }
case DOUBLEQUOTE: {
@@ -548,6 +553,72 @@ Token Lexer::getTokenInt()
//assert(!"bugcheck");
}
+Token Lexer::getTokenInt_RawString(bool is_byte)
+{
+ // Raw string (possibly byte)
+ char ch = this->getc();
+ unsigned int hashes = 0;
+ while(ch == '#')
+ {
+ hashes ++;
+ ch = this->getc();
+ }
+ if( hashes == 0 && ch != '"' ) {
+ this->ungetc();
+ return this->getTokenInt_Identifier('r');
+ }
+ char terminator = ch;
+ ::std::string val;
+
+ for(;;)
+ {
+ ch = this->getc();
+ if( ch == terminator ) {
+ for(unsigned i = 0; i < hashes; i ++)
+ {
+ ch = this->getc();
+ if( ch != '#' ) {
+ val += terminator;
+ while( i -- )
+ val += '#';
+ break ;
+ }
+ }
+ if( hashes == 0 || ch == '#' ) {
+ return Token(is_byte ? TOK_BYTESTRING : TOK_STRING, val);
+ }
+ }
+ else {
+ val += ch;
+ }
+ }
+}
+Token Lexer::getTokenInt_Identifier(char leader)
+{
+ char ch = leader;
+ ::std::string str;
+ while( issym(ch) )
+ {
+ str.push_back(ch);
+ ch = this->getc();
+ }
+
+ if( ch == '!' )
+ {
+ return Token(TOK_MACRO, str);
+ }
+ else
+ {
+ this->ungetc();
+ for( unsigned int i = 0; i < LEN(RWORDS); i ++ )
+ {
+ if( str < RWORDS[i].chars ) break;
+ if( str == RWORDS[i].chars ) return Token((enum eTokenType)RWORDS[i].type);
+ }
+ return Token(TOK_IDENT, str);
+ }
+}
+
// Takes the VERY lazy way of reading the float into a string then passing to strtod
double Lexer::parseFloat(uint64_t whole)
{
@@ -615,6 +686,8 @@ uint32_t Lexer::parseEscape(char enclosing)
else
;
return val; }
+ case '0':
+ return '\0';
case '\\':
return '\\';
case '\'':
@@ -628,7 +701,7 @@ uint32_t Lexer::parseEscape(char enclosing)
case 't':
return '\t';
case '\n':
- m_line ++;
+ m_line ++;
while( isspace(ch) )
ch = this->getc();
return ch;
@@ -662,6 +735,16 @@ char Lexer::getc_num()
} while( ch == '_' );
return ch;
}
+Codepoint Lexer::getc_codepoint()
+{
+ uint8_t v1 = this->getc();
+ if( v1 < 128 ) {
+ return {v1};
+ }
+ else {
+ throw ParseError::Todo("getc_codepoint");
+ }
+}
void Lexer::ungetc()
{