-
Notifications
You must be signed in to change notification settings - Fork 0
Lexer! #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Lexer! #14
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,26 @@ | ||
| #include <iostream> | ||
|
|
||
| #include "lib/ui/ui_functions.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
|
|
||
| int main(int32_t argc, char** argv) { | ||
| std::vector<std::string> args = std::vector<std::string>(argv, argv + argc); | ||
| return StartConsoleUI(args, std::cout); | ||
| int main() { | ||
| const std::string sample = R"ovum( | ||
| // demo | ||
| fun Main(args: StringArray): Int { | ||
| val count: Int = args.Length() | ||
| sys::Print("Args count: " + count.ToString()) | ||
| return 0 | ||
| } | ||
| )ovum"; | ||
|
|
||
| Lexer lx(sample, false); | ||
| try { | ||
| auto toks = lx.tokenize(); | ||
| for (auto &t : toks) { | ||
| std::cout << t->to_string() << "\n"; | ||
| } | ||
| } catch (const std::exception &e) { | ||
| std::cerr << "Lexer error: " << e.what() << "\n"; | ||
| return 1; | ||
| } | ||
| return 0; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,131 @@ | ||
| #include "Lexer.hpp" | ||
| #include "utils.hpp" | ||
|
|
||
| Lexer::Lexer(std::string_view src, bool keep_comments) : | ||
| src_(src), keep_comments_(keep_comments), start_(0), current_(0), line_(1), col_(1), token_col_(1) { | ||
| register_defaults(); | ||
| } | ||
|
|
||
| bool Lexer::is_at_end() const noexcept { | ||
| return current_ >= src_.size(); | ||
| } | ||
|
|
||
| char Lexer::peek(size_t offset) const noexcept { | ||
| size_t idx = current_ + offset; | ||
| if (idx >= src_.size()) | ||
| return '\0'; | ||
| return src_[idx]; | ||
| } | ||
|
|
||
| char Lexer::current_char() const noexcept { | ||
| if (current_ == 0) | ||
| return '\0'; | ||
| return src_[current_ - 1]; | ||
| } | ||
|
|
||
| char Lexer::advance() { | ||
| if (is_at_end()) | ||
| return '\0'; | ||
| char c = src_[current_++]; | ||
| if (c == '\n') { | ||
| ++line_; | ||
| col_ = 1; | ||
| } else | ||
| ++col_; | ||
| return c; | ||
| } | ||
|
|
||
| void Lexer::retreat_one() { | ||
| if (current_ == 0) | ||
| return; | ||
| --current_; | ||
| int l = 1; | ||
| for (size_t i = 0; i < current_; ++i) | ||
| if (src_[i] == '\n') | ||
| ++l; | ||
| line_ = l; | ||
| int col = 1; | ||
| for (size_t i = current_; i > 0; --i) { | ||
| if (src_[i - 1] == '\n') { | ||
| col = static_cast<int>(current_ - i + 1); | ||
| break; | ||
| } | ||
| if (i == 1) | ||
| col = static_cast<int>(current_); | ||
| } | ||
| col_ = col; | ||
| } | ||
|
|
||
| void Lexer::consume_while(std::string &out, const std::function<bool(char)> &pred) { | ||
| while (!is_at_end() && pred(peek())) { | ||
| out.push_back(advance()); | ||
| } | ||
| } | ||
|
|
||
| std::string Lexer::raw_lexeme() const { | ||
| if (current_ >= start_) | ||
| return std::string(src_.substr(start_, current_ - start_)); | ||
| return {}; | ||
| } | ||
|
|
||
| bool Lexer::is_keyword(std::string_view s) const { | ||
| return keyword_set().contains(std::string(s)); | ||
| } | ||
|
|
||
| bool Lexer::is_multiop(std::string_view s) const { | ||
| return multi_ops_set().contains(std::string(s)); | ||
| } | ||
|
|
||
| std::vector<TokenPtr> Lexer::tokenize() { | ||
| std::vector<TokenPtr> tokens; | ||
| while (!is_at_end()) { | ||
| start_ = current_; | ||
| token_col_ = col_; | ||
| char c = advance(); | ||
| Handler *h = handlers_[static_cast<unsigned char>(c)].get(); | ||
| if (!h) | ||
| h = default_handler_.get(); | ||
| OptToken maybe = h->scan(*this); | ||
| if (maybe && *maybe) | ||
| tokens.push_back(std::move(*maybe)); | ||
| } | ||
| tokens.push_back(TokenFactory::make_eof(line_, col_)); | ||
| return tokens; | ||
| } | ||
|
|
||
| void Lexer::register_defaults() { | ||
| for (auto &p : handlers_) | ||
| p.reset(); | ||
| default_handler_.reset(); | ||
|
|
||
| set_handler(' ', std::make_unique<WhitespaceHandler>()); | ||
| set_handler('\t', std::make_unique<WhitespaceHandler>()); | ||
| set_handler('\r', std::make_unique<WhitespaceHandler>()); | ||
|
|
||
| set_handler('\n', std::make_unique<NewlineHandler>()); | ||
|
|
||
| for (unsigned char c = 'a'; c <= 'z'; ++c) | ||
| set_handler(c, std::make_unique<IdentifierHandler>()); | ||
| for (unsigned char c = 'A'; c <= 'Z'; ++c) | ||
| set_handler(c, std::make_unique<IdentifierHandler>()); | ||
| set_handler((unsigned char) '_', std::make_unique<IdentifierHandler>()); | ||
|
|
||
| for (unsigned char d = '0'; d <= '9'; ++d) | ||
| set_handler(d, std::make_unique<NumberHandler>()); | ||
| set_handler((unsigned char) '.', std::make_unique<NumberHandler>()); | ||
|
|
||
| set_handler((unsigned char) '"', std::make_unique<StringHandler>()); | ||
| set_handler((unsigned char) '\'', std::make_unique<CharHandler>()); | ||
|
|
||
| set_handler((unsigned char) '/', std::make_unique<SlashHandler>()); | ||
|
|
||
| const std::string opchars = "+-*/%<>=!&|^~?:."; | ||
| for (unsigned char c : opchars) | ||
| set_handler(c, std::make_unique<OperatorHandler>()); | ||
|
|
||
| const std::string puncts = ",;:(){}[]"; | ||
| for (unsigned char c : puncts) | ||
| set_handler(c, std::make_unique<PunctHandler>()); | ||
|
|
||
| set_default_handler(std::make_unique<DefaultHandler>()); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| #ifndef LEXER_HPP_ | ||
| #define LEXER_HPP_ | ||
|
|
||
| #include <cctype> | ||
| #include <functional> | ||
| #include <stdexcept> | ||
| #include <string> | ||
| #include <string_view> | ||
| #include <unordered_set> | ||
| #include <vector> | ||
|
|
||
| #include "handlers/CharHandler.hpp" | ||
| #include "handlers/DefaultHandler.hpp" | ||
| #include "handlers/Handler.hpp" | ||
| #include "handlers/IdentifierHandler.hpp" | ||
| #include "handlers/NewlineHandler.hpp" | ||
| #include "handlers/NumberHandler.hpp" | ||
| #include "handlers/OperatorHandler.hpp" | ||
| #include "handlers/PunctHandler.hpp" | ||
| #include "handlers/SlashHandler.hpp" | ||
| #include "handlers/StringHandler.hpp" | ||
| #include "handlers/WhitespaceHandler.hpp" | ||
|
|
||
| #include "tokens/TokenFactory.hpp" | ||
|
|
||
| class Lexer { | ||
| public: | ||
| explicit Lexer(std::string_view src, bool keep_comments = false); | ||
|
|
||
| std::vector<TokenPtr> tokenize(); | ||
|
|
||
| bool is_at_end() const noexcept; | ||
| char peek(size_t offset = 0) const noexcept; | ||
| char current_char() const noexcept; | ||
| char advance(); | ||
| void retreat_one(); | ||
| void consume_while(std::string &out, const std::function<bool(char)> &pred); | ||
| std::string raw_lexeme() const; | ||
|
|
||
| int line() const noexcept { | ||
| return line_; | ||
| } | ||
| int token_col() const noexcept { | ||
| return token_col_; | ||
| } | ||
| bool keep_comments() const noexcept { | ||
| return keep_comments_; | ||
| } | ||
| bool is_keyword(std::string_view s) const; | ||
| bool is_multiop(std::string_view s) const; | ||
|
|
||
| void set_handler(unsigned char c, std::unique_ptr<Handler> handler) { | ||
| handlers_[c] = std::move(handler); | ||
| } | ||
| void set_default_handler(std::unique_ptr<Handler> handler) { | ||
| default_handler_ = std::move(handler); | ||
| } | ||
|
|
||
| private: | ||
| void register_defaults(); | ||
|
|
||
| std::string_view src_; | ||
| bool keep_comments_; | ||
|
|
||
| size_t start_{0}; | ||
| size_t current_{0}; | ||
| int line_{1}; | ||
| int col_{1}; | ||
| int token_col_{1}; | ||
|
|
||
| std::array<std::unique_ptr<Handler>, 256> handlers_; | ||
| std::unique_ptr<Handler> default_handler_; | ||
|
Comment on lines
+4
to
+72
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The header declares Useful? React with 👍 / 👎. |
||
| }; | ||
|
|
||
| #endif // LEXER_HPP_ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef LEXERERROR_HPP_ | ||
| #define LEXERERROR_HPP_ | ||
|
|
||
| #include <stdexcept> | ||
|
|
||
| class LexerError : public std::runtime_error { | ||
| public: | ||
| using std::runtime_error::runtime_error; | ||
| }; | ||
|
|
||
| #endif // LEXERERROR_HPP_ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| #include "CharHandler.hpp" | ||
|
|
||
| OptToken CharHandler::scan(Lexer &lx) { | ||
| std::string raw; | ||
| raw.push_back('\''); | ||
| char val = '\0'; | ||
| if (lx.peek() == '\\') { | ||
| lx.advance(); | ||
| raw.push_back('\\'); | ||
| char e = lx.advance(); | ||
| raw.push_back(e); | ||
| switch (e) { | ||
| case 'n': | ||
| val = '\n'; | ||
| break; | ||
| case 't': | ||
| val = '\t'; | ||
| break; | ||
| case '\\': | ||
| val = '\\'; | ||
| break; | ||
| case '\'': | ||
| val = '\''; | ||
| break; | ||
| default: | ||
| val = e; | ||
| break; | ||
| } | ||
| } else { | ||
| char c = lx.advance(); | ||
| raw.push_back(c); | ||
| val = c; | ||
| } | ||
| if (lx.peek() == '\'') { | ||
| lx.advance(); | ||
| raw.push_back('\''); | ||
| } else | ||
| throw LexerError("Unterminated char literal"); | ||
| return std::make_optional(TokenFactory::make_char_literal(std::move(raw), val, lx.line(), lx.token_col())); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| #ifndef CHARHANDLER_HPP_ | ||
| #define CHARHANDLER_HPP_ | ||
|
|
||
| #include "Handler.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
| #include "lib/lexer/LexerError.hpp" | ||
|
|
||
| struct CharHandler : Handler { | ||
| OptToken scan(Lexer &lx) override; | ||
| }; | ||
|
|
||
| #endif // CHARHANDLER_HPP_ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| #include "DefaultHandler.hpp" | ||
|
|
||
| OptToken DefaultHandler::scan(Lexer &lx) { | ||
| char c = lx.current_char(); | ||
| throw LexerError(std::string("Unexpected character: ") + c); | ||
|
Comment on lines
+1
to
+5
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This translation unit throws Useful? React with 👍 / 👎. |
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef DEFAULTHANDLER_HPP_ | ||
| #define DEFAULTHANDLER_HPP_ | ||
|
|
||
| #include "Handler.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
|
|
||
| struct DefaultHandler : Handler { | ||
| OptToken scan(Lexer &lx) override; | ||
| }; | ||
|
|
||
| #endif // DEFAULTHANDLER_HPP_ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| #ifndef HANDLER_HPP_ | ||
| #define HANDLER_HPP_ | ||
|
|
||
| #include <memory> | ||
| #include <optional> | ||
|
|
||
| class Token; | ||
| class Lexer; | ||
|
|
||
| using TokenPtr = std::unique_ptr<Token>; | ||
| using OptToken = std::optional<TokenPtr>; | ||
|
|
||
| struct Handler { | ||
| virtual ~Handler() = default; | ||
| virtual OptToken scan(Lexer &lx) = 0; | ||
| }; | ||
|
|
||
| #endif // HANDLER_HPP_ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| #include "IdentifierHandler.hpp" | ||
|
|
||
| OptToken IdentifierHandler::scan(Lexer &lx) { | ||
| std::string s; | ||
| s.push_back(lx.current_char()); | ||
| lx.consume_while(s, [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; }); | ||
|
|
||
| if (lx.is_keyword(s)) { | ||
| if (s == "true" || s == "false") { | ||
| return std::make_optional(TokenFactory::make_bool_literal(s, s == "true", lx.line(), lx.token_col())); | ||
| } | ||
| return std::make_optional(TokenFactory::make_keyword(std::move(s), lx.line(), lx.token_col())); | ||
| } | ||
| if (s == "xor") { | ||
| return std::make_optional(TokenFactory::make_operator(std::move(s), lx.line(), lx.token_col())); | ||
| } | ||
| return std::make_optional(TokenFactory::make_ident(std::move(s), lx.line(), lx.token_col())); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef IDENTIFIERHANDLER_HPP_ | ||
| #define IDENTIFIERHANDLER_HPP_ | ||
|
|
||
| #include "Handler.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
|
|
||
| struct IdentifierHandler : Handler { | ||
| OptToken scan(Lexer &lx) override; | ||
| }; | ||
|
|
||
| #endif // IDENTIFIERHANDLER_HPP_ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| #include "NewlineHandler.hpp" | ||
|
|
||
| OptToken NewlineHandler::scan(Lexer &lx) { | ||
| return std::make_optional(TokenFactory::make_newline(lx.line(), lx.token_col())); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef NEWLINEHANDLER_HPP_ | ||
| #define NEWLINEHANDLER_HPP_ | ||
|
|
||
| #include "Handler.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
|
|
||
| struct NewlineHandler : Handler { | ||
| OptToken scan(Lexer &lx) override; | ||
| }; | ||
|
|
||
| #endif // NEWLINEHANDLER_HPP_ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In
register_defaultsyou assignNumberHandlerto'.'(line 115) but immediately afterwards iterate overopcharswhich also contains'.'(lines 122‑124). The latter assignment replaces the numeric handler, so literals like.5are tokenized as an operator'.'followed by5rather than a float. Either remove'.'fromopcharsor re‑apply the numeric handler after operator registration.Useful? React with 👍 / 👎.