generated from bialger/cpp_tests
-
Notifications
You must be signed in to change notification settings - Fork 0
Lexer! #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Lexer! #14
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,26 @@ | ||
| #include <iostream> | ||
|
|
||
| #include "lib/ui/ui_functions.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
|
|
||
| int main(int32_t argc, char** argv) { | ||
| std::vector<std::string> args = std::vector<std::string>(argv, argv + argc); | ||
| return StartConsoleUI(args, std::cout); | ||
| int main() { | ||
| const std::string sample = R"ovum( | ||
| // demo | ||
| fun Main(args: StringArray): Int { | ||
| val count: Int = args.Length() | ||
| sys::Print("Args count: " + count.ToString()) | ||
| return 0 | ||
| } | ||
| )ovum"; | ||
|
|
||
| Lexer lx(sample, false); | ||
| try { | ||
| auto toks = lx.tokenize(); | ||
| for (auto &t : toks) { | ||
| std::cout << t->to_string() << "\n"; | ||
| } | ||
| } catch (const std::exception &e) { | ||
| std::cerr << "Lexer error: " << e.what() << "\n"; | ||
| return 1; | ||
| } | ||
| return 0; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,131 @@ | ||
| #include "Lexer.hpp" | ||
| #include "utils.hpp" | ||
|
|
||
| Lexer::Lexer(std::string_view src, bool keep_comments) : | ||
| src_(src), keep_comments_(keep_comments), start_(0), current_(0), line_(1), col_(1), token_col_(1) { | ||
| register_defaults(); | ||
| } | ||
|
|
||
| bool Lexer::is_at_end() const noexcept { | ||
| return current_ >= src_.size(); | ||
| } | ||
|
|
||
| char Lexer::peek(size_t offset) const noexcept { | ||
| size_t idx = current_ + offset; | ||
| if (idx >= src_.size()) | ||
| return '\0'; | ||
| return src_[idx]; | ||
| } | ||
|
|
||
| char Lexer::current_char() const noexcept { | ||
| if (current_ == 0) | ||
| return '\0'; | ||
| return src_[current_ - 1]; | ||
| } | ||
|
|
||
| char Lexer::advance() { | ||
| if (is_at_end()) | ||
| return '\0'; | ||
| char c = src_[current_++]; | ||
| if (c == '\n') { | ||
| ++line_; | ||
| col_ = 1; | ||
| } else | ||
| ++col_; | ||
| return c; | ||
| } | ||
|
|
||
| void Lexer::retreat_one() { | ||
| if (current_ == 0) | ||
| return; | ||
| --current_; | ||
| int l = 1; | ||
| for (size_t i = 0; i < current_; ++i) | ||
| if (src_[i] == '\n') | ||
| ++l; | ||
| line_ = l; | ||
| int col = 1; | ||
| for (size_t i = current_; i > 0; --i) { | ||
| if (src_[i - 1] == '\n') { | ||
| col = static_cast<int>(current_ - i + 1); | ||
| break; | ||
| } | ||
| if (i == 1) | ||
| col = static_cast<int>(current_); | ||
| } | ||
| col_ = col; | ||
| } | ||
|
|
||
| void Lexer::consume_while(std::string &out, const std::function<bool(char)> &pred) { | ||
| while (!is_at_end() && pred(peek())) { | ||
| out.push_back(advance()); | ||
| } | ||
| } | ||
|
|
||
| std::string Lexer::raw_lexeme() const { | ||
| if (current_ >= start_) | ||
| return std::string(src_.substr(start_, current_ - start_)); | ||
| return {}; | ||
| } | ||
|
|
||
| bool Lexer::is_keyword(std::string_view s) const { | ||
| return keyword_set().contains(std::string(s)); | ||
| } | ||
|
|
||
| bool Lexer::is_multiop(std::string_view s) const { | ||
| return multi_ops_set().contains(std::string(s)); | ||
| } | ||
|
|
||
| std::vector<TokenPtr> Lexer::tokenize() { | ||
| std::vector<TokenPtr> tokens; | ||
| while (!is_at_end()) { | ||
| start_ = current_; | ||
| token_col_ = col_; | ||
| char c = advance(); | ||
| Handler *h = handlers_[static_cast<unsigned char>(c)].get(); | ||
| if (!h) | ||
| h = default_handler_.get(); | ||
| OptToken maybe = h->scan(*this); | ||
| if (maybe && *maybe) | ||
| tokens.push_back(std::move(*maybe)); | ||
| } | ||
| tokens.push_back(TokenFactory::make_eof(line_, col_)); | ||
| return tokens; | ||
| } | ||
|
|
||
| void Lexer::register_defaults() { | ||
| for (auto &p : handlers_) | ||
| p.reset(); | ||
| default_handler_.reset(); | ||
|
|
||
| set_handler(' ', std::make_unique<WhitespaceHandler>()); | ||
| set_handler('\t', std::make_unique<WhitespaceHandler>()); | ||
| set_handler('\r', std::make_unique<WhitespaceHandler>()); | ||
|
|
||
| set_handler('\n', std::make_unique<NewlineHandler>()); | ||
|
|
||
| for (unsigned char c = 'a'; c <= 'z'; ++c) | ||
| set_handler(c, std::make_unique<IdentifierHandler>()); | ||
| for (unsigned char c = 'A'; c <= 'Z'; ++c) | ||
| set_handler(c, std::make_unique<IdentifierHandler>()); | ||
| set_handler((unsigned char) '_', std::make_unique<IdentifierHandler>()); | ||
|
|
||
| for (unsigned char d = '0'; d <= '9'; ++d) | ||
| set_handler(d, std::make_unique<NumberHandler>()); | ||
| set_handler((unsigned char) '.', std::make_unique<NumberHandler>()); | ||
|
|
||
| set_handler((unsigned char) '"', std::make_unique<StringHandler>()); | ||
| set_handler((unsigned char) '\'', std::make_unique<CharHandler>()); | ||
|
|
||
| set_handler((unsigned char) '/', std::make_unique<SlashHandler>()); | ||
|
|
||
| const std::string opchars = "+-*/%<>=!&|^~?:."; | ||
| for (unsigned char c : opchars) | ||
| set_handler(c, std::make_unique<OperatorHandler>()); | ||
|
|
||
| const std::string puncts = ",;:(){}[]"; | ||
| for (unsigned char c : puncts) | ||
| set_handler(c, std::make_unique<PunctHandler>()); | ||
|
|
||
| set_default_handler(std::make_unique<DefaultHandler>()); | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| #ifndef LEXER_HPP_ | ||
| #define LEXER_HPP_ | ||
|
|
||
| #include <cctype> | ||
| #include <functional> | ||
| #include <stdexcept> | ||
| #include <string> | ||
| #include <string_view> | ||
| #include <unordered_set> | ||
| #include <vector> | ||
|
|
||
| #include "handlers/CharHandler.hpp" | ||
| #include "handlers/DefaultHandler.hpp" | ||
| #include "handlers/Handler.hpp" | ||
| #include "handlers/IdentifierHandler.hpp" | ||
| #include "handlers/NewlineHandler.hpp" | ||
| #include "handlers/NumberHandler.hpp" | ||
| #include "handlers/OperatorHandler.hpp" | ||
| #include "handlers/PunctHandler.hpp" | ||
| #include "handlers/SlashHandler.hpp" | ||
| #include "handlers/StringHandler.hpp" | ||
| #include "handlers/WhitespaceHandler.hpp" | ||
|
|
||
| #include "tokens/TokenFactory.hpp" | ||
|
|
||
| class Lexer { | ||
| public: | ||
| explicit Lexer(std::string_view src, bool keep_comments = false); | ||
|
|
||
| std::vector<TokenPtr> tokenize(); | ||
|
|
||
| bool is_at_end() const noexcept; | ||
| char peek(size_t offset = 0) const noexcept; | ||
| char current_char() const noexcept; | ||
| char advance(); | ||
| void retreat_one(); | ||
| void consume_while(std::string &out, const std::function<bool(char)> &pred); | ||
| std::string raw_lexeme() const; | ||
|
|
||
| int line() const noexcept { | ||
| return line_; | ||
| } | ||
| int token_col() const noexcept { | ||
| return token_col_; | ||
| } | ||
| bool keep_comments() const noexcept { | ||
| return keep_comments_; | ||
| } | ||
| bool is_keyword(std::string_view s) const; | ||
| bool is_multiop(std::string_view s) const; | ||
|
|
||
| void set_handler(unsigned char c, std::unique_ptr<Handler> handler) { | ||
| handlers_[c] = std::move(handler); | ||
| } | ||
| void set_default_handler(std::unique_ptr<Handler> handler) { | ||
| default_handler_ = std::move(handler); | ||
| } | ||
|
|
||
| private: | ||
| void register_defaults(); | ||
|
|
||
| std::string_view src_; | ||
| bool keep_comments_; | ||
|
|
||
| size_t start_{0}; | ||
| size_t current_{0}; | ||
| int line_{1}; | ||
| int col_{1}; | ||
| int token_col_{1}; | ||
|
|
||
| std::array<std::unique_ptr<Handler>, 256> handlers_; | ||
| std::unique_ptr<Handler> default_handler_; | ||
| }; | ||
|
|
||
| #endif // LEXER_HPP_ | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef LEXERERROR_HPP_ | ||
| #define LEXERERROR_HPP_ | ||
|
|
||
| #include <stdexcept> | ||
|
|
||
| class LexerError : public std::runtime_error { | ||
| public: | ||
| using std::runtime_error::runtime_error; | ||
| }; | ||
|
|
||
| #endif // LEXERERROR_HPP_ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| #include "CharHandler.hpp" | ||
|
|
||
| OptToken CharHandler::scan(Lexer &lx) { | ||
| std::string raw; | ||
| raw.push_back('\''); | ||
| char val = '\0'; | ||
| if (lx.peek() == '\\') { | ||
| lx.advance(); | ||
| raw.push_back('\\'); | ||
| char e = lx.advance(); | ||
| raw.push_back(e); | ||
| switch (e) { | ||
| case 'n': | ||
| val = '\n'; | ||
| break; | ||
| case 't': | ||
| val = '\t'; | ||
| break; | ||
| case '\\': | ||
| val = '\\'; | ||
| break; | ||
| case '\'': | ||
| val = '\''; | ||
| break; | ||
| default: | ||
| val = e; | ||
| break; | ||
| } | ||
| } else { | ||
| char c = lx.advance(); | ||
| raw.push_back(c); | ||
| val = c; | ||
| } | ||
| if (lx.peek() == '\'') { | ||
| lx.advance(); | ||
| raw.push_back('\''); | ||
| } else | ||
| throw LexerError("Unterminated char literal"); | ||
| return std::make_optional(TokenFactory::make_char_literal(std::move(raw), val, lx.line(), lx.token_col())); | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| #ifndef CHARHANDLER_HPP_ | ||
| #define CHARHANDLER_HPP_ | ||
|
|
||
| #include "Handler.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
| #include "lib/lexer/LexerError.hpp" | ||
|
|
||
| struct CharHandler : Handler { | ||
| OptToken scan(Lexer &lx) override; | ||
| }; | ||
|
|
||
| #endif // CHARHANDLER_HPP_ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| #include "DefaultHandler.hpp" | ||
|
|
||
| OptToken DefaultHandler::scan(Lexer &lx) { | ||
| char c = lx.current_char(); | ||
| throw LexerError(std::string("Unexpected character: ") + c); | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef DEFAULTHANDLER_HPP_ | ||
| #define DEFAULTHANDLER_HPP_ | ||
|
|
||
| #include "Handler.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
|
|
||
| struct DefaultHandler : Handler { | ||
| OptToken scan(Lexer &lx) override; | ||
| }; | ||
|
|
||
| #endif // DEFAULTHANDLER_HPP_ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| #ifndef HANDLER_HPP_ | ||
| #define HANDLER_HPP_ | ||
|
|
||
| #include <memory> | ||
| #include <optional> | ||
|
|
||
| class Token; | ||
| class Lexer; | ||
|
|
||
| using TokenPtr = std::unique_ptr<Token>; | ||
| using OptToken = std::optional<TokenPtr>; | ||
|
|
||
| struct Handler { | ||
| virtual ~Handler() = default; | ||
| virtual OptToken scan(Lexer &lx) = 0; | ||
| }; | ||
|
|
||
| #endif // HANDLER_HPP_ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| #include "IdentifierHandler.hpp" | ||
|
|
||
| OptToken IdentifierHandler::scan(Lexer &lx) { | ||
| std::string s; | ||
| s.push_back(lx.current_char()); | ||
| lx.consume_while(s, [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; }); | ||
|
|
||
| if (lx.is_keyword(s)) { | ||
| if (s == "true" || s == "false") { | ||
| return std::make_optional(TokenFactory::make_bool_literal(s, s == "true", lx.line(), lx.token_col())); | ||
| } | ||
| return std::make_optional(TokenFactory::make_keyword(std::move(s), lx.line(), lx.token_col())); | ||
| } | ||
| if (s == "xor") { | ||
| return std::make_optional(TokenFactory::make_operator(std::move(s), lx.line(), lx.token_col())); | ||
| } | ||
| return std::make_optional(TokenFactory::make_ident(std::move(s), lx.line(), lx.token_col())); | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef IDENTIFIERHANDLER_HPP_ | ||
| #define IDENTIFIERHANDLER_HPP_ | ||
|
|
||
| #include "Handler.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
|
|
||
| struct IdentifierHandler : Handler { | ||
| OptToken scan(Lexer &lx) override; | ||
| }; | ||
|
|
||
| #endif // IDENTIFIERHANDLER_HPP_ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| #include "NewlineHandler.hpp" | ||
|
|
||
| OptToken NewlineHandler::scan(Lexer &lx) { | ||
| return std::make_optional(TokenFactory::make_newline(lx.line(), lx.token_col())); | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| #ifndef NEWLINEHANDLER_HPP_ | ||
| #define NEWLINEHANDLER_HPP_ | ||
|
|
||
| #include "Handler.hpp" | ||
| #include "lib/lexer/Lexer.hpp" | ||
|
|
||
| struct NewlineHandler : Handler { | ||
| OptToken scan(Lexer &lx) override; | ||
| }; | ||
|
|
||
| #endif // NEWLINEHANDLER_HPP_ |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The header declares
std::array<std::unique_ptr<Handler>, 256> handlers_;but the file only includes<vector>and other headers, not<array>. Any TU including this header will fail to compile becausestd::arrayis undefined. Add#include <array>above the class definition.Useful? React with 👍 / 👎.