Skip to content

Commit ca1cb41

Browse files
committed
feat: add lexer implementation
1 parent ab16402 commit ca1cb41

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1303
-4
lines changed

bin/main.cpp

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,26 @@
11
#include <iostream>
22

3-
#include "lib/ui/ui_functions.hpp"
3+
#include "lib/lexer/Lexer.hpp"
44

5-
int main(int32_t argc, char** argv) {
6-
std::vector<std::string> args = std::vector<std::string>(argv, argv + argc);
7-
return StartConsoleUI(args, std::cout);
5+
int main() {
6+
const std::string sample = R"ovum(
7+
// demo
8+
fun Main(args: StringArray): Int {
9+
val count: Int = args.Length()
10+
sys::Print("Args count: " + count.ToString())
11+
return 0
12+
}
13+
)ovum";
14+
15+
Lexer lx(sample, false);
16+
try {
17+
auto toks = lx.tokenize();
18+
for (auto &t : toks) {
19+
std::cout << t->to_string() << "\n";
20+
}
21+
} catch (const std::exception &e) {
22+
std::cerr << "Lexer error: " << e.what() << "\n";
23+
return 1;
24+
}
25+
return 0;
826
}

lib/lexer/Lexer.cpp

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#include "Lexer.hpp"
2+
#include "utils.hpp"
3+
4+
Lexer::Lexer(std::string_view src, bool keep_comments) :
5+
src_(src), keep_comments_(keep_comments), start_(0), current_(0), line_(1), col_(1), token_col_(1) {
6+
register_defaults();
7+
}
8+
9+
bool Lexer::is_at_end() const noexcept {
10+
return current_ >= src_.size();
11+
}
12+
13+
char Lexer::peek(size_t offset) const noexcept {
14+
size_t idx = current_ + offset;
15+
if (idx >= src_.size())
16+
return '\0';
17+
return src_[idx];
18+
}
19+
20+
char Lexer::current_char() const noexcept {
21+
if (current_ == 0)
22+
return '\0';
23+
return src_[current_ - 1];
24+
}
25+
26+
char Lexer::advance() {
27+
if (is_at_end())
28+
return '\0';
29+
char c = src_[current_++];
30+
if (c == '\n') {
31+
++line_;
32+
col_ = 1;
33+
} else
34+
++col_;
35+
return c;
36+
}
37+
38+
void Lexer::retreat_one() {
39+
if (current_ == 0)
40+
return;
41+
--current_;
42+
int l = 1;
43+
for (size_t i = 0; i < current_; ++i)
44+
if (src_[i] == '\n')
45+
++l;
46+
line_ = l;
47+
int col = 1;
48+
for (size_t i = current_; i > 0; --i) {
49+
if (src_[i - 1] == '\n') {
50+
col = static_cast<int>(current_ - i + 1);
51+
break;
52+
}
53+
if (i == 1)
54+
col = static_cast<int>(current_);
55+
}
56+
col_ = col;
57+
}
58+
59+
void Lexer::consume_while(std::string &out, const std::function<bool(char)> &pred) {
60+
while (!is_at_end() && pred(peek())) {
61+
out.push_back(advance());
62+
}
63+
}
64+
65+
std::string Lexer::raw_lexeme() const {
66+
if (current_ >= start_)
67+
return std::string(src_.substr(start_, current_ - start_));
68+
return {};
69+
}
70+
71+
bool Lexer::is_keyword(std::string_view s) const {
72+
return keyword_set().contains(std::string(s));
73+
}
74+
75+
bool Lexer::is_multiop(std::string_view s) const {
76+
return multi_ops_set().contains(std::string(s));
77+
}
78+
79+
std::vector<TokenPtr> Lexer::tokenize() {
80+
std::vector<TokenPtr> tokens;
81+
while (!is_at_end()) {
82+
start_ = current_;
83+
token_col_ = col_;
84+
char c = advance();
85+
Handler *h = handlers_[static_cast<unsigned char>(c)].get();
86+
if (!h)
87+
h = default_handler_.get();
88+
OptToken maybe = h->scan(*this);
89+
if (maybe && *maybe)
90+
tokens.push_back(std::move(*maybe));
91+
}
92+
tokens.push_back(TokenFactory::make_eof(line_, col_));
93+
return tokens;
94+
}
95+
96+
void Lexer::register_defaults() {
97+
for (auto &p : handlers_)
98+
p.reset();
99+
default_handler_.reset();
100+
101+
set_handler(' ', std::make_unique<WhitespaceHandler>());
102+
set_handler('\t', std::make_unique<WhitespaceHandler>());
103+
set_handler('\r', std::make_unique<WhitespaceHandler>());
104+
105+
set_handler('\n', std::make_unique<NewlineHandler>());
106+
107+
for (unsigned char c = 'a'; c <= 'z'; ++c)
108+
set_handler(c, std::make_unique<IdentifierHandler>());
109+
for (unsigned char c = 'A'; c <= 'Z'; ++c)
110+
set_handler(c, std::make_unique<IdentifierHandler>());
111+
set_handler((unsigned char) '_', std::make_unique<IdentifierHandler>());
112+
113+
for (unsigned char d = '0'; d <= '9'; ++d)
114+
set_handler(d, std::make_unique<NumberHandler>());
115+
set_handler((unsigned char) '.', std::make_unique<NumberHandler>());
116+
117+
set_handler((unsigned char) '"', std::make_unique<StringHandler>());
118+
set_handler((unsigned char) '\'', std::make_unique<CharHandler>());
119+
120+
set_handler((unsigned char) '/', std::make_unique<SlashHandler>());
121+
122+
const std::string opchars = "+-*/%<>=!&|^~?:.";
123+
for (unsigned char c : opchars)
124+
set_handler(c, std::make_unique<OperatorHandler>());
125+
126+
const std::string puncts = ",;:(){}[]";
127+
for (unsigned char c : puncts)
128+
set_handler(c, std::make_unique<PunctHandler>());
129+
130+
set_default_handler(std::make_unique<DefaultHandler>());
131+
}

lib/lexer/Lexer.hpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#ifndef LEXER_HPP_
2+
#define LEXER_HPP_
3+
4+
#include <cctype>
5+
#include <functional>
6+
#include <stdexcept>
7+
#include <string>
8+
#include <string_view>
9+
#include <unordered_set>
10+
#include <vector>
11+
12+
#include "handlers/CharHandler.hpp"
13+
#include "handlers/DefaultHandler.hpp"
14+
#include "handlers/Handler.hpp"
15+
#include "handlers/IdentifierHandler.hpp"
16+
#include "handlers/NewlineHandler.hpp"
17+
#include "handlers/NumberHandler.hpp"
18+
#include "handlers/OperatorHandler.hpp"
19+
#include "handlers/PunctHandler.hpp"
20+
#include "handlers/SlashHandler.hpp"
21+
#include "handlers/StringHandler.hpp"
22+
#include "handlers/WhitespaceHandler.hpp"
23+
24+
#include "tokens/TokenFactory.hpp"
25+
26+
class Lexer {
27+
public:
28+
explicit Lexer(std::string_view src, bool keep_comments = false);
29+
30+
std::vector<TokenPtr> tokenize();
31+
32+
bool is_at_end() const noexcept;
33+
char peek(size_t offset = 0) const noexcept;
34+
char current_char() const noexcept;
35+
char advance();
36+
void retreat_one();
37+
void consume_while(std::string &out, const std::function<bool(char)> &pred);
38+
std::string raw_lexeme() const;
39+
40+
int line() const noexcept {
41+
return line_;
42+
}
43+
int token_col() const noexcept {
44+
return token_col_;
45+
}
46+
bool keep_comments() const noexcept {
47+
return keep_comments_;
48+
}
49+
bool is_keyword(std::string_view s) const;
50+
bool is_multiop(std::string_view s) const;
51+
52+
void set_handler(unsigned char c, std::unique_ptr<Handler> handler) {
53+
handlers_[c] = std::move(handler);
54+
}
55+
void set_default_handler(std::unique_ptr<Handler> handler) {
56+
default_handler_ = std::move(handler);
57+
}
58+
59+
private:
60+
void register_defaults();
61+
62+
std::string_view src_;
63+
bool keep_comments_;
64+
65+
size_t start_{0};
66+
size_t current_{0};
67+
int line_{1};
68+
int col_{1};
69+
int token_col_{1};
70+
71+
std::array<std::unique_ptr<Handler>, 256> handlers_;
72+
std::unique_ptr<Handler> default_handler_;
73+
};
74+
75+
#endif // LEXER_HPP_

lib/lexer/LexerError.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#ifndef LEXERERROR_HPP_
2+
#define LEXERERROR_HPP_
3+
4+
#include <stdexcept>
5+
6+
class LexerError : public std::runtime_error {
7+
public:
8+
using std::runtime_error::runtime_error;
9+
};
10+
11+
#endif // LEXERERROR_HPP_

lib/lexer/handlers/CharHandler.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#include "CharHandler.hpp"
2+
3+
OptToken CharHandler::scan(Lexer &lx) {
4+
std::string raw;
5+
raw.push_back('\'');
6+
char val = '\0';
7+
if (lx.peek() == '\\') {
8+
lx.advance();
9+
raw.push_back('\\');
10+
char e = lx.advance();
11+
raw.push_back(e);
12+
switch (e) {
13+
case 'n':
14+
val = '\n';
15+
break;
16+
case 't':
17+
val = '\t';
18+
break;
19+
case '\\':
20+
val = '\\';
21+
break;
22+
case '\'':
23+
val = '\'';
24+
break;
25+
default:
26+
val = e;
27+
break;
28+
}
29+
} else {
30+
char c = lx.advance();
31+
raw.push_back(c);
32+
val = c;
33+
}
34+
if (lx.peek() == '\'') {
35+
lx.advance();
36+
raw.push_back('\'');
37+
} else
38+
throw LexerError("Unterminated char literal");
39+
return std::make_optional(TokenFactory::make_char_literal(std::move(raw), val, lx.line(), lx.token_col()));
40+
}

lib/lexer/handlers/CharHandler.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef CHARHANDLER_HPP_
2+
#define CHARHANDLER_HPP_
3+
4+
#include "Handler.hpp"
5+
#include "lib/lexer/Lexer.hpp"
6+
#include "lib/lexer/LexerError.hpp"
7+
8+
struct CharHandler : Handler {
9+
OptToken scan(Lexer &lx) override;
10+
};
11+
12+
#endif // CHARHANDLER_HPP_
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#include "DefaultHandler.hpp"
2+
3+
OptToken DefaultHandler::scan(Lexer &lx) {
4+
char c = lx.current_char();
5+
throw LexerError(std::string("Unexpected character: ") + c);
6+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#ifndef DEFAULTHANDLER_HPP_
2+
#define DEFAULTHANDLER_HPP_
3+
4+
#include "Handler.hpp"
5+
#include "lib/lexer/Lexer.hpp"
6+
7+
struct DefaultHandler : Handler {
8+
OptToken scan(Lexer &lx) override;
9+
};
10+
11+
#endif // DEFAULTHANDLER_HPP_

lib/lexer/handlers/Handler.hpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#ifndef HANDLER_HPP_
2+
#define HANDLER_HPP_
3+
4+
#include <memory>
5+
#include <optional>
6+
7+
class Token;
8+
class Lexer;
9+
10+
using TokenPtr = std::unique_ptr<Token>;
11+
using OptToken = std::optional<TokenPtr>;
12+
13+
struct Handler {
14+
virtual ~Handler() = default;
15+
virtual OptToken scan(Lexer &lx) = 0;
16+
};
17+
18+
#endif // HANDLER_HPP_
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#include "IdentifierHandler.hpp"
2+
3+
OptToken IdentifierHandler::scan(Lexer &lx) {
4+
std::string s;
5+
s.push_back(lx.current_char());
6+
lx.consume_while(s, [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; });
7+
8+
if (lx.is_keyword(s)) {
9+
if (s == "true" || s == "false") {
10+
return std::make_optional(TokenFactory::make_bool_literal(s, s == "true", lx.line(), lx.token_col()));
11+
}
12+
return std::make_optional(TokenFactory::make_keyword(std::move(s), lx.line(), lx.token_col()));
13+
}
14+
if (s == "xor") {
15+
return std::make_optional(TokenFactory::make_operator(std::move(s), lx.line(), lx.token_col()));
16+
}
17+
return std::make_optional(TokenFactory::make_ident(std::move(s), lx.line(), lx.token_col()));
18+
}

0 commit comments

Comments
 (0)