From 7979c2749a19e53008624bc49659adeb96ce39d5 Mon Sep 17 00:00:00 2001 From: jobala Date: Fri, 13 Mar 2026 20:00:32 +0300 Subject: [PATCH 1/4] add tokenization helpers --- CMakeLists.txt | 1 + lib/sql/CMakeLists.txt | 2 ++ lib/sql/token.h | 45 ++++++++++++++++++++++++++++++++++++++++++ lib/sql/tokenizer.cpp | 0 lib/sql/tokenizer.h | 20 +++++++++++++++++++ 5 files changed, 68 insertions(+) create mode 100644 lib/sql/CMakeLists.txt create mode 100644 lib/sql/token.h create mode 100644 lib/sql/tokenizer.cpp create mode 100644 lib/sql/tokenizer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6970eb4..3021bd4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ find_package(Arrow REQUIRED) add_subdirectory(lib/types) add_subdirectory(lib/datasource) add_subdirectory(lib/logical-plan) +add_subdirectory(lib/sql) add_subdirectory(test) diff --git a/lib/sql/CMakeLists.txt b/lib/sql/CMakeLists.txt new file mode 100644 index 0000000..98353d5 --- /dev/null +++ b/lib/sql/CMakeLists.txt @@ -0,0 +1,2 @@ +add_library(sql tokenizer.cpp) +target_include_directories(sql PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/lib/sql/token.h b/lib/sql/token.h new file mode 100644 index 0000000..e112085 --- /dev/null +++ b/lib/sql/token.h @@ -0,0 +1,45 @@ +#include +#include +#include + +enum class TokenType : std::uint8_t { + // keywords + SELECT, + FROM, + + // literals + LONG, + DOUBLE, + STRING, + IDENTIFIER +}; + +TokenType from_string(const std::string &token) +{ + static const std::unordered_map keywords = {{"SELECT", TokenType::SELECT}, + {"FROM", TokenType::FROM}}; + + auto iter = keywords.find(token); + return iter != keywords.end() ? iter->second : TokenType::IDENTIFIER; +} + +struct Literal +{ + static bool is_number_start(unsigned char letter) { return std::isdigit(letter) != 0 || letter == '.'; } + static bool is_identifier_start(char letter) { return std::isalpha(letter) != 0 || letter == '`'; } + static bool is_char_start(char letter) { return letter == '\'' || '"' == letter; } + + static bool is_identifier_part(char letter) + { + return std::isdigit(letter) != 0 || std::isalpha(letter) != 0 || letter == '_'; + } +}; + +struct Token +{ + std::string text_; + TokenType type_; + int end_offset_; + + Token(std::string &text, TokenType type, int end_offset) : text_(text), type_(type), end_offset_(end_offset) {} +}; diff --git a/lib/sql/tokenizer.cpp b/lib/sql/tokenizer.cpp new file mode 100644 index 0000000..e69de29 diff --git a/lib/sql/tokenizer.h b/lib/sql/tokenizer.h new file mode 100644 index 0000000..2183ef6 --- /dev/null +++ b/lib/sql/tokenizer.h @@ -0,0 +1,20 @@ +#include +#include +#include + +#include "token.h" + +class SqlTokenizer +{ + std::string sql_; + int offset; + + int skip_whitespace(int start_offset); + int get_offset_until_terminated_char(char terminated, int start_offset); + Token scan_identifier(int start_offset); + +public: + SqlTokenizer(std::string sql); + std::vector tokenize(); + std::optional next_token(); +}; From a5734993798de111c42b031f9727b877ec3d1ea2 Mon Sep 17 00:00:00 2001 From: jobala Date: Sat, 14 Mar 2026 10:52:39 +0300 Subject: [PATCH 2/4] add tokenizer implementation --- lib/sql/token.h | 13 ++++++- lib/sql/tokenizer.cpp | 90 +++++++++++++++++++++++++++++++++++++++++++ lib/sql/tokenizer.h | 8 ++-- test/CMakeLists.txt | 3 ++ 4 files changed, 110 insertions(+), 4 deletions(-) diff --git a/lib/sql/token.h b/lib/sql/token.h index e112085..7a3e201 100644 --- a/lib/sql/token.h +++ b/lib/sql/token.h @@ -1,3 +1,5 @@ +#pragma once + #include #include #include @@ -14,8 +16,16 @@ enum class TokenType : std::uint8_t { IDENTIFIER }; -TokenType from_string(const std::string &token) +namespace Type +{ +inline TokenType from_string(std::string &token) { + // uppercase token + for (auto &tkn : token) + { + toupper(tkn); + } + static const std::unordered_map keywords = {{"SELECT", TokenType::SELECT}, {"FROM", TokenType::FROM}}; @@ -23,6 +33,7 @@ TokenType from_string(const std::string &token) return iter != keywords.end() ? iter->second : TokenType::IDENTIFIER; } +} // namespace Type struct Literal { static bool is_number_start(unsigned char letter) { return std::isdigit(letter) != 0 || letter == '.'; } diff --git a/lib/sql/tokenizer.cpp b/lib/sql/tokenizer.cpp index e69de29..b443013 100644 --- a/lib/sql/tokenizer.cpp +++ b/lib/sql/tokenizer.cpp @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include + +#include "tokenizer.h" + +SqlTokenizer::SqlTokenizer(const std::string &sql) : sql_(sql) {} + +auto SqlTokenizer::tokenize() -> std::vector +{ + std::vector res; + + auto token = next_token(); + while (token.has_value()) + { + res.push_back(token.value()); + token = next_token(); + } + + return res; +} + +auto SqlTokenizer::next_token() -> std::optional +{ + auto offset = skip_whitespace(offset_); + if (offset > (int)sql_.length()) + { + return std::nullopt; + } + + if (Literal::is_identifier_start(sql_[offset])) + { + auto token = scan_identifier(offset); + offset = token.end_offset_; + return token; + } + + if (Literal::is_number_start(sql_[offset])) + { + throw std::runtime_error("Not Implemented"); + } + + if (Literal::is_char_start(sql_[offset])) + { + throw std::runtime_error("Not Implemented"); + } + + return std::nullopt; +} +auto SqlTokenizer::skip_whitespace(int start_offset) -> int +{ + auto end_offset = start_offset; + while (end_offset < (int)sql_.size() && sql_[end_offset] == ' ') + { + end_offset += 1; + } + return end_offset; +} + +auto SqlTokenizer::scan_identifier(int start_offset) -> Token +{ + if (offset_ < (int)sql_.size() && '`' == sql_[offset_]) + { + auto end_offset = get_offset_until_terminated_char('`', start_offset); + auto text = sql_.substr(start_offset, end_offset); + return {text, TokenType::IDENTIFIER, end_offset + 1}; + } + + auto end_offset = start_offset; + while (end_offset < (int)sql_.size() && Literal::is_identifier_part(sql_[end_offset])) + { + end_offset += 1; + } + + auto text = sql_.substr(start_offset, end_offset); + auto token_type = Type::from_string(text); + return {text, token_type, end_offset + 1}; +} + +auto SqlTokenizer::get_offset_until_terminated_char(unsigned char terminated, int start_offset) -> int +{ + auto end_offset = start_offset; + while (end_offset < (int)sql_.size() && static_cast(sql_[end_offset]) != terminated) + { + end_offset += 1; + } + return end_offset; +} diff --git a/lib/sql/tokenizer.h b/lib/sql/tokenizer.h index 2183ef6..b48042a 100644 --- a/lib/sql/tokenizer.h +++ b/lib/sql/tokenizer.h @@ -1,3 +1,5 @@ +#pragma once + #include #include #include @@ -7,14 +9,14 @@ class SqlTokenizer { std::string sql_; - int offset; + int offset_; int skip_whitespace(int start_offset); - int get_offset_until_terminated_char(char terminated, int start_offset); + int get_offset_until_terminated_char(unsigned char terminated, int start_offset); Token scan_identifier(int start_offset); public: - SqlTokenizer(std::string sql); + SqlTokenizer(const std::string &sql); std::vector tokenize(); std::optional next_token(); }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c32adc2..fb9d2bb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,10 +2,13 @@ include(AddGoogleTest) add_executable(datasource_test datasource_test.cpp) add_executable(logical_plan_test logical_plan_test.cpp) +add_executable(sql_test sql_test.cpp) target_link_libraries(datasource_test PRIVATE gtest GTest::gtest_main datasource) target_link_libraries(logical_plan_test PRIVATE gtest GTest::gtest_main logical_plan datasource) +target_link_libraries(sql_test PRIVATE gtest GTest::gtest_main sql) include(GoogleTest) gtest_discover_tests(datasource_test) gtest_discover_tests(logical_plan_test) +gtest_discover_tests(sql_test) From 0e975e40adf9ae67c9851de69e9b955ce73381b2 Mon Sep 17 00:00:00 2001 From: jobala Date: Sat, 14 Mar 2026 12:32:32 +0300 Subject: [PATCH 3/4] test tokenizer --- lib/sql/token.h | 29 ++++++++++++++++++++--------- lib/sql/tokenizer.cpp | 33 +++++++++++++++++++++++++++++---- lib/sql/tokenizer.h | 1 + test/sql_test.cpp | 24 ++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 13 deletions(-) create mode 100644 test/sql_test.cpp diff --git a/lib/sql/token.h b/lib/sql/token.h index 7a3e201..3309ec2 100644 --- a/lib/sql/token.h +++ b/lib/sql/token.h @@ -1,6 +1,8 @@ #pragma once +#include #include +#include #include #include @@ -13,33 +15,42 @@ enum class TokenType : std::uint8_t { LONG, DOUBLE, STRING, - IDENTIFIER + IDENTIFIER, + + // Symbol + STAR }; namespace Type { -inline TokenType from_string(std::string &token) + +inline TokenType from_string(std::string token) { - // uppercase token - for (auto &tkn : token) - { - toupper(tkn); - } - static const std::unordered_map keywords = {{"SELECT", TokenType::SELECT}, - {"FROM", TokenType::FROM}}; + std::ranges::transform(token, token.begin(), [](unsigned char letter) { return std::toupper(letter); }); + + static const std::unordered_map keywords = { + {"SELECT", TokenType::SELECT}, {"FROM", TokenType::FROM}, {"*", TokenType::STAR}}; auto iter = keywords.find(token); return iter != keywords.end() ? iter->second : TokenType::IDENTIFIER; } } // namespace Type + struct Literal { static bool is_number_start(unsigned char letter) { return std::isdigit(letter) != 0 || letter == '.'; } static bool is_identifier_start(char letter) { return std::isalpha(letter) != 0 || letter == '`'; } static bool is_char_start(char letter) { return letter == '\'' || '"' == letter; } + static bool is_symbol_start(char letter) { return is_symbol(letter); } + static bool is_symbol(char letter) + { + std::set symbols{'*'}; + return symbols.contains(letter); + } + static bool is_identifier_part(char letter) { return std::isdigit(letter) != 0 || std::isalpha(letter) != 0 || letter == '_'; diff --git a/lib/sql/tokenizer.cpp b/lib/sql/tokenizer.cpp index b443013..1efdf0c 100644 --- a/lib/sql/tokenizer.cpp +++ b/lib/sql/tokenizer.cpp @@ -4,9 +4,10 @@ #include #include +#include "token.h" #include "tokenizer.h" -SqlTokenizer::SqlTokenizer(const std::string &sql) : sql_(sql) {} +SqlTokenizer::SqlTokenizer(const std::string &sql) : sql_(sql) { offset_ = 0; } auto SqlTokenizer::tokenize() -> std::vector { @@ -33,7 +34,14 @@ auto SqlTokenizer::next_token() -> std::optional if (Literal::is_identifier_start(sql_[offset])) { auto token = scan_identifier(offset); - offset = token.end_offset_; + offset_ = token.end_offset_; + return token; + } + + if (Literal::is_symbol_start(sql_[offset])) + { + auto token = scan_symbol(offset); + offset_ = token.end_offset_; return token; } @@ -49,9 +57,13 @@ auto SqlTokenizer::next_token() -> std::optional return std::nullopt; } + auto SqlTokenizer::skip_whitespace(int start_offset) -> int { auto end_offset = start_offset; + auto curr = sql_[end_offset]; + std::cout << curr; + while (end_offset < (int)sql_.size() && sql_[end_offset] == ' ') { end_offset += 1; @@ -64,7 +76,7 @@ auto SqlTokenizer::scan_identifier(int start_offset) -> Token if (offset_ < (int)sql_.size() && '`' == sql_[offset_]) { auto end_offset = get_offset_until_terminated_char('`', start_offset); - auto text = sql_.substr(start_offset, end_offset); + auto text = sql_.substr(start_offset, end_offset - start_offset); return {text, TokenType::IDENTIFIER, end_offset + 1}; } @@ -74,7 +86,20 @@ auto SqlTokenizer::scan_identifier(int start_offset) -> Token end_offset += 1; } - auto text = sql_.substr(start_offset, end_offset); + auto text = sql_.substr(start_offset, end_offset - start_offset); + auto token_type = Type::from_string(text); + return {text, token_type, end_offset + 1}; +} + +auto SqlTokenizer::scan_symbol(int start_offset) -> Token +{ + auto end_offset = start_offset; + while (end_offset < (int)sql_.size() && Literal::is_symbol(sql_[end_offset])) + { + end_offset += 1; + } + + auto text = sql_.substr(start_offset, end_offset - start_offset); auto token_type = Type::from_string(text); return {text, token_type, end_offset + 1}; } diff --git a/lib/sql/tokenizer.h b/lib/sql/tokenizer.h index b48042a..f54f232 100644 --- a/lib/sql/tokenizer.h +++ b/lib/sql/tokenizer.h @@ -14,6 +14,7 @@ class SqlTokenizer int skip_whitespace(int start_offset); int get_offset_until_terminated_char(unsigned char terminated, int start_offset); Token scan_identifier(int start_offset); + Token scan_symbol(int start_offset); public: SqlTokenizer(const std::string &sql); diff --git a/test/sql_test.cpp b/test/sql_test.cpp new file mode 100644 index 0000000..4552e34 --- /dev/null +++ b/test/sql_test.cpp @@ -0,0 +1,24 @@ +#include +#include +#include + +#include "token.h" +#include "tokenizer.h" + +TEST(Tokenizer, tokenize_sql_string) +{ + std::string query = "select * from users"; + + SqlTokenizer tokenizer(query); + auto tokens = tokenizer.tokenize(); + + std::vector res{"select", "*", "from", "users"}; + std::vector token_types{TokenType::SELECT, TokenType::STAR, TokenType::FROM, TokenType::IDENTIFIER}; + + ASSERT_EQ(4, tokens.size()); + for (int i = 0; i < (int)tokens.size(); i++) + { + ASSERT_EQ(res[i], tokens[i].text_); + ASSERT_EQ(token_types[i], tokens[i].type_); + } +} From 8a490b7c014fdbd678bedc3875f713477ad69ed4 Mon Sep 17 00:00:00 2001 From: jobala Date: Sat, 14 Mar 2026 13:06:11 +0300 Subject: [PATCH 4/4] handle projection in sql string --- lib/sql/token.h | 7 ++++--- lib/sql/tokenizer.cpp | 4 ++-- test/sql_test.cpp | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/lib/sql/token.h b/lib/sql/token.h index 3309ec2..1fd4516 100644 --- a/lib/sql/token.h +++ b/lib/sql/token.h @@ -18,7 +18,8 @@ enum class TokenType : std::uint8_t { IDENTIFIER, // Symbol - STAR + STAR, + COMMA, }; namespace Type @@ -30,7 +31,7 @@ inline TokenType from_string(std::string token) std::ranges::transform(token, token.begin(), [](unsigned char letter) { return std::toupper(letter); }); static const std::unordered_map keywords = { - {"SELECT", TokenType::SELECT}, {"FROM", TokenType::FROM}, {"*", TokenType::STAR}}; + {"SELECT", TokenType::SELECT}, {"FROM", TokenType::FROM}, {"*", TokenType::STAR}, {",", TokenType::COMMA}}; auto iter = keywords.find(token); return iter != keywords.end() ? iter->second : TokenType::IDENTIFIER; @@ -47,7 +48,7 @@ struct Literal static bool is_symbol_start(char letter) { return is_symbol(letter); } static bool is_symbol(char letter) { - std::set symbols{'*'}; + std::set symbols{'*', ','}; return symbols.contains(letter); } diff --git a/lib/sql/tokenizer.cpp b/lib/sql/tokenizer.cpp index 1efdf0c..3632489 100644 --- a/lib/sql/tokenizer.cpp +++ b/lib/sql/tokenizer.cpp @@ -88,7 +88,7 @@ auto SqlTokenizer::scan_identifier(int start_offset) -> Token auto text = sql_.substr(start_offset, end_offset - start_offset); auto token_type = Type::from_string(text); - return {text, token_type, end_offset + 1}; + return {text, token_type, end_offset}; } auto SqlTokenizer::scan_symbol(int start_offset) -> Token @@ -101,7 +101,7 @@ auto SqlTokenizer::scan_symbol(int start_offset) -> Token auto text = sql_.substr(start_offset, end_offset - start_offset); auto token_type = Type::from_string(text); - return {text, token_type, end_offset + 1}; + return {text, token_type, end_offset}; } auto SqlTokenizer::get_offset_until_terminated_char(unsigned char terminated, int start_offset) -> int diff --git a/test/sql_test.cpp b/test/sql_test.cpp index 4552e34..111b9ac 100644 --- a/test/sql_test.cpp +++ b/test/sql_test.cpp @@ -22,3 +22,23 @@ TEST(Tokenizer, tokenize_sql_string) ASSERT_EQ(token_types[i], tokens[i].type_); } } + +TEST(Tokenizer, tokenize_projected_sql_string) +{ + std::string query = "select name, age from users"; + + SqlTokenizer tokenizer(query); + auto tokens = tokenizer.tokenize(); + + std::vector res{"select", "name", ",", "age", "from", "users"}; + std::vector token_types{TokenType::SELECT, TokenType::IDENTIFIER, TokenType::COMMA, + TokenType::IDENTIFIER, TokenType::FROM, TokenType::IDENTIFIER}; + + ASSERT_EQ(6, tokens.size()); + for (int i = 0; i < (int)tokens.size(); i++) + { + std::cout << tokens[i].text_ << "\n"; + ASSERT_EQ(res[i], tokens[i].text_); + ASSERT_EQ(token_types[i], tokens[i].type_); + } +}