diff --git a/CMakeLists.txt b/CMakeLists.txt index 6970eb4..3021bd4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ find_package(Arrow REQUIRED) add_subdirectory(lib/types) add_subdirectory(lib/datasource) add_subdirectory(lib/logical-plan) +add_subdirectory(lib/sql) add_subdirectory(test) diff --git a/lib/sql/CMakeLists.txt b/lib/sql/CMakeLists.txt new file mode 100644 index 0000000..98353d5 --- /dev/null +++ b/lib/sql/CMakeLists.txt @@ -0,0 +1,2 @@ +add_library(sql tokenizer.cpp) +target_include_directories(sql PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/lib/sql/token.h b/lib/sql/token.h new file mode 100644 index 0000000..1fd4516 --- /dev/null +++ b/lib/sql/token.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include +#include +#include + +enum class TokenType : std::uint8_t { + // keywords + SELECT, + FROM, + + // literals + LONG, + DOUBLE, + STRING, + IDENTIFIER, + + // Symbol + STAR, + COMMA, +}; + +namespace Type +{ + +inline TokenType from_string(std::string token) +{ + + std::ranges::transform(token, token.begin(), [](unsigned char letter) { return std::toupper(letter); }); + + static const std::unordered_map keywords = { + {"SELECT", TokenType::SELECT}, {"FROM", TokenType::FROM}, {"*", TokenType::STAR}, {",", TokenType::COMMA}}; + + auto iter = keywords.find(token); + return iter != keywords.end() ? iter->second : TokenType::IDENTIFIER; +} + +} // namespace Type + +struct Literal +{ + static bool is_number_start(unsigned char letter) { return std::isdigit(letter) != 0 || letter == '.'; } + static bool is_identifier_start(char letter) { return std::isalpha(letter) != 0 || letter == '`'; } + static bool is_char_start(char letter) { return letter == '\'' || '"' == letter; } + + static bool is_symbol_start(char letter) { return is_symbol(letter); } + static bool is_symbol(char letter) + { + std::set symbols{'*', ','}; + return symbols.contains(letter); + } + + static bool is_identifier_part(char letter) + { + return std::isdigit(letter) != 0 || std::isalpha(letter) != 0 || letter == '_'; + } +}; + +struct Token +{ + std::string text_; + TokenType type_; + int end_offset_; + + Token(std::string &text, TokenType type, int end_offset) : text_(text), type_(type), end_offset_(end_offset) {} +}; diff --git a/lib/sql/tokenizer.cpp b/lib/sql/tokenizer.cpp new file mode 100644 index 0000000..3632489 --- /dev/null +++ b/lib/sql/tokenizer.cpp @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include + +#include "token.h" +#include "tokenizer.h" + +SqlTokenizer::SqlTokenizer(const std::string &sql) : sql_(sql) { offset_ = 0; } + +auto SqlTokenizer::tokenize() -> std::vector +{ + std::vector res; + + auto token = next_token(); + while (token.has_value()) + { + res.push_back(token.value()); + token = next_token(); + } + + return res; +} + +auto SqlTokenizer::next_token() -> std::optional +{ + auto offset = skip_whitespace(offset_); + if (offset > (int)sql_.length()) + { + return std::nullopt; + } + + if (Literal::is_identifier_start(sql_[offset])) + { + auto token = scan_identifier(offset); + offset_ = token.end_offset_; + return token; + } + + if (Literal::is_symbol_start(sql_[offset])) + { + auto token = scan_symbol(offset); + offset_ = token.end_offset_; + return token; + } + + if (Literal::is_number_start(sql_[offset])) + { + throw std::runtime_error("Not Implemented"); + } + + if (Literal::is_char_start(sql_[offset])) + { + throw std::runtime_error("Not Implemented"); + } + + return std::nullopt; +} + +auto SqlTokenizer::skip_whitespace(int start_offset) -> int +{ + auto end_offset = start_offset; + auto curr = sql_[end_offset]; + std::cout << curr; + + while (end_offset < (int)sql_.size() && sql_[end_offset] == ' ') + { + end_offset += 1; + } + return end_offset; +} + +auto SqlTokenizer::scan_identifier(int start_offset) -> Token +{ + if (offset_ < (int)sql_.size() && '`' == sql_[offset_]) + { + auto end_offset = get_offset_until_terminated_char('`', start_offset); + auto text = sql_.substr(start_offset, end_offset - start_offset); + return {text, TokenType::IDENTIFIER, end_offset + 1}; + } + + auto end_offset = start_offset; + while (end_offset < (int)sql_.size() && Literal::is_identifier_part(sql_[end_offset])) + { + end_offset += 1; + } + + auto text = sql_.substr(start_offset, end_offset - start_offset); + auto token_type = Type::from_string(text); + return {text, token_type, end_offset}; +} + +auto SqlTokenizer::scan_symbol(int start_offset) -> Token +{ + auto end_offset = start_offset; + while (end_offset < (int)sql_.size() && Literal::is_symbol(sql_[end_offset])) + { + end_offset += 1; + } + + auto text = sql_.substr(start_offset, end_offset - start_offset); + auto token_type = Type::from_string(text); + return {text, token_type, end_offset}; +} + +auto SqlTokenizer::get_offset_until_terminated_char(unsigned char terminated, int start_offset) -> int +{ + auto end_offset = start_offset; + while (end_offset < (int)sql_.size() && static_cast(sql_[end_offset]) != terminated) + { + end_offset += 1; + } + return end_offset; +} diff --git a/lib/sql/tokenizer.h b/lib/sql/tokenizer.h new file mode 100644 index 0000000..f54f232 --- /dev/null +++ b/lib/sql/tokenizer.h @@ -0,0 +1,23 @@ +#pragma once + +#include +#include +#include + +#include "token.h" + +class SqlTokenizer +{ + std::string sql_; + int offset_; + + int skip_whitespace(int start_offset); + int get_offset_until_terminated_char(unsigned char terminated, int start_offset); + Token scan_identifier(int start_offset); + Token scan_symbol(int start_offset); + +public: + SqlTokenizer(const std::string &sql); + std::vector tokenize(); + std::optional next_token(); +}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c32adc2..fb9d2bb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,10 +2,13 @@ include(AddGoogleTest) add_executable(datasource_test datasource_test.cpp) add_executable(logical_plan_test logical_plan_test.cpp) +add_executable(sql_test sql_test.cpp) target_link_libraries(datasource_test PRIVATE gtest GTest::gtest_main datasource) target_link_libraries(logical_plan_test PRIVATE gtest GTest::gtest_main logical_plan datasource) +target_link_libraries(sql_test PRIVATE gtest GTest::gtest_main sql) include(GoogleTest) gtest_discover_tests(datasource_test) gtest_discover_tests(logical_plan_test) +gtest_discover_tests(sql_test) diff --git a/test/sql_test.cpp b/test/sql_test.cpp new file mode 100644 index 0000000..111b9ac --- /dev/null +++ b/test/sql_test.cpp @@ -0,0 +1,44 @@ +#include +#include +#include + +#include "token.h" +#include "tokenizer.h" + +TEST(Tokenizer, tokenize_sql_string) +{ + std::string query = "select * from users"; + + SqlTokenizer tokenizer(query); + auto tokens = tokenizer.tokenize(); + + std::vector res{"select", "*", "from", "users"}; + std::vector token_types{TokenType::SELECT, TokenType::STAR, TokenType::FROM, TokenType::IDENTIFIER}; + + ASSERT_EQ(4, tokens.size()); + for (int i = 0; i < (int)tokens.size(); i++) + { + ASSERT_EQ(res[i], tokens[i].text_); + ASSERT_EQ(token_types[i], tokens[i].type_); + } +} + +TEST(Tokenizer, tokenize_projected_sql_string) +{ + std::string query = "select name, age from users"; + + SqlTokenizer tokenizer(query); + auto tokens = tokenizer.tokenize(); + + std::vector res{"select", "name", ",", "age", "from", "users"}; + std::vector token_types{TokenType::SELECT, TokenType::IDENTIFIER, TokenType::COMMA, + TokenType::IDENTIFIER, TokenType::FROM, TokenType::IDENTIFIER}; + + ASSERT_EQ(6, tokens.size()); + for (int i = 0; i < (int)tokens.size(); i++) + { + std::cout << tokens[i].text_ << "\n"; + ASSERT_EQ(res[i], tokens[i].text_); + ASSERT_EQ(token_types[i], tokens[i].type_); + } +}