Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ find_package(Arrow REQUIRED)
add_subdirectory(lib/types)
add_subdirectory(lib/datasource)
add_subdirectory(lib/logical-plan)
add_subdirectory(lib/sql)
add_subdirectory(test)


Expand Down
2 changes: 2 additions & 0 deletions lib/sql/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
add_library(sql tokenizer.cpp)
target_include_directories(sql PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
68 changes: 68 additions & 0 deletions lib/sql/token.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#pragma once

#include <algorithm>
#include <cctype>
#include <set>
#include <string>
#include <unordered_map>

enum class TokenType : std::uint8_t {
// keywords
SELECT,
FROM,

// literals
LONG,
DOUBLE,
STRING,
IDENTIFIER,

// Symbol
STAR,
COMMA,
};

namespace Type
{

inline TokenType from_string(std::string token)
{

std::ranges::transform(token, token.begin(), [](unsigned char letter) { return std::toupper(letter); });
Comment on lines +3 to +31

static const std::unordered_map<std::string, TokenType> keywords = {
{"SELECT", TokenType::SELECT}, {"FROM", TokenType::FROM}, {"*", TokenType::STAR}, {",", TokenType::COMMA}};

auto iter = keywords.find(token);
return iter != keywords.end() ? iter->second : TokenType::IDENTIFIER;
}

} // namespace Type

struct Literal
{
static bool is_number_start(unsigned char letter) { return std::isdigit(letter) != 0 || letter == '.'; }
static bool is_identifier_start(char letter) { return std::isalpha(letter) != 0 || letter == '`'; }
static bool is_char_start(char letter) { return letter == '\'' || '"' == letter; }

static bool is_symbol_start(char letter) { return is_symbol(letter); }
static bool is_symbol(char letter)
{
std::set<unsigned char> symbols{'*', ','};
return symbols.contains(letter);
}

Comment on lines +51 to +54
static bool is_identifier_part(char letter)
{
return std::isdigit(letter) != 0 || std::isalpha(letter) != 0 || letter == '_';
}
Comment on lines +44 to +58
};

struct Token
{
std::string text_;
TokenType type_;
int end_offset_;

Token(std::string &text, TokenType type, int end_offset) : text_(text), type_(type), end_offset_(end_offset) {}
};
Comment on lines +61 to +68
115 changes: 115 additions & 0 deletions lib/sql/tokenizer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#include <iostream>
#include <optional>
#include <stdexcept>
#include <string>
#include <vector>

#include "token.h"
#include "tokenizer.h"

SqlTokenizer::SqlTokenizer(const std::string &sql) : sql_(sql) { offset_ = 0; }

auto SqlTokenizer::tokenize() -> std::vector<Token>
{
std::vector<Token> res;

auto token = next_token();
while (token.has_value())
{
res.push_back(token.value());
token = next_token();
}

return res;
}

auto SqlTokenizer::next_token() -> std::optional<Token>
{
auto offset = skip_whitespace(offset_);
if (offset > (int)sql_.length())
{
return std::nullopt;
}

if (Literal::is_identifier_start(sql_[offset]))
{
Comment on lines +28 to +35
auto token = scan_identifier(offset);
offset_ = token.end_offset_;
return token;
}

if (Literal::is_symbol_start(sql_[offset]))
{
auto token = scan_symbol(offset);
offset_ = token.end_offset_;
return token;
}

if (Literal::is_number_start(sql_[offset]))
{
throw std::runtime_error("Not Implemented");
}

if (Literal::is_char_start(sql_[offset]))
{
throw std::runtime_error("Not Implemented");
Comment on lines +50 to +55
}

return std::nullopt;
}
Comment on lines +53 to +59

auto SqlTokenizer::skip_whitespace(int start_offset) -> int
{
auto end_offset = start_offset;
auto curr = sql_[end_offset];
std::cout << curr;

Comment on lines +61 to +66
while (end_offset < (int)sql_.size() && sql_[end_offset] == ' ')
{
end_offset += 1;
}
return end_offset;
Comment on lines +67 to +71
}

auto SqlTokenizer::scan_identifier(int start_offset) -> Token
{
if (offset_ < (int)sql_.size() && '`' == sql_[offset_])
{
auto end_offset = get_offset_until_terminated_char('`', start_offset);
auto text = sql_.substr(start_offset, end_offset - start_offset);
return {text, TokenType::IDENTIFIER, end_offset + 1};
Comment on lines +76 to +80
}
Comment on lines +74 to +81

auto end_offset = start_offset;
while (end_offset < (int)sql_.size() && Literal::is_identifier_part(sql_[end_offset]))
{
end_offset += 1;
}

auto text = sql_.substr(start_offset, end_offset - start_offset);
auto token_type = Type::from_string(text);
return {text, token_type, end_offset};
}

auto SqlTokenizer::scan_symbol(int start_offset) -> Token
{
auto end_offset = start_offset;
while (end_offset < (int)sql_.size() && Literal::is_symbol(sql_[end_offset]))
{
end_offset += 1;
}

auto text = sql_.substr(start_offset, end_offset - start_offset);
auto token_type = Type::from_string(text);
return {text, token_type, end_offset};
}

auto SqlTokenizer::get_offset_until_terminated_char(unsigned char terminated, int start_offset) -> int
{
auto end_offset = start_offset;
while (end_offset < (int)sql_.size() && static_cast<unsigned char>(sql_[end_offset]) != terminated)
{
end_offset += 1;
}
return end_offset;
}
23 changes: 23 additions & 0 deletions lib/sql/tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma once

#include <optional>
#include <string>
#include <vector>

#include "token.h"

class SqlTokenizer
{
std::string sql_;
int offset_;

int skip_whitespace(int start_offset);
int get_offset_until_terminated_char(unsigned char terminated, int start_offset);
Token scan_identifier(int start_offset);
Token scan_symbol(int start_offset);

public:
SqlTokenizer(const std::string &sql);
std::vector<Token> tokenize();
std::optional<Token> next_token();
};
3 changes: 3 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@ include(AddGoogleTest)

add_executable(datasource_test datasource_test.cpp)
add_executable(logical_plan_test logical_plan_test.cpp)
add_executable(sql_test sql_test.cpp)

target_link_libraries(datasource_test PRIVATE gtest GTest::gtest_main datasource)
target_link_libraries(logical_plan_test PRIVATE gtest GTest::gtest_main logical_plan datasource)
target_link_libraries(sql_test PRIVATE gtest GTest::gtest_main sql)

include(GoogleTest)
gtest_discover_tests(datasource_test)
gtest_discover_tests(logical_plan_test)
gtest_discover_tests(sql_test)
44 changes: 44 additions & 0 deletions test/sql_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include <gtest/gtest.h>
#include <string>
#include <vector>

#include "token.h"
#include "tokenizer.h"

TEST(Tokenizer, tokenize_sql_string)
{
std::string query = "select * from users";

SqlTokenizer tokenizer(query);
auto tokens = tokenizer.tokenize();

std::vector<std::string> res{"select", "*", "from", "users"};
std::vector<TokenType> token_types{TokenType::SELECT, TokenType::STAR, TokenType::FROM, TokenType::IDENTIFIER};

ASSERT_EQ(4, tokens.size());
for (int i = 0; i < (int)tokens.size(); i++)
{
Comment on lines +19 to +20
ASSERT_EQ(res[i], tokens[i].text_);
ASSERT_EQ(token_types[i], tokens[i].type_);
}
}

TEST(Tokenizer, tokenize_projected_sql_string)
{
std::string query = "select name, age from users";

SqlTokenizer tokenizer(query);
auto tokens = tokenizer.tokenize();

std::vector<std::string> res{"select", "name", ",", "age", "from", "users"};
std::vector<TokenType> token_types{TokenType::SELECT, TokenType::IDENTIFIER, TokenType::COMMA,
TokenType::IDENTIFIER, TokenType::FROM, TokenType::IDENTIFIER};

ASSERT_EQ(6, tokens.size());
for (int i = 0; i < (int)tokens.size(); i++)
{
std::cout << tokens[i].text_ << "\n";
ASSERT_EQ(res[i], tokens[i].text_);
ASSERT_EQ(token_types[i], tokens[i].type_);
}
}
Loading