From 21ac3ff2932174152265e8ce3888c92e9763af80 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 24 Jun 2025 19:24:23 +0100 Subject: [PATCH 1/3] Add tokenizer using logos --- Cargo.toml | 4 + docs/parser-plan.md | 5 +- src/lib.rs | 2 + src/tokenizer.rs | 237 ++++++++++++++++++++++++++++++++++++++++++++ tests/tokenizer.rs | 30 ++++++ 5 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 src/tokenizer.rs create mode 100644 tests/tokenizer.rs diff --git a/Cargo.toml b/Cargo.toml index 98929746..f63f20ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,10 @@ edition = "2024" rowan = { version = "0.15", default-features = false } num-derive = { version = "0.4", default-features = false } num-traits = { version = "0.2", default-features = false, features = ["std"] } +logos = { version = "0.13", default-features = false, features = ["export_derive"] } + +[dev-dependencies] +rstest = "0.18" [lints.clippy] pedantic = { level = "warn", priority = -1 } diff --git a/docs/parser-plan.md b/docs/parser-plan.md index 77f695f9..f64bd403 100644 --- a/docs/parser-plan.md +++ b/docs/parser-plan.md @@ -33,7 +33,10 @@ transparently. Use `chumsky`'s text utilities (or integrate a `logos` lexer if preferred) to convert the source text into a stream of `(SyntaxKind, Span)` pairs. Each span records byte offsets so that the resulting CST can precisely mirror the input. -Whitespace and comments should produce tokens so they can be preserved. +Whitespace and comments should produce tokens so they can be preserved. The +current implementation opts for a small `logos` lexer because it keeps the token +definitions declarative while still interoperating smoothly with `chumsky` +parsers. ## 4. Construct the Parser with `chumsky` diff --git a/src/lib.rs b/src/lib.rs index a02f3654..935320b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,5 +5,7 @@ #![forbid(unsafe_code)] pub mod language; +pub mod tokenizer; pub use language::{DdlogLanguage, SyntaxKind}; +pub use tokenizer::{Span, tokenize}; diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 00000000..c8d36503 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,237 @@ +//! Lexical analysis for `DDlog` source. +//! +//! This module exposes a `tokenize` function which converts raw source text into +//! a sequence of `(SyntaxKind, Span)` pairs. It uses the `logos` crate to +//! recognise tokens so that the CST can mirror the input exactly. + +use logos::Logos; + +use crate::SyntaxKind; + +/// Byte range for a token within the source. +pub type Span = std::ops::Range; + +#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] +enum Token { + #[regex(r"[ \t\r\n]+")] + Whitespace, + #[regex(r"/\*([^*]|\*[^/])*\*/", priority = 2)] + #[regex(r"//[^\n]*")] + Comment, + #[regex(r"[A-Za-z_][A-Za-z0-9_]*")] + Ident, + #[regex(r"[0-9]+")] + Number, + #[regex(r#""([^"\\]|\\.)*""#)] + String, + #[token("(")] + LParen, + #[token(")")] + RParen, + #[token("{")] + LBrace, + #[token("}")] + RBrace, + #[token("[")] + LBracket, + #[token("]")] + RBracket, + #[token(";")] + Semi, + #[token(",")] + Comma, + #[token(".")] + Dot, + #[token("::")] + ColonColon, + #[token(":")] + Colon, + #[token("|")] + Pipe, + #[token("&")] + Amp, + #[token("==")] + EqEq, + #[token("=")] + Eq, + #[token(":-")] + Implies, + #[token("%")] + Percent, + #[token("*")] + Star, + #[token("/")] + Slash, + #[token("+")] + Plus, + #[token("-")] + Minus, + #[token("->")] + Arrow, + #[token("=>")] + FatArrow, + #[token("<=")] + Lte, + #[token("<=>")] + Spaceship, + #[token(">=")] + Gte, + #[token("<")] + Lt, + #[token(">")] + Gt, + #[token("!=")] + Neq, + #[token(">>")] + Shr, + #[token("<<")] + Shl, + #[token("~")] + Tilde, + #[token("@")] + At, + #[token("#")] + Hash, + #[token("'")] + Apostrophe, +} + +fn keyword_kind(ident: &str) -> Option { + Some(match ident { + "abstract" => SyntaxKind::K_ABSTRACT, + "Aggregate" => SyntaxKind::K_AGGREGATE, + "and" => SyntaxKind::K_AND, + "apply" => SyntaxKind::K_APPLY, + "as" => SyntaxKind::K_AS, + "async" => SyntaxKind::K_ASYNC, + "await" => SyntaxKind::K_AWAIT, + "become" => SyntaxKind::K_BECOME, + "bigint" => SyntaxKind::K_BIGINT, + "bit" => SyntaxKind::K_BIT, + "bool" => SyntaxKind::K_BOOL, + "box" => SyntaxKind::K_BOX, + "break" => SyntaxKind::K_BREAK, + "const" => SyntaxKind::K_CONST, + "continue" => SyntaxKind::K_CONTINUE, + "crate" => SyntaxKind::K_CRATE, + "do" => SyntaxKind::K_DO, + "double" => SyntaxKind::K_DOUBLE, + "dyn" => SyntaxKind::K_DYN, + "else" => SyntaxKind::K_ELSE, + "extern" => SyntaxKind::K_EXTERN, + "false" => SyntaxKind::K_FALSE, + "final" => SyntaxKind::K_FINAL, + "fn" => SyntaxKind::K_FN, + "FlatMap" => SyntaxKind::K_FLATMAP, + "float" => SyntaxKind::K_FLOAT, + "for" => SyntaxKind::K_FOR, + "function" => SyntaxKind::K_FUNCTION, + "if" => SyntaxKind::K_IF, + "impl" => SyntaxKind::K_IMPL, + "import" => SyntaxKind::K_IMPORT, + "in" => SyntaxKind::K_IN, + "input" => SyntaxKind::K_INPUT, + "Inspect" => SyntaxKind::K_INSPECT, + "let" => SyntaxKind::K_LET, + "loop" => SyntaxKind::K_LOOP, + "macro" => SyntaxKind::K_MACRO, + "match" => SyntaxKind::K_MATCH, + "mod" => SyntaxKind::K_MOD, + "move" => SyntaxKind::K_MOVE, + "multiset" => SyntaxKind::K_MULTISET, + "mut" => SyntaxKind::K_MUT, + "not" => SyntaxKind::K_NOT, + "or" => SyntaxKind::K_OR, + "override" => SyntaxKind::K_OVERRIDE, + "output" => SyntaxKind::K_OUTPUT, + "priv" => SyntaxKind::K_PRIV, + "pub" => SyntaxKind::K_PUB, + "ref" => SyntaxKind::K_REF, + "relation" => SyntaxKind::K_RELATION, + "return" => SyntaxKind::K_RETURN, + "self" => SyntaxKind::K_SELF, + "Self" => SyntaxKind::K_SELF_TYPE, + "signed" => SyntaxKind::K_SIGNED, + "skip" => SyntaxKind::K_SKIP, + "static" => SyntaxKind::K_STATIC, + "stream" => SyntaxKind::K_STREAM, + "struct" => SyntaxKind::K_STRUCT, + "super" => SyntaxKind::K_SUPER, + "trait" => SyntaxKind::K_TRAIT, + "transformer" => SyntaxKind::K_TRANSFORMER, + "try" => SyntaxKind::K_TRY, + "true" => SyntaxKind::K_TRUE, + "type" => SyntaxKind::K_TYPE, + "typedef" => SyntaxKind::K_TYPEDEF, + "typeof" => SyntaxKind::K_TYPEOF, + "_" => SyntaxKind::K_UNDERSCORE, + "unsafe" => SyntaxKind::K_UNSAFE, + "unsized" => SyntaxKind::K_UNSIZED, + "use" => SyntaxKind::K_USE, + "var" => SyntaxKind::K_VAR, + "virtual" => SyntaxKind::K_VIRTUAL, + "where" => SyntaxKind::K_WHERE, + "while" => SyntaxKind::K_WHILE, + "yield" => SyntaxKind::K_YIELD, + _ => return None, + }) +} + +/// Tokenise the provided `DDlog` source. +#[must_use] +pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> { + let mut lexer = Token::lexer(src); + let mut out = Vec::new(); + while let Some(result) = lexer.next() { + let span = lexer.span(); + let text = src.get(span.clone()).unwrap_or(""); + let Ok(token) = result else { + out.push((SyntaxKind::N_ERROR, span)); + continue; + }; + let kind = match token { + Token::Whitespace => SyntaxKind::T_WHITESPACE, + Token::Comment => SyntaxKind::T_COMMENT, + Token::Ident => keyword_kind(text).unwrap_or(SyntaxKind::T_IDENT), + Token::Number => SyntaxKind::T_NUMBER, + Token::String => SyntaxKind::T_STRING, + Token::LParen => SyntaxKind::T_LPAREN, + Token::RParen => SyntaxKind::T_RPAREN, + Token::LBrace => SyntaxKind::T_LBRACE, + Token::RBrace => SyntaxKind::T_RBRACE, + Token::LBracket => SyntaxKind::T_LBRACKET, + Token::RBracket => SyntaxKind::T_RBRACKET, + Token::Semi => SyntaxKind::T_SEMI, + Token::Comma => SyntaxKind::T_COMMA, + Token::Dot => SyntaxKind::T_DOT, + Token::ColonColon => SyntaxKind::T_COLON_COLON, + Token::Colon => SyntaxKind::T_COLON, + Token::Pipe => SyntaxKind::T_PIPE, + Token::Amp => SyntaxKind::T_AMP, + Token::EqEq => SyntaxKind::T_EQEQ, + Token::Eq => SyntaxKind::T_EQ, + Token::Implies => SyntaxKind::T_IMPLIES, + Token::Percent => SyntaxKind::T_PERCENT, + Token::Star => SyntaxKind::T_STAR, + Token::Slash => SyntaxKind::T_SLASH, + Token::Plus => SyntaxKind::T_PLUS, + Token::Minus => SyntaxKind::T_MINUS, + Token::Arrow => SyntaxKind::T_ARROW, + Token::FatArrow => SyntaxKind::T_FAT_ARROW, + Token::Lte => SyntaxKind::T_LTE, + Token::Spaceship => SyntaxKind::T_SPACESHIP, + Token::Gte => SyntaxKind::T_GTE, + Token::Lt => SyntaxKind::T_LT, + Token::Gt => SyntaxKind::T_GT, + Token::Neq => SyntaxKind::T_NEQ, + Token::Shr => SyntaxKind::T_SHR, + Token::Shl => SyntaxKind::T_SHL, + Token::Tilde => SyntaxKind::T_TILDE, + Token::At => SyntaxKind::T_AT, + Token::Hash => SyntaxKind::T_HASH, + Token::Apostrophe => SyntaxKind::T_APOSTROPHE, + }; + out.push((kind, span)); + } + out +} diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs new file mode 100644 index 00000000..265b72a7 --- /dev/null +++ b/tests/tokenizer.rs @@ -0,0 +1,30 @@ +use ddlint::{SyntaxKind, tokenize}; +use rstest::{fixture, rstest}; + +#[fixture] +fn simple_input() -> &'static str { + "input relation R(x: u32)" +} + +#[rstest] +#[case("input", vec![SyntaxKind::K_INPUT])] +#[case("relation", vec![SyntaxKind::K_RELATION])] +#[case("R", vec![SyntaxKind::T_IDENT])] +fn single_tokens(#[case] source: &str, #[case] expected: Vec) { + let tokens = tokenize(source); + let kinds: Vec = tokens.iter().map(|(k, _)| *k).collect(); + assert_eq!(kinds, expected); +} + +#[rstest] +fn token_spans(simple_input: &str) { + let tokens = tokenize(simple_input); + for (kind, span) in tokens { + let text = simple_input.get(span.clone()).unwrap_or(""); + if let SyntaxKind::K_INPUT = kind { + assert_eq!(text, "input"); + } else if let SyntaxKind::K_RELATION = kind { + assert_eq!(text, "relation"); + } + } +} From cf35d9fe9bfd0cbb16ddc721b63c48093c4ff4bb Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 24 Jun 2025 19:46:38 +0100 Subject: [PATCH 2/3] Add tokenizer tests --- tests/tokenizer.rs | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 265b72a7..427fa36d 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -28,3 +28,44 @@ fn token_spans(simple_input: &str) { } } } + +#[rstest] +#[case("123", SyntaxKind::T_NUMBER)] +#[case("\"foo\"", SyntaxKind::T_STRING)] +fn literal_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .unwrap_or_else(|| panic!("no token")); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case(" ", SyntaxKind::T_WHITESPACE)] +#[case("\n", SyntaxKind::T_WHITESPACE)] +#[case("/* c */", SyntaxKind::T_COMMENT)] +#[case("// line", SyntaxKind::T_COMMENT)] +fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .unwrap_or_else(|| panic!("no token")); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("?")] +#[case("$")] +fn unknown_character_produces_error(#[case] source: &str) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .unwrap_or_else(|| panic!("no token")); + assert_eq!(first.0, SyntaxKind::N_ERROR); +} From a6b8654ff34a6dcce5b75c025508551186dae486 Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Tue, 24 Jun 2025 21:18:19 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`cod?= =?UTF-8?q?ex/build-differential-datalog-tokenizer`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docstrings generation was requested by @leynos. * https://github.com/leynos/ddlint/pull/7#issuecomment-3001471343 The following files were modified: * `src/tokenizer.rs` * `tests/tokenizer.rs` --- src/tokenizer.rs | 28 ++++++++++++++++++- tests/tokenizer.rs | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c8d36503..9aa84c38 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -96,6 +96,17 @@ enum Token { Apostrophe, } +/// Returns the `SyntaxKind` for a given identifier if it matches a recognised keyword. +/// +/// If the input string corresponds to a DDlog or Rust keyword, returns the associated +/// `SyntaxKind` variant; otherwise, returns `None`. +/// +/// # Examples +/// +/// ```no_run +/// assert_eq!(keyword_kind("fn"), Some(SyntaxKind::K_FN)); +/// assert_eq!(keyword_kind("foobar"), None); +/// ``` fn keyword_kind(ident: &str) -> Option { Some(match ident { "abstract" => SyntaxKind::K_ABSTRACT, @@ -177,7 +188,22 @@ fn keyword_kind(ident: &str) -> Option { }) } -/// Tokenise the provided `DDlog` source. +/// Converts DDlog source text into a sequence of syntax tokens with their corresponding byte spans. +/// +/// Each token is represented as a `(SyntaxKind, Span)` pair, where `Span` is the byte range of the token in the input. +/// Whitespace and comments are preserved in the output. Identifiers that match keywords are assigned the appropriate +/// keyword kind; otherwise, they are treated as generic identifiers. Unrecognised or invalid tokens are marked with +/// `SyntaxKind::N_ERROR`. +/// +/// # Examples +/// +/// ```no_run +/// use ddlog_tokenizer::{tokenize, SyntaxKind}; +/// +/// let src = "relation R(x: bit<32>) // comment"; +/// let tokens = tokenize(src); +/// assert!(tokens.iter().any(|(kind, _)| *kind == SyntaxKind::T_IDENT)); +/// ``` #[must_use] pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> { let mut lexer = Token::lexer(src); diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 427fa36d..9238a329 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -1,11 +1,28 @@ use ddlint::{SyntaxKind, tokenize}; use rstest::{fixture, rstest}; +/// Provides a static example input string for use in tokenizer tests. +/// +/// # Examples +/// +/// ```no_run +/// let input = simple_input(); +/// assert_eq!(input, "input relation R(x: u32)"); +/// ``` #[fixture] fn simple_input() -> &'static str { "input relation R(x: u32)" } +/// Tests that tokenising a single keyword or identifier produces the expected sequence of token kinds. +/// +/// # Examples +/// +/// ```no_run +/// single_tokens("input", vec![SyntaxKind::K_INPUT]); +/// single_tokens("relation", vec![SyntaxKind::K_RELATION]); +/// single_tokens("R", vec![SyntaxKind::T_IDENT]); +/// ``` #[rstest] #[case("input", vec![SyntaxKind::K_INPUT])] #[case("relation", vec![SyntaxKind::K_RELATION])] @@ -16,6 +33,17 @@ fn single_tokens(#[case] source: &str, #[case] expected: Vec) { assert_eq!(kinds, expected); } +/// Verifies that the spans of keyword tokens correspond to the correct substrings in the input. +/// +/// This test checks that the `K_INPUT` and `K_RELATION` tokens produced by the tokenizer +/// map to the expected text slices within the provided input string. +/// +/// # Examples +/// +/// ```no_run +/// let input = "input relation R(x: u32)"; +/// token_spans(input); // Asserts that "input" and "relation" tokens have correct spans. +/// ``` #[rstest] fn token_spans(simple_input: &str) { let tokens = tokenize(simple_input); @@ -29,6 +57,20 @@ fn token_spans(simple_input: &str) { } } +/// Tests that a single literal input is tokenised as the expected literal token kind. +/// +/// Asserts that tokenising the input string produces exactly one token of the expected +/// `SyntaxKind` for literals such as numbers or strings. +/// +/// # Examples +/// +/// ```no_run +/// // Number literal +/// literal_tokens("123", SyntaxKind::T_NUMBER); +/// +/// // String literal +/// literal_tokens("\"foo\"", SyntaxKind::T_STRING); +/// ``` #[rstest] #[case("123", SyntaxKind::T_NUMBER)] #[case("\"foo\"", SyntaxKind::T_STRING)] @@ -42,6 +84,21 @@ fn literal_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { assert_eq!(first.0, expected); } +/// Verifies that whitespace and comment inputs are tokenised as the expected trivia token kind. +/// +/// Asserts that tokenising the given source string produces exactly one token of the expected +/// `SyntaxKind` for trivia (whitespace or comment). +/// +/// # Examples +/// +/// ```no_run +/// use ddlint::tokenize; +/// use ddlint::SyntaxKind; +/// +/// let tokens = tokenize(" "); +/// assert_eq!(tokens.len(), 1); +/// assert_eq!(tokens[0].0, SyntaxKind::T_WHITESPACE); +/// ``` #[rstest] #[case(" ", SyntaxKind::T_WHITESPACE)] #[case("\n", SyntaxKind::T_WHITESPACE)] @@ -57,6 +114,17 @@ fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { assert_eq!(first.0, expected); } +/// Verifies that tokenising an unknown character produces a single error token. +/// +/// This test ensures that when the tokenizer encounters an unrecognised character, +/// it emits exactly one token of kind `N_ERROR`. +/// +/// # Examples +/// +/// ```no_run +/// // The test will pass if the tokenizer returns a single error token for '?' +/// unknown_character_produces_error("?"); +/// ``` #[rstest] #[case("?")] #[case("$")]