From 21ac3ff2932174152265e8ce3888c92e9763af80 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 24 Jun 2025 19:24:23 +0100
Subject: [PATCH 1/3] Add tokenizer using logos

---
 Cargo.toml          |   4 +
 docs/parser-plan.md |   5 +-
 src/lib.rs          |   2 +
 src/tokenizer.rs    | 237 ++++++++++++++++++++++++++++++++++++++++++++
 tests/tokenizer.rs  |  30 ++++++
 5 files changed, 277 insertions(+), 1 deletion(-)
 create mode 100644 src/tokenizer.rs
 create mode 100644 tests/tokenizer.rs
diff --git a/Cargo.toml b/Cargo.toml
index 98929746..f63f20ef 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,10 @@ edition = "2024"
 rowan = { version = "0.15", default-features = false }
 num-derive = { version = "0.4", default-features = false }
 num-traits = { version = "0.2", default-features = false, features = ["std"] }
+logos = { version = "0.13", default-features = false, features = ["export_derive"] }
+
+[dev-dependencies]
+rstest = "0.18"
 
 [lints.clippy]
 pedantic = { level = "warn", priority = -1 }
diff --git a/docs/parser-plan.md b/docs/parser-plan.md
index 77f695f9..f64bd403 100644
--- a/docs/parser-plan.md
+++ b/docs/parser-plan.md
@@ -33,7 +33,10 @@ transparently.
 Use `chumsky`'s text utilities (or integrate a `logos` lexer if preferred) to
 convert the source text into a stream of `(SyntaxKind, Span)` pairs. Each span
 records byte offsets so that the resulting CST can precisely mirror the input.
-Whitespace and comments should produce tokens so they can be preserved.
+Whitespace and comments should produce tokens so they can be preserved. The
+current implementation opts for a small `logos` lexer because it keeps the token
+definitions declarative while still interoperating smoothly with `chumsky`
+parsers.
 
 ## 4. Construct the Parser with `chumsky`
 
diff --git a/src/lib.rs b/src/lib.rs
index a02f3654..935320b5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,5 +5,7 @@
 #![forbid(unsafe_code)]
 
 pub mod language;
+pub mod tokenizer;
 
 pub use language::{DdlogLanguage, SyntaxKind};
+pub use tokenizer::{Span, tokenize};
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
new file mode 100644
index 00000000..c8d36503
--- /dev/null
+++ b/src/tokenizer.rs
@@ -0,0 +1,237 @@
+//! Lexical analysis for `DDlog` source.
+//!
+//! This module exposes a `tokenize` function which converts raw source text into
+//! a sequence of `(SyntaxKind, Span)` pairs. It uses the `logos` crate to
+//! recognise tokens so that the CST can mirror the input exactly.
+
+use logos::Logos;
+
+use crate::SyntaxKind;
+
+/// Byte range for a token within the source.
+pub type Span = std::ops::Range<usize>;
+
+#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
+enum Token {
+    #[regex(r"[ \t\r\n]+")]
+    Whitespace,
+    #[regex(r"/\*([^*]|\*[^/])*\*/", priority = 2)]
+    #[regex(r"//[^\n]*")]
+    Comment,
+    #[regex(r"[A-Za-z_][A-Za-z0-9_]*")]
+    Ident,
+    #[regex(r"[0-9]+")]
+    Number,
+    #[regex(r#""([^"\\]|\\.)*""#)]
+    String,
+    #[token("(")]
+    LParen,
+    #[token(")")]
+    RParen,
+    #[token("{")]
+    LBrace,
+    #[token("}")]
+    RBrace,
+    #[token("[")]
+    LBracket,
+    #[token("]")]
+    RBracket,
+    #[token(";")]
+    Semi,
+    #[token(",")]
+    Comma,
+    #[token(".")]
+    Dot,
+    #[token("::")]
+    ColonColon,
+    #[token(":")]
+    Colon,
+    #[token("|")]
+    Pipe,
+    #[token("&")]
+    Amp,
+    #[token("==")]
+    EqEq,
+    #[token("=")]
+    Eq,
+    #[token(":-")]
+    Implies,
+    #[token("%")]
+    Percent,
+    #[token("*")]
+    Star,
+    #[token("/")]
+    Slash,
+    #[token("+")]
+    Plus,
+    #[token("-")]
+    Minus,
+    #[token("->")]
+    Arrow,
+    #[token("=>")]
+    FatArrow,
+    #[token("<=")]
+    Lte,
+    #[token("<=>")]
+    Spaceship,
+    #[token(">=")]
+    Gte,
+    #[token("<")]
+    Lt,
+    #[token(">")]
+    Gt,
+    #[token("!=")]
+    Neq,
+    #[token(">>")]
+    Shr,
+    #[token("<<")]
+    Shl,
+    #[token("~")]
+    Tilde,
+    #[token("@")]
+    At,
+    #[token("#")]
+    Hash,
+    #[token("'")]
+    Apostrophe,
+}
+
+fn keyword_kind(ident: &str) -> Option<SyntaxKind> {
+    Some(match ident {
+        "abstract" => SyntaxKind::K_ABSTRACT,
+        "Aggregate" => SyntaxKind::K_AGGREGATE,
+        "and" => SyntaxKind::K_AND,
+        "apply" => SyntaxKind::K_APPLY,
+        "as" => SyntaxKind::K_AS,
+        "async" => SyntaxKind::K_ASYNC,
+        "await" => SyntaxKind::K_AWAIT,
+        "become" => SyntaxKind::K_BECOME,
+        "bigint" => SyntaxKind::K_BIGINT,
+        "bit" => SyntaxKind::K_BIT,
+        "bool" => SyntaxKind::K_BOOL,
+        "box" => SyntaxKind::K_BOX,
+        "break" => SyntaxKind::K_BREAK,
+        "const" => SyntaxKind::K_CONST,
+        "continue" => SyntaxKind::K_CONTINUE,
+        "crate" => SyntaxKind::K_CRATE,
+        "do" => SyntaxKind::K_DO,
+        "double" => SyntaxKind::K_DOUBLE,
+        "dyn" => SyntaxKind::K_DYN,
+        "else" => SyntaxKind::K_ELSE,
+        "extern" => SyntaxKind::K_EXTERN,
+        "false" => SyntaxKind::K_FALSE,
+        "final" => SyntaxKind::K_FINAL,
+        "fn" => SyntaxKind::K_FN,
+        "FlatMap" => SyntaxKind::K_FLATMAP,
+        "float" => SyntaxKind::K_FLOAT,
+        "for" => SyntaxKind::K_FOR,
+        "function" => SyntaxKind::K_FUNCTION,
+        "if" => SyntaxKind::K_IF,
+        "impl" => SyntaxKind::K_IMPL,
+        "import" => SyntaxKind::K_IMPORT,
+        "in" => SyntaxKind::K_IN,
+        "input" => SyntaxKind::K_INPUT,
+        "Inspect" => SyntaxKind::K_INSPECT,
+        "let" => SyntaxKind::K_LET,
+        "loop" => SyntaxKind::K_LOOP,
+        "macro" => SyntaxKind::K_MACRO,
+        "match" => SyntaxKind::K_MATCH,
+        "mod" => SyntaxKind::K_MOD,
+        "move" => SyntaxKind::K_MOVE,
+        "multiset" => SyntaxKind::K_MULTISET,
+        "mut" => SyntaxKind::K_MUT,
+        "not" => SyntaxKind::K_NOT,
+        "or" => SyntaxKind::K_OR,
+        "override" => SyntaxKind::K_OVERRIDE,
+        "output" => SyntaxKind::K_OUTPUT,
+        "priv" => SyntaxKind::K_PRIV,
+        "pub" => SyntaxKind::K_PUB,
+        "ref" => SyntaxKind::K_REF,
+        "relation" => SyntaxKind::K_RELATION,
+        "return" => SyntaxKind::K_RETURN,
+        "self" => SyntaxKind::K_SELF,
+        "Self" => SyntaxKind::K_SELF_TYPE,
+        "signed" => SyntaxKind::K_SIGNED,
+        "skip" => SyntaxKind::K_SKIP,
+        "static" => SyntaxKind::K_STATIC,
+        "stream" => SyntaxKind::K_STREAM,
+        "struct" => SyntaxKind::K_STRUCT,
+        "super" => SyntaxKind::K_SUPER,
+        "trait" => SyntaxKind::K_TRAIT,
+        "transformer" => SyntaxKind::K_TRANSFORMER,
+        "try" => SyntaxKind::K_TRY,
+        "true" => SyntaxKind::K_TRUE,
+        "type" => SyntaxKind::K_TYPE,
+        "typedef" => SyntaxKind::K_TYPEDEF,
+        "typeof" => SyntaxKind::K_TYPEOF,
+        "_" => SyntaxKind::K_UNDERSCORE,
+        "unsafe" => SyntaxKind::K_UNSAFE,
+        "unsized" => SyntaxKind::K_UNSIZED,
+        "use" => SyntaxKind::K_USE,
+        "var" => SyntaxKind::K_VAR,
+        "virtual" => SyntaxKind::K_VIRTUAL,
+        "where" => SyntaxKind::K_WHERE,
+        "while" => SyntaxKind::K_WHILE,
+        "yield" => SyntaxKind::K_YIELD,
+        _ => return None,
+    })
+}
+
+/// Tokenise the provided `DDlog` source.
+#[must_use]
+pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> {
+    let mut lexer = Token::lexer(src);
+    let mut out = Vec::new();
+    while let Some(result) = lexer.next() {
+        let span = lexer.span();
+        let text = src.get(span.clone()).unwrap_or("");
+        let Ok(token) = result else {
+            out.push((SyntaxKind::N_ERROR, span));
+            continue;
+        };
+        let kind = match token {
+            Token::Whitespace => SyntaxKind::T_WHITESPACE,
+            Token::Comment => SyntaxKind::T_COMMENT,
+            Token::Ident => keyword_kind(text).unwrap_or(SyntaxKind::T_IDENT),
+            Token::Number => SyntaxKind::T_NUMBER,
+            Token::String => SyntaxKind::T_STRING,
+            Token::LParen => SyntaxKind::T_LPAREN,
+            Token::RParen => SyntaxKind::T_RPAREN,
+            Token::LBrace => SyntaxKind::T_LBRACE,
+            Token::RBrace => SyntaxKind::T_RBRACE,
+            Token::LBracket => SyntaxKind::T_LBRACKET,
+            Token::RBracket => SyntaxKind::T_RBRACKET,
+            Token::Semi => SyntaxKind::T_SEMI,
+            Token::Comma => SyntaxKind::T_COMMA,
+            Token::Dot => SyntaxKind::T_DOT,
+            Token::ColonColon => SyntaxKind::T_COLON_COLON,
+            Token::Colon => SyntaxKind::T_COLON,
+            Token::Pipe => SyntaxKind::T_PIPE,
+            Token::Amp => SyntaxKind::T_AMP,
+            Token::EqEq => SyntaxKind::T_EQEQ,
+            Token::Eq => SyntaxKind::T_EQ,
+            Token::Implies => SyntaxKind::T_IMPLIES,
+            Token::Percent => SyntaxKind::T_PERCENT,
+            Token::Star => SyntaxKind::T_STAR,
+            Token::Slash => SyntaxKind::T_SLASH,
+            Token::Plus => SyntaxKind::T_PLUS,
+            Token::Minus => SyntaxKind::T_MINUS,
+            Token::Arrow => SyntaxKind::T_ARROW,
+            Token::FatArrow => SyntaxKind::T_FAT_ARROW,
+            Token::Lte => SyntaxKind::T_LTE,
+            Token::Spaceship => SyntaxKind::T_SPACESHIP,
+            Token::Gte => SyntaxKind::T_GTE,
+            Token::Lt => SyntaxKind::T_LT,
+            Token::Gt => SyntaxKind::T_GT,
+            Token::Neq => SyntaxKind::T_NEQ,
+            Token::Shr => SyntaxKind::T_SHR,
+            Token::Shl => SyntaxKind::T_SHL,
+            Token::Tilde => SyntaxKind::T_TILDE,
+            Token::At => SyntaxKind::T_AT,
+            Token::Hash => SyntaxKind::T_HASH,
+            Token::Apostrophe => SyntaxKind::T_APOSTROPHE,
+        };
+        out.push((kind, span));
+    }
+    out
+}
diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs
new file mode 100644
index 00000000..265b72a7
--- /dev/null
+++ b/tests/tokenizer.rs
@@ -0,0 +1,30 @@
+use ddlint::{SyntaxKind, tokenize};
+use rstest::{fixture, rstest};
+
+#[fixture]
+fn simple_input() -> &'static str {
+    "input relation R(x: u32)"
+}
+
+#[rstest]
+#[case("input", vec![SyntaxKind::K_INPUT])]
+#[case("relation", vec![SyntaxKind::K_RELATION])]
+#[case("R", vec![SyntaxKind::T_IDENT])]
+fn single_tokens(#[case] source: &str, #[case] expected: Vec<SyntaxKind>) {
+    let tokens = tokenize(source);
+    let kinds: Vec<SyntaxKind> = tokens.iter().map(|(k, _)| *k).collect();
+    assert_eq!(kinds, expected);
+}
+
+#[rstest]
+fn token_spans(simple_input: &str) {
+    let tokens = tokenize(simple_input);
+    for (kind, span) in tokens {
+        let text = simple_input.get(span.clone()).unwrap_or("");
+        if let SyntaxKind::K_INPUT = kind {
+            assert_eq!(text, "input");
+        } else if let SyntaxKind::K_RELATION = kind {
+            assert_eq!(text, "relation");
+        }
+    }
+}

From cf35d9fe9bfd0cbb16ddc721b63c48093c4ff4bb Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 24 Jun 2025 19:46:38 +0100
Subject: [PATCH 2/3] Add tokenizer tests

---
 tests/tokenizer.rs | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs
index 265b72a7..427fa36d 100644
--- a/tests/tokenizer.rs
+++ b/tests/tokenizer.rs
@@ -28,3 +28,44 @@ fn token_spans(simple_input: &str) {
         }
     }
 }
+
+#[rstest]
+#[case("123", SyntaxKind::T_NUMBER)]
+#[case("\"foo\"", SyntaxKind::T_STRING)]
+fn literal_tokens(#[case] source: &str, #[case] expected: SyntaxKind) {
+    let tokens = tokenize(source);
+    assert_eq!(tokens.len(), 1);
+    let first = tokens
+        .first()
+        .cloned()
+        .unwrap_or_else(|| panic!("no token"));
+    assert_eq!(first.0, expected);
+}
+
+#[rstest]
+#[case(" ", SyntaxKind::T_WHITESPACE)]
+#[case("\n", SyntaxKind::T_WHITESPACE)]
+#[case("/* c */", SyntaxKind::T_COMMENT)]
+#[case("// line", SyntaxKind::T_COMMENT)]
+fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) {
+    let tokens = tokenize(source);
+    assert_eq!(tokens.len(), 1);
+    let first = tokens
+        .first()
+        .cloned()
+        .unwrap_or_else(|| panic!("no token"));
+    assert_eq!(first.0, expected);
+}
+
+#[rstest]
+#[case("?")]
+#[case("$")]
+fn unknown_character_produces_error(#[case] source: &str) {
+    let tokens = tokenize(source);
+    assert_eq!(tokens.len(), 1);
+    let first = tokens
+        .first()
+        .cloned()
+        .unwrap_or_else(|| panic!("no token"));
+    assert_eq!(first.0, SyntaxKind::N_ERROR);
+}

From a6b8654ff34a6dcce5b75c025508551186dae486 Mon Sep 17 00:00:00 2001
From: "coderabbitai[bot]"
 <136622811+coderabbitai[bot]@users.noreply.github.com>
Date: Tue, 24 Jun 2025 21:18:19 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`cod?=
 =?UTF-8?q?ex/build-differential-datalog-tokenizer`?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Docstrings generation was requested by @leynos.

* https://github.com/leynos/ddlint/pull/7#issuecomment-3001471343

The following files were modified:

* `src/tokenizer.rs`
* `tests/tokenizer.rs`
---
 src/tokenizer.rs   | 28 ++++++++++++++++++-
 tests/tokenizer.rs | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index c8d36503..9aa84c38 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -96,6 +96,17 @@ enum Token {
     Apostrophe,
 }
 
+/// Returns the `SyntaxKind` for a given identifier if it matches a recognised keyword.
+///
+/// If the input string corresponds to a DDlog or Rust keyword, returns the associated
+/// `SyntaxKind` variant; otherwise, returns `None`.
+///
+/// # Examples
+///
+/// ```no_run
+/// assert_eq!(keyword_kind("fn"), Some(SyntaxKind::K_FN));
+/// assert_eq!(keyword_kind("foobar"), None);
+/// ```
 fn keyword_kind(ident: &str) -> Option<SyntaxKind> {
     Some(match ident {
         "abstract" => SyntaxKind::K_ABSTRACT,
@@ -177,7 +188,22 @@ fn keyword_kind(ident: &str) -> Option<SyntaxKind> {
     })
 }
 
-/// Tokenise the provided `DDlog` source.
+/// Converts DDlog source text into a sequence of syntax tokens with their corresponding byte spans.
+///
+/// Each token is represented as a `(SyntaxKind, Span)` pair, where `Span` is the byte range of the token in the input.
+/// Whitespace and comments are preserved in the output. Identifiers that match keywords are assigned the appropriate
+/// keyword kind; otherwise, they are treated as generic identifiers. Unrecognised or invalid tokens are marked with
+/// `SyntaxKind::N_ERROR`.
+///
+/// # Examples
+///
+/// ```no_run
+/// use ddlog_tokenizer::{tokenize, SyntaxKind};
+///
+/// let src = "relation R(x: bit<32>) // comment";
+/// let tokens = tokenize(src);
+/// assert!(tokens.iter().any(|(kind, _)| *kind == SyntaxKind::T_IDENT));
+/// ```
 #[must_use]
 pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> {
     let mut lexer = Token::lexer(src);
diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs
index 427fa36d..9238a329 100644
--- a/tests/tokenizer.rs
+++ b/tests/tokenizer.rs
@@ -1,11 +1,28 @@
 use ddlint::{SyntaxKind, tokenize};
 use rstest::{fixture, rstest};
 
+/// Provides a static example input string for use in tokenizer tests.
+///
+/// # Examples
+///
+/// ```no_run
+/// let input = simple_input();
+/// assert_eq!(input, "input relation R(x: u32)");
+/// ```
 #[fixture]
 fn simple_input() -> &'static str {
     "input relation R(x: u32)"
 }
 
+/// Tests that tokenising a single keyword or identifier produces the expected sequence of token kinds.
+///
+/// # Examples
+///
+/// ```no_run
+/// single_tokens("input", vec![SyntaxKind::K_INPUT]);
+/// single_tokens("relation", vec![SyntaxKind::K_RELATION]);
+/// single_tokens("R", vec![SyntaxKind::T_IDENT]);
+/// ```
 #[rstest]
 #[case("input", vec![SyntaxKind::K_INPUT])]
 #[case("relation", vec![SyntaxKind::K_RELATION])]
@@ -16,6 +33,17 @@ fn single_tokens(#[case] source: &str, #[case] expected: Vec<SyntaxKind>) {
     assert_eq!(kinds, expected);
 }
 
+/// Verifies that the spans of keyword tokens correspond to the correct substrings in the input.
+///
+/// This test checks that the `K_INPUT` and `K_RELATION` tokens produced by the tokenizer
+/// map to the expected text slices within the provided input string.
+///
+/// # Examples
+///
+/// ```no_run
+/// let input = "input relation R(x: u32)";
+/// token_spans(input); // Asserts that "input" and "relation" tokens have correct spans.
+/// ```
 #[rstest]
 fn token_spans(simple_input: &str) {
     let tokens = tokenize(simple_input);
@@ -29,6 +57,20 @@ fn token_spans(simple_input: &str) {
     }
 }
 
+/// Tests that a single literal input is tokenised as the expected literal token kind.
+///
+/// Asserts that tokenising the input string produces exactly one token of the expected
+/// `SyntaxKind` for literals such as numbers or strings.
+///
+/// # Examples
+///
+/// ```no_run
+/// // Number literal
+/// literal_tokens("123", SyntaxKind::T_NUMBER);
+///
+/// // String literal
+/// literal_tokens("\"foo\"", SyntaxKind::T_STRING);
+/// ```
 #[rstest]
 #[case("123", SyntaxKind::T_NUMBER)]
 #[case("\"foo\"", SyntaxKind::T_STRING)]
@@ -42,6 +84,21 @@ fn literal_tokens(#[case] source: &str, #[case] expected: SyntaxKind) {
     assert_eq!(first.0, expected);
 }
 
+/// Verifies that whitespace and comment inputs are tokenised as the expected trivia token kind.
+///
+/// Asserts that tokenising the given source string produces exactly one token of the expected
+/// `SyntaxKind` for trivia (whitespace or comment).
+///
+/// # Examples
+///
+/// ```no_run
+/// use ddlint::tokenize;
+/// use ddlint::SyntaxKind;
+///
+/// let tokens = tokenize(" ");
+/// assert_eq!(tokens.len(), 1);
+/// assert_eq!(tokens[0].0, SyntaxKind::T_WHITESPACE);
+/// ```
 #[rstest]
 #[case(" ", SyntaxKind::T_WHITESPACE)]
 #[case("\n", SyntaxKind::T_WHITESPACE)]
@@ -57,6 +114,17 @@ fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) {
     assert_eq!(first.0, expected);
 }
 
+/// Verifies that tokenising an unknown character produces a single error token.
+///
+/// This test ensures that when the tokenizer encounters an unrecognised character,
+/// it emits exactly one token of kind `N_ERROR`.
+///
+/// # Examples
+///
+/// ```no_run
+/// // The test will pass if the tokenizer returns a single error token for '?'
+/// unknown_character_produces_error("?");
+/// ```
 #[rstest]
 #[case("?")]
 #[case("$")]