diff --git a/Cargo.toml b/Cargo.toml index 98929746..fe8d77f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,11 @@ edition = "2024" rowan = { version = "0.15", default-features = false } num-derive = { version = "0.4", default-features = false } num-traits = { version = "0.2", default-features = false, features = ["std"] } +logos = { version = ">=0.13.0, <0.14.0", default-features = false, features = ["export_derive"] } +phf = { version = ">=0.11.0, <0.12.0", default-features = false, features = ["macros"] } + +[dev-dependencies] +rstest = ">=0.18.0, <0.19.0" [lints.clippy] pedantic = { level = "warn", priority = -1 } diff --git a/docs/ddlint-design-and-road-map.md b/docs/ddlint-design-and-road-map.md index 2a9d32c4..b1aa5ed7 100644 --- a/docs/ddlint-design-and-road-map.md +++ b/docs/ddlint-design-and-road-map.md @@ -212,6 +212,7 @@ enum SyntaxKind { T_IDENT, // An identifier, like a relation or variable name T_STRING_LIT, // A string literal T_NUMBER, // A numeric literal + // decimal, floating-point, hex, binary, or octal T_LPAREN, // '(' T_RPAREN, // ')' T_LBRACE, // '{' diff --git a/docs/parser-plan.md b/docs/parser-plan.md index 77f695f9..182eb6fe 100644 --- a/docs/parser-plan.md +++ b/docs/parser-plan.md @@ -33,7 +33,26 @@ transparently. Use `chumsky`'s text utilities (or integrate a `logos` lexer if preferred) to convert the source text into a stream of `(SyntaxKind, Span)` pairs. Each span records byte offsets so that the resulting CST can precisely mirror the input. -Whitespace and comments should produce tokens so they can be preserved. +Whitespace and comments should produce tokens so they can be preserved. The +current implementation opts for a small `logos` lexer because it keeps the token +definitions declarative while still interoperating smoothly with `chumsky` +parsers. Keyword lookups use a `phf::Map` for zero-cost perfect hashing. + +```mermaid +sequenceDiagram + participant Client + participant Tokenizer + participant LogosLexer + participant SyntaxKind + Client->>Tokenizer: tokenize(src: &str) + Tokenizer->>LogosLexer: Token::lexer(src) + loop for each token + LogosLexer-->>Tokenizer: next() -> Token + span + Tokenizer->>SyntaxKind: map Token to SyntaxKind + Tokenizer-->>Tokenizer: collect (SyntaxKind, Span) + end + Tokenizer-->>Client: Vec<(SyntaxKind, Span)> +``` ## 4. Construct the Parser with `chumsky` diff --git a/src/lib.rs b/src/lib.rs index a02f3654..143fe57e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,11 @@ //! Library crate for ddlint. //! -//! Currently exposes only the parser language definitions. +//! Exposes parser language definitions and lexical analysis functionality. #![forbid(unsafe_code)] pub mod language; +pub mod tokenizer; pub use language::{DdlogLanguage, SyntaxKind}; +pub use tokenizer::{Span, tokenize}; diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 00000000..efb79be0 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,245 @@ +//! Lexical analysis for `DDlog` source. +//! +//! This module exposes a `tokenize` function which converts raw source text into +//! a sequence of `(SyntaxKind, Span)` pairs. It uses the `logos` crate to +//! recognise tokens so that the CST can mirror the input exactly. + +use logos::Logos; +use phf::phf_map; + +use crate::SyntaxKind; + +/// Byte range for a token within the source. +pub type Span = std::ops::Range; + +#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] +enum Token { + #[regex(r"[ \t\r\n]+")] + Whitespace, + #[regex(r"/\*([^*]|\*[^/])*\*/", priority = 2)] + #[regex(r"//[^\n]*")] + Comment, + #[regex(r"[A-Za-z_][A-Za-z0-9_]*")] + Ident, + #[regex(r"0[xX][0-9a-fA-F]+|0[bB][01]+|0[oO][0-7]+|[0-9]+(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")] + Number, + #[regex(r#""([^"\\]|\\.)*""#)] + String, + #[token("(")] + LParen, + #[token(")")] + RParen, + #[token("{")] + LBrace, + #[token("}")] + RBrace, + #[token("[")] + LBracket, + #[token("]")] + RBracket, + #[token(";")] + Semi, + #[token(",")] + Comma, + #[token(".")] + Dot, + #[token("::")] + ColonColon, + #[token(":")] + Colon, + #[token("|")] + Pipe, + #[token("&")] + Amp, + #[token("==")] + EqEq, + #[token("=")] + Eq, + #[token(":-")] + Implies, + #[token("%")] + Percent, + #[token("*")] + Star, + #[token("/")] + Slash, + #[token("+")] + Plus, + #[token("-")] + Minus, + #[token("->")] + Arrow, + #[token("=>")] + FatArrow, + #[token("<=")] + Lte, + #[token("<=>")] + Spaceship, + #[token(">=")] + Gte, + #[token("<")] + Lt, + #[token(">")] + Gt, + #[token("!=")] + Neq, + #[token(">>")] + Shr, + #[token("<<")] + Shl, + #[token("~")] + Tilde, + #[token("@")] + At, + #[token("#")] + Hash, + #[token("'")] + Apostrophe, +} + +/// Maps identifier strings to their keyword `SyntaxKind`. +/// +/// Returns `Some(kind)` if `ident` is a recognised `DDlog` keyword, or `None` +/// otherwise. A static map avoids a long match statement and allows O(1) +/// lookups. +static KEYWORDS: phf::Map<&'static str, SyntaxKind> = phf_map! { + "abstract" => SyntaxKind::K_ABSTRACT, + "Aggregate" => SyntaxKind::K_AGGREGATE, + "and" => SyntaxKind::K_AND, + "apply" => SyntaxKind::K_APPLY, + "as" => SyntaxKind::K_AS, + "async" => SyntaxKind::K_ASYNC, + "await" => SyntaxKind::K_AWAIT, + "become" => SyntaxKind::K_BECOME, + "bigint" => SyntaxKind::K_BIGINT, + "bit" => SyntaxKind::K_BIT, + "bool" => SyntaxKind::K_BOOL, + "box" => SyntaxKind::K_BOX, + "break" => SyntaxKind::K_BREAK, + "const" => SyntaxKind::K_CONST, + "continue" => SyntaxKind::K_CONTINUE, + "crate" => SyntaxKind::K_CRATE, + "do" => SyntaxKind::K_DO, + "double" => SyntaxKind::K_DOUBLE, + "dyn" => SyntaxKind::K_DYN, + "else" => SyntaxKind::K_ELSE, + "extern" => SyntaxKind::K_EXTERN, + "false" => SyntaxKind::K_FALSE, + "final" => SyntaxKind::K_FINAL, + "fn" => SyntaxKind::K_FN, + "FlatMap" => SyntaxKind::K_FLATMAP, + "float" => SyntaxKind::K_FLOAT, + "for" => SyntaxKind::K_FOR, + "function" => SyntaxKind::K_FUNCTION, + "if" => SyntaxKind::K_IF, + "impl" => SyntaxKind::K_IMPL, + "import" => SyntaxKind::K_IMPORT, + "in" => SyntaxKind::K_IN, + "input" => SyntaxKind::K_INPUT, + "Inspect" => SyntaxKind::K_INSPECT, + "let" => SyntaxKind::K_LET, + "loop" => SyntaxKind::K_LOOP, + "macro" => SyntaxKind::K_MACRO, + "match" => SyntaxKind::K_MATCH, + "mod" => SyntaxKind::K_MOD, + "move" => SyntaxKind::K_MOVE, + "multiset" => SyntaxKind::K_MULTISET, + "mut" => SyntaxKind::K_MUT, + "not" => SyntaxKind::K_NOT, + "or" => SyntaxKind::K_OR, + "override" => SyntaxKind::K_OVERRIDE, + "output" => SyntaxKind::K_OUTPUT, + "priv" => SyntaxKind::K_PRIV, + "pub" => SyntaxKind::K_PUB, + "ref" => SyntaxKind::K_REF, + "relation" => SyntaxKind::K_RELATION, + "return" => SyntaxKind::K_RETURN, + "self" => SyntaxKind::K_SELF, + "Self" => SyntaxKind::K_SELF_TYPE, + "signed" => SyntaxKind::K_SIGNED, + "skip" => SyntaxKind::K_SKIP, + "static" => SyntaxKind::K_STATIC, + "stream" => SyntaxKind::K_STREAM, + "struct" => SyntaxKind::K_STRUCT, + "super" => SyntaxKind::K_SUPER, + "trait" => SyntaxKind::K_TRAIT, + "transformer" => SyntaxKind::K_TRANSFORMER, + "try" => SyntaxKind::K_TRY, + "true" => SyntaxKind::K_TRUE, + "type" => SyntaxKind::K_TYPE, + "typedef" => SyntaxKind::K_TYPEDEF, + "typeof" => SyntaxKind::K_TYPEOF, + "_" => SyntaxKind::K_UNDERSCORE, + "unsafe" => SyntaxKind::K_UNSAFE, + "unsized" => SyntaxKind::K_UNSIZED, + "use" => SyntaxKind::K_USE, + "var" => SyntaxKind::K_VAR, + "virtual" => SyntaxKind::K_VIRTUAL, + "where" => SyntaxKind::K_WHERE, + "while" => SyntaxKind::K_WHILE, + "yield" => SyntaxKind::K_YIELD, +}; + +fn keyword_kind(ident: &str) -> Option { + KEYWORDS.get(ident).copied() +} + +/// Tokenise the provided `DDlog` source. +#[must_use] +pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> { + let mut lexer = Token::lexer(src); + let estimated_tokens = src.len() >> 2; // roughly four chars per token + let mut out = Vec::with_capacity(estimated_tokens); + while let Some(result) = lexer.next() { + let span = lexer.span(); + let text = src.get(span.clone()).unwrap_or(""); + let Ok(token) = result else { + out.push((SyntaxKind::N_ERROR, span)); + continue; + }; + let kind = match token { + Token::Whitespace => SyntaxKind::T_WHITESPACE, + Token::Comment => SyntaxKind::T_COMMENT, + Token::Ident => keyword_kind(text).unwrap_or(SyntaxKind::T_IDENT), + Token::Number => SyntaxKind::T_NUMBER, + Token::String => SyntaxKind::T_STRING, + Token::LParen => SyntaxKind::T_LPAREN, + Token::RParen => SyntaxKind::T_RPAREN, + Token::LBrace => SyntaxKind::T_LBRACE, + Token::RBrace => SyntaxKind::T_RBRACE, + Token::LBracket => SyntaxKind::T_LBRACKET, + Token::RBracket => SyntaxKind::T_RBRACKET, + Token::Semi => SyntaxKind::T_SEMI, + Token::Comma => SyntaxKind::T_COMMA, + Token::Dot => SyntaxKind::T_DOT, + Token::ColonColon => SyntaxKind::T_COLON_COLON, + Token::Colon => SyntaxKind::T_COLON, + Token::Pipe => SyntaxKind::T_PIPE, + Token::Amp => SyntaxKind::T_AMP, + Token::EqEq => SyntaxKind::T_EQEQ, + Token::Eq => SyntaxKind::T_EQ, + Token::Implies => SyntaxKind::T_IMPLIES, + Token::Percent => SyntaxKind::T_PERCENT, + Token::Star => SyntaxKind::T_STAR, + Token::Slash => SyntaxKind::T_SLASH, + Token::Plus => SyntaxKind::T_PLUS, + Token::Minus => SyntaxKind::T_MINUS, + Token::Arrow => SyntaxKind::T_ARROW, + Token::FatArrow => SyntaxKind::T_FAT_ARROW, + Token::Lte => SyntaxKind::T_LTE, + Token::Spaceship => SyntaxKind::T_SPACESHIP, + Token::Gte => SyntaxKind::T_GTE, + Token::Lt => SyntaxKind::T_LT, + Token::Gt => SyntaxKind::T_GT, + Token::Neq => SyntaxKind::T_NEQ, + Token::Shr => SyntaxKind::T_SHR, + Token::Shl => SyntaxKind::T_SHL, + Token::Tilde => SyntaxKind::T_TILDE, + Token::At => SyntaxKind::T_AT, + Token::Hash => SyntaxKind::T_HASH, + Token::Apostrophe => SyntaxKind::T_APOSTROPHE, + }; + out.push((kind, span)); + } + out +} diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs new file mode 100644 index 00000000..5add1dda --- /dev/null +++ b/tests/tokenizer.rs @@ -0,0 +1,224 @@ +//! Integration tests for the tokenizer module. +//! +//! Tests verify that the logos-based lexer correctly tokenises `DDlog` source +//! code into `(SyntaxKind, Span)` pairs, covering keywords, literals, trivia, +//! and error cases. + +#![expect(clippy::expect_used, reason = "tests assert exact behaviour")] + +use ddlint::{SyntaxKind, tokenize}; +use rstest::{fixture, rstest}; + +#[fixture] +fn simple_input() -> &'static str { + "input relation R(x: u32)" +} + +#[rstest] +#[case("input", vec![SyntaxKind::K_INPUT])] +#[case("relation", vec![SyntaxKind::K_RELATION])] +#[case("R", vec![SyntaxKind::T_IDENT])] +fn single_tokens(#[case] source: &str, #[case] expected: Vec) { + let tokens = tokenize(source); + let kinds: Vec = tokens.iter().map(|(k, _)| *k).collect(); + assert_eq!(kinds, expected); +} + +#[rstest] +fn token_spans(simple_input: &str) { + let tokens = tokenize(simple_input); + for (kind, span) in tokens { + let text = simple_input + .get(span.clone()) + .expect("span should be valid for input"); + if let SyntaxKind::K_INPUT = kind { + assert_eq!(text, "input"); + } else if let SyntaxKind::K_RELATION = kind { + assert_eq!(text, "relation"); + } + } +} + +#[rstest] +#[case("123", SyntaxKind::T_NUMBER)] +#[case("\"foo\"", SyntaxKind::T_STRING)] +fn literal_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("0xFF", SyntaxKind::T_NUMBER)] +#[case("0b1010", SyntaxKind::T_NUMBER)] +#[case("0o77", SyntaxKind::T_NUMBER)] +#[case("1e10", SyntaxKind::T_NUMBER)] +fn extended_number_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case(" ", SyntaxKind::T_WHITESPACE)] +#[case("\n", SyntaxKind::T_WHITESPACE)] +#[case("\t", SyntaxKind::T_WHITESPACE)] +#[case("/* c */", SyntaxKind::T_COMMENT)] +#[case("// line", SyntaxKind::T_COMMENT)] +fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("?")] +#[case("$")] +fn unknown_character_produces_error(#[case] source: &str) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, SyntaxKind::N_ERROR); +} + +#[test] +fn unterminated_string_is_error() { + let tokens = tokenize("\"foo"); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, SyntaxKind::N_ERROR); +} + +#[rstest] +#[case("(", SyntaxKind::T_LPAREN)] +#[case(")", SyntaxKind::T_RPAREN)] +#[case(":", SyntaxKind::T_COLON)] +#[case("::", SyntaxKind::T_COLON_COLON)] +fn punctuation_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("{", SyntaxKind::T_LBRACE)] +#[case("}", SyntaxKind::T_RBRACE)] +#[case("[", SyntaxKind::T_LBRACKET)] +#[case("]", SyntaxKind::T_RBRACKET)] +#[case(";", SyntaxKind::T_SEMI)] +#[case(",", SyntaxKind::T_COMMA)] +#[case(".", SyntaxKind::T_DOT)] +fn delimiter_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("|", SyntaxKind::T_PIPE)] +#[case("&", SyntaxKind::T_AMP)] +#[case("=", SyntaxKind::T_EQ)] +#[case("==", SyntaxKind::T_EQEQ)] +#[case(":-", SyntaxKind::T_IMPLIES)] +#[case("%", SyntaxKind::T_PERCENT)] +#[case("*", SyntaxKind::T_STAR)] +#[case("/", SyntaxKind::T_SLASH)] +#[case("+", SyntaxKind::T_PLUS)] +#[case("-", SyntaxKind::T_MINUS)] +#[case("->", SyntaxKind::T_ARROW)] +#[case("=>", SyntaxKind::T_FAT_ARROW)] +#[case("<=", SyntaxKind::T_LTE)] +#[case("<=>", SyntaxKind::T_SPACESHIP)] +#[case(">=", SyntaxKind::T_GTE)] +#[case("<", SyntaxKind::T_LT)] +#[case(">", SyntaxKind::T_GT)] +#[case("!=", SyntaxKind::T_NEQ)] +#[case(">>", SyntaxKind::T_SHR)] +#[case("<<", SyntaxKind::T_SHL)] +#[case("~", SyntaxKind::T_TILDE)] +#[case("@", SyntaxKind::T_AT)] +#[case("#", SyntaxKind::T_HASH)] +#[case("'", SyntaxKind::T_APOSTROPHE)] +fn operator_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, expected); +} + +#[test] +fn negative_number_tokens() { + let tokens = tokenize("-1"); + let kinds: Vec = tokens.iter().map(|(k, _)| *k).collect(); + assert_eq!(kinds, vec![SyntaxKind::T_MINUS, SyntaxKind::T_NUMBER]); +} + +#[test] +fn escaped_string_token() { + let tokens = tokenize("\"a\\\"b\""); + assert_eq!(tokens.len(), 1); + assert_eq!( + tokens + .first() + .expect("tokenizer should produce at least one token") + .0, + SyntaxKind::T_STRING + ); +} + +#[test] +fn unterminated_comment_is_error() { + let tokens = tokenize("/* comment"); + assert_eq!(tokens.len(), 1); + assert_eq!( + tokens + .first() + .expect("tokenizer should produce at least one token") + .0, + SyntaxKind::N_ERROR + ); +} + +#[test] +fn empty_input_produces_no_tokens() { + let tokens = tokenize(""); + assert!(tokens.is_empty()); +} + +#[test] +fn complex_expression() { + let src = "R(a, b) :- Q(a) && S(b)."; + let tokens = tokenize(src); + // ensure we tokenise without errors and capture punctuation + assert!(tokens.iter().all(|(k, _)| *k != SyntaxKind::N_ERROR)); + assert!(tokens.iter().any(|(k, _)| *k == SyntaxKind::T_IMPLIES)); + assert!(tokens.iter().any(|(k, _)| *k == SyntaxKind::T_DOT)); +}