From 21ac3ff2932174152265e8ce3888c92e9763af80 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 24 Jun 2025 19:24:23 +0100 Subject: [PATCH 1/8] Add tokenizer using logos --- Cargo.toml | 4 + docs/parser-plan.md | 5 +- src/lib.rs | 2 + src/tokenizer.rs | 237 ++++++++++++++++++++++++++++++++++++++++++++ tests/tokenizer.rs | 30 ++++++ 5 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 src/tokenizer.rs create mode 100644 tests/tokenizer.rs diff --git a/Cargo.toml b/Cargo.toml index 98929746..f63f20ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,10 @@ edition = "2024" rowan = { version = "0.15", default-features = false } num-derive = { version = "0.4", default-features = false } num-traits = { version = "0.2", default-features = false, features = ["std"] } +logos = { version = "0.13", default-features = false, features = ["export_derive"] } + +[dev-dependencies] +rstest = "0.18" [lints.clippy] pedantic = { level = "warn", priority = -1 } diff --git a/docs/parser-plan.md b/docs/parser-plan.md index 77f695f9..f64bd403 100644 --- a/docs/parser-plan.md +++ b/docs/parser-plan.md @@ -33,7 +33,10 @@ transparently. Use `chumsky`'s text utilities (or integrate a `logos` lexer if preferred) to convert the source text into a stream of `(SyntaxKind, Span)` pairs. Each span records byte offsets so that the resulting CST can precisely mirror the input. -Whitespace and comments should produce tokens so they can be preserved. +Whitespace and comments should produce tokens so they can be preserved. The +current implementation opts for a small `logos` lexer because it keeps the token +definitions declarative while still interoperating smoothly with `chumsky` +parsers. ## 4. Construct the Parser with `chumsky` diff --git a/src/lib.rs b/src/lib.rs index a02f3654..935320b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,5 +5,7 @@ #![forbid(unsafe_code)] pub mod language; +pub mod tokenizer; pub use language::{DdlogLanguage, SyntaxKind}; +pub use tokenizer::{Span, tokenize}; diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 00000000..c8d36503 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,237 @@ +//! Lexical analysis for `DDlog` source. +//! +//! This module exposes a `tokenize` function which converts raw source text into +//! a sequence of `(SyntaxKind, Span)` pairs. It uses the `logos` crate to +//! recognise tokens so that the CST can mirror the input exactly. + +use logos::Logos; + +use crate::SyntaxKind; + +/// Byte range for a token within the source. +pub type Span = std::ops::Range; + +#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] +enum Token { + #[regex(r"[ \t\r\n]+")] + Whitespace, + #[regex(r"/\*([^*]|\*[^/])*\*/", priority = 2)] + #[regex(r"//[^\n]*")] + Comment, + #[regex(r"[A-Za-z_][A-Za-z0-9_]*")] + Ident, + #[regex(r"[0-9]+")] + Number, + #[regex(r#""([^"\\]|\\.)*""#)] + String, + #[token("(")] + LParen, + #[token(")")] + RParen, + #[token("{")] + LBrace, + #[token("}")] + RBrace, + #[token("[")] + LBracket, + #[token("]")] + RBracket, + #[token(";")] + Semi, + #[token(",")] + Comma, + #[token(".")] + Dot, + #[token("::")] + ColonColon, + #[token(":")] + Colon, + #[token("|")] + Pipe, + #[token("&")] + Amp, + #[token("==")] + EqEq, + #[token("=")] + Eq, + #[token(":-")] + Implies, + #[token("%")] + Percent, + #[token("*")] + Star, + #[token("/")] + Slash, + #[token("+")] + Plus, + #[token("-")] + Minus, + #[token("->")] + Arrow, + #[token("=>")] + FatArrow, + #[token("<=")] + Lte, + #[token("<=>")] + Spaceship, + #[token(">=")] + Gte, + #[token("<")] + Lt, + #[token(">")] + Gt, + #[token("!=")] + Neq, + #[token(">>")] + Shr, + #[token("<<")] + Shl, + #[token("~")] + Tilde, + #[token("@")] + At, + #[token("#")] + Hash, + #[token("'")] + Apostrophe, +} + +fn keyword_kind(ident: &str) -> Option { + Some(match ident { + "abstract" => SyntaxKind::K_ABSTRACT, + "Aggregate" => SyntaxKind::K_AGGREGATE, + "and" => SyntaxKind::K_AND, + "apply" => SyntaxKind::K_APPLY, + "as" => SyntaxKind::K_AS, + "async" => SyntaxKind::K_ASYNC, + "await" => SyntaxKind::K_AWAIT, + "become" => SyntaxKind::K_BECOME, + "bigint" => SyntaxKind::K_BIGINT, + "bit" => SyntaxKind::K_BIT, + "bool" => SyntaxKind::K_BOOL, + "box" => SyntaxKind::K_BOX, + "break" => SyntaxKind::K_BREAK, + "const" => SyntaxKind::K_CONST, + "continue" => SyntaxKind::K_CONTINUE, + "crate" => SyntaxKind::K_CRATE, + "do" => SyntaxKind::K_DO, + "double" => SyntaxKind::K_DOUBLE, + "dyn" => SyntaxKind::K_DYN, + "else" => SyntaxKind::K_ELSE, + "extern" => SyntaxKind::K_EXTERN, + "false" => SyntaxKind::K_FALSE, + "final" => SyntaxKind::K_FINAL, + "fn" => SyntaxKind::K_FN, + "FlatMap" => SyntaxKind::K_FLATMAP, + "float" => SyntaxKind::K_FLOAT, + "for" => SyntaxKind::K_FOR, + "function" => SyntaxKind::K_FUNCTION, + "if" => SyntaxKind::K_IF, + "impl" => SyntaxKind::K_IMPL, + "import" => SyntaxKind::K_IMPORT, + "in" => SyntaxKind::K_IN, + "input" => SyntaxKind::K_INPUT, + "Inspect" => SyntaxKind::K_INSPECT, + "let" => SyntaxKind::K_LET, + "loop" => SyntaxKind::K_LOOP, + "macro" => SyntaxKind::K_MACRO, + "match" => SyntaxKind::K_MATCH, + "mod" => SyntaxKind::K_MOD, + "move" => SyntaxKind::K_MOVE, + "multiset" => SyntaxKind::K_MULTISET, + "mut" => SyntaxKind::K_MUT, + "not" => SyntaxKind::K_NOT, + "or" => SyntaxKind::K_OR, + "override" => SyntaxKind::K_OVERRIDE, + "output" => SyntaxKind::K_OUTPUT, + "priv" => SyntaxKind::K_PRIV, + "pub" => SyntaxKind::K_PUB, + "ref" => SyntaxKind::K_REF, + "relation" => SyntaxKind::K_RELATION, + "return" => SyntaxKind::K_RETURN, + "self" => SyntaxKind::K_SELF, + "Self" => SyntaxKind::K_SELF_TYPE, + "signed" => SyntaxKind::K_SIGNED, + "skip" => SyntaxKind::K_SKIP, + "static" => SyntaxKind::K_STATIC, + "stream" => SyntaxKind::K_STREAM, + "struct" => SyntaxKind::K_STRUCT, + "super" => SyntaxKind::K_SUPER, + "trait" => SyntaxKind::K_TRAIT, + "transformer" => SyntaxKind::K_TRANSFORMER, + "try" => SyntaxKind::K_TRY, + "true" => SyntaxKind::K_TRUE, + "type" => SyntaxKind::K_TYPE, + "typedef" => SyntaxKind::K_TYPEDEF, + "typeof" => SyntaxKind::K_TYPEOF, + "_" => SyntaxKind::K_UNDERSCORE, + "unsafe" => SyntaxKind::K_UNSAFE, + "unsized" => SyntaxKind::K_UNSIZED, + "use" => SyntaxKind::K_USE, + "var" => SyntaxKind::K_VAR, + "virtual" => SyntaxKind::K_VIRTUAL, + "where" => SyntaxKind::K_WHERE, + "while" => SyntaxKind::K_WHILE, + "yield" => SyntaxKind::K_YIELD, + _ => return None, + }) +} + +/// Tokenise the provided `DDlog` source. +#[must_use] +pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> { + let mut lexer = Token::lexer(src); + let mut out = Vec::new(); + while let Some(result) = lexer.next() { + let span = lexer.span(); + let text = src.get(span.clone()).unwrap_or(""); + let Ok(token) = result else { + out.push((SyntaxKind::N_ERROR, span)); + continue; + }; + let kind = match token { + Token::Whitespace => SyntaxKind::T_WHITESPACE, + Token::Comment => SyntaxKind::T_COMMENT, + Token::Ident => keyword_kind(text).unwrap_or(SyntaxKind::T_IDENT), + Token::Number => SyntaxKind::T_NUMBER, + Token::String => SyntaxKind::T_STRING, + Token::LParen => SyntaxKind::T_LPAREN, + Token::RParen => SyntaxKind::T_RPAREN, + Token::LBrace => SyntaxKind::T_LBRACE, + Token::RBrace => SyntaxKind::T_RBRACE, + Token::LBracket => SyntaxKind::T_LBRACKET, + Token::RBracket => SyntaxKind::T_RBRACKET, + Token::Semi => SyntaxKind::T_SEMI, + Token::Comma => SyntaxKind::T_COMMA, + Token::Dot => SyntaxKind::T_DOT, + Token::ColonColon => SyntaxKind::T_COLON_COLON, + Token::Colon => SyntaxKind::T_COLON, + Token::Pipe => SyntaxKind::T_PIPE, + Token::Amp => SyntaxKind::T_AMP, + Token::EqEq => SyntaxKind::T_EQEQ, + Token::Eq => SyntaxKind::T_EQ, + Token::Implies => SyntaxKind::T_IMPLIES, + Token::Percent => SyntaxKind::T_PERCENT, + Token::Star => SyntaxKind::T_STAR, + Token::Slash => SyntaxKind::T_SLASH, + Token::Plus => SyntaxKind::T_PLUS, + Token::Minus => SyntaxKind::T_MINUS, + Token::Arrow => SyntaxKind::T_ARROW, + Token::FatArrow => SyntaxKind::T_FAT_ARROW, + Token::Lte => SyntaxKind::T_LTE, + Token::Spaceship => SyntaxKind::T_SPACESHIP, + Token::Gte => SyntaxKind::T_GTE, + Token::Lt => SyntaxKind::T_LT, + Token::Gt => SyntaxKind::T_GT, + Token::Neq => SyntaxKind::T_NEQ, + Token::Shr => SyntaxKind::T_SHR, + Token::Shl => SyntaxKind::T_SHL, + Token::Tilde => SyntaxKind::T_TILDE, + Token::At => SyntaxKind::T_AT, + Token::Hash => SyntaxKind::T_HASH, + Token::Apostrophe => SyntaxKind::T_APOSTROPHE, + }; + out.push((kind, span)); + } + out +} diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs new file mode 100644 index 00000000..265b72a7 --- /dev/null +++ b/tests/tokenizer.rs @@ -0,0 +1,30 @@ +use ddlint::{SyntaxKind, tokenize}; +use rstest::{fixture, rstest}; + +#[fixture] +fn simple_input() -> &'static str { + "input relation R(x: u32)" +} + +#[rstest] +#[case("input", vec![SyntaxKind::K_INPUT])] +#[case("relation", vec![SyntaxKind::K_RELATION])] +#[case("R", vec![SyntaxKind::T_IDENT])] +fn single_tokens(#[case] source: &str, #[case] expected: Vec) { + let tokens = tokenize(source); + let kinds: Vec = tokens.iter().map(|(k, _)| *k).collect(); + assert_eq!(kinds, expected); +} + +#[rstest] +fn token_spans(simple_input: &str) { + let tokens = tokenize(simple_input); + for (kind, span) in tokens { + let text = simple_input.get(span.clone()).unwrap_or(""); + if let SyntaxKind::K_INPUT = kind { + assert_eq!(text, "input"); + } else if let SyntaxKind::K_RELATION = kind { + assert_eq!(text, "relation"); + } + } +} From cf35d9fe9bfd0cbb16ddc721b63c48093c4ff4bb Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 24 Jun 2025 19:46:38 +0100 Subject: [PATCH 2/8] Add tokenizer tests --- tests/tokenizer.rs | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 265b72a7..427fa36d 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -28,3 +28,44 @@ fn token_spans(simple_input: &str) { } } } + +#[rstest] +#[case("123", SyntaxKind::T_NUMBER)] +#[case("\"foo\"", SyntaxKind::T_STRING)] +fn literal_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .unwrap_or_else(|| panic!("no token")); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case(" ", SyntaxKind::T_WHITESPACE)] +#[case("\n", SyntaxKind::T_WHITESPACE)] +#[case("/* c */", SyntaxKind::T_COMMENT)] +#[case("// line", SyntaxKind::T_COMMENT)] +fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .unwrap_or_else(|| panic!("no token")); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("?")] +#[case("$")] +fn unknown_character_produces_error(#[case] source: &str) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .unwrap_or_else(|| panic!("no token")); + assert_eq!(first.0, SyntaxKind::N_ERROR); +} From 19f9ed12a08d32a61b5c7d69140addc93d32d430 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 24 Jun 2025 22:38:46 +0100 Subject: [PATCH 3/8] Add tokenizer mermaid diagram --- docs/parser-plan.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/parser-plan.md b/docs/parser-plan.md index f64bd403..b5472188 100644 --- a/docs/parser-plan.md +++ b/docs/parser-plan.md @@ -38,6 +38,22 @@ current implementation opts for a small `logos` lexer because it keeps the token definitions declarative while still interoperating smoothly with `chumsky` parsers. +```mermaid +sequenceDiagram + participant Client + participant Tokenizer + participant LogosLexer + participant SyntaxKind + Client->>Tokenizer: tokenize(src: &str) + Tokenizer->>LogosLexer: Token::lexer(src) + loop for each token + LogosLexer-->>Tokenizer: next() -> Token + span + Tokenizer->>SyntaxKind: map Token to SyntaxKind + Tokenizer-->>Tokenizer: collect (SyntaxKind, Span) + end + Tokenizer-->>Client: Vec<(SyntaxKind, Span)> +``` + ## 4. Construct the Parser with `chumsky` 1. Express each grammar rule using `chumsky` combinators. The parser should From e2db157cc6534b8b48b99031082da770c6d7c3a0 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 24 Jun 2025 22:51:41 +0100 Subject: [PATCH 4/8] Add module docs and expose tokenizer --- src/lib.rs | 2 +- tests/tokenizer.rs | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 935320b5..143fe57e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ //! Library crate for ddlint. //! -//! Currently exposes only the parser language definitions. +//! Exposes parser language definitions and lexical analysis functionality. #![forbid(unsafe_code)] diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 427fa36d..a99eb898 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -1,3 +1,9 @@ +//! Integration tests for the tokenizer module. +//! +//! Tests verify that the logos-based lexer correctly tokenises `DDlog` source +//! code into `(SyntaxKind, Span)` pairs, covering keywords, literals, trivia, +//! and error cases. + use ddlint::{SyntaxKind, tokenize}; use rstest::{fixture, rstest}; From 9625b8cb4a11fb00a663e072c529acc311e0e11b Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 24 Jun 2025 23:00:11 +0100 Subject: [PATCH 5/8] Refine tokenizer tests and dependencies --- Cargo.toml | 4 ++-- tests/tokenizer.rs | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f63f20ef..3da02b8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,10 +7,10 @@ edition = "2024" rowan = { version = "0.15", default-features = false } num-derive = { version = "0.4", default-features = false } num-traits = { version = "0.2", default-features = false, features = ["std"] } -logos = { version = "0.13", default-features = false, features = ["export_derive"] } +logos = { version = ">=0.13.0, <0.14.0", default-features = false, features = ["export_derive"] } [dev-dependencies] -rstest = "0.18" +rstest = ">=0.18.0, <0.19.0" [lints.clippy] pedantic = { level = "warn", priority = -1 } diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index a99eb898..9192bdd9 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -75,3 +75,37 @@ fn unknown_character_produces_error(#[case] source: &str) { .unwrap_or_else(|| panic!("no token")); assert_eq!(first.0, SyntaxKind::N_ERROR); } + +#[rstest] +#[case("(", SyntaxKind::T_LPAREN)] +#[case(")", SyntaxKind::T_RPAREN)] +#[case("==", SyntaxKind::T_EQEQ)] +#[case(":", SyntaxKind::T_COLON)] +#[case("::", SyntaxKind::T_COLON_COLON)] +#[case("->", SyntaxKind::T_ARROW)] +#[case("=>", SyntaxKind::T_FAT_ARROW)] +fn punctuation_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .unwrap_or_else(|| panic!("no token")); + assert_eq!(first.0, expected); +} + +#[test] +fn empty_input_produces_no_tokens() { + let tokens = tokenize(""); + assert!(tokens.is_empty()); +} + +#[test] +fn complex_expression() { + let src = "R(a, b) :- Q(a) && S(b)."; + let tokens = tokenize(src); + // ensure we tokenise without errors and capture punctuation + assert!(tokens.iter().all(|(k, _)| *k != SyntaxKind::N_ERROR)); + assert!(tokens.iter().any(|(k, _)| *k == SyntaxKind::T_IMPLIES)); + assert!(tokens.iter().any(|(k, _)| *k == SyntaxKind::T_DOT)); +} From 23a520798dae6bdb0e73e9f86cd4dd484fcc864c Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 25 Jun 2025 00:33:22 +0100 Subject: [PATCH 6/8] Improve tokenizer and tests --- docs/ddlint-design-and-road-map.md | 1 + src/tokenizer.rs | 171 +++++++++++++++-------------- tests/tokenizer.rs | 37 ++++++- 3 files changed, 124 insertions(+), 85 deletions(-) diff --git a/docs/ddlint-design-and-road-map.md b/docs/ddlint-design-and-road-map.md index 2a9d32c4..b1aa5ed7 100644 --- a/docs/ddlint-design-and-road-map.md +++ b/docs/ddlint-design-and-road-map.md @@ -212,6 +212,7 @@ enum SyntaxKind { T_IDENT, // An identifier, like a relation or variable name T_STRING_LIT, // A string literal T_NUMBER, // A numeric literal + // decimal, floating-point, hex, binary, or octal T_LPAREN, // '(' T_RPAREN, // ')' T_LBRACE, // '{' diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c8d36503..ff71fbd4 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -5,6 +5,8 @@ //! recognise tokens so that the CST can mirror the input exactly. use logos::Logos; +use std::collections::HashMap; +use std::sync::LazyLock; use crate::SyntaxKind; @@ -20,7 +22,7 @@ enum Token { Comment, #[regex(r"[A-Za-z_][A-Za-z0-9_]*")] Ident, - #[regex(r"[0-9]+")] + #[regex(r"0[xX][0-9a-fA-F]+|0[bB][01]+|0[oO][0-7]+|[0-9]+(?:\\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")] Number, #[regex(r#""([^"\\]|\\.)*""#)] String, @@ -96,92 +98,101 @@ enum Token { Apostrophe, } +/// Maps identifier strings to their keyword `SyntaxKind`. +/// +/// Returns `Some(kind)` if `ident` is a recognised `DDlog` keyword, or `None` +/// otherwise. A static map avoids a long match statement and allows O(1) +/// lookups. +static KEYWORDS: LazyLock> = LazyLock::new(|| { + HashMap::from([ + ("abstract", SyntaxKind::K_ABSTRACT), + ("Aggregate", SyntaxKind::K_AGGREGATE), + ("and", SyntaxKind::K_AND), + ("apply", SyntaxKind::K_APPLY), + ("as", SyntaxKind::K_AS), + ("async", SyntaxKind::K_ASYNC), + ("await", SyntaxKind::K_AWAIT), + ("become", SyntaxKind::K_BECOME), + ("bigint", SyntaxKind::K_BIGINT), + ("bit", SyntaxKind::K_BIT), + ("bool", SyntaxKind::K_BOOL), + ("box", SyntaxKind::K_BOX), + ("break", SyntaxKind::K_BREAK), + ("const", SyntaxKind::K_CONST), + ("continue", SyntaxKind::K_CONTINUE), + ("crate", SyntaxKind::K_CRATE), + ("do", SyntaxKind::K_DO), + ("double", SyntaxKind::K_DOUBLE), + ("dyn", SyntaxKind::K_DYN), + ("else", SyntaxKind::K_ELSE), + ("extern", SyntaxKind::K_EXTERN), + ("false", SyntaxKind::K_FALSE), + ("final", SyntaxKind::K_FINAL), + ("fn", SyntaxKind::K_FN), + ("FlatMap", SyntaxKind::K_FLATMAP), + ("float", SyntaxKind::K_FLOAT), + ("for", SyntaxKind::K_FOR), + ("function", SyntaxKind::K_FUNCTION), + ("if", SyntaxKind::K_IF), + ("impl", SyntaxKind::K_IMPL), + ("import", SyntaxKind::K_IMPORT), + ("in", SyntaxKind::K_IN), + ("input", SyntaxKind::K_INPUT), + ("Inspect", SyntaxKind::K_INSPECT), + ("let", SyntaxKind::K_LET), + ("loop", SyntaxKind::K_LOOP), + ("macro", SyntaxKind::K_MACRO), + ("match", SyntaxKind::K_MATCH), + ("mod", SyntaxKind::K_MOD), + ("move", SyntaxKind::K_MOVE), + ("multiset", SyntaxKind::K_MULTISET), + ("mut", SyntaxKind::K_MUT), + ("not", SyntaxKind::K_NOT), + ("or", SyntaxKind::K_OR), + ("override", SyntaxKind::K_OVERRIDE), + ("output", SyntaxKind::K_OUTPUT), + ("priv", SyntaxKind::K_PRIV), + ("pub", SyntaxKind::K_PUB), + ("ref", SyntaxKind::K_REF), + ("relation", SyntaxKind::K_RELATION), + ("return", SyntaxKind::K_RETURN), + ("self", SyntaxKind::K_SELF), + ("Self", SyntaxKind::K_SELF_TYPE), + ("signed", SyntaxKind::K_SIGNED), + ("skip", SyntaxKind::K_SKIP), + ("static", SyntaxKind::K_STATIC), + ("stream", SyntaxKind::K_STREAM), + ("struct", SyntaxKind::K_STRUCT), + ("super", SyntaxKind::K_SUPER), + ("trait", SyntaxKind::K_TRAIT), + ("transformer", SyntaxKind::K_TRANSFORMER), + ("try", SyntaxKind::K_TRY), + ("true", SyntaxKind::K_TRUE), + ("type", SyntaxKind::K_TYPE), + ("typedef", SyntaxKind::K_TYPEDEF), + ("typeof", SyntaxKind::K_TYPEOF), + ("_", SyntaxKind::K_UNDERSCORE), + ("unsafe", SyntaxKind::K_UNSAFE), + ("unsized", SyntaxKind::K_UNSIZED), + ("use", SyntaxKind::K_USE), + ("var", SyntaxKind::K_VAR), + ("virtual", SyntaxKind::K_VIRTUAL), + ("where", SyntaxKind::K_WHERE), + ("while", SyntaxKind::K_WHILE), + ("yield", SyntaxKind::K_YIELD), + ]) +}); + fn keyword_kind(ident: &str) -> Option { - Some(match ident { - "abstract" => SyntaxKind::K_ABSTRACT, - "Aggregate" => SyntaxKind::K_AGGREGATE, - "and" => SyntaxKind::K_AND, - "apply" => SyntaxKind::K_APPLY, - "as" => SyntaxKind::K_AS, - "async" => SyntaxKind::K_ASYNC, - "await" => SyntaxKind::K_AWAIT, - "become" => SyntaxKind::K_BECOME, - "bigint" => SyntaxKind::K_BIGINT, - "bit" => SyntaxKind::K_BIT, - "bool" => SyntaxKind::K_BOOL, - "box" => SyntaxKind::K_BOX, - "break" => SyntaxKind::K_BREAK, - "const" => SyntaxKind::K_CONST, - "continue" => SyntaxKind::K_CONTINUE, - "crate" => SyntaxKind::K_CRATE, - "do" => SyntaxKind::K_DO, - "double" => SyntaxKind::K_DOUBLE, - "dyn" => SyntaxKind::K_DYN, - "else" => SyntaxKind::K_ELSE, - "extern" => SyntaxKind::K_EXTERN, - "false" => SyntaxKind::K_FALSE, - "final" => SyntaxKind::K_FINAL, - "fn" => SyntaxKind::K_FN, - "FlatMap" => SyntaxKind::K_FLATMAP, - "float" => SyntaxKind::K_FLOAT, - "for" => SyntaxKind::K_FOR, - "function" => SyntaxKind::K_FUNCTION, - "if" => SyntaxKind::K_IF, - "impl" => SyntaxKind::K_IMPL, - "import" => SyntaxKind::K_IMPORT, - "in" => SyntaxKind::K_IN, - "input" => SyntaxKind::K_INPUT, - "Inspect" => SyntaxKind::K_INSPECT, - "let" => SyntaxKind::K_LET, - "loop" => SyntaxKind::K_LOOP, - "macro" => SyntaxKind::K_MACRO, - "match" => SyntaxKind::K_MATCH, - "mod" => SyntaxKind::K_MOD, - "move" => SyntaxKind::K_MOVE, - "multiset" => SyntaxKind::K_MULTISET, - "mut" => SyntaxKind::K_MUT, - "not" => SyntaxKind::K_NOT, - "or" => SyntaxKind::K_OR, - "override" => SyntaxKind::K_OVERRIDE, - "output" => SyntaxKind::K_OUTPUT, - "priv" => SyntaxKind::K_PRIV, - "pub" => SyntaxKind::K_PUB, - "ref" => SyntaxKind::K_REF, - "relation" => SyntaxKind::K_RELATION, - "return" => SyntaxKind::K_RETURN, - "self" => SyntaxKind::K_SELF, - "Self" => SyntaxKind::K_SELF_TYPE, - "signed" => SyntaxKind::K_SIGNED, - "skip" => SyntaxKind::K_SKIP, - "static" => SyntaxKind::K_STATIC, - "stream" => SyntaxKind::K_STREAM, - "struct" => SyntaxKind::K_STRUCT, - "super" => SyntaxKind::K_SUPER, - "trait" => SyntaxKind::K_TRAIT, - "transformer" => SyntaxKind::K_TRANSFORMER, - "try" => SyntaxKind::K_TRY, - "true" => SyntaxKind::K_TRUE, - "type" => SyntaxKind::K_TYPE, - "typedef" => SyntaxKind::K_TYPEDEF, - "typeof" => SyntaxKind::K_TYPEOF, - "_" => SyntaxKind::K_UNDERSCORE, - "unsafe" => SyntaxKind::K_UNSAFE, - "unsized" => SyntaxKind::K_UNSIZED, - "use" => SyntaxKind::K_USE, - "var" => SyntaxKind::K_VAR, - "virtual" => SyntaxKind::K_VIRTUAL, - "where" => SyntaxKind::K_WHERE, - "while" => SyntaxKind::K_WHILE, - "yield" => SyntaxKind::K_YIELD, - _ => return None, - }) + KEYWORDS.get(ident).copied() } /// Tokenise the provided `DDlog` source. #[must_use] pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> { let mut lexer = Token::lexer(src); - let mut out = Vec::new(); + let estimated_tokens = src.len() >> 2; // roughly four chars per token + let mut out = Vec::with_capacity(estimated_tokens); while let Some(result) = lexer.next() { let span = lexer.span(); let text = src.get(span.clone()).unwrap_or(""); diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 9192bdd9..12718010 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -26,7 +26,9 @@ fn single_tokens(#[case] source: &str, #[case] expected: Vec) { fn token_spans(simple_input: &str) { let tokens = tokenize(simple_input); for (kind, span) in tokens { - let text = simple_input.get(span.clone()).unwrap_or(""); + let text = simple_input + .get(span.clone()) + .unwrap_or_else(|| panic!("span should be valid for input")); if let SyntaxKind::K_INPUT = kind { assert_eq!(text, "input"); } else if let SyntaxKind::K_RELATION = kind { @@ -44,7 +46,22 @@ fn literal_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("no token")); + .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("0xFF", SyntaxKind::T_NUMBER)] +#[case("0b1010", SyntaxKind::T_NUMBER)] +#[case("0o77", SyntaxKind::T_NUMBER)] +#[case("1e10", SyntaxKind::T_NUMBER)] +fn extended_number_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); assert_eq!(first.0, expected); } @@ -59,7 +76,7 @@ fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("no token")); + .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); assert_eq!(first.0, expected); } @@ -72,10 +89,20 @@ fn unknown_character_produces_error(#[case] source: &str) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("no token")); + .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); assert_eq!(first.0, SyntaxKind::N_ERROR); } +#[test] +fn unterminated_string_is_error() { + let tokens = tokenize("\"foo"); + assert_eq!(tokens.len(), 1); + match tokens.first() { + Some(t) => assert_eq!(t.0, SyntaxKind::N_ERROR), + None => panic!("tokenizer should produce at least one token"), + } +} + #[rstest] #[case("(", SyntaxKind::T_LPAREN)] #[case(")", SyntaxKind::T_RPAREN)] @@ -90,7 +117,7 @@ fn punctuation_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("no token")); + .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); assert_eq!(first.0, expected); } From 373ae27401c4cae133f15e640789d79d003c998a Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 25 Jun 2025 02:05:41 +0100 Subject: [PATCH 7/8] Add comprehensive tokenizer tests --- tests/tokenizer.rs | 97 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 10 deletions(-) diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index 12718010..b3051a72 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -4,6 +4,8 @@ //! code into `(SyntaxKind, Span)` pairs, covering keywords, literals, trivia, //! and error cases. +#![expect(clippy::expect_used, reason = "tests assert exact behaviour")] + use ddlint::{SyntaxKind, tokenize}; use rstest::{fixture, rstest}; @@ -28,7 +30,7 @@ fn token_spans(simple_input: &str) { for (kind, span) in tokens { let text = simple_input .get(span.clone()) - .unwrap_or_else(|| panic!("span should be valid for input")); + .expect("span should be valid for input"); if let SyntaxKind::K_INPUT = kind { assert_eq!(text, "input"); } else if let SyntaxKind::K_RELATION = kind { @@ -46,7 +48,7 @@ fn literal_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); + .expect("tokenizer should produce at least one token"); assert_eq!(first.0, expected); } @@ -61,13 +63,14 @@ fn extended_number_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); + .expect("tokenizer should produce at least one token"); assert_eq!(first.0, expected); } #[rstest] #[case(" ", SyntaxKind::T_WHITESPACE)] #[case("\n", SyntaxKind::T_WHITESPACE)] +#[case("\t", SyntaxKind::T_WHITESPACE)] #[case("/* c */", SyntaxKind::T_COMMENT)] #[case("// line", SyntaxKind::T_COMMENT)] fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { @@ -76,7 +79,7 @@ fn trivia_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); + .expect("tokenizer should produce at least one token"); assert_eq!(first.0, expected); } @@ -89,7 +92,7 @@ fn unknown_character_produces_error(#[case] source: &str) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); + .expect("tokenizer should produce at least one token"); assert_eq!(first.0, SyntaxKind::N_ERROR); } @@ -97,10 +100,10 @@ fn unknown_character_produces_error(#[case] source: &str) { fn unterminated_string_is_error() { let tokens = tokenize("\"foo"); assert_eq!(tokens.len(), 1); - match tokens.first() { - Some(t) => assert_eq!(t.0, SyntaxKind::N_ERROR), - None => panic!("tokenizer should produce at least one token"), - } + let first = tokens + .first() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, SyntaxKind::N_ERROR); } #[rstest] @@ -117,10 +120,84 @@ fn punctuation_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { let first = tokens .first() .cloned() - .unwrap_or_else(|| panic!("tokenizer should produce at least one token")); + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("{", SyntaxKind::T_LBRACE)] +#[case("}", SyntaxKind::T_RBRACE)] +#[case("[", SyntaxKind::T_LBRACKET)] +#[case("]", SyntaxKind::T_RBRACKET)] +#[case(";", SyntaxKind::T_SEMI)] +#[case(",", SyntaxKind::T_COMMA)] +#[case(".", SyntaxKind::T_DOT)] +fn delimiter_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); + assert_eq!(first.0, expected); +} + +#[rstest] +#[case("|", SyntaxKind::T_PIPE)] +#[case("&", SyntaxKind::T_AMP)] +#[case("=", SyntaxKind::T_EQ)] +#[case("==", SyntaxKind::T_EQEQ)] +#[case(":-", SyntaxKind::T_IMPLIES)] +#[case("%", SyntaxKind::T_PERCENT)] +#[case("*", SyntaxKind::T_STAR)] +#[case("/", SyntaxKind::T_SLASH)] +#[case("+", SyntaxKind::T_PLUS)] +#[case("-", SyntaxKind::T_MINUS)] +#[case("->", SyntaxKind::T_ARROW)] +#[case("=>", SyntaxKind::T_FAT_ARROW)] +#[case("<=", SyntaxKind::T_LTE)] +#[case("<=>", SyntaxKind::T_SPACESHIP)] +#[case(">=", SyntaxKind::T_GTE)] +#[case("<", SyntaxKind::T_LT)] +#[case(">", SyntaxKind::T_GT)] +#[case("!=", SyntaxKind::T_NEQ)] +#[case(">>", SyntaxKind::T_SHR)] +#[case("<<", SyntaxKind::T_SHL)] +#[case("~", SyntaxKind::T_TILDE)] +#[case("@", SyntaxKind::T_AT)] +#[case("#", SyntaxKind::T_HASH)] +#[case("'", SyntaxKind::T_APOSTROPHE)] +fn operator_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { + let tokens = tokenize(source); + assert_eq!(tokens.len(), 1); + let first = tokens + .first() + .cloned() + .expect("tokenizer should produce at least one token"); assert_eq!(first.0, expected); } +#[test] +fn negative_number_tokens() { + let tokens = tokenize("-1"); + let kinds: Vec = tokens.iter().map(|(k, _)| *k).collect(); + assert_eq!(kinds, vec![SyntaxKind::T_MINUS, SyntaxKind::T_NUMBER]); +} + +#[test] +fn escaped_string_token() { + let tokens = tokenize("\"a\\\"b\""); + assert_eq!(tokens.len(), 1); + assert_eq!(tokens.first().expect("token").0, SyntaxKind::T_STRING); +} + +#[test] +fn unterminated_comment_is_error() { + let tokens = tokenize("/* comment"); + assert_eq!(tokens.len(), 1); + assert_eq!(tokens.first().expect("token").0, SyntaxKind::N_ERROR); +} + #[test] fn empty_input_produces_no_tokens() { let tokens = tokenize(""); From 6844bf617ce5f0877b63e3db4b081f8192835324 Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 25 Jun 2025 03:03:13 +0100 Subject: [PATCH 8/8] Optimise keyword lookup and refine tests --- Cargo.toml | 1 + docs/parser-plan.md | 2 +- src/tokenizer.rs | 161 ++++++++++++++++++++++---------------------- tests/tokenizer.rs | 19 ++++-- 4 files changed, 95 insertions(+), 88 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3da02b8b..fe8d77f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ rowan = { version = "0.15", default-features = false } num-derive = { version = "0.4", default-features = false } num-traits = { version = "0.2", default-features = false, features = ["std"] } logos = { version = ">=0.13.0, <0.14.0", default-features = false, features = ["export_derive"] } +phf = { version = ">=0.11.0, <0.12.0", default-features = false, features = ["macros"] } [dev-dependencies] rstest = ">=0.18.0, <0.19.0" diff --git a/docs/parser-plan.md b/docs/parser-plan.md index b5472188..182eb6fe 100644 --- a/docs/parser-plan.md +++ b/docs/parser-plan.md @@ -36,7 +36,7 @@ records byte offsets so that the resulting CST can precisely mirror the input. Whitespace and comments should produce tokens so they can be preserved. The current implementation opts for a small `logos` lexer because it keeps the token definitions declarative while still interoperating smoothly with `chumsky` -parsers. +parsers. Keyword lookups use a `phf::Map` for zero-cost perfect hashing. ```mermaid sequenceDiagram diff --git a/src/tokenizer.rs b/src/tokenizer.rs index ff71fbd4..efb79be0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -5,8 +5,7 @@ //! recognise tokens so that the CST can mirror the input exactly. use logos::Logos; -use std::collections::HashMap; -use std::sync::LazyLock; +use phf::phf_map; use crate::SyntaxKind; @@ -22,7 +21,7 @@ enum Token { Comment, #[regex(r"[A-Za-z_][A-Za-z0-9_]*")] Ident, - #[regex(r"0[xX][0-9a-fA-F]+|0[bB][01]+|0[oO][0-7]+|[0-9]+(?:\\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")] + #[regex(r"0[xX][0-9a-fA-F]+|0[bB][01]+|0[oO][0-7]+|[0-9]+(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")] Number, #[regex(r#""([^"\\]|\\.)*""#)] String, @@ -103,85 +102,83 @@ enum Token { /// Returns `Some(kind)` if `ident` is a recognised `DDlog` keyword, or `None` /// otherwise. A static map avoids a long match statement and allows O(1) /// lookups. -static KEYWORDS: LazyLock> = LazyLock::new(|| { - HashMap::from([ - ("abstract", SyntaxKind::K_ABSTRACT), - ("Aggregate", SyntaxKind::K_AGGREGATE), - ("and", SyntaxKind::K_AND), - ("apply", SyntaxKind::K_APPLY), - ("as", SyntaxKind::K_AS), - ("async", SyntaxKind::K_ASYNC), - ("await", SyntaxKind::K_AWAIT), - ("become", SyntaxKind::K_BECOME), - ("bigint", SyntaxKind::K_BIGINT), - ("bit", SyntaxKind::K_BIT), - ("bool", SyntaxKind::K_BOOL), - ("box", SyntaxKind::K_BOX), - ("break", SyntaxKind::K_BREAK), - ("const", SyntaxKind::K_CONST), - ("continue", SyntaxKind::K_CONTINUE), - ("crate", SyntaxKind::K_CRATE), - ("do", SyntaxKind::K_DO), - ("double", SyntaxKind::K_DOUBLE), - ("dyn", SyntaxKind::K_DYN), - ("else", SyntaxKind::K_ELSE), - ("extern", SyntaxKind::K_EXTERN), - ("false", SyntaxKind::K_FALSE), - ("final", SyntaxKind::K_FINAL), - ("fn", SyntaxKind::K_FN), - ("FlatMap", SyntaxKind::K_FLATMAP), - ("float", SyntaxKind::K_FLOAT), - ("for", SyntaxKind::K_FOR), - ("function", SyntaxKind::K_FUNCTION), - ("if", SyntaxKind::K_IF), - ("impl", SyntaxKind::K_IMPL), - ("import", SyntaxKind::K_IMPORT), - ("in", SyntaxKind::K_IN), - ("input", SyntaxKind::K_INPUT), - ("Inspect", SyntaxKind::K_INSPECT), - ("let", SyntaxKind::K_LET), - ("loop", SyntaxKind::K_LOOP), - ("macro", SyntaxKind::K_MACRO), - ("match", SyntaxKind::K_MATCH), - ("mod", SyntaxKind::K_MOD), - ("move", SyntaxKind::K_MOVE), - ("multiset", SyntaxKind::K_MULTISET), - ("mut", SyntaxKind::K_MUT), - ("not", SyntaxKind::K_NOT), - ("or", SyntaxKind::K_OR), - ("override", SyntaxKind::K_OVERRIDE), - ("output", SyntaxKind::K_OUTPUT), - ("priv", SyntaxKind::K_PRIV), - ("pub", SyntaxKind::K_PUB), - ("ref", SyntaxKind::K_REF), - ("relation", SyntaxKind::K_RELATION), - ("return", SyntaxKind::K_RETURN), - ("self", SyntaxKind::K_SELF), - ("Self", SyntaxKind::K_SELF_TYPE), - ("signed", SyntaxKind::K_SIGNED), - ("skip", SyntaxKind::K_SKIP), - ("static", SyntaxKind::K_STATIC), - ("stream", SyntaxKind::K_STREAM), - ("struct", SyntaxKind::K_STRUCT), - ("super", SyntaxKind::K_SUPER), - ("trait", SyntaxKind::K_TRAIT), - ("transformer", SyntaxKind::K_TRANSFORMER), - ("try", SyntaxKind::K_TRY), - ("true", SyntaxKind::K_TRUE), - ("type", SyntaxKind::K_TYPE), - ("typedef", SyntaxKind::K_TYPEDEF), - ("typeof", SyntaxKind::K_TYPEOF), - ("_", SyntaxKind::K_UNDERSCORE), - ("unsafe", SyntaxKind::K_UNSAFE), - ("unsized", SyntaxKind::K_UNSIZED), - ("use", SyntaxKind::K_USE), - ("var", SyntaxKind::K_VAR), - ("virtual", SyntaxKind::K_VIRTUAL), - ("where", SyntaxKind::K_WHERE), - ("while", SyntaxKind::K_WHILE), - ("yield", SyntaxKind::K_YIELD), - ]) -}); +static KEYWORDS: phf::Map<&'static str, SyntaxKind> = phf_map! { + "abstract" => SyntaxKind::K_ABSTRACT, + "Aggregate" => SyntaxKind::K_AGGREGATE, + "and" => SyntaxKind::K_AND, + "apply" => SyntaxKind::K_APPLY, + "as" => SyntaxKind::K_AS, + "async" => SyntaxKind::K_ASYNC, + "await" => SyntaxKind::K_AWAIT, + "become" => SyntaxKind::K_BECOME, + "bigint" => SyntaxKind::K_BIGINT, + "bit" => SyntaxKind::K_BIT, + "bool" => SyntaxKind::K_BOOL, + "box" => SyntaxKind::K_BOX, + "break" => SyntaxKind::K_BREAK, + "const" => SyntaxKind::K_CONST, + "continue" => SyntaxKind::K_CONTINUE, + "crate" => SyntaxKind::K_CRATE, + "do" => SyntaxKind::K_DO, + "double" => SyntaxKind::K_DOUBLE, + "dyn" => SyntaxKind::K_DYN, + "else" => SyntaxKind::K_ELSE, + "extern" => SyntaxKind::K_EXTERN, + "false" => SyntaxKind::K_FALSE, + "final" => SyntaxKind::K_FINAL, + "fn" => SyntaxKind::K_FN, + "FlatMap" => SyntaxKind::K_FLATMAP, + "float" => SyntaxKind::K_FLOAT, + "for" => SyntaxKind::K_FOR, + "function" => SyntaxKind::K_FUNCTION, + "if" => SyntaxKind::K_IF, + "impl" => SyntaxKind::K_IMPL, + "import" => SyntaxKind::K_IMPORT, + "in" => SyntaxKind::K_IN, + "input" => SyntaxKind::K_INPUT, + "Inspect" => SyntaxKind::K_INSPECT, + "let" => SyntaxKind::K_LET, + "loop" => SyntaxKind::K_LOOP, + "macro" => SyntaxKind::K_MACRO, + "match" => SyntaxKind::K_MATCH, + "mod" => SyntaxKind::K_MOD, + "move" => SyntaxKind::K_MOVE, + "multiset" => SyntaxKind::K_MULTISET, + "mut" => SyntaxKind::K_MUT, + "not" => SyntaxKind::K_NOT, + "or" => SyntaxKind::K_OR, + "override" => SyntaxKind::K_OVERRIDE, + "output" => SyntaxKind::K_OUTPUT, + "priv" => SyntaxKind::K_PRIV, + "pub" => SyntaxKind::K_PUB, + "ref" => SyntaxKind::K_REF, + "relation" => SyntaxKind::K_RELATION, + "return" => SyntaxKind::K_RETURN, + "self" => SyntaxKind::K_SELF, + "Self" => SyntaxKind::K_SELF_TYPE, + "signed" => SyntaxKind::K_SIGNED, + "skip" => SyntaxKind::K_SKIP, + "static" => SyntaxKind::K_STATIC, + "stream" => SyntaxKind::K_STREAM, + "struct" => SyntaxKind::K_STRUCT, + "super" => SyntaxKind::K_SUPER, + "trait" => SyntaxKind::K_TRAIT, + "transformer" => SyntaxKind::K_TRANSFORMER, + "try" => SyntaxKind::K_TRY, + "true" => SyntaxKind::K_TRUE, + "type" => SyntaxKind::K_TYPE, + "typedef" => SyntaxKind::K_TYPEDEF, + "typeof" => SyntaxKind::K_TYPEOF, + "_" => SyntaxKind::K_UNDERSCORE, + "unsafe" => SyntaxKind::K_UNSAFE, + "unsized" => SyntaxKind::K_UNSIZED, + "use" => SyntaxKind::K_USE, + "var" => SyntaxKind::K_VAR, + "virtual" => SyntaxKind::K_VIRTUAL, + "where" => SyntaxKind::K_WHERE, + "while" => SyntaxKind::K_WHILE, + "yield" => SyntaxKind::K_YIELD, +}; fn keyword_kind(ident: &str) -> Option { KEYWORDS.get(ident).copied() diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index b3051a72..5add1dda 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -109,11 +109,8 @@ fn unterminated_string_is_error() { #[rstest] #[case("(", SyntaxKind::T_LPAREN)] #[case(")", SyntaxKind::T_RPAREN)] -#[case("==", SyntaxKind::T_EQEQ)] #[case(":", SyntaxKind::T_COLON)] #[case("::", SyntaxKind::T_COLON_COLON)] -#[case("->", SyntaxKind::T_ARROW)] -#[case("=>", SyntaxKind::T_FAT_ARROW)] fn punctuation_tokens(#[case] source: &str, #[case] expected: SyntaxKind) { let tokens = tokenize(source); assert_eq!(tokens.len(), 1); @@ -188,14 +185,26 @@ fn negative_number_tokens() { fn escaped_string_token() { let tokens = tokenize("\"a\\\"b\""); assert_eq!(tokens.len(), 1); - assert_eq!(tokens.first().expect("token").0, SyntaxKind::T_STRING); + assert_eq!( + tokens + .first() + .expect("tokenizer should produce at least one token") + .0, + SyntaxKind::T_STRING + ); } #[test] fn unterminated_comment_is_error() { let tokens = tokenize("/* comment"); assert_eq!(tokens.len(), 1); - assert_eq!(tokens.first().expect("token").0, SyntaxKind::N_ERROR); + assert_eq!( + tokens + .first() + .expect("tokenizer should produce at least one token") + .0, + SyntaxKind::N_ERROR + ); } #[test]