Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ edition = "2024"
rowan = { version = "0.15", default-features = false }
num-derive = { version = "0.4", default-features = false }
num-traits = { version = "0.2", default-features = false, features = ["std"] }
logos = { version = ">=0.13.0, <0.14.0", default-features = false, features = ["export_derive"] }
phf = { version = ">=0.11.0, <0.12.0", default-features = false, features = ["macros"] }

[dev-dependencies]
rstest = ">=0.18.0, <0.19.0"

[lints.clippy]
pedantic = { level = "warn", priority = -1 }
Expand Down
1 change: 1 addition & 0 deletions docs/ddlint-design-and-road-map.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ enum SyntaxKind {
T_IDENT, // An identifier, like a relation or variable name
T_STRING_LIT, // A string literal
T_NUMBER, // A numeric literal
// decimal, floating-point, hex, binary, or octal
T_LPAREN, // '('
T_RPAREN, // ')'
T_LBRACE, // '{'
Expand Down
21 changes: 20 additions & 1 deletion docs/parser-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,26 @@ transparently.
Use `chumsky`'s text utilities (or integrate a `logos` lexer if preferred) to
convert the source text into a stream of `(SyntaxKind, Span)` pairs. Each span
records byte offsets so that the resulting CST can precisely mirror the input.
Whitespace and comments should produce tokens so they can be preserved.
Whitespace and comments should produce tokens so they can be preserved. The
current implementation opts for a small `logos` lexer because it keeps the token
definitions declarative while still interoperating smoothly with `chumsky`
parsers. Keyword lookups use a `phf::Map` for zero-cost perfect hashing.

```mermaid
sequenceDiagram
participant Client
participant Tokenizer
participant LogosLexer
participant SyntaxKind
Client->>Tokenizer: tokenize(src: &str)
Tokenizer->>LogosLexer: Token::lexer(src)
loop for each token
LogosLexer-->>Tokenizer: next() -> Token + span
Tokenizer->>SyntaxKind: map Token to SyntaxKind
Tokenizer-->>Tokenizer: collect (SyntaxKind, Span)
end
Tokenizer-->>Client: Vec<(SyntaxKind, Span)>
```

## 4. Construct the Parser with `chumsky`

Expand Down
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
//! Library crate for ddlint.
//!
//! Currently exposes only the parser language definitions.
//! Exposes parser language definitions and lexical analysis functionality.

#![forbid(unsafe_code)]

pub mod language;
pub mod tokenizer;

pub use language::{DdlogLanguage, SyntaxKind};
pub use tokenizer::{Span, tokenize};
245 changes: 245 additions & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
//! Lexical analysis for `DDlog` source.
//!
//! This module exposes a `tokenize` function which converts raw source text into
//! a sequence of `(SyntaxKind, Span)` pairs. It uses the `logos` crate to
//! recognise tokens so that the CST can mirror the input exactly.

use logos::Logos;
use phf::phf_map;

use crate::SyntaxKind;

/// Byte range for a token within the source.
pub type Span = std::ops::Range<usize>;

#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
enum Token {
#[regex(r"[ \t\r\n]+")]
Whitespace,
#[regex(r"/\*([^*]|\*[^/])*\*/", priority = 2)]
#[regex(r"//[^\n]*")]
Comment,
#[regex(r"[A-Za-z_][A-Za-z0-9_]*")]
Ident,
#[regex(r"0[xX][0-9a-fA-F]+|0[bB][01]+|0[oO][0-7]+|[0-9]+(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")]
Number,
#[regex(r#""([^"\\]|\\.)*""#)]
String,
#[token("(")]
LParen,
#[token(")")]
RParen,
#[token("{")]
LBrace,
#[token("}")]
RBrace,
#[token("[")]
LBracket,
#[token("]")]
RBracket,
#[token(";")]
Semi,
#[token(",")]
Comma,
#[token(".")]
Dot,
#[token("::")]
ColonColon,
#[token(":")]
Colon,
#[token("|")]
Pipe,
#[token("&")]
Amp,
#[token("==")]
EqEq,
#[token("=")]
Eq,
#[token(":-")]
Implies,
#[token("%")]
Percent,
#[token("*")]
Star,
#[token("/")]
Slash,
#[token("+")]
Plus,
#[token("-")]
Minus,
#[token("->")]
Arrow,
#[token("=>")]
FatArrow,
#[token("<=")]
Lte,
#[token("<=>")]
Spaceship,
#[token(">=")]
Gte,
#[token("<")]
Lt,
#[token(">")]
Gt,
#[token("!=")]
Neq,
#[token(">>")]
Shr,
#[token("<<")]
Shl,
#[token("~")]
Tilde,
#[token("@")]
At,
#[token("#")]
Hash,
#[token("'")]
Apostrophe,
}

/// Maps identifier strings to their keyword `SyntaxKind`.
///
/// Returns `Some(kind)` if `ident` is a recognised `DDlog` keyword, or `None`
/// otherwise. A static map avoids a long match statement and allows O(1)
/// lookups.
static KEYWORDS: phf::Map<&'static str, SyntaxKind> = phf_map! {
"abstract" => SyntaxKind::K_ABSTRACT,
"Aggregate" => SyntaxKind::K_AGGREGATE,
"and" => SyntaxKind::K_AND,
"apply" => SyntaxKind::K_APPLY,
"as" => SyntaxKind::K_AS,
"async" => SyntaxKind::K_ASYNC,
"await" => SyntaxKind::K_AWAIT,
"become" => SyntaxKind::K_BECOME,
"bigint" => SyntaxKind::K_BIGINT,
"bit" => SyntaxKind::K_BIT,
"bool" => SyntaxKind::K_BOOL,
"box" => SyntaxKind::K_BOX,
"break" => SyntaxKind::K_BREAK,
"const" => SyntaxKind::K_CONST,
"continue" => SyntaxKind::K_CONTINUE,
"crate" => SyntaxKind::K_CRATE,
"do" => SyntaxKind::K_DO,
"double" => SyntaxKind::K_DOUBLE,
"dyn" => SyntaxKind::K_DYN,
"else" => SyntaxKind::K_ELSE,
"extern" => SyntaxKind::K_EXTERN,
"false" => SyntaxKind::K_FALSE,
"final" => SyntaxKind::K_FINAL,
"fn" => SyntaxKind::K_FN,
"FlatMap" => SyntaxKind::K_FLATMAP,
"float" => SyntaxKind::K_FLOAT,
"for" => SyntaxKind::K_FOR,
"function" => SyntaxKind::K_FUNCTION,
"if" => SyntaxKind::K_IF,
"impl" => SyntaxKind::K_IMPL,
"import" => SyntaxKind::K_IMPORT,
"in" => SyntaxKind::K_IN,
"input" => SyntaxKind::K_INPUT,
"Inspect" => SyntaxKind::K_INSPECT,
"let" => SyntaxKind::K_LET,
"loop" => SyntaxKind::K_LOOP,
"macro" => SyntaxKind::K_MACRO,
"match" => SyntaxKind::K_MATCH,
"mod" => SyntaxKind::K_MOD,
"move" => SyntaxKind::K_MOVE,
"multiset" => SyntaxKind::K_MULTISET,
"mut" => SyntaxKind::K_MUT,
"not" => SyntaxKind::K_NOT,
"or" => SyntaxKind::K_OR,
"override" => SyntaxKind::K_OVERRIDE,
"output" => SyntaxKind::K_OUTPUT,
"priv" => SyntaxKind::K_PRIV,
"pub" => SyntaxKind::K_PUB,
"ref" => SyntaxKind::K_REF,
"relation" => SyntaxKind::K_RELATION,
"return" => SyntaxKind::K_RETURN,
"self" => SyntaxKind::K_SELF,
"Self" => SyntaxKind::K_SELF_TYPE,
"signed" => SyntaxKind::K_SIGNED,
"skip" => SyntaxKind::K_SKIP,
"static" => SyntaxKind::K_STATIC,
"stream" => SyntaxKind::K_STREAM,
"struct" => SyntaxKind::K_STRUCT,
"super" => SyntaxKind::K_SUPER,
"trait" => SyntaxKind::K_TRAIT,
"transformer" => SyntaxKind::K_TRANSFORMER,
"try" => SyntaxKind::K_TRY,
"true" => SyntaxKind::K_TRUE,
"type" => SyntaxKind::K_TYPE,
"typedef" => SyntaxKind::K_TYPEDEF,
"typeof" => SyntaxKind::K_TYPEOF,
"_" => SyntaxKind::K_UNDERSCORE,
"unsafe" => SyntaxKind::K_UNSAFE,
"unsized" => SyntaxKind::K_UNSIZED,
"use" => SyntaxKind::K_USE,
"var" => SyntaxKind::K_VAR,
"virtual" => SyntaxKind::K_VIRTUAL,
"where" => SyntaxKind::K_WHERE,
"while" => SyntaxKind::K_WHILE,
"yield" => SyntaxKind::K_YIELD,
};

fn keyword_kind(ident: &str) -> Option<SyntaxKind> {
KEYWORDS.get(ident).copied()
}

/// Tokenise the provided `DDlog` source.
#[must_use]
pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> {
let mut lexer = Token::lexer(src);
let estimated_tokens = src.len() >> 2; // roughly four chars per token
let mut out = Vec::with_capacity(estimated_tokens);
while let Some(result) = lexer.next() {
let span = lexer.span();
let text = src.get(span.clone()).unwrap_or("");
let Ok(token) = result else {
out.push((SyntaxKind::N_ERROR, span));
continue;
};
let kind = match token {
Token::Whitespace => SyntaxKind::T_WHITESPACE,
Token::Comment => SyntaxKind::T_COMMENT,
Token::Ident => keyword_kind(text).unwrap_or(SyntaxKind::T_IDENT),
Token::Number => SyntaxKind::T_NUMBER,
Token::String => SyntaxKind::T_STRING,
Token::LParen => SyntaxKind::T_LPAREN,
Token::RParen => SyntaxKind::T_RPAREN,
Token::LBrace => SyntaxKind::T_LBRACE,
Token::RBrace => SyntaxKind::T_RBRACE,
Token::LBracket => SyntaxKind::T_LBRACKET,
Token::RBracket => SyntaxKind::T_RBRACKET,
Token::Semi => SyntaxKind::T_SEMI,
Token::Comma => SyntaxKind::T_COMMA,
Token::Dot => SyntaxKind::T_DOT,
Token::ColonColon => SyntaxKind::T_COLON_COLON,
Token::Colon => SyntaxKind::T_COLON,
Token::Pipe => SyntaxKind::T_PIPE,
Token::Amp => SyntaxKind::T_AMP,
Token::EqEq => SyntaxKind::T_EQEQ,
Token::Eq => SyntaxKind::T_EQ,
Token::Implies => SyntaxKind::T_IMPLIES,
Token::Percent => SyntaxKind::T_PERCENT,
Token::Star => SyntaxKind::T_STAR,
Token::Slash => SyntaxKind::T_SLASH,
Token::Plus => SyntaxKind::T_PLUS,
Token::Minus => SyntaxKind::T_MINUS,
Token::Arrow => SyntaxKind::T_ARROW,
Token::FatArrow => SyntaxKind::T_FAT_ARROW,
Token::Lte => SyntaxKind::T_LTE,
Token::Spaceship => SyntaxKind::T_SPACESHIP,
Token::Gte => SyntaxKind::T_GTE,
Token::Lt => SyntaxKind::T_LT,
Token::Gt => SyntaxKind::T_GT,
Token::Neq => SyntaxKind::T_NEQ,
Token::Shr => SyntaxKind::T_SHR,
Token::Shl => SyntaxKind::T_SHL,
Token::Tilde => SyntaxKind::T_TILDE,
Token::At => SyntaxKind::T_AT,
Token::Hash => SyntaxKind::T_HASH,
Token::Apostrophe => SyntaxKind::T_APOSTROPHE,
};
out.push((kind, span));
}
out
}
Loading
Loading