Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ edition = "2024"
rowan = { version = "0.15", default-features = false }
num-derive = { version = "0.4", default-features = false }
num-traits = { version = "0.2", default-features = false, features = ["std"] }
logos = { version = "0.13", default-features = false, features = ["export_derive"] }

[dev-dependencies]
rstest = "0.18"

[lints.clippy]
pedantic = { level = "warn", priority = -1 }
Expand Down
5 changes: 4 additions & 1 deletion docs/parser-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ transparently.
Use `chumsky`'s text utilities (or integrate a `logos` lexer if preferred) to
convert the source text into a stream of `(SyntaxKind, Span)` pairs. Each span
records byte offsets so that the resulting CST can precisely mirror the input.
Whitespace and comments should produce tokens so they can be preserved.
Whitespace and comments should produce tokens so they can be preserved. The
current implementation opts for a small `logos` lexer because it keeps the token
definitions declarative while still interoperating smoothly with `chumsky`
parsers.

## 4. Construct the Parser with `chumsky`

Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@
#![forbid(unsafe_code)]

pub mod language;
pub mod tokenizer;

pub use language::{DdlogLanguage, SyntaxKind};
pub use tokenizer::{Span, tokenize};
263 changes: 263 additions & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
//! Lexical analysis for `DDlog` source.
//!
//! This module exposes a `tokenize` function which converts raw source text into
//! a sequence of `(SyntaxKind, Span)` pairs. It uses the `logos` crate to
//! recognise tokens so that the CST can mirror the input exactly.

use logos::Logos;

use crate::SyntaxKind;

/// Byte range for a token within the source.
pub type Span = std::ops::Range<usize>;

#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
enum Token {
#[regex(r"[ \t\r\n]+")]
Whitespace,
#[regex(r"/\*([^*]|\*[^/])*\*/", priority = 2)]
#[regex(r"//[^\n]*")]
Comment,
#[regex(r"[A-Za-z_][A-Za-z0-9_]*")]
Ident,
#[regex(r"[0-9]+")]
Number,
#[regex(r#""([^"\\]|\\.)*""#)]
String,
#[token("(")]
LParen,
#[token(")")]
RParen,
#[token("{")]
LBrace,
#[token("}")]
RBrace,
#[token("[")]
LBracket,
#[token("]")]
RBracket,
#[token(";")]
Semi,
#[token(",")]
Comma,
#[token(".")]
Dot,
#[token("::")]
ColonColon,
#[token(":")]
Colon,
#[token("|")]
Pipe,
#[token("&")]
Amp,
#[token("==")]
EqEq,
#[token("=")]
Eq,
#[token(":-")]
Implies,
#[token("%")]
Percent,
#[token("*")]
Star,
#[token("/")]
Slash,
#[token("+")]
Plus,
#[token("-")]
Minus,
#[token("->")]
Arrow,
#[token("=>")]
FatArrow,
#[token("<=")]
Lte,
#[token("<=>")]
Spaceship,
#[token(">=")]
Gte,
#[token("<")]
Lt,
#[token(">")]
Gt,
#[token("!=")]
Neq,
#[token(">>")]
Shr,
#[token("<<")]
Shl,
#[token("~")]
Tilde,
#[token("@")]
At,
#[token("#")]
Hash,
#[token("'")]
Apostrophe,
}

/// Returns the `SyntaxKind` for a given identifier if it matches a recognised keyword.
///
/// If the input string corresponds to a DDlog or Rust keyword, returns the associated
/// `SyntaxKind` variant; otherwise, returns `None`.
///
/// # Examples
///
/// ```no_run
/// assert_eq!(keyword_kind("fn"), Some(SyntaxKind::K_FN));
/// assert_eq!(keyword_kind("foobar"), None);
/// ```
fn keyword_kind(ident: &str) -> Option<SyntaxKind> {
Some(match ident {
"abstract" => SyntaxKind::K_ABSTRACT,
"Aggregate" => SyntaxKind::K_AGGREGATE,
"and" => SyntaxKind::K_AND,
"apply" => SyntaxKind::K_APPLY,
"as" => SyntaxKind::K_AS,
"async" => SyntaxKind::K_ASYNC,
"await" => SyntaxKind::K_AWAIT,
"become" => SyntaxKind::K_BECOME,
"bigint" => SyntaxKind::K_BIGINT,
"bit" => SyntaxKind::K_BIT,
"bool" => SyntaxKind::K_BOOL,
"box" => SyntaxKind::K_BOX,
"break" => SyntaxKind::K_BREAK,
"const" => SyntaxKind::K_CONST,
"continue" => SyntaxKind::K_CONTINUE,
"crate" => SyntaxKind::K_CRATE,
"do" => SyntaxKind::K_DO,
"double" => SyntaxKind::K_DOUBLE,
"dyn" => SyntaxKind::K_DYN,
"else" => SyntaxKind::K_ELSE,
"extern" => SyntaxKind::K_EXTERN,
"false" => SyntaxKind::K_FALSE,
"final" => SyntaxKind::K_FINAL,
"fn" => SyntaxKind::K_FN,
"FlatMap" => SyntaxKind::K_FLATMAP,
"float" => SyntaxKind::K_FLOAT,
"for" => SyntaxKind::K_FOR,
"function" => SyntaxKind::K_FUNCTION,
"if" => SyntaxKind::K_IF,
"impl" => SyntaxKind::K_IMPL,
"import" => SyntaxKind::K_IMPORT,
"in" => SyntaxKind::K_IN,
"input" => SyntaxKind::K_INPUT,
"Inspect" => SyntaxKind::K_INSPECT,
"let" => SyntaxKind::K_LET,
"loop" => SyntaxKind::K_LOOP,
"macro" => SyntaxKind::K_MACRO,
"match" => SyntaxKind::K_MATCH,
"mod" => SyntaxKind::K_MOD,
"move" => SyntaxKind::K_MOVE,
"multiset" => SyntaxKind::K_MULTISET,
"mut" => SyntaxKind::K_MUT,
"not" => SyntaxKind::K_NOT,
"or" => SyntaxKind::K_OR,
"override" => SyntaxKind::K_OVERRIDE,
"output" => SyntaxKind::K_OUTPUT,
"priv" => SyntaxKind::K_PRIV,
"pub" => SyntaxKind::K_PUB,
"ref" => SyntaxKind::K_REF,
"relation" => SyntaxKind::K_RELATION,
"return" => SyntaxKind::K_RETURN,
"self" => SyntaxKind::K_SELF,
"Self" => SyntaxKind::K_SELF_TYPE,
"signed" => SyntaxKind::K_SIGNED,
"skip" => SyntaxKind::K_SKIP,
"static" => SyntaxKind::K_STATIC,
"stream" => SyntaxKind::K_STREAM,
"struct" => SyntaxKind::K_STRUCT,
"super" => SyntaxKind::K_SUPER,
"trait" => SyntaxKind::K_TRAIT,
"transformer" => SyntaxKind::K_TRANSFORMER,
"try" => SyntaxKind::K_TRY,
"true" => SyntaxKind::K_TRUE,
"type" => SyntaxKind::K_TYPE,
"typedef" => SyntaxKind::K_TYPEDEF,
"typeof" => SyntaxKind::K_TYPEOF,
"_" => SyntaxKind::K_UNDERSCORE,
"unsafe" => SyntaxKind::K_UNSAFE,
"unsized" => SyntaxKind::K_UNSIZED,
"use" => SyntaxKind::K_USE,
"var" => SyntaxKind::K_VAR,
"virtual" => SyntaxKind::K_VIRTUAL,
"where" => SyntaxKind::K_WHERE,
"while" => SyntaxKind::K_WHILE,
"yield" => SyntaxKind::K_YIELD,
_ => return None,
})
}

/// Converts DDlog source text into a sequence of syntax tokens with their corresponding byte spans.
///
/// Each token is represented as a `(SyntaxKind, Span)` pair, where `Span` is the byte range of the token in the input.
/// Whitespace and comments are preserved in the output. Identifiers that match keywords are assigned the appropriate
/// keyword kind; otherwise, they are treated as generic identifiers. Unrecognised or invalid tokens are marked with
/// `SyntaxKind::N_ERROR`.
///
/// # Examples
///
/// ```no_run
/// use ddlog_tokenizer::{tokenize, SyntaxKind};
///
/// let src = "relation R(x: bit<32>) // comment";
/// let tokens = tokenize(src);
/// assert!(tokens.iter().any(|(kind, _)| *kind == SyntaxKind::T_IDENT));
/// ```
#[must_use]
pub fn tokenize(src: &str) -> Vec<(SyntaxKind, Span)> {
let mut lexer = Token::lexer(src);
let mut out = Vec::new();
while let Some(result) = lexer.next() {
let span = lexer.span();
let text = src.get(span.clone()).unwrap_or("");
let Ok(token) = result else {
out.push((SyntaxKind::N_ERROR, span));
continue;
};
let kind = match token {
Token::Whitespace => SyntaxKind::T_WHITESPACE,
Token::Comment => SyntaxKind::T_COMMENT,
Token::Ident => keyword_kind(text).unwrap_or(SyntaxKind::T_IDENT),
Token::Number => SyntaxKind::T_NUMBER,
Token::String => SyntaxKind::T_STRING,
Token::LParen => SyntaxKind::T_LPAREN,
Token::RParen => SyntaxKind::T_RPAREN,
Token::LBrace => SyntaxKind::T_LBRACE,
Token::RBrace => SyntaxKind::T_RBRACE,
Token::LBracket => SyntaxKind::T_LBRACKET,
Token::RBracket => SyntaxKind::T_RBRACKET,
Token::Semi => SyntaxKind::T_SEMI,
Token::Comma => SyntaxKind::T_COMMA,
Token::Dot => SyntaxKind::T_DOT,
Token::ColonColon => SyntaxKind::T_COLON_COLON,
Token::Colon => SyntaxKind::T_COLON,
Token::Pipe => SyntaxKind::T_PIPE,
Token::Amp => SyntaxKind::T_AMP,
Token::EqEq => SyntaxKind::T_EQEQ,
Token::Eq => SyntaxKind::T_EQ,
Token::Implies => SyntaxKind::T_IMPLIES,
Token::Percent => SyntaxKind::T_PERCENT,
Token::Star => SyntaxKind::T_STAR,
Token::Slash => SyntaxKind::T_SLASH,
Token::Plus => SyntaxKind::T_PLUS,
Token::Minus => SyntaxKind::T_MINUS,
Token::Arrow => SyntaxKind::T_ARROW,
Token::FatArrow => SyntaxKind::T_FAT_ARROW,
Token::Lte => SyntaxKind::T_LTE,
Token::Spaceship => SyntaxKind::T_SPACESHIP,
Token::Gte => SyntaxKind::T_GTE,
Token::Lt => SyntaxKind::T_LT,
Token::Gt => SyntaxKind::T_GT,
Token::Neq => SyntaxKind::T_NEQ,
Token::Shr => SyntaxKind::T_SHR,
Token::Shl => SyntaxKind::T_SHL,
Token::Tilde => SyntaxKind::T_TILDE,
Token::At => SyntaxKind::T_AT,
Token::Hash => SyntaxKind::T_HASH,
Token::Apostrophe => SyntaxKind::T_APOSTROPHE,
};
out.push((kind, span));
}
out
}
Loading