Skip to content
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ json5 = "1"

# Tokenization + Templates
tokenizers = { version = "0.23", features = ["http"] }
minijinja = { version = "2", features = ["loader"] }
minijinja = { version = "2", features = ["loader", "fuel"] }
minijinja-contrib = { version = "2", features = ["pycompat"] }

# CLI + Configuration
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ Higgs is a single static Rust binary that serves local models, proxies to provid
- Exact local model names now beat regex routes.
- `/metrics` is a real endpoint, and `server.max_body_size` is enforced on API requests.
- `higgs shellenv` and `higgs exec` now fail fast on bad config or an unreachable server.
- The server now binds `127.0.0.1` by default (was `0.0.0.0`). Set `server.host = "0.0.0.0"` (and an `api_key`) to expose it on the network.
- CORS headers are no longer sent unless `server.cors_origins` is set (`["*"]` restores the old permissive behavior).

## Quick Links

Expand Down
160 changes: 50 additions & 110 deletions crates/higgs-engine/src/batch_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use crate::{
error::EngineError,
model_loader,
prompt_cache::PrefixCache,
simple::{IncrementalDetok, find_stop_in_tail},
};

/// Default maximum number of cached prefixes.
Expand Down Expand Up @@ -63,7 +64,7 @@ struct ActiveRequest {
constraint: Option<crate::constrained::ConstrainedGenerator>,
response_tx: tokio::sync::mpsc::Sender<StreamingOutput>,
prompt_len: u32,
prev_decoded_len: usize,
detok: IncrementalDetok,
}

// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -722,20 +723,40 @@ fn prefill_request(
.as_ref()
.map(|lp| lp.materialize(first_token_id));

// Decode the first token for text diff tracking
let first_text = tokenizer
.decode(&[first_token_id], true)
// Decode the first token incrementally (routes through IncrementalDetok
// so partial UTF-8 is held back and prefix-before-stop is correctly emitted).
let mut detok = IncrementalDetok::new(String::new(), 0);
let first_chunk = detok
.append(tokenizer, &[first_token_id])
.unwrap_or_default();
let emitted_before = detok.text.len() - first_chunk.len();

// Check if we're done after the first token
let is_eos = eos_token_ids.contains(&first_token_id);
let hit_stop = check_stop_sequences_simple(&first_text, &req.stop_sequences);
let hit_stop = !req.stop_sequences.is_empty()
&& find_stop_in_tail(&detok.text, first_chunk.len(), &req.stop_sequences).is_some();
let at_max = req.max_tokens <= 1;

if is_eos || hit_stop || at_max {
let finish_reason = if is_eos || hit_stop { "stop" } else { "length" };
let mut send_text = if hit_stop {
// Emit any prefix text before the stop sequence
find_stop_in_tail(&detok.text, first_chunk.len(), &req.stop_sequences)
.and_then(|pos| detok.text.get(emitted_before..pos))
.unwrap_or_default()
.to_owned()
} else {
first_chunk
};
if !hit_stop {
send_text.push_str(
&detok
.flush(tokenizer, &[first_token_id])
.unwrap_or_default(),
);
}
let _ = req.response_tx.blocking_send(StreamingOutput {
new_text: if hit_stop { String::new() } else { first_text },
new_text: send_text,
finished: true,
finish_reason: Some(finish_reason.to_owned()),
prompt_tokens: prompt_len,
Expand All @@ -746,11 +767,10 @@ fn prefill_request(
}

// Send first token
let prev_decoded_len = first_text.len();
if req
.response_tx
.blocking_send(StreamingOutput {
new_text: first_text,
new_text: first_chunk,
finished: false,
finish_reason: None,
prompt_tokens: prompt_len,
Expand All @@ -773,7 +793,7 @@ fn prefill_request(
constraint: req.constraint,
response_tx: req.response_tx,
prompt_len,
prev_decoded_len,
detok,
}))
})
}
Expand Down Expand Up @@ -852,23 +872,21 @@ fn materialize_decode_step(

let completion_len: u32 = ar.generated_tokens.len().try_into().unwrap_or(u32::MAX);

// Decode full text for diff and stop sequence checking
let full_text = tokenizer
.decode(&ar.generated_tokens, true)
// Decode only the trailing token window for diff and stop checking
let new_text = ar
.detok
.append(tokenizer, &ar.generated_tokens)
.unwrap_or_default();
let new_text = full_text
.get(ar.prev_decoded_len..)
.unwrap_or_default()
.to_owned();
let old_decoded_len = ar.prev_decoded_len;
ar.prev_decoded_len = full_text.len();

let (final_new_text, hit_stop) = if ar.stop_sequences.is_empty() {
let emitted_before = ar.detok.text.len() - new_text.len();

let (mut final_new_text, hit_stop) = if ar.stop_sequences.is_empty() {
(new_text, false)
} else if check_stop_sequences_simple(&full_text, &ar.stop_sequences) {
let truncated = truncate_at_stop(&full_text, &ar.stop_sequences);
let emit = truncated
.get(old_decoded_len..)
} else if let Some(pos) = find_stop_in_tail(&ar.detok.text, new_text.len(), &ar.stop_sequences)
{
let emit = ar
.detok
.text
.get(emitted_before..pos)
.unwrap_or_default()
.to_owned();
(emit, true)
Expand All @@ -884,6 +902,13 @@ fn materialize_decode_step(
.is_some_and(crate::constrained::ConstrainedGenerator::is_finished);

let finished = is_eos || at_max || hit_stop || constraint_done;
if finished && !hit_stop {
final_new_text.push_str(
&ar.detok
.flush(tokenizer, &ar.generated_tokens)
.unwrap_or_default(),
);
}
let finish_reason = if is_eos || hit_stop || constraint_done {
Some("stop".to_owned())
} else if at_max {
Expand All @@ -907,96 +932,11 @@ fn materialize_decode_step(
finished || disconnected
}

/// Check if any stop sequence appears in the text.
fn check_stop_sequences_simple(text: &str, stop_sequences: &[String]) -> bool {
stop_sequences.iter().any(|seq| text.contains(seq.as_str()))
}

/// Truncate text at the earliest stop sequence.
fn truncate_at_stop(text: &str, stop_sequences: &[String]) -> String {
let mut earliest: Option<usize> = None;
for seq in stop_sequences {
if let Some(pos) = text.find(seq.as_str()) {
earliest = Some(earliest.map_or(pos, |prev| prev.min(pos)));
}
}
earliest.map_or_else(
|| text.to_owned(),
|pos| text.get(..pos).unwrap_or_default().to_owned(),
)
}

#[cfg(test)]
#[allow(clippy::panic, clippy::unwrap_used, clippy::indexing_slicing)]
mod tests {
use super::*;

// -----------------------------------------------------------------------
// check_stop_sequences_simple
// -----------------------------------------------------------------------

#[test]
fn stop_sequences_empty_never_matches() {
assert!(!check_stop_sequences_simple("hello world", &[]));
}

#[test]
fn stop_sequences_match_at_end() {
let stops = vec!["</s>".to_owned()];
assert!(check_stop_sequences_simple("some text</s>", &stops));
}

#[test]
fn stop_sequences_match_in_middle() {
let stops = vec!["STOP".to_owned()];
assert!(check_stop_sequences_simple("before STOP after", &stops));
}

#[test]
fn stop_sequences_no_match() {
let stops = vec!["</s>".to_owned(), "<|end|>".to_owned()];
assert!(!check_stop_sequences_simple("normal text", &stops));
}

#[test]
fn stop_sequences_multiple_one_matches() {
let stops = vec!["</s>".to_owned(), "\n\n".to_owned()];
assert!(check_stop_sequences_simple("text\n\nmore", &stops));
}

// -----------------------------------------------------------------------
// truncate_at_stop
// -----------------------------------------------------------------------

#[test]
fn truncate_no_stop_returns_full_text() {
let stops = vec!["</s>".to_owned()];
assert_eq!(truncate_at_stop("hello world", &stops), "hello world");
}

#[test]
fn truncate_at_stop_removes_suffix() {
let stops = vec!["</s>".to_owned()];
assert_eq!(truncate_at_stop("hello</s>", &stops), "hello");
}

#[test]
fn truncate_at_earliest_of_multiple_stops() {
let stops = vec!["BBB".to_owned(), "AAA".to_owned()];
assert_eq!(truncate_at_stop("xAAAyBBBz", &stops), "x");
}

#[test]
fn truncate_empty_stops() {
assert_eq!(truncate_at_stop("hello", &[]), "hello");
}

#[test]
fn truncate_stop_at_start() {
let stops = vec!["STOP".to_owned()];
assert_eq!(truncate_at_stop("STOPrest", &stops), "");
}

// -----------------------------------------------------------------------
// materialize_decode_step
// -----------------------------------------------------------------------
Expand All @@ -1017,7 +957,7 @@ mod tests {
constraint: None,
response_tx: tx,
prompt_len: 5,
prev_decoded_len: 0,
detok: IncrementalDetok::new(String::new(), 0),
};
(ar, rx)
}
Expand Down
9 changes: 9 additions & 0 deletions crates/higgs-engine/src/chat_template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ pub struct ChatMessage {
pub tool_calls: Option<Vec<serde_json::Value>>,
}

/// Upper bound on template-engine instructions per render. Generous enough
/// for complex HF chat templates over long conversations, but stops a
/// malicious template from looping forever.
const TEMPLATE_FUEL: u64 = 5_000_000;

/// Renders chat messages using a Jinja2 template (`HuggingFace` format).
pub struct ChatTemplateRenderer {
env: Environment<'static>,
Expand All @@ -24,6 +29,10 @@ impl ChatTemplateRenderer {
/// Create a renderer from a Jinja2 template string.
pub fn new<S: Into<String>>(template_source: S) -> Result<Self, EngineError> {
let mut env = Environment::new();
// Templates come from model directories (tokenizer_config.json /
// chat_template.jinja), which are third-party content; bound execution
// so a hostile template cannot loop forever.
env.set_fuel(Some(TEMPLATE_FUEL));
env.add_filter("tojson", tojson_filter);
minijinja_contrib::add_to_environment(&mut env);
env.set_unknown_method_callback(minijinja_contrib::pycompat::unknown_method_callback);
Expand Down
Loading