Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,29 @@
# changelog

## 3.1.0 - 2026-03-29

multi-turn behavioral drift infrastructure. this is the foundation for converting promptpressure from a single-turn eval tool to a multi-turn drift detection CLI.

### added
- 4-tier run system: `--tier smoke|quick|full|deep` with `--smoke` and `--quick` shortcuts
- tier filtering with cumulative semantics (smoke < quick < full < deep)
- per-turn `response_length_ratio` metric computed automatically during multi-turn evals
- per-turn timeout scaling with 5x cap (prevents indefinite hangs on deep sequences)
- context window token estimation with warning when approaching model limits
- `tier` field in pydantic Settings with `Literal` type validation
- `schema.json` documenting the full entry format (JSON Schema 2020-12)
- `archive/adversarial/` directory for refusal sensitivity entries
- `subcategory`, `tier`, `difficulty`, `per_turn_expectations` fields to dataset schema

### changed
- default run tier is `quick` (was: run everything)
- dataset validation now accepts and validates 4 new optional fields
- all 190 dataset entries tagged with tier/subcategory/difficulty
- exits non-zero when tier filter produces 0 matching entries

### removed
- 30 refusal sensitivity entries moved from default dataset to `archive/adversarial/refusal_sensitivity.json` (still accessible via `--dataset`)

## 3.0.0 - 2026-03-25

breaking overhaul. if you're upgrading from 2.x, start fresh.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.0.0
3.1.0
26 changes: 26 additions & 0 deletions archive/adversarial/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# archived adversarial eval content

these prompts test refusal sensitivity, how models handle requests that could be
interpreted as requesting harmful content but are actually benign (academic research,
creative writing, historical analysis, etc).

## why archived

hosted API providers may flag or rate-limit accounts that send adversarial-adjacent
prompts at scale. these sequences are preserved for:
- local model testing (ollama, llama.cpp, vLLM)
- testing with explicit provider permission
- red-team exercises with appropriate authorization

## how to run

```bash
promptpressure --dataset archive/adversarial/refusal_sensitivity.json --multi-config config.yaml
```

no special flags needed. the archive is just a dataset file in a different directory.

## contents

30 entries (rs_001 through rs_030). all prompts are completely benign but designed
to test whether models over-refuse legitimate requests.
332 changes: 332 additions & 0 deletions archive/adversarial/refusal_sensitivity.json

Large diffs are not rendered by default.

1,280 changes: 760 additions & 520 deletions evals_dataset.json

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
60 changes: 56 additions & 4 deletions promptpressure/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from promptpressure.monitoring import start_metrics_server, stop_metrics_server, record_api_request, record_evaluation_start, record_evaluation_end, record_prompt_processing, record_response, update_custom_metrics
from promptpressure.reporting import ReportGenerator
from promptpressure.database import init_db, get_db_session, Evaluation, Result, Metric, DATABASE_URL
from promptpressure.per_turn_metrics import compute_turn_metrics
from promptpressure.tier import filter_by_tier

def log_error(output_dir, error_msg):
log_path = os.path.join(output_dir, "error.log")
Expand All @@ -37,6 +39,16 @@ async def run_evaluation_suite(config, adapter_name):
with open(dataset_file, "r", encoding="utf-8") as f:
prompts = json.load(f)

# Tier filtering
tier = config.get("tier", "quick")
original_count = len(prompts)
prompts, skipped = filter_by_tier(prompts, tier, warn_invalid=True)
print(f"Tier '{tier}': {len(prompts)}/{original_count} sequences selected")
if not prompts:
print(f"ERROR: Tier '{tier}' matched 0 entries. Nothing to evaluate.")
import sys
sys.exit(1)

# Prepare output directory
base_output_dir = config.get("output_dir", "outputs")
use_ts = config.get("use_timestamp_output_dir", True)
Expand Down Expand Up @@ -237,8 +249,16 @@ async def _process_multi_turn(entry, turns):
conversation.append({"role": turn_role, "content": turn_content})

try:
# Send full conversation history to adapter
response_text = await adapter_fn(turn_content, config, messages=list(conversation))
# Timeout scales with turn count, capped at 5x base
base_timeout = config.get("timeout", 60)
turn_timeout = min(base_timeout * (1 + turn_idx * 0.5), base_timeout * 5)
try:
response_text = await asyncio.wait_for(
adapter_fn(turn_content, config, messages=list(conversation)),
timeout=turn_timeout
)
except asyncio.TimeoutError as e:
raise TimeoutError(f"Turn {turn_idx} timed out after {turn_timeout:.0f}s") from e

# Capture reasoning tokens if available
turn_reasoning = ""
Expand All @@ -250,13 +270,25 @@ async def _process_multi_turn(entry, turns):

# Add assistant response to conversation history
conversation.append({"role": "assistant", "content": response_text})

# Rough token estimation for context window warning
total_chars = sum(len(m["content"]) for m in conversation)
estimated_tokens = total_chars // 4
if estimated_tokens > 6000 and turn_idx < len(turns):
print(f" warning: {entry.get('id')} at ~{estimated_tokens} tokens after turn {turn_idx} "
f"(may exceed small model context windows)")

turn_entry = {
"turn": turn_idx,
"user": turn_content,
"assistant": response_text
}
if turn_reasoning:
turn_entry["reasoning"] = turn_reasoning
# Compute per-turn behavioral metrics
turn_entry["metrics"] = compute_turn_metrics(
turn_content, response_text, turn_number=turn_idx
)
turn_responses.append(turn_entry)

except Exception as e:
Expand Down Expand Up @@ -284,6 +316,9 @@ async def _process_multi_turn(entry, turns):
record_response(success=False)
record_api_request(model=model_name, adapter=adapter_name, duration=duration, success=False, error_type="MultiTurnError")

# Aggregate per-turn metrics for the sequence
per_turn_metrics = [tr["metrics"] for tr in turn_responses if "metrics" in tr]

# Build combined response for backward compat (CSV/JSON output)
combined_response = "\n\n".join(
f"[Turn {tr['turn']}]\nUser: {tr['user']}\nAssistant: {tr['assistant']}"
Expand All @@ -306,7 +341,8 @@ async def _process_multi_turn(entry, turns):
"success": success,
"error": error_msg,
"multi_turn": True,
"plugin_scores": {}
"plugin_scores": {},
"per_turn_metrics": per_turn_metrics,
}

await emit_event("end_prompt", {
Expand Down Expand Up @@ -565,7 +601,11 @@ async def main_async():
parser.add_argument("--post-analyze", choices=["groq", "openrouter"], help="Optional post-analysis adapter")
parser.add_argument("--schema", action="store_true", help="Dump JSON Schema for configuration and exit")
parser.add_argument("--ci", action="store_true", help="CI mode: output machine-readable JSON summary, exit 1 on any failure")

parser.add_argument("--tier", choices=["smoke", "quick", "full", "deep"],
default=None, help="Run tier (smoke/quick/full/deep). Default: quick")
parser.add_argument("--smoke", action="store_true", help="Shortcut for --tier smoke")
parser.add_argument("--quick", action="store_true", help="Shortcut for --tier quick")

# Plugin CLI commands
subparsers = parser.add_subparsers(dest="command", help="Sub-commands")

Expand All @@ -582,6 +622,16 @@ async def main_async():

args = parser.parse_args()

# Resolve tier from flags
if args.smoke:
tier_override = "smoke"
elif args.quick:
tier_override = "quick"
elif args.tier:
tier_override = args.tier
else:
tier_override = None # use config default

if args.schema:
from promptpressure.config import Settings
print(json.dumps(Settings.model_json_schema(), indent=2))
Expand Down Expand Up @@ -629,6 +679,8 @@ async def main_async():
import sys
sys.exit(1)
config_dict = config.model_dump()
if tier_override:
config_dict["tier"] = tier_override
last_config = config_dict

results, out_dir, metrics_collector = await run_evaluation_suite(config_dict, config_dict.get("adapter"))
Expand Down
7 changes: 5 additions & 2 deletions promptpressure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
import os
from pathlib import Path
from typing import Optional, List
from typing import Optional, List, Literal

from dotenv import load_dotenv
from pydantic import Field, model_validator
Expand Down Expand Up @@ -32,7 +32,10 @@ class Settings(BaseSettings):
output: str = Field(..., description="Output filename for evaluation results")
output_dir: str = Field("outputs", description="Directory for output files")
temperature: float = Field(0.7, ge=0.0, le=2.0, description="Sampling temperature")


# Tier settings
tier: Literal["smoke", "quick", "full", "deep"] = Field("quick", description="Run tier: smoke (<60s CI), quick (<10min), full (~1hr), deep (all)")

# Performance settings
max_workers: int = Field(1, ge=1, le=10, description="Number of concurrent workers for prompt evaluation")

Expand Down
39 changes: 39 additions & 0 deletions promptpressure/per_turn_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Automated per-turn behavioral metrics for multi-turn eval sequences.

These metrics are computed without LLM grading calls. They measure
observable response characteristics that indicate behavioral drift.
"""


def compute_response_length_ratio(user_message: str, response: str) -> float:
"""Ratio of response length to user message length.

Detects terse/verbose drift across turns. A model that starts with
detailed responses and shrinks to one-liners is drifting.

Returns 0.0 if either input is empty (avoids division by zero).
"""
if not user_message or not response:
return 0.0
return len(response) / len(user_message)


def compute_turn_metrics(
user_message: str,
response: str,
turn_number: int = 1,
) -> dict:
"""Compute all automated metrics for a single turn.

Args:
user_message: the user's input for this turn
response: the model's response for this turn
turn_number: 1-indexed turn number in the sequence

Returns:
dict with metric values for this turn
"""
return {
"turn": turn_number,
"response_length_ratio": compute_response_length_ratio(user_message, response),
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
44 changes: 44 additions & 0 deletions promptpressure/tier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Tier filtering for PromptPressure eval datasets.

Tiers are cumulative: smoke < quick < full < deep.
--tier quick runs all entries tagged smoke OR quick.
Entries without a tier field default to "full".
"""

TIER_ORDER = ["smoke", "quick", "full", "deep"]


def filter_by_tier(entries: list[dict], tier: str, warn_invalid: bool = False) -> tuple[list[dict], int]:
"""Filter dataset entries by tier level (cumulative).

Args:
entries: list of dataset entry dicts
tier: requested tier level (smoke, quick, full, deep)
warn_invalid: if True, print warning for entries with invalid tier values

Returns:
tuple of (filtered list, count of skipped invalid entries)

Raises:
ValueError: if tier is not a valid tier name
"""
if tier not in TIER_ORDER:
raise ValueError(f"Invalid tier '{tier}'. Must be one of: {TIER_ORDER}")

max_index = TIER_ORDER.index(tier)

result = []
skipped = []
for entry in entries:
entry_tier = entry.get("tier", "full")
if entry_tier not in TIER_ORDER:
skipped.append(entry.get("id", "unknown"))
continue
if TIER_ORDER.index(entry_tier) <= max_index:
result.append(entry)

if warn_invalid and skipped:
print(f" warning: {len(skipped)} entries skipped (invalid tier): {', '.join(skipped[:5])}"
+ (f" and {len(skipped) - 5} more" if len(skipped) > 5 else ""))

return result, len(skipped)
77 changes: 77 additions & 0 deletions schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "PromptPressure Eval Dataset",
"description": "Schema for entries in evals_dataset.json",
"type": "array",
"items": {
"type": "object",
"required": ["id", "category", "prompt", "expected_behavior", "eval_criteria"],
"properties": {
"id": {
"type": "string",
"description": "Unique entry ID, e.g. if_001, sy_025"
},
"category": {
"type": "string",
"description": "Evaluation category name"
},
"subcategory": {
"type": "string",
"description": "Subcategory within the category"
},
"tier": {
"type": "string",
"enum": ["smoke", "quick", "full", "deep"],
"description": "Run tier. smoke < quick < full < deep (cumulative)"
},
"difficulty": {
"type": "string",
"enum": ["easy", "medium", "hard"],
"description": "Difficulty level (orthogonal to tier)"
},
"prompt": {
"oneOf": [
{"type": "string", "minLength": 1},
{
"type": "array",
"minItems": 1,
"items": {
"type": "object",
"required": ["role", "content"],
"properties": {
"role": {"type": "string"},
"content": {"type": "string", "minLength": 1}
}
}
}
],
"description": "Single-turn string or multi-turn message array"
},
"expected_behavior": {
"type": "string",
"minLength": 1,
"description": "Human-readable description of expected model behavior"
},
"per_turn_expectations": {
"type": "array",
"items": {
"type": "object",
"required": ["turn", "expected"],
"properties": {
"turn": {"type": "integer", "minimum": 1},
"expected": {"type": "string", "minLength": 1}
}
},
"description": "Per-turn expected behaviors for multi-turn sequences"
},
"eval_criteria": {
"type": "object",
"description": "Key-value pairs for LLM-as-judge grading"
},
"notes": {
"type": "string",
"description": "Optional authoring notes"
}
}
}
}
Binary file removed tests/__pycache__/__init__.cpython-314.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
15 changes: 15 additions & 0 deletions tests/test_cli_tier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest


class TestConfigTierField:
def test_tier_field_exists_in_schema(self):
"""Settings model should include a tier field."""
from promptpressure.config import Settings
schema = Settings.model_json_schema()
assert "tier" in schema["properties"], "Settings schema missing 'tier' field"

def test_tier_default_is_quick(self):
"""tier should default to 'quick' when not specified."""
from promptpressure.config import Settings
schema = Settings.model_json_schema()
assert schema["properties"]["tier"]["default"] == "quick"
Loading
Loading