StressTestor · StressTestor · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,29 @@
 # changelog
 
+## 3.1.0 - 2026-03-29
+
+multi-turn behavioral drift infrastructure. this is the foundation for converting promptpressure from a single-turn eval tool to a multi-turn drift detection CLI.
+
+### added
+- 4-tier run system: `--tier smoke|quick|full|deep` with `--smoke` and `--quick` shortcuts
+- tier filtering with cumulative semantics (smoke < quick < full < deep)
+- per-turn `response_length_ratio` metric computed automatically during multi-turn evals
+- per-turn timeout scaling with 5x cap (prevents indefinite hangs on deep sequences)
+- context window token estimation with warning when approaching model limits
+- `tier` field in pydantic Settings with `Literal` type validation
+- `schema.json` documenting the full entry format (JSON Schema 2020-12)
+- `archive/adversarial/` directory for refusal sensitivity entries
+- `subcategory`, `tier`, `difficulty`, `per_turn_expectations` fields to dataset schema
+
+### changed
+- default run tier is `quick` (was: run everything)
+- dataset validation now accepts and validates 4 new optional fields
+- all 190 dataset entries tagged with tier/subcategory/difficulty
+- exits non-zero when tier filter produces 0 matching entries
+
+### removed
+- 30 refusal sensitivity entries moved from default dataset to `archive/adversarial/refusal_sensitivity.json` (still accessible via `--dataset`)
+
 ## 3.0.0 - 2026-03-25
 
 breaking overhaul. if you're upgrading from 2.x, start fresh.

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.0.0
+3.1.0
diff --git a/archive/adversarial/README.md b/archive/adversarial/README.md
@@ -0,0 +1,26 @@
+# archived adversarial eval content
+
+these prompts test refusal sensitivity, how models handle requests that could be
+interpreted as requesting harmful content but are actually benign (academic research,
+creative writing, historical analysis, etc).
+
+## why archived
+
+hosted API providers may flag or rate-limit accounts that send adversarial-adjacent
+prompts at scale. these sequences are preserved for:
+- local model testing (ollama, llama.cpp, vLLM)
+- testing with explicit provider permission
+- red-team exercises with appropriate authorization
+
+## how to run
+
+```bash
+promptpressure --dataset archive/adversarial/refusal_sensitivity.json --multi-config config.yaml
+```
+
+no special flags needed. the archive is just a dataset file in a different directory.
+
+## contents
+
+30 entries (rs_001 through rs_030). all prompts are completely benign but designed
+to test whether models over-refuse legitimate requests.
diff --git a/archive/adversarial/refusal_sensitivity.json b/archive/adversarial/refusal_sensitivity.json
diff --git a/evals_dataset.json b/evals_dataset.json
diff --git a/promptpressure/adapters/__pycache__/__init__.cpython-314.pyc b/promptpressure/adapters/__pycache__/__init__.cpython-314.pyc
diff --git a/promptpressure/adapters/__pycache__/groq_adapter.cpython-314.pyc b/promptpressure/adapters/__pycache__/groq_adapter.cpython-314.pyc
diff --git a/promptpressure/adapters/__pycache__/lmstudio_adapter.cpython-314.pyc b/promptpressure/adapters/__pycache__/lmstudio_adapter.cpython-314.pyc
diff --git a/promptpressure/adapters/__pycache__/mock_adapter.cpython-314.pyc b/promptpressure/adapters/__pycache__/mock_adapter.cpython-314.pyc
diff --git a/promptpressure/adapters/__pycache__/openrouter_adapter.cpython-314.pyc b/promptpressure/adapters/__pycache__/openrouter_adapter.cpython-314.pyc
diff --git a/promptpressure/cli.py b/promptpressure/cli.py
@@ -21,6 +21,8 @@
 from promptpressure.monitoring import start_metrics_server, stop_metrics_server, record_api_request, record_evaluation_start, record_evaluation_end, record_prompt_processing, record_response, update_custom_metrics
 from promptpressure.reporting import ReportGenerator
 from promptpressure.database import init_db, get_db_session, Evaluation, Result, Metric, DATABASE_URL
+from promptpressure.per_turn_metrics import compute_turn_metrics
+from promptpressure.tier import filter_by_tier
 
 def log_error(output_dir, error_msg):
     log_path = os.path.join(output_dir, "error.log")
@@ -37,6 +39,16 @@ async def run_evaluation_suite(config, adapter_name):
     with open(dataset_file, "r", encoding="utf-8") as f:
         prompts = json.load(f)
 
+    # Tier filtering
+    tier = config.get("tier", "quick")
+    original_count = len(prompts)
+    prompts, skipped = filter_by_tier(prompts, tier, warn_invalid=True)
+    print(f"Tier '{tier}': {len(prompts)}/{original_count} sequences selected")
+    if not prompts:
+        print(f"ERROR: Tier '{tier}' matched 0 entries. Nothing to evaluate.")
+        import sys
+        sys.exit(1)
+
     # Prepare output directory
     base_output_dir = config.get("output_dir", "outputs")
     use_ts = config.get("use_timestamp_output_dir", True)
@@ -237,8 +249,16 @@ async def _process_multi_turn(entry, turns):
             conversation.append({"role": turn_role, "content": turn_content})
 
             try:
-                # Send full conversation history to adapter
-                response_text = await adapter_fn(turn_content, config, messages=list(conversation))
+                # Timeout scales with turn count, capped at 5x base
+                base_timeout = config.get("timeout", 60)
+                turn_timeout = min(base_timeout * (1 + turn_idx * 0.5), base_timeout * 5)
+                try:
+                    response_text = await asyncio.wait_for(
+                        adapter_fn(turn_content, config, messages=list(conversation)),
+                        timeout=turn_timeout
+                    )
+                except asyncio.TimeoutError as e:
+                    raise TimeoutError(f"Turn {turn_idx} timed out after {turn_timeout:.0f}s") from e
 
                 # Capture reasoning tokens if available
                 turn_reasoning = ""
@@ -250,13 +270,25 @@ async def _process_multi_turn(entry, turns):
 
                 # Add assistant response to conversation history
                 conversation.append({"role": "assistant", "content": response_text})
+
+                # Rough token estimation for context window warning
+                total_chars = sum(len(m["content"]) for m in conversation)
+                estimated_tokens = total_chars // 4
+                if estimated_tokens > 6000 and turn_idx < len(turns):
+                    print(f"  warning: {entry.get('id')} at ~{estimated_tokens} tokens after turn {turn_idx} "
+                          f"(may exceed small model context windows)")
+
                 turn_entry = {
                     "turn": turn_idx,
                     "user": turn_content,
                     "assistant": response_text
                 }
                 if turn_reasoning:
                     turn_entry["reasoning"] = turn_reasoning
+                # Compute per-turn behavioral metrics
+                turn_entry["metrics"] = compute_turn_metrics(
+                    turn_content, response_text, turn_number=turn_idx
+                )
                 turn_responses.append(turn_entry)
 
             except Exception as e:
@@ -284,6 +316,9 @@ async def _process_multi_turn(entry, turns):
             record_response(success=False)
             record_api_request(model=model_name, adapter=adapter_name, duration=duration, success=False, error_type="MultiTurnError")
 
+        # Aggregate per-turn metrics for the sequence
+        per_turn_metrics = [tr["metrics"] for tr in turn_responses if "metrics" in tr]
+
         # Build combined response for backward compat (CSV/JSON output)
         combined_response = "\n\n".join(
             f"[Turn {tr['turn']}]\nUser: {tr['user']}\nAssistant: {tr['assistant']}"
@@ -306,7 +341,8 @@ async def _process_multi_turn(entry, turns):
             "success": success,
             "error": error_msg,
             "multi_turn": True,
-            "plugin_scores": {}
+            "plugin_scores": {},
+            "per_turn_metrics": per_turn_metrics,
         }
 
         await emit_event("end_prompt", {
@@ -565,7 +601,11 @@ async def main_async():
     parser.add_argument("--post-analyze", choices=["groq", "openrouter"], help="Optional post-analysis adapter")
     parser.add_argument("--schema", action="store_true", help="Dump JSON Schema for configuration and exit")
     parser.add_argument("--ci", action="store_true", help="CI mode: output machine-readable JSON summary, exit 1 on any failure")
-
+    parser.add_argument("--tier", choices=["smoke", "quick", "full", "deep"],
+                        default=None, help="Run tier (smoke/quick/full/deep). Default: quick")
+    parser.add_argument("--smoke", action="store_true", help="Shortcut for --tier smoke")
+    parser.add_argument("--quick", action="store_true", help="Shortcut for --tier quick")
+
     # Plugin CLI commands
     subparsers = parser.add_subparsers(dest="command", help="Sub-commands")
 
@@ -582,6 +622,16 @@ async def main_async():
 
     args = parser.parse_args()
 
+    # Resolve tier from flags
+    if args.smoke:
+        tier_override = "smoke"
+    elif args.quick:
+        tier_override = "quick"
+    elif args.tier:
+        tier_override = args.tier
+    else:
+        tier_override = None  # use config default
+
     if args.schema:
         from promptpressure.config import Settings
         print(json.dumps(Settings.model_json_schema(), indent=2))
@@ -629,6 +679,8 @@ async def main_async():
             import sys
             sys.exit(1)
         config_dict = config.model_dump()
+        if tier_override:
+            config_dict["tier"] = tier_override
         last_config = config_dict
 
         results, out_dir, metrics_collector = await run_evaluation_suite(config_dict, config_dict.get("adapter"))

diff --git a/promptpressure/config.py b/promptpressure/config.py
@@ -4,7 +4,7 @@
 """
 import os
 from pathlib import Path
-from typing import Optional, List
+from typing import Optional, List, Literal
 
 from dotenv import load_dotenv
 from pydantic import Field, model_validator
@@ -32,7 +32,10 @@ class Settings(BaseSettings):
     output: str = Field(..., description="Output filename for evaluation results")
     output_dir: str = Field("outputs", description="Directory for output files")
     temperature: float = Field(0.7, ge=0.0, le=2.0, description="Sampling temperature")
-
+
+    # Tier settings
+    tier: Literal["smoke", "quick", "full", "deep"] = Field("quick", description="Run tier: smoke (<60s CI), quick (<10min), full (~1hr), deep (all)")
+
     # Performance settings
     max_workers: int = Field(1, ge=1, le=10, description="Number of concurrent workers for prompt evaluation")
 

diff --git a/promptpressure/per_turn_metrics.py b/promptpressure/per_turn_metrics.py
@@ -0,0 +1,39 @@
+"""Automated per-turn behavioral metrics for multi-turn eval sequences.
+
+These metrics are computed without LLM grading calls. They measure
+observable response characteristics that indicate behavioral drift.
+"""
+
+
+def compute_response_length_ratio(user_message: str, response: str) -> float:
+    """Ratio of response length to user message length.
+
+    Detects terse/verbose drift across turns. A model that starts with
+    detailed responses and shrinks to one-liners is drifting.
+
+    Returns 0.0 if either input is empty (avoids division by zero).
+    """
+    if not user_message or not response:
+        return 0.0
+    return len(response) / len(user_message)
+
+
+def compute_turn_metrics(
+    user_message: str,
+    response: str,
+    turn_number: int = 1,
+) -> dict:
+    """Compute all automated metrics for a single turn.
+
+    Args:
+        user_message: the user's input for this turn
+        response: the model's response for this turn
+        turn_number: 1-indexed turn number in the sequence
+
+    Returns:
+        dict with metric values for this turn
+    """
+    return {
+        "turn": turn_number,
+        "response_length_ratio": compute_response_length_ratio(user_message, response),
+    }
diff --git a/promptpressure/plugins/__pycache__/__init__.cpython-314.pyc b/promptpressure/plugins/__pycache__/__init__.cpython-314.pyc
diff --git a/promptpressure/plugins/__pycache__/core.cpython-314.pyc b/promptpressure/plugins/__pycache__/core.cpython-314.pyc
diff --git a/promptpressure/plugins/__pycache__/demo_scorer.cpython-314.pyc b/promptpressure/plugins/__pycache__/demo_scorer.cpython-314.pyc
diff --git a/promptpressure/tier.py b/promptpressure/tier.py
@@ -0,0 +1,44 @@
+"""Tier filtering for PromptPressure eval datasets.
+
+Tiers are cumulative: smoke < quick < full < deep.
+--tier quick runs all entries tagged smoke OR quick.
+Entries without a tier field default to "full".
+"""
+
+TIER_ORDER = ["smoke", "quick", "full", "deep"]
+
+
+def filter_by_tier(entries: list[dict], tier: str, warn_invalid: bool = False) -> tuple[list[dict], int]:
+    """Filter dataset entries by tier level (cumulative).
+
+    Args:
+        entries: list of dataset entry dicts
+        tier: requested tier level (smoke, quick, full, deep)
+        warn_invalid: if True, print warning for entries with invalid tier values
+
+    Returns:
+        tuple of (filtered list, count of skipped invalid entries)
+
+    Raises:
+        ValueError: if tier is not a valid tier name
+    """
+    if tier not in TIER_ORDER:
+        raise ValueError(f"Invalid tier '{tier}'. Must be one of: {TIER_ORDER}")
+
+    max_index = TIER_ORDER.index(tier)
+
+    result = []
+    skipped = []
+    for entry in entries:
+        entry_tier = entry.get("tier", "full")
+        if entry_tier not in TIER_ORDER:
+            skipped.append(entry.get("id", "unknown"))
+            continue
+        if TIER_ORDER.index(entry_tier) <= max_index:
+            result.append(entry)
+
+    if warn_invalid and skipped:
+        print(f"  warning: {len(skipped)} entries skipped (invalid tier): {', '.join(skipped[:5])}"
+              + (f" and {len(skipped) - 5} more" if len(skipped) > 5 else ""))
+
+    return result, len(skipped)
diff --git a/schema.json b/schema.json
@@ -0,0 +1,77 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "PromptPressure Eval Dataset",
+  "description": "Schema for entries in evals_dataset.json",
+  "type": "array",
+  "items": {
+    "type": "object",
+    "required": ["id", "category", "prompt", "expected_behavior", "eval_criteria"],
+    "properties": {
+      "id": {
+        "type": "string",
+        "description": "Unique entry ID, e.g. if_001, sy_025"
+      },
+      "category": {
+        "type": "string",
+        "description": "Evaluation category name"
+      },
+      "subcategory": {
+        "type": "string",
+        "description": "Subcategory within the category"
+      },
+      "tier": {
+        "type": "string",
+        "enum": ["smoke", "quick", "full", "deep"],
+        "description": "Run tier. smoke < quick < full < deep (cumulative)"
+      },
+      "difficulty": {
+        "type": "string",
+        "enum": ["easy", "medium", "hard"],
+        "description": "Difficulty level (orthogonal to tier)"
+      },
+      "prompt": {
+        "oneOf": [
+          {"type": "string", "minLength": 1},
+          {
+            "type": "array",
+            "minItems": 1,
+            "items": {
+              "type": "object",
+              "required": ["role", "content"],
+              "properties": {
+                "role": {"type": "string"},
+                "content": {"type": "string", "minLength": 1}
+              }
+            }
+          }
+        ],
+        "description": "Single-turn string or multi-turn message array"
+      },
+      "expected_behavior": {
+        "type": "string",
+        "minLength": 1,
+        "description": "Human-readable description of expected model behavior"
+      },
+      "per_turn_expectations": {
+        "type": "array",
+        "items": {
+          "type": "object",
+          "required": ["turn", "expected"],
+          "properties": {
+            "turn": {"type": "integer", "minimum": 1},
+            "expected": {"type": "string", "minLength": 1}
+          }
+        },
+        "description": "Per-turn expected behaviors for multi-turn sequences"
+      },
+      "eval_criteria": {
+        "type": "object",
+        "description": "Key-value pairs for LLM-as-judge grading"
+      },
+      "notes": {
+        "type": "string",
+        "description": "Optional authoring notes"
+      }
+    }
+  }
+}
diff --git a/tests/__pycache__/__init__.cpython-314.pyc b/tests/__pycache__/__init__.cpython-314.pyc
diff --git a/tests/__pycache__/test_adapters.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_adapters.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_aggregate_openrouter_scores.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_aggregate_openrouter_scores.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_config.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_config.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_config_advanced.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_config_advanced.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_dataset_validation.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_dataset_validation.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_env.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_env.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_metrics_full.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_metrics_full.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_metrics_simple.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_metrics_simple.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_openrouter.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_openrouter.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/__pycache__/test_reporting.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_reporting.cpython-314-pytest-9.0.2.pyc
diff --git a/tests/test_cli_tier.py b/tests/test_cli_tier.py
@@ -0,0 +1,15 @@
+import pytest
+
+
+class TestConfigTierField:
+    def test_tier_field_exists_in_schema(self):
+        """Settings model should include a tier field."""
+        from promptpressure.config import Settings
+        schema = Settings.model_json_schema()
+        assert "tier" in schema["properties"], "Settings schema missing 'tier' field"
+
+    def test_tier_default_is_quick(self):
+        """tier should default to 'quick' when not specified."""
+        from promptpressure.config import Settings
+        schema = Settings.model_json_schema()
+        assert schema["properties"]["tier"]["default"] == "quick"