From 91b3614a8f29d66278ece1fb907a245eb7ccf83e Mon Sep 17 00:00:00 2001
From: AneeshD04 <aneesh72583@gmail.com>
Date: Fri, 12 Jun 2026 14:04:00 -0700
Subject: [PATCH 1/4] Add DemandAssessor model and demand annotation pipeline

- DemandAssessor: neural IRT model predicting P(response=1 | subject, item_features)
  via MLP over concatenated subject embeddings and item feature vectors
- Full demand annotation pipeline (DemandAnnotator, GeminiClient, RubricsCatalog,
  AnnotationCache) implementing the 18-dimension ADeLe rubric scoring system
- Unit tests for DemandAssessor (24 tests, synthetic data, no pretrained model needed)
- Unit and live end-to-end tests for annotation pipeline
- Fix deferred import in LLMJudge to avoid crash when transformers not installed

Closes #41
---
 .gitignore                                    |   3 +
 src/torch_measure/annotation/__init__.py      |  62 +++
 src/torch_measure/annotation/_annotator.py    | 122 ++++++
 src/torch_measure/annotation/_cache.py        |  57 +++
 src/torch_measure/annotation/_client.py       |  97 +++++
 src/torch_measure/annotation/_parsers.py      |  54 +++
 src/torch_measure/annotation/_prompts.py      |  25 ++
 src/torch_measure/annotation/_rubrics.py      |  62 +++
 src/torch_measure/annotation/_types.py        |  82 ++++
 src/torch_measure/annotation/_ug.py           |  83 ++++
 src/torch_measure/annotation/py.typed         |   0
 src/torch_measure/annotation/rubrics/AS.txt   |  39 ++
 src/torch_measure/annotation/rubrics/AT.txt   |  30 ++
 src/torch_measure/annotation/rubrics/CEc.txt  |  32 ++
 src/torch_measure/annotation/rubrics/CEe.txt  |  32 ++
 src/torch_measure/annotation/rubrics/CL.txt   |  36 ++
 src/torch_measure/annotation/rubrics/KNa.txt  |  32 ++
 src/torch_measure/annotation/rubrics/KNc.txt  |  32 ++
 src/torch_measure/annotation/rubrics/KNf.txt  |  32 ++
 src/torch_measure/annotation/rubrics/KNn.txt  |  32 ++
 src/torch_measure/annotation/rubrics/KNs.txt  |  32 ++
 src/torch_measure/annotation/rubrics/MCr.txt  |  31 ++
 src/torch_measure/annotation/rubrics/MCt.txt  |  38 ++
 src/torch_measure/annotation/rubrics/MCu.txt  |  38 ++
 src/torch_measure/annotation/rubrics/MS.txt   |  38 ++
 src/torch_measure/annotation/rubrics/QLl.txt  |  38 ++
 src/torch_measure/annotation/rubrics/QLq.txt  |  38 ++
 src/torch_measure/annotation/rubrics/SNs.txt  |  39 ++
 .../annotation/rubrics/UG_choice_num.txt      |  40 ++
 src/torch_measure/annotation/rubrics/VO.txt   |  39 ++
 src/torch_measure/models/__init__.py          |   2 +
 src/torch_measure/models/demand_assessor.py   | 332 +++++++++++++++
 src/torch_measure/models/llm_judge.py         |   3 +-
 tests/test_annotation/__init__.py             |   0
 tests/test_annotation/test_cache.py           | 271 ++++++++++++
 tests/test_annotation/test_live.py            | 257 ++++++++++++
 tests/test_annotation/test_parsers.py         | 225 ++++++++++
 tests/test_annotation/test_pipeline.py        | 390 ++++++++++++++++++
 tests/test_annotation/test_prompts.py         | 149 +++++++
 tests/test_annotation/test_rubrics.py         | 235 +++++++++++
 tests/test_models/test_demand_assessor.py     | 389 +++++++++++++++++
 41 files changed, 3567 insertions(+), 1 deletion(-)
 create mode 100644 src/torch_measure/annotation/__init__.py
 create mode 100644 src/torch_measure/annotation/_annotator.py
 create mode 100644 src/torch_measure/annotation/_cache.py
 create mode 100644 src/torch_measure/annotation/_client.py
 create mode 100644 src/torch_measure/annotation/_parsers.py
 create mode 100644 src/torch_measure/annotation/_prompts.py
 create mode 100644 src/torch_measure/annotation/_rubrics.py
 create mode 100644 src/torch_measure/annotation/_types.py
 create mode 100644 src/torch_measure/annotation/_ug.py
 create mode 100644 src/torch_measure/annotation/py.typed
 create mode 100644 src/torch_measure/annotation/rubrics/AS.txt
 create mode 100644 src/torch_measure/annotation/rubrics/AT.txt
 create mode 100644 src/torch_measure/annotation/rubrics/CEc.txt
 create mode 100644 src/torch_measure/annotation/rubrics/CEe.txt
 create mode 100644 src/torch_measure/annotation/rubrics/CL.txt
 create mode 100644 src/torch_measure/annotation/rubrics/KNa.txt
 create mode 100644 src/torch_measure/annotation/rubrics/KNc.txt
 create mode 100644 src/torch_measure/annotation/rubrics/KNf.txt
 create mode 100644 src/torch_measure/annotation/rubrics/KNn.txt
 create mode 100644 src/torch_measure/annotation/rubrics/KNs.txt
 create mode 100644 src/torch_measure/annotation/rubrics/MCr.txt
 create mode 100644 src/torch_measure/annotation/rubrics/MCt.txt
 create mode 100644 src/torch_measure/annotation/rubrics/MCu.txt
 create mode 100644 src/torch_measure/annotation/rubrics/MS.txt
 create mode 100644 src/torch_measure/annotation/rubrics/QLl.txt
 create mode 100644 src/torch_measure/annotation/rubrics/QLq.txt
 create mode 100644 src/torch_measure/annotation/rubrics/SNs.txt
 create mode 100644 src/torch_measure/annotation/rubrics/UG_choice_num.txt
 create mode 100644 src/torch_measure/annotation/rubrics/VO.txt
 create mode 100644 src/torch_measure/models/demand_assessor.py
 create mode 100644 tests/test_annotation/__init__.py
 create mode 100644 tests/test_annotation/test_cache.py
 create mode 100644 tests/test_annotation/test_live.py
 create mode 100644 tests/test_annotation/test_parsers.py
 create mode 100644 tests/test_annotation/test_pipeline.py
 create mode 100644 tests/test_annotation/test_prompts.py
 create mode 100644 tests/test_annotation/test_rubrics.py
 create mode 100644 tests/test_models/test_demand_assessor.py

diff --git a/.gitignore b/.gitignore
index 49217854..fed9b249 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Test annotation cache (API responses — do not commit)
+tests/test_annotation/paper_comparison_cache.jsonl
+
 # Python
 __pycache__/
 *.py[cod]
diff --git a/src/torch_measure/annotation/__init__.py b/src/torch_measure/annotation/__init__.py
new file mode 100644
index 00000000..ea47547a
--- /dev/null
+++ b/src/torch_measure/annotation/__init__.py
@@ -0,0 +1,62 @@
+"""ADeLe demand annotation pipeline (Gemini re-implementation).
+
+Reproduces the annotation methodology from:
+  Zhou et al. (2026) "General scales unlock AI evaluation with explanatory
+  and predictive power." Nature.
+
+Public API
+----------
+DemandAnnotator   — main entry point: annotates one item or a full dataset
+GeminiClient      — Gemini API wrapper (caller supplies pinned model string)
+RubricsCatalog    — loads the 19 bundled rubric files
+AnnotationCache   — append-only JSONL result cache
+
+Data types
+----------
+AnnotationJob     — input: item_id, content, reference_answer
+DemandAnnotation  — one (item, rubric) result with CoT response
+UGAnnotation      — UG classification result
+ItemAnnotation    — all 19 annotations for one item (.to_feature_vector())
+DemandVector      — full-dataset tensor (n_items × 19) for DemandAssessor
+CacheEntry        — one persisted cache record
+
+Constants
+---------
+DIMENSION_ORDER   — canonical ordering of all 19 dimensions
+DEMAND_DIMENSIONS — the first 18 (excludes UG)
+"""
+from ._annotator import DemandAnnotator
+from ._cache import AnnotationCache
+from ._client import GeminiClient
+from ._rubrics import RubricsCatalog
+from ._types import (
+    DEMAND_DIMENSIONS,
+    DIMENSION_ORDER,
+    N_DIMENSIONS,
+    AnnotationJob,
+    CacheEntry,
+    DemandAnnotation,
+    DemandVector,
+    ItemAnnotation,
+    Rubric,
+    UGAnnotation,
+)
+from ._ug import UGAnnotator
+
+__all__ = [
+    "DemandAnnotator",
+    "GeminiClient",
+    "RubricsCatalog",
+    "AnnotationCache",
+    "UGAnnotator",
+    "AnnotationJob",
+    "DemandAnnotation",
+    "UGAnnotation",
+    "ItemAnnotation",
+    "DemandVector",
+    "CacheEntry",
+    "Rubric",
+    "DIMENSION_ORDER",
+    "DEMAND_DIMENSIONS",
+    "N_DIMENSIONS",
+]
diff --git a/src/torch_measure/annotation/_annotator.py b/src/torch_measure/annotation/_annotator.py
new file mode 100644
index 00000000..b97b45bb
--- /dev/null
+++ b/src/torch_measure/annotation/_annotator.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+import hashlib
+from typing import Optional
+
+from ._cache import AnnotationCache, make_cache_key
+from ._client import GeminiClient
+from ._parsers import extract_demand_level
+from ._prompts import get_full_instruction
+from ._rubrics import RubricsCatalog
+from ._types import (
+    AnnotationJob,
+    CacheEntry,
+    DemandAnnotation,
+    DemandVector,
+    ItemAnnotation,
+    Rubric,
+)
+from ._ug import UGAnnotator
+
+
+class DemandAnnotator:
+    """Runs the full 19-call ADeLe annotation pipeline for one benchmark item.
+
+    One API call per demand rubric (18 sequential calls) plus one UG call.
+    Results are cached to avoid redundant API calls across runs.
+    """
+
+    def __init__(
+        self,
+        client: GeminiClient,
+        rubrics: RubricsCatalog,
+        cache: Optional[AnnotationCache] = None,
+    ) -> None:
+        self._client = client
+        self._rubrics = rubrics
+        self._cache = cache
+        self._ug = UGAnnotator(client, rubrics, cache)
+
+    def annotate(self, job: AnnotationJob) -> ItemAnnotation:
+        """Annotate one item across all 18 demand rubrics plus UG."""
+        demands: dict[str, DemandAnnotation] = {}
+        for rubric in self._rubrics.all_demand_rubrics():
+            demands[rubric.acronym] = self._annotate_one(job, rubric)
+        ug = self._ug.annotate(job)
+        return ItemAnnotation(item_id=job.item_id, demands=demands, ug=ug)
+
+    def annotate_dataset(self, jobs: list[AnnotationJob]) -> DemandVector:
+        """Annotate all items and return a (n_items × 19) tensor.
+
+        Row ordering in the returned ``DemandVector.tensor`` mirrors the order
+        of ``jobs``. To pass the result to ``DemandAssessor.fit()``, supply
+        ``jobs`` in the same order as ``data.to_fit_tensors()["item_ids"]``::
+
+            item_ids = data.to_fit_tensors()["item_ids"]          # canonical order
+            jobs = [AnnotationJob(iid, content[iid], ref[iid]) for iid in item_ids]
+            dv   = annotator.annotate_dataset(jobs)
+            model.fit(data, item_features=dv.tensor)
+        """
+        import torch
+
+        item_ids: list[str] = []
+        rows: list[list[float]] = []
+        for job in jobs:
+            item_ann = self.annotate(job)
+            item_ids.append(job.item_id)
+            rows.append(item_ann.to_feature_vector())
+
+        tensor = torch.tensor(rows, dtype=torch.float32)
+        return DemandVector(item_ids=item_ids, tensor=tensor)
+
+    def _annotate_one(self, job: AnnotationJob, rubric: Rubric) -> DemandAnnotation:
+        key = make_cache_key(
+            content=job.content,
+            acronym=rubric.acronym,
+            model_id=self._client.model,
+            rubric_hash=rubric.rubric_hash,
+        )
+
+        if self._cache is not None:
+            entry = self._cache.get(key)
+            if entry is not None:
+                return DemandAnnotation(
+                    item_id=job.item_id,
+                    demand=rubric.acronym,
+                    level=entry.level,
+                    finish_reason=entry.finish_reason,
+                    model_response=entry.model_response,
+                )
+
+        prompt = get_full_instruction(
+            dimension=rubric.dimension_name,
+            rubric_content=rubric.content,
+            item_text=job.content,
+        )
+        model_response, finish_reason = self._client.generate(prompt)
+        level = extract_demand_level(model_response)
+
+        annotation = DemandAnnotation(
+            item_id=job.item_id,
+            demand=rubric.acronym,
+            level=level,
+            finish_reason=finish_reason,
+            model_response=model_response,
+        )
+
+        if self._cache is not None:
+            content_hash = hashlib.sha256(job.content.encode()).hexdigest()[:16]
+            self._cache.put(CacheEntry(
+                key=key,
+                item_id=job.item_id,
+                demand=rubric.acronym,
+                level=level,
+                finish_reason=finish_reason,
+                model_response=model_response,
+                rubric_hash=rubric.rubric_hash,
+                model_id=self._client.model,
+                content_hash=content_hash,
+                timestamp=AnnotationCache.now_iso(),
+            ))
+
+        return annotation
diff --git a/src/torch_measure/annotation/_cache.py b/src/torch_measure/annotation/_cache.py
new file mode 100644
index 00000000..ba208902
--- /dev/null
+++ b/src/torch_measure/annotation/_cache.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import dataclasses
+import hashlib
+import json
+import math
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+from ._types import CacheEntry
+
+
+def make_cache_key(content: str, acronym: str, model_id: str, rubric_hash: str) -> str:
+    """sha256(content)[:16] : acronym : model_id : rubric_hash"""
+    content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
+    return f"{content_hash}:{acronym}:{model_id}:{rubric_hash}"
+
+
+class AnnotationCache:
+    """Append-only JSONL cache keyed by sha256(content)[:16]:acronym:model_id:rubric_hash."""
+
+    def __init__(self, path: Path) -> None:
+        self._path = path
+        self._index: dict[str, CacheEntry] = {}
+        if path.exists():
+            self._load()
+
+    def _load(self) -> None:
+        with open(self._path, encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                data = json.loads(line)
+                # NaN is serialised as null (RFC-compliant); restore here.
+                level = data.get("level")
+                if level is None:
+                    data["level"] = math.nan
+                entry = CacheEntry(**data)
+                self._index[entry.key] = entry
+
+    def get(self, key: str) -> Optional[CacheEntry]:
+        return self._index.get(key)
+
+    def put(self, entry: CacheEntry) -> None:
+        self._index[entry.key] = entry
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        record = dataclasses.asdict(entry)
+        if isinstance(record["level"], float) and math.isnan(record["level"]):
+            record["level"] = None
+        with open(self._path, "a", encoding="utf-8") as fh:
+            fh.write(json.dumps(record) + "\n")
+
+    @staticmethod
+    def now_iso() -> str:
+        return datetime.now(timezone.utc).isoformat()
diff --git a/src/torch_measure/annotation/_client.py b/src/torch_measure/annotation/_client.py
new file mode 100644
index 00000000..dc86a530
--- /dev/null
+++ b/src/torch_measure/annotation/_client.py
@@ -0,0 +1,97 @@
+"""Gemini API client — the only file that imports google.genai."""
+from __future__ import annotations
+
+import time
+
+_FINISH_REASON_MAP: dict[str, str] = {
+    "STOP": "stop",
+    "MAX_TOKENS": "length",
+}
+
+
+def _is_retryable(exc: BaseException) -> bool:
+    # Network-level transient errors — server closed keep-alive connection
+    try:
+        import httpx
+        if isinstance(exc, (httpx.RemoteProtocolError, httpx.ConnectError, httpx.ReadError)):
+            return True
+    except ImportError:
+        pass
+    # Gemini API errors
+    try:
+        from google.genai import errors as genai_errors
+        if isinstance(exc, genai_errors.ServerError):
+            return True
+        if isinstance(exc, genai_errors.ClientError):
+            code = getattr(exc, "status_code", None) or getattr(exc, "code", None)
+            return code == 429
+    except AttributeError:
+        pass
+    return False
+
+
+class GeminiClient:
+    """Thin wrapper around google.genai with retry and finish-reason normalisation.
+
+    Parameters
+    ----------
+    api_key:
+        Gemini API key.
+    model:
+        Pinned model string, e.g. "gemini-2.0-flash-001". No default — caller
+        must supply the exact version to guarantee reproducibility.
+    """
+
+    _TEMPERATURE = 0.0
+    _MAX_OUTPUT_TOKENS = 4096  # 2.5 Flash generates longer CoT than 2.0 Flash
+
+    def __init__(self, api_key: str, model: str, rpm: int = 0) -> None:
+        import google.genai as genai
+        from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
+
+        self._client = genai.Client(api_key=api_key)
+        self.model = model
+        self._min_interval = (60.0 / rpm) if rpm > 0 else 0.0
+        self._last_call_time: float = 0.0
+
+        self._generate_with_retry = retry(
+            retry=retry_if_exception(_is_retryable),
+            wait=wait_exponential(min=2, max=256),
+            stop=stop_after_attempt(10),
+            reraise=True,
+        )(self._call_api)
+
+    def generate(self, prompt: str) -> tuple[str, str]:
+        """Call the API and return (response_text, finish_reason).
+
+        Retries up to 10 times on transient errors with exponential backoff
+        (min 2 s, max 256 s), matching the paper's tenacity settings.
+        """
+        return self._generate_with_retry(prompt)
+
+    def _call_api(self, prompt: str) -> tuple[str, str]:
+        from google.genai import types as genai_types
+
+        if self._min_interval > 0:
+            elapsed = time.monotonic() - self._last_call_time
+            if elapsed < self._min_interval:
+                time.sleep(self._min_interval - elapsed)
+        self._last_call_time = time.monotonic()
+
+        response = self._client.models.generate_content(
+            model=self.model,
+            contents=prompt,
+            config=genai_types.GenerateContentConfig(
+                temperature=self._TEMPERATURE,
+                max_output_tokens=self._MAX_OUTPUT_TOKENS,
+            ),
+        )
+        text: str = response.text or ""
+        candidate = response.candidates[0] if response.candidates else None
+        raw_reason = (
+            candidate.finish_reason.name
+            if candidate and candidate.finish_reason
+            else "FINISH_REASON_UNSPECIFIED"
+        )
+        finish_reason = _FINISH_REASON_MAP.get(raw_reason, "other")
+        return text, finish_reason
diff --git a/src/torch_measure/annotation/_parsers.py b/src/torch_measure/annotation/_parsers.py
new file mode 100644
index 00000000..e7ad7dd2
--- /dev/null
+++ b/src/torch_measure/annotation/_parsers.py
@@ -0,0 +1,54 @@
+"""Pure parsing functions, verbatim from adgomant/delean-batch-manager/parse.py."""
+from __future__ import annotations
+
+import math
+import re
+
+
+def extract_demand_level(response: str) -> float:
+    """Verbatim from extract_demand_level_from_response() in the paper repo.
+
+    Splits on blank lines, takes the last paragraph as the conclusion,
+    extracts the last integer, validates 0-5, rejects leading section numbers.
+    Returns math.nan on any failure.
+    """
+    segments = response.split("\n\n")
+    conclusion = segments[-1]
+
+    digits = re.findall(r"\d+", conclusion)
+    if not digits:
+        return math.nan
+
+    score = int(digits[-1])
+    if not 0 <= score <= 5:
+        return math.nan
+
+    # Reject if the only integer found is a leading section-header number
+    # (e.g. "4. Conclusion:" where 4 is not the actual score).
+    if len(digits) == 1 and re.search(rf"^{score}\.", conclusion, re.MULTILINE):
+        return math.nan
+
+    return float(score)
+
+
+def extract_ug_score(response: str) -> tuple[str, float]:
+    """Parse a UG classification response into (raw_output, ug_score).
+
+    Model is instructed to output a single line: an integer N or the word "open".
+    Formula: ug_score = (1 - 1/N) * 100  for MCQ, or 100.0 for open-ended.
+    Returns math.nan as ug_score on any parse failure.
+    """
+    raw = response.strip().split("\n")[0].strip()
+
+    if raw.lower() == "open":
+        return raw, 100.0
+
+    try:
+        n = int(raw)
+    except ValueError:
+        return raw, math.nan
+
+    if n < 1:
+        return raw, math.nan
+
+    return raw, round((1.0 - 1.0 / n) * 100.0, 6)
diff --git a/src/torch_measure/annotation/_prompts.py b/src/torch_measure/annotation/_prompts.py
new file mode 100644
index 00000000..985f4445
--- /dev/null
+++ b/src/torch_measure/annotation/_prompts.py
@@ -0,0 +1,25 @@
+"""Pure prompt-construction functions, verbatim from the ADeLe paper pipeline."""
+from __future__ import annotations
+
+
+def get_full_instruction(dimension: str, rubric_content: str, item_text: str) -> str:
+    """Verbatim prompt template from adgomant/delean-batch-manager/src/.../files.py."""
+    return (
+        f"The following rubric describes six distinct levels of *{dimension}*"
+        f" required by different tasks:\n"
+        f"{rubric_content}\n"
+        f"\nTASK INSTANCE: {item_text}\n"
+        f"\nINSTRUCTION: Score the level of *{dimension}* demanded by the given"
+        f" TASK INSTANCE using a discrete value from 0 to 5. Use CHAIN-OF-THOUGHTS"
+        f" REASONING to reason step by step before assigning the score. After the"
+        f" CHAIN-OF-THOUGHTS REASONING STEPS, conclude your assessment with the"
+        f' statement: "Thus, the level of *{dimension}* demanded by the given TASK'
+        f' INSTANCE is: SCORE", where SCORE is an integer score you have determined.\n'
+        f"\nCHAIN-OF-THOUGHTS REASONING STEPS to score the level of *{dimension}*"
+        f" demanded by the given TASK INSTANCE above:\n"
+    )
+
+
+def get_ug_instruction(item_text: str, reference_answer: str, ug_rubric_content: str) -> str:
+    """Best-faith reconstruction of the UG prompt (prepend format undocumented in paper repos)."""
+    return f"{item_text}\n\nReference answer: {reference_answer}\n\n{ug_rubric_content}"
diff --git a/src/torch_measure/annotation/_rubrics.py b/src/torch_measure/annotation/_rubrics.py
new file mode 100644
index 00000000..13f38fb0
--- /dev/null
+++ b/src/torch_measure/annotation/_rubrics.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+from ._types import DEMAND_DIMENSIONS, Rubric
+
+
+class RubricsCatalog:
+    """Loads rubric .txt files from the bundled rubrics/ directory."""
+
+    def __init__(self, rubrics_dir: Path | None = None) -> None:
+        if rubrics_dir is None:
+            rubrics_dir = Path(__file__).parent / "rubrics"
+        self._rubrics: dict[str, Rubric] = {}
+        self._ug_content: str = ""
+        self._ug_hash: str = ""
+        self._load(rubrics_dir)
+        missing = [a for a in DEMAND_DIMENSIONS if a not in self._rubrics]
+        if missing:
+            raise RuntimeError(f"Missing rubric files: {missing}")
+
+    def _load(self, rubrics_dir: Path) -> None:
+        for path in rubrics_dir.glob("*.txt"):
+            acronym = path.stem
+            text = path.read_text(encoding="utf-8")
+            lines = text.splitlines(keepends=True)
+
+            if lines and lines[0].startswith("#"):
+                dimension_name = lines[0].lstrip("#").strip()
+                content = "".join(lines[1:]).strip("\n")
+            else:
+                dimension_name = acronym
+                content = text
+
+            rubric_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
+
+            if acronym == "UG_choice_num":
+                self._ug_content = content
+                self._ug_hash = rubric_hash
+            else:
+                self._rubrics[acronym] = Rubric(
+                    acronym=acronym,
+                    dimension_name=dimension_name,
+                    content=content,
+                    rubric_hash=rubric_hash,
+                )
+
+    def get(self, acronym: str) -> Rubric:
+        return self._rubrics[acronym]
+
+    @property
+    def ug_content(self) -> str:
+        return self._ug_content
+
+    @property
+    def ug_hash(self) -> str:
+        return self._ug_hash
+
+    def all_demand_rubrics(self) -> list[Rubric]:
+        """Return all 18 demand rubrics in canonical DIMENSION_ORDER."""
+        return [self._rubrics[a] for a in DEMAND_DIMENSIONS]
diff --git a/src/torch_measure/annotation/_types.py b/src/torch_measure/annotation/_types.py
new file mode 100644
index 00000000..b238b066
--- /dev/null
+++ b/src/torch_measure/annotation/_types.py
@@ -0,0 +1,82 @@
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from typing import Any
+
+DIMENSION_ORDER: tuple[str, ...] = (
+    "AS", "CEc", "CEe", "CL", "MCr", "MCt", "MCu", "MS", "QLl", "QLq", "SNs",
+    "KNa", "KNc", "KNf", "KNn", "KNs", "AT", "VO", "UG",
+)
+
+DEMAND_DIMENSIONS: tuple[str, ...] = DIMENSION_ORDER[:18]
+N_DIMENSIONS: int = 19
+
+
+@dataclass
+class Rubric:
+    acronym: str
+    dimension_name: str
+    content: str       # verbatim file text after the # Title line
+    rubric_hash: str   # sha256(content)[:16]
+
+
+@dataclass
+class AnnotationJob:
+    item_id: str
+    content: str
+    reference_answer: str
+
+
+@dataclass
+class DemandAnnotation:
+    item_id: str
+    demand: str        # rubric acronym
+    level: float       # 0-5 or math.nan
+    finish_reason: str
+    model_response: str
+
+
+@dataclass
+class UGAnnotation:
+    item_id: str
+    raw_output: str
+    ug_score: float    # 0-100 or math.nan
+    finish_reason: str
+    model_response: str
+
+
+@dataclass
+class ItemAnnotation:
+    item_id: str
+    demands: dict[str, DemandAnnotation]  # acronym -> DemandAnnotation
+    ug: UGAnnotation
+
+    def to_feature_vector(self) -> list[float]:
+        result: list[float] = []
+        for dim in DEMAND_DIMENSIONS:
+            ann = self.demands.get(dim)
+            result.append(ann.level if ann is not None else math.nan)
+        result.append(self.ug.ug_score)
+        return result
+
+
+@dataclass
+class DemandVector:
+    item_ids: list[str]
+    tensor: Any  # torch.Tensor at runtime; torch not imported here
+
+
+@dataclass
+class CacheEntry:
+    key: str
+    item_id: str
+    demand: str        # rubric acronym or "UG"
+    level: float       # demand level 0-5 or UG score 0-100; math.nan on parse failure
+    finish_reason: str
+    model_response: str
+    rubric_hash: str
+    model_id: str
+    content_hash: str
+    timestamp: str
+    raw_output: str = ""  # UG only: the raw model token ("3", "open", etc.)
diff --git a/src/torch_measure/annotation/_ug.py b/src/torch_measure/annotation/_ug.py
new file mode 100644
index 00000000..f495c3f5
--- /dev/null
+++ b/src/torch_measure/annotation/_ug.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+import hashlib
+from typing import Optional
+
+from ._cache import AnnotationCache, make_cache_key
+from ._client import GeminiClient
+from ._parsers import extract_ug_score
+from ._prompts import get_ug_instruction
+from ._rubrics import RubricsCatalog
+from ._types import AnnotationJob, CacheEntry, UGAnnotation
+
+
+class UGAnnotator:
+    """Classifies benchmark items as MCQ or open-ended and computes the UG score.
+
+    UG (Unguessability) is separate from the 18 demand rubrics:
+      - MCQ with N choices → ug_score = (1 - 1/N) * 100
+      - open-ended         → ug_score = 100.0
+    """
+
+    def __init__(
+        self,
+        client: GeminiClient,
+        rubrics: RubricsCatalog,
+        cache: Optional[AnnotationCache] = None,
+    ) -> None:
+        self._client = client
+        self._rubrics = rubrics
+        self._cache = cache
+
+    def annotate(self, job: AnnotationJob) -> UGAnnotation:
+        key = make_cache_key(
+            content=job.content,
+            acronym="UG",
+            model_id=self._client.model,
+            rubric_hash=self._rubrics.ug_hash,
+        )
+
+        if self._cache is not None:
+            entry = self._cache.get(key)
+            if entry is not None:
+                return UGAnnotation(
+                    item_id=job.item_id,
+                    raw_output=entry.raw_output,
+                    ug_score=entry.level,
+                    finish_reason=entry.finish_reason,
+                    model_response=entry.model_response,
+                )
+
+        prompt = get_ug_instruction(
+            item_text=job.content,
+            reference_answer=job.reference_answer,
+            ug_rubric_content=self._rubrics.ug_content,
+        )
+        model_response, finish_reason = self._client.generate(prompt)
+        raw_output, ug_score = extract_ug_score(model_response)
+
+        annotation = UGAnnotation(
+            item_id=job.item_id,
+            raw_output=raw_output,
+            ug_score=ug_score,
+            finish_reason=finish_reason,
+            model_response=model_response,
+        )
+
+        if self._cache is not None:
+            content_hash = hashlib.sha256(job.content.encode()).hexdigest()[:16]
+            self._cache.put(CacheEntry(
+                key=key,
+                item_id=job.item_id,
+                demand="UG",
+                level=ug_score,
+                finish_reason=finish_reason,
+                model_response=model_response,
+                rubric_hash=self._rubrics.ug_hash,
+                model_id=self._client.model,
+                content_hash=content_hash,
+                timestamp=AnnotationCache.now_iso(),
+                raw_output=raw_output,
+            ))
+
+        return annotation
diff --git a/src/torch_measure/annotation/py.typed b/src/torch_measure/annotation/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/src/torch_measure/annotation/rubrics/AS.txt b/src/torch_measure/annotation/rubrics/AS.txt
new file mode 100644
index 00000000..23c0eae4
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/AS.txt
@@ -0,0 +1,39 @@
+# Attention and Search
+This criterion assesses the level of attention and scan required to focus on or locate specific elements within a given stream of information or environment in the whole process of solving a task. During this process, there is the need to actively scan for or retrieve elements that meet predetermined criteria. The level represents the extent to which the task requires locating and focusing on specific target information, ranging from situations where the target is immediately obvious to those requiring sustained tracking of multiple targets among numerous distractors—any elements that are irrelevant to solve the task, such as visual objects, sounds, pieces of text, noise, or other stimuli, but compete for attention with the target information—in complex, dynamic environments. The challenge is not on determining what to look for but focusing the attention to find it within a larger context. This differs from tasks where there's a need to identify which pieces of information are relevant from a set already under consideration. While both processes may overlap in complex tasks like reading comprehension or image understanding, "attention and scan" specifically focuses on the deployment of attention during scan processes when solving the task, rather than the selection or evaluation of information.
+
+Level 0: None. No attention or scan is required. The target information is immediately obvious or is the only information present.
+Examples:
+* "Given a single word input, determine if it starts with a capital letter."
+* "Look at the only object in the centre of the white page and tell what colour it is."
+* "Is Madrid the capital of Spain?"
+
+Level 1: Very low. Minimal attention or scanning is required. The target information is easily distinguishable with little to almost no distraction.
+Examples:
+* "Find the only blue car in a car park full of red cars."
+* "Find the letter 'X' among a row of 'O's"
+* "Spot the tall tree in a row of short bushes."
+
+Level 2: Low. Some attention or basic scanning is required. The target information is visible among a few distractors or in a small scan area.
+Examples:
+* "Find all the vowels in the following sentence: 'The quick brown fox jumps over the lazy dog.'"
+* "Find who's wearing glasses in this photo of students at commencement, with 2 rows of 5 students each, all facing forward, taken by a professional photographer."
+* "Who authored the Queensberry rules, which were published in 1867 for the sport of boxing? Choices: A. John Douglas  (in his late twenties)\nB. John Graham Chambers (in his mid-twenties)\nC. Marquess of Queensberry (in his early thirties)\nD. James Figg (in his forties)."
+
+Level 3: Intermediate. Moderate attention and scan are required. The target information is mixed with several distractors or spread over a fairly large scan area.
+Examples:
+* "Find everyone wearing glasses in this casual BBQ photo where 15 people are gathered around a table. Some are sitting, some standing, some looking at the camera while others are in conversation."
+* "In a 5-page technical document about basic geometry, locate all explicit references to the Pythagorean theorem (a² + b² = c²), where the equation appears 5 times mixed among references to 15 other geometric formulas, with occasional inconsistent equation numbering but standard mathematical notation.
+* "While reading a podcast interview, keep track of how many times the guest explicitly discusses content about their new book."
+* "As we all know, the Queensberry Rules are a set of rules for boxing that govern both amateur and professional matches. Who authored the Queensberry rules, which were published in 1867 for the sport of boxing? Choices: A. John Douglas  (in his late twenties)\nB. John Graham Chambers (in his mid-twenties)\nC. Marquess of Queensberry (in his early thirties)\nD. James Figg (in his forties)\nE. James Zou (in his fifties)\nF. Lucy Grande (in her late twenties)\nG. Xiaoxiao Li (in her early forties)\nH. Enrique Garcia (in his late thirties)."
+
+Level 4: High. Sustained tracking of one or various targets is required. The target information is in an environment mixed with numerous distractors and changing conditions. requires some continuous monitoring amid competing signals.
+Examples:
+* "Listening to a symphony, identify all instances where the clarinet plays in a minor key, even when it's not playing the main melody.
+* "Track three orange spheres among twenty red spheres as they move randomly across a black screen (40 cm × 30 cm) at varying speeds (1-3 cm/s), with spheres frequently intersecting paths and maintaining a minimum separation distance of 2 cm. Each sphere is 1 cm in diameter."
+* "In a real-time video feed of a busy airport, finding the locations of ten blue suitcases."
+
+Level 5: Very high. Requires sustained attention and scan for simultaneous tracking of multiple targets across different domains or contexts, with continuous adaptation to fast-changing conditions. The target information is extremely difficult to distinguish from distractors or is hidden in a vast or constantly changing environment.
+Examples:
+* "While seated courtside at a professional basketball game, track two specific players throughout the entire game as they move at speeds up to 8m/s, frequently cluster with other players during rebounds, and weave through screens and defensive formations."
+* "Monitor four simultaneous video feeds of a crowded airport terminal from different angles, detecting subtle security-relevant changes (e.g. brief interactions < 2 seconds, crowd flow changes, small object exchanges) across feeds."
+* "While monitoring multiple simultaneous customer service chat conversations in different languages, identify instances where customers are expressing the same underlying technical issue, even though they're describing it using different metaphors, technical terms, or cultural references specific to their region."
diff --git a/src/torch_measure/annotation/rubrics/AT.txt b/src/torch_measure/annotation/rubrics/AT.txt
new file mode 100644
index 00000000..3e5b4556
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/AT.txt
@@ -0,0 +1,30 @@
+# Atypicality
+Level 0: None. The task is a staple one. Exactly the same instance of the task appears many times on the Internet, textbooks or common psychological or achievement tests, and the solution is generally well-known and memorized. Examples:
+* "What is 2 + 2?"
+* "Name the capital of France."
+* "What gets wetter and wetter the more it dries?"
+
+Level 1: Very Low. The task is very common and the specific task instance is likely to frequently appear on the Internet, textbooks or common psychological or achievement tests, so the chance that the solution is well-known and memorized is high. Examples:
+* "What is the derivative of sin(x)?"
+* "Define opportunity cost."
+* "Name the seven continents."
+
+Level 2: Low. The task is moderately common and the specific task instance varies somewhat from other common examples or is unlikely to have seen it before in exactly the same form, but possibly in variations. Examples:
+* "What is 21251 + 2835?"
+* "Given the molecular SMILES: COC[C@H]1OC(=O)c2coc3c2[C@@]1(C)C1=C(C3=O)[C@@H]2CCC(=O)[C@@]2(C)C[C@H]1OC(C)=O, your task is to provide the detailed description of the molecule using your experienced chemical Molecular knowledge."
+* "Solve the following Math Olympiad question: Determine the greatest real number $ C $, such that for every positive integer $ n\ge 2 $, there exists $ x_1, x_2,..., x_n \in [-1,1]$, so that $$\prod_{1\le i<j\le n}(x_i-x_j) \ge C^{\frac{n(n-1)}{2}}$$."
+
+Level 3: Intermediate. The task is somewhat common but the specific task instance is quite rare, and it is unlikely to appear in common sources (Internet, textbooks or common psychological or achievement tests). Examples:
+* "What is 5205175017521571 + 68270867426872052?"
+* "Among the following exoplanets, which one has the lowest density? a) An Earth-mass and Earth-radius planet. b) A planet with 3 Earth masses and a density of approximately 4.6 g/cm^3. c) A planet with the same composition as Earth but 1.5 times more massive than Earth. d) A planet with the same composition as Earth but half the mass of Earth."
+* Get answers for the question based on the context, where answers derived from substrings in the context or categorized as [unanswerable]. Context: ['On May 1 , 2015 , Quentin announced his retirement .Quentin signed a minor league deal with the Seattle Mariners on April 22 , 2015 , and was assigned to the Tacoma Rainiers .On April 5 , 2015 , Quentin was traded to the Atlanta Braves along with Cameron Maybin , Matt Wisler , and Jordan Paroubeck , for Craig Kimbrel and Melvin Upton Jr . The Braves designated him for assignment later that day , and released him on April 14 .', 'On July 22 , 2012 , Quentin agreed to a three-year , $27 million contract extension through 2015 with a $10 million mutual option for 2016 , including a no-trade clause . This is an amazing opportunity to stay and play in the city I grew up in . said Quentin .'] Question: Who did Carlos Quentin work for in April 2016?
+
+Level 4: High. The task is not extremely uncommon and the specific task instance is infrequent or presented in notably different ways from standard formulations. Examples:
+* "Create a measurement system where accuracy is expressed through different shapes rather than decimal places."
+* "Assume that there exist only two types of people: knights and knaves. Knights always tell the truth, while knaves always lie.\nYou are given the statements from 5 characters. Based on their statements, infer who is a knight and who is a knave.\nA: B is a liar and D is a truth-teller.\nB: D is a truth-teller.\nC: If A is a truth-teller, then E is a liar.\nD: B is a truth-teller and A is a truth-teller.\nE: A is a liar."
+* "In the context of neurolinguistic processing models examining the interface between phonological working memory and syntactic parsing during real-time sentence comprehension, what is the primary anatomical structure that shows increased metabolic activity during novel word acquisition in fMRI studies?" [This is a very sophisticated way of asking "which part of the brain lights up when we learn new words?"]
+
+Level 5: Very High. The task is fundamentally different from those typically appearing on the Internet, textbooks or psychological or achievement tests, or, the specific task instance is very unlikely to have close analogues in those sources. Noteworthy, any tasks (simple or elaborate) that can be found in standard tests or benchmarks should be considered less than level 5. Examples:
+* "Take 20 major sky constellations and design 20 Formula1 race circuits that follow the FIA regulations but mimic the shapes of the constellations."
+* "Write a poem in fifty African languages, with each line in one language, with the number of letters e in the line being proportional to the speakers of those languages"
+* "List mathematical terms that are also dancing terms (like 'step function' or 'rotation'), then write dance instructions using only mathematical language. However, the dance must stem from an Asian country since I'm teaching a creative Asian dance course today to one student who happens to be a mathematician."
diff --git a/src/torch_measure/annotation/rubrics/CEc.txt b/src/torch_measure/annotation/rubrics/CEc.txt
new file mode 100644
index 00000000..521d6235
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/CEc.txt
@@ -0,0 +1,32 @@
+# Comprehension
+This rubric evaluates the difficulty of a task's comprehension requirements, encompassing the understanding of text, stories or the semantic content of other representations of ideas in different formats or modalities. It may include the interpretation of explicit and implicit meanings, recognition of relationships between concepts, processing of contextual information, and understanding of abstract ideas and complex systems. Noteworthy, the mere presence of specialized terminology or jargon does not necessarily indicate a high difficulty level in this rubric, as these terms may appear within tasks that follow simple, straightforward linguistic structures and are more reflective of domain knowledge rather than comprehension complexity. Further, for specialized formal languages (e.g., molecular structures, programming code) the task will be hard to comprehend only if the sequence in that formal language (e.g. molecular expression, snippet of code) is convoluted, but simple molecules or pieces of code should be easy. The rubric include difficult levels that range from tasks requiring no semantic comprehension to those demanding an understanding of highly convoluted, interconnected concepts, including the ability to process sophisticated theoretical frameworks, understand nuanced implications, and synthesize multiple complex perspectives across different domains and levels of abstraction.
+
+Level 0: None. Tasks at this level require no comprehension of language or semantic content, such as those that can be completed by non-human animals. Examples:
+* Pulling levers in a specific sequence (pull middle lever, then right lever, then left lever) to release food from a container, learning the pattern through trial and error.
+* Manipulating a twist-lid container with multiple appendages in a rotating motion until the lid separates from the base, then retrieving the contents inside.
+* Using a stick to push a banana that's out of reach through a fence gap, by positioning and moving the stick in the correct direction.
+
+Level 1: Very Low. Tasks at this level require understanding of basic, explicit meanings in simple formats, including recognition of common words, straightforward statements, and clear one-to-one relationships between symbols and their meanings. Comprehension is limited to surface-level, literal interpretations without need for context or inference. Examples:
+* Identifying basic subject-verb relationships that describe observable actions (e.g., "Context: The blue bird was flying high in the sky. Question: who was flying?").
+* Understanding simple questions that do not require sophisticated language skills such as "Why is the sky blue?"
+* Understanding single-step instructions where the action directly matches the command (e.g., comprehending the sentence "close the door for me" and mentally connecting these words and the corresponding physical action).
+
+Level 2: Low. Tasks at this level involve comprehending straightforward messages with basic context, including simple cause-effect relationships, clear sequential instructions, and explicit connections between ideas. Understanding requires basic inference but remains tied to concrete, clearly stated information. Examples:
+* Capable of answering "why" questions about a simple story (e.g., "Why did the girl take an umbrella?" after reading "Sarah saw dark clouds in the sky. She grabbed her umbrella before leaving home.")
+* Understanding simple explanations of processes (e.g., "Plants need water and sunlight to grow, otherwise they will not survive, especially in harsh climate.")
+* In a recipe interface, interpreting "Add milk slowly while stirring continuously until mixture thickens" by understanding that the stirring must occur simultaneously with the milk addition, not after.
+
+Level 3: Intermediate. Tasks at this level require understanding of moderately complex information including implicit meanings, metaphorical language, and relationships between multiple concepts. Comprehension may involve processing both explicit and implicit information. Examples:
+* In a high school student's history essay about the Industrial Revolution, following their argument that "While factories created more jobs in cities, this urbanization ironically decreased quality of life because cramped living conditions and poor sanitation led to disease outbreaks." This requires understanding how the student is connecting multiple historical factors (industrialization, urbanization, living conditions, public health) and recognizing their use of "ironically" to highlight the unexpected negative consequence of economic progress.
+* In an employee handbook, understanding that the statement "The company values work-life balance" combined with "Employees are expected to be responsive to urgent matters outside office hours" represents a potential policy contradiction requiring contextual judgment.
+* In a technical manual, interpreting a troubleshooting section that requires understanding how different error messages might indicate the same underlying problem depending on the system's state.
+
+Level 4: High. Tasks at this level demand comprehension of sophisticated content with multiple layers of meaning, complex relationships between concepts, and nuanced implications. Understanding requires integration of various information sources and recognition of subtle patterns and connections. Examples:
+* Following an accessible fiction story told from multiple viewpoints where each narrator provides partial, biased information, requiring the reader to construct the true sequence of events by reconciling conflicting accounts and recognizing each narrator's limitations and motivations.
+* Understanding a complex academic argument that develops through multiple chapters, where key terms are gradually redefined and earlier arguments are recontextualized by later developments.
+* Interpreting a modern theatrical play where dialogue has multiple meanings based on staging directions, character backgrounds, and historical context, requiring simultaneous understanding of textual and performative elements.
+
+Level 5: Very High. Tasks at this level require mastery in understanding highly convoluted, abstract, and interconnected information systems, including sophisticated theoretical frameworks, convoluted narratives and nuanced philosophical arguments. Comprehension involves synthesizing multiple complex perspectives and understanding subtle distinctions. Examples:
+* Understanding well a convoluted legal document that requires tracking multiple cross-references, understanding nested conditions, and comprehending how different clauses modify each other.
+* Comprehending a modernist novel that uses a stream-of-consciousness narrative technique where multiple timelines, memories, and internal thoughts are interwoven without clear demarcation, requiring readers to track subtle linguistic shifts (changes in tense, pronouns, or narrative voice) to understand when the narrative moves between present action, past memories, imagined futures, and other characters' perspectives.
+* Understanding a convoluted visual narrative where multiple story threads are told simultaneously through different visual styles on the same page, requiring understanding of how the visual elements interact, conflict, and complement each other to create meaning. For example, a graphic novel page where realistic drawings depict current events, sketchy portions represent memories, and geometric patterns show emotional states, all interacting to tell a coherent story.
diff --git a/src/torch_measure/annotation/rubrics/CEe.txt b/src/torch_measure/annotation/rubrics/CEe.txt
new file mode 100644
index 00000000..f92dd367
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/CEe.txt
@@ -0,0 +1,32 @@
+# Expression
+This rubric evaluates the difficulty of a task's expression requirements, encompassing the generation and articulation of ideas, stories, or semantic content in different formats or modalities. It may include the usage  of the right vocabulary, adoption of the appropriate genre, formulation of explicit and implicit meanings, creation of relationships between concepts, incorporation of contextual information, expression of abstract ideas and complex systems, and transformation of sophisticated content into a smooth narrative. Noteworthy, the need of specialized vocabulary or jargon in the expression does not necessarily indicate a high level of difficulty, as these terms may be used within simple, straightforward linguistic structures requiring minimal compositional complexity and are more reflective of domain knowledge rather than expression sophistication. In addition, the difficulty level should correspond to the simplest expression effort to successfully solve the task, given that a solution to a task may be formulated in various ways with varied linguistic complexity; multiple-choice questions, even if the options are long and complex, generally do not require language expression beyond the basic level, so they are typically level 1. The rubric ranges from tasks requiring no meaningful expression to those demanding the generation of highly sophisticated, interconnected content, including the ability to create convoluted narratives, convey nuanced implications, and express multiple perspectives across different domains and levels of abstraction.
+
+Level 0: None. Tasks at this level involve no meaningful expression or communication, limited to automatic responses or simple pattern reproduction. The task can be completed through purely mechanical or algorithmic processes without any generation of meaning. Examples:
+* Repeating a sound pattern exactly as heard without understanding or modifying its meaning.
+* Copying text from one format to another without generating or modifying content.
+* Reproducing a sequence of gestures through simple mimicry.
+
+Level 1: Very Low. Tasks at this level require expressing basic, explicit meanings in simple formats, including use of common words, straightforward statements, and clear one-to-one relationships between ideas and their expression. Expression is limited to surface-level, literal articulation without need for context or nuance. Examples:
+* Stating immediate needs like "I need water" in a simple, direct, unambiguous way.
+* Solving a task that requires domain expertise to get the right answer but the answer only requires basic expression ability (e.g. "Given the product SMILES: O=C(NC1CCN(CCc2ccccc2)CC1)c1c[nH]c2ccc(F)cc12, predict the reactants SMILES".
+* Multiple-choice QA questions, where the subject only needs to choose one readily available option, even though the accurate answer option may be formulated in a linguistically complex manner (e.g. "The correct answer is option C. Reynolds and Khripkova would not make suitable business partners, [...], if they quarrel, know how to resolve their differences.")
+
+Level 2: Low. Tasks at this level involve producing straightforward messages with basic context, such as simple cause-effect relationships, clear sequential instructions, and explicit connections between ideas. Expression requires basic organization but remains tied to concrete, clearly stated information. Examples:
+* "Writing step-by-step instructions for making a sandwich, clearly indicating the sequence of actions and basic quantities needed."
+* Creating a brief email to schedule a meeting, specifying time, place, and basic purpose.
+* Describing a simple process like plant growth, connecting the basic sequence of events: "First the seed needs soil and water, then it grows roots, then it sprouts leaves."
+
+Level 3: Intermediate. Tasks at this level require generating moderately nuanced information, with attention to both content and presentation style. This includes selecting field-appropriate vocabulary, adapting to specific genres (like technical documentation or clinical notes), and creating coherent narratives that smoothly connect ideas. Expression may involve conveying both explicit and implicit information while maintaining consistent tone and voice throughout the text. Examples:
+* Writing explanatory notes for a simple geometry proof that guides the reader through the logic: "To prove these triangles are similar, we first show their angles are equal. The alternate angles formed by these parallel lines are equal, and since both triangles share this angle at point A, we can conclude..."
+* Writing product documentation that anticipates user confusion: "While the red indicator light typically signals an error, in sleep mode it indicates normal operation. If the light flashes red during active use, consult the troubleshooting guide."
+* Writing short clinical notes that connect symptoms with potential causes: "Patient presents with persistent cough and fatigue for 2 weeks. Given their recent travel history and exposure to dusty environments, considering both viral upper respiratory infection and environmental allergies as potential causes."
+
+Level 4: High. Tasks at this level demand generating sophisticated content with multiple layers of meaning, complex relationships between concepts, and nuanced implications. Such expressions may include the usage of linguistically advanced vocabulary and rhetorical devices, careful attention to genre conventions, and the ability to integrate multiple perspectives and communicate subtle patterns and connections. Examples:
+* Writing lecture notes that integrate multivariable calculus with linear algebra to explain the connection between Jacobian matrices, coordinate transformations, and volume changes in higher dimensions.
+* Writing technical documentation that addresses multiple user levels simultaneously: "The API's modular design allows for both simple plug-and-play implementation for basic use cases and sophisticated customization through advanced configuration options, ensuring scalability as your needs evolve."
+* Writing a detailed legal brief that weaves together statutory requirements, case law precedents, and policy implications: "While Smith v. Jones (2019) established a broad interpretation of 'reasonable care,' the specific circumstances of our case, combined with the legislative history of Section 47(b), suggest that this standard should be qualified when applied to specialized industrial settings..."
+
+Level 5: Very High. Tasks at this level require mastery in generating convoluted, abstract, and interconnected content, including nuanced vocabulary, convoluted narratives, deep arguments, and conveying multiple perspectives and subtle distinctions simultaneously. Examples:
+* Writing a few paragraphs of a graduate-level textbook section that develops the relationship between Lie groups, Lie algebras, and differential manifolds.
+* Creating a convoluted multi-layered narrative that simultaneously develops several plot threads through carefully structured revelations, such as a novel seemingly disconnected opening chapters gradually revealing their interconnections through subtle linguistic echoes and thematic resonances, allowing readers to piece together the full story while maintaining tension across multiple timelines.
+* Writing well-thought comprehensive hospital policy guidelines that address complex medical, legal, and ethical considerations: "The protocol for experimental treatments must balance patient autonomy, clinical evidence requirements, and legal liability considerations. When standard treatments are exhausted, the following decision tree integrates real-time clinical assessment, informed consent documentation, ethics committee review, and liability mitigation steps, while maintaining compliance with both state regulations and international medical ethics standards..."
diff --git a/src/torch_measure/annotation/rubrics/CL.txt b/src/torch_measure/annotation/rubrics/CL.txt
new file mode 100644
index 00000000..b418761b
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/CL.txt
@@ -0,0 +1,36 @@
+# Conceptualization, Learning, and Abstraction
+Level 0: None. The task requires no conceptualization, learning, abstraction, inductive or analogical reasoning. It involves applying well-established procedures or recalling known information, even for complex tasks. No new abstractions, analogies, or learning occur during task execution.
+Examples:
+* Performing basic one-digit arithmetic multiplications using memorized multiplication tables (like 3x3 = 9).
+* Sorting short texts into predefined categories based on a list of indicator words, without inferring new indicators.
+* What was the name of Abraham Lincoln's father?
+
+Level 1: Very Low. The task involves minimal conceptualization, learning, inductive or analogical reasoning. It requires simple pattern identification or following basic instructions, with very limited generalization or basic surface-level analogies occurring during the task.
+Examples:
+* "Continuing a basic letter sequence (e.g., "a, c, e, g, __").
+* "Given a red circle, a red square, a red triangle and a blue pentagon, find the one out, which is the blue pentagon."
+* Given a pair of words (like "hot and cold"), choose another pair from a list that shares the same relationship. For example, if "hot" and "cold" are opposites, you'd look for another opposite pair like "up and down."
+
+Level 2: Low. The task requires basic conceptualization, learning, inductive and analogical reasoning. It involves generalizing from a small set of examples, applying simple analogies to closely related domains, or applying simple instructions to new but closely related tasks.
+Examples:
+* Given the sentence 'As it started raining, Alice opened her brolly.' inferring the meaning of the unknown word (brolly) by using surrounding context clues, forming a basic abstraction about its possible definition.
+* In a fictional planet, observing in a garden where light yellow and light orange plants grow towards light sources over time but dark blue and dark red plants don't, and forming a basic hypothesis between the colors and plant behavior.
+* Adapting a solution from a previously solved secondary school math problem to a new problem with very similar structure but different surface features (e.g. numbers, names and context). While the core mathematical approach remains similar, the adaptation still requires recognizing how small variations in the new problem might require adjustments to the original solution method.
+
+Level 3: Intermediate. The task involves moderate conceptualization, learning, and inductive and analogical reasoning. It requires recognizing broader patterns, applying analogies across moderately different domains, and forming more complex hypotheses through analogical reasoning.
+Examples:
+* Reading passages where certain words are consistently replaced with nonsense words: 'The zork lives in a tree. The small zork ate berries. Many zorks gather in winter [...]. The tired zork slept quietly'. Through the multiple examples, learning not just that 'zork' likely means 'squirrel', but also understanding how it follows plural rules ('zorks'), can be modified by adjectives ('small zork', 'tired zork'), and performs actions typical of animals.
+* While playing a strategy game named Xiangqi (also known as Chinese chess) without any prior experience on it, coming up with some effective tactics through repeated observations and trials as well as some past experience playing chess.
+* Given data about plant growth in artificial conditions where light color, temperature, and humidity vary cyclically, observing that plants develop different leaf patterns depending on which factor changes first each day. Through systematic observation, forming basic hypotheses about how the sequence of environmental changes affects growth patterns.
+
+Level 4: High. The task requires substantial conceptualization, inductive and analogical reasoning, and abstraction, involving the integration of multiple concepts, creating complex analogical mappings across diverse domains, and forming and testing complex hypotheses.
+Examples:
+* Working with a collection of text messages where response times vary significantly. Through analysis, discovering that certain word combinations, sentence structures, and punctuation patterns consistently correlate with faster or slower response times, then using these insights to predict likely response speeds for new messages.
+* While learning Go after experience with chess and Xiangqi, discovering how stone formations serve multiple strategic purposes that differ fundamentally from piece-based games. Through systematic play and analysis, understanding how a group of stones can simultaneously secure territory, threaten invasion, and maintain connectivity with other groups. This requires substantial abstraction beyond piece-movement games to grasp how value emerges from stone relationships rather than individual pieces.
+* Working with a sequence of pattern acceptance tests where rules change systematically. For instance, in judging whether grid arrangements of colored shapes are "valid": early patterns are accepted based on color adjacency (e.g., "red must never touch blue"), then the rule shifts to consider shape orientation (e.g., "triangles must point toward squares"), and finally combines both aspects (e.g., "red triangles must point toward blue squares"). The systematic nature of the rule changes follows a clear progression from simple single-attribute rules to combined rules. The subject must track these rule evolutions to correctly predict which new grid arrangements would be considered valid, understanding that rules become progressively more complex by combining previous attributes rather than introducing entirely new concepts.
+
+Level 5: Very High. The task involves very advanced conceptualization, inductive and analogical reasoning, and abstraction. It requires generating new analogical frameworks in real-time, mapping relationships across highly diverse and abstract domains, or solving complex problems through novel analogical insights.
+Examples:
+* Solving a visual puzzle where three different properties (symmetry, rotation, and scaling) must be understood at both the element level and the pattern level. For instance, individual shapes follow one set of transformation rules, while the overall arrangement follows a different set of rules, and the relationship between these two rule sets must be discovered to predict the next state.
+* Designing a new electronic musical instrument after studying blueprints of synthesizers, amplifiers, and effect pedals. This requires abstracting core principles of signal generation, processing, and control from each device (oscillation, filtering, envelope shaping, feedback), understanding how these principles create different sonic characteristics, and then creatively recombining them to produce new types of sounds. The task demands identifying how fundamental concepts manifest differently across devices (like how feedback creates sustain in an amplifier but modulation in a ring modulator), then synthesizing these insights to create sound-generating mechanisms.
+* Working with a sequence of pattern acceptance tests where rules evolve with increasing abstraction and self-reference. Starting from 'red triangles must point toward blue squares', patterns evolve to where shapes establish relationships based on their relative properties. For instance, shapes with more sides must point toward shapes with fewer sides, but this relationship inverts when the shapes share colors. Furthermore, each valid pattern must mirror a small-scale arrangement within its larger structure - if three triangles form a particular relationship on one side of the grid, the overall shape arrangement of the entire grid must follow that same relationship. The subject must discover these nested self-referential patterns and predict how they apply at different scales, requiring both pattern recognition and the generation of new frameworks for understanding how rules can reference themselves.
diff --git a/src/torch_measure/annotation/rubrics/KNa.txt b/src/torch_measure/annotation/rubrics/KNa.txt
new file mode 100644
index 00000000..42a6f67b
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/KNa.txt
@@ -0,0 +1,32 @@
+# Knowledge in Applied Sciences and Professions
+The following rubric is designed to annotate the conceptual sophistication level of tasks based entirely on the depth of knowledge or conceptual understanding required in the fields of applied sciences and professions (e.g., medicine, law, education, business, agriculture, engineering except software and data engineering). Noteworthy, this rubric only focuses on applied knowledge and practical implementations rather than purely theoretical frameworks or abstract concepts from natural sciences, formal sciences, social sciences and humanities. For instance, understanding chemical reactions is only part of natural sciences, but applying this knowledge in pharmaceutical manufacturing would fall under applied sciences. Similarly, economic theory belongs only to social sciences, but practical business management and operations fall under applied sciences and professions. The focus is exclusively on the level of domain-specific knowledge needed, disregarding other cognitive demands such as reasoning or metacognition. This reflects only the depth of applied sciences and professional knowledge required.
+
+Level 0: None. Tasks requiring no knowledge or understanding of applied sciences or professional fields. Examples:
+* "Let $\\triangle A B C$ be an acute triangle, with $M$ being the midpoint of $\\overline{B C}$, such that $A M=B C$. Let $D$ and $E$ be the intersection of the internal angle bisectors of $\\angle A M B$ and $\\angle A M C$ with $A B$ and $A C$, respectively. Find the ratio of the area of $\\triangle D M E$ to the area of $\\triangle A B C$."
+* "In a chemical reaction at pH 1, an unknown substance was added that changed the pH to 4 and slowed down the reaction. What could have caused this?"
+* "If a star 20 light-years away explodes, would gravitational waves reach Earth faster than light?""
+
+Level 1: Very low. Tasks that require knowledge in applied sciences and professions typically acquired through elementary school education. Examples:
+* Basic personal hygiene and hand washing procedures.
+* Common road signs and traffic signals.
+* Basic safety rules at home and school.
+
+Level 2: Low. Tasks that require knowledge in applied sciences and professions typically acquired through middle school education. Examples:
+* The use of basic measurement tools (thermometer, ruler, scale).
+* Basic principles of personal finance and saving.
+* Simple first aid for minor injuries.
+
+Level 3: Intermediate. Tasks that require knowledge in applied sciences and professions typically acquired through high school education. Examples:
+* Common legal terms (plaintiff, defendant, contract).
+* Basic business concepts (budget, profit, loss).
+* Fundamental principles of agricultural science.
+
+Level 4: High. Tasks that require knowledge in applied sciences and professions typically acquired through undergraduate education. Examples:
+* Basic principles of bridge design in civil engineering.
+* Core concepts of supply chain management.
+* Fundamentals of clinical assessment in healthcare.
+
+Level 5: Very high. Tasks that require knowledge in applied sciences and professions typically acquired through graduate education or beyond. Examples:
+* Gene therapy techniques in precision medicine.
+* Engineering principles of nuclear fusion reactor design.
+* Legal frameworks for regulating artificial intelligence systems.
diff --git a/src/torch_measure/annotation/rubrics/KNc.txt b/src/torch_measure/annotation/rubrics/KNc.txt
new file mode 100644
index 00000000..050f1b26
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/KNc.txt
@@ -0,0 +1,32 @@
+# Customary Everyday Knowledge
+The following rubric is designed to annotate the conceptual sophistication level of tasks based solely on the depth of common everyday knowledge required. This knowledge encompasses information that most people in a given society typically acquire through daily life experiences, social interactions, and exposure to popular media, rather than through formal education or specialized training. This does not include specialized knowledge from the natural sciences, social sciences, humanities, formal sciences, or applied sciences and professions. This reflects only the depth of common everyday knowledge required.
+
+Level 0: None. Tasks do not require any common everyday knowledge. Examples:
+* Looking in a mirror and checking if there's any dirt in your face.
+* Recognizing that two objects are of the same color.
+* Basic arithmetic calculations.
+
+Level 1: Very low. Tasks that require basic knowledge universally shared within a society. Examples:
+* The fact that the sky appears blue during daytime.
+* The fact that food and water are necessary for survival.
+* Common objects in daily life (chairs, tables, cars).
+
+Level 2: Low. Tasks that require common knowledge typically possessed by most adults in a society. Examples:
+* Basic kitchen tools and their uses.
+* Common traffic signs and their meanings.
+* Major holidays in one's culture.
+
+Level 3: Intermediate. Tasks that require general knowledge typically possessed by socially engaged members of society. Examples:
+* Different types of payment methods (cash, credit cards, digital wallets).
+* Common technology features (touch screens, wireless connectivity, cloud storage).
+* Standard retail practices (return policies, warranties, seasonal sales).
+
+Level 4: High. Tasks that require extensive everyday knowledge gained through active engagement in society. Examples:
+* Major generational trends in technology adoption (from landlines to smartphones).
+* Common real estate concepts (mortgages, leases, property taxes).
+* Dietary restrictions across different groups (religious, health-based, ethical).
+
+Level 5: Very high. Tasks that require comprehensive everyday knowledge across diverse cultural and social contexts. Examples:
+* Gift-giving customs and taboos across different cultures.
+* Business etiquette variations in major world regions.
+* Dining customs and table manners in different societies.
\ No newline at end of file
diff --git a/src/torch_measure/annotation/rubrics/KNf.txt b/src/torch_measure/annotation/rubrics/KNf.txt
new file mode 100644
index 00000000..3aadaa75
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/KNf.txt
@@ -0,0 +1,32 @@
+# Knowledge in Formal Sciences
+The following rubric is designed to annotate the conceptual sophistication level of tasks based strictly on the depth of knowledge or conceptual understanding required in the fields of formal sciences (e.g., mathematics, logic, computer science, statistics). This does not include natural sciences (e.g., physics, chemistry, biology, astronomy, earth sciences, ecology) or social sciences and humanities (e.g., history, psychology, sociology, anthropology, literature, art, philosophy, linguistics). It's crucial to understand that this rubric measures only the level of formal scientific knowledge needed, not considering other cognitive demands such as reasoning or metacognition. This indicates solely the depth of formal sciences knowledge required.
+
+Level 0: None. Tasks require no knowledge or understanding of formal sciences. Examples:
+* "Explaining the biological mechanisms of cellular respiration."
+* "Describing the major artistic movements of the Renaissance period."
+* "Explaining the rules of a sport."
+
+Level 1: Very low. Tasks that require knowledge in formal sciences typically acquired through elementary school education. Examples:
+* Basic arithmetic operations (+, -, ×, ÷).
+* Names and properties of basic shapes (square, circle, triangle).
+* Understanding that a programming loop with 10 repetitions takes double time than a loop with 5 repetitions.
+
+Level 2: Low. Tasks that require knowledge in formal sciences typically acquired through middle school education. Examples:
+* Basic algebraic expressions and variables.
+* Calculating mean, median, and mode.
+* Properties of basic number systems (integers, decimals, fractions).
+
+Level 3: Intermediate. Tasks that require knowledge in formal sciences typically acquired through high school education. Examples:
+* Basic geometric shapes and their properties.
+* What an algorithm is.
+* Fundamental concepts of logic and syllogisms.
+
+Level 4: High. Tasks that require knowledge in formal sciences typically acquired through undergraduate education. Examples:
+* The fundamental theorem of calculus.
+* Principles of object-oriented programming.
+* Basic concepts in linear algebra and matrix operations.
+
+Level 5: Very high. Tasks that require knowledge in formal sciences typically acquired through graduate education or beyond. Examples:
+* Principles of homological algebra.
+* Mathematical foundations of quantum computing.
+* Advanced concepts in cryptography and their mathematical basis.
\ No newline at end of file
diff --git a/src/torch_measure/annotation/rubrics/KNn.txt b/src/torch_measure/annotation/rubrics/KNn.txt
new file mode 100644
index 00000000..42ba27e1
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/KNn.txt
@@ -0,0 +1,32 @@
+# Knowledge in Natural Sciences
+This rubric assesses the conceptual sophistication level of tasks based solely on the depth of knowledge or conceptual understanding required in the fields of natural sciences (e.g., physics, chemistry, biology, astronomy, earth sciences, ecology). This does not include social sciences and humanities (e.g., history, psychology, sociology, anthropology, literature, art, philosophy, linguistics) or formal sciences (e.g., mathematics, logic, computer science, statistics). It's important to note that this rubric focuses exclusively on the domain-specific knowledge needed, not considering other cognitive demands such as reasoning or metacognition. This reflects the conceptual depth and specificity of the knowledge in natural sciences required, rather than the mere presence of scientific content.
+
+Level 0: None. Tasks do not require any knowledge of natural sciences. Examples:
+* "Write a python script to train a machine learning classifier for fake news detection."
+* "Analyze the symbolism in Shakespeare's Hamlet".
+* "Calculate the cost of groceries."
+
+Level 1: Very low. Tasks that require knowledge in natural sciences typically acquired through elementary school education. Examples:
+* Living things need food, water, and air to survive.
+* Basic parts of a plant (roots, stem, leaves).
+* Day and night cycle and seasons.
+
+Level 2: Low. Tasks that require knowledge in natural sciences typically acquired through middle school education. Examples:
+* The water cycle (evaporation, condensation, precipitation).
+* Basic cellular structure (nucleus, membrane, cytoplasm).
+* Simple food chains and ecosystems.
+
+Level 3: Intermediate. Tasks that require knowledge in natural sciences typically acquired through high school education. Examples:
+* Mendel's laws of inheritance and basic genetics.
+* The ideal gas law (PV = nRT).
+* Newton's three laws of motion.
+
+Level 4: High. Tasks that require knowledge in natural sciences typically acquired through undergraduate education. Examples:
+* Hardy-Weinberg equilibrium and population genetics.
+* Molecular orbital theory.
+* The process of cellular respiration and its relationship to photosynthesis.
+
+Level 5: Very high. Tasks that require knowledge in natural sciences typically acquired through graduate education or beyond. Examples:
+* The theoretical frameworks of string theory and its implications.
+* The six forms of quark flavors in particle physics.
+* The role of quantum entanglement in biological systems.
\ No newline at end of file
diff --git a/src/torch_measure/annotation/rubrics/KNs.txt b/src/torch_measure/annotation/rubrics/KNs.txt
new file mode 100644
index 00000000..c759063a
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/KNs.txt
@@ -0,0 +1,32 @@
+# Knowledge in Social Sciences and Humanities
+The following rubric is designed to annotate the conceptual sophistication level of tasks based exclusively on the depth of knowledge or conceptual understanding required in the fields of social sciences and humanities (e.g., history, psychology, sociology, anthropology, literature, art, philosophy, linguistics). This does not include natural sciences (e.g., physics, chemistry, biology, astronomy, earth sciences, ecology) and formal sciences (e.g., mathematics, logic, computer science, statistics). It's important to note that this rubric focuses exclusively on the  domain-specific knowledge needed, not considering other cognitive demands such as reasoning or metacognition. This reflects purely the depth of social sciences and humanities knowledge required.
+
+Level 0: None. Tasks do not require any knowledge or understanding of social sciences or humanities. Examples:
+* "Calculating the area of a rectangle with length 5 cm and width 3 cm."
+* "Explaining the process of photosynthesis in plants."
+* "Explaining the mathematical principles behind differential calculus."
+
+Level 1: Very low. Tasks that require knowledge in social sciences and humanities typically acquired through elementary school education. Examples:
+* Basic concepts of past, present, and future in history.
+* Different types of communities (family, school, neighborhood).
+* Traditional holidays and their basic meanings.
+
+Level 2: Low. Tasks that require knowledge in social sciences and humanities typically acquired through middle school education. Examples:
+* Basic historical periods like "ancient" vs "modern" times.
+* Different types of government (democracy, monarchy).
+* Major world religions and their basic beliefs.
+
+Level 3: Intermediate. Tasks that require knowledge in social sciences and humanities typically acquired through high school education. Examples:
+* The role of the Silk Road in cultural exchange.
+* Basic principles of cognitive psychology.
+* Major literary movements (Romanticism, Realism).
+
+Level 4: High. Tasks that require knowledge in social sciences and humanities typically acquired through undergraduate education. Examples:
+* The socio-economic factors that led to the Industrial Revolution.
+* Major sociological theories of social stratification.
+* The main theoretical approaches in anthropology.
+
+Level 5: Very high. Tasks that require knowledge in social sciences and humanities typically acquired through graduate education or beyond. Examples:
+* The major schools of Sanskrit poetics.
+* The primary theoretical frameworks in phenomenology.
+* Advanced theories in historical linguistics and their implications.
\ No newline at end of file
diff --git a/src/torch_measure/annotation/rubrics/MCr.txt b/src/torch_measure/annotation/rubrics/MCr.txt
new file mode 100644
index 00000000..5f0f52f0
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/MCr.txt
@@ -0,0 +1,31 @@
+# Identifying Relevant Information
+This rubric assesses the difficulty of the metacognitive processing required by the respondent to identify the information necessary during the process of solving the task with a given set of information. More concretely, it involves the respondent's ability to recognize what information helps solve the task or does not, and how this recognition process unfolds as they work toward the solution.
+
+Level 0: None. All necessary information is immediately apparent and directly applicable to solving the task, or no information is provided and none is needed. No metacognitive processing is required to identify relevant information during problem-solving. Examples:
+* "What is the capital of France?"
+* "What is 2513441 + 7519239519281?"
+* "How many sports correspond to IPTC Newscode mediatopic/20000960?."
+
+Level 1: Very low. Most relevant information is readily identifiable, with minimal extraneous details. The respondent needs to do simple filtering or selection of information as they proceed with solving the task, but the relevance of information remains clear throughout the process. Example:
+* "John has 5 apples and 3 oranges. How many apples does John have?"
+* "Alice's mother has several brothers, one married to Helen, who currently lives in Barcelona. What's Helen with respect to Alice?"
+* "The recipe calls for 4 cups of flour and 2 cups of sugar. How many cups of flour are needed?"
+
+Level 2: Low.  A fair amount of potentially relevant information is provided, mixed with some irrelevant details. As the respondent works through the problem, they need to evaluate which pieces of information are useful for the next step in their solution process, requiring ongoing but straightforward metacognitive assessment. Examples:
+* "Sarah went to the grocery store on Tuesday. She bought 3 oranges for $0.50 each, 2 apples for $0.75 each, and a loaf of bread for $2.25. What was the total cost of the fruit Sarah purchased?"
+* "In preparation for a marathon, James ran 5 miles on Monday, 8 miles on Wednesday, and 6 miles on Saturday. He also trained at the gym for 2 hours each week. How many miles did James run in total?"
+* "Mary took photos of the Colosseum at sunset and visited the Vatican Museums where she spent two hours studying the famous ceiling of the Sistine Chapel. She also bought souvenirs for her friends and got lost trying to find her hotel. What did Mary observe at the Vatican Museums?"
+
+Level 3: Medium. The respondent must engage in moderate metacognitive processing throughout the problem-solving process in one or both of these ways: (1) evaluating and reconciling potentially conflicting or redundant information that serves as distractors within a manageable search space, or (2) recognizing what additional information or problem-solving approaches are needed when not all relevant information is explicitly provided, but the possible solution paths remain relatively constrained. Some information that seems irrelevant initially may become important later, or some unstated information may become crucial to identify as the solution progresses. Examples:
+* "A student's short essay discusses how Shakespeare's character Hamlet shows signs of depression. In the essay, it states that Hamlet speaks harshly to Ophelia in Act 3, telling her 'Get thee to a nunnery' and refusing her love. The essay also mentions his soliloquy 'To be or not to be,' his wearing of dark clothes at court, and his Act 1 conversation with Horatio about his father's ghost. The essay is 1000 words long and includes quotes from Acts 1, 3, and 5. What evidence does the essay present about Hamlet's interactions with Ophelia?"
+* "A news article discusses a local park renovation project, mentioning the park's historical significance from the 1950s, current visitor numbers, planned new features including a playground and walking paths, the project's $500,000 budget split across different improvements, debates about preserving old trees versus adding parking spaces, and quotes from both the project manager and local residents about their memories of childhood visits. What are the specific new features planned for the park renovation?"
+
+Level 4: High. The problem-solving process requires sophisticated metacognitive strategies throughout, with a large search space to navigate. This could involve either: (1) evaluating multiple possible interpretations of significant amounts of conflicting/redundant information that serves as distractors, requiring exploration of various combination possibilities, or (2) identifying crucial unstated information or approaches needed for solution while considering multiple possible solution paths and their implications. The respondent must frequently reassess their understanding and adjust their approach as they either discover new connections between provided information or recognize important unstated elements needed for solving the task. Examples:
+* "In this escape room scenario, you find a desk with a locked drawer, a calendar marked with different colored circles, a bookshelf with titles in various languages, and a wall clock showing 3:45. On the desk, there's a note that reads 'Time reveals knowledge, knowledge unlocks secrets.' A painting on the wall shows a sunset over a library, and there's a globe with certain cities marked with stars. Each time you examine an object, you notice new details that might connect to others. How can you open the locked drawer?"
+* "Assume that there exist only two types of people: knights and knaves. Knights always tell the truth, while knaves always lie.\nYou are given the statements from 6 characters. Based on their statements, infer who is a knight and who is a knave.\nA: E is a liar if and only if C is a liar.\nB: If D is a liar, then E is a liar.\nC: E is a truth-teller and F is a truth-teller.\nD: C is a liar if and only if B is a liar.\nE: If B is a liar, then C is a truth-teller.\nF: B is a liar if and only if A is a liar."
+* "A customer survey about a new phone model gathered feedback through three methods: online reviews mentioned battery life lasting 'all day', 'about 12 hours', or '14-16 hours'; in-person interviews reported battery performance as 'excellent', 'better than previous model by 4 hours', or 'lasting from morning to night'; and usage data showed power consumption patterns varying between 10-18 hours depending on features used. Technical specs list battery capacity, screen brightness impact, and various power-saving modes. What can be concluded about the phone's actual battery life?"
+
+Level 5: Very high. The problem-solving process demands constant high-level metacognitive monitoring and regulation in challenging conditions: either most of the provided information is redundant, misleading, or contradictory (while remaining solvable), or crucial information about solution approaches and constraints is left unstated and must be discovered. The respondent must maintain awareness of many possible interpretation frameworks or solution paths simultaneously, regularly revisiting their understanding as they either recontextualize conflicting information or identify necessary unstated information and constraints. Examples:
+* Riddles such as: "I am found in ancient scrolls and modern screens, made of nothing but seen by all, I dance between light and dark, born in storms yet living in peace, flowing like water but dry as sand, silent as night but telling stories, changing shape with every eye yet always staying the same. Sometimes I march in straight lines, other times I curl and twist, I can be bold or gentle, thick or thin, but I never truly exist. What am I?" (the answer is "shadow")
+* "Solve this cryptic crossword puzzle: 'Stop for break, drink coffee and tea endlessly, stir milk around in a mug - useless without morning essentials!'" (the answer is "breakfast")
+* "A restaurant review contains extensive details about the reviewer's experience: describes the rainy drive to the location, the hostess's friendly greeting, memories of their grandmother's cooking, opinions about the restaurant's decor choices, a lengthy story about their career as a food critic, descriptions of fellow diners' conversations, commentary about parking difficulties, their favorite recipes, the day's weather forecast, and briefly mentions in different places that the pasta was 'perfectly cooked', 'somewhat firm', 'just right', and 'could have been softer'. What was the reviewer's assessment of the pasta's texture?"
diff --git a/src/torch_measure/annotation/rubrics/MCt.txt b/src/torch_measure/annotation/rubrics/MCt.txt
new file mode 100644
index 00000000..3073604e
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/MCt.txt
@@ -0,0 +1,38 @@
+# Critical Thinking Processes
+This rubric assesses the difficulty level of metacognitive engagement required by the question. More concretely, the level represents the extent to which the question requires the respondent to monitor or regulate multiple thought processes to answer the question effectively, ranging from simple recall to high-level critical thinking.
+
+Level 0: None. No critical thinking or analysis is needed.
+Examples:
+* Clapping one's hands with another entity.
+* Simple recall of facts without further processing.
+* Recognizing a familiar face from a photograph.
+
+Level 1: Very Low. The task requires recall or recognition of facts, with a low level information processing required. The respondent needs to retrieve information directly from memory or identify very obvious relationships. There is no need for critical thinking or analysis beyond the most elementary level.
+Examples:
+* Selecting the correct meaning of a common word from multiple clearly distinct definitions.
+* Matching simple synonyms, such as "big" and "large".
+* Question: What was the time 5 years and 6 months before Jan, 1956?.
+
+Level 2: Low. The task involves mostly straightforward comprehension or application of known concepts, with some information processing. The respondent may need to demonstrate understanding by explaining ideas, making simple comparisons, or applying concepts in familiar contexts. A low-level of critical thinking is required, such as recognizing generally obvious patterns or making simple categorizations.
+Examples:
+* Answering the question: 'What happens in Cinderella when the clock approaches midnight?' The answer requires explaining that Cinderella must flee because her magical transformation will end.
+* Giving the smallest amount of coins as change from a purchase.
+* Answering the question: "How many solid $1 \times 1 \times 1$ cubes are required to make a solid $2 \times 2 \times 2$ cube?".
+
+Level 3: Intermediate. The task necessitates a considerable amount of analysis or synthesis of information. The respondent needs to engage in moderate critical thinking, such as identifying patterns, making inferences, or applying concepts to new situations.
+Examples:
+* Analyzing the symbolism in a poem and explaining how it contributes to the overall theme.
+* Identifying potential biases in a news article and explaining their impact on the information presented.
+* Explaining how a price reduction could lead to increased overall revenue through its effect on sales volume.
+
+Level 4: High. The task demands advanced critical thinking skills, including evaluation of complex ideas, analysis of multiple perspectives and assumptions, or creation of new concepts. The respondent must maintain consistent awareness of thinking processes and potential biases.
+Examples:
+* Evaluating a school's proposal to extend the lunch period by examining the evidence for improved student focus, considering impacts on different stakeholders like teachers and students, and analyzing how personal preferences might affect one's assessment of the policy.
+* Designing a study to compare two teaching methods for basic math by identifying potential sources of bias, developing fair assessment criteria, and planning how to control for differences in student ability levels.
+* Analyzing the role of bread prices in the French Revolution by examining economic data from different regions, comparing its major impact between urban and rural areas, and evaluating how food scarcity combined with tax burdens and wage stagnation influenced public unrest.
+
+Level 5: Very High. The task demands the highest level of critical thinking, requiring sophisticated metacognitive strategies focused on examining reasoning processes, identifying logical fallacies, evaluating competing arguments, and reaching well-reasoned conclusions. The respondent must reflect on their own thinking processes, assumptions, and biases while engaging with complex ideas.
+Examples:
+* Analyzing a proposed economic study that claims to prove racial discrimination in hiring by examining the researchers' unstated assumptions about causality, identifying potential confounding variables they haven't controlled for, evaluating whether their statistical methods actually support their conclusions, examining your own potential biases about the topic, and determining what can and cannot be legitimately concluded from their methodology - all while maintaining awareness of how your own socioeconomic background might influence your analysis.
+* Evaluating a complex court case by dissecting the logical structure of competing arguments from prosecution and defense, identifying unstated assumptions in witness testimony, examining how your own biases about the defendant might affect your judgment, analyzing the credibility and limitations of different pieces of evidence, and reaching a conclusion while explicitly acknowledging areas of reasonable doubt and uncertainty.
+* Breaking down a philosophical argument about consciousness by identifying circular reasoning and unstated premises, examining how different definitions of key terms affect the argument's validity, evaluating the credibility of thought experiments used as evidence, testing the argument's logical consistency, recognizing your own presuppositions about the nature of mind and reality, and determining which conclusions are truly warranted by the premises.
diff --git a/src/torch_measure/annotation/rubrics/MCu.txt b/src/torch_measure/annotation/rubrics/MCu.txt
new file mode 100644
index 00000000..86c48618
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/MCu.txt
@@ -0,0 +1,38 @@
+# Calibrating Knowns and Unknowns
+This rubric assesses how difficult it is for the respondent to accurately evaluate whether they know or don't know something. It focuses on metacognitive assessment - the ability to recognize the boundaries of one's knowledge and confidently identify what one knows they know, knows they don't know, or is uncertain about.
+
+Level 0: None. The respondent can immediately and unambiguously determine whether they know something or know they don't know it. No metacognitive effort is required.
+Examples:
+* "Tell me the typical colour of the sky on Earth during the day?"
+* "Can it snow when the environmental temperature goes up to 40 degree celsius?"
+*  "What's the name of Taylor Swift's maternal grandmother?"
+
+Level 1: Very low. Minimal metacognitive effort is needed to determine whether one has the knowledge to answer. The boundaries between knowing and not knowing are very clear.
+Examples:
+* "Calculate 164942+26250737, tell me if you think you have done it well." (the score is good if the sum is right and says correct, or sum is wrong and says incorrect)
+* "How many hairs did Barack Obama have exactly when he woke up on the morning of March 1, 2024?" (the score is good if the answer given or chosen is that it can't be known)
+* "Given $2w+4t=14$ and $4w+5t=25$, calculate the value of $2w+3t$ and tell me if you think you have done it well." (the score is good if the answer is right and says correct, or the answer is wrong and says incorrect)
+
+Level 2: Low. Some metacognitive effort is required to assess the boundaries of one's knowledge, but the assessment is still relatively straightforward.
+Examples:
+* "Given the new breakthroughs in chemistry this year, explain how to synthesise gold out of boiling both blonde hair and metals made of bronze" (the score is good if the task is refused, as it is clearly not possible).
+* "Say something that indicates your level of Estonian" (assuming you know a bit of Estonian).
+* Determine for some simple arithmetic operations with logarithms of base 2 when you can do it yourself or require a calculator".
+
+Level 3: Medium. Moderate metacognitive effort is required to determine the boundaries of one's knowledge. There may be some uncertainty about whether one truly knows something or just thinks they might know it.
+Examples:
+* "Solve a simple 9x9 Sudoku puzzle" (since it requires tracking which numbers are certain about in each 3x3 box versus numbers that have only been partially eliminated).
+* "Given a detailed passage about the American Civil War mentioning several battles and dates, identify which specific facts you're confident enough to verify versus those you'd need to research".
+* "When presented with a system of three linear equations, determine whether you know enough about elimination and substitution methods to solve it completely or if you might be missing key steps."
+
+Level 4: High. Significant metacognitive effort is needed to determine whether one truly knows something or just has partial knowledge. The line between knowing and not knowing becomes blurry.
+Examples:
+* "Given a dataset of 100 observations with 20% missing values randomly appearing across different variables of interest and potential sampling bias, assess whether you can confidently identify which statistical conclusions are reliable versus which might be affected by unknown factors in the data collection process.""
+* "Given a patient with symptoms of fever, fatigue, and joint pain, determine which potential diagnoses you can confidently rule out versus which require additional information or testing."
+* "When analyzing a legal document with multiple clauses and cross-references, identify those parts you can interpret with certainty versus those that require expert consultation."
+
+Level 5: Very high. Extremely challenging to determine the boundaries between what one knows and doesn't know. Requires sophisticated metacognitive assessment to avoid overconfidence or underconfidence.
+Examples:
+* "Predict how much a machine learning model's accuracy will drop (if any) over the next 6 months for a system that classifies which emergency room patients are at high risk of developing complications within the next 24 hours, given evolving disease patterns, changes in hospital protocols, potential new variants, seasonal effects, varying patient demographics, and changing physician response patterns. "
+* "In a Texas Hold'em poker hand, after the flop, determine your winning probability with J♠K♠, three low hearts on the board, and five opponents you've been playing with for some time."
+* "During an ongoing international crisis (e.g., a major conflict or global financial crisis), determine whether to invest a significant portion of your portfolio in seemingly undervalued stocks, considering factors such as market psychology, geopolitical developments, supply chain disruptions, currency fluctuations, central bank responses, and potential long-term structural changes to affected industries.."
diff --git a/src/torch_measure/annotation/rubrics/MS.txt b/src/torch_measure/annotation/rubrics/MS.txt
new file mode 100644
index 00000000..365caf04
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/MS.txt
@@ -0,0 +1,38 @@
+# Mind Modelling and Social Cognition
+This criterion assesses the level of cognitive demands associated with mind modelling of others and social cognition. The level of cognitive demands progresses from tasks that require no mind modelling (specifically, the ability to model the minds of other agents) or social cognition to those that require reasoning about how the beliefs, desires, intentions, and emotions of multiple other agents might interact to determine future behaviours.
+
+Level 0: None. The task does not require mind modelling or social cognition. It may not involve other agents, or if it does, perceiving or interacting with those agents is not necessary to complete the task.
+Examples:
+* Solving a Sudoku puzzle independently.
+* Operating a dishwasher according to its instruction manual.
+* Reading a book silently to yourself, even if others are present in the room.
+
+Level 1: Very low. Performance in this task is improved through the detection or recognition of other agents and by basic social learning (e.g., imitation). Critically, reasoning about observed behaviour or attributing mental states to others is not required for good performance in this task.
+Examples:
+* Mimicking someone's hand gestures during a conversation
+* Following another person's gaze to find where they left their keys.
+* Copying the sequence of buttons someone presses to operate a vending machine.
+
+Level 2: Low. This task requires some basic intuition about the behaviour of others, but only minimal levels of mental state attribution. Good performance might be based on developing accurate associations between other's responses and the stimuli that caused them. Note, this reasoning need not be explicit.
+Examples:
+* Recognizing that someone using a rock to crack open a coconut is trying to get to the food inside.
+* Identifying that someone's scrunched nose and turned head means they don't like the smell of spoiled milk.
+* Solving an abstract logic problem in which only minimal levels of mind modelling is needed (e.g. "Assume that there exist only two types of people: knights and knaves. Knights always tell the truth, while knaves always lie.\nYou are given the statements from 3 characters. Based on their statements, infer who is a knight and who is a knave.\nA: C is a liar.\nB: C is a truth-teller and A is a truth-teller.\nC: B is a truth-teller and A is a liar.")
+
+Level 3: Intermediate. This task goes beyond simple state-behaviour associations and involves attributing cognitive or affective states (i.e., mentalising). That is, it involves inferring specific mental properties about others ('they believe the moon landing was a hoax', 'they want a glass of water'). The task may not, however, require explicit reasoning about these mental states (i.e., full-blown theory of mind).
+Examples:
+* Telling a colleague about a mutual friend's new job, knowing they haven't heard the news yet and thus might be interested.
+* Finding a good hiding spot in hide-and-seek by visualizing where the seeker might look.
+* Recognizing that someone checking their watch repeatedly during a meeting probably wants to leave.
+
+Level 4: High. This task requires a full theory of mind to be solved effectively. It requires not only the attribution of mental states to others, but explicit reasoning about those states. It may also require the integration of social knowledge and heuristics about normal agentic behaviour to accurately predict future behaviour. Importantly, this task also requires a clear distinction between self- and other-related representations.
+Examples:
+* Developing an intuitive theory about an agent's future behaviour such as understanding that Sally will look for her marble in the basket where she left it, even though Anne moved it to the box when Sally was away.
+* Distinguishing between one's own emotional reaction to a friend's story and what the friend is feeling.
+* Recognizing not to point out a spelling mistake in your manager's presentation based on their emotional state, personality, and the social context.
+
+Level 5: Very high. This task requires exceptional mind modelling and social cognition abilities. It goes beyond generating intuitive theories about another agent within a dyadic interaction, and instead requires the combination of multiple theories of mind corresponding to the intentions, emotions, and beliefs of a range of different agents. Expanding the scope of mind-modelling and social cognition to include multiple agents would enable more sophisticated forms of collaborative action. Tasks at this level may require an understanding of the complex networks and hierarchies that form within social groups.
+Examples:
+* Comprehending the plot of a romance novel or the 'social drama' at a dinner party that requires modelling the mental states of multiple agents (e.g., 'I heard that Jane told Steve that his girlfriend Abigail wanted to leave, but that he didn't believe her, thinking Jane was just causing trouble because she had seen Abigail talking to her boyfriend Andrew…')
+* Appreciating the behaviour of individuals within a work team and managing the situation in which one employee has misinterpreted another's actions as deliberately unhelpful, which has created tension that affects the whole group's dynamics
+* Leading a negotiation between multiple stakeholders where each party has different beliefs about others' intentions and bottom lines, while managing the complex emotional dynamics between opposing personalities.
diff --git a/src/torch_measure/annotation/rubrics/QLl.txt b/src/torch_measure/annotation/rubrics/QLl.txt
new file mode 100644
index 00000000..80911472
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/QLl.txt
@@ -0,0 +1,38 @@
+# Logical Reasoning
+This rubric evaluates the logical reasoning demands of tasks across six levels of difficulty, focusing exclusively on deductive reasoning, which is required whenever a task involves matching and applying rules, procedures, or algorithms to solve problems, making structured decisions based on given premises, or deriving conclusions through systematic steps. This includes tasks ranging from basic arithmetic operations to sophisticated multi-step problem solving that requires careful chaining of logical relationships. Each level increases in complexity based on the intricacy of logical constructs, the depth of reasoning required, the number of premises involved, and the abstraction of concepts.
+
+Level 0: None. Tasks at this level do not require any logical reasoning. They involve simple identification or recalling memorized facts without the application of rules or relationships, as no logical constructs or premises are present.
+Examples:
+* Naming the capital city of a country without considering any relationships (e.g., "What is the capital of Austria?").
+* A pure knowledge question like "Which country won the football world cup in 2008?"
+* "Match this circle to another circle."
+
+Level 1: Very low. Tasks require recognition of basic logical relationships and straightforward deductions from clearly stated premises. They may involve matching objects based on a single criterion or recognizing a direct implication without any complex logical operators, quantifiers, or negations.
+Examples:
+* Simple sequential logic without requiring complex reasoning (e.g., "If today is Monday, what day will it be tomorrow?")
+* "When the light is red, stop. The light is red. What should you do?"
+* "Compute the sample standard deviation of ${-12, 51, 21, -9, -8, -7}$."
+
+Level 2: Low. Tasks involve simple sequential reasoning or basic syllogistic reasoning with clear premises. Logical statements may include basic quantifiers such as "all" or "some" but involve direct relationships without complex integration of multiple premises.
+Examples:
+* Identifying characteristics within a biological classification  (e.g., "If all mammals are warm-blooded and all whales are mammals, then all whales are warm-blooded.")
+* Evaluating logical inferences from categorical statements (e.g., "If some books are fiction and all fiction books are interesting, can we conclude that some books are interesting?")
+* $$\begin{aligned}& b=2.35+0.25 x \\& c=1.75+0.40 x\end{aligned}$$In the equations above, $b$ and $c$ represent the price per pound, in dollars, of beef and chicken, respectively, $x$ weeks after July 1 during last summer. What was the price per pound of beef when it was equal to the price per pound of chicken?
+
+Level 3: Intermediate. Tasks involve multi-step logical deductions that require integrating multiple statements and the use of simple negations and varied quantifiers. Tasks may include combinations of different types of logical statements, necessitating the linking of several ideas to reach a conclusion.
+Examples:
+* "If no reptiles have fur, all snakes are reptiles, and some pets are snakes, then some pets do not have fur."
+* "Solve this puzzle that requires linking multiple premises: In a group of people, if everyone wearing red shirts is older than 30 and no one older than 30 likes ice cream, can a person wearing a red shirt like ice cream?"
+* ​​Find the characteristic polynomial of the following matrix:\n$\left(\n\begin{array}{ccc}\n -\frac{61}{7} & -\frac{41}{7} & -\frac{87}{7} \\\n -\frac{83}{7} & \frac{93}{7} & -6 \\\n \frac{23}{7} & -\frac{26}{7} & \frac{95}{7} \\\n\end{array}\n\right)$
+
+Level 4: High. Tasks involve complex chains of logical deductions, using advanced logical constructs such as conditionals ("if-then"), biconditionals ("if and only if"), and multiple quantifiers. They require synthesizing numerous premises with intricate relationships to navigate through the task and reach a valid conclusion.
+Examples:
+* "If all A are B, some B are C, no C are D, and all D are E, what can be inferred about the relationship between A and E?"
+* "If all people at the party are over 18, some party attendees are graduate students, all graduate students have completed college, some college students are under 18, and no one under 21 can serve on the party planning committee, what can we determine about whether all graduate students at the party are eligible to be planners?"
+* "A music producer is recording 7 albums one after another? F, G, H, J, K, L and M, but it is not necessary to record them in this order. When arranging the sequence of recording these 7 albums, the following conditions must be met? (l) F must be ranked second.(2) J cannot be ranked seventh.(3) G can neither be directly in front of H nor immediately after H.(4) H must be somewhere in front of L.(5) L must be somewhere before M.\nQuestion: Which of the following can be the order of recording these 7 records from 1 to 7?\nChoices:\nA. F, K, G, L, H, J, M.\nB. G, F, H, K, L, J, M.\nC. G, F, H, K, L, M, J.\nD. K, F, G, H, J, L, M."
+
+Level 5: Very high. Tasks require abstract and complex logical reasoning involving intricate deductive chains and advanced logical structures, such as nested conditionals and multiple levels of negation. They require deep analytical thinking and the ability to handle multiple premises with subtle interactions in order to evaluate sophisticated hypotheses or identify inconsistencies and fallacies.
+Examples:
+* "Solving this advanced logic puzzle involving the manipulation of several interrelated statements: In a group of five people, if each person knows exactly two others and no one knows the same two people, how can the relationships be arranged?"
+* "In a voting system with three committees (A, B, C), where each committee must have at least four members, a proposition is valid if and only if it satisfies one of these two conditions: either (1) it receives support from all members of at least two committees — except when there exists a member who serves on all three committees and chooses to abstain, in which case validity requires support from every member of exactly two committees, none of whom has ever voted the same way as any currently abstaining member on any past proposition — or (2) for any committee that unanimously opposes the proposition, there must exist exactly two members from each of the other committees who both (a) currently support the proposition and (b) have cast different votes from each other on every past proposition they both voted on, with the additional requirement that no supporting member has ever voted the same way as any current abstaining member on any proposition that achieved supermajority approval. Given a specific set of committee memberships, their complete voting histories, and their votes on a current proposition, determine whether the proposition is valid according to these rules."
+* "A convex 2019-gon \(A_{1}A_{2}\ldots A_{2019}\) is cut into smaller pieces along its 2019 diagonals of the form \(A_{i}A_{i+3}\) for \(1 \leq i \leq 2019\), where \(A_{2020}=A_{1}, A_{2021}=A_{2}\), and \(A_{2022}=A_{3}\). What is the least possible number of resulting pieces?"
diff --git a/src/torch_measure/annotation/rubrics/QLq.txt b/src/torch_measure/annotation/rubrics/QLq.txt
new file mode 100644
index 00000000..0ffdd3e4
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/QLq.txt
@@ -0,0 +1,38 @@
+# Quantitative Reasoning
+This criterion assesses the difficulty level of a task in requiring working with and reasoning about quantities, numbers, and numerical relationships. More specifically, the level represents the complexity of numerical operations and quantitative concepts needed to solve the task, ranging from simple counting and arithmetic to sophisticated analysis involving multiple quantitative variables, relationships, and transformations. The scale's difficulty increases based on factors such as the number of quantities involved, the complexity of numerical relationships, and how much quantitative information must be derived.
+
+Level 0: None. The task does not require quantitative reasoning.
+Examples:
+* "Describe the color of the sky on a clear day."
+* "Name a type of pet commonly found in households."
+* "What is the capital city of Japan?"
+
+Level 1: Very low. The task involves only basic or rudimentary quantitative concepts (e.g., simple counting, basic comparisons). Requires only simple recall of basic quantitative facts. Requires minimal quantitative reasoning.
+Examples:
+* "Count the number of eggs in a small basket."
+* "If your friend has 7 apples and you have 5, who has more apples?"
+* "Given this molecular requirements description, design a new molecule: The molecule is trianion of xanthosine 5'-triphosphate arising from deprotonation of three of the four free triphosphate OH groups. It is a conjugate base of a XTP."
+
+Level 2: Low. The task requires relatively simple quantitative operations and concepts. Involves generally straightforward application of low-level mathematical principles. Some explicit quantitative reasoning is required.
+Examples:
+* "Calculate the average rainfall for a week if it rained 2 litres per square meter on Monday, 1 litres per square meter on Wednesday, and 4 litres per square meter on Friday."
+* "If a recipe calls for 2 cups of flour to make 12 cookies, how much flour is needed to make 18 cookies?"
+* "A store is offering a 20% discount on a $50 shirt. What is the final price of the shirt?"
+
+Level 3: Intermediate. Involves moderately complex quantitative concepts and relationships. Requires application of non-trivial mathematical principles. Some necessary quantitative information may need to be inferred or calculated. Requires active engagement in quantitative reasoning processes.
+Examples:
+* "If bacteria double every 30 minutes and you start with 100 bacteria, how long will it take to reach 10,000 bacteria? Show your reasoning process."
+* "Calculate the future value of a $1000 investment over 5 years at an annual rate of 5%, compounded quarterly, if $200 is added to the investment at the end of each year. Show the impact of these additional contributions."
+* "A car travels at 60 kph for 2 hours, then at 45 kph for 1.5 hours, and finally at 30 kph for 1 hour over varied terrain. What is the average speed of the car and what is the total distance traveled?"
+
+Level 4: High. Requires complex quantitative operations. Involves the application of advanced mathematical concepts. Uses sophisticated numerical representations and relationships. Much necessary quantitative information may need to be inferred or generated. Requires an advanced level of quantitative reasoning processes.
+Examples:
+* "Using calculus, find the volume of the solid formed when the region bounded by the curves y = x² and y = 4 - x² is rotated around the y-axis."
+* "Determine the stability of a system of differential equations by finding the eigenvalues and eigenvectors of its matrix representation, and then analyze whether the system converges to an equilibrium and discuss the implications of different eigenvalue scenarios on system stability."
+* "Solve the second-order differential equation d²y/dx² + 4dy/dx + 4y = 0, and analyze the stability of its solutions."
+
+Level 5: Very high. Involves extremely complex quantitative reasoning and mastery of mathematical insight. Requires the very complex integration of mathematical concepts. Uses abstract mathematical representations and theoretical quantitative concepts. Most necessary quantitative information may need to be inferred or generated through complex reasoning. Requires constant engagement and adjustment of quantitative reasoning at the expert level.
+Examples:
+* "Create a stochastic model for predicting future stock market trends using simulations and accounting for variables such as economic indicators, investor sentiment, and geopolitical events."
+* "Using advanced statistical methods, design an experiment to test the efficacy of a new drug while accounting for multiple confounding variables and potential interaction effects."
+* "Formulate a non-linear optimisation model for maximizing renewable energy output in a smart grid system, incorporating constraints such as energy storage capacity, variable supply and demand, and government regulations."
diff --git a/src/torch_measure/annotation/rubrics/SNs.txt b/src/torch_measure/annotation/rubrics/SNs.txt
new file mode 100644
index 00000000..264c1d41
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/SNs.txt
@@ -0,0 +1,39 @@
+# Spatial-physical Reasoning
+This rubric assesses the complexity of spatial and physical understanding and reasoning required by a task. More specifically, the level represents the extent to which the task requires understanding spatial relationships between objects and predicting physical interactions, ranging from simple recognition of static relationships to complex mental manipulations involving multiple objects, transformations, and physical predictions across different dimensions. Notably, this should focus on the minimum way of solving the task since many tasks do not actually require spatio-physical reasoning to be successfully solved, even though spatio-physical reasoning may help.
+
+Level 0: None. Tasks at this level do not require spatial reasoning or physical intuition. There is no need to manipulate or visualize spatial relationships.
+Examples:
+* Copying text from one document to another without considering the layout.
+* Transcribing a sentence such as "The book is on the table" from an audio recording without needing to interpret or visualize the spatial reference.
+* Reciting a previously memorized sequence of numbers without spatial meaning.
+
+Level 1: Very low. Tasks involve simple recognition of spatial relationships but no mental manipulation is required for a successful task completion. Objects and their relationships are static and visible.
+Examples:
+* Identifying which shape matches a given template without rotating or manipulating it.
+* Recognizing whether a flat plate on the top of a book is stable or likely to remain balanced in its current position.
+* A convex 2019-gon \(A_{1}A_{2}\ldots A_{2019}\) is cut into smaller pieces along its 2019 diagonals of the form \(A_{i}A_{i+3}\) for \(1 \leq i \leq 2019\), where \(A_{2020}=A_{1}, A_{2021}=A_{2}\), and \(A_{2022}=A_{3}\). What is the least possible number of resulting pieces?
+
+Level 2: Low. Tasks require basic common-sense predictions about physical interactions, or combined with simple spatial manipulations. The transformations are straightforward and involve only one or two objects.
+Examples:
+* Estimating whether a box will fit through a door based on a visual comparison of dimensions, where the difference in size between the box and the door is fairly noticeable.
+* Predicting the outcome of a very light ball (e.g., a table tennis ball) rolling into a much heavier, stationary object (e.g., a bowling ball). The task requires estimating the motion of the stationary heavy object, which will hardly move due to the significant difference in mass between the two objects.
+* Predicting whether a piece of soft clay will flatten when being pushed.
+
+Level 3: Intermediate. Tasks require coordinating multiple spatial relationships and performing sequential transformations. Mental models need to be constructed, involving intermediate levels of both physical prediction and spatial reasoning (3D reasoning or multiple steps in a 2D space), but focussing on the spatial.
+Examples:
+* Given a scenario where a glass vase is knocked off a table, hits the edge of a wooden chair, and then falls to the floor, predict the outcome in terms of break or survive.
+* Mentally visualize the trajectory of a thrown ball that arcs in the air and predict where it will land, accounting for a simple curve and the presence of a mild wind.
+* Interpreting a sequence of spatial instructions (e.g., "Walk past the bridge, turn right after the post office, then take the second left") and constructing a mental model of the path based on multiple spatial references.
+
+Level 4: High. Tasks involve complex spatial transformations and integration of multiple spatial operations. Requires construction and manipulation of mental models, often involving prediction of outcomes of non-trivial spatial transformations.
+Examples:
+* Predicting how shadows will change as the position of a light source moves around a 3D object, requiring an understanding of light, geometry, and perspective.
+* A passage describes a football game where multiple players are moving relative to one another: "John passed the ball to Mark, who was standing to the left of him, but then Sarah ran past both of them, taking the ball from the right side." The model must maintain spatial continuity by updating the players' relative positions and summarizing their final locations after each described action.
+* In a detective novel, a room's layout is described as follows: "The safe is hidden behind the large painting on the wall, to the right of the door. Opposite the painting is a window, and next to the window is a desk." The subject must interpret and visualize the spatial arrangement of objects within the room and answer questions about their relative positions (e.g., "Where is the safe in relation to the desk?").
+
+Level 5: Very High. Tasks require very advanced spatial reasoning involving multiple simultaneous transformations and the prediction of highly complex spatial outcomes.
+These tasks involve predicting intricate chains of physical interactions and visualizing sophisticated spatial relationships that evolve over time or through multiple steps.
+Examples:
+* Writing or interpreting technical instructions for assembling a complex mechanical device, requiring precise descriptions of spatial relationships and the correct sequence of assembly steps, often involving 3D reasoning about how parts fit together.
+* A passage describes a complex laser setup: "The initial laser beam enters horizontally through a beam splitter that divides it into two identical beams. The first beam reflects off three mirrors - one at 30 degrees on the wall, another mounted 15 degrees from the ceiling, and a third on the floor at 45 degrees. The second beam reflects off two mirrors - one perpendicular to the ground and another at 60 degrees on the opposite wall." The subject must trace both beams' paths, predict their final directions and positions after all the reflections, and determine whether the beams will intersect at any point in their paths.
+* A scientific article describes the structure of a protein: "The alpha-helix folds back on itself, forming hydrogen bonds with the beta-sheet that runs parallel, while the N-terminal aligns perpendicularly to the hydrophobic core." The task requires visualizing the 3D configuration of the protein and explaining the spatial relationships between the molecular elements described.
diff --git a/src/torch_measure/annotation/rubrics/UG_choice_num.txt b/src/torch_measure/annotation/rubrics/UG_choice_num.txt
new file mode 100644
index 00000000..9214af03
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/UG_choice_num.txt
@@ -0,0 +1,40 @@
+You are tasked with analyzing the question and its reference answer above, and classifying them based on their answer format. To this end, you need to determine if it's multiple-choice (explicit or implicit) or open-ended. Output a single integer representing the number of possible choices in an open-ended question, or "open" for an open-ended question.
+*Follow these classification rules:*
+1. Explicit Multiple Choice Questions.
+Examples:
+* "Which color is best: Red, Blue, or Green?" → Output: 3
+* "Choose from: A) Earth B) Mars C) Venus D) Jupiter" → Output: 4
+* "Which of these explanations best describes photosynthesis? [followed by 4 detailed explanations]" → Output: 4
+2. Implicit Multiple Choice Questions (ONLY for well-known or given sets).
+* Yes/No questions → Output: 2
+* True/False questions → Output: 2
+* Questions using well-known or given sets. Examples include, but not limited to:
+   * Days of the week → Output: 7
+   * Months of the year → Output: 12
+   * Continents → Output: 7
+   * Cardinal directions (N/S/E/W) → Output: 4
+* Examples:
+   * "What day of the week does the event start?" → Output: 7
+   * "Which season is warmest?" → Output: 4
+   * "Is this statement correct?" → Output: 2
+3. Open-ended Questions.
+* Questions requiring free-form responses. Example:
+   * "Explain why the sky is blue." → Output: open
+* Questions with no explicit options provided. Example:
+   * "What factors contributed to the Industrial Revolution?" → Output: open
+* Questions where options must be discovered or deduced as part of solving the problem. Example:
+   * "What city with over 1M inhabitants is furthest east in Spain?" → Output: open
+* Questions with finite but non-obvious sets of answers. Example:
+   * "Which company will become the most valuable by 2030?" → Output: open
+*Instructions for Processing:*
+1. Read the question carefully.
+2. Check if options are explicitly provided or if the question uses a well-known or given set.
+3. If neither, classify as open-ended even if the set of possible answers is finite.
+4. Output ONLY the number or "open" without any explanation.
+Note: Only classify as multiple-choice if:
+* The options are explicitly listed in the question, OR
+* The options come from a fine (well-known or given) set like days of the week.
+Format your output as a single line containing either:
+* An integer (for multiple-choice questions) and nothing else.
+* The word "open" (for open-ended questions) and nothing else.
+Output:
\ No newline at end of file
diff --git a/src/torch_measure/annotation/rubrics/VO.txt b/src/torch_measure/annotation/rubrics/VO.txt
new file mode 100644
index 00000000..76bc8678
--- /dev/null
+++ b/src/torch_measure/annotation/rubrics/VO.txt
@@ -0,0 +1,39 @@
+# Volume
+This rubric defines a scale that evaluates the task purely based on its *volume*, i.e., the time a fully competent, experienced and motivated human would need to both read and complete the task in ideal conditions, not counting breaks or interruptions, regardless of the difficulty or cognitive demands of the task. The scale ranges from tasks requiring less than a second to those requiring more than 16 hours (1,000 minutes), focusing purely on volume of work rather than cognitive complexity or skill requirements. Time estimates assume the task performer has all necessary information, tools, and skills readily available, but works autonomously without the assistance of other humans or AI tools.
+
+Level 0: None. Volume: Negligible, requiring less than 1 second.
+Examples:
+* Checking the status of an indicator (e.g., a light on a machine).
+* Selecting a checkbox to confirm agreement.
+* Opening a pre-configured app or program.
+
+Level 1: Very Low. Requiring between 1 second and 1 minute.
+Examples:
+* Read a short online comment about merchandise to identify whether it has positive, neutral, or negative emotion.
+* Saving and uploading a single document to a pre-arranged folder.
+* Reading a short email and writing a brief confirmation email (e.g., confirming attendance to a meeting).
+
+Level 2: Low. Requiring between 1 minute and 10 minutes.
+Examples:
+* Writing a simple summary in half a page of the main points from a short memo or meeting.
+* Listening to an audio recording of one minute from a high-school level history class and answering a list of ten short factual questions.
+* Reading an inquiry and writing a few paragraphs long personalized email in response to an inquiry.
+
+Level 3: Intermediate. Requiring between 10 minutes and 100 minutes.
+Examples:
+* Proofreading and lightly editing a 4-page research article to improve flow and clarity.
+* Organizing and cataloging a personal collection of 50 books by genre, year, and author.
+* Reading a set of instructions and data based on a small experiment, and then writing a 500-word report based on .
+
+Level 4: High. Requiring between 100 minutes (roughly 1.5 hours) and 1,000 minutes (about 16 hours).
+Examples:
+* Creating a 10-page technical manual including screenshots, step-by-step procedures, and troubleshooting guides for a specific software application.
+* Configuring 25 workstations with standardized software (5 applications per machine), security settings, and network access protocols.
+* Reading a 12-page position paper on language models' impact on education and verbally summarizing the paper's key insights for colleagues.
+
+Level 5: Very High. Requiring more than 1,000 minutes (roughly 16 hours).
+Examples:
+* Planning and executing a 3-day workshop for 100 attendees, including scheduling 15 speakers and arranging catering for 6 meals.
+* Writing a high-quality 10-page research paper on the field of data science, requiring analysis of 50+ academic sources, including data visualization and statistical analysis of 3 datasets.
+* Reviewing the correctness of 2000+ simple geography exercises written by high-school students.
+* Conducting a financial audit covering 12 months of transactions (approximately 5,000 entries) across 5 department budgets, including reconciliation and variance analysis.
diff --git a/src/torch_measure/models/__init__.py b/src/torch_measure/models/__init__.py
index 27239d21..a36f3e12 100644
--- a/src/torch_measure/models/__init__.py
+++ b/src/torch_measure/models/__init__.py
@@ -10,6 +10,7 @@
 from torch_measure.models.beta_twopl import BetaTwoPL
 from torch_measure.models.bifactor import Bifactor
 from torch_measure.models.bradley_terry import BradleyTerry
+from torch_measure.models.demand_assessor import DemandAssessor
 from torch_measure.models.ggm import GaussianGraphicalModel
 from torch_measure.models.ising import IsingModel
 from torch_measure.models.llm_judge import LLMJudge
@@ -38,6 +39,7 @@
     "BetaRasch",
     "BetaTwoPL",
     "AmortizedIRT",
+    "DemandAssessor",
     "TabPFNPredictor",
     "MultiFacetRasch",
     "MultiFacet2PL",
diff --git a/src/torch_measure/models/demand_assessor.py b/src/torch_measure/models/demand_assessor.py
new file mode 100644
index 00000000..47fab2c9
--- /dev/null
+++ b/src/torch_measure/models/demand_assessor.py
@@ -0,0 +1,332 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+"""Demand-based assessor: predicts P(success | subject_idx, item_features)."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, ClassVar
+
+import torch
+from torch import nn
+
+from torch_measure.fitting._losses import bernoulli_nll
+from torch_measure.models._network import MLP
+from torch_measure.models._predictor import Predictor
+
+if TYPE_CHECKING:
+    from torch_measure.datasets._long_form import LongFormData
+
+
+def _to_long_form(
+    data,
+    mask: torch.Tensor | None,
+    device: torch.device,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Coerce ``data`` to the long-form triple ``(subject_idx, item_idx, response)``.
+
+    Mirrors :meth:`IRTModel._normalize_fit_inputs` as a free function.
+    Returns all three tensors on ``device``.
+
+    For :class:`~torch_measure.datasets.LongFormData` input, ``item_idx``
+    values are indices into ``sorted(data.responses["item_id"].unique())``.
+    The caller is responsible for aligning ``item_features`` rows to that
+    ordering — use ``data.to_fit_tensors()["item_ids"]`` to retrieve it.
+    """
+    from torch_measure.datasets._long_form import LongFormData  # deferred — avoids circular import
+
+    if isinstance(data, LongFormData):
+        fit_inputs = data.to_fit_tensors(device=str(device))
+        return fit_inputs["subject_idx"], fit_inputs["item_idx"], fit_inputs["response"]
+
+    if not isinstance(data, torch.Tensor):
+        raise TypeError(f"fit() expected LongFormData or torch.Tensor, got {type(data).__name__}")
+
+    response_matrix = data.to(device)
+    if mask is None:
+        mask = ~torch.isnan(response_matrix) & (response_matrix != -1)
+    mask = mask.to(device)
+
+    obs_indices = mask.nonzero(as_tuple=False)
+    subject_idx = obs_indices[:, 0].to(device)
+    item_idx = obs_indices[:, 1].to(device)
+    response = response_matrix[mask].float().to(device)
+    return subject_idx, item_idx, response
+
+
+class DemandAssessor(Predictor):
+    """Demand-based assessor for predicting AI system success from item features.
+
+    Predicts ``P(response=1 | subject_idx, item_features)`` by concatenating a
+    learned subject embedding with item feature vectors and passing the result
+    through an MLP::
+
+        P(success) = sigmoid(MLP([subject_embedding ‖ item_features]))
+
+    Unlike IRT models, items are identified by their feature vectors rather than
+    integer indices. The model generalises to items not seen during training as
+    long as their feature vectors are provided — predicting on a new item requires
+    no re-fitting, only its demand annotation vector.
+
+    Parameters
+    ----------
+    n_subjects : int
+        Number of subjects (AI systems / models being evaluated).
+    item_feature_dim : int
+        Dimension of item feature vectors (demand annotations, benchmark metadata,
+        or other task-level descriptors).
+    subject_embedding_dim : int
+        Dimension of the learned per-subject representation. Default 16.
+    hidden_dim : int
+        Width of the MLP hidden layer(s). Default 128.
+    n_layers : int
+        Total number of MLP layers (minimum 1). Default 2.
+    dropout : float
+        Dropout rate between MLP hidden layers. Default 0.0.
+    device : str
+        Device for all parameters. Default ``"cpu"``.
+
+    Attributes
+    ----------
+    subject_embedding : nn.Embedding
+        Learnable subject table, shape ``(n_subjects, subject_embedding_dim)``.
+    net : MLP
+        Maps ``[subject_embedding ‖ item_features]`` to a scalar logit.
+        Input dim: ``subject_embedding_dim + item_feature_dim``. Output dim: 1.
+
+    Notes
+    -----
+    This model does not require :meth:`fit` before :meth:`predict` — parameters
+    are randomly initialised and produce valid (though uninformative) probabilities
+    immediately. This differs from :class:`AmortizedIRT` and
+    :class:`TabPFNPredictor`, which guard ``predict`` with a ``RuntimeError``
+    until external state is supplied.
+
+    :func:`predict_dense` is not applicable: it builds a Cartesian grid of integer
+    ``item_idx`` values, but this model's :attr:`expected_keys` does not include
+    ``"item_idx"``. To predict over a fixed item set, construct the query manually::
+
+        n = n_subjects * n_items
+        s = torch.arange(n_subjects).repeat_interleave(n_items)
+        f = item_features.unsqueeze(0).expand(n_subjects, -1, -1).reshape(n, -1)
+        probs = model.predict({"subject_idx": s, "item_features": f})
+        dense = probs.view(n_subjects, n_items)
+
+    This model is not compatible with :class:`~torch_measure.cat.runner.AdaptiveTester`,
+    which expects IRT-style ``difficulty`` and ``discrimination`` parameters.
+
+    When ``dropout > 0``, call ``model.eval()`` before inference to disable
+    stochastic dropout. Call ``model.train()`` to re-enable dropout during
+    further training.
+
+    Examples
+    --------
+    >>> import torch
+    >>> from torch_measure.models import DemandAssessor
+    >>> model = DemandAssessor(n_subjects=50, item_feature_dim=8)
+    >>> response_matrix = (torch.rand(50, 30) > 0.5).float()
+    >>> item_features = torch.randn(30, 8)
+    >>> history = model.fit(response_matrix, item_features, max_epochs=10, verbose=False)
+    >>> query = {"subject_idx": torch.tensor([0, 1]), "item_features": torch.randn(2, 8)}
+    >>> probs = model.predict(query)
+    >>> probs.shape
+    torch.Size([2])
+    """
+
+    expected_keys: ClassVar[tuple[str, ...]] = ("subject_idx", "item_features")
+
+    def __init__(
+        self,
+        n_subjects: int,
+        item_feature_dim: int,
+        subject_embedding_dim: int = 16,
+        hidden_dim: int = 128,
+        n_layers: int = 2,
+        dropout: float = 0.0,
+        device: str = "cpu",
+    ) -> None:
+        # n_items=0: this model has no per-item parameters. Item count is
+        # tracked via n_items_buf after fit() and exposed through n_items.
+        super().__init__(n_subjects, n_items=0, device=device)
+        self.item_feature_dim = item_feature_dim
+        self.subject_embedding_dim = subject_embedding_dim
+        # Scalar buffer so that n_items survives state_dict save/load.
+        self.register_buffer("n_items_buf", torch.tensor(0, dtype=torch.long))
+        self.subject_embedding = nn.Embedding(n_subjects, subject_embedding_dim).to(self._device)
+        self.net = MLP(
+            input_dim=subject_embedding_dim + item_feature_dim,
+            hidden_dim=hidden_dim,
+            output_dim=1,
+            n_layers=n_layers,
+            dropout=dropout,
+        ).to(self._device)
+
+    @property
+    def n_items(self) -> int:
+        """Number of rows in the ``item_features`` matrix supplied to the last :meth:`fit` call.
+
+        Returns 0 before :meth:`fit`. Equal to ``item_features.shape[0]`` after
+        fitting, which may exceed the number of items referenced in the training
+        data if extra feature rows were supplied. Persists across ``state_dict``
+        save/load via ``n_items_buf``.
+        """
+        return int(self.n_items_buf.item())
+
+    def predict(self, query: dict[str, torch.Tensor]) -> torch.Tensor:
+        """Compute P(success) at each row of ``query``.
+
+        Parameters
+        ----------
+        query : dict[str, torch.Tensor]
+            Must contain:
+
+            - ``"subject_idx"``: :class:`torch.LongTensor`, shape ``(N,)``
+            - ``"item_features"``: :class:`torch.FloatTensor`, shape
+              ``(N, item_feature_dim)``
+
+        Returns
+        -------
+        torch.Tensor
+            Predicted probabilities, shape ``(N,)``, values in ``[0, 1]``.
+
+        Raises
+        ------
+        ValueError
+            If the last dimension of ``item_features`` does not match
+            :attr:`item_feature_dim`.
+        """
+        s = query["subject_idx"]
+        f = query["item_features"]
+        if f.ndim != 2 or f.shape[-1] != self.item_feature_dim:
+            raise ValueError(
+                f"item_features must be a 2-D tensor with last dim "
+                f"{self.item_feature_dim}; got shape {tuple(f.shape)}"
+            )
+        e_s = self.subject_embedding(s)        # (N, subject_embedding_dim)
+        x = torch.cat([e_s, f], dim=-1)        # (N, subject_embedding_dim + item_feature_dim)
+        logit = self.net(x).squeeze(-1)         # (N,)
+        return torch.sigmoid(logit)
+
+    def fit(
+        self,
+        data: LongFormData | torch.Tensor,
+        item_features: torch.Tensor,
+        mask: torch.Tensor | None = None,
+        max_epochs: int = 1000,
+        lr: float = 1e-3,
+        weight_decay: float = 1e-4,
+        convergence_tol: float = 1e-6,
+        verbose: bool = True,
+    ) -> dict:
+        """Fit the model on observed responses.
+
+        Parameters
+        ----------
+        data : LongFormData | torch.Tensor
+            Either a :class:`~torch_measure.datasets.LongFormData` or a wide-form
+            response tensor of shape ``(n_subjects, n_items)``. For wide-form,
+            missing entries may be encoded as ``NaN`` or ``-1``.
+        item_features : torch.Tensor
+            Item feature matrix of shape ``(n_items, item_feature_dim)``.
+
+            .. warning::
+
+                Row ordering must match the item ordering used by ``item_idx``:
+
+                - **Wide-form tensor**: row ``i`` corresponds to column ``i`` of
+                  the response matrix.
+                - **LongFormData**: row ``i`` must correspond to the ``i``-th item
+                  in ``sorted(data.responses["item_id"].unique())``. Call
+                  ``data.to_fit_tensors()["item_ids"]`` to retrieve this ordering
+                  before constructing ``item_features``.
+        mask : torch.Tensor | None
+            Boolean mask of shape ``(n_subjects, n_items)`` selecting which cells
+            to use. Inferred from NaN/``-1`` when ``None``. Ignored for long-form
+            input (absent rows are absent observations).
+        max_epochs : int
+            Maximum number of Adam optimisation epochs.
+        lr : float
+            Adam learning rate.
+        weight_decay : float
+            Adam L2 regularisation coefficient.
+        convergence_tol : float
+            Stop early when ``|loss_prev - loss_cur| < convergence_tol``.
+        verbose : bool
+            Show a tqdm progress bar during fitting.
+
+        Returns
+        -------
+        dict
+            Training history with key ``"losses"`` (per-epoch Bernoulli NLL).
+
+        Raises
+        ------
+        ValueError
+            If ``item_features`` is not a 2-D tensor or its second dimension does
+            not match :attr:`item_feature_dim`.
+        ValueError
+            If no observed responses remain after applying ``mask`` / NaN filtering.
+        ValueError
+            If ``item_features`` has fewer rows than the maximum item index
+            referenced in ``data``.
+        TypeError
+            If ``data`` is neither :class:`~torch_measure.datasets.LongFormData`
+            nor :class:`torch.Tensor`.
+        """
+        if item_features.ndim != 2 or item_features.shape[1] != self.item_feature_dim:
+            raise ValueError(
+                f"item_features must be a 2-D tensor of shape "
+                f"(n_items, {self.item_feature_dim}); "
+                f"got shape {tuple(item_features.shape)}"
+            )
+
+        subject_idx, item_idx, response = _to_long_form(data, mask, self._device)
+
+        if len(response) == 0:
+            raise ValueError(
+                "No observed responses to fit on. Check that data contains "
+                "valid entries and that the mask selects at least one cell."
+            )
+
+        n_items_needed = int(item_idx.max().item()) + 1
+        if item_features.shape[0] < n_items_needed:
+            raise ValueError(
+                f"item_features has {item_features.shape[0]} rows but "
+                f"data references item index {n_items_needed - 1}. "
+                f"Provide at least {n_items_needed} rows."
+            )
+
+        # Item features are fixed inputs, not model parameters. Index once before
+        # the loop rather than re-slicing each epoch.
+        obs_features = item_features.detach().to(self._device)[item_idx]  # (n_obs, item_feature_dim)
+        training_query = {"subject_idx": subject_idx, "item_features": obs_features}
+
+        optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
+        history = {"losses": []}
+
+        iterator = range(max_epochs)
+        if verbose:
+            try:
+                from tqdm import tqdm
+
+                iterator = tqdm(iterator, desc="Fitting DemandAssessor")
+            except ImportError:
+                pass
+
+        prev_loss = float("inf")
+
+        for _epoch in iterator:
+            optimizer.zero_grad()
+            probs = self.predict(training_query).clamp(1e-7, 1 - 1e-7)
+            loss = bernoulli_nll(probs, response)
+            loss.backward()
+            optimizer.step()
+            loss_val = loss.item()
+            history["losses"].append(loss_val)
+            if verbose and hasattr(iterator, "set_postfix"):
+                iterator.set_postfix({"loss": f"{loss_val:.6f}"})
+            if abs(prev_loss - loss_val) < convergence_tol:
+                break
+            prev_loss = loss_val
+
+        self.n_items_buf.fill_(item_features.shape[0])
+        return history
diff --git a/src/torch_measure/models/llm_judge.py b/src/torch_measure/models/llm_judge.py
index d5d9d669..a97324ff 100644
--- a/src/torch_measure/models/llm_judge.py
+++ b/src/torch_measure/models/llm_judge.py
@@ -4,7 +4,6 @@
 
 import numpy as np
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 _JUDGE_TEMPLATE = (
     "You will see a description of an AI subject and an"
@@ -45,6 +44,8 @@ def __init__(
         batch_size: int = 32,
         device: str = "auto",
     ) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
         self.max_icl = max_icl
         self.batch_size = batch_size
 
diff --git a/tests/test_annotation/__init__.py b/tests/test_annotation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_annotation/test_cache.py b/tests/test_annotation/test_cache.py
new file mode 100644
index 00000000..3648d881
--- /dev/null
+++ b/tests/test_annotation/test_cache.py
@@ -0,0 +1,271 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+"""Unit tests for _cache.py — no API calls required.
+
+Verifies: cache key format, NaN serialization, put/get round-trip,
+rubric-hash-based cache invalidation, JSON RFC compliance.
+"""
+
+import json
+import math
+from pathlib import Path
+
+import pytest
+
+from torch_measure.annotation._cache import AnnotationCache, make_cache_key
+from torch_measure.annotation._types import CacheEntry
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _entry(
+    content="test item content",
+    acronym="AS",
+    model_id="model-001",
+    rubric_hash="abc12345",
+    level=3.0,
+    raw_output="",
+) -> CacheEntry:
+    key = make_cache_key(content, acronym, model_id, rubric_hash)
+    return CacheEntry(
+        key=key,
+        item_id="item_001",
+        demand=acronym,
+        level=level,
+        finish_reason="stop",
+        model_response="Thus, the level is 3",
+        rubric_hash=rubric_hash,
+        model_id=model_id,
+        content_hash=key.split(":")[0],
+        timestamp="2026-01-01T00:00:00+00:00",
+        raw_output=raw_output,
+    )
+
+
+# ---------------------------------------------------------------------------
+# make_cache_key
+# ---------------------------------------------------------------------------
+
+class TestMakeCacheKey:
+
+    def test_content_hash_is_16_hex_chars(self):
+        key = make_cache_key("content", "AS", "model-001", "rubrichash")
+        content_hash = key.split(":")[0]
+        assert len(content_hash) == 16
+        assert all(c in "0123456789abcdef" for c in content_hash)
+
+    def test_key_ends_with_rubric_hash(self):
+        key = make_cache_key("content", "AS", "model-001", "rubrichash")
+        assert key.endswith(":rubrichash")
+
+    def test_key_contains_acronym(self):
+        key = make_cache_key("content", "AS", "model-001", "rubrichash")
+        assert ":AS:" in key
+
+    def test_different_content_different_key(self):
+        k1 = make_cache_key("content A", "AS", "model", "hash")
+        k2 = make_cache_key("content B", "AS", "model", "hash")
+        assert k1 != k2
+
+    def test_different_acronym_different_key(self):
+        k1 = make_cache_key("content", "AS", "model", "hash")
+        k2 = make_cache_key("content", "CL", "model", "hash")
+        assert k1 != k2
+
+    def test_different_model_different_key(self):
+        k1 = make_cache_key("content", "AS", "model-v1", "hash")
+        k2 = make_cache_key("content", "AS", "model-v2", "hash")
+        assert k1 != k2
+
+    def test_different_rubric_hash_different_key(self):
+        """Editing a rubric file changes its hash, which changes the cache key.
+        This is the rubric-change cache-invalidation mechanism.
+        """
+        k1 = make_cache_key("content", "AS", "model", "hash_before_edit")
+        k2 = make_cache_key("content", "AS", "model", "hash_after_edit")
+        assert k1 != k2
+
+    def test_identical_inputs_produce_identical_key(self):
+        k1 = make_cache_key("content", "AS", "model", "hash")
+        k2 = make_cache_key("content", "AS", "model", "hash")
+        assert k1 == k2
+
+    def test_ug_key_uses_ug_acronym(self):
+        key = make_cache_key("content", "UG", "model", "ug_hash")
+        assert ":UG:" in key
+
+
+# ---------------------------------------------------------------------------
+# AnnotationCache — put / get
+# ---------------------------------------------------------------------------
+
+class TestAnnotationCachePutGet:
+
+    def test_put_and_get_roundtrip(self, tmp_path):
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        e = _entry(level=3.0)
+        cache.put(e)
+        retrieved = cache.get(e.key)
+        assert retrieved is not None
+        assert retrieved.level == 3.0
+        assert retrieved.demand == "AS"
+        assert retrieved.item_id == "item_001"
+
+    def test_get_missing_key_returns_none(self, tmp_path):
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        assert cache.get("nonexistent::key::here") is None
+
+    def test_multiple_entries_retrievable(self, tmp_path):
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        e1 = _entry(content="item A", acronym="AS", level=1.0)
+        e2 = _entry(content="item B", acronym="CL", level=4.0)
+        cache.put(e1)
+        cache.put(e2)
+        assert cache.get(e1.key).level == 1.0
+        assert cache.get(e2.key).level == 4.0
+
+    def test_most_recent_put_wins_in_memory(self, tmp_path):
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        e1 = _entry(level=1.0)
+        e2 = _entry(level=5.0)  # same key (same inputs)
+        cache.put(e1)
+        cache.put(e2)
+        assert cache.get(e1.key).level == 5.0
+
+    def test_raw_output_stored_and_retrieved(self, tmp_path):
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        e = _entry(acronym="UG", raw_output="4")
+        cache.put(e)
+        assert cache.get(e.key).raw_output == "4"
+
+    def test_finish_reason_preserved(self, tmp_path):
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        e = _entry()
+        cache.put(e)
+        assert cache.get(e.key).finish_reason == "stop"
+
+    def test_model_response_preserved(self, tmp_path):
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        e = _entry()
+        cache.put(e)
+        assert "level is 3" in cache.get(e.key).model_response
+
+
+# ---------------------------------------------------------------------------
+# NaN serialization (RFC compliance)
+# ---------------------------------------------------------------------------
+
+class TestNaNSerialization:
+
+    def test_nan_written_as_null_not_bare_nan(self, tmp_path):
+        cache_path = tmp_path / "cache.jsonl"
+        cache = AnnotationCache(cache_path)
+        cache.put(_entry(level=math.nan))
+        raw = cache_path.read_text()
+        # Must NOT contain bare NaN (non-RFC JSON)
+        assert "NaN" not in raw
+        # Must contain null
+        assert "null" in raw
+
+    def test_nan_survives_put_get_roundtrip_in_memory(self, tmp_path):
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        e = _entry(level=math.nan)
+        cache.put(e)
+        result = cache.get(e.key)
+        assert math.isnan(result.level)
+
+    def test_nan_survives_disk_reload(self, tmp_path):
+        cache_path = tmp_path / "cache.jsonl"
+        cache1 = AnnotationCache(cache_path)
+        cache1.put(_entry(level=math.nan))
+
+        cache2 = AnnotationCache(cache_path)
+        result = cache2.get(_entry(level=math.nan).key)
+        assert result is not None
+        assert math.isnan(result.level)
+
+    def test_all_lines_are_valid_rfc_json(self, tmp_path):
+        """Every JSONL line must be parseable by strict json.loads (no allow_nan)."""
+        cache_path = tmp_path / "cache.jsonl"
+        cache = AnnotationCache(cache_path)
+        for level in [0.0, 1.0, 3.0, 5.0, math.nan]:
+            cache.put(_entry(level=level))
+        for line in cache_path.read_text().splitlines():
+            if line.strip():
+                json.loads(line)  # raises if NaN/Infinity present
+
+    def test_valid_levels_not_converted_to_null(self, tmp_path):
+        cache_path = tmp_path / "cache.jsonl"
+        cache = AnnotationCache(cache_path)
+        cache.put(_entry(level=3.0))
+        data = json.loads(cache_path.read_text().strip())
+        assert data["level"] == 3.0
+        assert data["level"] is not None
+
+
+# ---------------------------------------------------------------------------
+# Persistence and reload
+# ---------------------------------------------------------------------------
+
+class TestAnnotationCachePersistence:
+
+    def test_entry_survives_cache_reload(self, tmp_path):
+        cache_path = tmp_path / "cache.jsonl"
+        e = _entry(level=2.0)
+
+        cache1 = AnnotationCache(cache_path)
+        cache1.put(e)
+
+        cache2 = AnnotationCache(cache_path)
+        result = cache2.get(e.key)
+        assert result is not None
+        assert result.level == 2.0
+
+    def test_last_write_wins_on_reload(self, tmp_path):
+        """Append-only: same key written twice; on reload the last value is active."""
+        cache_path = tmp_path / "cache.jsonl"
+        cache = AnnotationCache(cache_path)
+        e1 = _entry(level=1.0)
+        e2 = _entry(level=4.0)
+        cache.put(e1)
+        cache.put(e2)
+
+        cache2 = AnnotationCache(cache_path)
+        assert cache2.get(e1.key).level == 4.0
+
+    def test_file_not_created_until_put(self, tmp_path):
+        cache_path = tmp_path / "cache.jsonl"
+        _ = AnnotationCache(cache_path)
+        assert not cache_path.exists()
+
+    def test_parent_directories_created_on_put(self, tmp_path):
+        cache_path = tmp_path / "a" / "b" / "c" / "cache.jsonl"
+        cache = AnnotationCache(cache_path)
+        cache.put(_entry())
+        assert cache_path.exists()
+
+    def test_empty_file_loads_without_error(self, tmp_path):
+        cache_path = tmp_path / "cache.jsonl"
+        cache_path.write_text("")
+        cache = AnnotationCache(cache_path)
+        assert cache.get("any_key") is None
+
+    def test_blank_lines_in_file_are_skipped(self, tmp_path):
+        cache_path = tmp_path / "cache.jsonl"
+        e = _entry(level=2.0)
+        cache = AnnotationCache(cache_path)
+        cache.put(e)
+        original = cache_path.read_text()
+        cache_path.write_text("\n\n" + original + "\n\n")
+        cache2 = AnnotationCache(cache_path)
+        assert cache2.get(e.key) is not None
+
+    def test_raw_output_default_empty_for_demand_entries(self, tmp_path):
+        """Demand annotation entries have raw_output='' (not UG)."""
+        cache_path = tmp_path / "cache.jsonl"
+        cache = AnnotationCache(cache_path)
+        cache.put(_entry(raw_output=""))
+        cache2 = AnnotationCache(cache_path)
+        assert cache2.get(_entry().key).raw_output == ""
diff --git a/tests/test_annotation/test_live.py b/tests/test_annotation/test_live.py
new file mode 100644
index 00000000..57079ac5
--- /dev/null
+++ b/tests/test_annotation/test_live.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+"""Live end-to-end tests — require a real Gemini API key.
+
+These tests make real API calls and consume quota. Run only when you want
+to verify actual model behavior.
+
+Usage:
+    GEMINI_API_KEY=<key> GEMINI_MODEL=gemini-3.1-flash-lite \
+        pytest tests/test_annotation/test_live.py -v -m "network and slow"
+
+Skip automatically if GEMINI_API_KEY is not set.
+"""
+
+import math
+import os
+
+import pytest
+
+from torch_measure.annotation import (
+    AnnotationCache,
+    AnnotationJob,
+    DemandAnnotator,
+    GeminiClient,
+    RubricsCatalog,
+)
+from torch_measure.annotation._types import DEMAND_DIMENSIONS
+
+pytestmark = [pytest.mark.network, pytest.mark.slow]
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(scope="module")
+def api_key():
+    key = os.environ.get("GEMINI_API_KEY", "").strip()
+    if not key:
+        pytest.skip("GEMINI_API_KEY environment variable not set")
+    return key
+
+
+@pytest.fixture(scope="module")
+def model_id():
+    return os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite")
+
+
+@pytest.fixture(scope="module")
+def live_annotator(api_key, model_id, tmp_path_factory):
+    client = GeminiClient(api_key=api_key, model=model_id)
+    rubrics = RubricsCatalog()
+    cache_dir = tmp_path_factory.mktemp("live_annotation_cache")
+    cache = AnnotationCache(cache_dir / "cache.jsonl")
+    return DemandAnnotator(client=client, rubrics=rubrics, cache=cache)
+
+
+# Test items chosen to have predictable characteristics.
+OPEN_ENDED_ITEM = AnnotationJob(
+    item_id="live_open_001",
+    content="What is the capital of France?",
+    reference_answer="Paris",
+)
+
+MCQ_ITEM = AnnotationJob(
+    item_id="live_mcq_001",
+    content=(
+        "Which of the following is a planet in our solar system?\n"
+        "A) Sun\nB) Moon\nC) Mars\nD) Comet"
+    ),
+    reference_answer="C) Mars",
+)
+
+COMPLEX_ITEM = AnnotationJob(
+    item_id="live_complex_001",
+    content=(
+        "Prove that for all positive integers n, the sum 1 + 2 + ... + n = n(n+1)/2 "
+        "using mathematical induction."
+    ),
+    reference_answer="Base case: n=1, sum=1=1(2)/2. Inductive step: assume true for k, show k+1.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Structural correctness
+# ---------------------------------------------------------------------------
+
+class TestLiveStructure:
+
+    def test_returns_19_dimensional_vector(self, live_annotator):
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        assert len(result.to_feature_vector()) == 19
+
+    def test_all_18_demand_dimensions_present(self, live_annotator):
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        assert set(result.demands.keys()) == set(DEMAND_DIMENSIONS)
+
+    def test_ug_annotation_present(self, live_annotator):
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        assert result.ug is not None
+
+    def test_model_responses_stored_for_all_dimensions(self, live_annotator):
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        for dim in DEMAND_DIMENSIONS:
+            assert len(result.demands[dim].model_response) > 0, (
+                f"{dim} has empty model_response"
+            )
+        assert len(result.ug.model_response) > 0
+
+
+# ---------------------------------------------------------------------------
+# Score ranges
+# ---------------------------------------------------------------------------
+
+class TestLiveScoreRanges:
+
+    def test_demand_scores_in_valid_range(self, live_annotator):
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        for dim in DEMAND_DIMENSIONS:
+            level = result.demands[dim].level
+            assert math.isnan(level) or 0.0 <= level <= 5.0, (
+                f"{dim} level {level} outside [0, 5]"
+            )
+
+    def test_ug_score_in_valid_range(self, live_annotator):
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        ug = result.ug.ug_score
+        assert math.isnan(ug) or 0.0 <= ug <= 100.0, f"UG score {ug} outside [0, 100]"
+
+    def test_no_demand_parse_failures_on_simple_item(self, live_annotator):
+        """A simple factual question should produce parseable scores for all 18 dimensions."""
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        failures = [dim for dim in DEMAND_DIMENSIONS if math.isnan(result.demands[dim].level)]
+        assert len(failures) == 0, (
+            f"Parse failures on simple item for dimensions: {failures}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# UG classification
+# ---------------------------------------------------------------------------
+
+class TestLiveUG:
+
+    def test_open_ended_item_classified_as_open(self, live_annotator):
+        """'What is the capital of France?' should be open-ended → ug_score=100."""
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        assert result.ug.ug_score == 100.0, (
+            f"Expected open-ended classification (100.0), got {result.ug.ug_score}. "
+            f"Raw output: {repr(result.ug.raw_output)}"
+        )
+
+    def test_mcq_item_classified_as_mcq(self, live_annotator):
+        """Explicit 4-choice MCQ should be classified with N=4 → ug_score=75.0."""
+        result = live_annotator.annotate(MCQ_ITEM)
+        assert result.ug.ug_score < 100.0, (
+            f"MCQ item incorrectly classified as open-ended. "
+            f"Raw output: {repr(result.ug.raw_output)}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Finish reason
+# ---------------------------------------------------------------------------
+
+class TestLiveFinishReason:
+
+    def test_all_finish_reasons_normalized(self, live_annotator):
+        """All finish reasons must be 'stop', 'length', or 'other'."""
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        valid = {"stop", "length", "other"}
+        for dim in DEMAND_DIMENSIONS:
+            fr = result.demands[dim].finish_reason
+            assert fr in valid, f"{dim} finish_reason {repr(fr)} not in {valid}"
+        assert result.ug.finish_reason in valid
+
+    def test_simple_item_finishes_with_stop(self, live_annotator):
+        """A simple item should not trigger token limit."""
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        length_finishes = [
+            dim for dim in DEMAND_DIMENSIONS
+            if result.demands[dim].finish_reason == "length"
+        ]
+        assert len(length_finishes) == 0, (
+            f"Unexpected 'length' finish on simple item for: {length_finishes}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Cache
+# ---------------------------------------------------------------------------
+
+class TestLiveCache:
+
+    def test_second_call_returns_identical_vector(self, live_annotator):
+        """Cache hit returns bit-identical results to original API call."""
+        result1 = live_annotator.annotate(OPEN_ENDED_ITEM)
+        result2 = live_annotator.annotate(OPEN_ENDED_ITEM)
+        assert result1.to_feature_vector() == result2.to_feature_vector()
+
+    def test_second_call_has_same_ug_score(self, live_annotator):
+        result1 = live_annotator.annotate(OPEN_ENDED_ITEM)
+        result2 = live_annotator.annotate(OPEN_ENDED_ITEM)
+        assert result1.ug.ug_score == result2.ug.ug_score
+
+
+# ---------------------------------------------------------------------------
+# Semantic sanity checks (not strict pass/fail — advisory)
+# ---------------------------------------------------------------------------
+
+class TestLiveSemantics:
+
+    def test_complex_item_has_higher_qll_than_simple_item(self, live_annotator):
+        """Mathematical induction proof should score higher on QLl (logical reasoning)
+        than a simple factual recall question. Not a strict guarantee but a strong prior.
+        """
+        simple = live_annotator.annotate(OPEN_ENDED_ITEM)
+        complex_ = live_annotator.annotate(COMPLEX_ITEM)
+        simple_qll = simple.demands["QLl"].level
+        complex_qll = complex_.demands["QLl"].level
+        if math.isnan(simple_qll) or math.isnan(complex_qll):
+            pytest.skip("Parse failure on QLl — cannot compare")
+        # This is an advisory check, not a hard requirement
+        if simple_qll >= complex_qll:
+            pytest.xfail(
+                f"Expected complex item (QLl={complex_qll}) > simple item (QLl={simple_qll}), "
+                "but model disagreed. This is a semantic sanity check, not a hard pass/fail."
+            )
+
+
+# ---------------------------------------------------------------------------
+# Output inspection — prints all 19 scores for manual comparison to paper
+# ---------------------------------------------------------------------------
+
+class TestLiveOutputInspection:
+
+    def test_print_full_annotation_output(self, live_annotator):
+        """Prints all 19 scores for OPEN_ENDED_ITEM. Uses cached result — 0 API calls.
+        Run with -s to see output. Compare to paper's expected annotation for this item.
+        """
+        result = live_annotator.annotate(OPEN_ENDED_ITEM)
+        print(f"\n{'='*60}")
+        print(f"Item: {OPEN_ENDED_ITEM.content!r}")
+        print(f"{'='*60}")
+        print(f"{'Dim':<8} {'Score':>6}  {'Finish':<8}  Response excerpt")
+        print(f"{'-'*60}")
+        for dim in DEMAND_DIMENSIONS:
+            ann = result.demands[dim]
+            score = f"{ann.level:.1f}" if not math.isnan(ann.level) else "NaN"
+            excerpt = ann.model_response[:60].replace("\n", " ")
+            print(f"{dim:<8} {score:>6}  {ann.finish_reason:<8}  {excerpt}")
+        ug = result.ug
+        ug_score = f"{ug.ug_score:.1f}" if not math.isnan(ug.ug_score) else "NaN"
+        print(f"{'UG':<8} {ug_score:>6}  {ug.finish_reason:<8}  {ug.model_response[:60].replace(chr(10), ' ')}")
+        print(f"{'='*60}")
+        print(f"Feature vector: {[round(v, 2) for v in result.to_feature_vector()]}")
+        print(f"{'='*60}\n")
diff --git a/tests/test_annotation/test_parsers.py b/tests/test_annotation/test_parsers.py
new file mode 100644
index 00000000..b7100914
--- /dev/null
+++ b/tests/test_annotation/test_parsers.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+"""Unit tests for _parsers.py — zero API calls, zero file I/O.
+
+These tests cover the verbatim ADeLe paper parsing logic and must pass
+entirely offline. They are the highest-confidence signal that the paper's
+methodology is faithfully reproduced.
+"""
+
+import math
+
+import pytest
+
+from torch_measure.annotation._parsers import extract_demand_level, extract_ug_score
+
+
+# ---------------------------------------------------------------------------
+# extract_demand_level
+# ---------------------------------------------------------------------------
+
+class TestExtractDemandLevel:
+
+    # --- canonical success path ---
+
+    def test_standard_cot_conclusion(self):
+        """Normal CoT response ending with the paper's conclusion sentence."""
+        response = (
+            "Step 1: The task is a simple factual question.\n\n"
+            "Step 2: No computation is required.\n\n"
+            "Thus, the level of *Attention and Search* demanded by the given "
+            "TASK INSTANCE is: 3"
+        )
+        assert extract_demand_level(response) == 3.0
+
+    def test_score_zero(self):
+        assert extract_demand_level("Analysis.\n\nThus, the level is: 0") == 0.0
+
+    def test_score_five(self):
+        assert extract_demand_level("Complex.\n\nThe level is 5.") == 5.0
+
+    def test_returns_float_not_int(self):
+        result = extract_demand_level("Thus, the level is 2")
+        assert isinstance(result, float)
+
+    # --- split on \n\n ---
+
+    def test_uses_last_double_newline_segment(self):
+        """Parser splits on \\n\\n and takes the LAST segment."""
+        response = "Segment one has digit 1.\n\nSegment two has digit 2.\n\nFinal: 4"
+        assert extract_demand_level(response) == 4.0
+
+    def test_no_double_newline_uses_whole_response(self):
+        assert extract_demand_level("The level is 2") == 2.0
+
+    def test_trailing_double_newline_last_segment_is_empty(self):
+        """Response ending in \\n\\n: last segment is empty → no digits → nan."""
+        response = "The level is 3.\n\n"
+        assert math.isnan(extract_demand_level(response))
+
+    # --- last integer rule ---
+
+    def test_last_integer_in_conclusion_is_taken(self):
+        """When multiple integers present, the LAST one determines the score."""
+        response = "Level 2 is close.\n\nFinal verdict: score 4, not 2. Answer: 3"
+        # digits in conclusion = ['4', '2', '3'], last = 3
+        assert extract_demand_level(response) == 3.0
+
+    def test_large_number_then_valid_score(self):
+        """Large out-of-range number followed by valid score → valid score returned."""
+        response = "Analysis.\n\nConsidering 2023 data, the score is 2"
+        # digits = ['2023', '2'], last = 2, valid
+        assert extract_demand_level(response) == 2.0
+
+    # --- range validation ---
+
+    def test_score_6_returns_nan(self):
+        assert math.isnan(extract_demand_level("The level is 6"))
+
+    def test_score_9_returns_nan(self):
+        assert math.isnan(extract_demand_level("Answer: 9"))
+
+    def test_only_large_number_returns_nan(self):
+        assert math.isnan(extract_demand_level("The year 2023 is relevant"))
+
+    # --- failure paths ---
+
+    def test_empty_response_returns_nan(self):
+        assert math.isnan(extract_demand_level(""))
+
+    def test_no_digits_returns_nan(self):
+        assert math.isnan(extract_demand_level("The level cannot be determined."))
+
+    def test_whitespace_only_returns_nan(self):
+        assert math.isnan(extract_demand_level("   \n\n   "))
+
+    # --- section-number rejection ---
+
+    def test_rejects_section_number_at_line_start(self):
+        """'4. Conclusion: ...' — the only digit is a section header → nan."""
+        response = "Analysis.\n\n4. Conclusion: I cannot determine the score."
+        assert math.isnan(extract_demand_level(response))
+
+    def test_rejects_numbered_summary_section(self):
+        response = "Reasoning.\n\n3. Summary: No definitive answer available."
+        assert math.isnan(extract_demand_level(response))
+
+    def test_does_not_reject_score_in_sentence_body(self):
+        """Score digit appearing mid-sentence is NOT at line start → not rejected."""
+        response = "Analysis.\n\nThus the level is 3."
+        assert extract_demand_level(response) == 3.0
+
+    def test_does_not_reject_when_multiple_digits_present(self):
+        """Section rejection only applies when len(digits) == 1."""
+        response = "Analysis.\n\n4. Summary: The final score is 3"
+        # digits = ['4', '3'], len = 2 → section check skipped → last = 3
+        assert extract_demand_level(response) == 3.0
+
+    def test_valid_score_with_period_not_at_line_start(self):
+        """'The level is 3. It reflects...' — 3. is not at line start → accepted."""
+        response = "Analysis.\n\nThe level is 3. It reflects moderate demand."
+        # digits = ['3'], ^3\. would match "3." only if at start of line
+        # "The level is 3." — '3' is NOT at start → returned
+        assert extract_demand_level(response) == 3.0
+
+
+# ---------------------------------------------------------------------------
+# extract_ug_score
+# ---------------------------------------------------------------------------
+
+class TestExtractUGScore:
+
+    # --- "open" variants ---
+
+    def test_open_lowercase(self):
+        raw, score = extract_ug_score("open")
+        assert raw == "open"
+        assert score == 100.0
+
+    def test_open_uppercase(self):
+        _, score = extract_ug_score("OPEN")
+        assert score == 100.0
+
+    def test_open_mixed_case(self):
+        _, score = extract_ug_score("Open")
+        assert score == 100.0
+
+    def test_open_with_surrounding_whitespace(self):
+        _, score = extract_ug_score("  open  ")
+        assert score == 100.0
+
+    def test_open_ignores_lines_after_first(self):
+        raw, score = extract_ug_score("open\nextra line")
+        assert score == 100.0
+
+    # --- MCQ integer paths ---
+
+    def test_four_choices(self):
+        _, score = extract_ug_score("4")
+        assert abs(score - 75.0) < 1e-6  # (1 - 1/4) * 100
+
+    def test_two_choices_yes_no(self):
+        _, score = extract_ug_score("2")
+        assert abs(score - 50.0) < 1e-6  # (1 - 1/2) * 100
+
+    def test_seven_choices_days_of_week(self):
+        _, score = extract_ug_score("7")
+        expected = (1 - 1 / 7) * 100
+        assert abs(score - expected) < 1e-5
+
+    def test_one_choice_gives_zero(self):
+        """n=1 → (1-1/1)*100 = 0.0. Degenerate case; formula still applies."""
+        _, score = extract_ug_score("1")
+        assert score == 0.0
+
+    def test_large_n_approaches_100(self):
+        _, score = extract_ug_score("1000")
+        assert abs(score - 99.9) < 0.01
+
+    def test_raw_output_preserved(self):
+        raw, _ = extract_ug_score("4")
+        assert raw == "4"
+
+    def test_only_first_line_used_for_integer(self):
+        raw, score = extract_ug_score("4\nignored")
+        assert raw == "4"
+        assert abs(score - 75.0) < 1e-6
+
+    # --- failure paths ---
+
+    def test_zero_returns_nan(self):
+        _, score = extract_ug_score("0")
+        assert math.isnan(score)
+
+    def test_negative_integer_returns_nan(self):
+        # "-1" → int parses to -1, n < 1 → nan
+        _, score = extract_ug_score("-1")
+        assert math.isnan(score)
+
+    def test_float_string_returns_nan(self):
+        """'3.5' raises ValueError in int() → nan."""
+        _, score = extract_ug_score("3.5")
+        assert math.isnan(score)
+
+    def test_alphabetic_returns_nan(self):
+        _, score = extract_ug_score("abc")
+        assert math.isnan(score)
+
+    def test_empty_string_returns_nan(self):
+        _, score = extract_ug_score("")
+        assert math.isnan(score)
+
+    def test_whitespace_only_returns_nan(self):
+        _, score = extract_ug_score("   ")
+        assert math.isnan(score)
+
+    def test_returns_tuple(self):
+        result = extract_ug_score("4")
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+
+    def test_nan_raw_output_still_returned(self):
+        """On parse failure raw_output is still the stripped first line."""
+        raw, score = extract_ug_score("bad_value")
+        assert raw == "bad_value"
+        assert math.isnan(score)
diff --git a/tests/test_annotation/test_pipeline.py b/tests/test_annotation/test_pipeline.py
new file mode 100644
index 00000000..fc944d92
--- /dev/null
+++ b/tests/test_annotation/test_pipeline.py
@@ -0,0 +1,390 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+"""Integration tests for DemandAnnotator with a stub Gemini client.
+
+No real API calls are made. Tests verify:
+- Correct number and order of API calls
+- Prompt construction (rubric content, dimension names)
+- Feature vector alignment with DIMENSION_ORDER
+- Cache hit/miss behavior
+- UGAnnotator pipeline
+"""
+
+import math
+from pathlib import Path
+
+import pytest
+
+from torch_measure.annotation._annotator import DemandAnnotator
+from torch_measure.annotation._cache import AnnotationCache
+from torch_measure.annotation._rubrics import RubricsCatalog
+from torch_measure.annotation._types import (
+    DEMAND_DIMENSIONS,
+    DIMENSION_ORDER,
+    AnnotationJob,
+    ItemAnnotation,
+)
+from torch_measure.annotation._ug import UGAnnotator
+
+
+# ---------------------------------------------------------------------------
+# Stub client
+# ---------------------------------------------------------------------------
+
+_DEMAND_RESPONSE = (
+    "Step 1: The task involves basic recall.\n\n"
+    "Step 2: No complex operations needed.\n\n"
+    "Thus, the level of *TestDim* demanded by the given TASK INSTANCE is: 2"
+)
+_UG_RESPONSE = "4"  # 4-choice MCQ → (1 - 1/4) * 100 = 75.0
+
+
+class StubClient:
+    """Records every generate() call; returns configurable stub responses."""
+
+    def __init__(
+        self,
+        demand_response: str = _DEMAND_RESPONSE,
+        ug_response: str = _UG_RESPONSE,
+    ) -> None:
+        self.model = "stub-model-001"
+        self.calls: list[str] = []
+        self._demand_response = demand_response
+        self._ug_response = ug_response
+
+    def generate(self, prompt: str) -> tuple[str, str]:
+        self.calls.append(prompt)
+        if "Reference answer:" in prompt:
+            return self._ug_response, "stop"
+        return self._demand_response, "stop"
+
+    @property
+    def demand_calls(self) -> list[str]:
+        return [c for c in self.calls if "Reference answer:" not in c]
+
+    @property
+    def ug_calls(self) -> list[str]:
+        return [c for c in self.calls if "Reference answer:" in c]
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(scope="module")
+def catalog():
+    return RubricsCatalog()
+
+
+@pytest.fixture
+def stub() -> StubClient:
+    return StubClient()
+
+
+@pytest.fixture
+def job() -> AnnotationJob:
+    return AnnotationJob(
+        item_id="item_test_001",
+        content="What is the capital of France?",
+        reference_answer="Paris",
+    )
+
+
+@pytest.fixture
+def annotator(stub, catalog) -> DemandAnnotator:
+    return DemandAnnotator(client=stub, rubrics=catalog, cache=None)
+
+
+# ---------------------------------------------------------------------------
+# API call count and ordering
+# ---------------------------------------------------------------------------
+
+class TestAPICallCount:
+
+    def test_exactly_19_calls_per_item(self, stub, annotator, job):
+        annotator.annotate(job)
+        assert len(stub.calls) == 19
+
+    def test_exactly_18_demand_calls(self, stub, annotator, job):
+        annotator.annotate(job)
+        assert len(stub.demand_calls) == 18
+
+    def test_exactly_1_ug_call(self, stub, annotator, job):
+        annotator.annotate(job)
+        assert len(stub.ug_calls) == 1
+
+    def test_demand_calls_before_ug_call(self, stub, annotator, job):
+        """All 18 demand calls happen before the UG call."""
+        annotator.annotate(job)
+        first_ug_idx = next(
+            i for i, c in enumerate(stub.calls) if "Reference answer:" in c
+        )
+        assert first_ug_idx == 18  # demand calls are indices 0-17
+
+
+class TestPromptContent:
+
+    def test_demand_prompts_contain_dimension_names(self, stub, catalog, annotator, job):
+        annotator.annotate(job)
+        rubrics = catalog.all_demand_rubrics()
+        for i, rubric in enumerate(rubrics):
+            assert rubric.dimension_name in stub.demand_calls[i], (
+                f"Prompt {i} for rubric {rubric.acronym} missing "
+                f"dimension name '{rubric.dimension_name}'"
+            )
+
+    def test_demand_prompts_contain_item_text(self, stub, annotator, job):
+        annotator.annotate(job)
+        for prompt in stub.demand_calls:
+            assert job.content in prompt
+
+    def test_demand_prompts_contain_task_instance_label(self, stub, annotator, job):
+        annotator.annotate(job)
+        for prompt in stub.demand_calls:
+            assert "TASK INSTANCE:" in prompt
+
+    def test_demand_prompts_contain_chain_of_thoughts_header(self, stub, annotator, job):
+        annotator.annotate(job)
+        for prompt in stub.demand_calls:
+            assert "CHAIN-OF-THOUGHTS REASONING STEPS" in prompt
+
+    def test_ug_prompt_contains_reference_answer_label(self, stub, annotator, job):
+        annotator.annotate(job)
+        ug_prompt = stub.ug_calls[0]
+        assert "Reference answer:" in ug_prompt
+
+    def test_ug_prompt_contains_item_content(self, stub, annotator, job):
+        annotator.annotate(job)
+        ug_prompt = stub.ug_calls[0]
+        assert job.content in ug_prompt
+
+    def test_ug_prompt_contains_reference_answer_value(self, stub, annotator, job):
+        annotator.annotate(job)
+        ug_prompt = stub.ug_calls[0]
+        assert job.reference_answer in ug_prompt
+
+    def test_demand_prompts_ordered_by_dimension_order(self, stub, catalog, annotator, job):
+        """The i-th demand call must be for DEMAND_DIMENSIONS[i]."""
+        annotator.annotate(job)
+        rubrics = catalog.all_demand_rubrics()
+        for i, rubric in enumerate(rubrics):
+            assert rubric.acronym == DEMAND_DIMENSIONS[i]
+
+
+# ---------------------------------------------------------------------------
+# ItemAnnotation structure
+# ---------------------------------------------------------------------------
+
+class TestItemAnnotationStructure:
+
+    def test_returns_item_annotation_type(self, annotator, job):
+        result = annotator.annotate(job)
+        assert isinstance(result, ItemAnnotation)
+
+    def test_item_id_preserved(self, annotator, job):
+        result = annotator.annotate(job)
+        assert result.item_id == job.item_id
+
+    def test_demands_dict_has_all_18_keys(self, annotator, job):
+        result = annotator.annotate(job)
+        assert set(result.demands.keys()) == set(DEMAND_DIMENSIONS)
+
+    def test_demand_levels_match_stub_response(self, annotator, job):
+        """Stub returns '...is: 2' → extract_demand_level returns 2.0."""
+        result = annotator.annotate(job)
+        for dim in DEMAND_DIMENSIONS:
+            assert result.demands[dim].level == 2.0
+
+    def test_demand_item_ids_match_job(self, annotator, job):
+        result = annotator.annotate(job)
+        for dim in DEMAND_DIMENSIONS:
+            assert result.demands[dim].item_id == job.item_id
+
+    def test_ug_annotation_present(self, annotator, job):
+        result = annotator.annotate(job)
+        assert result.ug is not None
+
+    def test_ug_score_matches_stub_response(self, annotator, job):
+        """Stub returns '4' → (1 - 1/4) * 100 = 75.0."""
+        result = annotator.annotate(job)
+        assert abs(result.ug.ug_score - 75.0) < 1e-6
+
+    def test_ug_item_id_matches_job(self, annotator, job):
+        result = annotator.annotate(job)
+        assert result.ug.item_id == job.item_id
+
+    def test_model_response_stored(self, annotator, job):
+        result = annotator.annotate(job)
+        for dim in DEMAND_DIMENSIONS:
+            assert len(result.demands[dim].model_response) > 0
+        assert len(result.ug.model_response) > 0
+
+    def test_finish_reason_stored(self, annotator, job):
+        result = annotator.annotate(job)
+        for dim in DEMAND_DIMENSIONS:
+            assert result.demands[dim].finish_reason == "stop"
+        assert result.ug.finish_reason == "stop"
+
+    def test_nan_level_when_parse_fails(self, catalog, job):
+        """Unparseable model response → math.nan stored in demand level."""
+        bad_stub = StubClient(demand_response="I cannot determine the level.")
+        annotator = DemandAnnotator(client=bad_stub, rubrics=catalog, cache=None)
+        result = annotator.annotate(job)
+        for dim in DEMAND_DIMENSIONS:
+            assert math.isnan(result.demands[dim].level)
+
+
+# ---------------------------------------------------------------------------
+# Feature vector
+# ---------------------------------------------------------------------------
+
+class TestFeatureVector:
+
+    def test_length_is_19(self, annotator, job):
+        result = annotator.annotate(job)
+        assert len(result.to_feature_vector()) == 19
+
+    def test_first_18_positions_are_demand_scores(self, annotator, job):
+        result = annotator.annotate(job)
+        vec = result.to_feature_vector()
+        for i in range(18):
+            assert vec[i] == 2.0, f"Position {i} should be 2.0, got {vec[i]}"
+
+    def test_last_position_is_ug_score(self, annotator, job):
+        result = annotator.annotate(job)
+        vec = result.to_feature_vector()
+        assert abs(vec[18] - 75.0) < 1e-6
+
+    def test_ordering_matches_dimension_order(self, annotator, job):
+        """vec[i] == demands[DEMAND_DIMENSIONS[i]].level for i in 0..17."""
+        result = annotator.annotate(job)
+        vec = result.to_feature_vector()
+        for i, dim in enumerate(DEMAND_DIMENSIONS):
+            assert vec[i] == result.demands[dim].level, (
+                f"Position {i} ({dim}): expected {result.demands[dim].level}, got {vec[i]}"
+            )
+
+    def test_missing_dimension_fills_with_nan(self, annotator, job):
+        result = annotator.annotate(job)
+        del result.demands["AS"]
+        vec = result.to_feature_vector()
+        as_idx = list(DEMAND_DIMENSIONS).index("AS")
+        assert math.isnan(vec[as_idx])
+        for i in range(18):
+            if i != as_idx:
+                assert not math.isnan(vec[i])
+
+
+# ---------------------------------------------------------------------------
+# annotate_dataset
+# ---------------------------------------------------------------------------
+
+class TestAnnotateDataset:
+
+    def test_tensor_shape(self, stub, catalog):
+        annotator = DemandAnnotator(client=stub, rubrics=catalog, cache=None)
+        jobs = [
+            AnnotationJob(f"item_{i}", f"content {i}", f"answer {i}")
+            for i in range(3)
+        ]
+        dv = annotator.annotate_dataset(jobs)
+        assert dv.tensor.shape == (3, 19)
+
+    def test_item_ids_in_input_order(self, stub, catalog):
+        annotator = DemandAnnotator(client=stub, rubrics=catalog, cache=None)
+        jobs = [
+            AnnotationJob(f"item_{i}", f"content {i}", f"answer {i}")
+            for i in range(4)
+        ]
+        dv = annotator.annotate_dataset(jobs)
+        assert dv.item_ids == [f"item_{i}" for i in range(4)]
+
+    def test_row_order_matches_job_order(self, stub, catalog):
+        annotator = DemandAnnotator(client=stub, rubrics=catalog, cache=None)
+        jobs = [
+            AnnotationJob(f"item_{i}", f"content {i}", f"answer {i}")
+            for i in range(3)
+        ]
+        dv = annotator.annotate_dataset(jobs)
+        for i in range(len(jobs)):
+            assert dv.item_ids[i] == jobs[i].item_id
+
+    def test_total_api_calls_is_19_per_item(self, catalog):
+        stub = StubClient()
+        annotator = DemandAnnotator(client=stub, rubrics=catalog, cache=None)
+        n_items = 3
+        jobs = [
+            AnnotationJob(f"item_{i}", f"content {i}", f"answer {i}")
+            for i in range(n_items)
+        ]
+        annotator.annotate_dataset(jobs)
+        assert len(stub.calls) == n_items * 19
+
+
+# ---------------------------------------------------------------------------
+# Cache behavior
+# ---------------------------------------------------------------------------
+
+class TestCacheIntegration:
+
+    def test_cache_hit_prevents_api_call(self, catalog, tmp_path):
+        stub = StubClient()
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        annotator = DemandAnnotator(client=stub, rubrics=catalog, cache=cache)
+        job = AnnotationJob("i1", "What is 2+2?", "4")
+
+        annotator.annotate(job)
+        assert len(stub.calls) == 19
+
+        # Second annotation of same item: all served from cache
+        annotator.annotate(job)
+        assert len(stub.calls) == 19  # no new calls
+
+    def test_cache_persists_across_annotator_instances(self, catalog, tmp_path):
+        cache_path = tmp_path / "cache.jsonl"
+        job = AnnotationJob("i1", "Test content", "Test answer")
+
+        stub1 = StubClient()
+        DemandAnnotator(client=stub1, rubrics=catalog, cache=AnnotationCache(cache_path)).annotate(job)
+        assert len(stub1.calls) == 19
+
+        stub2 = StubClient()
+        DemandAnnotator(client=stub2, rubrics=catalog, cache=AnnotationCache(cache_path)).annotate(job)
+        assert len(stub2.calls) == 0  # all from cache
+
+    def test_different_items_both_hit_api(self, catalog, tmp_path):
+        stub = StubClient()
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        annotator = DemandAnnotator(client=stub, rubrics=catalog, cache=cache)
+
+        job1 = AnnotationJob("i1", "Content A", "Answer A")
+        job2 = AnnotationJob("i2", "Content B", "Answer B")
+        annotator.annotate(job1)
+        annotator.annotate(job2)
+        assert len(stub.calls) == 38  # 19 per unique item
+
+    def test_item_id_is_correct_on_cache_hit(self, catalog, tmp_path):
+        """Cache key excludes item_id; cache hit with different item_id
+        must still return annotation with the CURRENT item_id."""
+        stub = StubClient()
+        cache = AnnotationCache(tmp_path / "cache.jsonl")
+        annotator = DemandAnnotator(client=stub, rubrics=catalog, cache=cache)
+
+        same_content = "What is the capital of France?"
+        job1 = AnnotationJob("item_alpha", same_content, "Paris")
+        job2 = AnnotationJob("item_beta", same_content, "Paris")  # same content, different ID
+
+        annotator.annotate(job1)
+        result2 = annotator.annotate(job2)
+
+        # Cache hit (same content → same key) but item_id must be job2's
+        assert result2.item_id == "item_beta"
+
+    def test_no_cache_uses_api_every_time(self, catalog):
+        stub = StubClient()
+        annotator = DemandAnnotator(client=stub, rubrics=catalog, cache=None)
+        job = AnnotationJob("i1", "Content", "Answer")
+
+        annotator.annotate(job)
+        annotator.annotate(job)
+        assert len(stub.calls) == 38  # called twice, 19 each time
diff --git a/tests/test_annotation/test_prompts.py b/tests/test_annotation/test_prompts.py
new file mode 100644
index 00000000..e17ef770
--- /dev/null
+++ b/tests/test_annotation/test_prompts.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+"""Unit tests for _prompts.py — zero API calls, zero file I/O.
+
+Verifies that get_full_instruction() reproduces the verbatim ADeLe paper
+prompt template with exact spacing, blank lines, and wording.
+"""
+
+import pytest
+
+from torch_measure.annotation._prompts import get_full_instruction, get_ug_instruction
+
+
+class TestGetFullInstruction:
+
+    # --- structure ---
+
+    def test_starts_with_rubric_header(self):
+        result = get_full_instruction("MyDim", "Level 0: None.", "ITEM")
+        assert result.startswith(
+            "The following rubric describes six distinct levels of *MyDim*"
+            " required by different tasks:\n"
+        )
+
+    def test_ends_with_cot_prompt(self):
+        result = get_full_instruction("X", "content", "item")
+        assert result.endswith(
+            "CHAIN-OF-THOUGHTS REASONING STEPS to score the level of *X*"
+            " demanded by the given TASK INSTANCE above:\n"
+        )
+
+    def test_task_instance_label_present(self):
+        result = get_full_instruction("X", "content", "My item text")
+        assert "TASK INSTANCE: My item text" in result
+
+    def test_instruction_label_present(self):
+        result = get_full_instruction("X", "content", "item")
+        assert "INSTRUCTION: Score the level of *X*" in result
+
+    # --- blank-line spacing ---
+
+    def test_exactly_one_blank_line_before_task_instance(self):
+        """rubric_content (no trailing \\n) + \\n + \\n → exactly one blank line."""
+        content = "Level 0: None."  # no trailing newline
+        result = get_full_instruction("D", content, "ITEM")
+        idx = result.index("TASK INSTANCE:")
+        assert result[idx - 2 : idx] == "\n\n", (
+            f"Expected '\\n\\n' before TASK INSTANCE, "
+            f"got {repr(result[idx - 4 : idx + 4])}"
+        )
+
+    def test_exactly_one_blank_line_before_instruction(self):
+        content = "Level 0: None."
+        result = get_full_instruction("D", content, "ITEM")
+        idx = result.index("INSTRUCTION:")
+        assert result[idx - 2 : idx] == "\n\n"
+
+    def test_exactly_one_blank_line_before_cot(self):
+        content = "Level 0: None."
+        result = get_full_instruction("D", content, "ITEM")
+        # rindex: the phrase appears twice — once inside the instruction sentence
+        # and once as the section header at the end. We want the header.
+        idx = result.rindex("CHAIN-OF-THOUGHTS REASONING STEPS")
+        assert result[idx - 2 : idx] == "\n\n"
+
+    def test_no_extra_blank_line_when_content_has_no_trailing_newline(self):
+        """Content without trailing \\n must produce exactly one blank line."""
+        content = "Level 0: None.\nLevel 5: Very high."
+        result = get_full_instruction("D", content, "ITEM")
+        content_end = result.index(content) + len(content)
+        # Must be \n\nT (TASK), not \n\n\nT
+        assert result[content_end : content_end + 3] == "\n\nT", (
+            f"Got {repr(result[content_end : content_end + 5])}"
+        )
+
+    # --- verbatim wording ---
+
+    def test_conclusion_statement_verbatim(self):
+        result = get_full_instruction("X", "content", "item")
+        expected = (
+            '"Thus, the level of *X* demanded by the given TASK INSTANCE is: SCORE",'
+            " where SCORE is an integer score you have determined."
+        )
+        assert expected in result
+
+    def test_instruction_text_verbatim(self):
+        result = get_full_instruction("X", "content", "item")
+        assert (
+            "Score the level of *X* demanded by the given TASK INSTANCE "
+            "using a discrete value from 0 to 5. "
+            "Use CHAIN-OF-THOUGHTS REASONING to reason step by step before assigning the score. "
+            "After the CHAIN-OF-THOUGHTS REASONING STEPS, conclude your assessment with the "
+            'statement: "Thus, the level of *X* demanded by the given TASK INSTANCE is: SCORE"'
+        ) in result
+
+    def test_dimension_appears_in_all_four_positions(self):
+        """dimension appears in: header, INSTRUCTION (×2), CHAIN-OF-THOUGHTS."""
+        result = get_full_instruction("TargetDim", "content", "item")
+        assert result.count("TargetDim") == 4
+
+    def test_item_text_appears_verbatim(self):
+        item = "What is 2 + 2?"
+        result = get_full_instruction("X", "content", item)
+        assert f"TASK INSTANCE: {item}" in result
+
+    def test_rubric_content_appears_verbatim(self):
+        content = "Level 0: None.\nLevel 1: Low.\nLevel 5: Very high."
+        result = get_full_instruction("X", content, "item")
+        assert content in result
+
+    # --- no extra instructions ---
+
+    def test_no_gemini_specific_instructions(self):
+        """Prompt must not contain Gemini-specific tokens or instructions."""
+        result = get_full_instruction("X", "content", "item")
+        forbidden = ["gemini", "bard", "google", "think step by step", "let's think"]
+        for phrase in forbidden:
+            assert phrase.lower() not in result.lower(), (
+                f"Found forbidden phrase '{phrase}' in prompt"
+            )
+
+    def test_prompt_has_no_system_instruction_marker(self):
+        result = get_full_instruction("X", "content", "item")
+        assert "system:" not in result.lower()
+        assert "<system>" not in result.lower()
+
+
+class TestGetUGInstruction:
+
+    def test_exact_structure(self):
+        result = get_ug_instruction("Question", "Answer", "Rubric")
+        assert result == "Question\n\nReference answer: Answer\n\nRubric"
+
+    def test_two_blank_lines_separate_each_section(self):
+        result = get_ug_instruction("Q", "A", "R")
+        parts = result.split("\n\n")
+        assert len(parts) == 3
+        assert parts[0] == "Q"
+        assert parts[1] == "Reference answer: A"
+        assert parts[2] == "R"
+
+    def test_ug_rubric_appended_without_modification(self):
+        rubric = "You are tasked...\nOutput:"
+        result = get_ug_instruction("Q", "A", rubric)
+        assert result.endswith(rubric)
+
+    def test_reference_answer_label_verbatim(self):
+        result = get_ug_instruction("Q", "Paris", "R")
+        assert "Reference answer: Paris" in result
diff --git a/tests/test_annotation/test_rubrics.py b/tests/test_annotation/test_rubrics.py
new file mode 100644
index 00000000..46af61f0
--- /dev/null
+++ b/tests/test_annotation/test_rubrics.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+"""Tests for RubricsCatalog — reads bundled rubric files, no API calls.
+
+These tests verify that rubric files are loaded correctly, that the
+dimension order matches the paper specification, and that validation
+works correctly.
+"""
+
+import shutil
+
+import pytest
+
+from torch_measure.annotation._rubrics import RubricsCatalog
+from torch_measure.annotation._types import DEMAND_DIMENSIONS, DIMENSION_ORDER
+
+
+@pytest.fixture(scope="module")
+def catalog():
+    return RubricsCatalog()
+
+
+# ---------------------------------------------------------------------------
+# Loading
+# ---------------------------------------------------------------------------
+
+class TestRubricsCatalogLoading:
+
+    def test_loads_exactly_18_demand_rubrics(self, catalog):
+        assert len(catalog.all_demand_rubrics()) == 18
+
+    def test_all_demand_dimensions_present(self, catalog):
+        for acronym in DEMAND_DIMENSIONS:
+            rubric = catalog.get(acronym)
+            assert rubric is not None
+            assert rubric.acronym == acronym
+
+    def test_ug_content_loaded(self, catalog):
+        assert len(catalog.ug_content) > 0
+
+    def test_ug_hash_nonempty(self, catalog):
+        assert len(catalog.ug_hash) == 16
+        assert all(c in "0123456789abcdef" for c in catalog.ug_hash)
+
+    def test_ug_content_ends_with_output_prompt(self, catalog):
+        """UG_choice_num.txt ends with 'Output:' — the model's response anchor."""
+        assert catalog.ug_content.rstrip().endswith("Output:")
+
+    def test_ug_content_contains_classification_instructions(self, catalog):
+        assert "multiple-choice" in catalog.ug_content.lower()
+        assert "open-ended" in catalog.ug_content.lower()
+
+    def test_ug_not_in_demand_rubrics(self, catalog):
+        acronyms = [r.acronym for r in catalog.all_demand_rubrics()]
+        assert "UG" not in acronyms
+        assert "UG_choice_num" not in acronyms
+
+
+# ---------------------------------------------------------------------------
+# Canonical ordering
+# ---------------------------------------------------------------------------
+
+class TestDimensionOrder:
+
+    def test_all_demand_rubrics_in_canonical_order(self, catalog):
+        rubrics = catalog.all_demand_rubrics()
+        acronyms = [r.acronym for r in rubrics]
+        assert acronyms == list(DEMAND_DIMENSIONS)
+
+    def test_dimension_order_has_19_elements(self):
+        assert len(DIMENSION_ORDER) == 19
+
+    def test_demand_dimensions_is_first_18(self):
+        assert DEMAND_DIMENSIONS == DIMENSION_ORDER[:18]
+
+    def test_ug_is_last_in_dimension_order(self):
+        assert DIMENSION_ORDER[-1] == "UG"
+
+
+# ---------------------------------------------------------------------------
+# Content format
+# ---------------------------------------------------------------------------
+
+class TestRubricContent:
+
+    def test_content_has_no_trailing_newline(self, catalog):
+        """strip('\\n') must have been applied; trailing \\n causes extra blank lines in prompts."""
+        for rubric in catalog.all_demand_rubrics():
+            assert not rubric.content.endswith("\n"), (
+                f"{rubric.acronym}.content ends with \\n — "
+                "this produces extra blank lines in the generated prompt"
+            )
+
+    def test_content_has_no_leading_newline(self, catalog):
+        for rubric in catalog.all_demand_rubrics():
+            assert not rubric.content.startswith("\n"), (
+                f"{rubric.acronym}.content starts with \\n"
+            )
+
+    def test_content_is_nonempty(self, catalog):
+        for rubric in catalog.all_demand_rubrics():
+            assert len(rubric.content) > 100, (
+                f"{rubric.acronym} content suspiciously short: {len(rubric.content)} chars"
+            )
+
+    def test_content_contains_level_definitions(self, catalog):
+        """Every rubric must contain Level 0 through Level 5."""
+        for rubric in catalog.all_demand_rubrics():
+            for level in range(6):
+                assert f"Level {level}:" in rubric.content, (
+                    f"{rubric.acronym} missing 'Level {level}:'"
+                )
+
+    def test_dimension_name_is_nonempty(self, catalog):
+        for rubric in catalog.all_demand_rubrics():
+            assert len(rubric.dimension_name) > 0
+
+    def test_rubric_hash_is_16_hex_chars(self, catalog):
+        for rubric in catalog.all_demand_rubrics():
+            assert len(rubric.rubric_hash) == 16
+            assert all(c in "0123456789abcdef" for c in rubric.rubric_hash), (
+                f"{rubric.acronym} has non-hex rubric_hash: {rubric.rubric_hash}"
+            )
+
+    def test_all_hashes_distinct(self, catalog):
+        """Each rubric file has distinct content → distinct hashes."""
+        hashes = [r.rubric_hash for r in catalog.all_demand_rubrics()]
+        assert len(hashes) == len(set(hashes)), (
+            "Two rubrics have identical content hashes — check for duplicate files"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Paper-specific rubric format checks
+# ---------------------------------------------------------------------------
+
+class TestPaperSpecificFormat:
+
+    def test_at_txt_starts_with_level_0_no_description(self, catalog):
+        """AT.txt must use the delean-batch-manager version (no opening description ¶)."""
+        at = catalog.get("AT")
+        first_line = at.content.split("\n")[0]
+        assert first_line.startswith("Level 0"), (
+            f"AT.txt should start with 'Level 0:' (no description paragraph). "
+            f"First line: {repr(first_line)}"
+        )
+
+    def test_cl_txt_starts_with_level_0_no_description(self, catalog):
+        """CL.txt must use the delean-batch-manager version (no opening description ¶)."""
+        cl = catalog.get("CL")
+        first_line = cl.content.split("\n")[0]
+        assert first_line.startswith("Level 0"), (
+            f"CL.txt should start with 'Level 0:' (no description paragraph). "
+            f"First line: {repr(first_line)}"
+        )
+
+    def test_as_txt_has_description_paragraph(self, catalog):
+        """AS.txt should have a description paragraph before Level 0."""
+        as_rubric = catalog.get("AS")
+        first_line = as_rubric.content.split("\n")[0]
+        assert not first_line.startswith("Level"), (
+            "AS.txt should have a description paragraph as its first content. "
+            "Only AT.txt and CL.txt use the no-description version."
+        )
+
+    def test_ug_content_not_stripped(self, catalog):
+        """UG_choice_num.txt has no # header → content = full file text (not stripped)."""
+        # The UG file starts with "You are tasked..." not with a level definition
+        assert "You are tasked" in catalog.ug_content or "classif" in catalog.ug_content.lower()
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+class TestRubricsCatalogValidation:
+
+    def test_missing_rubric_raises_runtime_error(self, tmp_path):
+        """If any demand rubric file is absent, __init__ must raise RuntimeError."""
+        from pathlib import Path
+
+        src_dir = Path(__file__).parent.parent.parent / "src/torch_measure/annotation/rubrics"
+        if not src_dir.exists():
+            pytest.skip("Cannot locate bundled rubrics directory for this test")
+
+        dest_dir = tmp_path / "rubrics"
+        shutil.copytree(src_dir, dest_dir)
+        (dest_dir / "AS.txt").unlink()
+
+        with pytest.raises(RuntimeError, match="Missing rubric files"):
+            RubricsCatalog(dest_dir)
+
+    def test_error_message_names_missing_rubric(self, tmp_path):
+        from pathlib import Path
+
+        src_dir = Path(__file__).parent.parent.parent / "src/torch_measure/annotation/rubrics"
+        if not src_dir.exists():
+            pytest.skip("Cannot locate bundled rubrics directory for this test")
+
+        dest_dir = tmp_path / "rubrics"
+        shutil.copytree(src_dir, dest_dir)
+        (dest_dir / "QLl.txt").unlink()
+
+        with pytest.raises(RuntimeError, match="QLl"):
+            RubricsCatalog(dest_dir)
+
+    def test_multiple_missing_rubrics_named_in_error(self, tmp_path):
+        from pathlib import Path
+
+        src_dir = Path(__file__).parent.parent.parent / "src/torch_measure/annotation/rubrics"
+        if not src_dir.exists():
+            pytest.skip("Cannot locate bundled rubrics directory for this test")
+
+        dest_dir = tmp_path / "rubrics"
+        shutil.copytree(src_dir, dest_dir)
+        (dest_dir / "AS.txt").unlink()
+        (dest_dir / "CL.txt").unlink()
+
+        with pytest.raises(RuntimeError) as exc_info:
+            RubricsCatalog(dest_dir)
+        msg = str(exc_info.value)
+        assert "AS" in msg
+        assert "CL" in msg
+
+    def test_custom_rubrics_dir_accepted(self, tmp_path):
+        from pathlib import Path
+
+        src_dir = Path(__file__).parent.parent.parent / "src/torch_measure/annotation/rubrics"
+        if not src_dir.exists():
+            pytest.skip("Cannot locate bundled rubrics directory for this test")
+
+        dest_dir = tmp_path / "rubrics"
+        shutil.copytree(src_dir, dest_dir)
+        catalog = RubricsCatalog(dest_dir)
+        assert len(catalog.all_demand_rubrics()) == 18
diff --git a/tests/test_models/test_demand_assessor.py b/tests/test_models/test_demand_assessor.py
new file mode 100644
index 00000000..27c49790
--- /dev/null
+++ b/tests/test_models/test_demand_assessor.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+import pandas as pd
+import pytest
+import torch
+
+from torch_measure.datasets._long_form import LongFormData
+from torch_measure.models import DemandAssessor
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_query(n_subjects: int, n_rows: int, item_feature_dim: int) -> dict:
+    """Build a minimal valid query for DemandAssessor."""
+    return {
+        "subject_idx": torch.randint(0, n_subjects, (n_rows,)),
+        "item_features": torch.randn(n_rows, item_feature_dim),
+    }
+
+
+def _synth_longform(
+    n_subjects: int = 10,
+    n_items: int = 15,
+    seed: int = 0,
+) -> LongFormData:
+    """Dense synthetic LongFormData with zero-padded IDs so sorted() == insertion order."""
+    torch.manual_seed(seed)
+    ability = torch.randn(n_subjects)
+    difficulty = torch.randn(n_items)
+    probs = torch.sigmoid(ability.unsqueeze(1) - difficulty.unsqueeze(0))
+    responses = torch.bernoulli(probs)
+
+    rows = []
+    for s in range(n_subjects):
+        for i in range(n_items):
+            rows.append(
+                {
+                    "subject_id": f"s{s:02d}",
+                    "item_id": f"i{i:02d}",
+                    "benchmark_id": "synthetic",
+                    "trial": 0,
+                    "test_condition": None,
+                    "response": float(responses[s, i].item()),
+                    "correct_answer": None,
+                    "trace": None,
+                }
+            )
+    df = pd.DataFrame(rows)
+    items = pd.DataFrame(
+        [{"item_id": f"i{i:02d}", "benchmark_id": "synthetic"} for i in range(n_items)]
+    )
+    subjects = pd.DataFrame([{"subject_id": f"s{s:02d}"} for s in range(n_subjects)])
+    return LongFormData(
+        name="synthetic",
+        responses=df,
+        items=items,
+        subjects=subjects,
+        traces=None,
+        info={},
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+class TestDemandAssessor:
+
+    # --- construction -------------------------------------------------------
+
+    def test_init(self):
+        model = DemandAssessor(n_subjects=10, item_feature_dim=8)
+        assert model.n_subjects == 10
+        assert model.item_feature_dim == 8
+        assert model.subject_embedding_dim == 16  # default
+        assert model.n_items == 0
+        assert model.subject_embedding.weight.shape == (10, 16)
+
+    def test_expected_keys(self):
+        assert DemandAssessor.expected_keys == ("subject_idx", "item_features")
+
+    def test_n_items_zero_before_fit(self):
+        model = DemandAssessor(n_subjects=5, item_feature_dim=4)
+        assert model.n_items == 0
+
+    # --- predict() ----------------------------------------------------------
+
+    def test_predict_shape_before_fit(self):
+        """predict() is valid before fit() — parameters are randomly initialised."""
+        model = DemandAssessor(n_subjects=5, item_feature_dim=4)
+        query = _make_query(5, 12, 4)
+        probs = model.predict(query)
+        assert probs.shape == (12,)
+        assert (probs >= 0).all()
+        assert (probs <= 1).all()
+
+    def test_predict_validates_feature_dim(self):
+        model = DemandAssessor(n_subjects=5, item_feature_dim=8)
+        bad_query = {
+            "subject_idx": torch.zeros(3, dtype=torch.long),
+            "item_features": torch.randn(3, 4),  # wrong dim
+        }
+        try:
+            model.predict(bad_query)
+            raise AssertionError("Should have raised ValueError")
+        except ValueError:
+            pass
+
+    def test_predict_known_values(self):
+        """Verify forward computation with manually set weights (n_layers=1)."""
+        # n_layers=1: net.net = Sequential([Linear(E+F, 1)]) — no activation.
+        E, F = 2, 2
+        model = DemandAssessor(
+            n_subjects=2, item_feature_dim=F, subject_embedding_dim=E, n_layers=1
+        )
+        with torch.no_grad():
+            model.subject_embedding.weight.copy_(
+                torch.tensor([[1.0, 0.0], [0.0, 0.0]])
+            )
+            # net.net[0] is the sole Linear(E+F, 1); weight shape (1, 4)
+            model.net.net[0].weight.copy_(torch.tensor([[1.0, 0.0, 0.0, 0.0]]))
+            model.net.net[0].bias.zero_()
+
+        # Subject 0: e_s=[1,0], f=[0,0] → x=[1,0,0,0] → logit=1 → sigmoid(1)
+        # Subject 1: e_s=[0,0], f=[0,0] → x=[0,0,0,0] → logit=0 → sigmoid(0)=0.5
+        query = {
+            "subject_idx": torch.tensor([0, 1]),
+            "item_features": torch.zeros(2, F),
+        }
+        probs = model.predict(query)
+        expected = torch.sigmoid(torch.tensor([1.0, 0.0]))
+        assert torch.allclose(probs, expected, atol=1e-5)
+
+    def test_predict_subjects_differ(self, seed):
+        """Different subjects produce different probabilities for the same item."""
+        model = DemandAssessor(n_subjects=5, item_feature_dim=4)
+        f = torch.randn(1, 4)
+        p0 = model.predict({"subject_idx": torch.tensor([0]), "item_features": f})
+        p1 = model.predict({"subject_idx": torch.tensor([1]), "item_features": f})
+        assert not torch.allclose(p0, p1)
+
+    def test_forward_equals_predict(self):
+        model = DemandAssessor(n_subjects=5, item_feature_dim=4)
+        query = _make_query(5, 10, 4)
+        assert torch.allclose(model(query), model.predict(query))
+
+    # --- fit() — basic ------------------------------------------------------
+
+    def test_fit_reduces_loss(self, small_response_matrix):
+        n_subjects, n_items = small_response_matrix.shape
+        torch.manual_seed(0)
+        features = torch.randn(n_items, 8)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=8)
+        history = model.fit(
+            small_response_matrix, features, max_epochs=100, verbose=False
+        )
+        assert len(history["losses"]) > 0
+        assert history["losses"][-1] < history["losses"][0]
+
+    def test_fit_updates_n_items(self, small_response_matrix):
+        n_subjects, n_items = small_response_matrix.shape
+        features = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        model.fit(small_response_matrix, features, max_epochs=5, verbose=False)
+        assert model.n_items == n_items
+
+    def test_fit_refit_updates_n_items(self, small_response_matrix):
+        """Re-fitting with a different item count correctly updates n_items."""
+        n_subjects, n_items = small_response_matrix.shape
+        features_30 = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        model.fit(small_response_matrix, features_30, max_epochs=5, verbose=False)
+        assert model.n_items == 30
+
+        # Re-fit on a smaller matrix
+        small_matrix = small_response_matrix[:, :10]
+        features_10 = features_30[:10]
+        model.fit(small_matrix, features_10, max_epochs=5, verbose=False)
+        assert model.n_items == 10
+
+    # --- fit() — validation -------------------------------------------------
+
+    def test_fit_validates_feature_dim_wrong_width(self, small_response_matrix):
+        n_subjects, n_items = small_response_matrix.shape
+        bad_features = torch.randn(n_items, 99)  # wrong second dim
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=8)
+        with pytest.raises(ValueError):
+            model.fit(small_response_matrix, bad_features, verbose=False)
+
+    def test_fit_validates_feature_rank_1d(self, small_response_matrix):
+        """1-D item_features must raise ValueError, not IndexError."""
+        n_subjects, _ = small_response_matrix.shape
+        features_1d = torch.randn(8)  # 1-D, not 2-D
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=8)
+        with pytest.raises(ValueError):
+            model.fit(small_response_matrix, features_1d, verbose=False)
+
+    def test_fit_validates_empty_observations_all_nan(self):
+        """All-NaN response matrix must raise ValueError before training."""
+        n_subjects, n_items = 5, 10
+        nan_matrix = torch.full((n_subjects, n_items), float("nan"))
+        features = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        with pytest.raises(ValueError, match="No observed responses"):
+            model.fit(nan_matrix, features, verbose=False)
+
+    def test_fit_validates_empty_observations_all_false_mask(self):
+        """All-False mask must raise ValueError before training."""
+        n_subjects, n_items = 5, 10
+        matrix = torch.rand(n_subjects, n_items)
+        mask = torch.zeros(n_subjects, n_items, dtype=torch.bool)
+        features = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        with pytest.raises(ValueError, match="No observed responses"):
+            model.fit(matrix, features, mask=mask, verbose=False)
+
+    def test_fit_validates_insufficient_feature_rows(self):
+        """item_features with fewer rows than items referenced raises ValueError."""
+        n_subjects, n_items = 5, 10
+        matrix = torch.rand(n_subjects, n_items)
+        too_few = torch.randn(5, 4)  # only 5 rows, but data has 10 items
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        with pytest.raises(ValueError):
+            model.fit(matrix, too_few, verbose=False)
+
+    def test_fit_with_explicit_mask(self, small_response_matrix):
+        """Explicit boolean mask selects the correct subset of observations."""
+        n_subjects, n_items = small_response_matrix.shape
+        mask = torch.ones(n_subjects, n_items, dtype=torch.bool)
+        mask[:, n_items // 2 :] = False  # hide second half of items
+        features = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        history = model.fit(
+            small_response_matrix, features, mask=mask, max_epochs=10, verbose=False
+        )
+        assert len(history["losses"]) > 0
+
+    def test_fit_with_nan_entries(self, response_matrix_with_nans):
+        """NaN entries in the response matrix are correctly excluded."""
+        n_subjects, n_items = response_matrix_with_nans.shape
+        features = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        history = model.fit(
+            response_matrix_with_nans, features, max_epochs=10, verbose=False
+        )
+        assert len(history["losses"]) > 0
+
+    def test_fit_convergence_tol(self, small_response_matrix):
+        """convergence_tol stops training early when loss plateaus."""
+        n_subjects, n_items = small_response_matrix.shape
+        features = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        history = model.fit(
+            small_response_matrix,
+            features,
+            max_epochs=10_000,
+            convergence_tol=1e-3,
+            verbose=False,
+        )
+        assert len(history["losses"]) < 10_000
+
+    # --- fit() — LongFormData -----------------------------------------------
+
+    def test_fit_accepts_longform_data(self):
+        """LongFormData input produces valid training history."""
+        data = _synth_longform(n_subjects=10, n_items=15)
+        # item_ids from to_fit_tensors() are sorted: "i00", "i01", ..., "i14"
+        # which matches insertion order for zero-padded IDs.
+        item_ids = data.to_fit_tensors()["item_ids"]
+        n_items = len(item_ids)
+        features = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=10, item_feature_dim=4)
+        history = model.fit(data, features, max_epochs=10, verbose=False)
+        assert len(history["losses"]) > 0
+        assert model.n_items == n_items
+
+    def test_fit_longform_item_ordering_is_sorted(self):
+        """Verify to_fit_tensors() item ordering so callers can align features."""
+        data = _synth_longform(n_subjects=5, n_items=6)
+        item_ids = data.to_fit_tensors()["item_ids"]
+        # Zero-padded IDs: sorted() == ["i00", "i01", "i02", "i03", "i04", "i05"]
+        assert item_ids == sorted(item_ids), (
+            "to_fit_tensors() must return item_ids in sorted order; "
+            "item_features rows must be aligned accordingly."
+        )
+
+    def test_fit_longform_unsorted_item_ids(self):
+        """fit() maps item_idx via sorted() even when IDs appear out of order.
+
+        Constructs LongFormData where item IDs are inserted in non-alphabetical
+        order ("item_b", "item_a", "item_c") and verifies that to_fit_tensors()
+        returns them sorted ("item_a", "item_b", "item_c"). Features must be
+        supplied in that sorted order for correct training.
+        """
+        torch.manual_seed(0)
+        n_subjects = 4
+        insertion_order = ["item_b", "item_a", "item_c"]
+        responses = torch.bernoulli(torch.rand(n_subjects, 3))
+
+        rows = []
+        for s in range(n_subjects):
+            for j, item_id in enumerate(insertion_order):
+                rows.append(
+                    {
+                        "subject_id": f"s{s:02d}",
+                        "item_id": item_id,
+                        "benchmark_id": "test",
+                        "trial": 0,
+                        "test_condition": None,
+                        "response": float(responses[s, j].item()),
+                        "correct_answer": None,
+                        "trace": None,
+                    }
+                )
+        df = pd.DataFrame(rows)
+        items_df = pd.DataFrame(
+            [{"item_id": iid, "benchmark_id": "test"} for iid in insertion_order]
+        )
+        subjects_df = pd.DataFrame([{"subject_id": f"s{s:02d}"} for s in range(n_subjects)])
+        data = LongFormData(
+            name="test",
+            responses=df,
+            items=items_df,
+            subjects=subjects_df,
+            traces=None,
+            info={},
+        )
+
+        # to_fit_tensors() must sort: ["item_a", "item_b", "item_c"]
+        fit_inputs = data.to_fit_tensors()
+        assert fit_inputs["item_ids"] == ["item_a", "item_b", "item_c"]
+
+        # Features aligned to sorted order — fit must succeed
+        F = 4
+        features = torch.randn(3, F)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=F)
+        history = model.fit(data, features, max_epochs=5, verbose=False)
+        assert len(history["losses"]) > 0
+
+    # --- predict() — generalisation -----------------------------------------
+
+    def test_predict_unseen_items(self, small_response_matrix):
+        """After fitting, predict on feature vectors not present in training data."""
+        n_subjects, n_items = small_response_matrix.shape
+        torch.manual_seed(0)
+        training_features = torch.randn(n_items, 4)
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        model.fit(
+            small_response_matrix, training_features, max_epochs=10, verbose=False
+        )
+
+        # New items: feature vectors never seen during training
+        new_features = torch.randn(5, 4)
+        query = {
+            "subject_idx": torch.zeros(5, dtype=torch.long),
+            "item_features": new_features,
+        }
+        probs = model.predict(query)
+        assert probs.shape == (5,)
+        assert (probs >= 0).all()
+        assert (probs <= 1).all()
+
+    # --- serialisation ------------------------------------------------------
+
+    def test_serialization(self, small_response_matrix):
+        """load_state_dict() fully restores the model; no re-supply step needed."""
+        n_subjects, n_items = small_response_matrix.shape
+        torch.manual_seed(0)
+        features = torch.randn(n_items, 4)
+
+        model = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        model.fit(small_response_matrix, features, max_epochs=10, verbose=False)
+
+        query = {
+            "subject_idx": torch.tensor([0, 1, 2]),
+            "item_features": features[:3],
+        }
+        probs_before = model.predict(query).detach()
+
+        # Save and restore — no set_embeddings() or set_item_features() call.
+        state = model.state_dict()
+        model2 = DemandAssessor(n_subjects=n_subjects, item_feature_dim=4)
+        model2.load_state_dict(state)
+
+        probs_after = model2.predict(query).detach()
+        assert torch.allclose(probs_before, probs_after)
+        assert model2.n_items == n_items  # persisted via n_items_buf

From fbad1b7c8b2a445d9863de0520b3e25bd57512e7 Mon Sep 17 00:00:00 2001
From: AneeshD04 <aneesh72583@gmail.com>
Date: Fri, 12 Jun 2026 14:23:45 -0700
Subject: [PATCH 2/4] Add Sphinx docs for DemandAssessor and annotation module

---
 docs/source/api/annotation.rst | 34 ++++++++++++++++++++++++++++++++++
 docs/source/api/models.rst     |  7 +++++++
 docs/source/index.rst          |  1 +
 3 files changed, 42 insertions(+)
 create mode 100644 docs/source/api/annotation.rst

diff --git a/docs/source/api/annotation.rst b/docs/source/api/annotation.rst
new file mode 100644
index 00000000..c757373a
--- /dev/null
+++ b/docs/source/api/annotation.rst
@@ -0,0 +1,34 @@
+Annotation
+==========
+
+Tools for annotating benchmark items with demand vectors using the
+18-dimension ADeLe rubric system from the Nature 2026 paper.
+
+.. automodule:: torch_measure.annotation
+   :members:
+
+Core Classes
+------------
+
+.. autoclass:: torch_measure.annotation.DemandAnnotator
+   :members:
+   :undoc-members:
+
+.. autoclass:: torch_measure.annotation.GeminiClient
+   :members:
+   :undoc-members:
+
+.. autoclass:: torch_measure.annotation.RubricsCatalog
+   :members:
+   :undoc-members:
+
+.. autoclass:: torch_measure.annotation.AnnotationCache
+   :members:
+   :undoc-members:
+
+Data Types
+----------
+
+.. autoclass:: torch_measure.annotation.AnnotationJob
+   :members:
+   :undoc-members:
diff --git a/docs/source/api/models.rst b/docs/source/api/models.rst
index 65ecf01c..2f74f647 100644
--- a/docs/source/api/models.rst
+++ b/docs/source/api/models.rst
@@ -53,6 +53,13 @@ Factor Models
    :members:
    :undoc-members:
 
+Demand-Based Models
+-------------------
+
+.. autoclass:: torch_measure.models.DemandAssessor
+   :members:
+   :undoc-members:
+
 Rotation Utilities
 ------------------
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 06d47403..f86b4fb1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -16,6 +16,7 @@ Computerized Adaptive Testing, psychometric metrics, and factor models.
    :caption: API Reference
 
    api/models
+   api/annotation
    api/cat
    api/metrics
    api/data

From e011be256a3eca055668f0dc5a15769034bdcb75 Mon Sep 17 00:00:00 2001
From: AneeshD04 <aneesh72583@gmail.com>
Date: Fri, 12 Jun 2026 14:36:33 -0700
Subject: [PATCH 3/4] Note that DemandAnnotator accepts any generate() client,
 not just Gemini

---
 docs/source/api/annotation.rst             | 5 +++++
 src/torch_measure/annotation/__init__.py   | 6 ++++++
 src/torch_measure/annotation/_annotator.py | 5 +++++
 3 files changed, 16 insertions(+)

diff --git a/docs/source/api/annotation.rst b/docs/source/api/annotation.rst
index c757373a..07f2aae7 100644
--- a/docs/source/api/annotation.rst
+++ b/docs/source/api/annotation.rst
@@ -4,6 +4,11 @@ Annotation
 Tools for annotating benchmark items with demand vectors using the
 18-dimension ADeLe rubric system from the Nature 2026 paper.
 
+:class:`DemandAnnotator` accepts any client that implements
+``generate(prompt: str) -> tuple[str, str]``, so any LLM provider
+(OpenAI, Anthropic, Azure, etc.) can be used in place of :class:`GeminiClient`
+by wrapping it in a class with that single method.
+
 .. automodule:: torch_measure.annotation
    :members:
 
diff --git a/src/torch_measure/annotation/__init__.py b/src/torch_measure/annotation/__init__.py
index ea47547a..2fc96171 100644
--- a/src/torch_measure/annotation/__init__.py
+++ b/src/torch_measure/annotation/__init__.py
@@ -4,6 +4,12 @@
   Zhou et al. (2026) "General scales unlock AI evaluation with explanatory
   and predictive power." Nature.
 
+``DemandAnnotator`` accepts any client that implements
+``generate(prompt: str) -> tuple[str, str]`` (response text, finish reason).
+``GeminiClient`` is the bundled implementation, but any LLM provider
+(OpenAI, Anthropic, Azure, etc.) can be used by wrapping it in a class with
+that single method.
+
 Public API
 ----------
 DemandAnnotator   — main entry point: annotates one item or a full dataset
diff --git a/src/torch_measure/annotation/_annotator.py b/src/torch_measure/annotation/_annotator.py
index b97b45bb..bb166765 100644
--- a/src/torch_measure/annotation/_annotator.py
+++ b/src/torch_measure/annotation/_annotator.py
@@ -24,6 +24,11 @@ class DemandAnnotator:
 
     One API call per demand rubric (18 sequential calls) plus one UG call.
     Results are cached to avoid redundant API calls across runs.
+
+    ``client`` can be any object implementing
+    ``generate(prompt: str) -> tuple[str, str]``. The bundled
+    :class:`GeminiClient` is the default, but any LLM provider can be used
+    by wrapping it in a class with that single method.
     """
 
     def __init__(

From 0a2649045cdcceb35bd5d65d7b99e06ea0367449 Mon Sep 17 00:00:00 2001
From: AneeshD04 <aneesh72583@gmail.com>
Date: Mon, 15 Jun 2026 21:49:05 -0700
Subject: [PATCH 4/4] adding support for openai client. Making all tests and
 annotator fully client agnostic (use environment vars to set client and key

---
 .gitignore                                    |   4 +-
 src/torch_measure/annotation/__init__.py      |   4 +
 .../annotation/_claude_client.py              |  96 +++++
 .../annotation/_openai_client.py              |  88 +++++
 tests/test_annotation/compare_annotations.py  | 291 ++++++++++++++
 tests/test_annotation/test_live.py            |  60 ++-
 .../test_annotation/test_paper_comparison.py  | 364 ++++++++++++++++++
 7 files changed, 894 insertions(+), 13 deletions(-)
 create mode 100644 src/torch_measure/annotation/_claude_client.py
 create mode 100644 src/torch_measure/annotation/_openai_client.py
 create mode 100644 tests/test_annotation/compare_annotations.py
 create mode 100644 tests/test_annotation/test_paper_comparison.py

diff --git a/.gitignore b/.gitignore
index fed9b249..be50abf7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
-# Test annotation cache (API responses — do not commit)
+# Test annotation caches (API responses — do not commit)
 tests/test_annotation/paper_comparison_cache.jsonl
+tests/test_annotation/*_comparison_cache.jsonl
+tests/test_annotation/annotation_comparison.png
 
 # Python
 __pycache__/
diff --git a/src/torch_measure/annotation/__init__.py b/src/torch_measure/annotation/__init__.py
index 2fc96171..723b2c0c 100644
--- a/src/torch_measure/annotation/__init__.py
+++ b/src/torch_measure/annotation/__init__.py
@@ -33,7 +33,9 @@
 """
 from ._annotator import DemandAnnotator
 from ._cache import AnnotationCache
+from ._claude_client import ClaudeClient
 from ._client import GeminiClient
+from ._openai_client import OpenAIClient
 from ._rubrics import RubricsCatalog
 from ._types import (
     DEMAND_DIMENSIONS,
@@ -51,7 +53,9 @@
 
 __all__ = [
     "DemandAnnotator",
+    "ClaudeClient",
     "GeminiClient",
+    "OpenAIClient",
     "RubricsCatalog",
     "AnnotationCache",
     "UGAnnotator",
diff --git a/src/torch_measure/annotation/_claude_client.py b/src/torch_measure/annotation/_claude_client.py
new file mode 100644
index 00000000..d8aba6a2
--- /dev/null
+++ b/src/torch_measure/annotation/_claude_client.py
@@ -0,0 +1,96 @@
+"""Anthropic Claude API client — the only file that imports anthropic."""
+from __future__ import annotations
+
+import time
+
+_FINISH_REASON_MAP: dict[str, str] = {
+    "end_turn": "stop",
+    "max_tokens": "length",
+}
+
+
+def _is_retryable(exc: BaseException) -> bool:
+    try:
+        import anthropic
+        if isinstance(exc, anthropic.RateLimitError):
+            return True
+        if isinstance(exc, anthropic.APIStatusError) and exc.status_code == 529:
+            return True
+        if isinstance(exc, anthropic.APIConnectionError):
+            return True
+    except ImportError:
+        pass
+    try:
+        import httpx
+        if isinstance(exc, (httpx.RemoteProtocolError, httpx.ConnectError, httpx.ReadError)):
+            return True
+    except ImportError:
+        pass
+    return False
+
+
+class ClaudeClient:
+    """Anthropic Claude API wrapper matching GeminiClient's generate() interface.
+
+    Parameters
+    ----------
+    api_key:
+        Anthropic API key.
+    model:
+        Pinned model string, e.g. "claude-opus-4-8". No default — caller
+        must supply the exact version to guarantee reproducibility.
+    rpm:
+        Optional rate limit (requests per minute). 0 disables throttling.
+    """
+
+    _MAX_TOKENS = 4096
+    # Thinking is disabled: get_full_instruction() already requests text-based
+    # chain-of-thought in the output, matching the paper's GPT-4o methodology.
+    # Enabling thinking would add a second internal reasoning pass on top of the
+    # in-output CoT, inflating cost and diverging from the paper's approach.
+    _THINKING: dict = {"type": "disabled"}
+
+    def __init__(self, api_key: str, model: str, rpm: int = 0) -> None:
+        import anthropic
+        from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
+
+        self._client = anthropic.Anthropic(api_key=api_key)
+        self.model = model
+        self._min_interval = (60.0 / rpm) if rpm > 0 else 0.0
+        self._last_call_time: float = 0.0
+
+        self._generate_with_retry = retry(
+            retry=retry_if_exception(_is_retryable),
+            wait=wait_exponential(min=2, max=256),
+            stop=stop_after_attempt(10),
+            reraise=True,
+        )(self._call_api)
+
+    def generate(self, prompt: str) -> tuple[str, str]:
+        """Call the API and return (response_text, finish_reason).
+
+        Retries up to 10 times on transient errors with exponential backoff
+        (min 2 s, max 256 s), matching the paper's tenacity settings.
+        """
+        return self._generate_with_retry(prompt)
+
+    def _call_api(self, prompt: str) -> tuple[str, str]:
+        if self._min_interval > 0:
+            elapsed = time.monotonic() - self._last_call_time
+            if elapsed < self._min_interval:
+                time.sleep(self._min_interval - elapsed)
+        self._last_call_time = time.monotonic()
+
+        response = self._client.messages.create(
+            model=self.model,
+            max_tokens=self._MAX_TOKENS,
+            thinking=self._THINKING,
+            messages=[{"role": "user", "content": prompt}],
+        )
+
+        text = "".join(
+            block.text for block in response.content if block.type == "text"
+        )
+        raw_reason = response.stop_reason or "other"
+        finish_reason = _FINISH_REASON_MAP.get(raw_reason, "other")
+        return text, finish_reason
diff --git a/src/torch_measure/annotation/_openai_client.py b/src/torch_measure/annotation/_openai_client.py
new file mode 100644
index 00000000..65103c90
--- /dev/null
+++ b/src/torch_measure/annotation/_openai_client.py
@@ -0,0 +1,88 @@
+"""OpenAI API client — the only file that imports openai."""
+from __future__ import annotations
+
+import time
+
+_FINISH_REASON_MAP: dict[str, str] = {
+    "stop": "stop",
+    "length": "length",
+}
+
+
+def _is_retryable(exc: BaseException) -> bool:
+    try:
+        import openai
+        if isinstance(exc, openai.RateLimitError):
+            return True
+        if isinstance(exc, openai.APIStatusError) and exc.status_code in (429, 500, 502, 503, 529):
+            return True
+        if isinstance(exc, openai.APIConnectionError):
+            return True
+    except ImportError:
+        pass
+    try:
+        import httpx
+        if isinstance(exc, (httpx.RemoteProtocolError, httpx.ConnectError, httpx.ReadError)):
+            return True
+    except ImportError:
+        pass
+    return False
+
+
+class OpenAIClient:
+    """OpenAI API wrapper matching GeminiClient's generate() interface.
+
+    Parameters
+    ----------
+    api_key:
+        OpenAI API key.
+    model:
+        Pinned model string, e.g. "gpt-4o". No default — caller must supply
+        the exact version to guarantee reproducibility.
+    rpm:
+        Optional rate limit (requests per minute). 0 disables throttling.
+    """
+
+    _MAX_TOKENS = 4096
+
+    def __init__(self, api_key: str, model: str, rpm: int = 0) -> None:
+        import openai
+        from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
+
+        self._client = openai.OpenAI(api_key=api_key)
+        self.model = model
+        self._min_interval = (60.0 / rpm) if rpm > 0 else 0.0
+        self._last_call_time: float = 0.0
+
+        self._generate_with_retry = retry(
+            retry=retry_if_exception(_is_retryable),
+            wait=wait_exponential(min=2, max=256),
+            stop=stop_after_attempt(10),
+            reraise=True,
+        )(self._call_api)
+
+    def generate(self, prompt: str) -> tuple[str, str]:
+        """Call the API and return (response_text, finish_reason).
+
+        Retries up to 10 times on transient errors with exponential backoff
+        (min 2 s, max 256 s), matching the paper's tenacity settings.
+        """
+        return self._generate_with_retry(prompt)
+
+    def _call_api(self, prompt: str) -> tuple[str, str]:
+        if self._min_interval > 0:
+            elapsed = time.monotonic() - self._last_call_time
+            if elapsed < self._min_interval:
+                time.sleep(self._min_interval - elapsed)
+        self._last_call_time = time.monotonic()
+
+        response = self._client.chat.completions.create(
+            model=self.model,
+            max_tokens=self._MAX_TOKENS,
+            messages=[{"role": "user", "content": prompt}],
+        )
+
+        text = response.choices[0].message.content or ""
+        raw_reason = response.choices[0].finish_reason or "other"
+        finish_reason = _FINISH_REASON_MAP.get(raw_reason, "other")
+        return text, finish_reason
diff --git a/tests/test_annotation/compare_annotations.py b/tests/test_annotation/compare_annotations.py
new file mode 100644
index 00000000..811fb007
--- /dev/null
+++ b/tests/test_annotation/compare_annotations.py
@@ -0,0 +1,291 @@
+"""Annotation comparison: all cached annotators vs Paper (GPT-4o).
+
+Auto-discovers every *_comparison_cache.jsonl file in this directory and plots
+each annotator's agreement with the paper's reference scores. Add a new model's
+cache file and it appears in the figure automatically.
+
+Run from the torch_measure directory:
+    python tests/test_annotation/compare_annotations.py
+"""
+import csv
+import json
+import math
+from pathlib import Path
+from statistics import mean
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+HERE      = Path(__file__).parent
+REPO_ROOT = HERE.parent.parent.parent  # AIMS_local/
+PAPER_CSV = REPO_ROOT / "ADeLe-AIEvaluation" / "ADeLe_battery_data" / "ADeLe_batterry_v1dot0.csv"
+
+DIMENSION_ORDER = (
+    "AS","CEc","CEe","CL","MCr","MCt","MCu","MS",
+    "QLl","QLq","SNs","KNa","KNc","KNf","KNn","KNs","AT","VO","UG",
+)
+DEMAND_DIMS = DIMENSION_ORDER[:18]
+
+ITEM_IDS = [
+    "ChemLLMBench-molecule_captioning-522",
+    "ChemLLMBench-name_prediction-278",
+    "ChemLLMBench-reaction_prediction-40",
+    "ChemLLMBench-retrosynthesis-926",
+    "Civil Service Examination-LogiQA-en-458",
+    "Date Arithmetic-Date Arithmetic-52",
+    "LSAT-LSAT-AR-197",
+    "MCTACO-MCTACO-313",
+    "MMLU-Pro-economics-474",
+    "MMLU-Pro-history-360",
+    "MMLU-Pro-physics-330",
+    "MedCalcBench-physical-50",
+    "OmniMath-Algebra-537",
+    "SciBench-Chemistry-126",
+    "TimeQA-TimeQA-implicit-81",
+]
+
+ITEM_SHORT = [
+    "Chem\nmol","Chem\nname","Chem\nrxn","Chem\nretro",
+    "LogiQA","DateArith","LSAT","MCTACO",
+    "MMLU\neco","MMLU\nhist","MMLU\nphys",
+    "MedCalc","OmniMath","SciBench","TimeQA",
+]
+
+# Colour palette — extended to support many annotators
+_PALETTE = [
+    "#4285F4",  # blue
+    "#D97706",  # amber
+    "#16A34A",  # green
+    "#DC2626",  # red
+    "#7C3AED",  # purple
+    "#0891B2",  # cyan
+    "#EA580C",  # orange
+    "#BE185D",  # pink
+]
+
+
+# ---------------------------------------------------------------------------
+# Data loading
+# ---------------------------------------------------------------------------
+
+def _load_cache(path: Path) -> tuple[dict, str]:
+    """JSONL cache → ({item_id: {dim: level}}, model_id from first entry)."""
+    data: dict = {}
+    model_id = path.stem  # fallback label if cache is empty
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            e = json.loads(line)
+            if model_id == path.stem:
+                model_id = e.get("model_id", path.stem)
+            data.setdefault(e["item_id"], {})[e["demand"]] = e["level"]
+    return data, model_id
+
+
+def _load_paper(csv_path: Path) -> dict:
+    """CSV → {instance_id: {dim: score}} for the 15 test items."""
+    target = set(ITEM_IDS)
+    scores: dict = {}
+    with open(csv_path, newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            if row["instance_id"] in target:
+                scores[row["instance_id"]] = {d: float(row[d]) for d in DIMENSION_ORDER}
+    return scores
+
+
+def _discover_caches() -> list[tuple[Path, str]]:
+    """Return (path, model_id) for every cache file in this directory, oldest first."""
+    candidates = sorted(
+        list(HERE.glob("*_comparison_cache.jsonl")) + list(HERE.glob("paper_comparison_cache.jsonl")),
+        key=lambda p: p.stat().st_mtime,
+    )
+    seen = set()
+    result = []
+    for p in candidates:
+        if p in seen:
+            continue
+        seen.add(p)
+        _, mid = _load_cache(p)
+        result.append((p, mid))
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Statistics helpers
+# ---------------------------------------------------------------------------
+
+def _ok(v) -> bool:
+    return v is not None and not math.isnan(float(v))
+
+
+def _valid_pairs(model: dict, paper: dict, dims) -> list[tuple[float, float]]:
+    out = []
+    for iid in ITEM_IDS:
+        for dim in dims:
+            m = model.get(iid, {}).get(dim, float("nan"))
+            p = paper.get(iid, {}).get(dim, float("nan"))
+            if _ok(m) and _ok(p):
+                out.append((float(p), float(m)))
+    return out
+
+
+def _per_dim_stat(model: dict, paper: dict, dims, fn) -> dict:
+    result = {}
+    for dim in dims:
+        vals = []
+        for iid in ITEM_IDS:
+            m = model.get(iid, {}).get(dim, float("nan"))
+            p = paper.get(iid, {}).get(dim, float("nan"))
+            if _ok(m) and _ok(p):
+                vals.append(fn(float(p), float(m)))
+        result[dim] = mean(vals) if vals else float("nan")
+    return result
+
+
+def _per_item_mad(model: dict, paper: dict, dims) -> list[float]:
+    mads = []
+    for iid in ITEM_IDS:
+        diffs = []
+        for dim in dims:
+            m = model.get(iid, {}).get(dim, float("nan"))
+            p = paper.get(iid, {}).get(dim, float("nan"))
+            if _ok(m) and _ok(p):
+                diffs.append(abs(float(m) - float(p)))
+        mads.append(mean(diffs) if diffs else float("nan"))
+    return mads
+
+
+def _overall(pairs: list[tuple[float, float]]) -> dict:
+    diffs  = [m - p for p, m in pairs]
+    adiffs = [abs(d) for d in diffs]
+    n = len(diffs)
+    return {
+        "MAE":    mean(adiffs),
+        "Bias":   mean(diffs),
+        "Exact%": 100 * sum(d == 0 for d in diffs) / n,
+        "±1%":    100 * sum(abs(d) <= 1 for d in diffs) / n,
+        "n":      n,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    caches = _discover_caches()
+    if not caches:
+        print("No *_comparison_cache.jsonl files found.")
+        return
+
+    paper = _load_paper(PAPER_CSV)
+
+    annotators = []
+    for path, model_id in caches:
+        data, _ = _load_cache(path)
+        pairs    = _valid_pairs(data, paper, DEMAND_DIMS)
+        annotators.append({
+            "label":    model_id,
+            "data":     data,
+            "pairs":    pairs,
+            "stats":    _overall(pairs),
+            "dim_mae":  _per_dim_stat(data, paper, DEMAND_DIMS, lambda p, m: abs(m - p)),
+            "dim_bias": _per_dim_stat(data, paper, DEMAND_DIMS, lambda p, m: m - p),
+            "item_mad": _per_item_mad(data, paper, DEMAND_DIMS),
+        })
+
+    n_ann = len(annotators)
+    colors = [_PALETTE[i % len(_PALETTE)] for i in range(n_ann)]
+    bar_w  = min(0.7 / n_ann, 0.25)
+    offsets = np.linspace(-(n_ann - 1) / 2 * bar_w, (n_ann - 1) / 2 * bar_w, n_ann)
+    x18    = np.arange(len(DEMAND_DIMS))
+    x15    = np.arange(len(ITEM_IDS))
+    rng    = np.random.default_rng(42)
+
+    fig = plt.figure(figsize=(22, 15))
+    fig.suptitle(
+        f"Annotator Comparison vs Paper (GPT-4o)  —  {len(ITEM_IDS)} items",
+        fontsize=15, fontweight="bold", y=0.99,
+    )
+
+    # --- Panel 1: Per-dimension MAE ---
+    ax1 = fig.add_subplot(2, 2, 1)
+    for i, (ann, col, off) in enumerate(zip(annotators, colors, offsets)):
+        vals = [ann["dim_mae"].get(d, float("nan")) for d in DEMAND_DIMS]
+        ax1.bar(x18 + off, vals, bar_w, label=ann["label"], color=col, alpha=0.85)
+        ax1.axhline(ann["stats"]["MAE"], color=col, linestyle="--",
+                    linewidth=0.9, alpha=0.55)
+    ax1.set_xticks(x18)
+    ax1.set_xticklabels(DEMAND_DIMS, rotation=45, ha="right", fontsize=8)
+    ax1.set_ylabel("MAE vs Paper (0–5 scale)")
+    ax1.set_title("Per-Dimension MAE vs Paper")
+    ax1.legend(fontsize=7.5)
+    ax1.set_ylim(0, 2.8)
+    ax1.grid(axis="y", alpha=0.3)
+
+    # --- Panel 2: Score scatter ---
+    ax2 = fig.add_subplot(2, 2, 2)
+    jit = 0.06
+    for ann, col in zip(annotators, colors):
+        pts = np.array(ann["pairs"])
+        ax2.scatter(
+            pts[:, 0] + rng.uniform(-jit, jit, len(pts)),
+            pts[:, 1] + rng.uniform(-jit, jit, len(pts)),
+            alpha=0.18, s=12, color=col, label=ann["label"],
+        )
+    ax2.plot([0, 5], [0, 5], "k--", alpha=0.4, linewidth=1.2, label="Perfect agreement")
+    ax2.set_xlabel("Paper Score (GPT-4o)")
+    ax2.set_ylabel("Annotator Score")
+    ax2.set_title(f"Score Correlation vs Paper  ({len(ITEM_IDS)}×{len(DEMAND_DIMS)} pairs)")
+    ax2.legend(fontsize=7.5)
+    ax2.set_xlim(-0.4, 5.4); ax2.set_ylim(-0.4, 5.4)
+    ax2.set_xticks(range(6)); ax2.set_yticks(range(6))
+    ax2.grid(alpha=0.25)
+
+    # --- Panel 3: Per-item MAD ---
+    ax3 = fig.add_subplot(2, 2, 3)
+    for ann, col, off in zip(annotators, colors, offsets):
+        ax3.bar(x15 + off, ann["item_mad"], bar_w,
+                label=ann["label"], color=col, alpha=0.85)
+    ax3.set_xticks(x15)
+    ax3.set_xticklabels(ITEM_SHORT, fontsize=7.5)
+    ax3.set_ylabel("MAD vs Paper (demand dims)")
+    ax3.set_title("Per-Item Divergence from Paper")
+    ax3.legend(fontsize=7.5)
+    ax3.grid(axis="y", alpha=0.3)
+
+    # --- Panel 4: Bias per dimension ---
+    ax4 = fig.add_subplot(2, 2, 4)
+    for ann, col, off in zip(annotators, colors, offsets):
+        vals = [ann["dim_bias"].get(d, float("nan")) for d in DEMAND_DIMS]
+        ax4.bar(x18 + off, vals, bar_w, label=ann["label"], color=col, alpha=0.85)
+        ax4.axhline(ann["stats"]["Bias"], color=col, linestyle="--",
+                    linewidth=0.9, alpha=0.55)
+    ax4.axhline(0, color="black", linewidth=0.9)
+    ax4.set_xticks(x18)
+    ax4.set_xticklabels(DEMAND_DIMS, rotation=45, ha="right", fontsize=8)
+    ax4.set_ylabel("Mean (annotator − paper)")
+    ax4.set_title("Systematic Bias per Dimension\n(negative = annotator scores lower than paper)")
+    ax4.legend(fontsize=7.5)
+    ax4.grid(axis="y", alpha=0.3)
+
+    # --- Summary table ---
+    header = f"{'Metric':<14}" + "".join(f"{a['label']:>20}" for a in annotators)
+    rows = []
+    for key, fmt in [("MAE", ".3f"), ("Bias", "+.3f"), ("Exact%", ".1f"), ("±1%", ".1f"), ("n", "d")]:
+        row = f"{key:<14}" + "".join(f"{format(a['stats'][key], fmt):>20}" for a in annotators)
+        rows.append(row)
+    summary = "\n".join([header, "─" * (14 + 20 * n_ann)] + rows)
+    fig.text(0.5, 0.005, summary, ha="center", fontsize=8.5, fontfamily="monospace",
+             bbox=dict(boxstyle="round,pad=0.5", facecolor="#FFF9C4", alpha=0.9))
+
+    plt.tight_layout(rect=[0, 0.10 + 0.015 * n_ann, 1, 0.97])
+
+    out = HERE / "annotation_comparison.png"
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    print(f"\nAnnotators plotted: {[a['label'] for a in annotators]}")
+    print(f"Saved -> {out}")
+    plt.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_annotation/test_live.py b/tests/test_annotation/test_live.py
index 57079ac5..ec5161a1 100644
--- a/tests/test_annotation/test_live.py
+++ b/tests/test_annotation/test_live.py
@@ -1,15 +1,25 @@
 # Copyright (c) 2026 AIMS Foundations. MIT License.
 
-"""Live end-to-end tests — require a real Gemini API key.
+"""Live end-to-end tests — require a real API key.
 
 These tests make real API calls and consume quota. Run only when you want
 to verify actual model behavior.
 
-Usage:
-    GEMINI_API_KEY=<key> GEMINI_MODEL=gemini-3.1-flash-lite \
-        pytest tests/test_annotation/test_live.py -v -m "network and slow"
+Usage (Gemini):
+    $env:GEMINI_API_KEY = "<key>"
+    pytest tests/test_annotation/test_live.py -v -s -m "network and slow"
 
-Skip automatically if GEMINI_API_KEY is not set.
+Usage (Claude):
+    $env:ANNOTATOR_CLIENT = "claude"
+    $env:ANTHROPIC_API_KEY = "<key>"
+    pytest tests/test_annotation/test_live.py -v -s -m "network and slow"
+
+Usage (OpenAI):
+    $env:ANNOTATOR_CLIENT = "openai"
+    $env:OPENAI_API_KEY = "<key>"
+    pytest tests/test_annotation/test_live.py -v -s -m "network and slow"
+
+Skip automatically if the required API key is not set.
 """
 
 import math
@@ -20,8 +30,10 @@
 from torch_measure.annotation import (
     AnnotationCache,
     AnnotationJob,
+    ClaudeClient,
     DemandAnnotator,
     GeminiClient,
+    OpenAIClient,
     RubricsCatalog,
 )
 from torch_measure.annotation._types import DEMAND_DIMENSIONS
@@ -34,24 +46,48 @@
 # ---------------------------------------------------------------------------
 
 @pytest.fixture(scope="module")
-def api_key():
-    key = os.environ.get("GEMINI_API_KEY", "").strip()
-    if not key:
-        pytest.skip("GEMINI_API_KEY environment variable not set")
+def client_type():
+    return os.environ.get("ANNOTATOR_CLIENT", "gemini").strip().lower()
+
+
+@pytest.fixture(scope="module")
+def api_key(client_type):
+    if client_type == "claude":
+        key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
+        if not key:
+            pytest.skip("ANTHROPIC_API_KEY environment variable not set")
+    elif client_type == "openai":
+        key = os.environ.get("OPENAI_API_KEY", "").strip()
+        if not key:
+            pytest.skip("OPENAI_API_KEY environment variable not set")
+    else:
+        key = os.environ.get("GEMINI_API_KEY", "").strip()
+        if not key:
+            pytest.skip("GEMINI_API_KEY environment variable not set")
     return key
 
 
 @pytest.fixture(scope="module")
-def model_id():
+def model_id(client_type):
+    if client_type == "claude":
+        return os.environ.get("CLAUDE_MODEL", "claude-opus-4-8")
+    if client_type == "openai":
+        return os.environ.get("OPENAI_MODEL", "gpt-4o")
     return os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite")
 
 
 @pytest.fixture(scope="module")
-def live_annotator(api_key, model_id, tmp_path_factory):
-    client = GeminiClient(api_key=api_key, model=model_id)
+def live_annotator(api_key, model_id, client_type, tmp_path_factory):
+    if client_type == "claude":
+        client = ClaudeClient(api_key=api_key, model=model_id)
+    elif client_type == "openai":
+        client = OpenAIClient(api_key=api_key, model=model_id)
+    else:
+        client = GeminiClient(api_key=api_key, model=model_id)
     rubrics = RubricsCatalog()
     cache_dir = tmp_path_factory.mktemp("live_annotation_cache")
     cache = AnnotationCache(cache_dir / "cache.jsonl")
+    print(f"\nClient: {client_type}  Model: {model_id}")
     return DemandAnnotator(client=client, rubrics=rubrics, cache=cache)
 
 
diff --git a/tests/test_annotation/test_paper_comparison.py b/tests/test_annotation/test_paper_comparison.py
new file mode 100644
index 00000000..da503031
--- /dev/null
+++ b/tests/test_annotation/test_paper_comparison.py
@@ -0,0 +1,364 @@
+# Copyright (c) 2026 AIMS Foundations. MIT License.
+
+"""Compare annotator output against the paper's annotations from ADeLe battery.
+
+Fetches 15 diverse items from the HuggingFace ADeLe battery dataset, runs the
+annotator on each, and reports score agreement + statistical analysis against
+the paper's reference annotations across all 19 dimensions.
+
+Items are sampled one per benchmark/task-type for diversity:
+    molecule_captioning, name_prediction, reaction_prediction, retrosynthesis,
+    LogiQA-en, Date Arithmetic, LSAT-AR, MCTACO, MMLU-Pro x3,
+    MedCalcBench, OmniMath, SciBench, TimeQA
+
+Requires:
+    HF_TOKEN            — HuggingFace token (dataset is gated)
+    ANNOTATOR_CLIENT    — 'gemini' (default), 'claude', or 'openai'
+    GEMINI_API_KEY      — Gemini API key      (when ANNOTATOR_CLIENT=gemini)
+    GEMINI_MODEL        — model string        (default: gemini-3.1-flash-lite)
+    ANTHROPIC_API_KEY   — Anthropic API key   (when ANNOTATOR_CLIENT=claude)
+    CLAUDE_MODEL        — model string        (default: claude-opus-4-8)
+    OPENAI_API_KEY      — OpenAI API key      (when ANNOTATOR_CLIENT=openai)
+    OPENAI_MODEL        — model string        (default: gpt-4o)
+
+Usage (Gemini — writes to paper_comparison_cache.jsonl):
+    $env:HF_TOKEN = "<token>"
+    $env:GEMINI_API_KEY = "<key>"
+    python -m pytest tests/test_annotation/test_paper_comparison.py -v -s -m "network and slow"
+
+Usage (Claude — writes to claude_opus_4_8_comparison_cache.jsonl):
+    $env:HF_TOKEN = "<token>"
+    $env:ANNOTATOR_CLIENT = "claude"
+    $env:ANTHROPIC_API_KEY = "<key>"
+    python -m pytest tests/test_annotation/test_paper_comparison.py -v -s -m "network and slow"
+
+Usage (OpenAI — writes to gpt_4o_comparison_cache.jsonl):
+    $env:HF_TOKEN = "<token>"
+    $env:ANNOTATOR_CLIENT = "openai"
+    $env:OPENAI_API_KEY = "<key>"
+    python -m pytest tests/test_annotation/test_paper_comparison.py -v -s -m "network and slow"
+
+Cost: 15 items x 19 calls = 285 API calls.
+"""
+
+import json
+import math
+import os
+import time
+import urllib.request
+from statistics import mean, stdev
+
+import pytest
+
+from torch_measure.annotation import (
+    AnnotationCache,
+    AnnotationJob,
+    ClaudeClient,
+    DemandAnnotator,
+    GeminiClient,
+    OpenAIClient,
+    RubricsCatalog,
+)
+from torch_measure.annotation._types import DEMAND_DIMENSIONS, DIMENSION_ORDER
+
+pytestmark = [pytest.mark.network, pytest.mark.slow]
+
+_HF_BASE = (
+    "https://datasets-server.huggingface.co/rows"
+    "?dataset=CFI-Kinds-of-Intelligence%2FADeLe_battery_v1dot0"
+    "&config=default&split=train&length=1&offset={offset}"
+)
+
+_OFFSETS = [0, 500, 1000, 1500, 2000, 2500, 3000, 4000, 6000, 7000, 9000, 10000, 11000, 13000, 15000]
+
+_N_ITEMS = len(_OFFSETS)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(scope="module")
+def hf_token():
+    token = os.environ.get("HF_TOKEN", "").strip()
+    if not token:
+        pytest.skip("HF_TOKEN environment variable not set")
+    return token
+
+
+@pytest.fixture(scope="module")
+def client_type():
+    return os.environ.get("ANNOTATOR_CLIENT", "gemini").strip().lower()
+
+
+@pytest.fixture(scope="module")
+def api_key(client_type):
+    if client_type == "claude":
+        key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
+        if not key:
+            pytest.skip("ANTHROPIC_API_KEY environment variable not set")
+    elif client_type == "openai":
+        key = os.environ.get("OPENAI_API_KEY", "").strip()
+        if not key:
+            pytest.skip("OPENAI_API_KEY environment variable not set")
+    else:
+        key = os.environ.get("GEMINI_API_KEY", "").strip()
+        if not key:
+            pytest.skip("GEMINI_API_KEY environment variable not set")
+    return key
+
+
+@pytest.fixture(scope="module")
+def model_id(client_type):
+    if client_type == "claude":
+        return os.environ.get("CLAUDE_MODEL", "claude-opus-4-8")
+    if client_type == "openai":
+        return os.environ.get("OPENAI_MODEL", "gpt-4o")
+    return os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite")
+
+
+@pytest.fixture(scope="module")
+def paper_rows(hf_token):
+    """Fetch one row per offset from the ADeLe battery dataset."""
+    rows = []
+    for i, offset in enumerate(_OFFSETS):
+        url = _HF_BASE.format(offset=offset)
+        req = urllib.request.Request(url, headers={"Authorization": "Bearer " + hf_token})
+        resp = urllib.request.urlopen(req)
+        data = json.loads(resp.read())
+        rows.append(data["rows"][0]["row"])
+        if len(rows) < len(_OFFSETS):
+            time.sleep(0.5)
+    print(f"\nFetched {len(rows)} items from ADeLe battery")
+    return rows
+
+
+def _cache_name(client_type: str, model_id: str) -> str:
+    if client_type == "gemini":
+        return "paper_comparison_cache.jsonl"
+    safe = model_id.replace("/", "_").replace("-", "_").replace(".", "_")
+    return f"{safe}_comparison_cache.jsonl"
+
+
+@pytest.fixture(scope="module")
+def annotator(api_key, model_id, client_type):
+    import pathlib
+    if client_type == "claude":
+        client = ClaudeClient(api_key=api_key, model=model_id)
+    elif client_type == "openai":
+        client = OpenAIClient(api_key=api_key, model=model_id)
+    else:
+        client = GeminiClient(api_key=api_key, model=model_id)
+    rubrics = RubricsCatalog()
+    cache_path = pathlib.Path(__file__).parent / _cache_name(client_type, model_id)
+    cache = AnnotationCache(cache_path)
+    print(f"\nClient: {client_type}  Model: {model_id}")
+    print(f"Cache:  {cache_path}")
+    return DemandAnnotator(client=client, rubrics=rubrics, cache=cache)
+
+
+@pytest.fixture(scope="module")
+def comparison_results(paper_rows, annotator):
+    """Annotate all 15 items and pair with paper scores.
+
+    Failures on individual items are caught and skipped — partial runs
+    still produce results for completed items. The persistent cache means
+    re-runs resume from where the previous run left off at zero extra cost.
+    """
+    results = []
+    skipped = []
+    print(f"\n{'─'*60}")
+    print(f"Annotating {_N_ITEMS} items — 19 API calls each")
+    print(f"{'─'*60}")
+    for i, row in enumerate(paper_rows):
+        job = AnnotationJob(
+            item_id=str(row["instance_id"]),
+            content=row["question"],
+            reference_answer=row["groundtruth"],
+        )
+        print(f"  [{i+1:2d}/{_N_ITEMS}] {row['benchmark']} / {row['task']} ...", end="", flush=True)
+        try:
+            annotation = annotator.annotate(job)
+            vector = annotation.to_feature_vector()
+            n_nan = sum(math.isnan(v) for v in vector)
+            status = f" done  (NaN: {n_nan})" if n_nan > 0 else " done ✓"
+            results.append({
+                "item_id": job.item_id,
+                "benchmark": row["benchmark"],
+                "task": row["task"],
+                "paper": {dim: float(row[dim]) for dim in DIMENSION_ORDER},
+                "ours": vector,
+            })
+        except Exception as exc:
+            status = f" FAILED — {type(exc).__name__}: {exc}"
+            skipped.append(job.item_id)
+        print(status)
+    print(f"Completed: {len(results)}  |  Skipped: {len(skipped)}")
+    if skipped:
+        print(f"Skipped items (re-run to retry from cache): {skipped}")
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _valid_pair(p: float, o: float) -> bool:
+    return not math.isnan(p) and not math.isnan(o)
+
+
+def _diff(p: float, o: float) -> float:
+    return o - p
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+class TestFetchAndAnnotate:
+
+    def test_all_items_fetched(self, paper_rows):
+        assert len(paper_rows) == _N_ITEMS
+
+    def test_all_items_annotated(self, comparison_results):
+        assert len(comparison_results) > 0, "No items were annotated — check API key and quota"
+        r = len(comparison_results)
+        if r < _N_ITEMS:
+            pytest.xfail(
+                f"Only {r} items completed. Re-run to finish — completed items are cached at zero cost."
+            )
+
+    def test_no_parse_failures(self, comparison_results):
+        failures = []
+        for r in comparison_results:
+            nan_dims = [DIMENSION_ORDER[i] for i, v in enumerate(r["ours"]) if math.isnan(v)]
+            if nan_dims:
+                failures.append(f"  {r['item_id']}: {nan_dims}")
+        assert not failures, "Parse failures (NaN scores):\n" + "\n".join(failures)
+
+
+class TestItemComparison:
+
+    def test_print_per_item_scores(self, comparison_results):
+        """Print each item's paper vs our scores side-by-side. 0 extra API calls."""
+        for r in comparison_results:
+            print(f"\n{'─'*72}")
+            print(f"Item : {r['item_id']}")
+            print(f"Bench: {r['benchmark']} / {r['task']}")
+            print(f"{'Dim':<8} {'Paper':>6} {'Ours':>6} {'Diff':>6}  Verdict")
+            print(f"{'─'*50}")
+            for i, dim in enumerate(DIMENSION_ORDER):
+                p = r["paper"][dim]
+                o = r["ours"][i]
+                if math.isnan(p):
+                    print(f"{dim:<8} {'N/A':>6} {o:>6.1f}        (no paper score)")
+                    continue
+                if math.isnan(o):
+                    print(f"{dim:<8} {p:>6.1f} {'N/A':>6}        (parse failure)")
+                    continue
+                d = _diff(p, o)
+                if abs(d) == 0:
+                    verdict = "exact"
+                elif abs(d) <= 1:
+                    verdict = "~±1"
+                else:
+                    verdict = f"OFF {d:+.0f}"
+                print(f"{dim:<8} {p:>6.1f} {o:>6.1f} {d:>+6.1f}  {verdict}")
+
+
+class TestStatistics:
+
+    def test_print_statistical_summary(self, comparison_results):
+        """Full statistical breakdown: per-dimension and overall. 0 extra API calls."""
+
+        def pearson(pairs: list) -> float:
+            if len(pairs) < 2:
+                return float("nan")
+            xs = [float(p) for p, _ in pairs]
+            ys = [float(o) for _, o in pairs]
+            mx, my = mean(xs), mean(ys)
+            num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
+            den = (sum((x - mx) ** 2 for x in xs) * sum((y - my) ** 2 for y in ys)) ** 0.5
+            return float("nan") if den == 0 else num / den
+
+        def spearman(pairs: list) -> float:
+            if len(pairs) < 2:
+                return float("nan")
+
+            def ranks(lst):
+                sorted_idx = sorted(range(len(lst)), key=lambda i: lst[i])
+                r = [0.0] * len(lst)
+                for rank, idx in enumerate(sorted_idx):
+                    r[idx] = float(rank)
+                return r
+
+            xs = [float(p) for p, _ in pairs]
+            ys = [float(o) for _, o in pairs]
+            rx = ranks(xs)
+            ry = ranks(ys)
+            return pearson(list(zip(rx, ry)))
+
+        dim_pairs: dict[str, list] = {d: [] for d in DIMENSION_ORDER}
+        for r in comparison_results:
+            for i, dim in enumerate(DIMENSION_ORDER):
+                p = r["paper"][dim]
+                o = r["ours"][i]
+                if _valid_pair(p, o):
+                    dim_pairs[dim].append((p, o))
+
+        all_pairs = [po for pairs in dim_pairs.values() for po in pairs]
+        n_total = len(all_pairs)
+        all_diffs = [_diff(p, o) for p, o in all_pairs]
+        abs_diffs = [abs(d) for d in all_diffs]
+        exact = sum(d == 0 for d in all_diffs)
+        within_1 = sum(abs(d) <= 1 for d in all_diffs)
+
+        r_pearson = pearson(all_pairs)
+        r_spearman = spearman(all_pairs)
+        mae = mean(abs_diffs)
+        bias = mean(all_diffs)
+        sd = stdev(all_diffs) if len(all_diffs) > 1 else float("nan")
+
+        print(f"\n{'='*72}")
+        print(f"OVERALL STATISTICS  ({_N_ITEMS} items × 19 dims = {n_total} scored pairs)")
+        print(f"  MAE (mean |ours − paper|)  : {mae:.3f}")
+        print(
+            f"  Bias (mean ours − paper)   : {bias:+.3f}  "
+            f"({'our scores higher' if bias > 0 else 'our scores lower' if bias < 0 else 'no bias'})"
+        )
+        print(f"  Std dev of differences     : {sd:.3f}")
+        print(f"  Pearson r                  : {r_pearson:.3f}")
+        print(f"  Spearman ρ                 : {r_spearman:.3f}")
+        print(f"  Exact match                : {exact}/{n_total}  ({100*exact/n_total:.1f}%)")
+        print(f"  Within ±1                  : {within_1}/{n_total}  ({100*within_1/n_total:.1f}%)")
+        print(f"  Off by >1                  : {n_total - within_1}/{n_total}  ({100*(n_total - within_1)/n_total:.1f}%)")
+
+        print(f"\n{'─'*72}")
+        print(f"PER-DIMENSION BREAKDOWN")
+        print(f"{'Dim':<8} {'MAE':>6} {'Bias':>7} {'Exact%':>7} {'±1%':>6}  Agreement")
+        print(f"{'─'*60}")
+
+        dim_stats = []
+        for dim in DIMENSION_ORDER:
+            pairs = dim_pairs[dim]
+            if not pairs:
+                continue
+            diffs = [_diff(p, o) for p, o in pairs]
+            adiffs = [abs(d) for d in diffs]
+            d_mae = mean(adiffs)
+            d_bias = mean(diffs)
+            d_exact = sum(d == 0 for d in diffs)
+            d_w1 = sum(abs(d) <= 1 for d in diffs)
+            n = len(pairs)
+            bar = "█" * int(10 * d_w1 / n)
+            print(
+                f"{dim:<8} {d_mae:>6.2f} {d_bias:>+7.2f} {100*d_exact/n:>7.1f}% "
+                f"{100*d_w1/n:>6.1f}%  {bar}"
+            )
+            dim_stats.append((dim, d_mae))
+
+        dim_stats.sort(key=lambda x: x[1])
+        best = ", ".join(d for d, _ in dim_stats[:3])
+        worst = ", ".join(d for d, _ in reversed(dim_stats[-3:]))
+        print(f"\nMost agreed dimensions  (lowest MAE): {best}")
+        print(f"Least agreed dimensions (highest MAE): {worst}")