aims-foundations · AneeshD04 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# Test annotation caches (API responses — do not commit)
+tests/test_annotation/paper_comparison_cache.jsonl
+tests/test_annotation/*_comparison_cache.jsonl
+tests/test_annotation/annotation_comparison.png
+
 # Python
 __pycache__/
 *.py[cod]

diff --git a/docs/source/api/annotation.rst b/docs/source/api/annotation.rst
@@ -0,0 +1,39 @@
+Annotation
+==========
+
+Tools for annotating benchmark items with demand vectors using the
+18-dimension ADeLe rubric system from the Nature 2026 paper.
+
+:class:`DemandAnnotator` accepts any client that implements
+``generate(prompt: str) -> tuple[str, str]``, so any LLM provider
+(OpenAI, Anthropic, Azure, etc.) can be used in place of :class:`GeminiClient`
+by wrapping it in a class with that single method.
+
+.. automodule:: torch_measure.annotation
+   :members:
+
+Core Classes
+------------
+
+.. autoclass:: torch_measure.annotation.DemandAnnotator
+   :members:
+   :undoc-members:
+
+.. autoclass:: torch_measure.annotation.GeminiClient
+   :members:
+   :undoc-members:
+
+.. autoclass:: torch_measure.annotation.RubricsCatalog
+   :members:
+   :undoc-members:
+
+.. autoclass:: torch_measure.annotation.AnnotationCache
+   :members:
+   :undoc-members:
+
+Data Types
+----------
+
+.. autoclass:: torch_measure.annotation.AnnotationJob
+   :members:
+   :undoc-members:
diff --git a/docs/source/api/models.rst b/docs/source/api/models.rst
@@ -53,6 +53,13 @@ Factor Models
    :members:
    :undoc-members:
 
+Demand-Based Models
+-------------------
+
+.. autoclass:: torch_measure.models.DemandAssessor
+   :members:
+   :undoc-members:
+
 Rotation Utilities
 ------------------
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -16,6 +16,7 @@ Computerized Adaptive Testing, psychometric metrics, and factor models.
    :caption: API Reference
 
    api/models
+   api/annotation
    api/cat
    api/metrics
    api/data

diff --git a/src/torch_measure/annotation/__init__.py b/src/torch_measure/annotation/__init__.py
@@ -0,0 +1,72 @@
+"""ADeLe demand annotation pipeline (Gemini re-implementation).
+
+Reproduces the annotation methodology from:
+  Zhou et al. (2026) "General scales unlock AI evaluation with explanatory
+  and predictive power." Nature.
+
+``DemandAnnotator`` accepts any client that implements
+``generate(prompt: str) -> tuple[str, str]`` (response text, finish reason).
+``GeminiClient`` is the bundled implementation, but any LLM provider
+(OpenAI, Anthropic, Azure, etc.) can be used by wrapping it in a class with
+that single method.
+
+Public API
+----------
+DemandAnnotator   — main entry point: annotates one item or a full dataset
+GeminiClient      — Gemini API wrapper (caller supplies pinned model string)
+RubricsCatalog    — loads the 19 bundled rubric files
+AnnotationCache   — append-only JSONL result cache
+
+Data types
+----------
+AnnotationJob     — input: item_id, content, reference_answer
+DemandAnnotation  — one (item, rubric) result with CoT response
+UGAnnotation      — UG classification result
+ItemAnnotation    — all 19 annotations for one item (.to_feature_vector())
+DemandVector      — full-dataset tensor (n_items × 19) for DemandAssessor
+CacheEntry        — one persisted cache record
+
+Constants
+---------
+DIMENSION_ORDER   — canonical ordering of all 19 dimensions
+DEMAND_DIMENSIONS — the first 18 (excludes UG)
+"""
+from ._annotator import DemandAnnotator
+from ._cache import AnnotationCache
+from ._claude_client import ClaudeClient
+from ._client import GeminiClient
+from ._openai_client import OpenAIClient
+from ._rubrics import RubricsCatalog
+from ._types import (
+    DEMAND_DIMENSIONS,
+    DIMENSION_ORDER,
+    N_DIMENSIONS,
+    AnnotationJob,
+    CacheEntry,
+    DemandAnnotation,
+    DemandVector,
+    ItemAnnotation,
+    Rubric,
+    UGAnnotation,
+)
+from ._ug import UGAnnotator
+
+__all__ = [
+    "DemandAnnotator",
+    "ClaudeClient",
+    "GeminiClient",
+    "OpenAIClient",
+    "RubricsCatalog",
+    "AnnotationCache",
+    "UGAnnotator",
+    "AnnotationJob",
+    "DemandAnnotation",
+    "UGAnnotation",
+    "ItemAnnotation",
+    "DemandVector",
+    "CacheEntry",
+    "Rubric",
+    "DIMENSION_ORDER",
+    "DEMAND_DIMENSIONS",
+    "N_DIMENSIONS",
+]
diff --git a/src/torch_measure/annotation/_annotator.py b/src/torch_measure/annotation/_annotator.py
@@ -0,0 +1,127 @@
+from __future__ import annotations
+
+import hashlib
+from typing import Optional
+
+from ._cache import AnnotationCache, make_cache_key
+from ._client import GeminiClient
+from ._parsers import extract_demand_level
+from ._prompts import get_full_instruction
+from ._rubrics import RubricsCatalog
+from ._types import (
+    AnnotationJob,
+    CacheEntry,
+    DemandAnnotation,
+    DemandVector,
+    ItemAnnotation,
+    Rubric,
+)
+from ._ug import UGAnnotator
+
+
+class DemandAnnotator:
+    """Runs the full 19-call ADeLe annotation pipeline for one benchmark item.
+
+    One API call per demand rubric (18 sequential calls) plus one UG call.
+    Results are cached to avoid redundant API calls across runs.
+
+    ``client`` can be any object implementing
+    ``generate(prompt: str) -> tuple[str, str]``. The bundled
+    :class:`GeminiClient` is the default, but any LLM provider can be used
+    by wrapping it in a class with that single method.
+    """
+
+    def __init__(
+        self,
+        client: GeminiClient,
+        rubrics: RubricsCatalog,
+        cache: Optional[AnnotationCache] = None,
+    ) -> None:
+        self._client = client
+        self._rubrics = rubrics
+        self._cache = cache
+        self._ug = UGAnnotator(client, rubrics, cache)
+
+    def annotate(self, job: AnnotationJob) -> ItemAnnotation:
+        """Annotate one item across all 18 demand rubrics plus UG."""
+        demands: dict[str, DemandAnnotation] = {}
+        for rubric in self._rubrics.all_demand_rubrics():
+            demands[rubric.acronym] = self._annotate_one(job, rubric)
+        ug = self._ug.annotate(job)
+        return ItemAnnotation(item_id=job.item_id, demands=demands, ug=ug)
+
+    def annotate_dataset(self, jobs: list[AnnotationJob]) -> DemandVector:
+        """Annotate all items and return a (n_items × 19) tensor.
+
+        Row ordering in the returned ``DemandVector.tensor`` mirrors the order
+        of ``jobs``. To pass the result to ``DemandAssessor.fit()``, supply
+        ``jobs`` in the same order as ``data.to_fit_tensors()["item_ids"]``::
+
+            item_ids = data.to_fit_tensors()["item_ids"]          # canonical order
+            jobs = [AnnotationJob(iid, content[iid], ref[iid]) for iid in item_ids]
+            dv   = annotator.annotate_dataset(jobs)
+            model.fit(data, item_features=dv.tensor)
+        """
+        import torch
+
+        item_ids: list[str] = []
+        rows: list[list[float]] = []
+        for job in jobs:
+            item_ann = self.annotate(job)
+            item_ids.append(job.item_id)
+            rows.append(item_ann.to_feature_vector())
+
+        tensor = torch.tensor(rows, dtype=torch.float32)
+        return DemandVector(item_ids=item_ids, tensor=tensor)
+
+    def _annotate_one(self, job: AnnotationJob, rubric: Rubric) -> DemandAnnotation:
+        key = make_cache_key(
+            content=job.content,
+            acronym=rubric.acronym,
+            model_id=self._client.model,
+            rubric_hash=rubric.rubric_hash,
+        )
+
+        if self._cache is not None:
+            entry = self._cache.get(key)
+            if entry is not None:
+                return DemandAnnotation(
+                    item_id=job.item_id,
+                    demand=rubric.acronym,
+                    level=entry.level,
+                    finish_reason=entry.finish_reason,
+                    model_response=entry.model_response,
+                )
+
+        prompt = get_full_instruction(
+            dimension=rubric.dimension_name,
+            rubric_content=rubric.content,
+            item_text=job.content,
+        )
+        model_response, finish_reason = self._client.generate(prompt)
+        level = extract_demand_level(model_response)
+
+        annotation = DemandAnnotation(
+            item_id=job.item_id,
+            demand=rubric.acronym,
+            level=level,
+            finish_reason=finish_reason,
+            model_response=model_response,
+        )
+
+        if self._cache is not None:
+            content_hash = hashlib.sha256(job.content.encode()).hexdigest()[:16]
+            self._cache.put(CacheEntry(
+                key=key,
+                item_id=job.item_id,
+                demand=rubric.acronym,
+                level=level,
+                finish_reason=finish_reason,
+                model_response=model_response,
+                rubric_hash=rubric.rubric_hash,
+                model_id=self._client.model,
+                content_hash=content_hash,
+                timestamp=AnnotationCache.now_iso(),
+            ))
+
+        return annotation
diff --git a/src/torch_measure/annotation/_cache.py b/src/torch_measure/annotation/_cache.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import dataclasses
+import hashlib
+import json
+import math
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+from ._types import CacheEntry
+
+
+def make_cache_key(content: str, acronym: str, model_id: str, rubric_hash: str) -> str:
+    """sha256(content)[:16] : acronym : model_id : rubric_hash"""
+    content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
+    return f"{content_hash}:{acronym}:{model_id}:{rubric_hash}"
+
+
+class AnnotationCache:
+    """Append-only JSONL cache keyed by sha256(content)[:16]:acronym:model_id:rubric_hash."""
+
+    def __init__(self, path: Path) -> None:
+        self._path = path
+        self._index: dict[str, CacheEntry] = {}
+        if path.exists():
+            self._load()
+
+    def _load(self) -> None:
+        with open(self._path, encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                data = json.loads(line)
+                # NaN is serialised as null (RFC-compliant); restore here.
+                level = data.get("level")
+                if level is None:
+                    data["level"] = math.nan
+                entry = CacheEntry(**data)
+                self._index[entry.key] = entry
+
+    def get(self, key: str) -> Optional[CacheEntry]:
+        return self._index.get(key)
+
+    def put(self, entry: CacheEntry) -> None:
+        self._index[entry.key] = entry
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        record = dataclasses.asdict(entry)
+        if isinstance(record["level"], float) and math.isnan(record["level"]):
+            record["level"] = None
+        with open(self._path, "a", encoding="utf-8") as fh:
+            fh.write(json.dumps(record) + "\n")
+
+    @staticmethod
+    def now_iso() -> str:
+        return datetime.now(timezone.utc).isoformat()