Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Test annotation caches (API responses — do not commit)
tests/test_annotation/paper_comparison_cache.jsonl
tests/test_annotation/*_comparison_cache.jsonl
tests/test_annotation/annotation_comparison.png

# Python
__pycache__/
*.py[cod]
Expand Down
39 changes: 39 additions & 0 deletions docs/source/api/annotation.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
Annotation
==========

Tools for annotating benchmark items with demand vectors using the
18-dimension ADeLe rubric system from the Nature 2026 paper.

:class:`DemandAnnotator` accepts any client that implements
``generate(prompt: str) -> tuple[str, str]``, so any LLM provider
(OpenAI, Anthropic, Azure, etc.) can be used in place of :class:`GeminiClient`
by wrapping it in a class with that single method.

.. automodule:: torch_measure.annotation
:members:

Core Classes
------------

.. autoclass:: torch_measure.annotation.DemandAnnotator
:members:
:undoc-members:

.. autoclass:: torch_measure.annotation.GeminiClient
:members:
:undoc-members:

.. autoclass:: torch_measure.annotation.RubricsCatalog
:members:
:undoc-members:

.. autoclass:: torch_measure.annotation.AnnotationCache
:members:
:undoc-members:

Data Types
----------

.. autoclass:: torch_measure.annotation.AnnotationJob
:members:
:undoc-members:
7 changes: 7 additions & 0 deletions docs/source/api/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ Factor Models
:members:
:undoc-members:

Demand-Based Models
-------------------

.. autoclass:: torch_measure.models.DemandAssessor
:members:
:undoc-members:

Rotation Utilities
------------------

Expand Down
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Computerized Adaptive Testing, psychometric metrics, and factor models.
:caption: API Reference

api/models
api/annotation
api/cat
api/metrics
api/data
Expand Down
72 changes: 72 additions & 0 deletions src/torch_measure/annotation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""ADeLe demand annotation pipeline (Gemini re-implementation).

Reproduces the annotation methodology from:
Zhou et al. (2026) "General scales unlock AI evaluation with explanatory
and predictive power." Nature.

``DemandAnnotator`` accepts any client that implements
``generate(prompt: str) -> tuple[str, str]`` (response text, finish reason).
``GeminiClient`` is the bundled implementation, but any LLM provider
(OpenAI, Anthropic, Azure, etc.) can be used by wrapping it in a class with
that single method.

Public API
----------
DemandAnnotator — main entry point: annotates one item or a full dataset
GeminiClient — Gemini API wrapper (caller supplies pinned model string)
RubricsCatalog — loads the 19 bundled rubric files
AnnotationCache — append-only JSONL result cache

Data types
----------
AnnotationJob — input: item_id, content, reference_answer
DemandAnnotation — one (item, rubric) result with CoT response
UGAnnotation — UG classification result
ItemAnnotation — all 19 annotations for one item (.to_feature_vector())
DemandVector — full-dataset tensor (n_items × 19) for DemandAssessor
CacheEntry — one persisted cache record

Constants
---------
DIMENSION_ORDER — canonical ordering of all 19 dimensions
DEMAND_DIMENSIONS — the first 18 (excludes UG)
"""
from ._annotator import DemandAnnotator
from ._cache import AnnotationCache
from ._claude_client import ClaudeClient
from ._client import GeminiClient
from ._openai_client import OpenAIClient
from ._rubrics import RubricsCatalog
from ._types import (
DEMAND_DIMENSIONS,
DIMENSION_ORDER,
N_DIMENSIONS,
AnnotationJob,
CacheEntry,
DemandAnnotation,
DemandVector,
ItemAnnotation,
Rubric,
UGAnnotation,
)
from ._ug import UGAnnotator

__all__ = [
"DemandAnnotator",
"ClaudeClient",
"GeminiClient",
"OpenAIClient",
"RubricsCatalog",
"AnnotationCache",
"UGAnnotator",
"AnnotationJob",
"DemandAnnotation",
"UGAnnotation",
"ItemAnnotation",
"DemandVector",
"CacheEntry",
"Rubric",
"DIMENSION_ORDER",
"DEMAND_DIMENSIONS",
"N_DIMENSIONS",
]
127 changes: 127 additions & 0 deletions src/torch_measure/annotation/_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from __future__ import annotations

import hashlib
from typing import Optional

from ._cache import AnnotationCache, make_cache_key
from ._client import GeminiClient
from ._parsers import extract_demand_level
from ._prompts import get_full_instruction
from ._rubrics import RubricsCatalog
from ._types import (
AnnotationJob,
CacheEntry,
DemandAnnotation,
DemandVector,
ItemAnnotation,
Rubric,
)
from ._ug import UGAnnotator


class DemandAnnotator:
"""Runs the full 19-call ADeLe annotation pipeline for one benchmark item.

One API call per demand rubric (18 sequential calls) plus one UG call.
Results are cached to avoid redundant API calls across runs.

``client`` can be any object implementing
``generate(prompt: str) -> tuple[str, str]``. The bundled
:class:`GeminiClient` is the default, but any LLM provider can be used
by wrapping it in a class with that single method.
"""

def __init__(
self,
client: GeminiClient,
rubrics: RubricsCatalog,
cache: Optional[AnnotationCache] = None,
) -> None:
self._client = client
self._rubrics = rubrics
self._cache = cache
self._ug = UGAnnotator(client, rubrics, cache)

def annotate(self, job: AnnotationJob) -> ItemAnnotation:
"""Annotate one item across all 18 demand rubrics plus UG."""
demands: dict[str, DemandAnnotation] = {}
for rubric in self._rubrics.all_demand_rubrics():
demands[rubric.acronym] = self._annotate_one(job, rubric)
ug = self._ug.annotate(job)
return ItemAnnotation(item_id=job.item_id, demands=demands, ug=ug)

def annotate_dataset(self, jobs: list[AnnotationJob]) -> DemandVector:
"""Annotate all items and return a (n_items × 19) tensor.

Row ordering in the returned ``DemandVector.tensor`` mirrors the order
of ``jobs``. To pass the result to ``DemandAssessor.fit()``, supply
``jobs`` in the same order as ``data.to_fit_tensors()["item_ids"]``::

item_ids = data.to_fit_tensors()["item_ids"] # canonical order
jobs = [AnnotationJob(iid, content[iid], ref[iid]) for iid in item_ids]
dv = annotator.annotate_dataset(jobs)
model.fit(data, item_features=dv.tensor)
"""
import torch

item_ids: list[str] = []
rows: list[list[float]] = []
for job in jobs:
item_ann = self.annotate(job)
item_ids.append(job.item_id)
rows.append(item_ann.to_feature_vector())

tensor = torch.tensor(rows, dtype=torch.float32)
return DemandVector(item_ids=item_ids, tensor=tensor)

def _annotate_one(self, job: AnnotationJob, rubric: Rubric) -> DemandAnnotation:
key = make_cache_key(
content=job.content,
acronym=rubric.acronym,
model_id=self._client.model,
rubric_hash=rubric.rubric_hash,
)

if self._cache is not None:
entry = self._cache.get(key)
if entry is not None:
return DemandAnnotation(
item_id=job.item_id,
demand=rubric.acronym,
level=entry.level,
finish_reason=entry.finish_reason,
model_response=entry.model_response,
)

prompt = get_full_instruction(
dimension=rubric.dimension_name,
rubric_content=rubric.content,
item_text=job.content,
)
model_response, finish_reason = self._client.generate(prompt)
level = extract_demand_level(model_response)

annotation = DemandAnnotation(
item_id=job.item_id,
demand=rubric.acronym,
level=level,
finish_reason=finish_reason,
model_response=model_response,
)

if self._cache is not None:
content_hash = hashlib.sha256(job.content.encode()).hexdigest()[:16]
self._cache.put(CacheEntry(
key=key,
item_id=job.item_id,
demand=rubric.acronym,
level=level,
finish_reason=finish_reason,
model_response=model_response,
rubric_hash=rubric.rubric_hash,
model_id=self._client.model,
content_hash=content_hash,
timestamp=AnnotationCache.now_iso(),
))

return annotation
57 changes: 57 additions & 0 deletions src/torch_measure/annotation/_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

import dataclasses
import hashlib
import json
import math
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

from ._types import CacheEntry


def make_cache_key(content: str, acronym: str, model_id: str, rubric_hash: str) -> str:
"""sha256(content)[:16] : acronym : model_id : rubric_hash"""
content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
return f"{content_hash}:{acronym}:{model_id}:{rubric_hash}"


class AnnotationCache:
"""Append-only JSONL cache keyed by sha256(content)[:16]:acronym:model_id:rubric_hash."""

def __init__(self, path: Path) -> None:
self._path = path
self._index: dict[str, CacheEntry] = {}
if path.exists():
self._load()

def _load(self) -> None:
with open(self._path, encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
data = json.loads(line)
# NaN is serialised as null (RFC-compliant); restore here.
level = data.get("level")
if level is None:
data["level"] = math.nan
entry = CacheEntry(**data)
self._index[entry.key] = entry

def get(self, key: str) -> Optional[CacheEntry]:
return self._index.get(key)

def put(self, entry: CacheEntry) -> None:
self._index[entry.key] = entry
self._path.parent.mkdir(parents=True, exist_ok=True)
record = dataclasses.asdict(entry)
if isinstance(record["level"], float) and math.isnan(record["level"]):
record["level"] = None
with open(self._path, "a", encoding="utf-8") as fh:
fh.write(json.dumps(record) + "\n")

@staticmethod
def now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
Loading