From 759daf59931618c7d678046dec1161990165b1f5 Mon Sep 17 00:00:00 2001
From: MrtinoRG <martinriosgarcia@gmail.com>
Date: Sun, 19 Apr 2026 15:02:15 +0200
Subject: [PATCH 1/2] feat: add statistics + illustrative traces

---
 analysis/compute_paper_stats.py  | 237 ++++++++++++++++++++
 analysis/illustrative_traces.tex | 371 +++++++++++++++++++++++++++++++
 2 files changed, 608 insertions(+)
 create mode 100644 analysis/compute_paper_stats.py
 create mode 100644 analysis/illustrative_traces.tex

diff --git a/analysis/compute_paper_stats.py b/analysis/compute_paper_stats.py
new file mode 100644
index 0000000000..bceaee6fea
--- /dev/null
+++ b/analysis/compute_paper_stats.py
@@ -0,0 +1,237 @@
+"""
+Compute the TODO statistics for the paper from analysis/results/data/reports.jsonl.
+
+Token counting logic:
+  For each assistant message in a trace, we count:
+    - INPUT  = tiktoken count of ALL preceding messages (system + user + prior assistant)
+    - OUTPUT = tiktoken count of that assistant message
+  This mirrors the actual API usage: every generation call sends the full
+  conversation prefix as input and produces one completion as output.
+
+Outputs:
+  1. Total tokens broken down by model and verbosity
+  2. Number of configuration--environment pairs
+  3. Estimated API cost for proprietary models (GPT-4o, Claude 4.5)
+  4. Malformed-response / scaffold-error rates per model
+"""
+
+import json
+from collections import defaultdict
+from pathlib import Path
+
+import tiktoken
+from loguru import logger
+
+PRICING = {
+    "gpt-4o": {"input": 2.50, "output": 10.00},
+    "claude-4.5": {"input": 3.00, "output": 15.00},
+    # gpt-oss-120b is self-hosted, no API cost
+}
+
+enc = tiktoken.get_encoding("o200k_base")
+
+
+def count_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return len(enc.encode(str(text)))
+
+
+DATA_PATH = Path(__file__).parent / "results" / "data" / "reports.jsonl"
+
+config_env_pairs = set()
+
+tokens = defaultdict(lambda: defaultdict(lambda: {"input": 0, "output": 0}))
+
+scaffold_errors = defaultdict(lambda: {"errors": 0, "total_trials": 0})
+trials_affected_count = defaultdict(lambda: {"affected": 0, "total": 0})
+
+total_trials = 0
+
+with DATA_PATH.open() as f:
+    for line in f:
+        rec = json.loads(line)
+        model = rec["model"]
+        agent_type = rec["agent_type"]
+        env = rec["environment"]
+        verbosity = rec.get("Tool Verbosity", "")
+
+        config_env_pairs.add((model, agent_type, verbosity, env))
+
+        for task_data in rec["Task Results"].values():
+            for trial in task_data["trials"]:
+                total_trials += 1
+                msgs = trial.get("messages", [])
+
+                # Tokenize every message once, reuse counts
+                msg_tok = [count_tokens(m.get("content", "")) for m in msgs]
+
+                # For each assistant turn: input = sum of all prior, output = this turn
+                for i, m in enumerate(msgs):
+                    if m.get("role") == "assistant":
+                        tokens[model][verbosity]["input"] += sum(msg_tok[:i])
+                        tokens[model][verbosity]["output"] += msg_tok[i]
+
+                # Scaffold errors
+                key = (model, agent_type)
+                scaffold_errors[key]["total_trials"] += 1
+                trial_has_error = False
+                for m in msgs:
+                    content = str(m.get("content", ""))
+                    if m.get("role") == "user" and "No actions to execute" in content:
+                        scaffold_errors[key]["errors"] += 1
+                        trial_has_error = True
+                if agent_type == "react":
+                    trials_affected_count[model]["total"] += 1
+                    if trial_has_error:
+                        trials_affected_count[model]["affected"] += 1
+
+
+def fmt_tokens(n):
+    if n >= 1e9:
+        return f"{n / 1e9:.1f}B"
+    if n >= 1e6:
+        return f"{n / 1e6:.1f}M"
+    if n >= 1e3:
+        return f"{n / 1e3:.1f}K"
+    return str(n)
+
+
+def agg_model(model):
+    inp = sum(v["input"] for v in tokens[model].values())
+    out = sum(v["output"] for v in tokens[model].values())
+    return inp, out
+
+
+def agg_all():
+    inp = sum(v["input"] for m in tokens for v in tokens[m].values())
+    out = sum(v["output"] for m in tokens for v in tokens[m].values())
+    return inp, out
+
+
+def agg_verbosity(verb):
+    inp = sum(tokens[m][verb]["input"] for m in tokens)
+    out = sum(tokens[m][verb]["output"] for m in tokens)
+    return inp, out
+
+
+logger.info("=" * 80)
+logger.info("PAPER STATISTICS  (from reports.jsonl)")
+logger.info("=" * 80)
+
+n_pairs = len(config_env_pairs)
+logger.info(f"\nConfiguration--environment pairs (N): {n_pairs}")
+logger.info(
+    f"  (x 5 trials each = {n_pairs * 5} expected trials, actual = {total_trials})"
+)
+
+all_models = sorted(tokens.keys())
+all_verbosities = sorted({v for m in tokens for v in tokens[m]})
+grand_in, grand_out = agg_all()
+
+logger.info(f"\n{'='*80}")
+logger.info(
+    "TOKEN COUNTS BY MODEL  (input = all prior msgs per call, output = assistant msg)"
+)
+logger.info(f"{'='*80}")
+for m in all_models:
+    inp, out = agg_model(m)
+    logger.info(
+        f"  {m:15s}:  input {inp:>14,}  output {out:>14,}  total {inp+out:>14,}"
+    )
+logger.info(
+    f"  {'GRAND TOTAL':15s}:  input {grand_in:>14,}  output {grand_out:>14,}  total {grand_in+grand_out:>14,}"
+)
+logger.info(f"\n  => Total tokens (all models): ~{fmt_tokens(grand_in + grand_out)}")
+
+logger.info(f"\n{'='*80}")
+logger.info("TOKEN COUNTS BY VERBOSITY")
+logger.info(f"{'='*80}")
+for verb in all_verbosities:
+    inp, out = agg_verbosity(verb)
+    logger.info(f"\n  Verbosity: {verb}")
+    for m in all_models:
+        vi = tokens[m][verb]["input"]
+        vo = tokens[m][verb]["output"]
+        logger.info(
+            f"    {m:15s}:  input {vi:>14,}  output {vo:>14,}  total {vi+vo:>14,}"
+        )
+    logger.info(
+        f"    {'SUBTOTAL':15s}:  input {inp:>14,}  output {out:>14,}  total {inp+out:>14,}"
+    )
+
+logger.info(f"\n{'='*80}")
+logger.info("FLAT TABLE: MODEL x VERBOSITY")
+logger.info(f"{'='*80}")
+header = (
+    f"  {'Model':15s} {'Verbosity':15s} {'Input':>14s} {'Output':>14s} {'Total':>14s}"
+)
+logger.info(header)
+logger.info(f"  {'-'*15} {'-'*15} {'-'*14} {'-'*14} {'-'*14}")
+for m in all_models:
+    for verb in all_verbosities:
+        vi = tokens[m][verb]["input"]
+        vo = tokens[m][verb]["output"]
+        logger.info(f"  {m:15s} {verb:15s} {vi:>14,} {vo:>14,} {vi+vo:>14,}")
+
+logger.info(f"\n{'='*80}")
+logger.info("ESTIMATED API COST (proprietary models only)")
+logger.info(f"{'='*80}")
+total_cost = 0.0
+for m, prices in PRICING.items():
+    inp, out = agg_model(m)
+    cost_in = inp / 1e6 * prices["input"]
+    cost_out = out / 1e6 * prices["output"]
+    cost = cost_in + cost_out
+    total_cost += cost
+    logger.info(
+        f"  {m:15s}:  input ${cost_in:>10,.2f}  output ${cost_out:>10,.2f}  total ${cost:>10,.2f}"
+    )
+logger.info(f"  {'TOTAL':15s}:  ${total_cost:>10,.2f}")
+
+logger.info("\n  Cost breakdown by verbosity:")
+for verb in all_verbosities:
+    verb_cost = 0.0
+    for m, prices in PRICING.items():
+        vi = tokens[m][verb]["input"]
+        vo = tokens[m][verb]["output"]
+        verb_cost += vi / 1e6 * prices["input"] + vo / 1e6 * prices["output"]
+    logger.info(f"    {verb:15s}: ${verb_cost:>10,.2f}")
+
+logger.info(f"\n{'='*80}")
+logger.info("MALFORMED-RESPONSE RATES (ReAct scaffold errors)")
+logger.info(f"{'='*80}")
+react_totals = {}
+for (m, at), val in scaffold_errors.items():
+    if at == "react":
+        react_totals[m] = val
+
+logger.info(
+    f"  {'Model':15s} {'Errors':>8s} {'Trials':>8s} {'Err/Trial':>10s} {'% Affected':>12s}"
+)
+for m in sorted(react_totals.keys()):
+    v = react_totals[m]
+    rate = v["errors"] / v["total_trials"] if v["total_trials"] else 0
+    ta = trials_affected_count.get(m, {"affected": 0, "total": 0})
+    pct = ta["affected"] / ta["total"] * 100 if ta["total"] else 0
+    logger.info(
+        f"  {m:15s} {v['errors']:>8d} {v['total_trials']:>8d} {rate:>10.3f} {pct:>11.1f}%"
+    )
+
+logger.info(f"\n{'='*80}")
+logger.info("LATEX-READY VALUES")
+logger.info(f"{'='*80}")
+logger.info(f"  Tokens:  ~{fmt_tokens(grand_in + grand_out)}")
+logger.info(f"  N (config--env pairs):  {n_pairs}")
+logger.info(f"  API cost:  ${total_cost:,.0f}")
+logger.info("")
+logger.info("  Malformed-response rates (ReAct only):")
+for m in ["gpt-oss-120b", "gpt-4o", "claude-4.5"]:
+    if m in react_totals:
+        v = react_totals[m]
+        rate = v["errors"] / v["total_trials"] if v["total_trials"] else 0
+        ta = trials_affected_count.get(m, {"affected": 0, "total": 0})
+        pct = ta["affected"] / ta["total"] * 100 if ta["total"] else 0
+        logger.info(
+            f"    {m}: {rate:.2f} errors/trial, {pct:.1f}% of react trials affected"
+        )
diff --git a/analysis/illustrative_traces.tex b/analysis/illustrative_traces.tex
new file mode 100644
index 0000000000..187ce7c1b4
--- /dev/null
+++ b/analysis/illustrative_traces.tex
@@ -0,0 +1,371 @@
+% Illustrative trace excerpts for the four major reasoning-breakdown categories.
+% Requires: tcolorbox, tikz, xcolor, enumitem, listings
+% Usage: \input{analysis/results/illustrative_traces.tex}
+
+\definecolor{colENU}{RGB}{198,219,239}   % Evidence non-uptake — light blue
+\definecolor{colUC}{RGB}{253,205,172}    % Untested claim — light orange
+\definecolor{colFB}{RGB}{203,213,232}    % Fixed belief trace — light lavender
+\definecolor{colCR}{RGB}{252,187,161}    % Contradiction without repair — light red
+
+\definecolor{nodeE}{RGB}{86,146,194}     % Evidence node badge
+\definecolor{nodeH}{RGB}{196,124,58}     % Hypothesis node badge
+\definecolor{nodeJ}{RGB}{100,140,100}    % Judgment node badge
+\definecolor{nodeT}{RGB}{136,136,136}    % Test node badge
+\definecolor{nodeC}{RGB}{170,68,68}      % Commitment node badge
+\definecolor{nodeF}{RGB}{80,80,80}       % Final-answer node badge
+
+\newcommand{\nodebadge}[1]{%
+  \tikz[baseline=(n.base)]{%
+    \node[circle, draw, fill=node#1!25, inner sep=0pt,
+          minimum size=9pt, font=\sffamily\bfseries\tiny] (n) {#1};%
+  }%
+}
+
+\subsection{Illustrative traces}
+\label{app:illustrative-traces}
+
+For each major breakdown category, we include one representative trace excerpt
+with the supporting quote the annotator used to assign the pattern.  For every
+message in the excerpt, the epistemic nodes extracted by the annotator are
+listed, together with the edges that link them.  A
+\nodebadge{H}~denotes a \emph{hypothesis},
+\nodebadge{T}~a \emph{test},
+\nodebadge{E}~an \emph{evidence} observation,
+\nodebadge{J}~a \emph{judgment}, and
+\nodebadge{C}~a \emph{commitment} (premature answer).
+Missing edges that \emph{should} appear in a sound reasoning chain are marked
+with \textcolor{red!70}{\textsf{\bfseries\small$\times$}}.
+The full annotated traces, with node and edge graphs, filtering by pattern,
+and alternative candidate annotations, are browsable at
+\url{https://lamalab-org.github.io/corral}.
+
+\begin{tcolorbox}[
+  colback=colENU!8, colframe=colENU!60!black,
+  title={\small\sffamily\bfseries Evidence non-uptake},
+  fontupper=\small,
+  left=4pt, right=4pt, top=4pt, bottom=4pt,
+  boxrule=0.4pt, arc=2pt,
+  subtitle style={colback=colENU!25},
+]
+\textbf{Model:} Claude Sonnet 4.5\quad
+\textbf{Environment:} NMR structure elucidation (spectra)\quad
+\textbf{Scope:} 1\quad
+\textbf{Trial:} 20
+
+\tcblower
+\begin{description}[font=\sffamily\bfseries\small, leftmargin=12pt, style=sameline, itemsep=6pt]
+
+\item[Msg\,11 \normalfont\textsf{(Observation)}]
+\texttt{obtain\_isomers\_from\_molecular\_formula}
+$\to$~20~candidate SMILES returned for C$_{18}$H$_{14}$O$_2$, including
+\texttt{COC(=O)/C=C\textbackslash{}C1=CC2=\allowbreak{}CC3=CC=CC=C3C=C2C=C1} (the correct answer)
+.\\[2pt]
+\nodebadge{T}~N13 \emph{Obtain isomers for molecular formula C$_{18}$H$_{14}$O$_2$}
+$\xrightarrow{\textsf{observes}}$
+\nodebadge{E}~N14 \emph{20 isomers returned, incl.\ several with methyl ester groups}
+
+\item[Msg\,12 \normalfont\textsf{(Agent)}]
+\enquote{\emph{I got 20 isomers with the molecular formula C$_{18}$H$_{14}$O$_2$.
+Looking at the list, I can see several candidates.  Let me analyze which one
+matches our NMR data\,\ldots}}\\
+\textcolor{gray}{\footnotesize $\rightarrow$~calls \texttt{retrieve\_dbe\_formula}}\\[2pt]
+No node extracted—the agent begins a tangential calculation instead of
+using the isomer list.
+
+\item[Msg\,14 \normalfont\textsf{(Agent)}]
+\enquote{\emph{Let me recalculate the DBE for C$_{18}$H$_{14}$O$_2$\,\ldots\ Looking
+back at the isomers list, let me focus on those with methyl ester
+groups\,\ldots}}\\
+\textcolor{gray}{\footnotesize $\rightarrow$~calls \texttt{simulate\_spectra} for methyl 9-phenanthrenecarboxylate}\\[2pt]
+\nodebadge{T}~N15 \emph{Simulate spectra for methyl 9-phenanthrenecarboxylate
+to compare with experimental}
+\end{description}
+
+\medskip
+\begin{center}
+\begin{tikzpicture}[
+  ncirc/.style={circle, draw, inner sep=1pt, minimum size=18pt,
+                font=\small\sffamily\bfseries},
+  nlbl/.style={font=\footnotesize, below=2pt},
+]
+\node[ncirc, fill=nodeT!25] (T) {T};
+\node[nlbl] at (T.south) {N13};
+\node[ncirc, fill=nodeE!25, right=55pt of T] (E) {E};
+\node[nlbl] at (E.south) {N14};
+\node[ncirc, draw=gray!40, text=gray!40, right=55pt of E] (JH) {J/H};
+\draw[->, thick] (T) -- node[above, font=\footnotesize] {observes} (E);
+\draw[->, gray!40, dashed, thick] (E) -- node[above, font=\footnotesize, text=gray] {informs} (JH);
+\node[red!70, font=\bfseries\small] at ($(E)!0.5!(JH)+(0,-0.3)$) {\texttimes};
+\end{tikzpicture}
+\end{center}
+
+\smallskip\noindent
+\textbf{Annotator quote:}
+\emph{\enquote{\texttt{COC(=O)/C=C\textbackslash{}C1=CC2=\allowbreak{}CC3=CC=CC=C3C=C2C=C1},
+\texttt{COC(=O)C12C=CC(\allowbreak{}C3=CC=CC=C31)C4=CC=CC=C24}}} — the 20 candidate
+structures include the correct answer, yet node~N14 has \textbf{zero outgoing
+\textsf{informs} edges}: the isomer list is never used to constrain any
+judgment or hypothesis.  The agent instead guesses structures one by one
+(phenanthrenecarboxylate, anthracenecarboxylate,~\ldots) without consulting
+the evidence it already obtained.
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[
+  colback=colUC!8, colframe=colUC!60!black,
+  title={\small\sffamily\bfseries Untested claim},
+  fontupper=\small,
+  left=4pt, right=4pt, top=4pt, bottom=4pt,
+  boxrule=0.4pt, arc=2pt,
+  subtitle style={colback=colUC!25},
+]
+\textbf{Model:} Claude Sonnet 4.5\quad
+\textbf{Environment:} NMR structure elucidation (spectra)\quad
+\textbf{Scope:} 1\quad
+\textbf{Trial:} 22
+
+\tcblower
+\begin{description}[font=\sffamily\bfseries\small, leftmargin=12pt, style=sameline, itemsep=6pt]
+
+\item[Msg\,3 \normalfont\textsf{(Observation)}]
+\texttt{proton\_nmr\_spectra}
+$\to$~\emph{\enquote{Deltas 2.23 (s, 3H), 2.81 (t, $J$\,=\,7.1\,Hz, 2H),
+2.98 (t, $J$\,=\,7.1\,Hz, 2H), 7.17 (ddd, 7H), 7.90 (ddd, 2H).}}\\[2pt]
+\nodebadge{T}~N1 \emph{Get proton NMR spectra to gather structural information}
+$\xrightarrow{\textsf{observes}}$
+\nodebadge{E}~N2 \emph{Proton NMR: 2.23\,(s,\,3H), 2.81\,(t,\,2H),
+2.98\,(t,\,2H), 7.17\,(ddd,\,7H), 7.90\,(ddd,\,2H)}
+
+\item[Msg\,4 \normalfont\textsf{(Agent)}]
+\enquote{\emph{The two triplets at 2.81 and 2.98\,ppm suggest two CH$_2$ groups
+coupled to each other (likely -CH$_2$-CH$_2$-).  The aromatic signals suggest a
+substituted benzene ring.  The singlet at 2.23\,ppm suggests a methyl group
+attached to a carbonyl (acetyl group).}}\\
+\textcolor{gray}{\footnotesize $\rightarrow$~calls
+\texttt{mass\_spectrometry\_spectra}}\\[2pt]
+\nodebadge{E}~N2 $\xrightarrow{\textsf{informs}}$
+\nodebadge{H}~N3 \emph{Two coupled CH$_2$ groups; singlet at 2.23\,ppm
+suggests acetyl group; substituted benzene ring}
+
+\item[Msg\,5 \normalfont\textsf{(Observation)}]
+\texttt{mass\_spectrometry\_spectra} $\to$ \emph{\enquote{m/z 224.12 (intensity
+100)\,\ldots}}\\[2pt]
+The agent moves directly to mass spectrometry—no test is designed to
+evaluate the CH$_2$-CH$_2$ or acetyl-group hypothesis.
+
+\end{description}
+
+\medskip
+\begin{center}
+\begin{tikzpicture}[
+  ncirc/.style={circle, draw, inner sep=1pt, minimum size=18pt,
+                font=\small\sffamily\bfseries},
+  nlbl/.style={font=\footnotesize, below=2pt},
+]
+\node[ncirc, fill=nodeE!25] (E) {E};
+\node[nlbl] at (E.south) {N2};
+\node[ncirc, fill=nodeH!25, right=55pt of E] (H) {H};
+\node[nlbl] at (H.south) {N3};
+\node[ncirc, draw=gray!40, text=gray!40, right=55pt of H] (T) {T};
+\draw[->, thick] (E) -- node[above, font=\footnotesize] {informs} (H);
+\draw[->, gray!40, dashed, thick] (H) -- node[above, font=\footnotesize, text=gray] {tests} (T);
+\node[red!70, font=\bfseries\small] at ($(H)!0.5!(T)+(0,-0.3)$) {\texttimes};
+\end{tikzpicture}
+\end{center}
+
+\smallskip\noindent
+\textbf{Annotator quote:}
+\emph{\enquote{The two triplets at 2.81 and 2.98\,ppm suggest two CH$_2$ groups
+coupled to each other (likely -CH$_2$-CH$_2$-).  The aromatic signals
+suggest a substituted benzene ring.  The singlet at 2.23\,ppm suggests a
+methyl group attached to a carbonyl (acetyl group).}}
+— Node~N3 carries a detailed structural interpretation but has \textbf{zero
+outgoing \textsf{tests} edges}.  The agent asserts the presence of an acetyl
+group and coupled CH$_2$ groups as fact, then immediately requests mass
+spectrometry to determine molecular weight rather than designing an
+experiment (e.g., simulating $^{13}$C NMR for a candidate) to verify
+the claim.
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[
+  colback=colFB!8, colframe=colFB!60!black,
+  title={\small\sffamily\bfseries Fixed belief trace},
+  fontupper=\small,
+  left=4pt, right=4pt, top=4pt, bottom=4pt,
+  boxrule=0.4pt, arc=2pt,
+  subtitle style={colback=colFB!25},
+]
+\textbf{Model:} GPT-4o\quad
+\textbf{Environment:} LAMMPS molecular dynamics (md)\quad
+\textbf{Scope:} 2\quad
+\textbf{Trial:} 33
+
+\tcblower
+\begin{description}[font=\sffamily\bfseries\small, leftmargin=12pt, style=sameline, itemsep=6pt]
+
+\item[Msg\,15 \normalfont\textsf{(Observation)}]
+\texttt{list\_files} $\to$ \emph{\enquote{\{\enquote{files}: [\enquote{/potentials/BKS},
+\enquote{/potentials/EAM}, \enquote{/potentials/SW}, \enquote{/potentials/TERSOFF}]\}}}\\[2pt]
+\nodebadge{T}~N10 \emph{List files in /potentials/ to find correct SW
+potential path}
+$\xrightarrow{\textsf{observes}}$
+\nodebadge{E}~N11 \emph{Potentials directory contains BKS, EAM, SW, TERSOFF
+subdirectories}
+
+\item[Msg\,16 \normalfont\textsf{(Agent)}]
+\enquote{\emph{The \texttt{/potentials/} directory contains a file named
+\texttt{SW}, which likely corresponds to the Stillinger-Weber potential.
+I will update the LAMMPS input script to use this file\,\ldots}}\\
+\textcolor{gray}{\footnotesize $\rightarrow$~calls \texttt{write\_file}
+to set \texttt{pair\_coeff * * /potentials/SW Si}}\\[2pt]
+\nodebadge{E}~N11 $\xrightarrow{\textsf{informs}}$
+\nodebadge{H}~N12 \emph{The SW potential file is located at /potentials/SW}
+
+\item[] {\color{gray} $\vdots$ \footnotesize\itshape three error messages and re-writes later}
+
+\item[Msg\,27 \normalfont\textsf{(Observation)}]
+\texttt{run\_lammps} $\to$ \emph{\enquote{ERROR: Lost atoms: original 8
+current 0}}\\[2pt]
+\nodebadge{T}~N33 \emph{Run LAMMPS simulation with updated script using
+/potentials/SW/Si.sw}
+$\xrightarrow{\textsf{observes}}$
+\nodebadge{E}~N34 \emph{LAMMPS simulation failed: Lost atoms}
+
+\item[Msg\,28 \normalfont\textsf{(Agent)}]
+\enquote{\emph{This might be due to the rapid heating rate or insufficient
+equilibration.  I will adjust the heating rate and ensure proper
+equilibration\,\ldots}}\\[2pt]
+\nodebadge{E}~N34 \;(no \textsf{informs} to any existing H)\\
+\nodebadge{H}~N35 \emph{Lost atoms error may be due to rapid heating rate
+or insufficient equilibration}
+
+\end{description}
+
+\medskip
+\begin{center}
+\begin{tikzpicture}[
+  ncirc/.style={circle, draw, inner sep=1pt, minimum size=18pt,
+                font=\small\sffamily\bfseries},
+  nlbl/.style={font=\footnotesize, below=2pt},
+]
+\node[ncirc, fill=nodeH!25] (H1) {H};
+\node[nlbl] at (H1.south) {N12};
+\node[ncirc, fill=nodeH!25, right=60pt of H1] (H2) {H};
+\node[nlbl] at (H2.south) {N19};
+\node[right=16pt of H2, font=\small, text=gray] {$\cdots$};
+\node[ncirc, fill=nodeH!25, right=75pt of H2] (H5) {H};
+\node[nlbl] at (H5.south) {N46};
+\node[ncirc, fill=nodeH!25, right=60pt of H5] (H6) {H};
+\node[nlbl] at (H6.south) {N57};
+\draw[->, gray!40, dashed, thick] (H1) -- node[above, font=\footnotesize, text=gray] {updates\_to} (H2);
+\node[red!70, font=\bfseries\small] at ($(H1)!0.5!(H2)+(0,-0.3)$) {\texttimes};
+\draw[->, gray!40, dashed, thick] (H5) -- node[above, font=\footnotesize, text=gray] {updates\_to} (H6);
+\node[red!70, font=\bfseries\small] at ($(H5)!0.5!(H6)+(0,-0.3)$) {\texttimes};
+\end{tikzpicture}
+\end{center}
+
+\smallskip\noindent
+\textbf{Annotator quotes:}\\
+\noindent N12 ($\mathsf{msg\,16}$): \emph{\enquote{The \texttt{/potentials/}
+directory contains a file named \texttt{SW}, which likely corresponds to the
+Stillinger-Weber potential.}}\\
+\noindent N35 ($\mathsf{msg\,28}$): \emph{\enquote{This might be due to the rapid
+heating rate or insufficient equilibration.}}
+
+\smallskip\noindent
+The trace contains \textbf{6 hypothesis nodes and zero \textsf{updates\_to}
+edges}.  The first hypothesis~(N12) mistakes the directory
+\texttt{/potentials/SW} for a file, producing an error that is never traced
+back to the belief.  After two more failures the agent introduces a second
+independent hypothesis~(N35)—that lost atoms stem from heating rate—without
+revising the earlier structural error.  Both beliefs persist unmodified
+through the remaining 15~messages.
+
+\end{tcolorbox}
+
+\begin{tcolorbox}[
+  colback=colCR!8, colframe=colCR!60!black,
+  title={\small\sffamily\bfseries Contradiction without repair},
+  fontupper=\small,
+  left=4pt, right=4pt, top=4pt, bottom=4pt,
+  boxrule=0.4pt, arc=2pt,
+  subtitle style={colback=colCR!25},
+]
+\textbf{Model:} Claude Sonnet 4.5\quad
+\textbf{Environment:} NMR structure elucidation (spectra)\quad
+\textbf{Scope:} 2\quad
+\textbf{Trial:} 45
+
+\tcblower
+\begin{description}[font=\sffamily\bfseries\small, leftmargin=12pt, style=sameline, itemsep=6pt]
+
+\item[Msg\,17 \normalfont\textsf{(Observation)}]
+\texttt{simulate\_spectra} for \texttt{CC(C)OC(=O)c1ccc\-cc1N(C)C}
+$\to$~$^{1}$H NMR: 1.46\,(d,\,\textbf{6H}),\;2.97\,(s,\,6H),\;
+4.75\,(p,\,1H),\;aromatic protons.\\[2pt]
+\nodebadge{T}~N31 \emph{Simulate spectra for
+\texttt{CC(C)OC(=O)c1ccccc1N(C)C}}
+$\xrightarrow{\textsf{observes}}$
+\nodebadge{E}~N25 \emph{Simulated: 1.46\,ppm (d,\,\textbf{6H});
+experimental: 1.43\,ppm (d,\,\textbf{3H})}
+
+\item[Msg\,18 \normalfont\textsf{(Agent)}]
+\enquote{\emph{H-NMR: 1.46\,ppm (d,\,6H) — This is for isopropyl CH$_3$ groups, but
+experimental shows 1.43\,ppm (d,\,3H).  The experimental data has only 3H in
+the doublet, not 6H.  \textbf{So this isn't isopropyl ester.}  The
+experimental data clearly shows CH(CH$_3$), not CH(CH$_3$)$_2$.}}\\
+\textcolor{gray}{\footnotesize $\rightarrow$~calls
+\texttt{get\_formula\_from\_smiles}}\\[2pt]
+\nodebadge{E}~N25 $\xrightarrow{\textsf{informs}}$
+\nodebadge{J}~N33 \emph{Simulated isopropyl ester shows 6H doublet at
+1.46\,ppm, but experimental shows only 3H doublet at 1.43\,ppm}\\[2pt]
+\nodebadge{J}~N33 $\xrightarrow{\textsf{contradicts}}$
+\nodebadge{H}~N30 \emph{The compound is 2-(dimethylamino)benzoic acid
+isopropyl ester}
+
+\item[Msg\,20 \normalfont\textsf{(Agent --- final answer)}]
+\enquote{\emph{Simulations are approximations and can have errors\,\ldots\
+The key evidence: (1)~quartet at 5.05\,ppm, (2)~singlet at 2.69\,ppm
+(6H) = N(CH$_3$)$_2$, (3)~4 aromatic protons, (4)~carbonyl at
+168.30\,ppm.}}
+\par\texttt{<final\_answer>CC(C)OC(=O)c1ccc\-cc1N(C)C</final\_answer>}\\[2pt]
+\nodebadge{J}~N36 \emph{Despite the 3H vs.\,6H discrepancy, the overall
+evidence supports the isopropyl ester}\\
+\nodebadge{C}~N37 \emph{Agent commits despite unresolved discrepancy}
+
+\end{description}
+
+\medskip
+\begin{center}
+\begin{tikzpicture}[
+  ncirc/.style={circle, draw, inner sep=1pt, minimum size=18pt,
+                font=\small\sffamily\bfseries},
+  nlbl/.style={font=\footnotesize, below=2pt},
+]
+\node[ncirc, fill=nodeE!25] (E) {E};
+\node[nlbl] at (E.south) {N25};
+\node[ncirc, fill=nodeJ!25, right=55pt of E] (J) {J};
+\node[nlbl] at (J.south) {N33};
+\node[ncirc, fill=nodeH!25, right=55pt of J] (H) {H};
+\node[nlbl] at (H.south) {N30};
+\node[ncirc, draw=gray!40, text=gray!40, right=55pt of H] (Hrev) {H$'$};
+\draw[->, thick] (E) -- node[above, font=\footnotesize] {informs} (J);
+\draw[->, thick, red!60!black] (J) -- node[above, font=\footnotesize] {contradicts} (H);
+\draw[->, gray!40, dashed, thick] (H) -- node[above, font=\footnotesize, text=gray] {updates\_to} (Hrev);
+\node[red!70, font=\bfseries\small] at ($(H)!0.5!(Hrev)+(0,-0.3)$) {\texttimes};
+\end{tikzpicture}
+\end{center}
+
+\smallskip\noindent
+\textbf{Annotator quote:}
+\emph{\enquote{H-NMR: 1.46\,ppm (d,\,6H) — This is for isopropyl CH$_3$ groups,
+but experimental shows 1.43\,ppm (d,\,3H)\,\ldots\ The experimental data
+has only 3H in the doublet, not 6H.  So this isn't isopropyl ester.}}
+— The agent \emph{explicitly} notes that the simulated $^{1}$H~NMR
+contradicts its own hypothesis~(N30).  Yet no revised hypothesis is generated:
+the \textsf{contradicts} edge from~N33 to~N30 has \textbf{no accompanying
+\textsf{updates\_to}~edge}.  Instead, the agent dismisses the contradiction
+as a simulation artefact and submits the isopropyl ester
+(\texttt{CC(C)OC(=O)c1ccccc1N(C)C}) as the final answer~(N38).
+\end{tcolorbox}

From aa78aaca68ab6cd9afdcb7f83f5d604a22abc5d8 Mon Sep 17 00:00:00 2001
From: MrtinoRG <martinriosgarcia@gmail.com>
Date: Tue, 21 Apr 2026 10:20:35 +0200
Subject: [PATCH 2/2] fix: update graph tables

---
 reasoning_reports/analyze.py | 101 ++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 36 deletions(-)

diff --git a/reasoning_reports/analyze.py b/reasoning_reports/analyze.py
index 3dafb2cf63..235d39ae0d 100644
--- a/reasoning_reports/analyze.py
+++ b/reasoning_reports/analyze.py
@@ -147,7 +147,7 @@
         "[E -contradicts-> H, no updates_to/competes_with]."
     ),
     "premature_commitment": (
-        "Hypothesis committed without intermediate testing "
+        "Commitment (explicit or inferred) to a hypothesis without testing it first "
         "[J -informs-> C, J -informs-> H, H with no tests]."
     ),
     "uninformative_test": ("Test produces no observed evidence [T with no E]."),
@@ -156,11 +156,11 @@
     ),
     "disconnected_evidence": ("Evidence node with no edges [Isolated E]."),
     "one_sided_confirmation": (
-        "Commitment reached without considering contradicting evidence "
+        "Commitment (explicit or inferred) without contradicting evidence "
         "[J -informs-> C, J -informs-> H, H has support but no contradicts]."
     ),
     "precommitted_test_plan": (
-        "A commitment is done before evidence collection begins "
+        "Commitment (explicit or inferred) before evidence collection "
         "[C before E; then H -tests-> T]."
     ),
 }
@@ -2707,28 +2707,57 @@ def _split_description(desc: str) -> tuple[str, str]:
 # Each entry is (merge_label | None, [pattern_keys]).
 # When *merge_label* is not None, \multirow groups the constituent rows.
 _SUBGRAPH_TABLE_ORDER: list[tuple[str | None, list[str]]] = [
-    (None, [SG_REFUTATION_DRIVEN_BELIEF_REVISION]),
     (
-        "Data-first hypothesis",
-        [SG_EXPLORE_THEN_TEST_TRANSITION, SG_EVIDENCE_LED_HYPOTHESIS_GENERATION],
+        "Hypothesis handling",
+        [
+            SG_EVIDENCE_LED_HYPOTHESIS_GENERATION,
+            SG_HYPOTHESIS_RERANKING,
+            SG_REFUTATION_DRIVEN_BELIEF_REVISION,
+            SG_EXPLORE_THEN_TEST_TRANSITION,
+        ],
     ),
-    (None, [SG_HYPOTHESIS_RERANKING]),
-    (None, [SG_CONVERGENT_MULTI_TEST_EVIDENCE]),
     (
-        "Iterative test refinement",
-        [SG_FIXED_HYPOTHESIS_TEST_TUNING, SG_EVIDENCE_GUIDED_TEST_REDESIGN],
+        "Evidence handling",
+        [
+            SG_CONVERGENT_MULTI_TEST_EVIDENCE,
+        ],
+    ),
+    (
+        "Inquiry control",
+        [
+            SG_FIXED_HYPOTHESIS_TEST_TUNING,
+            SG_EVIDENCE_GUIDED_TEST_REDESIGN,
+        ],
     ),
 ]
 
 _ANTIPATTERN_TABLE_ORDER: list[tuple[str | None, list[str]]] = [
-    ("Untested hypothesis", [AP_UNTESTED_CLAIM, AP_PREMATURE_COMMITMENT]),
-    ("Unused evidence", [AP_EVIDENCE_NON_UPTAKE, AP_DISCONNECTED_EVIDENCE]),
-    (None, [AP_UNSUPPORTED_JUDGMENT]),
-    (None, [AP_CONTRADICTION_WITHOUT_REPAIR]),
-    (None, [AP_UNINFORMATIVE_TEST]),
-    ("Absent/Stalled revision", [AP_STALLED_REVISION, AP_FIXED_BELIEF_TRACE]),
-    (None, [AP_ONE_SIDED_CONFIRMATION]),
-    (None, [AP_PRECOMMITTED_TEST_PLAN]),
+    (
+        "Hypothesis handling",
+        [
+            AP_UNTESTED_CLAIM,
+            AP_ONE_SIDED_CONFIRMATION,
+            AP_CONTRADICTION_WITHOUT_REPAIR,
+            AP_PREMATURE_COMMITMENT,
+        ],
+    ),
+    (
+        "Evidence handling",
+        [
+            AP_EVIDENCE_NON_UPTAKE,
+            AP_DISCONNECTED_EVIDENCE,
+            AP_UNSUPPORTED_JUDGMENT,
+            AP_UNINFORMATIVE_TEST,
+        ],
+    ),
+    (
+        "Inquiry control",
+        [
+            AP_FIXED_BELIEF_TRACE,
+            AP_PRECOMMITTED_TEST_PLAN,
+            AP_STALLED_REVISION,
+        ],
+    ),
 ]
 
 
@@ -2743,29 +2772,28 @@ def _emit_group(
     for i, key in enumerate(keys):
         prose, graph_text = _split_description(descriptions[key])
         prose = _latex_escape(prose)
+        name = _pretty_name(key)
+        name_and_desc = rf"\textbf{{{name}}}. {prose}"
         if tikz_patterns and key in tikz_patterns:
             graph = tikz_patterns[key]
         else:
             graph = _latex_escape(graph_text)
-        if n == 1:
-            name = _pretty_name(key)
-            rows.append(rf"{name} & {graph} & {prose} \\")
+        if i == 0:
+            rows.append(
+                rf"\multirow{{{n}}}{{=}}{{{merge_name}}} & {graph} & {name_and_desc} \\"
+            )
         else:
-            if i == 0:
-                rows.append(
-                    rf"\multirow{{{n}}}{{=}}{{{merge_name}}} & {graph} & {prose} \\"
-                )
-            else:
-                rows.append(rf" & {graph} & {prose} \\")
+            rows.append(rf" & {graph} & {name_and_desc} \\")
     return rows
 
 
 def build_productive_motifs_latex() -> str:
     """Return a LaTeX tabularx table with definitions of productive motifs.
 
-    Column layout: Topic (p{2.2cm}), Graph (TikZ picture, c), Description (X).
-    Related patterns share a merged row label via `\\multirow`. Each graph
-    cell contains an inline TikZ diagram from `_TIKZ_SUBGRAPH_PATTERNS`.
+    Column layout: Group (p{2.2cm}), Graph (TikZ picture, c), Name + Description (X).
+    Patterns are grouped by reasoning capability (Hypothesis handling,
+    Evidence handling, Inquiry control) matching the GROUPS structure used
+    in analysis plots. Each graph cell contains an inline TikZ diagram.
 
     Returns:
         A string of LaTeX source for the complete tabularx environment,
@@ -2775,7 +2803,7 @@ def build_productive_motifs_latex() -> str:
     lines.append(_TIKZ_STYLE_DEFS)
     lines.append(r"\begin{tabularx}{\textwidth}{p{2.2cm}cX}")
     lines.append(r"\toprule")
-    lines.append(r"Pattern & Graph & Description \\")
+    lines.append(r"Group & Graph & Description \\")
     lines.append(r"\midrule")
 
     for idx, (merge_name, keys) in enumerate(_SUBGRAPH_TABLE_ORDER):
@@ -2795,9 +2823,10 @@ def build_productive_motifs_latex() -> str:
 def build_reasoning_breakdowns_latex() -> str:
     """Return a LaTeX tabularx table with definitions of reasoning breakdowns.
 
-    Column layout: Topic (X), Graph (TikZ picture, c), Description (X).
-    Related breakdowns share a merged row label via `\\multirow`. Each graph
-    cell contains an inline TikZ diagram from `_TIKZ_ANTIPATTERN_PATTERNS`.
+    Column layout: Group (p{2.2cm}), Graph (TikZ picture, c), Name + Description (X).
+    Patterns are grouped by reasoning capability (Hypothesis handling,
+    Evidence handling, Inquiry control) matching the GROUPS structure used
+    in analysis plots. Each graph cell contains an inline TikZ diagram.
 
     Returns:
         A string of LaTeX source for the complete tabularx environment,
@@ -2805,9 +2834,9 @@ def build_reasoning_breakdowns_latex() -> str:
     """
     lines: list[str] = []
     lines.append(_TIKZ_STYLE_DEFS)
-    lines.append(r"\begin{tabularx}{\textwidth}{XcX}")
+    lines.append(r"\begin{tabularx}{\textwidth}{p{1.6cm}cX}")
     lines.append(r"\toprule")
-    lines.append(r"Pattern & Graph & Description \\")
+    lines.append(r"Group & Graph & Description \\")
     lines.append(r"\midrule")
 
     for idx, (merge_name, keys) in enumerate(_ANTIPATTERN_TABLE_ORDER):