From 759daf59931618c7d678046dec1161990165b1f5 Mon Sep 17 00:00:00 2001 From: MrtinoRG Date: Sun, 19 Apr 2026 15:02:15 +0200 Subject: [PATCH 1/2] feat: add statistics + illustrative traces --- analysis/compute_paper_stats.py | 237 ++++++++++++++++++++ analysis/illustrative_traces.tex | 371 +++++++++++++++++++++++++++++++ 2 files changed, 608 insertions(+) create mode 100644 analysis/compute_paper_stats.py create mode 100644 analysis/illustrative_traces.tex diff --git a/analysis/compute_paper_stats.py b/analysis/compute_paper_stats.py new file mode 100644 index 0000000000..bceaee6fea --- /dev/null +++ b/analysis/compute_paper_stats.py @@ -0,0 +1,237 @@ +""" +Compute the TODO statistics for the paper from analysis/results/data/reports.jsonl. + +Token counting logic: + For each assistant message in a trace, we count: + - INPUT = tiktoken count of ALL preceding messages (system + user + prior assistant) + - OUTPUT = tiktoken count of that assistant message + This mirrors the actual API usage: every generation call sends the full + conversation prefix as input and produces one completion as output. + +Outputs: + 1. Total tokens broken down by model and verbosity + 2. Number of configuration--environment pairs + 3. Estimated API cost for proprietary models (GPT-4o, Claude 4.5) + 4. Malformed-response / scaffold-error rates per model +""" + +import json +from collections import defaultdict +from pathlib import Path + +import tiktoken +from loguru import logger + +PRICING = { + "gpt-4o": {"input": 2.50, "output": 10.00}, + "claude-4.5": {"input": 3.00, "output": 15.00}, + # gpt-oss-120b is self-hosted, no API cost +} + +enc = tiktoken.get_encoding("o200k_base") + + +def count_tokens(text: str) -> int: + if not text: + return 0 + return len(enc.encode(str(text))) + + +DATA_PATH = Path(__file__).parent / "results" / "data" / "reports.jsonl" + +config_env_pairs = set() + +tokens = defaultdict(lambda: defaultdict(lambda: {"input": 0, "output": 0})) + +scaffold_errors = defaultdict(lambda: {"errors": 0, "total_trials": 0}) +trials_affected_count = defaultdict(lambda: {"affected": 0, "total": 0}) + +total_trials = 0 + +with DATA_PATH.open() as f: + for line in f: + rec = json.loads(line) + model = rec["model"] + agent_type = rec["agent_type"] + env = rec["environment"] + verbosity = rec.get("Tool Verbosity", "") + + config_env_pairs.add((model, agent_type, verbosity, env)) + + for task_data in rec["Task Results"].values(): + for trial in task_data["trials"]: + total_trials += 1 + msgs = trial.get("messages", []) + + # Tokenize every message once, reuse counts + msg_tok = [count_tokens(m.get("content", "")) for m in msgs] + + # For each assistant turn: input = sum of all prior, output = this turn + for i, m in enumerate(msgs): + if m.get("role") == "assistant": + tokens[model][verbosity]["input"] += sum(msg_tok[:i]) + tokens[model][verbosity]["output"] += msg_tok[i] + + # Scaffold errors + key = (model, agent_type) + scaffold_errors[key]["total_trials"] += 1 + trial_has_error = False + for m in msgs: + content = str(m.get("content", "")) + if m.get("role") == "user" and "No actions to execute" in content: + scaffold_errors[key]["errors"] += 1 + trial_has_error = True + if agent_type == "react": + trials_affected_count[model]["total"] += 1 + if trial_has_error: + trials_affected_count[model]["affected"] += 1 + + +def fmt_tokens(n): + if n >= 1e9: + return f"{n / 1e9:.1f}B" + if n >= 1e6: + return f"{n / 1e6:.1f}M" + if n >= 1e3: + return f"{n / 1e3:.1f}K" + return str(n) + + +def agg_model(model): + inp = sum(v["input"] for v in tokens[model].values()) + out = sum(v["output"] for v in tokens[model].values()) + return inp, out + + +def agg_all(): + inp = sum(v["input"] for m in tokens for v in tokens[m].values()) + out = sum(v["output"] for m in tokens for v in tokens[m].values()) + return inp, out + + +def agg_verbosity(verb): + inp = sum(tokens[m][verb]["input"] for m in tokens) + out = sum(tokens[m][verb]["output"] for m in tokens) + return inp, out + + +logger.info("=" * 80) +logger.info("PAPER STATISTICS (from reports.jsonl)") +logger.info("=" * 80) + +n_pairs = len(config_env_pairs) +logger.info(f"\nConfiguration--environment pairs (N): {n_pairs}") +logger.info( + f" (x 5 trials each = {n_pairs * 5} expected trials, actual = {total_trials})" +) + +all_models = sorted(tokens.keys()) +all_verbosities = sorted({v for m in tokens for v in tokens[m]}) +grand_in, grand_out = agg_all() + +logger.info(f"\n{'='*80}") +logger.info( + "TOKEN COUNTS BY MODEL (input = all prior msgs per call, output = assistant msg)" +) +logger.info(f"{'='*80}") +for m in all_models: + inp, out = agg_model(m) + logger.info( + f" {m:15s}: input {inp:>14,} output {out:>14,} total {inp+out:>14,}" + ) +logger.info( + f" {'GRAND TOTAL':15s}: input {grand_in:>14,} output {grand_out:>14,} total {grand_in+grand_out:>14,}" +) +logger.info(f"\n => Total tokens (all models): ~{fmt_tokens(grand_in + grand_out)}") + +logger.info(f"\n{'='*80}") +logger.info("TOKEN COUNTS BY VERBOSITY") +logger.info(f"{'='*80}") +for verb in all_verbosities: + inp, out = agg_verbosity(verb) + logger.info(f"\n Verbosity: {verb}") + for m in all_models: + vi = tokens[m][verb]["input"] + vo = tokens[m][verb]["output"] + logger.info( + f" {m:15s}: input {vi:>14,} output {vo:>14,} total {vi+vo:>14,}" + ) + logger.info( + f" {'SUBTOTAL':15s}: input {inp:>14,} output {out:>14,} total {inp+out:>14,}" + ) + +logger.info(f"\n{'='*80}") +logger.info("FLAT TABLE: MODEL x VERBOSITY") +logger.info(f"{'='*80}") +header = ( + f" {'Model':15s} {'Verbosity':15s} {'Input':>14s} {'Output':>14s} {'Total':>14s}" +) +logger.info(header) +logger.info(f" {'-'*15} {'-'*15} {'-'*14} {'-'*14} {'-'*14}") +for m in all_models: + for verb in all_verbosities: + vi = tokens[m][verb]["input"] + vo = tokens[m][verb]["output"] + logger.info(f" {m:15s} {verb:15s} {vi:>14,} {vo:>14,} {vi+vo:>14,}") + +logger.info(f"\n{'='*80}") +logger.info("ESTIMATED API COST (proprietary models only)") +logger.info(f"{'='*80}") +total_cost = 0.0 +for m, prices in PRICING.items(): + inp, out = agg_model(m) + cost_in = inp / 1e6 * prices["input"] + cost_out = out / 1e6 * prices["output"] + cost = cost_in + cost_out + total_cost += cost + logger.info( + f" {m:15s}: input ${cost_in:>10,.2f} output ${cost_out:>10,.2f} total ${cost:>10,.2f}" + ) +logger.info(f" {'TOTAL':15s}: ${total_cost:>10,.2f}") + +logger.info("\n Cost breakdown by verbosity:") +for verb in all_verbosities: + verb_cost = 0.0 + for m, prices in PRICING.items(): + vi = tokens[m][verb]["input"] + vo = tokens[m][verb]["output"] + verb_cost += vi / 1e6 * prices["input"] + vo / 1e6 * prices["output"] + logger.info(f" {verb:15s}: ${verb_cost:>10,.2f}") + +logger.info(f"\n{'='*80}") +logger.info("MALFORMED-RESPONSE RATES (ReAct scaffold errors)") +logger.info(f"{'='*80}") +react_totals = {} +for (m, at), val in scaffold_errors.items(): + if at == "react": + react_totals[m] = val + +logger.info( + f" {'Model':15s} {'Errors':>8s} {'Trials':>8s} {'Err/Trial':>10s} {'% Affected':>12s}" +) +for m in sorted(react_totals.keys()): + v = react_totals[m] + rate = v["errors"] / v["total_trials"] if v["total_trials"] else 0 + ta = trials_affected_count.get(m, {"affected": 0, "total": 0}) + pct = ta["affected"] / ta["total"] * 100 if ta["total"] else 0 + logger.info( + f" {m:15s} {v['errors']:>8d} {v['total_trials']:>8d} {rate:>10.3f} {pct:>11.1f}%" + ) + +logger.info(f"\n{'='*80}") +logger.info("LATEX-READY VALUES") +logger.info(f"{'='*80}") +logger.info(f" Tokens: ~{fmt_tokens(grand_in + grand_out)}") +logger.info(f" N (config--env pairs): {n_pairs}") +logger.info(f" API cost: ${total_cost:,.0f}") +logger.info("") +logger.info(" Malformed-response rates (ReAct only):") +for m in ["gpt-oss-120b", "gpt-4o", "claude-4.5"]: + if m in react_totals: + v = react_totals[m] + rate = v["errors"] / v["total_trials"] if v["total_trials"] else 0 + ta = trials_affected_count.get(m, {"affected": 0, "total": 0}) + pct = ta["affected"] / ta["total"] * 100 if ta["total"] else 0 + logger.info( + f" {m}: {rate:.2f} errors/trial, {pct:.1f}% of react trials affected" + ) diff --git a/analysis/illustrative_traces.tex b/analysis/illustrative_traces.tex new file mode 100644 index 0000000000..187ce7c1b4 --- /dev/null +++ b/analysis/illustrative_traces.tex @@ -0,0 +1,371 @@ +% Illustrative trace excerpts for the four major reasoning-breakdown categories. +% Requires: tcolorbox, tikz, xcolor, enumitem, listings +% Usage: \input{analysis/results/illustrative_traces.tex} + +\definecolor{colENU}{RGB}{198,219,239} % Evidence non-uptake — light blue +\definecolor{colUC}{RGB}{253,205,172} % Untested claim — light orange +\definecolor{colFB}{RGB}{203,213,232} % Fixed belief trace — light lavender +\definecolor{colCR}{RGB}{252,187,161} % Contradiction without repair — light red + +\definecolor{nodeE}{RGB}{86,146,194} % Evidence node badge +\definecolor{nodeH}{RGB}{196,124,58} % Hypothesis node badge +\definecolor{nodeJ}{RGB}{100,140,100} % Judgment node badge +\definecolor{nodeT}{RGB}{136,136,136} % Test node badge +\definecolor{nodeC}{RGB}{170,68,68} % Commitment node badge +\definecolor{nodeF}{RGB}{80,80,80} % Final-answer node badge + +\newcommand{\nodebadge}[1]{% + \tikz[baseline=(n.base)]{% + \node[circle, draw, fill=node#1!25, inner sep=0pt, + minimum size=9pt, font=\sffamily\bfseries\tiny] (n) {#1};% + }% +} + +\subsection{Illustrative traces} +\label{app:illustrative-traces} + +For each major breakdown category, we include one representative trace excerpt +with the supporting quote the annotator used to assign the pattern. For every +message in the excerpt, the epistemic nodes extracted by the annotator are +listed, together with the edges that link them. A +\nodebadge{H}~denotes a \emph{hypothesis}, +\nodebadge{T}~a \emph{test}, +\nodebadge{E}~an \emph{evidence} observation, +\nodebadge{J}~a \emph{judgment}, and +\nodebadge{C}~a \emph{commitment} (premature answer). +Missing edges that \emph{should} appear in a sound reasoning chain are marked +with \textcolor{red!70}{\textsf{\bfseries\small$\times$}}. +The full annotated traces, with node and edge graphs, filtering by pattern, +and alternative candidate annotations, are browsable at +\url{https://lamalab-org.github.io/corral}. + +\begin{tcolorbox}[ + colback=colENU!8, colframe=colENU!60!black, + title={\small\sffamily\bfseries Evidence non-uptake}, + fontupper=\small, + left=4pt, right=4pt, top=4pt, bottom=4pt, + boxrule=0.4pt, arc=2pt, + subtitle style={colback=colENU!25}, +] +\textbf{Model:} Claude Sonnet 4.5\quad +\textbf{Environment:} NMR structure elucidation (spectra)\quad +\textbf{Scope:} 1\quad +\textbf{Trial:} 20 + +\tcblower +\begin{description}[font=\sffamily\bfseries\small, leftmargin=12pt, style=sameline, itemsep=6pt] + +\item[Msg\,11 \normalfont\textsf{(Observation)}] +\texttt{obtain\_isomers\_from\_molecular\_formula} +$\to$~20~candidate SMILES returned for C$_{18}$H$_{14}$O$_2$, including +\texttt{COC(=O)/C=C\textbackslash{}C1=CC2=\allowbreak{}CC3=CC=CC=C3C=C2C=C1} (the correct answer) +.\\[2pt] +\nodebadge{T}~N13 \emph{Obtain isomers for molecular formula C$_{18}$H$_{14}$O$_2$} +$\xrightarrow{\textsf{observes}}$ +\nodebadge{E}~N14 \emph{20 isomers returned, incl.\ several with methyl ester groups} + +\item[Msg\,12 \normalfont\textsf{(Agent)}] +\enquote{\emph{I got 20 isomers with the molecular formula C$_{18}$H$_{14}$O$_2$. +Looking at the list, I can see several candidates. Let me analyze which one +matches our NMR data\,\ldots}}\\ +\textcolor{gray}{\footnotesize $\rightarrow$~calls \texttt{retrieve\_dbe\_formula}}\\[2pt] +No node extracted—the agent begins a tangential calculation instead of +using the isomer list. + +\item[Msg\,14 \normalfont\textsf{(Agent)}] +\enquote{\emph{Let me recalculate the DBE for C$_{18}$H$_{14}$O$_2$\,\ldots\ Looking +back at the isomers list, let me focus on those with methyl ester +groups\,\ldots}}\\ +\textcolor{gray}{\footnotesize $\rightarrow$~calls \texttt{simulate\_spectra} for methyl 9-phenanthrenecarboxylate}\\[2pt] +\nodebadge{T}~N15 \emph{Simulate spectra for methyl 9-phenanthrenecarboxylate +to compare with experimental} +\end{description} + +\medskip +\begin{center} +\begin{tikzpicture}[ + ncirc/.style={circle, draw, inner sep=1pt, minimum size=18pt, + font=\small\sffamily\bfseries}, + nlbl/.style={font=\footnotesize, below=2pt}, +] +\node[ncirc, fill=nodeT!25] (T) {T}; +\node[nlbl] at (T.south) {N13}; +\node[ncirc, fill=nodeE!25, right=55pt of T] (E) {E}; +\node[nlbl] at (E.south) {N14}; +\node[ncirc, draw=gray!40, text=gray!40, right=55pt of E] (JH) {J/H}; +\draw[->, thick] (T) -- node[above, font=\footnotesize] {observes} (E); +\draw[->, gray!40, dashed, thick] (E) -- node[above, font=\footnotesize, text=gray] {informs} (JH); +\node[red!70, font=\bfseries\small] at ($(E)!0.5!(JH)+(0,-0.3)$) {\texttimes}; +\end{tikzpicture} +\end{center} + +\smallskip\noindent +\textbf{Annotator quote:} +\emph{\enquote{\texttt{COC(=O)/C=C\textbackslash{}C1=CC2=\allowbreak{}CC3=CC=CC=C3C=C2C=C1}, +\texttt{COC(=O)C12C=CC(\allowbreak{}C3=CC=CC=C31)C4=CC=CC=C24}}} — the 20 candidate +structures include the correct answer, yet node~N14 has \textbf{zero outgoing +\textsf{informs} edges}: the isomer list is never used to constrain any +judgment or hypothesis. The agent instead guesses structures one by one +(phenanthrenecarboxylate, anthracenecarboxylate,~\ldots) without consulting +the evidence it already obtained. + +\end{tcolorbox} + +\begin{tcolorbox}[ + colback=colUC!8, colframe=colUC!60!black, + title={\small\sffamily\bfseries Untested claim}, + fontupper=\small, + left=4pt, right=4pt, top=4pt, bottom=4pt, + boxrule=0.4pt, arc=2pt, + subtitle style={colback=colUC!25}, +] +\textbf{Model:} Claude Sonnet 4.5\quad +\textbf{Environment:} NMR structure elucidation (spectra)\quad +\textbf{Scope:} 1\quad +\textbf{Trial:} 22 + +\tcblower +\begin{description}[font=\sffamily\bfseries\small, leftmargin=12pt, style=sameline, itemsep=6pt] + +\item[Msg\,3 \normalfont\textsf{(Observation)}] +\texttt{proton\_nmr\_spectra} +$\to$~\emph{\enquote{Deltas 2.23 (s, 3H), 2.81 (t, $J$\,=\,7.1\,Hz, 2H), +2.98 (t, $J$\,=\,7.1\,Hz, 2H), 7.17 (ddd, 7H), 7.90 (ddd, 2H).}}\\[2pt] +\nodebadge{T}~N1 \emph{Get proton NMR spectra to gather structural information} +$\xrightarrow{\textsf{observes}}$ +\nodebadge{E}~N2 \emph{Proton NMR: 2.23\,(s,\,3H), 2.81\,(t,\,2H), +2.98\,(t,\,2H), 7.17\,(ddd,\,7H), 7.90\,(ddd,\,2H)} + +\item[Msg\,4 \normalfont\textsf{(Agent)}] +\enquote{\emph{The two triplets at 2.81 and 2.98\,ppm suggest two CH$_2$ groups +coupled to each other (likely -CH$_2$-CH$_2$-). The aromatic signals suggest a +substituted benzene ring. The singlet at 2.23\,ppm suggests a methyl group +attached to a carbonyl (acetyl group).}}\\ +\textcolor{gray}{\footnotesize $\rightarrow$~calls +\texttt{mass\_spectrometry\_spectra}}\\[2pt] +\nodebadge{E}~N2 $\xrightarrow{\textsf{informs}}$ +\nodebadge{H}~N3 \emph{Two coupled CH$_2$ groups; singlet at 2.23\,ppm +suggests acetyl group; substituted benzene ring} + +\item[Msg\,5 \normalfont\textsf{(Observation)}] +\texttt{mass\_spectrometry\_spectra} $\to$ \emph{\enquote{m/z 224.12 (intensity +100)\,\ldots}}\\[2pt] +The agent moves directly to mass spectrometry—no test is designed to +evaluate the CH$_2$-CH$_2$ or acetyl-group hypothesis. + +\end{description} + +\medskip +\begin{center} +\begin{tikzpicture}[ + ncirc/.style={circle, draw, inner sep=1pt, minimum size=18pt, + font=\small\sffamily\bfseries}, + nlbl/.style={font=\footnotesize, below=2pt}, +] +\node[ncirc, fill=nodeE!25] (E) {E}; +\node[nlbl] at (E.south) {N2}; +\node[ncirc, fill=nodeH!25, right=55pt of E] (H) {H}; +\node[nlbl] at (H.south) {N3}; +\node[ncirc, draw=gray!40, text=gray!40, right=55pt of H] (T) {T}; +\draw[->, thick] (E) -- node[above, font=\footnotesize] {informs} (H); +\draw[->, gray!40, dashed, thick] (H) -- node[above, font=\footnotesize, text=gray] {tests} (T); +\node[red!70, font=\bfseries\small] at ($(H)!0.5!(T)+(0,-0.3)$) {\texttimes}; +\end{tikzpicture} +\end{center} + +\smallskip\noindent +\textbf{Annotator quote:} +\emph{\enquote{The two triplets at 2.81 and 2.98\,ppm suggest two CH$_2$ groups +coupled to each other (likely -CH$_2$-CH$_2$-). The aromatic signals +suggest a substituted benzene ring. The singlet at 2.23\,ppm suggests a +methyl group attached to a carbonyl (acetyl group).}} +— Node~N3 carries a detailed structural interpretation but has \textbf{zero +outgoing \textsf{tests} edges}. The agent asserts the presence of an acetyl +group and coupled CH$_2$ groups as fact, then immediately requests mass +spectrometry to determine molecular weight rather than designing an +experiment (e.g., simulating $^{13}$C NMR for a candidate) to verify +the claim. + +\end{tcolorbox} + +\begin{tcolorbox}[ + colback=colFB!8, colframe=colFB!60!black, + title={\small\sffamily\bfseries Fixed belief trace}, + fontupper=\small, + left=4pt, right=4pt, top=4pt, bottom=4pt, + boxrule=0.4pt, arc=2pt, + subtitle style={colback=colFB!25}, +] +\textbf{Model:} GPT-4o\quad +\textbf{Environment:} LAMMPS molecular dynamics (md)\quad +\textbf{Scope:} 2\quad +\textbf{Trial:} 33 + +\tcblower +\begin{description}[font=\sffamily\bfseries\small, leftmargin=12pt, style=sameline, itemsep=6pt] + +\item[Msg\,15 \normalfont\textsf{(Observation)}] +\texttt{list\_files} $\to$ \emph{\enquote{\{\enquote{files}: [\enquote{/potentials/BKS}, +\enquote{/potentials/EAM}, \enquote{/potentials/SW}, \enquote{/potentials/TERSOFF}]\}}}\\[2pt] +\nodebadge{T}~N10 \emph{List files in /potentials/ to find correct SW +potential path} +$\xrightarrow{\textsf{observes}}$ +\nodebadge{E}~N11 \emph{Potentials directory contains BKS, EAM, SW, TERSOFF +subdirectories} + +\item[Msg\,16 \normalfont\textsf{(Agent)}] +\enquote{\emph{The \texttt{/potentials/} directory contains a file named +\texttt{SW}, which likely corresponds to the Stillinger-Weber potential. +I will update the LAMMPS input script to use this file\,\ldots}}\\ +\textcolor{gray}{\footnotesize $\rightarrow$~calls \texttt{write\_file} +to set \texttt{pair\_coeff * * /potentials/SW Si}}\\[2pt] +\nodebadge{E}~N11 $\xrightarrow{\textsf{informs}}$ +\nodebadge{H}~N12 \emph{The SW potential file is located at /potentials/SW} + +\item[] {\color{gray} $\vdots$ \footnotesize\itshape three error messages and re-writes later} + +\item[Msg\,27 \normalfont\textsf{(Observation)}] +\texttt{run\_lammps} $\to$ \emph{\enquote{ERROR: Lost atoms: original 8 +current 0}}\\[2pt] +\nodebadge{T}~N33 \emph{Run LAMMPS simulation with updated script using +/potentials/SW/Si.sw} +$\xrightarrow{\textsf{observes}}$ +\nodebadge{E}~N34 \emph{LAMMPS simulation failed: Lost atoms} + +\item[Msg\,28 \normalfont\textsf{(Agent)}] +\enquote{\emph{This might be due to the rapid heating rate or insufficient +equilibration. I will adjust the heating rate and ensure proper +equilibration\,\ldots}}\\[2pt] +\nodebadge{E}~N34 \;(no \textsf{informs} to any existing H)\\ +\nodebadge{H}~N35 \emph{Lost atoms error may be due to rapid heating rate +or insufficient equilibration} + +\end{description} + +\medskip +\begin{center} +\begin{tikzpicture}[ + ncirc/.style={circle, draw, inner sep=1pt, minimum size=18pt, + font=\small\sffamily\bfseries}, + nlbl/.style={font=\footnotesize, below=2pt}, +] +\node[ncirc, fill=nodeH!25] (H1) {H}; +\node[nlbl] at (H1.south) {N12}; +\node[ncirc, fill=nodeH!25, right=60pt of H1] (H2) {H}; +\node[nlbl] at (H2.south) {N19}; +\node[right=16pt of H2, font=\small, text=gray] {$\cdots$}; +\node[ncirc, fill=nodeH!25, right=75pt of H2] (H5) {H}; +\node[nlbl] at (H5.south) {N46}; +\node[ncirc, fill=nodeH!25, right=60pt of H5] (H6) {H}; +\node[nlbl] at (H6.south) {N57}; +\draw[->, gray!40, dashed, thick] (H1) -- node[above, font=\footnotesize, text=gray] {updates\_to} (H2); +\node[red!70, font=\bfseries\small] at ($(H1)!0.5!(H2)+(0,-0.3)$) {\texttimes}; +\draw[->, gray!40, dashed, thick] (H5) -- node[above, font=\footnotesize, text=gray] {updates\_to} (H6); +\node[red!70, font=\bfseries\small] at ($(H5)!0.5!(H6)+(0,-0.3)$) {\texttimes}; +\end{tikzpicture} +\end{center} + +\smallskip\noindent +\textbf{Annotator quotes:}\\ +\noindent N12 ($\mathsf{msg\,16}$): \emph{\enquote{The \texttt{/potentials/} +directory contains a file named \texttt{SW}, which likely corresponds to the +Stillinger-Weber potential.}}\\ +\noindent N35 ($\mathsf{msg\,28}$): \emph{\enquote{This might be due to the rapid +heating rate or insufficient equilibration.}} + +\smallskip\noindent +The trace contains \textbf{6 hypothesis nodes and zero \textsf{updates\_to} +edges}. The first hypothesis~(N12) mistakes the directory +\texttt{/potentials/SW} for a file, producing an error that is never traced +back to the belief. After two more failures the agent introduces a second +independent hypothesis~(N35)—that lost atoms stem from heating rate—without +revising the earlier structural error. Both beliefs persist unmodified +through the remaining 15~messages. + +\end{tcolorbox} + +\begin{tcolorbox}[ + colback=colCR!8, colframe=colCR!60!black, + title={\small\sffamily\bfseries Contradiction without repair}, + fontupper=\small, + left=4pt, right=4pt, top=4pt, bottom=4pt, + boxrule=0.4pt, arc=2pt, + subtitle style={colback=colCR!25}, +] +\textbf{Model:} Claude Sonnet 4.5\quad +\textbf{Environment:} NMR structure elucidation (spectra)\quad +\textbf{Scope:} 2\quad +\textbf{Trial:} 45 + +\tcblower +\begin{description}[font=\sffamily\bfseries\small, leftmargin=12pt, style=sameline, itemsep=6pt] + +\item[Msg\,17 \normalfont\textsf{(Observation)}] +\texttt{simulate\_spectra} for \texttt{CC(C)OC(=O)c1ccc\-cc1N(C)C} +$\to$~$^{1}$H NMR: 1.46\,(d,\,\textbf{6H}),\;2.97\,(s,\,6H),\; +4.75\,(p,\,1H),\;aromatic protons.\\[2pt] +\nodebadge{T}~N31 \emph{Simulate spectra for +\texttt{CC(C)OC(=O)c1ccccc1N(C)C}} +$\xrightarrow{\textsf{observes}}$ +\nodebadge{E}~N25 \emph{Simulated: 1.46\,ppm (d,\,\textbf{6H}); +experimental: 1.43\,ppm (d,\,\textbf{3H})} + +\item[Msg\,18 \normalfont\textsf{(Agent)}] +\enquote{\emph{H-NMR: 1.46\,ppm (d,\,6H) — This is for isopropyl CH$_3$ groups, but +experimental shows 1.43\,ppm (d,\,3H). The experimental data has only 3H in +the doublet, not 6H. \textbf{So this isn't isopropyl ester.} The +experimental data clearly shows CH(CH$_3$), not CH(CH$_3$)$_2$.}}\\ +\textcolor{gray}{\footnotesize $\rightarrow$~calls +\texttt{get\_formula\_from\_smiles}}\\[2pt] +\nodebadge{E}~N25 $\xrightarrow{\textsf{informs}}$ +\nodebadge{J}~N33 \emph{Simulated isopropyl ester shows 6H doublet at +1.46\,ppm, but experimental shows only 3H doublet at 1.43\,ppm}\\[2pt] +\nodebadge{J}~N33 $\xrightarrow{\textsf{contradicts}}$ +\nodebadge{H}~N30 \emph{The compound is 2-(dimethylamino)benzoic acid +isopropyl ester} + +\item[Msg\,20 \normalfont\textsf{(Agent --- final answer)}] +\enquote{\emph{Simulations are approximations and can have errors\,\ldots\ +The key evidence: (1)~quartet at 5.05\,ppm, (2)~singlet at 2.69\,ppm +(6H) = N(CH$_3$)$_2$, (3)~4 aromatic protons, (4)~carbonyl at +168.30\,ppm.}} +\par\texttt{CC(C)OC(=O)c1ccc\-cc1N(C)C}\\[2pt] +\nodebadge{J}~N36 \emph{Despite the 3H vs.\,6H discrepancy, the overall +evidence supports the isopropyl ester}\\ +\nodebadge{C}~N37 \emph{Agent commits despite unresolved discrepancy} + +\end{description} + +\medskip +\begin{center} +\begin{tikzpicture}[ + ncirc/.style={circle, draw, inner sep=1pt, minimum size=18pt, + font=\small\sffamily\bfseries}, + nlbl/.style={font=\footnotesize, below=2pt}, +] +\node[ncirc, fill=nodeE!25] (E) {E}; +\node[nlbl] at (E.south) {N25}; +\node[ncirc, fill=nodeJ!25, right=55pt of E] (J) {J}; +\node[nlbl] at (J.south) {N33}; +\node[ncirc, fill=nodeH!25, right=55pt of J] (H) {H}; +\node[nlbl] at (H.south) {N30}; +\node[ncirc, draw=gray!40, text=gray!40, right=55pt of H] (Hrev) {H$'$}; +\draw[->, thick] (E) -- node[above, font=\footnotesize] {informs} (J); +\draw[->, thick, red!60!black] (J) -- node[above, font=\footnotesize] {contradicts} (H); +\draw[->, gray!40, dashed, thick] (H) -- node[above, font=\footnotesize, text=gray] {updates\_to} (Hrev); +\node[red!70, font=\bfseries\small] at ($(H)!0.5!(Hrev)+(0,-0.3)$) {\texttimes}; +\end{tikzpicture} +\end{center} + +\smallskip\noindent +\textbf{Annotator quote:} +\emph{\enquote{H-NMR: 1.46\,ppm (d,\,6H) — This is for isopropyl CH$_3$ groups, +but experimental shows 1.43\,ppm (d,\,3H)\,\ldots\ The experimental data +has only 3H in the doublet, not 6H. So this isn't isopropyl ester.}} +— The agent \emph{explicitly} notes that the simulated $^{1}$H~NMR +contradicts its own hypothesis~(N30). Yet no revised hypothesis is generated: +the \textsf{contradicts} edge from~N33 to~N30 has \textbf{no accompanying +\textsf{updates\_to}~edge}. Instead, the agent dismisses the contradiction +as a simulation artefact and submits the isopropyl ester +(\texttt{CC(C)OC(=O)c1ccccc1N(C)C}) as the final answer~(N38). +\end{tcolorbox} From aa78aaca68ab6cd9afdcb7f83f5d604a22abc5d8 Mon Sep 17 00:00:00 2001 From: MrtinoRG Date: Tue, 21 Apr 2026 10:20:35 +0200 Subject: [PATCH 2/2] fix: update graph tables --- reasoning_reports/analyze.py | 101 ++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 36 deletions(-) diff --git a/reasoning_reports/analyze.py b/reasoning_reports/analyze.py index 3dafb2cf63..235d39ae0d 100644 --- a/reasoning_reports/analyze.py +++ b/reasoning_reports/analyze.py @@ -147,7 +147,7 @@ "[E -contradicts-> H, no updates_to/competes_with]." ), "premature_commitment": ( - "Hypothesis committed without intermediate testing " + "Commitment (explicit or inferred) to a hypothesis without testing it first " "[J -informs-> C, J -informs-> H, H with no tests]." ), "uninformative_test": ("Test produces no observed evidence [T with no E]."), @@ -156,11 +156,11 @@ ), "disconnected_evidence": ("Evidence node with no edges [Isolated E]."), "one_sided_confirmation": ( - "Commitment reached without considering contradicting evidence " + "Commitment (explicit or inferred) without contradicting evidence " "[J -informs-> C, J -informs-> H, H has support but no contradicts]." ), "precommitted_test_plan": ( - "A commitment is done before evidence collection begins " + "Commitment (explicit or inferred) before evidence collection " "[C before E; then H -tests-> T]." ), } @@ -2707,28 +2707,57 @@ def _split_description(desc: str) -> tuple[str, str]: # Each entry is (merge_label | None, [pattern_keys]). # When *merge_label* is not None, \multirow groups the constituent rows. _SUBGRAPH_TABLE_ORDER: list[tuple[str | None, list[str]]] = [ - (None, [SG_REFUTATION_DRIVEN_BELIEF_REVISION]), ( - "Data-first hypothesis", - [SG_EXPLORE_THEN_TEST_TRANSITION, SG_EVIDENCE_LED_HYPOTHESIS_GENERATION], + "Hypothesis handling", + [ + SG_EVIDENCE_LED_HYPOTHESIS_GENERATION, + SG_HYPOTHESIS_RERANKING, + SG_REFUTATION_DRIVEN_BELIEF_REVISION, + SG_EXPLORE_THEN_TEST_TRANSITION, + ], ), - (None, [SG_HYPOTHESIS_RERANKING]), - (None, [SG_CONVERGENT_MULTI_TEST_EVIDENCE]), ( - "Iterative test refinement", - [SG_FIXED_HYPOTHESIS_TEST_TUNING, SG_EVIDENCE_GUIDED_TEST_REDESIGN], + "Evidence handling", + [ + SG_CONVERGENT_MULTI_TEST_EVIDENCE, + ], + ), + ( + "Inquiry control", + [ + SG_FIXED_HYPOTHESIS_TEST_TUNING, + SG_EVIDENCE_GUIDED_TEST_REDESIGN, + ], ), ] _ANTIPATTERN_TABLE_ORDER: list[tuple[str | None, list[str]]] = [ - ("Untested hypothesis", [AP_UNTESTED_CLAIM, AP_PREMATURE_COMMITMENT]), - ("Unused evidence", [AP_EVIDENCE_NON_UPTAKE, AP_DISCONNECTED_EVIDENCE]), - (None, [AP_UNSUPPORTED_JUDGMENT]), - (None, [AP_CONTRADICTION_WITHOUT_REPAIR]), - (None, [AP_UNINFORMATIVE_TEST]), - ("Absent/Stalled revision", [AP_STALLED_REVISION, AP_FIXED_BELIEF_TRACE]), - (None, [AP_ONE_SIDED_CONFIRMATION]), - (None, [AP_PRECOMMITTED_TEST_PLAN]), + ( + "Hypothesis handling", + [ + AP_UNTESTED_CLAIM, + AP_ONE_SIDED_CONFIRMATION, + AP_CONTRADICTION_WITHOUT_REPAIR, + AP_PREMATURE_COMMITMENT, + ], + ), + ( + "Evidence handling", + [ + AP_EVIDENCE_NON_UPTAKE, + AP_DISCONNECTED_EVIDENCE, + AP_UNSUPPORTED_JUDGMENT, + AP_UNINFORMATIVE_TEST, + ], + ), + ( + "Inquiry control", + [ + AP_FIXED_BELIEF_TRACE, + AP_PRECOMMITTED_TEST_PLAN, + AP_STALLED_REVISION, + ], + ), ] @@ -2743,29 +2772,28 @@ def _emit_group( for i, key in enumerate(keys): prose, graph_text = _split_description(descriptions[key]) prose = _latex_escape(prose) + name = _pretty_name(key) + name_and_desc = rf"\textbf{{{name}}}. {prose}" if tikz_patterns and key in tikz_patterns: graph = tikz_patterns[key] else: graph = _latex_escape(graph_text) - if n == 1: - name = _pretty_name(key) - rows.append(rf"{name} & {graph} & {prose} \\") + if i == 0: + rows.append( + rf"\multirow{{{n}}}{{=}}{{{merge_name}}} & {graph} & {name_and_desc} \\" + ) else: - if i == 0: - rows.append( - rf"\multirow{{{n}}}{{=}}{{{merge_name}}} & {graph} & {prose} \\" - ) - else: - rows.append(rf" & {graph} & {prose} \\") + rows.append(rf" & {graph} & {name_and_desc} \\") return rows def build_productive_motifs_latex() -> str: """Return a LaTeX tabularx table with definitions of productive motifs. - Column layout: Topic (p{2.2cm}), Graph (TikZ picture, c), Description (X). - Related patterns share a merged row label via `\\multirow`. Each graph - cell contains an inline TikZ diagram from `_TIKZ_SUBGRAPH_PATTERNS`. + Column layout: Group (p{2.2cm}), Graph (TikZ picture, c), Name + Description (X). + Patterns are grouped by reasoning capability (Hypothesis handling, + Evidence handling, Inquiry control) matching the GROUPS structure used + in analysis plots. Each graph cell contains an inline TikZ diagram. Returns: A string of LaTeX source for the complete tabularx environment, @@ -2775,7 +2803,7 @@ def build_productive_motifs_latex() -> str: lines.append(_TIKZ_STYLE_DEFS) lines.append(r"\begin{tabularx}{\textwidth}{p{2.2cm}cX}") lines.append(r"\toprule") - lines.append(r"Pattern & Graph & Description \\") + lines.append(r"Group & Graph & Description \\") lines.append(r"\midrule") for idx, (merge_name, keys) in enumerate(_SUBGRAPH_TABLE_ORDER): @@ -2795,9 +2823,10 @@ def build_productive_motifs_latex() -> str: def build_reasoning_breakdowns_latex() -> str: """Return a LaTeX tabularx table with definitions of reasoning breakdowns. - Column layout: Topic (X), Graph (TikZ picture, c), Description (X). - Related breakdowns share a merged row label via `\\multirow`. Each graph - cell contains an inline TikZ diagram from `_TIKZ_ANTIPATTERN_PATTERNS`. + Column layout: Group (p{2.2cm}), Graph (TikZ picture, c), Name + Description (X). + Patterns are grouped by reasoning capability (Hypothesis handling, + Evidence handling, Inquiry control) matching the GROUPS structure used + in analysis plots. Each graph cell contains an inline TikZ diagram. Returns: A string of LaTeX source for the complete tabularx environment, @@ -2805,9 +2834,9 @@ def build_reasoning_breakdowns_latex() -> str: """ lines: list[str] = [] lines.append(_TIKZ_STYLE_DEFS) - lines.append(r"\begin{tabularx}{\textwidth}{XcX}") + lines.append(r"\begin{tabularx}{\textwidth}{p{1.6cm}cX}") lines.append(r"\toprule") - lines.append(r"Pattern & Graph & Description \\") + lines.append(r"Group & Graph & Description \\") lines.append(r"\midrule") for idx, (merge_name, keys) in enumerate(_ANTIPATTERN_TABLE_ORDER):