From 411e3f2571dd203600949c90ee59452d6b273c86 Mon Sep 17 00:00:00 2001 From: Alexander Kireev Date: Mon, 18 May 2026 02:38:25 +0700 Subject: [PATCH 1/5] test: consolidate test files into __tests__, drop stale duplicates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four test files existed twice — older copies at src/ root and newer, maintained copies under src/__tests__/. bun test ran both, so the same assertions executed redundantly. Remove the stale root copies and move shadow-eval.test.ts under __tests__ so every test lives in one place. Co-Authored-By: Claude Opus 4.7 --- src/{ => __tests__}/shadow-eval.test.ts | 2 +- src/ab-router.test.ts | 77 ------------- src/elo.test.ts | 76 ------------- src/skill-recommendations.test.ts | 109 ------------------- src/stage-router.test.ts | 138 ------------------------ 5 files changed, 1 insertion(+), 401 deletions(-) rename src/{ => __tests__}/shadow-eval.test.ts (96%) delete mode 100644 src/ab-router.test.ts delete mode 100644 src/elo.test.ts delete mode 100644 src/skill-recommendations.test.ts delete mode 100644 src/stage-router.test.ts diff --git a/src/shadow-eval.test.ts b/src/__tests__/shadow-eval.test.ts similarity index 96% rename from src/shadow-eval.test.ts rename to src/__tests__/shadow-eval.test.ts index ff5d0f7..ba7beef 100644 --- a/src/shadow-eval.test.ts +++ b/src/__tests__/shadow-eval.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from "bun:test"; -import { shadowDecide } from "./shadow-eval.ts"; +import { shadowDecide } from "../shadow-eval.ts"; describe("shadowDecide", () => { test("0 pairs → inconclusive", () => { diff --git a/src/ab-router.test.ts b/src/ab-router.test.ts deleted file mode 100644 index 5cc35c9..0000000 --- a/src/ab-router.test.ts +++ /dev/null @@ -1,77 +0,0 @@ -import { describe, expect, test } from "bun:test"; -import { pickVariant } from "./ab-router.ts"; - -const EXP = { - slug: "test-exp", - variants: [ - { styleSlug: "a", weight: 50 }, - { styleSlug: "b", weight: 50 }, - ], -}; - -describe("pickVariant", () => { - test("returns a known variant slug", () => { - const result = pickVariant(EXP, "user-1"); - expect(["a", "b"]).toContain(result); - }); - - test("same userId always gets same variant (deterministic)", () => { - const r1 = pickVariant(EXP, "user-42"); - const r2 = pickVariant(EXP, "user-42"); - expect(r1).toBe(r2); - }); - - test("different userIds can get different variants", () => { - const results = new Set( - Array.from({ length: 20 }, (_, i) => pickVariant(EXP, `user-${i}`)), - ); - expect(results.size).toBeGreaterThan(1); - }); - - test("100% weight on one variant always returns it", () => { - const exp = { - slug: "one-sided", - variants: [{ styleSlug: "only", weight: 100 }], - }; - for (let i = 0; i < 50; i++) { - expect(pickVariant(exp, `u${i}`)).toBe("only"); - } - }); - - test("distribution is roughly proportional to weights", () => { - const exp = { - slug: "weighted", - variants: [ - { styleSlug: "heavy", weight: 80 }, - { styleSlug: "light", weight: 20 }, - ], - }; - const counts: Record = { heavy: 0, light: 0 }; - for (let i = 0; i < 500; i++) { - const v = pickVariant(exp, String(i)); - counts[v] = (counts[v] ?? 0) + 1; - } - // heavy should win ~80% of the time — allow ±10% slack - expect(counts.heavy).toBeGreaterThan(300); - expect(counts.light).toBeLessThan(200); - }); - - test("throws on empty variants", () => { - expect(() => pickVariant({ slug: "x", variants: [] }, "u")).toThrow(); - }); - - test("throws on zero total weight", () => { - expect(() => - pickVariant( - { slug: "x", variants: [{ styleSlug: "a", weight: 0 }] }, - "u", - ), - ).toThrow(); - }); - - test("numeric userId is treated same as its string equivalent", () => { - const byStr = pickVariant(EXP, "123"); - const byNum = pickVariant(EXP, 123); - expect(byStr).toBe(byNum); - }); -}); diff --git a/src/elo.test.ts b/src/elo.test.ts deleted file mode 100644 index 1a2639f..0000000 --- a/src/elo.test.ts +++ /dev/null @@ -1,76 +0,0 @@ -import { describe, expect, test } from "bun:test"; -import { - actualScore, - ELO_BASELINE, - ELO_DEFAULT_K, - eloUpdate, - eloUpdatePair, - expectedScore, -} from "./elo.ts"; - -describe("actualScore", () => { - test("won → 1", () => expect(actualScore("won")).toBe(1)); - test("lost → 0", () => expect(actualScore("lost")).toBe(0)); - test("draw → 0.5", () => expect(actualScore("draw")).toBe(0.5)); -}); - -describe("expectedScore", () => { - test("equal ratings → 0.5", () => { - expect(expectedScore(1500, 1500)).toBeCloseTo(0.5); - }); - test("higher self → > 0.5", () => { - expect(expectedScore(1600, 1500)).toBeGreaterThan(0.5); - }); - test("lower self → < 0.5", () => { - expect(expectedScore(1400, 1500)).toBeLessThan(0.5); - }); -}); - -describe("eloUpdate", () => { - test("win from baseline raises rating", () => { - const next = eloUpdate(ELO_BASELINE, "won"); - expect(next).toBeGreaterThan(ELO_BASELINE); - }); - test("loss from baseline lowers rating", () => { - const next = eloUpdate(ELO_BASELINE, "lost"); - expect(next).toBeLessThan(ELO_BASELINE); - }); - test("draw from baseline changes by less than K/2", () => { - const next = eloUpdate(ELO_BASELINE, "draw"); - expect(Math.abs(next - ELO_BASELINE)).toBeLessThan(ELO_DEFAULT_K / 2); - }); - test("win + loss are symmetric around baseline", () => { - const win = eloUpdate(ELO_BASELINE, "won"); - const loss = eloUpdate(ELO_BASELINE, "lost"); - expect(win + loss).toBe(2 * ELO_BASELINE); - }); - test("win delta ≈ K*(1-0.5) = 16 at equal ratings", () => { - expect(eloUpdate(1500, "won")).toBe(1516); - expect(eloUpdate(1500, "lost")).toBe(1484); - }); - test("custom k and opponentRating respected", () => { - const next = eloUpdate(1500, "won", { k: 16, opponentRating: 1500 }); - expect(next).toBe(1508); - }); -}); - -describe("eloUpdatePair", () => { - test("symmetric: A wins → A up, B down", () => { - const { a, b } = eloUpdatePair(1500, 1500, "won"); - expect(a).toBeGreaterThan(1500); - expect(b).toBeLessThan(1500); - }); - test("sum of ratings is preserved (±1 rounding)", () => { - const { a, b } = eloUpdatePair(1500, 1500, "won"); - expect(Math.abs(a + b - 3000)).toBeLessThanOrEqual(1); - }); - test("draw at equal ratings leaves both unchanged", () => { - const { a, b } = eloUpdatePair(1500, 1500, "draw"); - expect(a).toBe(1500); - expect(b).toBe(1500); - }); - test("A wins → delta = -(B delta) within rounding", () => { - const { a, b } = eloUpdatePair(1600, 1400, "won"); - expect(Math.abs(a - 1600 + (b - 1400))).toBeLessThanOrEqual(1); - }); -}); diff --git a/src/skill-recommendations.test.ts b/src/skill-recommendations.test.ts deleted file mode 100644 index bab152a..0000000 --- a/src/skill-recommendations.test.ts +++ /dev/null @@ -1,109 +0,0 @@ -import { describe, expect, test } from "bun:test"; -import { - rankSkillRecommendations, - wilsonLowerBound, -} from "./skill-recommendations.ts"; -import type { SkillAggregate, SkillRow } from "./store.ts"; - -describe("wilsonLowerBound", () => { - test("0 total → 0", () => expect(wilsonLowerBound(0, 0)).toBe(0)); - test("100% win rate returns positive lower bound", () => { - expect(wilsonLowerBound(10, 10)).toBeGreaterThan(0.7); - }); - test("0% win rate → 0", () => { - expect(wilsonLowerBound(0, 10)).toBe(0); - }); - test("50% rate, large sample → near 0.5", () => { - expect(wilsonLowerBound(500, 1000)).toBeCloseTo(0.47, 1); - }); - test("lower bound < observed rate", () => { - const lb = wilsonLowerBound(7, 10); - expect(lb).toBeLessThan(0.7); - expect(lb).toBeGreaterThan(0); - }); - test("more samples → tighter (higher lower bound) for same rate", () => { - const lb10 = wilsonLowerBound(5, 10); - const lb100 = wilsonLowerBound(50, 100); - expect(lb100).toBeGreaterThan(lb10); - }); -}); - -const makeSkill = (slug: string, family = "cialdini"): SkillRow => ({ - slug, - display_name: slug, - family, - prompt_fragment: "", - applicable_stages: [], - is_enabled: true, -}); - -const makeAgg = ( - slug: string, - wins: number, - losses: number, - draws = 0, -): SkillAggregate => ({ - skill_slug: slug, - wins, - losses, - draws, - count: wins + losses + draws, -}); - -describe("rankSkillRecommendations", () => { - test("returns empty when catalogue is empty", () => { - expect(rankSkillRecommendations([], [])).toEqual([]); - }); - - test("filters out disabled skills", () => { - const skill = { ...makeSkill("s1"), is_enabled: false }; - expect(rankSkillRecommendations([skill], [])).toHaveLength(0); - }); - - test("filters out noise family", () => { - const skill = makeSkill("noise-skill", "noise"); - expect(rankSkillRecommendations([skill], [])).toHaveLength(0); - }); - - test("skill with no aggregates has count=0, NaN rate", () => { - const [rec] = rankSkillRecommendations([makeSkill("s1")], []); - expect(rec?.count).toBe(0); - expect(rec?.observed_rate).toBeNaN(); - expect(rec?.confidence_lower).toBe(0); - expect(rec?.recommended).toBe(false); - }); - - test("skill below minSamples has confidence_lower=0", () => { - const [rec] = rankSkillRecommendations( - [makeSkill("s1")], - [makeAgg("s1", 3, 0)], - { minSamples: 5 }, - ); - expect(rec?.confidence_lower).toBe(0); - }); - - test("high win-rate skill is recommended once samples met", () => { - const [rec] = rankSkillRecommendations( - [makeSkill("s1")], - [makeAgg("s1", 8, 2)], - { minSamples: 5, acceptThreshold: 0.4 }, - ); - expect(rec?.recommended).toBe(true); - }); - - test("ranks high-confidence skill above low-confidence", () => { - const skills = [makeSkill("weak"), makeSkill("strong")]; - const aggs = [makeAgg("weak", 3, 7), makeAgg("strong", 9, 1)]; - const recs = rankSkillRecommendations(skills, aggs, { minSamples: 5 }); - expect(recs[0]?.slug).toBe("strong"); - }); - - test("draws count as 0.5 wins for observed_rate", () => { - const [rec] = rankSkillRecommendations( - [makeSkill("s1")], - [makeAgg("s1", 5, 5, 10)], - ); - // wins=5, draws=10 → successCount=10, total=20 → rate=0.5 - expect(rec?.observed_rate).toBeCloseTo(0.5); - }); -}); diff --git a/src/stage-router.test.ts b/src/stage-router.test.ts deleted file mode 100644 index 9b39214..0000000 --- a/src/stage-router.test.ts +++ /dev/null @@ -1,138 +0,0 @@ -import { describe, expect, test } from "bun:test"; -import { nextStage } from "./stage-router.ts"; - -describe("nextStage — objection keywords", () => { - for (const word of [ - "но", - "боюсь", - "развод", - "обман", - "не уверен", - "страшно", - ]) { - test(`"${word}" → objection`, () => { - expect( - nextStage({ - turnNumber: 3, - currentStage: "qualify", - lastUserMessage: word, - }), - ).toBe("objection"); - }); - } -}); - -describe("nextStage — pitch keywords", () => { - for (const word of [ - "сколько", - "зарплата", - "вакансии", - "контракт", - "виза", - "условия", - ]) { - test(`"${word}" → pitch`, () => { - expect( - nextStage({ - turnNumber: 3, - currentStage: "qualify", - lastUserMessage: word, - }), - ).toBe("pitch"); - }); - } -}); - -describe("nextStage — agreement → close", () => { - for (const stage of ["pitch", "qualify", "objection"] as const) { - test(`"давай" from ${stage} → close`, () => { - expect( - nextStage({ - turnNumber: 5, - currentStage: stage, - lastUserMessage: "давай", - }), - ).toBe("close"); - }); - } - test("agreement from opener does NOT go to close", () => { - const result = nextStage({ - turnNumber: 5, - currentStage: "opener", - lastUserMessage: "ок", - }); - expect(result).not.toBe("close"); - }); -}); - -describe("nextStage — turn 1 fallback", () => { - test("turn 1, null stage → opener", () => { - expect( - nextStage({ - turnNumber: 1, - currentStage: null, - lastUserMessage: "привет", - }), - ).toBe("opener"); - }); - test("turn 1, existing stage preserved", () => { - expect( - nextStage({ - turnNumber: 1, - currentStage: "qualify", - lastUserMessage: "привет", - }), - ).toBe("qualify"); - }); -}); - -describe("nextStage — stage progression", () => { - test("opener → qualify on turn 2", () => { - expect( - nextStage({ - turnNumber: 2, - currentStage: "opener", - lastUserMessage: "интересно", - }), - ).toBe("qualify"); - }); - test("qualify stays on qualifier pattern", () => { - expect( - nextStage({ - turnNumber: 3, - currentStage: "qualify", - lastUserMessage: "мне 23 года, из Москвы", - }), - ).toBe("qualify"); - }); - test("close stays close", () => { - expect( - nextStage({ - turnNumber: 8, - currentStage: "close", - lastUserMessage: "думаю", - }), - ).toBe("close"); - }); -}); - -describe("nextStage — Cyrillic Unicode boundary", () => { - test("objection keyword inside sentence matches", () => { - expect( - nextStage({ - turnNumber: 3, - currentStage: "qualify", - lastUserMessage: "мне кажется это развод какой-то", - }), - ).toBe("objection"); - }); - test("pricing keyword inside sentence matches", () => { - expect( - nextStage({ - turnNumber: 3, - currentStage: "qualify", - lastUserMessage: "а сколько там платят?", - }), - ).toBe("pitch"); - }); -}); From 04f78e4296937b1403ca43696f664c1b396badb7 Mon Sep 17 00:00:00 2001 From: Alexander Kireev Date: Mon, 18 May 2026 02:41:31 +0700 Subject: [PATCH 2/5] refactor: extract shared tolerant LLM-JSON parser The same strip-think-tags / strip-code-fences / JSON.parse / extract-outer- block logic was reimplemented in four places (coach, judge, pairwise, stage-classifier). Extract it into src/llm-json.ts as extractJsonObject and route all four call sites through it; each caller keeps its own domain-specific normalization and last-resort regex fallback. Co-Authored-By: Claude Opus 4.7 --- src/__tests__/llm-json.test.ts | 48 +++++++++++++++++++++++++++++++++ src/coach.ts | 28 ++++--------------- src/llm-json.ts | 49 ++++++++++++++++++++++++++++++++++ src/self-play/judge.ts | 29 +++++++------------- src/self-play/pairwise.ts | 28 +++++++------------ src/stage-classifier.ts | 20 +++----------- 6 files changed, 124 insertions(+), 78 deletions(-) create mode 100644 src/__tests__/llm-json.test.ts create mode 100644 src/llm-json.ts diff --git a/src/__tests__/llm-json.test.ts b/src/__tests__/llm-json.test.ts new file mode 100644 index 0000000..819a08c --- /dev/null +++ b/src/__tests__/llm-json.test.ts @@ -0,0 +1,48 @@ +import { describe, expect, test } from "bun:test"; +import { extractJsonObject } from "../llm-json.ts"; + +describe("extractJsonObject", () => { + test("parses a clean JSON object", () => { + expect(extractJsonObject('{"a":1,"b":"x"}')).toEqual({ a: 1, b: "x" }); + }); + + test("strips a ```json code fence", () => { + expect(extractJsonObject('```json\n{"ok":true}\n```')).toEqual({ + ok: true, + }); + }); + + test("strips a plain ``` code fence", () => { + expect(extractJsonObject('```\n{"ok":false}\n```')).toEqual({ ok: false }); + }); + + test("strips a leading block", () => { + const raw = 'reasoning here\n{"stage":"pitch"}'; + expect(extractJsonObject(raw)).toEqual({ stage: "pitch" }); + }); + + test("extracts the object from surrounding prose", () => { + const raw = 'Ответ: {"winner":"a"} — надеюсь, помог'; + expect(extractJsonObject(raw)).toEqual({ winner: "a" }); + }); + + test("returns null for prose with no JSON object", () => { + expect(extractJsonObject("I cannot determine the outcome")).toBeNull(); + }); + + test("returns null for an empty string", () => { + expect(extractJsonObject("")).toBeNull(); + }); + + test("returns null for a JSON array (objects only)", () => { + expect(extractJsonObject("[1,2,3]")).toBeNull(); + }); + + test("returns null for malformed JSON", () => { + expect(extractJsonObject('{"a": }')).toBeNull(); + }); + + test("returns null for a non-string input", () => { + expect(extractJsonObject(undefined as unknown as string)).toBeNull(); + }); +}); diff --git a/src/coach.ts b/src/coach.ts index 775c10d..c1b3661 100644 --- a/src/coach.ts +++ b/src/coach.ts @@ -1,3 +1,4 @@ +import { extractJsonObject } from "./llm-json.ts"; import type { ISelfPlayMatchesRepo } from "./store.ts"; /** * Coach-LLM: reads recent self-play LOSSES and DRAWS for a style, @@ -216,31 +217,12 @@ export async function proposeStyleEdits( /** * Tolerant JSON parser. Strips code fences, attempts JSON.parse, falls - * back to extracting an outer object via regex. Always returns a valid - * CoachProposal (with raw output preserved on parse failure). + * back to extracting an outer object. Always returns a valid CoachProposal + * (with raw output preserved on parse failure). */ export function parseProposal(raw: string): CoachProposal { - const stripped = raw - .replace(/^```(?:json)?\s*/i, "") - .replace(/\s*```\s*$/i, "") - .trim(); - // First try a direct parse. - try { - const parsed = JSON.parse(stripped); - return normalizeProposal(parsed, raw); - } catch { - /* fall through */ - } - // Try to extract the outermost {...} block. - const m = stripped.match(/\{[\s\S]*\}/); - if (m) { - try { - const parsed = JSON.parse(m[0]); - return normalizeProposal(parsed, raw); - } catch { - /* fall through */ - } - } + const parsed = extractJsonObject(raw); + if (parsed) return normalizeProposal(parsed, raw); return { summary: "(coach output unparseable — see raw)", edits: {}, diff --git a/src/llm-json.ts b/src/llm-json.ts new file mode 100644 index 0000000..18bc9b8 --- /dev/null +++ b/src/llm-json.ts @@ -0,0 +1,49 @@ +/** + * Tolerant JSON-object extraction for LLM output. + * + * Models rarely return a clean JSON object: they wrap it in markdown code + * fences, prepend `...` reasoning, or surround it with prose. + * `extractJsonObject` strips all of that and returns the first parseable + * object — or `null` when nothing usable is found. + * + * Callers keep their own domain-specific normalization and last-resort + * regex fallback; this only handles the generic strip-and-parse step that + * was previously duplicated across coach / judge / pairwise / classifier. + */ + +function tryParseObject(s: string): Record | null { + try { + const parsed: unknown = JSON.parse(s); + if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) { + return parsed as Record; + } + } catch { + /* not valid JSON */ + } + return null; +} + +/** + * Strip think-tags and code fences, then return the first JSON object found + * — either the whole payload or the outermost `{...}` block embedded in it. + */ +export function extractJsonObject(raw: string): Record | null { + if (typeof raw !== "string") return null; + const stripped = raw + .replace(/[\s\S]*?<\/think>/gi, "") + .replace(/^\s*```(?:json|js)?\s*/i, "") + .replace(/\s*```\s*$/i, "") + .trim(); + + const direct = tryParseObject(stripped); + if (direct) return direct; + + // Fall back to the outermost { ... } block — handles leading prefixes + // ("Ответ:", "Result:") and trailing commentary around the object. + const start = stripped.indexOf("{"); + const end = stripped.lastIndexOf("}"); + if (start >= 0 && end > start) { + return tryParseObject(stripped.slice(start, end + 1)); + } + return null; +} diff --git a/src/self-play/judge.ts b/src/self-play/judge.ts index fe3e4fe..8654eff 100644 --- a/src/self-play/judge.ts +++ b/src/self-play/judge.ts @@ -13,6 +13,7 @@ */ import type { ChatClient, ChatMessage } from "@chatman-media/rag"; import type { EloOutcome } from "../elo.ts"; +import { extractJsonObject } from "../llm-json.ts"; export interface JudgeInput { /** Style under test (e.g. "marina-prime-v1"). */ @@ -93,28 +94,18 @@ export async function judgeMatch(input: JudgeInput): Promise { * falls back to regex match if necessary. Exported for tests. */ export function parseVerdict(raw: string): JudgeVerdict { - const stripped = raw - .replace(/[\s\S]*?<\/think>/gi, "") - .replace(/^```(?:json)?\s*/i, "") - .replace(/\s*```\s*$/i, "") - .trim(); - // Try direct parse. - try { - const parsed = JSON.parse(stripped); - if (parsed && typeof parsed === "object") { - const outcome = pickOutcome(parsed.outcome); - const reason = - typeof parsed.reason === "string" ? parsed.reason : "(no reason)"; - if (outcome) return { outcome, reason }; - } - } catch { - /* fall through */ + const parsed = extractJsonObject(raw); + if (parsed) { + const outcome = pickOutcome(parsed.outcome); + const reason = + typeof parsed.reason === "string" ? parsed.reason : "(no reason)"; + if (outcome) return { outcome, reason }; } // Regex fallback — find an "outcome": "..." pair anywhere. - const m = stripped.match(/"outcome"\s*:\s*"(won|lost|draw)"/i); + const m = raw.match(/"outcome"\s*:\s*"(won|lost|draw)"/i); if (m) { const outcome = (m[1] ?? "draw").toLowerCase() as EloOutcome; - const reasonMatch = stripped.match(/"reason"\s*:\s*"([^"]+)"/); + const reasonMatch = raw.match(/"reason"\s*:\s*"([^"]+)"/); return { outcome, reason: reasonMatch?.[1] ?? "(no reason)", @@ -122,7 +113,7 @@ export function parseVerdict(raw: string): JudgeVerdict { } console.warn( "[judge] unparseable output (first 300 chars):", - stripped.slice(0, 300), + raw.slice(0, 300), ); return { outcome: "draw", reason: "judge output unparseable", raw }; } diff --git a/src/self-play/pairwise.ts b/src/self-play/pairwise.ts index cfb1c0d..2e47fd1 100644 --- a/src/self-play/pairwise.ts +++ b/src/self-play/pairwise.ts @@ -11,6 +11,7 @@ import type { ChatClient, ChatMessage } from "@chatman-media/rag"; import type { EloOutcome } from "../elo.ts"; import { eloUpdatePair } from "../elo.ts"; +import { extractJsonObject } from "../llm-json.ts"; import type { IPairwiseMatchesRepo } from "../store.ts"; import type { Style } from "../types.ts"; import { @@ -117,28 +118,17 @@ export async function judgePairwise(args: { } export function parsePairwiseVerdict(raw: string): PairwiseVerdict { - const stripped = raw - .replace(/[\s\S]*?<\/think>/gi, "") - .replace(/^```(?:json)?\s*/i, "") - .replace(/\s*```\s*$/i, "") - .trim(); - try { - const parsed = JSON.parse(stripped); - if (parsed && typeof parsed === "object") { - const winner = pickWinner((parsed as Record).winner); - const reason = - typeof (parsed as Record).reason === "string" - ? ((parsed as Record).reason as string) - : "(no reason)"; - if (winner) return { winner, reason }; - } - } catch { - /* fall through to regex */ + const parsed = extractJsonObject(raw); + if (parsed) { + const winner = pickWinner(parsed.winner); + const reason = + typeof parsed.reason === "string" ? parsed.reason : "(no reason)"; + if (winner) return { winner, reason }; } - const m = stripped.match(/"winner"\s*:\s*"(a|b|draw)"/i); + const m = raw.match(/"winner"\s*:\s*"(a|b|draw)"/i); if (m) { const winner = (m[1] ?? "draw").toLowerCase() as PairwiseWinner; - const reasonMatch = stripped.match(/"reason"\s*:\s*"([^"]+)"/); + const reasonMatch = raw.match(/"reason"\s*:\s*"([^"]+)"/); return { winner, reason: reasonMatch?.[1] ?? "(no reason)" }; } return { winner: "draw", reason: "pairwise judge unparseable", raw }; diff --git a/src/stage-classifier.ts b/src/stage-classifier.ts index fd07c6e..e1d0244 100644 --- a/src/stage-classifier.ts +++ b/src/stage-classifier.ts @@ -1,4 +1,5 @@ import type { ChatClient } from "@chatman-media/rag"; +import { extractJsonObject } from "./llm-json.ts"; import { nextStage } from "./stage-router.ts"; import { FUNNEL_STAGES, type FunnelStage } from "./types.ts"; @@ -89,23 +90,8 @@ interface ParsedClassification { export function parseClassifierOutput( raw: string, ): ParsedClassification | null { - if (typeof raw !== "string") return null; - // Strip common code-fence wrappers. - let s = raw.trim(); - s = s.replace(/^```(?:json|js)?\s*/i, "").replace(/```\s*$/, ""); - // Locate the first { and matching last } — naive but works because the - // expected payload is a flat object with two scalar fields. - const start = s.indexOf("{"); - const end = s.lastIndexOf("}"); - if (start < 0 || end <= start) return null; - let parsed: unknown; - try { - parsed = JSON.parse(s.slice(start, end + 1)); - } catch { - return null; - } - if (typeof parsed !== "object" || parsed === null) return null; - const obj = parsed as Record; + const obj = extractJsonObject(raw); + if (!obj) return null; if (typeof obj.stage !== "string") return null; if (typeof obj.confidence !== "number" || !Number.isFinite(obj.confidence)) { return null; From 08459016955572e46b315612bc47193440e5cb08 Mon Sep 17 00:00:00 2001 From: Alexander Kireev Date: Mon, 18 May 2026 02:42:38 +0700 Subject: [PATCH 3/5] test: cover composeSystemPrompt prompt composition composeSystemPrompt is a pure, deterministic function with no test coverage. Add tests for persona/framework sections, the few-shot toggle, conditional KB-context injection, the human-persona disclosure branch, and conditional rendering of the persona facts section. Co-Authored-By: Claude Opus 4.7 --- src/__tests__/prompt.test.ts | 81 ++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 src/__tests__/prompt.test.ts diff --git a/src/__tests__/prompt.test.ts b/src/__tests__/prompt.test.ts new file mode 100644 index 0000000..53ea0bd --- /dev/null +++ b/src/__tests__/prompt.test.ts @@ -0,0 +1,81 @@ +import { describe, expect, test } from "bun:test"; +import { composeSystemPrompt } from "../prompt.ts"; +import { marinaPrime } from "../styles/marina-prime.ts"; + +describe("composeSystemPrompt — core sections", () => { + test("includes the persona name", () => { + expect(composeSystemPrompt(marinaPrime, "qualify")).toContain( + "Тебя зовут Марина", + ); + }); + + test("includes the framework blurb", () => { + expect(composeSystemPrompt(marinaPrime, "qualify")).toContain("NEPQ"); + }); + + test("separates sections with blank lines", () => { + expect(composeSystemPrompt(marinaPrime, "qualify")).toContain("\n\n"); + }); +}); + +describe("composeSystemPrompt — few-shot toggle", () => { + test("included by default", () => { + expect(composeSystemPrompt(marinaPrime, "qualify")).toContain( + "ПРИМЕРЫ ДИАЛОГА", + ); + }); + + test("omitted when includeFewShot is false", () => { + const prompt = composeSystemPrompt(marinaPrime, "qualify", null, { + includeFewShot: false, + }); + expect(prompt).not.toContain("ПРИМЕРЫ ДИАЛОГА"); + }); +}); + +describe("composeSystemPrompt — KB context", () => { + test("omitted when no context is provided", () => { + expect(composeSystemPrompt(marinaPrime, "qualify")).not.toContain( + "KB CONTEXT (актуальные факты", + ); + }); + + test("included verbatim when context is provided", () => { + const kb = "Корея: оклад ₩110 000 за смену."; + const prompt = composeSystemPrompt(marinaPrime, "qualify", kb); + expect(prompt).toContain("KB CONTEXT (актуальные факты"); + expect(prompt).toContain(kb); + }); +}); + +describe("composeSystemPrompt — human persona disclosure branch", () => { + test("evasive answer when botDisclosureOnDirectQuestion is false", () => { + expect(composeSystemPrompt(marinaPrime, "qualify")).toContain( + "отвечай уклончиво", + ); + }); + + test("honest answer when botDisclosureOnDirectQuestion is true", () => { + const style = structuredClone(marinaPrime); + style.guardrails.botDisclosureOnDirectQuestion = true; + const prompt = composeSystemPrompt(style, "qualify"); + expect(prompt).toContain("ЧЕСТНО ответь, что ты ИИ-ассистент"); + expect(prompt).not.toContain("отвечай уклончиво"); + }); +}); + +describe("composeSystemPrompt — persona facts", () => { + test("no facts section when persona has no facts", () => { + expect(composeSystemPrompt(marinaPrime, "qualify")).not.toContain( + "ЛИЧНЫЕ ФАКТЫ", + ); + }); + + test("facts section rendered when persona facts are present", () => { + const style = structuredClone(marinaPrime); + style.persona.facts = { возраст: "28", город: "Москва" }; + const prompt = composeSystemPrompt(style, "qualify"); + expect(prompt).toContain("ЛИЧНЫЕ ФАКТЫ"); + expect(prompt).toContain("город: Москва"); + }); +}); From dd7888bba06401ff197e1114f0ac242c815e0bf6 Mon Sep 17 00:00:00 2001 From: Alexander Kireev Date: Mon, 18 May 2026 02:44:04 +0700 Subject: [PATCH 4/5] test: cover stage-classifier parsing and regex fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stage-classifier.ts had no test coverage. Add tests for parseClassifierOutput (code fences, prose prefixes, percentage-style confidence clamping, malformed input) and for classifyStage's regex fallback paths — driven by a stub ChatClient — covering llm-error, parse-error, unknown-stage, low-confidence, and the happy LLM path. Co-Authored-By: Claude Opus 4.7 --- src/__tests__/stage-classifier.test.ts | 115 +++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 src/__tests__/stage-classifier.test.ts diff --git a/src/__tests__/stage-classifier.test.ts b/src/__tests__/stage-classifier.test.ts new file mode 100644 index 0000000..79d9b34 --- /dev/null +++ b/src/__tests__/stage-classifier.test.ts @@ -0,0 +1,115 @@ +import { describe, expect, test } from "bun:test"; +import type { ChatClient } from "@chatman-media/rag"; +import { classifyStage, parseClassifierOutput } from "../stage-classifier.ts"; + +/** Minimal ChatClient whose `complete` returns (or throws) a fixed value. */ +function stubChat(reply: string | (() => never)): ChatClient { + return { + async complete() { + if (typeof reply === "function") return reply(); + return reply; + }, + }; +} + +describe("parseClassifierOutput", () => { + test("parses a clean object", () => { + expect(parseClassifierOutput('{"stage":"pitch","confidence":0.9}')).toEqual( + { stage: "pitch", confidence: 0.9 }, + ); + }); + + test("strips a ```json code fence", () => { + const raw = '```json\n{"stage":"qualify","confidence":0.8}\n```'; + expect(parseClassifierOutput(raw)).toEqual({ + stage: "qualify", + confidence: 0.8, + }); + }); + + test("extracts the object past an 'Ответ:' prefix", () => { + const raw = 'Ответ: {"stage":"close","confidence":0.7}'; + expect(parseClassifierOutput(raw)).toEqual({ + stage: "close", + confidence: 0.7, + }); + }); + + test("clamps a percentage-style confidence (95 → 0.95)", () => { + expect(parseClassifierOutput('{"stage":"pitch","confidence":95}')).toEqual({ + stage: "pitch", + confidence: 0.95, + }); + }); + + test("returns null for malformed JSON", () => { + expect(parseClassifierOutput("not json at all")).toBeNull(); + }); + + test("returns null when stage field is missing", () => { + expect(parseClassifierOutput('{"confidence":0.9}')).toBeNull(); + }); + + test("returns null when confidence is not a number", () => { + expect( + parseClassifierOutput('{"stage":"pitch","confidence":"high"}'), + ).toBeNull(); + }); +}); + +describe("classifyStage — fallback paths", () => { + const base = { + userMessage: "сколько платят?", + currentStage: "qualify" as const, + turnNumber: 3, + }; + + test("LLM error → regex fallback with reason 'llm-error'", async () => { + const result = await classifyStage({ + ...base, + chat: stubChat(() => { + throw new Error("network down"); + }), + }); + expect(result.source).toBe("regex-fallback"); + expect(result.fallbackReason).toBe("llm-error"); + }); + + test("unparseable output → reason 'parse-error'", async () => { + const result = await classifyStage({ + ...base, + chat: stubChat("I have no idea"), + }); + expect(result.fallbackReason).toBe("parse-error"); + }); + + test("unknown stage → reason 'unknown-stage'", async () => { + const result = await classifyStage({ + ...base, + chat: stubChat('{"stage":"smalltalk","confidence":0.9}'), + }); + expect(result.fallbackReason).toBe("unknown-stage"); + }); + + test("below-threshold confidence → reason 'low-confidence'", async () => { + const result = await classifyStage({ + ...base, + chat: stubChat('{"stage":"pitch","confidence":0.3}'), + }); + expect(result.fallbackReason).toBe("low-confidence"); + }); +}); + +describe("classifyStage — LLM path", () => { + test("valid high-confidence verdict is taken as-is", async () => { + const result = await classifyStage({ + userMessage: "сколько платят?", + currentStage: "qualify", + turnNumber: 3, + chat: stubChat('{"stage":"pitch","confidence":0.92}'), + }); + expect(result.source).toBe("llm"); + expect(result.stage).toBe("pitch"); + expect(result.confidence).toBe(0.92); + }); +}); From 9e1a460ec3e74aba0304a69bd1fd0c2e8080e032 Mon Sep 17 00:00:00 2001 From: Alexander Kireev Date: Mon, 18 May 2026 02:45:46 +0700 Subject: [PATCH 5/5] fix: surface self-play persistence failures to callers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A failed match insert in runSelfPlayMatch and runPairwiseMatch was only console.warn'd — the id was silently left null, so callers running evaluation loops had no clear signal their results were never recorded. Add an explicit persisted boolean to SelfPlayMatchResult and PairwiseMatchResult so consumers can detect the data loss. Non-throwing, additive change. Co-Authored-By: Claude Opus 4.7 --- src/self-play/orchestrator.ts | 8 ++++++++ src/self-play/pairwise.ts | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/src/self-play/orchestrator.ts b/src/self-play/orchestrator.ts index bac24a3..87efe15 100644 --- a/src/self-play/orchestrator.ts +++ b/src/self-play/orchestrator.ts @@ -93,6 +93,12 @@ export interface SelfPlayMatchResult { fabricationsCaught: number; /** Row id in self_play_matches, or null when the insert failed. */ matchId: number | null; + /** + * Whether the match transcript was durably persisted. `false` means the + * insert threw and this result exists only in memory — callers running + * evaluation loops should treat the run as not recorded. + */ + persisted: boolean; /** Non-fatal errors collected during the match (e.g. skill grading failures). */ warnings: string[]; } @@ -333,9 +339,11 @@ async function finalize( leadId, fabricationsCaught, matchId: null, + persisted: false, warnings, }; result.matchId = await persistSelfPlayMatch(deps, result, verdict.reason); + result.persisted = result.matchId !== null; return result; } diff --git a/src/self-play/pairwise.ts b/src/self-play/pairwise.ts index 2e47fd1..342f95d 100644 --- a/src/self-play/pairwise.ts +++ b/src/self-play/pairwise.ts @@ -52,6 +52,12 @@ export interface PairwiseMatchResult { eloAAfter: number; eloBAfter: number; pairwiseId: number | null; + /** + * Whether the pairwise match was durably persisted. `false` means the + * insert threw and this result exists only in memory — callers running + * A/B evaluation loops should treat the comparison as not recorded. + */ + persisted: boolean; } const PAIRWISE_SYSTEM = (hint: string) => @@ -179,6 +185,7 @@ export async function runPairwiseMatch( if (newB !== bRating) await deps.ratings.setRating(input.styleBId, newB); let pairwiseId: number | null = null; + let persisted = false; try { pairwiseId = await deps.pairwiseMatches.insert({ matchAId: matchA.matchId ?? 0, @@ -189,6 +196,7 @@ export async function runPairwiseMatch( winner: verdict.winner, reason: verdict.reason, }); + persisted = true; } catch (err) { console.warn("[pairwise] failed to persist pairwise match:", err); } @@ -203,5 +211,6 @@ export async function runPairwiseMatch( eloAAfter: newA, eloBAfter: newB, pairwiseId, + persisted, }; }