forked from khalilgharbaoui/opencode-claude-code-plugin
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest-auto-continue.ts
More file actions
602 lines (546 loc) · 20.1 KB
/
test-auto-continue.ts
File metadata and controls
602 lines (546 loc) · 20.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
/**
* Unit tests for smart auto-continuation policy in
* src/claude-code-language-model.ts.
*/
import { test } from "node:test"
import assert from "node:assert/strict"
import { shouldAutoContinueIncompleteTurn } from "./src/claude-code-language-model.js"
function state(overrides: Record<string, unknown> = {}) {
return {
enabled: "smart" as const,
attempts: 0,
startedAt: 1_000,
noProgressCount: 0,
...overrides,
} as any
}
function snap(overrides: Record<string, unknown> = {}) {
const base: Record<string, unknown> = {
text: "",
lastVisibleText: "",
hadReasoning: false,
hadToolActivity: false,
hadProxyActivity: false,
now: 1_500,
...overrides,
}
// Default lastVisibleText to mirror text unless explicitly overridden, so
// legacy single-block test cases keep working.
if (
overrides.text !== undefined &&
overrides.lastVisibleText === undefined
) {
base.lastVisibleText = overrides.text
}
return base as any
}
test("smart auto-continue is disabled by false", () => {
const result = shouldAutoContinueIncompleteTurn(
state({ enabled: false }),
snap({ hadReasoning: true }),
)
assert.deepEqual(result, { continue: false, reason: "disabled" })
})
test("continues reasoning-only result with no visible answer", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ hadReasoning: true }),
)
assert.equal(result.continue, true)
assert.equal(result.reason, "activity-without-visible-answer")
})
test("continues tool activity without visible answer", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ hadToolActivity: true }),
)
assert.equal(result.continue, true)
})
test("continues non-final visible progress", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ text: "I found the relevant files and am checking the tests.", hadToolActivity: true }),
)
assert.deepEqual(result, { continue: true, reason: "non-final-progress" })
})
test("stops for final-looking visible answer", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ text: "Done. Implemented the fix and tests passed successfully.", hadToolActivity: true }),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("stops for question", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ text: "Which option do you want me to use?", hadReasoning: true }),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("stops for blocker", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ text: "I cannot proceed because the required token is missing.", hadToolActivity: true }),
)
assert.deepEqual(result, { continue: false, reason: "blocker" })
})
test("stops for errors", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ isError: true, hadToolActivity: true }),
)
assert.deepEqual(result, { continue: false, reason: "error" })
})
test("stops at max attempts", () => {
const result = shouldAutoContinueIncompleteTurn(
state({ attempts: 8 }),
snap({ hadReasoning: true }),
)
assert.deepEqual(result, { continue: false, reason: "max-attempts" })
})
test("stops when elapsed budget is exhausted", () => {
const result = shouldAutoContinueIncompleteTurn(
state({ startedAt: 0 }),
snap({ hadReasoning: true, now: 10 * 60 * 1000 + 1 }),
)
assert.deepEqual(result, { continue: false, reason: "max-elapsed" })
})
test("stops on repeated no-progress continuation", () => {
const snapshot = snap({ hadReasoning: true })
const first = shouldAutoContinueIncompleteTurn(state(), snapshot)
assert.equal(first.continue, true)
const second = shouldAutoContinueIncompleteTurn(
state({
lastSignature: JSON.stringify({
text: "",
reasoning: true,
tools: false,
proxy: false,
}),
noProgressCount: 1,
}),
snapshot,
)
assert.deepEqual(second, { continue: false, reason: "no-progress" })
})
test("stops when there was no activity", () => {
const result = shouldAutoContinueIncompleteTurn(state(), snap())
assert.deepEqual(result, { continue: false, reason: "no-activity" })
})
test("ignores final-answer keywords in earlier text blocks", () => {
// Earlier mid-task narration contains keywords like 'implemented' and
// 'updated' — but the LAST text block is a mid-task pause. Should still
// continue.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text:
"I implemented the helper. Updated the search index. " +
"Now checking the next set of files.",
lastVisibleText: "Now checking the next set of files.",
hadToolActivity: true,
}),
)
assert.equal(result.continue, true)
assert.equal(result.reason, "non-final-progress")
})
test("stops when the last text block looks like a final answer", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text:
"Let me check the files. " +
"Found three matches. " +
"Done. Implemented the fix and tests passed successfully.",
lastVisibleText:
"Done. Implemented the fix and tests passed successfully.",
hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("question in any earlier text block still stops continuation", () => {
// Even if the last block looks mid-task, a question raised earlier in the
// turn should still block auto-continue — answering a question is the
// user's job.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text:
"Which option do you want me to use? Continuing with the first one for now.",
lastVisibleText: "Continuing with the first one for now.",
hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
// ─── v0.4.10 regression tests for tweaks 2, 3, 4, 5 ────────────────────────
test("v0.4.10 tweak 2: 'let me know if you'd like' stops as question", () => {
// Indirect offer of next steps without literal '?'. C03 in the sim corpus.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text:
"Let me know if you'd like me to proceed with the cleanup phase or stop here.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.10 tweak 3: 'needs your approval' stops as blocker", () => {
// 'needs your' is intent-equivalent to 'requires your' but slipped past
// the regex pre-0.4.10. D03 in the sim corpus.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text:
"Needs your approval before I push the tag — auto-push is not enabled.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "blocker" })
})
test("v0.4.10 tweak 4: short completion (36 chars) stops as final-answer", () => {
// Pre-0.4.10 floor of 40 chars let "Task is now completely done. Pushed."
// through as non-final-progress. Floor lowered to 30. I01 in the sim corpus.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "Task is now completely done. Pushed.",
hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("v0.4.10 tweak 5a: '?' anywhere in last block stops as question", () => {
// Real fire shape from 2026-05-14T03:31 — long answer that asks a
// question early then lists options and ends in a period.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text:
"Here's the plan. Want me to proceed with that? Concretely: 1. Do X. 2. Do Y. 3. Do Z. Say 'go' or push back on any step.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.10 tweak 5b: 'say go or push back' (no '?') stops as question", () => {
// Pure soft-proceed phrasing with no '?' anywhere. Tests that the
// phrase-based half of tweak 5 fires independently of the '?' check.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text:
"Pick the option you want. Say 'go' to ship as planned, or push back on any specific step.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.10 tweak 5c: 'if you want to' stops as question", () => {
// Reconstruction of 02:48:11-style fire — long analysis ending in a
// conditional action offer with no '?'.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text:
"Three options are on the table. The recommendation is to leave DEBUG off. Consider option C if you want to re-enable DEBUG without UI noise.",
hadReasoning: true, hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.10 tweak 5d: A-class continues unaffected (no '?' or soft-proceed phrase)", () => {
// Sanity check: mid-task narration without question signals should still
// continue. Catches regressions where '?' or phrase regex accidentally
// expands.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "Now I'll read the file. Then I'll diff against previous. Then summarize.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: true, reason: "non-final-progress" })
})
// ─── v0.4.11 regression tests ──────────────────────────────────────────────
test("v0.4.11 'ready when you are' stops as question", () => {
// Real fire from 2026-05-14T04:00:41 — short answer ending in this
// canonical 'your move' phrase fired 4-δ inappropriately on v0.4.10.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "The standing-by stub lives in training, not just the CLI's empty-turn behavior. Ready when you are.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.11 'standing by' stops as question (the meta-irony stub)", () => {
// Commit 49345e3 originally fought 'No input received. Standing by.' at
// the message-builder layer (suppressing the CLI stub on empty turns).
// This test guards against the model organically producing the same
// idiom at the response layer.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "All done on my side; the rest is on you. Standing by.",
hadReasoning: true, hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.11 'let me know when' stops as question", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "I've staged everything for the release. Let me know when you've reviewed.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
// ─── v0.4.12 regression tests ──────────────────────────────────────────────
test("v0.4.12 'over to you' stops as question", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "I've prepared the patch and tests are green. Over to you.",
hadReasoning: true, hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.12 'your turn' stops as question", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "Reviewed the diff and flagged three concerns. Your turn to pick a direction.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.12 'all yours' stops as question", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "Branch is rebased and the PR template filled. The rest is all yours.",
hadReasoning: true, hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.12 'let me know how' stops as question", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "Three viable paths surfaced. Let me know how you'd like to proceed.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
test("v0.4.12 'i'm here' stops as question", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "All staged for the release. I'm here when you're ready to ship.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "question" })
})
// ─── v0.4.15 regression tests ──────────────────────────────────────────────
test("v0.4.15 'shipped' as final-answer keyword", () => {
// Real fire shape from 03:31 — long completion narrative ending with
// 'shipped'-style verbs that weren't in the v0.4.14 keyword list.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "v0.4.15 on npm, pin matches, 78/78 tests pass, sim corpus preserved as future leverage. Shipped.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("v0.4.15 'deployed/merged/tagged' as keywords", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "Patch merged to master, tagged v0.4.15, deployed via CI. Restart at your convenience.",
hadReasoning: true, hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("v0.4.15 'pinned' as keyword", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "Plugin pinned at @0.4.15 in opencode.jsonc. Restart loads it.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("v0.4.15 short 'We're done.' bypasses length floor", () => {
// 11 chars — would have been below the 30-char threshold and missed
// pre-v0.4.15. The strong-completion phrase override catches it.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "We're done.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("v0.4.15 short 'All set.' bypasses length floor", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "All set.",
hadReasoning: true, hadToolActivity: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("v0.4.15 'tests pass' (present tense) stops as final-answer", () => {
// Real fire 03:31 ended in "78/78 tests pass" — the v0.4.14 regex
// matched only past tense ("tests passed") so the fire was missed.
// This case is the actual 03:31 message text.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "v0.4.13 on npm, pin matches, 78/78 tests pass, sim corpus + regression bench preserved as future leverage.",
hadReasoning: true,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("v0.4.16 end_turn stop_reason short-circuits heuristic", () => {
// Even a long ambiguous mid-task narration with no completion keywords
// and visible tool activity gets stopped immediately when Claude CLI
// signals end_turn. This is the architectural alternative to chasing
// soft-proceed idioms via regex (v0.4.10-15).
const ambiguous =
"Running the next probe to inspect the build output and confirm bundle sizes are roughly equal."
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: ambiguous,
hadReasoning: true,
hadToolActivity: true,
stopReason: "end_turn",
}),
)
assert.deepEqual(result, { continue: false, reason: "end-turn" })
})
test("v0.4.16 end_turn beats max-attempts (decided last)", () => {
// End-turn wins over budget guards too — once the model says it's done,
// there's no value in burning more attempts.
const result = shouldAutoContinueIncompleteTurn(
state({ attempts: 999 }),
snap({ stopReason: "end_turn", hadReasoning: true }),
)
assert.deepEqual(result, { continue: false, reason: "end-turn" })
})
test("v0.4.16 end_turn does NOT beat genuine error", () => {
// is_error still wins. Defensive: we don't want to silently treat a CLI
// error as a clean stop.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ stopReason: "end_turn", isError: true }),
)
assert.deepEqual(result, { continue: false, reason: "error" })
})
test("v0.4.16 end_turn does NOT beat abort", () => {
const result = shouldAutoContinueIncompleteTurn(
state({ aborted: true }),
snap({ stopReason: "end_turn" }),
)
assert.deepEqual(result, { continue: false, reason: "aborted" })
})
test("v0.4.17 max_tokens stop_reason stops via protocol signal", () => {
// v0.4.17: ANY stop_reason value is authoritative. max_tokens is the
// model signaling a stop (it was cut off but the protocol said stop).
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "Working on it",
hadReasoning: true,
hadToolActivity: true,
stopReason: "max_tokens",
}),
)
assert.deepEqual(result, { continue: false, reason: "max-tokens" })
})
test("v0.4.17 stop_sequence stops via protocol signal", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ stopReason: "stop_sequence", hadReasoning: true }),
)
assert.deepEqual(result, { continue: false, reason: "stop-sequence" })
})
test("v0.4.17 refusal stops via protocol signal", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ stopReason: "refusal" }),
)
assert.deepEqual(result, { continue: false, reason: "refusal" })
})
test("v0.4.17 pause_turn stops via protocol signal", () => {
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ stopReason: "pause_turn", hadReasoning: true }),
)
assert.deepEqual(result, { continue: false, reason: "pause-turn" })
})
test("v0.4.17 tool_use stops via protocol signal", () => {
// Defensive: tool_use shouldn't normally reach the result boundary
// (drain timer closes the stream first), but if it does we honor it.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ stopReason: "tool_use", hadToolActivity: true }),
)
assert.deepEqual(result, { continue: false, reason: "tool-use" })
})
test("v0.4.17 unknown stop_reason still stops (forward-compat)", () => {
// If Anthropic adds a new stop_reason value, we trust it as authoritative
// and stop. Safer than running the keyword heuristic on unknown shape.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({ stopReason: "future_value_we_dont_know" }),
)
assert.deepEqual(result, {
continue: false,
reason: "future-value-we-dont-know",
})
})
test("v0.4.17 empty-string stop_reason falls through (falsy)", () => {
// Empty string is falsy — fall back to heuristic, same as null/undefined.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "We're done.",
hadReasoning: true,
stopReason: "",
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})
test("v0.4.16 missing stop_reason falls through (back-compat)", () => {
// When stop_reason is undefined or null, the heuristic must still run
// unchanged. Protects against CLI versions / paths that don't surface it.
const result = shouldAutoContinueIncompleteTurn(
state(),
snap({
text: "We're done.",
hadReasoning: true,
stopReason: null,
}),
)
assert.deepEqual(result, { continue: false, reason: "final-answer" })
})