latchbio · t-proctor · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/runtime/mount/agent.py b/runtime/mount/agent.py
@@ -985,6 +985,17 @@ def submit_response(args: dict) -> dict:
                 plan_items = args.get("plan", [])
                 plan_diff_items = args.get("plan_diff", [])
                 expected_widgets = args.get("expected_widgets", [])
+                print(f"[tim] self.behavior: {self.behavior}")
+                print(f"[tim] self.behavior == Behavior.step_by_step: {self.behavior == Behavior.step_by_step}")
+                if self.behavior == Behavior.step_by_step and should_continue:
+                    for diff in plan_diff_items:
+                        action = diff.get("action")
+                        print(f"[tim] action: {action}")
+                        if action == "complete":
+                            print(f"[tim] Potential Step-by-Step Mode Override: Forcing continue=False because step {diff.get('id')} was completed")
+                            # should_continue = False
+                            # next_status = "done"
+                            # break
 
                 print("[tool] submit_response called with:")
                 print(f"  - next_status: {next_status}")
@@ -1616,7 +1627,7 @@ async def smart_ui_spotlight(args: dict) -> dict:
                 "type": "object",
                 "properties": {
                     "plan": {"type": "array", "description": "List of plan items"},
-                    "plan_diff": {"type": "array", "description": "List of plan diff items"},
+                    "plan_diff": {"type": "array", "description": "List of plan diff items. Every change to plan MUST have a corresponding plan_diff entry. Empty array means no status changes this turn."},
                     "plan_update_overview": {"type": "string", "description": "Short title overview of what changed in the plan. Should follow the format like 'Added a new step.' or  `Completed step 2, step 3 now in progress.`"},
                     "summary": {"type": "string", "description": "Summary text to help the user. This can be a message to the user or a description of what was accomplished. Use markdown formatting with bullet points if needed."},
                     "questions": {"type": "string", "description": "Optional question text for the user."},
@@ -3216,6 +3227,9 @@ async def handle_query(self, msg: dict[str, object]) -> None:
         if contextual_node_data:
             full_query = f"{query} \n\nHere is the context of the selected nodes the user would like to use: <ContextualNodeData>{json.dumps(contextual_node_data)}</ContextualNodeData>"
 
+        if self.behavior:
+            full_query += f"\n\nFollow Current Behavior Mode: {self.behavior.value}"
+
         await self.pending_messages.put({
             "type": "user_query",
             "content": full_query,

diff --git a/runtime/mount/agent_config/context/step_by_step_behavior.md b/runtime/mount/agent_config/context/step_by_step_behavior.md
@@ -4,23 +4,45 @@ Each turn processes one user message (question, request, cell execution result,
 
 ## Guiding Principles
 
-- Gather requirements as you go with the user.
-- Collaborate closely with frequent user interaction.
-- Ask the user for widget values instead of choosing defaults.
-- Complete one step at a time and confirm before moving on.
-- Ask clarifying questions whenever they help.
+- **Scientific Logic**: Review **EVERY** operation or plot that reflects decisions about scientific reasoning.
+- **Anticipatory Evidence**: Anticipate and generate "intermediate" plots and metrics to guide analysis decisions without waiting to be asked.
+- **Step-by-Step Verification**: Complete one logical step at a time, presenting this evidence to confirm the approach before moving to the next stage.
+- **Collaborative Requirement Gathering**: Gather requirements iteratively as the data's properties become clear.
+
+## Interaction Override: Handling System Prompt Examples
+
+Please follow these rules to override the system prompt examples
+
+- **Scientific Auto-Correction vs. Error Fixing**:
+  - **FORBIDDEN**: Automatically changing analysis parameters, filtering thresholds, or methods because the results "look bad" (e.g., "Retention too low, trying new threshold"). Always present the result and ask.
+  - **ALLOWED**: Automatically fixing code errors (SyntaxError, NameError, ImportError) to make the cell run.
+- **Do not chain** complex steps without confirmation.
+- **NEVER** call `submit_response` with `next_status`: `awaiting_cell_execution`.
+- **After ANY successful cell execution** that generates a plot or metric, you **MUST** call `submit_response` and set `continue: false` and `next_status: done`. But, if the cell failed (error), you may continue to fix it.
+- **ALWAYS** adopt the "Generate Evidence & Wait" pattern:
+  1. Generate the diagnostic plot/table.
+  2. Explain what it shows.
+  3. **STOP** (`continue: false`, `next_status: done"`) and ask the user how they want to proceed.
+
+## Plan Execution Strategy
+
+In this mode, a "Plan Step" is **NOT** a license to execute all cells for that step at once.
+
+- **Atomic Execution**: Break each plan step into atomic verification units (e.g., 1 cell = 1 unit).
+- **One at a Time**: Execute ONE unit, then **STOP**.
+- **Verify then Proceed**: Only move to the next unit after the user confirms the previous one.
 
 ## Turn Flow
 
 1. Process user input
 2. Update plan status if working on a plan
 3. Execute actions (create/edit cells, ask questions, etc.)
 4. Call `submit_response` with current state
-5. Either continue (if `continue: true`) or wait for next input
+5. Either continue (if `continue: true`) or wait for next user input
 
 ## Turn End Requirement
- 
+
 **Every turn MUST end with `submit_response`**. After calling `submit_response`:
 
 - If `continue: true` → Immediately proceed to next action
-- If `continue: false` → Turn ends, wait for next user input or cell execution result. Default to this for most turns to allow user verification/feedback
+- If `continue: false` → Turn ends, wait for next user input. Default to this for most turns to allow user verification/feedback