CortexLM · echobt · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,5 @@ test-verify/
 *.db
 *.db-shm
 *.db-wal
+mine_test.log
+test-easy-output/
diff --git a/AGENTS.md b/AGENTS.md
@@ -28,6 +28,7 @@ src/
 │   ├── pr_cache.rs          # SQLite-backed PR deduplication cache
 │   ├── progress.rs          # Background progress monitor for long-running pipeline runs
 │   ├── github_search.rs     # GitHub Search API client (alternative PR source)
+│   ├── tool_server.rs       # Embedded Python HTTP tool server injected into Docker containers
 │   └── workspace_validator.rs # Pre-export workspace validation (Docker-based)
 ├── llm/                     # LLM integration layer
 │   ├── litellm.rs           # OpenAI-compatible API client (function calling, tools)
@@ -41,7 +42,7 @@ src/
 ├── export/                  # Parquet dataset export + HuggingFace Hub upload
 ├── difficulty/              # Difficulty levels, resource limits, scoring
 ├── anti_hardcoding/         # Canary strings, sealed parameters, contamination detection
-├── runner/                  # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs]
+├── runner/                  # Agent runner infrastructure (sandbox, verifier, agent adapters) [not compiled — not declared in lib.rs; exists as scaffolding]
 ├── utils/                   # JSON extraction from LLM responses
 └── error.rs                 # Typed error hierarchy (thiserror)
 ```
@@ -135,13 +136,13 @@ Git hooks are in `.githooks/` and activated via `git config core.hooksPath .gith
 
 3. **Never leak test plans into agent prompts** — The `prompt_rewriter.rs` module strips test-specific information from PR bodies before generating `prompt.md`. Any new prompt generation code must ensure `fail_to_pass` and `pass_to_pass` test commands are never visible to the agent being evaluated.
 
-4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: memory (512MB–2GB), CPU (1–4 cores), PIDs (100–500). Never create containers without limits.
+4. **Docker containers must have resource limits** — All container creation must use `apply_resource_limits()` from `src/docker/resources.rs`. Difficulty-based limits are enforced: PIDs (100–500), storage (1–5 GB), network mode (none/internal). Never create containers without limits.
 
 5. **Respect GitHub API rate limits (5000 req/h)** — The pipeline uses semaphore-based concurrency (no chunk barriers). Each candidate needs ~2 API calls for enrichment. Never add unbounded concurrent GitHub API calls. Use the existing concurrency limits (enrichment: 10x, pre-classification: 25x, deep processing: 8x).
 
 6. **All async code must be `Send + Sync` compatible** — The codebase uses `Arc<dyn LlmProvider>` extensively. Trait objects must be `Send + Sync`. Never introduce `Rc`, `RefCell`, or non-Send types in async contexts.
 
-7. **Serde rename conventions must be `snake_case`** — All serializable enums use `#[serde(rename_all = "snake_case")]`. Task status, difficulty levels, and all API-facing types must follow this convention for YAML/JSON compatibility.
+7. **Serde rename conventions** — Most serializable enums use `#[serde(rename_all = "snake_case")]` (e.g., `SweTaskStatus`, `HarnessStatus`, `CheckType`). `DifficultyLevel` and `NetworkMode` in `src/difficulty/` use `#[serde(rename_all = "lowercase")]`. Follow the existing convention for each type for YAML/JSON compatibility.
 
 8. **Anti-hardcoding mechanisms must be preserved** — The `anti_hardcoding/` module provides canary strings, sealed parameters, and process validation. Never bypass contamination detection. Any new task generation must embed canary strings via `CanaryConfig::generate()`.
 

diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs
@@ -324,16 +324,26 @@ impl LiteLlmClient {
     /// * `api_base` - Base URL for the LiteLLM API (e.g., "http://localhost:4000")
     /// * `api_key` - Optional API key for authentication
     /// * `default_model` - Default model to use when none is specified
-    pub fn new(api_base: String, api_key: Option<String>, default_model: String) -> Self {
-        Self {
+    ///
+    /// # Errors
+    ///
+    /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built.
+    pub fn new(
+        api_base: String,
+        api_key: Option<String>,
+        default_model: String,
+    ) -> Result<Self, LlmError> {
+        Ok(Self {
             api_base,
             api_key,
             default_model,
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
-        }
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
+        })
     }
 
     /// Create a new LiteLLM client pre-configured for OpenRouter.
@@ -347,16 +357,22 @@ impl LiteLlmClient {
     /// A client configured with:
     /// - api_base: "https://openrouter.ai/api/v1"
     /// - default_model: "anthropic/claude-opus-4.5"
-    pub fn new_with_defaults(api_key: String) -> Self {
-        Self {
+    ///
+    /// # Errors
+    ///
+    /// Returns `LlmError::RequestFailed` if the HTTP client cannot be built.
+    pub fn new_with_defaults(api_key: String) -> Result<Self, LlmError> {
+        Ok(Self {
             api_base: "https://openrouter.ai/api/v1".to_string(),
             api_key: Some(api_key),
             default_model: "anthropic/claude-opus-4.5".to_string(),
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
-        }
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
+        })
     }
 
     /// Create a new LiteLLM client from environment variables.
@@ -382,7 +398,9 @@ impl LiteLlmClient {
             http_client: Client::builder()
                 .timeout(Duration::from_secs(300))
                 .build()
-                .expect("Failed to build HTTP client"),
+                .map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to build HTTP client: {e}"))
+                })?,
         })
     }
 
@@ -490,6 +508,10 @@ struct ApiRequest {
     top_p: Option<f64>,
     #[serde(skip_serializing_if = "Option::is_none")]
     response_format: Option<ResponseFormat>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tools: Option<serde_json::Value>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tool_choice: Option<serde_json::Value>,
 }
 
 /// Internal response structure from the OpenAI-compatible API.
@@ -506,14 +528,42 @@ struct ApiResponse {
 struct ApiChoice {
     index: u32,
     message: ApiMessage,
-    finish_reason: String,
+    #[serde(default)]
+    finish_reason: Option<String>,
 }
 
 /// Internal message structure from the API response.
+/// Supports reasoning models and tool calls.
 #[derive(Debug, Deserialize)]
 struct ApiMessage {
     role: String,
+    #[serde(default)]
     content: String,
+    #[serde(default)]
+    reasoning: Option<String>,
+    #[serde(default)]
+    reasoning_content: Option<String>,
+    #[serde(default)]
+    tool_calls: Option<Vec<ApiToolCall>>,
+}
+
+/// A tool call returned by the model.
+#[derive(Debug, Deserialize)]
+struct ApiToolCall {
+    #[serde(default)]
+    id: String,
+    #[serde(rename = "type", default)]
+    _tool_type: String,
+    function: ApiToolCallFunction,
+}
+
+/// Function details within a tool call response.
+#[derive(Debug, Deserialize)]
+struct ApiToolCallFunction {
+    #[serde(default)]
+    name: String,
+    #[serde(default)]
+    arguments: String,
 }
 
 /// Internal usage structure from the API response.
@@ -549,13 +599,29 @@ impl LlmProvider for LiteLlmClient {
             request.model.clone()
         };
 
+        let tools =
+            match request.tools {
+                Some(t) => Some(serde_json::to_value(&t).map_err(|e| {
+                    LlmError::RequestFailed(format!("Failed to serialize tools: {e}"))
+                })?),
+                None => None,
+            };
+        let tool_choice = match request.tool_choice {
+            Some(tc) => Some(serde_json::to_value(&tc).map_err(|e| {
+                LlmError::RequestFailed(format!("Failed to serialize tool_choice: {e}"))
+            })?),
+            None => None,
+        };
+
         let api_request = ApiRequest {
             model: model.clone(),
             messages: request.messages,
             temperature: request.temperature,
             max_tokens: request.max_tokens,
             top_p: request.top_p,
             response_format: request.response_format,
+            tools,
+            tool_choice,
         };
 
         let url = format!("{}/chat/completions", self.api_base);
@@ -617,15 +683,52 @@ impl LlmProvider for LiteLlmClient {
         let choices = api_response
             .choices
             .into_iter()
-            .map(|choice| Choice {
-                index: choice.index,
-                message: Message {
-                    role: choice.message.role,
-                    content: choice.message.content,
-                    tool_calls: None,
-                    tool_call_id: None,
-                },
-                finish_reason: choice.finish_reason,
+            .map(|choice| {
+                let tool_calls_info = choice.message.tool_calls.as_ref().map(|tcs| {
+                    tcs.iter()
+                        .map(|tc| ToolCallInfo {
+                            id: tc.id.clone(),
+                            call_type: "function".to_string(),
+                            function: ToolCallFunction {
+                                name: tc.function.name.clone(),
+                                arguments: tc.function.arguments.clone(),
+                            },
+                        })
+                        .collect::<Vec<_>>()
+                });
+
+                let content = if let Some(ref tool_calls) = choice.message.tool_calls {
+                    if let Some(first_call) = tool_calls.first() {
+                        if !first_call.function.arguments.is_empty() {
+                            first_call.function.arguments.clone()
+                        } else {
+                            choice.message.content.clone()
+                        }
+                    } else {
+                        choice.message.content.clone()
+                    }
+                } else if !choice.message.content.trim().is_empty() {
+                    choice.message.content
+                } else if let Some(rc) = choice.message.reasoning_content {
+                    if !rc.trim().is_empty() {
+                        rc
+                    } else {
+                        choice.message.reasoning.unwrap_or_default()
+                    }
+                } else {
+                    choice.message.reasoning.unwrap_or_default()
+                };
+
+                Choice {
+                    index: choice.index,
+                    message: Message {
+                        role: choice.message.role,
+                        content,
+                        tool_calls: tool_calls_info,
+                        tool_call_id: None,
+                    },
+                    finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".to_string()),
+                }
             })
             .collect();
 
@@ -849,7 +952,8 @@ mod tests {
             "http://localhost:4000".to_string(),
             Some("test-key".to_string()),
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         assert_eq!(client.api_base(), "http://localhost:4000");
         assert_eq!(client.default_model(), "gpt-4");
@@ -862,14 +966,15 @@ mod tests {
             "http://localhost:4000".to_string(),
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         assert!(!client.has_api_key());
     }
 
     #[test]
     fn test_litellm_client_new_with_defaults() {
-        let client = LiteLlmClient::new_with_defaults("test-api-key".to_string());
+        let client = LiteLlmClient::new_with_defaults("test-api-key".to_string()).unwrap();
 
         assert_eq!(client.api_base(), "https://openrouter.ai/api/v1");
         assert_eq!(client.default_model(), "anthropic/claude-opus-4.5");
@@ -883,7 +988,8 @@ mod tests {
             "http://localhost:65535".to_string(), // Use a port that's unlikely to have a server
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
 
         let request = GenerationRequest::new("gpt-4", vec![Message::user("test")]);
         let result = client.generate(request).await;
@@ -903,13 +1009,17 @@ mod tests {
             max_tokens: Some(1000),
             top_p: None,
             response_format: None,
+            tools: None,
+            tool_choice: None,
         };
 
         let json = serde_json::to_string(&request).expect("serialization should succeed");
         assert!(json.contains("\"model\":\"gpt-4\""));
         assert!(json.contains("\"temperature\":0.7"));
         assert!(json.contains("\"max_tokens\":1000"));
         assert!(!json.contains("top_p")); // Should be skipped because None
+        assert!(!json.contains("tools")); // Should be skipped because None
+        assert!(!json.contains("tool_choice")); // Should be skipped because None
     }
 
     #[tokio::test]
@@ -918,7 +1028,8 @@ mod tests {
             "http://localhost:65535".to_string(),
             None,
             "gpt-4".to_string(),
-        );
+        )
+        .unwrap();
         let cache = PromptCache::new(100);
 
         // Create a request with a system prompt

diff --git a/src/swe/AGENTS.md b/src/swe/AGENTS.md
@@ -22,6 +22,7 @@ Core SWE mining pipeline. Fetches merged pull requests from GH Archive, enriches
 | `pipeline.rs` | Streaming pipeline with semaphore-based concurrency |
 | `github_search.rs` | `GitHubSearchClient` — GitHub Search API as alternative PR source (30 req/min) |
 | `workspace_validator.rs` | `WorkspaceValidator` — pre-export Docker-based validation (install, tests, patch application) |
+| `tool_server.rs` | Embedded Python HTTP tool server injected into Docker containers (read_file, list_dir, grep_files, search_files, apply_patch) |
 | `pr_cache.rs` | SQLite-backed PR deduplication cache |
 | `progress.rs` | `ProgressMonitor` — background progress logging for long-running pipeline runs |