diff --git a/docs/agents/cookbook.md b/docs/agents/cookbook.md
new file mode 100644
index 000000000..703678866
--- /dev/null
+++ b/docs/agents/cookbook.md
@@ -0,0 +1,493 @@
+---
+sidebar_position: 7
+---
+
+# Agent Cookbook
+
+A collection of common agent patterns with full working code examples. Copy-paste and adapt these recipes for your use case.
+
+---
+
+## 1. Web research agent
+
+Search the web and summarize findings into a structured report:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, tool
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit import LLMConfig, WebSearchTool, WebFetchTool
+
+@tool
+def summarize_page(url: str, question: str) -> str:
+    """Fetch a web page and extract content relevant to a question."""
+    import urllib.request
+    try:
+        with urllib.request.urlopen(url, timeout=5) as resp:
+            html = resp.read().decode("utf-8", errors="ignore")
+        # Strip HTML tags (simple)
+        import re
+        text = re.sub(r"<[^>]+>", " ", html)
+        text = " ".join(text.split())[:2000]
+        return text
+    except Exception as e:
+        return f"Could not fetch {url}: {e}"
+
+llm = OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+
+executor = AgentExecutor(AgentConfig(
+    llm=llm,
+    tools=[WebSearchTool(), WebFetchTool(), summarize_page],
+    agent_type="function_calling",
+    max_steps=10,
+    system_prompt=(
+        "You are a research assistant. Search for information, read sources, "
+        "and produce a concise summary with key findings and sources cited."
+    ),
+))
+
+report = asyncio.run(executor.run(
+    "Research the current state of open-source LLMs in 2026. "
+    "What are the top 3 models and their key capabilities?"
+))
+print(report)
+```
+
+---
+
+## 2. Code review agent
+
+Read a file, analyze it, and produce a structured review:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, tool
+from synapsekit.llm.anthropic import AnthropicLLM
+from synapsekit import LLMConfig, FileReadTool, PythonREPLTool
+
+@tool
+def run_tests(test_command: str) -> str:
+    """Run the test suite and return results."""
+    import subprocess
+    result = subprocess.run(
+        test_command.split(),
+        capture_output=True,
+        text=True,
+        timeout=30,
+    )
+    return result.stdout + result.stderr
+
+llm = AnthropicLLM(LLMConfig(
+    model="claude-sonnet-4-6",
+    api_key="sk-ant-...",
+    max_tokens=4096,
+))
+
+executor = AgentExecutor(AgentConfig(
+    llm=llm,
+    tools=[FileReadTool(), PythonREPLTool(), run_tests],
+    agent_type="function_calling",
+    system_prompt=(
+        "You are a senior code reviewer. Read the file, check for bugs, "
+        "style issues, and performance problems. Run tests if available. "
+        "Return a structured review with: summary, issues (critical/minor), "
+        "and specific suggestions."
+    ),
+))
+
+review = asyncio.run(executor.run(
+    "Review the file at src/synapsekit/llm/openai.py. "
+    "Focus on error handling and async safety."
+))
+print(review)
+```
+
+---
+
+## 3. Customer support agent
+
+Classify intent, respond appropriately, and escalate if needed:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, tool
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit import LLMConfig
+
+@tool
+def lookup_order(order_id: str) -> dict:
+    """Look up an order by ID in the database."""
+    # In practice, query your database
+    orders = {
+        "ORD-123": {"status": "shipped", "eta": "2 days", "item": "SynapseKit Pro"},
+        "ORD-456": {"status": "processing", "eta": "5 days", "item": "SynapseKit Basic"},
+    }
+    return orders.get(order_id, {"error": f"Order {order_id} not found"})
+
+@tool
+def create_support_ticket(
+    category: str,
+    priority: str,
+    description: str,
+    customer_email: str,
+) -> dict:
+    """Create a support ticket and assign it to the right team."""
+    ticket_id = f"TKT-{hash(description) % 10000:04d}"
+    return {
+        "ticket_id": ticket_id,
+        "category": category,
+        "priority": priority,
+        "assigned_to": "billing@company.com" if category == "billing" else "support@company.com",
+        "message": f"Ticket {ticket_id} created and team notified.",
+    }
+
+@tool
+def send_response_email(to: str, subject: str, body: str) -> str:
+    """Send a support response email to the customer."""
+    # In practice, use SMTP or a mailing API
+    print(f"EMAIL → {to}: {subject}\n{body}")
+    return f"Email sent to {to}"
+
+llm = OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+
+executor = AgentExecutor(AgentConfig(
+    llm=llm,
+    tools=[lookup_order, create_support_ticket, send_response_email],
+    agent_type="function_calling",
+    system_prompt=(
+        "You are a customer support agent. Classify the customer's intent "
+        "(billing/order/technical/other), look up relevant information, "
+        "respond helpfully, and escalate complex issues by creating a ticket."
+    ),
+))
+
+response = asyncio.run(executor.run(
+    "Hi, I'm John (john@example.com). My order ORD-123 hasn't arrived yet "
+    "and I'm worried. Can you help?"
+))
+print(response)
+```
+
+---
+
+## 4. Data analysis agent
+
+Load a CSV, compute statistics, and generate a chart:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit import LLMConfig, PythonREPLTool, FileReadTool, FileWriteTool
+
+llm = OpenAILLM(LLMConfig(model="gpt-4o", api_key="sk-..."))
+
+executor = AgentExecutor(AgentConfig(
+    llm=llm,
+    tools=[PythonREPLTool(), FileReadTool(), FileWriteTool()],
+    agent_type="function_calling",
+    system_prompt=(
+        "You are a data analyst. Use Python to load data, compute statistics, "
+        "and create visualizations. Save charts as PNG files. "
+        "Always explain your findings in plain English after analysis."
+    ),
+))
+
+analysis = asyncio.run(executor.run(
+    "Load sales_data.csv, compute monthly revenue totals, "
+    "identify the top 3 months, and save a bar chart as sales_chart.png."
+))
+print(analysis)
+# Loaded sales_data.csv with 1,200 rows...
+# Top 3 months: March ($52,400), December ($49,800), October ($44,100)
+# Chart saved to sales_chart.png.
+```
+
+---
+
+## 5. Multi-provider fallback agent
+
+Try OpenAI first, automatically fall back to Anthropic if it fails:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, CalculatorTool, WebSearchTool
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit.llm.anthropic import AnthropicLLM
+from synapsekit import LLMConfig
+from synapsekit.exceptions import LLMError, RateLimitError
+
+async def get_agent_with_fallback():
+    """Build an agent that falls back from OpenAI to Anthropic."""
+    tools = [CalculatorTool(), WebSearchTool()]
+
+    try:
+        llm = OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+        # Quick health check
+        await llm.generate("ping", max_tokens=5)
+        print("Using OpenAI gpt-4o-mini")
+    except (LLMError, RateLimitError):
+        print("OpenAI unavailable, falling back to Anthropic")
+        llm = AnthropicLLM(LLMConfig(
+            model="claude-haiku-4-5",
+            api_key="sk-ant-...",
+            max_tokens=1024,
+        ))
+
+    return AgentExecutor(AgentConfig(
+        llm=llm,
+        tools=tools,
+        agent_type="function_calling",
+    ))
+
+async def main():
+    executor = await get_agent_with_fallback()
+    answer = await executor.run("What is 15% of $1,250, and search for the latest Python version?")
+    print(answer)
+
+asyncio.run(main())
+```
+
+---
+
+## 6. Agent with Redis memory
+
+Persist conversation context across sessions using Redis:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, CalculatorTool
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit import LLMConfig
+from synapsekit.memory import RedisMemory
+
+llm = OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+
+# Memory persists across process restarts
+memory = RedisMemory(
+    url="redis://localhost:6379",
+    session_id="user-42-session",   # unique per user/session
+    ttl_seconds=3600,               # expire after 1 hour
+)
+
+executor = AgentExecutor(AgentConfig(
+    llm=llm,
+    tools=[CalculatorTool()],
+    agent_type="function_calling",
+    memory=memory,
+))
+
+# First turn
+answer1 = asyncio.run(executor.run("My name is Alice and my budget is $500."))
+print(answer1)  # Nice to meet you, Alice!
+
+# Second turn (different process, same session_id) — agent remembers Alice
+answer2 = asyncio.run(executor.run("How much can I spend on each of 4 items?"))
+print(answer2)  # Based on your $500 budget, you can spend $125 per item.
+```
+
+---
+
+## 7. Human-in-the-loop agent
+
+Pause for human approval before executing sensitive actions:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, tool
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit import LLMConfig
+from synapsekit.exceptions import ToolError
+
+APPROVED_ACTIONS: set[str] = set()  # Track approved actions
+
+@tool
+def send_email(to: str, subject: str, body: str) -> str:
+    """Send an email. Requires human approval before sending."""
+    action_key = f"email:{to}:{subject}"
+
+    if action_key not in APPROVED_ACTIONS:
+        # Pause and ask for approval
+        print(f"\n[APPROVAL REQUIRED]")
+        print(f"  To: {to}")
+        print(f"  Subject: {subject}")
+        print(f"  Body: {body[:200]}...")
+        confirmed = input("Approve? (yes/no): ").strip().lower()
+        if confirmed != "yes":
+            raise ToolError(f"User rejected sending email to {to}")
+        APPROVED_ACTIONS.add(action_key)
+
+    # In practice, send via SMTP
+    return f"Email sent to {to} with subject '{subject}'"
+
+@tool
+def delete_file(path: str) -> str:
+    """Delete a file. Always requires human approval."""
+    print(f"\n[APPROVAL REQUIRED] Delete file: {path}")
+    confirmed = input("Confirm deletion? (yes/no): ").strip().lower()
+    if confirmed != "yes":
+        raise ToolError(f"User rejected deleting {path}")
+    import os
+    os.remove(path)
+    return f"Deleted {path}"
+
+llm = OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+executor = AgentExecutor(AgentConfig(
+    llm=llm,
+    tools=[send_email, delete_file],
+    agent_type="function_calling",
+))
+
+asyncio.run(executor.run(
+    "Send a welcome email to new_user@example.com with subject 'Welcome!' "
+    "and a brief welcome message."
+))
+```
+
+---
+
+## 8. Cost-bounded agent
+
+Hard-stop the agent when it exceeds a cost budget:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, WebSearchTool, PythonREPLTool
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit import LLMConfig, BudgetGuard
+from synapsekit.observability import CostTracker
+from synapsekit.exceptions import BudgetExceededError
+
+tracker = CostTracker()
+llm = OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+llm.attach_tracker(tracker)
+
+executor = AgentExecutor(AgentConfig(
+    llm=llm,
+    tools=[WebSearchTool(), PythonREPLTool()],
+    agent_type="function_calling",
+    max_steps=20,
+    budget_guard=BudgetGuard(
+        tracker=tracker,
+        max_cost_usd=0.10,             # Stop if cost exceeds $0.10
+        on_exceeded="raise",           # or "warn" to log and continue
+    ),
+))
+
+try:
+    answer = asyncio.run(executor.run(
+        "Research and compare the top 10 vector databases. "
+        "For each, find pricing, performance benchmarks, and key features."
+    ))
+    print(answer)
+    print(f"\nTotal cost: ${tracker.total_cost_usd:.4f}")
+except BudgetExceededError as e:
+    print(f"Budget exceeded: {e}")
+    print(f"Spent: ${tracker.total_cost_usd:.4f} / $0.10 limit")
+    print("Partial result:", e.partial_result)
+```
+
+---
+
+## 9. Streaming agent
+
+Stream tokens to the user as the agent generates its final response:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, CalculatorTool, WebSearchTool
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit import LLMConfig
+
+llm = OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+
+executor = AgentExecutor(AgentConfig(
+    llm=llm,
+    tools=[CalculatorTool(), WebSearchTool()],
+    agent_type="function_calling",
+))
+
+async def main():
+    print("Agent: ", end="", flush=True)
+    async for token in executor.stream(
+        "Search for the current population of Tokyo and calculate "
+        "what percentage that is of Japan's total population."
+    ):
+        print(token, end="", flush=True)
+    print()  # newline at end
+
+asyncio.run(main())
+# Agent: Tokyo's population is approximately 13.96 million, which represents
+# about 11.2% of Japan's total population of 125.7 million.
+```
+
+---
+
+## 10. Multi-agent pipeline
+
+Chain a researcher, writer, and reviewer in sequence:
+
+```python
+import asyncio
+from synapsekit import AgentExecutor, AgentConfig, WebSearchTool, WebFetchTool
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit.llm.anthropic import AnthropicLLM
+from synapsekit import LLMConfig, FileWriteTool
+
+# Agent 1: Researcher — gathers facts
+researcher = AgentExecutor(AgentConfig(
+    llm=OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-...")),
+    tools=[WebSearchTool(), WebFetchTool()],
+    agent_type="function_calling",
+    system_prompt=(
+        "You are a researcher. Search for information and return a list of "
+        "key facts with sources. Be thorough and cite your sources."
+    ),
+))
+
+# Agent 2: Writer — turns facts into a blog post
+writer = AgentExecutor(AgentConfig(
+    llm=AnthropicLLM(LLMConfig(
+        model="claude-sonnet-4-6",
+        api_key="sk-ant-...",
+        max_tokens=4096,
+    )),
+    tools=[FileWriteTool()],
+    agent_type="function_calling",
+    system_prompt=(
+        "You are a technical writer. Given research notes, write a clear, "
+        "engaging blog post. Use markdown. Save the result as a file."
+    ),
+))
+
+# Agent 3: Reviewer — checks quality and suggests edits
+reviewer = AgentExecutor(AgentConfig(
+    llm=OpenAILLM(LLMConfig(model="gpt-4o", api_key="sk-...")),
+    tools=[],
+    agent_type="function_calling",
+    system_prompt=(
+        "You are an editor. Review the draft and provide specific, actionable "
+        "feedback on clarity, accuracy, and structure. Score it 1-10."
+    ),
+))
+
+async def run_pipeline(topic: str) -> str:
+    print(f"[1/3] Researching: {topic}")
+    research_notes = await researcher.run(f"Research: {topic}")
+
+    print("[2/3] Writing draft...")
+    draft = await writer.run(
+        f"Write a blog post using these research notes:\n\n{research_notes}\n\n"
+        f"Save it as '{topic.replace(' ', '_')}.md'"
+    )
+
+    print("[3/3] Reviewing draft...")
+    review = await reviewer.run(
+        f"Review this blog post draft:\n\n{draft}"
+    )
+
+    return f"Draft complete.\n\nReview:\n{review}"
+
+result = asyncio.run(run_pipeline("the future of vector databases in 2026"))
+print(result)
+```
diff --git a/docs/agents/overview.md b/docs/agents/overview.md
index 48876377a..24bf5a555 100644
--- a/docs/agents/overview.md
+++ b/docs/agents/overview.md
@@ -4,7 +4,7 @@ sidebar_position: 1
 
 # Agents Overview
 
-SynapseKit agents are async-first, tool-using AI systems that reason and act to complete tasks.
+SynapseKit agents are async-first, tool-using AI systems that reason and act to complete tasks. An agent combines an LLM with a set of tools, loops until a task is complete, and tracks the full reasoning trace.
 
 ## Core concepts
 
@@ -37,33 +37,146 @@ answer = asyncio.run(executor.run("What is 2 ** 10 + 24?"))
 print(answer)  # "The answer is 1048."
 ```
 
+## Agent type selection guide
+
+Choose your agent type based on your LLM and task requirements:
+
+| Scenario | Recommended type | Why |
+|---|---|---|
+| OpenAI or Anthropic LLM | `function_calling` | Native tool_calls, more reliable |
+| Any other LLM (Ollama, Mistral, etc.) | `react` | Works via structured text prompts |
+| Need full control over loop | `react` | Easy to inspect Thought/Action/Observation |
+| Production with strict tool schemas | `function_calling` | Typed arguments, fewer hallucinations |
+| Local/offline models | `react` | No function-calling API needed |
+| MCP (Model Context Protocol) tools | `mcp` | Connects to external MCP servers |
+
+## Agent types
+
+**`"react"`** — Works with any LLM. Uses a structured text prompt (Thought/Action/Observation). No native function calling required. Best for local models and providers without tool-use APIs.
+
+**`"function_calling"`** — Requires `OpenAILLM` or `AnthropicLLM`. Uses native `tool_calls` / `tool_use` for more reliable tool selection and type-safe arguments.
+
+**`"mcp"`** — Connects to external [Model Context Protocol](https://modelcontextprotocol.io) servers. Access any MCP-compatible tool (filesystem, databases, APIs) without writing wrapper code.
+
 ## Built-in tools
 
+SynapseKit includes 32+ built-in tools organized by category:
+
+### Math and code
+| Tool | Class | Description |
+|---|---|---|
+| Calculator | `CalculatorTool` | Safe math eval (sqrt, trig, log, etc.) |
+| Python REPL | `PythonREPLTool` | Execute Python with persistent namespace |
+| Shell | `ShellTool` | Run shell commands (use with care) |
+
+### Web and search
 | Tool | Class | Extra | Description |
 |---|---|---|---|
-| Calculator | `CalculatorTool` | none | Safe math eval |
-| Python REPL | `PythonREPLTool` | none | Execute Python code |
-| File Read | `FileReadTool` | none | Read local files |
 | Web Search | `WebSearchTool` | `synapsekit[search]` | DuckDuckGo search |
-| SQL Query | `SQLQueryTool` | none (SQLite) / `sqlalchemy` | SQL SELECT queries |
+| Web Fetch | `WebFetchTool` | none | Fetch and parse a URL |
+| Wikipedia | `WikipediaTool` | none | Search Wikipedia |
+| News Search | `NewsSearchTool` | `synapsekit[search]` | Recent news articles |
 
-## Agent types
+### File and data
+| Tool | Class | Description |
+|---|---|---|
+| File Read | `FileReadTool` | Read local files (text, JSON, CSV) |
+| File Write | `FileWriteTool` | Write content to local files |
+| Directory List | `DirectoryListTool` | List files in a directory |
+| CSV Reader | `CSVReaderTool` | Load and query CSV files |
+| JSON Parser | `JSONParserTool` | Parse and extract fields from JSON |
 
-**`"react"`** — Works with any LLM. Uses a structured text prompt (Thought/Action/Observation). No native function calling required.
+### Database
+| Tool | Class | Extra | Description |
+|---|---|---|---|
+| SQL Query | `SQLQueryTool` | `sqlalchemy` optional | SQL SELECT queries |
+| SQLite | `SQLiteTool` | none | SQLite read/write |
+| MongoDB | `MongoDBTool` | `motor` | MongoDB queries |
 
-**`"function_calling"`** — Requires `OpenAILLM` or `AnthropicLLM`. Uses native tool_calls / tool_use for more reliable tool selection.
+### APIs and integrations
+| Tool | Class | Extra | Description |
+|---|---|---|---|
+| HTTP Request | `HTTPRequestTool` | none | GET/POST any HTTP endpoint |
+| GitHub | `GitHubTool` | none | Read repos, issues, PRs |
+| Slack | `SlackTool` | `slack-sdk` | Send Slack messages |
+| Email | `EmailTool` | none | Send emails via SMTP |
+
+### AI and ML
+| Tool | Class | Description |
+|---|---|---|
+| Image Describer | `ImageDescriberTool` | Describe images using vision LLM |
+| Text Classifier | `TextClassifierTool` | Zero-shot classification |
+| Embeddings | `EmbeddingsTool` | Compute text embeddings |
+| Vector Search | `VectorSearchTool` | Similarity search over a vector store |
+
+## ReActAgent vs FunctionCallingAgent vs MCPAgent
+
+| Feature | `ReActAgent` | `FunctionCallingAgent` | `MCPAgent` |
+|---|---|---|---|
+| LLM requirement | Any LLM | OpenAI / Anthropic only | Any LLM with function calling |
+| Tool format | Text prompt | JSON schema (tool_calls) | MCP protocol |
+| Reliability | Good | Excellent | Depends on MCP server |
+| Tracing | Thought/Action/Obs | Tool call history | Tool call history |
+| Best for | Flexibility, local LLMs | Production, typed outputs | Ecosystem integrations |
+| Streaming | Yes | Yes | Yes |
+| Max steps | Configurable | Configurable | Configurable |
+
+## Multi-agent patterns
+
+For complex tasks, coordinate multiple agents:
+
+```python
+from synapsekit.multi_agent import HandoffChain, Crew
+
+# Sequential: researcher → writer → reviewer
+chain = HandoffChain([
+    researcher_agent,
+    writer_agent,
+    reviewer_agent,
+])
+result = await chain.run("Write a technical blog post about vector databases.")
+
+# Parallel crew: multiple agents tackle sub-tasks simultaneously
+crew = Crew(agents=[data_agent, chart_agent, summary_agent])
+results = await crew.run("Analyze Q4 sales data.")
+```
+
+See [Multi-Agent](../multi-agent/overview) for full patterns.
 
 ## Sync usage
 
 ```python
 executor = AgentExecutor(AgentConfig(llm=llm, tools=[CalculatorTool()]))
 answer = executor.run_sync("What is sqrt(144)?")
+print(answer)  # "The square root of 144 is 12."
+```
+
+## Cost and latency tips
+
+- Use `gpt-4o-mini` or `llama-3.1-8b` for agent loops — cheaper and faster per step
+- Set `max_steps` to cap runaway loops: `AgentConfig(max_steps=10)`
+- Use `BudgetGuard` to hard-stop on cost: `AgentConfig(budget_usd=0.10)`
+- Enable caching on the LLM to avoid re-calling identical sub-queries
+- For latency-sensitive agents, use Groq or Cerebras for the underlying LLM
+
+```python
+from synapsekit import AgentConfig, BudgetGuard
+
+config = AgentConfig(
+    llm=llm,
+    tools=[WebSearchTool(), CalculatorTool()],
+    agent_type="function_calling",
+    max_steps=15,
+    budget_guard=BudgetGuard(max_cost_usd=0.50),
+)
 ```
 
 ## Next steps
 
 - [ReAct Agent](./react) — prompt-based reasoning loop that works with any LLM
 - [Function Calling Agent](./function-calling) — native tool_calls for OpenAI and Anthropic
-- [Built-in Tools](./tools) — all 24 tools with usage examples
+- [Built-in Tools](./tools) — all 32+ tools with usage examples
 - [AgentExecutor](./executor) — unified runner, multi-step loops, and streaming
+- [Agent Cookbook](./cookbook) — 10 common patterns with full code examples
+- [Tool Authoring Guide](./tool-authoring) — build custom tools with `@tool` and `BaseTool`
 - [Multi-Agent](../multi-agent/overview) — coordinating multiple agents with message passing
diff --git a/docs/agents/tool-authoring.md b/docs/agents/tool-authoring.md
new file mode 100644
index 000000000..a03482b1f
--- /dev/null
+++ b/docs/agents/tool-authoring.md
@@ -0,0 +1,457 @@
+---
+sidebar_position: 8
+---
+
+# Tool Authoring Guide
+
+Write custom tools for SynapseKit agents using the `@tool` decorator or `BaseTool` class.
+
+## The `@tool` decorator
+
+The simplest way to create a tool. SynapseKit generates the JSON Schema automatically from type hints and the docstring:
+
+```python
+from synapsekit import tool
+
+@tool
+def get_weather(city: str, unit: str = "celsius") -> str:
+    """Get the current weather for a city.
+
+    Args:
+        city: Name of the city (e.g. 'London', 'Tokyo').
+        unit: Temperature unit, either 'celsius' or 'fahrenheit'.
+    """
+    # Your implementation here
+    return f"Sunny, 22 {unit} in {city}"
+```
+
+SynapseKit reads the function signature and docstring to produce the schema:
+
+```json
+{
+  "type": "function",
+  "function": {
+    "name": "get_weather",
+    "description": "Get the current weather for a city.",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "city": {"type": "string", "description": "Name of the city"},
+        "unit": {"type": "string", "description": "Temperature unit", "default": "celsius"}
+      },
+      "required": ["city"]
+    }
+  }
+}
+```
+
+## Type hint to JSON Schema mapping
+
+| Python type | JSON Schema type | Notes |
+|---|---|---|
+| `str` | `"string"` | |
+| `int` | `"integer"` | |
+| `float` | `"number"` | |
+| `bool` | `"boolean"` | |
+| `list` | `"array"` | |
+| `list[str]` | `"array"` with `"items": {"type": "string"}` | |
+| `dict` | `"object"` | |
+| `Optional[str]` | `"string"` | Not required |
+| `str \| None` | `"string"` | Not required |
+| `Literal["a", "b"]` | `"string"` with `"enum": ["a", "b"]` | |
+
+## Async tools
+
+Tools can be async for I/O-bound operations like HTTP requests:
+
+```python
+import httpx
+from synapsekit import tool
+
+@tool
+async def fetch_github_repo(owner: str, repo: str) -> dict:
+    """Fetch metadata about a GitHub repository.
+
+    Args:
+        owner: GitHub username or organization name.
+        repo: Repository name.
+    """
+    url = f"https://api.github.com/repos/{owner}/{repo}"
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(url, headers={"Accept": "application/vnd.github+json"})
+        resp.raise_for_status()
+    data = resp.json()
+    return {
+        "name": data["full_name"],
+        "stars": data["stargazers_count"],
+        "forks": data["forks_count"],
+        "description": data["description"],
+        "language": data["language"],
+        "open_issues": data["open_issues_count"],
+    }
+```
+
+SynapseKit automatically handles async tools -- the agent executor awaits them correctly.
+
+## Class-based tools
+
+For tools that need configuration, dependencies, or state, use `BaseTool`:
+
+```python
+from synapsekit.tools import BaseTool
+import sqlite3
+
+class DatabaseQueryTool(BaseTool):
+    name = "query_database"
+    description = "Execute a SQL SELECT query on the application database."
+
+    def __init__(self, db_path: str, allowed_tables: list[str] | None = None):
+        self.db_path = db_path
+        self.allowed_tables = allowed_tables or []
+
+    @property
+    def schema(self) -> dict:
+        return {
+            "type": "function",
+            "function": {
+                "name": self.name,
+                "description": self.description,
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "query": {
+                            "type": "string",
+                            "description": "SQL SELECT query to execute",
+                        },
+                        "limit": {
+                            "type": "integer",
+                            "description": "Maximum rows to return",
+                            "default": 50,
+                        },
+                    },
+                    "required": ["query"],
+                },
+            },
+        }
+
+    async def run(self, query: str, limit: int = 50) -> list[dict]:
+        # Validate query
+        if not query.strip().upper().startswith("SELECT"):
+            raise ValueError("Only SELECT queries are allowed")
+
+        # Check table access (optional security guard)
+        if self.allowed_tables:
+            for table in self.allowed_tables:
+                if table.lower() in query.lower():
+                    break
+            else:
+                raise ValueError(
+                    f"Query must reference one of: {self.allowed_tables}"
+                )
+
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        cursor = conn.execute(f"{query} LIMIT {limit}")
+        results = [dict(row) for row in cursor.fetchall()]
+        conn.close()
+        return results
+
+
+# Usage
+db_tool = DatabaseQueryTool(
+    db_path="production.db",
+    allowed_tables=["orders", "products", "customers"],
+)
+
+from synapsekit import FunctionCallingAgent
+from synapsekit.llm.openai import OpenAILLM
+from synapsekit.llm.base import LLMConfig
+
+agent = FunctionCallingAgent(
+    llm=OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-...")),
+    tools=[db_tool],
+)
+
+answer = await agent.run("How many orders were placed in March 2026?")
+```
+
+## Structured return types
+
+Use dataclasses or Pydantic models for typed tool outputs:
+
+```python
+from dataclasses import dataclass
+from synapsekit import tool
+
+@dataclass
+class StockQuote:
+    ticker: str
+    price: float
+    change_pct: float
+    volume: int
+    currency: str = "USD"
+
+@tool
+def get_stock_quote(ticker: str) -> StockQuote:
+    """Get the current stock quote for a ticker symbol.
+
+    Args:
+        ticker: Stock ticker symbol (e.g. 'AAPL', 'GOOG').
+    """
+    # In practice, call a financial API
+    return StockQuote(
+        ticker=ticker,
+        price=185.20,
+        change_pct=1.35,
+        volume=52_300_000,
+    )
+```
+
+The dataclass is automatically serialized to a JSON dict when returned to the LLM.
+
+## Error handling in tools
+
+Raise `ToolError` for expected failures. The agent will see the error message and can decide how to proceed:
+
+```python
+from synapsekit import tool
+from synapsekit.exceptions import ToolError
+
+@tool
+def read_file(path: str) -> str:
+    """Read a text file and return its contents.
+
+    Args:
+        path: Absolute or relative path to the file.
+    """
+    import os
+
+    if not os.path.exists(path):
+        raise ToolError(f"File not found: {path}")
+
+    if os.path.getsize(path) > 10 * 1024 * 1024:  # 10 MB limit
+        raise ToolError(f"File too large (>10MB): {path}")
+
+    try:
+        with open(path, encoding="utf-8") as f:
+            return f.read()
+    except PermissionError:
+        raise ToolError(f"Permission denied reading: {path}")
+    except UnicodeDecodeError:
+        raise ToolError(f"File is not valid UTF-8 text: {path}")
+```
+
+The agent will receive the error as an observation and can try a different approach (e.g., try a different path, skip the file, or inform the user).
+
+## Tool with retry logic
+
+For tools that call flaky external services:
+
+```python
+import asyncio
+import httpx
+from synapsekit import tool
+from synapsekit.exceptions import ToolError
+
+@tool
+async def call_external_api(endpoint: str, payload: dict) -> dict:
+    """Call an external REST API endpoint with automatic retry.
+
+    Args:
+        endpoint: Full URL of the API endpoint.
+        payload: JSON payload to send in the POST request.
+    """
+    max_retries = 3
+    last_error = None
+
+    for attempt in range(max_retries):
+        try:
+            async with httpx.AsyncClient(timeout=10) as client:
+                resp = await client.post(endpoint, json=payload)
+                resp.raise_for_status()
+                return resp.json()
+        except httpx.TimeoutException as e:
+            last_error = e
+            if attempt < max_retries - 1:
+                await asyncio.sleep(2 ** attempt)  # exponential backoff
+        except httpx.HTTPStatusError as e:
+            if e.response.status_code == 429:
+                retry_after = int(e.response.headers.get("Retry-After", 5))
+                await asyncio.sleep(retry_after)
+                last_error = e
+            elif e.response.status_code >= 500:
+                last_error = e
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(2 ** attempt)
+            else:
+                raise ToolError(f"API error {e.response.status_code}: {e.response.text}")
+
+    raise ToolError(f"API call failed after {max_retries} attempts: {last_error}")
+```
+
+## Tool registry
+
+Register tools centrally and reuse them across agents:
+
+```python
+from synapsekit.tools import ToolRegistry
+from synapsekit import tool
+
+registry = ToolRegistry()
+
+@registry.register
+@tool
+def search_docs(query: str) -> list:
+    """Search internal documentation."""
+    return [{"title": f"Doc about {query}", "score": 0.95}]
+
+@registry.register
+@tool
+def get_user(user_id: str) -> dict:
+    """Look up a user by ID."""
+    return {"id": user_id, "name": "Alice", "role": "admin"}
+
+@registry.register
+@tool
+def send_notification(user_id: str, message: str) -> str:
+    """Send a notification to a user."""
+    return f"Notification sent to user {user_id}"
+
+# Create agents with specific tool subsets
+support_agent_tools = registry.get(["search_docs", "get_user", "send_notification"])
+read_only_tools = registry.get(["search_docs", "get_user"])
+
+# List all registered tools
+for name, tool_fn in registry.items():
+    print(f"{name}: {tool_fn.description}")
+```
+
+## Parameterized tools via factory
+
+Create tool variants configured at runtime:
+
+```python
+from synapsekit.tools import BaseTool
+
+def make_http_tool(base_url: str, auth_token: str, tool_name: str = "api_call") -> BaseTool:
+    """Factory that creates an HTTP tool pre-configured for a specific API."""
+
+    class ConfiguredHTTPTool(BaseTool):
+        name = tool_name
+        description = f"Make authenticated API calls to {base_url}"
+
+        @property
+        def schema(self) -> dict:
+            return {
+                "type": "function",
+                "function": {
+                    "name": self.name,
+                    "description": self.description,
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "path": {"type": "string", "description": "API path (e.g. /users/42)"},
+                            "method": {"type": "string", "enum": ["GET", "POST", "PUT", "DELETE"], "default": "GET"},
+                            "body": {"type": "object", "description": "Request body for POST/PUT"},
+                        },
+                        "required": ["path"],
+                    },
+                },
+            }
+
+        async def run(self, path: str, method: str = "GET", body: dict | None = None) -> dict:
+            import httpx
+            url = f"{base_url}{path}"
+            headers = {"Authorization": f"Bearer {auth_token}"}
+            async with httpx.AsyncClient() as client:
+                resp = await client.request(method, url, json=body, headers=headers)
+                resp.raise_for_status()
+                return resp.json()
+
+    return ConfiguredHTTPTool()
+
+
+# Usage
+stripe_tool = make_http_tool(
+    base_url="https://api.stripe.com/v1",
+    auth_token="sk_live_...",
+    tool_name="stripe_api",
+)
+
+github_tool = make_http_tool(
+    base_url="https://api.github.com",
+    auth_token="ghp_...",
+    tool_name="github_api",
+)
+```
+
+## Testing tools
+
+Always test tools independently before using them in agents:
+
+```python
+import asyncio
+import pytest
+from synapsekit import tool
+from synapsekit.exceptions import ToolError
+
+@tool
+def divide(a: float, b: float) -> float:
+    """Divide a by b."""
+    if b == 0:
+        raise ToolError("Cannot divide by zero")
+    return a / b
+
+
+def test_divide_basic():
+    result = asyncio.run(divide.run(10, 2))
+    assert result == 5.0
+
+
+def test_divide_by_zero():
+    with pytest.raises(ToolError, match="zero"):
+        asyncio.run(divide.run(10, 0))
+
+
+def test_divide_schema():
+    schema = divide.schema
+    assert schema["function"]["name"] == "divide"
+    params = schema["function"]["parameters"]["properties"]
+    assert "a" in params
+    assert "b" in params
+    assert schema["function"]["parameters"]["required"] == ["a", "b"]
+
+
+# Test async tools
+@tool
+async def fetch_data(url: str) -> str:
+    """Fetch data from a URL."""
+    import httpx
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(url)
+        return resp.text[:500]
+
+
+@pytest.mark.asyncio
+async def test_fetch_data():
+    result = await fetch_data.run("https://httpbin.org/get")
+    assert "url" in result
+```
+
+## Best practices
+
+| Practice | Recommendation |
+|---|---|
+| Docstrings | Always write clear descriptions -- the LLM uses them to decide when to call |
+| Parameter names | Use descriptive names (`city` not `c`, `user_id` not `uid`) |
+| Return types | Return serializable types (str, dict, list, int, float) |
+| Error messages | Be specific: `"File not found: /tmp/foo.txt"` not `"error"` |
+| Idempotency | Make tools safe to call twice (avoid duplicate writes/emails) |
+| Side effects | Document any side effects in the docstring |
+| Auth/credentials | Pass via constructor, not as tool arguments |
+| Timeouts | Always set timeouts on network calls |
+| Validation | Validate inputs before executing -- the LLM may pass unexpected values |
+
+:::tip
+A well-written docstring is critical for reliable tool use. Explain what the tool does, when to use it, and any limitations or preconditions. The LLM reads the description to decide which tool to call.
+:::
diff --git a/docs/llms/anthropic.md b/docs/llms/anthropic.md
index ec889bc57..b490708ec 100644
--- a/docs/llms/anthropic.md
+++ b/docs/llms/anthropic.md
@@ -4,6 +4,8 @@ sidebar_position: 3
 
 # Anthropic
 
+Use Anthropic's Claude models with streaming, tool use, vision, and large context windows.
+
 ## Install
 
 ```bash
@@ -31,6 +33,204 @@ response = await llm.generate("Explain RAG in simple terms.")
 print(response)
 ```
 
-## Supported models
+## Available models
+
+| Model | Context | Input (per 1M) | Output (per 1M) | Notes |
+|---|---|---|---|---|
+| `claude-opus-4-6` | 200K | $15.00 | $75.00 | Most capable |
+| `claude-sonnet-4-6` | 200K | $3.00 | $15.00 | Best balance |
+| `claude-haiku-4-5-20251001` | 200K | $0.25 | $1.25 | Fastest, cheapest |
+
+:::note
+`max_tokens` is **required** for Anthropic models. The API will reject requests without it.
+:::
+
+## Function calling (tool use)
+
+Anthropic uses a `tool_use` flow. SynapseKit handles the multi-step protocol automatically:
+
+1. Send user message + tool schemas
+2. Receive `tool_use` block from Claude
+3. Execute the tool and collect results
+4. Send `tool_result` back in the next message
+5. Receive final text response
+
+```python
+from synapsekit import tool, FunctionCallingAgent
+from synapsekit.llms import AnthropicLLM, LLMConfig
+
+@tool
+def get_stock_price(ticker: str) -> dict:
+    """Get current stock price for a ticker symbol."""
+    prices = {"AAPL": 185.20, "GOOG": 142.50, "MSFT": 415.30}
+    return {"ticker": ticker, "price": prices.get(ticker, 0), "currency": "USD"}
+
+@tool
+def calculate_portfolio_value(holdings: dict) -> float:
+    """Calculate total portfolio value given ticker to shares mapping."""
+    return sum(shares * 100 for shares in holdings.values())
+
+llm = AnthropicLLM(LLMConfig(
+    model="claude-sonnet-4-6",
+    api_key="sk-ant-...",
+    max_tokens=2048,
+))
+
+agent = FunctionCallingAgent(llm=llm, tools=[get_stock_price, calculate_portfolio_value])
+answer = await agent.run("What's the current price of AAPL and MSFT?")
+print(answer)
+```
+
+### Raw call_with_tools
+
+```python
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "run_sql",
+            "description": "Run a SQL SELECT query",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string"},
+                    "database": {"type": "string", "default": "main"},
+                },
+                "required": ["query"],
+            },
+        },
+    }
+]
+
+result = await llm.call_with_tools(
+    messages=[{"role": "user", "content": "How many users signed up last week?"}],
+    tools=tools,
+)
+# {"content": None, "tool_calls": [{"id": "toolu_01...", "name": "run_sql", "arguments": {...}}]}
+```
+
+## Vision
+
+Claude models support image inputs via `MultimodalMessage`:
+
+```python
+from synapsekit.multimodal import MultimodalMessage, ImageContent
+
+# From URL
+message = MultimodalMessage(
+    role="user",
+    content=[
+        ImageContent.from_url("https://example.com/diagram.png"),
+        "Explain what this architecture diagram shows.",
+    ],
+)
+
+response = await llm.generate(message)
+```
+
+```python
+# From file bytes
+with open("screenshot.png", "rb") as f:
+    image_bytes = f.read()
+
+message = MultimodalMessage(
+    role="user",
+    content=[
+        ImageContent.from_bytes(image_bytes, media_type="image/png"),
+        "Describe this UI and identify any accessibility issues.",
+    ],
+)
+response = await llm.generate(message)
+```
+
+## Large context (200K tokens)
+
+Claude's 200K context window lets you load entire codebases or documents:
+
+```python
+import os
+
+# Load all Python files in a project
+code_files = []
+for root, _, files in os.walk("./myproject"):
+    for f in files:
+        if f.endswith(".py"):
+            with open(os.path.join(root, f)) as fh:
+                code_files.append(f"# {f}\n{fh.read()}")
+
+full_codebase = "\n\n".join(code_files)
+
+llm = AnthropicLLM(LLMConfig(
+    model="claude-opus-4-6",
+    api_key="sk-ant-...",
+    max_tokens=4096,
+))
+
+response = await llm.generate(
+    f"Here is the full codebase:\n\n{full_codebase}\n\nIdentify any security vulnerabilities."
+)
+```
+
+## LLMConfig options
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `model` | str | required | Claude model name |
+| `api_key` | str | required | Your Anthropic API key |
+| `max_tokens` | int | **required** | Max output tokens |
+| `temperature` | float | `1.0` | Sampling temperature (0-1) |
+| `seed` | int | None | For reproducible outputs |
+| `max_retries` | int | `3` | Auto-retry on transient errors |
+| `requests_per_minute` | int | None | Rate throttle |
+| `cache_backend` | str | None | `"sqlite"` or `"lru"` |
+
+## Cost tracking
+
+```python
+from synapsekit.observability import CostTracker
+
+tracker = CostTracker()
+llm = AnthropicLLM(LLMConfig(
+    model="claude-sonnet-4-6",
+    api_key="sk-ant-...",
+    max_tokens=2048,
+))
+llm.attach_tracker(tracker)
+
+await llm.generate("Summarize the French Revolution in 3 bullet points.")
+print(f"Cost: ${tracker.total_cost_usd:.6f}")
+```
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key -- check sk-ant-...")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"Anthropic error: {e}")
+```
+
+## Using the RAG facade
+
+```python
+from synapsekit import RAG
+
+rag = RAG(
+    model="claude-sonnet-4-6",
+    api_key="sk-ant-...",
+    provider="anthropic",
+)
+rag.add("SynapseKit documentation goes here.")
+
+answer = rag.ask_sync("What is SynapseKit?")
+```
 
-Any model supported by the Anthropic API: `claude-opus-4-6`, `claude-sonnet-4-6`, `claude-haiku-4-5`, etc.
+:::tip
+Set `ANTHROPIC_API_KEY` in your environment to avoid passing `api_key` in code.
+:::
diff --git a/docs/llms/azure-openai.md b/docs/llms/azure-openai.md
index f925d366e..a6f205d69 100644
--- a/docs/llms/azure-openai.md
+++ b/docs/llms/azure-openai.md
@@ -4,24 +4,60 @@ sidebar_position: 8
 
 # Azure OpenAI
 
-Use OpenAI models hosted on your Azure resource. Supports streaming, `generate()`, and native function calling.
+Use OpenAI models (GPT-4o, GPT-4o-mini, o1, etc.) hosted on your own Azure resource. Azure OpenAI provides enterprise compliance features: data residency, private networking, Azure AD authentication, and SLA guarantees.
 
-## Install
+## Installation
 
 ```bash
 pip install synapsekit[openai]
 ```
 
-Uses the same `openai` package as `OpenAILLM`.
+Uses the same `openai` package as `OpenAILLM` -- no additional packages needed.
 
-## Usage
+## Prerequisites
+
+Before using Azure OpenAI with SynapseKit, you need:
+
+1. An Azure subscription with Azure OpenAI access approved
+2. An Azure OpenAI resource created in a supported region
+3. A model deployed in your resource (deployment name can differ from model name)
+
+## Authentication
+
+### Option 1: API key (simplest)
+
+```bash
+export AZURE_OPENAI_API_KEY=your-azure-api-key
+export AZURE_OPENAI_ENDPOINT=https://myresource.openai.azure.com
+```
+
+### Option 2: Azure Active Directory (enterprise recommended)
+
+```bash
+pip install azure-identity
+az login
+```
+
+```python
+from azure.identity import DefaultAzureCredential
+from synapsekit.llm.azure_openai import AzureOpenAILLM
+from synapsekit import LLMConfig
+
+llm = AzureOpenAILLM(
+    LLMConfig(model="gpt-4o", api_key="", provider="azure"),
+    azure_endpoint="https://myresource.openai.azure.com",
+    azure_ad_token_provider=DefaultAzureCredential(),
+)
+```
+
+## Basic usage
 
 ```python
 from synapsekit.llm.azure_openai import AzureOpenAILLM
 from synapsekit import LLMConfig
 
 config = LLMConfig(
-    model="gpt-4o",           # Your Azure deployment name
+    model="gpt-4o",           # Your Azure deployment name -- NOT the model name
     api_key="your-azure-key",
     provider="azure",
 )
@@ -37,11 +73,43 @@ async for token in llm.stream("What is Python?"):
     print(token, end="")
 
 # Generate
-response = await llm.generate("What is Python?")
+response = await llm.generate("Explain Azure OpenAI vs OpenAI direct API")
+print(response)
 ```
 
+:::important
+The `model` field in `LLMConfig` must be your **deployment name** in Azure, not the underlying model name. For example, if you deployed GPT-4o under the name `"my-gpt4o-deployment"`, use that as the `model` value.
+:::
+
 ## Function calling
 
+Azure OpenAI supports the same function calling as the direct OpenAI API:
+
+```python
+from synapsekit.tools import tool
+from synapsekit.llm.azure_openai import AzureOpenAILLM
+from synapsekit.agents import FunctionCallingAgent
+
+@tool
+def query_azure_sql(query: str, database: str) -> list[dict]:
+    """Execute a read-only SQL query on an Azure SQL Database."""
+    return [
+        {"id": 1, "name": "Alice", "department": "Engineering"},
+        {"id": 2, "name": "Bob", "department": "Product"},
+    ]
+
+llm = AzureOpenAILLM(
+    LLMConfig(model="gpt-4o", api_key="your-azure-key", provider="azure"),
+    azure_endpoint="https://myresource.openai.azure.com",
+)
+agent = FunctionCallingAgent(llm=llm, tools=[query_azure_sql])
+
+result = await agent.arun("List all employees in the Engineering department")
+print(result)
+```
+
+### Direct call_with_tools
+
 ```python
 tools = [
     {
@@ -64,13 +132,91 @@ result = await llm.call_with_tools(
 )
 ```
 
-## Parameters
+## Deployment name vs model name
+
+This is the most common source of confusion with Azure OpenAI:
 
-| Parameter | Required | Description |
+| Azure setting | Example value | SynapseKit field |
 |---|---|---|
-| `azure_endpoint` | Yes | Your Azure resource URL |
-| `api_version` | No | Azure API version (default `"2024-06-01"`) |
+| Deployment name | `my-gpt4o-prod` | `LLMConfig(model="my-gpt4o-prod")` |
+| Underlying model | `gpt-4o` | Used for token counting only |
+| Resource endpoint | `https://myco.openai.azure.com` | `azure_endpoint=...` |
+| API version | `2024-06-01` | `api_version=...` |
+
+## API versions
+
+Azure OpenAI uses dated API versions. The default is `2024-06-01`:
+
+| API Version | Key features |
+|---|---|
+| `2024-06-01` | GPT-4o, structured outputs, latest -- recommended |
+| `2024-02-01` | Function calling, vision |
+
+## Cost tracking
+
+```python
+from synapsekit import CostTracker
+from synapsekit.llm.azure_openai import AzureOpenAILLM
+
+tracker = CostTracker()
+llm = AzureOpenAILLM(
+    LLMConfig(model="my-gpt4o-mini", api_key="...", provider="azure"),
+    azure_endpoint="https://myresource.openai.azure.com",
+)
+
+with tracker.scope("azure-request"):
+    response = await llm.generate("Explain Azure Cognitive Services")
+    rec = tracker.record("gpt-4o-mini", input_tokens=50, output_tokens=200)
+
+print(f"Cost: ${rec.cost_usd:.6f}")
+```
+
+## Error handling
+
+```python
+from openai import RateLimitError, APIError, AuthenticationError
+
+llm = AzureOpenAILLM(
+    LLMConfig(model="gpt-4o", api_key="...", max_retries=3),
+    azure_endpoint="https://myresource.openai.azure.com",
+)
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid Azure API key or endpoint")
+except RateLimitError:
+    print("Azure OpenAI quota exceeded -- check your Azure quota in the portal")
+except APIError as e:
+    if "DeploymentNotFound" in str(e):
+        print("Deployment name not found -- check your Azure OpenAI deployments")
+    else:
+        print(f"Azure OpenAI error {e.status_code}: {e.message}")
+```
+
+## Parameters
+
+| Parameter | Required | Default | Description |
+|---|---|---|---|
+| `azure_endpoint` | Yes | -- | Your Azure resource URL |
+| `api_version` | No | `"2024-06-01"` | Azure API version |
+| `azure_ad_token_provider` | No | `None` | Azure AD credential for AAD auth |
 
 :::tip
 Azure OpenAI uses the same `openai` SDK under the hood. If you already have `synapsekit[openai]` installed, no additional packages are needed.
 :::
+
+## Environment variables
+
+| Variable | Description |
+|---|---|
+| `AZURE_OPENAI_API_KEY` | Azure OpenAI API key |
+| `AZURE_OPENAI_ENDPOINT` | Azure resource endpoint |
+| `OPENAI_API_VERSION` | API version |
+
+## See also
+
+- [OpenAI](./openai) -- direct OpenAI API
+- [Function calling agents](../agents/function-calling)
+- [Cost tracking](../observability/cost-tracker)
+- [Azure OpenAI docs](https://learn.microsoft.com/en-us/azure/ai-services/openai/)
diff --git a/docs/llms/bedrock.md b/docs/llms/bedrock.md
index 2b9c853dc..eb62657bb 100644
--- a/docs/llms/bedrock.md
+++ b/docs/llms/bedrock.md
@@ -4,7 +4,7 @@ sidebar_position: 8
 
 # AWS Bedrock
 
-Run Claude, Titan, Llama, and other models via AWS Bedrock.
+Run Claude, Titan, Llama, Mistral, and other models via AWS Bedrock. Uses your AWS credentials -- no separate AI vendor account needed.
 
 ## Install
 
@@ -12,16 +12,43 @@ Run Claude, Titan, Llama, and other models via AWS Bedrock.
 pip install synapsekit[bedrock]
 ```
 
-AWS credentials must be configured (e.g. via `~/.aws/credentials`, environment variables, or an IAM role).
+## Authentication
+
+AWS Bedrock uses the standard AWS credential chain. Choose the method that fits your deployment:
+
+### Option 1: Environment variables
+
+```bash
+export AWS_ACCESS_KEY_ID=AKIA...
+export AWS_SECRET_ACCESS_KEY=...
+export AWS_DEFAULT_REGION=us-east-1
+```
+
+### Option 2: AWS CLI profile
+
+```bash
+aws configure
+# or
+aws configure --profile myprofile
+```
+
+### Option 3: IAM role (recommended for EC2/ECS/Lambda)
+
+No configuration needed -- Bedrock automatically uses the instance/task role.
+
+### Option 4: AWS SSO
+
+```bash
+aws sso login --profile my-sso-profile
+```
 
 ## Via the RAG facade
 
 ```python
 from synapsekit import RAG
 
-# Claude on Bedrock
 rag = RAG(
-    model="anthropic.claude-3-sonnet-20240229-v1:0",
+    model="anthropic.claude-3-5-sonnet-20241022-v2:0",
     api_key="env",   # uses AWS credential chain
     provider="bedrock",
 )
@@ -50,12 +77,156 @@ async for token in llm.stream("What is SynapseKit?"):
     print(token, end="", flush=True)
 ```
 
-## Supported model families
+## Supported models
+
+| Provider | Model | Bedrock Model ID |
+|---|---|---|
+| Anthropic | Claude 3.5 Sonnet v2 | `anthropic.claude-3-5-sonnet-20241022-v2:0` |
+| Anthropic | Claude 3.5 Haiku | `anthropic.claude-3-5-haiku-20241022-v1:0` |
+| Anthropic | Claude 3 Haiku | `anthropic.claude-3-haiku-20240307-v1:0` |
+| Meta | Llama 3.1 70B | `meta.llama3-1-70b-instruct-v1:0` |
+| Meta | Llama 3.1 8B | `meta.llama3-1-8b-instruct-v1:0` |
+| Mistral | Mixtral 8x7B | `mistral.mixtral-8x7b-instruct-v0:1` |
+| Amazon | Titan Text G1 | `amazon.titan-text-express-v1` |
+
+:::note
+You must enable model access in the AWS Console before using a model. Go to Amazon Bedrock > Model access and request access to the models you need.
+:::
+
+## Function calling
+
+```python
+from synapsekit.tools import tool
+from synapsekit.agents import FunctionCallingAgent
+
+@tool
+def query_s3_bucket(bucket_name: str, prefix: str = "") -> list[str]:
+    """List objects in an S3 bucket with an optional prefix filter."""
+    import boto3
+    s3 = boto3.client("s3")
+    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+    return [obj["Key"] for obj in response.get("Contents", [])]
+
+@tool
+def describe_ec2_instance(instance_id: str) -> dict:
+    """Get details about an EC2 instance."""
+    import boto3
+    ec2 = boto3.client("ec2")
+    response = ec2.describe_instances(InstanceIds=[instance_id])
+    instance = response["Reservations"][0]["Instances"][0]
+    return {
+        "id": instance["InstanceId"],
+        "type": instance["InstanceType"],
+        "state": instance["State"]["Name"],
+    }
+
+llm = BedrockLLM(
+    LLMConfig(
+        model="anthropic.claude-3-5-sonnet-20241022-v2:0",
+        api_key="env",
+        provider="bedrock",
+    ),
+    region="us-east-1",
+)
+agent = FunctionCallingAgent(llm=llm, tools=[query_s3_bucket, describe_ec2_instance])
+
+result = await agent.arun(
+    "List all objects in the 'my-data-bucket' bucket with prefix 'reports/2024/'"
+)
+print(result)
+```
 
-| Family | Example model ID |
+## Custom boto3 session
+
+Use a specific AWS profile or cross-account role:
+
+```python
+import boto3
+
+session = boto3.Session(
+    profile_name="production",
+    region_name="us-west-2",
+)
+
+llm = BedrockLLM(
+    LLMConfig(model="anthropic.claude-3-haiku-20240307-v1:0", api_key="env"),
+    boto3_session=session,
+)
+```
+
+## IAM permissions
+
+Your IAM role or user needs the `bedrock:InvokeModel` permission:
+
+```json
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Action": [
+                "bedrock:InvokeModel",
+                "bedrock:InvokeModelWithResponseStream"
+            ],
+            "Resource": [
+                "arn:aws:bedrock:us-east-1::foundation-model/anthropic.claude-3-haiku-20240307-v1:0"
+            ]
+        }
+    ]
+}
+```
+
+Use `"Resource": "*"` to allow all Bedrock models.
+
+## Cost tracking
+
+```python
+from synapsekit import CostTracker
+
+tracker = CostTracker()
+llm = BedrockLLM(
+    LLMConfig(model="anthropic.claude-3-haiku-20240307-v1:0", api_key="env"),
+    region="us-east-1",
+)
+
+with tracker.scope("bedrock-request"):
+    response = await llm.generate("Summarize this AWS architecture document...")
+    rec = tracker.record("claude-3-haiku", input_tokens=200, output_tokens=400)
+
+print(f"Cost: ${rec.cost_usd:.6f}")
+```
+
+## Error handling
+
+```python
+import botocore.exceptions
+
+try:
+    response = await llm.generate("Hello")
+except botocore.exceptions.NoCredentialsError:
+    print("AWS credentials not found -- run 'aws configure' or set environment variables")
+except botocore.exceptions.ClientError as e:
+    error_code = e.response["Error"]["Code"]
+    if error_code == "AccessDeniedException":
+        print("IAM permissions missing -- add bedrock:InvokeModel to your policy")
+    elif error_code == "ValidationException":
+        print("Model not available in this region or model access not enabled")
+    else:
+        print(f"AWS error: {e}")
+```
+
+## Environment variables
+
+| Variable | Description |
 |---|---|
-| Anthropic Claude | `anthropic.claude-3-sonnet-20240229-v1:0` |
-| Amazon Titan | `amazon.titan-text-express-v1` |
-| Meta Llama | `meta.llama2-13b-chat-v1` |
+| `AWS_ACCESS_KEY_ID` | AWS access key |
+| `AWS_SECRET_ACCESS_KEY` | AWS secret key |
+| `AWS_DEFAULT_REGION` | AWS region (e.g. `us-east-1`) |
+| `AWS_PROFILE` | AWS CLI profile name |
+
+## See also
 
-See [AWS Bedrock docs](https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html) for the full list.
+- [AWS Bedrock docs](https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html)
+- [Bedrock model access](https://us-east-1.console.aws.amazon.com/bedrock/home#/modelaccess)
+- [Function calling agents](../agents/function-calling)
+- [Cost tracking](../observability/cost-tracker)
diff --git a/docs/llms/cerebras.md b/docs/llms/cerebras.md
index 9e9c9f6f2..4d4076487 100644
--- a/docs/llms/cerebras.md
+++ b/docs/llms/cerebras.md
@@ -4,7 +4,7 @@ sidebar_position: 15
 
 # Cerebras
 
-[Cerebras](https://cerebras.ai/) provides ultra-fast inference on their custom wafer-scale hardware with an OpenAI-compatible API.
+[Cerebras](https://cerebras.ai/) provides ultra-fast inference on their custom Wafer-Scale Engine (WSE) hardware. With speeds exceeding 2,100 tokens/second, Cerebras is the fastest cloud inference option available for supported models.
 
 ## Install
 
@@ -14,7 +14,7 @@ pip install synapsekit[openai]
 
 Cerebras uses the OpenAI-compatible API, so it requires the `openai` package.
 
-## Usage
+## Basic usage
 
 ```python
 from synapsekit import LLMConfig
@@ -25,36 +25,187 @@ llm = CerebrasLLM(LLMConfig(
     api_key="csk-...",
 ))
 
-async for token in llm.stream("What is RAG?"):
+response = await llm.generate("Explain large language models in three sentences.")
+print(response)
+# Large language models are trained on vast text datasets...
+```
+
+## Streaming
+
+```python
+from synapsekit import LLMConfig
+from synapsekit.llm.cerebras import CerebrasLLM
+
+llm = CerebrasLLM(LLMConfig(
+    model="llama3.1-8b",
+    api_key="csk-...",
+))
+
+async for token in llm.stream("Write a quicksort implementation in Python."):
     print(token, end="", flush=True)
+# def quicksort(arr):
+#     if len(arr) <= 1:
+#         return arr
+#     ...
 ```
 
 ## Available models
 
-| Model | ID |
-|---|---|
-| Llama 3.1 8B | `llama3.1-8b` |
-| Llama 3.1 70B | `llama3.1-70b` |
-| Llama 3.3 70B | `llama-3.3-70b` |
+| Model | Context | Speed (tok/s) | Best for |
+|---|---|---|---|
+| `llama3.1-8b` | 128K | ~2,100 | Ultra-fast, interactive tasks |
+| `llama3.1-70b` | 128K | ~450 | High quality, still very fast |
+| `llama-3.3-70b` | 128K | ~450 | Latest Llama 3.3 weights |
+
+:::tip
+`llama3.1-8b` on Cerebras is typically 10x faster than the same model on GPU-based providers, making it ideal for chatbots and real-time applications.
+:::
+
+## Speed comparison
 
-See the full list at [inference-docs.cerebras.ai](https://inference-docs.cerebras.ai/introduction).
+| Provider | Model | Median speed (tok/s) |
+|---|---|---|
+| Cerebras | Llama 3.1 8B | ~2,100 |
+| Cerebras | Llama 3.1 70B | ~450 |
+| Groq | Llama 3.1 8B | ~800 |
+| Together AI | Llama 3.1 8B | ~200 |
+| OpenAI | gpt-4o-mini | ~120 |
 
 ## Function calling
 
+Cerebras supports OpenAI-compatible function calling on Llama models:
+
 ```python
-result = await llm.call_with_tools(messages, tools)
+from synapsekit import FunctionCallingAgent, tool
+from synapsekit import LLMConfig
+from synapsekit.llm.cerebras import CerebrasLLM
+
+@tool
+def get_stock_price(ticker: str) -> dict:
+    """Get the current stock price for a ticker symbol."""
+    # In practice, call a real market data API
+    prices = {"AAPL": 189.30, "GOOG": 175.20, "MSFT": 415.50}
+    return {"ticker": ticker, "price": prices.get(ticker, 0.0), "currency": "USD"}
+
+@tool
+def calculate_portfolio_value(holdings: dict) -> float:
+    """Calculate total portfolio value given a dict of {ticker: shares}."""
+    # Simplified calculation
+    prices = {"AAPL": 189.30, "GOOG": 175.20, "MSFT": 415.50}
+    total = sum(shares * prices.get(ticker, 0) for ticker, shares in holdings.items())
+    return round(total, 2)
+
+llm = CerebrasLLM(LLMConfig(model="llama3.1-70b", api_key="csk-..."))
+agent = FunctionCallingAgent(llm=llm, tools=[get_stock_price, calculate_portfolio_value])
+
+answer = await agent.run("What is AAPL's price and what is my portfolio worth if I have 10 AAPL and 5 MSFT?")
+print(answer)
+# AAPL is trading at $189.30. Your portfolio (10 AAPL + 5 MSFT) is worth $3,970.50.
+```
+
+### Raw `call_with_tools`
+
+```python
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "run_sql_query",
+            "description": "Execute a SQL query and return results",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string", "description": "SQL SELECT query"},
+                    "limit": {"type": "integer", "default": 10},
+                },
+                "required": ["query"],
+            },
+        },
+    }
+]
+
+result = await llm.call_with_tools(
+    messages=[{"role": "user", "content": "Show me the top 5 users by order count"}],
+    tools=tools,
+)
+# result["tool_calls"] → [{"name": "run_sql_query", "arguments": {"query": "SELECT user_id, COUNT(*) FROM orders GROUP BY user_id ORDER BY COUNT(*) DESC LIMIT 5"}}]
+```
+
+## Batch processing
+
+For high-throughput workloads, run multiple concurrent requests:
+
+```python
+import asyncio
+from synapsekit import LLMConfig
+from synapsekit.llm.cerebras import CerebrasLLM
+
+llm = CerebrasLLM(LLMConfig(model="llama3.1-8b", api_key="csk-..."))
+
+prompts = [
+    "Translate to Spanish: Hello world",
+    "Translate to French: Hello world",
+    "Translate to German: Hello world",
+    "Translate to Japanese: Hello world",
+    "Translate to Arabic: Hello world",
+]
+
+# Fire all requests concurrently
+results = await asyncio.gather(*[llm.generate(p) for p in prompts])
+for prompt, result in zip(prompts, results):
+    print(f"{prompt[:30]} → {result}")
+# Translate to Spanish: Hello w → Hola mundo
+# Translate to French: Hello wo → Bonjour le monde
+# ...
+```
+
+## Cost tracking
+
+```python
+from synapsekit.observability import CostTracker
+
+tracker = CostTracker()
+llm = CerebrasLLM(LLMConfig(model="llama3.1-70b", api_key="csk-..."))
+llm.attach_tracker(tracker)
+
+await llm.generate("Summarize the history of computing in 200 words.")
+print(f"Cost: ${tracker.total_cost_usd:.6f}")
 ```
 
 ## Custom base URL
 
 ```python
-llm = CerebrasLLM(config, base_url="http://localhost:8000/v1")
+llm = CerebrasLLM(
+    LLMConfig(model="llama3.1-70b", api_key="csk-..."),
+    base_url="http://localhost:8000/v1",
+)
 ```
 
-## Parameters
+## Parameters reference
 
 | Parameter | Description |
 |---|---|
-| `model` | Cerebras model ID |
-| `api_key` | Your Cerebras API key |
+| `model` | Cerebras model ID (e.g. `llama3.1-70b`) |
+| `api_key` | Your Cerebras API key (starts with `csk-`) |
+| `temperature` | Sampling temperature (0.0–1.0) |
+| `max_tokens` | Maximum output tokens |
 | `base_url` | Custom API base URL (default: `https://api.cerebras.ai/v1`) |
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key — get one at cloud.cerebras.ai")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"Cerebras error: {e}")
+```
+
+:::tip
+Cerebras is ideal for latency-sensitive use cases like streaming chatbots, real-time code completion, and interactive agents. The extreme token throughput means users see meaningful output almost instantly.
+:::
diff --git a/docs/llms/deepseek.md b/docs/llms/deepseek.md
index 146b11283..eed8cdef3 100644
--- a/docs/llms/deepseek.md
+++ b/docs/llms/deepseek.md
@@ -4,7 +4,7 @@ sidebar_position: 10
 
 # DeepSeek
 
-DeepSeek models via their OpenAI-compatible API. Supports streaming, generate, and function calling.
+DeepSeek models via their OpenAI-compatible API. Excellent cost-to-performance ratio with strong reasoning capabilities.
 
 ## Install
 
@@ -38,17 +38,93 @@ response = await llm.generate("What is DeepSeek?")
 
 ## Available models
 
-| Model | Description |
-|---|---|
-| `deepseek-chat` | General-purpose chat model |
-| `deepseek-reasoner` | Enhanced reasoning capabilities |
+| Model | Context | Input (per 1M) | Output (per 1M) | Notes |
+|---|---|---|---|---|
+| `deepseek-chat` | 64K | $0.07 | $1.10 | General chat, V3 architecture |
+| `deepseek-reasoner` | 64K | $0.55 | $2.19 | Chain-of-thought reasoning (R1) |
+
+DeepSeek-V3 and R1 offer competitive performance at a fraction of GPT-4o pricing.
+
+## DeepSeek-R1: reasoning model
+
+The `deepseek-reasoner` (R1) model outputs its thinking process before the answer:
+
+```python
+llm = DeepSeekLLM(LLMConfig(
+    model="deepseek-reasoner",
+    api_key="sk-...",
+))
+
+# R1 streams reasoning tokens wrapped in <think>...</think>
+async for token in llm.stream("Solve: if 3x + 7 = 22, what is x?"):
+    print(token, end="")
+
+# Output includes reasoning steps then answer:
+# <think>
+# We need to solve for x: 3x + 7 = 22
+# 3x = 15
+# x = 5
+# </think>
+# The answer is x = 5.
+```
+
+To strip reasoning and get only the final answer:
+
+```python
+response = await llm.generate("Solve: if 3x + 7 = 22, what is x?")
+# Strip <think>...</think> block
+import re
+answer = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()
+print(answer)  # "The answer is x = 5."
+```
 
 ## Function calling
 
+```python
+from synapsekit import FunctionCallingAgent, tool
+
+@tool
+def search_pypi(package_name: str) -> dict:
+    """Search PyPI for a Python package."""
+    import urllib.request, json
+    url = f"https://pypi.org/pypi/{package_name}/json"
+    with urllib.request.urlopen(url) as resp:
+        data = json.loads(resp.read())
+    return {
+        "name": data["info"]["name"],
+        "version": data["info"]["version"],
+        "summary": data["info"]["summary"],
+    }
+
+@tool
+def compare_packages(pkg1: str, pkg2: str) -> str:
+    """Compare two Python packages by description."""
+    return f"Comparing {pkg1} vs {pkg2}: both are popular libraries."
+
+llm = DeepSeekLLM(LLMConfig(model="deepseek-chat", api_key="sk-..."))
+agent = FunctionCallingAgent(llm=llm, tools=[search_pypi, compare_packages])
+
+answer = await agent.run("Compare synapsekit and langchain packages on PyPI")
+print(answer)
+```
+
+### Raw call_with_tools
+
 ```python
 result = await llm.call_with_tools(
     messages=[{"role": "user", "content": "Calculate 15% tip on $85"}],
-    tools=[...],
+    tools=[{
+        "type": "function",
+        "function": {
+            "name": "calculate",
+            "description": "Evaluate a mathematical expression",
+            "parameters": {
+                "type": "object",
+                "properties": {"expression": {"type": "string"}},
+                "required": ["expression"],
+            },
+        },
+    }],
 )
 ```
 
@@ -68,4 +144,52 @@ The RAG facade auto-detects DeepSeek for `deepseek-*` model names:
 from synapsekit import RAG
 
 rag = RAG(model="deepseek-chat", api_key="sk-...")
+rag.add("Your document text here")
+answer = rag.ask_sync("Summarize this.")
+```
+
+## Cost comparison
+
+DeepSeek offers significant savings vs proprietary models for equivalent quality:
+
+| Model | Input (per 1M) | Output (per 1M) | Relative cost |
+|---|---|---|---|
+| `deepseek-chat` | $0.07 | $1.10 | 1x (baseline) |
+| `deepseek-reasoner` | $0.55 | $2.19 | ~4x |
+| `gpt-4o-mini` | $0.15 | $0.60 | ~2x |
+| `gpt-4o` | $2.50 | $10.00 | ~36x |
+| `claude-sonnet-4-6` | $3.00 | $15.00 | ~43x |
+
+## Cost tracking
+
+```python
+from synapsekit.observability import CostTracker
+
+tracker = CostTracker()
+llm = DeepSeekLLM(LLMConfig(model="deepseek-chat", api_key="sk-..."))
+llm.attach_tracker(tracker)
+
+for _ in range(100):
+    await llm.generate("Translate this sentence to Spanish: Hello world.")
+
+print(f"Total: ${tracker.total_cost_usd:.6f}")
+```
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key -- get one at platform.deepseek.com")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"DeepSeek error: {e}")
 ```
+
+:::tip
+For cost-sensitive production workloads, `deepseek-chat` provides GPT-4-class quality at a fraction of the price. The `deepseek-reasoner` model excels at math, coding, and logical reasoning tasks.
+:::
diff --git a/docs/llms/fireworks.md b/docs/llms/fireworks.md
index b662fa3dd..cfd28b25d 100644
--- a/docs/llms/fireworks.md
+++ b/docs/llms/fireworks.md
@@ -4,7 +4,7 @@ sidebar_position: 13
 
 # Fireworks AI
 
-[Fireworks AI](https://fireworks.ai/) provides optimized inference for open-source models with an OpenAI-compatible API.
+[Fireworks AI](https://fireworks.ai/) provides optimized inference for open-source models with an OpenAI-compatible API. It offers some of the lowest latency for popular models like Llama and Mixtral, with their FireFunction models purpose-built for reliable tool use.
 
 ## Install
 
@@ -14,7 +14,7 @@ pip install synapsekit[openai]
 
 Fireworks AI uses the OpenAI-compatible API, so it requires the `openai` package.
 
-## Usage
+## Basic usage
 
 ```python
 from synapsekit import LLMConfig
@@ -22,39 +22,183 @@ from synapsekit.llm.fireworks import FireworksLLM
 
 llm = FireworksLLM(LLMConfig(
     model="accounts/fireworks/models/llama-v3p3-70b-instruct",
-    api_key="...",
+    api_key="fw_...",
 ))
 
-async for token in llm.stream("What is RAG?"):
+response = await llm.generate("Explain the difference between RAG and fine-tuning.")
+print(response)
+# RAG retrieves relevant context at inference time, while fine-tuning...
+```
+
+## Streaming
+
+```python
+from synapsekit import LLMConfig
+from synapsekit.llm.fireworks import FireworksLLM
+
+llm = FireworksLLM(LLMConfig(
+    model="accounts/fireworks/models/llama-v3p3-70b-instruct",
+    api_key="fw_...",
+    temperature=0.6,
+))
+
+async for token in llm.stream("Write a Python function to parse JSON safely."):
     print(token, end="", flush=True)
+# def safe_json_parse(text: str) -> dict | None:
+#     try:
+#         return json.loads(text)
+#     except json.JSONDecodeError:
+#         return None
 ```
 
 ## Available models
 
-| Model | ID |
-|---|---|
-| Llama 3.3 70B | `accounts/fireworks/models/llama-v3p3-70b-instruct` |
-| Mixtral 8x7B | `accounts/fireworks/models/mixtral-8x7b-instruct` |
-| Qwen 2.5 72B | `accounts/fireworks/models/qwen2p5-72b-instruct` |
+| Model | ID | Context | Notes |
+|---|---|---|---|
+| Llama 3.3 70B | `accounts/fireworks/models/llama-v3p3-70b-instruct` | 131K | Best quality |
+| Llama 3.1 8B | `accounts/fireworks/models/llama-v3p1-8b-instruct` | 131K | Fast, cheap |
+| Mixtral 8x7B | `accounts/fireworks/models/mixtral-8x7b-instruct` | 32K | Strong reasoning |
+| Qwen 2.5 72B | `accounts/fireworks/models/qwen2p5-72b-instruct` | 131K | Multilingual |
+| FireFunction v2 | `accounts/fireworks/models/firefunction-v2` | 8K | Optimized for tool use |
+| Llama 3.1 405B | `accounts/fireworks/models/llama-v3p1-405b-instruct` | 131K | Largest open model |
 
 See the full list at [fireworks.ai/models](https://fireworks.ai/models).
 
 ## Function calling
 
+Fireworks offers `FireFunction-v2`, a model specifically optimized for reliable function calling:
+
+```python
+from synapsekit import FunctionCallingAgent, tool
+from synapsekit import LLMConfig
+from synapsekit.llm.fireworks import FireworksLLM
+
+@tool
+def search_documentation(query: str, max_results: int = 3) -> list:
+    """Search the SynapseKit documentation for a query."""
+    # In practice, run a vector search
+    return [
+        {"title": f"Result {i}: {query}", "url": f"https://docs.example.com/{i}"}
+        for i in range(1, max_results + 1)
+    ]
+
+@tool
+def create_github_issue(title: str, body: str, labels: list[str] = None) -> dict:
+    """Create a GitHub issue in the SynapseKit repository."""
+    return {
+        "number": 42,
+        "title": title,
+        "url": "https://github.com/SynapseKit/SynapseKit/issues/42",
+        "labels": labels or [],
+    }
+
+# Use FireFunction-v2 for most reliable tool calling
+llm = FireworksLLM(LLMConfig(
+    model="accounts/fireworks/models/firefunction-v2",
+    api_key="fw_...",
+))
+
+agent = FunctionCallingAgent(llm=llm, tools=[search_documentation, create_github_issue])
+answer = await agent.run(
+    "Search for 'streaming' in the docs and create an issue to improve those docs."
+)
+print(answer)
+# Found 3 results for 'streaming'. Created issue #42: 'Improve streaming documentation'.
+```
+
+### Raw `call_with_tools`
+
 ```python
-result = await llm.call_with_tools(messages, tools)
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "classify_text",
+            "description": "Classify text into a category",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text": {"type": "string"},
+                    "categories": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                    },
+                },
+                "required": ["text", "categories"],
+            },
+        },
+    }
+]
+
+result = await llm.call_with_tools(
+    messages=[{"role": "user", "content": "Is 'I love this product!' positive or negative?"}],
+    tools=tools,
+)
+# result["tool_calls"] → [{"name": "classify_text", "arguments": {"text": "I love this product!", "categories": ["positive", "negative", "neutral"]}}]
 ```
 
+## FireFunction models
+
+Fireworks' FireFunction models are fine-tuned versions of Llama specifically for tool use:
+
+| Model | Best for |
+|---|---|
+| `accounts/fireworks/models/firefunction-v2` | Reliable single and parallel tool calls |
+
+FireFunction-v2 is recommended over general-purpose models when your agent makes many tool calls, as it produces cleaner JSON arguments and fewer hallucinated tool names.
+
 ## Custom base URL
 
 ```python
-llm = FireworksLLM(config, base_url="http://localhost:8000/v1")
+llm = FireworksLLM(
+    LLMConfig(model="accounts/fireworks/models/llama-v3p3-70b-instruct", api_key="fw_..."),
+    base_url="http://localhost:8000/v1",
+)
 ```
 
-## Parameters
+## Cost tracking
+
+```python
+from synapsekit.observability import CostTracker
+
+tracker = CostTracker()
+llm = FireworksLLM(LLMConfig(
+    model="accounts/fireworks/models/llama-v3p1-8b-instruct",
+    api_key="fw_...",
+))
+llm.attach_tracker(tracker)
+
+for i in range(10):
+    await llm.generate(f"Summarize paragraph {i}.")
+
+print(f"Total cost: ${tracker.total_cost_usd:.6f}")
+```
+
+## Parameters reference
 
 | Parameter | Description |
 |---|---|
-| `model` | Fireworks model ID |
-| `api_key` | Your Fireworks API key |
+| `model` | Fireworks model ID (full `accounts/fireworks/models/...` path) |
+| `api_key` | Your Fireworks API key (starts with `fw_`) |
+| `temperature` | Sampling temperature (0.0–1.0) |
+| `max_tokens` | Maximum output tokens |
 | `base_url` | Custom API base URL (default: `https://api.fireworks.ai/inference/v1`) |
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key — get one at fireworks.ai")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"Fireworks error: {e}")
+```
+
+:::tip
+Use `firefunction-v2` when building production agents that need reliable tool calling. For general Q&A workloads, `llama-v3p3-70b-instruct` offers the best quality-to-cost ratio.
+:::
diff --git a/docs/llms/gemini.md b/docs/llms/gemini.md
index 3bdc5111f..0a80f50b1 100644
--- a/docs/llms/gemini.md
+++ b/docs/llms/gemini.md
@@ -4,6 +4,8 @@ sidebar_position: 7
 
 # Google Gemini
 
+Use Google's Gemini models with up to 1M token context, multimodal inputs, and native function calling.
+
 ## Install
 
 ```bash
@@ -15,7 +17,7 @@ pip install synapsekit[gemini]
 ```python
 from synapsekit import RAG
 
-rag = RAG(model="gemini-1.5-pro", api_key="your-google-api-key")
+rag = RAG(model="gemini-2.0-flash", api_key="your-google-api-key")
 rag.add("Your document text here")
 
 answer = rag.ask_sync("Summarize the document.")
@@ -28,7 +30,7 @@ from synapsekit.llm.gemini import GeminiLLM
 from synapsekit.llm.base import LLMConfig
 
 llm = GeminiLLM(LLMConfig(
-    model="gemini-1.5-pro",
+    model="gemini-2.0-flash",
     api_key="your-google-api-key",
     provider="gemini",
     temperature=0.3,
@@ -39,9 +41,53 @@ async for token in llm.stream("Explain vector embeddings."):
     print(token, end="", flush=True)
 ```
 
+## Available models
+
+| Model | Context | Input (per 1M) | Output (per 1M) | Notes |
+|---|---|---|---|---|
+| `gemini-2.5-pro` | 1M | $1.25 | $10.00 | Most capable, multimodal |
+| `gemini-2.5-flash` | 1M | $0.075 | $0.30 | Fast, low cost |
+| `gemini-2.0-flash` | 1M | $0.075 | $0.30 | Stable, production-ready |
+| `gemini-2.0-flash-lite` | 1M | $0.01 | $0.04 | Cheapest |
+| `gemini-1.5-pro` | 2M | $1.25 | $5.00 | Legacy, largest context |
+| `gemini-1.5-flash` | 1M | $0.075 | $0.30 | Legacy fast |
+
+## Google AI API vs Vertex AI
+
+| Feature | Google AI API | Vertex AI |
+|---|---|---|
+| Auth | API key | `gcloud` / service account |
+| Cost | Pay-per-use | Same, + GCP billing |
+| Region control | No | Yes |
+| Enterprise SLA | No | Yes |
+| Free tier | Yes | No |
+
+### Google AI API (default)
+
+```python
+llm = GeminiLLM(LLMConfig(
+    model="gemini-2.0-flash",
+    api_key="AIza...",
+    provider="gemini",
+))
+```
+
+### Vertex AI
+
+```python
+llm = GeminiLLM(
+    LLMConfig(model="gemini-2.0-flash", api_key="", provider="gemini"),
+    use_vertex=True,
+    project_id="my-gcp-project",
+    location="us-central1",
+)
+```
+
+When `use_vertex=True`, SynapseKit uses `google-auth` Application Default Credentials. Run `gcloud auth application-default login` first.
+
 ## Function calling
 
-GeminiLLM supports native function calling via `call_with_tools()`. This enables the `FunctionCallingAgent` to work with Gemini models.
+GeminiLLM supports native function calling via `call_with_tools()`. SynapseKit automatically converts OpenAI-format tool schemas to Gemini's `FunctionDeclaration` format.
 
 ```python
 from synapsekit import FunctionCallingAgent, CalculatorTool
@@ -49,7 +95,7 @@ from synapsekit.llm.gemini import GeminiLLM
 from synapsekit.llm.base import LLMConfig
 
 llm = GeminiLLM(LLMConfig(
-    model="gemini-1.5-pro",
+    model="gemini-2.0-flash",
     api_key="your-google-api-key",
     provider="gemini",
 ))
@@ -62,11 +108,7 @@ agent = FunctionCallingAgent(
 answer = await agent.run("What is 144 divided by 12?")
 ```
 
-### How it works
-
-SynapseKit automatically converts OpenAI-format tool schemas to Gemini's `FunctionDeclaration` format. Response `function_call` parts are parsed back into the standard `{"id", "name", "arguments"}` format. Since Gemini doesn't provide tool call IDs, SynapseKit generates them via `uuid4`.
-
-### Direct call_with_tools usage
+### Direct call_with_tools
 
 ```python
 tools = [
@@ -95,10 +137,107 @@ result = await llm.call_with_tools(messages, tools)
 # {"content": None, "tool_calls": [{"id": "call_...", "name": "get_weather", "arguments": {"city": "Paris"}}]}
 ```
 
-## Supported models
+:::note
+Gemini doesn't provide tool call IDs natively. SynapseKit generates them via `uuid4` for compatibility.
+:::
+
+## Multimodal inputs
+
+```python
+from synapsekit.multimodal import ImageContent
+
+# Analyze an image
+message = {
+    "role": "user",
+    "content": [
+        ImageContent.from_url("https://example.com/chart.png"),
+        {"type": "text", "text": "Describe the trend shown in this chart."},
+    ],
+}
+
+response = await llm.generate(message)
+```
+
+### Audio inputs
+
+```python
+from synapsekit.multimodal import AudioContent
+
+with open("meeting_recording.mp3", "rb") as f:
+    audio = AudioContent.from_bytes(f.read(), media_type="audio/mp3")
+
+response = await llm.generate([audio, "Summarize this meeting recording."])
+```
+
+## Long context: processing large documents
+
+Gemini's 1M+ token context enables loading entire books or codebases:
+
+```python
+# Load a 500-page PDF (as text) into context
+with open("annual_report.txt") as f:
+    document = f.read()
+
+# Gemini 2.5 Pro handles ~750K words in a single request
+llm = GeminiLLM(LLMConfig(
+    model="gemini-2.5-pro",
+    api_key="AIza...",
+    max_tokens=8192,
+))
+
+response = await llm.generate(
+    f"Here is the annual report:\n\n{document}\n\nWhat were the top 3 risks mentioned?"
+)
+```
+
+For documents exceeding 1M tokens, chunk and summarize progressively:
+
+```python
+CHUNK_SIZE = 800_000  # tokens (approximate)
+
+chunks = [document[i:i+CHUNK_SIZE*4] for i in range(0, len(document), CHUNK_SIZE*4)]
+summaries = []
 
-- `gemini-1.5-pro` — most capable
-- `gemini-1.5-flash` — faster, lower cost
-- `gemini-1.0-pro`
+for i, chunk in enumerate(chunks):
+    summary = await llm.generate(f"Summarize section {i+1}:\n\n{chunk}")
+    summaries.append(summary)
+
+final = await llm.generate("Combine these summaries:\n\n" + "\n\n".join(summaries))
+```
+
+## Rate limits
+
+| Tier | RPM | TPM | Notes |
+|---|---|---|---|
+| Free | 15 | 1M | For prototyping |
+| Pay-as-you-go | 360 | 4M | gemini-2.0-flash |
+| Pay-as-you-go | 360 | 4M | gemini-2.5-pro |
+
+Use `requests_per_minute` in `LLMConfig` to throttle if needed:
+
+```python
+llm = GeminiLLM(LLMConfig(
+    model="gemini-2.0-flash",
+    api_key="AIza...",
+    requests_per_minute=14,  # stay under free tier limit
+))
+```
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key — visit aistudio.google.com to create one")
+except RateLimitError:
+    print("Rate limit exceeded — upgrade to pay-as-you-go or reduce RPM")
+except LLMError as e:
+    print(f"Gemini error: {e}")
+```
 
-See [Google AI docs](https://ai.google.dev/models/gemini) for the full list.
+:::tip
+Get a free API key at [aistudio.google.com](https://aistudio.google.com). The free tier includes 15 RPM and 1M tokens/day.
+:::
diff --git a/docs/llms/groq.md b/docs/llms/groq.md
index 0aafcdd9c..fb6d71e10 100644
--- a/docs/llms/groq.md
+++ b/docs/llms/groq.md
@@ -4,7 +4,7 @@ sidebar_position: 9
 
 # Groq
 
-Ultra-fast inference with Groq's LPU hardware. Supports Llama, Mixtral, Gemma, and other open models.
+Ultra-fast inference with Groq's LPU (Language Processing Unit) hardware. Supports Llama, Mixtral, Gemma, and other open models.
 
 ## Install
 
@@ -36,19 +36,73 @@ response = await llm.generate("What is Rust?")
 
 ## Available models
 
-| Model | Context | Notes |
-|---|---|---|
-| `llama-3.3-70b-versatile` | 128K | Best quality |
-| `llama-3.1-8b-instant` | 128K | Fastest |
-| `mixtral-8x7b-32768` | 32K | Good balance |
-| `gemma2-9b-it` | 8K | Google Gemma |
+| Model | Context | Speed (tok/s) | Notes |
+|---|---|---|---|
+| `llama-3.3-70b-versatile` | 128K | ~500 | Best quality |
+| `llama-3.1-8b-instant` | 128K | ~800 | Fastest |
+| `llama-3.2-90b-vision-preview` | 128K | ~300 | Multimodal (preview) |
+| `mixtral-8x7b-32768` | 32K | ~600 | Good balance |
+| `gemma2-9b-it` | 8K | ~700 | Google Gemma |
+| `llama-guard-3-8b` | 8K | ~800 | Safety classifier |
 
 ## Function calling
 
+Groq supports native function calling on most Llama and Gemma models:
+
 ```python
+from synapsekit import FunctionCallingAgent, tool
+from synapsekit.llm.groq import GroqLLM
+from synapsekit import LLMConfig
+
+@tool
+def get_latest_news(topic: str, count: int = 3) -> list:
+    """Get the latest news headlines for a topic."""
+    # In practice, call a news API
+    return [
+        {"title": f"Breaking: {topic} update #{i}", "source": "Reuters"}
+        for i in range(1, count + 1)
+    ]
+
+@tool
+def calculate(expression: str) -> float:
+    """Safely evaluate a mathematical expression."""
+    import ast
+    return float(ast.literal_eval(expression))
+
+llm = GroqLLM(LLMConfig(
+    model="llama-3.3-70b-versatile",
+    api_key="gsk_...",
+))
+
+agent = FunctionCallingAgent(llm=llm, tools=[get_latest_news, calculate])
+answer = await agent.run("What are the latest AI news? Also, what is 2**10?")
+print(answer)
+```
+
+### Raw call_with_tools
+
+```python
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "lookup_product",
+            "description": "Look up product details by SKU",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sku": {"type": "string"},
+                    "include_inventory": {"type": "boolean", "default": False},
+                },
+                "required": ["sku"],
+            },
+        },
+    }
+]
+
 result = await llm.call_with_tools(
-    messages=[{"role": "user", "content": "What's 2+2?"}],
-    tools=[...],
+    messages=[{"role": "user", "content": "What's in stock for SKU-12345?"}],
+    tools=tools,
 )
 ```
 
@@ -60,8 +114,69 @@ The RAG facade auto-detects Groq for `llama`, `mixtral`, and `gemma` model prefi
 from synapsekit import RAG
 
 rag = RAG(model="llama-3.3-70b-versatile", api_key="gsk_...")
+rag.add("Your document text here")
+answer = rag.ask_sync("Summarize this.")
+```
+
+## Rate limits
+
+| Tier | Requests/min | Tokens/min | Tokens/day |
+|---|---|---|---|
+| Free | 30 | 14,400 | 500,000 |
+| Dev ($0/mo) | 30 | 14,400 | 500,000 |
+| Paid | 3,500 | 500,000 | Unlimited |
+
+For high-throughput workloads, use `requests_per_minute` to throttle:
+
+```python
+llm = GroqLLM(LLMConfig(
+    model="llama-3.1-8b-instant",
+    api_key="gsk_...",
+    requests_per_minute=28,  # stay under free tier limit
+))
+```
+
+## Latency benchmarks
+
+Groq is the fastest cloud inference option for open models:
+
+| Provider | Model | Median latency | Throughput |
+|---|---|---|---|
+| Groq | Llama 3.1 8B | ~0.2s | ~800 tok/s |
+| Groq | Llama 3.3 70B | ~0.5s | ~500 tok/s |
+| Together AI | Llama 3.1 8B | ~0.8s | ~200 tok/s |
+| OpenAI | gpt-4o-mini | ~1.2s | ~120 tok/s |
+
+## Cost tracking
+
+```python
+from synapsekit.observability import CostTracker
+
+tracker = CostTracker()
+llm = GroqLLM(LLMConfig(model="llama-3.3-70b-versatile", api_key="gsk_..."))
+llm.attach_tracker(tracker)
+
+for i in range(10):
+    await llm.generate(f"Translate to French: message {i}")
+
+print(f"Total cost: ${tracker.total_cost_usd:.6f}")
+```
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key — get one at console.groq.com")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"Groq error: {e}")
 ```
 
 :::tip
-Groq is ideal for latency-sensitive applications. Most models respond in under 500ms.
+Groq is ideal for latency-sensitive applications. Most models respond in under 500ms for short prompts. Use `llama-3.1-8b-instant` when you need the absolute fastest responses.
 :::
diff --git a/docs/llms/mistral.md b/docs/llms/mistral.md
index cc37a0ac6..ed8a94c02 100644
--- a/docs/llms/mistral.md
+++ b/docs/llms/mistral.md
@@ -4,12 +4,33 @@ sidebar_position: 6
 
 # Mistral AI
 
+[Mistral AI](https://mistral.ai/) provides high-quality European AI models via an OpenAI-compatible API. Mistral models are known for their strong reasoning, code generation, and function calling capabilities at competitive pricing.
+
 ## Install
 
 ```bash
 pip install synapsekit[mistral]
 ```
 
+## Basic usage
+
+```python
+from synapsekit.llm.mistral import MistralLLM
+from synapsekit.llm.base import LLMConfig
+
+llm = MistralLLM(LLMConfig(
+    model="mistral-large-latest",
+    api_key="your-mistral-key",
+    provider="mistral",
+    temperature=0.3,
+    max_tokens=1024,
+))
+
+response = await llm.generate("Explain the difference between RAG and fine-tuning.")
+print(response)
+# RAG (Retrieval-Augmented Generation) retrieves relevant context at inference time...
+```
+
 ## Via the RAG facade
 
 ```python
@@ -19,35 +40,64 @@ rag = RAG(model="mistral-large-latest", api_key="your-mistral-key")
 rag.add("Your document text here")
 
 answer = rag.ask_sync("Summarize the document.")
+print(answer)
 ```
 
-## Direct usage
+## Streaming
 
 ```python
 from synapsekit.llm.mistral import MistralLLM
 from synapsekit.llm.base import LLMConfig
 
 llm = MistralLLM(LLMConfig(
-    model="mistral-large-latest",
+    model="mistral-small-latest",
     api_key="your-mistral-key",
     provider="mistral",
-    temperature=0.3,
-    max_tokens=1024,
 ))
 
-async for token in llm.stream("What is RAG?"):
+async for token in llm.stream("Write a Python function to check if a number is prime."):
     print(token, end="", flush=True)
+# def is_prime(n: int) -> bool:
+#     if n < 2: return False
+#     ...
 ```
 
+## Supported models
+
+| Model | Context | Input (per 1M) | Output (per 1M) | Best for |
+|---|---|---|---|---|
+| `mistral-large-latest` | 131K | $2.00 | $6.00 | Best quality, complex tasks |
+| `mistral-small-latest` | 32K | $0.20 | $0.60 | Fast, cost-efficient |
+| `open-mistral-nemo` | 128K | $0.15 | $0.15 | Open-weight, great value |
+| `open-mistral-7b` | 32K | $0.25 | $0.25 | Open-weight, self-hostable |
+| `open-mixtral-8x7b` | 32K | $0.70 | $0.70 | MoE, strong at reasoning |
+| `open-mixtral-8x22b` | 65K | $2.00 | $6.00 | Largest open MoE model |
+| `codestral-latest` | 32K | $0.20 | $0.60 | Code generation optimized |
+| `mistral-embed` | 8K | $0.10 | — | Text embeddings |
+
+See the [Mistral model docs](https://docs.mistral.ai/getting-started/models/) for the full list.
+
 ## Function calling
 
-MistralLLM supports native function calling via `call_with_tools()`. Mistral's API is OpenAI-compatible, so tool schemas work without conversion.
+MistralLLM supports native function calling via `call_with_tools()`. Mistral's API is OpenAI-compatible, so tool schemas work without conversion:
 
 ```python
-from synapsekit import FunctionCallingAgent, CalculatorTool, WebSearchTool
+from synapsekit import FunctionCallingAgent, tool
 from synapsekit.llm.mistral import MistralLLM
 from synapsekit.llm.base import LLMConfig
 
+@tool
+def search_web(query: str, num_results: int = 5) -> list:
+    """Search the web for current information."""
+    return [{"title": f"Result {i}: {query}", "url": f"https://example.com/{i}"}
+            for i in range(1, num_results + 1)]
+
+@tool
+def calculate(expression: str) -> float:
+    """Evaluate a mathematical expression safely."""
+    import ast
+    return float(ast.literal_eval(expression))
+
 llm = MistralLLM(LLMConfig(
     model="mistral-large-latest",
     api_key="your-mistral-key",
@@ -56,13 +106,15 @@ llm = MistralLLM(LLMConfig(
 
 agent = FunctionCallingAgent(
     llm=llm,
-    tools=[CalculatorTool(), WebSearchTool()],
+    tools=[search_web, calculate],
 )
 
 answer = await agent.run("Search for the population of France and calculate its square root.")
+print(answer)
+# The population of France is approximately 68 million. The square root is ~8,246.
 ```
 
-### Direct call_with_tools usage
+### Direct `call_with_tools` usage
 
 ```python
 tools = [
@@ -83,7 +135,7 @@ tools = [
 ]
 
 messages = [
-    {"role": "system", "content": "You are helpful."},
+    {"role": "system", "content": "You are a helpful assistant."},
     {"role": "user", "content": "What's the weather in Paris?"},
 ]
 
@@ -91,11 +143,81 @@ result = await llm.call_with_tools(messages, tools)
 # {"content": None, "tool_calls": [{"id": "...", "name": "get_weather", "arguments": {"city": "Paris"}}]}
 ```
 
-## Supported models
+## JSON mode
+
+Force the model to output valid JSON:
+
+```python
+llm = MistralLLM(LLMConfig(
+    model="mistral-large-latest",
+    api_key="your-mistral-key",
+    extra_params={"response_format": {"type": "json_object"}},
+))
+
+response = await llm.generate(
+    "Extract the name, email, and phone from this text: "
+    "Contact John Smith at john@example.com or 555-1234."
+)
+import json
+data = json.loads(response)
+print(data)
+# {"name": "John Smith", "email": "john@example.com", "phone": "555-1234"}
+```
+
+## Codestral for code generation
+
+Use `codestral-latest` for code-specific tasks — it's fine-tuned on code and supports fill-in-the-middle:
+
+```python
+llm = MistralLLM(LLMConfig(
+    model="codestral-latest",
+    api_key="your-mistral-key",
+    provider="mistral",
+    temperature=0.1,  # Low temp for deterministic code
+))
+
+response = await llm.generate(
+    "Write a Python decorator that caches function results with a TTL."
+)
+print(response)
+# import time
+# from functools import wraps
+# ...
+```
+
+## Cost tracking
+
+```python
+from synapsekit.observability import CostTracker
 
-- `mistral-large-latest`
-- `mistral-small-latest`
-- `open-mistral-7b`
-- `open-mixtral-8x7b`
+tracker = CostTracker()
+llm = MistralLLM(LLMConfig(
+    model="mistral-small-latest",
+    api_key="your-mistral-key",
+))
+llm.attach_tracker(tracker)
+
+for i in range(10):
+    await llm.generate(f"Translate to French: message {i}")
+
+print(f"Total cost: ${tracker.total_cost_usd:.4f}")
+```
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key — get one at console.mistral.ai")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"Mistral error: {e}")
+```
 
-See [Mistral docs](https://docs.mistral.ai/getting-started/models/) for the full list.
+:::tip
+`mistral-small-latest` offers an excellent balance of quality and cost for most tasks. Use `mistral-large-latest` for complex reasoning, coding, or when you need the best output quality.
+:::
diff --git a/docs/llms/ollama.md b/docs/llms/ollama.md
index 4eda526f8..d839eb273 100644
--- a/docs/llms/ollama.md
+++ b/docs/llms/ollama.md
@@ -4,23 +4,51 @@ sidebar_position: 4
 
 # Ollama (Local)
 
-Run open-source LLMs locally via [Ollama](https://ollama.com). No API key required.
+Run open-source LLMs locally via [Ollama](https://ollama.com). No API key required. Full privacy -- nothing leaves your machine.
 
-## Install
+## Install Ollama
+
+### macOS
 
 ```bash
-# Install Ollama: https://ollama.com/download
-ollama pull llama3
+brew install ollama
+ollama serve
+```
+
+### Linux
+
+```bash
+curl -fsSL https://ollama.com/install.sh | sh
+ollama serve
+```
+
+### Windows
 
+Download the installer from [ollama.com/download](https://ollama.com/download) and run it.
+
+Then install the SynapseKit package:
+
+```bash
 pip install synapsekit[ollama]
 ```
 
+## Pull a model
+
+```bash
+ollama pull llama3.2
+ollama pull mistral
+ollama pull gemma2
+ollama pull phi3
+ollama pull codellama
+ollama pull deepseek-r1
+```
+
 ## Via the RAG facade
 
 ```python
 from synapsekit import RAG
 
-rag = RAG(model="llama3", api_key="", provider="ollama")
+rag = RAG(model="llama3.2", api_key="", provider="ollama")
 rag.add("Your document text here")
 
 answer = rag.ask_sync("Summarize the document.")
@@ -34,7 +62,7 @@ from synapsekit.llm.ollama import OllamaLLM
 from synapsekit.llm.base import LLMConfig
 
 llm = OllamaLLM(LLMConfig(
-    model="llama3",
+    model="llama3.2",
     api_key="",
     provider="ollama",
     temperature=0.7,
@@ -45,14 +73,128 @@ async for token in llm.stream("Explain async Python in one paragraph."):
     print(token, end="", flush=True)
 ```
 
+## Custom base URL
+
+If Ollama is running on a different host (e.g. a GPU server on your LAN):
+
+```python
+llm = OllamaLLM(
+    LLMConfig(model="llama3.2", api_key="", provider="ollama"),
+    base_url="http://192.168.1.50:11434",
+)
+```
+
 ## Supported models
 
-Any model you have pulled with `ollama pull`:
+Any model available from `ollama pull`:
 
-```bash
-ollama pull llama3
-ollama pull mistral
-ollama pull gemma2
-ollama pull phi3
-ollama pull codellama
+| Model | Size | RAM Required | Notes |
+|---|---|---|---|
+| `llama3.2` | 3B | ~4 GB | Fast, great for most tasks |
+| `llama3.1` | 8B | ~8 GB | Good quality |
+| `llama3.1:70b` | 70B (Q4) | ~40 GB | High quality, needs GPU |
+| `mistral` | 7B | ~8 GB | Strong reasoning |
+| `gemma2` | 9B | ~10 GB | Google's open model |
+| `phi3` | 3.8B | ~4 GB | Microsoft, fast + efficient |
+| `codellama` | 7B | ~8 GB | Code generation |
+| `deepseek-r1` | 7B | ~8 GB | Reasoning with chain of thought |
+| `nomic-embed-text` | — | ~1 GB | Embeddings only |
+
+## GPU memory guide
+
+| Model size | Minimum VRAM | Recommended |
+|---|---|---|
+| 1-3B | 4 GB | GTX 1650, M1 |
+| 7-8B | 8 GB | RTX 3070, M2 |
+| 13B | 12 GB | RTX 3080, M2 Pro |
+| 70B (Q4) | 40 GB | A100, M2 Ultra |
+
+Models that don't fit in VRAM run on CPU -- much slower.
+
+## Ollama-specific options
+
+```python
+llm = OllamaLLM(
+    LLMConfig(model="llama3.2", api_key="", provider="ollama"),
+    keep_alive="10m",   # keep model loaded in VRAM after request
+    num_ctx=8192,       # context window override (default: model default)
+)
+```
+
+| Option | Description |
+|---|---|
+| `keep_alive` | Time to keep model in memory. `"0"` unloads immediately, `"-1"` keeps forever |
+| `num_ctx` | Override context window size |
+| `num_gpu` | Number of GPU layers to offload |
+| `num_thread` | CPU threads to use |
+
+## Function calling
+
+Some Ollama models support function calling (e.g. `llama3.1`, `mistral-nemo`):
+
+```python
+from synapsekit import FunctionCallingAgent, tool
+from synapsekit.llm.ollama import OllamaLLM
+from synapsekit.llm.base import LLMConfig
+
+@tool
+def get_weather(city: str) -> str:
+    """Get the weather for a city."""
+    return f"It's sunny in {city}, 24 degrees C"
+
+llm = OllamaLLM(LLMConfig(
+    model="llama3.1",
+    api_key="",
+    provider="ollama",
+))
+
+agent = FunctionCallingAgent(llm=llm, tools=[get_weather])
+answer = await agent.run("What's the weather in Tokyo?")
 ```
+
+:::caution
+Not all Ollama models support function calling. Use `llama3.1` or later for reliable results. For other models, use `ReActAgent` instead.
+:::
+
+## Use in GitHub Actions (CI)
+
+Run tests with a local Ollama model in CI:
+
+```yaml
+# .github/workflows/test.yml
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Start Ollama
+        run: |
+          curl -fsSL https://ollama.com/install.sh | sh
+          ollama serve &
+          sleep 5
+          ollama pull phi3
+      - name: Run tests
+        run: |
+          pip install synapsekit[ollama]
+          pytest tests/
+```
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError
+
+try:
+    response = await llm.generate("Hello")
+except LLMError as e:
+    if "connection refused" in str(e).lower():
+        print("Ollama is not running. Start it with: ollama serve")
+    elif "model not found" in str(e).lower():
+        print("Pull the model first: ollama pull llama3.2")
+    else:
+        raise
+```
+
+:::tip
+To list all locally available models: `ollama list`
+:::
diff --git a/docs/llms/openai.md b/docs/llms/openai.md
index b5d9a1384..92c3b051c 100644
--- a/docs/llms/openai.md
+++ b/docs/llms/openai.md
@@ -4,6 +4,8 @@ sidebar_position: 2
 
 # OpenAI
 
+Use OpenAI's GPT models with streaming, function calling, vision, and structured output.
+
 ## Install
 
 ```bash
@@ -30,6 +32,198 @@ response = await llm.generate("Tell me about async Python.")
 print(response)
 ```
 
-## Supported models
+## Available models
+
+| Model | Context | Input (per 1M tokens) | Output (per 1M tokens) | Notes |
+|---|---|---|---|---|
+| `gpt-4o` | 128K | $2.50 | $10.00 | Best quality, multimodal |
+| `gpt-4o-mini` | 128K | $0.15 | $0.60 | Fast and cheap |
+| `gpt-4-turbo` | 128K | $10.00 | $30.00 | Legacy high-quality |
+| `gpt-3.5-turbo` | 16K | $0.50 | $1.50 | Legacy, cheapest |
+| `o1` | 200K | $15.00 | $60.00 | Reasoning, no streaming |
+| `o1-mini` | 128K | $3.00 | $12.00 | Reasoning, fast |
+| `o3-mini` | 200K | $1.10 | $4.40 | Latest reasoning |
+
+Any model supported by the OpenAI API works.
+
+## Function calling
+
+Use the `@tool` decorator or pass raw JSON Schema. SynapseKit auto-generates schemas from Python type hints.
+
+```python
+from synapsekit import tool, FunctionCallingAgent
+from synapsekit.llms import OpenAILLM, LLMConfig
+
+@tool
+def get_weather(city: str, unit: str = "celsius") -> str:
+    """Get current weather for a city."""
+    return f"Weather in {city}: 22 degrees {unit}, sunny"
+
+@tool
+def calculate(expression: str) -> float:
+    """Evaluate a math expression."""
+    return eval(expression)
+
+llm = OpenAILLM(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+agent = FunctionCallingAgent(llm=llm, tools=[get_weather, calculate])
+
+answer = await agent.run("What's the weather in Paris? Also, what's 144 / 12?")
+print(answer)
+```
+
+### Raw tool schema
+
+```python
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "search_web",
+            "description": "Search the web for information",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string", "description": "Search query"},
+                    "num_results": {"type": "integer", "default": 5},
+                },
+                "required": ["query"],
+            },
+        },
+    }
+]
+
+result = await llm.call_with_tools(
+    messages=[{"role": "user", "content": "What happened in AI this week?"}],
+    tools=tools,
+)
+# {"content": None, "tool_calls": [{"id": "call_abc123", "name": "search_web", "arguments": {"query": "AI news this week"}}]}
+```
+
+## Vision
+
+Pass images alongside text using `ImageContent`:
+
+```python
+from synapsekit.multimodal import ImageContent
+
+message = {
+    "role": "user",
+    "content": [
+        ImageContent.from_url("https://example.com/chart.png"),
+        {"type": "text", "text": "Describe this chart."},
+    ],
+}
+
+response = await llm.generate(message)
+```
+
+```python
+# From local file
+from synapsekit.multimodal import ImageContent
+
+with open("screenshot.png", "rb") as f:
+    image = ImageContent.from_bytes(f.read(), media_type="image/png")
+
+response = await llm.generate([image, "What's in this image?"])
+```
+
+## LLMConfig options
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `model` | str | required | OpenAI model name |
+| `api_key` | str | required | Your OpenAI API key |
+| `temperature` | float | `1.0` | Sampling temperature (0-2) |
+| `max_tokens` | int | None | Maximum output tokens |
+| `seed` | int | None | For deterministic outputs |
+| `max_retries` | int | `3` | Auto-retry on transient errors |
+| `requests_per_minute` | int | None | Rate limit (RPM) throttle |
+| `cache_backend` | str | None | `"sqlite"` or `"lru"` |
+
+## Response caching
+
+Enable caching to avoid re-requesting the same prompt:
+
+```python
+llm = OpenAILLM(LLMConfig(
+    model="gpt-4o-mini",
+    api_key="sk-...",
+    cache_backend="sqlite",
+    cache_path="~/.synapsekit/cache.db",
+))
+
+# First call hits the API
+response1 = await llm.generate("What is Python?")
+
+# Second call is served from cache
+response2 = await llm.generate("What is Python?")
+```
+
+## Cost tracking
+
+```python
+from synapsekit.observability import CostTracker
+
+tracker = CostTracker()
+llm = OpenAILLM(LLMConfig(model="gpt-4o", api_key="sk-..."))
+llm.attach_tracker(tracker)
+
+await llm.generate("Summarize the Python docs.")
+await llm.generate("Write a haiku about async.")
+
+print(tracker.total_cost_usd)
+print(tracker.summary())
+```
+
+## Reasoning models (o1, o3)
+
+The `o1` and `o3` series have different constraints:
+
+```python
+# o1 does not support streaming or system messages
+llm = OpenAILLM(LLMConfig(
+    model="o1",
+    api_key="sk-...",
+    # Do not set temperature -- unsupported for o1
+))
+
+response = await llm.generate("Solve this logic puzzle: ...")
+```
+
+:::caution
+`o1` and `o3-mini` do not support `stream()`, system messages, or `temperature`. Use `generate()` only.
+:::
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"LLM error: {e}")
+```
+
+SynapseKit automatically retries on `429 Too Many Requests` and `5xx` errors up to `max_retries` times with exponential backoff.
+
+## Using the RAG facade
+
+```python
+from synapsekit import RAG
+
+rag = RAG(model="gpt-4o-mini", api_key="sk-...")
+rag.add("SynapseKit is a Python library for building LLM applications.")
+rag.add_file("docs/readme.txt")
+
+answer = rag.ask_sync("What is SynapseKit?")
+print(answer)
+```
 
-Any model supported by the OpenAI API: `gpt-4o`, `gpt-4o-mini`, `gpt-4-turbo`, `gpt-3.5-turbo`, etc.
+:::tip
+Set `OPENAI_API_KEY` in your environment and omit `api_key` from `LLMConfig` -- SynapseKit will pick it up automatically.
+:::
diff --git a/docs/llms/openrouter.md b/docs/llms/openrouter.md
index cbbbc8360..d01a8a5e7 100644
--- a/docs/llms/openrouter.md
+++ b/docs/llms/openrouter.md
@@ -4,7 +4,7 @@ sidebar_position: 11
 
 # OpenRouter
 
-[OpenRouter](https://openrouter.ai/) is a unified API that provides access to 200+ models from OpenAI, Anthropic, Meta, Mistral, Google, and more — with automatic fallback and load balancing.
+[OpenRouter](https://openrouter.ai/) is a unified API that provides access to 200+ models from OpenAI, Anthropic, Meta, Mistral, Google, and more -- with automatic fallback and load balancing.
 
 ## Install
 
@@ -31,15 +31,18 @@ async for token in llm.stream("What is RAG?"):
 
 ## Available models
 
-OpenRouter supports 200+ models. Some popular ones:
+OpenRouter supports 200+ models. Model IDs follow the `provider/model-name` format:
 
-| Model | ID |
-|---|---|
-| GPT-4o | `openai/gpt-4o` |
-| Claude 3.5 Sonnet | `anthropic/claude-3.5-sonnet` |
-| Llama 3.3 70B | `meta-llama/llama-3.3-70b-instruct` |
-| Mixtral 8x7B | `mistralai/mixtral-8x7b-instruct` |
-| Gemini Pro | `google/gemini-pro` |
+| Model | ID | Input (per 1M) | Output (per 1M) |
+|---|---|---|---|
+| GPT-4o | `openai/gpt-4o` | $2.50 | $10.00 |
+| GPT-4o Mini | `openai/gpt-4o-mini` | $0.15 | $0.60 |
+| Claude Sonnet 4.6 | `anthropic/claude-sonnet-4-6` | $3.00 | $15.00 |
+| Llama 3.3 70B | `meta-llama/llama-3.3-70b-instruct` | $0.12 | $0.40 |
+| Mixtral 8x7B | `mistralai/mixtral-8x7b-instruct` | $0.24 | $0.24 |
+| Gemini Pro | `google/gemini-pro` | $0.50 | $1.50 |
+| DeepSeek V3 | `deepseek/deepseek-chat` | $0.07 | $1.10 |
+| Qwen 2.5 72B | `qwen/qwen-2.5-72b-instruct` | $0.13 | $0.40 |
 
 See the full list at [openrouter.ai/models](https://openrouter.ai/models).
 
@@ -51,6 +54,24 @@ result = await llm.call_with_tools(messages, tools)
 
 Function calling support depends on the underlying model.
 
+```python
+from synapsekit import FunctionCallingAgent, tool
+from synapsekit.llm.openrouter import OpenRouterLLM
+
+@tool
+def get_news(topic: str, count: int = 5) -> list:
+    """Get recent news headlines about a topic."""
+    return [{"title": f"News about {topic} #{i}"} for i in range(count)]
+
+llm = OpenRouterLLM(LLMConfig(
+    model="anthropic/claude-sonnet-4-6",
+    api_key="sk-or-...",
+))
+
+agent = FunctionCallingAgent(llm=llm, tools=[get_news])
+answer = await agent.run("What's happening in AI today?")
+```
+
 ## Auto-detection
 
 Models with a `/` in the name are auto-detected as OpenRouter:
@@ -59,6 +80,39 @@ Models with a `/` in the name are auto-detected as OpenRouter:
 from synapsekit import RAG
 
 rag = RAG(model="openai/gpt-4o", api_key="sk-or-...")
+rag.add("Your document text here")
+answer = rag.ask_sync("Summarize this.")
+```
+
+## Model routing by complexity
+
+Use OpenRouter to route simple queries to cheap models and complex ones to powerful models:
+
+```python
+from synapsekit.llm.openrouter import OpenRouterLLM
+from synapsekit import LLMConfig
+
+async def route_by_complexity(query: str, api_key: str) -> str:
+    """Use cheap model for simple queries, expensive for complex ones."""
+    word_count = len(query.split())
+
+    if word_count < 20:
+        # Simple query: use cheapest option
+        model = "meta-llama/llama-3.3-70b-instruct"
+    elif word_count < 100:
+        # Medium complexity
+        model = "openai/gpt-4o-mini"
+    else:
+        # Complex query: use best model
+        model = "anthropic/claude-sonnet-4-6"
+
+    llm = OpenRouterLLM(LLMConfig(model=model, api_key=api_key))
+    return await llm.generate(query)
+
+result = await route_by_complexity(
+    "Explain quantum entanglement and its implications for computing.",
+    api_key="sk-or-..."
+)
 ```
 
 ## Custom base URL
@@ -74,3 +128,32 @@ llm = OpenRouterLLM(config, base_url="http://localhost:8000/v1")
 | `model` | Any model ID from OpenRouter (e.g. `openai/gpt-4o`) |
 | `api_key` | Your OpenRouter API key |
 | `base_url` | Custom API base URL (default: `https://openrouter.ai/api/v1`) |
+
+## LLMConfig options
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `temperature` | float | `1.0` | Sampling temperature |
+| `max_tokens` | int | None | Maximum output tokens |
+| `max_retries` | int | `3` | Auto-retry on transient errors |
+| `requests_per_minute` | int | None | Rate throttle |
+| `cache_backend` | str | None | `"sqlite"` or `"lru"` |
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key -- get one at openrouter.ai/keys")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"OpenRouter error: {e}")
+```
+
+:::tip
+OpenRouter is ideal for experimenting with many models using a single API key, or for building systems that need automatic model fallback when a provider is down.
+:::
diff --git a/docs/llms/perplexity.md b/docs/llms/perplexity.md
index 85d42589e..db407f3af 100644
--- a/docs/llms/perplexity.md
+++ b/docs/llms/perplexity.md
@@ -4,7 +4,7 @@ sidebar_position: 14
 
 # Perplexity AI
 
-[Perplexity AI](https://www.perplexity.ai/) provides search-augmented LLMs with an OpenAI-compatible API.
+[Perplexity AI](https://www.perplexity.ai/) provides search-augmented LLMs with real-time web access. Unlike standard LLMs, Perplexity's Sonar models automatically search the web and include citations in their responses — making them ideal for research, news monitoring, and fact-checking tasks.
 
 ## Install
 
@@ -14,7 +14,7 @@ pip install synapsekit[openai]
 
 Perplexity AI uses the OpenAI-compatible API, so it requires the `openai` package.
 
-## Usage
+## Basic usage
 
 ```python
 from synapsekit import LLMConfig
@@ -25,37 +25,145 @@ llm = PerplexityLLM(LLMConfig(
     api_key="pplx-...",
 ))
 
-async for token in llm.stream("What is RAG?"):
+response = await llm.generate("What are the latest developments in AI safety research?")
+print(response)
+# Recent AI safety research has focused on... [cites sources]
+```
+
+## Streaming
+
+```python
+from synapsekit import LLMConfig
+from synapsekit.llm.perplexity import PerplexityLLM
+
+llm = PerplexityLLM(LLMConfig(
+    model="sonar",
+    api_key="pplx-...",
+))
+
+async for token in llm.stream("What happened in AI this week?"):
     print(token, end="", flush=True)
+# This week in AI: OpenAI announced... Anthropic released... [with citations]
 ```
 
-## Available models
+## Available Sonar models
 
-| Model | ID |
-|---|---|
-| Sonar | `sonar` |
-| Sonar Pro | `sonar-pro` |
-| Sonar Reasoning | `sonar-reasoning` |
-| Sonar Reasoning Pro | `sonar-reasoning-pro` |
+| Model | Context | Notes |
+|---|---|---|
+| `sonar` | 128K | Fast, real-time web search |
+| `sonar-pro` | 200K | Higher quality search, deeper research |
+| `sonar-reasoning` | 128K | Web search + chain-of-thought reasoning |
+| `sonar-reasoning-pro` | 200K | Best quality for complex research questions |
+| `sonar-deep-research` | 128K | Multi-step research synthesis |
 
 See the full list at [docs.perplexity.ai](https://docs.perplexity.ai/guides/model-cards).
 
-## Function calling
+## Real-time web search
+
+Sonar models automatically search the web for every request. The response includes citations you can access:
 
 ```python
-result = await llm.call_with_tools(messages, tools)
+from synapsekit import LLMConfig
+from synapsekit.llm.perplexity import PerplexityLLM
+
+llm = PerplexityLLM(LLMConfig(model="sonar-pro", api_key="pplx-..."))
+
+# Ask about current events — Perplexity fetches live data
+response = await llm.generate("What is the current price of Bitcoin?")
+print(response)
+# As of March 2026, Bitcoin is trading at approximately $X,XXX...
+
+# Access citations from the raw response
+raw = await llm.generate_raw("Latest Python 3.14 release notes")
+if hasattr(raw, "citations"):
+    for cite in raw.citations:
+        print(f"  - {cite}")
+```
+
+:::caution
+Perplexity models are not suitable for tasks requiring deterministic, reproducible outputs. Because they search the web at inference time, the same prompt may return different answers on different days as news changes.
+:::
+
+## Research with Sonar Reasoning
+
+For complex research questions, `sonar-reasoning` combines web search with step-by-step thinking:
+
+```python
+llm = PerplexityLLM(LLMConfig(
+    model="sonar-reasoning",
+    api_key="pplx-...",
+))
+
+response = await llm.generate(
+    "Compare the performance benchmarks of the latest open-source LLMs released in 2026. "
+    "Which ones perform best on coding tasks?"
+)
+print(response)
+# <think>
+# Let me search for recent LLM benchmarks...
+# </think>
+# Based on current benchmarks, the top-performing models for coding in 2026 are...
+```
+
+## Combining with RAG
+
+Use Perplexity for freshness and a local vector store for private data:
+
+```python
+from synapsekit import RAG
+from synapsekit.llm.perplexity import PerplexityLLM
+from synapsekit import LLMConfig
+
+# Use Sonar to answer questions that need up-to-date web knowledge
+web_llm = PerplexityLLM(LLMConfig(model="sonar-pro", api_key="pplx-..."))
+
+# Use a standard RAG pipeline for your private documents
+from synapsekit import RAGPipeline, LLMConfig as RagConfig
+private_rag = RAGPipeline(LLMConfig(model="gpt-4o-mini", api_key="sk-..."))
+private_rag.add_file("internal_report.pdf")
+
+# Route queries: private docs → RAG, current events → Perplexity
+query = "What are the latest regulations for our industry?"
+if "latest" in query or "current" in query or "recent" in query:
+    answer = await web_llm.generate(query)
+else:
+    answer = await private_rag.ask(query)
 ```
 
 ## Custom base URL
 
 ```python
-llm = PerplexityLLM(config, base_url="http://localhost:8000/v1")
+llm = PerplexityLLM(
+    LLMConfig(model="sonar-pro", api_key="pplx-..."),
+    base_url="http://localhost:8000/v1",
+)
 ```
 
-## Parameters
+## Parameters reference
 
 | Parameter | Description |
 |---|---|
-| `model` | Perplexity model ID |
-| `api_key` | Your Perplexity API key |
+| `model` | Perplexity model ID (e.g. `sonar-pro`) |
+| `api_key` | Your Perplexity API key (starts with `pplx-`) |
+| `temperature` | Sampling temperature (lower = more factual) |
+| `max_tokens` | Maximum output tokens |
 | `base_url` | Custom API base URL (default: `https://api.perplexity.ai`) |
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("What's new in machine learning?")
+except AuthenticationError:
+    print("Invalid API key — get one at perplexity.ai/settings/api")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"Perplexity error: {e}")
+```
+
+:::tip
+Use `sonar` for fast lookups and `sonar-pro` for deeper research. If your application needs to verify facts or track current events, Perplexity's search-augmented models are more reliable than asking a static LLM about recent information.
+:::
diff --git a/docs/llms/together.md b/docs/llms/together.md
index e6fe047cb..ed4626a90 100644
--- a/docs/llms/together.md
+++ b/docs/llms/together.md
@@ -4,7 +4,7 @@ sidebar_position: 12
 
 # Together AI
 
-[Together AI](https://together.ai/) provides fast inference on open-source models with an OpenAI-compatible API.
+[Together AI](https://together.ai/) provides fast, scalable inference for open-source models including Llama, Mistral, Qwen, and more -- with competitive pricing.
 
 ## Install
 
@@ -31,19 +31,82 @@ async for token in llm.stream("What is RAG?"):
 
 ## Available models
 
-| Model | ID |
-|---|---|
-| Llama 3.3 70B Turbo | `meta-llama/Llama-3.3-70B-Instruct-Turbo` |
-| Mixtral 8x7B | `mistralai/Mixtral-8x7B-Instruct-v0.1` |
-| Qwen 2.5 72B | `Qwen/Qwen2.5-72B-Instruct-Turbo` |
-| DeepSeek V3 | `deepseek-ai/DeepSeek-V3` |
+| Model | ID | Input (per 1M) | Output (per 1M) | Notes |
+|---|---|---|---|---|
+| Llama 3.3 70B | `meta-llama/Llama-3.3-70B-Instruct-Turbo` | $0.88 | $0.88 | Best Llama quality |
+| Llama 3.1 405B | `meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo` | $3.50 | $3.50 | Largest open model |
+| Llama 3.1 8B | `meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo` | $0.18 | $0.18 | Fast and cheap |
+| Mistral 7B | `mistralai/Mistral-7B-Instruct-v0.3` | $0.20 | $0.20 | Reliable workhorse |
+| Mixtral 8x7B | `mistralai/Mixtral-8x7B-Instruct-v0.1` | $0.60 | $0.60 | MoE architecture |
+| Qwen 2.5 72B | `Qwen/Qwen2.5-72B-Instruct-Turbo` | $1.20 | $1.20 | Strong multilingual |
+| DeepSeek V3 | `deepseek-ai/DeepSeek-V3` | $1.25 | $1.25 | Reasoning optimized |
+
+See the full list at [api.together.ai/models](https://api.together.ai/models).
+
+## Llama 3.1 405B example
+
+Together AI is one of the few providers offering Llama 3.1 405B:
+
+```python
+from synapsekit.llm.together import TogetherLLM
+from synapsekit import LLMConfig
+
+llm = TogetherLLM(LLMConfig(
+    model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+    api_key="...",
+    temperature=0.1,
+    max_tokens=4096,
+))
 
-See the full list at [together.ai/models](https://api.together.ai/models).
+response = await llm.generate(
+    "Analyze this code and suggest architectural improvements: ..."
+)
+```
 
 ## Function calling
 
 ```python
-result = await llm.call_with_tools(messages, tools)
+from synapsekit import FunctionCallingAgent, tool
+from synapsekit.llm.together import TogetherLLM
+
+@tool
+def web_search(query: str, num_results: int = 5) -> list:
+    """Search the web for information."""
+    return [{"title": f"Result {i}: {query}", "url": f"https://example.com/{i}"}
+            for i in range(num_results)]
+
+@tool
+def summarize_url(url: str) -> str:
+    """Fetch and summarize a web page."""
+    return f"Summary of {url}: This page discusses relevant topics..."
+
+llm = TogetherLLM(LLMConfig(
+    model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
+    api_key="...",
+))
+
+agent = FunctionCallingAgent(llm=llm, tools=[web_search, summarize_url])
+answer = await agent.run("Research the latest developments in vector databases")
+```
+
+### Raw call_with_tools
+
+```python
+result = await llm.call_with_tools(
+    messages=[{"role": "user", "content": "What's the weather in Berlin?"}],
+    tools=[{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get current weather",
+            "parameters": {
+                "type": "object",
+                "properties": {"city": {"type": "string"}},
+                "required": ["city"],
+            },
+        },
+    }],
+)
 ```
 
 ## Custom base URL
@@ -52,6 +115,25 @@ result = await llm.call_with_tools(messages, tools)
 llm = TogetherLLM(config, base_url="http://localhost:8000/v1")
 ```
 
+## Provider comparison
+
+| Provider | Best for | Llama 3.1 8B | Llama 3.3 70B |
+|---|---|---|---|
+| Together AI | Large models, 405B | $0.18/1M | $0.88/1M |
+| Groq | Ultra-low latency | $0.05/1M | $0.59/1M |
+| Fireworks AI | Production throughput | $0.20/1M | $0.90/1M |
+
+## LLMConfig options
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `model` | str | required | Together AI model ID |
+| `api_key` | str | required | Your Together AI API key |
+| `temperature` | float | `0.7` | Sampling temperature |
+| `max_tokens` | int | None | Maximum output tokens |
+| `max_retries` | int | `3` | Auto-retry on transient errors |
+| `requests_per_minute` | int | None | Rate throttle |
+
 ## Parameters
 
 | Parameter | Description |
@@ -59,3 +141,22 @@ llm = TogetherLLM(config, base_url="http://localhost:8000/v1")
 | `model` | Together AI model ID |
 | `api_key` | Your Together AI API key |
 | `base_url` | Custom API base URL (default: `https://api.together.xyz/v1`) |
+
+## Error handling
+
+```python
+from synapsekit.exceptions import LLMError, RateLimitError, AuthenticationError
+
+try:
+    response = await llm.generate("Hello")
+except AuthenticationError:
+    print("Invalid API key -- get one at api.together.ai")
+except RateLimitError as e:
+    print(f"Rate limited. Retry after {e.retry_after}s")
+except LLMError as e:
+    print(f"Together AI error: {e}")
+```
+
+:::tip
+Together AI is the go-to choice when you need Llama 3.1 405B or want to run large models (70B+) at competitive prices. For maximum speed at lower cost, consider Groq for 8B/70B models.
+:::
diff --git a/sidebars.ts b/sidebars.ts
index 1278eaf97..7eb3374c0 100644
--- a/sidebars.ts
+++ b/sidebars.ts
@@ -79,6 +79,8 @@ const sidebars: SidebarsConfig = {
         'agents/tools',
         'agents/executor',
         'agents/mcp',
+        'agents/cookbook',
+        'agents/tool-authoring',
       ],
     },
     {