feliperun · feliperun · Jun 9, 2026 · Jun 6, 2026 · Jun 9, 2026
diff --git a/backend/app/docs/sync.py b/backend/app/docs/sync.py
@@ -30,6 +30,7 @@
     ManifestDiff,
     SyncManifest,
     diff_manifests,
+    guess_mime,
     manifest_from_directory,
 )
 
@@ -67,11 +68,13 @@ class SyncResult:
 class LocalFile(BaseModel):
     """A single file in the /docs/sync payload."""
 
-    model_config = ConfigDict(extra="forbid")
+    model_config = ConfigDict(extra="ignore")
 
     path: str = Field(min_length=1)
     local_path: Path | None = None
     mime_type: str | None = None
+    sha256: str | None = None
+    size: int | None = None
 
 
 class DocsSyncRequest(BaseModel):
@@ -203,14 +206,25 @@ def _gcs_uri_for_directory(self, relpath: str, entry: FileEntry) -> str:
         return f"gs://{self._bucket}/{self._object_key(relpath, entry.sha256)}"
 
     def _manifest_from_payload(self, files: list[LocalFile]) -> SyncManifest:
-        from .manifest import compute_file_sha256, guess_mime
+        from .manifest import compute_file_sha256
 
         entries: dict[str, FileEntry] = {}
         for item in files:
             if item.local_path is None:
-                raise ValueError(
-                    f"file {item.path!r} has no local_path; payload uploads require resolved paths"
-                )
+                if item.sha256 is not None and item.size is not None:
+                    mime = item.mime_type or guess_mime(Path(item.path))
+                    entries[item.path] = FileEntry(
+                        sha256=item.sha256,
+                        size=item.size,
+                        gcs_uri=f"gs://{self._bucket}/{self._object_key(item.path, item.sha256)}",
+                        mime_type=mime,
+                        updated_at=datetime.now(tz=UTC),
+                    )
+                    continue
+                else:
+                    raise ValueError(
+                        f"file {item.path!r} has no local_path and missing sha256/size; payload uploads require resolved paths"
+                    )
             src = Path(item.local_path)
             if not src.exists():
                 raise FileNotFoundError(f"file not found: {src}")
@@ -240,11 +254,19 @@ async def _upload_changed(
             if local is not None:
                 await self._gcs.upload_file(self._bucket, key, local, entry.mime_type)
             else:
-                # Walked-from-directory case: re-read the file fresh.
                 src = self._docs_dir / relpath
-                with src.open("rb") as fh:
-                    data = fh.read()
-                await self._gcs.upload_bytes(self._bucket, key, data, entry.mime_type)
+                if src.is_file():
+                    with src.open("rb") as fh:
+                        data = fh.read()
+                    await self._gcs.upload_bytes(self._bucket, key, data, entry.mime_type)
+                    continue
+                # Remote sync: CLI posts sha256/size; bytes may already be in GCS.
+                existing = await self._gcs.download_bytes(self._bucket, key)
+                if existing is None:
+                    raise FileNotFoundError(
+                        f"file {relpath!r} not found under {self._docs_dir} "
+                        f"and not pre-uploaded to gs://{self._bucket}/{key}"
+                    )
 
     async def _delete_removed(self, diff: ManifestDiff, old: SyncManifest) -> None:
         for relpath in diff.removed:

diff --git a/cli/expert/commands/sync.py b/cli/expert/commands/sync.py
@@ -9,6 +9,7 @@
 
 import asyncio
 import hashlib
+import mimetypes
 from pathlib import Path
 from typing import Annotated, Any
 
@@ -37,6 +38,22 @@ def _iter_matching_files(
     return sorted(matched)
 
 
+_MIME_BY_SUFFIX = {
+    ".md": "text/markdown",
+    ".markdown": "text/markdown",
+    ".txt": "text/plain",
+    ".pdf": "application/pdf",
+}
+
+
+def _guess_mime(path: Path) -> str:
+    suffix = path.suffix.lower()
+    if suffix in _MIME_BY_SUFFIX:
+        return _MIME_BY_SUFFIX[suffix]
+    guessed, _ = mimetypes.guess_type(path.as_posix())
+    return guessed or "application/octet-stream"
+
+
 def _sha256(path: Path) -> str:
     digest = hashlib.sha256()
     with path.open("rb") as fh:
@@ -63,13 +80,11 @@ def _build_manifest(schema: AgentSchema, base_dir: Path) -> dict[str, Any]:
                 "path": str(rel),
                 "sha256": _sha256(file_path),
                 "size": file_path.stat().st_size,
+                "mime_type": _guess_mime(file_path),
             }
         )
-    return {
-        "agent_id": schema.agent_id,
-        "schema_version": schema.metadata.version,
-        "files": entries,
-    }
+    # Backend `DocsSyncRequest` only accepts `files` (extra=forbid on other keys).
+    return {"files": entries}
 
 
 async def _post_sync(

diff --git a/cli/expert/ui.py b/cli/expert/ui.py
@@ -77,10 +77,24 @@ def print_diff_table(diff: dict[str, Any]) -> None:
     table.add_column("SHA", no_wrap=True, style="dim")
     table.add_column("Size", justify="right", no_wrap=True, style="dim")
 
+    def _normalize(entries: list[Any]) -> list[dict[str, Any]]:
+        normalized: list[dict[str, Any]] = []
+        for entry in entries:
+            if isinstance(entry, str):
+                normalized.append({"path": entry})
+            elif isinstance(entry, dict):
+                normalized.append(entry)
+        return normalized
+
     actions: list[tuple[str, str, str, list[dict[str, Any]]]] = [
-        ("+", "green", "added", list(diff.get("added", []) or [])),
-        ("~", "yellow", "updated", list(diff.get("updated", []) or [])),
-        ("-", "red", "removed", list(diff.get("removed", []) or [])),
+        ("+", "green", "added", _normalize(list(diff.get("added", []) or []))),
+        (
+            "~",
+            "yellow",
+            "updated",
+            _normalize(list(diff.get("updated", []) or diff.get("changed", []) or [])),
+        ),
+        ("-", "red", "removed", _normalize(list(diff.get("removed", []) or []))),
     ]
     total = 0
     for glyph, color, _name, entries in actions:

diff --git a/docs/rfc/google-drive-connector.md b/docs/rfc/google-drive-connector.md
@@ -0,0 +1,93 @@
+# RFC: Google Drive connector for knowledge sync
+
+**Status:** draft  
+**Authors:** expert-agent maintainers  
+**Consumers:** private agent repos that mirror Drive folders into `docs/`
+
+## Problem
+
+Many teams keep canonical SOPs and manuals in Google Drive (Docs, PDFs, DOCX).
+The expert-agent pipeline today expects files under `<agent>/docs/` and syncs them
+to GCS + Gemini Context Cache via `expert sync` / `POST /docs/sync`.
+
+Private repos currently bridge this gap with ad-hoc scripts (e.g.
+`import-*-drive.py`). We need a **generic, framework-level** connector that:
+
+1. Lists files in an authorized Drive folder (recursive).
+2. Exports supported MIME types to `.md` / `.pdf` / `.txt`.
+3. Writes frontmatter metadata (`source_url`, `drive_file_id`, `slug`).
+4. Triggers the existing manifest sync (no duplicate cache logic).
+
+## Goals
+
+- Parametrize folder ID, credentials, and export map via `agent_schema.yaml`.
+- Reuse `DocsSyncService` — Drive is an **ingest** step, not a new cache path.
+- Support dry-run and incremental diff (skip unchanged `modifiedTime` + `md5`).
+- Work with user ADC and service accounts (`drive.readonly` scope).
+
+## Non-goals
+
+- Real-time Drive webhooks (phase 2).
+- Indexing Shared Drives without explicit folder ID (phase 2).
+- Storing Drive credentials in the agent container image.
+
+## Proposed schema extension
+
+```yaml
+spec:
+  knowledge:
+    reference_docs_dir: ./docs
+    include_patterns: ["*.md", "*.pdf"]
+    drive_sync:                    # optional
+      enabled: true
+      folder_id: "${DRIVE_FOLDER_ID}"
+      credentials_env: DRIVE_CREDENTIALS_JSON  # optional SA path
+      export:
+        google_docs: markdown
+        docx: pdf
+      frontmatter:
+        - source_url
+        - drive_file_id
+        - slug
+```
+
+## CLI surface
+
+```bash
+expert drive pull --agent my-expert          # dry-run report
+expert drive pull --agent my-expert --apply  # write docs/ + optional auto-sync
+expert drive pull --search "quality"         # discover candidate folders
+```
+
+## Backend changes
+
+None required for v1 if ingest remains CLI-side. Optional future endpoint:
+
+`POST /docs/ingest/drive` (admin-only) for Cloud Run environments without local
+`docs/` — out of scope for v1.
+
+## Security
+
+- Scope: `https://www.googleapis.com/auth/drive.readonly` minimum.
+- Folder ID is an allowlist boundary — document in agent README.
+- Never log file contents; log file IDs and slugs only.
+
+## Testing
+
+- Unit: MIME export map, slugify, frontmatter rendering.
+- Integration: mock Drive API (`googleapiclient` test double).
+- E2E: synthetic Drive folder in a test GCP project.
+
+## Rollout
+
+1. Merge RFC + schema fields (no default — opt-in).
+2. Implement `cli/expert/commands/drive.py`.
+3. Document in `example-schema/agent_schema.yaml`.
+4. Private repos delete interim `import-*-drive.py` scripts after adoption.
+
+## Open questions
+
+- Should `source_url` be mandatory in identity prompts (private concern) or
+  schema-level `citation.required_fields`?
+- Minimum token count for Context Cache (Gemini ≥ 2048) — validate at
+  `expert validate` time when `context_cache.enabled: true`?
diff --git a/uv.lock b/uv.lock