diff --git a/backend/app/docs/sync.py b/backend/app/docs/sync.py index 506177d..691b1d3 100644 --- a/backend/app/docs/sync.py +++ b/backend/app/docs/sync.py @@ -30,6 +30,7 @@ ManifestDiff, SyncManifest, diff_manifests, + guess_mime, manifest_from_directory, ) @@ -67,11 +68,13 @@ class SyncResult: class LocalFile(BaseModel): """A single file in the /docs/sync payload.""" - model_config = ConfigDict(extra="forbid") + model_config = ConfigDict(extra="ignore") path: str = Field(min_length=1) local_path: Path | None = None mime_type: str | None = None + sha256: str | None = None + size: int | None = None class DocsSyncRequest(BaseModel): @@ -203,14 +206,25 @@ def _gcs_uri_for_directory(self, relpath: str, entry: FileEntry) -> str: return f"gs://{self._bucket}/{self._object_key(relpath, entry.sha256)}" def _manifest_from_payload(self, files: list[LocalFile]) -> SyncManifest: - from .manifest import compute_file_sha256, guess_mime + from .manifest import compute_file_sha256 entries: dict[str, FileEntry] = {} for item in files: if item.local_path is None: - raise ValueError( - f"file {item.path!r} has no local_path; payload uploads require resolved paths" - ) + if item.sha256 is not None and item.size is not None: + mime = item.mime_type or guess_mime(Path(item.path)) + entries[item.path] = FileEntry( + sha256=item.sha256, + size=item.size, + gcs_uri=f"gs://{self._bucket}/{self._object_key(item.path, item.sha256)}", + mime_type=mime, + updated_at=datetime.now(tz=UTC), + ) + continue + else: + raise ValueError( + f"file {item.path!r} has no local_path and missing sha256/size; payload uploads require resolved paths" + ) src = Path(item.local_path) if not src.exists(): raise FileNotFoundError(f"file not found: {src}") @@ -240,11 +254,19 @@ async def _upload_changed( if local is not None: await self._gcs.upload_file(self._bucket, key, local, entry.mime_type) else: - # Walked-from-directory case: re-read the file fresh. src = self._docs_dir / relpath - with src.open("rb") as fh: - data = fh.read() - await self._gcs.upload_bytes(self._bucket, key, data, entry.mime_type) + if src.is_file(): + with src.open("rb") as fh: + data = fh.read() + await self._gcs.upload_bytes(self._bucket, key, data, entry.mime_type) + continue + # Remote sync: CLI posts sha256/size; bytes may already be in GCS. + existing = await self._gcs.download_bytes(self._bucket, key) + if existing is None: + raise FileNotFoundError( + f"file {relpath!r} not found under {self._docs_dir} " + f"and not pre-uploaded to gs://{self._bucket}/{key}" + ) async def _delete_removed(self, diff: ManifestDiff, old: SyncManifest) -> None: for relpath in diff.removed: diff --git a/cli/expert/commands/sync.py b/cli/expert/commands/sync.py index 0396323..2f27865 100644 --- a/cli/expert/commands/sync.py +++ b/cli/expert/commands/sync.py @@ -9,6 +9,7 @@ import asyncio import hashlib +import mimetypes from pathlib import Path from typing import Annotated, Any @@ -37,6 +38,22 @@ def _iter_matching_files( return sorted(matched) +_MIME_BY_SUFFIX = { + ".md": "text/markdown", + ".markdown": "text/markdown", + ".txt": "text/plain", + ".pdf": "application/pdf", +} + + +def _guess_mime(path: Path) -> str: + suffix = path.suffix.lower() + if suffix in _MIME_BY_SUFFIX: + return _MIME_BY_SUFFIX[suffix] + guessed, _ = mimetypes.guess_type(path.as_posix()) + return guessed or "application/octet-stream" + + def _sha256(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as fh: @@ -63,13 +80,11 @@ def _build_manifest(schema: AgentSchema, base_dir: Path) -> dict[str, Any]: "path": str(rel), "sha256": _sha256(file_path), "size": file_path.stat().st_size, + "mime_type": _guess_mime(file_path), } ) - return { - "agent_id": schema.agent_id, - "schema_version": schema.metadata.version, - "files": entries, - } + # Backend `DocsSyncRequest` only accepts `files` (extra=forbid on other keys). + return {"files": entries} async def _post_sync( diff --git a/cli/expert/ui.py b/cli/expert/ui.py index e4a58f6..3e2695c 100644 --- a/cli/expert/ui.py +++ b/cli/expert/ui.py @@ -77,10 +77,24 @@ def print_diff_table(diff: dict[str, Any]) -> None: table.add_column("SHA", no_wrap=True, style="dim") table.add_column("Size", justify="right", no_wrap=True, style="dim") + def _normalize(entries: list[Any]) -> list[dict[str, Any]]: + normalized: list[dict[str, Any]] = [] + for entry in entries: + if isinstance(entry, str): + normalized.append({"path": entry}) + elif isinstance(entry, dict): + normalized.append(entry) + return normalized + actions: list[tuple[str, str, str, list[dict[str, Any]]]] = [ - ("+", "green", "added", list(diff.get("added", []) or [])), - ("~", "yellow", "updated", list(diff.get("updated", []) or [])), - ("-", "red", "removed", list(diff.get("removed", []) or [])), + ("+", "green", "added", _normalize(list(diff.get("added", []) or []))), + ( + "~", + "yellow", + "updated", + _normalize(list(diff.get("updated", []) or diff.get("changed", []) or [])), + ), + ("-", "red", "removed", _normalize(list(diff.get("removed", []) or []))), ] total = 0 for glyph, color, _name, entries in actions: diff --git a/docs/rfc/google-drive-connector.md b/docs/rfc/google-drive-connector.md new file mode 100644 index 0000000..16da552 --- /dev/null +++ b/docs/rfc/google-drive-connector.md @@ -0,0 +1,93 @@ +# RFC: Google Drive connector for knowledge sync + +**Status:** draft +**Authors:** expert-agent maintainers +**Consumers:** private agent repos that mirror Drive folders into `docs/` + +## Problem + +Many teams keep canonical SOPs and manuals in Google Drive (Docs, PDFs, DOCX). +The expert-agent pipeline today expects files under `/docs/` and syncs them +to GCS + Gemini Context Cache via `expert sync` / `POST /docs/sync`. + +Private repos currently bridge this gap with ad-hoc scripts (e.g. +`import-*-drive.py`). We need a **generic, framework-level** connector that: + +1. Lists files in an authorized Drive folder (recursive). +2. Exports supported MIME types to `.md` / `.pdf` / `.txt`. +3. Writes frontmatter metadata (`source_url`, `drive_file_id`, `slug`). +4. Triggers the existing manifest sync (no duplicate cache logic). + +## Goals + +- Parametrize folder ID, credentials, and export map via `agent_schema.yaml`. +- Reuse `DocsSyncService` — Drive is an **ingest** step, not a new cache path. +- Support dry-run and incremental diff (skip unchanged `modifiedTime` + `md5`). +- Work with user ADC and service accounts (`drive.readonly` scope). + +## Non-goals + +- Real-time Drive webhooks (phase 2). +- Indexing Shared Drives without explicit folder ID (phase 2). +- Storing Drive credentials in the agent container image. + +## Proposed schema extension + +```yaml +spec: + knowledge: + reference_docs_dir: ./docs + include_patterns: ["*.md", "*.pdf"] + drive_sync: # optional + enabled: true + folder_id: "${DRIVE_FOLDER_ID}" + credentials_env: DRIVE_CREDENTIALS_JSON # optional SA path + export: + google_docs: markdown + docx: pdf + frontmatter: + - source_url + - drive_file_id + - slug +``` + +## CLI surface + +```bash +expert drive pull --agent my-expert # dry-run report +expert drive pull --agent my-expert --apply # write docs/ + optional auto-sync +expert drive pull --search "quality" # discover candidate folders +``` + +## Backend changes + +None required for v1 if ingest remains CLI-side. Optional future endpoint: + +`POST /docs/ingest/drive` (admin-only) for Cloud Run environments without local +`docs/` — out of scope for v1. + +## Security + +- Scope: `https://www.googleapis.com/auth/drive.readonly` minimum. +- Folder ID is an allowlist boundary — document in agent README. +- Never log file contents; log file IDs and slugs only. + +## Testing + +- Unit: MIME export map, slugify, frontmatter rendering. +- Integration: mock Drive API (`googleapiclient` test double). +- E2E: synthetic Drive folder in a test GCP project. + +## Rollout + +1. Merge RFC + schema fields (no default — opt-in). +2. Implement `cli/expert/commands/drive.py`. +3. Document in `example-schema/agent_schema.yaml`. +4. Private repos delete interim `import-*-drive.py` scripts after adoption. + +## Open questions + +- Should `source_url` be mandatory in identity prompts (private concern) or + schema-level `citation.required_fields`? +- Minimum token count for Context Cache (Gemini ≥ 2048) — validate at + `expert validate` time when `context_cache.enabled: true`? diff --git a/uv.lock b/uv.lock index 5b248d7..a56ed91 100644 --- a/uv.lock +++ b/uv.lock @@ -544,7 +544,7 @@ wheels = [ [[package]] name = "expert-agent" -version = "0.1.1" +version = "0.1.4" source = { editable = "." } dependencies = [ { name = "bcrypt" },