Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 31 additions & 9 deletions backend/app/docs/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ManifestDiff,
SyncManifest,
diff_manifests,
guess_mime,
manifest_from_directory,
)

Expand Down Expand Up @@ -67,11 +68,13 @@ class SyncResult:
class LocalFile(BaseModel):
"""A single file in the /docs/sync payload."""

model_config = ConfigDict(extra="forbid")
model_config = ConfigDict(extra="ignore")

path: str = Field(min_length=1)
local_path: Path | None = None
mime_type: str | None = None
sha256: str | None = None
size: int | None = None


class DocsSyncRequest(BaseModel):
Expand Down Expand Up @@ -203,14 +206,25 @@ def _gcs_uri_for_directory(self, relpath: str, entry: FileEntry) -> str:
return f"gs://{self._bucket}/{self._object_key(relpath, entry.sha256)}"

def _manifest_from_payload(self, files: list[LocalFile]) -> SyncManifest:
from .manifest import compute_file_sha256, guess_mime
from .manifest import compute_file_sha256

entries: dict[str, FileEntry] = {}
for item in files:
if item.local_path is None:
raise ValueError(
f"file {item.path!r} has no local_path; payload uploads require resolved paths"
)
if item.sha256 is not None and item.size is not None:
mime = item.mime_type or guess_mime(Path(item.path))
entries[item.path] = FileEntry(
sha256=item.sha256,
size=item.size,
gcs_uri=f"gs://{self._bucket}/{self._object_key(item.path, item.sha256)}",
mime_type=mime,
updated_at=datetime.now(tz=UTC),
)
continue
else:
raise ValueError(
f"file {item.path!r} has no local_path and missing sha256/size; payload uploads require resolved paths"
)
src = Path(item.local_path)
if not src.exists():
raise FileNotFoundError(f"file not found: {src}")
Expand Down Expand Up @@ -240,11 +254,19 @@ async def _upload_changed(
if local is not None:
await self._gcs.upload_file(self._bucket, key, local, entry.mime_type)
else:
# Walked-from-directory case: re-read the file fresh.
src = self._docs_dir / relpath
with src.open("rb") as fh:
data = fh.read()
await self._gcs.upload_bytes(self._bucket, key, data, entry.mime_type)
if src.is_file():
with src.open("rb") as fh:
data = fh.read()
await self._gcs.upload_bytes(self._bucket, key, data, entry.mime_type)
continue
# Remote sync: CLI posts sha256/size; bytes may already be in GCS.
existing = await self._gcs.download_bytes(self._bucket, key)
if existing is None:
raise FileNotFoundError(
f"file {relpath!r} not found under {self._docs_dir} "
f"and not pre-uploaded to gs://{self._bucket}/{key}"
)

async def _delete_removed(self, diff: ManifestDiff, old: SyncManifest) -> None:
for relpath in diff.removed:
Expand Down
25 changes: 20 additions & 5 deletions cli/expert/commands/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import asyncio
import hashlib
import mimetypes
from pathlib import Path
from typing import Annotated, Any

Expand Down Expand Up @@ -37,6 +38,22 @@ def _iter_matching_files(
return sorted(matched)


_MIME_BY_SUFFIX = {
".md": "text/markdown",
".markdown": "text/markdown",
".txt": "text/plain",
".pdf": "application/pdf",
}


def _guess_mime(path: Path) -> str:
suffix = path.suffix.lower()
if suffix in _MIME_BY_SUFFIX:
return _MIME_BY_SUFFIX[suffix]
guessed, _ = mimetypes.guess_type(path.as_posix())
return guessed or "application/octet-stream"


def _sha256(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as fh:
Expand All @@ -63,13 +80,11 @@ def _build_manifest(schema: AgentSchema, base_dir: Path) -> dict[str, Any]:
"path": str(rel),
"sha256": _sha256(file_path),
"size": file_path.stat().st_size,
"mime_type": _guess_mime(file_path),
}
)
return {
"agent_id": schema.agent_id,
"schema_version": schema.metadata.version,
"files": entries,
}
# Backend `DocsSyncRequest` only accepts `files` (extra=forbid on other keys).
return {"files": entries}


async def _post_sync(
Expand Down
20 changes: 17 additions & 3 deletions cli/expert/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,24 @@ def print_diff_table(diff: dict[str, Any]) -> None:
table.add_column("SHA", no_wrap=True, style="dim")
table.add_column("Size", justify="right", no_wrap=True, style="dim")

def _normalize(entries: list[Any]) -> list[dict[str, Any]]:
normalized: list[dict[str, Any]] = []
for entry in entries:
if isinstance(entry, str):
normalized.append({"path": entry})
elif isinstance(entry, dict):
normalized.append(entry)
return normalized

actions: list[tuple[str, str, str, list[dict[str, Any]]]] = [
("+", "green", "added", list(diff.get("added", []) or [])),
("~", "yellow", "updated", list(diff.get("updated", []) or [])),
("-", "red", "removed", list(diff.get("removed", []) or [])),
("+", "green", "added", _normalize(list(diff.get("added", []) or []))),
(
"~",
"yellow",
"updated",
_normalize(list(diff.get("updated", []) or diff.get("changed", []) or [])),
),
("-", "red", "removed", _normalize(list(diff.get("removed", []) or []))),
]
total = 0
for glyph, color, _name, entries in actions:
Expand Down
93 changes: 93 additions & 0 deletions docs/rfc/google-drive-connector.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# RFC: Google Drive connector for knowledge sync

**Status:** draft
**Authors:** expert-agent maintainers
**Consumers:** private agent repos that mirror Drive folders into `docs/`

## Problem

Many teams keep canonical SOPs and manuals in Google Drive (Docs, PDFs, DOCX).
The expert-agent pipeline today expects files under `<agent>/docs/` and syncs them
to GCS + Gemini Context Cache via `expert sync` / `POST /docs/sync`.

Private repos currently bridge this gap with ad-hoc scripts (e.g.
`import-*-drive.py`). We need a **generic, framework-level** connector that:

1. Lists files in an authorized Drive folder (recursive).
2. Exports supported MIME types to `.md` / `.pdf` / `.txt`.
3. Writes frontmatter metadata (`source_url`, `drive_file_id`, `slug`).
4. Triggers the existing manifest sync (no duplicate cache logic).

## Goals

- Parametrize folder ID, credentials, and export map via `agent_schema.yaml`.
- Reuse `DocsSyncService` — Drive is an **ingest** step, not a new cache path.
- Support dry-run and incremental diff (skip unchanged `modifiedTime` + `md5`).
- Work with user ADC and service accounts (`drive.readonly` scope).

## Non-goals

- Real-time Drive webhooks (phase 2).
- Indexing Shared Drives without explicit folder ID (phase 2).
- Storing Drive credentials in the agent container image.

## Proposed schema extension

```yaml
spec:
knowledge:
reference_docs_dir: ./docs
include_patterns: ["*.md", "*.pdf"]
drive_sync: # optional
enabled: true
folder_id: "${DRIVE_FOLDER_ID}"
credentials_env: DRIVE_CREDENTIALS_JSON # optional SA path
export:
google_docs: markdown
docx: pdf
frontmatter:
- source_url
- drive_file_id
- slug
```

## CLI surface

```bash
expert drive pull --agent my-expert # dry-run report
expert drive pull --agent my-expert --apply # write docs/ + optional auto-sync
expert drive pull --search "quality" # discover candidate folders
```

## Backend changes

None required for v1 if ingest remains CLI-side. Optional future endpoint:

`POST /docs/ingest/drive` (admin-only) for Cloud Run environments without local
`docs/` — out of scope for v1.

## Security

- Scope: `https://www.googleapis.com/auth/drive.readonly` minimum.
- Folder ID is an allowlist boundary — document in agent README.
- Never log file contents; log file IDs and slugs only.

## Testing

- Unit: MIME export map, slugify, frontmatter rendering.
- Integration: mock Drive API (`googleapiclient` test double).
- E2E: synthetic Drive folder in a test GCP project.

## Rollout

1. Merge RFC + schema fields (no default — opt-in).
2. Implement `cli/expert/commands/drive.py`.
3. Document in `example-schema/agent_schema.yaml`.
4. Private repos delete interim `import-*-drive.py` scripts after adoption.

## Open questions

- Should `source_url` be mandatory in identity prompts (private concern) or
schema-level `citation.required_fields`?
- Minimum token count for Context Cache (Gemini ≥ 2048) — validate at
`expert validate` time when `context_cache.enabled: true`?
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading