Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions docs/TECH_DEBT.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,63 @@ Interpretation:

The hotspots above were checked against current code and targeted tests.

## Quick Wins (next 2-4 weeks)

This section prioritizes low-risk, high-impact improvements for the three
highest-impact active debts.

### A) Mutation saga monolítica

1. **Extract phase helpers without changing orchestration order**
- Create private functions for: preflight/profile checks, SQL apply, vector
apply, rollback/recovery.
- **Why:** Reduces cognitive load and blast radius while preserving current
write semantics.
2. **Introduce a small typed runtime config object**
- Resolve toggles from `Settings` once and pass a typed object through the
mutation path.
- **Why:** Removes repeated dynamic branching and makes behavior easier to
reason about and test.
3. **Add phase-scoped structured telemetry**
- Emit consistent phase markers (`phase`, `doc_count`, `strategy`) around saga
steps.
- **Why:** Makes production debugging faster before deeper refactors.

### B) Evaluación con sesgos metodológicos

1. **Preserve retrieval scores in the callback contract**
- Move from `Sequence[str]` to a scored shape (`external_id`, `score`).
- **Why:** Avoids lossy rank-only evaluation and unlocks better diagnostics.
2. **Stop silently dropping unknown IDs**
- Treat unknown IDs as explicit anomalies (counter + optional report) or as
non-relevant retrieved docs.
- **Why:** Prevents false confidence by surfacing corpus/index drift.
3. **Emit per-query breakdown in compare mode**
- Add query-level metrics output alongside aggregate deltas.
- **Why:** Quick visibility into regressions hidden by averages.

### C) Fragilidad de contratos CLI/HTTP

1. **Single DTO validation path for CLI + HTTP + MCP**
- Reuse the same typed input models in all transports.
- **Why:** Eliminates semantic drift and duplicated parsing logic.
2. **Unify visible defaults and error shape**
- Align default values and error payload contract across entrypoints.
- **Why:** Reduces user confusion and makes integration tests more stable.
3. **Add transport parity tests for critical commands**
- Golden tests asserting same payload => same behavior in CLI and HTTP.
- **Why:** Catches drift early with low implementation cost.

## Suggested execution order

1. **Week 1-2:** transport parity + shared DTO validation (fast risk reduction).
2. **Week 2-3:** evaluator unknown-ID handling + scored callback contract.
3. **Week 3-4:** mutation saga phase extraction (behavior-preserving slice).

This order is recommended because it first reduces outward-facing contract
fragility, then improves quality signals, and finally tackles internal
orchestration complexity with lower operational risk.

Targeted test command used during this review:

```bash
Expand Down
51 changes: 8 additions & 43 deletions src/local_rag_backend/cli_commands/docs/docs_mutate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
from typing import Any, cast

import click
from pydantic import ValidationError

from local_rag_backend.cli_commands.runtime import (
build_dense_embedder,
get_cli_container,
run_cli_mutation,
)
from local_rag_backend.core.use_cases.docs_mutation import (
MutationCoordinator,
MutationIntent,
MutationUpsertInput,
from local_rag_backend.core.services.docs_mutation_transport import (
build_docs_mutation_intent_from_raw,
)
from local_rag_backend.core.use_cases.docs_mutation import MutationCoordinator, MutationIntent
from local_rag_backend.core.use_cases.results import MutationSummary
from local_rag_backend.settings import settings

Expand All @@ -26,46 +26,11 @@ def _read_payload(payload_json: Path) -> dict[str, Any]:
raise ValueError("--json must contain a JSON object payload")
return cast("dict[str, Any]", payload)


def _build_intent(payload: dict[str, Any]) -> MutationIntent:
upserts_raw = payload.get("upserts") or []
delete_ids_raw = payload.get("delete_ids") or []
delete_external_ids_raw = payload.get("delete_external_ids") or []

if not isinstance(upserts_raw, list):
raise ValueError("payload.upserts must be a list")
if not isinstance(delete_ids_raw, list):
raise ValueError("payload.delete_ids must be a list")
if not isinstance(delete_external_ids_raw, list):
raise ValueError("payload.delete_external_ids must be a list")

upserts: list[MutationUpsertInput] = []
for item in upserts_raw:
if not isinstance(item, dict):
raise ValueError("each payload.upserts item must be an object")
md = item.get("metadata")
upserts.append(
MutationUpsertInput(
external_id=str(item.get("external_id") or "").strip(),
content=str(item.get("content") or "").strip(),
source_id=(
str(item.get("source_id")) if item.get("source_id") is not None else None
),
scope=(str(item.get("scope")) if item.get("scope") is not None else None),
snapshot_id=(
str(item.get("snapshot_id")) if item.get("snapshot_id") is not None else None
),
metadata=(cast("dict[str, Any]", md) if isinstance(md, dict) else None),
)
)

return MutationIntent(
op_id=str(payload.get("op_id") or "").strip(),
upserts=tuple(upserts),
delete_ids=tuple(str(v) for v in delete_ids_raw),
delete_external_ids=tuple(str(v) for v in delete_external_ids_raw),
source="cli:docs:mutate",
)
try:
return build_docs_mutation_intent_from_raw(payload, source="cli:docs:mutate")
except (ValidationError, ValueError) as exc:
raise ValueError(str(exc)) from exc


@click.command("mutate-docs")
Expand Down
127 changes: 127 additions & 0 deletions src/local_rag_backend/core/services/docs_mutation_transport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""Shared transport validation/assembly for docs mutation payloads."""

from __future__ import annotations

from collections.abc import Mapping
from typing import Any

from pydantic import BaseModel, Field, field_validator, model_validator

from local_rag_backend.core.use_cases.docs_mutation import MutationIntent, MutationUpsertInput


class MutationUpsertPayload(BaseModel):
external_id: str = Field(..., min_length=1, max_length=512)
content: str = Field(..., min_length=1, max_length=20000)
source_id: str | None = Field(default=None, max_length=1024)
scope: str | None = Field(default=None, min_length=1, max_length=512)
snapshot_id: str | None = Field(default=None, min_length=1, max_length=512)
metadata: dict[str, Any] | None = None

@field_validator("external_id")
@classmethod
def _external_id_not_blank(cls, value: str) -> str:
value2 = value.strip()
if not value2:
raise ValueError("external_id must not be blank")
return value2

@field_validator("content")
@classmethod
def _content_not_blank(cls, value: str) -> str:
value2 = value.strip()
if not value2:
raise ValueError("content must not be blank")
return value2


class DocsMutationPayload(BaseModel):
op_id: str | None = Field(default=None, min_length=1, max_length=128)
upserts: list[MutationUpsertPayload] = Field(default_factory=list, max_length=256)
delete_ids: list[str] = Field(default_factory=list, max_length=2048)
delete_external_ids: list[str] = Field(default_factory=list, max_length=2048)

@field_validator("delete_ids")
@classmethod
def _normalize_delete_ids(cls, values: list[str]) -> list[str]:
normalized: list[str] = []
seen: set[str] = set()
for doc_id in values:
doc_id_s = str(doc_id).strip()
if not doc_id_s or doc_id_s in seen:
continue
seen.add(doc_id_s)
normalized.append(doc_id_s)
return normalized

@field_validator("delete_external_ids")
@classmethod
def _normalize_delete_external_ids(cls, values: list[str]) -> list[str]:
normalized: list[str] = []
seen: set[str] = set()
for external_id in values:
ext_s = str(external_id).strip()
if not ext_s or ext_s in seen:
continue
seen.add(ext_s)
normalized.append(ext_s)
return normalized

@model_validator(mode="after")
def _validate_payload(self) -> DocsMutationPayload:
if not self.upserts and not self.delete_ids and not self.delete_external_ids:
raise ValueError(
"docs/mutate requires at least one operation: upserts, delete_ids, or delete_external_ids"
)
upsert_ext_ids = {str(item.external_id).strip() for item in self.upserts}
conflict = upsert_ext_ids & set(self.delete_external_ids)
if conflict:
raise ValueError(
"upserts and delete_external_ids cannot target the same external_id values"
)
return self


def validate_docs_mutation_payload(payload: Mapping[str, Any]) -> DocsMutationPayload:
return DocsMutationPayload.model_validate(payload)


def build_docs_mutation_intent(
payload: DocsMutationPayload,
*,
source: str,
) -> MutationIntent:
return MutationIntent(
op_id=str(payload.op_id or "").strip(),
upserts=tuple(
MutationUpsertInput(
external_id=item.external_id,
content=item.content,
source_id=item.source_id,
scope=item.scope,
snapshot_id=item.snapshot_id,
metadata=item.metadata,
)
for item in payload.upserts
),
delete_ids=tuple(payload.delete_ids),
delete_external_ids=tuple(payload.delete_external_ids),
source=source,
)


def build_docs_mutation_intent_from_raw(
payload: Mapping[str, Any],
*,
source: str,
) -> MutationIntent:
return build_docs_mutation_intent(validate_docs_mutation_payload(payload), source=source)


__all__ = [
"DocsMutationPayload",
"MutationUpsertPayload",
"build_docs_mutation_intent",
"build_docs_mutation_intent_from_raw",
"validate_docs_mutation_payload",
]
Loading
Loading