Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Changed

- **Knowledge Base embedding model binding (breaking / migration)**
The Knowledge Base now treats the **Model Hub ID** as the single source of truth for embedding model identity:
- `collection_metadata.embedding_model_id` stores the Hub ID (trimmed; no other normalization).
- Embeddings tables are named by Hub ID: `embeddings_{to_model_tag(hub_id)}`.
- The `model` field stored alongside each embedding vector is the Hub ID.

**Migration / backward compatibility:** Older deployments may have created embeddings tables using the provider `model_name`
(e.g. `embeddings_text-embedding-v4`). During search and embedding reads, the system will **try the new Hub-ID table first**
and automatically **fall back to the legacy table name** derived from the resolved `model_name` when the new table is missing.
Rebuild/inference helpers were updated to prefer Hub IDs when they can be resolved from Model Hub metadata.

- **Knowledge Base upload: default parse method (breaking)**
The default parse method on the KB detail upload form is now `"default"` instead of `"pypdf"`. The backend chooses the parser by file type (e.g. .docx, .pdf). If you rely on the previous default (always use PyPDF), select `"pypdf"` explicitly in the parse method dropdown when uploading.

Expand Down
54 changes: 54 additions & 0 deletions scripts/set_nanwang_embedding_model_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from __future__ import annotations

import math
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict

import lancedb


def _clean_value(value: Any) -> Any:
if value is None:
return None
if isinstance(value, float) and math.isnan(value):
return None
return value


def main() -> None:
db_dir = os.environ.get("LANCEDB_DIR")
if not db_dir:
raise SystemExit("LANCEDB_DIR is not set")
db_path = Path(db_dir).expanduser().resolve()
print("LANCEDB_DIR =", str(db_path))
if not db_path.exists():
raise SystemExit("LANCEDB_DIR does not exist")

# IMPORTANT: set to model hub ID so resolve_embedding_adapter can load it.
target_model_id = "text-embedding-v4-openai-1"

conn = lancedb.connect(str(db_path))
meta = conn.open_table("collection_metadata")
df = meta.search().where("name = '南网'").limit(10).to_pandas()
if df is None or df.empty:
raise SystemExit("collection_metadata 中找不到 '南网'")

row: Dict[str, Any] = df.iloc[0].to_dict()
print("old embedding_model_id =", row.get("embedding_model_id"))
row["embedding_model_id"] = target_model_id
row["updated_at"] = datetime.now(timezone.utc).replace(tzinfo=None)

schema_names = list(meta.schema.names)
cleaned = {k: _clean_value(row.get(k)) for k in schema_names}

meta.delete("name = '南网'")
meta.add([cleaned])

df2 = meta.search().where("name = '南网'").limit(10).to_pandas()
print("new embedding_model_id =", df2.iloc[0].get("embedding_model_id"))


if __name__ == "__main__":
main()
Loading