xorbitsai · sqhyz55 · Mar 16, 2026 · Mar 17, 2026 · Mar 19, 2026 · Mar 24, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- **Knowledge Base embedding model binding (breaking / migration)**
+  The Knowledge Base now treats the **Model Hub ID** as the single source of truth for embedding model identity:
+  - `collection_metadata.embedding_model_id` stores the Hub ID (trimmed; no other normalization).
+  - Embeddings tables are named by Hub ID: `embeddings_{to_model_tag(hub_id)}`.
+  - The `model` field stored alongside each embedding vector is the Hub ID.
+
+  **Migration / backward compatibility:** Older deployments may have created embeddings tables using the provider `model_name`
+  (e.g. `embeddings_text-embedding-v4`). During search and embedding reads, the system will **try the new Hub-ID table first**
+  and automatically **fall back to the legacy table name** derived from the resolved `model_name` when the new table is missing.
+  Rebuild/inference helpers were updated to prefer Hub IDs when they can be resolved from Model Hub metadata.
+
 - **Knowledge Base upload: default parse method (breaking)**
   The default parse method on the KB detail upload form is now `"default"` instead of `"pypdf"`. The backend chooses the parser by file type (e.g. .docx, .pdf). If you rely on the previous default (always use PyPDF), select `"pypdf"` explicitly in the parse method dropdown when uploading.
 

diff --git a/scripts/set_nanwang_embedding_model_id.py b/scripts/set_nanwang_embedding_model_id.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import math
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict
+
+import lancedb
+
+
+def _clean_value(value: Any) -> Any:
+    if value is None:
+        return None
+    if isinstance(value, float) and math.isnan(value):
+        return None
+    return value
+
+
+def main() -> None:
+    db_dir = os.environ.get("LANCEDB_DIR")
+    if not db_dir:
+        raise SystemExit("LANCEDB_DIR is not set")
+    db_path = Path(db_dir).expanduser().resolve()
+    print("LANCEDB_DIR =", str(db_path))
+    if not db_path.exists():
+        raise SystemExit("LANCEDB_DIR does not exist")
+
+    # IMPORTANT: set to model hub ID so resolve_embedding_adapter can load it.
+    target_model_id = "text-embedding-v4-openai-1"
+
+    conn = lancedb.connect(str(db_path))
+    meta = conn.open_table("collection_metadata")
+    df = meta.search().where("name = '南网'").limit(10).to_pandas()
+    if df is None or df.empty:
+        raise SystemExit("collection_metadata 中找不到 '南网'")
+
+    row: Dict[str, Any] = df.iloc[0].to_dict()
+    print("old embedding_model_id =", row.get("embedding_model_id"))
+    row["embedding_model_id"] = target_model_id
+    row["updated_at"] = datetime.now(timezone.utc).replace(tzinfo=None)
+
+    schema_names = list(meta.schema.names)
+    cleaned = {k: _clean_value(row.get(k)) for k in schema_names}
+
+    meta.delete("name = '南网'")
+    meta.add([cleaned])
+
+    df2 = meta.search().where("name = '南网'").limit(10).to_pandas()
+    print("new embedding_model_id =", df2.iloc[0].get("embedding_model_id"))
+
+
+if __name__ == "__main__":
+    main()