Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
37178f9
feat: add DomainContext model, CLI flag, and pipeline wiring
deanban Apr 14, 2026
af0cb44
feat: add staged L2 schemas, Stage A/B prompts, and trigger logic
deanban Apr 14, 2026
d81b66d
feat: wire A→B→C→merge pipeline with recovery and enriched vocab context
deanban Apr 14, 2026
d0c3756
feat: add eval harness — assertion dump, diff, telemetry, dev slice
deanban Apr 14, 2026
246c55c
feat: add domain-aware prompts and healthcare few-shot library
deanban Apr 14, 2026
7beea0a
test: add Stage C trigger, execution, merge, and partial failure tests
deanban Apr 14, 2026
676c610
feat: add cBioPortal + OMOP ingest pipeline and Databricks bridge
deanban Apr 19, 2026
b2577f7
chore: gitignore .wolf/ OpenWolf context directory
deanban Apr 19, 2026
a6180bb
feat: add sema eval CLI for dev-slice runner, diff, and milestone report
deanban Apr 19, 2026
bf2da3a
fix: populate real latency and token telemetry in staged L2 pipeline
deanban Apr 19, 2026
39e8b82
chore(eval): add dev_slice_poc.yaml matching current Databricks cBioP…
deanban Apr 19, 2026
9db97a5
fix: sanitize LLM-leaked type suffix from Stage B column names
deanban Apr 19, 2026
12dce77
fix(few-shot): add synonyms to Stage B examples and compact JSON format
deanban Apr 19, 2026
a92e2bf
eval: add dev-slice rollout artifacts for steps 2–5
deanban Apr 19, 2026
f67dd27
feat(ingest): add cBioPortal SV, CNA, gene-panel-matrix, and resource…
deanban Apr 19, 2026
181af3b
eval: expand dev slice to 12 tables, re-run full pipeline on GBM ingest
deanban Apr 20, 2026
005c72d
refactor: remove deprecated single-pass and two-pass L2 code (Task 11)
deanban Apr 20, 2026
6aa21d6
eval: post-cleanup sanity run on 12-table slice
deanban Apr 20, 2026
c5662fe
eval: verification run after Neo4j wipe + Task 11 cleanup
deanban Apr 20, 2026
a047c78
docs(eval): add step 6 milestone summary for 12-table POC slice
deanban Apr 21, 2026
88849d8
refactor: extract cBioPortal ingest and slice YAMLs to showcase/cbiop…
deanban Apr 21, 2026
6389a57
feat(few-shot): add generic base layer and split domain packs into mo…
deanban Apr 21, 2026
aaebf02
Merge remote-tracking branch 'origin/main' into dean/refactor/cbiopor…
deanban Apr 21, 2026
4cb42e1
ci: drop single-entry python matrix so test context matches branch rule
deanban Apr 22, 2026
f808639
feat(providers): native Databricks Mosaic AI provider
deanban Apr 22, 2026
ec23af4
Merge branch 'main' into dean/refactor/cbioportal-showcase-extract
deanban Apr 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 26 additions & 10 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,33 @@ NEO4J_URI=bolt://localhost:7687
NEO4J_USER=neo4j
NEO4J_PASSWORD=graphrag

# LLM provider: openrouter, anthropic, openai, databricks, custom
LLM_PROVIDER=openrouter
LLM_MODEL=anthropic/claude-sonnet-4
LLM_API_KEY=sk-or-...
# LLM_BASE_URL= # Only needed for databricks or custom providers
# LLM provider: databricks (default), openrouter, anthropic, openai, custom
# Default is databricks — LLM_MODEL is a Databricks serving-endpoint name
# (e.g., databricks-llama-4-maverick). LLM_API_KEY and LLM_BASE_URL are
# IGNORED under provider=databricks; auth is resolved via the Databricks SDK
# chain (DATABRICKS_HOST/DATABRICKS_TOKEN above, then ~/.databrickscfg profile).
LLM_PROVIDER=databricks
LLM_MODEL=databricks-llama-4-maverick
# For openrouter/anthropic/openai/custom, also set LLM_API_KEY (and LLM_BASE_URL
# for custom). Example openrouter config:
# LLM_PROVIDER=openrouter
# LLM_MODEL=google/gemini-3-flash-preview
# LLM_API_KEY=sk-or-...
# LLM_API_KEY=
# LLM_BASE_URL=

# Embedding provider: openrouter, openai, sentence-transformers, databricks, custom
EMBEDDING_PROVIDER=openrouter
EMBEDDING_MODEL=google/gemini-embedding-001
EMBEDDING_API_KEY=sk-or-...
# EMBEDDING_BASE_URL= # Only needed for databricks or custom providers
# Embedding provider: databricks (default), openrouter, openai, sentence-transformers, custom
# Default is databricks — EMBEDDING_MODEL is a Databricks serving-endpoint name
# (e.g., databricks-bge-large-en). EMBEDDING_API_KEY / EMBEDDING_BASE_URL are
# IGNORED under provider=databricks (same SDK auth chain as LLM).
EMBEDDING_PROVIDER=databricks
EMBEDDING_MODEL=databricks-bge-large-en
# Example openrouter embedding config:
# EMBEDDING_PROVIDER=openrouter
# EMBEDDING_MODEL=openai/text-embedding-ada-002
# EMBEDDING_API_KEY=sk-or-...
# EMBEDDING_API_KEY=
# EMBEDDING_BASE_URL=

# Ingest / data bridge (optional)
# INGEST_DUCKDB_PATH=~/.sema/poc.duckdb
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ coverage.xml
docs/
backups/

# Eval run logs (artifacts kept, logs are transient)
eval-runs/*.log
# Eval run artifacts — local by default (previously-tracked runs stay tracked)
eval-runs/
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,17 @@ uv run sema query --question "Average age of patients by cancer type"

All commands are run with `uv run sema` (or just `sema` if you installed with pip).

### LLM provider configuration

Sema supports `openrouter` (default), `anthropic`, `openai`, `databricks`
(Mosaic AI Model Serving), and `custom` (any OpenAI-compatible endpoint) as
LLM providers, and the same set plus `sentence-transformers` for embeddings.

For Databricks Mosaic-specific operation — endpoint discovery, supported vs
unsupported endpoints, profile-based auth, dimension-guard resolution, and
the baseline/candidate eval workflow — see
[`docs/runbooks/databricks-mosaic-provider.md`](docs/runbooks/databricks-mosaic-provider.md).

### `sema build`

Build the knowledge graph from your warehouse catalog.
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ dependencies = [
"langchain-anthropic>=0.3.0",
"langchain-openai>=0.3.0",
"langgraph>=0.2.0",
"databricks-langchain>=0.5.0",
"databricks-sdk>=0.30.0",
"fastapi[standard]",
"neo4j>=5.0.0",
"databricks-sql-connector[pyarrow]>=3.0.0",
Expand Down Expand Up @@ -58,6 +60,8 @@ module = [
"langchain_anthropic.*",
"sentence_transformers.*",
"databricks.*",
"databricks.sdk.*",
"databricks_langchain.*",
"loguru.*",
"sqlglot.*",
"pyarrow",
Expand Down
122 changes: 122 additions & 0 deletions scripts/embedding_smoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Measurement B — embedding smoke for OpenRouter (ada-002) vs Databricks (bge-large-en).

Prints per-provider output dimension and cosine-similarity table for a fixed
set of domain-term pairs from `showcase/cbioportal_to_omop/slices/dev_slice_poc.yaml`.

Run with each provider sourced in env:

source .env.openrouter-baseline
uv run python scripts/embedding_smoke.py --label openrouter

source .env.databricks-candidate
uv run python scripts/embedding_smoke.py --label databricks-mosaic
"""
from __future__ import annotations

import argparse
import math
import sys
from dataclasses import dataclass


DOMAIN_TERMS: list[str] = [
"patient",
"sample",
"mutation",
"structural variant",
"copy number alteration",
"gene panel matrix",
"clinical supplemental hypoxia",
"resource definition",
"resource patient",
"timeline sample acquisition",
"timeline status",
"timeline treatment",
"overall survival months",
"disease-free survival status",
"tumor mutational burden",
"microsatellite instability",
"icd10 diagnosis code",
"cancer stage",
"drug regimen",
"variant classification",
]

# Pairs that should have high similarity (related) and low similarity (unrelated).
RELATED_PAIRS: list[tuple[str, str]] = [
("patient", "resource patient"),
("mutation", "variant classification"),
("structural variant", "mutation"),
("copy number alteration", "mutation"),
("tumor mutational burden", "mutation"),
("microsatellite instability", "mutation"),
("overall survival months", "disease-free survival status"),
("timeline sample acquisition", "sample"),
("timeline treatment", "drug regimen"),
("icd10 diagnosis code", "cancer stage"),
]


@dataclass
class SmokeReport:
label: str
dim: int
pair_scores: dict[tuple[str, str], float]

def render(self) -> str:
lines = [
f"label: {self.label}",
f"dim: {self.dim}",
"pair cosine similarities:",
]
for (a, b), score in self.pair_scores.items():
lines.append(f" {a!r} vs {b!r}: {score:.4f}")
return "\n".join(lines)


def _cosine(u: list[float], v: list[float]) -> float:
dot = sum(a * b for a, b in zip(u, v))
norm_u = math.sqrt(sum(a * a for a in u))
norm_v = math.sqrt(sum(b * b for b in v))
if norm_u == 0 or norm_v == 0:
return 0.0
return dot / (norm_u * norm_v)


def _embed_all(embedder, terms: list[str]) -> dict[str, list[float]]:
if hasattr(embedder, "embed_documents"):
vecs = embedder.embed_documents(terms)
elif hasattr(embedder, "encode"):
vecs = embedder.encode(terms)
else:
raise SystemExit("Embedder exposes no known batch method.")
return dict(zip(terms, (list(v) for v in vecs)))


def _build_embedder():
from sema.cli_factories import _get_embedder
from sema.models.config import EmbeddingConfig
return _get_embedder(EmbeddingConfig())


def run(label: str) -> SmokeReport:
embedder = _build_embedder()
vecs = _embed_all(embedder, DOMAIN_TERMS)
sample_dim = len(next(iter(vecs.values())))
pair_scores: dict[tuple[str, str], float] = {}
for a, b in RELATED_PAIRS:
pair_scores[(a, b)] = _cosine(vecs[a], vecs[b])
return SmokeReport(label=label, dim=sample_dim, pair_scores=pair_scores)


def main(argv: list[str]) -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--label", required=True, help="report label")
args = parser.parse_args(argv)
report = run(args.label)
print(report.render())
return 0


if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
79 changes: 71 additions & 8 deletions src/sema/cli_factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,19 @@
Neo4jConfig,
)

DATABRICKS_UNSUPPORTED_ENDPOINT_SUBSTRINGS: tuple[str, ...] = (
"gpt-oss-",
"-codex",
)


class DatabricksProviderAuthError(RuntimeError):
"""Raised when Databricks SDK credential resolution fails at factory time.

Carries a narrow provider-specific type so retrieval-path `except Exception`
catches can re-raise this while still silencing unrelated degradations.
"""


def _get_neo4j_driver(neo4j_config: Neo4jConfig) -> Any:
from neo4j import GraphDatabase
Expand All @@ -17,8 +30,58 @@ def _get_neo4j_driver(neo4j_config: Neo4jConfig) -> Any:
)


def _reject_unsupported_databricks_endpoint(model: str) -> None:
for substring in DATABRICKS_UNSUPPORTED_ENDPOINT_SUBSTRINGS:
if substring in model:
raise ValueError(
f"Databricks endpoint '{model}' is not supported by sema: "
f"endpoints matching '{substring}' use response shapes "
"(reasoning-block content or the OpenAI Responses API) that "
"LLMClient cannot currently consume. Pick a chat-completions "
"endpoint (e.g., databricks-llama-4-maverick, "
"databricks-gemma-3-12b)."
)


def _force_databricks_auth() -> None:
"""Force Databricks SDK credential resolution at factory-construction time.

Without this, `ChatDatabricks` resolves auth lazily at first `invoke`,
and `LLMClient._probe_structured_output` catches Exception — a deferred
auth failure would be swallowed into "structured output not supported"
instead of surfacing.
"""
from databricks.sdk import WorkspaceClient
try:
WorkspaceClient().current_user.me()
except Exception as exc:
raise DatabricksProviderAuthError(
"Databricks SDK could not resolve credentials. Set "
"DATABRICKS_HOST and DATABRICKS_TOKEN, or configure "
"DATABRICKS_CONFIG_PROFILE to select a ~/.databrickscfg profile. "
"See the Databricks SDK default auth chain for other options "
f"(OAuth, service principal). Underlying error: {exc}"
) from exc


def _build_databricks_llm(llm_config: LLMConfig) -> Any:
_reject_unsupported_databricks_endpoint(llm_config.model)
_force_databricks_auth()
from databricks_langchain import ChatDatabricks
return ChatDatabricks(endpoint=llm_config.model)


def _build_databricks_embedder(embedding_config: EmbeddingConfig) -> Any:
_force_databricks_auth()
from databricks_langchain import DatabricksEmbeddings
return DatabricksEmbeddings(endpoint=embedding_config.model)


def _get_llm(llm_config: LLMConfig) -> Any:
provider = llm_config.provider.lower()
if provider == "databricks":
return _build_databricks_llm(llm_config)

api_key = llm_config.api_key.get_secret_value()
timeout = llm_config.request_timeout

Expand All @@ -30,39 +93,40 @@ def _get_llm(llm_config: LLMConfig) -> Any:
base_url="https://openrouter.ai/api/v1",
request_timeout=timeout, # type: ignore[call-arg]
)
elif provider == "anthropic":
if provider == "anthropic":
from langchain_anthropic import ChatAnthropic
return ChatAnthropic(
model=llm_config.model,
api_key=api_key, # type: ignore[call-arg, arg-type]
timeout=float(timeout),
)
elif provider == "openai":
if provider == "openai":
from langchain_openai import ChatOpenAI
return ChatOpenAI(
model=llm_config.model,
api_key=api_key, # type: ignore[arg-type]
request_timeout=timeout, # type: ignore[call-arg]
)
elif provider in ("databricks", "custom"):
if provider == "custom":
from langchain_openai import ChatOpenAI
return ChatOpenAI(
model=llm_config.model,
api_key=api_key, # type: ignore[arg-type]
base_url=llm_config.base_url,
request_timeout=timeout, # type: ignore[call-arg]
)
else:
raise ValueError(f"Unknown LLM provider: {provider}")
raise ValueError(f"Unknown LLM provider: {provider}")


def _get_embedder(embedding_config: EmbeddingConfig) -> Any:
provider = embedding_config.provider.lower()

if provider == "databricks":
return _build_databricks_embedder(embedding_config)
if provider == "sentence-transformers":
from sentence_transformers import SentenceTransformer
return SentenceTransformer(embedding_config.model)
elif provider in ("openrouter", "openai", "databricks", "custom"):
if provider in ("openrouter", "openai", "custom"):
from langchain_openai import OpenAIEmbeddings
base_url = embedding_config.base_url
if provider == "openrouter":
Expand All @@ -72,5 +136,4 @@ def _get_embedder(embedding_config: EmbeddingConfig) -> Any:
api_key=embedding_config.api_key.get_secret_value(), # type: ignore[arg-type]
base_url=base_url,
)
else:
raise ValueError(f"Unknown embedding provider: {provider}")
raise ValueError(f"Unknown embedding provider: {provider}")
Loading
Loading