From a6ce50dca98a03138c310c3adff5e2715797f40f Mon Sep 17 00:00:00 2001 From: Rohit Ghumare Date: Wed, 22 Apr 2026 12:38:54 +0100 Subject: [PATCH] fix(embedding): derive OpenAI provider dimensions from model Follow-up to #186. The PR made model configurable via OPENAI_EMBEDDING_MODEL but left dimensions hardcoded at 1536. A user pointing at text-embedding-3-large (3072 dims) would see provider.dimensions return 1536, which vector-store schemas and hybrid-search weights rely on for correct sizing. - Add MODEL_DIMENSIONS table: text-embedding-3-small=1536, text-embedding-3-large=3072, text-embedding-ada-002=1536. - Make dimensions a computed readonly field. - New OPENAI_EMBEDDING_DIMENSIONS env var for custom / self-hosted OpenAI-compatible endpoints not in the table. Positive integers only; reject non-numeric / non-positive values with a clear error. - Unknown model names fall back to 1536 with the explicit override available if the server returns a different size. - Tests cover known models, dimension override, unknown-model fallback, and validation of the override env var. --- src/providers/embedding/openai.ts | 41 +++++++++++++++++++++++++--- test/embedding-provider.test.ts | 45 +++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 4 deletions(-) diff --git a/src/providers/embedding/openai.ts b/src/providers/embedding/openai.ts index 99205c6..308479f 100644 --- a/src/providers/embedding/openai.ts +++ b/src/providers/embedding/openai.ts @@ -4,19 +4,48 @@ import { getEnvVar } from "../../config.js"; const DEFAULT_BASE_URL = "https://api.openai.com"; const DEFAULT_MODEL = "text-embedding-3-small"; +/** + * Known OpenAI embedding model dimensions. Extend as new models ship. + * Override in any case via OPENAI_EMBEDDING_DIMENSIONS for custom or + * self-hosted OpenAI-compatible endpoints returning non-standard sizes. + */ +const MODEL_DIMENSIONS: Record = { + "text-embedding-3-small": 1536, + "text-embedding-3-large": 3072, + "text-embedding-ada-002": 1536, +}; + +const DEFAULT_DIMENSIONS = MODEL_DIMENSIONS[DEFAULT_MODEL] ?? 1536; + +function resolveDimensions(model: string, override: string | undefined): number { + if (override !== undefined && override.trim().length > 0) { + const parsed = parseInt(override, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error( + `OPENAI_EMBEDDING_DIMENSIONS must be a positive integer, got: ${override}`, + ); + } + return parsed; + } + return MODEL_DIMENSIONS[model] ?? DEFAULT_DIMENSIONS; +} + /** * OpenAI-compatible embedding provider. * * Required env vars: - * OPENAI_API_KEY — API key + * OPENAI_API_KEY — API key * * Optional: - * OPENAI_BASE_URL — base URL without path (default: https://api.openai.com) - * OPENAI_EMBEDDING_MODEL — model name (default: text-embedding-3-small) + * OPENAI_BASE_URL — base URL without path (default: https://api.openai.com) + * OPENAI_EMBEDDING_MODEL — model name (default: text-embedding-3-small) + * OPENAI_EMBEDDING_DIMENSIONS — override reported dimensions (required for + * custom / self-hosted models not in the + * MODEL_DIMENSIONS table above) */ export class OpenAIEmbeddingProvider implements EmbeddingProvider { readonly name = "openai"; - readonly dimensions = 1536; + readonly dimensions: number; private apiKey: string; private baseUrl: string; private model: string; @@ -28,6 +57,10 @@ export class OpenAIEmbeddingProvider implements EmbeddingProvider { getEnvVar("OPENAI_BASE_URL") || DEFAULT_BASE_URL; this.model = getEnvVar("OPENAI_EMBEDDING_MODEL") || DEFAULT_MODEL; + this.dimensions = resolveDimensions( + this.model, + getEnvVar("OPENAI_EMBEDDING_DIMENSIONS"), + ); } async embed(text: string): Promise { diff --git a/test/embedding-provider.test.ts b/test/embedding-provider.test.ts index 2cf3271..48092eb 100644 --- a/test/embedding-provider.test.ts +++ b/test/embedding-provider.test.ts @@ -55,6 +55,7 @@ describe("OpenAIEmbeddingProvider", () => { process.env = { ...originalEnv }; delete process.env["OPENAI_BASE_URL"]; delete process.env["OPENAI_EMBEDDING_MODEL"]; + delete process.env["OPENAI_EMBEDDING_DIMENSIONS"]; }); afterEach(() => { @@ -103,4 +104,48 @@ describe("OpenAIEmbeddingProvider", () => { fetchSpy.mockRestore(); }); + + it("derives dimensions from model in the known-models table", () => { + process.env["OPENAI_EMBEDDING_MODEL"] = "text-embedding-3-large"; + const large = new OpenAIEmbeddingProvider("test-key"); + expect(large.dimensions).toBe(3072); + + process.env["OPENAI_EMBEDDING_MODEL"] = "text-embedding-ada-002"; + const ada = new OpenAIEmbeddingProvider("test-key"); + expect(ada.dimensions).toBe(1536); + + process.env["OPENAI_EMBEDDING_MODEL"] = "text-embedding-3-small"; + const small = new OpenAIEmbeddingProvider("test-key"); + expect(small.dimensions).toBe(1536); + }); + + it("OPENAI_EMBEDDING_DIMENSIONS overrides the model-derived dimensions", () => { + process.env["OPENAI_EMBEDDING_MODEL"] = "text-embedding-3-large"; + process.env["OPENAI_EMBEDDING_DIMENSIONS"] = "768"; + const provider = new OpenAIEmbeddingProvider("test-key"); + expect(provider.dimensions).toBe(768); + }); + + it("falls back to 1536 for unknown custom models", () => { + process.env["OPENAI_EMBEDDING_MODEL"] = "mystery-self-hosted-model"; + const provider = new OpenAIEmbeddingProvider("test-key"); + expect(provider.dimensions).toBe(1536); + }); + + it("rejects invalid OPENAI_EMBEDDING_DIMENSIONS values", () => { + process.env["OPENAI_EMBEDDING_DIMENSIONS"] = "not-a-number"; + expect(() => new OpenAIEmbeddingProvider("test-key")).toThrow( + /OPENAI_EMBEDDING_DIMENSIONS must be a positive integer/, + ); + + process.env["OPENAI_EMBEDDING_DIMENSIONS"] = "-5"; + expect(() => new OpenAIEmbeddingProvider("test-key")).toThrow( + /OPENAI_EMBEDDING_DIMENSIONS must be a positive integer/, + ); + + process.env["OPENAI_EMBEDDING_DIMENSIONS"] = "0"; + expect(() => new OpenAIEmbeddingProvider("test-key")).toThrow( + /OPENAI_EMBEDDING_DIMENSIONS must be a positive integer/, + ); + }); });