diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..7e85350 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,161 @@ +{ + "name": "secondbrain-engine", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "secondbrain-engine", + "version": "0.1.0", + "dependencies": { + "pg": "^8.20.0" + } + }, + "node_modules/pg": { + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/pg/-/pg-8.20.0.tgz", + "integrity": "sha512-ldhMxz2r8fl/6QkXnBD3CR9/xg694oT6DZQ2s6c/RI28OjtSOpxnPrUCGOBJ46RCUxcWdx3p6kw/xnDHjKvaRA==", + "license": "MIT", + "dependencies": { + "pg-connection-string": "^2.12.0", + "pg-pool": "^3.13.0", + "pg-protocol": "^1.13.0", + "pg-types": "2.2.0", + "pgpass": "1.0.5" + }, + "engines": { + "node": ">= 16.0.0" + }, + "optionalDependencies": { + "pg-cloudflare": "^1.3.0" + }, + "peerDependencies": { + "pg-native": ">=3.0.1" + }, + "peerDependenciesMeta": { + "pg-native": { + "optional": true + } + } + }, + "node_modules/pg-cloudflare": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/pg-cloudflare/-/pg-cloudflare-1.3.0.tgz", + "integrity": "sha512-6lswVVSztmHiRtD6I8hw4qP/nDm1EJbKMRhf3HCYaqud7frGysPv7FYJ5noZQdhQtN2xJnimfMtvQq21pdbzyQ==", + "license": "MIT", + "optional": true + }, + "node_modules/pg-connection-string": { + "version": "2.12.0", + "resolved": "https://registry.npmjs.org/pg-connection-string/-/pg-connection-string-2.12.0.tgz", + "integrity": "sha512-U7qg+bpswf3Cs5xLzRqbXbQl85ng0mfSV/J0nnA31MCLgvEaAo7CIhmeyrmJpOr7o+zm0rXK+hNnT5l9RHkCkQ==", + "license": "MIT" + }, + "node_modules/pg-int8": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", + "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", + "license": "ISC", + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/pg-pool": { + "version": "3.13.0", + "resolved": "https://registry.npmjs.org/pg-pool/-/pg-pool-3.13.0.tgz", + "integrity": "sha512-gB+R+Xud1gLFuRD/QgOIgGOBE2KCQPaPwkzBBGC9oG69pHTkhQeIuejVIk3/cnDyX39av2AxomQiyPT13WKHQA==", + "license": "MIT", + "peerDependencies": { + "pg": ">=8.0" + } + }, + "node_modules/pg-protocol": { + "version": "1.13.0", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.13.0.tgz", + "integrity": "sha512-zzdvXfS6v89r6v7OcFCHfHlyG/wvry1ALxZo4LqgUoy7W9xhBDMaqOuMiF3qEV45VqsN6rdlcehHrfDtlCPc8w==", + "license": "MIT" + }, + "node_modules/pg-types": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", + "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", + "license": "MIT", + "dependencies": { + "pg-int8": "1.0.1", + "postgres-array": "~2.0.0", + "postgres-bytea": "~1.0.0", + "postgres-date": "~1.0.4", + "postgres-interval": "^1.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/pgpass": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/pgpass/-/pgpass-1.0.5.tgz", + "integrity": "sha512-FdW9r/jQZhSeohs1Z3sI1yxFQNFvMcnmfuj4WBMUTxOrAyLMaTcE1aAMBiTlbMNaXvBCQuVi0R7hd8udDSP7ug==", + "license": "MIT", + "dependencies": { + "split2": "^4.1.0" + } + }, + "node_modules/postgres-array": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", + "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/postgres-bytea": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.1.tgz", + "integrity": "sha512-5+5HqXnsZPE65IJZSMkZtURARZelel2oXUEO8rH83VS/hxH5vv1uHquPg5wZs8yMAfdv971IU+kcPUczi7NVBQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postgres-date": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", + "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postgres-interval": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", + "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", + "license": "MIT", + "dependencies": { + "xtend": "^4.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/split2": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/split2/-/split2-4.2.0.tgz", + "integrity": "sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==", + "license": "ISC", + "engines": { + "node": ">= 10.x" + } + }, + "node_modules/xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", + "license": "MIT", + "engines": { + "node": ">=0.4" + } + } + } +} diff --git a/package.json b/package.json index 96ecf8a..5d67063 100644 --- a/package.json +++ b/package.json @@ -6,8 +6,11 @@ "type": "module", "scripts": { "typecheck": "tsc --project tsconfig.json", - "build": "tsc --project tsconfig.build.json", + "build": "tsc --project tsconfig.build.json && node scripts/copy-migrations.mjs", "test": "npm run build --silent && sh -c 'node --test \"$@\" test/*.test.mjs' --", "lint": "npx --yes oxlint@1.56.0 $(find src -type d -name upstream -prune -o -name '*.ts' -print) $(find test -name '*.mjs' -print)" + }, + "dependencies": { + "pg": "^8.20.0" } } diff --git a/scripts/copy-migrations.mjs b/scripts/copy-migrations.mjs new file mode 100644 index 0000000..49480ec --- /dev/null +++ b/scripts/copy-migrations.mjs @@ -0,0 +1,7 @@ +import { cpSync } from 'node:fs'; + +cpSync( + 'src/subsystems/supabase/migrations', + 'dist/subsystems/supabase/migrations', + { recursive: true }, +); diff --git a/src/@types/node/index.d.ts b/src/@types/node/index.d.ts index f41d2e5..dd53793 100644 --- a/src/@types/node/index.d.ts +++ b/src/@types/node/index.d.ts @@ -4,6 +4,10 @@ declare namespace NodeJS { SERVICE_NAME?: string; NODE_ENV?: string; PORT?: string; + RELATIONAL_BACKEND?: string; + DATABASE_URL?: string; + DATABASE_SCHEMA?: string; + POSTGRES_AUTO_MIGRATE?: string; } } @@ -11,6 +15,11 @@ declare const process: { env: NodeJS.ProcessEnv; }; +declare class Buffer extends Uint8Array { + static from(data: string | ArrayLike): Buffer; + toString(encoding?: string): string; +} + declare module "node:http" { export interface IncomingMessage { method?: string; @@ -26,7 +35,9 @@ declare module "node:http" { } export interface Server { + listening: boolean; listen(port?: number): Server; + close(callback?: (error?: Error) => void): Server; once(event: string, listener: (...args: any[]) => void): Server; off(event: string, listener: (...args: any[]) => void): Server; } @@ -38,3 +49,16 @@ declare module "node:http" { ) => void, ): Server; } + +declare module "node:crypto" { + export function timingSafeEqual(a: Buffer, b: Buffer): boolean; + export function createHash(algorithm: string): { + update(data: string): { digest(): Buffer }; + }; +} + +declare module "node:fs" { + export function readFileSync(path: string | URL, encoding: "utf8"): string; + export function readdirSync(path: string | URL): string[]; + export function existsSync(path: string | URL): boolean; +} diff --git a/src/@types/pg/index.d.ts b/src/@types/pg/index.d.ts new file mode 100644 index 0000000..61485cb --- /dev/null +++ b/src/@types/pg/index.d.ts @@ -0,0 +1,28 @@ +declare module "pg" { + export interface QueryResult { + rows: Row[]; + rowCount: number | null; + } + + export interface PoolConfig { + connectionString?: string; + } + + export class Pool { + constructor(config?: PoolConfig); + query( + text: string, + values?: readonly unknown[], + ): Promise>; + connect(): Promise; + end(): Promise; + } + + export interface PoolClient { + query( + text: string, + values?: readonly unknown[], + ): Promise>; + release(err?: Error | boolean): void; + } +} diff --git a/src/app/create-app.ts b/src/app/create-app.ts index 0462ae6..3b94169 100644 --- a/src/app/create-app.ts +++ b/src/app/create-app.ts @@ -3,24 +3,51 @@ import type { IncomingMessage, ServerResponse } from "node:http"; import { createHealthRoute, type HealthPayload } from "../api/routes/health.js"; import type { IngestionService } from "../ingestion/service.js"; import type { RetrievalService } from "../retrieval/service.js"; -import type { AppEnv } from "./env.js"; +import type { + WorkspaceContext, + WorkspaceContextResolver, +} from "../workspace/context.js"; +import type { AppEnv, PublicAppEnv } from "./env.js"; export interface AppServices { ingestion: IngestionService; retrieval: RetrievalService; } +export type ProtectedRequestInput = { + workspaceId?: string; + headers: Record; +}; + +export type ProtectedRequestResult = + | { ok: true; context: WorkspaceContext } + | { + ok: false; + statusCode: number; + error: { + code: string; + message: string; + workspaceId?: string; + requestId?: string; + correlationId?: string; + }; + }; + export interface CreateAppOptions { env: AppEnv; services: AppServices; + contextResolver?: WorkspaceContextResolver; now?: () => Date; } export interface SecondBrainEngineApp { - env: AppEnv; + env: PublicAppEnv; services: AppServices; health(): HealthPayload; handleNodeRequest(request: IncomingMessage, response: ServerResponse): void; + resolveProtectedRequest( + input: ProtectedRequestInput, + ): Promise; } export function createApp(options: CreateAppOptions): SecondBrainEngineApp { @@ -29,8 +56,10 @@ export function createApp(options: CreateAppOptions): SecondBrainEngineApp { now: options.now, }); + const { contextResolver } = options; + return { - env: options.env, + env: redactEnv(options.env), services: options.services, health, handleNodeRequest(request, response) { @@ -49,5 +78,121 @@ export function createApp(options: CreateAppOptions): SecondBrainEngineApp { }); response.end(JSON.stringify({ error: "Not Found" })); }, + async resolveProtectedRequest(input) { + if (!contextResolver) { + return { + ok: false, + statusCode: 401, + error: { + code: "auth_disabled", + message: "Protected routes require configured auth", + }, + }; + } + + const requestId = extractHeader(input.headers, "x-request-id"); + const correlationId = extractHeader(input.headers, "x-correlation-id"); + const authorizationHeader = extractHeader( + input.headers, + "authorization", + ); + + const result = await contextResolver.resolve({ + workspaceId: input.workspaceId, + authorizationHeader, + requestId, + correlationId, + }); + + if (result.ok) { + return { ok: true, context: result.context }; + } + + const statusCode = resolveStatusCode(result.code); + return { + ok: false, + statusCode, + error: { + code: result.code, + message: resolveErrorMessage(result.code), + ...("workspaceId" in result && result.workspaceId !== undefined + ? { workspaceId: result.workspaceId } + : {}), + ...("requestId" in result && result.requestId !== undefined + ? { requestId: result.requestId } + : {}), + ...("correlationId" in result && result.correlationId !== undefined + ? { correlationId: result.correlationId } + : {}), + }, + }; + }, }; } + +function redactEnv(env: AppEnv): PublicAppEnv { + const auth: PublicAppEnv["auth"] = + env.auth.mode === "service-token" + ? { mode: "service-token", actorId: env.auth.actorId } + : { mode: "disabled" }; + + const relational: PublicAppEnv["relational"] = + env.relational.backend === "postgres" + ? { + backend: "postgres", + schema: env.relational.schema, + autoMigrate: env.relational.autoMigrate, + } + : { backend: "memory" }; + + return { + serviceName: env.serviceName, + mode: env.mode, + port: env.port, + auth, + relational, + }; +} + +function extractHeader( + headers: Record, + name: string, +): string | undefined { + const lower = name.toLowerCase(); + for (const key of Object.keys(headers)) { + if (key.toLowerCase() === lower) { + const value = headers[key]; + return Array.isArray(value) ? value[0] : value; + } + } + + return undefined; +} + +function resolveStatusCode(code: string): number { + switch (code) { + case "workspace_required": + return 400; + case "workspace_forbidden": + return 403; + default: + return 401; + } +} + +function resolveErrorMessage(code: string): string { + switch (code) { + case "workspace_required": + return "Workspace target is required"; + case "missing_credentials": + return "Missing bearer credentials"; + case "invalid_credentials": + return "Invalid bearer credentials"; + case "auth_disabled": + return "Protected routes require configured auth"; + case "workspace_forbidden": + return "Workspace access denied"; + default: + return "Authorization failed"; + } +} diff --git a/src/app/env.ts b/src/app/env.ts index 7c1491b..223e129 100644 --- a/src/app/env.ts +++ b/src/app/env.ts @@ -1,9 +1,68 @@ export type AppMode = "development" | "test" | "production"; +export type AuthConfig = + | { mode: "disabled" } + | { mode: "service-token"; serviceToken: string; actorId: string }; + +export type PublicAuthConfig = + | { mode: "disabled" } + | { mode: "service-token"; actorId: string }; + +export type RelationalConfig = + | { backend: "memory" } + | { + backend: "postgres"; + databaseUrl: string; + schema?: string; + autoMigrate?: boolean; + }; + +export type PublicRelationalConfig = + | { backend: "memory" } + | { + backend: "postgres"; + schema?: string; + autoMigrate?: boolean; + }; + export interface AppEnv { serviceName: string; mode: AppMode; port: number; + auth: AuthConfig; + relational: RelationalConfig; +} + +export interface PublicAppEnv { + serviceName: string; + mode: AppMode; + port: number; + auth: PublicAuthConfig; + relational: PublicRelationalConfig; +} + +export function redactAppEnv(env: AppEnv): PublicAppEnv { + const auth: PublicAuthConfig = + env.auth.mode === "service-token" + ? { mode: "service-token", actorId: env.auth.actorId } + : { mode: "disabled" }; + + const relational: PublicRelationalConfig = + env.relational.backend === "postgres" + ? { + backend: "postgres", + schema: env.relational.schema, + autoMigrate: env.relational.autoMigrate, + } + : { backend: "memory" }; + + return { + serviceName: env.serviceName, + mode: env.mode, + port: env.port, + auth, + relational, + }; } export function loadAppEnv(source: NodeJS.ProcessEnv = process.env): AppEnv { @@ -11,6 +70,8 @@ export function loadAppEnv(source: NodeJS.ProcessEnv = process.env): AppEnv { serviceName: source.SERVICE_NAME?.trim() || "secondbrain-engine", mode: parseAppMode(source.NODE_ENV), port: parsePort(source.PORT), + auth: parseAuthConfig(source), + relational: parseRelationalConfig(source), }; } @@ -31,3 +92,38 @@ function parsePort(value: string | undefined): number { return 4000; } + +function parseAuthConfig(source: NodeJS.ProcessEnv): AuthConfig { + const mode = source.AUTH_MODE?.trim(); + + if (mode === "service-token") { + const serviceToken = source.AUTH_SERVICE_TOKEN?.trim() ?? ""; + const actorId = source.AUTH_SERVICE_ACTOR_ID?.trim() ?? "service"; + + return { mode: "service-token", serviceToken, actorId }; + } + + return { mode: "disabled" }; +} + +function parseRelationalConfig(source: NodeJS.ProcessEnv): RelationalConfig { + const backend = source.RELATIONAL_BACKEND?.trim(); + + if (backend === "postgres") { + const databaseUrl = source.DATABASE_URL?.trim(); + + if (!databaseUrl) { + throw new Error( + "RELATIONAL_BACKEND=postgres requires DATABASE_URL to be set", + ); + } + + const schema = source.DATABASE_SCHEMA?.trim() || undefined; + const autoMigrate = + source.POSTGRES_AUTO_MIGRATE?.trim().toLowerCase() === "true"; + + return { backend: "postgres", databaseUrl, schema, autoMigrate }; + } + + return { backend: "memory" }; +} diff --git a/src/index.ts b/src/index.ts index ef13a73..865f67c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,5 @@ import { createServer, type Server } from "node:http"; +import { createHash, timingSafeEqual } from "node:crypto"; import { type AppServices, @@ -6,20 +7,48 @@ import { type SecondBrainEngineApp, } from "./app/create-app.js"; import { type AppEnv, loadAppEnv } from "./app/env.js"; +import { + createExtractorRuntime, + type ExtractorRuntime, +} from "./ingestion/extractors/index.js"; import type { IngestionService } from "./ingestion/service.js"; import type { + AdaptiveRetrievalTrace, ContextPacket, + RetrievalCitation, + RetrievalQuery, RetrievalService, RetrievedContextChunk, } from "./retrieval/service.js"; import type { GraphMemoryPort } from "./subsystems/graphiti/port.js"; import type { VectorIndexPort } from "./subsystems/qdrant/port.js"; -import type { RelationalWorkspacePort } from "./subsystems/supabase/port.js"; +import { + createNoopReranker, + type RerankerPort, +} from "./subsystems/reranker/port.js"; +import { + isRepresentationAwareRelationalWorkspacePort, + type RelationalWorkspacePort, + type RepresentationAwareRelationalWorkspacePort, +} from "./subsystems/supabase/port.js"; +import { createPostgresRelationalWorkspaceRepository } from "./subsystems/supabase/postgres.js"; import { createInMemoryRelationalWorkspaceRepository } from "./subsystems/supabase/repository.js"; +import { + type ArtifactStoragePort, + createInMemoryArtifactStoragePort, +} from "./subsystems/supabase/storage-port.js"; +import type { + WorkspaceContext, + WorkspaceContextResolutionInput, + WorkspaceContextResolver, + WorkspaceResolutionResult, +} from "./workspace/context.js"; export interface Runtime { app: SecondBrainEngineApp; server: Server; + ready(): Promise; + close(): Promise; } export interface RuntimeOverrides { @@ -37,35 +66,103 @@ export interface RuntimeSubsystemOverrides { interface InternalSubsystemPorts { workspaceStore: RelationalWorkspacePort; + extractorRuntime: ExtractorRuntime; + artifactStorage: ArtifactStoragePort; graphMemory: GraphMemoryPort; vectorIndex: VectorIndexPort; + reranker?: RerankerPort; + initialize?: () => Promise; + close?: () => Promise; } +const ADAPTIVE_QUERY_STOP_WORDS = new Set([ + "a", + "about", + "an", + "and", + "can", + "do", + "explain", + "for", + "give", + "i", + "in", + "me", + "of", + "on", + "please", + "show", + "tell", + "the", + "to", + "we", + "what", + "you", +]); + +const WORKSPACE_STORE_SYMBOL = Symbol("workspaceStore"); + +type ServiceWithWorkspaceStore = { + [WORKSPACE_STORE_SYMBOL]?: RelationalWorkspacePort; +}; + +type WorkspaceStoreWithLifecycle = RelationalWorkspacePort & { + initialize?: () => Promise; + close?: () => Promise; +}; + export function createRuntime(overrides: RuntimeOverrides = {}): Runtime { const env = overrides.env ?? loadAppEnv(); - const defaultInternal = createDefaultInternalPorts(); - const internal: InternalSubsystemPorts = { - workspaceStore: - overrides.subsystems?.workspaceStore ?? defaultInternal.workspaceStore, - graphMemory: - overrides.subsystems?.graphMemory ?? defaultInternal.graphMemory, - vectorIndex: - overrides.subsystems?.vectorIndex ?? defaultInternal.vectorIndex, + const defaultInternal = createDefaultInternalPorts(env); + const overrideWorkspaceStore = overrides.subsystems?.workspaceStore; + const graphMemory = + overrides.subsystems?.graphMemory ?? defaultInternal.graphMemory; + const vectorIndex = + overrides.subsystems?.vectorIndex ?? defaultInternal.vectorIndex; + const usesDefaultWorkspaceStore = + overrideWorkspaceStore === undefined || + overrideWorkspaceStore === defaultInternal.workspaceStore; + const baseInternal: InternalSubsystemPorts = { + ...defaultInternal, + graphMemory, + vectorIndex, + initialize: usesDefaultWorkspaceStore + ? defaultInternal.initialize + : undefined, + close: usesDefaultWorkspaceStore ? defaultInternal.close : undefined, }; + const runtimeInternal = resolveRuntimeInternalPorts({ + auth: env.auth, + internal: baseInternal, + overrideWorkspaceStore, + services: overrides.services, + }); const services = createAppServices({ - internal, + internal: runtimeInternal, overrides: overrides.services, now: overrides.now, }); const app = createApp({ env, services, + contextResolver: createWorkspaceContextResolver({ + auth: env.auth, + workspaceStore: runtimeInternal.workspaceStore, + }), now: overrides.now, }); + const appServer = createServer(app.handleNodeRequest); return { app, - server: createServer(app.handleNodeRequest), + server: appServer, + async ready() { + await runtimeInternal.initialize?.(); + }, + async close() { + await closeNodeServer(appServer); + await runtimeInternal.close?.(); + }, }; } @@ -79,8 +176,14 @@ export async function startServer( env: port === undefined ? env : { ...env, port }, }); - await listen(runtime.server, runtime.app.env.port); - return runtime; + try { + await runtime.ready(); + await listen(runtime.server, runtime.app.env.port); + return runtime; + } catch (error) { + await runtime.close().catch(() => undefined); + throw error; + } } function createAppServices(input: { @@ -102,228 +205,1121 @@ function createAppServices(input: { }; } -export function createIngestionService(input: { +function resolveRuntimeInternalPorts(input: { + auth: AppEnv["auth"]; + internal: InternalSubsystemPorts; + overrideWorkspaceStore?: RelationalWorkspacePort; + services?: Partial; +}): InternalSubsystemPorts { + const taggedIngestionStore = getWorkspaceStoreFromService( + input.services?.ingestion, + ); + const taggedRetrievalStore = getWorkspaceStoreFromService( + input.services?.retrieval, + ); + const stores = [ + input.overrideWorkspaceStore, + taggedIngestionStore, + taggedRetrievalStore, + ].filter((store): store is RelationalWorkspacePort => store !== undefined); + const uniqueStores = [...new Set(stores)]; + + if (input.auth.mode !== "disabled" && uniqueStores.length > 1) { + throw new Error( + "auth-enabled runtime requires ingestion and retrieval services to share one workspace store", + ); + } + + if (input.auth.mode === "disabled") { + const workspaceStore = + input.overrideWorkspaceStore ?? + taggedIngestionStore ?? + taggedRetrievalStore ?? + input.internal.workspaceStore; + const lifecycleStores = [ + workspaceStore, + taggedIngestionStore, + taggedRetrievalStore, + ].filter((store): store is RelationalWorkspacePort => store !== undefined); + + return attachWorkspaceStoreLifecycle({ + internal: input.internal, + workspaceStore, + lifecycleStores, + }); + } + + if (uniqueStores.length === 1) { + const workspaceStore = uniqueStores[0]; + const hasTaggedService = + taggedIngestionStore !== undefined || taggedRetrievalStore !== undefined; + const hasUntaggedService = + (input.services?.ingestion !== undefined && + taggedIngestionStore === undefined) || + (input.services?.retrieval !== undefined && + taggedRetrievalStore === undefined); + + if (hasTaggedService && hasUntaggedService) { + throw new Error( + "auth-enabled runtime with custom services requires services created by secondbrain-engine factories or tagged workspace stores", + ); + } + + if ( + input.overrideWorkspaceStore !== undefined && + input.services?.ingestion !== undefined && + taggedIngestionStore === undefined + ) { + throw new Error( + "auth-enabled runtime with custom services requires services created by secondbrain-engine factories or tagged workspace stores", + ); + } + + if ( + input.overrideWorkspaceStore !== undefined && + input.services?.retrieval !== undefined && + taggedRetrievalStore === undefined + ) { + throw new Error( + "auth-enabled runtime with custom services requires services created by secondbrain-engine factories or tagged workspace stores", + ); + } + + return attachWorkspaceStoreLifecycle({ + internal: input.internal, + workspaceStore, + lifecycleStores: [workspaceStore], + }); + } + + if ( + input.services?.ingestion !== undefined || + input.services?.retrieval !== undefined + ) { + throw new Error( + "auth-enabled runtime with custom services requires services created by secondbrain-engine factories or tagged workspace stores", + ); + } + + return input.internal; +} + +function attachWorkspaceStoreLifecycle(input: { + internal: InternalSubsystemPorts; workspaceStore: RelationalWorkspacePort; - graphMemory: GraphMemoryPort; - vectorIndex: VectorIndexPort; - now?: () => Date; -}): IngestionService { + lifecycleStores?: RelationalWorkspacePort[]; +}): InternalSubsystemPorts { + const lifecycleStores = [ + ...(input.lifecycleStores ?? [input.workspaceStore]), + ].filter((store, index, stores) => stores.indexOf(store) === index); + + if ( + input.workspaceStore === input.internal.workspaceStore && + lifecycleStores.length === 1 && + lifecycleStores[0] === input.internal.workspaceStore + ) { + return input.internal; + } + + // When internal.initialize/close are defined they already manage internal.workspaceStore, + // so exclude it from the explicit per-store loops to avoid double-invocation. + const storesForInit = + input.internal.initialize !== undefined + ? lifecycleStores.filter((s) => s !== input.internal.workspaceStore) + : lifecycleStores; + const storesForClose = + input.internal.close !== undefined + ? lifecycleStores.filter((s) => s !== input.internal.workspaceStore) + : lifecycleStores; + + const hasStoreInitialize = storesForInit.some( + (store) => (store as WorkspaceStoreWithLifecycle).initialize !== undefined, + ); + const hasStoreClose = storesForClose.some( + (store) => (store as WorkspaceStoreWithLifecycle).close !== undefined, + ); + + const initialize = + hasStoreInitialize || input.internal.initialize !== undefined + ? async () => { + if (hasStoreInitialize) { + for (const store of storesForInit) { + await (store as WorkspaceStoreWithLifecycle).initialize?.(); + } + } + await input.internal.initialize?.(); + } + : undefined; + const close = + hasStoreClose || input.internal.close !== undefined + ? async () => { + const errors: unknown[] = []; + try { + await input.internal.close?.(); + } catch (error) { + errors.push(error); + } + if (hasStoreClose) { + for (const store of [...storesForClose].reverse()) { + try { + await (store as WorkspaceStoreWithLifecycle).close?.(); + } catch (error) { + errors.push(error); + } + } + } + if (errors.length === 1) { + throw errors[0]; + } + if (errors.length > 1) { + throw new AggregateError( + errors, + "Multiple errors occurred while closing lifecycle stores and internal resources.", + ); + } + } + : undefined; + return { - async ingest(command) { - const serviceTimestamp = - input.now?.().toISOString() ?? new Date().toISOString(); - const occurredAt = command.occurredAt ?? serviceTimestamp; - const hasAccess = await input.workspaceStore.validateAccess( - command.workspace, - ); + ...input.internal, + workspaceStore: input.workspaceStore, + initialize, + close, + }; +} - if (!hasAccess) { - throw new Error( - `workspace access denied for ${command.workspace.workspaceId}`, - ); +function withWorkspaceStore( + service: T, + workspaceStore: RelationalWorkspacePort, +): T { + Object.defineProperty(service, WORKSPACE_STORE_SYMBOL, { + value: workspaceStore, + enumerable: false, + configurable: false, + writable: false, + }); + + return service; +} + +function getWorkspaceStoreFromService( + service: IngestionService | RetrievalService | undefined, +): RelationalWorkspacePort | undefined { + return (service as ServiceWithWorkspaceStore | undefined)?.[ + WORKSPACE_STORE_SYMBOL + ]; +} + +function createWorkspaceContextResolver(input: { + auth: AppEnv["auth"]; + workspaceStore: RelationalWorkspacePort; +}): WorkspaceContextResolver { + const serviceTokenDigest = + input.auth.mode !== "disabled" + ? createHash("sha256").update(input.auth.serviceToken).digest() + : null; + + return { + async resolve( + resolutionInput: WorkspaceContextResolutionInput, + ): Promise { + const workspaceId = resolutionInput.workspaceId?.trim(); + const baseFailure = { + ...(resolutionInput.requestId === undefined + ? {} + : { requestId: resolutionInput.requestId }), + ...(resolutionInput.correlationId === undefined + ? {} + : { correlationId: resolutionInput.correlationId }), + ...(workspaceId === undefined || workspaceId === "" + ? {} + : { workspaceId }), + }; + + if (!workspaceId) { + return { + ok: false, + code: "workspace_required", + ...baseFailure, + }; } - const source = await input.workspaceStore.upsertSource({ - workspace: command.workspace, - source: command.source, - occurredAt, - }); + if (input.auth.mode === "disabled") { + return { + ok: false, + code: "auth_disabled", + ...baseFailure, + }; + } - const persistedItems = await input.workspaceStore.persistItems({ - workspace: command.workspace, - source, - items: command.items, - occurredAt, - }); + const bearerToken = parseBearerToken(resolutionInput.authorizationHeader); + if (bearerToken === null) { + return { + ok: false, + code: "missing_credentials", + ...baseFailure, + }; + } - try { - await input.graphMemory.upsertFacts({ - workspaceId: command.workspace.workspaceId, - sourceId: source.sourceId, - facts: persistedItems.map((item) => ({ - itemId: item.itemId, - content: item.content, - contentType: item.contentType, - })), - }); - } catch { - // Canonical relational persistence remains authoritative in this bead. + if ( + bearerToken === undefined || + serviceTokenDigest === null || + !timingSafeEqual( + createHash("sha256").update(bearerToken).digest(), + serviceTokenDigest, + ) + ) { + return { + ok: false, + code: "invalid_credentials", + ...baseFailure, + }; } - try { - await input.vectorIndex.upsertDocuments({ - workspaceId: command.workspace.workspaceId, - documents: persistedItems.map((item) => ({ - documentId: `${item.sourceId}:${item.itemId}`, - text: item.content, - embeddingModel: "pending", - metadata: { - workspaceId: command.workspace.workspaceId, - sourceId: source.sourceId, - itemId: item.itemId, - contentType: item.contentType, - }, - })), - }); - } catch { - // Canonical relational persistence remains authoritative in this bead. + const context: WorkspaceContext = { + workspaceId, + actor: { + actorId: input.auth.actorId, + role: "service", + }, + ...(resolutionInput.requestId === undefined + ? {} + : { requestId: resolutionInput.requestId }), + ...(resolutionInput.correlationId === undefined + ? {} + : { correlationId: resolutionInput.correlationId }), + }; + + const hasAccess = await input.workspaceStore.validateAccess(context); + if (!hasAccess) { + return { + ok: false, + code: "workspace_forbidden", + ...baseFailure, + }; } return { - workspaceId: command.workspace.workspaceId, - sourceId: source.sourceId, - acceptedItems: persistedItems.length, - status: "accepted", - receivedAt: serviceTimestamp, + ok: true, + context, }; }, }; } +function parseBearerToken( + authorizationHeader: string | undefined, +): string | null | undefined { + if (!authorizationHeader) { + return null; + } + + const match = /^Bearer\s+(.+)$/i.exec(authorizationHeader.trim()); + if (!match) { + return undefined; + } + + const token = match[1]?.trim(); + return token ? token : undefined; +} + +export function createIngestionService(input: { + workspaceStore: RelationalWorkspacePort; + graphMemory: GraphMemoryPort; + vectorIndex: VectorIndexPort; + now?: () => Date; +}): IngestionService { + return withWorkspaceStore( + { + async ingest(command) { + const serviceTimestamp = + input.now?.().toISOString() ?? new Date().toISOString(); + const occurredAt = command.occurredAt ?? serviceTimestamp; + const hasAccess = await input.workspaceStore.validateAccess( + command.workspace, + ); + + if (!hasAccess) { + throw new Error( + `workspace access denied for ${command.workspace.workspaceId}`, + ); + } + + const representationAwareStore = getRepresentationAwareWorkspaceStore( + input.workspaceStore, + ); + if ( + (command.artifacts?.length || + command.representations?.length || + command.items.some( + (item) => item.representationId !== undefined, + )) && + !representationAwareStore + ) { + throw new Error( + "workspace store does not support multimodal canonical persistence", + ); + } + + const persisted = representationAwareStore?.persistCanonicalIngestion + ? await representationAwareStore.persistCanonicalIngestion({ + workspace: command.workspace, + source: command.source, + artifacts: command.artifacts, + representations: command.representations, + items: command.items, + occurredAt, + }) + : await (async () => { + const source = await input.workspaceStore.upsertSource({ + workspace: command.workspace, + source: command.source, + occurredAt, + }); + + return { + source, + artifacts: [], + representations: [], + items: await input.workspaceStore.persistItems({ + workspace: command.workspace, + source, + items: command.items, + occurredAt, + }), + }; + })(); + const source = persisted.source; + const persistedItems = persisted.items; + + try { + await input.graphMemory.upsertFacts({ + workspaceId: command.workspace.workspaceId, + sourceId: source.sourceId, + facts: persistedItems.map((item) => ({ + itemId: item.itemId, + content: item.content, + contentType: item.contentType, + })), + }); + } catch { + // Canonical relational persistence remains authoritative in this bead. + } + + try { + await input.vectorIndex.upsertDocuments({ + workspaceId: command.workspace.workspaceId, + documents: persistedItems.map((item) => { + const metadata = { + workspaceId: command.workspace.workspaceId, + sourceId: source.sourceId, + contentType: item.contentType, + itemId: item.itemId, + occurredAt: item.occurredAt, + ...(item.packetKey === undefined + ? {} + : { packetKey: item.packetKey }), + ...(item.sectionKey === undefined + ? {} + : { sectionKey: item.sectionKey }), + }; + + return { + documentId: `${item.sourceId}:${item.itemId}`, + text: item.content, + embeddingModel: "pending", + metadata, + }; + }), + }); + } catch { + // Canonical relational persistence remains authoritative in this bead. + } + + return { + workspaceId: command.workspace.workspaceId, + sourceId: source.sourceId, + acceptedItems: persistedItems.length, + status: "accepted", + receivedAt: serviceTimestamp, + }; + }, + }, + input.workspaceStore, + ); +} + +function getRepresentationAwareWorkspaceStore( + workspaceStore: RelationalWorkspacePort, +): RepresentationAwareRelationalWorkspacePort | null { + return isRepresentationAwareRelationalWorkspacePort(workspaceStore) + ? workspaceStore + : null; +} + export function createRetrievalService( internal: InternalSubsystemPorts, ): RetrievalService { - return { - async retrieve(query) { - const limit = normalizeRetrievalLimit(query.limit); - const hasAccess = await internal.workspaceStore.validateAccess( - query.workspace, - ); - - if (!hasAccess) { - throw new Error( - `workspace access denied for ${query.workspace.workspaceId}`, + return withWorkspaceStore( + { + async retrieve(query) { + const reranker = internal.reranker ?? createNoopReranker(); + const limit = normalizeRetrievalLimit(query.limit); + const hasAccess = await internal.workspaceStore.validateAccess( + query.workspace, ); - } - const relationalSources = await internal.workspaceStore.findSources({ - workspaceId: query.workspace.workspaceId, - queryText: query.queryText, - sourceIds: query.sourceIds, - limit, - }); - const relationalItems = await internal.workspaceStore.findItems({ - workspaceId: query.workspace.workspaceId, - queryText: query.queryText, - sourceIds: query.sourceIds, - limit: limit * 3, - }); - let graphCandidates: Awaited< - ReturnType - > = []; - try { - graphCandidates = await internal.graphMemory.findRelatedContext({ - workspaceId: query.workspace.workspaceId, - queryText: query.queryText, + if (!hasAccess) { + throw new Error( + `workspace access denied for ${query.workspace.workspaceId}`, + ); + } + + const attemptedQueries = buildAdaptiveQueryVariants(query.queryText); + const adaptiveQueries = attemptedQueries.slice(1, 2); + const hasAdaptiveVariant = adaptiveQueries.length > 0; + const topPacketIdsPerPass: string[][] = []; + const initialAttempt = await runRetrievalAttempt({ + internal, + workspace: query.workspace, + queryText: attemptedQueries[0], limit, + sourceIds: query.sourceIds, + reranker, }); - } catch { - graphCandidates = []; - } - - let vectorHits: Awaited> = []; - try { - vectorHits = await internal.vectorIndex.search({ + topPacketIdsPerPass.push( + initialAttempt.rankedPackets.map((packet) => packet.packetId), + ); + let packets = initialAttempt.packets; + let rankedPackets = initialAttempt.rankedPackets; + + let adaptiveTrace: AdaptiveRetrievalTrace = { + strategy: "single-pass", + ran: false, + passes: 1, + triggerReason: getSinglePassTriggerReason( + initialAttempt.rankedPackets, + adaptiveQueries.length, + ), + queriesTried: [attemptedQueries[0]], + topPacketIdsPerPass, + }; + + if ( + limit > 0 && + hasAdaptiveVariant && + shouldRunAdaptivePass(initialAttempt.rankedPackets) + ) { + adaptiveTrace = { + strategy: "keyword-expansion", + ran: true, + passes: 1, + triggerReason: getAdaptiveTriggerReason( + initialAttempt.rankedPackets, + ), + queriesTried: [attemptedQueries[0]], + topPacketIdsPerPass, + }; + + for (const adaptiveQueryText of adaptiveQueries) { + const adaptiveAttempt = await runRetrievalAttempt({ + internal, + workspace: query.workspace, + queryText: adaptiveQueryText, + limit, + sourceIds: query.sourceIds, + reranker, + }); + topPacketIdsPerPass.push( + adaptiveAttempt.rankedPackets.map((packet) => packet.packetId), + ); + adaptiveTrace.queriesTried.push(adaptiveQueryText); + adaptiveTrace.passes += 1; + packets = mergeContextPackets(packets, adaptiveAttempt.packets); + } + + rankedPackets = + packets.length === 0 + ? [] + : await rankContextPackets({ + packets: packets.map((packet) => cloneContextPacket(packet)), + queryText: query.queryText, + limit, + reranker, + }); + } + + return { workspaceId: query.workspace.workspaceId, queryText: query.queryText, - limit, + packets: rankedPackets, + totalPackets: rankedPackets.length, + adaptive: adaptiveTrace, + }; + }, + }, + internal.workspaceStore, + ); +} + +function shouldRunAdaptivePass(packets: ContextPacket[]): boolean { + if (packets.length === 0) { + return true; + } + + const topPacket = packets[0]; + return topPacket.scores.graph === 0 && !hasGroundedVectorEvidence(topPacket); +} + +function getAdaptiveTriggerReason(packets: ContextPacket[]): string { + if (packets.length === 0) { + return "no grounded packets from initial pass"; + } + + const topPacket = packets[0]; + if ( + topPacket.scores.relational > 0 && + topPacket.scores.graph === 0 && + !hasGroundedVectorEvidence(topPacket) + ) { + return "no grounded packets from initial pass"; + } + + return "initial packet evidence weak"; +} + +function getSinglePassTriggerReason( + packets: ContextPacket[], + adaptiveQueryCount: number, +): string { + if (adaptiveQueryCount === 0 && shouldRunAdaptivePass(packets)) { + return "no distinct adaptive query variant available"; + } + + return "initial packet evidence strong enough"; +} + +function hasGroundedVectorEvidence(packet: ContextPacket): boolean { + return ( + packet.scores.vector > 0 && + packet.supportingChunks.some((chunk) => chunk.score > 0) + ); +} + +async function runRetrievalAttempt(input: { + internal: InternalSubsystemPorts; + workspace: WorkspaceContext; + queryText: string; + limit: number; + sourceIds?: RetrievalQuery["sourceIds"]; + reranker: RerankerPort; +}): Promise<{ packets: ContextPacket[]; rankedPackets: ContextPacket[] }> { + if (input.limit === 0) { + return { + packets: [], + rankedPackets: [], + }; + } + + const relationalSources = await input.internal.workspaceStore.findSources({ + workspaceId: input.workspace.workspaceId, + queryText: input.queryText, + sourceIds: input.sourceIds, + limit: input.limit, + }); + const relationalItems = await input.internal.workspaceStore.findItems({ + workspaceId: input.workspace.workspaceId, + queryText: input.queryText, + sourceIds: input.sourceIds, + limit: input.limit * 3, + }); + + let graphCandidates: Awaited< + ReturnType + > = []; + try { + graphCandidates = await input.internal.graphMemory.findRelatedContext({ + workspaceId: input.workspace.workspaceId, + queryText: input.queryText, + limit: input.limit, + }); + } catch { + graphCandidates = []; + } + + let vectorHits: Awaited> = []; + try { + vectorHits = await input.internal.vectorIndex.search({ + workspaceId: input.workspace.workspaceId, + queryText: input.queryText, + limit: input.limit, + sourceIds: input.sourceIds, + }); + } catch { + vectorHits = []; + } + + const expansionSourceIds = collectExpansionSourceIds({ + relationalItems, + graphCandidates, + vectorHits, + sourceIds: input.sourceIds, + }); + + const hydratedRelationalSources = + expansionSourceIds.length === 0 + ? [] + : await hydrateSourceCandidates({ + workspaceStore: input.internal.workspaceStore, + workspaceId: input.workspace.workspaceId, + sourceIds: expansionSourceIds, }); - } catch { - vectorHits = []; + const hydratedRelationalItems = await hydrateVectorMatchedItems({ + workspaceStore: input.internal.workspaceStore, + workspaceId: input.workspace.workspaceId, + vectorHits, + sourceIds: input.sourceIds, + }); + const hydratedGraphItems = await hydrateGraphMatchedItems({ + workspaceStore: input.internal.workspaceStore, + workspaceId: input.workspace.workspaceId, + graphCandidates, + sourceIds: input.sourceIds, + }); + const mergedRelationalItems = mergeItemCandidates( + mergeItemCandidates(relationalItems, hydratedRelationalItems), + hydratedGraphItems, + ); + + const vectorItemsByDocumentId = buildVectorItemsByDocumentId( + mergedRelationalItems, + ); + + const packets = buildContextPackets({ + relationalSources: mergeSourceCandidates( + relationalSources, + hydratedRelationalSources, + ), + relationalItems: mergedRelationalItems, + graphCandidates, + vectorHits, + vectorItemsByDocumentId, + sourceIds: input.sourceIds, + }); + + return { + packets, + rankedPackets: + packets.length === 0 + ? [] + : await rankContextPackets({ + packets: packets.map((packet) => cloneContextPacket(packet)), + queryText: input.queryText, + limit: input.limit, + reranker: input.reranker, + }), + }; +} + +function createDefaultInternalPorts(env: AppEnv): InternalSubsystemPorts { + // Initial upstream source slices are now vendored for all three foundations, but + // these adapters remain intentionally narrow until the internal wiring lands. + const shouldSeedBootstrapWorkspace = env.mode !== "production"; + const defaultWorkspaces = shouldSeedBootstrapWorkspace + ? [ + { + workspaceId: "default-workspace", + slug: "default-workspace", + displayName: "Default Workspace", + }, + ] + : []; + const defaultMemberships = shouldSeedBootstrapWorkspace + ? [ + { + workspaceId: "default-workspace", + actorId: "system", + role: "owner" as const, + }, + ] + : []; + + const postgresWorkspaceStore = + env.relational.backend === "postgres" + ? createPostgresRelationalWorkspaceRepository({ + connectionString: env.relational.databaseUrl, + schema: env.relational.schema, + autoMigrate: env.relational.autoMigrate, + workspaces: defaultWorkspaces, + memberships: defaultMemberships, + }) + : null; + const workspaceStore: RelationalWorkspacePort = + postgresWorkspaceStore ?? + createInMemoryRelationalWorkspaceRepository({ + workspaces: defaultWorkspaces, + memberships: defaultMemberships, + }); + + const graphMemory: GraphMemoryPort = { + async upsertFacts() { + return undefined; + }, + async findRelatedContext() { + return []; + }, + }; + + const vectorIndex: VectorIndexPort = { + async upsertDocuments() { + return undefined; + }, + async search() { + return []; + }, + }; + + return { + workspaceStore, + extractorRuntime: createExtractorRuntime(), + artifactStorage: createInMemoryArtifactStoragePort(), + graphMemory, + vectorIndex, + reranker: createNoopReranker(), + initialize: postgresWorkspaceStore + ? async () => { + await postgresWorkspaceStore.initialize(); + } + : undefined, + close: postgresWorkspaceStore + ? async () => { + await postgresWorkspaceStore.close(); + } + : undefined, + }; +} + +function closeNodeServer(server: Server): Promise { + return new Promise((resolve, reject) => { + if (!server.listening) { + resolve(); + return; + } + + try { + server.close((error?: Error) => { + if (error) { + reject(error); + return; + } + + resolve(); + }); + } catch (error) { + reject(error); + } + }); +} + +function buildAdaptiveQueryVariants(queryText: string): string[] { + const normalizedTerms = queryText + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter((term) => term.length > 0); + const normalizedQueryText = normalizedTerms.join(" ").trim(); + const focusedTerms = normalizedTerms.filter( + (term) => !ADAPTIVE_QUERY_STOP_WORDS.has(term), + ); + const focusedQueryText = focusedTerms.join(" ").trim(); + + if ( + focusedQueryText.length === 0 || + focusedQueryText === normalizedQueryText + ) { + return [queryText]; + } + + return Array.from( + new Set( + [queryText, focusedQueryText].filter( + (candidate) => candidate.trim().length > 0, + ), + ), + ); +} + +function mergeContextPackets( + basePackets: ContextPacket[], + extraPackets: ContextPacket[], +): ContextPacket[] { + const packetsById = new Map(); + + for (const packet of [...basePackets, ...extraPackets]) { + const existing = packetsById.get(packet.packetId); + + if (!existing) { + const cloned = cloneContextPacket(packet); + packetsById.set(packet.packetId, cloned); + continue; + } + + existing.title ??= packet.title; + existing.uri ??= packet.uri; + existing.kind ??= packet.kind; + existing.scores.relational = Math.max( + existing.scores.relational, + packet.scores.relational, + ); + existing.scores.graph = Math.max( + existing.scores.graph, + packet.scores.graph, + ); + existing.scores.vector = Math.max( + existing.scores.vector, + packet.scores.vector, + ); + existing.scores.combined = + existing.scores.relational + + existing.scores.graph + + existing.scores.vector; + const existingChunkOrder = existing.supportingChunks.slice(); + + for (const chunk of packet.supportingChunks) { + mergePacketChunk(existing, chunk); + } + existing.supportingChunks = mergeOrderedPacketChunks( + existingChunkOrder, + existing.supportingChunks, + packet.supportingChunks, + ); + + existing.citations = dedupeCitations([ + ...existing.citations, + ...packet.citations, + ]); + } + + for (const packet of packetsById.values()) { + packet.citations = dedupeCitations([ + ...packet.supportingChunks.flatMap((chunk) => chunk.citations), + ...packet.citations, + ]); + } + + return Array.from(packetsById.values()); +} + +function mergeOrderedPacketChunks( + existingChunksBeforeMerge: RetrievedContextChunk[], + mergedChunks: RetrievedContextChunk[], + incomingChunks: RetrievedContextChunk[], +): RetrievedContextChunk[] { + const mergedChunksById = new Map( + mergedChunks.map((chunk) => [chunk.chunkId, chunk]), + ); + const orderedChunks = existingChunksBeforeMerge.map( + (chunk) => mergedChunksById.get(chunk.chunkId) ?? chunk, + ); + + for (const [index, incomingChunk] of incomingChunks.entries()) { + const mergedIncomingChunk = + mergedChunksById.get(incomingChunk.chunkId) ?? incomingChunk; + + if ( + orderedChunks.some( + (chunk) => chunk.chunkId === mergedIncomingChunk.chunkId, + ) + ) { + continue; + } + + const previousKnownChunk = incomingChunks + .slice(0, index) + .reverse() + .find((chunk) => + orderedChunks.some((candidate) => candidate.chunkId === chunk.chunkId), + ); + const nextKnownChunk = incomingChunks + .slice(index + 1) + .find((chunk) => + orderedChunks.some((candidate) => candidate.chunkId === chunk.chunkId), + ); + + if (nextKnownChunk) { + const nextIndex = orderedChunks.findIndex( + (chunk) => chunk.chunkId === nextKnownChunk.chunkId, + ); + orderedChunks.splice(nextIndex, 0, mergedIncomingChunk); + continue; + } + + if (previousKnownChunk) { + const previousIndex = orderedChunks.findIndex( + (chunk) => chunk.chunkId === previousKnownChunk.chunkId, + ); + orderedChunks.splice(previousIndex + 1, 0, mergedIncomingChunk); + continue; + } + + orderedChunks.push(mergedIncomingChunk); + } + + return orderedChunks; +} + +function cloneContextPacket(packet: ContextPacket): ContextPacket { + return { + ...packet, + citations: packet.citations.map((citation) => ({ ...citation })), + supportingChunks: packet.supportingChunks.map((chunk) => ({ + ...chunk, + citations: chunk.citations.map((citation) => ({ ...citation })), + })), + scores: { ...packet.scores }, + }; +} + +function mergePacketChunk( + packet: ContextPacket, + chunk: RetrievedContextChunk, +): void { + const existing = packet.supportingChunks.find( + (candidate) => candidate.chunkId === chunk.chunkId, + ); + + if (!existing) { + packet.supportingChunks.push({ + ...chunk, + citations: chunk.citations.map((citation) => ({ ...citation })), + }); + return; + } + + existing.score = Math.max(existing.score, chunk.score); + existing.citations = dedupeCitations([ + ...existing.citations, + ...chunk.citations, + ]); +} + +async function rankContextPackets(input: { + packets: ContextPacket[]; + queryText: string; + limit: number; + reranker: RerankerPort; +}): Promise { + const fusedPackets = applyFusionScores(input.packets); + const shortlistLimit = input.limit > 0 ? input.limit * 3 : input.limit; + const shortlist = fusedPackets + .slice() + .sort((left, right) => { + return ( + right.scores.fused - left.scores.fused || + right.scores.combined - left.scores.combined || + left.packetId.localeCompare(right.packetId) + ); + }) + .slice(0, shortlistLimit); + + let rerankResults: Awaited> = []; + try { + rerankResults = await input.reranker.rerank({ + queryText: input.queryText, + packets: shortlist, + limit: input.limit, + }); + } catch { + rerankResults = await createNoopReranker().rerank({ + queryText: input.queryText, + packets: shortlist, + limit: input.limit, + }); + } + + const rerankScoreByPacketId = new Map( + rerankResults.map((result) => [result.packetId, result.score]), + ); + const omittedRerankScore = getOmittedRerankScore(rerankResults); + + for (const packet of shortlist) { + packet.scores.reranked = + rerankScoreByPacketId.get(packet.packetId) ?? omittedRerankScore; + } + + return shortlist + .sort((left, right) => { + const leftHasExplicitRerankScore = rerankScoreByPacketId.has( + left.packetId, + ); + const rightHasExplicitRerankScore = rerankScoreByPacketId.has( + right.packetId, + ); + + if (leftHasExplicitRerankScore !== rightHasExplicitRerankScore) { + return leftHasExplicitRerankScore ? -1 : 1; } - const expansionSourceIds = collectExpansionSourceIds({ - relationalItems, - graphCandidates, - vectorHits, - sourceIds: query.sourceIds, - }); + return ( + right.scores.reranked - left.scores.reranked || + right.scores.fused - left.scores.fused || + right.scores.combined - left.scores.combined || + left.packetId.localeCompare(right.packetId) + ); + }) + .slice(0, input.limit); +} - const hydratedRelationalSources = - expansionSourceIds.length === 0 - ? [] - : await hydrateSourceCandidates({ - workspaceStore: internal.workspaceStore, - workspaceId: query.workspace.workspaceId, - sourceIds: expansionSourceIds, - }); - const vectorItemsByDocumentId = await hydrateVectorItemsByDocumentId({ - workspaceStore: internal.workspaceStore, - workspaceId: query.workspace.workspaceId, - vectorHits, - sourceIds: query.sourceIds, +function applyFusionScores(packets: ContextPacket[]): ContextPacket[] { + const fusedScores = new Map(); + const rankConstant = 60; + const lanes: Array = [ + "relational", + "graph", + "vector", + ]; + + for (const lane of lanes) { + const rankedPackets = packets + .filter((packet) => packet.scores[lane] > 0) + .sort((left, right) => { + return ( + right.scores[lane] - left.scores[lane] || + left.packetId.localeCompare(right.packetId) + ); }); - const packets = - limit === 0 - ? [] - : buildContextPackets({ - relationalSources: mergeSourceCandidates( - relationalSources, - hydratedRelationalSources, - ), - relationalItems, - graphCandidates, - vectorHits, - vectorItemsByDocumentId, - sourceIds: query.sourceIds, - limit, - }); + rankedPackets.forEach((packet, index) => { + fusedScores.set( + packet.packetId, + (fusedScores.get(packet.packetId) ?? 0) + + 1 / (rankConstant + index + 1), + ); + }); + } - return { - workspaceId: query.workspace.workspaceId, - queryText: query.queryText, - packets, - totalPackets: packets.length, - }; - }, - }; -} + for (const packet of packets) { + packet.scores.fused = fusedScores.get(packet.packetId) ?? 0; + packet.scores.reranked = 0; + } -function createDefaultInternalPorts(): InternalSubsystemPorts { - // Initial upstream source slices are now vendored for all three foundations, but - // these adapters remain intentionally narrow until the internal wiring lands. - const workspaceStore: RelationalWorkspacePort = - createInMemoryRelationalWorkspaceRepository({ - workspaces: [ - { - workspaceId: "default-workspace", - slug: "default-workspace", - displayName: "Default Workspace", - }, - ], - memberships: [ - { - workspaceId: "default-workspace", - actorId: "system", - role: "owner", - }, - ], - }); + return packets; +} - const graphMemory: GraphMemoryPort = { - async upsertFacts() { - return undefined; - }, - async findRelatedContext() { - return []; - }, - }; +function getOmittedRerankScore( + rerankResults: Awaited>, +): number { + if (rerankResults.length === 0) { + return 0; + } - const vectorIndex: VectorIndexPort = { - async upsertDocuments() { - return undefined; - }, - async search() { - return []; - }, - }; + const lowestExplicitScore = rerankResults.reduce( + (lowest, result) => (result.score < lowest ? result.score : lowest), + rerankResults[0].score, + ); - return { - workspaceStore, - graphMemory, - vectorIndex, - }; + return Number.isFinite(lowestExplicitScore) + ? lowestExplicitScore - 1 + : lowestExplicitScore; } function listen(server: Server, port: number): Promise { @@ -356,16 +1352,50 @@ function buildContextPackets(input: { Awaited>[number]["item"] >; sourceIds?: string[]; - limit: number; }): ContextPacket[] { - const packetsBySource = new Map(); + const packetsByKey = new Map(); + const packetKeysBySourceId = new Map>(); + const chunkSortKeyById = new Map< + string, + { ordinal: number; itemId: string } + >(); + const sourceRelationalScoresBySourceId = new Map( + input.relationalSources.map((candidate) => [ + candidate.source.sourceId, + candidate.score, + ]), + ); + const sourceMetadataBySourceId = new Map( + input.relationalSources.map((candidate) => [ + candidate.source.sourceId, + candidate.source, + ]), + ); + const itemsBySourceItemId = new Map( + input.relationalItems.map((candidate) => [ + `${candidate.item.sourceId}:${candidate.item.itemId}`, + candidate.item, + ]), + ); const allowedSourceIds = input.sourceIds ? new Set(input.sourceIds) : undefined; for (const candidate of input.relationalSources) { - packetsBySource.set(candidate.source.sourceId, { - packetId: `packet:${candidate.source.sourceId}`, + const packetMapKey = getPacketMapKey( + candidate.source.sourceId, + candidate.source.sourceGroupKey, + ); + registerPacketSource( + packetKeysBySourceId, + candidate.source.sourceId, + packetMapKey, + ); + packetsByKey.set(packetMapKey, { + packetId: getPacketId( + candidate.source.sourceId, + candidate.source.sourceGroupKey, + ), sourceId: candidate.source.sourceId, title: candidate.source.title, uri: candidate.source.uri, @@ -377,17 +1407,58 @@ function buildContextPackets(input: { graph: 0, vector: 0, combined: candidate.score, + fused: 0, + reranked: 0, }, }); } for (const candidate of input.relationalItems) { - const packet = ensurePacket(packetsBySource, candidate.item.sourceId); + const chunkId = `${candidate.item.sourceId}:${candidate.item.itemId}`; + const existingChunkSortKey = chunkSortKeyById.get(chunkId); + const nextChunkSortKey = { + ordinal: candidate.item.ordinal ?? Number.MAX_SAFE_INTEGER, + itemId: candidate.item.itemId, + }; + + if ( + existingChunkSortKey === undefined || + nextChunkSortKey.ordinal < existingChunkSortKey.ordinal || + (nextChunkSortKey.ordinal === existingChunkSortKey.ordinal && + nextChunkSortKey.itemId.localeCompare(existingChunkSortKey.itemId) < 0) + ) { + chunkSortKeyById.set(chunkId, nextChunkSortKey); + } + + const packetMapKey = getPacketMapKey( + candidate.item.sourceId, + candidate.item.packetKey ?? candidate.item.sourceId, + ); + registerPacketSource( + packetKeysBySourceId, + candidate.item.sourceId, + packetMapKey, + ); + const packet = ensurePacket( + packetsByKey, + packetMapKey, + getPacketId( + candidate.item.sourceId, + candidate.item.packetKey ?? candidate.item.sourceId, + ), + candidate.item.sourceId, + ); + applyPacketSourceMetadata( + packet, + sourceMetadataBySourceId.get(candidate.item.sourceId), + ); const chunk = createChunkFromItem(candidate.item, candidate.score); appendChunk(packet, chunk); + const sourceRelationalScore = + sourceRelationalScoresBySourceId.get(candidate.item.sourceId) ?? 0; packet.scores.relational = Math.max( packet.scores.relational, - candidate.score, + sourceRelationalScore + candidate.score, ); packet.scores.combined = packet.scores.relational + packet.scores.graph + packet.scores.vector; @@ -395,7 +1466,16 @@ function buildContextPackets(input: { for (const candidate of input.graphCandidates) { const packetSourceId = candidate.sourceId; - const existingPacket = packetsBySource.get(packetSourceId); + const sourceMetadata = sourceMetadataBySourceId.get(packetSourceId); + const matchedItem = candidate.itemId + ? itemsBySourceItemId.get(`${packetSourceId}:${candidate.itemId}`) + : undefined; + const anchorPacketMapKey = matchedItem + ? getPacketMapKey( + packetSourceId, + matchedItem.packetKey ?? matchedItem.sourceId, + ) + : getPacketMapKey(packetSourceId, sourceMetadata?.sourceGroupKey); if ( allowedSourceIds !== undefined && @@ -404,14 +1484,16 @@ function buildContextPackets(input: { continue; } - if (!existingPacket) { + const packet = packetsByKey.get(anchorPacketMapKey); + + if (!packet) { continue; } - if (!appendGraphChunk(existingPacket, candidate)) { + applyPacketSourceMetadata(packet, sourceMetadata); + if (!appendGraphChunk(packet, candidate)) { continue; } - const packet = existingPacket; packet.scores.graph = Math.max(packet.scores.graph, candidate.score); packet.scores.combined = packet.scores.relational + packet.scores.graph + packet.scores.vector; @@ -422,25 +1504,66 @@ function buildContextPackets(input: { continue; } - if (!packetsBySource.has(hit.sourceId)) { + const vectorItem = + input.vectorItemsByDocumentId.get( + vectorHitKey(hit.sourceId, hit.documentId, hit.itemId), + ) ?? + (hit.itemId === undefined + ? input.vectorItemsByDocumentId.get(hit.documentId) + : undefined); + const packetMapKey = vectorItem + ? getPacketMapKey(hit.sourceId, vectorItem.packetKey ?? hit.sourceId) + : resolveFallbackVectorPacketMapKey({ + packetKeysBySourceId, + sourceId: hit.sourceId, + sourceGroupKey: sourceMetadataBySourceId.get(hit.sourceId) + ?.sourceGroupKey, + }); + + if ( + packetMapKey && + allowedSourceIds !== undefined && + !Array.from(packetKeysBySourceId.entries()).some( + ([sourceId, packetKeys]) => + allowedSourceIds.has(sourceId) && packetKeys.has(packetMapKey), + ) + ) { + continue; + } + + if (!packetMapKey) { continue; } - const packet = ensurePacket(packetsBySource, hit.sourceId); - const vectorItem = input.vectorItemsByDocumentId.get( - vectorHitKey(hit.sourceId, hit.documentId, hit.itemId), + const packet = packetsByKey.get(packetMapKey); + if (!packet) { + continue; + } + applyPacketSourceMetadata( + packet, + sourceMetadataBySourceId.get(hit.sourceId), ); if (vectorItem) { const chunk = createChunkFromItem(vectorItem, hit.score); - appendChunk(packet, chunk); - packet.scores.vector = Math.max(packet.scores.vector, hit.score); - packet.scores.combined = - packet.scores.relational + packet.scores.graph + packet.scores.vector; + mergePacketChunk(packet, chunk); + } else if (packet.supportingChunks.length === 0) { + appendChunk(packet, createChunkFromVectorHit(hit)); + } else { + packet.citations.push({ + sourceId: hit.sourceId, + snippet: hit.snippet, + location: hit.itemId ?? hit.documentId, + itemId: hit.itemId, + }); } + + packet.scores.vector = Math.max(packet.scores.vector, hit.score); + packet.scores.combined = + packet.scores.relational + packet.scores.graph + packet.scores.vector; } - for (const packet of packetsBySource.values()) { + for (const packet of packetsByKey.values()) { if (packet.supportingChunks.length > 0) { continue; } @@ -455,10 +1578,15 @@ function buildContextPackets(input: { } } - return Array.from(packetsBySource.values()) + return Array.from(packetsByKey.values()) + .map((packet) => orderPacketChunks(packet, chunkSortKeyById)) .filter((packet) => packet.supportingChunks.length > 0) - .sort((left, right) => right.scores.combined - left.scores.combined) - .slice(0, input.limit); + .sort((left, right) => { + return ( + right.scores.combined - left.scores.combined || + left.packetId.localeCompare(right.packetId) + ); + }); } function collectExpansionSourceIds(input: { @@ -507,6 +1635,27 @@ function collectExpansionSourceIds(input: { return Array.from(collected.values()); } +function mergeItemCandidates( + base: Awaited>, + extra: Awaited>, +): Awaited> { + const merged = new Map< + string, + Awaited>[number] + >(); + + for (const candidate of [...base, ...extra]) { + const key = `${candidate.item.sourceId}:${candidate.item.itemId}`; + const existing = merged.get(key); + + if (!existing || candidate.score > existing.score) { + merged.set(key, candidate); + } + } + + return Array.from(merged.values()); +} + async function hydrateSourceCandidates(input: { workspaceStore: RelationalWorkspacePort; workspaceId: string; @@ -526,17 +1675,12 @@ async function hydrateSourceCandidates(input: { return hydrated.filter((candidate) => candidate !== null); } -async function hydrateVectorItemsByDocumentId(input: { +async function hydrateVectorMatchedItems(input: { workspaceStore: RelationalWorkspacePort; workspaceId: string; vectorHits: Awaited>; sourceIds?: string[]; -}): Promise< - Map< - string, - Awaited>[number]["item"] - > -> { +}): Promise>> { const allowedSourceIds = input.sourceIds ? new Set(input.sourceIds) : undefined; @@ -567,26 +1711,158 @@ async function hydrateVectorItemsByDocumentId(input: { ); } - const lookup = new Map< + const candidates = new Map< string, - Awaited>[number]["item"] + Awaited>[number] >(); - for (const hit of input.vectorHits.filter((hit) => - allowedSourceIds === undefined ? true : allowedSourceIds.has(hit.sourceId), - )) { - const matchedItem = itemsBySource - .get(hit.sourceId) - ?.find((item) => matchesVectorHit(item, hit)); + for (const hit of input.vectorHits) { + if (allowedSourceIds !== undefined && !allowedSourceIds.has(hit.sourceId)) { + continue; + } + + const persistedItems = itemsBySource.get(hit.sourceId) ?? []; + const matchedItem = persistedItems.find((item) => + matchesVectorHit(item, hit), + ); + + if (!matchedItem) { + continue; + } - if (matchedItem) { - lookup.set( - vectorHitKey(hit.sourceId, hit.documentId, hit.itemId), - matchedItem, + const packetItems = persistedItems + .filter( + (item) => + (item.packetKey ?? item.sourceId) === + (matchedItem.packetKey ?? matchedItem.sourceId), + ) + .sort( + (left, right) => + (left.ordinal ?? Number.MAX_SAFE_INTEGER) - + (right.ordinal ?? Number.MAX_SAFE_INTEGER) || + left.itemId.localeCompare(right.itemId), ); + + for (const item of packetItems) { + const candidateKey = `${item.sourceId}:${item.itemId}`; + if (!candidates.has(candidateKey)) { + candidates.set(candidateKey, { item, score: 0 }); + } + } + } + + return Array.from(candidates.values()); +} + +async function hydrateGraphMatchedItems(input: { + workspaceStore: RelationalWorkspacePort; + workspaceId: string; + graphCandidates: Awaited>; + sourceIds?: string[]; +}): Promise>> { + const allowedSourceIds = input.sourceIds + ? new Set(input.sourceIds) + : undefined; + const sourceIds = Array.from( + new Set( + input.graphCandidates + .map((candidate) => candidate.sourceId) + .filter((sourceId) => + allowedSourceIds === undefined + ? true + : allowedSourceIds.has(sourceId), + ), + ), + ); + + const itemsBySource = new Map< + string, + Awaited> + >(); + + for (const sourceId of sourceIds) { + itemsBySource.set( + sourceId, + await input.workspaceStore.getPersistedItems({ + workspaceId: input.workspaceId, + sourceId, + }), + ); + } + + const candidates = new Map< + string, + Awaited>[number] + >(); + + for (const candidate of input.graphCandidates) { + if ( + candidate.itemId === undefined || + (allowedSourceIds !== undefined && + !allowedSourceIds.has(candidate.sourceId)) + ) { + continue; + } + + const persistedItems = itemsBySource.get(candidate.sourceId) ?? []; + const matchedItem = persistedItems.find( + (item) => item.itemId === candidate.itemId, + ); + + if (!matchedItem) { + continue; + } + + const packetItems = persistedItems + .filter( + (item) => + (item.packetKey ?? item.sourceId) === + (matchedItem.packetKey ?? matchedItem.sourceId), + ) + .sort( + (left, right) => + (left.ordinal ?? Number.MAX_SAFE_INTEGER) - + (right.ordinal ?? Number.MAX_SAFE_INTEGER) || + left.itemId.localeCompare(right.itemId), + ); + + for (const item of packetItems) { + const candidateKey = `${item.sourceId}:${item.itemId}`; + if (!candidates.has(candidateKey)) { + candidates.set(candidateKey, { item, score: 0 }); + } } } + return Array.from(candidates.values()); +} + +function buildVectorItemsByDocumentId( + items: Awaited>, +): Map< + string, + Awaited>[number]["item"] +> { + const lookup = new Map< + string, + Awaited>[number]["item"] + >(); + + for (const candidate of items) { + lookup.set( + vectorHitKey( + candidate.item.sourceId, + `${candidate.item.sourceId}:${candidate.item.itemId}`, + candidate.item.itemId, + ), + candidate.item, + ); + lookup.set( + `${candidate.item.sourceId}:${candidate.item.itemId}`, + candidate.item, + ); + } + return lookup; } @@ -613,6 +1889,33 @@ function vectorHitKey( : `${sourceId}:${documentId}:${itemId}`; } +function resolveFallbackVectorPacketMapKey(input: { + packetKeysBySourceId: Map>; + sourceId: string; + sourceGroupKey?: string; +}): string | undefined { + const packetMapKeys = Array.from( + input.packetKeysBySourceId.get(input.sourceId) ?? [], + ); + + if (packetMapKeys.length === 1) { + return packetMapKeys[0]; + } + + if (input.sourceGroupKey !== undefined) { + const anchorPacketMapKey = getPacketMapKey( + input.sourceId, + input.sourceGroupKey, + ); + + if (packetMapKeys.includes(anchorPacketMapKey)) { + return anchorPacketMapKey; + } + } + + return undefined; +} + function mergeSourceCandidates( base: Awaited>, extra: Awaited>, @@ -642,17 +1945,19 @@ function normalizeRetrievalLimit(limit?: number): number { } function ensurePacket( - packetsBySource: Map, + packetsByKey: Map, + packetKey: string, + packetId: string, sourceId: string, ): ContextPacket { - const existing = packetsBySource.get(sourceId); + const existing = packetsByKey.get(packetKey); if (existing) { return existing; } const created: ContextPacket = { - packetId: `packet:${sourceId}`, + packetId, sourceId, citations: [], supportingChunks: [], @@ -661,13 +1966,55 @@ function ensurePacket( graph: 0, vector: 0, combined: 0, + fused: 0, + reranked: 0, }, }; - packetsBySource.set(sourceId, created); + packetsByKey.set(packetKey, created); return created; } +function applyPacketSourceMetadata( + packet: ContextPacket, + source?: { title?: string; uri?: string; kind?: string }, +): void { + if (!source) { + return; + } + + packet.title ??= source.title; + packet.uri ??= source.uri; + packet.kind ??= source.kind; +} + +function getPacketMapKey(sourceId: string, packetKey?: string): string { + return packetKey && packetKey !== sourceId + ? `${sourceId}:${packetKey}` + : sourceId; +} + +function getPacketId(sourceId: string, packetKey?: string): string { + return packetKey && packetKey !== sourceId + ? `packet:${sourceId}:${packetKey}` + : `packet:${sourceId}`; +} + +function registerPacketSource( + packetKeysBySourceId: Map>, + sourceId: string, + packetMapKey: string, +): void { + const existing = packetKeysBySourceId.get(sourceId); + + if (existing) { + existing.add(packetMapKey); + return; + } + + packetKeysBySourceId.set(sourceId, new Set([packetMapKey])); +} + function createChunkFromItem( item: Awaited< ReturnType @@ -682,7 +2029,8 @@ function createChunkFromItem( { sourceId: item.sourceId, snippet: item.content, - location: item.itemId, + location: item.provenanceLocation ?? item.itemId, + itemId: item.itemId, }, ], }; @@ -746,6 +2094,27 @@ function appendGraphChunk( return true; } +function createChunkFromVectorHit( + hit: Awaited>[number], +): RetrievedContextChunk { + const location = hit.itemId ?? hit.documentId; + const snippet = hit.snippet.length > 0 ? hit.snippet : location; + + return { + chunkId: `vector:${hit.sourceId}:${location}`, + text: snippet, + score: hit.score, + citations: [ + { + sourceId: hit.sourceId, + snippet, + location, + itemId: hit.itemId, + }, + ], + }; +} + function appendChunk( packet: ContextPacket, chunk: RetrievedContextChunk, @@ -761,3 +2130,49 @@ function appendChunk( packet.supportingChunks.push(chunk); packet.citations.push(...chunk.citations); } + +function orderPacketChunks( + packet: ContextPacket, + chunkSortKeyById: Map, +): ContextPacket { + const existingCitations = packet.citations.slice(); + packet.supportingChunks.sort((left, right) => { + const leftSortKey = chunkSortKeyById.get(left.chunkId) ?? { + ordinal: Number.MAX_SAFE_INTEGER, + itemId: left.chunkId, + }; + const rightSortKey = chunkSortKeyById.get(right.chunkId) ?? { + ordinal: Number.MAX_SAFE_INTEGER, + itemId: right.chunkId, + }; + + return ( + leftSortKey.ordinal - rightSortKey.ordinal || + leftSortKey.itemId.localeCompare(rightSortKey.itemId) + ); + }); + packet.citations = dedupeCitations([ + ...packet.supportingChunks.flatMap((chunk) => chunk.citations), + ...existingCitations, + ]); + return packet; +} + +function dedupeCitations(citations: RetrievalCitation[]): RetrievalCitation[] { + const deduped = new Map(); + + for (const citation of citations) { + const key = [ + citation.sourceId, + citation.itemId ?? "", + citation.location ?? "", + citation.snippet, + ].join("|"); + + if (!deduped.has(key)) { + deduped.set(key, citation); + } + } + + return Array.from(deduped.values()); +} diff --git a/src/ingestion/extractors/index.ts b/src/ingestion/extractors/index.ts new file mode 100644 index 0000000..c47bd2a --- /dev/null +++ b/src/ingestion/extractors/index.ts @@ -0,0 +1,9 @@ +export interface ExtractorRuntime { + readonly initialized: boolean; +} + +export function createExtractorRuntime(): ExtractorRuntime { + return { + initialized: true, + }; +} diff --git a/src/ingestion/representations.ts b/src/ingestion/representations.ts new file mode 100644 index 0000000..b3d1159 --- /dev/null +++ b/src/ingestion/representations.ts @@ -0,0 +1,127 @@ +import type { + IngestionMetadataValue, + SourceRepresentation, + SourceRepresentationType, +} from "./service.js"; + +export const defaultRepresentationTypes: SourceRepresentationType[] = [ + "normalized-text", + "html-cleaned", + "pdf-page-text", + "ocr-text", + "table-projection", + "transcript-segment", + "meeting-note-segment", + "caption-text", +]; + +export function normalizeRepresentationMetadata( + metadata: Record | undefined, +): Record { + if (!metadata) { + return {}; + } + + return structuredClone(metadata); +} + +export interface RepresentationExtractor { + type: SourceRepresentationType; + extract(input: { + sourceId: string; + artifactId?: string; + metadata?: Record; + }): Promise; +} + +export interface RepresentationExtractorRegistry { + listTypes(): SourceRepresentationType[]; + register(extractor: RepresentationExtractor): void; + get(type: SourceRepresentationType): RepresentationExtractor | null; +} + +export function createDefaultExtractorRegistry(): RepresentationExtractorRegistry { + const registry = new Map(); + + for (const type of defaultRepresentationTypes) { + registry.set(type, { + type, + async extract(input) { + return [ + { + representationId: createRepresentationId({ + sourceId: input.sourceId, + type, + artifactId: input.artifactId, + metadata: input.metadata, + }), + representationType: type, + artifactId: input.artifactId, + metadata: normalizeRepresentationMetadata(input.metadata), + }, + ]; + }, + }); + } + + return { + listTypes() { + return [...registry.keys()].sort(); + }, + register(extractor) { + registry.set(extractor.type, extractor); + }, + get(type) { + return registry.get(type) ?? null; + }, + }; +} + +function createRepresentationId(input: { + sourceId: string; + type: SourceRepresentationType; + artifactId?: string; + metadata?: Record; +}): string { + const fingerprint = hashStableString( + stableSerializeMetadata({ + artifactId: input.artifactId ?? null, + metadata: input.metadata ?? null, + }), + ); + + return `${input.sourceId}:${input.type}:${fingerprint}`; +} + +function hashStableString(value: string): string { + let hash = 0xcbf29ce484222325n; + const prime = 0x100000001b3n; + const mask = 0xffffffffffffffffn; + + for (const codePoint of value) { + hash ^= BigInt(codePoint.codePointAt(0) ?? 0); + hash = (hash * prime) & mask; + } + + return hash.toString(16).padStart(16, "0"); +} + +function stableSerializeMetadata(value: unknown): string { + if (Array.isArray(value)) { + return `[${(value as unknown[]) + .map((entry) => stableSerializeMetadata(entry)) + .join(",")}]`; + } + + if (value && typeof value === "object") { + return `{${Object.entries(value as Record) + .sort(([left], [right]) => left.localeCompare(right)) + .map( + ([key, entry]) => + `${JSON.stringify(key)}:${stableSerializeMetadata(entry)}`, + ) + .join(",")}}`; + } + + return JSON.stringify(value); +} diff --git a/src/ingestion/service.ts b/src/ingestion/service.ts index 465aad0..f3144a3 100644 --- a/src/ingestion/service.ts +++ b/src/ingestion/service.ts @@ -2,11 +2,43 @@ import type { WorkspaceContext } from "../workspace/context.js"; export type IngestionMetadataValue = string | number | boolean | null; +export type SourceRepresentationType = + | "normalized-text" + | "html-cleaned" + | "pdf-page-text" + | "ocr-text" + | "table-projection" + | "transcript-segment" + | "meeting-note-segment" + | "caption-text" + | (string & {}); + +export interface SourceArtifact { + artifactId: string; + artifactType: string; + mimeType?: string; + storageUri?: string; + checksum?: string; + metadata?: Record; +} + +export interface SourceRepresentation { + representationId: string; + representationType: SourceRepresentationType; + artifactId?: string; + mimeType?: string; + language?: string; + metadata?: Record; +} + export interface IngestionSource { sourceId: string; kind: "document" | "message" | "note" | "web"; uri?: string; title?: string; + rootSourceId?: string; + parentSourceId?: string; + sourceGroupKey?: string; } export interface IngestionItem { @@ -14,6 +46,12 @@ export interface IngestionItem { content: string; contentType: "text/plain" | "text/markdown"; metadata?: Record; + representationId?: string; + ordinal?: number; + parentItemId?: string; + packetKey?: string; + sectionKey?: string; + provenanceLocation?: string; } export interface IngestionCommand { @@ -21,6 +59,8 @@ export interface IngestionCommand { source: IngestionSource; items: IngestionItem[]; occurredAt?: string; + artifacts?: SourceArtifact[]; + representations?: SourceRepresentation[]; } export interface IngestionReceipt { diff --git a/src/retrieval/service.ts b/src/retrieval/service.ts index 2922139..bf3ead4 100644 --- a/src/retrieval/service.ts +++ b/src/retrieval/service.ts @@ -4,6 +4,7 @@ export interface RetrievalCitation { sourceId: string; snippet: string; location?: string; + itemId?: string; } export interface RetrievedContextChunk { @@ -26,9 +27,20 @@ export interface ContextPacket { graph: number; vector: number; combined: number; + fused: number; + reranked: number; }; } +export interface AdaptiveRetrievalTrace { + strategy: "single-pass" | "keyword-expansion"; + ran: boolean; + passes: number; + triggerReason: string; + queriesTried: string[]; + topPacketIdsPerPass: string[][]; +} + export interface RetrievalQuery { workspace: WorkspaceContext; queryText: string; @@ -42,6 +54,7 @@ export interface RetrievalResult { queryText: string; packets: ContextPacket[]; totalPackets: number; + adaptive: AdaptiveRetrievalTrace; } export interface RetrievalService { diff --git a/src/subsystems/qdrant/port.ts b/src/subsystems/qdrant/port.ts index b7bb0d3..097c037 100644 --- a/src/subsystems/qdrant/port.ts +++ b/src/subsystems/qdrant/port.ts @@ -33,5 +33,6 @@ export interface VectorIndexPort { workspaceId: string; queryText: string; limit: number; + sourceIds?: string[]; }): Promise; } diff --git a/src/subsystems/reranker/port.ts b/src/subsystems/reranker/port.ts new file mode 100644 index 0000000..9dfe888 --- /dev/null +++ b/src/subsystems/reranker/port.ts @@ -0,0 +1,22 @@ +import type { ContextPacket } from "../../retrieval/service.js"; + +export interface RerankerPort { + rerank(input: { + queryText: string; + packets: ContextPacket[]; + limit: number; + }): Promise>; +} + +export function createNoopReranker(): RerankerPort { + return { + async rerank({ packets, limit }) { + return packets + .slice(0, limit > 0 ? limit : packets.length) + .map((packet, index) => ({ + packetId: packet.packetId, + score: packets.length - index, + })); + }, + }; +} diff --git a/src/subsystems/supabase/migrations/0001_workspace_foundation.sql b/src/subsystems/supabase/migrations/0001_workspace_foundation.sql new file mode 100644 index 0000000..27228f0 --- /dev/null +++ b/src/subsystems/supabase/migrations/0001_workspace_foundation.sql @@ -0,0 +1,99 @@ +create schema if not exists secondbrain_engine; + +create table if not exists secondbrain_engine.workspaces ( + workspace_id text primary key, + slug text not null, + display_name text not null +); + +create table if not exists secondbrain_engine.workspace_memberships ( + workspace_id text not null references secondbrain_engine.workspaces (workspace_id) on delete cascade, + actor_id text not null, + role text not null, + primary key (workspace_id, actor_id) +); + +create table if not exists secondbrain_engine.sources ( + workspace_id text not null references secondbrain_engine.workspaces (workspace_id) on delete cascade, + source_id text not null, + root_source_id text, + parent_source_id text, + source_group_key text, + kind text not null, + uri text, + title text, + first_occurred_at timestamptz not null, + last_occurred_at timestamptz not null, + item_count integer not null default 0, + primary key (workspace_id, source_id) +); + +create index if not exists sources_workspace_lookup_idx + on secondbrain_engine.sources (workspace_id); + +create table if not exists secondbrain_engine.source_artifacts ( + workspace_id text not null, + source_id text not null, + artifact_id text not null, + artifact_type text not null, + mime_type text, + storage_uri text, + checksum text, + metadata jsonb not null default '{}'::jsonb, + created_at timestamptz not null, + updated_at timestamptz not null, + primary key (workspace_id, source_id, artifact_id), + foreign key (workspace_id, source_id) + references secondbrain_engine.sources (workspace_id, source_id) + on delete cascade +); + +create table if not exists secondbrain_engine.source_representations ( + workspace_id text not null, + source_id text not null, + representation_id text not null, + representation_type text not null, + artifact_id text, + mime_type text, + language text, + metadata jsonb not null default '{}'::jsonb, + created_at timestamptz not null, + updated_at timestamptz not null, + primary key (workspace_id, source_id, representation_id), + foreign key (workspace_id, source_id) + references secondbrain_engine.sources (workspace_id, source_id) + on delete cascade, + foreign key (workspace_id, source_id, artifact_id) + references secondbrain_engine.source_artifacts (workspace_id, source_id, artifact_id) + on delete cascade +); + +comment on column secondbrain_engine.source_representations.artifact_id is 'Nullable to allow derived representations without a stored artifact; the foreign key is enforced only when artifact_id is populated.'; + +create index if not exists source_representations_artifact_lookup_idx + on secondbrain_engine.source_representations (workspace_id, source_id, artifact_id); + +create table if not exists secondbrain_engine.ingestion_items ( + workspace_id text not null, + source_id text not null, + item_id text not null, + representation_id text, + ordinal integer, + parent_item_id text, + packet_key text, + section_key text, + provenance_location text, + content text not null, + content_type text not null, + metadata jsonb not null default '{}'::jsonb, + occurred_at timestamptz not null, + created_at timestamptz not null, + updated_at timestamptz not null, + primary key (workspace_id, source_id, item_id), + foreign key (workspace_id, source_id) + references secondbrain_engine.sources (workspace_id, source_id) + on delete cascade, + foreign key (workspace_id, source_id, representation_id) + references secondbrain_engine.source_representations (workspace_id, source_id, representation_id) + on delete cascade +); diff --git a/src/subsystems/supabase/port.ts b/src/subsystems/supabase/port.ts index ff91149..5ccff76 100644 --- a/src/subsystems/supabase/port.ts +++ b/src/subsystems/supabase/port.ts @@ -1,11 +1,16 @@ import type { IngestionItem, IngestionSource, + SourceArtifact, + SourceRepresentation, } from "../../ingestion/service.js"; import type { WorkspaceContext } from "../../workspace/context.js"; import type { + CanonicalMultimodalRecordSet, IngestionItemRecord, + SourceArtifactRecord, SourceRecord, + SourceRepresentationRecord, WorkspaceRecord, } from "./schema.js"; @@ -27,12 +32,25 @@ export interface RelationalItemCandidate { * port keeps the app-facing runtime boundary narrow and internal-only. */ +export const relationalCanonicalEntityNames = [ + "source", + "artifact", + "representation", + "retrieval-item", +] as const; + export const relationalWorkspacePortMethodNames = [ "getWorkspace", "validateAccess", + "persistCanonicalIngestion", "upsertSource", + "persistArtifacts", + "persistRepresentations", "persistItems", "getPersistedSource", + "getPersistedArtifacts", + "getPersistedRepresentations", + "getCanonicalMultimodalRecordSet", "getPersistedItems", "findSources", "findItems", @@ -75,3 +93,57 @@ export interface RelationalWorkspacePort { }): Promise; reset(): Promise; } + +export interface CanonicalIngestionPersistenceResult { + source: SourceRecord; + artifacts: SourceArtifactRecord[]; + representations: SourceRepresentationRecord[]; + items: IngestionItemRecord[]; +} + +export interface RepresentationAwareRelationalWorkspacePort + extends RelationalWorkspacePort { + persistCanonicalIngestion(input: { + workspace: WorkspaceContext; + source: IngestionSource; + artifacts?: SourceArtifact[]; + representations?: SourceRepresentation[]; + items: IngestionItem[]; + occurredAt: string; + }): Promise; + persistArtifacts?(input: { + workspace: WorkspaceContext; + source: SourceRecord; + artifacts: SourceArtifact[]; + occurredAt: string; + }): Promise; + persistRepresentations?(input: { + workspace: WorkspaceContext; + source: SourceRecord; + representations: SourceRepresentation[]; + occurredAt: string; + }): Promise; + getPersistedArtifacts?(input: { + workspaceId: string; + sourceId: string; + }): Promise; + getPersistedRepresentations?(input: { + workspaceId: string; + sourceId: string; + }): Promise; + getCanonicalMultimodalRecordSet(input: { + workspaceId: string; + sourceId: string; + }): Promise; +} + +export function isRepresentationAwareRelationalWorkspacePort( + port: RelationalWorkspacePort, +): port is RepresentationAwareRelationalWorkspacePort { + return ( + typeof (port as RepresentationAwareRelationalWorkspacePort) + .persistCanonicalIngestion === "function" && + typeof (port as RepresentationAwareRelationalWorkspacePort) + .getCanonicalMultimodalRecordSet === "function" + ); +} diff --git a/src/subsystems/supabase/postgres.ts b/src/subsystems/supabase/postgres.ts new file mode 100644 index 0000000..9e10399 --- /dev/null +++ b/src/subsystems/supabase/postgres.ts @@ -0,0 +1,1293 @@ +import * as fs from "node:fs"; +import { readFileSync } from "node:fs"; + +import { Pool, type PoolClient } from "pg"; + +import type { + IngestionItem, + IngestionSource, + SourceArtifact, + SourceRepresentation, +} from "../../ingestion/service.js"; +import type { WorkspaceContext } from "../../workspace/context.js"; +import type { + CanonicalIngestionPersistenceResult, + RepresentationAwareRelationalWorkspacePort, +} from "./port.js"; +import { + dedupeIngestionItems, + scoreItemRecordForQuery, + scoreSourceRecordForQuery, +} from "./repository.js"; +import type { + CanonicalMultimodalRecordSet, + IngestionItemRecord, + SourceArtifactRecord, + SourceRecord, + SourceRepresentationRecord, + WorkspaceMembershipRecord, + WorkspaceRecord, +} from "./schema.js"; + +const DEFAULT_SCHEMA_NAME = "secondbrain_engine"; +const WORKSPACE_MIGRATION = loadWorkspaceMigrationTemplate(); +type QueryExecutor = Pool | PoolClient; + +export interface CreatePostgresRelationalWorkspaceRepositoryInput { + connectionString: string; + schema?: string; + autoMigrate?: boolean; + workspaces?: WorkspaceRecord[]; + memberships?: WorkspaceMembershipRecord[]; + pool?: Pool; +} + +export interface PostgresRelationalWorkspaceRepository + extends RepresentationAwareRelationalWorkspacePort { + initialize(): Promise; + close(): Promise; +} + +export function createPostgresRelationalWorkspaceRepository( + input: CreatePostgresRelationalWorkspaceRepositoryInput, +): PostgresRelationalWorkspaceRepository { + const schema = normalizeSchemaName(input.schema ?? "secondbrain_engine"); + const pool = + input.pool ?? + new Pool({ + connectionString: input.connectionString, + }); + const ownsPool = input.pool === undefined; + let initialized = false; + let initializePromise: Promise | undefined; + + const releaseClient = (client: PoolClient, error?: unknown): void => { + if (error !== undefined) { + client.release(error instanceof Error ? error : new Error(String(error))); + return; + } + + client.release(); + }; + + const ensureReady = async () => { + if (initialized) { + return; + } + + if (!initializePromise) { + initializePromise = (async () => { + await pool.query("select 1"); + + if (input.autoMigrate) { + await pool.query(renderMigration(schema)); + } else { + await assertRequiredSchemaObjects(pool, schema); + } + + await seedWorkspaceFixtures(pool, schema, input.workspaces ?? []); + await seedMembershipFixtures(pool, schema, input.memberships ?? []); + initialized = true; + })().catch((error) => { + initializePromise = undefined; + throw error; + }); + } + + await initializePromise; + }; + + return { + async initialize() { + await ensureReady(); + }, + async close() { + if (!ownsPool) return; + await pool.end(); + }, + async getWorkspace(workspaceId) { + await ensureReady(); + const result = await pool.query<{ + workspace_id: string; + slug: string; + display_name: string; + }>( + `select workspace_id, slug, display_name + from ${qualify(schema, "workspaces")} + where workspace_id = $1`, + [workspaceId], + ); + + return result.rows[0] ? mapWorkspaceRecord(result.rows[0]) : null; + }, + async validateAccess(context) { + await ensureReady(); + const membership = await pool.query( + `select 1 + from ${qualify(schema, "workspace_memberships")} + where workspace_id = $1 and actor_id = $2`, + [context.workspaceId, context.actor.actorId], + ); + + return (membership.rowCount ?? 0) > 0; + }, + async persistCanonicalIngestion({ + workspace, + source, + artifacts, + representations, + items, + occurredAt, + }) { + await ensureReady(); + const client = await pool.connect(); + let releaseError: unknown; + + try { + await client.query("begin"); + await assertWorkspaceAccess(client, schema, workspace); + const persistedSource = await upsertSourceRecord( + client, + schema, + workspace, + source, + occurredAt, + ); + const persistedArtifacts = artifacts?.length + ? await persistArtifactRecords( + client, + schema, + workspace, + persistedSource, + artifacts, + occurredAt, + ) + : []; + const persistedRepresentations = representations?.length + ? await persistRepresentationRecords( + client, + schema, + workspace, + persistedSource, + representations, + occurredAt, + ) + : []; + const persistedItems = await persistItemRecords( + client, + schema, + workspace, + persistedSource, + items, + occurredAt, + ); + await updateSourceItemCount( + client, + schema, + workspace.workspaceId, + persistedSource.sourceId, + occurredAt, + ); + const refreshedSource = await getPersistedSourceRecord( + client, + schema, + workspace.workspaceId, + persistedSource.sourceId, + ); + await client.query("commit"); + + return { + source: refreshedSource ?? persistedSource, + artifacts: persistedArtifacts, + representations: persistedRepresentations, + items: persistedItems, + } satisfies CanonicalIngestionPersistenceResult; + } catch (error) { + releaseError = error; + try { + await client.query("rollback"); + } catch (rollbackError) { + releaseError = rollbackError; + } + throw error; + } finally { + releaseClient(client, releaseError); + } + }, + async upsertSource({ workspace, source, occurredAt }) { + await ensureReady(); + await assertWorkspaceAccess(pool, schema, workspace); + return upsertSourceRecord(pool, schema, workspace, source, occurredAt); + }, + async persistArtifacts({ workspace, source, artifacts, occurredAt }) { + await ensureReady(); + await assertWorkspaceAccess(pool, schema, workspace); + return persistArtifactRecords( + pool, + schema, + workspace, + source, + artifacts, + occurredAt, + ); + }, + async persistRepresentations({ + workspace, + source, + representations, + occurredAt, + }) { + await ensureReady(); + await assertWorkspaceAccess(pool, schema, workspace); + return persistRepresentationRecords( + pool, + schema, + workspace, + source, + representations, + occurredAt, + ); + }, + async persistItems({ workspace, source, items, occurredAt }) { + await ensureReady(); + await assertWorkspaceAccess(pool, schema, workspace); + const client = await pool.connect(); + let releaseError: unknown; + + try { + await client.query("begin"); + const providedSourceMatchesWorkspace = + source.workspaceId === workspace.workspaceId; + const normalizedSource = + (await getPersistedSourceRecord( + client, + schema, + workspace.workspaceId, + source.sourceId, + )) ?? + (await upsertSourceRecord( + client, + schema, + workspace, + { + sourceId: source.sourceId, + rootSourceId: source.rootSourceId, + parentSourceId: source.parentSourceId, + sourceGroupKey: source.sourceGroupKey, + kind: source.kind, + uri: providedSourceMatchesWorkspace ? source.uri : undefined, + title: providedSourceMatchesWorkspace ? source.title : undefined, + }, + occurredAt, + )); + const persistedItems = await persistItemRecords( + client, + schema, + workspace, + normalizedSource, + items, + occurredAt, + ); + await updateSourceItemCount( + client, + schema, + workspace.workspaceId, + normalizedSource.sourceId, + occurredAt, + ); + + await client.query("commit"); + return persistedItems; + } catch (error) { + releaseError = error; + try { + await client.query("rollback"); + } catch (rollbackError) { + releaseError = rollbackError; + } + throw error; + } finally { + releaseClient(client, releaseError); + } + }, + async getPersistedSource({ workspaceId, sourceId }) { + await ensureReady(); + return getPersistedSourceRecord(pool, schema, workspaceId, sourceId); + }, + async getPersistedArtifacts({ workspaceId, sourceId }) { + await ensureReady(); + const result = await pool.query( + `select workspace_id, source_id, artifact_id, artifact_type, mime_type, storage_uri, checksum, metadata, created_at, updated_at + from ${qualify(schema, "source_artifacts")} + where workspace_id = $1 and source_id = $2 + order by artifact_id`, + [workspaceId, sourceId], + ); + + return result.rows.map((row) => mapArtifactRecord(row)); + }, + async getPersistedRepresentations({ workspaceId, sourceId }) { + await ensureReady(); + const result = await pool.query( + `select workspace_id, source_id, representation_id, representation_type, artifact_id, mime_type, language, metadata, created_at, updated_at + from ${qualify(schema, "source_representations")} + where workspace_id = $1 and source_id = $2 + order by representation_id`, + [workspaceId, sourceId], + ); + + return result.rows.map((row) => mapRepresentationRecord(row)); + }, + async getPersistedItems({ workspaceId, sourceId }) { + await ensureReady(); + const result = await pool.query<{ + workspace_id: string; + source_id: string; + item_id: string; + representation_id: string | null; + ordinal: number | null; + parent_item_id: string | null; + packet_key: string | null; + section_key: string | null; + provenance_location: string | null; + content: string; + content_type: IngestionItemRecord["contentType"]; + metadata: Record | null; + occurred_at: Date; + created_at: Date; + updated_at: Date; + }>( + `select workspace_id, source_id, item_id, representation_id, ordinal, parent_item_id, packet_key, + section_key, provenance_location, content, content_type, metadata, + occurred_at, created_at, updated_at + from ${qualify(schema, "ingestion_items")} + where workspace_id = $1 and source_id = $2 + order by coalesce(ordinal, 2147483647), item_id`, + [workspaceId, sourceId], + ); + + return result.rows.map((row) => + mapItemRecord({ + workspace_id: row.workspace_id, + source_id: row.source_id, + item_id: row.item_id, + representation_id: row.representation_id, + ordinal: row.ordinal, + parent_item_id: row.parent_item_id, + packet_key: row.packet_key, + section_key: row.section_key, + provenance_location: row.provenance_location, + content: row.content, + content_type: row.content_type, + metadata: row.metadata, + occurred_at: row.occurred_at, + created_at: row.created_at, + updated_at: row.updated_at, + }), + ); + }, + async getCanonicalMultimodalRecordSet({ + workspaceId, + sourceId, + }: { + workspaceId: string; + sourceId: string; + }) { + await ensureReady(); + const source = await this.getPersistedSource({ workspaceId, sourceId }); + if (!source) { + return null; + } + + const artifactsGetter = this.getPersistedArtifacts; + const representationsGetter = this.getPersistedRepresentations; + if (!artifactsGetter || !representationsGetter) { + return null; + } + + const artifacts = await artifactsGetter({ + workspaceId, + sourceId, + }); + const representations = await representationsGetter({ + workspaceId, + sourceId, + }); + const retrievalItems = await this.getPersistedItems({ + workspaceId, + sourceId, + }); + + return { + source, + artifacts, + representations, + retrievalItems, + } satisfies CanonicalMultimodalRecordSet; + }, + async findSources({ workspaceId, queryText, sourceIds, limit }) { + await ensureReady(); + const sources = await getSourcesForSearch( + pool, + schema, + workspaceId, + queryText, + sourceIds, + limit, + ); + const queryTerms = tokenizeQuery(queryText); + + return sources + .map((source) => ({ + source, + score: scoreSourceRecordForQuery(source, queryTerms), + })) + .filter((candidate) => candidate.score > 0) + .sort((left, right) => right.score - left.score) + .slice(0, limit); + }, + async findItems({ workspaceId, queryText, sourceIds, limit }) { + await ensureReady(); + const items = await getItemsForSearch( + pool, + schema, + workspaceId, + queryText, + sourceIds, + limit, + ); + const queryTerms = tokenizeQuery(queryText); + + return items + .map((item) => ({ + item, + score: scoreItemRecordForQuery(item, queryTerms), + })) + .filter((candidate) => candidate.score > 0) + .sort((left, right) => right.score - left.score) + .slice(0, limit); + }, + async reset() { + await ensureReady(); + await pool.query( + `truncate table ${qualify(schema, "ingestion_items")}, ${qualify(schema, "source_representations")}, ${qualify(schema, "source_artifacts")}, ${qualify(schema, "sources")} restart identity cascade`, + ); + }, + }; +} + +type ArtifactRow = { + workspace_id: string; + source_id: string; + artifact_id: string; + artifact_type: string; + mime_type: string | null; + storage_uri: string | null; + checksum: string | null; + metadata: Record | null; + created_at: Date; + updated_at: Date; +}; + +type RepresentationRow = { + workspace_id: string; + source_id: string; + representation_id: string; + representation_type: string; + artifact_id: string | null; + mime_type: string | null; + language: string | null; + metadata: Record | null; + created_at: Date; + updated_at: Date; +}; + +async function assertWorkspaceAccess( + executor: QueryExecutor, + schema: string, + context: WorkspaceContext, +): Promise { + const result = await executor.query( + `select 1 + from ${qualify(schema, "workspace_memberships")} + where workspace_id = $1 and actor_id = $2`, + [context.workspaceId, context.actor.actorId], + ); + + if ((result.rowCount ?? 0) === 0) { + throw new Error(`workspace access denied for ${context.workspaceId}`); + } +} + +async function upsertSourceRecord( + executor: QueryExecutor, + schema: string, + workspace: WorkspaceContext, + source: IngestionSource, + occurredAt: string, +): Promise { + assertNonEmptyIdentifier(source.sourceId, "sourceId"); + const result = await executor.query<{ + workspace_id: string; + source_id: string; + root_source_id: string | null; + parent_source_id: string | null; + source_group_key: string | null; + kind: SourceRecord["kind"]; + uri: string | null; + title: string | null; + first_occurred_at: Date; + last_occurred_at: Date; + item_count: number; + }>( + `insert into ${qualify(schema, "sources")} as existing ( + workspace_id, + source_id, + root_source_id, + parent_source_id, + source_group_key, + kind, + uri, + title, + first_occurred_at, + last_occurred_at, + item_count + ) values ($1, $2, coalesce($3, $2), $4, coalesce($5, $2), $6, $7, $8, $9::timestamptz, $10::timestamptz, $11) + on conflict (workspace_id, source_id) do update set + root_source_id = case when $3::text is null then existing.root_source_id else $3 end, + parent_source_id = coalesce($4, existing.parent_source_id), + source_group_key = case when $5::text is null then existing.source_group_key else $5 end, + kind = existing.kind, + uri = coalesce($7, existing.uri), + title = coalesce($8, existing.title), + first_occurred_at = least(existing.first_occurred_at, excluded.first_occurred_at), + last_occurred_at = greatest(existing.last_occurred_at, excluded.last_occurred_at), + item_count = existing.item_count + returning workspace_id, source_id, root_source_id, parent_source_id, source_group_key, + kind, uri, title, first_occurred_at, last_occurred_at, item_count`, + [ + workspace.workspaceId, + source.sourceId, + source.rootSourceId ?? null, + source.parentSourceId ?? null, + source.sourceGroupKey ?? null, + source.kind, + source.uri ?? null, + source.title ?? null, + occurredAt, + occurredAt, + 0, + ], + ); + + return mapSourceRecord(result.rows[0]); +} + +async function persistArtifactRecords( + executor: QueryExecutor, + schema: string, + workspace: WorkspaceContext, + source: SourceRecord, + artifacts: SourceArtifact[], + occurredAt: string, +): Promise { + if (artifacts.length === 0) { + return []; + } + + const values: unknown[] = []; + const rowsSql = artifacts.map((artifact, index) => { + assertNonEmptyIdentifier(artifact.artifactId, "artifactId"); + const metadataJson = + artifact.metadata === undefined + ? null + : JSON.stringify(artifact.metadata); + const offset = index * 9; + values.push( + workspace.workspaceId, + source.sourceId, + artifact.artifactId, + artifact.artifactType, + artifact.mimeType ?? null, + artifact.storageUri ?? null, + artifact.checksum ?? null, + metadataJson, + occurredAt, + ); + + return `($${offset + 1}, $${offset + 2}, $${offset + 3}, $${offset + 4}, $${offset + 5}, $${offset + 6}, $${offset + 7}, coalesce($${offset + 8}::jsonb, '{}'::jsonb), $${offset + 9}::timestamptz, $${offset + 9}::timestamptz)`; + }); + + const result = await executor.query( + `insert into ${qualify(schema, "source_artifacts")} ( + workspace_id, + source_id, + artifact_id, + artifact_type, + mime_type, + storage_uri, + checksum, + metadata, + created_at, + updated_at + ) values ${rowsSql.join(", ")} + on conflict (workspace_id, source_id, artifact_id) do update set + artifact_type = excluded.artifact_type, + mime_type = coalesce(excluded.mime_type, ${qualify(schema, "source_artifacts")}.mime_type), + storage_uri = coalesce(excluded.storage_uri, ${qualify(schema, "source_artifacts")}.storage_uri), + checksum = coalesce(excluded.checksum, ${qualify(schema, "source_artifacts")}.checksum), + metadata = case + when excluded.metadata = '{}'::jsonb then ${qualify(schema, "source_artifacts")}.metadata + else excluded.metadata + end, + updated_at = excluded.updated_at + returning workspace_id, source_id, artifact_id, artifact_type, mime_type, storage_uri, checksum, metadata, created_at, updated_at`, + values, + ); + + return result.rows + .map((row) => mapArtifactRecord(row)) + .sort((left, right) => left.artifactId.localeCompare(right.artifactId)); +} + +async function persistRepresentationRecords( + executor: QueryExecutor, + schema: string, + workspace: WorkspaceContext, + source: SourceRecord, + representations: SourceRepresentation[], + occurredAt: string, +): Promise { + const persisted: SourceRepresentationRecord[] = []; + + for (const representation of representations) { + assertNonEmptyIdentifier( + representation.representationId, + "representationId", + ); + if (representation.artifactId !== undefined) { + assertNonEmptyIdentifier(representation.artifactId, "artifactId"); + await assertArtifactExists( + executor, + schema, + workspace.workspaceId, + source.sourceId, + representation.artifactId, + ); + } + + const metadataJson = + representation.metadata === undefined + ? null + : JSON.stringify(representation.metadata); + const result = await executor.query( + `insert into ${qualify(schema, "source_representations")} ( + workspace_id, + source_id, + representation_id, + representation_type, + artifact_id, + mime_type, + language, + metadata, + created_at, + updated_at + ) values ( + $1, $2, $3, $4, $5, $6, $7, coalesce($8::jsonb, '{}'::jsonb), $9::timestamptz, $9::timestamptz + ) + on conflict (workspace_id, source_id, representation_id) do update set + representation_type = excluded.representation_type, + artifact_id = coalesce(excluded.artifact_id, ${qualify(schema, "source_representations")}.artifact_id), + mime_type = coalesce(excluded.mime_type, ${qualify(schema, "source_representations")}.mime_type), + language = coalesce(excluded.language, ${qualify(schema, "source_representations")}.language), + metadata = case + when $8::jsonb is null then ${qualify(schema, "source_representations")}.metadata + else $8::jsonb + end, + updated_at = excluded.updated_at + returning workspace_id, source_id, representation_id, representation_type, artifact_id, mime_type, language, metadata, created_at, updated_at`, + [ + workspace.workspaceId, + source.sourceId, + representation.representationId, + representation.representationType, + representation.artifactId ?? null, + representation.mimeType ?? null, + representation.language ?? null, + metadataJson, + occurredAt, + ], + ); + + persisted.push(mapRepresentationRecord(result.rows[0])); + } + + return persisted; +} + +async function persistItemRecords( + executor: QueryExecutor, + schema: string, + workspace: WorkspaceContext, + source: SourceRecord, + items: IngestionItem[], + occurredAt: string, +): Promise { + const uniqueItems = dedupeIngestionItems(items); + const persistedItems: IngestionItemRecord[] = []; + + for (const [ordinal, item] of uniqueItems.entries()) { + assertNonEmptyIdentifier(item.itemId, "itemId"); + if (item.representationId !== undefined) { + assertNonEmptyIdentifier(item.representationId, "representationId"); + } + const result = await executor.query<{ + workspace_id: string; + source_id: string; + item_id: string; + representation_id: string | null; + ordinal: number | null; + parent_item_id: string | null; + packet_key: string | null; + section_key: string | null; + provenance_location: string | null; + content: string; + content_type: IngestionItemRecord["contentType"]; + metadata: Record | null; + occurred_at: Date; + created_at: Date; + updated_at: Date; + }>( + `insert into ${qualify(schema, "ingestion_items")} ( + workspace_id, + source_id, + item_id, + representation_id, + ordinal, + parent_item_id, + packet_key, + section_key, + provenance_location, + content, + content_type, + metadata, + occurred_at, + created_at, + updated_at + ) values ( + $1, + $2, + $3, + $4, + coalesce($5::int, $14::int), + $6, + coalesce($7, $15, $2), + $8, + $9, + $10, + $11, + $12::jsonb, + $13::timestamptz, + $13::timestamptz, + $13::timestamptz + ) + on conflict (workspace_id, source_id, item_id) do update set + representation_id = coalesce($4, ${qualify(schema, "ingestion_items")}.representation_id), + ordinal = case when $5::int is null then ${qualify(schema, "ingestion_items")}.ordinal else $5::int end, + parent_item_id = coalesce($6, ${qualify(schema, "ingestion_items")}.parent_item_id), + packet_key = case when $7::text is null then ${qualify(schema, "ingestion_items")}.packet_key else $7::text end, + section_key = coalesce($8, ${qualify(schema, "ingestion_items")}.section_key), + provenance_location = coalesce($9, ${qualify(schema, "ingestion_items")}.provenance_location), + content = excluded.content, + content_type = excluded.content_type, + metadata = excluded.metadata, + occurred_at = excluded.occurred_at, + updated_at = excluded.updated_at + returning workspace_id, source_id, item_id, representation_id, ordinal, parent_item_id, packet_key, + section_key, provenance_location, content, content_type, metadata, + occurred_at, created_at, updated_at`, + [ + workspace.workspaceId, + source.sourceId, + item.itemId, + item.representationId ?? null, + item.ordinal ?? null, + item.parentItemId ?? null, + item.packetKey ?? null, + item.sectionKey ?? null, + item.provenanceLocation ?? null, + item.content, + item.contentType, + JSON.stringify(item.metadata ?? {}), + occurredAt, + ordinal, + source.sourceGroupKey ?? null, + ], + ); + + persistedItems.push(mapItemRecord(result.rows[0])); + } + + return persistedItems; +} + +async function updateSourceItemCount( + executor: QueryExecutor, + schema: string, + workspaceId: string, + sourceId: string, + occurredAt: string, +): Promise { + await executor.query( + `update ${qualify(schema, "sources")} + set item_count = ( + select count(*)::int + from ${qualify(schema, "ingestion_items")} + where workspace_id = $1 and source_id = $2 + ), + last_occurred_at = greatest(last_occurred_at, $3::timestamptz) + where workspace_id = $1 and source_id = $2`, + [workspaceId, sourceId, occurredAt], + ); +} + +async function getPersistedSourceRecord( + executor: QueryExecutor, + schema: string, + workspaceId: string, + sourceId: string, +): Promise { + const result = await executor.query<{ + workspace_id: string; + source_id: string; + root_source_id: string | null; + parent_source_id: string | null; + source_group_key: string | null; + kind: SourceRecord["kind"]; + uri: string | null; + title: string | null; + first_occurred_at: Date; + last_occurred_at: Date; + item_count: number; + }>( + `select workspace_id, source_id, root_source_id, parent_source_id, source_group_key, + kind, uri, title, first_occurred_at, last_occurred_at, item_count + from ${qualify(schema, "sources")} + where workspace_id = $1 and source_id = $2`, + [workspaceId, sourceId], + ); + + return result.rows[0] ? mapSourceRecord(result.rows[0]) : null; +} + +async function assertArtifactExists( + executor: QueryExecutor, + schema: string, + workspaceId: string, + sourceId: string, + artifactId: string, +): Promise { + const result = await executor.query( + `select 1 + from ${qualify(schema, "source_artifacts")} + where workspace_id = $1 and source_id = $2 and artifact_id = $3`, + [workspaceId, sourceId, artifactId], + ); + + if ((result.rowCount ?? 0) === 0) { + throw new Error(`artifact ${artifactId} not found for source ${sourceId}`); + } +} + +function assertNonEmptyIdentifier(value: string, label: string): void { + if (value.trim().length === 0) { + throw new Error(`${label} must not be empty`); + } +} + +async function seedWorkspaceFixtures( + pool: Pool, + schema: string, + workspaces: WorkspaceRecord[], +): Promise { + for (const workspace of workspaces) { + await pool.query( + `insert into ${qualify(schema, "workspaces")} (workspace_id, slug, display_name) + values ($1, $2, $3) + on conflict (workspace_id) do update set slug = excluded.slug, display_name = excluded.display_name`, + [workspace.workspaceId, workspace.slug, workspace.displayName], + ); + } +} + +async function seedMembershipFixtures( + pool: Pool, + schema: string, + memberships: WorkspaceMembershipRecord[], +): Promise { + for (const membership of memberships) { + await pool.query( + `insert into ${qualify(schema, "workspace_memberships")} (workspace_id, actor_id, role) + values ($1, $2, $3) + on conflict (workspace_id, actor_id) do update set role = excluded.role`, + [membership.workspaceId, membership.actorId, membership.role], + ); + } +} + +async function getSourcesForSearch( + pool: Pool, + schema: string, + workspaceId: string, + queryText: string, + sourceIds?: string[], + limit?: number, +): Promise { + const queryTerms = tokenizeQuery(queryText); + if (queryTerms.length === 0) { + return []; + } + + const hasSourceFilter = sourceIds !== undefined; + const patterns = queryTerms.map((term) => `%${term}%`); + const result = await pool.query<{ + workspace_id: string; + source_id: string; + root_source_id: string | null; + parent_source_id: string | null; + source_group_key: string | null; + kind: SourceRecord["kind"]; + uri: string | null; + title: string | null; + first_occurred_at: Date; + last_occurred_at: Date; + item_count: number; + lexical_score: number; + }>( + `select workspace_id, source_id, root_source_id, parent_source_id, source_group_key, + kind, uri, title, first_occurred_at, last_occurred_at, item_count, + ( + select count(*)::int + from unnest($4::text[]) as pattern + where lower(concat_ws(' ', coalesce(title, ''), coalesce(uri, ''), kind)) like pattern + ) as lexical_score + from ${qualify(schema, "sources")} + where workspace_id = $1 + and ($2::boolean = false or source_id = any($3::text[])) + and lower(concat_ws(' ', coalesce(title, ''), coalesce(uri, ''), kind)) like any($4::text[]) + order by lexical_score desc, source_id + limit $5`, + [ + workspaceId, + hasSourceFilter, + sourceIds ?? [], + patterns, + Math.max((limit ?? 0) * 20, 50), + ], + ); + + return result.rows + .map((row): SourceRecord => mapSourceRecord(row)) + .sort((left, right) => left.sourceId.localeCompare(right.sourceId)); +} + +async function getItemsForSearch( + pool: Pool, + schema: string, + workspaceId: string, + queryText: string, + sourceIds?: string[], + limit?: number, +): Promise { + const queryTerms = tokenizeQuery(queryText); + if (queryTerms.length === 0) { + return []; + } + + const hasSourceFilter = sourceIds !== undefined; + const patterns = queryTerms.map((term) => `%${term}%`); + const result = await pool.query<{ + workspace_id: string; + source_id: string; + item_id: string; + representation_id: string | null; + ordinal: number | null; + parent_item_id: string | null; + packet_key: string | null; + section_key: string | null; + provenance_location: string | null; + content: string; + content_type: IngestionItemRecord["contentType"]; + metadata: Record | null; + occurred_at: Date; + created_at: Date; + updated_at: Date; + lexical_score: number; + }>( + `select workspace_id, source_id, item_id, representation_id, ordinal, parent_item_id, packet_key, + section_key, provenance_location, content, content_type, metadata, + occurred_at, created_at, updated_at, + ( + select count(*)::int + from unnest($4::text[]) as pattern + where lower(concat_ws(' ', content, content_type, coalesce(metadata::text, ''))) like pattern + ) as lexical_score + from ${qualify(schema, "ingestion_items")} + where workspace_id = $1 + and ($2::boolean = false or source_id = any($3::text[])) + and lower(concat_ws(' ', content, content_type, coalesce(metadata::text, ''))) like any($4::text[]) + order by lexical_score desc, coalesce(ordinal, 2147483647), item_id + limit $5`, + [ + workspaceId, + hasSourceFilter, + sourceIds ?? [], + patterns, + Math.max((limit ?? 0) * 20, 50), + ], + ); + + return result.rows + .map((row): IngestionItemRecord => mapItemRecord(row)) + .sort((left, right) => { + const ordinalDiff = + (left.ordinal ?? Number.MAX_SAFE_INTEGER) - + (right.ordinal ?? Number.MAX_SAFE_INTEGER); + if (ordinalDiff !== 0) { + return ordinalDiff; + } + + return left.itemId.localeCompare(right.itemId); + }); +} + +function mapWorkspaceRecord(row: { + workspace_id: string; + slug: string; + display_name: string; +}): WorkspaceRecord { + return { + workspaceId: row.workspace_id, + slug: row.slug, + displayName: row.display_name, + }; +} + +function mapSourceRecord(row: { + workspace_id: string; + source_id: string; + root_source_id: string | null; + parent_source_id: string | null; + source_group_key: string | null; + kind: SourceRecord["kind"]; + uri: string | null; + title: string | null; + first_occurred_at: Date; + last_occurred_at: Date; + item_count: number; +}): SourceRecord { + return { + workspaceId: row.workspace_id, + sourceId: row.source_id, + rootSourceId: row.root_source_id ?? undefined, + parentSourceId: row.parent_source_id ?? undefined, + sourceGroupKey: row.source_group_key ?? undefined, + kind: row.kind, + uri: row.uri ?? undefined, + title: row.title ?? undefined, + firstOccurredAt: row.first_occurred_at.toISOString(), + lastOccurredAt: row.last_occurred_at.toISOString(), + itemCount: row.item_count, + }; +} + +function mapArtifactRecord(row: ArtifactRow): SourceArtifactRecord { + return { + workspaceId: row.workspace_id, + sourceId: row.source_id, + artifactId: row.artifact_id, + artifactType: row.artifact_type as SourceArtifactRecord["artifactType"], + mimeType: row.mime_type ?? undefined, + storageUri: row.storage_uri ?? undefined, + checksum: row.checksum ?? undefined, + metadata: sanitizeMetadata(row.metadata), + createdAt: row.created_at.toISOString(), + updatedAt: row.updated_at.toISOString(), + }; +} + +function mapRepresentationRecord( + row: RepresentationRow, +): SourceRepresentationRecord { + return { + workspaceId: row.workspace_id, + sourceId: row.source_id, + representationId: row.representation_id, + representationType: + row.representation_type as SourceRepresentationRecord["representationType"], + artifactId: row.artifact_id ?? undefined, + mimeType: row.mime_type ?? undefined, + language: row.language ?? undefined, + metadata: sanitizeMetadata(row.metadata), + createdAt: row.created_at.toISOString(), + updatedAt: row.updated_at.toISOString(), + }; +} + +function mapItemRecord(row: { + workspace_id: string; + source_id: string; + item_id: string; + representation_id: string | null; + ordinal: number | null; + parent_item_id: string | null; + packet_key: string | null; + section_key: string | null; + provenance_location: string | null; + content: string; + content_type: IngestionItemRecord["contentType"]; + metadata: Record | null; + occurred_at: Date; + created_at: Date; + updated_at: Date; +}): IngestionItemRecord { + return { + workspaceId: row.workspace_id, + sourceId: row.source_id, + itemId: row.item_id, + representationId: row.representation_id ?? undefined, + ordinal: row.ordinal ?? undefined, + parentItemId: row.parent_item_id ?? undefined, + packetKey: row.packet_key ?? undefined, + sectionKey: row.section_key ?? undefined, + provenanceLocation: row.provenance_location ?? undefined, + content: row.content, + contentType: row.content_type, + metadata: sanitizeMetadata(row.metadata), + occurredAt: row.occurred_at.toISOString(), + createdAt: row.created_at.toISOString(), + updatedAt: row.updated_at.toISOString(), + }; +} + +function sanitizeMetadata( + metadata: Record | null, +): IngestionItemRecord["metadata"] { + if (!metadata) { + return {}; + } + + return structuredClone(metadata) as IngestionItemRecord["metadata"]; +} + +function renderMigration(schema: string): string { + return WORKSPACE_MIGRATION.replace( + new RegExp(`\\b${DEFAULT_SCHEMA_NAME}\\b`, "g"), + quoteIdentifier(schema), + ); +} + +function qualify(schema: string, table: string): string { + return `${quoteIdentifier(schema)}.${quoteIdentifier(table)}`; +} + +function normalizeSchemaName(schema: string): string { + const trimmed = schema.trim(); + if (!trimmed) { + throw new Error("postgres schema must not be empty"); + } + + if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(trimmed)) { + throw new Error(`invalid postgres schema name: ${schema}`); + } + + return trimmed; +} + +function quoteIdentifier(value: string): string { + return `"${value.replace(/"/g, '""')}"`; +} + +function loadWorkspaceMigrationTemplate(): string { + const candidateDirectories = [ + new URL("./migrations/0001_workspace_foundation.sql", import.meta.url), + new URL( + "../../../src/subsystems/supabase/migrations/0001_workspace_foundation.sql", + import.meta.url, + ), + ].map((url) => new URL(".", url)); + const migrations = new Map(); + + for (const directory of candidateDirectories) { + try { + if (!fs.existsSync(directory)) { + continue; + } + + for (const entry of fs.readdirSync(directory)) { + if (!entry.endsWith(".sql")) { + continue; + } + + migrations.set(entry, readFileSync(new URL(entry, directory), "utf8")); + } + } catch {} + } + + if (migrations.size > 0) { + return Array.from(migrations.entries()) + .sort(([left], [right]) => left.localeCompare(right)) + .map(([, sql]) => sql.trim()) + .join("\n\n"); + } + + throw new Error("unable to load workspace migration template"); +} + +async function assertRequiredSchemaObjects( + pool: Pool, + schema: string, +): Promise { + await pool.query( + `select workspace_id, slug, display_name + from ${qualify(schema, "workspaces")} + limit 1`, + ); + await pool.query( + `select workspace_id, actor_id, role + from ${qualify(schema, "workspace_memberships")} + limit 1`, + ); + await pool.query( + `select workspace_id, source_id, root_source_id, parent_source_id, source_group_key, + kind, uri, title, first_occurred_at, last_occurred_at, item_count + from ${qualify(schema, "sources")} + limit 1`, + ); + await pool.query( + `select workspace_id, source_id, artifact_id, artifact_type, mime_type, storage_uri, + checksum, metadata, created_at, updated_at + from ${qualify(schema, "source_artifacts")} + limit 1`, + ); + await pool.query( + `select workspace_id, source_id, representation_id, representation_type, artifact_id, + mime_type, language, metadata, created_at, updated_at + from ${qualify(schema, "source_representations")} + limit 1`, + ); + await pool.query( + `select workspace_id, source_id, item_id, representation_id, ordinal, parent_item_id, + packet_key, section_key, provenance_location, content, content_type, + metadata, occurred_at, created_at, updated_at + from ${qualify(schema, "ingestion_items")} + limit 1`, + ); +} + +function tokenizeQuery(queryText: string): string[] { + return queryText + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter((term) => term.length > 0); +} diff --git a/src/subsystems/supabase/repository.ts b/src/subsystems/supabase/repository.ts index 8519f1b..2ee2edc 100644 --- a/src/subsystems/supabase/repository.ts +++ b/src/subsystems/supabase/repository.ts @@ -1,9 +1,12 @@ import type { IngestionItem } from "../../ingestion/service.js"; import type { WorkspaceContext } from "../../workspace/context.js"; -import type { RelationalWorkspacePort } from "./port.js"; +import type { RepresentationAwareRelationalWorkspacePort } from "./port.js"; import type { + CanonicalMultimodalRecordSet, IngestionItemRecord, + SourceArtifactRecord, SourceRecord, + SourceRepresentationRecord, WorkspaceMembershipRecord, WorkspaceRecord, } from "./schema.js"; @@ -12,6 +15,8 @@ interface RepositoryState { workspaces: Map; memberships: Map; sources: Map; + artifacts: Map; + representations: Map; items: Map; } @@ -22,7 +27,7 @@ export interface CreateInMemoryRelationalWorkspaceRepositoryInput { export function createInMemoryRelationalWorkspaceRepository( input: CreateInMemoryRelationalWorkspaceRepositoryInput = {}, -): RelationalWorkspacePort { +): RepresentationAwareRelationalWorkspacePort { const state: RepositoryState = { workspaces: new Map( (input.workspaces ?? []).map((workspace) => [ @@ -37,6 +42,8 @@ export function createInMemoryRelationalWorkspaceRepository( ]), ), sources: new Map(), + artifacts: new Map(), + representations: new Map(), items: new Map(), }; @@ -57,9 +64,62 @@ export function createInMemoryRelationalWorkspaceRepository( return membership !== undefined; }, + async persistCanonicalIngestion({ + workspace, + source, + artifacts, + representations, + items, + occurredAt, + }) { + const snapshot = snapshotMutableRepositoryState(state); + + try { + const persistedSource = await this.upsertSource({ + workspace, + source, + occurredAt, + }); + const persistedArtifacts = artifacts?.length + ? ((await this.persistArtifacts?.({ + workspace, + source: persistedSource, + artifacts, + occurredAt, + })) ?? []) + : []; + const persistedRepresentations = representations?.length + ? ((await this.persistRepresentations?.({ + workspace, + source: persistedSource, + representations, + occurredAt, + })) ?? []) + : []; + const persistedItems = await this.persistItems({ + workspace, + source: persistedSource, + items, + occurredAt, + }); + + return { + source: + state.sources.get( + sourceKey(workspace.workspaceId, persistedSource.sourceId), + ) ?? persistedSource, + artifacts: persistedArtifacts, + representations: persistedRepresentations, + items: persistedItems, + }; + } catch (error) { + restoreMutableRepositoryState(state, snapshot); + throw error; + } + }, async upsertSource({ workspace, source, occurredAt }) { assertWorkspaceAccess(state, workspace); - + assertNonEmptyIdentifier(source.sourceId, "sourceId"); const key = sourceKey(workspace.workspaceId, source.sourceId); const existing = state.sources.get(key); const occurredAtTime = timestampToMillis(occurredAt); @@ -83,6 +143,11 @@ export function createInMemoryRelationalWorkspaceRepository( const next: SourceRecord = { workspaceId: workspace.workspaceId, sourceId: source.sourceId, + rootSourceId: + source.rootSourceId ?? existing?.rootSourceId ?? source.sourceId, + parentSourceId: source.parentSourceId ?? existing?.parentSourceId, + sourceGroupKey: + source.sourceGroupKey ?? existing?.sourceGroupKey ?? source.sourceId, kind: existing?.kind ?? source.kind, uri: source.uri ?? existing?.uri, title: source.title ?? existing?.title, @@ -96,65 +161,236 @@ export function createInMemoryRelationalWorkspaceRepository( }, async persistItems({ workspace, source, items, occurredAt }) { assertWorkspaceAccess(state, workspace); - - const key = sourceKey(workspace.workspaceId, source.sourceId); - const existingSource = state.sources.get(key); - const providedSourceMatchesWorkspace = - source.workspaceId === workspace.workspaceId; - const sourceRecord = existingSource ?? { - workspaceId: workspace.workspaceId, - sourceId: source.sourceId, - kind: source.kind, - uri: providedSourceMatchesWorkspace ? source.uri : undefined, - title: providedSourceMatchesWorkspace ? source.title : undefined, - firstOccurredAt: occurredAt, - lastOccurredAt: occurredAt, - itemCount: 0, - }; - const uniqueItems = dedupeItems(items); - - const persisted = uniqueItems.map((item) => - persistItemRecord({ - state, - workspace, - source: sourceRecord, - item, - occurredAt, - }), + const snapshot = snapshotMutableRepositoryState(state); + + try { + const key = sourceKey(workspace.workspaceId, source.sourceId); + const existingSource = state.sources.get(key); + const providedSourceMatchesWorkspace = + source.workspaceId === workspace.workspaceId; + const sourceRecord = existingSource ?? { + workspaceId: workspace.workspaceId, + sourceId: source.sourceId, + rootSourceId: source.rootSourceId ?? source.sourceId, + parentSourceId: source.parentSourceId, + sourceGroupKey: source.sourceGroupKey ?? source.sourceId, + kind: source.kind, + uri: providedSourceMatchesWorkspace ? source.uri : undefined, + title: providedSourceMatchesWorkspace ? source.title : undefined, + firstOccurredAt: occurredAt, + lastOccurredAt: occurredAt, + itemCount: 0, + }; + const uniqueItems = dedupeIngestionItems(items); + + const persisted = uniqueItems.map((item, ordinal) => + persistItemRecord({ + state, + workspace, + source: sourceRecord, + item, + ordinal, + occurredAt, + }), + ); + + const relatedItems = Array.from(state.items.values()).filter( + (record) => + record.workspaceId === workspace.workspaceId && + record.sourceId === source.sourceId, + ); + + const nextSource = { + ...sourceRecord, + itemCount: relatedItems.length, + lastOccurredAt: + timestampToMillis(occurredAt) > + timestampToMillis(sourceRecord.lastOccurredAt) + ? occurredAt + : sourceRecord.lastOccurredAt, + }; + + state.sources.set(key, nextSource); + + return persisted.map((record) => ({ + ...record, + sourceId: nextSource.sourceId, + })); + } catch (error) { + restoreMutableRepositoryState(state, snapshot); + throw error; + } + }, + async persistArtifacts({ workspace, source, artifacts, occurredAt }) { + assertWorkspaceAccess(state, workspace); + const persistedSource = state.sources.get( + sourceKey(workspace.workspaceId, source.sourceId), ); + if (!persistedSource) { + throw new Error(`source ${source.sourceId} not found`); + } - const relatedItems = Array.from(state.items.values()).filter( - (record) => - record.workspaceId === workspace.workspaceId && - record.sourceId === source.sourceId, + const persisted = artifacts.map((artifact) => { + assertNonEmptyIdentifier(artifact.artifactId, "artifactId"); + const existing = state.artifacts.get( + artifactKey( + workspace.workspaceId, + source.sourceId, + artifact.artifactId, + ), + ); + const record: SourceArtifactRecord = { + workspaceId: workspace.workspaceId, + sourceId: source.sourceId, + artifactId: artifact.artifactId, + artifactType: artifact.artifactType, + mimeType: artifact.mimeType ?? existing?.mimeType, + storageUri: artifact.storageUri ?? existing?.storageUri, + checksum: artifact.checksum ?? existing?.checksum, + metadata: structuredClone( + artifact.metadata ?? existing?.metadata ?? {}, + ), + createdAt: existing?.createdAt ?? occurredAt, + updatedAt: occurredAt, + }; + + state.artifacts.set( + artifactKey( + workspace.workspaceId, + source.sourceId, + artifact.artifactId, + ), + record, + ); + return record; + }); + + return persisted; + }, + async persistRepresentations({ + workspace, + source, + representations, + occurredAt, + }) { + assertWorkspaceAccess(state, workspace); + const persistedSource = state.sources.get( + sourceKey(workspace.workspaceId, source.sourceId), ); + if (!persistedSource) { + throw new Error(`source ${source.sourceId} not found`); + } - const nextSource = { - ...sourceRecord, - itemCount: relatedItems.length, - lastOccurredAt: - timestampToMillis(occurredAt) > - timestampToMillis(sourceRecord.lastOccurredAt) - ? occurredAt - : sourceRecord.lastOccurredAt, - }; - - state.sources.set(key, nextSource); - - return persisted.map((record) => ({ - ...record, - sourceId: nextSource.sourceId, - })); + const persisted = representations.map((representation) => { + assertNonEmptyIdentifier( + representation.representationId, + "representationId", + ); + if (representation.artifactId !== undefined) { + assertNonEmptyIdentifier(representation.artifactId, "artifactId"); + const relatedArtifact = state.artifacts.get( + artifactKey( + workspace.workspaceId, + source.sourceId, + representation.artifactId, + ), + ); + if (!relatedArtifact) { + throw new Error( + `artifact ${representation.artifactId} not found for source ${source.sourceId}`, + ); + } + } + + const existing = state.representations.get( + representationKey( + workspace.workspaceId, + source.sourceId, + representation.representationId, + ), + ); + const record: SourceRepresentationRecord = { + workspaceId: workspace.workspaceId, + sourceId: source.sourceId, + representationId: representation.representationId, + representationType: representation.representationType, + artifactId: representation.artifactId ?? existing?.artifactId, + mimeType: representation.mimeType ?? existing?.mimeType, + language: representation.language ?? existing?.language, + metadata: structuredClone( + representation.metadata ?? existing?.metadata ?? {}, + ), + createdAt: existing?.createdAt ?? occurredAt, + updatedAt: occurredAt, + }; + + state.representations.set( + representationKey( + workspace.workspaceId, + source.sourceId, + representation.representationId, + ), + record, + ); + return record; + }); + + return persisted; }, async getPersistedSource({ workspaceId, sourceId }) { return state.sources.get(sourceKey(workspaceId, sourceId)) ?? null; }, + async getPersistedArtifacts({ workspaceId, sourceId }) { + return Array.from(state.artifacts.values()) + .filter( + (record) => + record.workspaceId === workspaceId && record.sourceId === sourceId, + ) + .sort((a, b) => a.artifactId.localeCompare(b.artifactId)); + }, + async getPersistedRepresentations({ workspaceId, sourceId }) { + return Array.from(state.representations.values()) + .filter( + (record) => + record.workspaceId === workspaceId && record.sourceId === sourceId, + ) + .sort((a, b) => a.representationId.localeCompare(b.representationId)); + }, async getPersistedItems({ workspaceId, sourceId }) { return Array.from(state.items.values()).filter( (record) => record.workspaceId === workspaceId && record.sourceId === sourceId, ); }, + async getCanonicalMultimodalRecordSet({ + workspaceId, + sourceId, + }: { + workspaceId: string; + sourceId: string; + }) { + const source = await this.getPersistedSource({ workspaceId, sourceId }); + if (!source) { + return null; + } + + const artifacts = + (await this.getPersistedArtifacts?.({ workspaceId, sourceId })) ?? []; + const representations = + (await this.getPersistedRepresentations?.({ workspaceId, sourceId })) ?? + []; + const retrievalItems = await this.getPersistedItems({ + workspaceId, + sourceId, + }); + + return { + source, + artifacts, + representations, + retrievalItems, + } satisfies CanonicalMultimodalRecordSet; + }, async findSources({ workspaceId, queryText, sourceIds, limit }) { const queryTerms = tokenizeQuery(queryText); @@ -165,7 +401,7 @@ export function createInMemoryRelationalWorkspaceRepository( ) .map((source) => ({ source, - score: scoreSourceCandidate(source, queryTerms), + score: scoreSourceRecordForQuery(source, queryTerms), })) .filter((candidate) => candidate.score > 0) .sort((left, right) => right.score - left.score) @@ -182,7 +418,7 @@ export function createInMemoryRelationalWorkspaceRepository( ) .map((item) => ({ item, - score: scoreItemCandidate(item, queryTerms), + score: scoreItemRecordForQuery(item, queryTerms), })) .filter((candidate) => candidate.score > 0) .sort((left, right) => right.score - left.score) @@ -191,6 +427,8 @@ export function createInMemoryRelationalWorkspaceRepository( }, async reset() { state.sources.clear(); + state.artifacts.clear(); + state.representations.clear(); state.items.clear(); }, }; @@ -217,19 +455,47 @@ function persistItemRecord(input: { workspace: WorkspaceContext; source: SourceRecord; item: IngestionItem; + ordinal: number; occurredAt: string; }): IngestionItemRecord { + assertNonEmptyIdentifier(input.item.itemId, "itemId"); const key = itemKey( input.workspace.workspaceId, input.source.sourceId, input.item.itemId, ); const existing = input.state.items.get(key); + if (input.item.representationId !== undefined) { + assertNonEmptyIdentifier(input.item.representationId, "representationId"); + const relatedRepresentation = input.state.representations.get( + representationKey( + input.workspace.workspaceId, + input.source.sourceId, + input.item.representationId, + ), + ); + if (!relatedRepresentation) { + throw new Error( + `representation ${input.item.representationId} not found for source ${input.source.sourceId}`, + ); + } + } const record: IngestionItemRecord = { workspaceId: input.workspace.workspaceId, sourceId: input.source.sourceId, itemId: input.item.itemId, + representationId: input.item.representationId ?? existing?.representationId, + ordinal: input.item.ordinal ?? existing?.ordinal ?? input.ordinal, + parentItemId: input.item.parentItemId ?? existing?.parentItemId, + packetKey: + input.item.packetKey ?? + existing?.packetKey ?? + input.source.sourceGroupKey ?? + input.source.sourceId, + sectionKey: input.item.sectionKey ?? existing?.sectionKey, + provenanceLocation: + input.item.provenanceLocation ?? existing?.provenanceLocation, content: input.item.content, contentType: input.item.contentType, metadata: structuredClone(input.item.metadata ?? {}), @@ -258,18 +524,103 @@ function itemKey( return `${workspaceId}:${sourceId}:${itemId}`; } -function dedupeItems(items: IngestionItem[]): IngestionItem[] { +function artifactKey( + workspaceId: string, + sourceId: string, + artifactId: string, +): string { + return `${workspaceId}:${sourceId}:artifact:${artifactId}`; +} + +function representationKey( + workspaceId: string, + sourceId: string, + representationId: string, +): string { + return `${workspaceId}:${sourceId}:representation:${representationId}`; +} + +function assertNonEmptyIdentifier(value: string, label: string): void { + if (value.trim().length === 0) { + throw new Error(`${label} must not be empty`); + } +} + +function snapshotMutableRepositoryState( + state: RepositoryState, +): Pick< + RepositoryState, + "sources" | "artifacts" | "representations" | "items" +> { + return { + sources: cloneRecordMap(state.sources), + artifacts: cloneRecordMap(state.artifacts), + representations: cloneRecordMap(state.representations), + items: cloneRecordMap(state.items), + }; +} + +function restoreMutableRepositoryState( + state: RepositoryState, + snapshot: Pick< + RepositoryState, + "sources" | "artifacts" | "representations" | "items" + >, +): void { + state.sources = snapshot.sources; + state.artifacts = snapshot.artifacts; + state.representations = snapshot.representations; + state.items = snapshot.items; +} + +function cloneRecordMap(input: Map): Map { + return new Map( + Array.from(input.entries()).map(([key, value]) => [ + key, + structuredClone(value), + ]), + ); +} + +export function dedupeIngestionItems(items: IngestionItem[]): IngestionItem[] { const seen = new Map(); for (const item of items) { - if (!seen.has(item.itemId)) { - seen.set(item.itemId, item); - } + const existing = seen.get(item.itemId); + seen.set(item.itemId, mergeIngestionItems(existing, item)); } return Array.from(seen.values()); } +function mergeIngestionItems( + existing: IngestionItem | undefined, + incoming: IngestionItem, +): IngestionItem { + if (!existing) { + return structuredClone(incoming); + } + + // Duplicate item ids use a first-wins merge: keep the earliest ingestion lineage + // fields and only fill gaps from later duplicates, while cloning the result so + // dedupe does not mutate caller-owned objects. + return { + ...existing, + representationId: existing.representationId ?? incoming.representationId, + ordinal: existing.ordinal ?? incoming.ordinal, + parentItemId: existing.parentItemId ?? incoming.parentItemId, + packetKey: existing.packetKey ?? incoming.packetKey, + sectionKey: existing.sectionKey ?? incoming.sectionKey, + provenanceLocation: + existing.provenanceLocation ?? incoming.provenanceLocation, + metadata: + existing.metadata === undefined || + Object.keys(existing.metadata).length === 0 + ? structuredClone(incoming.metadata ?? existing.metadata ?? {}) + : structuredClone(existing.metadata), + }; +} + function timestampToMillis(value: string): number { return new Date(value).getTime(); } @@ -281,19 +632,19 @@ function tokenizeQuery(queryText: string): string[] { .filter((term) => term.length > 0); } -function scoreSourceCandidate( +export function scoreSourceRecordForQuery( source: SourceRecord, queryTerms: string[], ): number { return scoreTextFields([source.title, source.uri, source.kind], queryTerms); } -function scoreItemCandidate( +export function scoreItemRecordForQuery( item: IngestionItemRecord, queryTerms: string[], ): number { - const metadataValues = Object.values(item.metadata).map((value) => - value === null ? "" : String(value), + const metadataValues = Object.values(item.metadata).flatMap((value) => + flattenMetadataSearchValues(value), ); return scoreTextFields( @@ -302,6 +653,61 @@ function scoreItemCandidate( ); } +function flattenMetadataSearchValues(value: unknown): string[] { + if (value === null || value === undefined) { + return []; + } + + if ( + typeof value === "string" || + typeof value === "number" || + typeof value === "boolean" + ) { + return [String(value)]; + } + + if (Array.isArray(value)) { + return value.flatMap((entry) => flattenMetadataSearchValues(entry)); + } + + if (typeof value === "object") { + return [stableSerializeSearchValue(value as Record)]; + } + + return [String(value)]; +} + +function stableSerializeSearchValue( + value: Record | unknown[], +): string { + if (Array.isArray(value)) { + return `[${value.map((entry) => serializeSearchEntry(entry)).join(",")}]`; + } + + return `{${Object.entries(value) + .sort(([left], [right]) => left.localeCompare(right)) + .map( + ([key, entry]) => `${JSON.stringify(key)}:${serializeSearchEntry(entry)}`, + ) + .join(",")}}`; +} + +function serializeSearchEntry(value: unknown): string { + if (value === null || value === undefined) { + return JSON.stringify(value); + } + + if (Array.isArray(value)) { + return stableSerializeSearchValue(value); + } + + if (typeof value === "object") { + return stableSerializeSearchValue(value as Record); + } + + return JSON.stringify(value); +} + function scoreTextFields( fields: Array, queryTerms: string[], @@ -310,12 +716,13 @@ function scoreTextFields( return 0; } - const haystack = fields - .filter((value): value is string => value !== undefined) - .join(" ") - .toLowerCase(); + const haystackTokens = new Set( + fields + .filter((value): value is string => value !== undefined) + .flatMap((value) => tokenizeQuery(value)), + ); return queryTerms.reduce((score, term) => { - return haystack.includes(term) ? score + 1 : score; + return haystackTokens.has(term) ? score + 1 : score; }, 0); } diff --git a/src/subsystems/supabase/schema.ts b/src/subsystems/supabase/schema.ts index 2629bcd..ba1dc73 100644 --- a/src/subsystems/supabase/schema.ts +++ b/src/subsystems/supabase/schema.ts @@ -32,6 +32,9 @@ export interface WorkspaceMembershipRecord { export const sourceRecordShape = { workspaceId: "string", sourceId: "string", + rootSourceId: "string?", + parentSourceId: "string?", + sourceGroupKey: "string?", kind: "source-kind", uri: "string?", title: "string?", @@ -43,6 +46,9 @@ export const sourceRecordShape = { export interface SourceRecord { workspaceId: string; sourceId: string; + rootSourceId?: string; + parentSourceId?: string; + sourceGroupKey?: string; kind: IngestionSource["kind"]; uri?: string; title?: string; @@ -51,10 +57,68 @@ export interface SourceRecord { itemCount: number; } +export const sourceArtifactRecordShape = { + workspaceId: "string", + sourceId: "string", + artifactId: "string", + artifactType: "artifact-type", + mimeType: "string?", + storageUri: "string?", + checksum: "string?", + metadata: "json", + createdAt: "iso-datetime", + updatedAt: "iso-datetime", +} as const; + +export interface SourceArtifactRecord { + workspaceId: string; + sourceId: string; + artifactId: string; + artifactType: string; + mimeType?: string; + storageUri?: string; + checksum?: string; + metadata: Record; + createdAt: string; + updatedAt: string; +} + +export const sourceRepresentationRecordShape = { + workspaceId: "string", + sourceId: "string", + representationId: "string", + representationType: "representation-type", + artifactId: "string?", + mimeType: "string?", + language: "string?", + metadata: "json", + createdAt: "iso-datetime", + updatedAt: "iso-datetime", +} as const; + +export interface SourceRepresentationRecord { + workspaceId: string; + sourceId: string; + representationId: string; + representationType: string; + artifactId?: string; + mimeType?: string; + language?: string; + metadata: Record; + createdAt: string; + updatedAt: string; +} + export const ingestionItemRecordShape = { workspaceId: "string", sourceId: "string", itemId: "string", + representationId: "string?", + ordinal: "number?", + parentItemId: "string?", + packetKey: "string?", + sectionKey: "string?", + provenanceLocation: "string?", content: "string", contentType: "content-type", metadata: "json", @@ -67,6 +131,12 @@ export interface IngestionItemRecord { workspaceId: string; sourceId: string; itemId: string; + representationId?: string; + ordinal?: number; + parentItemId?: string; + packetKey?: string; + sectionKey?: string; + provenanceLocation?: string; content: string; contentType: IngestionItem["contentType"]; metadata: Record; @@ -74,3 +144,12 @@ export interface IngestionItemRecord { createdAt: string; updatedAt: string; } + +export interface CanonicalMultimodalRecordSet { + source: SourceRecord; + artifacts: SourceArtifactRecord[]; + representations: SourceRepresentationRecord[]; + retrievalItems: IngestionItemRecord[]; +} + +export const retrievalItemRecordShape = ingestionItemRecordShape; diff --git a/src/subsystems/supabase/storage-port.ts b/src/subsystems/supabase/storage-port.ts new file mode 100644 index 0000000..01019fd --- /dev/null +++ b/src/subsystems/supabase/storage-port.ts @@ -0,0 +1,57 @@ +export interface StoredArtifact { + workspaceId: string; + sourceId: string; + artifactId: string; + body: Buffer; + mimeType?: string; +} + +export interface ArtifactStoragePort { + putArtifact(input: StoredArtifact): Promise; + getArtifact(input: { + workspaceId: string; + sourceId: string; + artifactId: string; + }): Promise; + deleteArtifact(input: { + workspaceId: string; + sourceId: string; + artifactId: string; + }): Promise; +} + +export function createInMemoryArtifactStoragePort(): ArtifactStoragePort { + // Test/local-only store: artifacts stay in-memory and unbounded, so this is not for production persistence. + const artifacts = new Map(); + + return { + async putArtifact(input) { + artifacts.set(makeArtifactKey(input), { + ...input, + body: Buffer.from(input.body), + }); + }, + async getArtifact(input) { + const existing = artifacts.get(makeArtifactKey(input)); + if (!existing) { + return null; + } + + return { + ...existing, + body: Buffer.from(existing.body), + }; + }, + async deleteArtifact(input) { + artifacts.delete(makeArtifactKey(input)); + }, + }; +} + +function makeArtifactKey(input: { + workspaceId: string; + sourceId: string; + artifactId: string; +}): string { + return `${input.workspaceId}\0${input.sourceId}\0${input.artifactId}`; +} diff --git a/src/workspace/context.ts b/src/workspace/context.ts index 50ed9ca..41e5fda 100644 --- a/src/workspace/context.ts +++ b/src/workspace/context.ts @@ -12,9 +12,29 @@ export interface WorkspaceContext { correlationId?: string; } +export interface WorkspaceContextResolutionInput { + workspaceId?: string; + authorizationHeader?: string; + requestId?: string; + correlationId?: string; +} + +export type WorkspaceResolutionResult = + | { + ok: true; + context: WorkspaceContext; + } + | ({ + ok: false; + code: string; + } & Partial<{ + workspaceId: string; + requestId: string; + correlationId: string; + }>); + export interface WorkspaceContextResolver { - resolve(input: { - workspaceId: string; - actorId: string; - }): Promise; + resolve( + input: WorkspaceContextResolutionInput, + ): Promise; } diff --git a/test/ingestion.relational.test.mjs b/test/ingestion.relational.test.mjs index 762f25f..54cdf42 100644 --- a/test/ingestion.relational.test.mjs +++ b/test/ingestion.relational.test.mjs @@ -3,8 +3,18 @@ import assert from 'node:assert/strict'; import * as schema from '../dist/subsystems/supabase/schema.js'; import { createIngestionService } from '../dist/index.js'; -import { relationalWorkspacePortMethodNames } from '../dist/subsystems/supabase/port.js'; +import { + relationalCanonicalEntityNames, + isRepresentationAwareRelationalWorkspacePort, + relationalWorkspacePortMethodNames, +} from '../dist/subsystems/supabase/port.js'; import { createInMemoryRelationalWorkspaceRepository } from '../dist/subsystems/supabase/repository.js'; +import { + createDefaultExtractorRegistry, + defaultRepresentationTypes, + normalizeRepresentationMetadata, +} from '../dist/ingestion/representations.js'; +import { createInMemoryArtifactStoragePort } from '../dist/subsystems/supabase/storage-port.js'; test('relational schema exports canonical workspace/source/item entities', () => { assert.ok(schema.workspaceRecordShape); @@ -13,13 +23,53 @@ test('relational schema exports canonical workspace/source/item entities', () => assert.ok(schema.ingestionItemRecordShape); }); +test('relational schema exports canonical source, artifact, representation, and retrieval item entities', () => { + assert.deepEqual(relationalCanonicalEntityNames, [ + 'source', + 'artifact', + 'representation', + 'retrieval-item', + ]); + assert.ok(schema.sourceArtifactRecordShape); + assert.ok(schema.sourceRepresentationRecordShape); + assert.ok(schema.retrievalItemRecordShape); + assert.equal(schema.sourceArtifactRecordShape.artifactType, 'artifact-type'); + assert.equal( + schema.sourceRepresentationRecordShape.representationType, + 'representation-type', + ); + assert.equal(schema.retrievalItemRecordShape.representationId, 'string?'); + assert.deepEqual( + schema.ingestionItemRecordShape, + schema.retrievalItemRecordShape, + ); +}); + +test('ingestion schema exposes retrieval-oriented source and item fields', () => { + assert.equal(schema.sourceRecordShape.rootSourceId, 'string?'); + assert.equal(schema.sourceRecordShape.parentSourceId, 'string?'); + assert.equal(schema.sourceRecordShape.sourceGroupKey, 'string?'); + + assert.equal(schema.ingestionItemRecordShape.ordinal, 'number?'); + assert.equal(schema.ingestionItemRecordShape.parentItemId, 'string?'); + assert.equal(schema.ingestionItemRecordShape.packetKey, 'string?'); + assert.equal(schema.ingestionItemRecordShape.sectionKey, 'string?'); + assert.equal(schema.ingestionItemRecordShape.provenanceLocation, 'string?'); +}); + test('relational port supports canonical source and item persistence', () => { assert.deepEqual(relationalWorkspacePortMethodNames, [ 'getWorkspace', 'validateAccess', + 'persistCanonicalIngestion', 'upsertSource', + 'persistArtifacts', + 'persistRepresentations', 'persistItems', 'getPersistedSource', + 'getPersistedArtifacts', + 'getPersistedRepresentations', + 'getCanonicalMultimodalRecordSet', 'getPersistedItems', 'findSources', 'findItems', @@ -27,6 +77,188 @@ test('relational port supports canonical source and item persistence', () => { ]); }); +test('foundation exports extractor-ready representation helpers and storage boundary', async () => { + assert.deepEqual(defaultRepresentationTypes, [ + 'normalized-text', + 'html-cleaned', + 'pdf-page-text', + 'ocr-text', + 'table-projection', + 'transcript-segment', + 'meeting-note-segment', + 'caption-text', + ]); + + assert.deepEqual( + normalizeRepresentationMetadata({ + pageNumber: 1, + flags: ['ocr', 'derived'], + nested: { region: 'page-1' }, + }), + { + flags: ['ocr', 'derived'], + nested: { region: 'page-1' }, + pageNumber: 1, + }, + ); + + const registry = createDefaultExtractorRegistry(); + assert.deepEqual(registry.listTypes(), [ + 'caption-text', + 'html-cleaned', + 'meeting-note-segment', + 'normalized-text', + 'ocr-text', + 'pdf-page-text', + 'table-projection', + 'transcript-segment', + ]); + + const storage = createInMemoryArtifactStoragePort(); + await storage.putArtifact({ + workspaceId: 'ws-1', + sourceId: 'source-1', + artifactId: 'artifact-1', + body: Buffer.from('artifact payload'), + mimeType: 'application/pdf', + }); + const loaded = await storage.getArtifact({ + workspaceId: 'ws-1', + sourceId: 'source-1', + artifactId: 'artifact-1', + }); + assert.equal(loaded?.mimeType, 'application/pdf'); + assert.equal(loaded?.body.toString('utf8'), 'artifact payload'); + + await storage.putArtifact({ + workspaceId: 'ws:1', + sourceId: 'src', + artifactId: 'art', + body: Buffer.from('first collision candidate'), + }); + await storage.putArtifact({ + workspaceId: 'ws', + sourceId: '1:src', + artifactId: 'art', + body: Buffer.from('second collision candidate'), + }); + const firstCollisionCandidate = await storage.getArtifact({ + workspaceId: 'ws:1', + sourceId: 'src', + artifactId: 'art', + }); + const secondCollisionCandidate = await storage.getArtifact({ + workspaceId: 'ws', + sourceId: '1:src', + artifactId: 'art', + }); + assert.equal( + firstCollisionCandidate?.body.toString('utf8'), + 'first collision candidate', + ); + assert.equal( + secondCollisionCandidate?.body.toString('utf8'), + 'second collision candidate', + ); + + const repository = createInMemoryRelationalWorkspaceRepository(); + assert.equal(isRepresentationAwareRelationalWorkspacePort(repository), true); +}); + +test('default extractor registry derives stable distinct representation ids from input context', async () => { + const registry = createDefaultExtractorRegistry(); + const extractor = registry.get('pdf-page-text'); + + assert.ok(extractor); + + const first = await extractor.extract({ + sourceId: 'source-1', + artifactId: 'artifact-1', + metadata: { pageNumber: 1 }, + }); + const second = await extractor.extract({ + sourceId: 'source-1', + artifactId: 'artifact-1', + metadata: { pageNumber: 2 }, + }); + const repeated = await extractor.extract({ + sourceId: 'source-1', + artifactId: 'artifact-1', + metadata: { pageNumber: 1 }, + }); + + assert.notEqual(first[0].representationId, second[0].representationId); + assert.equal(first[0].representationId, repeated[0].representationId); +}); + +test('default extractor registry keeps representation ids bounded for large metadata', async () => { + const registry = createDefaultExtractorRegistry(); + const extractor = registry.get('pdf-page-text'); + + assert.ok(extractor); + + const [representation] = await extractor.extract({ + sourceId: 'source-1', + artifactId: 'artifact-1', + metadata: { + pageNumber: 1, + payload: 'x'.repeat(4096), + }, + }); + + assert.match(representation.representationId, /^source-1:pdf-page-text:[a-f0-9]{16}$/); +}); + +test('representation-aware port guard requires canonical multimodal getter', () => { + const incompletePort = { + async getWorkspace() { + return null; + }, + async validateAccess() { + return true; + }, + async persistCanonicalIngestion() { + throw new Error('not implemented'); + }, + async upsertSource() { + throw new Error('not implemented'); + }, + async persistArtifacts() { + return []; + }, + async persistRepresentations() { + return []; + }, + async persistItems() { + return []; + }, + async getPersistedSource() { + return null; + }, + async getPersistedArtifacts() { + return []; + }, + async getPersistedRepresentations() { + return []; + }, + async getPersistedItems() { + return []; + }, + async findSources() { + return []; + }, + async findItems() { + return []; + }, + async reset() {}, + }; + + assert.equal( + isRepresentationAwareRelationalWorkspacePort(incompletePort), + false, + ); +}); + test('relational repository persists a canonical source and item records', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ @@ -84,6 +316,38 @@ test('relational repository persists a canonical source and item records', async assert.equal(persistedItems[0].content, 'hello world'); }); +test('in-memory multimodal helpers reject missing source parity with postgres', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + await assert.rejects( + () => + repository.persistArtifacts({ + workspace, + source: { sourceId: 'missing-source' }, + artifacts: [ + { + artifactId: 'artifact-1', + artifactType: 'raw-file', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /source missing-source not found/, + ); +}); + test('relational repository reuses canonical source identity on re-ingestion', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ @@ -303,6 +567,112 @@ test('ingest persists canonical source and item data through the relational laye assert.equal(persistedItems.length, 1); }); +test('ingest persists artifacts and representations before retrieval item fanout', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const graphCalls = []; + const vectorCalls = []; + const service = createIngestionService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts(input) { + graphCalls.push({ + input, + snapshot: await repository.getCanonicalMultimodalRecordSet?.({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }), + }); + }, + }, + vectorIndex: { + async upsertDocuments(input) { + vectorCalls.push({ + input, + snapshot: await repository.getCanonicalMultimodalRecordSet?.({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }), + }); + }, + }, + now: () => new Date('2026-03-23T00:00:00.000Z'), + }); + + const receipt = await service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'pdf', + title: 'Quarterly Review', + }, + artifacts: [ + { + artifactId: 'artifact-1', + artifactType: 'pdf-original', + mimeType: 'application/pdf', + storageUri: 'memory://artifact-1.pdf', + metadata: { labels: ['finance'] }, + }, + ], + representations: [ + { + representationId: 'representation-1', + representationType: 'pdf-page-text', + artifactId: 'artifact-1', + metadata: { pageNumber: 1 }, + }, + ], + items: [ + { + itemId: 'item-1', + representationId: 'representation-1', + content: 'quarterly review text projection', + contentType: 'text/plain', + metadata: { pageNumber: 1 }, + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + assert.equal(receipt.acceptedItems, 1); + const multimodal = await repository.getCanonicalMultimodalRecordSet?.({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + assert.equal(multimodal?.artifacts.length, 1); + assert.equal(multimodal?.representations.length, 1); + assert.equal(multimodal?.retrievalItems[0].representationId, 'representation-1'); + assert.equal(graphCalls.length, 1); + assert.equal(vectorCalls.length, 1); + assert.ok( + graphCalls.every( + (call) => + call.snapshot?.artifacts.length === 1 && + call.snapshot?.representations.length === 1 && + call.snapshot?.retrievalItems[0].representationId === 'representation-1', + ), + ); + assert.ok( + vectorCalls.every( + (call) => + call.snapshot?.artifacts.length === 1 && + call.snapshot?.representations.length === 1 && + call.snapshot?.retrievalItems[0].representationId === 'representation-1', + ), + ); +}); + test('ingest derives the receipt and downstream fanout from persisted canonical items', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ @@ -375,6 +745,64 @@ test('ingest derives the receipt and downstream fanout from persisted canonical assert.equal(receipt.receivedAt, '2026-03-23T01:00:00.000Z'); }); +test('ingest preserves later duplicate lineage fields while collapsing duplicate item ids', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const service = createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { async upsertDocuments() {} }, + now: () => new Date('2026-03-23T01:00:00.000Z'), + }); + + await service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'document', + }, + representations: [ + { + representationId: 'representation-1', + representationType: 'normalized-text', + }, + ], + items: [ + { + itemId: 'item-1', + content: 'hello world', + contentType: 'text/plain', + }, + { + itemId: 'item-1', + representationId: 'representation-1', + content: 'duplicate should collapse', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + const persistedItems = await repository.getPersistedItems({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + + assert.equal(persistedItems.length, 1); + assert.equal(persistedItems[0].content, 'hello world'); + assert.equal(persistedItems[0].representationId, 'representation-1'); +}); + test('ingest rejects unauthorized workspace writes before persistence', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ @@ -514,56 +942,158 @@ test('relational repository rejects direct item writes without workspace members ); }); -test('relational repository normalizes source identity to the target workspace during direct item writes', async () => { +test('relational repository rejects artifact writes without workspace membership', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, - { workspaceId: 'ws-2', slug: 'ws-2', displayName: 'Workspace Two' }, ], memberships: [ - { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, - { workspaceId: 'ws-2', actorId: 'user-2', role: 'member' }, + { workspaceId: 'ws-1', actorId: 'owner-1', role: 'owner' }, ], }); - const workspaceOne = { + const authorizedWorkspace = { workspaceId: 'ws-1', - actor: { actorId: 'user-1', role: 'member' }, + actor: { actorId: 'owner-1', role: 'owner' }, }; - const workspaceTwo = { - workspaceId: 'ws-2', - actor: { actorId: 'user-2', role: 'member' }, + const unauthorizedWorkspace = { + workspaceId: 'ws-1', + actor: { actorId: 'intruder', role: 'member' }, }; - const foreignSource = await repository.upsertSource({ - workspace: workspaceTwo, + const source = await repository.upsertSource({ + workspace: authorizedWorkspace, source: { - sourceId: 'shared-source', + sourceId: 'source-1', kind: 'document', - title: 'Workspace Two Source', - uri: 'https://example.com/ws-2', }, occurredAt: '2026-03-23T00:00:00.000Z', }); - await repository.persistItems({ - workspace: workspaceOne, - source: foreignSource, - items: [ - { - itemId: 'item-1', - content: 'workspace one content', - contentType: 'text/plain', - }, - ], - occurredAt: '2026-03-24T00:00:00.000Z', - }); + await assert.rejects( + () => + repository.persistArtifacts?.({ + workspace: unauthorizedWorkspace, + source, + artifacts: [ + { + artifactId: 'artifact-unauthorized', + artifactType: 'pdf-original', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /access/i, + ); - const workspaceOneSource = await repository.getPersistedSource({ + const multimodal = await repository.getCanonicalMultimodalRecordSet?.({ workspaceId: 'ws-1', - sourceId: 'shared-source', + sourceId: 'source-1', }); - const workspaceTwoSource = await repository.getPersistedSource({ + assert.equal(multimodal?.artifacts.length ?? 0, 0); +}); + +test('relational repository rejects representation writes without workspace membership', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'owner-1', role: 'owner' }, + ], + }); + + const authorizedWorkspace = { + workspaceId: 'ws-1', + actor: { actorId: 'owner-1', role: 'owner' }, + }; + const unauthorizedWorkspace = { + workspaceId: 'ws-1', + actor: { actorId: 'intruder', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace: authorizedWorkspace, + source: { + sourceId: 'source-1', + kind: 'document', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await assert.rejects( + () => + repository.persistRepresentations?.({ + workspace: unauthorizedWorkspace, + source, + representations: [ + { + representationId: 'representation-unauthorized', + representationType: 'normalized-text', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /access/i, + ); + + const multimodal = await repository.getCanonicalMultimodalRecordSet?.({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + assert.equal(multimodal?.representations.length ?? 0, 0); +}); + +test('relational repository normalizes source identity to the target workspace during direct item writes', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + { workspaceId: 'ws-2', slug: 'ws-2', displayName: 'Workspace Two' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + { workspaceId: 'ws-2', actorId: 'user-2', role: 'member' }, + ], + }); + + const workspaceOne = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + const workspaceTwo = { + workspaceId: 'ws-2', + actor: { actorId: 'user-2', role: 'member' }, + }; + + const foreignSource = await repository.upsertSource({ + workspace: workspaceTwo, + source: { + sourceId: 'shared-source', + kind: 'document', + title: 'Workspace Two Source', + uri: 'https://example.com/ws-2', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistItems({ + workspace: workspaceOne, + source: foreignSource, + items: [ + { + itemId: 'item-1', + content: 'workspace one content', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-24T00:00:00.000Z', + }); + + const workspaceOneSource = await repository.getPersistedSource({ + workspaceId: 'ws-1', + sourceId: 'shared-source', + }); + const workspaceTwoSource = await repository.getPersistedSource({ workspaceId: 'ws-2', sourceId: 'shared-source', }); @@ -577,6 +1107,78 @@ test('relational repository normalizes source identity to the target workspace d assert.equal(workspaceTwoSource?.itemCount, 0); }); +test('relational repository rolls back in-memory direct item writes when a later item fails validation', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Atomic Source', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistRepresentations?.({ + workspace, + source, + representations: [ + { + representationId: 'representation-1', + representationType: 'normalized-text', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await assert.rejects( + () => + repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + representationId: 'representation-1', + content: 'valid content', + contentType: 'text/plain', + }, + { + itemId: 'item-2', + representationId: 'missing-representation', + content: 'invalid content', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /representation missing-representation not found/i, + ); + + const persistedItems = await repository.getPersistedItems({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + + assert.deepEqual( + persistedItems.map((item) => item.itemId), + [], + ); +}); + test('ingest fans out canonical identifiers to Graphiti and Qdrant after persistence', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ @@ -647,18 +1249,320 @@ test('ingest fans out canonical identifiers to Graphiti and Qdrant after persist documentId: 'source-1:item-1', text: 'hello world', embeddingModel: 'pending', - metadata: { - workspaceId: 'ws-1', - sourceId: 'source-1', - itemId: 'item-1', - contentType: 'text/plain', - }, + metadata: { + workspaceId: 'ws-1', + sourceId: 'source-1', + itemId: 'item-1', + contentType: 'text/plain', + occurredAt: '2026-03-23T00:00:00.000Z', + packetKey: 'source-1', }, + }, ], }, ]); }); +test('ingestion persists retrieval-oriented defaults for source and items', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const service = createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { async upsertDocuments() {} }, + now: () => new Date('2026-03-23T02:00:00.000Z'), + }); + + await service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Packet Source', + }, + items: [ + { + itemId: 'item-1', + content: 'first paragraph', + contentType: 'text/plain', + }, + { + itemId: 'item-2', + content: 'second paragraph', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T01:00:00.000Z', + }); + + const persistedSource = await repository.getPersistedSource({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + const persistedItems = await repository.getPersistedItems({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + + assert.equal(persistedSource?.rootSourceId, 'source-1'); + assert.equal(persistedSource?.sourceGroupKey, 'source-1'); + assert.deepEqual( + persistedItems.map((item) => ({ + itemId: item.itemId, + ordinal: item.ordinal, + packetKey: item.packetKey, + })), + [ + { itemId: 'item-1', ordinal: 0, packetKey: 'source-1' }, + { itemId: 'item-2', ordinal: 1, packetKey: 'source-1' }, + ], + ); +}); + +test('ingest rejects orphan representation references consistently in memory', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const service = createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { async upsertDocuments() {} }, + now: () => new Date('2026-03-23T00:00:00.000Z'), + }); + + await assert.rejects( + () => + service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'pdf', + }, + items: [ + { + itemId: 'item-1', + representationId: 'missing-representation', + content: 'orphan text', + contentType: 'text/plain', + }, + ], + }), + /representation missing-representation not found/, + ); +}); + +test('ingest rejects empty-string representation references consistently in memory', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const service = createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { async upsertDocuments() {} }, + now: () => new Date('2026-03-23T00:00:00.000Z'), + }); + + await assert.rejects( + () => + service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'pdf', + }, + items: [ + { + itemId: 'item-1', + representationId: '', + content: 'orphan text', + contentType: 'text/plain', + }, + ], + }), + /representationId must not be empty/, + ); +}); + +test('ingest rejects empty-string canonical ids in memory repository', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await assert.rejects( + () => + repository.persistArtifacts({ + workspace, + source, + artifacts: [ + { + artifactId: ' ', + artifactType: 'raw-file', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /artifactId must not be empty/, + ); +}); + +test('in-memory lexical retrieval matches nested metadata values', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { sourceId: 'source-1', kind: 'image', title: 'Screenshot' }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'visual grounding text', + contentType: 'text/plain', + metadata: { + region: { label: 'invoice-footer' }, + }, + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + const candidates = await repository.findItems({ + workspaceId: 'ws-1', + queryText: 'invoice footer', + limit: 5, + }); + + assert.equal(candidates.length, 1); + assert.equal(candidates[0].item.itemId, 'item-1'); +}); + +test('ingestion preserves an existing custom packetKey when re-ingesting without one', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const service = createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { async upsertDocuments() {} }, + now: () => new Date('2026-03-23T05:00:00.000Z'), + }); + + await service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'document', + sourceGroupKey: 'source-default', + }, + items: [ + { + itemId: 'item-1', + content: 'first paragraph', + contentType: 'text/plain', + packetKey: 'custom-packet', + }, + ], + }); + + await service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'document', + sourceGroupKey: 'source-default', + }, + items: [ + { + itemId: 'item-1', + content: 'updated paragraph', + contentType: 'text/plain', + }, + ], + }); + + const persistedItems = await repository.getPersistedItems({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + + assert.equal(persistedItems[0].packetKey, 'custom-packet'); +}); + test('ingest still returns accepted after non-authoritative fanout failures', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ @@ -765,30 +1669,102 @@ test('ingest still attempts vector fanout when graph fanout fails', async () => test('createRuntime exposes the active ingestion service through the runtime app', async () => { const { createRuntime } = await import('../dist/index.js'); const runtime = createRuntime({ + env: { + serviceName: 'secondbrain-engine', + mode: 'development', + port: 4000, + relational: { backend: 'memory' }, + auth: { mode: 'disabled' }, + }, now: () => new Date('2026-03-23T00:00:00.000Z'), }); - assert.equal(typeof runtime.app.services.ingestion.ingest, 'function'); + try { + assert.equal(typeof runtime.app.services.ingestion.ingest, 'function'); - const receipt = await runtime.app.services.ingestion.ingest({ - workspace: { - workspaceId: 'default-workspace', - actor: { actorId: 'system', role: 'owner' }, - }, - source: { - sourceId: 'runtime-source', - kind: 'document', - }, - items: [ - { - itemId: 'runtime-item', - content: 'runtime content', - contentType: 'text/plain', + const receipt = await runtime.app.services.ingestion.ingest({ + workspace: { + workspaceId: 'default-workspace', + actor: { actorId: 'system', role: 'owner' }, }, - ], + source: { + sourceId: 'runtime-source', + kind: 'document', + }, + items: [ + { + itemId: 'runtime-item', + content: 'runtime content', + contentType: 'text/plain', + }, + ], + }); + + assert.equal(receipt.workspaceId, 'default-workspace'); + assert.equal(receipt.sourceId, 'runtime-source'); + assert.equal(receipt.receivedAt, '2026-03-23T00:00:00.000Z'); + } finally { + await runtime.close(); + } +}); + +test('ingest rejects representation-linked items for non-representation-aware stores', async () => { + const service = createIngestionService({ + workspaceStore: { + async getWorkspace() { + return { + workspaceId: 'ws-1', + slug: 'ws-1', + displayName: 'Workspace One', + }; + }, + async validateAccess() { + return true; + }, + async upsertSource() { + throw new Error('should not reach upsertSource'); + }, + async persistItems() { + throw new Error('should not reach persistItems'); + }, + async getPersistedSource() { + return null; + }, + async getPersistedItems() { + return []; + }, + async findSources() { + return []; + }, + async findItems() { + return []; + }, + async reset() {}, + }, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { async upsertDocuments() {} }, }); - assert.equal(receipt.workspaceId, 'default-workspace'); - assert.equal(receipt.sourceId, 'runtime-source'); - assert.equal(receipt.receivedAt, '2026-03-23T00:00:00.000Z'); + await assert.rejects( + () => + service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'document', + }, + items: [ + { + itemId: 'item-1', + representationId: 'representation-1', + content: 'grounded text', + contentType: 'text/plain', + }, + ], + }), + /workspace store does not support multimodal canonical persistence/, + ); }); diff --git a/test/postgres.repository.test.mjs b/test/postgres.repository.test.mjs new file mode 100644 index 0000000..c463d0a --- /dev/null +++ b/test/postgres.repository.test.mjs @@ -0,0 +1,1428 @@ +import test, { after, before } from 'node:test'; +import assert from 'node:assert/strict'; +import { execFileSync, spawnSync } from 'node:child_process'; +import { randomUUID } from 'node:crypto'; +import { readFileSync } from 'node:fs'; +import { setTimeout as delay } from 'node:timers/promises'; +import { Client } from 'pg'; + +import { + createIngestionService, + createRuntime, + createRetrievalService, +} from '../dist/index.js'; +import { loadAppEnv } from '../dist/app/env.js'; +import { createPostgresRelationalWorkspaceRepository } from '../dist/subsystems/supabase/postgres.js'; + +const dockerSkipReason = getDockerSkipReason(); + +let postgres; + +before(async () => { + if (dockerSkipReason) { + return; + } + + postgres = await startPostgresContainer(); +}); + +after(async () => { + if (!postgres) { + return; + } + + await postgres.stop(); + postgres = undefined; +}); + +test( + 'postgres repository persists canonical source and item records across adapter recreation', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const schema = nextSchemaName(); + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + const repository = createRepository({ + schema, + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + try { + await repository.initialize(); + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Durable Source', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'durable relational grounding', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + } finally { + await repository.close(); + } + + const reopened = createRepository({ schema }); + try { + await reopened.initialize(); + + assert.equal(await reopened.validateAccess(workspace), true); + + const persistedSource = await reopened.getPersistedSource({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + const persistedItems = await reopened.getPersistedItems({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + + assert.equal(persistedSource?.itemCount, 1); + assert.equal(persistedItems.length, 1); + assert.equal(persistedItems[0].packetKey, 'source-1'); + } finally { + await reopened.close(); + } + }, +); + +test( + 'postgres repository persists artifacts, representations, and retrieval item lineage', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const schema = nextSchemaName(); + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + const repository = createRepository({ + schema, + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + try { + await repository.initialize(); + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'pdf', + title: 'Quarterly Review', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + const artifacts = await repository.persistArtifacts({ + workspace, + source, + artifacts: [ + { + artifactId: 'artifact-1', + artifactType: 'pdf-original', + mimeType: 'application/pdf', + storageUri: 'memory://artifact-1.pdf', + metadata: { + pageCount: 7, + labels: ['finance', 'review'], + }, + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + const representations = await repository.persistRepresentations({ + workspace, + source, + representations: [ + { + representationId: 'representation-1', + representationType: 'pdf-page-text', + artifactId: 'artifact-1', + mimeType: 'text/plain', + metadata: { + pageNumber: 1, + bounds: { x: 10, y: 20 }, + }, + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + const items = await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + representationId: 'representation-1', + content: 'quarterly review text projection', + contentType: 'text/plain', + metadata: { + pageNumber: 1, + labels: ['finance', 'review'], + bounds: { x: 10, y: 20 }, + }, + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + assert.equal(artifacts[0].artifactId, 'artifact-1'); + assert.deepEqual(artifacts[0].metadata.labels, ['finance', 'review']); + assert.equal(representations[0].artifactId, 'artifact-1'); + assert.deepEqual(representations[0].metadata.bounds, { x: 10, y: 20 }); + assert.equal(items[0].representationId, 'representation-1'); + assert.deepEqual(items[0].metadata.bounds, { x: 10, y: 20 }); + } finally { + await repository.close(); + } + + const reopened = createRepository({ schema }); + try { + await reopened.initialize(); + + const persistedArtifacts = await reopened.getPersistedArtifacts({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + const persistedRepresentations = + await reopened.getPersistedRepresentations({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + const persistedItems = await reopened.getPersistedItems({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + + assert.equal(persistedArtifacts.length, 1); + assert.deepEqual(persistedArtifacts[0].metadata.labels, ['finance', 'review']); + assert.equal(persistedRepresentations.length, 1); + assert.deepEqual(persistedRepresentations[0].metadata.bounds, { x: 10, y: 20 }); + assert.equal(persistedItems[0].representationId, 'representation-1'); + assert.deepEqual(persistedItems[0].metadata.bounds, { x: 10, y: 20 }); + } finally { + await reopened.close(); + } + }, +); + +test( + 'postgres repository rejects orphan artifact references in representations', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const repository = createRepository({ + schema: nextSchemaName(), + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + try { + await repository.initialize(); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'pdf', + title: 'Quarterly Review', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await assert.rejects( + () => + repository.persistRepresentations({ + workspace, + source, + representations: [ + { + representationId: 'representation-1', + representationType: 'pdf-page-text', + artifactId: 'missing-artifact', + mimeType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /artifact missing-artifact not found/, + ); + } finally { + await repository.close(); + } + }, +); + +test( + 'postgres repository preserves artifact and representation metadata on partial re-persist', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const schema = nextSchemaName(); + const repository = createRepository({ + schema, + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + try { + await repository.initialize(); + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'pdf', + title: 'Quarterly Review', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistArtifacts({ + workspace, + source, + artifacts: [ + { + artifactId: 'artifact-1', + artifactType: 'pdf-original', + mimeType: 'application/pdf', + metadata: { labels: ['finance'], pageCount: 7 }, + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + await repository.persistRepresentations({ + workspace, + source, + representations: [ + { + representationId: 'representation-1', + representationType: 'pdf-page-text', + artifactId: 'artifact-1', + metadata: { pageNumber: 1, bounds: { x: 10, y: 20 } }, + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistArtifacts({ + workspace, + source, + artifacts: [ + { + artifactId: 'artifact-1', + artifactType: 'pdf-original', + mimeType: 'application/pdf', + }, + ], + occurredAt: '2026-03-24T00:00:00.000Z', + }); + await repository.persistRepresentations({ + workspace, + source, + representations: [ + { + representationId: 'representation-1', + representationType: 'pdf-page-text', + artifactId: 'artifact-1', + mimeType: 'text/plain', + }, + ], + occurredAt: '2026-03-24T00:00:00.000Z', + }); + + const artifacts = await repository.getPersistedArtifacts({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + const representations = await repository.getPersistedRepresentations({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + + assert.deepEqual(artifacts[0].metadata, { + labels: ['finance'], + pageCount: 7, + }); + assert.deepEqual(representations[0].metadata, { + pageNumber: 1, + bounds: { x: 10, y: 20 }, + }); + } finally { + await repository.close(); + } + }, +); + +test( + 'postgres repository preserves canonical source kind on re-ingestion', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const repository = createRepository({ + schema: nextSchemaName(), + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + try { + await repository.initialize(); + + await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Original Kind Source', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + const reingested = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'pdf', + }, + occurredAt: '2026-03-24T00:00:00.000Z', + }); + + assert.equal(reingested.kind, 'document'); + + const persisted = await repository.getPersistedSource({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }); + + assert.equal(persisted?.kind, 'document'); + } finally { + await repository.close(); + } + }, +); + +test( + 'postgres repository normalizes source identity to the target workspace during direct item writes', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const repository = createRepository({ + schema: nextSchemaName(), + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + { workspaceId: 'ws-2', slug: 'ws-2', displayName: 'Workspace Two' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + { workspaceId: 'ws-2', actorId: 'user-2', role: 'member' }, + ], + }); + + const workspaceOne = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + const workspaceTwo = { + workspaceId: 'ws-2', + actor: { actorId: 'user-2', role: 'member' }, + }; + + try { + await repository.initialize(); + + const foreignSource = await repository.upsertSource({ + workspace: workspaceTwo, + source: { + sourceId: 'shared-source', + kind: 'document', + title: 'Workspace Two Source', + uri: 'https://example.com/ws-2', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistItems({ + workspace: workspaceOne, + source: foreignSource, + items: [ + { + itemId: 'item-1', + content: 'workspace one content', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-24T00:00:00.000Z', + }); + + const workspaceOneSource = await repository.getPersistedSource({ + workspaceId: 'ws-1', + sourceId: 'shared-source', + }); + const workspaceTwoSource = await repository.getPersistedSource({ + workspaceId: 'ws-2', + sourceId: 'shared-source', + }); + + assert.equal(workspaceOneSource?.workspaceId, 'ws-1'); + assert.equal(workspaceOneSource?.title, undefined); + assert.equal(workspaceOneSource?.uri, undefined); + assert.equal(workspaceOneSource?.itemCount, 1); + assert.equal(workspaceTwoSource?.workspaceId, 'ws-2'); + assert.equal(workspaceTwoSource?.title, 'Workspace Two Source'); + assert.equal(workspaceTwoSource?.itemCount, 0); + } finally { + await repository.close(); + } + }, +); + +test( + 'postgres repository rejects empty-string canonical ids', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const repository = createRepository({ + schema: nextSchemaName(), + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + try { + await repository.initialize(); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await assert.rejects( + () => + repository.persistArtifacts({ + workspace, + source, + artifacts: [ + { + artifactId: ' ', + artifactType: 'raw-file', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /artifactId must not be empty/, + ); + } finally { + await repository.close(); + } + }, +); + +test( + 'postgres ingestion rolls back canonical writes when a later persistence step fails', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const repository = createRepository({ + schema: nextSchemaName(), + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + try { + await repository.initialize(); + + const service = createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { async upsertDocuments() {} }, + now: () => new Date('2026-03-23T00:00:00.000Z'), + }); + + await assert.rejects(() => + service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'pdf', + title: 'Quarterly Review', + }, + artifacts: [ + { + artifactId: 'artifact-1', + artifactType: 'pdf-original', + mimeType: 'application/pdf', + }, + ], + items: [ + { + itemId: 'item-1', + representationId: 'missing-representation', + content: 'orphan text', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /ingestion_items_workspace_id_source_id_representation_id_fkey/i, + ); + + assert.equal( + await repository.getPersistedSource({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }), + null, + ); + assert.deepEqual( + await repository.getPersistedArtifacts({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }), + [], + ); + assert.deepEqual( + await repository.getPersistedItems({ + workspaceId: 'ws-1', + sourceId: 'source-1', + }), + [], + ); + } finally { + await repository.close(); + } + }, +); + +test('postgres repository migration artifact defines the canonical workspace tables', () => { + const migration = readFileSync( + new URL('../src/subsystems/supabase/migrations/0001_workspace_foundation.sql', import.meta.url), + 'utf8', + ); + + assert.match(migration, /create table if not exists secondbrain_engine\.workspaces/i); + assert.match(migration, /create table if not exists secondbrain_engine\.workspace_memberships/i); + assert.match(migration, /create table if not exists secondbrain_engine\.sources/i); + assert.match(migration, /create table if not exists secondbrain_engine\.ingestion_items/i); + assert.match( + migration, + /create index if not exists sources_workspace_lookup_idx\s+on secondbrain_engine\.sources \(workspace_id\)/i, + ); + assert.match( + migration, + /comment on column secondbrain_engine\.source_representations\.artifact_id is 'Nullable to allow derived representations without a stored artifact; the foreign key is enforced only when artifact_id is populated\.'/i, + ); + + const packagedMigration = readFileSync( + new URL('../dist/subsystems/supabase/migrations/0001_workspace_foundation.sql', import.meta.url), + 'utf8', + ); + + assert.equal(packagedMigration, migration); +}); + +test('postgres auto-migrate applies all packaged sql migrations in lexical order', async () => { + const schema = nextSchemaName(); + const repository = createRepository({ schema }); + + try { + await repository.initialize(); + + const client = new Client({ connectionString: postgres.connectionString }); + await client.connect(); + try { + const result = await client.query( + `select count(*)::int as count + from pg_indexes + where schemaname = $1 and indexname = $2`, + [schema, 'sources_workspace_lookup_idx'], + ); + + assert.equal(result.rows[0]?.count, 1); + } finally { + await client.end(); + } + } finally { + await repository.close(); + } +}); + +test( + 'postgres repository preserves membership gating and grounded retrieval behavior', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const repository = createRepository({ + schema: nextSchemaName(), + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + try { + await repository.initialize(); + + assert.equal( + await repository.validateAccess({ + workspaceId: 'ws-1', + actor: { actorId: 'intruder', role: 'service' }, + }), + false, + ); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Packet Grounding Notes', + uri: 'https://example.com/postgres-packet', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'postgres packet grounding from canonical truth', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + const reingested = await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-2', + content: 'new grounded delta only', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-24T00:00:00.000Z', + }); + + assert.equal(reingested.length, 1); + assert.equal(reingested[0].itemId, 'item-2'); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'postgres packet grounding', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].sourceId, 'source-1'); + assert.equal(result.packets[0].citations.length > 0, true); + } finally { + await repository.close(); + } + }, +); + +test( + 'createRuntime can select the postgres relational backend from env', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const runtime = createRuntime({ + env: loadAppEnv({ + RELATIONAL_BACKEND: 'postgres', + DATABASE_URL: postgres.connectionString, + DATABASE_SCHEMA: nextSchemaName(), + POSTGRES_AUTO_MIGRATE: 'true', + }), + now: () => new Date('2026-03-23T00:00:00.000Z'), + }); + + try { + const workspace = { + workspaceId: 'default-workspace', + actor: { actorId: 'system', role: 'owner' }, + }; + + const receipt = await runtime.app.services.ingestion.ingest({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Runtime Source', + }, + items: [ + { + itemId: 'item-1', + content: 'runtime postgres grounding', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + const result = await runtime.app.services.retrieval.retrieve({ + workspace, + queryText: 'runtime postgres grounding', + limit: 5, + }); + + assert.equal(receipt.acceptedItems, 1); + assert.equal(result.totalPackets, 1); + + const persisted = await runtime.app.services.retrieval.retrieve({ + workspace, + queryText: 'runtime source', + limit: 5, + }); + + assert.equal(persisted.totalPackets, 1); + + const runtimeRepository = createRepository({ + schema: runtime.app.env.relational.schema, + }); + + try { + await runtimeRepository.initialize(); + + const persistedSource = await runtimeRepository.getPersistedSource({ + workspaceId: 'default-workspace', + sourceId: 'source-1', + }); + + assert.equal(persistedSource?.sourceId, 'source-1'); + } finally { + await runtimeRepository.close(); + } + } finally { + await runtime.close(); + } + }, +); + +test( + 'createRuntime postgres runtime can eagerly initialize before use', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const runtime = createRuntime({ + env: loadAppEnv({ + RELATIONAL_BACKEND: 'postgres', + DATABASE_URL: postgres.connectionString, + DATABASE_SCHEMA: nextSchemaName(), + POSTGRES_AUTO_MIGRATE: 'true', + }), + now: () => new Date('2026-03-23T00:00:00.000Z'), + }); + + try { + await runtime.ready(); + + const runtimeRepository = createRepository({ + schema: runtime.app.env.relational.schema, + }); + + try { + await runtimeRepository.initialize(); + + const defaultWorkspace = await runtimeRepository.getWorkspace( + 'default-workspace', + ); + + assert.equal(defaultWorkspace?.workspaceId, 'default-workspace'); + } finally { + await runtimeRepository.close(); + } + } finally { + await runtime.close(); + } + }, +); + +test('postgres repository initialize fails fast when schema is missing and automigrate is disabled', async () => { + const queries = []; + const repository = createPostgresRelationalWorkspaceRepository({ + connectionString: 'postgresql://127.0.0.1:1/secondbrain', + schema: 'missing_schema', + autoMigrate: false, + pool: { + async query(sql) { + queries.push(sql); + if (String(sql).includes('packet_key')) { + throw new Error('column "packet_key" does not exist'); + } + + return { rowCount: 1, rows: [] }; + }, + async end() {}, + }, + }); + + try { + await assert.rejects( + () => repository.initialize(), + /packet_key/, + ); + assert.equal( + queries.some((sql) => String(sql).includes('packet_key')), + true, + ); + } finally { + await repository.close().catch(() => undefined); + } +}); + +test('postgres repository treats null membership rowCount as denied access for writes', async () => { + const queries = []; + const repository = createPostgresRelationalWorkspaceRepository({ + connectionString: 'postgresql://127.0.0.1:1/secondbrain', + schema: 'rowcount_null_access', + autoMigrate: true, + pool: { + async query(sql) { + const text = String(sql); + queries.push(text); + if (text.trim() === 'select 1') { + return { rowCount: 1, rows: [{ ok: 1 }] }; + } + if (text.includes('workspace_memberships')) { + return { rowCount: null, rows: [] }; + } + if (text.includes('insert into') || text.includes('update ')) { + throw new Error(`unexpected downstream query: ${text}`); + } + + return { rowCount: 1, rows: [] }; + }, + async end() {}, + }, + }); + + try { + await assert.rejects( + () => + repository.upsertSource({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'document', + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /workspace access denied/i, + ); + assert.equal( + queries.some((sql) => sql.includes('workspace_memberships')), + true, + ); + } finally { + await repository.close().catch(() => undefined); + } +}); + +test('postgres repository treats null artifact rowCount as a missing artifact', async () => { + const repository = createPostgresRelationalWorkspaceRepository({ + connectionString: 'postgresql://127.0.0.1:1/secondbrain', + schema: 'rowcount_null_artifact', + autoMigrate: true, + pool: { + async query(sql) { + const text = String(sql); + if (text.trim() === 'select 1') { + return { rowCount: 1, rows: [{ ok: 1 }] }; + } + if (text.includes('workspace_memberships')) { + return { rowCount: 1, rows: [] }; + } + if ( + text.includes('from rowcount_null_artifact.source_artifacts') || + text.includes('from "rowcount_null_artifact"."source_artifacts"') + ) { + return { rowCount: null, rows: [] }; + } + if (text.includes('insert into') || text.includes('update ')) { + throw new Error(`unexpected downstream query: ${text}`); + } + + return { rowCount: 1, rows: [] }; + }, + async end() {}, + }, + }); + + try { + await assert.rejects( + () => + repository.persistRepresentations({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + workspaceId: 'ws-1', + sourceId: 'source-1', + rootSourceId: 'source-1', + sourceGroupKey: 'source-1', + kind: 'pdf', + firstOccurredAt: '2026-03-23T00:00:00.000Z', + lastOccurredAt: '2026-03-23T00:00:00.000Z', + itemCount: 0, + }, + representations: [ + { + representationId: 'representation-1', + representationType: 'pdf-page-text', + artifactId: 'artifact-1', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /artifact artifact-1 not found/i, + ); + } finally { + await repository.close().catch(() => undefined); + } +}); + +test('postgres repository discards the client when a transaction write fails', async () => { + const releaseCalls = []; + const repository = createPostgresRelationalWorkspaceRepository({ + connectionString: 'postgresql://127.0.0.1:1/secondbrain', + schema: 'discard_failed_client', + autoMigrate: true, + pool: { + async query(sql) { + const text = String(sql); + if (text.trim() === 'select 1') { + return { rowCount: 1, rows: [{ ok: 1 }] }; + } + if (text.includes('workspace_memberships')) { + return { rowCount: 1, rows: [{ ok: 1 }] }; + } + + return { rowCount: 1, rows: [{ ok: 1 }] }; + }, + async connect() { + return { + async query(sql) { + const text = String(sql); + if (text === 'begin' || text === 'rollback') { + return { rowCount: null, rows: [] }; + } + if (text.includes('from "discard_failed_client"."sources"')) { + return { rowCount: 0, rows: [] }; + } + if (text.includes('insert into "discard_failed_client"."sources"')) { + return { + rowCount: 1, + rows: [ + { + workspace_id: 'ws-1', + source_id: 'source-1', + root_source_id: 'source-1', + parent_source_id: null, + source_group_key: 'source-1', + kind: 'document', + uri: null, + title: null, + first_occurred_at: new Date('2026-03-23T00:00:00.000Z'), + last_occurred_at: new Date('2026-03-23T00:00:00.000Z'), + item_count: 0, + }, + ], + }; + } + if (text.includes('insert into "discard_failed_client"."ingestion_items"')) { + throw new Error('write failed'); + } + + return { rowCount: 1, rows: [] }; + }, + release(error) { + releaseCalls.push(error); + }, + }; + }, + async end() {}, + }, + }); + + try { + await assert.rejects( + () => + repository.persistItems({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + workspaceId: 'ws-1', + sourceId: 'source-1', + rootSourceId: 'source-1', + sourceGroupKey: 'source-1', + kind: 'document', + firstOccurredAt: '2026-03-23T00:00:00.000Z', + lastOccurredAt: '2026-03-23T00:00:00.000Z', + itemCount: 0, + }, + items: [ + { + itemId: 'item-1', + content: 'write failure', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }), + /write failed/, + ); + assert.equal(releaseCalls.length, 1); + assert.match(String(releaseCalls[0]), /write failed/); + } finally { + await repository.close().catch(() => undefined); + } +}); + +test( + 'postgres-backed retrieval hydrates graph and vector matches outside the initial lexical top-N window', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const repository = createRepository({ + schema: nextSchemaName(), + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + try { + await repository.initialize(); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + for (let index = 1; index <= 20; index += 1) { + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: `source-${index}`, + kind: 'document', + title: `Source ${index}`, + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: `neutral content ${index}`, + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + } + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return [ + { + entityId: 'entity-20', + sourceId: 'source-20', + summary: 'graph match', + score: 2, + }, + ]; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return [ + { + documentId: 'source-20:item-1', + score: 3, + snippet: 'strong vector match', + sourceId: 'source-20', + }, + ]; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'no lexical overlap here', + limit: 5, + }); + + assert.ok(result.packets.some((packet) => packet.sourceId === 'source-20')); + } finally { + await repository.close(); + } + }, +); + +test( + 'postgres lexical retrieval still returns the strongest late-sorting source match', + { skip: dockerSkipReason, timeout: 120000 }, + async () => { + const repository = createRepository({ + schema: nextSchemaName(), + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + try { + await repository.initialize(); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + for (let index = 0; index < 60; index += 1) { + const sourceId = `source-${String(index).padStart(3, '0')}`; + const source = await repository.upsertSource({ + workspace, + source: { + sourceId, + kind: 'document', + title: + index === 59 + ? 'zeta multimodal canonical retrieval strongest match' + : `alpha canonical match ${index}`, + }, + occurredAt: '2026-03-23T00:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: + index === 59 + ? 'multimodal canonical retrieval strongest evidence' + : 'canonical evidence', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T00:00:00.000Z', + }); + } + + const result = await repository.findSources({ + workspaceId: 'ws-1', + queryText: 'multimodal canonical retrieval strongest', + limit: 1, + }); + + assert.equal(result.length, 1); + assert.equal(result[0].source.sourceId, 'source-059'); + } finally { + await repository.close(); + } + }, +); + +function createRepository(input) { + if (!postgres) { + throw new Error(dockerSkipReason ?? 'docker is unavailable for postgres tests'); + } + + return createPostgresRelationalWorkspaceRepository({ + connectionString: postgres.connectionString, + schema: input.schema, + autoMigrate: true, + workspaces: input.workspaces, + memberships: input.memberships, + }); +} + +function nextSchemaName() { + return `test_${randomUUID().replace(/-/g, '_')}`; +} + +function getDockerSkipReason() { + const result = spawnSync('docker', ['info'], { + encoding: 'utf8', + stdio: 'ignore', + }); + + return result.status === 0 ? undefined : 'docker is unavailable for postgres tests'; +} + +async function startPostgresContainer() { + const containerId = execFileSync( + 'docker', + [ + 'run', + '-d', + '--rm', + '-e', + 'POSTGRES_PASSWORD=postgres', + '-e', + 'POSTGRES_USER=postgres', + '-e', + 'POSTGRES_DB=secondbrain', + '-P', + 'postgres:16-alpine', + ], + { encoding: 'utf8' }, + ).trim(); + + try { + const portOutput = execFileSync( + 'docker', + ['port', containerId, '5432/tcp'], + { encoding: 'utf8' }, + ).trim(); + const portMatch = portOutput.match(/:(\d+)$/); + + if (!portMatch) { + throw new Error(`unable to determine postgres port: ${portOutput}`); + } + + const connectionString = `postgresql://postgres:postgres@127.0.0.1:${portMatch[1]}/secondbrain`; + await waitForPostgres(connectionString); + + return { + containerId, + connectionString, + async stop() { + execFileSync('docker', ['rm', '-f', containerId], { stdio: 'ignore' }); + }, + }; + } catch (error) { + execFileSync('docker', ['rm', '-f', containerId], { stdio: 'ignore' }); + throw error; + } +} + +async function waitForPostgres(connectionString) { + const deadline = Date.now() + 60000; + + while (Date.now() < deadline) { + const repository = createPostgresRelationalWorkspaceRepository({ + connectionString, + schema: nextSchemaName(), + autoMigrate: true, + }); + try { + await repository.initialize(); + return; + } catch { + await delay(1000); + } finally { + await repository.close().catch(() => undefined); + } + } + + throw new Error('timed out waiting for postgres container readiness'); +} diff --git a/test/retrieval.hybrid.test.mjs b/test/retrieval.hybrid.test.mjs index 5afc020..9bb08a5 100644 --- a/test/retrieval.hybrid.test.mjs +++ b/test/retrieval.hybrid.test.mjs @@ -748,7 +748,7 @@ test('retrieval ignores graph-only candidates that cannot hydrate a canonical so assert.equal(result.totalPackets, 0); }); -test('retrieval drops vector-only hits that cannot be grounded to canonical items', async () => { +test('retrieval preserves vector-only hits as fallback packets when canonical items are missing', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, @@ -798,8 +798,18 @@ test('retrieval drops vector-only hits that cannot be grounded to canonical item limit: 5, }); - assert.equal(result.totalPackets, 0); - assert.equal(result.packets.length, 0); + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-1'); + assert.equal(result.packets[0].scores.vector, 0.7); + assert.equal(result.packets[0].supportingChunks.length, 1); + assert.equal( + result.packets[0].supportingChunks[0].chunkId, + 'vector:source-1:opaque-missing-item', + ); + assert.equal( + result.packets[0].supportingChunks[0].text, + 'stale vector snippet', + ); }); test('retrieval does not double count vector-grounded items as relational score', async () => { @@ -934,6 +944,865 @@ test('retrieval grounds vector hits to the matched item instead of unrelated sou ), ); }); +test('retrieval reranks packets after hybrid recall instead of using raw combined scores alone', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const sourceOne = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Architecture Packet', + }, + occurredAt: '2026-03-23T06:00:00.000Z', + }); + const sourceTwo = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-2', + kind: 'document', + title: 'Nebula Lens Packet', + }, + occurredAt: '2026-03-23T06:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source: sourceOne, + items: [ + { + itemId: 'item-1', + content: 'architecture operational runbook', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T06:00:00.000Z', + }); + await repository.persistItems({ + workspace, + source: sourceTwo, + items: [ + { + itemId: 'item-1', + content: 'nebula lens retrieval grounding', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T06:00:00.000Z', + }); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return [ + { + entityId: 'entity-1', + sourceId: 'source-1', + summary: 'strong graph support', + score: 2.5, + }, + { + entityId: 'entity-2', + sourceId: 'source-2', + summary: 'weak graph support', + score: 0.1, + }, + ]; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return [ + { + documentId: 'source-1:item-1', + score: 3, + snippet: 'architecture operational runbook', + sourceId: 'source-1', + }, + { + documentId: 'source-2:item-1', + score: 0.5, + snippet: 'nebula lens retrieval grounding', + sourceId: 'source-2', + }, + ]; + }, + }, + reranker: { + async rerank() { + return [ + { packetId: 'packet:source-2', score: 100 }, + { packetId: 'packet:source-1', score: 10 }, + ]; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'nebula lens architecture', + limit: 5, + }); + + assert.deepEqual(result.packets.map((packet) => packet.packetId), [ + 'packet:source-2', + 'packet:source-1', + ]); + assert.ok( + result.packets[0].citations.every( + (citation) => citation.sourceId === 'source-2', + ), + ); + assert.ok( + result.packets[0].supportingChunks.some( + (chunk) => chunk.chunkId === 'source-2:item-1', + ), + ); + assert.ok(result.packets[0].scores.reranked > result.packets[1].scores.reranked); + assert.ok(result.packets[0].scores.fused > 0); + assert.ok(result.packets[1].scores.fused > 0); + assert.ok(result.packets[0].scores.fused < result.packets[1].scores.fused); +}); + +test('retrieval keeps reranker-preferred packets that would otherwise fall below additive pre-ranking cutoffs', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + for (const sourceDefinition of [ + { + sourceId: 'source-1', + title: 'Alpha Packet', + content: 'alpha beta gamma delta', + }, + { + sourceId: 'source-2', + title: 'Beta Packet', + content: 'alpha beta gamma', + }, + { + sourceId: 'source-3', + title: 'Gamma Packet', + content: 'alpha beta', + }, + { + sourceId: 'source-4', + title: 'Nebula Packet', + content: 'nebula lens only', + }, + ]) { + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: sourceDefinition.sourceId, + kind: 'document', + title: sourceDefinition.title, + }, + occurredAt: '2026-03-23T08:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: sourceDefinition.content, + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T08:00:00.000Z', + }); + } + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return [ + { + documentId: 'source-4:item-1', + score: 0.4, + snippet: 'nebula lens only', + sourceId: 'source-4', + }, + ]; + }, + }, + reranker: { + async rerank() { + return [ + { packetId: 'packet:source-4', score: 100 }, + { packetId: 'packet:source-1', score: 10 }, + ]; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'alpha beta gamma delta', + limit: 1, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-4'); + assert.equal(result.packets[0].sourceId, 'source-4'); + assert.ok(result.packets[0].scores.fused < 1); + assert.ok(result.packets[0].scores.reranked > 0); +}); + +test('retrieval falls back to deterministic no-op reranking when the configured reranker throws', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const sourceOne = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Deterministic Fallback Packet', + }, + occurredAt: '2026-03-23T07:00:00.000Z', + }); + const sourceTwo = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-2', + kind: 'document', + title: 'Secondary Packet', + }, + occurredAt: '2026-03-23T07:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source: sourceOne, + items: [ + { + itemId: 'item-1', + content: 'nebula lens fallback grounding', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T07:00:00.000Z', + }); + await repository.persistItems({ + workspace, + source: sourceTwo, + items: [ + { + itemId: 'item-1', + content: 'auxiliary context only', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T07:00:00.000Z', + }); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return [ + { + entityId: 'entity-1', + sourceId: 'source-1', + summary: 'nebula lens graph support', + score: 1.5, + }, + { + entityId: 'entity-2', + sourceId: 'source-2', + summary: 'secondary graph support', + score: 0.5, + }, + ]; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return [ + { + documentId: 'source-1:item-1', + score: 2, + snippet: 'nebula lens fallback grounding', + sourceId: 'source-1', + }, + { + documentId: 'source-2:item-1', + score: 0.2, + snippet: 'auxiliary context only', + sourceId: 'source-2', + }, + ]; + }, + }, + reranker: { + async rerank() { + throw new Error('reranker unavailable'); + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'nebula lens fallback', + limit: 2, + }); + + assert.deepEqual(result.packets.map((packet) => packet.packetId), [ + 'packet:source-1', + 'packet:source-2', + ]); + assert.deepEqual( + result.packets.map((packet) => packet.scores.reranked), + [2, 1], + ); + assert.ok(result.packets[0].scores.fused >= result.packets[1].scores.fused); + assert.ok( + result.packets[0].citations.every( + (citation) => citation.sourceId === 'source-1', + ), + ); +}); + +test('retrieval runs one bounded keyword-expansion pass when initial packet evidence is weak', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Adaptive Retrieval Packet', + }, + occurredAt: '2026-03-23T10:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'catalog entry alpha', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T10:00:00.000Z', + }); + + const graphQueries = []; + const vectorQueries = []; + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext(input) { + graphQueries.push(input.queryText); + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search(input) { + vectorQueries.push(input.queryText); + + if (input.queryText === 'nebula lens grounding') { + return [ + { + documentId: 'source-1:item-1', + score: 0.95, + snippet: 'nebula lens grounding', + sourceId: 'source-1', + }, + ]; + } + + return []; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'tell me about nebula lens grounding', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.deepEqual(result.packets.map((packet) => packet.packetId), [ + 'packet:source-1', + ]); + assert.ok( + result.packets[0].supportingChunks.some( + (chunk) => chunk.chunkId === 'source-1:item-1', + ), + ); + assert.deepEqual(graphQueries, [ + 'tell me about nebula lens grounding', + 'nebula lens grounding', + ]); + assert.deepEqual(vectorQueries, [ + 'tell me about nebula lens grounding', + 'nebula lens grounding', + ]); + assert.equal(new Set([...graphQueries, ...vectorQueries]).size, 2); + assert.equal(result.adaptive?.ran, true); + assert.equal(result.adaptive?.strategy, 'keyword-expansion'); + assert.equal(result.adaptive?.passes, 2); + assert.equal( + result.adaptive?.triggerReason, + 'no grounded packets from initial pass', + ); + assert.deepEqual(result.adaptive?.queriesTried, [ + 'tell me about nebula lens grounding', + 'nebula lens grounding', + ]); + assert.deepEqual(result.adaptive?.topPacketIdsPerPass, [ + [], + ['packet:source-1'], + ]); +}); + +test('retrieval skips adaptive expansion when initial packet evidence is already strong', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Strong Base Packet', + }, + occurredAt: '2026-03-23T11:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'nebula lens grounding', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T11:00:00.000Z', + }); + + const graphQueries = []; + const vectorQueries = []; + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext(input) { + graphQueries.push(input.queryText); + return [ + { + entityId: 'entity-1', + sourceId: 'source-1', + summary: 'nebula lens grounding', + score: 0.9, + }, + ]; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search(input) { + vectorQueries.push(input.queryText); + return [ + { + documentId: 'source-1:item-1', + score: 0.95, + snippet: 'nebula lens grounding', + sourceId: 'source-1', + }, + ]; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'nebula lens grounding', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.deepEqual(graphQueries, ['nebula lens grounding']); + assert.deepEqual(vectorQueries, ['nebula lens grounding']); + assert.equal(result.adaptive?.ran, false); + assert.equal(result.adaptive?.strategy, 'single-pass'); + assert.equal(result.adaptive?.passes, 1); + assert.equal( + result.adaptive?.triggerReason, + 'initial packet evidence strong enough', + ); + assert.deepEqual(result.adaptive?.queriesTried, ['nebula lens grounding']); + assert.deepEqual(result.adaptive?.topPacketIdsPerPass, [['packet:source-1']]); +}); + +test('retrieval preserves sourceIds filtering across adaptive retrieval attempts', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const sourceOne = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Allowed Packet', + }, + occurredAt: '2026-03-23T12:00:00.000Z', + }); + const sourceTwo = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-2', + kind: 'document', + title: 'Filtered Packet', + }, + occurredAt: '2026-03-23T12:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source: sourceOne, + items: [ + { + itemId: 'item-1', + content: 'catalog entry alpha', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T12:00:00.000Z', + }); + await repository.persistItems({ + workspace, + source: sourceTwo, + items: [ + { + itemId: 'item-1', + content: 'nebula lens grounding', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T12:00:00.000Z', + }); + + const vectorCalls = []; + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search(input) { + vectorCalls.push({ + queryText: input.queryText, + workspaceId: input.workspaceId, + sourceIds: input.sourceIds, + }); + + if (input.queryText === 'nebula lens grounding') { + return [ + { + documentId: 'source-2:item-1', + score: 0.95, + snippet: 'nebula lens grounding', + sourceId: 'source-2', + }, + ]; + } + + return []; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'tell me about nebula lens grounding', + limit: 5, + sourceIds: ['source-1'], + }); + + assert.equal(result.totalPackets, 0); + assert.equal(result.adaptive?.ran, true); + assert.equal(result.adaptive?.passes, 2); + assert.deepEqual(result.adaptive?.queriesTried, [ + 'tell me about nebula lens grounding', + 'nebula lens grounding', + ]); + assert.deepEqual(result.adaptive?.topPacketIdsPerPass, [[], []]); + assert.deepEqual( + vectorCalls.map((call) => call.queryText), + ['tell me about nebula lens grounding', 'nebula lens grounding'], + ); + assert.deepEqual( + vectorCalls.map((call) => call.sourceIds), + [['source-1'], ['source-1']], + ); +}); + +test('retrieval still runs bounded adaptive expansion when the first pass is weak but non-empty', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const sourceOne = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Weak First Pass Packet', + }, + occurredAt: '2026-03-23T14:00:00.000Z', + }); + const sourceTwo = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-2', + kind: 'document', + title: 'Adaptive Follow-up Packet', + }, + occurredAt: '2026-03-23T14:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source: sourceOne, + items: [ + { + itemId: 'item-1', + content: 'tell me about weak retrieval fallback', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T14:00:00.000Z', + }); + await repository.persistItems({ + workspace, + source: sourceTwo, + items: [ + { + itemId: 'item-1', + content: 'catalog anchor only', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T14:00:00.000Z', + }); + + const vectorQueries = []; + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search(input) { + vectorQueries.push(input.queryText); + + if (input.queryText === 'nebula lens grounding') { + return [ + { + documentId: 'source-2:item-1', + score: 0.99, + snippet: 'catalog anchor only', + sourceId: 'source-2', + }, + ]; + } + + return []; + }, + }, + reranker: { + async rerank({ packets }) { + return packets.map((packet, index) => ({ + packetId: packet.packetId, + score: + packet.packetId === 'packet:source-2' ? 100 : packets.length - index, + })); + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'tell me about nebula lens grounding', + limit: 1, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-2'); + assert.equal(result.adaptive?.ran, true); + assert.equal(result.adaptive?.passes, 2); + assert.deepEqual(vectorQueries, [ + 'tell me about nebula lens grounding', + 'nebula lens grounding', + ]); + assert.deepEqual(result.adaptive?.topPacketIdsPerPass, [ + ['packet:source-1'], + ['packet:source-2'], + ]); +}); + +test('retrieval reports single-pass trace when no distinct adaptive query variant exists', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const vectorQueries = []; + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search(input) { + vectorQueries.push(input.queryText); + return []; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'nebula lens grounding', + limit: 5, + }); + + assert.equal(result.totalPackets, 0); + assert.equal(result.adaptive?.ran, false); + assert.equal(result.adaptive?.strategy, 'single-pass'); + assert.equal(result.adaptive?.passes, 1); + assert.deepEqual(result.adaptive?.queriesTried, ['nebula lens grounding']); + assert.deepEqual(result.adaptive?.topPacketIdsPerPass, [[]]); + assert.deepEqual(vectorQueries, ['nebula lens grounding']); +}); test('retrieval hydrates opaque vector documentIds with canonical itemIds', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ diff --git a/test/retrieval.packet.test.mjs b/test/retrieval.packet.test.mjs index 8df21a1..1b87865 100644 --- a/test/retrieval.packet.test.mjs +++ b/test/retrieval.packet.test.mjs @@ -1,7 +1,11 @@ import test from 'node:test'; import assert from 'node:assert/strict'; -import { createRetrievalService, createRuntime } from '../dist/index.js'; +import { + createIngestionService, + createRetrievalService, + createRuntime, +} from '../dist/index.js'; import { createInMemoryRelationalWorkspaceRepository } from '../dist/subsystems/supabase/repository.js'; test('retrieval result is packet-first rather than chunk-first', async () => { @@ -170,3 +174,607 @@ test('retrieval stays workspace-scoped even when other workspaces have similar c assert.deepEqual(result.packets.map((packet) => packet.sourceId), ['source-1']); }); + +test('ingestion vector payload includes explicit item identity and grouping hints', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const vectorCalls = []; + + const runtime = createRuntime({ + services: { + ingestion: createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { + async upsertDocuments(input) { + vectorCalls.push(input); + }, + async search() { + return []; + }, + }, + now: () => new Date('2026-03-23T03:00:00.000Z'), + }), + }, + }); + + await runtime.app.services.ingestion.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Packet Source', + }, + items: [ + { + itemId: 'item-1', + content: 'first paragraph', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T01:00:00.000Z', + }); + + assert.deepEqual(vectorCalls[0].documents[0].metadata, { + workspaceId: 'ws-1', + sourceId: 'source-1', + contentType: 'text/plain', + itemId: 'item-1', + occurredAt: '2026-03-23T01:00:00.000Z', + packetKey: 'source-1', + }); +}); + +test('retrieval packet grouping prefers persisted packetKey when present', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const service = createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { async upsertDocuments() {} }, + now: () => new Date('2026-03-23T04:00:00.000Z'), + }); + + await service.ingest({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Grouped Packet Source', + sourceGroupKey: 'packet-anchor-1', + }, + items: [ + { + itemId: 'item-1', + content: 'retrieval packet key first item', + contentType: 'text/plain', + packetKey: 'packet-anchor-1', + }, + { + itemId: 'item-2', + content: 'retrieval packet key second item', + contentType: 'text/plain', + packetKey: 'packet-anchor-1', + }, + ], + occurredAt: '2026-03-23T01:00:00.000Z', + }); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace: { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }, + queryText: 'packet key', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-1:packet-anchor-1'); + assert.equal(result.packets[0].title, 'Grouped Packet Source'); + assert.deepEqual( + result.packets[0].supportingChunks.map((chunk) => chunk.chunkId), + ['source-1:item-1', 'source-1:item-2'], + ); +}); + +test('retrieval expands vector-matched packets with sibling supporting chunks from the same packet', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Contextual Retrieval Packet', + sourceGroupKey: 'packet-anchor-1', + uri: 'https://example.com/contextual-packet', + }, + occurredAt: '2026-03-23T05:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'supporting architectural context that only appears through packet expansion', + contentType: 'text/plain', + packetKey: 'packet-anchor-1', + }, + { + itemId: 'item-2', + content: 'nebula-lens reranker clue', + contentType: 'text/plain', + packetKey: 'packet-anchor-1', + }, + ], + occurredAt: '2026-03-23T05:00:00.000Z', + }); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return [ + { + documentId: 'source-1:item-2', + score: 0.95, + snippet: 'nebula-lens reranker clue', + sourceId: 'source-1', + }, + ]; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'nebula-lens', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-1:packet-anchor-1'); + assert.deepEqual( + result.packets[0].supportingChunks.map((chunk) => chunk.chunkId), + ['source-1:item-1', 'source-1:item-2'], + ); + assert.ok( + result.packets[0].citations.some( + (citation) => + citation.sourceId === 'source-1' && citation.itemId === 'item-1', + ), + ); + assert.ok( + result.packets[0].citations.some( + (citation) => + citation.sourceId === 'source-1' && citation.itemId === 'item-2', + ), + ); +}); + +test('document-only vector hits keep vector scores on the matched packet when the source anchor differs', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Anchored Packet Source', + sourceGroupKey: 'source-anchor', + }, + occurredAt: '2026-03-23T08:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'first packet filler', + contentType: 'text/plain', + packetKey: 'packet-one', + }, + { + itemId: 'item-2', + content: 'second packet vector match', + contentType: 'text/plain', + packetKey: 'packet-two', + }, + ], + occurredAt: '2026-03-23T08:00:00.000Z', + }); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return [ + { + documentId: 'source-1:item-2', + score: 0.91, + snippet: 'second packet vector match', + sourceId: 'source-1', + }, + ]; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'opaque semantic retrieval', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-1:packet-two'); + assert.equal(result.packets[0].scores.vector, 0.91); + assert.equal( + result.packets[0].supportingChunks.find( + (chunk) => chunk.chunkId === 'source-1:item-2', + )?.score, + 0.91, + ); + assert.ok( + result.packets[0].supportingChunks.some( + (chunk) => chunk.chunkId === 'source-1:item-2', + ), + ); +}); + +test('graph hits anchor to the matched item packet when graph candidates provide itemId', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Neutral Packet Source', + sourceGroupKey: 'source-anchor', + }, + occurredAt: '2026-03-23T08:30:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'first packet filler', + contentType: 'text/plain', + packetKey: 'packet-one', + }, + { + itemId: 'item-2', + content: 'custom focus body', + contentType: 'text/plain', + packetKey: 'packet-two', + }, + { + itemId: 'item-3', + content: 'related sibling body', + contentType: 'text/plain', + packetKey: 'packet-two', + }, + ], + occurredAt: '2026-03-23T08:30:00.000Z', + }); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return [ + { + entityId: 'entity-2', + sourceId: 'source-1', + itemId: 'item-2', + summary: 'graph-only packet-two anchor', + score: 0.8, + }, + ]; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'graph-only anchor', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-1:packet-two'); + assert.deepEqual( + result.packets[0].supportingChunks.map((chunk) => chunk.chunkId), + ['source-1:item-2', 'source-1:item-3', 'graph:source-1:item-2'], + ); +}); + +test('retrieval preserves fallback vector citations when chunk ordering normalizes grounded chunks', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Packet With Stale Vector Hit', + uri: 'https://example.com/stale-vector-hit', + }, + occurredAt: '2026-03-23T09:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'grounded relational content', + contentType: 'text/plain', + }, + ], + occurredAt: '2026-03-23T09:00:00.000Z', + }); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return [ + { + documentId: 'source-1:item-missing', + score: 0.9, + snippet: 'stale vector grounding snippet', + sourceId: 'source-1', + }, + ]; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'grounded relational content', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-1'); + assert.deepEqual( + result.packets[0].supportingChunks.map((chunk) => chunk.chunkId), + ['source-1:item-1'], + ); + assert.ok( + result.packets[0].citations.some( + (citation) => + citation.sourceId === 'source-1' && + citation.location === 'source-1:item-missing' && + citation.snippet === 'stale vector grounding snippet', + ), + ); +}); + +test('adaptive retrieval preserves packet grounding and source-linked citations across attempts', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }, + ], + memberships: [ + { workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }, + ], + }); + + const workspace = { + workspaceId: 'ws-1', + actor: { actorId: 'user-1', role: 'member' }, + }; + + const source = await repository.upsertSource({ + workspace, + source: { + sourceId: 'source-1', + kind: 'document', + title: 'Adaptive Packet Source', + uri: 'https://example.com/adaptive-packet', + sourceGroupKey: 'adaptive-packet-1', + }, + occurredAt: '2026-03-23T13:00:00.000Z', + }); + + await repository.persistItems({ + workspace, + source, + items: [ + { + itemId: 'item-1', + content: 'tell me about adaptive retrieval context', + contentType: 'text/plain', + packetKey: 'adaptive-packet-1', + }, + { + itemId: 'item-2', + content: 'catalog entry alpha anchor', + contentType: 'text/plain', + packetKey: 'adaptive-packet-1', + }, + ], + occurredAt: '2026-03-23T13:00:00.000Z', + }); + + const retrieval = createRetrievalService({ + workspaceStore: repository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search(input) { + if (input.queryText === 'nebula lens grounding') { + return [ + { + documentId: 'source-1:item-2', + score: 0.98, + snippet: 'catalog entry alpha anchor', + sourceId: 'source-1', + }, + ]; + } + + return []; + }, + }, + }); + + const result = await retrieval.retrieve({ + workspace, + queryText: 'tell me about nebula lens grounding', + limit: 5, + }); + + assert.equal(result.totalPackets, 1); + assert.equal(result.packets[0].packetId, 'packet:source-1:adaptive-packet-1'); + assert.deepEqual( + result.packets[0].supportingChunks.map((chunk) => chunk.chunkId), + ['source-1:item-1', 'source-1:item-2'], + ); + assert.ok( + result.packets[0].citations.some( + (citation) => citation.sourceId === 'source-1' && citation.itemId === 'item-1', + ), + ); + assert.ok( + result.packets[0].citations.some( + (citation) => citation.sourceId === 'source-1' && citation.itemId === 'item-2', + ), + ); + assert.equal(result.adaptive?.ran, true); + assert.equal(result.adaptive?.passes, 2); + assert.deepEqual(result.adaptive?.topPacketIdsPerPass, [ + ['packet:source-1:adaptive-packet-1'], + ['packet:source-1:adaptive-packet-1'], + ]); +}); diff --git a/test/runtime.test.mjs b/test/runtime.test.mjs index 3aa9b20..937d2e5 100644 --- a/test/runtime.test.mjs +++ b/test/runtime.test.mjs @@ -2,9 +2,28 @@ import test from 'node:test'; import assert from 'node:assert/strict'; import { createServer } from 'node:http'; -import { createRuntime, startServer } from '../dist/index.js'; +import { + createIngestionService, + createRetrievalService, + createRuntime, + startServer, +} from '../dist/index.js'; +import { loadAppEnv } from '../dist/app/env.js'; import { createInMemoryRelationalWorkspaceRepository } from '../dist/subsystems/supabase/repository.js'; +const CLEAN_RELATIONAL_ENV = { + SERVICE_NAME: 'secondbrain-engine', + NODE_ENV: 'development', + PORT: '4000', +}; + +const SERVICE_TOKEN_AUTH_ENV = { + RELATIONAL_BACKEND: 'memory', + AUTH_MODE: 'service-token', + AUTH_SERVICE_TOKEN: 'top-secret', + AUTH_SERVICE_ACTOR_ID: 'service-agent', +}; + async function closeServer(server) { await new Promise((resolve, reject) => { server.close((error) => { @@ -18,14 +37,738 @@ async function closeServer(server) { }); } +async function withCleanRelationalEnv(run) { + // Create a "clean" snapshot of the current environment without mutating + // process.env, so this helper is safe even when tests run concurrently. + const cleanEnv = { + ...process.env, + }; + + delete cleanEnv.RELATIONAL_BACKEND; + delete cleanEnv.DATABASE_URL; + delete cleanEnv.DATABASE_SCHEMA; + delete cleanEnv.POSTGRES_AUTO_MIGRATE; + delete cleanEnv.AUTH_MODE; + delete cleanEnv.AUTH_SERVICE_TOKEN; + delete cleanEnv.AUTH_SERVICE_ACTOR_ID; + + return await run(cleanEnv); +} + test('createRuntime wires the minimal app boundary', async () => { const runtime = createRuntime({ + env: { + serviceName: CLEAN_RELATIONAL_ENV.SERVICE_NAME, + mode: 'development', + port: 4000, + relational: { backend: 'memory' }, + auth: { mode: 'disabled' }, + }, now: () => new Date('2026-03-23T00:00:00.000Z'), }); assert.equal(runtime.app.health().service, 'secondbrain-engine'); assert.equal(runtime.app.health().status, 'ok'); assert.equal(runtime.server.listening, false); + await runtime.close(); +}); + +test('createRuntime redacts postgres credentials from the public app env', async () => { + const runtime = createRuntime({ + env: loadAppEnv({ + RELATIONAL_BACKEND: 'postgres', + DATABASE_URL: 'postgresql://postgres:postgres@127.0.0.1:5432/secondbrain', + DATABASE_SCHEMA: 'engine_test', + POSTGRES_AUTO_MIGRATE: 'true', + }), + }); + + try { + assert.equal(runtime.app.env.relational.backend, 'postgres'); + assert.equal(runtime.app.env.relational.schema, 'engine_test'); + assert.equal('databaseUrl' in runtime.app.env.relational, false); + } finally { + await runtime.close(); + } +}); + +test('createRuntime redacts private service-token auth config from the public app env', async () => { + const runtime = createRuntime({ + env: loadAppEnv(SERVICE_TOKEN_AUTH_ENV), + }); + + try { + assert.deepEqual(runtime.app.env.auth, { + mode: 'service-token', + actorId: 'service-agent', + }); + assert.equal('serviceToken' in runtime.app.env.auth, false); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver rejects missing credentials', async () => { + const runtime = createRuntime({ + env: loadAppEnv(SERVICE_TOKEN_AUTH_ENV), + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + workspaceId: 'default-workspace', + headers: {}, + }); + + assert.deepEqual(result, { + ok: false, + statusCode: 401, + error: { + code: 'missing_credentials', + message: 'Missing bearer credentials', + workspaceId: 'default-workspace', + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver rejects malformed bearer credentials', async () => { + const runtime = createRuntime({ + env: loadAppEnv(SERVICE_TOKEN_AUTH_ENV), + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + workspaceId: 'default-workspace', + headers: { authorization: 'Bearer' }, + }); + + assert.deepEqual(result, { + ok: false, + statusCode: 401, + error: { + code: 'invalid_credentials', + message: 'Invalid bearer credentials', + workspaceId: 'default-workspace', + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver rejects protected requests when auth is disabled', async () => { + const runtime = createRuntime({ + env: { + serviceName: CLEAN_RELATIONAL_ENV.SERVICE_NAME, + mode: 'development', + port: 4000, + relational: { backend: 'memory' }, + auth: { mode: 'disabled' }, + }, + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + workspaceId: 'default-workspace', + headers: { + authorization: 'Bearer top-secret', + 'x-request-id': 'req-disabled', + 'x-correlation-id': 'corr-disabled', + }, + }); + + assert.deepEqual(result, { + ok: false, + statusCode: 401, + error: { + code: 'auth_disabled', + message: 'Protected routes require configured auth', + requestId: 'req-disabled', + correlationId: 'corr-disabled', + workspaceId: 'default-workspace', + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver requires a workspace target', async () => { + const runtime = createRuntime({ + env: loadAppEnv(SERVICE_TOKEN_AUTH_ENV), + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + headers: { authorization: 'Bearer top-secret' }, + }); + + assert.deepEqual(result, { + ok: false, + statusCode: 400, + error: { + code: 'workspace_required', + message: 'Workspace target is required', + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver rejects blank workspace targets', async () => { + const runtime = createRuntime({ + env: loadAppEnv(SERVICE_TOKEN_AUTH_ENV), + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + workspaceId: ' ', + headers: { authorization: 'Bearer top-secret' }, + }); + + assert.deepEqual(result, { + ok: false, + statusCode: 400, + error: { + code: 'workspace_required', + message: 'Workspace target is required', + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver maps a valid bearer token to the configured member service actor', async () => { + const runtime = createRuntime({ + env: loadAppEnv({ + ...SERVICE_TOKEN_AUTH_ENV, + AUTH_SERVICE_ACTOR_ID: 'system', + }), + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + workspaceId: 'default-workspace', + headers: { + authorization: 'Bearer top-secret', + 'x-request-id': 'req-123', + 'x-correlation-id': 'corr-456', + 'x-actor-id': 'spoofed', + 'x-workspace-role': 'owner', + }, + }); + + assert.deepEqual(result, { + ok: true, + context: { + workspaceId: 'default-workspace', + actor: { + actorId: 'system', + role: 'service', + }, + requestId: 'req-123', + correlationId: 'corr-456', + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver accepts case-insensitive auth headers', async () => { + const runtime = createRuntime({ + env: loadAppEnv({ + ...SERVICE_TOKEN_AUTH_ENV, + AUTH_SERVICE_ACTOR_ID: 'system', + }), + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + workspaceId: 'default-workspace', + headers: { + Authorization: 'bearer top-secret', + 'X-Request-Id': 'req-case', + 'X-Correlation-Id': 'corr-case', + }, + }); + + assert.deepEqual(result, { + ok: true, + context: { + workspaceId: 'default-workspace', + actor: { + actorId: 'system', + role: 'service', + }, + requestId: 'req-case', + correlationId: 'corr-case', + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver preserves request metadata on forbidden membership failures', async () => { + const runtime = createRuntime({ + env: loadAppEnv(SERVICE_TOKEN_AUTH_ENV), + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + workspaceId: 'default-workspace', + headers: { + authorization: 'Bearer top-secret', + 'x-request-id': 'req-forbidden', + 'x-correlation-id': 'corr-forbidden', + }, + }); + + assert.deepEqual(result, { + ok: false, + statusCode: 403, + error: { + code: 'workspace_forbidden', + message: 'Workspace access denied', + requestId: 'req-forbidden', + correlationId: 'corr-forbidden', + workspaceId: 'default-workspace', + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime protected request resolver reuses an overridden ingestion service workspace store', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }], + memberships: [{ workspaceId: 'ws-1', actorId: 'system', role: 'owner' }], + }); + + const runtime = createRuntime({ + env: loadAppEnv({ + ...SERVICE_TOKEN_AUTH_ENV, + AUTH_SERVICE_ACTOR_ID: 'system', + }), + services: { + ingestion: createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }), + }, + }); + + try { + const result = await runtime.app.resolveProtectedRequest({ + workspaceId: 'ws-1', + headers: { authorization: 'Bearer top-secret' }, + }); + + assert.deepEqual(result, { + ok: true, + context: { + workspaceId: 'ws-1', + actor: { + actorId: 'system', + role: 'service', + }, + }, + }); + } finally { + await runtime.close(); + } +}); + +test('createRuntime rejects conflicting overridden service workspace stores when auth is enabled', async () => { + const ingestionRepository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }], + memberships: [{ workspaceId: 'ws-1', actorId: 'system', role: 'owner' }], + }); + const retrievalRepository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-2', slug: 'ws-2', displayName: 'Workspace Two' }], + memberships: [{ workspaceId: 'ws-2', actorId: 'system', role: 'owner' }], + }); + + assert.throws( + () => + createRuntime({ + env: loadAppEnv({ + ...SERVICE_TOKEN_AUTH_ENV, + AUTH_SERVICE_ACTOR_ID: 'system', + }), + services: { + ingestion: createIngestionService({ + workspaceStore: ingestionRepository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }), + retrieval: createRetrievalService({ + workspaceStore: retrievalRepository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + artifactStorage: { + async storeArtifact() { + return undefined; + }, + }, + extractorRuntime: { + registry: [], + runExtractors() { + return Promise.resolve([]); + }, + }, + }), + }, + }), + /auth-enabled runtime requires ingestion and retrieval services to share one workspace store/, + ); +}); + +test('createRuntime rejects untagged custom services when auth is enabled', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }], + memberships: [{ workspaceId: 'ws-1', actorId: 'system', role: 'owner' }], + }); + + assert.throws( + () => + createRuntime({ + env: loadAppEnv({ + ...SERVICE_TOKEN_AUTH_ENV, + AUTH_SERVICE_ACTOR_ID: 'system', + }), + workspaceStore: repository, + services: { + ingestion: { + async ingest() { + throw new Error('not used'); + }, + }, + }, + }), + /auth-enabled runtime with custom services requires services created by secondbrain-engine factories or tagged workspace stores/, + ); +}); + +test('createRuntime rejects mixed tagged and untagged custom services when auth is enabled', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }], + memberships: [{ workspaceId: 'ws-1', actorId: 'system', role: 'owner' }], + }); + + assert.throws( + () => + createRuntime({ + env: loadAppEnv({ + ...SERVICE_TOKEN_AUTH_ENV, + AUTH_SERVICE_ACTOR_ID: 'system', + }), + services: { + ingestion: createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }), + retrieval: { + async retrieve() { + throw new Error('not used'); + }, + }, + }, + }), + /auth-enabled runtime with custom services requires services created by secondbrain-engine factories or tagged workspace stores/, + ); +}); + +test('createRuntime allows conflicting custom service stores when auth is disabled', async () => { + const ingestionRepository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }], + memberships: [{ workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }], + }); + const retrievalRepository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-2', slug: 'ws-2', displayName: 'Workspace Two' }], + memberships: [{ workspaceId: 'ws-2', actorId: 'user-2', role: 'member' }], + }); + + const runtime = createRuntime({ + env: { + serviceName: CLEAN_RELATIONAL_ENV.SERVICE_NAME, + mode: 'development', + port: 4000, + relational: { backend: 'memory' }, + auth: { mode: 'disabled' }, + }, + services: { + ingestion: createIngestionService({ + workspaceStore: ingestionRepository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }), + retrieval: createRetrievalService({ + workspaceStore: retrievalRepository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + artifactStorage: { + async storeArtifact() { + return undefined; + }, + }, + extractorRuntime: { + registry: [], + runExtractors() { + return Promise.resolve([]); + }, + }, + }), + }, + }); + + await runtime.close(); +}); + +test('createRuntime uses the active workspace store lifecycle hooks', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }], + memberships: [{ workspaceId: 'ws-1', actorId: 'system', role: 'owner' }], + }); + let initialized = 0; + let closed = 0; + repository.initialize = async () => { + initialized += 1; + }; + repository.close = async () => { + closed += 1; + }; + + const runtime = createRuntime({ + env: loadAppEnv({ + ...SERVICE_TOKEN_AUTH_ENV, + AUTH_SERVICE_ACTOR_ID: 'system', + }), + subsystems: { + workspaceStore: repository, + }, + }); + + await runtime.ready(); + await runtime.close(); + + assert.equal(initialized, 1); + assert.equal(closed, 1); +}); + +test('createRuntime uses tagged custom-service lifecycle hooks when auth is disabled', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }], + memberships: [{ workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }], + }); + let initialized = 0; + let closed = 0; + repository.initialize = async () => { + initialized += 1; + }; + repository.close = async () => { + closed += 1; + }; + + const runtime = createRuntime({ + env: { + serviceName: CLEAN_RELATIONAL_ENV.SERVICE_NAME, + mode: 'development', + port: 4000, + relational: { backend: 'memory' }, + auth: { mode: 'disabled' }, + }, + services: { + ingestion: createIngestionService({ + workspaceStore: repository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }), + }, + }); + + await runtime.ready(); + await runtime.close(); + + assert.equal(initialized, 1); + assert.equal(closed, 1); +}); + +test('createRuntime preserves explicit workspaceStore lifecycle hooks alongside auth-disabled custom services', async () => { + const overrideRepository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-override', slug: 'ws-override', displayName: 'Override Workspace' }], + memberships: [{ workspaceId: 'ws-override', actorId: 'user-1', role: 'member' }], + }); + const ingestionRepository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-1', slug: 'ws-1', displayName: 'Workspace One' }], + memberships: [{ workspaceId: 'ws-1', actorId: 'user-1', role: 'member' }], + }); + const retrievalRepository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [{ workspaceId: 'ws-2', slug: 'ws-2', displayName: 'Workspace Two' }], + memberships: [{ workspaceId: 'ws-2', actorId: 'user-2', role: 'member' }], + }); + let initialized = 0; + let closed = 0; + overrideRepository.initialize = async () => { + initialized += 1; + }; + overrideRepository.close = async () => { + closed += 1; + }; + + const runtime = createRuntime({ + env: { + serviceName: CLEAN_RELATIONAL_ENV.SERVICE_NAME, + mode: 'development', + port: 4000, + relational: { backend: 'memory' }, + auth: { mode: 'disabled' }, + }, + subsystems: { + workspaceStore: overrideRepository, + }, + services: { + ingestion: createIngestionService({ + workspaceStore: ingestionRepository, + graphMemory: { async upsertFacts() {} }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + }), + retrieval: createRetrievalService({ + workspaceStore: retrievalRepository, + graphMemory: { + async upsertFacts() {}, + async findRelatedContext() { + return []; + }, + }, + vectorIndex: { + async upsertDocuments() {}, + async search() { + return []; + }, + }, + artifactStorage: { + async storeArtifact() { + return undefined; + }, + }, + extractorRuntime: { + registry: [], + runExtractors() { + return Promise.resolve([]); + }, + }, + }), + }, + }); + + await runtime.ready(); + await runtime.close(); + + assert.equal(initialized, 1); + assert.equal(closed, 1); +}); + +test('createRuntime production mode does not authorize the bootstrap default workspace', async () => { + const runtime = createRuntime({ + env: { + serviceName: CLEAN_RELATIONAL_ENV.SERVICE_NAME, + mode: 'production', + port: 4000, + relational: { backend: 'memory' }, + auth: { mode: 'disabled' }, + }, + }); + + try { + await assert.rejects( + () => + runtime.app.services.retrieval.retrieve({ + workspace: { + workspaceId: 'default-workspace', + actor: { actorId: 'system', role: 'owner' }, + }, + queryText: 'what context do we have?', + }), + /workspace access denied/, + ); + } finally { + await runtime.close(); + } +}); + +test('runtime.close can be used as a detached callback on an idle server', async () => { + const runtime = createRuntime({ + env: { + serviceName: CLEAN_RELATIONAL_ENV.SERVICE_NAME, + mode: 'development', + port: 4000, + relational: { backend: 'memory' }, + auth: { mode: 'disabled' }, + }, + now: () => new Date('2026-03-23T00:00:00.000Z'), + }); + const close = runtime.close; + + await assert.doesNotReject(() => close()); }); test('createRuntime accepts subsystem overrides for non-default workspace stores', async () => { @@ -79,6 +822,35 @@ test('createRuntime accepts subsystem overrides for non-default workspace stores assert.equal(result.packets[0].sourceId, 'custom-source'); }); +test('createRuntime does not initialize the default postgres store when workspaceStore is overridden', async () => { + const repository = createInMemoryRelationalWorkspaceRepository({ + workspaces: [ + { workspaceId: 'ws-custom', slug: 'ws-custom', displayName: 'Custom Workspace' }, + ], + memberships: [ + { workspaceId: 'ws-custom', actorId: 'user-1', role: 'member' }, + ], + }); + + const runtime = createRuntime({ + env: loadAppEnv({ + RELATIONAL_BACKEND: 'postgres', + DATABASE_URL: 'postgresql://127.0.0.1:1/secondbrain', + DATABASE_SCHEMA: 'engine_test', + POSTGRES_AUTO_MIGRATE: 'true', + }), + subsystems: { + workspaceStore: repository, + }, + }); + + try { + await assert.doesNotReject(() => runtime.ready()); + } finally { + await runtime.close(); + } +}); + test('startServer accepts subsystem overrides for non-default workspace stores', async () => { const repository = createInMemoryRelationalWorkspaceRepository({ workspaces: [ @@ -123,13 +895,74 @@ test('startServer accepts subsystem overrides for non-default workspace stores', }); test('startServer listens successfully on an ephemeral port', async () => { - const runtime = await startServer(0); + const runtime = await withCleanRelationalEnv(() => startServer(0)); + const address = runtime.server.address(); + + assert.ok(address); + assert.notEqual(typeof address, 'string'); + + await runtime.close(); +}); + +test('startServer keeps /health public without auth credentials', async () => { + const runtime = await withCleanRelationalEnv(() => startServer(0)); const address = runtime.server.address(); assert.ok(address); assert.notEqual(typeof address, 'string'); - await closeServer(runtime.server); + try { + const response = await fetch(`http://127.0.0.1:${address.port}/health`); + const payload = await response.json(); + + assert.equal(response.status, 200); + assert.equal(payload.status, 'ok'); + assert.equal(payload.service, 'secondbrain-engine'); + } finally { + await runtime.close(); + } +}); + +test('startServer ignores ambient auth env when booted via withCleanRelationalEnv', async () => { + const original = { + AUTH_MODE: process.env.AUTH_MODE, + AUTH_SERVICE_TOKEN: process.env.AUTH_SERVICE_TOKEN, + AUTH_SERVICE_ACTOR_ID: process.env.AUTH_SERVICE_ACTOR_ID, + }; + + process.env.AUTH_MODE = 'service-token'; + delete process.env.AUTH_SERVICE_TOKEN; + delete process.env.AUTH_SERVICE_ACTOR_ID; + + let runtime; + + try { + runtime = await withCleanRelationalEnv(() => startServer(0)); + const address = runtime.server.address(); + + assert.ok(address); + assert.notEqual(typeof address, 'string'); + } finally { + if (runtime) { + await runtime.close(); + } + + for (const [key, value] of Object.entries(original)) { + if (value === undefined) { + delete process.env[key]; + } else { + process.env[key] = value; + } + } + } +}); + +test('runtime.close shuts down a listening server', async () => { + const runtime = await withCleanRelationalEnv(() => startServer(0)); + + await runtime.close(); + + assert.equal(runtime.server.listening, false); }); test('startServer rejects async bind failures', async () => { @@ -141,6 +974,20 @@ test('startServer rejects async bind failures', async () => { assert.ok(address); assert.notEqual(typeof address, 'string'); - await assert.rejects(() => startServer(address.port), /EADDRINUSE/); + await assert.rejects( + () => withCleanRelationalEnv(() => startServer(address.port)), + /EADDRINUSE/, + ); await closeServer(blocker); }); + +test('createRuntime surfaces postgres env configuration errors before boot', async () => { + assert.throws( + () => + loadAppEnv({ + RELATIONAL_BACKEND: 'postgres', + DATABASE_SCHEMA: 'engine_test', + }), + /RELATIONAL_BACKEND=postgres requires DATABASE_URL/, + ); +});