Skip to content

Commit a68afab

Browse files
committed
#123: Add hippocampus Chunker + minimal ingest pipeline (PageBuilder, Ingest, tests)
1 parent cafb4c8 commit a68afab

3 files changed

Lines changed: 266 additions & 0 deletions

File tree

hippocampus/Ingest.ts

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import type { Book, ModelProfile, MetadataStore, VectorStore } from "../core/types";
2+
import { hashText } from "../core/crypto/hash";
3+
import type { KeyPair } from "../core/crypto/sign";
4+
import { EmbeddingRunner } from "../embeddings/EmbeddingRunner";
5+
import { chunkText } from "./Chunker";
6+
import { buildPage } from "./PageBuilder";
7+
import { runPromotionSweep } from "../core/SalienceEngine";
8+
9+
export interface IngestOptions {
10+
modelProfile: ModelProfile;
11+
embeddingRunner: EmbeddingRunner;
12+
vectorStore: VectorStore;
13+
metadataStore: MetadataStore;
14+
keyPair: KeyPair;
15+
now?: number;
16+
}
17+
18+
export interface IngestResult {
19+
pages: Array<Awaited<ReturnType<typeof buildPage>>>;
20+
book?: Book;
21+
}
22+
23+
export async function ingestText(
24+
text: string,
25+
options: IngestOptions,
26+
): Promise<IngestResult> {
27+
const {
28+
modelProfile,
29+
embeddingRunner,
30+
vectorStore,
31+
metadataStore,
32+
keyPair,
33+
now = Date.now(),
34+
} = options;
35+
36+
const chunks = chunkText(text, modelProfile);
37+
if (chunks.length === 0) {
38+
return { pages: [], book: undefined };
39+
}
40+
41+
const createdAt = new Date(now).toISOString();
42+
43+
// Precompute page IDs (content hashes) so we can link prev/next before signing.
44+
const pageIds = await Promise.all(chunks.map((c) => hashText(c)));
45+
46+
const embeddings = await embeddingRunner.embed(chunks);
47+
if (embeddings.length !== chunks.length) {
48+
throw new Error("Embedding provider returned unexpected number of embeddings");
49+
}
50+
51+
const offsets: number[] = [];
52+
for (const embedding of embeddings) {
53+
const offset = await vectorStore.appendVector(embedding);
54+
offsets.push(offset);
55+
}
56+
57+
const pages = await Promise.all(
58+
chunks.map(async (content, idx) => {
59+
const prevPageId = idx > 0 ? pageIds[idx - 1] : null;
60+
const nextPageId = idx < pageIds.length - 1 ? pageIds[idx + 1] : null;
61+
62+
return buildPage({
63+
content,
64+
embedding: embeddings[idx],
65+
embeddingOffset: offsets[idx],
66+
embeddingDim: modelProfile.embeddingDimension,
67+
creatorPubKey: keyPair.publicKey,
68+
signingKey: keyPair.signingKey,
69+
prevPageId,
70+
nextPageId,
71+
createdAt,
72+
});
73+
}),
74+
);
75+
76+
// Persist pages and activity records.
77+
for (const page of pages) {
78+
await metadataStore.putPage(page);
79+
await metadataStore.putPageActivity({
80+
pageId: page.pageId,
81+
queryHitCount: 0,
82+
lastQueryAt: createdAt,
83+
});
84+
}
85+
86+
// Build a simple book containing all pages.
87+
const bookId = await hashText(pageIds.join("|"));
88+
const book: Book = {
89+
bookId,
90+
pageIds,
91+
medoidPageId: pageIds[0],
92+
meta: {},
93+
};
94+
await metadataStore.putBook(book);
95+
96+
// Run hotpath promotion for the newly ingested pages.
97+
await runPromotionSweep(pageIds, metadataStore);
98+
99+
return { pages, book };
100+
}

hippocampus/PageBuilder.ts

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import type { Hash, Page } from "../core/types";
2+
import type { KeyPair } from "../core/crypto/sign";
3+
import { hashBinary, hashText } from "../core/crypto/hash";
4+
import { signData } from "../core/crypto/sign";
5+
6+
export interface BuildPageOptions {
7+
content: string;
8+
embedding: Float32Array;
9+
embeddingOffset: number;
10+
embeddingDim: number;
11+
creatorPubKey: string;
12+
signingKey: CryptoKey;
13+
prevPageId?: Hash | null;
14+
nextPageId?: Hash | null;
15+
createdAt?: string;
16+
}
17+
18+
/**
19+
* Build a Page entity from content + embedding.
20+
*
21+
* Creates deterministic `pageId`/`contentHash` from content, a `vectorHash` from
22+
* the raw embedding bytes, and signs the page using the provided key.
23+
*/
24+
export async function buildPage(options: BuildPageOptions): Promise<Page> {
25+
const {
26+
content,
27+
embedding,
28+
embeddingOffset,
29+
embeddingDim,
30+
creatorPubKey,
31+
signingKey,
32+
prevPageId = null,
33+
nextPageId = null,
34+
createdAt = new Date().toISOString(),
35+
} = options;
36+
37+
if (embedding.length !== embeddingDim) {
38+
throw new Error(
39+
`Embedding dimension mismatch: expected ${embeddingDim}, got ${embedding.length}`,
40+
);
41+
}
42+
43+
const contentHash = await hashText(content);
44+
const pageId = contentHash;
45+
46+
const rawVector = embedding.buffer.slice(
47+
embedding.byteOffset,
48+
embedding.byteOffset + embedding.byteLength,
49+
);
50+
const vectorHash = await hashBinary(rawVector);
51+
52+
const unsignedPage = {
53+
pageId,
54+
content,
55+
embeddingOffset,
56+
embeddingDim,
57+
contentHash,
58+
vectorHash,
59+
prevPageId: prevPageId ?? null,
60+
nextPageId: nextPageId ?? null,
61+
creatorPubKey,
62+
createdAt,
63+
} as const;
64+
65+
// Deterministic canonical representation used for signing.
66+
const canonical = canonicalizePageForSigning(unsignedPage);
67+
const signature = await signData(canonical, signingKey);
68+
69+
return {
70+
...unsignedPage,
71+
signature,
72+
};
73+
}
74+
75+
function canonicalizePageForSigning(page: Omit<Page, "signature">): string {
76+
// Keep key order stable for deterministic signing.
77+
return JSON.stringify({
78+
pageId: page.pageId,
79+
content: page.content,
80+
embeddingOffset: page.embeddingOffset,
81+
embeddingDim: page.embeddingDim,
82+
contentHash: page.contentHash,
83+
vectorHash: page.vectorHash,
84+
prevPageId: page.prevPageId ?? null,
85+
nextPageId: page.nextPageId ?? null,
86+
creatorPubKey: page.creatorPubKey,
87+
createdAt: page.createdAt,
88+
});
89+
}

tests/hippocampus/Ingest.test.ts

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import { describe, expect, it, beforeEach } from "vitest";
2+
import { IDBFactory, IDBKeyRange as FakeIDBKeyRange } from "fake-indexeddb";
3+
4+
import { IndexedDbMetadataStore } from "../../storage/IndexedDbMetadataStore";
5+
import { MemoryVectorStore } from "../../storage/MemoryVectorStore";
6+
import { DeterministicDummyEmbeddingBackend } from "../../embeddings/DeterministicDummyEmbeddingBackend";
7+
import { EmbeddingRunner } from "../../embeddings/EmbeddingRunner";
8+
import { generateKeyPair } from "../../core/crypto/sign";
9+
import { ingestText } from "../../hippocampus/Ingest";
10+
import type { ModelProfile } from "../../core/ModelProfile";
11+
12+
let dbCounter = 0;
13+
function freshDbName(): string {
14+
return `cortex-ingest-test-${Date.now()}-${++dbCounter}`;
15+
}
16+
17+
describe("hippocampus ingest", () => {
18+
beforeEach(() => {
19+
(globalThis as Record<string, unknown>)["indexedDB"] = new IDBFactory();
20+
(globalThis as Record<string, unknown>)["IDBKeyRange"] = FakeIDBKeyRange;
21+
});
22+
23+
it("persists pages, metadata, and book records", async () => {
24+
const metadataStore = await IndexedDbMetadataStore.open(freshDbName());
25+
const vectorStore = new MemoryVectorStore();
26+
const keyPair = await generateKeyPair();
27+
28+
const backend = new DeterministicDummyEmbeddingBackend({ dimension: 4 });
29+
const runner = new EmbeddingRunner(async () => ({
30+
backend,
31+
selectedKind: "dummy" as const,
32+
reason: "forced" as const,
33+
supportedKinds: ["dummy" as const],
34+
measurements: [],
35+
}));
36+
37+
const profile: ModelProfile = {
38+
modelId: "test-model",
39+
embeddingDimension: 4,
40+
contextWindowTokens: 64,
41+
truncationTokens: 48,
42+
maxChunkTokens: 5,
43+
source: "metadata",
44+
};
45+
46+
const text = "One two three four five six seven eight nine ten.";
47+
48+
const result = await ingestText(text, {
49+
modelProfile: profile,
50+
embeddingRunner: runner,
51+
vectorStore,
52+
metadataStore,
53+
keyPair,
54+
});
55+
56+
expect(result.pages.length).toBeGreaterThanOrEqual(2);
57+
58+
// Stored page should match returned page
59+
const stored = await metadataStore.getPage(result.pages[0].pageId);
60+
expect(stored).toEqual(result.pages[0]);
61+
62+
// Activity record should be initialized
63+
const activity = await metadataStore.getPageActivity(result.pages[0].pageId);
64+
expect(activity).toEqual({
65+
pageId: result.pages[0].pageId,
66+
queryHitCount: 0,
67+
lastQueryAt: result.pages[0].createdAt,
68+
});
69+
70+
// Book should contain the pages
71+
const storedBook = await metadataStore.getBook(result.book!.bookId);
72+
expect(storedBook).toEqual(result.book);
73+
74+
// Vector store should have data stored for each page
75+
expect(vectorStore.byteLength).toBeGreaterThan(0);
76+
});
77+
});

0 commit comments

Comments
 (0)