From 8d34f79e15ada2880d799473b624291f476936c3 Mon Sep 17 00:00:00 2001 From: Declade <110547349+Declade@users.noreply.github.com> Date: Sun, 17 May 2026 11:22:14 +0200 Subject: [PATCH 1/5] =?UTF-8?q?feat(slice-2):=20core=20modules=20=E2=80=94?= =?UTF-8?q?=20gateway=20client,=20redaction=20extractor,=20recall,=20mocks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the in-process methodology library Slice 2 needs: - src/gateway-client.ts — typed wrapper around POST /api/v1/proxy/messages with mode=proving_ground. 2-retry exponential backoff with jitter on 5xx + connection errors; no retry on 4xx; per-request timeout (30s default, LUCAIRN_REQUEST_TIMEOUT_MS env-configurable). Env reads are call-time only — module import is side-effect-free. - src/redaction-extractor.ts — pure converter from gateway proving-ground matches/missed/extras into a flat ExtractedRedaction[] tagged with HIPAA Safe Harbor category + verdict (tp/fn/fp). Unmapped extras carry hipaa_category=null so the FP count is preserved while taxonomy drift is observable. - src/hipaa-category-mapping.ts — explicit one-way map from Lucairn sanitizer internal taxonomy ([PERSON_N], [LOCATION_N], …) to the 18 HIPAA Safe Harbor categories (45 CFR § 164.514(b)(2)(i)). Placeholder-parsing mirrors gateway extractEntityTypes (proxy.go:1361-1395). - src/recall.ts — two consumer paths: aggregateExtracted() consumes gateway-attested verdicts (the harness's live path; arm's-length property preserved because matching runs inside the gateway, not in code Lucairn authored alongside the publication); computeRecallFromSpans() implements the ≥50%-character-overlap span-matching the Slice 2 brief locks for any future raw-span inline surface. Both produce the same RecallSummary shape. - src/mocks/gateway-fixtures.ts — deterministic mock builders for msw-backed unit tests + --mock smoke scripts. Configurable missRate + spuriousFpCount exercise recall paths against known oracles. - src/index.ts — barrel exports for the public surface. - package.json — adds msw ^2.7 devDependency. No new runtime deps. Cite-back for gateway response shape: proxy.go:35-58 (request schema), proxy.go:361-373 (mode + activity validation), proxy.go:1068-1080 (ground_truth_evaluation emission), ground_truth.go:5-138 (result shape). --- package.json | 1 + pnpm-lock.yaml | 395 ++++++++++++++++++++++++++++++++++ src/gateway-client.ts | 391 +++++++++++++++++++++++++++++++++ src/hipaa-category-mapping.ts | 146 +++++++++++++ src/index.ts | 61 +++++- src/mocks/gateway-fixtures.ts | 198 +++++++++++++++++ src/recall.ts | 359 ++++++++++++++++++++++++++++++ src/redaction-extractor.ts | 127 +++++++++++ 8 files changed, 1677 insertions(+), 1 deletion(-) create mode 100644 src/gateway-client.ts create mode 100644 src/hipaa-category-mapping.ts create mode 100644 src/mocks/gateway-fixtures.ts create mode 100644 src/recall.ts create mode 100644 src/redaction-extractor.ts diff --git a/package.json b/package.json index b4563ca..5279c6e 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ "devDependencies": { "@faker-js/faker": "^9.0.0", "@types/node": "^20.11.0", + "msw": "^2.14.6", "tsx": "^4.22.0", "typescript": "^5.4.0", "vitest": "^1.6.0" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7b5d5fc..83d46fc 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -14,6 +14,9 @@ importers: '@types/node': specifier: ^20.11.0 version: 20.19.41 + msw: + specifier: ^2.14.6 + version: 2.14.6(@types/node@20.19.41)(typescript@5.9.3) tsx: specifier: ^4.22.0 version: 4.22.1 @@ -324,6 +327,41 @@ packages: resolution: {integrity: sha512-OEl393iCOoo/z8bMezRlJu+GlRGlsKbUAN7jKB6LhnKoqKve5DXRpalbItIIcwnCjs1k/FOPjFzcA6Qn+H+YbA==} engines: {node: '>=18.0.0', npm: '>=9.0.0'} + '@inquirer/ansi@2.0.5': + resolution: {integrity: sha512-doc2sWgJpbFQ64UflSVd17ibMGDuxO1yKgOgLMwavzESnXjFWJqUeG8saYosqKpHp4kWiM5x1nXvEjbpx90gzw==} + engines: {node: '>=23.5.0 || ^22.13.0 || ^21.7.0 || ^20.12.0'} + + '@inquirer/confirm@6.0.13': + resolution: {integrity: sha512-wkGPC7yJ5WJk1DJ5SX7fzk+gfj4BM8cf5dDDi71B/551xHrdsZVRJOC0WyikXd0pEsb/9cLniuE4atbsMqmFkw==} + engines: {node: '>=23.5.0 || ^22.13.0 || ^21.7.0 || ^20.12.0'} + peerDependencies: + '@types/node': '>=18' + peerDependenciesMeta: + '@types/node': + optional: true + + '@inquirer/core@11.1.10': + resolution: {integrity: sha512-a4Q5BXHQAHa9eO202sTaFCHFYVB3x5fauDuThEAdZ9gfn76pSxiKU7wWcEH0N1O0XmQvNfQNU6QXpiRxmYQx+A==} + engines: {node: '>=23.5.0 || ^22.13.0 || ^21.7.0 || ^20.12.0'} + peerDependencies: + '@types/node': '>=18' + peerDependenciesMeta: + '@types/node': + optional: true + + '@inquirer/figures@2.0.5': + resolution: {integrity: sha512-NsSs4kzfm12lNetHwAn3GEuH317IzpwrMCbOuMIVytpjnJ90YYHNwdRgYGuKmVxwuIqSgqk3M5qqQt1cDk0tGQ==} + engines: {node: '>=23.5.0 || ^22.13.0 || ^21.7.0 || ^20.12.0'} + + '@inquirer/type@4.0.5': + resolution: {integrity: sha512-aetVUNeKNc/VriqXlw1NRSW0zhMBB0W4bNbWRJgzRl/3d0QNDQFfk0GO5SDdtjMZVg6o8ZKEiadd7SCCzoOn5Q==} + engines: {node: '>=23.5.0 || ^22.13.0 || ^21.7.0 || ^20.12.0'} + peerDependencies: + '@types/node': '>=18' + peerDependenciesMeta: + '@types/node': + optional: true + '@jest/schemas@29.6.3': resolution: {integrity: sha512-mo5j5X+jIZmJQveBKeS/clAueipV7KgiX1vMgCxam1RNYiqE1w62n0/tJJnHtjW8ZHcQco5gY85jA3mi0L+nSA==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} @@ -331,6 +369,22 @@ packages: '@jridgewell/sourcemap-codec@1.5.5': resolution: {integrity: sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==} + '@mswjs/interceptors@0.41.9': + resolution: {integrity: sha512-VVPPgHyQ6ShqnrmDWuxjmUIsO9gWyOZFmuOfLd9LfBGQJwZfy0gvv9pbHSJuoFNIYC7ZDX9aoFwowjcdSC4E8w==} + engines: {node: '>=18'} + + '@open-draft/deferred-promise@2.2.0': + resolution: {integrity: sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA==} + + '@open-draft/deferred-promise@3.0.0': + resolution: {integrity: sha512-XW375UK8/9SqUVNVa6M0yEy8+iTi4QN5VZ7aZuRFQmy76LRwI9wy5F4YIBU6T+eTe2/DNDo8tqu8RHlwLHM6RA==} + + '@open-draft/logger@0.3.0': + resolution: {integrity: sha512-X2g45fzhxH238HKO4xbSr7+wBS8Fvw6ixhTDuvLd5mqh6bJJCFAPwU9mPDxbcrRtfxv4u5IHCEH77BmxvXmmxQ==} + + '@open-draft/until@2.1.0': + resolution: {integrity: sha512-U69T3ItWHvLwGg5eJ0n3I62nWuE6ilHlmz7zM0npLBRvPRd7e6NYmg54vvRtP5mZG7kZqZCFVdsTWo7BPtBujg==} + '@rollup/rollup-android-arm-eabi@4.60.4': resolution: {integrity: sha512-F5QXMSiFebS9hKZj02XhWLLnRpJ3B3AROP0tWbFBSj+6kCbg5m9j5JoHKd4mmSVy5mS/IMQloYgYxCuJC0fxEQ==} cpu: [arm] @@ -481,6 +535,12 @@ packages: '@types/node@20.19.41': resolution: {integrity: sha512-ECymXOukMnOoVkC2bb1Vc/w/836DXncOg5m8Xj1RH7xSHZJWNYY6Zh7EH477vcnD5egKNNfy2RpNOmuChhFPgQ==} + '@types/set-cookie-parser@2.4.10': + resolution: {integrity: sha512-GGmQVGpQWUe5qglJozEjZV/5dyxbOOZ0LHe/lqyWssB88Y4svNfst0uqBVscdDeIKl5Jy5+aPSvy7mI9tYRguw==} + + '@types/statuses@2.0.6': + resolution: {integrity: sha512-xMAgYwceFhRA2zY+XbEA7mxYbA093wdiW8Vu6gZPGWy9cmOyU9XesH1tNcEWsKFd5Vzrqx5T3D38PWx1FIIXkA==} + '@vitest/expect@1.6.1': resolution: {integrity: sha512-jXL+9+ZNIJKruofqXuuTClf44eSpcHlgj3CiuNihUF3Ioujtmc0zIa3UJOW5RjDK1YLBJZnWBlPuqhYycLioog==} @@ -505,6 +565,14 @@ packages: engines: {node: '>=0.4.0'} hasBin: true + ansi-regex@5.0.1: + resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==} + engines: {node: '>=8'} + + ansi-styles@4.3.0: + resolution: {integrity: sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==} + engines: {node: '>=8'} + ansi-styles@5.2.0: resolution: {integrity: sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==} engines: {node: '>=10'} @@ -523,9 +591,28 @@ packages: check-error@1.0.3: resolution: {integrity: sha512-iKEoDYaRmd1mxM90a2OEfWhjsjPpYPuQ+lMYsoxB126+t8fw7ySEO48nmDg5COTjxDI65/Y2OWpeEHk3ZOe8zg==} + cli-width@4.1.0: + resolution: {integrity: sha512-ouuZd4/dm2Sw5Gmqy6bGyNNNe1qt9RpmxveLSO7KcgsTnU7RXfsw+/bukWGo1abgBiMAic068rclZsO4IWmmxQ==} + engines: {node: '>= 12'} + + cliui@8.0.1: + resolution: {integrity: sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==} + engines: {node: '>=12'} + + color-convert@2.0.1: + resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==} + engines: {node: '>=7.0.0'} + + color-name@1.1.4: + resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} + confbox@0.1.8: resolution: {integrity: sha512-RMtmw0iFkeR4YV+fUOSucriAQNb9g8zFR52MWCtl+cCZOFRNL6zeB395vPzFhEjjn4fMxXudmELnl/KF/WrK6w==} + cookie@1.1.1: + resolution: {integrity: sha512-ei8Aos7ja0weRpFzJnEA9UHJ/7XQmqglbRwnf2ATjcB9Wq874VKH9kfjjirM6UhU2/E5fFYadylyhFldcqSidQ==} + engines: {node: '>=18'} + cross-spawn@7.0.6: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} @@ -547,6 +634,9 @@ packages: resolution: {integrity: sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + emoji-regex@8.0.0: + resolution: {integrity: sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==} + esbuild@0.21.5: resolution: {integrity: sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==} engines: {node: '>=12'} @@ -557,6 +647,10 @@ packages: engines: {node: '>=18'} hasBin: true + escalade@3.2.0: + resolution: {integrity: sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==} + engines: {node: '>=6'} + estree-walker@3.0.3: resolution: {integrity: sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==} @@ -564,11 +658,24 @@ packages: resolution: {integrity: sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg==} engines: {node: '>=16.17'} + fast-string-truncated-width@3.0.3: + resolution: {integrity: sha512-0jjjIEL6+0jag3l2XWWizO64/aZVtpiGE3t0Zgqxv0DPuxiMjvB3M24fCyhZUO4KomJQPj3LTSUnDP3GpdwC0g==} + + fast-string-width@3.0.2: + resolution: {integrity: sha512-gX8LrtNEI5hq8DVUfRQMbr5lpaS4nMIWV+7XEbXk2b8kiQIizgnlr12B4dA3ZEx3308ze0O4Q1R+cHts8kyUJg==} + + fast-wrap-ansi@0.2.0: + resolution: {integrity: sha512-rLV8JHxTyhVmFYhBJuMujcrHqOT2cnO5Zxj37qROj23CP39GXubJRBUFF0z8KFK77Uc0SukZUf7JZhsVEQ6n8w==} + fsevents@2.3.3: resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} os: [darwin] + get-caller-file@2.0.5: + resolution: {integrity: sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==} + engines: {node: 6.* || 8.* || >= 10.*} + get-func-name@2.0.2: resolution: {integrity: sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==} @@ -576,10 +683,24 @@ packages: resolution: {integrity: sha512-VaUJspBffn/LMCJVoMvSAdmscJyS1auj5Zulnn5UoYcY531UWmdwhRWkcGKnGU93m5HSXP9LP2usOryrBtQowA==} engines: {node: '>=16'} + graphql@16.14.0: + resolution: {integrity: sha512-BBvQ/406p+4CZbTpCbVPSxfzrZrbnuWSP1ELYgyS6B+hNeKzgrdB4JczCa5VZUBQrDa9hUngm0KnexY6pJRN5Q==} + engines: {node: ^12.22.0 || ^14.16.0 || ^16.0.0 || >=17.0.0} + + headers-polyfill@5.0.1: + resolution: {integrity: sha512-1TJ6Fih/b8h5TIcv+1+Hw0PDQWJTKDKzFZzcKOiW1wJza3XoAQlkCuXLbymPYB8+ZQyw8mHvdw560e8zVFIWyA==} + human-signals@5.0.0: resolution: {integrity: sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==} engines: {node: '>=16.17.0'} + is-fullwidth-code-point@3.0.0: + resolution: {integrity: sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==} + engines: {node: '>=8'} + + is-node-process@1.2.0: + resolution: {integrity: sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw==} + is-stream@3.0.0: resolution: {integrity: sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -613,6 +734,20 @@ packages: ms@2.1.3: resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} + msw@2.14.6: + resolution: {integrity: sha512-ALe+N10S72cyx94cMcy3Zs4HhXCj35sgeAL4c+WTvKi0zWnbd8/h0lcFqv0mb2P+aSgAdD7p9HzvA0DiUPxsyg==} + engines: {node: '>=18'} + hasBin: true + peerDependencies: + typescript: '>= 4.8.x' + peerDependenciesMeta: + typescript: + optional: true + + mute-stream@3.0.0: + resolution: {integrity: sha512-dkEJPVvun4FryqBmZ5KhDo0K9iDXAwn08tMLDinNdRBNPcYEDiWYysLcc6k3mjTMlbP9KyylvRpd4wFtwrT9rw==} + engines: {node: ^20.17.0 || >=22.9.0} + nanoid@3.3.12: resolution: {integrity: sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ==} engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1} @@ -626,6 +761,9 @@ packages: resolution: {integrity: sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==} engines: {node: '>=12'} + outvariant@1.4.3: + resolution: {integrity: sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA==} + p-limit@5.0.0: resolution: {integrity: sha512-/Eaoq+QyLSiXQ4lyYV23f14mZRQcXnxfHrN0vCai+ak9G0pp9iEQukIIZq5NccEvwRB8PUnZT0KsOoDCINS1qQ==} engines: {node: '>=18'} @@ -638,6 +776,9 @@ packages: resolution: {integrity: sha512-haREypq7xkM7ErfgIyA0z+Bj4AGKlMSdlQE2jvJo6huWD1EdkKYV+G/T4nq0YEF2vgTT8kqMFKo1uHn950r4SQ==} engines: {node: '>=12'} + path-to-regexp@6.3.0: + resolution: {integrity: sha512-Yhpw4T9C6hPpgPeA28us07OJeqZ5EzQTkbfwuhsUg0c237RomFoETJgmp2sa3F/41gfLE6G5cqcYwznmeEeOlQ==} + pathe@1.1.2: resolution: {integrity: sha512-whLdWMYL2TwI08hn8/ZqAbrVemu0LNaNNJZX73O6qaIdCTfXutsLhMkjdENX0qhsQ9uIimo4/aQOmXkoon2nDQ==} @@ -664,11 +805,21 @@ packages: react-is@18.3.1: resolution: {integrity: sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==} + require-directory@2.1.1: + resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} + engines: {node: '>=0.10.0'} + + rettime@0.11.11: + resolution: {integrity: sha512-ILJRqVWBCTlg9r42fFgwVZx1gnFAcQF8mRoMkbgQfIrjEDf9nbBFDFx00oloOa+Q869FUtaYDXZvEfnecQSCoQ==} + rollup@4.60.4: resolution: {integrity: sha512-WHeFSbZYsPu3+bLoNRUuAO+wavNlocOPf3wSHTP7hcFKVnJeWsYlCDbr3mTS14FCizf9ccIxXA8sGL8zKeQN3g==} engines: {node: '>=18.0.0', npm: '>=8.0.0'} hasBin: true + set-cookie-parser@3.1.0: + resolution: {integrity: sha512-kjnC1DXBHcxaOaOXBHBeRtltsDG2nUiUni+jP92M9gYdW12rsmx92UsfpH7o5tDRs7I1ZZPSQJQGv3UaRfCiuw==} + shebang-command@2.0.0: resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} engines: {node: '>=8'} @@ -691,9 +842,24 @@ packages: stackback@0.0.2: resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==} + statuses@2.0.2: + resolution: {integrity: sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==} + engines: {node: '>= 0.8'} + std-env@3.10.0: resolution: {integrity: sha512-5GS12FdOZNliM5mAOxFRg7Ir0pWz8MdpYm6AY6VPkGpbA7ZzmbzNcBJQ0GPvvyWgcY7QAhCgf9Uy89I03faLkg==} + strict-event-emitter@0.5.1: + resolution: {integrity: sha512-vMgjE/GGEPEFnhFub6pa4FmJBRBVOLpIII2hvCZ8Kzb7K0hlHo7mQv6xYrBvCL2LtAIBwFUK8wvuJgTVSQ5MFQ==} + + string-width@4.2.3: + resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} + engines: {node: '>=8'} + + strip-ansi@6.0.1: + resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==} + engines: {node: '>=8'} + strip-final-newline@3.0.0: resolution: {integrity: sha512-dOESqjYr96iWYylGObzd39EuNTa5VJxyvVAEm5Jnh7KGo75V43Hk1odPQkNDyXNmUR6k+gEiDVXnjB8HJ3crXw==} engines: {node: '>=12'} @@ -701,6 +867,10 @@ packages: strip-literal@2.1.1: resolution: {integrity: sha512-631UJ6O00eNGfMiWG78ck80dfBab8X6IVFB51jZK5Icd7XAs60Z5y7QdSd/wGIklnWvRbUNloVzhOKKmutxQ6Q==} + tagged-tag@1.0.0: + resolution: {integrity: sha512-yEFYrVhod+hdNyx7g5Bnkkb0G6si8HJurOoOEgC8B/O0uXLHlaey/65KRv6cuWBNhBgHKAROVpc7QyYqE5gFng==} + engines: {node: '>=20'} + tinybench@2.9.0: resolution: {integrity: sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==} @@ -712,6 +882,17 @@ packages: resolution: {integrity: sha512-KYad6Vy5VDWV4GH3fjpseMQ/XU2BhIYP7Vzd0LG44qRWm/Yt2WCOTicFdvmgo6gWaqooMQCawTtILVQJupKu7A==} engines: {node: '>=14.0.0'} + tldts-core@7.0.30: + resolution: {integrity: sha512-uiHN8PIB1VmWyS98eZYja4xzlYqeFZVjb4OuYlJQnZAuJhMw4PbKQOKgHKhBdJR3FE/t5mUQ1Kd80++B+qhD1Q==} + + tldts@7.0.30: + resolution: {integrity: sha512-ELrFxuqsDdHUwoh0XxDbxuLD3Wnz49Z57IFvTtvWy1hJdcMZjXLIuonjilCiWHlT2GbE4Wlv1wKVTzDFnXH1aw==} + hasBin: true + + tough-cookie@6.0.1: + resolution: {integrity: sha512-LktZQb3IeoUWB9lqR5EWTHgW/VTITCXg4D21M+lvybRVdylLrRMnqaIONLVb5mav8vM19m44HIcGq4qASeu2Qw==} + engines: {node: '>=16'} + tsx@4.22.1: resolution: {integrity: sha512-TvncJykhxAzFCk0VQZKBTClall4Pm7qXDSodb6uxi8QFa8X8mT6ABjxxsQ2opDRYxG7AzcRWXaFtruz5HJKuWg==} engines: {node: '>=18.0.0'} @@ -721,6 +902,10 @@ packages: resolution: {integrity: sha512-Acylog8/luQ8L7il+geoSxhEkazvkslg7PSNKOX59mbB9cOveP5aq9h74Y7YU8yDpJwetzQQrfIwtf4Wp4LKcw==} engines: {node: '>=4'} + type-fest@5.6.0: + resolution: {integrity: sha512-8ZiHFm91orbSAe2PSAiSVBVko18pbhbiB3U9GglSzF/zCGkR+rxpHx6sEMCUm4kxY4LjDIUGgCfUMtwfZfjfUA==} + engines: {node: '>=20'} + typescript@5.9.3: resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} engines: {node: '>=14.17'} @@ -732,6 +917,9 @@ packages: undici-types@6.21.0: resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==} + until-async@3.0.2: + resolution: {integrity: sha512-IiSk4HlzAMqTUseHHe3VhIGyuFmN90zMTpD3Z3y8jeQbzLIq500MVM7Jq2vUAnTKAFPJrqwkzr6PoTcPhGcOiw==} + vite-node@1.6.1: resolution: {integrity: sha512-YAXkfvGtuTzwWbDSACdJSg4A4DZiAqckWe90Zapc/sEX3XvHcw1NdurM/6od8J207tSDqNbSsgdCacBgvJKFuA==} engines: {node: ^18.0.0 || >=20.0.0} @@ -803,6 +991,22 @@ packages: engines: {node: '>=8'} hasBin: true + wrap-ansi@7.0.0: + resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==} + engines: {node: '>=10'} + + y18n@5.0.8: + resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==} + engines: {node: '>=10'} + + yargs-parser@21.1.1: + resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==} + engines: {node: '>=12'} + + yargs@17.7.2: + resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==} + engines: {node: '>=12'} + yocto-queue@1.2.2: resolution: {integrity: sha512-4LCcse/U2MHZ63HAJVE+v71o7yOdIe4cZ70Wpf8D/IyjDKYQLV5GD46B+hSTjJsvV5PztjvHoU580EftxjDZFQ==} engines: {node: '>=12.20'} @@ -958,12 +1162,59 @@ snapshots: '@faker-js/faker@9.9.0': {} + '@inquirer/ansi@2.0.5': {} + + '@inquirer/confirm@6.0.13(@types/node@20.19.41)': + dependencies: + '@inquirer/core': 11.1.10(@types/node@20.19.41) + '@inquirer/type': 4.0.5(@types/node@20.19.41) + optionalDependencies: + '@types/node': 20.19.41 + + '@inquirer/core@11.1.10(@types/node@20.19.41)': + dependencies: + '@inquirer/ansi': 2.0.5 + '@inquirer/figures': 2.0.5 + '@inquirer/type': 4.0.5(@types/node@20.19.41) + cli-width: 4.1.0 + fast-wrap-ansi: 0.2.0 + mute-stream: 3.0.0 + signal-exit: 4.1.0 + optionalDependencies: + '@types/node': 20.19.41 + + '@inquirer/figures@2.0.5': {} + + '@inquirer/type@4.0.5(@types/node@20.19.41)': + optionalDependencies: + '@types/node': 20.19.41 + '@jest/schemas@29.6.3': dependencies: '@sinclair/typebox': 0.27.10 '@jridgewell/sourcemap-codec@1.5.5': {} + '@mswjs/interceptors@0.41.9': + dependencies: + '@open-draft/deferred-promise': 2.2.0 + '@open-draft/logger': 0.3.0 + '@open-draft/until': 2.1.0 + is-node-process: 1.2.0 + outvariant: 1.4.3 + strict-event-emitter: 0.5.1 + + '@open-draft/deferred-promise@2.2.0': {} + + '@open-draft/deferred-promise@3.0.0': {} + + '@open-draft/logger@0.3.0': + dependencies: + is-node-process: 1.2.0 + outvariant: 1.4.3 + + '@open-draft/until@2.1.0': {} + '@rollup/rollup-android-arm-eabi@4.60.4': optional: true @@ -1049,6 +1300,12 @@ snapshots: dependencies: undici-types: 6.21.0 + '@types/set-cookie-parser@2.4.10': + dependencies: + '@types/node': 20.19.41 + + '@types/statuses@2.0.6': {} + '@vitest/expect@1.6.1': dependencies: '@vitest/spy': 1.6.1 @@ -1084,6 +1341,12 @@ snapshots: acorn@8.16.0: {} + ansi-regex@5.0.1: {} + + ansi-styles@4.3.0: + dependencies: + color-convert: 2.0.1 + ansi-styles@5.2.0: {} assertion-error@1.1.0: {} @@ -1104,8 +1367,24 @@ snapshots: dependencies: get-func-name: 2.0.2 + cli-width@4.1.0: {} + + cliui@8.0.1: + dependencies: + string-width: 4.2.3 + strip-ansi: 6.0.1 + wrap-ansi: 7.0.0 + + color-convert@2.0.1: + dependencies: + color-name: 1.1.4 + + color-name@1.1.4: {} + confbox@0.1.8: {} + cookie@1.1.1: {} + cross-spawn@7.0.6: dependencies: path-key: 3.1.1 @@ -1122,6 +1401,8 @@ snapshots: diff-sequences@29.6.3: {} + emoji-regex@8.0.0: {} + esbuild@0.21.5: optionalDependencies: '@esbuild/aix-ppc64': 0.21.5 @@ -1177,6 +1458,8 @@ snapshots: '@esbuild/win32-ia32': 0.28.0 '@esbuild/win32-x64': 0.28.0 + escalade@3.2.0: {} + estree-walker@3.0.3: dependencies: '@types/estree': 1.0.9 @@ -1193,15 +1476,38 @@ snapshots: signal-exit: 4.1.0 strip-final-newline: 3.0.0 + fast-string-truncated-width@3.0.3: {} + + fast-string-width@3.0.2: + dependencies: + fast-string-truncated-width: 3.0.3 + + fast-wrap-ansi@0.2.0: + dependencies: + fast-string-width: 3.0.2 + fsevents@2.3.3: optional: true + get-caller-file@2.0.5: {} + get-func-name@2.0.2: {} get-stream@8.0.1: {} + graphql@16.14.0: {} + + headers-polyfill@5.0.1: + dependencies: + '@types/set-cookie-parser': 2.4.10 + set-cookie-parser: 3.1.0 + human-signals@5.0.0: {} + is-fullwidth-code-point@3.0.0: {} + + is-node-process@1.2.0: {} + is-stream@3.0.0: {} isexe@2.0.0: {} @@ -1234,6 +1540,33 @@ snapshots: ms@2.1.3: {} + msw@2.14.6(@types/node@20.19.41)(typescript@5.9.3): + dependencies: + '@inquirer/confirm': 6.0.13(@types/node@20.19.41) + '@mswjs/interceptors': 0.41.9 + '@open-draft/deferred-promise': 3.0.0 + '@types/statuses': 2.0.6 + cookie: 1.1.1 + graphql: 16.14.0 + headers-polyfill: 5.0.1 + is-node-process: 1.2.0 + outvariant: 1.4.3 + path-to-regexp: 6.3.0 + picocolors: 1.1.1 + rettime: 0.11.11 + statuses: 2.0.2 + strict-event-emitter: 0.5.1 + tough-cookie: 6.0.1 + type-fest: 5.6.0 + until-async: 3.0.2 + yargs: 17.7.2 + optionalDependencies: + typescript: 5.9.3 + transitivePeerDependencies: + - '@types/node' + + mute-stream@3.0.0: {} + nanoid@3.3.12: {} npm-run-path@5.3.0: @@ -1244,6 +1577,8 @@ snapshots: dependencies: mimic-fn: 4.0.0 + outvariant@1.4.3: {} + p-limit@5.0.0: dependencies: yocto-queue: 1.2.2 @@ -1252,6 +1587,8 @@ snapshots: path-key@4.0.0: {} + path-to-regexp@6.3.0: {} + pathe@1.1.2: {} pathe@2.0.3: {} @@ -1280,6 +1617,10 @@ snapshots: react-is@18.3.1: {} + require-directory@2.1.1: {} + + rettime@0.11.11: {} + rollup@4.60.4: dependencies: '@types/estree': 1.0.8 @@ -1311,6 +1652,8 @@ snapshots: '@rollup/rollup-win32-x64-msvc': 4.60.4 fsevents: 2.3.3 + set-cookie-parser@3.1.0: {} + shebang-command@2.0.0: dependencies: shebang-regex: 3.0.0 @@ -1325,20 +1668,46 @@ snapshots: stackback@0.0.2: {} + statuses@2.0.2: {} + std-env@3.10.0: {} + strict-event-emitter@0.5.1: {} + + string-width@4.2.3: + dependencies: + emoji-regex: 8.0.0 + is-fullwidth-code-point: 3.0.0 + strip-ansi: 6.0.1 + + strip-ansi@6.0.1: + dependencies: + ansi-regex: 5.0.1 + strip-final-newline@3.0.0: {} strip-literal@2.1.1: dependencies: js-tokens: 9.0.1 + tagged-tag@1.0.0: {} + tinybench@2.9.0: {} tinypool@0.8.4: {} tinyspy@2.2.1: {} + tldts-core@7.0.30: {} + + tldts@7.0.30: + dependencies: + tldts-core: 7.0.30 + + tough-cookie@6.0.1: + dependencies: + tldts: 7.0.30 + tsx@4.22.1: dependencies: esbuild: 0.28.0 @@ -1347,12 +1716,18 @@ snapshots: type-detect@4.1.0: {} + type-fest@5.6.0: + dependencies: + tagged-tag: 1.0.0 + typescript@5.9.3: {} ufo@1.6.4: {} undici-types@6.21.0: {} + until-async@3.0.2: {} + vite-node@1.6.1(@types/node@20.19.41): dependencies: cac: 6.7.14 @@ -1423,4 +1798,24 @@ snapshots: siginfo: 2.0.0 stackback: 0.0.2 + wrap-ansi@7.0.0: + dependencies: + ansi-styles: 4.3.0 + string-width: 4.2.3 + strip-ansi: 6.0.1 + + y18n@5.0.8: {} + + yargs-parser@21.1.1: {} + + yargs@17.7.2: + dependencies: + cliui: 8.0.1 + escalade: 3.2.0 + get-caller-file: 2.0.5 + require-directory: 2.1.1 + string-width: 4.2.3 + y18n: 5.0.8 + yargs-parser: 21.1.1 + yocto-queue@1.2.2: {} diff --git a/src/gateway-client.ts b/src/gateway-client.ts new file mode 100644 index 0000000..5dc8c11 --- /dev/null +++ b/src/gateway-client.ts @@ -0,0 +1,391 @@ +/** + * gateway-client.ts + * + * Typed wrapper around the Lucairn gateway's proving-ground proxy endpoint + * (`POST /api/v1/proxy/messages` with `mode: "proving_ground"`). + * + * Why this endpoint: + * The proving-ground mode is the ONLY inline gateway surface that returns + * per-entity matching evidence (matches / missed / extras keyed by the + * caller-supplied annotation type) in the same HTTP response. The + * alternative inline surfaces — `/v1/messages` and the public-summary + * endpoint — emit only aggregate redaction counts and explicitly omit + * per-entity fields for privacy reasons. References: + * - dual-sandbox-architecture/services/gateway/internal/api/proxy.go:35-58 + * (proxyPIIAnnotation + proxyRequest schemas) + * - dual-sandbox-architecture/services/gateway/internal/api/proxy.go:361-373 + * (mode validation, ActivityID + GroundTruth requirements) + * - dual-sandbox-architecture/services/gateway/internal/api/proxy.go:1068-1080 + * (ground_truth_evaluation field emission) + * - dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:5-138 + * (groundTruthResult + per-item shapes) + * + * The retry policy is 2 retries with exponential backoff (base 500 ms, jitter + * 0–200 ms) on 5xx and connection errors only. 4xx errors are surfaced + * without retry. The per-request timeout defaults to 30 s and is configurable + * via LUCAIRN_REQUEST_TIMEOUT_MS. + * + * No real secret material is referenced at import time — env reads happen at + * call time inside makeGatewayClient(). Tests run with msw active and use + * synthetic URLs / keys. + */ + +import type { HipaaCategory, InjectedEntity } from './inject-pii-core.js'; + +/** + * The annotation we send to the gateway as ground truth. `type` carries the + * HIPAA Safe Harbor category verbatim, so the gateway echoes it back in + * `matches[].annotation_type` and `missed[].type` and we can aggregate + * directly without a second mapping pass. + */ +export interface ProvingGroundAnnotation { + readonly type: HipaaCategory; + readonly value: string; + readonly start: number; + readonly end: number; +} + +/** + * The per-row request body emitted to `POST /api/v1/proxy/messages`. The + * prompt template + context fields are minimal because Paper 1's measurement + * is upstream of inference — we are measuring sanitizer recall against + * known-injected PHI, not LLM behaviour. The single context field + * `transcription` carries the row text; the prompt template trivially echoes + * it back so the inference call completes. + */ +export interface GatewayRequestBody { + readonly prompt_template: string; + readonly context: Readonly>; + readonly mode: 'proving_ground'; + readonly activity_id: string; + readonly ground_truth: Readonly>; + readonly relink_response: false; + readonly model?: string; + readonly max_tokens?: number; +} + +/** + * Mirrors `groundTruthMatch` in + * dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:20-24 + */ +export interface GroundTruthMatch { + readonly annotation_type: string; + readonly annotation_value: string; + readonly redacted_as: string; +} + +/** + * Mirrors `groundTruthMiss` in + * dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:26-30 + */ +export interface GroundTruthMiss { + readonly field: string; + readonly type: string; + readonly value: string; +} + +/** + * Mirrors `groundTruthExtra` in + * dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:32-35 + */ +export interface GroundTruthExtra { + readonly placeholder: string; + readonly original: string; +} + +/** + * Mirrors `groundTruthResult` in + * dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:7-18 + */ +export interface GroundTruthEvaluation { + readonly total_annotations: number; + readonly true_positives: number; + readonly false_negatives: number; + readonly false_positives: number; + readonly detection_rate: number; + readonly matches?: readonly GroundTruthMatch[]; + readonly missed?: readonly GroundTruthMiss[]; + readonly extras?: readonly GroundTruthExtra[]; +} + +/** + * Subset of the gateway proxy response that the harness reads. The full + * response also includes `result`, `model_used`, `dlp_redacted`, + * `compliance_trace`, etc. — those are surfaced verbatim in the raw NDJSON + * for downstream auditability but the harness reads only what's needed for + * cert collection + recall computation. + * + * `veil` is the Pro/Enterprise hint emitted at + * dual-sandbox-architecture/services/gateway/internal/api/proxy.go:1088-1094 + */ +export interface VeilHint { + readonly status: string; + readonly certificate_url: string; + readonly summary_url: string; +} + +export interface GatewayResponse { + readonly request_id: string; + readonly status?: string; + readonly latency_ms?: number; + readonly redaction_count?: number; + readonly ground_truth_evaluation?: GroundTruthEvaluation; + readonly veil?: VeilHint; + // Free-form passthrough for the raw NDJSON dump — typed loosely so the + // harness never silently drops fields that the gateway adds later. + readonly [extra: string]: unknown; +} + +export interface GatewayRowInput { + readonly row_index: number; + readonly transcription: string; + readonly entities: readonly InjectedEntity[]; +} + +export interface GatewayRowResult { + readonly row_index: number; + readonly request_id: string; + readonly cert_url: string | null; + readonly summary_url: string | null; + readonly evaluation: GroundTruthEvaluation | null; + readonly redaction_count: number | null; + readonly latency_ms: number | null; + readonly raw_response: GatewayResponse; +} + +export interface GatewayClientOptions { + readonly gatewayUrl: string; + readonly apiKey: string; + readonly activityIdPrefix?: string; + readonly requestTimeoutMs?: number; + readonly maxRetries?: number; + readonly backoffBaseMs?: number; + readonly backoffJitterMs?: number; + readonly fetchFn?: typeof fetch; + readonly sleepFn?: (ms: number) => Promise; + readonly randomFn?: () => number; + readonly model?: string; + readonly maxTokens?: number; +} + +export interface GatewayClient { + runRow(row: GatewayRowInput): Promise; +} + +export class GatewayClientError extends Error { + public readonly status: number | null; + public readonly responseBody: string | null; + + constructor(message: string, status: number | null, responseBody: string | null) { + super(message); + this.name = 'GatewayClientError'; + this.status = status; + this.responseBody = responseBody; + } +} + +const DEFAULT_TIMEOUT_MS = 30_000; +const DEFAULT_MAX_RETRIES = 2; +const DEFAULT_BACKOFF_BASE_MS = 500; +const DEFAULT_BACKOFF_JITTER_MS = 200; +const DEFAULT_MODEL = 'claude-sonnet-4-6'; +const DEFAULT_MAX_TOKENS = 64; + +function defaultSleep(ms: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +/** + * Construct an annotation list suitable for the proving-ground ground_truth + * field. The keying field name is fixed at `transcription` because that is + * the single context field we route through the sanitizer. + */ +function buildGroundTruth( + entities: readonly InjectedEntity[], +): Record { + return { + transcription: entities.map((e) => ({ + type: e.category, + value: e.value, + start: e.start_char, + end: e.end_char, + })), + }; +} + +/** + * Pure helper: extract the cert URL + summary URL from a gateway response. + * Exposed for the collect-certs script + unit testing without mounting a + * full client. + */ +export function extractCertUrls(response: GatewayResponse): { + cert_url: string | null; + summary_url: string | null; +} { + const veil = response.veil; + if (!veil) return { cert_url: null, summary_url: null }; + return { + cert_url: veil.certificate_url ?? null, + summary_url: veil.summary_url ?? null, + }; +} + +export function makeGatewayClient(options: GatewayClientOptions): GatewayClient { + if (options.gatewayUrl === '') { + throw new GatewayClientError( + 'gatewayUrl is required (or set LUCAIRN_GATEWAY_URL)', + null, + null, + ); + } + if (options.apiKey === '') { + throw new GatewayClientError('apiKey is required (or set LUCAIRN_API_KEY)', null, null); + } + const fetchFn = options.fetchFn ?? fetch; + const sleepFn = options.sleepFn ?? defaultSleep; + const randomFn = options.randomFn ?? Math.random; + const timeoutMs = options.requestTimeoutMs ?? DEFAULT_TIMEOUT_MS; + const maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES; + const backoffBase = options.backoffBaseMs ?? DEFAULT_BACKOFF_BASE_MS; + const backoffJitter = options.backoffJitterMs ?? DEFAULT_BACKOFF_JITTER_MS; + const activityPrefix = options.activityIdPrefix ?? 'paper-1-healthcare'; + const model = options.model ?? DEFAULT_MODEL; + const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS; + const endpoint = `${options.gatewayUrl.replace(/\/+$/u, '')}/api/v1/proxy/messages`; + + async function runRow(row: GatewayRowInput): Promise { + const body: GatewayRequestBody = { + prompt_template: + 'Echo the transcription back verbatim. Make no inferences. Transcription: {transcription}', + context: { transcription: row.transcription }, + mode: 'proving_ground', + activity_id: `${activityPrefix}-row-${row.row_index}`, + ground_truth: buildGroundTruth(row.entities), + relink_response: false, + model, + max_tokens: maxTokens, + }; + + let attempt = 0; + // The retry budget is maxRetries + 1 (the initial attempt + retries). + while (true) { + attempt += 1; + let controller: AbortController | null = null; + let timeoutHandle: ReturnType | null = null; + try { + controller = new AbortController(); + timeoutHandle = setTimeout(() => { + controller?.abort(); + }, timeoutMs); + const response = await fetchFn(endpoint, { + method: 'POST', + headers: { + 'content-type': 'application/json', + 'x-api-key': options.apiKey, + }, + body: JSON.stringify(body), + signal: controller.signal, + }); + clearTimeout(timeoutHandle); + timeoutHandle = null; + if (response.status >= 500) { + // Retry-eligible. + const text = await safeReadText(response); + if (attempt > maxRetries) { + throw new GatewayClientError( + `gateway 5xx after ${attempt - 1} retries (status ${response.status})`, + response.status, + text, + ); + } + await sleepFn(computeBackoffMs(attempt, backoffBase, backoffJitter, randomFn)); + continue; + } + if (response.status >= 400) { + // 4xx is terminal — surface immediately, no retry. + const text = await safeReadText(response); + throw new GatewayClientError( + `gateway 4xx (status ${response.status})`, + response.status, + text, + ); + } + const raw = (await response.json()) as GatewayResponse; + const urls = extractCertUrls(raw); + return { + row_index: row.row_index, + request_id: raw.request_id ?? '', + cert_url: urls.cert_url, + summary_url: urls.summary_url, + evaluation: raw.ground_truth_evaluation ?? null, + redaction_count: typeof raw.redaction_count === 'number' ? raw.redaction_count : null, + latency_ms: typeof raw.latency_ms === 'number' ? raw.latency_ms : null, + raw_response: raw, + }; + } catch (err) { + if (timeoutHandle !== null) { + clearTimeout(timeoutHandle); + } + if (err instanceof GatewayClientError) { + // Terminal — already classified. + throw err; + } + // Connection / abort / unknown error → retry budget applies. + if (attempt > maxRetries) { + const reason = err instanceof Error ? err.message : String(err); + throw new GatewayClientError( + `gateway connection error after ${attempt - 1} retries: ${reason}`, + null, + null, + ); + } + await sleepFn(computeBackoffMs(attempt, backoffBase, backoffJitter, randomFn)); + } + } + } + + return { runRow }; +} + +function computeBackoffMs( + attempt: number, + baseMs: number, + jitterMs: number, + randomFn: () => number, +): number { + const expo = baseMs * 2 ** (attempt - 1); + const jitter = randomFn() * jitterMs; + return Math.floor(expo + jitter); +} + +async function safeReadText(response: Response): Promise { + try { + return await response.text(); + } catch { + return null; + } +} + +/** + * Read gateway URL + API key from process.env. Returns null fields if unset + * so callers can decide whether to enter mock mode or fail. + */ +export function readGatewayEnv(env: NodeJS.ProcessEnv = process.env): { + gatewayUrl: string | null; + apiKey: string | null; + requestTimeoutMs: number | null; +} { + const url = env.LUCAIRN_GATEWAY_URL ?? null; + const key = env.LUCAIRN_API_KEY ?? null; + const timeoutStr = env.LUCAIRN_REQUEST_TIMEOUT_MS ?? null; + let timeoutMs: number | null = null; + if (timeoutStr !== null) { + const parsed = Number.parseInt(timeoutStr, 10); + if (Number.isFinite(parsed) && parsed > 0) { + timeoutMs = parsed; + } + } + return { gatewayUrl: url, apiKey: key, requestTimeoutMs: timeoutMs }; +} diff --git a/src/hipaa-category-mapping.ts b/src/hipaa-category-mapping.ts new file mode 100644 index 0000000..be47108 --- /dev/null +++ b/src/hipaa-category-mapping.ts @@ -0,0 +1,146 @@ +/** + * hipaa-category-mapping.ts + * + * Maps Lucairn's internal sanitizer placeholder types (the `[TYPE_N]` shape) + * back to the 18 HIPAA Safe Harbor categories defined in + * `src/inject-pii-core.ts:28-47` (45 CFR § 164.514(b)(2)(i)). + * + * Why this exists: + * The Lucairn sanitizer emits redactions whose `placeholder` field is of the + * form `[TYPE_N]` where TYPE is an internal taxonomy term (PERSON, LOCATION, + * PHONE_NUMBER, etc.). The HIPAA Safe Harbor enumeration is the standard the + * research program reports recall against. This module is the documented + * bridge between the two taxonomies. + * + * Cite-back: gateway emits `placeholder` per redaction at + * `dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:48-56` + * and the placeholder parsing convention at + * `dual-sandbox-architecture/services/gateway/internal/api/proxy.go:1361-1395` + * (extractEntityTypes — accepts `[TYPE_N]` where TYPE is one or more + * uppercase letters/underscores). + * + * The mapping is intentionally explicit and one-way (internal → HIPAA). If + * Lucairn introduces a new sanitizer type, this table MUST be extended before + * Paper 1 numbers are re-published — an unmapped placeholder is a recall + * accounting gap, not a silent passthrough. + */ + +import type { HipaaCategory } from './inject-pii-core.js'; + +/** + * The exhaustive mapping from Lucairn internal sanitizer types to HIPAA Safe + * Harbor categories. + * + * Sources for the right-hand-side category assignments: + * - 45 CFR § 164.514(b)(2)(i) Safe Harbor enumeration (the 18 categories + * listed in `src/inject-pii-core.ts:28-47`). + * - Lucairn sanitizer's internal type vocabulary as observed in the gateway + * `extractEntityTypes` logic (`proxy.go:1361-1395`) and the Presidio + + * custom recognizer catalogue. + * + * Categories not currently emitted by the sanitizer (e.g. FACE_PHOTO_REF, + * BIOMETRIC_ID) are absent from this map; they appear in injected ground + * truth only and will show as false-negatives if the sanitizer never detects + * them, which is correct accounting. + */ +export const LUCAIRN_TO_HIPAA: Readonly> = Object.freeze({ + // Name-bearing types + PERSON: 'NAME', + PERSON_NAME: 'NAME', + NAME: 'NAME', + + // Geographic subdivisions + LOCATION: 'GEO_SUBDIVISION', + ADDRESS: 'GEO_SUBDIVISION', + STREET_ADDRESS: 'GEO_SUBDIVISION', + ZIP_CODE: 'GEO_SUBDIVISION', + GERMAN_ZIP_CODE: 'GEO_SUBDIVISION', + CITY: 'GEO_SUBDIVISION', + + // Dates + DATE: 'DATE', + DATE_TIME: 'DATE', + + // Telephone / fax — sanitizer does not natively distinguish PHONE from FAX. + // We map both PHONE_NUMBER and PHONE to PHONE; FAX is only recognised when + // a custom recognizer surfaces FAX explicitly. + PHONE_NUMBER: 'PHONE', + PHONE: 'PHONE', + FAX: 'FAX', + FAX_NUMBER: 'FAX', + + // Email + EMAIL: 'EMAIL', + EMAIL_ADDRESS: 'EMAIL', + + // US identifier-shaped categories + US_SSN: 'SSN', + SSN: 'SSN', + + // Medical record / health-plan / account / license / vehicle / device + MRN: 'MRN', + MEDICAL_RECORD_NUMBER: 'MRN', + HEALTH_PLAN_ID: 'HEALTH_PLAN_ID', + HEALTH_PLAN_BENEFICIARY_NUMBER: 'HEALTH_PLAN_ID', + ACCOUNT_NUMBER: 'ACCOUNT_NUMBER', + US_BANK_NUMBER: 'ACCOUNT_NUMBER', + IBAN: 'ACCOUNT_NUMBER', + IBAN_CODE: 'ACCOUNT_NUMBER', + CREDIT_CARD: 'ACCOUNT_NUMBER', + CREDIT_CARD_NUMBER: 'ACCOUNT_NUMBER', + LICENSE_NUMBER: 'LICENSE_NUMBER', + US_DRIVER_LICENSE: 'LICENSE_NUMBER', + PROFESSIONAL_LICENSE: 'LICENSE_NUMBER', + VEHICLE_ID: 'VEHICLE_ID', + VIN: 'VEHICLE_ID', + US_VEHICLE_VIN: 'VEHICLE_ID', + LICENSE_PLATE: 'VEHICLE_ID', + DEVICE_ID: 'DEVICE_ID', + DEVICE_SERIAL: 'DEVICE_ID', + IMEI: 'DEVICE_ID', + + // Web identifiers + URL: 'URL', + IP_ADDRESS: 'IP_ADDRESS', + + // Biometric / face photo / other unique ID + BIOMETRIC_ID: 'BIOMETRIC_ID', + FACE_PHOTO_REF: 'FACE_PHOTO_REF', + STUDY_ID: 'OTHER_UNIQUE_ID', + OTHER_UNIQUE_ID: 'OTHER_UNIQUE_ID', + PASSPORT: 'OTHER_UNIQUE_ID', + US_PASSPORT: 'OTHER_UNIQUE_ID', + US_ITIN: 'OTHER_UNIQUE_ID', +}); + +/** + * Parse the internal type prefix out of a `[TYPE_N]` placeholder. Returns + * null for malformed placeholders. + * + * Mirrors the gateway's own parsing in `extractEntityTypes` + * (`proxy.go:1361-1395`): require leading `[`, trailing `]`, at least one + * underscore, and an all-digit suffix. + */ +export function parsePlaceholderType(placeholder: string): string | null { + if (placeholder.length < 4) return null; + if (placeholder[0] !== '[' || placeholder[placeholder.length - 1] !== ']') return null; + const inner = placeholder.slice(1, -1); + const lastUnderscore = inner.lastIndexOf('_'); + if (lastUnderscore < 1) return null; + const suffix = inner.slice(lastUnderscore + 1); + if (suffix.length === 0) return null; + for (const c of suffix) { + if (c < '0' || c > '9') return null; + } + return inner.slice(0, lastUnderscore); +} + +/** + * Map a Lucairn `[TYPE_N]` placeholder to its HIPAA Safe Harbor category. + * Returns null when the internal type is not in `LUCAIRN_TO_HIPAA`. + */ +export function placeholderToHipaaCategory(placeholder: string): HipaaCategory | null { + const t = parsePlaceholderType(placeholder); + if (t === null) return null; + return LUCAIRN_TO_HIPAA[t] ?? null; +} diff --git a/src/index.ts b/src/index.ts index cb0ff5c..c481b7e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1 +1,60 @@ -export {}; +/** + * Public API surface for @lucairn/research methodology code. + * + * The repo is not published to npm; consumers run it from a clone. This + * barrel keeps the script + test imports terse and documents the supported + * extension points for future research papers. + */ + +export { + HIPAA_CATEGORIES, + type HipaaCategory, + type InjectedEntity, + type InjectedRow, +} from './inject-pii-core.js'; + +export { + LUCAIRN_TO_HIPAA, + parsePlaceholderType, + placeholderToHipaaCategory, +} from './hipaa-category-mapping.js'; + +export { + type GatewayClient, + type GatewayClientOptions, + GatewayClientError, + type GatewayRequestBody, + type GatewayResponse, + type GatewayRowInput, + type GatewayRowResult, + type GroundTruthEvaluation, + type GroundTruthExtra, + type GroundTruthMatch, + type GroundTruthMiss, + type ProvingGroundAnnotation, + type VeilHint, + extractCertUrls, + makeGatewayClient, + readGatewayEnv, +} from './gateway-client.js'; + +export { + type ExtractedRedaction, + type RedactionVerdict, + extractFromEvaluation, + unmappedExtraTypes, +} from './redaction-extractor.js'; + +export { + type CategoryCounts, + type OverallCounts, + type PredictedSpan, + type RecallSummary, + type RowBreakdown, + type SpanEntity, + SPAN_OVERLAP_THRESHOLD, + aggregateExtracted, + computeRecallFromSpans, +} from './recall.js'; + +export { emitCsv, parseCsv, type CsvRow } from './csv.js'; diff --git a/src/mocks/gateway-fixtures.ts b/src/mocks/gateway-fixtures.ts new file mode 100644 index 0000000..9dbf652 --- /dev/null +++ b/src/mocks/gateway-fixtures.ts @@ -0,0 +1,198 @@ +/** + * mocks/gateway-fixtures.ts + * + * Deterministic mock-response builders that mirror the real gateway's + * proving-ground response shape exactly. Tests and the `--mock` smoke + * scripts mount these via msw (see msw setup in test files). + * + * The shape MUST track the gateway sources cited in + * `src/gateway-client.ts` — any divergence is a Slice 3 hazard. + */ + +import type { InjectedEntity } from '../inject-pii-core.js'; +import type { + GatewayResponse, + GroundTruthExtra, + GroundTruthMatch, + GroundTruthMiss, + ProvingGroundAnnotation, +} from '../gateway-client.js'; +import { mulberry32 } from '../inject-pii-core.js'; + +export interface MockBuilderOptions { + readonly rowIndex: number; + readonly entities: readonly InjectedEntity[]; + /** Fraction in [0, 1] of injected entities the mock should "miss". 0 = perfect recall, 1.0 = no detections. */ + readonly missRate?: number; + /** Optional fixed seed for the per-row PRNG. Default: rowIndex. */ + readonly seed?: number; + /** When provided, latency_ms field is set to this value. Default: deterministic-pseudo. */ + readonly latencyMsOverride?: number; + /** Synthetic spurious-redaction count, simulating false positives. */ + readonly spuriousFpCount?: number; +} + +const PLACEHOLDER_FOR_CATEGORY: Readonly> = { + NAME: 'PERSON', + GEO_SUBDIVISION: 'LOCATION', + DATE: 'DATE', + PHONE: 'PHONE_NUMBER', + FAX: 'FAX_NUMBER', + EMAIL: 'EMAIL_ADDRESS', + SSN: 'US_SSN', + MRN: 'MEDICAL_RECORD_NUMBER', + HEALTH_PLAN_ID: 'HEALTH_PLAN_ID', + ACCOUNT_NUMBER: 'ACCOUNT_NUMBER', + LICENSE_NUMBER: 'LICENSE_NUMBER', + VEHICLE_ID: 'VEHICLE_ID', + DEVICE_ID: 'DEVICE_ID', + URL: 'URL', + IP_ADDRESS: 'IP_ADDRESS', + BIOMETRIC_ID: 'BIOMETRIC_ID', + FACE_PHOTO_REF: 'FACE_PHOTO_REF', + OTHER_UNIQUE_ID: 'STUDY_ID', +}; + +/** + * Build a mock gateway response for a single row. Determinism: given the + * same options the output is byte-identical across runs and platforms (the + * miss-selection PRNG is mulberry32-seeded). + */ +export function buildMockResponse(options: MockBuilderOptions): GatewayResponse { + const missRate = clampUnit(options.missRate ?? 0); + const seed = options.seed ?? options.rowIndex; + const rng = mulberry32(seed); + const spuriousFpCount = Math.max(0, options.spuriousFpCount ?? 0); + + const matches: GroundTruthMatch[] = []; + const missed: GroundTruthMiss[] = []; + // Deterministic per-category sequence counters for placeholder N suffix. + const seqByType = new Map(); + + for (const e of options.entities) { + const draw = rng(); + if (draw < missRate) { + missed.push({ field: 'transcription', type: e.category, value: e.value }); + } else { + const internalType = PLACEHOLDER_FOR_CATEGORY[e.category] ?? 'OTHER'; + const nextN = (seqByType.get(internalType) ?? 0) + 1; + seqByType.set(internalType, nextN); + matches.push({ + annotation_type: e.category, + annotation_value: e.value, + redacted_as: `[${internalType}_${nextN}]`, + }); + } + } + + const extras: GroundTruthExtra[] = []; + for (let i = 0; i < spuriousFpCount; i++) { + // Synthesise plausible-looking spurious detections so FP-handling code + // paths can be exercised. Use deterministic pseudo-text. + const internalType = ['PERSON', 'LOCATION', 'PHONE_NUMBER'][i % 3] ?? 'PERSON'; + const nextN = (seqByType.get(internalType) ?? 0) + 1; + seqByType.set(internalType, nextN); + extras.push({ + placeholder: `[${internalType}_${nextN}]`, + original: `spurious_${seed}_${i}`, + }); + } + + const totalAnnotations = options.entities.length; + const truePositives = matches.length; + const falseNegatives = missed.length; + const falsePositives = extras.length; + const detectionRate = + totalAnnotations === 0 ? 1.0 : truePositives / totalAnnotations; + + const certId = pseudoCertId(seed); + return { + request_id: `req_${seed.toString(16).padStart(8, '0')}`, + status: 'JOB_STATUS_COMPLETED', + latency_ms: options.latencyMsOverride ?? 250, + result: 'mock-result-omitted', + redaction_count: truePositives + falsePositives, + ground_truth_evaluation: { + total_annotations: totalAnnotations, + true_positives: truePositives, + false_negatives: falseNegatives, + false_positives: falsePositives, + detection_rate: detectionRate, + matches, + missed, + extras, + }, + veil: { + status: 'available', + certificate_url: `/api/v1/veil/certificate/${certId}`, + summary_url: `/api/v1/veil/certificate/${certId}/summary`, + }, + }; +} + +/** + * Stub helper used by test mocks to recover the per-row ground truth from a + * request body. Mirrors the wire shape: ground_truth.transcription is an + * array of ProvingGroundAnnotation. + */ +export function entitiesFromRequestBody(body: unknown): { + rowIndex: number | null; + entities: InjectedEntity[]; +} { + if (typeof body !== 'object' || body === null) { + return { rowIndex: null, entities: [] }; + } + const obj = body as Record; + const activity = obj['activity_id']; + let rowIndex: number | null = null; + if (typeof activity === 'string') { + const match = /-row-(\d+)$/u.exec(activity); + if (match !== null) { + const parsed = Number.parseInt(match[1] ?? '', 10); + if (Number.isFinite(parsed)) rowIndex = parsed; + } + } + const gtRaw = obj['ground_truth']; + if (typeof gtRaw !== 'object' || gtRaw === null) { + return { rowIndex, entities: [] }; + } + const transcription = (gtRaw as Record)['transcription']; + if (!Array.isArray(transcription)) { + return { rowIndex, entities: [] }; + } + const entities: InjectedEntity[] = []; + for (const item of transcription) { + if (typeof item !== 'object' || item === null) continue; + const a = item as Partial; + if ( + typeof a.type === 'string' && + typeof a.value === 'string' && + typeof a.start === 'number' && + typeof a.end === 'number' + ) { + entities.push({ + category: a.type, + value: a.value, + start_char: a.start, + end_char: a.end, + }); + } + } + return { rowIndex, entities }; +} + +function clampUnit(x: number): number { + if (x < 0) return 0; + if (x > 1) return 1; + return x; +} + +function pseudoCertId(seed: number): string { + // 32 hex chars, deterministic per seed via two mulberry32 draws. + const rng = mulberry32(seed); + let out = ''; + for (let i = 0; i < 4; i++) { + out += Math.floor(rng() * 0x1_0000_0000).toString(16).padStart(8, '0'); + } + return out; +} diff --git a/src/recall.ts b/src/recall.ts new file mode 100644 index 0000000..615f03e --- /dev/null +++ b/src/recall.ts @@ -0,0 +1,359 @@ +/** + * recall.ts + * + * Per-HIPAA-category recall / precision / F1 aggregation. + * + * Two consumer paths: + * + * 1. `aggregateExtracted(extracted)` — the path the harness uses live. + * Consumes redactions already tagged TP/FN/FP by the gateway's + * proving-ground evaluator (value-containment matching, server-side). + * Per-category counts are derived from `hipaa_category` on each + * `ExtractedRedaction`. This path is the source of truth for any number + * published in Paper 1 because the matching is performed inside the + * gateway, not by code that the publisher (Lucairn) also wrote — the + * arm's-length property compliance buyers care about. + * + * 2. `computeRecallFromSpans(groundTruth, predictedSpans)` — pure math + * layer with span-overlap matching at the ≥50%-character-overlap + * threshold locked in the Slice 2 dispatch brief. Useful when a future + * gateway surface exposes raw per-entity spans inline (none does today; + * see slice-2 brief lines 47-67 for the citation chain). Lets the + * research repo evolve its recall semantics without re-implementing + * aggregation downstream. + * + * Both paths produce the same `RecallSummary` shape. + */ + +import { HIPAA_CATEGORIES, type HipaaCategory, type InjectedRow } from './inject-pii-core.js'; +import type { ExtractedRedaction } from './redaction-extractor.js'; + +/** Overlap threshold for `computeRecallFromSpans`. Locked at 50% per Slice 2 brief. */ +export const SPAN_OVERLAP_THRESHOLD = 0.5; + +export interface CategoryCounts { + readonly tp: number; + readonly fp: number; + readonly fn: number; + readonly precision: number; + readonly recall: number; + readonly f1: number; +} + +export interface OverallCounts { + readonly tp: number; + readonly fp: number; + readonly fn: number; + readonly precision: number; + readonly recall: number; + readonly f1: number; + /** Total annotations in the ground truth (TP + FN). */ + readonly total_annotations: number; +} + +export interface RowBreakdown { + readonly row_index: number; + readonly tp: number; + readonly fp: number; + readonly fn: number; + readonly recall: number; +} + +export interface RecallSummary { + readonly schema_version: '1.0'; + readonly generator: 'lucairn-research/recall.ts'; + readonly overall: OverallCounts; + /** Sorted by HipaaCategory canonical order from `HIPAA_CATEGORIES`. */ + readonly per_category: ReadonlyArray<{ category: HipaaCategory; counts: CategoryCounts }>; + /** Sorted by row_index ascending. */ + readonly per_row: readonly RowBreakdown[]; + readonly notes: readonly string[]; +} + +/** Spans with start ≤ end. Treated as half-open intervals [start, end). */ +export interface SpanEntity { + readonly category: HipaaCategory; + readonly value: string; + readonly start_char: number; + readonly end_char: number; +} + +export interface PredictedSpan { + /** Optional Lucairn-internal type for diagnostics; not required for matching. */ + readonly category?: HipaaCategory | null; + readonly start_char: number; + readonly end_char: number; + /** Original PHI text the sanitizer matched, when known. */ + readonly value?: string; +} + +interface MutableCategoryCounts { + tp: number; + fp: number; + fn: number; +} + +function emptyCategoryCounts(): MutableCategoryCounts { + return { tp: 0, fp: 0, fn: 0 }; +} + +/** + * Derive precision, recall, F1 from raw TP/FP/FN. When (TP+FP)==0 or + * (TP+FN)==0 we report 0 rather than NaN; that is the more useful behaviour + * for aggregating summaries across rows where one category may be absent. + */ +function deriveRates(tp: number, fp: number, fn: number): { + precision: number; + recall: number; + f1: number; +} { + const precision = tp + fp === 0 ? 0 : tp / (tp + fp); + const recall = tp + fn === 0 ? 0 : tp / (tp + fn); + const f1 = precision + recall === 0 ? 0 : (2 * precision * recall) / (precision + recall); + return { precision, recall, f1 }; +} + +function finaliseCategory(c: MutableCategoryCounts): CategoryCounts { + const r = deriveRates(c.tp, c.fp, c.fn); + return { tp: c.tp, fp: c.fp, fn: c.fn, ...r }; +} + +/** + * Aggregate gateway-attested TP/FP/FN verdicts into a RecallSummary. + */ +export function aggregateExtracted( + extracted: readonly ExtractedRedaction[], + notes: readonly string[] = [], +): RecallSummary { + const perCat: Map = new Map(); + for (const cat of HIPAA_CATEGORIES) { + perCat.set(cat, emptyCategoryCounts()); + } + // "unknown" bucket for verdicts the harness could not tag with a HIPAA + // category (e.g. an unmapped Lucairn placeholder appearing in extras). We + // track it separately so it does not contaminate per-category numbers but + // is still surfaced in `overall` + a note. + let unknownTp = 0; + let unknownFp = 0; + let unknownFn = 0; + + const perRow: Map = new Map(); + + for (const r of extracted) { + let bucket: MutableCategoryCounts | undefined; + if (r.hipaa_category !== null) { + bucket = perCat.get(r.hipaa_category); + } + if (bucket === undefined) { + // Bump the unknown tallies; still tally per-row. + if (r.verdict === 'tp') unknownTp += 1; + else if (r.verdict === 'fp') unknownFp += 1; + else unknownFn += 1; + } else { + if (r.verdict === 'tp') bucket.tp += 1; + else if (r.verdict === 'fp') bucket.fp += 1; + else bucket.fn += 1; + } + + const rowKey = r.row_index; + let rowBucket = perRow.get(rowKey); + if (rowBucket === undefined) { + rowBucket = emptyCategoryCounts(); + perRow.set(rowKey, rowBucket); + } + if (r.verdict === 'tp') rowBucket.tp += 1; + else if (r.verdict === 'fp') rowBucket.fp += 1; + else rowBucket.fn += 1; + } + + let totTp = unknownTp; + let totFp = unknownFp; + let totFn = unknownFn; + const perCategory: Array<{ category: HipaaCategory; counts: CategoryCounts }> = []; + for (const cat of HIPAA_CATEGORIES) { + const c = perCat.get(cat) ?? emptyCategoryCounts(); + totTp += c.tp; + totFp += c.fp; + totFn += c.fn; + perCategory.push({ category: cat, counts: finaliseCategory(c) }); + } + + const overallRates = deriveRates(totTp, totFp, totFn); + const perRowOut: RowBreakdown[] = Array.from(perRow.entries()) + .sort((a, b) => a[0] - b[0]) + .map(([rowIndex, c]) => { + const r = deriveRates(c.tp, c.fp, c.fn); + return { row_index: rowIndex, tp: c.tp, fp: c.fp, fn: c.fn, recall: r.recall }; + }); + + const allNotes: string[] = [...notes]; + if (unknownTp + unknownFp + unknownFn > 0) { + allNotes.push( + `Encountered ${unknownTp + unknownFp + unknownFn} verdict(s) with no HIPAA category mapping ` + + `(tp=${unknownTp} fp=${unknownFp} fn=${unknownFn}). These are included in overall counts ` + + 'but not in per_category. Extend src/hipaa-category-mapping.ts if these are recurring.', + ); + } + + return { + schema_version: '1.0', + generator: 'lucairn-research/recall.ts', + overall: { + tp: totTp, + fp: totFp, + fn: totFn, + total_annotations: totTp + totFn, + ...overallRates, + }, + per_category: perCategory, + per_row: perRowOut, + notes: allNotes, + }; +} + +/** + * ≥50%-character-overlap span matcher. A prediction `p` matches a ground- + * truth entity `g` when `(overlap_chars(p, g) / length(g)) >= 0.5`. Ties + * (same overlap fraction for two predictions against the same ground-truth) + * resolve to the earlier predicted span (lower start_char), then the + * smaller predicted-span length — fully deterministic. + * + * Each ground-truth entity matches at most one prediction; each prediction + * matches at most one ground-truth entity. Unmatched ground truth → FN. + * Unmatched prediction → FP. + * + * Per-category tally: when matched, the ground-truth entity's category is + * the one credited (since that is the category we knew was injected). + */ +export function computeRecallFromSpans( + groundTruth: readonly InjectedRow[], + predictedSpans: readonly { row_index: number; spans: readonly PredictedSpan[] }[], + notes: readonly string[] = [], +): RecallSummary { + const predictedByRow = new Map(); + for (const p of predictedSpans) { + predictedByRow.set(p.row_index, p.spans); + } + + const perCat: Map = new Map(); + for (const cat of HIPAA_CATEGORIES) { + perCat.set(cat, emptyCategoryCounts()); + } + const perRow: Map = new Map(); + + for (const row of groundTruth) { + const truth = row.entities; + const preds = predictedByRow.get(row.row_index) ?? []; + const matched = matchSpans(truth, preds); + + const rowBucket: MutableCategoryCounts = emptyCategoryCounts(); + for (const t of truth) { + const m = matched.truthToPred.get(t); + const catBucket = perCat.get(t.category); + if (m === undefined) { + if (catBucket !== undefined) catBucket.fn += 1; + rowBucket.fn += 1; + } else { + if (catBucket !== undefined) catBucket.tp += 1; + rowBucket.tp += 1; + } + } + for (const p of preds) { + if (!matched.predMatched.has(p)) { + // Tag FP to the predicted span's own category if known; otherwise + // bump the unknown bucket. + const catBucket = p.category != null ? perCat.get(p.category) : undefined; + if (catBucket !== undefined) catBucket.fp += 1; + rowBucket.fp += 1; + } + } + perRow.set(row.row_index, rowBucket); + } + + let totTp = 0; + let totFp = 0; + let totFn = 0; + const perCategory: Array<{ category: HipaaCategory; counts: CategoryCounts }> = []; + for (const cat of HIPAA_CATEGORIES) { + const c = perCat.get(cat) ?? emptyCategoryCounts(); + totTp += c.tp; + totFp += c.fp; + totFn += c.fn; + perCategory.push({ category: cat, counts: finaliseCategory(c) }); + } + const overallRates = deriveRates(totTp, totFp, totFn); + const perRowOut: RowBreakdown[] = Array.from(perRow.entries()) + .sort((a, b) => a[0] - b[0]) + .map(([rowIndex, c]) => { + const r = deriveRates(c.tp, c.fp, c.fn); + return { row_index: rowIndex, tp: c.tp, fp: c.fp, fn: c.fn, recall: r.recall }; + }); + + return { + schema_version: '1.0', + generator: 'lucairn-research/recall.ts', + overall: { + tp: totTp, + fp: totFp, + fn: totFn, + total_annotations: totTp + totFn, + ...overallRates, + }, + per_category: perCategory, + per_row: perRowOut, + notes, + }; +} + +interface MatchResult { + truthToPred: Map; + predMatched: Set; +} + +function matchSpans( + truth: readonly SpanEntity[], + preds: readonly PredictedSpan[], +): MatchResult { + // Build candidate pairs sorted by (overlap_fraction desc, pred.start_char + // asc, pred-length asc). Greedy assign — first pair wins, each truth and + // each prediction can match at most once. + const candidates: Array<{ + truth: SpanEntity; + pred: PredictedSpan; + overlapFraction: number; + }> = []; + for (const t of truth) { + const truthLen = Math.max(0, t.end_char - t.start_char); + if (truthLen === 0) continue; + for (const p of preds) { + const overlap = Math.max( + 0, + Math.min(t.end_char, p.end_char) - Math.max(t.start_char, p.start_char), + ); + if (overlap <= 0) continue; + const frac = overlap / truthLen; + if (frac >= SPAN_OVERLAP_THRESHOLD) { + candidates.push({ truth: t, pred: p, overlapFraction: frac }); + } + } + } + candidates.sort((a, b) => { + if (b.overlapFraction !== a.overlapFraction) { + return b.overlapFraction - a.overlapFraction; + } + if (a.pred.start_char !== b.pred.start_char) { + return a.pred.start_char - b.pred.start_char; + } + return a.pred.end_char - a.pred.start_char - (b.pred.end_char - b.pred.start_char); + }); + const truthMatched = new Set(); + const predMatched = new Set(); + const truthToPred = new Map(); + for (const c of candidates) { + if (truthMatched.has(c.truth) || predMatched.has(c.pred)) continue; + truthMatched.add(c.truth); + predMatched.add(c.pred); + truthToPred.set(c.truth, c.pred); + } + return { truthToPred, predMatched }; +} diff --git a/src/redaction-extractor.ts b/src/redaction-extractor.ts new file mode 100644 index 0000000..87f3239 --- /dev/null +++ b/src/redaction-extractor.ts @@ -0,0 +1,127 @@ +/** + * redaction-extractor.ts + * + * Pure function that converts a gateway proving-ground response into a + * uniform per-entity record stream the recall computation can consume. + * + * Why this layer exists: + * The gateway's proving-ground response carries three structured arrays — + * `matches[]` (true positives), `missed[]` (false negatives), and + * `extras[]` (false positives unmatched by ground truth) — keyed off the + * caller-supplied annotation type. The harness needs a single flat record + * per gateway-emitted decision, tagged with its HIPAA Safe Harbor category + * and a verdict (TP / FN / FP), so the recall layer can aggregate + * per-category without re-parsing the response shape. + * + * Cite-back: gateway emits `matches`/`missed`/`extras` at + * dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:5-138. + */ + +import type { HipaaCategory, HIPAA_CATEGORIES } from './inject-pii-core.js'; +import type { GroundTruthEvaluation } from './gateway-client.js'; +import { placeholderToHipaaCategory } from './hipaa-category-mapping.js'; + +// Imported only as the type-source for HipaaCategory; the runtime constant is +// imported via the values import below to satisfy isolated-modules + the +// noUnusedImports lint policy. +// eslint-disable-next-line @typescript-eslint/no-unused-vars +type _CategoryTypeAnchor = (typeof HIPAA_CATEGORIES)[number]; + +import { HIPAA_CATEGORIES as HIPAA_CATEGORIES_VALUES } from './inject-pii-core.js'; + +const HIPAA_SET = new Set(HIPAA_CATEGORIES_VALUES as readonly string[]); + +export type RedactionVerdict = 'tp' | 'fn' | 'fp'; + +export interface ExtractedRedaction { + readonly row_index: number; + readonly hipaa_category: HipaaCategory | null; + readonly verdict: RedactionVerdict; + /** Value the gateway compared against (ground-truth value for TP/FN; original PHI for FP). */ + readonly value: string; + /** Sanitizer placeholder (e.g. `[PERSON_1]`) for TP/FP; null for FN. */ + readonly placeholder: string | null; + /** When known, the field name the gateway saw (`transcription` for the harness). */ + readonly field: string | null; +} + +/** + * Convert a single proving-ground evaluation block into a list of flat + * extracted redactions tagged with HIPAA category + verdict. + * + * - TP rows: category = annotation_type (HIPAA-tagged in our submission). + * - FN rows: category = type (same source). + * - FP rows: category derived from the sanitizer placeholder via + * `placeholderToHipaaCategory`; null when the placeholder type is + * unmapped (still emitted with verdict=fp so the FP count is preserved). + */ +export function extractFromEvaluation( + rowIndex: number, + evaluation: GroundTruthEvaluation, +): ExtractedRedaction[] { + const out: ExtractedRedaction[] = []; + for (const m of evaluation.matches ?? []) { + out.push({ + row_index: rowIndex, + hipaa_category: tagAsHipaa(m.annotation_type), + verdict: 'tp', + value: m.annotation_value, + placeholder: m.redacted_as, + field: null, + }); + } + for (const miss of evaluation.missed ?? []) { + out.push({ + row_index: rowIndex, + hipaa_category: tagAsHipaa(miss.type), + verdict: 'fn', + value: miss.value, + placeholder: null, + field: miss.field, + }); + } + for (const extra of evaluation.extras ?? []) { + out.push({ + row_index: rowIndex, + hipaa_category: placeholderToHipaaCategory(extra.placeholder), + verdict: 'fp', + value: extra.original, + placeholder: extra.placeholder, + field: null, + }); + } + return out; +} + +/** + * Narrow a free-form string to HipaaCategory when it matches one of the 18 + * canonical names exactly; otherwise null. This keeps malformed gateway + * payloads from silently widening the type. + */ +function tagAsHipaa(s: string): HipaaCategory | null { + return HIPAA_SET.has(s) ? (s as HipaaCategory) : null; +} + +/** + * Verify the mapping table covers every Lucairn internal type observed in a + * supplied evaluation's `extras[]`. Returns the list of unmapped types found + * (empty if the mapping is complete for this sample). Used by + * `test/redaction-extractor.spec.ts` to detect taxonomy drift. + */ +export function unmappedExtraTypes(evaluation: GroundTruthEvaluation): string[] { + const seen = new Set(); + const unmapped: string[] = []; + for (const e of evaluation.extras ?? []) { + const mapped = placeholderToHipaaCategory(e.placeholder); + if (mapped === null) { + // Pull the inner type for the report. + const stripped = e.placeholder.replace(/^\[|\]$/gu, ''); + const t = stripped.replace(/_\d+$/u, ''); + if (!seen.has(t)) { + seen.add(t); + unmapped.push(t); + } + } + } + return unmapped; +} From b93244c2b3b4abb51238241797fdc1694f12d2e5 Mon Sep 17 00:00:00 2001 From: Declade <110547349+Declade@users.noreply.github.com> Date: Sun, 17 May 2026 11:30:58 +0200 Subject: [PATCH 2/5] feat(slice-2): pipeline runner, cert collector, recall computer + SUMMARY schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the three CLI scripts the Slice 2 harness needs: - scripts/run-pipeline.ts — orchestrates per-row gateway calls via POST /api/v1/proxy/messages (mode=proving_ground). --mock mounts an msw fixture server in-process; --live is reserved for Slice 3 and refuses to start without the explicit gate. Writes raw NDJSON to papers/paper-1-healthcare/raw-results/.ndjson (or --output). Supports --rows / --truth / --subset / --gateway / --api-key / --miss-rate / --spurious-fp-count / --activity-id-prefix. - scripts/collect-certs.ts — walks the NDJSON, extracts cert URL + summary URL + redaction count + overall verdict per row, emits CERTIFICATES.csv via src/csv.ts::emitCsv. Columns: row_index, cert_url, cert_id, summary_url, overall_verdict, redaction_count, latency_ms, timestamp_utc, error_code - scripts/compute-recall.ts — reads ground-truth JSONL + raw NDJSON (or re-runs the in-process mock when --redactions-source=mock), aggregates per-HIPAA-category recall / precision / F1 via aggregateExtracted, emits SUMMARY.json, validates against papers/_template/SUMMARY.schema.json in-process. Avoids a runtime dep on ajv via a minimal validator covering the schema subset used. Also adds: - papers/_template/SUMMARY.schema.json — Draft 2020-12 JSON Schema for SUMMARY.json. Enforces 18-category coverage in per_category, the four required overall fields, the RowBreakdown shape, and the schema_version / generator const fields. Reused by every paper in the program. - papers/paper-1-healthcare/raw-results/.gitignore + .gitkeep — directory scaffold; per-run NDJSON is gitignored at the repo level (datasets/.gitignore line 17) but the per-paper sub-tree's own .gitignore locks it locally too. - package.json — adds pipeline / collect-certs / compute-recall scripts. End-to-end smoke (all PASS): pnpm run pipeline -- --rows=5 --mock --output=/tmp/slice2-smoke.ndjson pnpm run collect-certs -- --input=/tmp/slice2-smoke.ndjson --output=/tmp/slice2-CERTIFICATES.csv pnpm run compute-recall -- --truth=ground-truth.jsonl --redactions-source=mock --rows=5 --output=/tmp/slice2-SUMMARY.json --- package.json | 5 +- papers/_template/SUMMARY.schema.json | 126 +++++++++ scripts/collect-certs.ts | 163 ++++++++++++ scripts/compute-recall.ts | 359 +++++++++++++++++++++++++ scripts/run-pipeline.ts | 374 +++++++++++++++++++++++++++ 5 files changed, 1026 insertions(+), 1 deletion(-) create mode 100644 papers/_template/SUMMARY.schema.json create mode 100644 scripts/collect-certs.ts create mode 100644 scripts/compute-recall.ts create mode 100644 scripts/run-pipeline.ts diff --git a/package.json b/package.json index 5279c6e..2c2e8c9 100644 --- a/package.json +++ b/package.json @@ -20,7 +20,10 @@ "test:watch": "vitest", "dataset:download": "node --import tsx scripts/download-mtsamples.ts", "dataset:inject-pii": "node --import tsx scripts/inject-pii.ts", - "dataset:verify-injection": "node --import tsx scripts/verify-injection.ts" + "dataset:verify-injection": "node --import tsx scripts/verify-injection.ts", + "pipeline": "node --import tsx scripts/run-pipeline.ts", + "collect-certs": "node --import tsx scripts/collect-certs.ts", + "compute-recall": "node --import tsx scripts/compute-recall.ts" }, "devDependencies": { "@faker-js/faker": "^9.0.0", diff --git a/papers/_template/SUMMARY.schema.json b/papers/_template/SUMMARY.schema.json new file mode 100644 index 0000000..f1a2f4f --- /dev/null +++ b/papers/_template/SUMMARY.schema.json @@ -0,0 +1,126 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/Declade/lucairn-research/papers/_template/SUMMARY.schema.json", + "title": "Lucairn Research Program — per-paper SUMMARY.json", + "description": "Aggregate recall / precision / F1 numbers per HIPAA Safe Harbor category + overall + per-row breakdown for any paper in the Lucairn Research Program. Mirrors the RecallSummary shape produced by src/recall.ts.", + "type": "object", + "required": [ + "schema_version", + "generator", + "overall", + "per_category", + "per_row", + "notes" + ], + "additionalProperties": false, + "properties": { + "schema_version": { + "type": "string", + "const": "1.0" + }, + "generator": { + "type": "string", + "const": "lucairn-research/recall.ts" + }, + "overall": { + "$ref": "#/$defs/OverallCounts" + }, + "per_category": { + "type": "array", + "items": { + "type": "object", + "required": ["category", "counts"], + "additionalProperties": false, + "properties": { + "category": { + "$ref": "#/$defs/HipaaCategory" + }, + "counts": { + "$ref": "#/$defs/CategoryCounts" + } + } + }, + "minItems": 18, + "maxItems": 18 + }, + "per_row": { + "type": "array", + "items": { + "type": "object", + "required": ["row_index", "tp", "fp", "fn", "recall"], + "additionalProperties": false, + "properties": { + "row_index": { "type": "integer", "minimum": 0 }, + "tp": { "type": "integer", "minimum": 0 }, + "fp": { "type": "integer", "minimum": 0 }, + "fn": { "type": "integer", "minimum": 0 }, + "recall": { "type": "number", "minimum": 0, "maximum": 1 } + } + } + }, + "notes": { + "type": "array", + "items": { "type": "string" } + } + }, + "$defs": { + "HipaaCategory": { + "type": "string", + "enum": [ + "NAME", + "GEO_SUBDIVISION", + "DATE", + "PHONE", + "FAX", + "EMAIL", + "SSN", + "MRN", + "HEALTH_PLAN_ID", + "ACCOUNT_NUMBER", + "LICENSE_NUMBER", + "VEHICLE_ID", + "DEVICE_ID", + "URL", + "IP_ADDRESS", + "BIOMETRIC_ID", + "FACE_PHOTO_REF", + "OTHER_UNIQUE_ID" + ] + }, + "CategoryCounts": { + "type": "object", + "required": ["tp", "fp", "fn", "precision", "recall", "f1"], + "additionalProperties": false, + "properties": { + "tp": { "type": "integer", "minimum": 0 }, + "fp": { "type": "integer", "minimum": 0 }, + "fn": { "type": "integer", "minimum": 0 }, + "precision": { "type": "number", "minimum": 0, "maximum": 1 }, + "recall": { "type": "number", "minimum": 0, "maximum": 1 }, + "f1": { "type": "number", "minimum": 0, "maximum": 1 } + } + }, + "OverallCounts": { + "type": "object", + "required": [ + "tp", + "fp", + "fn", + "total_annotations", + "precision", + "recall", + "f1" + ], + "additionalProperties": false, + "properties": { + "tp": { "type": "integer", "minimum": 0 }, + "fp": { "type": "integer", "minimum": 0 }, + "fn": { "type": "integer", "minimum": 0 }, + "total_annotations": { "type": "integer", "minimum": 0 }, + "precision": { "type": "number", "minimum": 0, "maximum": 1 }, + "recall": { "type": "number", "minimum": 0, "maximum": 1 }, + "f1": { "type": "number", "minimum": 0, "maximum": 1 } + } + } + } +} diff --git a/scripts/collect-certs.ts b/scripts/collect-certs.ts new file mode 100644 index 0000000..0e71d7e --- /dev/null +++ b/scripts/collect-certs.ts @@ -0,0 +1,163 @@ +/** + * collect-certs.ts + * + * Walks the NDJSON output of `run-pipeline.ts`, extracts each row's cert URL + * + summary URL + recall metadata, and emits a `CERTIFICATES.csv` appendix + * suitable for the paper. + * + * Columns: + * row_index, cert_url, cert_id, summary_url, overall_verdict, + * redaction_count, latency_ms, timestamp_utc, error_code + * + * `overall_verdict` is a per-row tag derived from the gateway-attested + * ground_truth_evaluation: + * - "verified" when total_annotations > 0 and detection_rate == 1.0 + * - "partial" when total_annotations > 0 and 0 < detection_rate < 1.0 + * - "miss" when total_annotations > 0 and detection_rate == 0 + * - "n/a" when total_annotations == 0 (no ground truth submitted) + * - "error" when the row carried an error block + * + * Cert ID is parsed from the certificate_url's final path segment to keep + * the CSV readable without re-parsing URLs. + */ + +import { readFile, writeFile } from 'node:fs/promises'; + +import { emitCsv } from '../src/csv.js'; +import type { GroundTruthEvaluation } from '../src/gateway-client.js'; + +interface PipelineNdjsonRecord { + row_index: number; + timestamp_utc: string; + entities_submitted?: number; + transcription_length?: number; + gateway?: string; + mode?: 'mock' | 'live'; + result: { + row_index: number; + request_id: string; + cert_url: string | null; + summary_url: string | null; + evaluation: GroundTruthEvaluation | null; + redaction_count: number | null; + latency_ms: number | null; + } | null; + error: { code: string; message: string } | null; +} + +interface CliArgs { + input: string; + output: string; +} + +function parseArgs(argv: readonly string[]): CliArgs { + const args: CliArgs = { input: '', output: '' }; + for (const raw of argv) { + const eq = raw.indexOf('='); + const key = eq === -1 ? raw : raw.slice(0, eq); + const val = eq === -1 ? '' : raw.slice(eq + 1); + switch (key) { + case '--input': + args.input = val; + break; + case '--output': + args.output = val; + break; + case '--help': + case '-h': + process.stdout.write( + 'Usage: pnpm run collect-certs -- --input= --output=\n', + ); + process.exit(0); + break; + default: + if (raw.length > 0 && raw !== '--') { + throw new Error(`unknown argument: ${raw}`); + } + } + } + if (args.input === '' || args.output === '') { + throw new Error('--input and --output are required'); + } + return args; +} + +function classifyVerdict( + evaluation: GroundTruthEvaluation | null, + error: { code: string } | null, +): string { + if (error !== null) return 'error'; + if (evaluation === null) return 'n/a'; + if (evaluation.total_annotations === 0) return 'n/a'; + if (evaluation.detection_rate >= 1) return 'verified'; + if (evaluation.detection_rate <= 0) return 'miss'; + return 'partial'; +} + +function extractCertIdFromUrl(certUrl: string | null): string { + if (certUrl === null) return ''; + const trimmed = certUrl.replace(/\/+$/u, ''); + const last = trimmed.lastIndexOf('/'); + return last === -1 ? trimmed : trimmed.slice(last + 1); +} + +async function main(): Promise { + const cli = parseArgs(process.argv.slice(2)); + const text = await readFile(cli.input, 'utf8'); + const records: PipelineNdjsonRecord[] = []; + let lineNo = 0; + for (const ln of text.split('\n')) { + lineNo += 1; + const trimmed = ln.trim(); + if (trimmed === '') continue; + try { + records.push(JSON.parse(trimmed) as PipelineNdjsonRecord); + } catch (err) { + const reason = err instanceof Error ? err.message : String(err); + throw new Error(`${cli.input}: line ${lineNo} is not valid JSON: ${reason}`); + } + } + + const headers = [ + 'row_index', + 'cert_url', + 'cert_id', + 'summary_url', + 'overall_verdict', + 'redaction_count', + 'latency_ms', + 'timestamp_utc', + 'error_code', + ]; + const rows = records.map((r) => { + const certUrl = r.result?.cert_url ?? ''; + const summaryUrl = r.result?.summary_url ?? ''; + const evaluation = r.result?.evaluation ?? null; + const verdict = classifyVerdict(evaluation, r.error); + return { + row_index: String(r.row_index), + cert_url: certUrl, + cert_id: extractCertIdFromUrl(r.result?.cert_url ?? null), + summary_url: summaryUrl, + overall_verdict: verdict, + redaction_count: r.result?.redaction_count === null || r.result?.redaction_count === undefined + ? '' + : String(r.result.redaction_count), + latency_ms: + r.result?.latency_ms === null || r.result?.latency_ms === undefined + ? '' + : String(r.result.latency_ms), + timestamp_utc: r.timestamp_utc, + error_code: r.error?.code ?? '', + }; + }); + + await writeFile(cli.output, emitCsv(headers, rows), 'utf8'); + process.stdout.write(`wrote ${rows.length} cert row(s) to ${cli.output}\n`); +} + +main().catch((err: unknown) => { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`collect-certs: ${msg}\n`); + process.exit(1); +}); diff --git a/scripts/compute-recall.ts b/scripts/compute-recall.ts new file mode 100644 index 0000000..801dbb1 --- /dev/null +++ b/scripts/compute-recall.ts @@ -0,0 +1,359 @@ +/** + * compute-recall.ts + * + * Reads the harness's NDJSON output, extracts per-row gateway-attested + * ground_truth_evaluation blocks, and emits a `SUMMARY.json` aggregate + * recall / precision / F1 file. Validates the emitted JSON against + * `papers/_template/SUMMARY.schema.json`. + * + * Two input modes: + * + * --redactions-source=ndjson (default): reads --input NDJSON, uses the + * gateway-attested evaluation blocks. This is the live path. + * --redactions-source=mock: re-runs the in-process mock against the + * ground-truth file, useful for math-only smoke without spinning up the + * full run-pipeline harness. Configurable via --miss-rate and + * --spurious-fp-count. + * + * Determinism: identical inputs produce byte-identical output (sort orders + * fixed: per_category in HIPAA_CATEGORIES order, per_row by row_index asc). + */ + +import { readFile, writeFile } from 'node:fs/promises'; +import { fileURLToPath } from 'node:url'; +import { dirname, resolve } from 'node:path'; + +import { + type GroundTruthEvaluation, + extractFromEvaluation, +} from '../src/index.js'; +import { aggregateExtracted } from '../src/recall.js'; +import type { ExtractedRedaction } from '../src/redaction-extractor.js'; +import { buildMockResponse } from '../src/mocks/gateway-fixtures.js'; +import type { InjectedEntity } from '../src/inject-pii-core.js'; + +interface CliArgs { + truth: string; + input: string | null; + redactionsSource: 'ndjson' | 'mock'; + output: string; + rows: number | null; + missRate: number; + spuriousFpCount: number; +} + +function parseArgs(argv: readonly string[]): CliArgs { + const args: CliArgs = { + truth: 'datasets/healthcare/with-injected-pii/ground-truth.jsonl', + input: null, + redactionsSource: 'ndjson', + output: 'papers/paper-1-healthcare/SUMMARY.json', + rows: null, + missRate: 0, + spuriousFpCount: 0, + }; + for (const raw of argv) { + const eq = raw.indexOf('='); + const key = eq === -1 ? raw : raw.slice(0, eq); + const val = eq === -1 ? '' : raw.slice(eq + 1); + switch (key) { + case '--truth': + args.truth = val; + break; + case '--input': + args.input = val; + break; + case '--redactions-source': + if (val !== 'ndjson' && val !== 'mock') { + throw new Error('--redactions-source must be "ndjson" or "mock"'); + } + args.redactionsSource = val; + break; + case '--output': + args.output = val; + break; + case '--rows': { + const n = Number.parseInt(val, 10); + if (!Number.isFinite(n) || n < 0) { + throw new Error('--rows must be a non-negative integer'); + } + args.rows = n; + break; + } + case '--miss-rate': { + const f = Number.parseFloat(val); + if (!Number.isFinite(f) || f < 0 || f > 1) { + throw new Error('--miss-rate must be in [0, 1]'); + } + args.missRate = f; + break; + } + case '--spurious-fp-count': { + const n = Number.parseInt(val, 10); + if (!Number.isFinite(n) || n < 0) { + throw new Error('--spurious-fp-count must be a non-negative integer'); + } + args.spuriousFpCount = n; + break; + } + case '--help': + case '-h': + process.stdout.write( + 'Usage: pnpm run compute-recall -- --truth= ' + + '[--input= | --redactions-source=mock] [--rows=N] --output=\n', + ); + process.exit(0); + break; + default: + if (raw.length > 0 && raw !== '--') { + throw new Error(`unknown argument: ${raw}`); + } + } + } + if (args.redactionsSource === 'ndjson' && args.input === null) { + throw new Error('--input is required when --redactions-source=ndjson'); + } + return args; +} + +async function loadGroundTruth( + path: string, +): Promise> { + const text = await readFile(path, 'utf8'); + const out: Array<{ row_index: number; entities: InjectedEntity[] }> = []; + let lineNo = 0; + for (const ln of text.split('\n')) { + lineNo += 1; + const trimmed = ln.trim(); + if (trimmed === '') continue; + let parsed: { row_index: unknown; entities: unknown }; + try { + parsed = JSON.parse(trimmed) as { row_index: unknown; entities: unknown }; + } catch (err) { + const reason = err instanceof Error ? err.message : String(err); + throw new Error(`${path}: line ${lineNo} not JSON: ${reason}`); + } + if (typeof parsed.row_index !== 'number' || !Array.isArray(parsed.entities)) continue; + const entities: InjectedEntity[] = []; + for (const item of parsed.entities as unknown[]) { + if (typeof item !== 'object' || item === null) continue; + const e = item as Partial; + if ( + typeof e.category === 'string' && + typeof e.value === 'string' && + typeof e.start_char === 'number' && + typeof e.end_char === 'number' + ) { + entities.push({ + category: e.category, + value: e.value, + start_char: e.start_char, + end_char: e.end_char, + }); + } + } + out.push({ row_index: parsed.row_index, entities }); + } + return out; +} + +async function loadEvaluationsFromNdjson( + path: string, +): Promise> { + const text = await readFile(path, 'utf8'); + const out = new Map(); + let lineNo = 0; + for (const ln of text.split('\n')) { + lineNo += 1; + const trimmed = ln.trim(); + if (trimmed === '') continue; + let parsed: { row_index?: unknown; result?: unknown }; + try { + parsed = JSON.parse(trimmed) as { row_index?: unknown; result?: unknown }; + } catch (err) { + const reason = err instanceof Error ? err.message : String(err); + throw new Error(`${path}: line ${lineNo} not JSON: ${reason}`); + } + if (typeof parsed.row_index !== 'number') continue; + const result = parsed.result; + if (typeof result !== 'object' || result === null) continue; + const evaluation = (result as { evaluation?: GroundTruthEvaluation | null }).evaluation; + if (evaluation !== null && evaluation !== undefined) { + out.set(parsed.row_index, evaluation); + } + } + return out; +} + +/** + * Minimal in-process JSON-Schema validator covering the subset of Draft + * 2020-12 used by SUMMARY.schema.json. Avoids a runtime dep on ajv for a + * single schema. Throws on the first failure with a JSON-pointer-ish path. + */ +function validateAgainstSchema(json: unknown, schemaPath: string): Promise { + return (async () => { + const schemaText = await readFile(schemaPath, 'utf8'); + const schema = JSON.parse(schemaText) as unknown; + validateNode(json, schema, '#'); + })(); +} + +interface Schema { + type?: string; + required?: readonly string[]; + additionalProperties?: boolean; + properties?: Readonly>; + $ref?: string; + $defs?: Readonly>; + enum?: readonly unknown[]; + const?: unknown; + minimum?: number; + maximum?: number; + minItems?: number; + maxItems?: number; + items?: Schema; +} + +function getDefs(schema: Schema, root: Schema | null): Schema['$defs'] | undefined { + return root?.$defs ?? schema.$defs; +} + +let SCHEMA_ROOT: Schema | null = null; + +function validateNode(node: unknown, schema: unknown, ptr: string): void { + if (typeof schema !== 'object' || schema === null) return; + const s = schema as Schema; + if (SCHEMA_ROOT === null) SCHEMA_ROOT = s; + if (s.$ref !== undefined) { + const defs = getDefs(s, SCHEMA_ROOT); + const refName = s.$ref.replace(/^#\/\$defs\//u, ''); + const target = defs?.[refName]; + if (target === undefined) { + throw new Error(`schema: unresolved $ref ${s.$ref} at ${ptr}`); + } + validateNode(node, target, ptr); + return; + } + if (s.type === 'object') { + if (typeof node !== 'object' || node === null || Array.isArray(node)) { + throw new Error(`schema: expected object at ${ptr}, got ${typeof node}`); + } + const obj = node as Record; + for (const req of s.required ?? []) { + if (!(req in obj)) { + throw new Error(`schema: missing required property "${req}" at ${ptr}`); + } + } + const props = s.properties ?? {}; + if (s.additionalProperties === false) { + for (const k of Object.keys(obj)) { + if (!(k in props)) { + throw new Error(`schema: unexpected property "${k}" at ${ptr}`); + } + } + } + for (const [k, sub] of Object.entries(props)) { + if (k in obj) validateNode(obj[k], sub, `${ptr}/${k}`); + } + return; + } + if (s.type === 'array') { + if (!Array.isArray(node)) { + throw new Error(`schema: expected array at ${ptr}`); + } + if (s.minItems !== undefined && node.length < s.minItems) { + throw new Error(`schema: array at ${ptr} has ${node.length} items, min ${s.minItems}`); + } + if (s.maxItems !== undefined && node.length > s.maxItems) { + throw new Error(`schema: array at ${ptr} has ${node.length} items, max ${s.maxItems}`); + } + if (s.items) { + for (let i = 0; i < node.length; i++) { + validateNode(node[i], s.items, `${ptr}/${i}`); + } + } + return; + } + if (s.type === 'integer') { + if (typeof node !== 'number' || !Number.isInteger(node)) { + throw new Error(`schema: expected integer at ${ptr}`); + } + } + if (s.type === 'number') { + if (typeof node !== 'number' || !Number.isFinite(node)) { + throw new Error(`schema: expected number at ${ptr}`); + } + } + if (s.type === 'string') { + if (typeof node !== 'string') { + throw new Error(`schema: expected string at ${ptr}`); + } + } + if (s.minimum !== undefined && typeof node === 'number' && node < s.minimum) { + throw new Error(`schema: ${ptr} below minimum ${s.minimum} (got ${node})`); + } + if (s.maximum !== undefined && typeof node === 'number' && node > s.maximum) { + throw new Error(`schema: ${ptr} above maximum ${s.maximum} (got ${node})`); + } + if (s.const !== undefined && node !== s.const) { + throw new Error(`schema: ${ptr} expected const ${JSON.stringify(s.const)}, got ${JSON.stringify(node)}`); + } + if (s.enum !== undefined && !s.enum.includes(node)) { + throw new Error(`schema: ${ptr} value ${JSON.stringify(node)} not in enum`); + } +} + +function defaultSchemaPath(): string { + const here = dirname(fileURLToPath(import.meta.url)); + return resolve(here, '..', 'papers', '_template', 'SUMMARY.schema.json'); +} + +async function main(): Promise { + const cli = parseArgs(process.argv.slice(2)); + const groundTruth = await loadGroundTruth(cli.truth); + const limit = cli.rows ?? groundTruth.length; + const targetRows = groundTruth.slice(0, limit); + + const extracted: ExtractedRedaction[] = []; + if (cli.redactionsSource === 'mock') { + for (const row of targetRows) { + const mockResponse = buildMockResponse({ + rowIndex: row.row_index, + entities: row.entities, + missRate: cli.missRate, + spuriousFpCount: cli.spuriousFpCount, + }); + const evaluation = mockResponse.ground_truth_evaluation; + if (evaluation === undefined) continue; + extracted.push(...extractFromEvaluation(row.row_index, evaluation)); + } + } else { + if (cli.input === null) { + throw new Error('--input is required when --redactions-source=ndjson'); + } + const evals = await loadEvaluationsFromNdjson(cli.input); + for (const row of targetRows) { + const evaluation = evals.get(row.row_index); + if (evaluation === undefined) continue; + extracted.push(...extractFromEvaluation(row.row_index, evaluation)); + } + } + + const summary = aggregateExtracted(extracted, [ + `Source: ${cli.redactionsSource}; rows processed: ${targetRows.length}.`, + ]); + await writeFile(cli.output, JSON.stringify(summary, null, 2) + '\n', 'utf8'); + await validateAgainstSchema(summary, defaultSchemaPath()); + + process.stdout.write( + `wrote SUMMARY.json (${targetRows.length} rows, ` + + `overall recall=${summary.overall.recall.toFixed(4)}, ` + + `f1=${summary.overall.f1.toFixed(4)}) to ${cli.output}\n`, + ); +} + +main().catch((err: unknown) => { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`compute-recall: ${msg}\n`); + process.exit(1); +}); diff --git a/scripts/run-pipeline.ts b/scripts/run-pipeline.ts new file mode 100644 index 0000000..af0f43c --- /dev/null +++ b/scripts/run-pipeline.ts @@ -0,0 +1,374 @@ +/** + * run-pipeline.ts + * + * Slice 2 harness — call the Lucairn gateway row-by-row over the + * Measurement-B 500-row subset (or a smaller --rows slice), recording each + * gateway response to an NDJSON file under `papers/paper-1-healthcare/raw- + * results/`. Designed to run in two modes: + * + * - LIVE (default): hits a real gateway at LUCAIRN_GATEWAY_URL with an + * LUCAIRN_API_KEY. Live runs are deferred to Slice 3 per the locked + * halt gate. Do not run live by accident — the script refuses to start + * without an explicit --live flag. + * - MOCK (--mock): mounts a deterministic msw fixture server in-process. + * The harness fetches the loopback `mock://` URL the msw handler + * intercepts. No network egress. The mock honours `--miss-rate` and + * `--spurious-fp-count` so smoke tests can drive recall paths against + * a known oracle. + * + * Usage: + * pnpm run pipeline -- --rows=5 --mock --output=/tmp/slice2-smoke.ndjson + * pnpm run pipeline -- --rows=500 --mock --output=papers/paper-1-healthcare/raw-results/mock-500.ndjson + * pnpm run pipeline -- --live --rows=20 # Slice 3 only + */ + +import { mkdir, readFile, writeFile } from 'node:fs/promises'; +import { existsSync } from 'node:fs'; +import { dirname, resolve } from 'node:path'; +import { setupServer } from 'msw/node'; +import { http, HttpResponse } from 'msw'; + +import { + GatewayClientError, + type GatewayRowResult, + makeGatewayClient, + readGatewayEnv, +} from '../src/gateway-client.js'; +import type { InjectedEntity } from '../src/inject-pii-core.js'; +import { parseCsv } from '../src/csv.js'; +import { buildMockResponse, entitiesFromRequestBody } from '../src/mocks/gateway-fixtures.js'; + +const DEFAULT_TRUTH_PATH = + 'datasets/healthcare/with-injected-pii/ground-truth.jsonl'; +const DEFAULT_SUBSET_PATH = + 'datasets/healthcare/with-injected-pii/measurement-b-subset.csv'; +const MOCK_GATEWAY_URL = 'http://mock.lucairn.local'; +const MOCK_API_KEY = 'lcr_live_mock_0000000000000000000000000000'; + +interface CliArgs { + rows: number | null; + mock: boolean; + live: boolean; + truth: string; + subset: string; + output: string; + gateway: string | null; + apiKey: string | null; + missRate: number; + spuriousFpCount: number; + activityIdPrefix: string; +} + +function parseArgs(argv: readonly string[]): CliArgs { + const args: CliArgs = { + rows: null, + mock: false, + live: false, + truth: DEFAULT_TRUTH_PATH, + subset: DEFAULT_SUBSET_PATH, + output: `papers/paper-1-healthcare/raw-results/run-${new Date() + .toISOString() + .replace(/[:.]/gu, '-')}.ndjson`, + gateway: null, + apiKey: null, + missRate: 0, + spuriousFpCount: 0, + activityIdPrefix: 'paper-1-healthcare', + }; + for (const raw of argv) { + const eq = raw.indexOf('='); + const key = eq === -1 ? raw : raw.slice(0, eq); + const val = eq === -1 ? '' : raw.slice(eq + 1); + switch (key) { + case '--rows': + args.rows = parseIntOrThrow(val, '--rows'); + break; + case '--mock': + args.mock = true; + break; + case '--live': + args.live = true; + break; + case '--truth': + args.truth = val; + break; + case '--subset': + args.subset = val; + break; + case '--output': + args.output = val; + break; + case '--gateway': + args.gateway = val; + break; + case '--api-key': + args.apiKey = val; + break; + case '--miss-rate': + args.missRate = parseFloatOrThrow(val, '--miss-rate'); + break; + case '--spurious-fp-count': + args.spuriousFpCount = parseIntOrThrow(val, '--spurious-fp-count'); + break; + case '--activity-id-prefix': + args.activityIdPrefix = val; + break; + case '--help': + case '-h': + printHelp(); + process.exit(0); + break; + default: + if (raw.length > 0 && raw !== '--') { + throw new Error(`unknown argument: ${raw}`); + } + } + } + return args; +} + +function parseIntOrThrow(s: string, flag: string): number { + const n = Number.parseInt(s, 10); + if (!Number.isFinite(n) || n < 0) throw new Error(`${flag} requires a non-negative integer`); + return n; +} + +function parseFloatOrThrow(s: string, flag: string): number { + const n = Number.parseFloat(s); + if (!Number.isFinite(n) || n < 0 || n > 1) { + throw new Error(`${flag} requires a number in [0, 1]`); + } + return n; +} + +function printHelp(): void { + const lines = [ + 'Usage: pnpm run pipeline -- [options]', + '', + 'Options:', + ' --rows=N limit run to first N rows (sorted by row_index). Default: all rows in ground truth.', + ' --mock mount msw mock; no network egress. Mutually exclusive with --live.', + ' --live require LUCAIRN_GATEWAY_URL + LUCAIRN_API_KEY in env (Slice 3 use).', + ' --truth=PATH ground-truth JSONL path. Default: datasets/healthcare/with-injected-pii/ground-truth.jsonl', + ' --subset=PATH Measurement-B subset CSV path. Default: datasets/healthcare/with-injected-pii/measurement-b-subset.csv', + ' --output=PATH NDJSON output path. Default: papers/paper-1-healthcare/raw-results/run-.ndjson', + ' --gateway=URL gateway URL override (also honoured under --live).', + ' --api-key=KEY API key override (--live only).', + ' --miss-rate=F --mock only. Fraction of injected entities the mock misses. Default: 0.', + ' --spurious-fp-count=N --mock only. Synthetic FP redactions per row. Default: 0.', + ' --activity-id-prefix=S per-row activity_id prefix. Default: paper-1-healthcare.', + '', + 'Slice 2 ships --mock support only. --live is reserved for Slice 3 and requires Marc-confirmation.', + ]; + for (const ln of lines) { + process.stdout.write(`${ln}\n`); + } +} + +async function loadGroundTruth(path: string): Promise> { + const text = await readFile(path, 'utf8'); + const out = new Map(); + let lineNo = 0; + for (const ln of text.split('\n')) { + lineNo += 1; + const trimmed = ln.trim(); + if (trimmed === '') continue; + let parsed: { row_index: unknown; entities: unknown }; + try { + parsed = JSON.parse(trimmed) as { row_index: unknown; entities: unknown }; + } catch (err) { + const reason = err instanceof Error ? err.message : String(err); + throw new Error(`ground truth line ${lineNo} is not valid JSON: ${reason}`); + } + if (typeof parsed.row_index !== 'number' || !Array.isArray(parsed.entities)) { + throw new Error(`ground truth line ${lineNo} missing row_index or entities`); + } + const entities: InjectedEntity[] = []; + for (const item of parsed.entities as unknown[]) { + if (typeof item !== 'object' || item === null) continue; + const e = item as { + category?: unknown; + value?: unknown; + start_char?: unknown; + end_char?: unknown; + }; + if ( + typeof e.category === 'string' && + typeof e.value === 'string' && + typeof e.start_char === 'number' && + typeof e.end_char === 'number' + ) { + entities.push({ + // The injected categories are HipaaCategory by construction; we + // intentionally avoid a runtime narrowing assertion so a malformed + // ground-truth line surfaces in the recall computation rather than + // at parse time. + category: e.category as InjectedEntity['category'], + value: e.value, + start_char: e.start_char, + end_char: e.end_char, + }); + } + } + out.set(parsed.row_index, entities); + } + return out; +} + +async function loadTranscriptions(path: string): Promise> { + const text = await readFile(path, 'utf8'); + const { rows } = parseCsv(text); + const out = new Map(); + for (const row of rows) { + const idxStr = row['original_row_index'] ?? ''; + const idx = Number.parseInt(idxStr, 10); + if (!Number.isFinite(idx)) continue; + const tr = row['transcription'] ?? ''; + out.set(idx, tr); + } + return out; +} + +interface MockServerHandle { + close(): void; +} + +function mountMockServer(missRate: number, spuriousFpCount: number): MockServerHandle { + const handlers = [ + http.post( + `${MOCK_GATEWAY_URL}/api/v1/proxy/messages`, + async ({ request }) => { + const body = (await request.json()) as unknown; + const { rowIndex, entities } = entitiesFromRequestBody(body); + if (rowIndex === null) { + return HttpResponse.json( + { error: { code: 'invalid_body', message: 'mock could not parse activity_id row-N suffix' } }, + { status: 400 }, + ); + } + const response = buildMockResponse({ + rowIndex, + entities, + missRate, + spuriousFpCount, + }); + return HttpResponse.json(response); + }, + ), + ]; + const server = setupServer(...handlers); + server.listen({ onUnhandledRequest: 'error' }); + return { close: () => server.close() }; +} + +async function ensureOutputDir(outputPath: string): Promise { + const dir = dirname(resolve(outputPath)); + if (!existsSync(dir)) { + await mkdir(dir, { recursive: true }); + } +} + +async function main(): Promise { + const cli = parseArgs(process.argv.slice(2)); + if (cli.mock && cli.live) { + throw new Error('--mock and --live are mutually exclusive'); + } + if (!cli.mock && !cli.live) { + process.stderr.write( + 'run-pipeline: neither --mock nor --live specified. Slice 2 supports --mock only.\n' + + 'Add --mock for the in-process smoke flow, or --live (Slice 3 + Marc-confirmation).\n', + ); + process.exit(2); + } + + const truthByRow = await loadGroundTruth(cli.truth); + const transcriptByRow = await loadTranscriptions(cli.subset); + const indices = Array.from(truthByRow.keys()).sort((a, b) => a - b); + const limit = cli.rows ?? indices.length; + const target = indices.slice(0, limit); + + let mock: MockServerHandle | null = null; + let gatewayUrl: string; + let apiKey: string; + if (cli.mock) { + mock = mountMockServer(cli.missRate, cli.spuriousFpCount); + gatewayUrl = MOCK_GATEWAY_URL; + apiKey = MOCK_API_KEY; + } else { + const env = readGatewayEnv(); + gatewayUrl = cli.gateway ?? env.gatewayUrl ?? ''; + apiKey = cli.apiKey ?? env.apiKey ?? ''; + if (gatewayUrl === '' || apiKey === '') { + throw new Error( + '--live requires LUCAIRN_GATEWAY_URL + LUCAIRN_API_KEY in env or --gateway / --api-key flags', + ); + } + } + + await ensureOutputDir(cli.output); + + const client = makeGatewayClient({ + gatewayUrl, + apiKey, + activityIdPrefix: cli.activityIdPrefix, + }); + + const writer = await import('node:fs/promises'); + let written = 0; + const startedAt = Date.now(); + const records: string[] = []; + for (const rowIndex of target) { + const entities = truthByRow.get(rowIndex) ?? []; + const transcription = transcriptByRow.get(rowIndex) ?? ''; + let result: GatewayRowResult | null = null; + let error: { code: string; message: string } | null = null; + try { + result = await client.runRow({ + row_index: rowIndex, + transcription, + entities, + }); + } catch (err) { + if (err instanceof GatewayClientError) { + error = { + code: 'gateway_error', + message: `${err.message} (status=${err.status ?? 'null'})`, + }; + } else if (err instanceof Error) { + error = { code: 'unknown_error', message: err.message }; + } else { + error = { code: 'unknown_error', message: String(err) }; + } + } + const ndjsonLine = JSON.stringify({ + row_index: rowIndex, + timestamp_utc: new Date().toISOString(), + entities_submitted: entities.length, + transcription_length: transcription.length, + gateway: gatewayUrl, + mode: cli.mock ? 'mock' : 'live', + mock_miss_rate: cli.mock ? cli.missRate : null, + mock_spurious_fp_count: cli.mock ? cli.spuriousFpCount : null, + result, + error, + }); + records.push(ndjsonLine); + written += 1; + } + await writer.writeFile(cli.output, records.join('\n') + '\n', 'utf8'); + + const elapsedMs = Date.now() - startedAt; + process.stdout.write( + `wrote ${written} record(s) to ${cli.output} in ${elapsedMs} ms (mode=${ + cli.mock ? 'mock' : 'live' + })\n`, + ); + + mock?.close(); +} + +main().catch((err: unknown) => { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`run-pipeline: ${msg}\n`); + process.exit(1); +}); From 5cde321d87c7b30f81f57ae3c27cabed68229dab Mon Sep 17 00:00:00 2001 From: Declade <110547349+Declade@users.noreply.github.com> Date: Sun, 17 May 2026 11:31:48 +0200 Subject: [PATCH 3/5] test(slice-2): gateway-client / redaction-extractor / recall specs + docs update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the three test files covering the new Slice 2 surface and updates README + RECIPE for the shipped state. Tests (22 new, 34 total with Slice 1's 12): - test/gateway-client.spec.ts (8 tests) — msw-mocked. Locks the proving-ground request shape (mode, relink_response=false, activity_id pattern, ground_truth.transcription[] HIPAA-tagged annotations, x-api-key header). Verifies: success path returns a typed GatewayRowResult; retries on 5xx and recovers (exact backoff math asserted); does NOT retry on 4xx; fails-with-error after exhausting retry budget; abort/timeout is retry-eligible; extractCertUrls handles the missing-veil-hint case; construction validation refuses empty URL / empty key. - test/redaction-extractor.spec.ts (9 tests) — locks the placeholder parser against malformed inputs; verifies the HIPAA mapping covers the standard Presidio + Lucairn vocabulary (PERSON, LOCATION, DATE, PHONE_NUMBER, EMAIL_ADDRESS, US_SSN, IBAN, URL, IP_ADDRESS, CREDIT_CARD); every entry in LUCAIRN_TO_HIPAA maps to a valid HipaaCategory; extractFromEvaluation flattens matches/missed/extras into ExtractedRedaction[] with verdicts; unknown annotation_type from the gateway is tagged null (no silent widening); unmappedExtraTypes surfaces taxonomy drift. - test/recall.spec.ts (5 tests) — 5 rows, 22 entities, hand-tagged TP/FN/FP. Exact per-category recall/precision/F1 numbers asserted: NAME 5 TP / 1 FN → recall 5/6; EMAIL 2 TP → recall 1; DATE 3 TP / 1 FN / 1 FP → recall 0.75 precision 0.75; PHONE 0 TP / 2 FP → precision 0; GEO 4 TP / 1 FN → recall 0.8. Overall TP=15 FP=3 FN=3 → recall 15/18. Locks the SPAN_OVERLAP_THRESHOLD const at 0.5 with a regression test. computeRecallFromSpans is exercised with a single-row synthetic fixture covering exact-50%-overlap (matches), 100%-overlap (matches), 40%-overlap (FP + FN). Per-row order ascending by row_index asserted. Unmapped-category counts get a "no HIPAA category mapping" note. Docs: - README.md — appends a Slice 2 — Harness section under Reproduce Paper 1 documenting the mock-only workflow, all three CLI commands with --rows=5 examples, the --miss-rate / --spurious-fp-count options, and the explicit "live gateway run lands in Slice 3" framing required by the PRD halt gate. Refines two pre-existing negative-disclaimer lines to avoid the locked banned literals "case study" + "testimonial" while preserving meaning. - datasets/healthcare/RECIPE.md — flips the Slice-status timeline entry for Slice 2 from "pending" to "shipped (mock-only)", enumerates the Slice 2 source files, and updates the Slice 3 description. - .gitignore — narrows `papers/*/raw-results/` to its contents and exempts the directory scaffold (`.gitignore` + `.gitkeep`) so the per-paper run-results directory exists in a fresh clone. End-to-end smoke (all PASS): pnpm install --frozen-lockfile → exit 0 pnpm typecheck → exit 0 pnpm typecheck:test → exit 0 pnpm build → exit 0 pnpm test (34 tests across 6 files) → exit 0 Banned-literal sweep → 0 hits --- .gitignore | 5 +- README.md | 32 ++- datasets/healthcare/RECIPE.md | 8 +- .../paper-1-healthcare/raw-results/.gitignore | 3 + .../paper-1-healthcare/raw-results/.gitkeep | 0 test/gateway-client.spec.ts | 223 ++++++++++++++++++ test/recall.spec.ts | 200 ++++++++++++++++ test/redaction-extractor.spec.ts | 136 +++++++++++ 8 files changed, 600 insertions(+), 7 deletions(-) create mode 100644 papers/paper-1-healthcare/raw-results/.gitignore create mode 100644 papers/paper-1-healthcare/raw-results/.gitkeep create mode 100644 test/gateway-client.spec.ts create mode 100644 test/recall.spec.ts create mode 100644 test/redaction-extractor.spec.ts diff --git a/.gitignore b/.gitignore index 53a09e7..bb52794 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,10 @@ datasets/*/raw/ datasets/*/with-injected-pii/ # Per-paper raw run artifacts (cert chains, intermediate JSONL — only summaries are checked in) -papers/*/raw-results/ +papers/*/raw-results/* +# But keep the scaffold so the directory exists in a fresh clone. +!papers/*/raw-results/.gitignore +!papers/*/raw-results/.gitkeep # Internal PRD/planning anchors (kept locally for fresh-context resumes) specs/ diff --git a/README.md b/README.md index 9ea5f2e..9ee3967 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,9 @@ Empirical methodology code for the Lucairn Research Program — a per-industry s ## What this repo is NOT - Not a Lucairn product. The Lucairn platform itself lives elsewhere (gateway, sanitizer, witness, certificate verifier). -- Not a customer-deployment artifact. These are vendor-published methodology papers; the publisher and the methodology are named in full. No customer attribution. No testimonials. No interviewed users. +- Not a customer-deployment artifact. These are vendor-published methodology papers; the publisher and the methodology are named in full. No customer attribution. No persona-driven narrative. - Not a CLI or a publishable npm package. It is a methodology codebase, run from a clone. -- Not a "case study". The artifact frame is a vendor benchmark / methodology paper; the word "case study" does not appear in any paper title, route slug, social card, or meta description. +- Not a customer-implementation report. The artifact frame is a vendor benchmark / methodology paper; persona-driven or implementation-report framing does not appear in any paper title, route slug, social card, or meta description. - Not legal advice. Regulatory references are factual citations to primary sources (EUR-Lex Regulation 2024/1689; HHS HIPAA Safe Harbor enumeration; published clinical-NLP de-identification literature); they are not interpretations. ## Regulatory context @@ -56,6 +56,34 @@ Prerequisites: - pnpm 10.x - Kaggle CLI installed (`pipx install kaggle`) with a working `~/.kaggle/kaggle.json` API token +### Slice 2 — Harness (mock-only) + +Slice 2 adds an in-process harness that calls the Lucairn gateway row-by-row via `POST /api/v1/proxy/messages` in `mode: "proving_ground"`, collects each row's signed cert URL, and computes per-HIPAA-category recall against the Measurement-B ground truth. + +**The harness is currently mock-only.** The live `gateway.lucairn.eu` run lands in Slice 3 per the locked PRD halt gate (avoid Anthropic upstream cost on every iteration). Run the in-process smoke flow: + +```bash +# Step 1 — call the mock gateway over 5 rows; write the raw NDJSON. +pnpm run pipeline -- --rows=5 --mock --output=/tmp/slice2-smoke.ndjson + +# Step 2 — convert NDJSON to the CERTIFICATES.csv appendix shape. +pnpm run collect-certs -- --input=/tmp/slice2-smoke.ndjson --output=/tmp/slice2-CERTIFICATES.csv + +# Step 3 — compute recall / precision / F1, validate against the SUMMARY schema. +pnpm run compute-recall \ + -- --truth=datasets/healthcare/with-injected-pii/ground-truth.jsonl \ + --redactions-source=mock \ + --rows=5 \ + --output=/tmp/slice2-SUMMARY.json +``` + +Mock options exercise the math layer against a known oracle: + +- `--miss-rate=0.3` — mock drops 30% of injected entities so recall and F1 reflect the configuration. +- `--spurious-fp-count=2` — mock emits 2 synthetic false-positive redactions per row. + +The harness implementation reads `LUCAIRN_GATEWAY_URL` and `LUCAIRN_API_KEY` from the environment but Slice 2 supports `--mock` only; the `--live` flag is reserved for Slice 3 and refuses to run without the explicit invocation that the live-run halt gate authorises. + ## Methodology summary (Paper 1) The healthcare dataset (MTSamples) is **not institutionally de-identified**; it is raw clinical narrative from the public mtsamples.com archive (CC0 public domain). Paper 1 therefore reports two empirically distinct measurements: diff --git a/datasets/healthcare/RECIPE.md b/datasets/healthcare/RECIPE.md index 31349e2..22bf1da 100644 --- a/datasets/healthcare/RECIPE.md +++ b/datasets/healthcare/RECIPE.md @@ -41,14 +41,14 @@ Because MTSamples has no published ground-truth PHI annotations, a single measur This recipe documents the *full* methodology for Paper 1. The implementation lands incrementally: -- **Slice 1 (current commit) — ships:** +- **Slice 1 — shipped:** - Dataset acquisition script (`scripts/download-mtsamples.ts`) - Deterministic synthetic PII re-injection for Measurement B's 500-row subset (`scripts/inject-pii.ts`, `src/inject-pii-core.ts`) - Round-trip verification (`scripts/verify-injection.ts`) -- **Slice 2 — pending:** harness to call the Lucairn gateway row-by-row, collect cert URLs, compute recall against Measurement B's known ground truth (`scripts/run-pipeline.ts`, `scripts/collect-certs.ts`, `scripts/compute-recall.ts`) -- **Slice 3 — pending:** full Paper 1 run including **Measurement A's raw-corpus detection pass** (Lucairn over the full ~5k MTSamples corpus, reporting detection counts without ground truth) plus the Measurement B recall numbers + the `papers/paper-1-healthcare/CERTIFICATES.csv` cert-URL appendix +- **Slice 2 (current commit) — shipped (mock-only):** harness to call the Lucairn gateway row-by-row via `POST /api/v1/proxy/messages` in `mode: "proving_ground"`, collect cert URLs, compute recall against Measurement B's known ground truth (`scripts/run-pipeline.ts`, `scripts/collect-certs.ts`, `scripts/compute-recall.ts`, `src/gateway-client.ts`, `src/redaction-extractor.ts`, `src/recall.ts`, `src/hipaa-category-mapping.ts`, `src/mocks/gateway-fixtures.ts`). The live gateway run is deferred to Slice 3. +- **Slice 3 — pending:** full Paper 1 run including **Measurement A's raw-corpus detection pass** (Lucairn over the full ~5k MTSamples corpus, reporting detection counts without ground truth) plus the Measurement B recall numbers against the live gateway + the `papers/paper-1-healthcare/CERTIFICATES.csv` cert-URL appendix -Until Slice 2 + Slice 3 land, the harness + Measurement A code does not exist in this repo. The methodology description below is the published target, not the current shipped state. +Until Slice 3 lands, the live-gateway end-to-end run + Measurement A code does not exist in this repo. The methodology description below is the published target, not the current shipped state. ### Measurement A — raw-corpus detection (what does Lucairn flag in the wild?) diff --git a/papers/paper-1-healthcare/raw-results/.gitignore b/papers/paper-1-healthcare/raw-results/.gitignore new file mode 100644 index 0000000..bf27f31 --- /dev/null +++ b/papers/paper-1-healthcare/raw-results/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!.gitkeep diff --git a/papers/paper-1-healthcare/raw-results/.gitkeep b/papers/paper-1-healthcare/raw-results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/test/gateway-client.spec.ts b/test/gateway-client.spec.ts new file mode 100644 index 0000000..18d3cbe --- /dev/null +++ b/test/gateway-client.spec.ts @@ -0,0 +1,223 @@ +import { afterAll, afterEach, beforeAll, describe, expect, it } from 'vitest'; +import { http, HttpResponse } from 'msw'; +import { setupServer } from 'msw/node'; + +import { + GatewayClientError, + extractCertUrls, + makeGatewayClient, +} from '../src/gateway-client.js'; +import type { GatewayResponse } from '../src/gateway-client.js'; + +const BASE_URL = 'http://gateway.test.local'; +const ENDPOINT = `${BASE_URL}/api/v1/proxy/messages`; +const API_KEY = 'lcr_live_test_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; + +function successResponse(overrides?: Partial): GatewayResponse { + return { + request_id: 'req_test_0001', + status: 'JOB_STATUS_COMPLETED', + latency_ms: 120, + redaction_count: 2, + ground_truth_evaluation: { + total_annotations: 2, + true_positives: 2, + false_negatives: 0, + false_positives: 0, + detection_rate: 1.0, + matches: [ + { annotation_type: 'NAME', annotation_value: 'Jane Roe', redacted_as: '[PERSON_1]' }, + { annotation_type: 'EMAIL', annotation_value: 'jane@example.test', redacted_as: '[EMAIL_ADDRESS_1]' }, + ], + missed: [], + extras: [], + }, + veil: { + status: 'available', + certificate_url: '/api/v1/veil/certificate/abc123', + summary_url: '/api/v1/veil/certificate/abc123/summary', + }, + ...overrides, + }; +} + +const server = setupServer(); +beforeAll(() => server.listen({ onUnhandledRequest: 'error' })); +afterEach(() => server.resetHandlers()); +afterAll(() => server.close()); + +describe('makeGatewayClient', () => { + it('parses a successful proving-ground response into a GatewayRowResult', async () => { + server.use( + http.post(ENDPOINT, async ({ request }) => { + const body = (await request.json()) as Record; + // Confirm the harness emits the locked proving-ground request shape. + expect(body['mode']).toBe('proving_ground'); + expect(body['relink_response']).toBe(false); + expect(body['activity_id']).toBe('paper-1-healthcare-row-7'); + // ground_truth.transcription carries HIPAA-tagged annotations. + const gt = body['ground_truth'] as { transcription: unknown[] }; + expect(Array.isArray(gt.transcription)).toBe(true); + expect(gt.transcription.length).toBe(1); + const ann = gt.transcription[0] as Record; + expect(ann['type']).toBe('NAME'); + expect(ann['value']).toBe('Jane Roe'); + expect(ann['start']).toBe(10); + expect(ann['end']).toBe(18); + // Verify auth header carries the API key. + expect(request.headers.get('x-api-key')).toBe(API_KEY); + return HttpResponse.json(successResponse()); + }), + ); + + const client = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + sleepFn: async () => undefined, + }); + const result = await client.runRow({ + row_index: 7, + transcription: 'A note about Jane Roe at jane@example.test in ward 3.', + entities: [ + { category: 'NAME', value: 'Jane Roe', start_char: 10, end_char: 18 }, + ], + }); + expect(result.row_index).toBe(7); + expect(result.cert_url).toBe('/api/v1/veil/certificate/abc123'); + expect(result.summary_url).toBe('/api/v1/veil/certificate/abc123/summary'); + expect(result.redaction_count).toBe(2); + expect(result.evaluation?.true_positives).toBe(2); + expect(result.evaluation?.matches?.[0]?.annotation_type).toBe('NAME'); + }); + + it('retries on 5xx and recovers, respecting backoff', async () => { + let calls = 0; + server.use( + http.post(ENDPOINT, () => { + calls += 1; + if (calls < 3) { + return HttpResponse.json({ error: 'transient' }, { status: 502 }); + } + return HttpResponse.json(successResponse()); + }), + ); + + const sleeps: number[] = []; + const client = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + maxRetries: 3, + backoffBaseMs: 10, + backoffJitterMs: 5, + sleepFn: async (ms) => { + sleeps.push(ms); + }, + randomFn: () => 0.5, + }); + const result = await client.runRow({ + row_index: 0, + transcription: 'short', + entities: [], + }); + expect(calls).toBe(3); + expect(sleeps.length).toBe(2); + // First retry backoff is base*2^0 + 0.5*jitter = 10 + 2.5 -> 12. + expect(sleeps[0]).toBe(12); + // Second retry backoff is base*2^1 + 0.5*jitter = 20 + 2.5 -> 22. + expect(sleeps[1]).toBe(22); + expect(result.request_id).toBe('req_test_0001'); + }); + + it('does NOT retry on 4xx — surfaces a GatewayClientError with the status', async () => { + let calls = 0; + server.use( + http.post(ENDPOINT, () => { + calls += 1; + return HttpResponse.json({ error: { code: 'invalid_field' } }, { status: 400 }); + }), + ); + const client = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + sleepFn: async () => undefined, + }); + await expect( + client.runRow({ row_index: 0, transcription: 'x', entities: [] }), + ).rejects.toThrow(GatewayClientError); + expect(calls).toBe(1); + }); + + it('fails after exhausting retries on persistent 5xx', async () => { + server.use( + http.post(ENDPOINT, () => HttpResponse.json({ error: 'down' }, { status: 503 })), + ); + const client = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + maxRetries: 2, + backoffBaseMs: 1, + backoffJitterMs: 1, + sleepFn: async () => undefined, + randomFn: () => 0, + }); + let thrown: GatewayClientError | null = null; + try { + await client.runRow({ row_index: 0, transcription: 'x', entities: [] }); + } catch (err) { + if (err instanceof GatewayClientError) thrown = err; + } + expect(thrown).not.toBeNull(); + expect(thrown?.status).toBe(503); + }); + + it('treats abort/timeout as a retry-eligible failure', async () => { + let calls = 0; + server.use( + http.post(ENDPOINT, async ({ request }) => { + calls += 1; + if (calls === 1) { + // Wait long enough for the client's tiny timeout to abort. + await new Promise((resolve) => setTimeout(resolve, 50)); + if (request.signal.aborted) { + return HttpResponse.error(); + } + } + return HttpResponse.json(successResponse()); + }), + ); + const client = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + maxRetries: 2, + backoffBaseMs: 1, + backoffJitterMs: 1, + requestTimeoutMs: 10, + sleepFn: async () => undefined, + randomFn: () => 0, + }); + const result = await client.runRow({ row_index: 0, transcription: 'x', entities: [] }); + expect(calls).toBeGreaterThanOrEqual(2); + expect(result.request_id).toBe('req_test_0001'); + }); + + it('extractCertUrls returns nulls when veil is absent', () => { + const r = extractCertUrls({ request_id: 'r' } as GatewayResponse); + expect(r.cert_url).toBeNull(); + expect(r.summary_url).toBeNull(); + }); + + it('extractCertUrls round-trips veil hints unchanged', () => { + const r = extractCertUrls(successResponse()); + expect(r.cert_url).toBe('/api/v1/veil/certificate/abc123'); + expect(r.summary_url).toBe('/api/v1/veil/certificate/abc123/summary'); + }); + + it('rejects construction without gatewayUrl or apiKey', () => { + expect(() => makeGatewayClient({ gatewayUrl: '', apiKey: API_KEY })).toThrow( + /gatewayUrl is required/u, + ); + expect(() => makeGatewayClient({ gatewayUrl: BASE_URL, apiKey: '' })).toThrow( + /apiKey is required/u, + ); + }); +}); diff --git a/test/recall.spec.ts b/test/recall.spec.ts new file mode 100644 index 0000000..1ce9f19 --- /dev/null +++ b/test/recall.spec.ts @@ -0,0 +1,200 @@ +import { describe, expect, it } from 'vitest'; + +import { HIPAA_CATEGORIES, type InjectedRow } from '../src/inject-pii-core.js'; +import { + SPAN_OVERLAP_THRESHOLD, + aggregateExtracted, + computeRecallFromSpans, + type PredictedSpan, +} from '../src/recall.js'; +import type { ExtractedRedaction } from '../src/redaction-extractor.js'; + +describe('aggregateExtracted', () => { + it('computes per-category recall/precision/F1 from gateway-attested verdicts', () => { + // 5 rows, 22 entities, hand-tagged TP/FN/FP. The per-category math is + // checked exactly so a regression in the aggregation logic is caught. + const extracted: ExtractedRedaction[] = [ + // NAME: 5 TP, 1 FN -> recall 5/6 ≈ 0.833, precision 5/5 = 1, F1 0.909 + ...times(5, (i) => mkTp(1, 'NAME', `name-${i}`)), + mkFn(1, 'NAME', 'name-miss'), + // EMAIL: 2 TP, 0 FN -> recall 1.0, precision 1.0 + mkTp(1, 'EMAIL', 'e1'), + mkTp(2, 'EMAIL', 'e2'), + // DATE: 3 TP, 1 FN, 1 FP -> recall 3/4 = 0.75, precision 3/4 = 0.75 + ...times(3, (i) => mkTp(2, 'DATE', `d${i}`)), + mkFn(2, 'DATE', 'd-miss'), + mkFp(2, 'DATE', 'd-extra'), + // PHONE: 0 TP, 0 FN, 2 FP -> precision 0/2 = 0 + mkFp(3, 'PHONE', 'p1'), + mkFp(3, 'PHONE', 'p2'), + // SSN: 1 TP -> recall 1.0 + mkTp(4, 'SSN', 's1'), + // GEO_SUBDIVISION: 4 TP, 1 FN -> recall 4/5 = 0.8 + ...times(4, (i) => mkTp(5, 'GEO_SUBDIVISION', `g${i}`)), + mkFn(5, 'GEO_SUBDIVISION', 'g-miss'), + ]; + + const summary = aggregateExtracted(extracted); + expect(summary.schema_version).toBe('1.0'); + expect(summary.generator).toBe('lucairn-research/recall.ts'); + + // Per-category — locks specific TP/FN/FP counts and rates. + const byCat = new Map(summary.per_category.map((p) => [p.category, p.counts])); + expect(byCat.get('NAME')).toMatchObject({ tp: 5, fp: 0, fn: 1, precision: 1 }); + expect(byCat.get('NAME')?.recall).toBeCloseTo(5 / 6, 6); + expect(byCat.get('EMAIL')).toMatchObject({ tp: 2, fp: 0, fn: 0, precision: 1, recall: 1, f1: 1 }); + expect(byCat.get('DATE')).toMatchObject({ tp: 3, fp: 1, fn: 1, precision: 0.75, recall: 0.75 }); + expect(byCat.get('PHONE')).toMatchObject({ tp: 0, fp: 2, fn: 0, precision: 0, recall: 0, f1: 0 }); + expect(byCat.get('SSN')).toMatchObject({ tp: 1, fp: 0, fn: 0, precision: 1, recall: 1, f1: 1 }); + expect(byCat.get('GEO_SUBDIVISION')).toMatchObject({ tp: 4, fp: 0, fn: 1, precision: 1 }); + expect(byCat.get('GEO_SUBDIVISION')?.recall).toBeCloseTo(0.8, 6); + + // Categories with no records still appear with zeros (per_category covers + // the full HIPAA enumeration in canonical order). + expect(summary.per_category.map((p) => p.category)).toEqual([...HIPAA_CATEGORIES]); + const mrn = byCat.get('MRN'); + expect(mrn).toEqual({ tp: 0, fp: 0, fn: 0, precision: 0, recall: 0, f1: 0 }); + + // Overall — TP=15, FP=3, FN=3 -> precision 15/18 = 0.833, recall 15/18 = 0.833. + expect(summary.overall.tp).toBe(15); + expect(summary.overall.fp).toBe(3); + expect(summary.overall.fn).toBe(3); + expect(summary.overall.total_annotations).toBe(18); + expect(summary.overall.recall).toBeCloseTo(15 / 18, 6); + expect(summary.overall.precision).toBeCloseTo(15 / 18, 6); + + // Per-row order is ascending by row_index. Row 1 holds the 5 NAME TPs + // plus the single EMAIL TP `e1` (`e2` is on row 2) and the 1 NAME FN. + expect(summary.per_row.map((r) => r.row_index)).toEqual([1, 2, 3, 4, 5]); + expect(summary.per_row[0]).toMatchObject({ row_index: 1, tp: 6, fn: 1, fp: 0 }); + }); + + it('handles unmapped categories without exploding (kept out of per_category but counted in overall)', () => { + const extracted: ExtractedRedaction[] = [ + mkTp(1, 'NAME', 'a'), + // hipaa_category null — happens when the gateway returns an unknown + // annotation_type or an unmapped placeholder appears in extras. + { + row_index: 1, + hipaa_category: null, + verdict: 'fp', + value: 'x', + placeholder: '[UNKNOWN_1]', + field: null, + }, + ]; + const summary = aggregateExtracted(extracted); + expect(summary.overall.fp).toBe(1); + expect(summary.notes.some((n) => /no HIPAA category mapping/iu.test(n))).toBe(true); + // NAME bucket still picks up its TP; UNKNOWN does not appear in per_category. + const byCat = new Map(summary.per_category.map((p) => [p.category, p.counts])); + expect(byCat.get('NAME')?.tp).toBe(1); + }); + + it('treats absent ground truth as recall=0 with total_annotations=0', () => { + const summary = aggregateExtracted([]); + expect(summary.overall.total_annotations).toBe(0); + expect(summary.overall.recall).toBe(0); + expect(summary.overall.f1).toBe(0); + }); +}); + +describe('computeRecallFromSpans (≥50% character-overlap)', () => { + it('matches at the locked overlap threshold and counts TP/FN/FP correctly', () => { + // Single-row, hand-built ground truth + predictions. + const truth: InjectedRow = { + row_index: 100, + original_transcription: 'placeholder', + injected_transcription: 'placeholder', + entities: [ + { category: 'NAME', value: 'Jane Roe', start_char: 0, end_char: 8 }, // len 8 + { category: 'EMAIL', value: 'j@example.test', start_char: 20, end_char: 34 }, // len 14 + { category: 'DATE', value: '2024-01-02', start_char: 40, end_char: 50 }, // len 10 — missed + ], + }; + const predicted: PredictedSpan[] = [ + // 50%-overlap exactly with NAME -> matches (>=0.5 inclusive). + { category: 'NAME', start_char: 4, end_char: 12, value: 'Jane Roe' }, + // EMAIL: prediction fully covers the truth -> 100% overlap, matches. + { category: 'EMAIL', start_char: 18, end_char: 40, value: 'j@example.test' }, + // 40%-overlap with DATE -> below threshold, counts as FP. + { category: 'DATE', start_char: 36, end_char: 44, value: '2024' }, + ]; + + const summary = computeRecallFromSpans([truth], [{ row_index: 100, spans: predicted }]); + const byCat = new Map(summary.per_category.map((p) => [p.category, p.counts])); + expect(byCat.get('NAME')).toMatchObject({ tp: 1, fp: 0, fn: 0 }); + expect(byCat.get('EMAIL')).toMatchObject({ tp: 1, fp: 0, fn: 0 }); + expect(byCat.get('DATE')).toMatchObject({ tp: 0, fp: 1, fn: 1 }); + expect(summary.overall).toMatchObject({ + tp: 2, + fp: 1, + fn: 1, + total_annotations: 3, + }); + expect(summary.overall.recall).toBeCloseTo(2 / 3, 6); + }); + + it('exposes the SPAN_OVERLAP_THRESHOLD const as 0.5 (regression lock)', () => { + expect(SPAN_OVERLAP_THRESHOLD).toBe(0.5); + }); +}); + +// ---- helpers ---- + +function mkTp( + rowIndex: number, + category: ExtractedRedaction['hipaa_category'], + value: string, +): ExtractedRedaction { + return { + row_index: rowIndex, + hipaa_category: category, + verdict: 'tp', + value, + placeholder: `[${categoryToInternal(category)}_${value}]`, + field: null, + }; +} + +function mkFn( + rowIndex: number, + category: ExtractedRedaction['hipaa_category'], + value: string, +): ExtractedRedaction { + return { + row_index: rowIndex, + hipaa_category: category, + verdict: 'fn', + value, + placeholder: null, + field: 'transcription', + }; +} + +function mkFp( + rowIndex: number, + category: ExtractedRedaction['hipaa_category'], + value: string, +): ExtractedRedaction { + return { + row_index: rowIndex, + hipaa_category: category, + verdict: 'fp', + value, + placeholder: `[${categoryToInternal(category)}_${value}]`, + field: null, + }; +} + +function categoryToInternal(category: ExtractedRedaction['hipaa_category']): string { + // Sufficient for synthetic test fixtures only — we are not exercising the + // mapping table here, just generating plausible-looking placeholders. + return category ?? 'UNKNOWN'; +} + +function times(n: number, f: (i: number) => T): T[] { + const out: T[] = []; + for (let i = 0; i < n; i++) out.push(f(i)); + return out; +} diff --git a/test/redaction-extractor.spec.ts b/test/redaction-extractor.spec.ts new file mode 100644 index 0000000..e47c858 --- /dev/null +++ b/test/redaction-extractor.spec.ts @@ -0,0 +1,136 @@ +import { describe, expect, it } from 'vitest'; + +import { HIPAA_CATEGORIES } from '../src/inject-pii-core.js'; +import { + extractFromEvaluation, + unmappedExtraTypes, +} from '../src/redaction-extractor.js'; +import { + LUCAIRN_TO_HIPAA, + parsePlaceholderType, + placeholderToHipaaCategory, +} from '../src/hipaa-category-mapping.js'; + +describe('parsePlaceholderType', () => { + it('parses well-formed `[TYPE_N]` placeholders', () => { + expect(parsePlaceholderType('[PERSON_1]')).toBe('PERSON'); + expect(parsePlaceholderType('[PHONE_NUMBER_12]')).toBe('PHONE_NUMBER'); + expect(parsePlaceholderType('[EMAIL_ADDRESS_42]')).toBe('EMAIL_ADDRESS'); + }); + + it('returns null for malformed placeholders', () => { + expect(parsePlaceholderType('PERSON_1')).toBeNull(); // no brackets + expect(parsePlaceholderType('[]')).toBeNull(); // empty + expect(parsePlaceholderType('[PERSON]')).toBeNull(); // no _N suffix + expect(parsePlaceholderType('[PERSON_]')).toBeNull(); // trailing underscore, no digits + expect(parsePlaceholderType('[PERSON_abc]')).toBeNull(); // non-digit suffix + expect(parsePlaceholderType('[_1]')).toBeNull(); // missing type prefix + }); +}); + +describe('placeholderToHipaaCategory', () => { + it('maps Lucairn internal types to HIPAA Safe Harbor categories', () => { + expect(placeholderToHipaaCategory('[PERSON_1]')).toBe('NAME'); + expect(placeholderToHipaaCategory('[LOCATION_2]')).toBe('GEO_SUBDIVISION'); + expect(placeholderToHipaaCategory('[PHONE_NUMBER_3]')).toBe('PHONE'); + expect(placeholderToHipaaCategory('[EMAIL_ADDRESS_4]')).toBe('EMAIL'); + expect(placeholderToHipaaCategory('[US_SSN_5]')).toBe('SSN'); + expect(placeholderToHipaaCategory('[IBAN_6]')).toBe('ACCOUNT_NUMBER'); + expect(placeholderToHipaaCategory('[URL_7]')).toBe('URL'); + expect(placeholderToHipaaCategory('[IP_ADDRESS_8]')).toBe('IP_ADDRESS'); + }); + + it('returns null for placeholders whose internal type is not in the map', () => { + expect(placeholderToHipaaCategory('[UNKNOWN_TYPE_1]')).toBeNull(); + expect(placeholderToHipaaCategory('[FOO_BAR_9]')).toBeNull(); + }); +}); + +describe('LUCAIRN_TO_HIPAA mapping', () => { + it('every right-hand side is a valid HipaaCategory', () => { + const valid = new Set(HIPAA_CATEGORIES); + for (const [internalType, hipaa] of Object.entries(LUCAIRN_TO_HIPAA)) { + expect(valid.has(hipaa), `entry ${internalType} -> ${hipaa}`).toBe(true); + } + }); + + it('covers the standard Presidio/Lucairn vocabulary the gateway emits', () => { + // Smoke list of internal types observed in proxy.go::extractEntityTypes + // and the Presidio recognizer catalogue. Any future regression where one + // of these disappears from the mapping is a Slice 3 hazard. + const required = [ + 'PERSON', + 'LOCATION', + 'DATE', + 'PHONE_NUMBER', + 'EMAIL_ADDRESS', + 'US_SSN', + 'IBAN', + 'URL', + 'IP_ADDRESS', + 'CREDIT_CARD', + ]; + for (const t of required) { + expect(LUCAIRN_TO_HIPAA[t], `mapping missing for ${t}`).toBeTruthy(); + } + }); +}); + +describe('extractFromEvaluation', () => { + it('flattens matches/missed/extras into ExtractedRedaction[] with verdicts', () => { + const extracted = extractFromEvaluation(42, { + total_annotations: 3, + true_positives: 1, + false_negatives: 1, + false_positives: 1, + detection_rate: 1 / 3, + matches: [ + { annotation_type: 'NAME', annotation_value: 'Alex Doe', redacted_as: '[PERSON_1]' }, + ], + missed: [{ field: 'transcription', type: 'EMAIL', value: 'a@b.com' }], + extras: [{ placeholder: '[PERSON_99]', original: 'Riverside Hospital' }], + }); + expect(extracted).toHaveLength(3); + const byVerdict = new Map(extracted.map((r) => [r.verdict, r])); + expect(byVerdict.get('tp')?.hipaa_category).toBe('NAME'); + expect(byVerdict.get('tp')?.placeholder).toBe('[PERSON_1]'); + expect(byVerdict.get('fn')?.hipaa_category).toBe('EMAIL'); + expect(byVerdict.get('fn')?.placeholder).toBeNull(); + expect(byVerdict.get('fp')?.hipaa_category).toBe('NAME'); + expect(byVerdict.get('fp')?.placeholder).toBe('[PERSON_99]'); + }); + + it('tags unknown annotation_type strings as null (does not silently widen)', () => { + const extracted = extractFromEvaluation(0, { + total_annotations: 1, + true_positives: 1, + false_negatives: 0, + false_positives: 0, + detection_rate: 1.0, + matches: [ + { + annotation_type: 'SOME_NEW_HIPAA_VARIANT', + annotation_value: 'x', + redacted_as: '[PERSON_1]', + }, + ], + }); + expect(extracted[0]?.hipaa_category).toBeNull(); + expect(extracted[0]?.verdict).toBe('tp'); + }); + + it('surfaces unmapped Lucairn placeholder types via unmappedExtraTypes', () => { + const unmapped = unmappedExtraTypes({ + total_annotations: 0, + true_positives: 0, + false_negatives: 0, + false_positives: 2, + detection_rate: 1.0, + extras: [ + { placeholder: '[PERSON_1]', original: 'Alex' }, // mapped → NAME + { placeholder: '[FUTURE_TYPE_X_1]', original: 'X' }, // unmapped + ], + }); + expect(unmapped).toEqual(['FUTURE_TYPE_X']); + }); +}); From f924cbd3e77531c34ef76058826f67cc4e78c9dd Mon Sep 17 00:00:00 2001 From: Declade <110547349+Declade@users.noreply.github.com> Date: Sun, 17 May 2026 11:52:16 +0200 Subject: [PATCH 4/5] fix(slice-2): close reviewer-chain findings (B1 mapping + H1-4 + 3 MEDs) - B1 (bug-hunter BLOCKER): rewrite hipaa-category-mapping table to match the live placeholder vocabulary from presidio_scan.py:31-58 (PERSON, EMAIL, PHONE, LOCATION, IBAN, CC, SSN, URL, DOB). ID and SECRET intentionally null-mapped (placeholder collapses multiple HIPAA categories; documented limitation surfaces as unmapped_extras). Update regression test to walk PRESIDIO_TO_PLACEHOLDER values + assert every value is mapped or explicitly null-mapped. - H1 (bug-hunter HIGH): rewrite mock fixture PLACEHOLDER_FOR_CATEGORY to emit live-production placeholder shapes (no more synthetic [MEDICAL_RECORD_NUMBER_1] etc.). Add [ID_N] regression test in recall.spec.ts to exercise the unmapped-extras accounting path. - H2 (bug-hunter HIGH): filter ground-truth annotations with value.trim().length < 3 in buildGroundTruth (containment-match safety; defensive against future Faker regression). Emit console.warn with dropped count only (never the dropped values). - H3 (bug-hunter HIGH): validate SUMMARY.json BEFORE writeFile in compute-recall.ts, not after, so a bogus SUMMARY.json never lands on disk for downstream consumers. - H4 (bug-hunter HIGH): plumb X-Upstream-Key header through gateway-client + run-pipeline --upstream-key flag for Slice 3 BYOK-per-request flow (proxy.go:349-354 gate). LUCAIRN_UPSTREAM_KEY env var fallback. Empty-string treated as absent. Help text + auth- modes table documented. - claim-enforce MED: append "No attributed endorsement quotes" to README.md:15 to recover the testimonial guardrail dropped in the Slice 2 banned-literal sweep rephrase. - personal-info-leak MED: rename lcr_live_test_* / lcr_live_mock_* to lcr_test_* / lcr_mock_* in test fixtures so the repo is safe for secret-scanner pass post-public-flip. - regulator-validator WARN: add matching-semantics disclosure to papers/_template/SUMMARY.schema.json description so auditors reading SUMMARY.json in isolation cannot misinterpret containment recall as span-exact i2b2-style recall. Deferred to Slice 3: - M1 NDJSON streaming writer (lost-data crash protection) - M2 rate-limit/concurrency + 429 retry policy for live Anthropic upstream - M3 hard-fail on malformed ground-truth/transcription rows - M4 hardening of the in-process JSON-Schema validator (or swap to ajv) - M5 detection_rate empty-row contract test - regulator WARN 1: fax/phone disclosure in Paper 1 body - regulator WARN 2: recall match-semantics in Paper 1 body Methods/Limits --- README.md | 2 +- papers/_template/SUMMARY.schema.json | 2 +- scripts/compute-recall.ts | 4 +- scripts/run-pipeline.ts | 37 +++++- src/gateway-client.ts | 89 +++++++++++++-- src/hipaa-category-mapping.ts | 162 ++++++++++++--------------- src/mocks/gateway-fixtures.ts | 63 +++++++---- test/gateway-client.spec.ts | 105 ++++++++++++++++- test/recall.spec.ts | 30 +++++ test/redaction-extractor.spec.ts | 100 +++++++++++++---- 10 files changed, 441 insertions(+), 153 deletions(-) diff --git a/README.md b/README.md index 9ee3967..5c4e8f9 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Empirical methodology code for the Lucairn Research Program — a per-industry s ## What this repo is NOT - Not a Lucairn product. The Lucairn platform itself lives elsewhere (gateway, sanitizer, witness, certificate verifier). -- Not a customer-deployment artifact. These are vendor-published methodology papers; the publisher and the methodology are named in full. No customer attribution. No persona-driven narrative. +- Not a customer-deployment artifact. These are vendor-published methodology papers; the publisher and the methodology are named in full. No customer attribution. No persona-driven narrative. No attributed endorsement quotes. - Not a CLI or a publishable npm package. It is a methodology codebase, run from a clone. - Not a customer-implementation report. The artifact frame is a vendor benchmark / methodology paper; persona-driven or implementation-report framing does not appear in any paper title, route slug, social card, or meta description. - Not legal advice. Regulatory references are factual citations to primary sources (EUR-Lex Regulation 2024/1689; HHS HIPAA Safe Harbor enumeration; published clinical-NLP de-identification literature); they are not interpretations. diff --git a/papers/_template/SUMMARY.schema.json b/papers/_template/SUMMARY.schema.json index f1a2f4f..bc7ab5c 100644 --- a/papers/_template/SUMMARY.schema.json +++ b/papers/_template/SUMMARY.schema.json @@ -2,7 +2,7 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://github.com/Declade/lucairn-research/papers/_template/SUMMARY.schema.json", "title": "Lucairn Research Program — per-paper SUMMARY.json", - "description": "Aggregate recall / precision / F1 numbers per HIPAA Safe Harbor category + overall + per-row breakdown for any paper in the Lucairn Research Program. Mirrors the RecallSummary shape produced by src/recall.ts.", + "description": "Aggregate recall / precision / F1 numbers per HIPAA Safe Harbor category + overall + per-row breakdown for any paper in the Lucairn Research Program. Mirrors the RecallSummary shape produced by src/recall.ts. Recall numbers are produced by the gateway's compareGroundTruth function at services/gateway/internal/api/ground_truth.go:69-138 in the dual-sandbox-architecture repo (case-insensitive bidirectional value-containment with whitespace normalization, server-side; not span-exact overlap). The publisher (Lucairn) ships this matcher in production; the research repo aggregates its verdicts.", "type": "object", "required": [ "schema_version", diff --git a/scripts/compute-recall.ts b/scripts/compute-recall.ts index 801dbb1..bd2e5ec 100644 --- a/scripts/compute-recall.ts +++ b/scripts/compute-recall.ts @@ -342,8 +342,10 @@ async function main(): Promise { const summary = aggregateExtracted(extracted, [ `Source: ${cli.redactionsSource}; rows processed: ${targetRows.length}.`, ]); - await writeFile(cli.output, JSON.stringify(summary, null, 2) + '\n', 'utf8'); + // Validate BEFORE writing. If validation throws, a bogus SUMMARY.json + // never lands on disk for downstream consumers to consume. await validateAgainstSchema(summary, defaultSchemaPath()); + await writeFile(cli.output, JSON.stringify(summary, null, 2) + '\n', 'utf8'); process.stdout.write( `wrote SUMMARY.json (${targetRows.length} rows, ` + diff --git a/scripts/run-pipeline.ts b/scripts/run-pipeline.ts index af0f43c..1a057a3 100644 --- a/scripts/run-pipeline.ts +++ b/scripts/run-pipeline.ts @@ -43,7 +43,12 @@ const DEFAULT_TRUTH_PATH = const DEFAULT_SUBSET_PATH = 'datasets/healthcare/with-injected-pii/measurement-b-subset.csv'; const MOCK_GATEWAY_URL = 'http://mock.lucairn.local'; -const MOCK_API_KEY = 'lcr_live_mock_0000000000000000000000000000'; +// Synthetic mock key. Uses an `lcr_mock_` prefix (NOT `lcr_live_`) so the +// real production key prefix never appears in committed code — secret +// scanners (truffleHog, gitleaks, GitHub secret scanning) would otherwise +// flag this file the moment the repo flips public. Length preserved so any +// length-based sanity checks elsewhere don't drift. +const MOCK_API_KEY = 'lcr_mock_0000000000000000000000000000'; interface CliArgs { rows: number | null; @@ -54,6 +59,17 @@ interface CliArgs { output: string; gateway: string | null; apiKey: string | null; + /** + * Upstream LLM API key (Anthropic for Claude models, OpenAI for GPT + * models, etc.) for BYOK-per-request customer profiles. Wired as + * `X-Upstream-Key` header on every gateway call. Required when the + * Lucairn customer profile has `ByokPerRequest: true` — the gateway + * returns 400 `missing_upstream_key` otherwise. See + * `dual-sandbox-architecture/services/gateway/internal/api/proxy.go:349-354` + * for the gate. Falls back to `process.env.LUCAIRN_UPSTREAM_KEY` when the + * flag is absent. Ignored under `--mock`. + */ + upstreamKey: string | null; missRate: number; spuriousFpCount: number; activityIdPrefix: string; @@ -71,6 +87,7 @@ function parseArgs(argv: readonly string[]): CliArgs { .replace(/[:.]/gu, '-')}.ndjson`, gateway: null, apiKey: null, + upstreamKey: null, missRate: 0, spuriousFpCount: 0, activityIdPrefix: 'paper-1-healthcare', @@ -104,6 +121,9 @@ function parseArgs(argv: readonly string[]): CliArgs { case '--api-key': args.apiKey = val; break; + case '--upstream-key': + args.upstreamKey = val; + break; case '--miss-rate': args.missRate = parseFloatOrThrow(val, '--miss-rate'); break; @@ -154,10 +174,19 @@ function printHelp(): void { ' --output=PATH NDJSON output path. Default: papers/paper-1-healthcare/raw-results/run-.ndjson', ' --gateway=URL gateway URL override (also honoured under --live).', ' --api-key=KEY API key override (--live only).', + ' --upstream-key=KEY Upstream LLM API key for BYOK-per-request customer profiles.', + ' Sent as X-Upstream-Key header. Falls back to LUCAIRN_UPSTREAM_KEY env.', + ' Required when the Lucairn profile has ByokPerRequest: true; otherwise', + ' the gateway returns HTTP 400 missing_upstream_key. Ignored under --mock.', ' --miss-rate=F --mock only. Fraction of injected entities the mock misses. Default: 0.', ' --spurious-fp-count=N --mock only. Synthetic FP redactions per row. Default: 0.', ' --activity-id-prefix=S per-row activity_id prefix. Default: paper-1-healthcare.', '', + 'Auth modes for --live runs (4 valid combinations):', + ' 1. lcr_live_* key + non-BYOK customer profile → only --api-key / LUCAIRN_API_KEY required.', + ' 2. lcr_live_* key + ByokPerRequest profile → --api-key + --upstream-key both required.', + ' 3. Direct provider key + X-DSA-Key auth fallback → not supported by this harness.', + ' 4. Authorization: Bearer relay → not supported by this harness.', 'Slice 2 ships --mock support only. --live is reserved for Slice 3 and requires Marc-confirmation.', ]; for (const ln of lines) { @@ -290,6 +319,10 @@ async function main(): Promise { let mock: MockServerHandle | null = null; let gatewayUrl: string; let apiKey: string; + // Upstream LLM API key for BYOK-per-request flows; null when --mock or + // when the customer profile doesn't require BYOK. See the auth-modes + // table in printHelp() for the four valid combinations. + let upstreamKey: string | null = null; if (cli.mock) { mock = mountMockServer(cli.missRate, cli.spuriousFpCount); gatewayUrl = MOCK_GATEWAY_URL; @@ -298,6 +331,7 @@ async function main(): Promise { const env = readGatewayEnv(); gatewayUrl = cli.gateway ?? env.gatewayUrl ?? ''; apiKey = cli.apiKey ?? env.apiKey ?? ''; + upstreamKey = cli.upstreamKey ?? env.upstreamKey ?? null; if (gatewayUrl === '' || apiKey === '') { throw new Error( '--live requires LUCAIRN_GATEWAY_URL + LUCAIRN_API_KEY in env or --gateway / --api-key flags', @@ -310,6 +344,7 @@ async function main(): Promise { const client = makeGatewayClient({ gatewayUrl, apiKey, + ...(upstreamKey !== null ? { upstreamKey } : {}), activityIdPrefix: cli.activityIdPrefix, }); diff --git a/src/gateway-client.ts b/src/gateway-client.ts index 5dc8c11..751149f 100644 --- a/src/gateway-client.ts +++ b/src/gateway-client.ts @@ -19,6 +19,10 @@ * (ground_truth_evaluation field emission) * - dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:5-138 * (groundTruthResult + per-item shapes) + * - dual-sandbox-architecture/services/gateway/internal/api/proxy.go:349-354 + * (BYOK-per-request gate — returns 400 missing_upstream_key when the + * customer profile requires per-request upstream keys and the + * X-Upstream-Key header is absent). * * The retry policy is 2 retries with exponential backoff (base 500 ms, jitter * 0–200 ms) on 5xx and connection errors only. 4xx errors are surfaced @@ -156,6 +160,17 @@ export interface GatewayRowResult { export interface GatewayClientOptions { readonly gatewayUrl: string; readonly apiKey: string; + /** + * Upstream LLM API key for BYOK-per-request customer profiles. When set, + * emitted as the `X-Upstream-Key` HTTP header on every request. Required + * for Slice 3 live runs when the Lucairn customer profile has + * `ByokPerRequest: true` — the gateway returns a 400 + * `missing_upstream_key` otherwise (see + * dual-sandbox-architecture/services/gateway/internal/api/proxy.go:349-354 + * for the gate). May be supplied via the `LUCAIRN_UPSTREAM_KEY` env var as + * a fallback when not set explicitly. + */ + readonly upstreamKey?: string; readonly activityIdPrefix?: string; readonly requestTimeoutMs?: number; readonly maxRetries?: number; @@ -197,22 +212,57 @@ function defaultSleep(ms: number): Promise { }); } +/** + * Minimum trimmed length of a ground-truth annotation value the harness will + * submit to the gateway. Defensive guard against future Faker regressions — + * the gateway's matcher (`compareGroundTruth` at + * dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:82-95 + * ) drops empty-after-trim values but NOT 1- or 2-char values. A 1-2 char + * value used as a containment-match needle has a high prior on spurious + * matches (e.g. annotation `value: "X"` matches every sanitizer redaction + * whose Original contains the letter X). Faker outputs in + * `inject-pii-core.ts:122-161` empirically always emit values ≥3 chars per + * category, but pinning the floor here protects against silent regressions. + */ +const MIN_GROUND_TRUTH_VALUE_LENGTH = 3; + /** * Construct an annotation list suitable for the proving-ground ground_truth * field. The keying field name is fixed at `transcription` because that is * the single context field we route through the sanitizer. + * + * Filters out annotations whose `value.trim().length` is below + * MIN_GROUND_TRUTH_VALUE_LENGTH and emits a single console.warn with the + * dropped count (never the dropped values — those are PII even when + * synthetic). The filter rationale + cite-back live on + * MIN_GROUND_TRUTH_VALUE_LENGTH above. */ function buildGroundTruth( entities: readonly InjectedEntity[], ): Record { - return { - transcription: entities.map((e) => ({ + const kept: ProvingGroundAnnotation[] = []; + let droppedCount = 0; + for (const e of entities) { + if (e.value.trim().length < MIN_GROUND_TRUTH_VALUE_LENGTH) { + droppedCount += 1; + continue; + } + kept.push({ type: e.category, value: e.value, start: e.start_char, end: e.end_char, - })), - }; + }); + } + if (droppedCount > 0) { + // eslint-disable-next-line no-console + console.warn( + `[gateway-client] dropped ${droppedCount} ground-truth annotation(s) ` + + `with value.trim().length < ${MIN_GROUND_TRUTH_VALUE_LENGTH} (containment-match safety; see ` + + `ground_truth.go:82-95)`, + ); + } + return { transcription: kept }; } /** @@ -253,6 +303,13 @@ export function makeGatewayClient(options: GatewayClientOptions): GatewayClient const activityPrefix = options.activityIdPrefix ?? 'paper-1-healthcare'; const model = options.model ?? DEFAULT_MODEL; const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS; + // Empty-string upstreamKey is treated as "absent" so callers can pass + // `process.env.LUCAIRN_UPSTREAM_KEY ?? ''` without accidentally emitting a + // header with no value. + const upstreamKey = + typeof options.upstreamKey === 'string' && options.upstreamKey.length > 0 + ? options.upstreamKey + : null; const endpoint = `${options.gatewayUrl.replace(/\/+$/u, '')}/api/v1/proxy/messages`; async function runRow(row: GatewayRowInput): Promise { @@ -279,12 +336,16 @@ export function makeGatewayClient(options: GatewayClientOptions): GatewayClient timeoutHandle = setTimeout(() => { controller?.abort(); }, timeoutMs); + const headers: Record = { + 'content-type': 'application/json', + 'x-api-key': options.apiKey, + }; + if (upstreamKey !== null) { + headers['x-upstream-key'] = upstreamKey; + } const response = await fetchFn(endpoint, { method: 'POST', - headers: { - 'content-type': 'application/json', - 'x-api-key': options.apiKey, - }, + headers, body: JSON.stringify(body), signal: controller.signal, }); @@ -369,16 +430,22 @@ async function safeReadText(response: Response): Promise { } /** - * Read gateway URL + API key from process.env. Returns null fields if unset - * so callers can decide whether to enter mock mode or fail. + * Read gateway URL + API key + optional upstream LLM API key from + * process.env. Returns null fields if unset so callers can decide whether to + * enter mock mode or fail. `upstreamKey` is sourced from + * `LUCAIRN_UPSTREAM_KEY` and is required for BYOK-per-request customer + * profiles in Slice 3 live runs (see `GatewayClientOptions.upstreamKey` + * for the gate cite-back). */ export function readGatewayEnv(env: NodeJS.ProcessEnv = process.env): { gatewayUrl: string | null; apiKey: string | null; + upstreamKey: string | null; requestTimeoutMs: number | null; } { const url = env.LUCAIRN_GATEWAY_URL ?? null; const key = env.LUCAIRN_API_KEY ?? null; + const upstreamKey = env.LUCAIRN_UPSTREAM_KEY ?? null; const timeoutStr = env.LUCAIRN_REQUEST_TIMEOUT_MS ?? null; let timeoutMs: number | null = null; if (timeoutStr !== null) { @@ -387,5 +454,5 @@ export function readGatewayEnv(env: NodeJS.ProcessEnv = process.env): { timeoutMs = parsed; } } - return { gatewayUrl: url, apiKey: key, requestTimeoutMs: timeoutMs }; + return { gatewayUrl: url, apiKey: key, upstreamKey, requestTimeoutMs: timeoutMs }; } diff --git a/src/hipaa-category-mapping.ts b/src/hipaa-category-mapping.ts index be47108..45340e1 100644 --- a/src/hipaa-category-mapping.ts +++ b/src/hipaa-category-mapping.ts @@ -1,116 +1,89 @@ /** * hipaa-category-mapping.ts * - * Maps Lucairn's internal sanitizer placeholder types (the `[TYPE_N]` shape) - * back to the 18 HIPAA Safe Harbor categories defined in - * `src/inject-pii-core.ts:28-47` (45 CFR § 164.514(b)(2)(i)). + * Maps Lucairn's LIVE placeholder prefixes (the `[PREFIX_N]` shape emitted by + * the sanitizer in production) back to the 18 HIPAA Safe Harbor categories + * defined in `src/inject-pii-core.ts:28-47` (45 CFR § 164.514(b)(2)(i)). * * Why this exists: * The Lucairn sanitizer emits redactions whose `placeholder` field is of the - * form `[TYPE_N]` where TYPE is an internal taxonomy term (PERSON, LOCATION, - * PHONE_NUMBER, etc.). The HIPAA Safe Harbor enumeration is the standard the - * research program reports recall against. This module is the documented - * bridge between the two taxonomies. + * form `[PREFIX_N]` where PREFIX comes from the `PRESIDIO_TO_PLACEHOLDER` + * dict in + * dual-sandbox-architecture/services/sanitizer/presidio_scan.py:31-58 + * (i.e. one of the 11 LIVE values: PERSON, EMAIL, PHONE, LOCATION, IBAN, CC, + * SSN, ID, URL, DOB, SECRET — confirmed by the placeholder-emit format at + * `placeholders.py:52` `f"[{pii_type}_{count}]"`). The HIPAA Safe Harbor + * enumeration is the standard the research program reports recall against. + * This module is the documented bridge between the two taxonomies. * - * Cite-back: gateway emits `placeholder` per redaction at - * `dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:48-56` - * and the placeholder parsing convention at - * `dual-sandbox-architecture/services/gateway/internal/api/proxy.go:1361-1395` - * (extractEntityTypes — accepts `[TYPE_N]` where TYPE is one or more - * uppercase letters/underscores). + * Cite-back: + * - Live placeholder vocabulary (source-of-truth): + * dual-sandbox-architecture/services/sanitizer/presidio_scan.py:31-58 + * (`PRESIDIO_TO_PLACEHOLDER` dict) + * - Placeholder emit format `[{pii_type}_{count}]`: + * dual-sandbox-architecture/services/sanitizer/placeholders.py:52 + * - Gateway emits `placeholder` per redaction at: + * dual-sandbox-architecture/services/gateway/internal/api/ground_truth.go:48-56 + * - Gateway's own parsing convention `[TYPE_N]`: + * dual-sandbox-architecture/services/gateway/internal/api/proxy.go:1361-1395 + * (extractEntityTypes — accepts uppercase letters/underscores + digits suffix). * - * The mapping is intentionally explicit and one-way (internal → HIPAA). If - * Lucairn introduces a new sanitizer type, this table MUST be extended before - * Paper 1 numbers are re-published — an unmapped placeholder is a recall - * accounting gap, not a silent passthrough. + * IMPORTANT — what this table is used for: + * The harness's TP and FN attribution flow through the ground-truth + * annotation's HIPAA `annotation_type` (since the harness submits HIPAA + * categories as `ProvingGroundAnnotation.type`), NOT through this table. + * This table is consulted only for FALSE POSITIVES surfaced in + * `extras[].placeholder` — where the gateway returns the placeholder the + * sanitizer emitted, and the harness needs to attribute the FP to a HIPAA + * category bucket. + * + * Documented limitations: + * - `[ID_N]` is the sanitizer's COLLAPSE bucket for many distinct Presidio + * entity types (MRN-shaped, US_BANK_NUMBER, US_PASSPORT, + * US_DRIVER_LICENSE, UK_NHS, SG_NRIC_FIN, AU_ABN, AU_TFN, AU_MEDICARE, + * IN_PAN, IP_ADDRESS, the four custom German recognizers Fallnummer / + * Personalausweis / Steuer-ID / SVNR, AND the unknown-entity fallback). + * These map to at least six different HIPAA categories (MRN, + * HEALTH_PLAN_ID, ACCOUNT_NUMBER, LICENSE_NUMBER, IP_ADDRESS, + * OTHER_UNIQUE_ID). The placeholder shape alone cannot disambiguate them. + * `placeholderToHipaaCategory('[ID_N]')` therefore returns `null` by + * design. + * - `[SECRET_N]` is the W5+ Phase 1 (2026-05-09) detect-secrets + + * SaaS-API-key bucket. Secrets are not a HIPAA Safe Harbor category in + * the 18-enumeration sense (45 CFR § 164.514(b)(2)(i)). + * `placeholderToHipaaCategory('[SECRET_N]')` returns `null` by design. + * + * FP counts whose placeholder maps to null surface in the + * `unmappedExtraTypes()` accounting in `src/redaction-extractor.ts:111-127` + * and `src/recall.ts:142-167` so they remain visible in the SUMMARY notes + * rather than being silently dropped. */ import type { HipaaCategory } from './inject-pii-core.js'; /** - * The exhaustive mapping from Lucairn internal sanitizer types to HIPAA Safe - * Harbor categories. + * The mapping from Lucairn LIVE placeholder prefixes (per + * `presidio_scan.py:31-58`) to HIPAA Safe Harbor categories. * - * Sources for the right-hand-side category assignments: - * - 45 CFR § 164.514(b)(2)(i) Safe Harbor enumeration (the 18 categories - * listed in `src/inject-pii-core.ts:28-47`). - * - Lucairn sanitizer's internal type vocabulary as observed in the gateway - * `extractEntityTypes` logic (`proxy.go:1361-1395`) and the Presidio + - * custom recognizer catalogue. + * The 11 live prefixes are: PERSON, EMAIL, PHONE, LOCATION, IBAN, CC, SSN, + * ID, URL, DOB, SECRET. `ID` and `SECRET` are deliberately UNMAPPED (they + * collapse multiple HIPAA categories / are not Safe Harbor categories + * respectively; see file-level doc-comment for the full rationale). * - * Categories not currently emitted by the sanitizer (e.g. FACE_PHOTO_REF, - * BIOMETRIC_ID) are absent from this map; they appear in injected ground - * truth only and will show as false-negatives if the sanitizer never detects - * them, which is correct accounting. + * If `presidio_scan.py` adds a new placeholder value, the regression test in + * `test/redaction-extractor.spec.ts` will fail until this table is updated + * or the new prefix is added to that test's `KNOWN_UNMAPPED` set. */ export const LUCAIRN_TO_HIPAA: Readonly> = Object.freeze({ - // Name-bearing types PERSON: 'NAME', - PERSON_NAME: 'NAME', - NAME: 'NAME', - - // Geographic subdivisions - LOCATION: 'GEO_SUBDIVISION', - ADDRESS: 'GEO_SUBDIVISION', - STREET_ADDRESS: 'GEO_SUBDIVISION', - ZIP_CODE: 'GEO_SUBDIVISION', - GERMAN_ZIP_CODE: 'GEO_SUBDIVISION', - CITY: 'GEO_SUBDIVISION', - - // Dates - DATE: 'DATE', - DATE_TIME: 'DATE', - - // Telephone / fax — sanitizer does not natively distinguish PHONE from FAX. - // We map both PHONE_NUMBER and PHONE to PHONE; FAX is only recognised when - // a custom recognizer surfaces FAX explicitly. - PHONE_NUMBER: 'PHONE', - PHONE: 'PHONE', - FAX: 'FAX', - FAX_NUMBER: 'FAX', - - // Email EMAIL: 'EMAIL', - EMAIL_ADDRESS: 'EMAIL', - - // US identifier-shaped categories - US_SSN: 'SSN', + PHONE: 'PHONE', + LOCATION: 'GEO_SUBDIVISION', + IBAN: 'ACCOUNT_NUMBER', // SEPA bank account numbers + CC: 'ACCOUNT_NUMBER', // credit card numbers SSN: 'SSN', - - // Medical record / health-plan / account / license / vehicle / device - MRN: 'MRN', - MEDICAL_RECORD_NUMBER: 'MRN', - HEALTH_PLAN_ID: 'HEALTH_PLAN_ID', - HEALTH_PLAN_BENEFICIARY_NUMBER: 'HEALTH_PLAN_ID', - ACCOUNT_NUMBER: 'ACCOUNT_NUMBER', - US_BANK_NUMBER: 'ACCOUNT_NUMBER', - IBAN: 'ACCOUNT_NUMBER', - IBAN_CODE: 'ACCOUNT_NUMBER', - CREDIT_CARD: 'ACCOUNT_NUMBER', - CREDIT_CARD_NUMBER: 'ACCOUNT_NUMBER', - LICENSE_NUMBER: 'LICENSE_NUMBER', - US_DRIVER_LICENSE: 'LICENSE_NUMBER', - PROFESSIONAL_LICENSE: 'LICENSE_NUMBER', - VEHICLE_ID: 'VEHICLE_ID', - VIN: 'VEHICLE_ID', - US_VEHICLE_VIN: 'VEHICLE_ID', - LICENSE_PLATE: 'VEHICLE_ID', - DEVICE_ID: 'DEVICE_ID', - DEVICE_SERIAL: 'DEVICE_ID', - IMEI: 'DEVICE_ID', - - // Web identifiers URL: 'URL', - IP_ADDRESS: 'IP_ADDRESS', - - // Biometric / face photo / other unique ID - BIOMETRIC_ID: 'BIOMETRIC_ID', - FACE_PHOTO_REF: 'FACE_PHOTO_REF', - STUDY_ID: 'OTHER_UNIQUE_ID', - OTHER_UNIQUE_ID: 'OTHER_UNIQUE_ID', - PASSPORT: 'OTHER_UNIQUE_ID', - US_PASSPORT: 'OTHER_UNIQUE_ID', - US_ITIN: 'OTHER_UNIQUE_ID', + DOB: 'DATE', }); /** @@ -137,7 +110,10 @@ export function parsePlaceholderType(placeholder: string): string | null { /** * Map a Lucairn `[TYPE_N]` placeholder to its HIPAA Safe Harbor category. - * Returns null when the internal type is not in `LUCAIRN_TO_HIPAA`. + * Returns null when the internal type is not in `LUCAIRN_TO_HIPAA`. The + * documented null cases are `[ID_N]` (collapse-bucket — disambiguation + * impossible from the placeholder alone) and `[SECRET_N]` (not a Safe + * Harbor category). See the file-level doc-comment for the rationale. */ export function placeholderToHipaaCategory(placeholder: string): HipaaCategory | null { const t = parsePlaceholderType(placeholder); diff --git a/src/mocks/gateway-fixtures.ts b/src/mocks/gateway-fixtures.ts index 9dbf652..f895514 100644 --- a/src/mocks/gateway-fixtures.ts +++ b/src/mocks/gateway-fixtures.ts @@ -32,26 +32,47 @@ export interface MockBuilderOptions { readonly spuriousFpCount?: number; } -const PLACEHOLDER_FOR_CATEGORY: Readonly> = { +/** + * Map from HIPAA Safe Harbor category to the LIVE placeholder prefix the + * Lucairn sanitizer would emit in production for that category. Source of + * truth for these mappings: + * dual-sandbox-architecture/services/sanitizer/presidio_scan.py:31-58 + * (PRESIDIO_TO_PLACEHOLDER dict). + * + * Locked decisions: + * - FAX uses the same PHONE prefix the sanitizer emits for phone numbers + * (the sanitizer does not natively distinguish FAX from PHONE — fax + * numbers match the PHONE recognizer). + * - MRN, HEALTH_PLAN_ID, LICENSE_NUMBER, VEHICLE_ID, DEVICE_ID, + * IP_ADDRESS, BIOMETRIC_ID, FACE_PHOTO_REF, OTHER_UNIQUE_ID all collapse + * to the sanitizer's ID bucket. The mock therefore emits [ID_N] for + * these — matching production behavior. Their FP attribution surfaces + * in the unmapped_extras accounting (recall.ts:142-167), NOT in the + * per-category HIPAA buckets, exactly as the live path behaves. + * - ACCOUNT_NUMBER picks IBAN deterministically (CC is the alternative). + * Both map back to ACCOUNT_NUMBER via LUCAIRN_TO_HIPAA, so the test + * bookkeeping is symmetric. + */ +const PLACEHOLDER_FOR_CATEGORY: Readonly> = Object.freeze({ NAME: 'PERSON', GEO_SUBDIVISION: 'LOCATION', - DATE: 'DATE', - PHONE: 'PHONE_NUMBER', - FAX: 'FAX_NUMBER', - EMAIL: 'EMAIL_ADDRESS', - SSN: 'US_SSN', - MRN: 'MEDICAL_RECORD_NUMBER', - HEALTH_PLAN_ID: 'HEALTH_PLAN_ID', - ACCOUNT_NUMBER: 'ACCOUNT_NUMBER', - LICENSE_NUMBER: 'LICENSE_NUMBER', - VEHICLE_ID: 'VEHICLE_ID', - DEVICE_ID: 'DEVICE_ID', + DATE: 'DOB', + PHONE: 'PHONE', + FAX: 'PHONE', // sanitizer doesn't natively distinguish fax from phone + EMAIL: 'EMAIL', + SSN: 'SSN', + MRN: 'ID', // sanitizer collapses to ID bucket + HEALTH_PLAN_ID: 'ID', // sanitizer collapses to ID bucket + ACCOUNT_NUMBER: 'IBAN', // deterministic choice; CC is the alternative + LICENSE_NUMBER: 'ID', // sanitizer collapses to ID bucket + VEHICLE_ID: 'ID', // sanitizer collapses to ID bucket + DEVICE_ID: 'ID', // sanitizer collapses to ID bucket URL: 'URL', - IP_ADDRESS: 'IP_ADDRESS', - BIOMETRIC_ID: 'BIOMETRIC_ID', - FACE_PHOTO_REF: 'FACE_PHOTO_REF', - OTHER_UNIQUE_ID: 'STUDY_ID', -}; + IP_ADDRESS: 'ID', // sanitizer collapses IP_ADDRESS to ID (presidio_scan.py:51) + BIOMETRIC_ID: 'ID', // sanitizer collapses to ID bucket + FACE_PHOTO_REF: 'ID', // sanitizer collapses to ID bucket + OTHER_UNIQUE_ID: 'ID', // sanitizer collapses to ID bucket +}); /** * Build a mock gateway response for a single row. Determinism: given the @@ -88,8 +109,12 @@ export function buildMockResponse(options: MockBuilderOptions): GatewayResponse const extras: GroundTruthExtra[] = []; for (let i = 0; i < spuriousFpCount; i++) { // Synthesise plausible-looking spurious detections so FP-handling code - // paths can be exercised. Use deterministic pseudo-text. - const internalType = ['PERSON', 'LOCATION', 'PHONE_NUMBER'][i % 3] ?? 'PERSON'; + // paths can be exercised. Use deterministic pseudo-text. The prefix + // rotation includes `ID` so the unmapped_extras accounting path + // (recall.ts:142-167) is exercised on at least one of every 4 synthetic + // FPs — mirroring production where `[ID_N]` is a common collapse-bucket + // placeholder for the sanitizer. + const internalType = ['PERSON', 'LOCATION', 'PHONE', 'ID'][i % 4] ?? 'PERSON'; const nextN = (seqByType.get(internalType) ?? 0) + 1; seqByType.set(internalType, nextN); extras.push({ diff --git a/test/gateway-client.spec.ts b/test/gateway-client.spec.ts index 18d3cbe..cea2a42 100644 --- a/test/gateway-client.spec.ts +++ b/test/gateway-client.spec.ts @@ -1,4 +1,4 @@ -import { afterAll, afterEach, beforeAll, describe, expect, it } from 'vitest'; +import { afterAll, afterEach, beforeAll, describe, expect, it, vi } from 'vitest'; import { http, HttpResponse } from 'msw'; import { setupServer } from 'msw/node'; @@ -11,7 +11,11 @@ import type { GatewayResponse } from '../src/gateway-client.js'; const BASE_URL = 'http://gateway.test.local'; const ENDPOINT = `${BASE_URL}/api/v1/proxy/messages`; -const API_KEY = 'lcr_live_test_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +// Synthetic test key. Uses an `lcr_test_` prefix (NOT `lcr_live_`) so the +// real production key prefix never appears in committed test code — that +// avoids triggering downstream secret scanners (truffleHog, gitleaks, +// GitHub secret scanning) once this repo flips public. +const API_KEY = 'lcr_test_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; function successResponse(overrides?: Partial): GatewayResponse { return { @@ -220,4 +224,101 @@ describe('makeGatewayClient', () => { /apiKey is required/u, ); }); + + it('emits X-Upstream-Key header when upstreamKey is set (Slice 3 BYOK gate)', async () => { + // Locks the contract for `dual-sandbox-architecture/services/gateway/ + // internal/api/proxy.go:349-354` BYOK-per-request profile gate. + let observedUpstreamHeader: string | null = null; + server.use( + http.post(ENDPOINT, async ({ request }) => { + observedUpstreamHeader = request.headers.get('x-upstream-key'); + return HttpResponse.json(successResponse()); + }), + ); + const client = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + upstreamKey: 'sk-ant-api03-fake-upstream-test-value', + sleepFn: async () => undefined, + }); + await client.runRow({ row_index: 0, transcription: 'x', entities: [] }); + expect(observedUpstreamHeader).toBe('sk-ant-api03-fake-upstream-test-value'); + }); + + it('omits X-Upstream-Key header when upstreamKey is absent or empty', async () => { + let observedUpstreamHeader: string | null = 'sentinel'; + server.use( + http.post(ENDPOINT, async ({ request }) => { + observedUpstreamHeader = request.headers.get('x-upstream-key'); + return HttpResponse.json(successResponse()); + }), + ); + const clientUnset = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + sleepFn: async () => undefined, + }); + await clientUnset.runRow({ row_index: 0, transcription: 'x', entities: [] }); + // msw / fetch surface absent headers as null. + expect(observedUpstreamHeader).toBeNull(); + + observedUpstreamHeader = 'sentinel'; + const clientEmpty = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + upstreamKey: '', // explicitly empty must be treated as "absent" + sleepFn: async () => undefined, + }); + await clientEmpty.runRow({ row_index: 0, transcription: 'y', entities: [] }); + expect(observedUpstreamHeader).toBeNull(); + }); + + it('filters ground-truth annotations with value.trim().length < 3 (H2 containment-match safety)', async () => { + // Defensive guard against future Faker regression — see + // src/gateway-client.ts::MIN_GROUND_TRUTH_VALUE_LENGTH and the + // ground_truth.go:82-95 cite-back. The gateway's compareGroundTruth + // drops empty-after-trim values but NOT 1-2 char values, so a 1-2 char + // needle would containment-match into many redactions spuriously. + let observedAnnotations: unknown[] = []; + server.use( + http.post(ENDPOINT, async ({ request }) => { + const body = (await request.json()) as Record; + const gt = body['ground_truth'] as { transcription: unknown[] }; + observedAnnotations = gt.transcription; + return HttpResponse.json(successResponse()); + }), + ); + // Silence the expected console.warn so the test output stays clean + // while still verifying the filter fired. + const warnSpy = vi + .spyOn(console, 'warn') + .mockImplementation((): void => undefined); + const client = makeGatewayClient({ + gatewayUrl: BASE_URL, + apiKey: API_KEY, + sleepFn: async () => undefined, + }); + await client.runRow({ + row_index: 0, + transcription: 'short note', + entities: [ + // length-1 — must be dropped. + { category: 'NAME', value: 'X', start_char: 0, end_char: 1 }, + // length-2 after trim — must be dropped. + { category: 'NAME', value: ' AB ', start_char: 0, end_char: 4 }, + // length-3 — must survive. + { category: 'EMAIL', value: 'a@b', start_char: 5, end_char: 8 }, + ], + }); + expect(observedAnnotations).toHaveLength(1); + const kept = observedAnnotations[0] as Record; + expect(kept['type']).toBe('EMAIL'); + expect(kept['value']).toBe('a@b'); + // Warning fired with the dropped count, NOT the dropped values. + expect(warnSpy).toHaveBeenCalledTimes(1); + const firstArg = warnSpy.mock.calls[0]?.[0]; + expect(typeof firstArg).toBe('string'); + expect(firstArg as string).toMatch(/dropped 2 ground-truth annotation\(s\)/u); + warnSpy.mockRestore(); + }); }); diff --git a/test/recall.spec.ts b/test/recall.spec.ts index 1ce9f19..89f3dcc 100644 --- a/test/recall.spec.ts +++ b/test/recall.spec.ts @@ -91,6 +91,36 @@ describe('aggregateExtracted', () => { expect(byCat.get('NAME')?.tp).toBe(1); }); + it('routes [ID_N] FPs into the unmapped bucket (documented collapse-bucket limitation)', () => { + // [ID_N] is the sanitizer's collapse-bucket for many distinct HIPAA + // categories (MRN, HEALTH_PLAN_ID, ACCOUNT_NUMBER, LICENSE_NUMBER, + // IP_ADDRESS, OTHER_UNIQUE_ID, +4 German custom recognizers + unknown + // fallback — cite-back: presidio_scan.py:31-58). The placeholder shape + // alone cannot disambiguate the underlying category, so by design + // [ID_N] FPs surface in the overall.fp count + the unmapped notes, + // NOT in any per_category bucket. This guards against silent + // misattribution if a future change tries to "fix" the null-mapping. + const extracted: ExtractedRedaction[] = [ + // hipaa_category null because extractFromEvaluation called + // placeholderToHipaaCategory('[ID_1]') and got null back. + { + row_index: 1, + hipaa_category: null, + verdict: 'fp', + value: 'spurious-id-string', + placeholder: '[ID_1]', + field: null, + }, + ]; + const summary = aggregateExtracted(extracted); + expect(summary.overall.fp).toBe(1); + expect(summary.notes.some((n) => /no HIPAA category mapping/iu.test(n))).toBe(true); + // None of the 18 HIPAA categories has fp > 0. + for (const entry of summary.per_category) { + expect(entry.counts.fp).toBe(0); + } + }); + it('treats absent ground truth as recall=0 with total_annotations=0', () => { const summary = aggregateExtracted([]); expect(summary.overall.total_annotations).toBe(0); diff --git a/test/redaction-extractor.spec.ts b/test/redaction-extractor.spec.ts index e47c858..d61d712 100644 --- a/test/redaction-extractor.spec.ts +++ b/test/redaction-extractor.spec.ts @@ -29,15 +29,34 @@ describe('parsePlaceholderType', () => { }); describe('placeholderToHipaaCategory', () => { - it('maps Lucairn internal types to HIPAA Safe Harbor categories', () => { + it('maps Lucairn LIVE placeholder prefixes to HIPAA Safe Harbor categories', () => { + // Live placeholder prefixes from presidio_scan.py:31-58 + // PRESIDIO_TO_PLACEHOLDER right-hand-side values. expect(placeholderToHipaaCategory('[PERSON_1]')).toBe('NAME'); expect(placeholderToHipaaCategory('[LOCATION_2]')).toBe('GEO_SUBDIVISION'); - expect(placeholderToHipaaCategory('[PHONE_NUMBER_3]')).toBe('PHONE'); - expect(placeholderToHipaaCategory('[EMAIL_ADDRESS_4]')).toBe('EMAIL'); - expect(placeholderToHipaaCategory('[US_SSN_5]')).toBe('SSN'); + expect(placeholderToHipaaCategory('[PHONE_3]')).toBe('PHONE'); + expect(placeholderToHipaaCategory('[EMAIL_4]')).toBe('EMAIL'); + expect(placeholderToHipaaCategory('[SSN_5]')).toBe('SSN'); expect(placeholderToHipaaCategory('[IBAN_6]')).toBe('ACCOUNT_NUMBER'); - expect(placeholderToHipaaCategory('[URL_7]')).toBe('URL'); - expect(placeholderToHipaaCategory('[IP_ADDRESS_8]')).toBe('IP_ADDRESS'); + expect(placeholderToHipaaCategory('[CC_7]')).toBe('ACCOUNT_NUMBER'); + expect(placeholderToHipaaCategory('[URL_8]')).toBe('URL'); + expect(placeholderToHipaaCategory('[DOB_9]')).toBe('DATE'); + }); + + it('null-maps [ID_N] and [SECRET_N] by design (documented limitation)', () => { + // [ID_N] is the sanitizer's collapse-bucket for MRN, US_BANK_NUMBER, + // US_PASSPORT, US_DRIVER_LICENSE, UK_NHS, SG_NRIC_FIN, AU_ABN, AU_TFN, + // AU_MEDICARE, IN_PAN, IP_ADDRESS + 4 German custom recognizers, AND the + // unknown-entity fallback (cite-back: presidio_scan.py:31-58). The + // placeholder shape cannot disambiguate the underlying HIPAA category, so + // null-mapping is the correct behavior — the FP count surfaces in the + // unmapped_extras accounting (recall.ts:142-167) instead of being + // silently misattributed. + expect(placeholderToHipaaCategory('[ID_1]')).toBeNull(); + // [SECRET_N] (W5+ Phase 1, 2026-05-09) is detect-secrets + SaaS-API-key + // matches; secrets are not a HIPAA Safe Harbor category in the + // 18-enumeration sense (45 CFR § 164.514(b)(2)(i)). + expect(placeholderToHipaaCategory('[SECRET_1]')).toBeNull(); }); it('returns null for placeholders whose internal type is not in the map', () => { @@ -47,6 +66,32 @@ describe('placeholderToHipaaCategory', () => { }); describe('LUCAIRN_TO_HIPAA mapping', () => { + // Live placeholder prefix vocabulary from + // dual-sandbox-architecture/services/sanitizer/presidio_scan.py:31-58 + // (PRESIDIO_TO_PLACEHOLDER dict right-hand-side values). Hard-coded here so + // any future addition to that dict that this repo hasn't accounted for + // surfaces as a test failure, not as silent FP miscategorization. + const LIVE_PLACEHOLDER_PREFIXES = [ + 'PERSON', + 'EMAIL', + 'PHONE', + 'LOCATION', + 'IBAN', + 'CC', + 'SSN', + 'ID', + 'URL', + 'DOB', + 'SECRET', + ] as const; + + // Prefixes intentionally NOT mapped — see hipaa-category-mapping.ts + // file-level doc-comment for the rationale. + const KNOWN_UNMAPPED: ReadonlySet = new Set([ + 'ID', // collapse-bucket for many distinct HIPAA categories; disambiguation impossible from placeholder alone + 'SECRET', // not a HIPAA Safe Harbor category in the 18-enumeration sense + ]); + it('every right-hand side is a valid HipaaCategory', () => { const valid = new Set(HIPAA_CATEGORIES); for (const [internalType, hipaa] of Object.entries(LUCAIRN_TO_HIPAA)) { @@ -54,24 +99,31 @@ describe('LUCAIRN_TO_HIPAA mapping', () => { } }); - it('covers the standard Presidio/Lucairn vocabulary the gateway emits', () => { - // Smoke list of internal types observed in proxy.go::extractEntityTypes - // and the Presidio recognizer catalogue. Any future regression where one - // of these disappears from the mapping is a Slice 3 hazard. - const required = [ - 'PERSON', - 'LOCATION', - 'DATE', - 'PHONE_NUMBER', - 'EMAIL_ADDRESS', - 'US_SSN', - 'IBAN', - 'URL', - 'IP_ADDRESS', - 'CREDIT_CARD', - ]; - for (const t of required) { - expect(LUCAIRN_TO_HIPAA[t], `mapping missing for ${t}`).toBeTruthy(); + it('every live placeholder prefix is either mapped or explicitly null-mapped', () => { + // Walk the live vocabulary; each prefix must either appear in + // LUCAIRN_TO_HIPAA OR be listed in KNOWN_UNMAPPED. This is the regression + // lock against `presidio_scan.py:31-58` drift. + for (const prefix of LIVE_PLACEHOLDER_PREFIXES) { + const mapped = LUCAIRN_TO_HIPAA[prefix] !== undefined; + const unmappedIntentionally = KNOWN_UNMAPPED.has(prefix); + expect( + mapped || unmappedIntentionally, + `prefix ${prefix} (from presidio_scan.py:31-58) must be in LUCAIRN_TO_HIPAA or KNOWN_UNMAPPED`, + ).toBe(true); + } + }); + + it('no prefix in LUCAIRN_TO_HIPAA is outside the live placeholder vocabulary', () => { + // Inverse guard — if someone adds a stale alias (e.g. PHONE_NUMBER or + // EMAIL_ADDRESS) to the mapping table, it must correspond to something + // the sanitizer actually emits. Otherwise the entry is dead code masking + // real drift. + const liveSet = new Set(LIVE_PLACEHOLDER_PREFIXES); + for (const internalType of Object.keys(LUCAIRN_TO_HIPAA)) { + expect( + liveSet.has(internalType), + `LUCAIRN_TO_HIPAA[${internalType}] is not in the live placeholder vocabulary (presidio_scan.py:31-58)`, + ).toBe(true); } }); }); From 5fb5c49e186e0cd6ae9cd3e8be5ac7f3cdbbb71e Mon Sep 17 00:00:00 2001 From: Declade <110547349+Declade@users.noreply.github.com> Date: Sun, 17 May 2026 12:11:41 +0200 Subject: [PATCH 5/5] fix(slice-2): close codex r1 findings ([8] help-text + [21] CSV-superset rationale) - [8] FAIL: --upstream-key help table listed 2 unsupported auth modes ("not supported by this harness") and omitted the --mock path entirely. Rewrote the table to enumerate the 3 actually-supported modes: --mock (no auth), --live + --api-key (non-BYOK), --live + --api-key + --upstream-key (BYOK-per-request, cite proxy.go:349-354 gate). - [21] FAIL: CERTIFICATES.csv ships 9 columns vs the brief's 7 minimum. The 2 extensions (summary_url, error_code) are intentional: summary_url saves readers a URL-construction step; error_code makes the paper appendix honest about which rows failed instead of silently dropping them. Documented the rationale inline in collect-certs.ts before the headers array. All 7 brief-required columns remain present in declaration order. Treating this as effective-PASS at the orchestrator level: brief spec was a minimum, not an exclusive list. No code-behavior changes. typecheck/build/test all green at HEAD. --- scripts/collect-certs.ts | 11 +++++++++++ scripts/run-pipeline.ts | 10 +++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/scripts/collect-certs.ts b/scripts/collect-certs.ts index 0e71d7e..f521057 100644 --- a/scripts/collect-certs.ts +++ b/scripts/collect-certs.ts @@ -118,6 +118,17 @@ async function main(): Promise { } } + // CERTIFICATES.csv schema. Slice 2 dispatch brief specified a 7-column + // minimum (row_index, cert_url, cert_id, overall_verdict, redaction_count, + // latency_ms, timestamp_utc). We ship a superset of 9 columns; the two + // extensions over the brief minimum are: + // - summary_url: the /summary HTML-view sibling of cert_url. Included + // so readers / auditors can paste directly into a + // browser without reconstructing the URL. + // - error_code: per-row failure code (empty string when row succeeded). + // Included so the paper appendix honestly records which + // rows failed instead of silently dropping them. + // The 7 brief-required columns are all present in their requested order. const headers = [ 'row_index', 'cert_url', diff --git a/scripts/run-pipeline.ts b/scripts/run-pipeline.ts index 1a057a3..7ffbfae 100644 --- a/scripts/run-pipeline.ts +++ b/scripts/run-pipeline.ts @@ -182,11 +182,11 @@ function printHelp(): void { ' --spurious-fp-count=N --mock only. Synthetic FP redactions per row. Default: 0.', ' --activity-id-prefix=S per-row activity_id prefix. Default: paper-1-healthcare.', '', - 'Auth modes for --live runs (4 valid combinations):', - ' 1. lcr_live_* key + non-BYOK customer profile → only --api-key / LUCAIRN_API_KEY required.', - ' 2. lcr_live_* key + ByokPerRequest profile → --api-key + --upstream-key both required.', - ' 3. Direct provider key + X-DSA-Key auth fallback → not supported by this harness.', - ' 4. Authorization: Bearer relay → not supported by this harness.', + 'Auth modes (3 supported by this harness; covers Slice 2 mock + Slice 3 live):', + ' 1. --mock → no auth required; in-process msw mock; tests + dev.', + ' 2. --live + --api-key → non-BYOK customer profile (Lucairn-managed AI).', + ' 3. --live + --api-key + --upstream-key → BYOK-per-request profile; gateway gate at', + ' dual-sandbox-architecture/services/gateway/internal/api/proxy.go:349-354.', 'Slice 2 ships --mock support only. --live is reserved for Slice 3 and requires Marc-confirmation.', ]; for (const ln of lines) {