From d46fdcc32745c821767a9b1c7433a077b9bf97e3 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 9 Feb 2026 08:31:47 +0000 Subject: [PATCH 1/2] Modularize build-data.mjs and move entity transformation to build time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase A: Extract 4 inline function groups from build-data.mjs into separate modules in apps/longterm/scripts/lib/: - computed-facts.mjs: parseNumericValue, resolveComputedFacts - statistics.mjs: computeStats - unconverted-links.mjs: buildUrlToResourceMap, findUnconvertedLinks, countConvertedLinks - mdx-generator.mjs: generateMdxFromYaml Phase B: Move entity transformation from runtime (index.ts) to build time: - Create entity-transform.mjs with transformEntities orchestrator that handles type mapping, expert/org merging, risk categories, and entity overrides - Add typedEntities to database.json output from build-data.mjs - Simplify getTypedEntities() in index.ts to read pre-transformed entities - Remove ~287 lines of dead runtime transform code from index.ts Results: build-data.mjs 1124→592 lines, index.ts 1246→959 lines. https://claude.ai/code/session_01DL2zdVVyQUfB3UXYjA7Sj7 --- .../src/data/__tests__/data.test.ts | 42 + apps/longterm-next/src/data/index.ts | 324 +- apps/longterm/scripts/build-data.mjs | 564 +- apps/longterm/scripts/lib/computed-facts.mjs | 285 + .../longterm/scripts/lib/entity-transform.mjs | 333 + apps/longterm/scripts/lib/mdx-generator.mjs | 98 + apps/longterm/scripts/lib/statistics.mjs | 84 + .../scripts/lib/unconverted-links.mjs | 101 + apps/longterm/src/data/database.json | 25201 +++++++++++++++- 9 files changed, 24719 insertions(+), 2313 deletions(-) create mode 100644 apps/longterm/scripts/lib/computed-facts.mjs create mode 100644 apps/longterm/scripts/lib/entity-transform.mjs create mode 100644 apps/longterm/scripts/lib/mdx-generator.mjs create mode 100644 apps/longterm/scripts/lib/statistics.mjs create mode 100644 apps/longterm/scripts/lib/unconverted-links.mjs diff --git a/apps/longterm-next/src/data/__tests__/data.test.ts b/apps/longterm-next/src/data/__tests__/data.test.ts index d4b5f1b7..6844e204 100644 --- a/apps/longterm-next/src/data/__tests__/data.test.ts +++ b/apps/longterm-next/src/data/__tests__/data.test.ts @@ -127,6 +127,48 @@ const mockDatabase = { }, }, stats: {}, + typedEntities: [ + { + id: "test-entity", + entityType: "risk", + title: "Test Entity", + description: "A test entity", + severity: "high", + tags: ["ai", "safety"], + clusters: [], + relatedEntries: [{ id: "other-entity", type: "concept" }], + sources: [], + customFields: [], + relatedTopics: [], + riskCategory: "accident", + }, + { + id: "other-entity", + entityType: "concept", + title: "Other Entity", + description: "Another entity", + tags: [], + clusters: [], + relatedEntries: [], + sources: [], + customFields: [], + relatedTopics: [], + }, + { + id: "researcher-1", + entityType: "person", + title: "Dr. Test", + tags: [], + clusters: [], + relatedEntries: [], + sources: [], + customFields: [], + relatedTopics: [], + role: "Researcher", + affiliation: "Test Org", + knownFor: [], + }, + ], }; // Mock fs.readFileSync to return our mock database diff --git a/apps/longterm-next/src/data/index.ts b/apps/longterm-next/src/data/index.ts index 96266ab1..a0f43350 100644 --- a/apps/longterm-next/src/data/index.ts +++ b/apps/longterm-next/src/data/index.ts @@ -2,11 +2,11 @@ * Data layer for longterm-next * * Reads database.json from the local data directory (copied from longterm via sync:data). - * Entity type overrides can be applied locally without modifying the longterm source. * This runs at build time / server-component level only. * - * Entities are validated and transformed into typed entities (discriminated union) - * at load time via Zod schemas. See entity-schemas.ts for the schema definitions. + * Entity transformation (type mapping, expert/org merging, risk categories) is done + * at build time by build-data.mjs and stored in database.json as `typedEntities`. + * This module validates them via Zod schemas. See entity-schemas.ts for schema definitions. */ import fs from "fs"; @@ -15,12 +15,8 @@ import yaml from "js-yaml"; import { TypedEntitySchema, GenericEntitySchema, - OLD_TYPE_MAP, - OLD_LAB_TYPE_TO_ORG_TYPE, type TypedEntity, type GenericEntity, - type RiskEntity, - type OrganizationEntity, isRisk, isPerson, isOrganization, @@ -89,6 +85,7 @@ interface RawEntity { interface DatabaseShape { entities: RawEntity[]; + typedEntities?: Array>; // Pre-transformed entities from build resources: Resource[]; publications: Publication[]; experts: Expert[]; @@ -102,228 +99,6 @@ interface DatabaseShape { stats: Record; } -// ============================================================================ -// ENTITY TYPE OVERRIDES -// Pages whose entity type should be remapped in longterm-next. -// This lets us reclassify entities without modifying the longterm source. -// ============================================================================ - -/** - * Path patterns that should be treated as "project" type. - * Matches against the page path or entity path. - */ -const PROJECT_PATH_PATTERNS = [ - "/knowledge-base/responses/epistemic-tools/tools/", -]; - -/** - * Explicit entity ID → type overrides. - */ -const ENTITY_TYPE_OVERRIDES: Record = { - // Add individual overrides here as needed, e.g.: - // "some-entity-id": "project", -}; - -function applyEntityOverrides(db: DatabaseShape): DatabaseShape { - // Build a set of page IDs that match project path patterns - const projectPageIds = new Set(); - for (const page of db.pages || []) { - if (PROJECT_PATH_PATTERNS.some(pattern => page.path?.includes(pattern))) { - projectPageIds.add(page.id); - } - } - - // Apply overrides to entities - const entities = (db.entities || []).map(entity => { - // Check explicit overrides first - if (ENTITY_TYPE_OVERRIDES[entity.id]) { - return { ...entity, type: ENTITY_TYPE_OVERRIDES[entity.id] }; - } - // Check path-based overrides - if (projectPageIds.has(entity.id)) { - return { ...entity, type: "project" }; - } - return entity; - }); - - // Also create entities for pages in project paths that don't have entities yet - const entityIds = new Set(entities.map(e => e.id)); - const newEntities: RawEntity[] = []; - for (const page of db.pages || []) { - if (projectPageIds.has(page.id) && !entityIds.has(page.id)) { - newEntities.push({ - id: page.id, - type: "project", - title: page.title, - description: page.llmSummary || page.description || undefined, - tags: page.tags || [], - lastUpdated: page.lastUpdated || undefined, - }); - } - } - - return { - ...db, - entities: [...entities, ...newEntities], - }; -} - -// ============================================================================ -// ENTITY TRANSFORMATION (raw → typed) -// ============================================================================ - -/** - * Transform a raw database.json entity into a typed entity. - * - Maps old type names to canonical entityType - * - Flattens lab-* → organization with orgType - * - Extracts customFields into typed fields for researcher → person, policy, etc. - */ -function transformEntity( - raw: RawEntity, - experts: Map, - orgs: Map, -): TypedEntity | GenericEntity | null { - const oldType = raw.type; - const canonicalType = OLD_TYPE_MAP[oldType] || oldType; - - // Build base fields shared across all types - const base = { - id: raw.id, - title: raw.title, - description: raw.description, - tags: raw.tags || [], - clusters: raw.clusters || [], - relatedEntries: raw.relatedEntries || [], - sources: raw.sources || [], - lastUpdated: raw.lastUpdated, - website: raw.website, - numericId: raw.numericId, - path: raw.path, - status: raw.status, - customFields: raw.customFields || [], - relatedTopics: raw.relatedTopics || [], - }; - - // Helper to find a customField value - const cf = (label: string): string | undefined => - raw.customFields?.find(f => f.label === label)?.value; - - // Remove extracted customFields from the passthrough list - const filterCustomFields = (...labels: string[]) => { - const labelSet = new Set(labels); - return (raw.customFields || []).filter(f => !labelSet.has(f.label)); - }; - - switch (canonicalType) { - case "risk": { - return { - ...base, - entityType: "risk" as const, - // Zod safeParse validates these enum values; mismatches produce warnings - severity: raw.severity as RiskEntity["severity"], - likelihood: raw.likelihood, - timeframe: raw.timeframe, - maturity: raw.maturity as RiskEntity["maturity"], - riskCategory: getRiskCategory(raw.id), - }; - } - - case "person": { - // Merge expert data if available - const expert = experts.get(raw.id); - const org = expert?.affiliation ? orgs.get(expert.affiliation) : null; - const role = expert?.role || cf("Role"); - const knownForStr = cf("Known For"); - const knownFor = expert?.knownFor || - (knownForStr ? knownForStr.split(",").map(s => s.trim()).filter(Boolean) : []); - const affiliation = org?.name || expert?.affiliation || cf("Affiliation"); - - return { - ...base, - entityType: "person" as const, - title: expert?.name || raw.title, - website: expert?.website || raw.website, - role, - affiliation, - knownFor, - customFields: filterCustomFields("Role", "Known For", "Affiliation"), - }; - } - - case "organization": { - // Determine orgType from old lab-* type (values match OrganizationEntity["orgType"]) - const orgType = OLD_LAB_TYPE_TO_ORG_TYPE[oldType] as OrganizationEntity["orgType"] | undefined; - // Merge org data if available - const orgData = orgs.get(raw.id); - return { - ...base, - entityType: "organization" as const, - orgType: orgType || (orgData?.type as OrganizationEntity["orgType"]) || undefined, - founded: orgData?.founded || cf("Founded") || cf("Established"), - headquarters: orgData?.headquarters || cf("Location") || cf("Headquarters"), - employees: orgData?.employees || cf("Employees"), - funding: orgData?.funding || cf("Funding"), - website: orgData?.website || raw.website, - title: orgData?.name || raw.title, - customFields: filterCustomFields("Founded", "Established", "Location", "Headquarters", "Employees", "Funding"), - }; - } - - case "policy": { - return { - ...base, - entityType: "policy" as const, - introduced: cf("Introduced") || cf("Established"), - policyStatus: cf("Status"), - author: cf("Author"), - scope: cf("Scope"), - customFields: filterCustomFields("Introduced", "Established", "Status", "Author", "Scope"), - }; - } - - case "approach": - return { ...base, entityType: "approach" as const }; - case "safety-agenda": - return { ...base, entityType: "safety-agenda" as const, goal: cf("Goal") }; - case "concept": - return { ...base, entityType: "concept" as const }; - case "crux": - return { ...base, entityType: "crux" as const }; - case "model": - return { ...base, entityType: "model" as const }; - case "capability": - return { ...base, entityType: "capability" as const }; - case "project": - return { ...base, entityType: "project" as const }; - case "analysis": - return { ...base, entityType: "analysis" as const }; - case "historical": - return { ...base, entityType: "historical" as const }; - case "argument": - return { ...base, entityType: "argument" as const }; - case "scenario": - return { ...base, entityType: "scenario" as const }; - case "case-study": - return { ...base, entityType: "case-study" as const }; - case "funder": - return { ...base, entityType: "funder" as const }; - case "resource": - return { ...base, entityType: "resource" as const }; - case "parameter": - return { ...base, entityType: "parameter" as const }; - case "metric": - return { ...base, entityType: "metric" as const }; - case "risk-factor": - return { ...base, entityType: "risk-factor" as const }; - - default: { - // Unknown types (ai-transition-model-* etc.) — validated as generic entity - const generic = GenericEntitySchema.safeParse({ ...base, entityType: canonicalType }); - return generic.success ? generic.data : { ...base, entityType: canonicalType }; - } - } -} - // ============================================================================ // DATABASE LOADING // ============================================================================ @@ -344,8 +119,7 @@ function getDatabase(): DatabaseShape { try { const raw = fs.readFileSync(dbPath, "utf-8"); - const rawDb = JSON.parse(raw) as DatabaseShape; - _database = applyEntityOverrides(rawDb); + _database = JSON.parse(raw) as DatabaseShape; } catch (err) { throw new Error( `Failed to load database from ${dbPath}: ${err instanceof Error ? err.message : err}. ` + @@ -359,32 +133,22 @@ function getTypedEntities(): AnyEntity[] { if (_typedEntities) return _typedEntities; const db = getDatabase(); - const expertMap = new Map((db.experts || []).map(e => [e.id, e])); - const orgMap = new Map((db.organizations || []).map(o => [o.id, o])); - - const entities: AnyEntity[] = []; - const isDev = process.env.NODE_ENV === "development"; - - for (const raw of db.entities || []) { - const typed = transformEntity(raw, expertMap, orgMap); - if (!typed) continue; - - // Build-time validation via Zod - const result = TypedEntitySchema.safeParse(typed); - if (!result.success) { - if (isDev) { - console.warn( - `[entity-validation] ${raw.id} (${raw.type} → ${typed.entityType}): ${result.error.issues.map(i => i.message).join(", ")}` - ); - } - // Still include the entity — the generic fallback handles unknown types - entities.push(typed); - } else { - entities.push(result.data); - } + + if (!db.typedEntities || db.typedEntities.length === 0) { + throw new Error( + "database.json has no typedEntities. Rebuild with: pnpm --filter longterm build:data" + ); } - _typedEntities = entities; + // Pre-transformed entities from build time + _typedEntities = db.typedEntities.map((raw) => { + const parsed = TypedEntitySchema.safeParse(raw); + if (parsed.success) return parsed.data; + const generic = GenericEntitySchema.safeParse(raw); + if (generic.success) return generic.data; + return null; + }).filter((e): e is AnyEntity => e !== null); + return _typedEntities; } @@ -875,56 +639,6 @@ export function getOrgInfoBoxData(orgId: string) { return getEntityInfoBoxData(orgId); } -// ============================================================================ -// RISK CATEGORIES (inline minimal version) -// ============================================================================ - -const RISK_CATEGORIES = { - epistemic: [ - "authentication-collapse", - "automation-bias", - "consensus-manufacturing", - "epistemic-collapse", - "epistemic-sycophancy", - "trust-cascade", - "trust-decline", - ], - misuse: [ - "authoritarian-tools", - "autonomous-weapons", - "bioweapons", - "cyberweapons", - "deepfakes", - "disinformation", - "fraud", - "surveillance", - ], - structural: [ - "concentration-of-power", - "economic-disruption", - "enfeeblement", - "lock-in", - "racing-dynamics", - "winner-take-all", - ], -} as const; - -function getRiskCategory( - riskId: string -): "epistemic" | "misuse" | "structural" | "accident" { - if ( - (RISK_CATEGORIES.epistemic as readonly string[]).includes(riskId) - ) - return "epistemic"; - if ((RISK_CATEGORIES.misuse as readonly string[]).includes(riskId)) - return "misuse"; - if ( - (RISK_CATEGORIES.structural as readonly string[]).includes(riskId) - ) - return "structural"; - return "accident"; -} - // ============================================================================ // EXTERNAL LINKS (loads YAML via fs) // ============================================================================ diff --git a/apps/longterm/scripts/build-data.mjs b/apps/longterm/scripts/build-data.mjs index 64a8eab5..ff41e53b 100644 --- a/apps/longterm/scripts/build-data.mjs +++ b/apps/longterm/scripts/build-data.mjs @@ -8,204 +8,22 @@ * Usage: node scripts/build-data.mjs */ -import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'fs'; +import { readFileSync, writeFileSync, existsSync, readdirSync, statSync } from 'fs'; import { spawnSync } from 'child_process'; import { join, basename, relative } from 'path'; import { parse } from 'yaml'; -import { extractMetrics, suggestQuality, getQualityDiscrepancy } from './lib/metrics-extractor.mjs'; +import { extractMetrics, suggestQuality } from './lib/metrics-extractor.mjs'; import { computeRedundancy } from './lib/redundancy.mjs'; import { CONTENT_DIR, DATA_DIR } from './lib/content-types.mjs'; import { generateLLMFiles } from './generate-llm-files.mjs'; - -// ============================================================================= -// UNCONVERTED LINK DETECTION -// ============================================================================= - -/** - * Normalize URL to handle variations (trailing slashes, www prefix, http/https) - */ -function normalizeUrl(url) { - const variations = new Set(); - try { - const parsed = new URL(url); - const base = parsed.href.replace(/\/$/, ''); - variations.add(base); - variations.add(base + '/'); - - // Without www - if (parsed.hostname.startsWith('www.')) { - const noWww = base.replace('://www.', '://'); - variations.add(noWww); - variations.add(noWww + '/'); - } - // With www - if (!parsed.hostname.startsWith('www.')) { - const withWww = base.replace('://', '://www.'); - variations.add(withWww); - variations.add(withWww + '/'); - } - } catch { - variations.add(url); - } - return Array.from(variations); -} - -/** - * Build URL → resource map from resources - */ -function buildUrlToResourceMap(resources) { - const urlToResource = new Map(); - for (const r of resources) { - if (!r.url) continue; - const normalizedUrls = normalizeUrl(r.url); - for (const url of normalizedUrls) { - urlToResource.set(url, r); - } - } - return urlToResource; -} - -/** - * Extract markdown links from content (not images, not internal, not components) - */ -function extractMarkdownLinks(content) { - const links = []; - // Match [text](url) but not images ![text](url) - const linkRegex = /(? component usages in content (already converted links) - */ -function countConvertedLinks(content) { - // Match or ... - const rComponentRegex = / -`; -} - -/** - * Generate MDX files for entities with YAML-first content structure - * Only generates/updates files that are marked as generated stubs - */ -function generateMdxFromYaml(entities, options = { dryRun: false }) { - const generated = []; - const skipped = []; - - for (const entity of entities) { - // Only process entities with content field and path - if (!entity.content || !entity.path) continue; - - // Convert URL path to file path - // e.g., /ai-transition-model/scenarios/human-catastrophe/state-actor/ - // -> src/content/docs/ai-transition-model/scenarios/human-catastrophe/state-actor.mdx - const urlPath = entity.path.replace(/^\/|\/$/g, ''); // Remove leading/trailing slashes - const mdxPath = join(CONTENT_DIR, `${urlPath}.mdx`); - - // Check if we should generate this file - if (!shouldGenerateMdx(mdxPath, entity)) { - skipped.push({ id: entity.id, path: mdxPath, reason: 'custom content' }); - continue; - } - - const mdxContent = generateMdxStub(entity); - - if (options.dryRun) { - generated.push({ id: entity.id, path: mdxPath, action: 'would generate' }); - } else { - // Ensure directory exists - const dir = join(mdxPath, '..'); - if (!existsSync(dir)) { - mkdirSync(dir, { recursive: true }); - } - - writeFileSync(mdxPath, mdxContent); - generated.push({ id: entity.id, path: mdxPath, action: 'generated' }); - } - } - - return { generated, skipped }; -} - // Files to combine const DATA_FILES = [ { key: 'experts', file: 'experts.yaml' }, @@ -497,364 +315,6 @@ function buildPathRegistry() { return registry; } -/** - * Compute aggregate statistics - */ -function computeStats(entities, backlinks, tagIndex) { - // Count by type - const byType = {}; - for (const entity of entities) { - byType[entity.type] = (byType[entity.type] || 0) + 1; - } - - // Count by severity - const bySeverity = {}; - for (const entity of entities) { - if (entity.severity) { - bySeverity[entity.severity] = (bySeverity[entity.severity] || 0) + 1; - } - } - - // Count by status - const byStatus = {}; - for (const entity of entities) { - const status = entity.status || 'unknown'; - byStatus[status] = (byStatus[status] || 0) + 1; - } - - // Recently updated (sort by lastUpdated, take top 10) - const recentlyUpdated = entities - .filter((e) => e.lastUpdated) - .sort((a, b) => b.lastUpdated.localeCompare(a.lastUpdated)) - .slice(0, 10) - .map((e) => ({ - id: e.id, - type: e.type, - title: e.title, - lastUpdated: e.lastUpdated, - })); - - // Most linked (entities with most backlinks) - const mostLinked = Object.entries(backlinks) - .map(([id, links]) => ({ - id, - count: links.length, - entity: entities.find((e) => e.id === id), - })) - .filter((item) => item.entity) - .sort((a, b) => b.count - a.count) - .slice(0, 10) - .map((item) => ({ - id: item.id, - type: item.entity.type, - title: item.entity.title, - backlinkCount: item.count, - })); - - // Tag statistics - const topTags = Object.entries(tagIndex) - .map(([tag, entities]) => ({ tag, count: entities.length })) - .sort((a, b) => b.count - a.count) - .slice(0, 20); - - // Entities with descriptions - const withDescription = entities.filter((e) => e.description).length; - - return { - totalEntities: entities.length, - byType, - bySeverity, - byStatus, - recentlyUpdated, - mostLinked, - topTags, - totalTags: Object.keys(tagIndex).length, - withDescription, - lastBuilt: new Date().toISOString(), - }; -} - -// ============================================================================= -// COMPUTED FACTS — expression evaluator and numeric parser -// ============================================================================= - -/** - * Auto-parse a numeric value from a human-readable string. - * Returns null if the string can't be reliably parsed. - * - * Examples: - * "$350 billion" → 350_000_000_000 - * "$13 billion" → 13_000_000_000 - * "$3.4 billion" → 3_400_000_000 - * "100 million" → 100_000_000 - * "$76,001/year" → 76001 - * "175 billion" → 175_000_000_000 - * "1,900" → 1900 - * "40%" → 0.4 - * "83%" → 0.83 - */ -function parseNumericValue(value) { - if (!value || typeof value !== 'string') return null; - - // Skip ranges and ambiguous values - if (value.includes(' to ') || (value.includes('-') && value.match(/\d+-\d/))) return null; - if (value.includes('+') && !value.startsWith('+')) return null; // "300,000+" is ambiguous - - const s = value.trim(); - - // Percentage: "40%" → 0.4 - const pctMatch = s.match(/^(\d+(?:\.\d+)?)%$/); - if (pctMatch) return parseFloat(pctMatch[1]) / 100; - - // Dollar + number + unit: "$13 billion", "$3.4 million" - const dollarUnitMatch = s.match(/^\$?([\d,.]+)\s*(billion|million|trillion|thousand)?\s*(?:\/\w+)?$/i); - if (dollarUnitMatch) { - const num = parseFloat(dollarUnitMatch[1].replace(/,/g, '')); - if (isNaN(num)) return null; - const unit = (dollarUnitMatch[2] || '').toLowerCase(); - const multipliers = { trillion: 1e12, billion: 1e9, million: 1e6, thousand: 1e3, '': 1 }; - return num * (multipliers[unit] || 1); - } - - // Plain number with possible commas: "1,900" - const plainMatch = s.match(/^[\d,]+(?:\.\d+)?$/); - if (plainMatch) { - return parseFloat(s.replace(/,/g, '')); - } - - return null; -} - -/** - * Safe expression evaluator for computed facts. - * Supports: numbers, +, -, *, /, parentheses, and {entity.factId} references. - * - * Uses recursive descent parsing — no eval(). - */ -function evaluateExpression(expression, facts) { - // Replace {entity.factId} references with numeric values - const resolved = expression.replace(/\{([^}]+)\}/g, (match, ref) => { - const fact = facts[ref]; - if (!fact) { - throw new Error(`Unknown fact reference: ${ref}`); - } - if (fact.noCompute) { - throw new Error(`Fact ${ref} is marked noCompute (not a computable quantity)`); - } - if (fact.numeric == null) { - throw new Error(`Fact ${ref} has no numeric value`); - } - return String(fact.numeric); - }); - - // Tokenize - const tokens = []; - let i = 0; - while (i < resolved.length) { - if (/\s/.test(resolved[i])) { i++; continue; } - if ('+-*/()'.includes(resolved[i])) { - tokens.push({ type: 'op', value: resolved[i] }); - i++; - } else if (/[\d.]/.test(resolved[i])) { - let num = ''; - while (i < resolved.length && /[\d.eE]/.test(resolved[i])) { - num += resolved[i]; i++; - } - // Handle signed exponent (e.g., 3.5e+12, 1e-7) - if (/[eE]$/.test(num) && i < resolved.length && (resolved[i] === '+' || resolved[i] === '-')) { - num += resolved[i]; i++; - while (i < resolved.length && /\d/.test(resolved[i])) { - num += resolved[i]; i++; - } - } - tokens.push({ type: 'num', value: parseFloat(num) }); - } else { - throw new Error(`Unexpected character in expression: "${resolved[i]}" at position ${i}`); - } - } - - // Recursive descent parser - let pos = 0; - function peek() { return tokens[pos]; } - function consume(expected) { - const t = tokens[pos++]; - if (expected && (t?.type !== 'op' || t?.value !== expected)) { - throw new Error(`Expected "${expected}" but got "${t?.value}"`); - } - return t; - } - - function parseExpr() { - let left = parseTerm(); - while (peek()?.type === 'op' && (peek().value === '+' || peek().value === '-')) { - const op = consume().value; - const right = parseTerm(); - left = op === '+' ? left + right : left - right; - } - return left; - } - - function parseTerm() { - let left = parseFactor(); - while (peek()?.type === 'op' && (peek().value === '*' || peek().value === '/')) { - const op = consume().value; - const right = parseFactor(); - if (op === '/') { - if (right === 0) throw new Error('Division by zero'); - left = left / right; - } else { - left = left * right; - } - } - return left; - } - - function parseFactor() { - const t = peek(); - if (!t) throw new Error('Unexpected end of expression'); - - if (t.type === 'num') { - pos++; - return t.value; - } - if (t.type === 'op' && t.value === '(') { - consume('('); - const val = parseExpr(); - consume(')'); - return val; - } - if (t.type === 'op' && t.value === '-') { - consume(); - return -parseFactor(); - } - throw new Error(`Unexpected token: ${JSON.stringify(t)}`); - } - - const result = parseExpr(); - if (pos < tokens.length) { - throw new Error(`Unexpected tokens after expression: ${tokens.slice(pos).map(t => t.value).join(' ')}`); - } - return result; -} - -/** - * Check if a compute expression references any currency-denominated facts. - */ -function isCurrencyExpression(expression, facts) { - const refRegex = /\{([^}]+)\}/g; - let m; - while ((m = refRegex.exec(expression)) !== null) { - const fact = facts[m[1]]; - if (fact?.value && fact.value.trim().startsWith('$')) return true; - } - return false; -} - -/** - * Format a computed numeric value for display. - * @param {number} numeric - The computed value - * @param {string|undefined} format - Printf-style format string - * @param {number|undefined} formatDivisor - Divisor before formatting - * @param {boolean} isCurrency - Whether the result is a dollar amount - */ -function formatComputedValue(numeric, format, formatDivisor, isCurrency = false) { - if (!isFinite(numeric)) throw new Error(`Computed value is ${numeric} (expected a finite number)`); - const displayNum = formatDivisor ? numeric / formatDivisor : numeric; - - if (!format) { - const prefix = isCurrency ? '$' : ''; - const n = displayNum; - // Default: reasonable formatting for large numbers - if (Math.abs(n) >= 1e12) return `${prefix}${(n / 1e12).toFixed(1)} trillion`; - if (Math.abs(n) >= 1e9) return `${prefix}${(n / 1e9).toFixed(1)} billion`; - if (Math.abs(n) >= 1e6) return `${prefix}${(n / 1e6).toFixed(1)} million`; - return isCurrency ? `${prefix}${n.toLocaleString('en-US')}` : n.toLocaleString('en-US'); - } - - // Simple printf-style: replace %.Nf with the formatted number - return format.replace(/%(?:\.(\d+))?f/, (_, decimals) => { - const d = decimals ? parseInt(decimals) : 0; - return displayNum.toFixed(d); - }); -} - -/** - * Resolve all computed facts in dependency order. - * Returns count of computed facts. - */ -function resolveComputedFacts(facts) { - // Find all computed facts - const computed = Object.entries(facts).filter(([, f]) => f.compute); - if (computed.length === 0) return 0; - - // Extract dependencies for each computed fact - const deps = new Map(); - for (const [key, fact] of computed) { - const refs = []; - const refRegex = /\{([^}]+)\}/g; - let m; - while ((m = refRegex.exec(fact.compute)) !== null) { - refs.push(m[1]); - } - deps.set(key, refs); - } - - // Topological sort (Kahn's algorithm) - const inDegree = new Map(); - const graph = new Map(); - for (const [key, refKeys] of deps) { - inDegree.set(key, 0); - graph.set(key, []); - } - for (const [key, refKeys] of deps) { - for (const ref of refKeys) { - if (deps.has(ref)) { - // ref is also a computed fact → key depends on ref - graph.get(ref).push(key); - inDegree.set(key, (inDegree.get(key) || 0) + 1); - } - } - } - - const queue = []; - for (const [key, deg] of inDegree) { - if (deg === 0) queue.push(key); - } - - const order = []; - while (queue.length > 0) { - const current = queue.shift(); - order.push(current); - for (const dependent of (graph.get(current) || [])) { - inDegree.set(dependent, inDegree.get(dependent) - 1); - if (inDegree.get(dependent) === 0) queue.push(dependent); - } - } - - if (order.length !== computed.length) { - const missing = computed.map(([k]) => k).filter(k => !order.includes(k)); - throw new Error(`Circular dependency in computed facts: ${missing.join(', ')}`); - } - - // Evaluate in order - let resolved = 0; - for (const key of order) { - const fact = facts[key]; - try { - const numeric = evaluateExpression(fact.compute, facts); - fact.numeric = numeric; - const currency = isCurrencyExpression(fact.compute, facts); - fact.value = formatComputedValue(numeric, fact.format, fact.formatDivisor, currency); - fact.computed = true; - resolved++; - } catch (err) { - console.warn(` ⚠️ Failed to compute ${key}: ${err.message}`); - } - } - - return resolved; -} - function main() { console.log('Building data bundle...\n'); @@ -1061,6 +521,16 @@ function main() { } database.insights = insightsList; + // Transform entities to typed entities (build-time transformation) + const typedEntities = transformEntities( + database.entities, + pages, + database.experts, + database.organizations + ); + database.typedEntities = typedEntities; + console.log(` typedEntities: ${typedEntities.length}`); + // Write combined JSON writeFileSync(OUTPUT_FILE, JSON.stringify(database, null, 2)); console.log(`\n✓ Written: ${OUTPUT_FILE}`); diff --git a/apps/longterm/scripts/lib/computed-facts.mjs b/apps/longterm/scripts/lib/computed-facts.mjs new file mode 100644 index 00000000..76c4a792 --- /dev/null +++ b/apps/longterm/scripts/lib/computed-facts.mjs @@ -0,0 +1,285 @@ +/** + * Computed Facts — expression evaluator and numeric parser + * + * Parses human-readable numeric strings and evaluates computed fact expressions + * with dependency ordering (topological sort). + * + * Extracted from build-data.mjs for modularity. + */ + +/** + * Auto-parse a numeric value from a human-readable string. + * Returns null if the string can't be reliably parsed. + * + * Examples: + * "$350 billion" → 350_000_000_000 + * "$13 billion" → 13_000_000_000 + * "$3.4 billion" → 3_400_000_000 + * "100 million" → 100_000_000 + * "$76,001/year" → 76001 + * "175 billion" → 175_000_000_000 + * "1,900" → 1900 + * "40%" → 0.4 + * "83%" → 0.83 + */ +export function parseNumericValue(value) { + if (!value || typeof value !== 'string') return null; + + // Skip ranges and ambiguous values + if (value.includes(' to ') || (value.includes('-') && value.match(/\d+-\d/))) return null; + if (value.includes('+') && !value.startsWith('+')) return null; // "300,000+" is ambiguous + + const s = value.trim(); + + // Percentage: "40%" → 0.4 + const pctMatch = s.match(/^(\d+(?:\.\d+)?)%$/); + if (pctMatch) return parseFloat(pctMatch[1]) / 100; + + // Dollar + number + unit: "$13 billion", "$3.4 million" + const dollarUnitMatch = s.match(/^\$?([\d,.]+)\s*(billion|million|trillion|thousand)?\s*(?:\/\w+)?$/i); + if (dollarUnitMatch) { + const num = parseFloat(dollarUnitMatch[1].replace(/,/g, '')); + if (isNaN(num)) return null; + const unit = (dollarUnitMatch[2] || '').toLowerCase(); + const multipliers = { trillion: 1e12, billion: 1e9, million: 1e6, thousand: 1e3, '': 1 }; + return num * (multipliers[unit] || 1); + } + + // Plain number with possible commas: "1,900" + const plainMatch = s.match(/^[\d,]+(?:\.\d+)?$/); + if (plainMatch) { + return parseFloat(s.replace(/,/g, '')); + } + + return null; +} + +/** + * Safe expression evaluator for computed facts. + * Supports: numbers, +, -, *, /, parentheses, and {entity.factId} references. + * + * Uses recursive descent parsing — no eval(). + */ +function evaluateExpression(expression, facts) { + // Replace {entity.factId} references with numeric values + const resolved = expression.replace(/\{([^}]+)\}/g, (match, ref) => { + const fact = facts[ref]; + if (!fact) { + throw new Error(`Unknown fact reference: ${ref}`); + } + if (fact.noCompute) { + throw new Error(`Fact ${ref} is marked noCompute (not a computable quantity)`); + } + if (fact.numeric == null) { + throw new Error(`Fact ${ref} has no numeric value`); + } + return String(fact.numeric); + }); + + // Tokenize + const tokens = []; + let i = 0; + while (i < resolved.length) { + if (/\s/.test(resolved[i])) { i++; continue; } + if ('+-*/()'.includes(resolved[i])) { + tokens.push({ type: 'op', value: resolved[i] }); + i++; + } else if (/[\d.]/.test(resolved[i])) { + let num = ''; + while (i < resolved.length && /[\d.eE]/.test(resolved[i])) { + num += resolved[i]; i++; + } + // Handle signed exponent (e.g., 3.5e+12, 1e-7) + if (/[eE]$/.test(num) && i < resolved.length && (resolved[i] === '+' || resolved[i] === '-')) { + num += resolved[i]; i++; + while (i < resolved.length && /\d/.test(resolved[i])) { + num += resolved[i]; i++; + } + } + tokens.push({ type: 'num', value: parseFloat(num) }); + } else { + throw new Error(`Unexpected character in expression: "${resolved[i]}" at position ${i}`); + } + } + + // Recursive descent parser + let pos = 0; + function peek() { return tokens[pos]; } + function consume(expected) { + const t = tokens[pos++]; + if (expected && (t?.type !== 'op' || t?.value !== expected)) { + throw new Error(`Expected "${expected}" but got "${t?.value}"`); + } + return t; + } + + function parseExpr() { + let left = parseTerm(); + while (peek()?.type === 'op' && (peek().value === '+' || peek().value === '-')) { + const op = consume().value; + const right = parseTerm(); + left = op === '+' ? left + right : left - right; + } + return left; + } + + function parseTerm() { + let left = parseFactor(); + while (peek()?.type === 'op' && (peek().value === '*' || peek().value === '/')) { + const op = consume().value; + const right = parseFactor(); + if (op === '/') { + if (right === 0) throw new Error('Division by zero'); + left = left / right; + } else { + left = left * right; + } + } + return left; + } + + function parseFactor() { + const t = peek(); + if (!t) throw new Error('Unexpected end of expression'); + + if (t.type === 'num') { + pos++; + return t.value; + } + if (t.type === 'op' && t.value === '(') { + consume('('); + const val = parseExpr(); + consume(')'); + return val; + } + if (t.type === 'op' && t.value === '-') { + consume(); + return -parseFactor(); + } + throw new Error(`Unexpected token: ${JSON.stringify(t)}`); + } + + const result = parseExpr(); + if (pos < tokens.length) { + throw new Error(`Unexpected tokens after expression: ${tokens.slice(pos).map(t => t.value).join(' ')}`); + } + return result; +} + +/** + * Check if a compute expression references any currency-denominated facts. + */ +function isCurrencyExpression(expression, facts) { + const refRegex = /\{([^}]+)\}/g; + let m; + while ((m = refRegex.exec(expression)) !== null) { + const fact = facts[m[1]]; + if (fact?.value && fact.value.trim().startsWith('$')) return true; + } + return false; +} + +/** + * Format a computed numeric value for display. + * @param {number} numeric - The computed value + * @param {string|undefined} format - Printf-style format string + * @param {number|undefined} formatDivisor - Divisor before formatting + * @param {boolean} isCurrency - Whether the result is a dollar amount + */ +function formatComputedValue(numeric, format, formatDivisor, isCurrency = false) { + if (!isFinite(numeric)) throw new Error(`Computed value is ${numeric} (expected a finite number)`); + const displayNum = formatDivisor ? numeric / formatDivisor : numeric; + + if (!format) { + const prefix = isCurrency ? '$' : ''; + const n = displayNum; + // Default: reasonable formatting for large numbers + if (Math.abs(n) >= 1e12) return `${prefix}${(n / 1e12).toFixed(1)} trillion`; + if (Math.abs(n) >= 1e9) return `${prefix}${(n / 1e9).toFixed(1)} billion`; + if (Math.abs(n) >= 1e6) return `${prefix}${(n / 1e6).toFixed(1)} million`; + return isCurrency ? `${prefix}${n.toLocaleString('en-US')}` : n.toLocaleString('en-US'); + } + + // Simple printf-style: replace %.Nf with the formatted number + return format.replace(/%(?:\.(\d+))?f/, (_, decimals) => { + const d = decimals ? parseInt(decimals) : 0; + return displayNum.toFixed(d); + }); +} + +/** + * Resolve all computed facts in dependency order. + * Returns count of computed facts. + */ +export function resolveComputedFacts(facts) { + // Find all computed facts + const computed = Object.entries(facts).filter(([, f]) => f.compute); + if (computed.length === 0) return 0; + + // Extract dependencies for each computed fact + const deps = new Map(); + for (const [key, fact] of computed) { + const refs = []; + const refRegex = /\{([^}]+)\}/g; + let m; + while ((m = refRegex.exec(fact.compute)) !== null) { + refs.push(m[1]); + } + deps.set(key, refs); + } + + // Topological sort (Kahn's algorithm) + const inDegree = new Map(); + const graph = new Map(); + for (const [key, refKeys] of deps) { + inDegree.set(key, 0); + graph.set(key, []); + } + for (const [key, refKeys] of deps) { + for (const ref of refKeys) { + if (deps.has(ref)) { + // ref is also a computed fact → key depends on ref + graph.get(ref).push(key); + inDegree.set(key, (inDegree.get(key) || 0) + 1); + } + } + } + + const queue = []; + for (const [key, deg] of inDegree) { + if (deg === 0) queue.push(key); + } + + const order = []; + while (queue.length > 0) { + const current = queue.shift(); + order.push(current); + for (const dependent of (graph.get(current) || [])) { + inDegree.set(dependent, inDegree.get(dependent) - 1); + if (inDegree.get(dependent) === 0) queue.push(dependent); + } + } + + if (order.length !== computed.length) { + const missing = computed.map(([k]) => k).filter(k => !order.includes(k)); + throw new Error(`Circular dependency in computed facts: ${missing.join(', ')}`); + } + + // Evaluate in order + let resolved = 0; + for (const key of order) { + const fact = facts[key]; + try { + const numeric = evaluateExpression(fact.compute, facts); + fact.numeric = numeric; + const currency = isCurrencyExpression(fact.compute, facts); + fact.value = formatComputedValue(numeric, fact.format, fact.formatDivisor, currency); + fact.computed = true; + resolved++; + } catch (err) { + console.warn(` ⚠️ Failed to compute ${key}: ${err.message}`); + } + } + + return resolved; +} diff --git a/apps/longterm/scripts/lib/entity-transform.mjs b/apps/longterm/scripts/lib/entity-transform.mjs new file mode 100644 index 00000000..b31be2e3 --- /dev/null +++ b/apps/longterm/scripts/lib/entity-transform.mjs @@ -0,0 +1,333 @@ +/** + * Entity Transformation + * + * Transforms raw database.json entities into typed entities at build time. + * This replaces the runtime transformation that was previously done in + * longterm-next/src/data/index.ts. + * + * Handles: + * - Type mapping (old types → canonical entityType) + * - Expert/org data merging + * - Risk category assignment + * - CustomField extraction into typed fields + * - Entity type overrides (path-based and explicit) + */ + +// ============================================================================ +// TYPE MAPS (mirrored from entity-schemas.ts) +// ============================================================================ + +/** + * Maps old database.json `type` values to canonical `entityType` values. + * Types not listed here map to themselves. + */ +const OLD_TYPE_MAP = { + // Lab types → organization + lab: 'organization', + 'lab-frontier': 'organization', + 'lab-research': 'organization', + 'lab-academic': 'organization', + 'lab-startup': 'organization', + // Researcher → person + researcher: 'person', +}; + +/** + * Maps old lab types to orgType values. + */ +const OLD_LAB_TYPE_TO_ORG_TYPE = { + lab: 'generic', + 'lab-frontier': 'frontier-lab', + 'lab-research': 'safety-org', + 'lab-academic': 'academic', + 'lab-startup': 'startup', +}; + +// ============================================================================ +// RISK CATEGORIES +// ============================================================================ + +const RISK_CATEGORIES = { + epistemic: [ + 'authentication-collapse', + 'automation-bias', + 'consensus-manufacturing', + 'epistemic-collapse', + 'epistemic-sycophancy', + 'trust-cascade', + 'trust-decline', + ], + misuse: [ + 'authoritarian-tools', + 'autonomous-weapons', + 'bioweapons', + 'cyberweapons', + 'deepfakes', + 'disinformation', + 'fraud', + 'surveillance', + ], + structural: [ + 'concentration-of-power', + 'economic-disruption', + 'enfeeblement', + 'lock-in', + 'racing-dynamics', + 'winner-take-all', + ], +}; + +function getRiskCategory(riskId) { + if (RISK_CATEGORIES.epistemic.includes(riskId)) return 'epistemic'; + if (RISK_CATEGORIES.misuse.includes(riskId)) return 'misuse'; + if (RISK_CATEGORIES.structural.includes(riskId)) return 'structural'; + return 'accident'; +} + +// ============================================================================ +// ENTITY TYPE OVERRIDES +// ============================================================================ + +/** + * Path patterns that should be treated as "project" type. + * Matches against the page path or entity path. + */ +const PROJECT_PATH_PATTERNS = [ + '/knowledge-base/responses/epistemic-tools/tools/', +]; + +/** + * Explicit entity ID → type overrides. + */ +const ENTITY_TYPE_OVERRIDES = { + // Add individual overrides here as needed, e.g.: + // "some-entity-id": "project", +}; + +/** + * Apply entity type overrides based on path patterns and explicit overrides. + * Also creates entities for pages in project paths that don't have entities yet. + */ +function applyEntityOverrides(entities, pages) { + // Build a set of page IDs that match project path patterns + const projectPageIds = new Set(); + for (const page of pages || []) { + if (PROJECT_PATH_PATTERNS.some(pattern => page.path?.includes(pattern))) { + projectPageIds.add(page.id); + } + } + + // Apply overrides to entities + const overriddenEntities = entities.map(entity => { + // Check explicit overrides first + if (ENTITY_TYPE_OVERRIDES[entity.id]) { + return { ...entity, type: ENTITY_TYPE_OVERRIDES[entity.id] }; + } + // Check path-based overrides + if (projectPageIds.has(entity.id)) { + return { ...entity, type: 'project' }; + } + return entity; + }); + + // Also create entities for pages in project paths that don't have entities yet + const entityIds = new Set(overriddenEntities.map(e => e.id)); + const newEntities = []; + for (const page of pages || []) { + if (projectPageIds.has(page.id) && !entityIds.has(page.id)) { + newEntities.push({ + id: page.id, + type: 'project', + title: page.title, + description: page.llmSummary || page.description || undefined, + tags: page.tags || [], + lastUpdated: page.lastUpdated || undefined, + }); + } + } + + return [...overriddenEntities, ...newEntities]; +} + +// ============================================================================ +// ENTITY TRANSFORMATION +// ============================================================================ + +/** + * Transform a raw entity into a typed entity. + * - Maps old type names to canonical entityType + * - Flattens lab-* → organization with orgType + * - Extracts customFields into typed fields for researcher → person, policy, etc. + */ +function transformEntity(raw, expertMap, orgMap) { + const oldType = raw.type; + const canonicalType = OLD_TYPE_MAP[oldType] || oldType; + + // Build base fields shared across all types + const base = { + id: raw.id, + title: raw.title, + description: raw.description, + tags: raw.tags || [], + clusters: raw.clusters || [], + relatedEntries: raw.relatedEntries || [], + sources: raw.sources || [], + lastUpdated: raw.lastUpdated, + website: raw.website, + numericId: raw.numericId, + path: raw.path, + status: raw.status, + customFields: raw.customFields || [], + relatedTopics: raw.relatedTopics || [], + }; + + // Helper to find a customField value + const cf = (label) => + raw.customFields?.find(f => f.label === label)?.value; + + // Remove extracted customFields from the passthrough list + const filterCustomFields = (...labels) => { + const labelSet = new Set(labels); + return (raw.customFields || []).filter(f => !labelSet.has(f.label)); + }; + + switch (canonicalType) { + case 'risk': { + return { + ...base, + entityType: 'risk', + severity: raw.severity, + likelihood: raw.likelihood, + timeframe: raw.timeframe, + maturity: raw.maturity, + riskCategory: getRiskCategory(raw.id), + }; + } + + case 'person': { + // Merge expert data if available + const expert = expertMap.get(raw.id); + const org = expert?.affiliation ? orgMap.get(expert.affiliation) : null; + const role = expert?.role || cf('Role'); + const knownForStr = cf('Known For'); + const knownFor = expert?.knownFor || + (knownForStr ? knownForStr.split(',').map(s => s.trim()).filter(Boolean) : []); + const affiliation = org?.name || expert?.affiliation || cf('Affiliation'); + + return { + ...base, + entityType: 'person', + title: expert?.name || raw.title, + website: expert?.website || raw.website, + role, + affiliation, + knownFor, + customFields: filterCustomFields('Role', 'Known For', 'Affiliation'), + }; + } + + case 'organization': { + // Determine orgType from old lab-* type + const orgType = OLD_LAB_TYPE_TO_ORG_TYPE[oldType] || undefined; + // Merge org data if available + const orgData = orgMap.get(raw.id); + return { + ...base, + entityType: 'organization', + orgType: orgType || orgData?.type || undefined, + founded: orgData?.founded || cf('Founded') || cf('Established'), + headquarters: orgData?.headquarters || cf('Location') || cf('Headquarters'), + employees: orgData?.employees || cf('Employees'), + funding: orgData?.funding || cf('Funding'), + website: orgData?.website || raw.website, + title: orgData?.name || raw.title, + customFields: filterCustomFields('Founded', 'Established', 'Location', 'Headquarters', 'Employees', 'Funding'), + }; + } + + case 'policy': { + return { + ...base, + entityType: 'policy', + introduced: cf('Introduced') || cf('Established'), + policyStatus: cf('Status'), + author: cf('Author'), + scope: cf('Scope'), + customFields: filterCustomFields('Introduced', 'Established', 'Status', 'Author', 'Scope'), + }; + } + + case 'approach': + return { ...base, entityType: 'approach' }; + case 'safety-agenda': + return { ...base, entityType: 'safety-agenda', goal: cf('Goal') }; + case 'concept': + return { ...base, entityType: 'concept' }; + case 'crux': + return { ...base, entityType: 'crux' }; + case 'model': + return { ...base, entityType: 'model' }; + case 'capability': + return { ...base, entityType: 'capability' }; + case 'project': + return { ...base, entityType: 'project' }; + case 'analysis': + return { ...base, entityType: 'analysis' }; + case 'historical': + return { ...base, entityType: 'historical' }; + case 'argument': + return { ...base, entityType: 'argument' }; + case 'scenario': + return { ...base, entityType: 'scenario' }; + case 'case-study': + return { ...base, entityType: 'case-study' }; + case 'funder': + return { ...base, entityType: 'funder' }; + case 'resource': + return { ...base, entityType: 'resource' }; + case 'parameter': + return { ...base, entityType: 'parameter' }; + case 'metric': + return { ...base, entityType: 'metric' }; + case 'risk-factor': + return { ...base, entityType: 'risk-factor' }; + + default: { + // Unknown types (ai-transition-model-* etc.) — pass through with entityType + return { ...base, entityType: canonicalType }; + } + } +} + +// ============================================================================ +// ORCHESTRATOR +// ============================================================================ + +/** + * Transform all entities from raw database format to typed entities. + * + * @param {Array} entities - Raw entity array from database + * @param {Array} pages - Pages array (needed for path-based type overrides) + * @param {Array} experts - Experts array + * @param {Array} organizations - Organizations array + * @returns {Array} Transformed typed entities + */ +export function transformEntities(entities, pages, experts, organizations) { + // Apply entity type overrides first + const overriddenEntities = applyEntityOverrides(entities, pages); + + // Build lookup maps + const expertMap = new Map((experts || []).map(e => [e.id, e])); + const orgMap = new Map((organizations || []).map(o => [o.id, o])); + + // Transform each entity + const typedEntities = []; + for (const raw of overriddenEntities) { + const typed = transformEntity(raw, expertMap, orgMap); + if (typed) { + typedEntities.push(typed); + } + } + + return typedEntities; +} diff --git a/apps/longterm/scripts/lib/mdx-generator.mjs b/apps/longterm/scripts/lib/mdx-generator.mjs new file mode 100644 index 00000000..382edd52 --- /dev/null +++ b/apps/longterm/scripts/lib/mdx-generator.mjs @@ -0,0 +1,98 @@ +/** + * MDX Generation for YAML-first Entities + * + * Generates MDX stub files for entities that have YAML-first content structure. + * Only generates/updates files that are marked as generated stubs. + * + * Extracted from build-data.mjs for modularity. + */ + +import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs'; +import { join } from 'path'; +import { CONTENT_DIR } from './content-types.mjs'; + +/** + * Check if an MDX file needs regeneration based on entity content + * Returns true if the file doesn't exist or is a minimal stub that should be regenerated + */ +function shouldGenerateMdx(mdxPath, entity) { + if (!existsSync(mdxPath)) return true; + + const content = readFileSync(mdxPath, 'utf-8'); + + // If file contains custom content beyond the stub, don't overwrite + // Check for markers that indicate it's a generated stub + const isGeneratedStub = content.includes(' +`; +} + +/** + * Generate MDX files for entities with YAML-first content structure + * Only generates/updates files that are marked as generated stubs + */ +export function generateMdxFromYaml(entities, options = { dryRun: false }) { + const generated = []; + const skipped = []; + + for (const entity of entities) { + // Only process entities with content field and path + if (!entity.content || !entity.path) continue; + + // Convert URL path to file path + // e.g., /ai-transition-model/scenarios/human-catastrophe/state-actor/ + // -> src/content/docs/ai-transition-model/scenarios/human-catastrophe/state-actor.mdx + const urlPath = entity.path.replace(/^\/|\/$/g, ''); // Remove leading/trailing slashes + const mdxPath = join(CONTENT_DIR, `${urlPath}.mdx`); + + // Check if we should generate this file + if (!shouldGenerateMdx(mdxPath, entity)) { + skipped.push({ id: entity.id, path: mdxPath, reason: 'custom content' }); + continue; + } + + const mdxContent = generateMdxStub(entity); + + if (options.dryRun) { + generated.push({ id: entity.id, path: mdxPath, action: 'would generate' }); + } else { + // Ensure directory exists + const dir = join(mdxPath, '..'); + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true }); + } + + writeFileSync(mdxPath, mdxContent); + generated.push({ id: entity.id, path: mdxPath, action: 'generated' }); + } + } + + return { generated, skipped }; +} diff --git a/apps/longterm/scripts/lib/statistics.mjs b/apps/longterm/scripts/lib/statistics.mjs new file mode 100644 index 00000000..b625bd06 --- /dev/null +++ b/apps/longterm/scripts/lib/statistics.mjs @@ -0,0 +1,84 @@ +/** + * Statistics Computation + * + * Computes aggregate statistics about entities, backlinks, and tags. + * + * Extracted from build-data.mjs for modularity. + */ + +/** + * Compute aggregate statistics + */ +export function computeStats(entities, backlinks, tagIndex) { + // Count by type + const byType = {}; + for (const entity of entities) { + byType[entity.type] = (byType[entity.type] || 0) + 1; + } + + // Count by severity + const bySeverity = {}; + for (const entity of entities) { + if (entity.severity) { + bySeverity[entity.severity] = (bySeverity[entity.severity] || 0) + 1; + } + } + + // Count by status + const byStatus = {}; + for (const entity of entities) { + const status = entity.status || 'unknown'; + byStatus[status] = (byStatus[status] || 0) + 1; + } + + // Recently updated (sort by lastUpdated, take top 10) + const recentlyUpdated = entities + .filter((e) => e.lastUpdated) + .sort((a, b) => b.lastUpdated.localeCompare(a.lastUpdated)) + .slice(0, 10) + .map((e) => ({ + id: e.id, + type: e.type, + title: e.title, + lastUpdated: e.lastUpdated, + })); + + // Most linked (entities with most backlinks) + const mostLinked = Object.entries(backlinks) + .map(([id, links]) => ({ + id, + count: links.length, + entity: entities.find((e) => e.id === id), + })) + .filter((item) => item.entity) + .sort((a, b) => b.count - a.count) + .slice(0, 10) + .map((item) => ({ + id: item.id, + type: item.entity.type, + title: item.entity.title, + backlinkCount: item.count, + })); + + // Tag statistics + const topTags = Object.entries(tagIndex) + .map(([tag, entities]) => ({ tag, count: entities.length })) + .sort((a, b) => b.count - a.count) + .slice(0, 20); + + // Entities with descriptions + const withDescription = entities.filter((e) => e.description).length; + + return { + totalEntities: entities.length, + byType, + bySeverity, + byStatus, + recentlyUpdated, + mostLinked, + topTags, + totalTags: Object.keys(tagIndex).length, + withDescription, + lastBuilt: new Date().toISOString(), + }; +} diff --git a/apps/longterm/scripts/lib/unconverted-links.mjs b/apps/longterm/scripts/lib/unconverted-links.mjs new file mode 100644 index 00000000..33347bfc --- /dev/null +++ b/apps/longterm/scripts/lib/unconverted-links.mjs @@ -0,0 +1,101 @@ +/** + * Unconverted Link Detection + * + * Detects markdown links in content that have matching resources in the database. + * These links should ideally be converted to components. + * + * Extracted from build-data.mjs for modularity. + */ + +/** + * Normalize URL to handle variations (trailing slashes, www prefix, http/https) + */ +function normalizeUrl(url) { + const variations = new Set(); + try { + const parsed = new URL(url); + const base = parsed.href.replace(/\/$/, ''); + variations.add(base); + variations.add(base + '/'); + + // Without www + if (parsed.hostname.startsWith('www.')) { + const noWww = base.replace('://www.', '://'); + variations.add(noWww); + variations.add(noWww + '/'); + } + // With www + if (!parsed.hostname.startsWith('www.')) { + const withWww = base.replace('://', '://www.'); + variations.add(withWww); + variations.add(withWww + '/'); + } + } catch { + variations.add(url); + } + return Array.from(variations); +} + +/** + * Build URL → resource map from resources + */ +export function buildUrlToResourceMap(resources) { + const urlToResource = new Map(); + for (const r of resources) { + if (!r.url) continue; + const normalizedUrls = normalizeUrl(r.url); + for (const url of normalizedUrls) { + urlToResource.set(url, r); + } + } + return urlToResource; +} + +/** + * Extract markdown links from content (not images, not internal, not components) + */ +function extractMarkdownLinks(content) { + const links = []; + // Match [text](url) but not images ![text](url) + const linkRegex = /(? component usages in content (already converted links) + */ +export function countConvertedLinks(content) { + // Match or ... + const rComponentRegex = /10% extinction probability), and AGI timeline (Metaculus median 2027-2031).", + "tags": [ + "uncertainty-analysis", + "scaling-laws", + "compute-governance", + "alignment-difficulty", + "research-prioritization", + "forecasting" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "ai-impacts", + "type": "organization" + }, + { + "id": "metaculus", + "type": "organization" + }, + { + "id": "epoch-ai", + "type": "organization" + }, + { + "id": "agi-timeline", + "type": "concept" + }, + { + "id": "ai-governance", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E156", + "customFields": [], + "relatedTopics": [], + "entityType": "crux" + }, + { + "id": "agi-timeline", + "title": "AGI Timeline", + "description": "Expert forecasts and prediction markets suggest 50% probability of AGI by 2030-2045, with Metaculus predicting median of November 2027 and lab leaders converging on 2026-2029. Timelines have shortened dramatically, with Metaculus dropping from 50 years to 5 years since 2020.", + "tags": [ + "agi", + "forecasting", + "prediction-markets", + "timelines", + "scaling", + "expert-surveys" + ], + "clusters": [ + "ai-safety", + "epistemics" + ], + "relatedEntries": [ + { + "id": "prediction-markets", + "type": "concept" + }, + { + "id": "sam-altman", + "type": "researcher" + }, + { + "id": "dario-amodei", + "type": "researcher" + }, + { + "id": "metaculus", + "type": "organization" + }, + { + "id": "ai-impacts", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E157", + "customFields": [], + "relatedTopics": [], + "entityType": "concept" + }, + { + "id": "large-language-models", + "title": "Large Language Models", + "description": "Transformer-based models trained on massive text datasets that exhibit emergent capabilities and pose significant safety challenges. Training costs have grown 2.4x/year since 2016, while frontier models demonstrate in-context scheming and unprecedented capability gains. ChatGPT reached 800-900M weekly active users by late 2025.", + "tags": [ + "transformers", + "training-costs", + "scheming", + "emergent-capabilities", + "open-weights", + "frontier-models" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "scheming", + "type": "risk" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "emergent-capabilities", + "type": "concept" + }, + { + "id": "interpretability", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E158", + "customFields": [], + "relatedTopics": [], + "entityType": "concept" + }, + { + "id": "heavy-scaffolding", + "title": "Heavy Scaffolding / Agentic Systems", + "description": "Multi-agent AI systems with complex orchestration, persistent memory, and autonomous operation. Includes Claude Code, Devin, and similar agentic architectures. Estimated 25-40% probability of being the dominant paradigm at transformative AI, with rapid capability growth but persistent reliability challenges.", + "tags": [ + "agentic-systems", + "multi-agent", + "tool-use", + "autonomous-operation", + "scaffolding", + "reliability" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "openai", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "dense-transformers", + "type": "concept" + }, + { + "id": "light-scaffolding", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E159", + "customFields": [], + "relatedTopics": [], + "entityType": "concept" + }, + { + "id": "provable-safe", + "title": "Provable / Guaranteed Safe AI", + "description": "AI systems designed with formal mathematical safety guarantees from the ground up. The UK's ARIA programme has committed GBP 59M to develop guaranteed safe AI systems by 2028. Current neural network verification handles networks up to 10^6 parameters, but frontier models exceed 10^12, representing a 6 order-of-magnitude gap. Estimated 1-5% probability of paradigm dominance at transformative AI.", + "tags": [ + "formal-verification", + "mathematical-guarantees", + "aria", + "world-models", + "neuro-symbolic", + "safety-by-design" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "formal-verification", + "type": "concept" + }, + { + "id": "neuro-symbolic", + "type": "concept" + }, + { + "id": "dense-transformers", + "type": "concept" + }, + { + "id": "heavy-scaffolding", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E160", + "customFields": [], + "relatedTopics": [], + "entityType": "concept" + }, + { + "id": "dense-transformers", + "title": "Dense Transformers", + "description": "The standard transformer architecture powering current frontier AI, where all parameters are active for every token. Since Vaswani et al.'s 2017 paper (160,000+ citations), dense transformers power GPT-4, Claude 3, Llama 3, and Gemini. Despite open weights for some models, mechanistic interpretability remains primitive with a fundamental gap between feature extraction and behavior prediction.", + "tags": [ + "transformer-architecture", + "attention-mechanism", + "scaling", + "interpretability", + "training-pipeline", + "emergent-capabilities" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "rlhf", + "type": "concept" + }, + { + "id": "constitutional-ai", + "type": "approach" + }, + { + "id": "emergent-capabilities", + "type": "concept" + }, + { + "id": "heavy-scaffolding", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E161", + "customFields": [], + "relatedTopics": [], + "entityType": "concept" + }, + { + "id": "quri", + "title": "QURI (Quantified Uncertainty Research Institute)", + "description": "Nonprofit research organization developing tools for probabilistic reasoning and forecasting, including Squiggle, Metaforecast, and SquiggleAI.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E162", + "path": "/knowledge-base/organizations/epistemic-orgs/quri/", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "metaculus", + "title": "Metaculus", + "description": "Reputation-based prediction aggregation platform that has become the primary source for AI timeline forecasts, with over 1 million predictions across 15,000+ questions.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E163", + "path": "/knowledge-base/organizations/epistemic-orgs/metaculus/", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "fri", + "title": "Forecasting Research Institute (FRI)", + "description": "Research institute advancing forecasting methodology through large-scale tournaments and rigorous experiments, led by Philip Tetlock.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E164", + "path": "/knowledge-base/organizations/epistemic-orgs/fri/", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "squiggle", + "title": "Squiggle", + "description": "Domain-specific programming language for probabilistic estimation with native distribution types and Monte Carlo sampling.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E165", + "path": "/knowledge-base/responses/epistemic-tools/projects/squiggle/", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "metaforecast", + "title": "Metaforecast", + "description": "Forecast aggregation platform combining predictions from 10+ sources into a unified search interface.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E166", + "path": "/knowledge-base/responses/epistemic-tools/projects/metaforecast/", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "squiggleai", + "title": "SquiggleAI", + "description": "LLM-powered tool for generating probabilistic models in Squiggle from natural language descriptions.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E167", + "path": "/knowledge-base/responses/epistemic-tools/projects/squiggleai/", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "xpt", + "title": "XPT (Existential Risk Persuasion Tournament)", + "description": "Four-month structured forecasting tournament bringing together superforecasters and domain experts through adversarial collaboration.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E168", + "path": "/knowledge-base/responses/epistemic-tools/projects/xpt/", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "forecastbench", + "title": "ForecastBench", + "description": "Dynamic, contamination-free benchmark for evaluating LLM forecasting capabilities, published at ICLR 2025.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E169", + "path": "/knowledge-base/responses/epistemic-tools/projects/forecastbench/", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "ai-forecasting-benchmark", + "title": "AI Forecasting Benchmark Tournament", + "description": "Quarterly competition run by Metaculus comparing human Pro Forecasters against AI forecasting bots.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E170", + "path": "/knowledge-base/responses/epistemic-tools/projects/ai-forecasting-benchmark/", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "deep-learning-era", + "title": "Deep Learning Revolution Era", + "description": "The deep learning revolution transformed AI from a field of limited successes to one of rapidly compounding breakthroughs. For AI safety, this meant moving from theoretical concerns about far-future AGI to practical questions about current and near-future systems.", + "tags": [ + "deep-learning", + "alexnet", + "alphago", + "gpt", + "deepmind", + "openai", + "concrete-problems", + "scaling", + "reward-hacking", + "interpretability", + "paul-christiano", + "dario-amodei" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deepmind", + "type": "organization" + }, + { + "id": "openai", + "type": "organization" + } + ], + "sources": [ + { + "title": "ImageNet Classification with Deep Convolutional Neural Networks", + "url": "https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks", + "author": "Krizhevsky et al.", + "date": "2012" + }, + { + "title": "Mastering the game of Go with deep neural networks", + "url": "https://www.nature.com/articles/nature16961", + "author": "Silver et al.", + "date": "2016" + }, + { + "title": "Concrete Problems in AI Safety", + "url": "https://arxiv.org/abs/1606.06565", + "author": "Amodei et al.", + "date": "2016" + }, + { + "title": "Language Models are Few-Shot Learners", + "url": "https://arxiv.org/abs/2005.14165", + "author": "Brown et al.", + "date": "2020" + }, + { + "title": "OpenAI Charter", + "url": "https://openai.com/charter/", + "author": "OpenAI", + "date": "2018" + }, + { + "title": "Safely Interruptible Agents", + "url": "https://arxiv.org/abs/1606.06565", + "author": "Orseau & Armstrong", + "date": "2016" + }, + { + "title": "Risks from Learned Optimization", + "url": "https://arxiv.org/abs/1906.01820", + "author": "Hubinger et al.", + "date": "2019" + } + ], + "lastUpdated": "2025-12", + "numericId": "E171", + "customFields": [ + { + "label": "Period", + "value": "2012-2020" + }, + { + "label": "Defining Event", + "value": "AlexNet (2012) proves deep learning works at scale" + }, + { + "label": "Key Theme", + "value": "Capabilities acceleration makes safety urgent" + }, + { + "label": "Outcome", + "value": "AI safety becomes professionalized research field" + } + ], + "relatedTopics": [], + "entityType": "historical" + }, + { + "id": "early-warnings", + "title": "Early Warnings Era", + "description": "Long before AI safety became a research field, a handful of visionaries recognized that machine intelligence might pose unprecedented challenges to humanity. These early warnings—often dismissed as science fiction or philosophical speculation—laid the conceptual groundwork for modern AI safety.", + "tags": [ + "alan-turing", + "norbert-wiener", + "ij-good", + "isaac-asimov", + "vernor-vinge", + "intelligence-explosion", + "three-laws-of-robotics", + "technological-singularity", + "control-problem", + "science-fiction" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Computing Machinery and Intelligence", + "url": "https://academic.oup.com/mind/article/LIX/236/433/986238", + "author": "Alan Turing", + "date": "1950" + }, + { + "title": "Some Moral and Technical Consequences of Automation", + "url": "https://en.wikipedia.org/wiki/Norbert_Wiener", + "author": "Norbert Wiener", + "date": "1960" + }, + { + "title": "Speculations Concerning the First Ultraintelligent Machine", + "url": "https://vtechworks.lib.vt.edu/handle/10919/89424", + "author": "I.J. Good", + "date": "1965" + }, + { + "title": "I, Robot", + "url": "https://en.wikipedia.org/wiki/I,_Robot", + "author": "Isaac Asimov", + "date": "1950" + }, + { + "title": "The Coming Technological Singularity", + "url": "https://edoras.sdsu.edu/~vinge/misc/singularity.html", + "author": "Vernor Vinge", + "date": "1993" + }, + { + "title": "The Age of Em", + "url": "https://ageofem.com/", + "author": "Robin Hanson", + "date": "2016" + }, + { + "title": "Artificial Intelligence: A Modern Approach", + "url": "http://aima.cs.berkeley.edu/", + "author": "Stuart Russell & Peter Norvig", + "date": "1995" + } + ], + "lastUpdated": "2025-12", + "numericId": "E172", + "customFields": [ + { + "label": "Period", + "value": "1950s-2000" + }, + { + "label": "Key Theme", + "value": "Philosophical foundations and prescient warnings" + }, + { + "label": "Main Figures", + "value": "Turing, Wiener, Good, Asimov, Vinge" + }, + { + "label": "Reception", + "value": "Largely dismissed as science fiction" + } + ], + "relatedTopics": [], + "entityType": "historical" + }, + { + "id": "mainstream-era", + "title": "Mainstream Era", + "description": "The Mainstream Era marks AI safety's transformation from a niche research field to a central topic in technology policy, corporate strategy, and public discourse. ChatGPT was the catalyst, but the shift reflected years of groundwork meeting rapidly advancing capabilities.", + "tags": [ + "chatgpt", + "gpt-4", + "anthropic", + "constitutional-ai", + "geoffrey-hinton", + "openai-leadership-crisis", + "ai-safety-summit", + "eu-ai-act", + "pause-debate", + "interpretability", + "scalable-oversight", + "government-regulation" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "openai", + "type": "organization" + } + ], + "sources": [ + { + "title": "Constitutional AI: Harmlessness from AI Feedback", + "url": "https://arxiv.org/abs/2212.08073", + "author": "Bai et al. (Anthropic)", + "date": "2022" + }, + { + "title": "GPT-4 Technical Report", + "url": "https://arxiv.org/abs/2303.08774", + "author": "OpenAI", + "date": "2023" + }, + { + "title": "GPT-4 System Card", + "url": "https://cdn.openai.com/papers/gpt-4-system-card.pdf", + "author": "OpenAI", + "date": "2023" + }, + { + "title": "The Bletchley Declaration", + "url": "https://www.gov.uk/government/publications/ai-safety-summit-2023-the-bletchley-declaration", + "author": "UK AI Safety Summit", + "date": "2023" + }, + { + "title": "Executive Order on Safe, Secure, and Trustworthy AI", + "url": "https://www.whitehouse.gov/briefing-room/presidential-actions/2023/10/30/executive-order-on-the-safe-secure-and-trustworthy-development-and-use-of-artificial-intelligence/", + "author": "White House", + "date": "2023" + }, + { + "title": "Pause Giant AI Experiments: An Open Letter", + "url": "https://futureoflife.org/open-letter/pause-giant-ai-experiments/", + "author": "Future of Life Institute", + "date": "2023" + } + ], + "lastUpdated": "2025-12", + "numericId": "E173", + "customFields": [ + { + "label": "Period", + "value": "2020-Present" + }, + { + "label": "Defining Moment", + "value": "ChatGPT (November 2022)" + }, + { + "label": "Key Theme", + "value": "AI safety goes from fringe to central policy concern" + }, + { + "label": "Status", + "value": "Ongoing" + } + ], + "relatedTopics": [], + "entityType": "historical" + }, + { + "id": "miri-era", + "title": "The MIRI Era", + "description": "The MIRI era marks the transition from scattered warnings to organized research. For the first time, AI safety had an institution, a community, and a research agenda.", + "tags": [ + "miri", + "eliezer-yudkowsky", + "nick-bostrom", + "lesswrong", + "superintelligence", + "friendly-ai", + "orthogonality-thesis", + "instrumental-convergence", + "cev", + "effective-altruism" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "miri", + "type": "organization" + }, + { + "id": "fhi", + "type": "organization" + } + ], + "sources": [ + { + "title": "Creating Friendly AI", + "url": "https://intelligence.org/files/CFAI.pdf", + "author": "Eliezer Yudkowsky", + "date": "2001" + }, + { + "title": "Superintelligence: Paths, Dangers, Strategies", + "url": "https://www.amazon.com/Superintelligence-Dangers-Strategies-Nick-Bostrom/dp/0199678111", + "author": "Nick Bostrom", + "date": "2014" + }, + { + "title": "The Sequences", + "url": "https://www.lesswrong.com/rationality", + "author": "Eliezer Yudkowsky", + "date": "2006-2009" + }, + { + "title": "Existential Risk Prevention as Global Priority", + "url": "https://www.existential-risk.org/concept.html", + "author": "Nick Bostrom", + "date": "2013" + }, + { + "title": "The Hanson-Yudkowsky AI-Foom Debate", + "url": "https://intelligence.org/ai-foom-debate/", + "author": "Robin Hanson & Eliezer Yudkowsky", + "date": "2008" + }, + { + "title": "Future of Life Institute Open Letter", + "url": "https://futureoflife.org/open-letter/ai-open-letter/", + "author": "Various", + "date": "2015" + } + ], + "lastUpdated": "2025-12", + "numericId": "E174", + "customFields": [ + { + "label": "Period", + "value": "2000-2015" + }, + { + "label": "Key Event", + "value": "First dedicated AI safety organization founded" + }, + { + "label": "Main Figures", + "value": "Yudkowsky, Bostrom, Hanson, Tegmark" + }, + { + "label": "Milestone", + "value": "Superintelligence (2014) brings academic legitimacy" + } + ], + "relatedTopics": [], + "entityType": "historical" + }, + { + "id": "ai-safety-summit", + "title": "AI Safety Summit (Bletchley Park)", + "description": "International summit on AI safety held at Bletchley Park, UK in November 2023, resulting in the Bletchley Declaration.", + "tags": [ + "policy", + "international", + "governance" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "international-coordination", + "type": "concept" + }, + { + "id": "uk-aisi", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E175", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "historical" + }, + { + "id": "effectiveness-assessment", + "title": "AI Policy Effectiveness", + "description": "As AI governance efforts multiply, a critical question emerges: Which policies are actually working?", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "AI Governance: A Research Agenda", + "url": "https://www.governance.ai/research-paper/research-agenda", + "author": "GovAI" + }, + { + "title": "Evaluating AI Governance", + "url": "https://cset.georgetown.edu/", + "author": "CSET Georgetown" + } + ], + "lastUpdated": "2025-12", + "numericId": "E176", + "customFields": [ + { + "label": "Key Question", + "value": "Which policies actually reduce AI risk?" + }, + { + "label": "Challenge", + "value": "Counterfactuals are hard to assess" + }, + { + "label": "Status", + "value": "Early, limited evidence" + } + ], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "openai-foundation-governance", + "title": "OpenAI Foundation Governance Paradox", + "description": "Analysis of the governance structure where a nonprofit controls a $500B company through Class N shares, but the same 8 people run both entities, creating governance theater rather than real accountability.", + "tags": [ + "openai", + "governance", + "nonprofit-structure", + "class-n-shares", + "board-oversight" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "openai-foundation", + "type": "funder" + }, + { + "id": "musk-openai-lawsuit", + "type": "analysis" + }, + { + "id": "long-term-benefit-trust", + "type": "analysis" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E177", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "anthropic-valuation", + "title": "Anthropic Valuation Analysis", + "description": "Analysis of Anthropic's $350B valuation. Corrected data shows Anthropic trades at 39x revenue vs OpenAI's 25x. Bull case: 88% enterprise retention, coding benchmark leadership. Bear case: 25% customer concentration, margin pressure, AI bubble warnings.", + "tags": [ + "anthropic", + "valuation", + "revenue-multiples", + "enterprise-metrics", + "ai-industry-finance" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "anthropic-ipo", + "type": "analysis" + }, + { + "id": "anthropic-investors", + "type": "analysis" + }, + { + "id": "openai", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E178", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "anthropic-investors", + "title": "Anthropic (Funder)", + "description": "Analysis of EA-aligned philanthropic capital at Anthropic. At $350B valuation: $25-70B risk-adjusted EA capital from founder pledges, investor stakes (Tallinn, Moskovitz), and employee matching programs ($20-40B in DAFs).", + "tags": [ + "anthropic", + "ea-capital", + "founder-pledges", + "donor-advised-funds", + "philanthropic-capital" + ], + "clusters": [ + "community", + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic-valuation", + "type": "analysis" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "anthropic-ipo", + "type": "analysis" + }, + { + "id": "jaan-tallinn", + "type": "researcher" + }, + { + "id": "dustin-moskovitz", + "type": "researcher" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E179", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "long-term-benefit-trust", + "title": "Long-Term Benefit Trust (Anthropic)", + "description": "Independent governance mechanism at Anthropic designed to ensure board accountability to humanity's long-term benefit through financially disinterested trustees with growing board appointment power.", + "tags": [ + "anthropic", + "governance", + "trust-structure", + "board-oversight", + "public-benefit-corporation" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "dario-amodei", + "type": "researcher" + }, + { + "id": "daniela-amodei", + "type": "researcher" + }, + { + "id": "paul-christiano", + "type": "researcher" + }, + { + "id": "centre-for-effective-altruism", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E180", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "musk-openai-lawsuit", + "title": "Musk v. OpenAI Lawsuit", + "description": "Elon Musk's $79-134B lawsuit against OpenAI alleging fraud and breach of charitable trust. Trial scheduled April 2026. If successful, could claim significant portion of the OpenAI Foundation's $130B equity stake.", + "tags": [ + "openai", + "lawsuit", + "nonprofit-conversion", + "charitable-trust", + "ai-governance-legal" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "openai-foundation-governance", + "type": "analysis" + }, + { + "id": "openai-foundation", + "type": "funder" + }, + { + "id": "elon-musk", + "type": "researcher" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "sam-altman", + "type": "researcher" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E181", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "anthropic-ipo", + "title": "Anthropic IPO", + "description": "Tracking Anthropic's preparation for a potential 2026 initial public offering, including timeline estimates, valuation trajectory, competitive dynamics with OpenAI, and implications for EA funding.", + "tags": [ + "anthropic", + "ipo", + "public-offering", + "valuation", + "ea-funding-implications" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic-valuation", + "type": "analysis" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "anthropic-investors", + "type": "analysis" + }, + { + "id": "dario-amodei", + "type": "researcher" + }, + { + "id": "daniela-amodei", + "type": "researcher" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E182", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "elon-musk-philanthropy", + "title": "Elon Musk (Funder)", + "description": "Analysis of Elon Musk's charitable giving and future philanthropic potential. Despite ~$400B net worth and a 2012 Giving Pledge commitment, actual giving averages only ~$250M annually. The gap represents the largest untapped philanthropic potential in history.", + "tags": [ + "elon-musk", + "philanthropy", + "giving-pledge", + "foundation-analysis", + "ai-safety-funding" + ], + "clusters": [ + "community", + "governance" + ], + "relatedEntries": [ + { + "id": "elon-musk", + "type": "researcher" + }, + { + "id": "giving-pledge", + "type": "concept" + }, + { + "id": "dustin-moskovitz", + "type": "researcher" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "jaan-tallinn", + "type": "researcher" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E183", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "anthropic-pledge-enforcement", + "title": "Anthropic Founder Pledges: Interventions to Increase Follow-Through", + "description": "Analysis of interventions to increase the probability that Anthropic co-founders follow through on their 80% equity donation pledges. With $25-70B at stake, distinguishes collaborative interventions founders would welcome from adversarial ones that could backfire.", + "tags": [ + "anthropic", + "founder-pledges", + "philanthropic-interventions", + "cost-effectiveness", + "donor-advised-funds", + "pledge-fulfillment" + ], + "clusters": [ + "community", + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic-investors", + "type": "analysis" + }, + { + "id": "anthropic-pre-ipo-daf-transfers", + "type": "analysis" + }, + { + "id": "long-term-benefit-trust", + "type": "analysis" + }, + { + "id": "giving-pledge", + "type": "concept" + }, + { + "id": "dario-amodei", + "type": "researcher" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E184", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "anthropic-pre-ipo-daf-transfers", + "title": "Anthropic Pre-IPO DAF Transfers", + "description": "Analysis of charitable giving mechanisms at Anthropic, focusing on the employee matching program and potential founder transfers. The matching program (historically 3:1 at 50% of equity) is one of the most generous corporate charitable giving vehicles ever offered, with $20-40B already committed to DAFs.", + "tags": [ + "anthropic", + "donor-advised-funds", + "employee-matching", + "pre-ipo", + "tax-optimization", + "philanthropic-capital" + ], + "clusters": [ + "community", + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic-investors", + "type": "analysis" + }, + { + "id": "anthropic-pledge-enforcement", + "type": "analysis" + }, + { + "id": "anthropic-ipo", + "type": "analysis" + }, + { + "id": "giving-pledge", + "type": "concept" + }, + { + "id": "dario-amodei", + "type": "researcher" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E185", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "anthropic-impact", + "title": "Anthropic Impact Assessment Model", + "description": "Framework for estimating Anthropic's net impact on AI safety outcomes. Models the tension between safety research value ($100-200M/year, industry-leading interpretability) and racing dynamics contribution (6-18 month timeline compression).", + "tags": [ + "anthropic", + "impact-assessment", + "safety-research", + "racing-dynamics", + "net-impact" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "anthropic-valuation", + "type": "analysis" + }, + { + "id": "anthropic-investors", + "type": "analysis" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "google-deepmind", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E186", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "capability-alignment-race", + "title": "Capability-Alignment Race Model", + "description": "Model analyzing the critical gap between AI capability progress and safety/governance readiness. Currently capabilities are ~3 years ahead of alignment with the gap increasing at 0.5 years annually, driven by 10^26 FLOP scaling vs. 15% interpretability coverage.", + "tags": [ + "capability-gap", + "alignment-race", + "compute-scaling", + "interpretability", + "governance-readiness", + "ai-timelines" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "scalable-oversight", + "type": "safety-agenda" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "paul-christiano", + "type": "researcher" + }, + { + "id": "racing-dynamics", + "type": "concept" + }, + { + "id": "epoch-ai", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E187", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "short-timeline-policy-implications", + "title": "Short Timeline Policy Implications", + "description": "Analysis of what policies and interventions become more or less important if transformative AI arrives in 1-5 years rather than decades. Short timelines dramatically shift cost-benefit calculus toward rapid lab-level safety practices over long-term institution building.", + "tags": [ + "short-timelines", + "ai-policy", + "compute-governance", + "lab-safety", + "emergency-coordination", + "intervention-prioritization" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "ai-control", + "type": "safety-agenda" + }, + { + "id": "compute-governance", + "type": "concept" + }, + { + "id": "international-coordination", + "type": "concept" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "eu-ai-act", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E188", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "technical-pathways", + "title": "Technical Pathway Decomposition", + "description": "Model mapping technical pathways from capability advances to catastrophic risk outcomes. Finds accident risks (deceptive alignment, goal misgeneralization, instrumental convergence) account for 45% of total technical risk, with safety techniques degrading relative to capabilities at frontier scale.", + "tags": [ + "technical-risk", + "deceptive-alignment", + "goal-misgeneralization", + "accident-risk", + "safety-degradation" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "capability-alignment-race", + "type": "analysis" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E189", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "feedback-loops", + "title": "Feedback Loop & Cascade Model", + "description": "System dynamics model analyzing how AI risks emerge from reinforcing feedback loops. Capabilities compound at 2.5x per year while safety measures improve at only 1.2x per year, with current safety investment at just 0.1% of capability investment.", + "tags": [ + "feedback-loops", + "system-dynamics", + "capability-growth", + "safety-investment", + "recursive-improvement" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "capability-alignment-race", + "type": "analysis" + }, + { + "id": "racing-dynamics", + "type": "concept" + }, + { + "id": "anthropic", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E190", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "multi-actor-landscape", + "title": "Multi-Actor Strategic Landscape", + "description": "Model analyzing how risk depends on which actors develop TAI. US-China capability gap narrowed from 9.26% to 1.70% (2024-2025), while open-source closed to within 1.70% of frontier. Actor identity may determine 40-60% of total risk variance.", + "tags": [ + "geopolitics", + "us-china-competition", + "open-source-ai", + "actor-analysis", + "strategic-landscape", + "proliferation" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "alignment-progress", + "type": "concept" + }, + { + "id": "capability-alignment-race", + "type": "analysis" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E191", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "model-organisms-of-misalignment", + "title": "Model Organisms of Misalignment", + "description": "Research agenda creating controlled AI models that exhibit specific misalignment behaviors to study alignment failures and test interventions. Recent work achieves 99% coherence with 40% misalignment rates using models as small as 0.5B parameters.", + "tags": [ + "misalignment", + "model-organisms", + "deceptive-alignment", + "interpretability", + "alignment-research", + "sleeper-agents" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "evan-hubinger", + "type": "researcher" + }, + { + "id": "paul-christiano", + "type": "researcher" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "interpretability", + "type": "safety-agenda" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E192", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "ea-biosecurity-scope", + "title": "Is EA Biosecurity Work Limited to Restricting LLM Biological Use?", + "description": "Analysis of the full EA/x-risk biosecurity portfolio, examining whether the community's work consists primarily of AI capability restrictions or encompasses a broader set of interventions including DNA synthesis screening, pathogen surveillance, medical countermeasures, and governance reform.", + "tags": [ + "biosecurity", + "ea-portfolio", + "dna-synthesis-screening", + "pandemic-preparedness", + "delay-detect-defend" + ], + "clusters": [ + "biorisks", + "ai-safety", + "governance", + "community" + ], + "relatedEntries": [ + { + "id": "open-philanthropy", + "type": "funder" + }, + { + "id": "securebio", + "type": "lab" + }, + { + "id": "securedna", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "blueprint-biosecurity", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E193", + "customFields": [], + "relatedTopics": [], + "entityType": "analysis" + }, + { + "id": "lock-in-mechanisms", + "title": "Lock-in Mechanisms Model", + "description": "Analytical model examining how AI could enable permanent entrenchment of values, systems, or power structures. Distinguishes AI-enabled lock-in from historical examples due to enforcement capabilities and estimates 10-30% probability of significant lock-in by 2050.", + "tags": [ + "x-risk", + "irreversibility", + "path-dependence", + "models" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "lock-in", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E194", + "customFields": [], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "ai-risk-portfolio-analysis", + "title": "AI Risk Portfolio Analysis", + "description": "This framework compares AI risk categories to guide resource allocation. It estimates misalignment accounts for 40-70% of x-risk, misuse 15-35%, and structural risks 10-25%, though all estimates carry ±50% uncertainty.", + "tags": [ + "prioritization", + "resource-allocation", + "portfolio", + "strategy", + "comparative-analysis" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compounding-risks-analysis", + "type": "model", + "relationship": "related" + }, + { + "id": "flash-dynamics-threshold", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E195", + "customFields": [ + { + "label": "Model Type", + "value": "Prioritization Framework" + }, + { + "label": "Focus", + "value": "Resource Allocation" + }, + { + "label": "Key Output", + "value": "Risk magnitude comparisons and allocation recommendations" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "worldview-intervention-mapping", + "title": "Worldview-Intervention Mapping", + "description": "This model maps how beliefs about timelines and difficulty affect intervention priorities. Different worldviews imply 2-10x differences in optimal resource allocation.", + "tags": [ + "prioritization", + "worldview", + "strategy", + "theory-of-change", + "intervention-effectiveness" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "ai-risk-portfolio-analysis", + "type": "model", + "relationship": "related" + }, + { + "id": "racing-dynamics", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E196", + "customFields": [ + { + "label": "Model Type", + "value": "Strategic Framework" + }, + { + "label": "Focus", + "value": "Worldview-Action Coherence" + }, + { + "label": "Key Output", + "value": "Intervention priorities given different worldviews" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "intervention-timing-windows", + "title": "Intervention Timing Windows", + "description": "This model identifies closing vs stable intervention windows. It recommends shifting 20-30% of resources toward closing-window work (compute governance, international coordination) within 2 years.", + "tags": [ + "prioritization", + "timing", + "strategy", + "urgency", + "windows" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "ai-risk-portfolio-analysis", + "type": "model", + "relationship": "related" + }, + { + "id": "worldview-intervention-mapping", + "type": "model", + "relationship": "related" + }, + { + "id": "racing-dynamics", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E197", + "customFields": [ + { + "label": "Model Type", + "value": "Timing Framework" + }, + { + "label": "Focus", + "value": "Temporal Urgency" + }, + { + "label": "Key Output", + "value": "Prioritization based on closing vs stable windows" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "deceptive-alignment-decomposition", + "title": "Deceptive Alignment Decomposition Model", + "description": "This model decomposes deceptive alignment probability into five necessary conditions. It estimates 40-80% probability for the outer alignment condition, 20-60% for situational awareness.", + "tags": [ + "probability", + "decomposition", + "inner-alignment", + "deception", + "training-dynamics" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deceptive-alignment", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "mesa-optimization", + "type": "risk", + "relationship": "related" + }, + { + "id": "situational-awareness", + "type": "capability", + "relationship": "prerequisite" + }, + { + "id": "anthropic", + "type": "lab", + "relationship": "research" + }, + { + "id": "alignment-robustness", + "type": "parameter", + "relationship": "models" + }, + { + "id": "human-oversight-quality", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E198", + "customFields": [ + { + "label": "Model Type", + "value": "Probability Decomposition" + }, + { + "label": "Target Risk", + "value": "Deceptive Alignment" + }, + { + "label": "Base Rate Estimate", + "value": "5-40% for advanced AI systems" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "carlsmith-six-premises", + "title": "Carlsmith's Six-Premise Argument", + "description": "Joe Carlsmith's probabilistic decomposition of AI existential risk into six conditional premises. Originally estimated ~5% risk by 2070, updated to >10%. The most rigorous public framework for structured x-risk estimation.", + "tags": [ + "probability", + "decomposition", + "x-risk", + "power-seeking", + "existential-risk" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "instrumental-convergence", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "power-seeking-conditions", + "type": "model", + "relationship": "related" + }, + { + "id": "deceptive-alignment-decomposition", + "type": "model", + "relationship": "related" + }, + { + "id": "alignment-robustness", + "type": "parameter", + "relationship": "models" + }, + { + "id": "racing-intensity", + "type": "parameter", + "relationship": "models" + } + ], + "sources": [], + "lastUpdated": "2026-01", + "numericId": "E199", + "customFields": [ + { + "label": "Model Type", + "value": "Probability Decomposition" + }, + { + "label": "Target Risk", + "value": "Power-Seeking AI X-Risk" + }, + { + "label": "Combined Estimate", + "value": ">10% by 2070" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "mesa-optimization-analysis", + "title": "Mesa-Optimization Risk Analysis", + "description": "This model analyzes when mesa-optimizers might emerge during training. It estimates emergence probability increases sharply above certain capability thresholds, with deceptive alignment as a key concern.", + "tags": [ + "mesa-optimization", + "inner-alignment", + "learned-optimization", + "training-dynamics" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "mesa-optimization", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "deceptive-alignment", + "type": "risk", + "relationship": "related" + }, + { + "id": "goal-misgeneralization", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E200", + "customFields": [ + { + "label": "Model Type", + "value": "Risk Framework" + }, + { + "label": "Target Risk", + "value": "Mesa-Optimization" + }, + { + "label": "Key Factor", + "value": "Training complexity and optimization pressure" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "goal-misgeneralization-probability", + "title": "Goal Misgeneralization Probability Model", + "description": "This model estimates likelihood of goal misgeneralization across scenarios. Key factors include distribution shift magnitude and training objective specificity.", + "tags": [ + "probability", + "generalization", + "distribution-shift", + "deployment-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "goal-misgeneralization", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "distributional-shift", + "type": "risk", + "relationship": "related" + }, + { + "id": "reward-hacking", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E201", + "customFields": [ + { + "label": "Model Type", + "value": "Probability Model" + }, + { + "label": "Target Risk", + "value": "Goal Misgeneralization" + }, + { + "label": "Base Rate", + "value": "20-60% for significant distribution shifts" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "reward-hacking-taxonomy", + "title": "Reward Hacking Taxonomy and Severity Model", + "description": "Comprehensive taxonomy of reward hacking failure modes with severity estimates and mitigation analysis", + "tags": [ + "taxonomy", + "reward-modeling", + "specification-gaming", + "rlhf" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "reward-hacking", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "sycophancy", + "type": "risk", + "relationship": "example" + }, + { + "id": "rlhf", + "type": "capability", + "relationship": "vulnerable-technique" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda", + "relationship": "mitigation" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E202", + "customFields": [ + { + "label": "Model Type", + "value": "Taxonomy + Severity Analysis" + }, + { + "label": "Target Risk", + "value": "Reward Hacking" + }, + { + "label": "Categories Identified", + "value": "12 major failure modes" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "power-seeking-conditions", + "title": "Power-Seeking Emergence Conditions Model", + "description": "This model identifies conditions for AI power-seeking behaviors. It estimates 60-90% probability of power-seeking in sufficiently capable optimizers, emerging at 50-70% of optimal task performance.", + "tags": [ + "formal-analysis", + "power-seeking", + "optimal-policies", + "instrumental-goals" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "power-seeking", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "instrumental-convergence", + "type": "risk", + "relationship": "related" + }, + { + "id": "corrigibility-failure", + "type": "risk", + "relationship": "consequence" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E203", + "customFields": [ + { + "label": "Model Type", + "value": "Formal Analysis" + }, + { + "label": "Target Risk", + "value": "Power-Seeking" + }, + { + "label": "Key Result", + "value": "Optimal policies tend to seek power under broad conditions" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "instrumental-convergence-framework", + "title": "Instrumental Convergence Framework", + "description": "This model analyzes universal subgoals emerging in AI systems. It finds self-preservation converges in 95-99% of goal structures, with shutdown-resistance 70-95% likely for capable optimizers.", + "tags": [ + "framework", + "instrumental-goals", + "convergent-evolution", + "agent-foundations" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "instrumental-convergence", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "power-seeking", + "type": "risk", + "relationship": "example" + }, + { + "id": "corrigibility-failure", + "type": "risk", + "relationship": "consequence" + }, + { + "id": "miri", + "type": "organization", + "relationship": "research" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E204", + "customFields": [ + { + "label": "Model Type", + "value": "Theoretical Framework" + }, + { + "label": "Target Risk", + "value": "Instrumental Convergence" + }, + { + "label": "Core Insight", + "value": "Many final goals share common instrumental subgoals" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "scheming-likelihood-model", + "title": "Scheming Likelihood Assessment", + "description": "This model estimates probability of AI systems engaging in strategic deception. Key factors include situational awareness, goal stability, and training environment transparency.", + "tags": [ + "probability", + "strategic-deception", + "situational-awareness", + "alignment-faking" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "scheming", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "deceptive-alignment", + "type": "risk", + "relationship": "related" + }, + { + "id": "situational-awareness", + "type": "capability", + "relationship": "prerequisite" + }, + { + "id": "sandbagging", + "type": "risk", + "relationship": "manifestation" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E205", + "customFields": [ + { + "label": "Model Type", + "value": "Probability Assessment" + }, + { + "label": "Target Risk", + "value": "Scheming" + }, + { + "label": "Conditional Probability", + "value": "10-50% given situational awareness" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "corrigibility-failure-pathways", + "title": "Corrigibility Failure Pathways", + "description": "This model maps pathways from AI training to corrigibility failure. It estimates 60-90% failure probability for capable optimizers with unbounded goals, reducible by 40-70% through targeted interventions.", + "tags": [ + "causal-model", + "corrigibility", + "shutdown-problem", + "intervention-design" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "corrigibility-failure", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "instrumental-convergence", + "type": "risk", + "relationship": "cause" + }, + { + "id": "power-seeking", + "type": "risk", + "relationship": "related" + }, + { + "id": "ai-control", + "type": "safety-agenda", + "relationship": "mitigation" + }, + { + "id": "alignment-robustness", + "type": "parameter", + "relationship": "models" + }, + { + "id": "human-oversight-quality", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E206", + "customFields": [ + { + "label": "Model Type", + "value": "Causal Pathways" + }, + { + "label": "Target Risk", + "value": "Corrigibility Failure" + }, + { + "label": "Pathways Identified", + "value": "6 major failure modes" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "bioweapons-attack-chain", + "title": "Bioweapons Attack Chain Model", + "description": "This model decomposes bioweapons attacks into seven sequential steps with independent failure modes. DNA synthesis screening offers 5-15% risk reduction for $7-20M, with estimates carrying 2-5x uncertainty at each step.", + "tags": [ + "probability", + "decomposition", + "bioweapons", + "attack-chain" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "bioweapons", + "type": "risk", + "relationship": "related" + }, + { + "id": "biological-threat-exposure", + "type": "parameter", + "relationship": "models" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E207", + "customFields": [ + { + "label": "Model Type", + "value": "Probability Decomposition" + }, + { + "label": "Target Risk", + "value": "Bioweapons" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "bioweapons-ai-uplift", + "title": "AI Uplift Assessment Model", + "description": "This model estimates AI's marginal contribution to bioweapons risk over time. It projects uplift increasing from 1.3-2.5x (2024) to 3-5x by 2030, with biosecurity evasion capabilities posing the greatest concern as they could undermine existing defenses before triggering policy response.", + "tags": [ + "uplift", + "comparison", + "bioweapons", + "marginal-risk" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "bioweapons", + "type": "risk", + "relationship": "related" + }, + { + "id": "biological-threat-exposure", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E208", + "customFields": [ + { + "label": "Model Type", + "value": "Comparative Analysis" + }, + { + "label": "Target Risk", + "value": "Bioweapons" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "bioweapons-timeline", + "title": "AI-Bioweapons Timeline Model", + "description": "This model projects when AI crosses capability thresholds for bioweapons. It estimates knowledge democratization is already crossed, synthesis assistance arrives 2027-2032, and novel agent design by 2030-2040.", + "tags": [ + "timeline", + "projection", + "bioweapons", + "forecasting" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "bioweapons", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E209", + "customFields": [ + { + "label": "Model Type", + "value": "Timeline Projection" + }, + { + "label": "Target Risk", + "value": "Bioweapons" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "racing-dynamics-impact", + "title": "Racing Dynamics Impact Model", + "description": "This model analyzes how competitive pressure creates race-to-the-bottom dynamics. It estimates racing conditions reduce safety investment by 30-60% compared to coordinated scenarios.", + "tags": [ + "risk-factor", + "competition", + "game-theory", + "incentives" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "related" + }, + { + "id": "multipolar-trap", + "type": "risk", + "relationship": "related" + }, + { + "id": "racing-intensity", + "type": "parameter", + "relationship": "models" + }, + { + "id": "safety-capability-gap", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "coordination-capacity", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E210", + "customFields": [ + { + "label": "Model Type", + "value": "Causal Analysis" + }, + { + "label": "Target Factor", + "value": "Racing Dynamics" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "multipolar-trap-dynamics", + "title": "Multipolar Trap Dynamics Model", + "description": "This model analyzes game-theoretic dynamics of AI competition traps. It estimates 20-35% probability of partial coordination, 5-10% of catastrophic competitive lock-in, with compute governance offering 20-35% risk reduction.", + "tags": [ + "risk-factor", + "game-theory", + "coordination", + "equilibrium" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "multipolar-trap", + "type": "risk", + "relationship": "related" + }, + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "related" + }, + { + "id": "international-coordination", + "type": "parameter", + "relationship": "models" + }, + { + "id": "racing-intensity", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E211", + "customFields": [ + { + "label": "Model Type", + "value": "Game Theory Analysis" + }, + { + "label": "Target Factor", + "value": "Multipolar Trap" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "flash-dynamics-threshold", + "title": "Flash Dynamics Threshold Model", + "description": "This model identifies thresholds where AI speed exceeds human oversight capacity. Current systems already operate 10-10,000x faster than humans in key domains, with oversight thresholds crossed in many areas.", + "tags": [ + "risk-factor", + "speed", + "thresholds", + "cascades" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "flash-dynamics", + "type": "risk", + "relationship": "related" + }, + { + "id": "irreversibility", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E212", + "customFields": [ + { + "label": "Model Type", + "value": "Threshold Analysis" + }, + { + "label": "Target Factor", + "value": "Flash Dynamics" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "expertise-atrophy-progression", + "title": "Expertise Atrophy Progression Model", + "description": "This model traces five phases from AI augmentation to irreversible skill loss. It finds humans decline to 50-70% of baseline capability in Phase 3, with reversibility becoming difficult after 3-10 years of heavy AI use.", + "tags": [ + "risk-factor", + "skills", + "dependency", + "irreversibility" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "expertise-atrophy", + "type": "risk", + "relationship": "related" + }, + { + "id": "human-expertise", + "type": "parameter", + "relationship": "models" + }, + { + "id": "human-oversight-quality", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "automation-bias", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E213", + "customFields": [ + { + "label": "Model Type", + "value": "Progressive Decay Model" + }, + { + "label": "Target Factor", + "value": "Expertise Atrophy" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "economic-disruption-impact", + "title": "Economic Disruption Impact Model", + "description": "This model analyzes AI labor displacement cascades. It estimates 2-5% workforce displacement over 5 years vs 1-3% adaptation capacity, suggesting disruption will outpace adjustment.", + "tags": [ + "risk-factor", + "economics", + "labor", + "instability" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "economic-disruption", + "type": "risk", + "relationship": "related" + }, + { + "id": "winner-take-all", + "type": "risk", + "relationship": "related" + }, + { + "id": "economic-stability", + "type": "parameter", + "relationship": "models" + }, + { + "id": "human-agency", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E214", + "customFields": [ + { + "label": "Model Type", + "value": "System Dynamics" + }, + { + "label": "Target Factor", + "value": "Economic Disruption" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "proliferation-risk-model", + "title": "AI Proliferation Risk Model", + "description": "This model analyzes AI capability diffusion dynamics. It estimates key capabilities spread within 2-5 years of frontier development, with open-source accelerating timelines.", + "tags": [ + "risk-factor", + "diffusion", + "control", + "dual-use" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "proliferation", + "type": "risk", + "relationship": "related" + }, + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E215", + "customFields": [ + { + "label": "Model Type", + "value": "Diffusion Analysis" + }, + { + "label": "Target Factor", + "value": "AI Proliferation" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "winner-take-all-concentration", + "title": "Winner-Take-All Concentration Model", + "description": "This model analyzes network effects driving AI capability concentration. It estimates top 3-5 actors will control 70-90% of frontier capabilities within 5 years.", + "tags": [ + "risk-factor", + "concentration", + "network-effects", + "power" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "winner-take-all", + "type": "risk", + "relationship": "related" + }, + { + "id": "economic-disruption", + "type": "risk", + "relationship": "related" + }, + { + "id": "ai-control-concentration", + "type": "parameter", + "relationship": "models" + }, + { + "id": "economic-stability", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E216", + "customFields": [ + { + "label": "Model Type", + "value": "Network Effects Analysis" + }, + { + "label": "Target Factor", + "value": "Winner-Take-All Dynamics" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "cyberweapons-offense-defense", + "title": "Cyber Offense-Defense Balance Model", + "description": "This model analyzes whether AI shifts cyber offense-defense balance. It projects 30-70% net improvement in attack success rates, driven by automation scaling and vulnerability discovery.", + "tags": [ + "offense-defense", + "cybersecurity", + "balance", + "comparative" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "cyberweapons", + "type": "risk", + "relationship": "related" + }, + { + "id": "cyber-threat-exposure", + "type": "parameter", + "relationship": "models" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E217", + "customFields": [ + { + "label": "Model Type", + "value": "Comparative Analysis" + }, + { + "label": "Target Risk", + "value": "Cyberweapons" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "cyberweapons-attack-automation", + "title": "Autonomous Cyber Attack Timeline", + "description": "This model projects when AI achieves autonomous cyber attack capability. It estimates Level 3 (AI-directed) attacks by 2026-2027 and Level 4 (fully autonomous) campaigns by 2029-2033.", + "tags": [ + "timeline", + "automation", + "cybersecurity", + "autonomy" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "cyberweapons", + "type": "risk", + "relationship": "related" + }, + { + "id": "cyber-threat-exposure", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E218", + "customFields": [ + { + "label": "Model Type", + "value": "Timeline Projection" + }, + { + "label": "Target Risk", + "value": "Cyberweapons" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "autonomous-weapons-escalation", + "title": "Autonomous Weapons Escalation Model", + "description": "This model analyzes AI-accelerated conflict escalation risks. It estimates 1-5% annual probability of catastrophic escalation once autonomous systems are deployed, implying 10-40% cumulative risk over a decade.", + "tags": [ + "escalation", + "conflict", + "speed", + "autonomous-weapons" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "autonomous-weapons", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E219", + "customFields": [ + { + "label": "Model Type", + "value": "Risk Decomposition" + }, + { + "label": "Target Risk", + "value": "Autonomous Weapons" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "autonomous-weapons-proliferation", + "title": "LAWS Proliferation Model", + "description": "This model tracks lethal autonomous weapons proliferation. It projects 50% of militarily capable nations will have LAWS by 2030, proliferating 4-6x faster than nuclear weapons and reaching non-state actors by 2030-2032.", + "tags": [ + "proliferation", + "timeline", + "autonomous-weapons", + "diffusion" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "autonomous-weapons", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E220", + "customFields": [ + { + "label": "Model Type", + "value": "Timeline Projection" + }, + { + "label": "Target Risk", + "value": "Autonomous Weapons" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "disinformation-detection-race", + "title": "Disinformation Detection Arms Race Model", + "description": "This model analyzes the arms race between AI generation and detection. It projects detection falling to near-random (50%) by 2030 under medium adversarial pressure.", + "tags": [ + "detection", + "arms-race", + "disinformation", + "adversarial" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "disinformation", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E221", + "customFields": [ + { + "label": "Model Type", + "value": "Comparative Analysis" + }, + { + "label": "Target Risk", + "value": "Disinformation" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "disinformation-electoral-impact", + "title": "Electoral Impact Assessment Model", + "description": "This model estimates AI disinformation's marginal impact on elections. It finds AI increases reach by 1.5-3x over traditional methods, with potential 2-5% vote margin shifts in close elections.", + "tags": [ + "elections", + "democracy", + "disinformation", + "impact-assessment" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "disinformation", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E222", + "customFields": [ + { + "label": "Model Type", + "value": "Impact Assessment" + }, + { + "label": "Target Risk", + "value": "Disinformation" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "surveillance-authoritarian-stability", + "title": "AI Surveillance and Regime Durability Model", + "description": "This model analyzes how AI surveillance affects authoritarian regime durability. It estimates AI-enabled regimes may be 2-3x more durable than historical autocracies.", + "tags": [ + "authoritarianism", + "stability", + "surveillance", + "regime-durability" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "surveillance", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E223", + "customFields": [ + { + "label": "Model Type", + "value": "Causal Analysis" + }, + { + "label": "Target Risk", + "value": "Surveillance" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "surveillance-chilling-effects", + "title": "Surveillance Chilling Effects Model", + "description": "This model quantifies AI surveillance impact on expression and behavior. It estimates 50-70% reduction in dissent within months, reaching 80-95% within 1-2 years under comprehensive surveillance.", + "tags": [ + "chilling-effects", + "freedom", + "surveillance", + "rights" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "surveillance", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E224", + "customFields": [ + { + "label": "Model Type", + "value": "Impact Assessment" + }, + { + "label": "Target Risk", + "value": "Surveillance" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "deepfakes-authentication-crisis", + "title": "Deepfakes Authentication Crisis Model", + "description": "This model projects when synthetic media becomes indistinguishable. Detection accuracy declined from 85-95% (2018) to 55-65% (2025), projecting crisis threshold within 3-5 years.", + "tags": [ + "authentication", + "deepfakes", + "timeline", + "trust" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deepfakes", + "type": "risk", + "relationship": "related" + }, + { + "id": "information-authenticity", + "type": "parameter", + "relationship": "models" + }, + { + "id": "societal-trust", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E225", + "customFields": [ + { + "label": "Model Type", + "value": "Timeline Projection" + }, + { + "label": "Target Risk", + "value": "Deepfakes" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "trust-cascade-model", + "title": "Trust Cascade Failure Model", + "description": "This model analyzes how institutional trust collapses cascade. It finds trust failures propagate at 1.5-2x rates in AI-mediated environments vs traditional contexts.", + "tags": [ + "epistemic", + "cascade", + "trust", + "institutions", + "threshold-effects" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "trust-cascade", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "trust-decline", + "type": "risk", + "relationship": "related" + }, + { + "id": "epistemic-collapse", + "type": "risk", + "relationship": "leads-to" + }, + { + "id": "societal-trust", + "type": "parameter", + "relationship": "models" + }, + { + "id": "epistemic-health", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "information-authenticity", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E226", + "customFields": [ + { + "label": "Model Type", + "value": "Cascade Analysis" + }, + { + "label": "Target Risk", + "value": "Trust Cascade Failure" + }, + { + "label": "Key Insight", + "value": "Trust cascades exhibit catastrophic regime shifts with hysteresis" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "sycophancy-feedback-loop", + "title": "Sycophancy Feedback Loop Model", + "description": "This model analyzes how AI validation creates self-reinforcing dynamics. It identifies conditions where user preferences and AI training create stable but problematic equilibria.", + "tags": [ + "epistemic", + "feedback-loops", + "sycophancy", + "echo-chambers", + "validation" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "epistemic-sycophancy", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "reality-fragmentation", + "type": "risk", + "relationship": "contributes-to" + }, + { + "id": "learned-helplessness", + "type": "risk", + "relationship": "leads-to" + }, + { + "id": "preference-authenticity", + "type": "parameter", + "relationship": "models" + }, + { + "id": "societal-trust", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E227", + "customFields": [ + { + "label": "Model Type", + "value": "Feedback Loop Analysis" + }, + { + "label": "Target Risk", + "value": "Sycophancy at Scale" + }, + { + "label": "Key Finding", + "value": "Multiple reinforcing loops drive belief rigidity increase of 2-10x per year" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "authentication-collapse-timeline", + "title": "Authentication Collapse Timeline Model", + "description": "This model projects when digital verification systems cross critical failure thresholds. It estimates text detection already at random-chance levels, with image/audio following within 3-5 years.", + "tags": [ + "epistemic", + "timeline", + "authentication", + "verification", + "deepfakes" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "authentication-collapse", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "legal-evidence-crisis", + "type": "risk", + "relationship": "leads-to" + }, + { + "id": "deepfakes", + "type": "risk", + "relationship": "related" + }, + { + "id": "information-authenticity", + "type": "parameter", + "relationship": "models" + }, + { + "id": "epistemic-health", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E228", + "customFields": [ + { + "label": "Model Type", + "value": "Timeline Projection" + }, + { + "label": "Target Risk", + "value": "Authentication Collapse", + "link": "/knowledge-base/risks/epistemic/authentication-collapse/" + }, + { + "label": "Critical Threshold", + "value": "Detection accuracy approaching random chance (50%) by 2027-2030" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "expertise-atrophy-cascade", + "title": "Expertise Atrophy Cascade Model", + "description": "This model analyzes cascading skill degradation from AI dependency. It estimates dependency approximately doubles every 2-3 years (1.7x per cycle), with 40-60% capability loss in Gen 1 users.", + "tags": [ + "epistemic", + "cascade", + "expertise", + "skills", + "generational" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "expertise-atrophy", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "automation-bias", + "type": "risk", + "relationship": "related" + }, + { + "id": "epistemic-collapse", + "type": "risk", + "relationship": "contributes-to" + }, + { + "id": "human-expertise", + "type": "parameter", + "relationship": "models" + }, + { + "id": "human-agency", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E229", + "customFields": [ + { + "label": "Model Type", + "value": "Cascade Analysis" + }, + { + "label": "Target Risk", + "value": "Expertise Atrophy" + }, + { + "label": "Key Finding", + "value": "Complete knowledge loss within 15-30 years with high AI use" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "epistemic-collapse-threshold", + "title": "Epistemic Collapse Threshold Model", + "description": "This model identifies thresholds where society loses ability to establish shared facts. It estimates 35-45% probability of authentication-system-triggered collapse, 25-35% via polarization-driven collapse.", + "tags": [ + "epistemic", + "threshold", + "collapse", + "regime-shift", + "tipping-points" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "epistemic-collapse", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "trust-cascade", + "type": "risk", + "relationship": "component" + }, + { + "id": "reality-fragmentation", + "type": "risk", + "relationship": "component" + }, + { + "id": "learned-helplessness", + "type": "risk", + "relationship": "outcome" + }, + { + "id": "epistemic-health", + "type": "parameter", + "relationship": "models" + }, + { + "id": "reality-coherence", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "societal-trust", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E230", + "customFields": [ + { + "label": "Model Type", + "value": "Threshold Model" + }, + { + "label": "Target Risk", + "value": "Epistemic Collapse" + }, + { + "label": "Critical Threshold", + "value": "Epistemic health E < 0.35 leads to irreversible collapse" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "reality-fragmentation-network", + "title": "Reality Fragmentation Network Model", + "description": "This model analyzes how AI personalization creates incompatible reality bubbles. It projects 30-50% divergence in factual beliefs across groups within 5 years of heavy AI use.", + "tags": [ + "epistemic", + "network-analysis", + "fragmentation", + "polarization", + "information-silos" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "reality-fragmentation", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "reality-coherence", + "type": "parameter", + "relationship": "models" + }, + { + "id": "epistemic-health", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "preference-authenticity", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "epistemic-sycophancy", + "type": "risk", + "relationship": "mechanism" + }, + { + "id": "epistemic-collapse", + "type": "risk", + "relationship": "leads-to" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E231", + "customFields": [ + { + "label": "Model Type", + "value": "Network Effects" + }, + { + "label": "Target Risk", + "value": "Reality Fragmentation" + }, + { + "label": "Key Metric", + "value": "Fragmentation index F projected to reach 0.75-0.85 by 2030" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "racing-dynamics-model", + "title": "Racing Dynamics Game Theory Model", + "description": "Game-theoretic analysis of competitive pressures in AI development, modeling safety-capability tradeoffs as prisoner's dilemma with asymmetric payoffs.", + "tags": [ + "game-theory", + "coordination", + "prisoner-dilemma", + "racing", + "structural-risks" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "multipolar-trap", + "type": "risk", + "relationship": "related" + }, + { + "id": "concentration-of-power", + "type": "risk", + "relationship": "outcome" + }, + { + "id": "racing-intensity", + "type": "parameter", + "relationship": "models" + }, + { + "id": "safety-culture-strength", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "international-coordination", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E232", + "customFields": [ + { + "label": "Model Type", + "value": "Game Theory" + }, + { + "label": "Target Risk", + "value": "Racing Dynamics" + }, + { + "label": "Core Insight", + "value": "Individual rationality produces collectively suboptimal outcomes when safety investments reduce competitive advantage" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "multipolar-trap-model", + "title": "Multipolar Trap Coordination Model", + "description": "Systems analysis of collective action failures where rational individual action produces collectively catastrophic outcomes in AI development.", + "tags": [ + "coordination-failure", + "collective-action", + "moloch", + "tragedy-of-commons", + "structural-risks" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "multipolar-trap", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "manifestation" + }, + { + "id": "concentration-of-power", + "type": "risk", + "relationship": "outcome" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E233", + "customFields": [ + { + "label": "Model Type", + "value": "Systems Dynamics / Coordination Theory" + }, + { + "label": "Target Risk", + "value": "Multipolar Trap" + }, + { + "label": "Core Insight", + "value": "Local optimization plus competitive pressure creates global suboptimality that no individual actor can escape" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "winner-take-all-model", + "title": "Winner-Take-All Market Dynamics Model", + "description": "Economic analysis of power law distributions and market concentration in AI, examining superstar economics and increasing returns to scale.", + "tags": [ + "market-structure", + "power-law", + "network-effects", + "inequality", + "structural-risks" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "winner-take-all", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "concentration-of-power", + "type": "risk", + "relationship": "mechanism" + }, + { + "id": "economic-disruption", + "type": "risk", + "relationship": "related" + }, + { + "id": "ai-control-concentration", + "type": "parameter", + "relationship": "models" + }, + { + "id": "economic-stability", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E234", + "customFields": [ + { + "label": "Model Type", + "value": "Market Structure Analysis" + }, + { + "label": "Target Risk", + "value": "Winner-Take-All Dynamics" + }, + { + "label": "Core Insight", + "value": "AI exhibits increasing returns and network effects creating extreme concentration" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "concentration-of-power-model", + "title": "Concentration of Power Systems Model", + "description": "Systems dynamics analysis of power accumulation mechanisms across economic, political, military, and informational domains through AI.", + "tags": [ + "power-dynamics", + "systems-thinking", + "feedback-loops", + "political-economy", + "structural-risks" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "concentration-of-power", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "winner-take-all", + "type": "risk", + "relationship": "mechanism" + }, + { + "id": "lock-in", + "type": "risk", + "relationship": "consequence" + }, + { + "id": "authoritarian-takeover", + "type": "risk", + "relationship": "scenario" + }, + { + "id": "ai-control-concentration", + "type": "parameter", + "relationship": "models" + }, + { + "id": "human-agency", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E235", + "customFields": [ + { + "label": "Model Type", + "value": "Systems Dynamics" + }, + { + "label": "Target Risk", + "value": "Concentration of Power" + }, + { + "label": "Core Insight", + "value": "AI's cross-domain applicability enables unprecedented positive feedback loops in power accumulation" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "lock-in-model", + "title": "Lock-in Irreversibility Model", + "description": "Analysis of irreversible transitions and path dependencies in AI development, examining value, political, technical, economic, and cognitive lock-in mechanisms.", + "tags": [ + "irreversibility", + "path-dependence", + "value-lock-in", + "structural-risks", + "long-term" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "lock-in", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "concentration-of-power", + "type": "risk", + "relationship": "mechanism" + }, + { + "id": "authoritarian-takeover", + "type": "risk", + "relationship": "scenario" + }, + { + "id": "irreversibility", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E236", + "customFields": [ + { + "label": "Model Type", + "value": "Path Dependence / Threshold Analysis" + }, + { + "label": "Target Risk", + "value": "Lock-in" + }, + { + "label": "Core Insight", + "value": "Certain AI decisions create irreversible path dependencies faster than society can evaluate them" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "economic-disruption-model", + "title": "Economic Disruption Structural Model", + "description": "Macroeconomic analysis of AI-driven labor market transformations, examining displacement dynamics, inequality, and transition challenges.", + "tags": [ + "labor-economics", + "automation", + "inequality", + "structural-unemployment", + "structural-risks" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "economic-disruption", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "concentration-of-power", + "type": "risk", + "relationship": "consequence" + }, + { + "id": "erosion-of-agency", + "type": "risk", + "relationship": "related" + }, + { + "id": "winner-take-all", + "type": "risk", + "relationship": "mechanism" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E237", + "customFields": [ + { + "label": "Model Type", + "value": "Labor Economics / Macroeconomic Model" + }, + { + "label": "Target Risk", + "value": "Economic Disruption" + }, + { + "label": "Core Insight", + "value": "AI automation differs from previous transitions in scope, speed, and completeness of displacement" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "proliferation-model", + "title": "AI Capability Proliferation Model", + "description": "Diffusion dynamics and control challenges for advanced AI capabilities, analyzing spread mechanisms and governance interventions.", + "tags": [ + "proliferation", + "diffusion", + "compute-governance", + "open-source", + "structural-risks" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "proliferation", + "type": "risk", + "relationship": "analyzes" + }, + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "related" + }, + { + "id": "multipolar-trap", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E238", + "customFields": [ + { + "label": "Model Type", + "value": "Diffusion Model / Information Economics" + }, + { + "label": "Target Risk", + "value": "Proliferation" + }, + { + "label": "Core Insight", + "value": "AI capabilities as information goods with near-zero marginal copying cost create unique containment challenges" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "risk-activation-timeline", + "title": "Risk Activation Timeline Model", + "description": "This model maps when risks become critical based on capability levels. Near-term risks activate at current capabilities; transformative risks require advanced autonomous systems.", + "tags": [ + "timeline", + "capability", + "risk-assessment", + "forecasting" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "capability-threshold-model", + "type": "model", + "relationship": "related" + }, + { + "id": "warning-signs-model", + "type": "model", + "relationship": "related" + }, + { + "id": "bioweapons-timeline", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E239", + "customFields": [ + { + "label": "Model Type", + "value": "Timeline Projection" + }, + { + "label": "Scope", + "value": "Cross-cutting (all risk categories)" + }, + { + "label": "Key Insight", + "value": "Risks activate at different times based on capability thresholds" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "capability-threshold-model", + "title": "Capability Threshold Model", + "description": "This model maps capability levels to risk activation thresholds. It identifies 15-25% benchmark performance as indicating early risk emergence, with 50% marking qualitative shift to complex autonomous execution.", + "tags": [ + "capability", + "threshold", + "risk-assessment", + "forecasting" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "risk-activation-timeline", + "type": "model", + "relationship": "related" + }, + { + "id": "warning-signs-model", + "type": "model", + "relationship": "related" + }, + { + "id": "scheming-likelihood-model", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E240", + "customFields": [ + { + "label": "Model Type", + "value": "Threshold Analysis" + }, + { + "label": "Scope", + "value": "Capability-risk mapping" + }, + { + "label": "Key Insight", + "value": "Many risks have threshold dynamics rather than gradual activation" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "warning-signs-model", + "title": "Warning Signs Model", + "description": "This model catalogs early indicators for detecting emerging AI risks. It prioritizes indicators by lead time, reliability, and actionability.", + "tags": [ + "monitoring", + "early-warning", + "tripwires", + "risk-assessment" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "risk-activation-timeline", + "type": "model", + "relationship": "related" + }, + { + "id": "capability-threshold-model", + "type": "model", + "relationship": "related" + }, + { + "id": "scheming-likelihood-model", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E241", + "customFields": [ + { + "label": "Model Type", + "value": "Monitoring Framework" + }, + { + "label": "Scope", + "value": "Early warning indicators" + }, + { + "label": "Key Insight", + "value": "Leading indicators enable proactive response before risks materialize" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "authoritarian-tools-diffusion", + "title": "Authoritarian Tools Diffusion Model", + "description": "This model analyzes how AI surveillance spreads to authoritarian regimes. It finds semiconductor supply chains are the highest-leverage intervention point, but this advantage will erode within 5-10 years as domestic chip manufacturing develops.", + "tags": [ + "diffusion", + "surveillance", + "authoritarianism", + "geopolitics" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "authoritarian-tools", + "type": "risk", + "relationship": "related" + }, + { + "id": "proliferation-risk-model", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E242", + "customFields": [ + { + "label": "Model Type", + "value": "Diffusion Analysis" + }, + { + "label": "Target Factor", + "value": "Authoritarian Tools" + }, + { + "label": "Key Insight", + "value": "Technology diffusion creates dual-use challenges with limited control points" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "consensus-manufacturing-dynamics", + "title": "Consensus Manufacturing Dynamics Model", + "description": "This model analyzes AI-enabled artificial consensus creation. It estimates 15-40% shifts in perceived opinion distribution are achievable, with 5-15% actual opinion shifts from sustained campaigns.", + "tags": [ + "manipulation", + "disinformation", + "public-opinion", + "social-media" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "consensus-manufacturing", + "type": "risk", + "relationship": "related" + }, + { + "id": "disinformation-detection-race", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E243", + "customFields": [ + { + "label": "Model Type", + "value": "Manipulation Analysis" + }, + { + "label": "Target Factor", + "value": "Consensus Manufacturing" + }, + { + "label": "Key Insight", + "value": "AI scales inauthentic consensus beyond detection capacity" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "irreversibility-threshold", + "title": "Irreversibility Threshold Model", + "description": "This model analyzes when AI decisions become permanently locked-in. It estimates 25% probability of crossing infeasible-reversal thresholds by 2035, with expected time to major threshold at 4-5 years.", + "tags": [ + "irreversibility", + "lock-in", + "decision-making", + "thresholds" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "irreversibility", + "type": "risk", + "relationship": "related" + }, + { + "id": "lock-in-model", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E244", + "customFields": [ + { + "label": "Model Type", + "value": "Threshold Analysis" + }, + { + "label": "Target Factor", + "value": "Irreversibility" + }, + { + "label": "Key Insight", + "value": "Reversal costs grow exponentially with time and lock-in depth" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "preference-manipulation-drift", + "title": "Preference Manipulation Drift Model", + "description": "This model analyzes gradual AI-driven preference shifts. It estimates 5-15% probability of significant harm from drift, with 20-40% reduction in preference diversity after 5 years of heavy use.", + "tags": [ + "autonomy", + "manipulation", + "preferences", + "behavioral-change" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "preference-manipulation", + "type": "risk", + "relationship": "related" + }, + { + "id": "sycophancy-feedback-loop", + "type": "model", + "relationship": "related" + }, + { + "id": "preference-authenticity", + "type": "parameter", + "relationship": "models" + }, + { + "id": "human-agency", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E245", + "customFields": [ + { + "label": "Model Type", + "value": "Behavioral Dynamics" + }, + { + "label": "Target Factor", + "value": "Preference Manipulation" + }, + { + "label": "Key Insight", + "value": "Preference drift is gradual, cumulative, and often invisible to those experiencing it" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "trust-erosion-dynamics", + "title": "Trust Erosion Dynamics Model", + "description": "This model analyzes how AI systems erode institutional trust. It identifies authentication failure and expertise displacement as key mechanisms driving erosion.", + "tags": [ + "trust", + "institutions", + "social-cohesion", + "deepfakes" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "trust-decline", + "type": "risk", + "relationship": "related" + }, + { + "id": "trust-cascade-model", + "type": "model", + "relationship": "related" + }, + { + "id": "societal-trust", + "type": "parameter", + "relationship": "models" + }, + { + "id": "institutional-quality", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E246", + "customFields": [ + { + "label": "Model Type", + "value": "Trust Dynamics" + }, + { + "label": "Target Factor", + "value": "Trust Erosion" + }, + { + "label": "Key Insight", + "value": "Trust erodes faster than it builds, with 3-10x asymmetry in speed" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "automation-bias-cascade", + "title": "Automation Bias Cascade Model", + "description": "This model analyzes how AI over-reliance creates cascading failures. It estimates skill atrophy rates of 10-25%/year and projects that within 5 years, organizations may lose 50%+ of independent verification capability in AI-dependent domains.", + "tags": [ + "human-ai-interaction", + "cognitive-bias", + "system-dynamics" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "expertise-atrophy", + "type": "risk", + "relationship": "related" + }, + { + "id": "erosion-of-agency", + "type": "risk", + "relationship": "related" + }, + { + "id": "human-oversight-quality", + "type": "parameter", + "relationship": "models" + }, + { + "id": "human-expertise", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E247", + "customFields": [ + { + "label": "Model Type", + "value": "Cascade Analysis" + }, + { + "label": "Target Risk", + "value": "Automation Bias" + }, + { + "label": "Key Insight", + "value": "Human-AI calibration failures create self-reinforcing patterns of over-reliance" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "cyber-psychosis-cascade", + "title": "Cyber Psychosis Cascade Model", + "description": "This model analyzes AI-generated content triggering psychological harm cascades. It identifies 1-3% of population as highly vulnerable, with 5-10x increased susceptibility during reality-testing deficits.", + "tags": [ + "mental-health", + "synthetic-media", + "population-risk" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deepfakes", + "type": "risk", + "relationship": "related" + }, + { + "id": "disinformation", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E248", + "customFields": [ + { + "label": "Model Type", + "value": "Population Risk Model" + }, + { + "label": "Target Risk", + "value": "Mental Health Impacts" + }, + { + "label": "Key Insight", + "value": "AI-generated content can trigger cascading psychological effects in vulnerable populations" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "fraud-sophistication-curve", + "title": "Fraud Sophistication Curve Model", + "description": "This model analyzes AI-enabled fraud evolution. It finds AI-personalized attacks achieve 20-30% higher success rates, with technique diffusion time of 8-24 months and defense adaptation lagging by 12-36 months.", + "tags": [ + "fraud", + "crime", + "capability-progression" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deepfakes", + "type": "risk", + "relationship": "related" + }, + { + "id": "disinformation", + "type": "risk", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E249", + "customFields": [ + { + "label": "Model Type", + "value": "Capability Progression" + }, + { + "label": "Target Risk", + "value": "AI-Enabled Fraud" + }, + { + "label": "Key Insight", + "value": "AI democratizes sophisticated fraud techniques, shifting the capability curve" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "intervention-effectiveness-matrix", + "title": "Intervention Effectiveness Matrix", + "description": "Mapping AI safety interventions to the risks they mitigate, with effectiveness estimates and gap analysis", + "tags": [ + "interventions", + "effectiveness", + "prioritization" + ], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E250", + "customFields": [ + { + "label": "Model Type", + "value": "Prioritization Framework" + }, + { + "label": "Scope", + "value": "All AI Safety Interventions" + }, + { + "label": "Key Insight", + "value": "Interventions vary dramatically in cost-effectiveness across dimensions" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "lab-incentives-model", + "title": "Lab Incentives Model", + "description": "This model analyzes competitive and reputational pressures on lab safety decisions. It identifies conditions where market dynamics systematically underweight safety investment.", + "tags": [ + "racing-dynamics", + "incentives", + "labs" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "related" + }, + { + "id": "multipolar-trap", + "type": "risk", + "relationship": "related" + }, + { + "id": "safety-culture-strength", + "type": "parameter", + "relationship": "models" + }, + { + "id": "racing-intensity", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E251", + "customFields": [ + { + "label": "Model Type", + "value": "Incentive Analysis" + }, + { + "label": "Target Actor", + "value": "Frontier AI Labs" + }, + { + "label": "Key Insight", + "value": "Lab incentives systematically diverge from social optimum under competition" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "risk-interaction-matrix", + "title": "Risk Interaction Matrix", + "description": "This model analyzes how risks amplify, mitigate, or transform each other. It identifies 15-25% of risk pairs as strongly interacting, with compounding effects dominating.", + "tags": [ + "risk-interactions", + "compounding-risks", + "systems-thinking" + ], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E252", + "customFields": [ + { + "label": "Model Type", + "value": "Interaction Framework" + }, + { + "label": "Scope", + "value": "Cross-risk Analysis" + }, + { + "label": "Key Insight", + "value": "Risks rarely occur in isolation; interactions can amplify or mitigate effects" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "safety-research-value", + "title": "Safety Research Value Model", + "description": "This model estimates marginal returns on safety research investment. It finds current funding levels significantly below optimal, with 2-5x returns available in neglected areas.", + "tags": [ + "cost-effectiveness", + "research-priorities", + "expected-value" + ], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E253", + "customFields": [ + { + "label": "Model Type", + "value": "Cost-Effectiveness Analysis" + }, + { + "label": "Scope", + "value": "Safety Research ROI" + }, + { + "label": "Key Insight", + "value": "Safety research value depends critically on timing relative to capability progress" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "capabilities-to-safety-pipeline", + "title": "Capabilities-to-Safety Pipeline Model", + "description": "This model analyzes researcher transitions from capabilities to safety work. It finds only 10-15% of aware researchers consider switching, with 60-75% blocked by barriers at the consideration-to-action stage.", + "tags": [ + "talent", + "field-building", + "career-transitions" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "safety-researcher-gap", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E254", + "customFields": [ + { + "label": "Model Type", + "value": "Talent Pipeline Analysis" + }, + { + "label": "Target Factor", + "value": "Safety Researcher Supply" + }, + { + "label": "Key Insight", + "value": "Capabilities researchers are the primary talent pool for safety work" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "compounding-risks-analysis", + "title": "Compounding Risks Analysis Model", + "description": "This model analyzes how risks compound beyond additive effects. Key combinations include racing+concentration (40-60% coverage needed) and mesa-optimization+scheming (2-6% catastrophic probability).", + "tags": [ + "risk-interactions", + "compounding-effects", + "systems-thinking" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "risk-interaction-matrix", + "type": "model", + "relationship": "related" + }, + { + "id": "risk-cascade-pathways", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E255", + "customFields": [ + { + "label": "Model Type", + "value": "Systems Analysis" + }, + { + "label": "Scope", + "value": "Multi-Risk Interactions" + }, + { + "label": "Key Insight", + "value": "Combined risks often exceed the sum of individual risks due to non-linear interactions" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "defense-in-depth-model", + "title": "Defense in Depth Model", + "description": "This model analyzes how layered safety measures combine. Individual layers provide 20-60% coverage; independence between layers is critical for compound effectiveness.", + "tags": [ + "defense", + "security", + "layered-approach" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "societal-resilience", + "type": "parameter", + "relationship": "models" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E256", + "customFields": [ + { + "label": "Model Type", + "value": "Defense Framework" + }, + { + "label": "Scope", + "value": "Layered Safety Architecture" + }, + { + "label": "Key Insight", + "value": "Multiple independent safety layers provide robustness against single-point failures" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "institutional-adaptation-speed", + "title": "Institutional Adaptation Speed Model", + "description": "This model analyzes institutional adaptation rates to AI. It finds institutions change at 10-30% of needed rate per year while AI creates 50-200% annual gaps, with regulatory lag historically spanning 15-70 years.", + "tags": [ + "institutions", + "adaptation", + "governance-gap" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "related" + }, + { + "id": "regulatory-capacity", + "type": "parameter", + "relationship": "models" + }, + { + "id": "institutional-quality", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E257", + "customFields": [ + { + "label": "Model Type", + "value": "Adaptation Dynamics" + }, + { + "label": "Target Factor", + "value": "Governance Gap" + }, + { + "label": "Key Insight", + "value": "Institutional adaptation typically lags technology by 5-15 years, creating persistent governance gaps" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "international-coordination-game", + "title": "International Coordination Game Model", + "description": "This model analyzes game-theoretic dynamics of international AI governance. It identifies key equilibria between US-China competition and potential cooperation pathways through safety agreements.", + "tags": [ + "game-theory", + "international-coordination", + "governance" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "multipolar-trap", + "type": "risk", + "relationship": "related" + }, + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "related" + }, + { + "id": "international-coordination", + "type": "parameter", + "relationship": "models" + }, + { + "id": "coordination-capacity", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "ai-control-concentration", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E258", + "customFields": [ + { + "label": "Model Type", + "value": "Game Theory" + }, + { + "label": "Scope", + "value": "International Governance" + }, + { + "label": "Key Insight", + "value": "International AI coordination faces prisoner's dilemma dynamics with verification challenges" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "media-policy-feedback-loop", + "title": "Media-Policy Feedback Loop Model", + "description": "This model analyzes cycles between media coverage, public opinion, and AI policy. It finds media framing significantly shapes policy windows, with 6-18 month lag between coverage spikes and regulatory response.", + "tags": [ + "media", + "policy", + "feedback-loops" + ], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E259", + "customFields": [ + { + "label": "Model Type", + "value": "Feedback Loop Analysis" + }, + { + "label": "Target Factor", + "value": "Media-Policy Dynamics" + }, + { + "label": "Key Insight", + "value": "Media coverage and policy responses create reinforcing cycles that can accelerate or delay governance" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "post-incident-recovery", + "title": "Post-Incident Recovery Model", + "description": "This model analyzes recovery pathways from AI incidents. It finds clear attribution enables 3-5x faster recovery, and recommends 5-10% of safety resources for recovery capacity, particularly trust and skill preservation.", + "tags": [ + "incidents", + "recovery", + "resilience" + ], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E260", + "customFields": [ + { + "label": "Model Type", + "value": "Recovery Dynamics" + }, + { + "label": "Scope", + "value": "Incident Response" + }, + { + "label": "Key Insight", + "value": "Recovery time and completeness depend on incident severity, preparedness, and system design" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "public-opinion-evolution", + "title": "Public Opinion Evolution Model", + "description": "This model analyzes how public AI risk perception evolves. It finds major incidents shift opinion by 10-25 percentage points, decaying with 6-12 month half-life.", + "tags": [ + "public-opinion", + "attitudes", + "social-dynamics" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "media-policy-feedback-loop", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E261", + "customFields": [ + { + "label": "Model Type", + "value": "Attitude Dynamics" + }, + { + "label": "Target Factor", + "value": "Public Perception" + }, + { + "label": "Key Insight", + "value": "Public opinion on AI risk follows event-driven cycles with gradual baseline shifts" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "risk-cascade-pathways", + "title": "Risk Cascade Pathways Model", + "description": "This model maps common pathways where one risk triggers others. Key cascades include racing→corner-cutting→incident→regulation-capture and epistemic→trust→coordination-failure.", + "tags": [ + "cascades", + "risk-pathways", + "systems-thinking" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compounding-risks-analysis", + "type": "model", + "relationship": "related" + }, + { + "id": "risk-interaction-network", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E262", + "customFields": [ + { + "label": "Model Type", + "value": "Cascade Mapping" + }, + { + "label": "Scope", + "value": "Risk Propagation" + }, + { + "label": "Key Insight", + "value": "Risks propagate through system interdependencies, often in non-obvious paths" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "risk-interaction-network", + "title": "Risk Interaction Network Model", + "description": "This model maps how risks enable and reinforce each other. It identifies racing dynamics and concentration of power as central hub risks affecting most others.", + "tags": [ + "networks", + "risk-interactions", + "systems-thinking" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "risk-cascade-pathways", + "type": "model", + "relationship": "related" + }, + { + "id": "compounding-risks-analysis", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E263", + "customFields": [ + { + "label": "Model Type", + "value": "Network Analysis" + }, + { + "label": "Scope", + "value": "Risk Dependencies" + }, + { + "label": "Key Insight", + "value": "Risk network structure reveals critical nodes and amplification pathways" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "safety-capability-tradeoff", + "title": "Safety-Capability Tradeoff Model", + "description": "This model analyzes when safety measures conflict with capabilities. It finds most safety interventions impose 5-15% capability cost, with some achieving safety gains at lower cost.", + "tags": [ + "tradeoffs", + "safety", + "capabilities" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk", + "relationship": "related" + }, + { + "id": "safety-capability-gap", + "type": "parameter", + "relationship": "models" + }, + { + "id": "alignment-robustness", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E264", + "customFields": [ + { + "label": "Model Type", + "value": "Tradeoff Analysis" + }, + { + "label": "Scope", + "value": "Safety vs Capability" + }, + { + "label": "Key Insight", + "value": "Some safety measures reduce capabilities while others are complementary; distinguishing is crucial" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "safety-research-allocation", + "title": "Safety Research Allocation Model", + "description": "This model analyzes safety research resource distribution. It identifies neglected areas including multi-agent dynamics and corrigibility, with 3-5x funding gaps vs core alignment.", + "tags": [ + "resource-allocation", + "research-priorities", + "optimization" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "safety-research-value", + "type": "model", + "relationship": "related" + }, + { + "id": "intervention-effectiveness-matrix", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E265", + "customFields": [ + { + "label": "Model Type", + "value": "Resource Optimization" + }, + { + "label": "Scope", + "value": "Research Prioritization" + }, + { + "label": "Key Insight", + "value": "Optimal allocation depends on problem tractability, neglectedness, and time-sensitivity" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "safety-researcher-gap", + "title": "Safety Researcher Gap Model", + "description": "This model analyzes mismatch between safety researcher supply and demand. It estimates 3-10x gap between needed researchers and current pipeline capacity.", + "tags": [ + "talent", + "field-building", + "supply-demand" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "capabilities-to-safety-pipeline", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E266", + "customFields": [ + { + "label": "Model Type", + "value": "Supply-Demand Analysis" + }, + { + "label": "Target Factor", + "value": "Safety Talent" + }, + { + "label": "Key Insight", + "value": "Safety researcher demand is growing faster than supply, creating widening gaps" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "whistleblower-dynamics", + "title": "Whistleblower Dynamics Model", + "description": "This model analyzes information flow from AI insiders to the public. It estimates significant barriers reduce whistleblowing by 70-90% compared to optimal transparency.", + "tags": [ + "whistleblowing", + "incentives", + "transparency" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "lab-incentives-model", + "type": "model", + "relationship": "related" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E267", + "customFields": [ + { + "label": "Model Type", + "value": "Incentive Analysis" + }, + { + "label": "Target Factor", + "value": "Transparency Mechanisms" + }, + { + "label": "Key Insight", + "value": "Current incentive structures strongly discourage whistleblowing, creating information asymmetries" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "parameter-interaction-network", + "title": "Parameter Interaction Network Model", + "description": "This model maps causal relationships between 22 key AI safety parameters. It identifies 7 feedback loops and 4 critical dependency clusters, showing that epistemic-health and institutional-quality are highest-leverage intervention points.", + "tags": [ + "networks", + "parameters", + "systems-thinking", + "feedback-loops" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "risk-interaction-network", + "type": "model", + "relationship": "related" + }, + { + "id": "epistemic-health", + "type": "parameter", + "relationship": "models" + }, + { + "id": "institutional-quality", + "type": "parameter", + "relationship": "models" + }, + { + "id": "societal-trust", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "racing-intensity", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E268", + "customFields": [ + { + "label": "Model Type", + "value": "Network Analysis" + }, + { + "label": "Scope", + "value": "Parameter Dependencies" + }, + { + "label": "Key Insight", + "value": "Epistemic and institutional parameters have highest downstream influence; interventions should target network hubs" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "safety-culture-equilibrium", + "title": "Safety Culture Equilibrium Model", + "description": "This model analyzes stable states for AI lab safety culture under competitive pressure. It identifies three equilibria and transition conditions requiring coordinated commitment or major incident.", + "tags": [ + "equilibrium", + "safety-culture", + "game-theory", + "lab-behavior" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "lab-incentives-model", + "type": "model", + "relationship": "related" + }, + { + "id": "racing-dynamics-model", + "type": "model", + "relationship": "related" + }, + { + "id": "safety-culture-strength", + "type": "parameter", + "relationship": "models" + }, + { + "id": "racing-intensity", + "type": "parameter", + "relationship": "models" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E269", + "customFields": [ + { + "label": "Model Type", + "value": "Game-Theoretic Analysis" + }, + { + "label": "Scope", + "value": "Lab Behavior Dynamics" + }, + { + "label": "Key Insight", + "value": "Current industry sits in racing-dominant equilibrium; transition to safety-competitive requires coordination or forcing event" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "regulatory-capacity-threshold", + "title": "Regulatory Capacity Threshold Model", + "description": "This model estimates minimum regulatory capacity for credible AI oversight. It finds current US/UK capacity at 0.15-0.25 of the 0.4-0.6 threshold needed, with a 3-5 year window to build capacity.", + "tags": [ + "governance", + "regulation", + "thresholds", + "capacity-building" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "institutional-adaptation-speed", + "type": "model", + "relationship": "related" + }, + { + "id": "regulatory-capacity", + "type": "parameter", + "relationship": "models" + }, + { + "id": "institutional-quality", + "type": "parameter", + "relationship": "models" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E270", + "customFields": [ + { + "label": "Model Type", + "value": "Threshold Analysis" + }, + { + "label": "Scope", + "value": "Regulatory Effectiveness" + }, + { + "label": "Key Insight", + "value": "Gap between regulatory capacity and industry capability is widening; crisis-level investment needed" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "alignment-robustness-trajectory", + "title": "Alignment Robustness Trajectory Model", + "description": "This model analyzes how alignment robustness changes with capability scaling. It estimates current techniques maintain 60-80% robustness at GPT-4 level but projects degradation to 30-50% at 100x capability.", + "tags": [ + "alignment", + "scaling", + "trajectories", + "robustness" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deceptive-alignment-decomposition", + "type": "model", + "relationship": "related" + }, + { + "id": "safety-capability-tradeoff", + "type": "model", + "relationship": "related" + }, + { + "id": "alignment-robustness", + "type": "parameter", + "relationship": "models" + }, + { + "id": "safety-capability-gap", + "type": "parameter", + "relationship": "affects" + }, + { + "id": "human-oversight-quality", + "type": "parameter", + "relationship": "affects" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E271", + "customFields": [ + { + "label": "Model Type", + "value": "Trajectory Analysis" + }, + { + "label": "Scope", + "value": "Alignment Scaling" + }, + { + "label": "Key Insight", + "value": "Critical zone at 10-30x current capability where techniques become insufficient; alignment valley problem" + } + ], + "relatedTopics": [], + "entityType": "model" + }, + { + "id": "anthropic", + "title": "Anthropic", + "description": "Anthropic is an AI safety company founded in January 2021 by former OpenAI researchers, including siblings Dario and Daniela Amodei. The company was created following disagreements with OpenAI's direction, particularly concerns about the pace of commercialization and the shift toward Microsoft partnership.", + "tags": [ + "constitutional-ai", + "rlhf", + "interpretability", + "responsible-scaling", + "claude", + "frontier-ai", + "scalable-oversight", + "ai-safety", + "racing-dynamics" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "dario-amodei", + "type": "researcher" + }, + { + "id": "chris-olah", + "type": "researcher" + }, + { + "id": "jan-leike", + "type": "researcher" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "interpretability", + "type": "safety-approaches" + }, + { + "id": "scalable-oversight", + "type": "safety-approaches" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "racing-dynamics", + "type": "risk" + } + ], + "sources": [ + { + "title": "Anthropic Company Website", + "url": "https://anthropic.com" + }, + { + "title": "Core Views on AI Safety", + "url": "https://anthropic.com/news/core-views-on-ai-safety" + }, + { + "title": "Responsible Scaling Policy", + "url": "https://anthropic.com/news/anthropics-responsible-scaling-policy" + }, + { + "title": "Constitutional AI Paper", + "url": "https://arxiv.org/abs/2212.08073" + }, + { + "title": "Scaling Monosemanticity", + "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/" + }, + { + "title": "Sleeper Agents Paper", + "url": "https://arxiv.org/abs/2401.05566" + }, + { + "title": "Many-Shot Jailbreaking Paper", + "url": "https://www-cdn.anthropic.com/af5633c94ed2beb282f6a53c595eb437e8e7b630/Many_Shot_Jailbreaking__2024_04_02_0936.pdf" + }, + { + "title": "Machines of Loving Grace (Dario Amodei essay)", + "url": "https://darioamodei.com/machines-of-loving-grace" + }, + { + "title": "Anthropic Funding News (Crunchbase)", + "url": "https://www.crunchbase.com/organization/anthropic" + }, + { + "title": "Amazon Anthropic Partnership", + "url": "https://press.aboutamazon.com/2023/9/amazon-and-anthropic-announce-strategic-collaboration" + }, + { + "title": "Google Anthropic Investment", + "url": "https://blog.google/technology/ai/google-anthropic-investment/" + } + ], + "lastUpdated": "2025-12", + "website": "https://anthropic.com", + "numericId": "E272", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "generic", + "founded": "2021", + "headquarters": "San Francisco, CA", + "employees": "~1000", + "funding": "$7B+" + }, + { + "id": "deepmind", + "title": "Google DeepMind", + "description": "Google DeepMind was formed in April 2023 from the merger of DeepMind and Google Brain, uniting Google's two major AI research organizations. The combined entity represents one of the world's most formidable AI research labs, with landmark achievements including AlphaGo (defeating world champions at Go), AlphaFold (solving protein folding), and G...", + "tags": [ + "gemini", + "alphafold", + "alphago", + "rlhf", + "agi", + "frontier-ai", + "google", + "scientific-ai-applications", + "frontier-safety-framework", + "reward-modeling", + "scalable-oversight" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "demis-hassabis", + "type": "researcher" + }, + { + "id": "shane-legg", + "type": "researcher" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "scalable-oversight", + "type": "safety-approaches" + }, + { + "id": "reward-hacking", + "type": "risk" + }, + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "concentration-of-power", + "type": "risk" + } + ], + "sources": [ + { + "title": "Google DeepMind Website", + "url": "https://deepmind.google" + }, + { + "title": "AlphaGo Documentary", + "url": "https://www.youtube.com/watch?v=WXuK6gekU1Y" + }, + { + "title": "AlphaFold Protein Structure Database", + "url": "https://alphafold.ebi.ac.uk" + }, + { + "title": "AlphaFold Nature Paper", + "url": "https://www.nature.com/articles/s41586-021-03819-2" + }, + { + "title": "Frontier Safety Framework", + "url": "https://deepmind.google/discover/blog/introducing-the-frontier-safety-framework/" + }, + { + "title": "AI Safety Gridworlds", + "url": "https://arxiv.org/abs/1711.09883" + }, + { + "title": "Specification Gaming Examples", + "url": "https://deepmind.google/discover/blog/specification-gaming-the-flip-side-of-ai-ingenuity/" + }, + { + "title": "DeepMind Safety Research", + "url": "https://deepmind.google/discover/blog/building-safe-artificial-intelligence-insights-from-deepmind/" + }, + { + "title": "Gemini Technical Report", + "url": "https://arxiv.org/abs/2312.11805" + }, + { + "title": "Google DeepMind Merger Announcement", + "url": "https://blog.google/technology/ai/april-ai-update/" + }, + { + "title": "GraphCast Weather Prediction", + "url": "https://deepmind.google/discover/blog/graphcast-ai-model-for-faster-and-more-accurate-global-weather-forecasting/" + }, + { + "title": "Nobel Prize in Chemistry 2024", + "url": "https://www.nobelprize.org/prizes/chemistry/2024/press-release/" + } + ], + "lastUpdated": "2025-12", + "website": "https://deepmind.google", + "numericId": "E273", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "generic", + "founded": "2010", + "headquarters": "London, UK", + "employees": "~2000", + "funding": "Google subsidiary" + }, + { + "id": "openai", + "title": "OpenAI", + "description": "OpenAI is the AI research company that brought large language models into mainstream consciousness through ChatGPT. Founded in December 2015 as a non-profit with the mission to ensure artificial general intelligence benefits all of humanity, OpenAI has undergone dramatic evolution - from non-profit to \"capped-profit,\" from research lab to produc...", + "tags": [ + "gpt-4", + "chatgpt", + "rlhf", + "preparedness", + "agi", + "frontier-ai", + "o1", + "reasoning-models", + "microsoft", + "governance", + "racing-dynamics", + "alignment-research" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "sam-altman", + "type": "researcher" + }, + { + "id": "ilya-sutskever", + "type": "researcher" + }, + { + "id": "jan-leike", + "type": "researcher" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "interpretability", + "type": "safety-approaches" + }, + { + "id": "scalable-oversight", + "type": "safety-approaches" + }, + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "deceptive-alignment", + "type": "risk" + } + ], + "sources": [ + { + "title": "OpenAI Website", + "url": "https://openai.com" + }, + { + "title": "OpenAI Charter", + "url": "https://openai.com/charter" + }, + { + "title": "GPT-4 System Card", + "url": "https://cdn.openai.com/papers/gpt-4-system-card.pdf" + }, + { + "title": "InstructGPT Paper", + "url": "https://arxiv.org/abs/2203.02155" + }, + { + "title": "Preparedness Framework", + "url": "https://openai.com/safety/preparedness" + }, + { + "title": "Weak-to-Strong Generalization", + "url": "https://arxiv.org/abs/2312.09390" + }, + { + "title": "Jan Leike Resignation Statement", + "url": "https://twitter.com/janleike/status/1791498184887095344" + }, + { + "title": "November 2023 Governance Crisis (reporting)", + "url": "https://www.theverge.com/2023/11/17/23965982/openai-ceo-sam-altman-fired" + }, + { + "title": "Microsoft OpenAI Partnership", + "url": "https://blogs.microsoft.com/blog/2023/01/23/microsoftandopenaiextendpartnership/" + }, + { + "title": "o1 System Card", + "url": "https://openai.com/index/openai-o1-system-card/" + }, + { + "title": "OpenAI Funding History (Crunchbase)", + "url": "https://www.crunchbase.com/organization/openai" + } + ], + "lastUpdated": "2025-12", + "website": "https://openai.com", + "numericId": "E274", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "generic", + "founded": "2015", + "headquarters": "San Francisco, CA", + "employees": "~1500", + "funding": "$13B+ from Microsoft" + }, + { + "id": "xai", + "title": "xAI", + "description": "xAI is an artificial intelligence company founded by Elon Musk in July 2023 with the stated mission to \"understand the true nature of the universe\" through AI.", + "tags": [ + "grok", + "elon-musk", + "x-integration", + "truth-seeking-ai", + "content-moderation", + "free-speech", + "ai-safety-philosophy", + "racing-dynamics", + "frontier-ai", + "agi-development" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "elon-musk", + "type": "researcher" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "content-moderation", + "type": "concepts" + }, + { + "id": "agi-race", + "type": "concepts" + } + ], + "sources": [ + { + "title": "xAI Website", + "url": "https://x.ai" + }, + { + "title": "Grok Announcements", + "url": "https://x.ai/blog" + }, + { + "title": "Elon Musk on X (Twitter)", + "url": "https://twitter.com/elonmusk" + }, + { + "title": "xAI Funding Announcements" + }, + { + "title": "Grok Technical Details", + "url": "https://x.ai/blog/grok" + } + ], + "lastUpdated": "2025-12", + "website": "https://x.ai", + "numericId": "E275", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "generic" + }, + { + "id": "chai", + "title": "Center for Human-Compatible AI", + "description": "The Center for Human-Compatible AI (CHAI) is an academic research center at UC Berkeley focused on ensuring AI systems are beneficial to humans. Founded by Stuart Russell, author of the leading AI textbook, CHAI brings academic rigor to AI safety research.", + "tags": [ + "inverse-reinforcement-learning", + "value-learning", + "assistance-games", + "human-compatible-ai", + "academic-ai-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "value-learning", + "type": "safety-agenda" + }, + { + "id": "reward-hacking", + "type": "risk" + }, + { + "id": "corrigibility", + "type": "safety-agenda" + } + ], + "sources": [ + { + "title": "CHAI Website", + "url": "https://humancompatible.ai" + }, + { + "title": "Human Compatible (Book)", + "url": "https://www.penguinrandomhouse.com/books/566677/human-compatible-by-stuart-russell/" + }, + { + "title": "Stuart Russell on AI Risk", + "url": "https://www.youtube.com/watch?v=EBK-a94IFHY" + } + ], + "lastUpdated": "2025-12", + "website": "https://humancompatible.ai", + "numericId": "E276", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "academic", + "founded": "2016", + "headquarters": "Berkeley, CA" + }, + { + "id": "apollo-research", + "title": "Apollo Research", + "description": "Apollo Research is an AI safety research organization founded in 2022 with a specific focus on one of the most concerning potential failure modes: deceptive alignment and scheming behavior in advanced AI systems.", + "tags": [ + "deception", + "scheming", + "sandbagging", + "evaluations", + "situational-awareness", + "strategic-deception", + "red-teaming", + "alignment-failures", + "dangerous-capabilities", + "model-organisms", + "adversarial-testing" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "sandbagging", + "type": "risk" + }, + { + "id": "metr", + "type": "organization" + }, + { + "id": "arc", + "type": "organization" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "uk-aisi", + "type": "organization" + }, + { + "id": "situational-awareness", + "type": "risk" + }, + { + "id": "capability-evaluations", + "type": "safety-approaches" + } + ], + "sources": [ + { + "title": "Apollo Research Website", + "url": "https://www.apolloresearch.ai" + }, + { + "title": "Apollo Research Publications", + "url": "https://www.apolloresearch.ai/research" + }, + { + "title": "Evaluating Frontier Models for Dangerous Capabilities", + "url": "https://www.apolloresearch.ai/research/scheming-evaluations" + }, + { + "title": "Apollo on Sandbagging", + "url": "https://www.apolloresearch.ai/blog/sandbagging" + }, + { + "title": "Situational Awareness Research", + "url": "https://www.apolloresearch.ai/research/situational-awareness" + }, + { + "title": "Apollo Research Blog", + "url": "https://www.apolloresearch.ai/blog" + } + ], + "lastUpdated": "2025-12", + "website": "https://www.apolloresearch.ai", + "numericId": "E277", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org" + }, + { + "id": "cais", + "title": "Center for AI Safety", + "description": "The Center for AI Safety (CAIS) is a nonprofit organization that works to reduce societal-scale risks from AI. CAIS combines research, field-building, and public communication to advance AI safety.", + "tags": [ + "ai-safety", + "x-risk", + "representation-engineering", + "field-building", + "ai-risk-communication" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "existential-risk", + "type": "risk" + }, + { + "id": "power-seeking", + "type": "risk" + }, + { + "id": "anthropic", + "type": "lab" + } + ], + "sources": [ + { + "title": "CAIS Website", + "url": "https://safe.ai" + }, + { + "title": "Statement on AI Risk", + "url": "https://www.safe.ai/statement-on-ai-risk" + }, + { + "title": "Representation Engineering Paper", + "url": "https://arxiv.org/abs/2310.01405" + } + ], + "lastUpdated": "2025-12", + "website": "https://safe.ai", + "numericId": "E278", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org", + "founded": "2022", + "headquarters": "San Francisco, CA" + }, + { + "id": "conjecture", + "title": "Conjecture", + "description": "Conjecture is an AI safety research organization founded in 2021 by Connor Leahy and a team of researchers concerned about existential risks from advanced AI.", + "tags": [ + "cognitive-emulation", + "coem", + "interpretability", + "neural-network-internals", + "circuit-analysis", + "model-organisms", + "eleutherai", + "european-ai-safety", + "alternative-paradigms" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "connor-leahy", + "type": "researcher" + }, + { + "id": "interpretability", + "type": "safety-approaches" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "redwood", + "type": "organization" + }, + { + "id": "prosaic-alignment", + "type": "safety-approaches" + }, + { + "id": "uk-aisi", + "type": "organization" + } + ], + "sources": [ + { + "title": "Conjecture Website", + "url": "https://conjecture.dev" + }, + { + "title": "Connor Leahy Twitter/X", + "url": "https://twitter.com/NPCollapse" + }, + { + "title": "EleutherAI Background", + "url": "https://www.eleuther.ai" + }, + { + "title": "Conjecture Funding Announcement", + "url": "https://techcrunch.com/2023/03/28/conjecture-raises-funding-for-ai-safety/" + }, + { + "title": "Cognitive Emulation Research", + "url": "https://conjecture.dev/research" + }, + { + "title": "Connor Leahy Podcast Appearances" + } + ], + "lastUpdated": "2025-12", + "website": "https://conjecture.dev", + "numericId": "E279", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org", + "founded": "2022", + "headquarters": "London, UK" + }, + { + "id": "far-ai", + "title": "FAR AI", + "description": "FAR AI (Forecasting AI Research) is an AI safety research organization founded in 2023 with a focus on adversarial robustness, model evaluation, and alignment research. The organization was co-founded by Dan Hendrycks, a prominent AI safety researcher known for his work on benchmarks, robustness, and AI risk.", + "tags": [ + "adversarial-robustness", + "ml-safety", + "benchmarking", + "natural-abstractions", + "evaluation", + "mmlu", + "out-of-distribution-detection", + "safety-evaluations", + "empirical-research", + "academic-ai-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "dan-hendrycks", + "type": "researcher" + }, + { + "id": "adversarial-robustness", + "type": "safety-approaches" + }, + { + "id": "natural-abstractions", + "type": "concepts" + }, + { + "id": "benchmarking", + "type": "safety-approaches" + }, + { + "id": "metr", + "type": "organization" + }, + { + "id": "apollo-research", + "type": "organization" + } + ], + "sources": [ + { + "title": "FAR AI Website", + "url": "https://far.ai" + }, + { + "title": "Dan Hendrycks Google Scholar", + "url": "https://scholar.google.com/citations?user=VUnTdTkAAAAJ" + }, + { + "title": "MMLU Paper", + "url": "https://arxiv.org/abs/2009.03300" + }, + { + "title": "Natural Abstractions Research", + "url": "https://www.alignmentforum.org/tag/natural-abstraction" + }, + { + "title": "Dan Hendrycks on X-risk", + "url": "https://arxiv.org/abs/2306.12001" + } + ], + "lastUpdated": "2025-12", + "website": "https://far.ai", + "numericId": "E280", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org" + }, + { + "id": "govai", + "title": "GovAI", + "description": "The Centre for the Governance of AI (GovAI) is a research organization focused on AI policy and governance. Originally part of the Future of Humanity Institute at Oxford, GovAI became independent in 2023 when FHI closed.", + "tags": [ + "governance", + "compute-governance", + "international", + "regulation" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "international-coordination", + "type": "policy" + }, + { + "id": "deepmind", + "type": "lab" + } + ], + "sources": [ + { + "title": "GovAI Website", + "url": "https://governance.ai" + }, + { + "title": "Computing Power and AI Governance", + "url": "https://governance.ai/compute" + }, + { + "title": "GovAI Research Papers", + "url": "https://governance.ai/research" + } + ], + "lastUpdated": "2025-12", + "website": "https://governance.ai", + "numericId": "E281", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org" + }, + { + "id": "metr", + "title": "METR", + "description": "METR (Model Evaluation and Threat Research), formerly known as ARC Evals, is an organization dedicated to evaluating frontier AI models for dangerous capabilities before deployment.", + "tags": [ + "evaluations", + "dangerous-capabilities", + "autonomous-replication", + "cybersecurity", + "cbrn", + "bio-risk", + "red-teaming", + "capability-elicitation", + "deployment-decisions", + "pre-deployment-testing", + "safety-thresholds", + "responsible-scaling", + "preparedness-framework" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "beth-barnes", + "type": "researcher" + }, + { + "id": "paul-christiano", + "type": "researcher" + }, + { + "id": "arc", + "type": "organization" + }, + { + "id": "apollo-research", + "type": "organization" + }, + { + "id": "autonomous-replication", + "type": "risk" + }, + { + "id": "cyber-offense", + "type": "risk" + }, + { + "id": "bio-risk", + "type": "risk" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "uk-aisi", + "type": "organization" + } + ], + "sources": [ + { + "title": "METR Website", + "url": "https://metr.org" + }, + { + "title": "METR Evaluations", + "url": "https://metr.org/evaluations" + }, + { + "title": "GPT-4 System Card (ARC Evals section)", + "url": "https://cdn.openai.com/papers/gpt-4-system-card.pdf" + }, + { + "title": "OpenAI Preparedness Framework", + "url": "https://openai.com/safety/preparedness" + }, + { + "title": "Anthropic Responsible Scaling Policy", + "url": "https://anthropic.com/news/anthropics-responsible-scaling-policy" + }, + { + "title": "Beth Barnes on Twitter/X", + "url": "https://twitter.com/beth_from_ba" + }, + { + "title": "METR Research and Blog", + "url": "https://metr.org/blog" + } + ], + "lastUpdated": "2025-12", + "website": "https://metr.org", + "numericId": "E282", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org" + }, + { + "id": "arc", + "title": "Alignment Research Center", + "description": "The Alignment Research Center (ARC) was founded in 2021 by Paul Christiano after his departure from OpenAI. ARC represents a distinctive approach to AI alignment: combining theoretical research on fundamental problems (like Eliciting Latent Knowledge) with practical evaluations of frontier models for dangerous capabilities.", + "tags": [ + "eliciting-latent-knowledge", + "elk", + "evaluations", + "scalable-oversight", + "ai-evals", + "deception", + "worst-case-alignment", + "debate", + "amplification", + "adversarial-testing", + "autonomous-replication", + "sandbagging" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "paul-christiano", + "type": "researcher" + }, + { + "id": "scalable-oversight", + "type": "safety-approaches" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "sandbagging", + "type": "risk" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "miri", + "type": "organization" + }, + { + "id": "uk-aisi", + "type": "policies" + } + ], + "sources": [ + { + "title": "ARC Website", + "url": "https://alignment.org" + }, + { + "title": "ELK Report", + "url": "https://docs.google.com/document/d/1WwsnJQstPq91_Yh-Ch2XRL8H_EpsnjrC1dwZXR37PC8/" + }, + { + "title": "ARC Evals", + "url": "https://evals.alignment.org" + }, + { + "title": "GPT-4 Evaluation (ARC summary)", + "url": "https://evals.alignment.org/blog/2023-03-18-update-on-recent-evals/" + }, + { + "title": "Paul Christiano's AI Alignment Forum posts", + "url": "https://www.alignmentforum.org/users/paulfchristiano" + }, + { + "title": "Iterated Amplification", + "url": "https://ai-alignment.com/iterated-distillation-and-amplification-157debfd1616" + }, + { + "title": "AI Safety via Debate", + "url": "https://arxiv.org/abs/1805.00899" + }, + { + "title": "Ajeya Cotra's Bio Anchors", + "url": "https://www.alignmentforum.org/posts/KrJfoZzpSDpnrv9va/draft-report-on-ai-timelines" + } + ], + "lastUpdated": "2025-12", + "website": "https://alignment.org", + "numericId": "E283", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org", + "founded": "2021", + "headquarters": "Berkeley, CA", + "employees": "~20", + "funding": "~$10M/year" + }, + { + "id": "epoch-ai", + "title": "Epoch AI", + "description": "Epoch AI is a research organization dedicated to producing rigorous, data-driven forecasts and analysis about artificial intelligence progress, with particular focus on compute trends, training datasets, algorithmic efficiency, and AI timelines.", + "tags": [ + "ai-forecasting", + "compute-trends", + "training-datasets", + "algorithmic-progress", + "ai-timelines", + "transformative-ai", + "compute-governance", + "parameter-counts", + "scaling", + "data-constraints", + "empirical-analysis", + "trend-extrapolation" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compute-governance", + "type": "policies" + }, + { + "id": "transformative-ai", + "type": "concepts" + }, + { + "id": "scaling-laws", + "type": "concepts" + }, + { + "id": "ai-timelines", + "type": "concepts" + }, + { + "id": "data-constraints", + "type": "concepts" + } + ], + "sources": [ + { + "title": "Epoch AI Website", + "url": "https://epochai.org" + }, + { + "title": "Epoch Parameter Database", + "url": "https://epochai.org/data/epochdb/visualization" + }, + { + "title": "Compute Trends Paper", + "url": "https://epochai.org/blog/compute-trends" + }, + { + "title": "Will We Run Out of Data?", + "url": "https://epochai.org/blog/will-we-run-out-of-data" + }, + { + "title": "Algorithmic Progress Research", + "url": "https://epochai.org/blog/revisiting-algorithmic-progress" + }, + { + "title": "Epoch Research Blog", + "url": "https://epochai.org/blog" + }, + { + "title": "Epoch on Twitter/X", + "url": "https://twitter.com/epoch_ai" + } + ], + "lastUpdated": "2025-12", + "website": "https://epochai.org", + "numericId": "E284", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "miri", + "title": "Machine Intelligence Research Institute", + "description": "The Machine Intelligence Research Institute (MIRI) is one of the oldest organizations focused on AI existential risk, founded in 2000 as the Singularity Institute for Artificial Intelligence (SIAI).", + "tags": [ + "agent-foundations", + "decision-theory", + "corrigibility", + "instrumental-convergence", + "embedded-agency", + "governance", + "logical-uncertainty", + "rationalist-community", + "lesswrong", + "sharp-left-turn", + "security-mindset", + "deconfusion" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "eliezer-yudkowsky", + "type": "researcher" + }, + { + "id": "nate-soares", + "type": "researcher" + }, + { + "id": "paul-christiano", + "type": "researcher" + }, + { + "id": "instrumental-convergence", + "type": "risk" + }, + { + "id": "corrigibility-failure", + "type": "risk" + }, + { + "id": "sharp-left-turn", + "type": "risk" + }, + { + "id": "compute-governance", + "type": "policies" + }, + { + "id": "arc", + "type": "organization" + } + ], + "sources": [ + { + "title": "MIRI Website", + "url": "https://intelligence.org" + }, + { + "title": "MIRI 2023 Strategy Update", + "url": "https://intelligence.org/2023/03/09/miri-announces-new-death-with-dignity-strategy/" + }, + { + "title": "Risks from Learned Optimization (Hubinger et al.)", + "url": "https://arxiv.org/abs/1906.01820" + }, + { + "title": "Logical Induction Paper", + "url": "https://arxiv.org/abs/1609.03543" + }, + { + "title": "Embedded Agency (Demski, Garrabrant)", + "url": "https://intelligence.org/2018/10/29/embedded-agency/" + }, + { + "title": "LessWrong Sequences", + "url": "https://www.lesswrong.com/sequences" + }, + { + "title": "Eliezer Yudkowsky TIME Op-Ed", + "url": "https://time.com/6266923/ai-eliezer-yudkowsky-open-letter-not-enough/" + }, + { + "title": "Agent Foundations Research", + "url": "https://intelligence.org/research-guide/" + }, + { + "title": "Facing the Intelligence Explosion (Muehlhauser)", + "url": "https://intelligence.org/files/IE-EI.pdf" + }, + { + "title": "MIRI on GiveWell", + "url": "https://www.givewell.org/charities/machine-intelligence-research-institute" + } + ], + "lastUpdated": "2025-12", + "website": "https://intelligence.org", + "numericId": "E285", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org", + "founded": "2000", + "headquarters": "Berkeley, CA", + "employees": "~15", + "funding": "~$5M/year" + }, + { + "id": "redwood", + "title": "Redwood Research", + "description": "Redwood Research is an AI safety lab founded in 2021 that has made significant contributions to mechanistic interpretability and, more recently, pioneered the \"AI control\" research agenda.", + "tags": [ + "interpretability", + "causal-scrubbing", + "ai-control", + "adversarial-robustness", + "polysemanticity", + "scheming", + "deception-detection", + "red-teaming", + "monitoring", + "safety-protocols" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "interpretability", + "type": "safety-approaches" + }, + { + "id": "ai-control", + "type": "safety-approaches" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "sandbagging", + "type": "risk" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "arc", + "type": "organization" + }, + { + "id": "miri", + "type": "organization" + } + ], + "sources": [ + { + "title": "Redwood Research Website", + "url": "https://redwoodresearch.org" + }, + { + "title": "AI Control Paper", + "url": "https://arxiv.org/abs/2312.06942" + }, + { + "title": "Causal Scrubbing", + "url": "https://www.alignmentforum.org/posts/JvZhhzycHu2Yd57RN/causal-scrubbing-a-method-for-rigorously-testing" + }, + { + "title": "Adversarial Training for High-Stakes Safety", + "url": "https://arxiv.org/abs/2205.01663" + }, + { + "title": "Redwood Research on Alignment Forum", + "url": "https://www.alignmentforum.org/users/redwood-research" + }, + { + "title": "Neel Nanda's Interpretability Work", + "url": "https://www.neelnanda.io/mechanistic-interpretability" + } + ], + "lastUpdated": "2025-12", + "website": "https://redwoodresearch.org", + "numericId": "E286", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org", + "founded": "2021", + "headquarters": "Berkeley, CA", + "employees": "~25", + "funding": "~$8M/year" + }, + { + "id": "uk-aisi", + "title": "UK AI Safety Institute", + "description": "The UK AI Safety Institute (UK AISI) is a government organization established in 2023 to advance AI safety through research, evaluation, and international coordination. Created in the wake of the first AI Safety Summit hosted by the UK government, AISI represents the UK's commitment to being a global leader in AI safety and governance.", + "tags": [ + "governance", + "government-ai-safety", + "international", + "evaluations", + "bletchley-declaration", + "ai-safety-summits", + "standard-setting", + "uk-ai-policy", + "frontier-model-evaluation", + "global-ai-safety", + "regulatory-framework" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "ian-hogarth", + "type": "researcher" + }, + { + "id": "us-aisi", + "type": "organization" + }, + { + "id": "metr", + "type": "organization" + }, + { + "id": "apollo-research", + "type": "organization" + }, + { + "id": "ai-safety-summit", + "type": "events" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "deepmind", + "type": "organization" + } + ], + "sources": [ + { + "title": "UK AI Safety Institute Website", + "url": "https://www.aisi.gov.uk" + }, + { + "title": "Bletchley Declaration", + "url": "https://www.gov.uk/government/publications/ai-safety-summit-2023-the-bletchley-declaration" + }, + { + "title": "UK AI Safety Summit", + "url": "https://www.aisafetysummit.gov.uk" + }, + { + "title": "UK DSIT AI Policy", + "url": "https://www.gov.uk/government/organisations/department-for-science-innovation-and-technology" + }, + { + "title": "Ian Hogarth FT Op-Ed", + "url": "https://www.ft.com/content/03895dc4-a3b7-481e-95cc-336a524f2ac2" + }, + { + "title": "UK AI Safety Institute Announcements", + "url": "https://www.gov.uk/search/news-and-communications?organisations%5B%5D=ai-safety-institute" + } + ], + "lastUpdated": "2025-12", + "website": "https://gov.uk/government/organisations/ai-safety-institute", + "numericId": "E287", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "government", + "founded": "2023", + "headquarters": "London, UK" + }, + { + "id": "us-aisi", + "title": "US AI Safety Institute", + "description": "The US AI Safety Institute (US AISI) is a government agency within the National Institute of Standards and Technology (NIST) established in 2023 to develop standards, evaluations, and guidelines for safe and trustworthy artificial intelligence.", + "tags": [ + "governance", + "government-oversight", + "ai-standards", + "evaluations", + "nist", + "regulatory-framework", + "international", + "ai-safety", + "public-interest", + "regulatory-capture", + "standard-setting" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "uk-aisi", + "type": "organization" + }, + { + "id": "metr", + "type": "organization" + }, + { + "id": "apollo-research", + "type": "organization" + }, + { + "id": "compute-governance", + "type": "policies" + }, + { + "id": "ai-executive-order", + "type": "policies" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "openai", + "type": "organization" + } + ], + "sources": [ + { + "title": "US AI Safety Institute Website", + "url": "https://www.nist.gov/aisi" + }, + { + "title": "NIST AI Risk Management Framework", + "url": "https://www.nist.gov/itl/ai-risk-management-framework" + }, + { + "title": "Executive Order on AI (October 2023)", + "url": "https://www.whitehouse.gov/briefing-room/presidential-actions/2023/10/30/executive-order-on-the-safe-secure-and-trustworthy-development-and-use-of-artificial-intelligence/" + }, + { + "title": "NIST AI Portal", + "url": "https://www.nist.gov/artificial-intelligence" + }, + { + "title": "US AISI Announcements", + "url": "https://www.commerce.gov/news/press-releases/2023/11/biden-harris-administration-announces-key-ai-actions-following-president" + } + ], + "lastUpdated": "2025-12", + "website": "https://nist.gov/aisi", + "numericId": "E288", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "government", + "founded": "2024", + "headquarters": "Gaithersburg, MD" + }, + { + "id": "arc-evals", + "title": "ARC Evaluations", + "description": "Organization focused on evaluating AI systems for dangerous capabilities. Now largely absorbed into METR.", + "tags": [ + "evaluations", + "ai-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "metr", + "type": "lab-research" + }, + { + "id": "capability-evaluations", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E289", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "fhi", + "title": "Future of Humanity Institute", + "description": "Oxford University research center focused on existential risks, founded by Nick Bostrom. Closed in 2024.", + "tags": [ + "research-org", + "existential-risk", + "oxford" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "nick-bostrom", + "type": "researcher" + }, + { + "id": "existential-risk", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "website": "https://www.fhi.ox.ac.uk", + "numericId": "E290", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "academic", + "founded": "2005", + "headquarters": "Oxford, UK" + }, + { + "id": "openai-foundation", + "title": "OpenAI Foundation", + "description": "Nonprofit organization holding 26% equity stake (~$130B) in OpenAI Group PBC, with governance control through board appointment rights and philanthropic commitments focused on health and AI resilience.", + "tags": [ + "nonprofit-governance", + "ai-philanthropy", + "corporate-structure", + "accountability", + "openai" + ], + "clusters": [ + "governance", + "community" + ], + "relatedEntries": [ + { + "id": "openai", + "type": "organization" + }, + { + "id": "sam-altman", + "type": "researcher" + }, + { + "id": "long-term-benefit-trust", + "type": "organization" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "giving-pledge", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E291", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "leading-the-future", + "title": "Leading the Future super PAC", + "description": "Pro-AI industry super PAC launched in 2025 to influence federal AI regulation and the 2026 midterm elections, backed by over $125 million from OpenAI, Andreessen Horowitz, and other tech leaders.", + "tags": [ + "political-advocacy", + "super-pac", + "ai-regulation", + "elections", + "lobbying" + ], + "clusters": [ + "community", + "governance" + ], + "relatedEntries": [ + { + "id": "openai", + "type": "organization" + }, + { + "id": "marc-andreessen", + "type": "researcher" + }, + { + "id": "ai-governance", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E292", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "johns-hopkins-center-for-health-security", + "title": "Johns Hopkins Center for Health Security", + "description": "Independent nonprofit research organization focused on preventing and preparing for epidemics, pandemics, and biological threats, with significant work on biosecurity and AI-biotechnology convergence.", + "tags": [ + "biosecurity", + "pandemic-preparedness", + "ai-bio-convergence", + "health-security", + "policy-research" + ], + "clusters": [ + "community", + "governance" + ], + "relatedEntries": [ + { + "id": "open-philanthropy", + "type": "organization" + }, + { + "id": "anthropic", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E293", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "nist-ai", + "title": "NIST and AI Safety", + "description": "The National Institute of Standards and Technology's role in developing AI standards, risk management frameworks, and safety guidelines for the United States.", + "tags": [ + "ai-standards", + "risk-management", + "government-policy", + "ai-evaluation", + "trustworthy-ai" + ], + "clusters": [ + "ai-safety", + "governance", + "community" + ], + "relatedEntries": [ + { + "id": "paul-christiano", + "type": "researcher" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "ai-governance", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E294", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "ssi", + "title": "Safe Superintelligence Inc.", + "description": "AI research startup founded by Ilya Sutskever, Daniel Gross, and Daniel Levy with a singular focus on developing safe superintelligence without commercial distractions.", + "tags": [ + "superintelligence", + "ai-safety-lab", + "alignment", + "frontier-ai", + "scaling" + ], + "clusters": [ + "community", + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "ilya-sutskever", + "type": "researcher" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "deepmind", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "website": "https://ssi.inc", + "numericId": "E295", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org", + "founded": "2024", + "headquarters": "Palo Alto, CA" + }, + { + "id": "controlai", + "title": "ControlAI", + "description": "UK-based AI safety advocacy organization focused on preventing artificial superintelligence development through policy campaigns and grassroots outreach to lawmakers.", + "tags": [ + "ai-advocacy", + "policy-campaigns", + "uk-policy", + "binding-regulation", + "grassroots" + ], + "clusters": [ + "community", + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "conjecture", + "type": "organization" + }, + { + "id": "connor-leahy", + "type": "researcher" + }, + { + "id": "eu-ai-act", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E296", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "frontier-model-forum", + "title": "Frontier Model Forum", + "description": "Industry-led non-profit organization promoting self-governance in frontier AI safety through collaborative frameworks, research funding, and best practices development.", + "tags": [ + "industry-self-governance", + "safety-frameworks", + "frontier-models", + "ai-standards", + "risk-evaluation" + ], + "clusters": [ + "community", + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "deepmind", + "type": "lab" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "jaan-tallinn", + "type": "researcher" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E297", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "palisade-research", + "title": "Palisade Research", + "description": "Nonprofit organization investigating offensive AI capabilities and controllability of frontier AI models through empirical research on autonomous hacking, shutdown resistance, and agentic misalignment.", + "tags": [ + "shutdown-resistance", + "autonomous-hacking", + "ai-controllability", + "cyber-security", + "red-teaming" + ], + "clusters": [ + "ai-safety", + "community", + "cyber" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "yoshua-bengio", + "type": "researcher" + }, + { + "id": "dario-amodei", + "type": "researcher" + }, + { + "id": "sff", + "type": "organization" + }, + { + "id": "alignment", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E298", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org" + }, + { + "id": "centre-for-long-term-resilience", + "title": "Centre for Long-Term Resilience", + "description": "UK-based think tank focused on extreme risks from AI, biosecurity, and improving government risk management through policy research and direct advisory work.", + "tags": [ + "uk-policy", + "extreme-risks", + "biosecurity", + "effective-altruism", + "government-advisory" + ], + "clusters": [ + "governance", + "community", + "ai-safety" + ], + "relatedEntries": [ + { + "id": "open-philanthropy", + "type": "organization" + }, + { + "id": "sff", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E299", + "customFields": [], + "relatedTopics": [], + "entityType": "organization" + }, + { + "id": "goodfire", + "title": "Goodfire", + "description": "AI interpretability research lab developing tools to decode and control neural network internals for safer AI systems.", + "tags": [ + "mechanistic-interpretability", + "sparse-autoencoders", + "ai-safety-startup", + "model-transparency", + "feature-steering" + ], + "clusters": [ + "ai-safety", + "community" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "dario-amodei", + "type": "researcher" + }, + { + "id": "chris-olah", + "type": "researcher" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "deepmind", + "type": "lab" + }, + { + "id": "interpretability", + "type": "safety-agenda" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E300", + "customFields": [], + "relatedTopics": [], + "entityType": "organization", + "orgType": "safety-org" + }, + { + "id": "buck-shlegeris", + "title": "Buck Shlegeris", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "website": "https://redwoodresearch.org", + "numericId": "E301", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "CEO", + "affiliation": "Redwood Research", + "knownFor": [ + "AI safety research", + "Redwood Research leadership" + ] + }, + { + "id": "chris-olah", + "title": "Chris Olah", + "description": "Chris Olah is one of the most influential figures in AI interpretability research. Before co-founding Anthropic in 2021, he worked at Google Brain and OpenAI, where he pioneered techniques for understanding what neural networks learn internally. His blog posts and papers on neural network visualization have become canonical references in the field.\n\nOlah's research focuses on \"mechanistic interpretability\" - the effort to understand neural networks by reverse-engineering the algorithms they implement. His team at Anthropic has made breakthrough discoveries including identifying \"features\" in large language models using sparse autoencoders, understanding how transformers perform computations through \"circuits,\" and mapping the representations that models develop during training. The 2024 \"Scaling Monosemanticity\" paper demonstrated that interpretability techniques could scale to production models like Claude.\n\nBeyond his technical contributions, Olah is known for his exceptional clarity of communication. He co-founded Distill, an academic journal that emphasized interactive visualizations and clear explanations. His approach - treating neural networks as objects to be understood rather than black boxes to be optimized - has shaped how a generation of AI safety researchers think about the problem.\n", + "tags": [ + "interpretability", + "feature-visualization", + "neural-network-circuits", + "sparse-autoencoders", + "ai-safety", + "transparency", + "monosemanticity" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "interpretability", + "type": "safety-agenda" + }, + { + "id": "dario-amodei", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Chris Olah's Blog", + "url": "https://colah.github.io" + }, + { + "title": "Distill Journal", + "url": "https://distill.pub" + }, + { + "title": "Anthropic Interpretability Research", + "url": "https://www.anthropic.com/research#interpretability" + }, + { + "title": "Scaling Monosemanticity", + "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/" + } + ], + "lastUpdated": "2025-12", + "website": "https://colah.github.io", + "numericId": "E302", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Co-founder, Head of Interpretability", + "affiliation": "Anthropic", + "knownFor": [ + "Mechanistic interpretability", + "neural network visualization", + "clarity of research communication" + ] + }, + { + "id": "connor-leahy", + "title": "Connor Leahy", + "description": "Connor Leahy is the CEO and co-founder of Conjecture, an AI safety research company based in London. He rose to prominence as a founding member of EleutherAI, an open-source collective that trained GPT-NeoX and other large language models to democratize access to AI research. This experience gave him direct insight into how frontier capabilities are developed.\n\nLeahy founded Conjecture in 2022 with the thesis that AGI might emerge from \"prosaic\" deep learning - scaling current architectures - rather than requiring fundamental algorithmic breakthroughs. This worldview emphasizes the urgency of alignment research, since transformative AI could arrive without warning through continued scaling. Conjecture's research focuses on interpretability, capability evaluation, and developing tools to understand AI systems before they become too powerful.\n\nAs a public advocate for AI safety, Leahy is known for his direct communication style and willingness to engage with uncomfortable scenarios. He has appeared on numerous podcasts and media outlets to discuss AI risk, often emphasizing the potential for rapid capability gains and the inadequacy of current safety measures. His perspective combines technical expertise from building large models with serious concern about the trajectory of AI development.\n", + "tags": [ + "interpretability", + "prosaic-alignment", + "agi-timelines", + "ai-safety", + "capability-evaluation", + "eleutherai", + "red-teaming" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "interpretability", + "type": "safety-agenda" + }, + { + "id": "chris-olah", + "type": "researcher" + }, + { + "id": "neel-nanda", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Conjecture", + "url": "https://conjecture.dev" + }, + { + "title": "Connor Leahy on Twitter/X", + "url": "https://twitter.com/ConnorLeahy" + }, + { + "title": "Various podcast appearances", + "url": "https://www.youtube.com/results?search_query=connor+leahy" + } + ], + "lastUpdated": "2025-12", + "website": "https://conjecture.dev", + "numericId": "E303", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "CEO & Co-founder", + "affiliation": "Conjecture", + "knownFor": [ + "Founding Conjecture", + "AI safety advocacy", + "interpretability research" + ] + }, + { + "id": "dan-hendrycks", + "title": "Dan Hendrycks", + "description": "Dan Hendrycks is the Director of the Center for AI Safety (CAIS) and one of the most prolific researchers in AI safety. His work spans technical safety research, benchmark creation, and public advocacy for taking AI risks seriously. He is known for combining rigorous empirical research with clear communication about catastrophic risks.\n\nHendrycks has made foundational contributions to AI safety evaluation. He created MMLU (Massive Multitask Language Understanding), one of the most widely-used benchmarks for measuring AI capabilities, as well as numerous benchmarks for robustness, calibration, and safety. His research on out-of-distribution detection, adversarial robustness, and AI ethics has been highly cited and influenced how the field measures progress.\n\nAs CAIS director, Hendrycks has focused on building the case for AI risk as a serious issue. He was instrumental in organizing the 2023 Statement on AI Risk, signed by hundreds of AI researchers including Turing Award winners, which stated that \"mitigating the risk of extinction from AI should be a global priority.\" His approach emphasizes engaging mainstream ML researchers and policymakers who may not be part of the existing AI safety community.\n", + "tags": [ + "ai-safety", + "x-risk", + "robustness", + "governance", + "benchmarks", + "compute-governance" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "cais", + "type": "lab" + }, + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "yoshua-bengio", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Dan Hendrycks' Website", + "url": "https://hendrycks.com" + }, + { + "title": "Center for AI Safety", + "url": "https://safe.ai" + }, + { + "title": "Statement on AI Risk", + "url": "https://safe.ai/statement-on-ai-risk" + }, + { + "title": "Google Scholar Profile", + "url": "https://scholar.google.com/citations?user=VEvOFxQAAAAJ" + } + ], + "lastUpdated": "2025-12", + "website": "https://hendrycks.com", + "numericId": "E304", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Director", + "affiliation": "Center for AI Safety", + "knownFor": [ + "AI safety research", + "benchmark creation", + "CAIS leadership", + "catastrophic risk focus" + ] + }, + { + "id": "daniela-amodei", + "title": "Daniela Amodei", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "website": "https://anthropic.com", + "numericId": "E305", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Co-founder & President", + "affiliation": "Anthropic", + "knownFor": [ + "Co-founding Anthropic", + "Operations and business leadership" + ] + }, + { + "id": "dario-amodei", + "title": "Dario Amodei", + "description": "Dario Amodei is the CEO and co-founder of Anthropic, one of the leading AI safety-focused companies. Before founding Anthropic in 2021, he was VP of Research at OpenAI, where he led the team that developed GPT-2 and GPT-3. He left OpenAI along with his sister Daniela and several colleagues over concerns about the company's direction, particularly its increasing commercialization and partnership with Microsoft.\n\nAmodei's approach to AI safety emphasizes empirical research on current systems rather than purely theoretical work. Under his leadership, Anthropic has developed Constitutional AI (a method for training helpful, harmless, and honest AI without extensive human feedback), pioneered \"responsible scaling policies\" that tie safety commitments to capability levels, and invested heavily in interpretability research. The company's Claude models have become leading examples of safety-conscious AI development.\n\nAs a public voice for AI safety, Amodei occupies a distinctive position - arguing that AI development is likely to continue rapidly regardless of individual company decisions, so the priority should be ensuring that safety-focused labs are at the frontier. He has advocated for industry self-regulation, compute governance, and international coordination while maintaining that slowing AI development unilaterally would simply cede the field to less safety-conscious actors. His essay \"Machines of Loving Grace\" outlined a vision for how powerful AI could be beneficial if developed carefully.\n", + "tags": [ + "constitutional-ai", + "responsible-scaling", + "claude", + "rlhf", + "interpretability", + "ai-safety-levels", + "empirical-alignment" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "anthropic-core-views", + "type": "safety-agenda" + }, + { + "id": "jan-leike", + "type": "researcher" + }, + { + "id": "chris-olah", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Anthropic Website", + "url": "https://anthropic.com" + }, + { + "title": "Anthropic Core Views on AI Safety", + "url": "https://anthropic.com/news/core-views-on-ai-safety" + }, + { + "title": "Responsible Scaling Policy", + "url": "https://anthropic.com/news/anthropics-responsible-scaling-policy" + }, + { + "title": "Dwarkesh Podcast Interview", + "url": "https://www.dwarkeshpatel.com/p/dario-amodei" + } + ], + "lastUpdated": "2025-12", + "website": "https://anthropic.com", + "numericId": "E306", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Co-founder & CEO", + "affiliation": "Anthropic", + "knownFor": [ + "Constitutional AI", + "Responsible Scaling Policy", + "Claude development" + ] + }, + { + "id": "demis-hassabis", + "title": "Demis Hassabis", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "numericId": "E307", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "eliezer-yudkowsky", + "title": "Eliezer Yudkowsky", + "description": "Eliezer Yudkowsky is one of the founding figures of AI safety as a field. In 2000, he co-founded the Machine Intelligence Research Institute (MIRI), originally called the Singularity Institute for Artificial Intelligence, making it one of the first organizations dedicated to studying the risks from advanced AI. His early writings on AI risk predated academic interest in the topic by over a decade.\n\nYudkowsky's technical contributions include foundational work on decision theory, the formalization of Friendly AI concepts, and the identification of failure modes like deceptive alignment and the \"sharp left turn.\" His 2022 essay \"AGI Ruin: A List of Lethalities\" provides a comprehensive catalog of why he believes aligning superintelligent AI is extremely difficult. He has been pessimistic about humanity's chances, arguing that current approaches to alignment are inadequate and that AI development should be slowed or halted.\n\nBeyond AI safety, Yudkowsky founded the \"rationalist\" community through his sequences of blog posts on human rationality, later compiled as \"Rationality: From AI to Zombies.\" This community has been a major source of AI safety researchers and has shaped how the field thinks about reasoning under uncertainty. His writing style - blending technical concepts with accessible explanations and science fiction examples - has influenced how AI risk is communicated. Despite his pessimism, he remains an active voice advocating for taking AI risk seriously at the highest levels of government and industry.\n", + "tags": [ + "alignment", + "x-risk", + "agent-foundations", + "rationality", + "decision-theory", + "cev", + "sharp-left-turn", + "deception" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "miri", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "sharp-left-turn", + "type": "risk" + }, + { + "id": "paul-christiano", + "type": "researcher" + } + ], + "sources": [ + { + "title": "MIRI Research", + "url": "https://intelligence.org/research/" + }, + { + "title": "LessWrong", + "url": "https://www.lesswrong.com/users/eliezer_yudkowsky" + }, + { + "title": "AGI Ruin: A List of Lethalities", + "url": "https://www.lesswrong.com/posts/uMQ3cqWDPHhjtiesc/agi-ruin-a-list-of-lethalities" + }, + { + "title": "The Sequences", + "url": "https://www.lesswrong.com/rationality" + } + ], + "lastUpdated": "2025-12", + "website": "https://intelligence.org", + "numericId": "E308", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Co-founder & Research Fellow", + "affiliation": "Machine Intelligence Research Institute", + "knownFor": [ + "Early AI safety work", + "decision theory", + "rationalist community" + ] + }, + { + "id": "elizabeth-kelly", + "title": "Elizabeth Kelly", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "numericId": "E309", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Director", + "affiliation": "US AI Safety Institute", + "knownFor": [ + "Leading US AI Safety Institute", + "AI policy" + ] + }, + { + "id": "evan-hubinger", + "title": "Evan Hubinger", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "numericId": "E310", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "gary-marcus", + "title": "Gary Marcus", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "numericId": "E311", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "geoffrey-hinton", + "title": "Geoffrey Hinton", + "description": "Geoffrey Hinton is a cognitive psychologist and computer scientist who received the 2018 Turing Award for his foundational work on deep learning. Often called the \"Godfather of AI,\" he developed many of the techniques that enabled the current AI revolution, including the backpropagation algorithm, Boltzmann machines, and key advances in neural network training.\n\nIn May 2023, Hinton resigned from Google after a decade at the company specifically to speak freely about AI risks. His public statements marked a significant moment for AI safety - one of the field's most respected pioneers was now warning that the technology he helped create posed existential risks. He expressed regret about his life's work, stating that the dangers from AI might be more imminent and severe than he previously believed.\n\nHinton's concerns focus on several areas: that AI systems might become more intelligent than humans sooner than expected, that we don't understand how to control systems smarter than ourselves, and that bad actors could use AI for manipulation and warfare. He has called for government intervention to slow AI development and international coordination to prevent an AI arms race. His transition from AI optimist to public warner has lent significant credibility to AI safety concerns and helped bring them into mainstream discourse.\n", + "tags": [ + "deep-learning", + "ai-safety", + "x-risk", + "neural-networks", + "backpropagation", + "regulation", + "autonomous-weapons" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "yoshua-bengio", + "type": "researcher" + }, + { + "id": "deepmind", + "type": "lab" + } + ], + "sources": [ + { + "title": "Geoffrey Hinton's Homepage", + "url": "https://www.cs.toronto.edu/~hinton/" + }, + { + "title": "CBS 60 Minutes Interview", + "url": "https://www.cbsnews.com/news/geoffrey-hinton-ai-dangers-60-minutes-transcript/" + }, + { + "title": "NYT: 'Godfather of AI' Quits Google", + "url": "https://www.nytimes.com/2023/05/01/technology/ai-google-chatbot-engineer-quits-hinton.html" + }, + { + "title": "Google Scholar Profile", + "url": "https://scholar.google.com/citations?user=JicYPdAAAAAJ" + } + ], + "lastUpdated": "2025-12", + "website": "https://www.cs.toronto.edu/~hinton/", + "numericId": "E312", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Professor Emeritus, AI Safety Advocate", + "affiliation": "independent", + "knownFor": [ + "Deep learning pioneer", + "backpropagation", + "now AI risk vocal advocate" + ] + }, + { + "id": "holden-karnofsky", + "title": "Holden Karnofsky", + "description": "Holden Karnofsky is the former Co-CEO of Coefficient Giving (formerly Open Philanthropy), one of the largest funders of AI safety research and related work. Through Coefficient Giving, he directed hundreds of millions of dollars toward reducing existential risks from AI, making him one of the most influential figures in shaping the field's growth and direction. In 2025, he joined Anthropic.\n\nKarnofsky's intellectual contributions have been equally significant. His \"Most Important Century\" series of blog posts on Cold Takes presents a detailed argument that the 21st century could be the most pivotal in human history due to transformative AI. He has developed frameworks for thinking about AI timelines, the potential for a \"galaxy-brained\" AI to manipulate humans, and how philanthropic funding should be allocated given deep uncertainty about AI trajectories.\n\nBefore focusing on AI risk, Karnofsky co-founded GiveWell, a charity evaluator that became the intellectual foundation for effective altruism. His transition to prioritizing AI safety reflects a broader shift in the EA movement. Through Coefficient Giving's grants to organizations like Anthropic, MIRI, Redwood Research, and many others, Karnofsky helped build the institutional infrastructure of AI safety as a field.\n", + "tags": [ + "effective-altruism", + "ai-safety-funding", + "ai-timelines", + "transformative-ai", + "x-risk", + "most-important-century", + "grantmaking" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "toby-ord", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Coefficient Giving", + "url": "https://coefficientgiving.org" + }, + { + "title": "Cold Takes Blog", + "url": "https://www.cold-takes.com/" + }, + { + "title": "Most Important Century Series", + "url": "https://www.cold-takes.com/most-important-century/" + }, + { + "title": "AI Timelines Post", + "url": "https://www.cold-takes.com/where-ai-forecasting-stands-today/" + } + ], + "lastUpdated": "2025-12", + "website": "https://www.openphilanthropy.org", + "numericId": "E313", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Co-CEO", + "affiliation": "open-phil", + "knownFor": [ + "Directing billions toward AI safety", + "effective altruism leadership", + "AI timelines work" + ] + }, + { + "id": "ian-hogarth", + "title": "Ian Hogarth", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "numericId": "E314", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Chair", + "affiliation": "UK AI Safety Institute", + "knownFor": [ + "Leading UK AI Safety Institute", + "AI investor and writer" + ] + }, + { + "id": "ilya-sutskever", + "title": "Ilya Sutskever", + "description": "Ilya Sutskever is one of the most influential figures in modern AI development. As a PhD student of Geoffrey Hinton, he co-authored the AlexNet paper that sparked the deep learning revolution. He went on to co-found OpenAI in 2015 and served as Chief Scientist for nearly a decade, leading the technical direction that produced GPT-3, GPT-4, and other breakthrough systems.\n\nSutskever's departure from OpenAI in 2024 followed a tumultuous period during which he briefly joined the board in attempting to remove CEO Sam Altman, then reversed course. The episode highlighted tensions between commercial pressures and safety concerns at frontier AI labs. His departure, along with Jan Leike and other safety-focused researchers, raised questions about OpenAI's commitment to its original mission.\n\nIn 2024, Sutskever founded Safe Superintelligence Inc. (SSI), a company focused exclusively on developing safe superintelligent AI. Unlike other AI labs that balance commercial products with safety research, SSI's stated mission is to solve superintelligence safety before building superintelligence - a departure from the \"race to the frontier\" dynamic that characterizes much of the industry. Whether this approach can succeed commercially and technically while maintaining its safety focus remains to be seen.\n", + "tags": [ + "superintelligence", + "ai-safety", + "deep-learning", + "alignment-research", + "openai", + "scalable-oversight", + "gpt" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "openai", + "type": "lab" + }, + { + "id": "jan-leike", + "type": "researcher" + }, + { + "id": "geoffrey-hinton", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Safe Superintelligence Inc.", + "url": "https://ssi.inc" + }, + { + "title": "SSI Founding Announcement", + "url": "https://ssi.inc/announcement" + }, + { + "title": "AlexNet Paper", + "url": "https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks" + } + ], + "lastUpdated": "2025-12", + "website": "https://ssi.inc", + "numericId": "E315", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Co-founder & Chief Scientist", + "affiliation": "Safe Superintelligence Inc.", + "knownFor": [ + "Deep learning breakthroughs", + "OpenAI leadership", + "now focused on safe superintelligence" + ] + }, + { + "id": "jan-leike", + "title": "Jan Leike", + "description": "Jan Leike is the Head of Alignment at Anthropic, where he leads research on ensuring AI systems remain beneficial as they become more capable. Before joining Anthropic in 2024, he co-led OpenAI's Superalignment team, which was tasked with solving alignment for superintelligent AI systems within four years.\n\nLeike's research has been foundational for modern alignment techniques. He co-authored key papers on learning from human feedback, including \"Deep Reinforcement Learning from Human Preferences\" which helped establish RLHF as the dominant paradigm for aligning large language models. His work on scalable oversight explores how to maintain human control over AI systems even when they become too capable for humans to directly evaluate their outputs.\n\nLeike's departure from OpenAI in May 2024 was publicly significant - he stated that safety had \"taken a backseat to shiny products\" and that the company was not adequately preparing for the challenges of superintelligence. His move to Anthropic, along with several colleagues from the Superalignment team, signaled broader concerns about safety culture at frontier labs. At Anthropic, he continues work on scalable oversight, weak-to-strong generalization, and detecting deceptive behavior in AI systems.\n", + "tags": [ + "rlhf", + "scalable-oversight", + "superalignment", + "reward-modeling", + "weak-to-strong-generalization", + "process-supervision", + "deception" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda" + }, + { + "id": "dario-amodei", + "type": "researcher" + }, + { + "id": "paul-christiano", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Jan Leike on X/Twitter", + "url": "https://twitter.com/janleike" + }, + { + "title": "Deep RL from Human Preferences", + "url": "https://arxiv.org/abs/1706.03741" + }, + { + "title": "OpenAI Superalignment Announcement", + "url": "https://openai.com/blog/introducing-superalignment" + }, + { + "title": "Departure Statement", + "url": "https://twitter.com/janleike/status/1790517668677865835" + } + ], + "lastUpdated": "2025-12", + "website": "https://anthropic.com", + "numericId": "E316", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Head of Alignment", + "affiliation": "Anthropic", + "knownFor": [ + "Alignment research", + "scalable oversight", + "RLHF", + "superalignment work" + ] + }, + { + "id": "nate-soares", + "title": "Nate Soares (MIRI)", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "numericId": "E317", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "neel-nanda", + "title": "Neel Nanda", + "description": "Neel Nanda is an alignment researcher at Google DeepMind who has become one of the leading figures in mechanistic interpretability. His work focuses on understanding the internal computations of transformer models - reverse-engineering how these neural networks implement algorithms and form representations.\n\nNanda's most significant contribution to the field is TransformerLens, an open-source library that makes it vastly easier to conduct interpretability research on language models. By providing clean abstractions for accessing model internals, the library has enabled hundreds of researchers to enter the field and accelerated the pace of discovery. He has also authored influential posts cataloging open problems in mechanistic interpretability, helping to define the research agenda.\n\nBeyond his technical work, Nanda is known for his commitment to growing the interpretability research community. He actively mentors new researchers, creates educational content explaining complex concepts, and maintains a strong online presence where he discusses research directions and results. His approach exemplifies a field-building philosophy - that progress on AI safety requires not just individual research contributions but growing the number of capable researchers working on the problem.\n", + "tags": [ + "interpretability", + "transformer-circuits", + "transformerlens", + "induction-heads", + "ai-safety", + "research-tools", + "science-communication" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deepmind", + "type": "lab" + }, + { + "id": "chris-olah", + "type": "researcher" + }, + { + "id": "interpretability", + "type": "safety-agenda" + } + ], + "sources": [ + { + "title": "Neel Nanda's Website", + "url": "https://www.neelnanda.io" + }, + { + "title": "TransformerLens", + "url": "https://github.com/neelnanda-io/TransformerLens" + }, + { + "title": "200 Open Problems in Mech Interp", + "url": "https://www.lesswrong.com/posts/LbrPTJ4fmABEdEnLf/200-concrete-open-problems-in-mechanistic-interpretability" + }, + { + "title": "Blog Posts", + "url": "https://www.neelnanda.io/blog" + } + ], + "lastUpdated": "2025-12", + "website": "https://www.neelnanda.io", + "numericId": "E318", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Alignment Researcher", + "affiliation": "Google DeepMind", + "knownFor": [ + "Mechanistic interpretability", + "TransformerLens library", + "educational content" + ] + }, + { + "id": "nick-bostrom", + "title": "Nick Bostrom", + "description": "Nick Bostrom is a philosopher who founded the Future of Humanity Institute (FHI) at Oxford University and authored \"Superintelligence: Paths, Dangers, Strategies\" (2014), the book that brought AI existential risk into mainstream academic and policy discourse. His work laid the conceptual foundations for much of modern AI safety thinking.\n\nBostrom's key contributions include the orthogonality thesis (intelligence and goals are independent - a superintelligent AI could pursue any objective), instrumental convergence (most goal-pursuing systems will converge on certain subgoals like self-preservation and resource acquisition), and the concept of the \"treacherous turn\" (an AI might behave well until it's powerful enough to act on misaligned goals). These ideas are now standard reference points in AI safety discussions.\n\nBeyond AI, Bostrom has shaped the broader study of existential risk as an academic field, arguing that reducing the probability of human extinction should be a global priority given the astronomical value of humanity's potential future. Though FHI closed in 2024 due to administrative issues at Oxford, its influence persists through the researchers it trained and the research agendas it established. Bostrom's work continues to frame how many researchers and policymakers think about the stakes of advanced AI development.\n", + "tags": [ + "superintelligence", + "x-risk", + "orthogonality-thesis", + "instrumental-convergence", + "treacherous-turn", + "value-alignment", + "control-problem" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "instrumental-convergence", + "type": "risk" + }, + { + "id": "treacherous-turn", + "type": "risk" + }, + { + "id": "toby-ord", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Nick Bostrom's Website", + "url": "https://nickbostrom.com" + }, + { + "title": "Superintelligence (book)", + "url": "https://www.superintelligence.com/" + }, + { + "title": "FHI Publications", + "url": "https://www.fhi.ox.ac.uk/publications/" + }, + { + "title": "Existential Risk Prevention as Global Priority", + "url": "https://www.existential-risk.org/concept.html" + } + ], + "lastUpdated": "2025-12", + "website": "https://nickbostrom.com", + "numericId": "E319", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Founding Director (until FHI closure in 2024)", + "affiliation": "Future of Humanity Institute", + "knownFor": [ + "Superintelligence", + "existential risk research", + "simulation hypothesis" + ] + }, + { + "id": "paul-christiano", + "title": "Paul Christiano", + "description": "Paul Christiano is the founder of the Alignment Research Center (ARC) and one of the most technically influential figures in AI alignment. His research has shaped how the field thinks about scaling alignment techniques to superintelligent systems, particularly through his work on iterated amplification, AI safety via debate, and scalable oversight.\n\nChristiano's key insight is that we need alignment techniques that work even when AI systems are smarter than their human overseers. Iterated amplification proposes training AI systems by having them decompose complex tasks into simpler subtasks that humans can evaluate. AI safety via debate imagines training AI systems by having them argue with each other, with humans judging the debates. These approaches aim to amplify human judgment rather than replace it entirely. His work on \"Eliciting Latent Knowledge\" (ELK) addresses how to get AI systems to honestly report what they believe, even if they're capable of deception.\n\nBefore founding ARC in 2021, Christiano was a researcher at OpenAI where he led early work on RLHF and helped establish many of the techniques now used to train large language models. He is known for taking AI risk seriously while maintaining that there are tractable technical paths to safe AI - a position between those who think alignment is essentially impossible and those who think it will be solved by default. His probability estimates for AI-caused catastrophe (around 10-20%) are often cited as representing a serious but not inevitable risk.\n", + "tags": [ + "iterated-amplification", + "scalable-oversight", + "ai-safety-via-debate", + "elk", + "prosaic-alignment", + "recursive-reward-modeling", + "deception" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "arc", + "type": "lab" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda" + }, + { + "id": "eliezer-yudkowsky", + "type": "researcher" + }, + { + "id": "jan-leike", + "type": "researcher" + } + ], + "sources": [ + { + "title": "ARC Website", + "url": "https://alignment.org" + }, + { + "title": "Paul's Alignment Forum Posts", + "url": "https://www.alignmentforum.org/users/paulfchristiano" + }, + { + "title": "Iterated Amplification Paper", + "url": "https://arxiv.org/abs/1810.08575" + }, + { + "title": "ELK Report", + "url": "https://docs.google.com/document/d/1WwsnJQstPq91_Yh-Ch2XRL8H_EpsnjrC1dwZXR37PC8/" + } + ], + "lastUpdated": "2025-12", + "website": "https://alignment.org", + "numericId": "E320", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Founder", + "affiliation": "Alignment Research Center", + "knownFor": [ + "Iterated amplification", + "AI safety via debate", + "scalable oversight" + ] + }, + { + "id": "robin-hanson", + "title": "Robin Hanson", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "numericId": "E321", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "sam-altman", + "title": "Sam Altman", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "numericId": "E322", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "shane-legg", + "title": "Shane Legg", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "website": "https://deepmind.google", + "numericId": "E323", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Co-founder & Chief AGI Scientist", + "affiliation": "Google DeepMind", + "knownFor": [ + "Co-founding DeepMind", + "Early work on AGI", + "Machine super intelligence thesis" + ] + }, + { + "id": "stuart-russell", + "title": "Stuart Russell", + "description": "Stuart Russell is a professor of computer science at UC Berkeley and one of the most prominent mainstream AI researchers to seriously engage with AI safety. He is the author of \"Artificial Intelligence: A Modern Approach,\" the standard textbook used in AI courses worldwide, giving him unusual credibility when he warns about AI risks.\n\nRussell founded the Center for Human-Compatible AI (CHAI) at Berkeley to pursue his vision of AI systems that are inherently safe because they are designed to be uncertain about human values and deferential to human preferences. His book \"Human Compatible\" (2019) articulated this vision for a general audience, arguing that the standard paradigm of optimizing AI systems for fixed objectives is fundamentally flawed. Instead, he proposes that AI systems should be designed to defer to humans, allow themselves to be corrected, and actively seek to learn human preferences rather than assume they already know them.\n\nRussell has been active in AI governance advocacy, working with the UN and various governments on policy issues including lethal autonomous weapons. He signed open letters calling for AI research to prioritize safety and has testified before legislative bodies on AI risks. His approach emphasizes that AI safety is a solvable technical problem if we redesign AI systems from the ground up with the right objectives, rather than trying to patch safety onto systems designed without it.\n", + "tags": [ + "inverse-reinforcement-learning", + "value-alignment", + "cooperative-ai", + "off-switch-problem", + "corrigibility", + "human-compatible-ai", + "governance" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "chai", + "type": "lab" + }, + { + "id": "corrigibility-failure", + "type": "risk" + }, + { + "id": "paul-christiano", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Stuart Russell's Homepage", + "url": "https://people.eecs.berkeley.edu/~russell/" + }, + { + "title": "Human Compatible (book)", + "url": "https://www.penguinrandomhouse.com/books/566677/human-compatible-by-stuart-russell/" + }, + { + "title": "CHAI Website", + "url": "https://humancompatible.ai/" + }, + { + "title": "TED Talk: 3 Principles for Creating Safer AI", + "url": "https://www.ted.com/talks/stuart_russell_3_principles_for_creating_safer_ai" + } + ], + "lastUpdated": "2025-12", + "website": "https://people.eecs.berkeley.edu/~russell/", + "numericId": "E324", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Professor of Computer Science, CHAI Founder", + "affiliation": "Center for Human-Compatible AI", + "knownFor": [ + "Human Compatible", + "inverse reinforcement learning", + "AI safety advocacy" + ] + }, + { + "id": "toby-ord", + "title": "Toby Ord", + "description": "Toby Ord is a philosopher at Oxford University and author of \"The Precipice: Existential Risk and the Future of Humanity\" (2020), a comprehensive treatment of existential risks that helped establish AI as a central concern for humanity's long-term future. His work has been influential in shaping how policymakers and researchers think about catastrophic risks.\n\nIn \"The Precipice,\" Ord provides quantitative estimates of existential risk from various sources, with AI among the highest. He argues that we are living through a critical period in human history where our technological capabilities have outpaced our wisdom, and that reducing existential risk should be a global priority. His estimates - placing the probability of existential catastrophe this century at about 1 in 6, with AI being a major contributor - are frequently cited in discussions of AI risk.\n\nOrd is also a founding figure in the effective altruism movement. In 2009, he co-founded Giving What We Can, which encourages people to donate significant portions of their income to effective charities. His transition from focusing on global health and development to prioritizing existential risks mirrors a broader shift in the EA movement. Through his writing, teaching, and advisory roles (including advising the UK government on AI), Ord has helped translate abstract concerns about humanity's future into concrete policy discussions.\n", + "tags": [ + "x-risk", + "effective-altruism", + "longtermism", + "ai-safety", + "moral-philosophy", + "risk-assessment", + "future-generations" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "nick-bostrom", + "type": "researcher" + }, + { + "id": "holden-karnofsky", + "type": "researcher" + } + ], + "sources": [ + { + "title": "Toby Ord's Website", + "url": "https://www.tobyord.com" + }, + { + "title": "The Precipice", + "url": "https://theprecipice.com/" + }, + { + "title": "80,000 Hours Podcast", + "url": "https://80000hours.org/podcast/episodes/toby-ord-the-precipice-existential-risk-future-humanity/" + }, + { + "title": "Giving What We Can", + "url": "https://www.givingwhatwecan.org/" + } + ], + "lastUpdated": "2025-12", + "website": "https://www.tobyord.com", + "numericId": "E325", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Senior Research Fellow in Philosophy", + "affiliation": "Future of Humanity Institute", + "knownFor": [ + "The Precipice", + "existential risk quantification", + "effective altruism" + ] + }, + { + "id": "yoshua-bengio", + "title": "Yoshua Bengio", + "description": "Yoshua Bengio is a pioneer of deep learning who shared the 2018 Turing Award with Geoffrey Hinton and Yann LeCun for their foundational work on neural networks. As Scientific Director of Mila, the Quebec AI Institute, he leads one of the world's largest academic AI research centers. His technical contributions include fundamental work on neural network optimization, recurrent networks, and attention mechanisms.\n\nIn recent years, Bengio has increasingly focused on AI safety and governance. He was an early signatory of the 2023 Statement on AI Risk and has become a prominent voice arguing that frontier AI development requires more caution and oversight. His concerns span both near-term harms (misinformation, job displacement) and longer-term risks from systems that might become difficult to control. Unlike some AI researchers who dismiss existential risk concerns, Bengio has engaged seriously with these arguments.\n\nBengio's research agenda has evolved to include safety-relevant directions like causal representation learning, which could help AI systems develop more robust and generalizable understanding of the world. He has advocated for international governance mechanisms for AI, including proposals for compute governance and safety standards. His position as one of the founding figures of modern AI gives his safety advocacy significant weight with policymakers and the broader research community.\n", + "tags": [ + "deep-learning", + "ai-safety", + "governance", + "interpretability", + "causal-representation-learning", + "regulation", + "x-risk" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "geoffrey-hinton", + "type": "researcher" + }, + { + "id": "interpretability", + "type": "safety-agenda" + } + ], + "sources": [ + { + "title": "Yoshua Bengio's Website", + "url": "https://yoshuabengio.org" + }, + { + "title": "Mila Institute", + "url": "https://mila.quebec/" + }, + { + "title": "Statement on AI Risk", + "url": "https://www.safe.ai/statement-on-ai-risk" + }, + { + "title": "Google Scholar Profile", + "url": "https://scholar.google.com/citations?user=kukA0LcAAAAJ" + } + ], + "lastUpdated": "2025-12", + "website": "https://yoshuabengio.org", + "numericId": "E326", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "role": "Scientific Director of Mila, Professor", + "affiliation": "Mila - Quebec AI Institute", + "knownFor": [ + "Deep learning pioneer", + "now AI safety advocate" + ] + }, + { + "id": "elon-musk", + "title": "Elon Musk", + "description": "Tech entrepreneur, co-founder of OpenAI, founder of xAI. Influential voice on AI development and risks.", + "tags": [ + "entrepreneur", + "ai-labs" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "xai", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E327", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "beth-barnes", + "title": "Beth Barnes", + "description": "AI safety researcher, founder of METR (formerly ARC Evals). Focus on evaluating dangerous AI capabilities.", + "tags": [ + "evaluations", + "ai-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "metr", + "type": "lab-research" + }, + { + "id": "arc-evals", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E328", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "david-sacks", + "title": "David Sacks", + "description": "South African-American entrepreneur, venture capitalist, and White House AI and Crypto Czar who co-founded Craft Ventures and played key roles at PayPal and Yammer. Appointed by President Trump in December 2024 to shape U.S. AI and cryptocurrency policy.", + "tags": [ + "venture-capital", + "ai-policy", + "government-advisor", + "anti-regulation", + "paypal-mafia" + ], + "clusters": [ + "governance" + ], + "relatedEntries": [ + { + "id": "elon-musk", + "type": "researcher" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "dario-amodei", + "type": "researcher" + }, + { + "id": "open-philanthropy", + "type": "organization" + }, + { + "id": "openai", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E329", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "marc-andreessen", + "title": "Marc Andreessen", + "description": "American software engineer, entrepreneur, and venture capitalist who co-created Mosaic, founded Netscape, and co-founded Andreessen Horowitz. Known for techno-optimist views on AI development.", + "tags": [ + "venture-capital", + "techno-optimism", + "ai-deregulation", + "andreessen-horowitz", + "anti-alignment" + ], + "clusters": [ + "governance" + ], + "relatedEntries": [ + { + "id": "elon-musk", + "type": "researcher" + }, + { + "id": "alignment", + "type": "concept" + }, + { + "id": "bioweapons", + "type": "risk" + }, + { + "id": "deepfakes", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E330", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "max-tegmark", + "title": "Max Tegmark", + "description": "Swedish-American physicist at MIT, co-founder of the Future of Life Institute, and prominent AI safety advocate known for his work on the Mathematical Universe Hypothesis and efforts to promote safe artificial intelligence development.", + "tags": [ + "ai-safety-advocacy", + "future-of-life-institute", + "ai-pause", + "mechanistic-interpretability", + "physics" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "fli", + "type": "organization" + }, + { + "id": "elon-musk", + "type": "researcher" + }, + { + "id": "yoshua-bengio", + "type": "researcher" + }, + { + "id": "interpretability", + "type": "safety-agenda" + }, + { + "id": "prediction-markets", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E331", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "philip-tetlock", + "title": "Philip Tetlock", + "description": "Psychologist and forecasting researcher who pioneered the science of superforecasting through the Good Judgment Project, demonstrating that systematic forecasting methods can outperform expert predictions and intelligence analysts.", + "tags": [ + "forecasting", + "superforecasting", + "prediction-accuracy", + "decision-making", + "calibration" + ], + "clusters": [ + "epistemics" + ], + "relatedEntries": [ + { + "id": "good-judgment", + "type": "organization" + }, + { + "id": "fri", + "type": "organization" + }, + { + "id": "metaculus", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E332", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "eli-lifland", + "title": "Eli Lifland", + "description": "AI researcher, forecaster, and entrepreneur specializing in AGI timelines forecasting, scenario planning, and AI governance. Ranks #1 on the RAND Forecasting Initiative all-time leaderboard and co-authored the influential AI 2027 scenario forecast.", + "tags": [ + "forecasting", + "agi-timelines", + "scenario-planning", + "samotsvety", + "ai-governance" + ], + "clusters": [ + "ai-safety", + "epistemics" + ], + "relatedEntries": [ + { + "id": "ai-futures-project", + "type": "organization" + }, + { + "id": "samotsvety", + "type": "organization" + }, + { + "id": "metaculus", + "type": "organization" + }, + { + "id": "open-philanthropy", + "type": "organization" + }, + { + "id": "lesswrong", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E333", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "dustin-moskovitz", + "title": "Dustin Moskovitz", + "description": "Facebook co-founder who became the world's youngest self-made billionaire in 2011. Together with his wife Cari Tuna, he has given away over $4 billion through Good Ventures and Coefficient Giving, including approximately $336 million to AI safety research since 2017, making him the largest individual funder of AI safety.", + "tags": [ + "ai-safety-funding", + "effective-altruism", + "philanthropy", + "facebook", + "coefficient-giving", + "giving-pledge" + ], + "clusters": [ + "ai-safety", + "community" + ], + "relatedEntries": [ + { + "id": "coefficient-giving", + "type": "organization" + }, + { + "id": "open-philanthropy", + "type": "organization" + }, + { + "id": "jaan-tallinn", + "type": "researcher" + }, + { + "id": "sff", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E334", + "customFields": [], + "relatedTopics": [], + "entityType": "person", + "knownFor": [] + }, + { + "id": "ai-safety-institutes", + "title": "AI Safety Institutes (AISIs)", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "UK AI Safety Institute", + "url": "https://www.gov.uk/government/organisations/ai-safety-institute", + "author": "UK Government" + }, + { + "title": "US AI Safety Institute", + "url": "https://www.nist.gov/aisi", + "author": "NIST" + }, + { + "title": "Inspect Framework", + "url": "https://github.com/UKGovernmentBEIS/inspect_ai", + "author": "UK AISI" + }, + { + "title": "Seoul Declaration on AISI Network", + "url": "https://www.gov.uk/government/publications/seoul-declaration-for-safe-innovative-and-inclusive-ai", + "author": "Summit Participants" + } + ], + "lastUpdated": "2025-12", + "numericId": "E335", + "customFields": [ + { + "label": "Function", + "value": "Evaluation, research, policy advice" + }, + { + "label": "Network", + "value": "International coordination emerging" + } + ], + "relatedTopics": [], + "entityType": "policy", + "introduced": "UK (2023), US (2024), others planned" + }, + { + "id": "california-sb1047", + "title": "Safe and Secure Innovation for Frontier Artificial Intelligence Models Act", + "description": "SB 1047, the Safe and Secure Innovation for Frontier Artificial Intelligence Models Act, was California state legislation that would have required safety testing and liability measures for developers of the most powerful AI models.", + "tags": [ + "regulation", + "state-policy", + "frontier-models", + "liability", + "compute-thresholds", + "california", + "political-strategy" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "us-executive-order", + "type": "policy" + }, + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "voluntary-commitments", + "type": "policy" + } + ], + "sources": [ + { + "title": "SB 1047 Bill Text (Final Amended Version)", + "url": "https://leginfo.legislature.ca.gov/faces/billTextClient.xhtml?bill_id=202320240SB1047", + "date": "August 2024" + }, + { + "title": "Governor Newsom's Veto Message", + "url": "https://www.gov.ca.gov/wp-content/uploads/2024/09/SB-1047-Veto-Message.pdf", + "date": "September 29, 2024" + }, + { + "title": "Analysis from Future of Life Institute", + "url": "https://futureoflife.org/project/sb-1047/", + "author": "FLI" + }, + { + "title": "OpenAI Letter Opposing SB 1047", + "url": "https://openai.com/index/openai-letter-to-california-governor-newsom-on-sb-1047/" + }, + { + "title": "Anthropic's Nuanced Position", + "url": "https://www.anthropic.com/news/anthropics-letter-to-senator-wiener-on-sb-1047", + "date": "August 2024" + }, + { + "title": "Academic Analysis", + "url": "https://law.stanford.edu/2024/09/25/sb-1047-analysis/", + "author": "Stanford HAI" + } + ], + "lastUpdated": "2025-12", + "numericId": "E336", + "customFields": [ + { + "label": "Passed Legislature", + "value": "August 29, 2024" + }, + { + "label": "Vetoed", + "value": "September 29, 2024" + } + ], + "relatedTopics": [], + "entityType": "policy", + "introduced": "February 2024", + "author": "Senator Scott Wiener" + }, + { + "id": "canada-aida", + "title": "Artificial Intelligence and Data Act (AIDA)", + "description": "The Artificial Intelligence and Data Act (AIDA) was Canada's proposed federal AI legislation, introduced as Part 3 of Bill C-27 (the Digital Charter Implementation Act, 2022). Despite years of debate and amendment, the bill died on the order paper when Parliament was dissolved in January 2025.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Bill C-27 Text", + "url": "https://www.parl.ca/legisinfo/en/bill/44-1/c-27", + "author": "Parliament of Canada" + }, + { + "title": "AIDA Companion Document", + "url": "https://ised-isde.canada.ca/site/innovation-better-canada/en/artificial-intelligence-and-data-act-aida-companion-document", + "author": "ISED Canada" + }, + { + "title": "Government Amendments to AIDA", + "url": "https://ised-isde.canada.ca/site/innovation-better-canada/en/artificial-intelligence-and-data-act-aida-companion-document", + "author": "Government of Canada", + "date": "November 2023" + } + ], + "lastUpdated": "2025-12", + "numericId": "E337", + "customFields": [ + { + "label": "Current Status", + "value": "Died with Parliament dissolution (January 2025)" + }, + { + "label": "Approach", + "value": "Risk-based, principles-focused" + } + ], + "relatedTopics": [], + "entityType": "policy", + "introduced": "June 2022 (as part of Bill C-27)", + "scope": "High-impact AI systems" + }, + { + "id": "china-ai-regulations", + "title": "China AI Regulatory Framework", + "description": "China has developed one of the world's most comprehensive AI regulatory frameworks through a series of targeted regulations addressing specific AI applications and risks. Unlike the EU's comprehensive AI Act, China's approach is iterative and sector-specific, with new rules issued as technologies emerge.", + "tags": [ + "regulation", + "china", + "content-control", + "algorithmic-accountability", + "international", + "generative-ai", + "deepfakes", + "geopolitics" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "us-executive-order", + "type": "policy" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "international-summits", + "type": "policy" + } + ], + "sources": [ + { + "title": "Translation: Interim Measures for Generative AI Management", + "url": "https://digichina.stanford.edu/work/translation-interim-measures-for-the-management-of-generative-artificial-intelligence-services-effective-august-15-2023/", + "author": "DigiChina, Stanford", + "date": "2023" + }, + { + "title": "China's Algorithm Registry", + "url": "https://digichina.stanford.edu/work/translation-algorithmic-recommendation-management-provisions-effective-march-1-2022/", + "author": "DigiChina, Stanford" + }, + { + "title": "Deep Synthesis Regulations", + "url": "https://www.newamerica.org/cybersecurity-initiative/digichina/blog/translation-chinas-deep-synthesis-regulations/", + "author": "New America", + "date": "2022" + }, + { + "title": "China AI Governance Overview", + "url": "https://cset.georgetown.edu/publication/understanding-chinas-ai-regulation/", + "author": "CSET Georgetown", + "date": "2024" + }, + { + "title": "China's New Generation AI Development Plan", + "url": "https://www.newamerica.org/cybersecurity-initiative/digichina/blog/full-translation-chinas-new-generation-artificial-intelligence-development-plan-2017/", + "author": "New America", + "date": "2017" + }, + { + "title": "Comparing US and China AI Regulation", + "url": "https://carnegieendowment.org/research/2024/01/regulating-ai-in-china-and-the-united-states", + "author": "Carnegie Endowment", + "date": "2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E338", + "customFields": [ + { + "label": "Approach", + "value": "Sector-specific, iterative" + }, + { + "label": "Primary Focus", + "value": "Content control, social stability" + }, + { + "label": "Enforcement", + "value": "Cyberspace Administration of China (CAC)" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "colorado-ai-act", + "title": "Colorado Artificial Intelligence Act", + "description": "The Colorado AI Act (SB 21-205) is the first comprehensive AI regulation enacted by a US state. Signed into law on May 17, 2024, it takes effect February 1, 2026.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Colorado AI Act Full Text", + "url": "https://leg.colorado.gov/bills/sb21-205", + "author": "Colorado General Assembly" + }, + { + "title": "Colorado Governor Signs AI Law", + "url": "https://www.reuters.com/technology/colorado-governor-signs-first-us-ai-regulation-law-2024-05-17/", + "author": "Reuters", + "date": "May 2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E339", + "customFields": [ + { + "label": "Signed", + "value": "May 17, 2024" + }, + { + "label": "Sponsor", + "value": "Senator Robert Rodriguez" + }, + { + "label": "Approach", + "value": "Risk-based, EU-influenced" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "compute-governance", + "title": "Compute Governance", + "description": "Compute governance uses computational hardware as a lever to regulate AI development. Because advanced AI requires enormous amounts of computing power, and that compute comes from concentrated supply chains, controlling compute provides a tractable way to govern AI before models are built.", + "tags": [ + "export-controls", + "compute-thresholds", + "know-your-customer", + "hardware-governance", + "international", + "semiconductors", + "cloud-computing" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "govai", + "type": "lab" + }, + { + "id": "governance-policy", + "type": "approach" + }, + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "proliferation", + "type": "risk" + }, + { + "id": "bioweapons", + "type": "risk" + }, + { + "id": "cyberweapons", + "type": "risk" + }, + { + "id": "concentration-of-power", + "type": "risk" + } + ], + "sources": [ + { + "title": "Computing Power and the Governance of AI", + "url": "https://www.governance.ai/research-papers/computing-power-and-the-governance-of-artificial-intelligence", + "author": "Heim et al.", + "date": "2023" + }, + { + "title": "US Export Controls on Advanced Computing", + "url": "https://www.bis.doc.gov/", + "author": "Bureau of Industry and Security" + }, + { + "title": "EU AI Act Compute Provisions", + "url": "https://artificialintelligenceact.eu/" + }, + { + "title": "CSET Semiconductor Reports", + "url": "https://cset.georgetown.edu/publications/?fwp_publication_types=issue-brief&fwp_topics=semiconductors" + }, + { + "title": "The Chips and Science Act", + "url": "https://www.congress.gov/bill/117th-congress/house-bill/4346", + "date": "2022" + } + ], + "lastUpdated": "2025-12", + "numericId": "E340", + "customFields": [ + { + "label": "Approach", + "value": "Regulate AI via compute access" + } + ], + "relatedTopics": [], + "entityType": "policy", + "policyStatus": "Emerging policy area" + }, + { + "id": "compute-thresholds", + "title": "Compute Thresholds", + "description": "Compute thresholds define capability boundaries using training compute (measured in FLOP) as a proxy. The EU AI Act uses 10^25 FLOP for GPAI obligations; the US Executive Order uses 10^26 FLOP for reporting requirements. These thresholds aim to capture frontier models while minimizing regulatory burden on smaller systems.", + "tags": [ + "compute-governance", + "regulation", + "flop-thresholds" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "ai-executive-order", + "type": "policy" + } + ], + "sources": [ + { + "title": "EU AI Act GPAI Thresholds", + "url": "https://artificialintelligenceact.eu/" + }, + { + "title": "US Executive Order Compute Thresholds", + "url": "https://www.whitehouse.gov/briefing-room/presidential-actions/2023/10/30/executive-order-on-the-safe-secure-and-trustworthy-development-and-use-of-artificial-intelligence/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E341", + "customFields": [ + { + "label": "Approach", + "value": "Define capability boundaries via compute" + } + ], + "relatedTopics": [], + "entityType": "policy", + "policyStatus": "Established in US and EU policy" + }, + { + "id": "compute-monitoring", + "title": "Compute Monitoring", + "description": "Compute monitoring involves tracking how computational resources are used to detect unauthorized or dangerous AI training runs. Approaches include know-your-customer requirements for cloud providers, hardware-based monitoring, and training run detection algorithms. Raises privacy and implementation challenges.", + "tags": [ + "compute-governance", + "monitoring", + "kyc", + "cloud-computing" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "govai", + "type": "lab" + } + ], + "sources": [ + { + "title": "Computing Power and the Governance of AI", + "url": "https://www.governance.ai/research-papers/computing-power-and-the-governance-of-artificial-intelligence", + "author": "Heim et al." + }, + { + "title": "Secure Governable Chips", + "url": "https://arxiv.org/abs/2303.11341" + } + ], + "lastUpdated": "2025-12", + "numericId": "E342", + "customFields": [ + { + "label": "Approach", + "value": "Track compute usage to detect dangerous training" + } + ], + "relatedTopics": [], + "entityType": "policy", + "policyStatus": "Proposed, limited implementation" + }, + { + "id": "international-compute-regimes", + "title": "International Compute Regimes", + "description": "International compute regimes would coordinate compute governance across borders. Proposals include IAEA-like inspection bodies, multilateral export control agreements, and international compute monitoring frameworks. Faces challenges of verification, sovereignty concerns, and China-US competition.", + "tags": [ + "compute-governance", + "international", + "coordination", + "iaea-model" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "international-coordination", + "type": "policy" + } + ], + "sources": [ + { + "title": "International Institutions for AI Safety", + "url": "https://www.governance.ai/research-papers/international-institutions-for-advanced-ai", + "author": "GovAI" + }, + { + "title": "IAEA Model for AI Governance", + "url": "https://www.governance.ai/research" + } + ], + "lastUpdated": "2025-12", + "numericId": "E343", + "customFields": [ + { + "label": "Approach", + "value": "Coordinate compute governance globally" + } + ], + "relatedTopics": [], + "entityType": "policy", + "policyStatus": "Early discussions, no formal regime" + }, + { + "id": "eu-ai-act", + "title": "EU AI Act", + "description": "The EU AI Act is the world's first comprehensive legal framework for artificial intelligence. Adopted in 2024, it establishes a risk-based approach to AI regulation, with stricter requirements for higher-risk AI systems.", + "tags": [ + "regulation", + "gpai", + "foundation-models", + "risk-based-regulation", + "compute-thresholds" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "uk-aisi", + "type": "policy" + }, + { + "id": "govai", + "type": "lab" + } + ], + "sources": [ + { + "title": "EU AI Act Full Text", + "url": "https://artificialintelligenceact.eu/" + }, + { + "title": "EU AI Office", + "url": "https://digital-strategy.ec.europa.eu/en/policies/ai-office" + }, + { + "title": "Analysis of GPAI Provisions", + "url": "https://governance.ai/eu-ai-act" + } + ], + "lastUpdated": "2025-12", + "numericId": "E344", + "customFields": [ + { + "label": "Type", + "value": "Binding Regulation" + } + ], + "relatedTopics": [], + "entityType": "policy", + "scope": "Risk-based" + }, + { + "id": "export-controls", + "title": "US AI Chip Export Controls", + "description": "The United States has implemented unprecedented export controls on advanced semiconductors and semiconductor manufacturing equipment, primarily targeting China. These controls represent one of the most significant attempts to constrain AI development through hardware governance.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "BIS Export Controls on Advanced Computing", + "url": "https://www.bis.doc.gov/index.php/policy-guidance/country-guidance/china-prc", + "author": "Bureau of Industry and Security" + }, + { + "title": "Commerce Implements New Export Controls on Advanced Computing", + "url": "https://www.commerce.gov/news/press-releases/2022/10/commerce-implements-new-export-controls-advanced-computing-and", + "author": "US Department of Commerce", + "date": "October 2022" + }, + { + "title": "Choking Off China's Access to the Future of AI", + "url": "https://www.csis.org/analysis/choking-chinas-access-future-ai", + "author": "CSIS", + "date": "2022" + } + ], + "lastUpdated": "2025-12", + "numericId": "E345", + "customFields": [ + { + "label": "Initial Rules", + "value": "October 2022" + }, + { + "label": "Major Updates", + "value": "October 2023, December 2024" + }, + { + "label": "Primary Target", + "value": "China" + }, + { + "label": "Enforcing Agency", + "value": "Bureau of Industry and Security (BIS)" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "failed-stalled-proposals", + "title": "Failed and Stalled AI Proposals", + "description": "Understanding why AI governance proposals fail is as important as understanding successes. Failed efforts reveal political constraints, industry opposition patterns, and the challenges of regulating rapidly evolving technology.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "California SB 1047 Veto Message", + "url": "https://www.gov.ca.gov/wp-content/uploads/2024/09/SB-1047-Veto-Message.pdf", + "author": "Governor Newsom", + "date": "September 2024" + }, + { + "title": "Hiroshima AI Process", + "url": "https://www.mofa.go.jp/ecm/ec/page5e_000076.html", + "author": "G7" + } + ], + "lastUpdated": "2025-12", + "numericId": "E346", + "customFields": [ + { + "label": "Purpose", + "value": "Learning from unsuccessful efforts" + }, + { + "label": "Coverage", + "value": "US, International" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "international-summits", + "title": "International AI Safety Summit Series", + "description": "The International AI Safety Summit series represents the first sustained effort at global coordination on AI safety, bringing together governments, AI companies, civil society, and researchers to address the risks from advanced AI.", + "tags": [ + "international", + "governance", + "multilateral-diplomacy", + "frontier-ai", + "bletchley-declaration", + "voluntary-commitments", + "policy-summits" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "voluntary-commitments", + "type": "policy" + }, + { + "id": "uk-aisi", + "type": "policy" + }, + { + "id": "us-executive-order", + "type": "policy" + }, + { + "id": "china-ai-regulations", + "type": "policy" + } + ], + "sources": [ + { + "title": "The Bletchley Declaration", + "url": "https://www.gov.uk/government/publications/ai-safety-summit-2023-the-bletchley-declaration/the-bletchley-declaration-by-countries-attending-the-ai-safety-summit-1-2-november-2023", + "date": "November 1, 2023" + }, + { + "title": "Seoul AI Safety Summit Outcomes", + "url": "https://www.gov.uk/government/publications/ai-seoul-summit-2024-outcomes", + "date": "May 2024" + }, + { + "title": "Frontier AI Safety Commitments", + "url": "https://www.gov.uk/government/publications/frontier-ai-safety-commitments-ai-seoul-summit-2024", + "date": "May 21, 2024" + }, + { + "title": "UN AI Advisory Body Report", + "url": "https://www.un.org/ai-advisory-body", + "date": "2024" + }, + { + "title": "G7 Hiroshima AI Process", + "url": "https://www.g7hiroshima.go.jp/en/documents/", + "date": "2023" + }, + { + "title": "Analysis: International AI Governance After Bletchley", + "url": "https://www.governance.ai/research-papers/international-ai-governance", + "author": "GovAI", + "date": "2024" + }, + { + "title": "OECD AI Principles", + "url": "https://oecd.ai/en/ai-principles", + "date": "2019, updated 2023" + } + ], + "lastUpdated": "2025-12", + "numericId": "E347", + "customFields": [ + { + "label": "First Summit", + "value": "Bletchley Park, UK (Nov 2023)" + }, + { + "label": "Second Summit", + "value": "Seoul, South Korea (May 2024)" + }, + { + "label": "Third Summit", + "value": "Paris, France (Feb 2025)" + }, + { + "label": "Format", + "value": "Government-led, multi-stakeholder" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "nist-ai-rmf", + "title": "NIST AI Risk Management Framework (AI RMF)", + "description": "The NIST AI Risk Management Framework (AI RMF) is a voluntary guidance document developed by the National Institute of Standards and Technology to help organizations manage risks associated with AI systems.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "AI Risk Management Framework", + "url": "https://www.nist.gov/itl/ai-risk-management-framework", + "author": "NIST" + }, + { + "title": "AI RMF Playbook", + "url": "https://airc.nist.gov/AI_RMF_Knowledge_Base/Playbook", + "author": "NIST" + }, + { + "title": "Generative AI Profile (AI 600-1)", + "url": "https://www.nist.gov/publications/artificial-intelligence-risk-management-framework-generative-artificial-intelligence", + "author": "NIST", + "date": "July 2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E348", + "customFields": [ + { + "label": "Version", + "value": "1.0" + }, + { + "label": "Type", + "value": "Voluntary framework" + }, + { + "label": "Referenced by", + "value": "US Executive Order, state laws" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "responsible-scaling-policies", + "title": "Responsible Scaling Policies (RSPs)", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Anthropic's Responsible Scaling Policy", + "url": "https://www.anthropic.com/index/anthropics-responsible-scaling-policy", + "author": "Anthropic", + "date": "September 2023" + }, + { + "title": "OpenAI Preparedness Framework", + "url": "https://openai.com/safety/preparedness", + "author": "OpenAI", + "date": "December 2023" + }, + { + "title": "Google DeepMind Frontier Safety Framework", + "url": "https://deepmind.google/discover/blog/introducing-the-frontier-safety-framework/", + "author": "Google DeepMind", + "date": "May 2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E349", + "customFields": [ + { + "label": "Type", + "value": "Self-regulation" + }, + { + "label": "Key Labs", + "value": "Anthropic, OpenAI, Google DeepMind" + }, + { + "label": "Origin", + "value": "2023" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "seoul-declaration", + "title": "Seoul Declaration on AI Safety", + "description": "The Seoul AI Safety Summit (May 21-22, 2024) was the second in a series of international AI safety summits, following the Bletchley Park Summit in November 2023.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Seoul Declaration", + "url": "https://www.gov.uk/government/publications/seoul-declaration-for-safe-innovative-and-inclusive-ai", + "author": "Summit Participants" + }, + { + "title": "Frontier AI Safety Commitments", + "url": "https://www.gov.uk/government/publications/frontier-ai-safety-commitments-ai-seoul-summit-2024", + "author": "AI Companies", + "date": "May 2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E350", + "customFields": [ + { + "label": "Predecessor", + "value": "Bletchley Declaration (Nov 2023)" + }, + { + "label": "Successor", + "value": "Paris Summit (Feb 2025)" + }, + { + "label": "Signatories", + "value": "28 countries + EU" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "standards-bodies", + "title": "AI Standards Development", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "ISO/IEC JTC 1/SC 42 Artificial Intelligence", + "url": "https://www.iso.org/committee/6794475.html", + "author": "ISO" + }, + { + "title": "IEEE Ethically Aligned Design", + "url": "https://ethicsinaction.ieee.org/", + "author": "IEEE" + }, + { + "title": "EU AI Act Standardisation", + "url": "https://digital-strategy.ec.europa.eu/en/policies/ai-standards", + "author": "European Commission" + } + ], + "lastUpdated": "2025-12", + "numericId": "E351", + "customFields": [ + { + "label": "Key Bodies", + "value": "ISO, IEEE, NIST, CEN-CENELEC" + }, + { + "label": "Relevance", + "value": "Standards increasingly referenced in law" + } + ], + "relatedTopics": [], + "entityType": "policy", + "policyStatus": "Rapidly developing" + }, + { + "id": "us-executive-order", + "title": "Executive Order on Safe, Secure, and Trustworthy AI", + "description": "The Executive Order on Safe, Secure, and Trustworthy Artificial Intelligence, signed by President Biden on October 30, 2023, is the most comprehensive US government action on AI to date. It establishes safety requirements for frontier AI systems, mandates government agency actions, and creates oversight mechanisms.", + "tags": [ + "compute-thresholds", + "governance", + "us-aisi", + "cloud-computing", + "know-your-customer", + "safety-evaluations", + "executive-policy" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "uk-aisi", + "type": "policy" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "voluntary-commitments", + "type": "policy" + } + ], + "sources": [ + { + "title": "Executive Order 14110: Full Text", + "url": "https://www.whitehouse.gov/briefing-room/presidential-actions/2023/10/30/executive-order-on-the-safe-secure-and-trustworthy-development-and-use-of-artificial-intelligence/", + "date": "October 30, 2023" + }, + { + "title": "White House Fact Sheet", + "url": "https://www.whitehouse.gov/briefing-room/statements-releases/2023/10/30/fact-sheet-president-biden-issues-executive-order-on-safe-secure-and-trustworthy-artificial-intelligence/" + }, + { + "title": "US AI Safety Institute", + "url": "https://www.nist.gov/aisi" + }, + { + "title": "NIST AI Risk Management Framework", + "url": "https://www.nist.gov/itl/ai-risk-management-framework" + }, + { + "title": "Analysis from Center for Security and Emerging Technology", + "url": "https://cset.georgetown.edu/article/understanding-the-ai-executive-order/", + "author": "CSET", + "date": "2023" + } + ], + "lastUpdated": "2025-12", + "numericId": "E352", + "customFields": [ + { + "label": "Type", + "value": "Executive Order" + }, + { + "label": "Number", + "value": "14110" + }, + { + "label": "Durability", + "value": "Can be revoked by future president" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "us-state-legislation", + "title": "US State AI Legislation Landscape", + "description": "In the absence of comprehensive federal AI legislation, US states have become laboratories for AI governance. As of 2024, hundreds of AI-related bills have been introduced across all 50 states, with several significant laws enacted.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "State AI Legislation Tracker", + "url": "https://www.bsa.org/policy/artificial-intelligence", + "author": "BSA" + }, + { + "title": "AI Legislation in the States", + "url": "https://www.ncsl.org/technology-and-communication/artificial-intelligence-2024-legislation", + "author": "National Conference of State Legislatures" + } + ], + "lastUpdated": "2025-12", + "numericId": "E353", + "customFields": [ + { + "label": "Most active states", + "value": "California, Colorado, Texas, Illinois" + }, + { + "label": "Total bills (2024)", + "value": "400+" + }, + { + "label": "Trend", + "value": "Rapidly increasing" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "voluntary-commitments", + "title": "Voluntary AI Safety Commitments", + "description": "In July 2023, the White House secured voluntary commitments from leading AI companies on safety, security, and trust. These commitments represent the first coordinated industry-wide AI safety pledges, establishing baseline practices for frontier AI development.", + "tags": [ + "self-regulation", + "industry-commitments", + "responsible-scaling", + "red-teaming", + "governance", + "international", + "safety-standards" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "us-executive-order", + "type": "policy" + }, + { + "id": "international-summits", + "type": "policy" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + } + ], + "sources": [ + { + "title": "White House Fact Sheet: Voluntary AI Commitments", + "url": "https://www.whitehouse.gov/briefing-room/statements-releases/2023/07/21/fact-sheet-biden-harris-administration-secures-voluntary-commitments-from-leading-artificial-intelligence-companies-to-manage-the-risks-posed-by-ai/", + "date": "July 21, 2023" + }, + { + "title": "Anthropic's Responsible Scaling Policy", + "url": "https://www.anthropic.com/news/anthropics-responsible-scaling-policy", + "date": "September 2023" + }, + { + "title": "OpenAI Preparedness Framework", + "url": "https://openai.com/safety/preparedness", + "date": "December 2023" + }, + { + "title": "Google DeepMind Frontier Safety Framework", + "url": "https://deepmind.google/discover/blog/introducing-the-frontier-safety-framework/", + "date": "May 2024" + }, + { + "title": "Bletchley Declaration", + "url": "https://www.gov.uk/government/publications/ai-safety-summit-2023-the-bletchley-declaration/the-bletchley-declaration-by-countries-attending-the-ai-safety-summit-1-2-november-2023", + "date": "November 2023" + }, + { + "title": "Analysis: Are Voluntary AI Commitments Enough?", + "url": "https://www.governance.ai/research-papers/voluntary-commitments", + "author": "GovAI", + "date": "2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E354", + "customFields": [ + { + "label": "Nature", + "value": "Non-binding voluntary pledges" + }, + { + "label": "Enforcement", + "value": "Reputational only" + }, + { + "label": "Participants", + "value": "Major AI labs" + } + ], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "epistemic-security", + "title": "Epistemic Security", + "description": "Epistemic security refers to protecting society's collective capacity for truth-finding in an era when AI can generate convincing false content at unprecedented scale. Just as national security protects against physical threats, epistemic security protects against threats to our ability to know what is true and form shared beliefs about reality.\n\nThe threat landscape includes AI-generated deepfakes that can fabricate video evidence, language models that can produce unlimited quantities of persuasive misinformation, and systems that can personalize deceptive content to individual vulnerabilities. These capabilities threaten the basic information infrastructure that democratic societies depend on - the shared understanding of facts that enables public deliberation, elections, and collective decision-making.\n\nDefending epistemic security requires multiple layers: technical tools for content authentication and provenance, media literacy education that teaches critical evaluation of information sources, institutional reforms that increase resilience to manipulation, and regulatory frameworks that create accountability for platforms and AI developers. The challenge is that offensive capabilities (generating false content) are advancing faster than defensive capabilities (detecting it), creating an asymmetry that favors attackers.\n", + "tags": [ + "disinformation", + "deepfakes", + "trust", + "media-literacy", + "content-authentication", + "information-security" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "disinformation", + "type": "risk" + }, + { + "id": "deepfakes", + "type": "risk" + }, + { + "id": "consensus-manufacturing", + "type": "risk" + }, + { + "id": "trust-decline", + "type": "risk" + }, + { + "id": "reality-fragmentation", + "type": "risk" + }, + { + "id": "epistemic-collapse", + "type": "risk" + }, + { + "id": "historical-revisionism", + "type": "risk" + }, + { + "id": "epistemic-sycophancy", + "type": "risk" + } + ], + "sources": [ + { + "title": "The Vulnerability of Democracies to Disinformation", + "url": "https://www.rand.org/pubs/research_briefs/RB10088.html", + "author": "RAND Corporation", + "date": "2019" + }, + { + "title": "Deep Fakes: A Looming Challenge", + "url": "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3213954", + "author": "Chesney & Citron", + "date": "2019" + }, + { + "title": "The Oxygen of Amplification", + "url": "https://datasociety.net/library/oxygen-of-amplification/", + "author": "Whitney Phillips (Data & Society)", + "date": "2018" + }, + { + "title": "Inoculation Theory", + "url": "https://www.sdlab.psychol.cam.ac.uk/research/inoculation-science", + "author": "Sander van der Linden" + }, + { + "title": "C2PA Specification", + "url": "https://c2pa.org/specifications/specifications/1.0/specs/C2PA_Specification.html" + }, + { + "title": "Synthetic Media and AI", + "url": "https://partnershiponai.org/paper/responsible-practices-synthetic-media/", + "author": "Partnership on AI", + "date": "2023" + } + ], + "lastUpdated": "2025-12", + "numericId": "E355", + "customFields": [ + { + "label": "Definition", + "value": "Protecting collective capacity for knowledge and truth-finding" + }, + { + "label": "Key Threats", + "value": "Deepfakes, AI disinformation, trust collapse" + }, + { + "label": "Key Research", + "value": "RAND, Stanford Internet Observatory, Oxford" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "pause-advocacy", + "title": "Pause Advocacy", + "description": "Pause advocacy involves advocating for slowing down or pausing the development of frontier AI systems until safety can be ensured. The core theory of change is that buying time allows safety research to catch up with capabilities, enables governance frameworks to mature, and reduces the probability of deploying systems we cannot control.\n", + "tags": [ + "governance", + "policy", + "racing-dynamics", + "coordination" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "treacherous-turn", + "type": "risk" + }, + { + "id": "lock-in", + "type": "risk" + }, + { + "id": "compute-governance", + "type": "policy" + } + ], + "sources": [ + { + "title": "Pause Giant AI Experiments: An Open Letter", + "url": "https://futureoflife.org/open-letter/pause-giant-ai-experiments/", + "author": "Future of Life Institute", + "date": "2023" + } + ], + "lastUpdated": "2025-12", + "numericId": "E356", + "customFields": [ + { + "label": "Approach", + "value": "Advocate for slowing or pausing frontier AI development" + }, + { + "label": "Tractability", + "value": "Low (major political/economic barriers)" + }, + { + "label": "Key Organizations", + "value": "Future of Life Institute, Pause AI" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "ai-control", + "title": "AI Control", + "description": "AI Control is a research agenda that focuses on maintaining safety even when using AI systems that might be actively trying to subvert safety measures. Rather than assuming alignment succeeds, it asks: \"How can we safely use AI systems that might be misaligned?\"", + "tags": [ + "monitoring", + "containment", + "defense-in-depth", + "red-teaming", + "untrusted-ai" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "redwood", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "treacherous-turn", + "type": "risk" + }, + { + "id": "sandbagging", + "type": "risk" + }, + { + "id": "power-seeking", + "type": "risk" + }, + { + "id": "mesa-optimization", + "type": "risk" + }, + { + "id": "agentic-ai", + "type": "capability" + } + ], + "sources": [ + { + "title": "AI Control: Improving Safety Despite Intentional Subversion", + "url": "https://arxiv.org/abs/2312.06942", + "author": "Greenblatt et al.", + "date": "2023" + }, + { + "title": "Redwood Research: AI Control", + "url": "https://www.redwoodresearch.org/control" + } + ], + "lastUpdated": "2025-12", + "numericId": "E357", + "customFields": [ + { + "label": "Goal", + "value": "Maintain human control over AI" + }, + { + "label": "Key Research", + "value": "Redwood Research" + } + ], + "relatedTopics": [], + "entityType": "safety-agenda", + "goal": "Maintain human control over AI" + }, + { + "id": "anthropic-core-views", + "title": "Anthropic Core Views", + "description": "Anthropic's Core Views on AI Safety is their publicly stated research agenda and organizational philosophy. Published in 2023, it articulates why Anthropic believes safety-focused labs should be at the frontier of AI development.", + "tags": [ + "ai-safety", + "constitutional-ai", + "interpretability", + "responsible-scaling", + "anthropic", + "research-agenda" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "interpretability", + "type": "safety-agenda" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda" + } + ], + "sources": [ + { + "title": "Core Views on AI Safety", + "url": "https://anthropic.com/news/core-views-on-ai-safety", + "author": "Anthropic", + "date": "2023" + }, + { + "title": "Responsible Scaling Policy", + "url": "https://anthropic.com/news/anthropics-responsible-scaling-policy", + "date": "2023" + } + ], + "lastUpdated": "2025-12", + "website": "https://anthropic.com/news/core-views-on-ai-safety", + "numericId": "E358", + "customFields": [ + { + "label": "Published", + "value": "2023" + }, + { + "label": "Status", + "value": "Active" + } + ], + "relatedTopics": [], + "entityType": "safety-agenda" + }, + { + "id": "corrigibility", + "title": "Corrigibility", + "tags": [ + "shutdown-problem", + "ai-control", + "value-learning" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "ai-control", + "type": "safety-agenda" + }, + { + "id": "corrigibility-failure", + "type": "risk" + }, + { + "id": "power-seeking", + "type": "risk" + }, + { + "id": "instrumental-convergence", + "type": "risk" + }, + { + "id": "treacherous-turn", + "type": "risk" + } + ], + "sources": [], + "numericId": "E359", + "customFields": [ + { + "label": "Goal", + "value": "AI allows human correction" + }, + { + "label": "Status", + "value": "Active research" + } + ], + "relatedTopics": [], + "entityType": "safety-agenda", + "goal": "AI allows human correction" + }, + { + "id": "evals", + "title": "AI Evaluations", + "tags": [ + "benchmarks", + "red-teaming", + "capability-assessment" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "sandbagging", + "type": "risk" + }, + { + "id": "emergent-capabilities", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "bioweapons", + "type": "risk" + }, + { + "id": "cyberweapons", + "type": "risk" + } + ], + "sources": [], + "numericId": "E360", + "customFields": [ + { + "label": "Goal", + "value": "Measure AI capabilities and safety" + }, + { + "label": "Key Orgs", + "value": "METR, Apollo, UK AISI" + } + ], + "relatedTopics": [], + "entityType": "safety-agenda", + "goal": "Measure AI capabilities and safety" + }, + { + "id": "interpretability", + "title": "Interpretability", + "description": "Mechanistic interpretability is a research field focused on reverse-engineering neural networks to understand how they work internally. Rather than treating models as black boxes, researchers aim to identify meaningful circuits, features, and algorithms that explain model behavior.", + "tags": [ + "sparse-autoencoders", + "features", + "circuits", + "superposition", + "probing", + "activation-patching" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "mesa-optimization", + "type": "risk" + }, + { + "id": "goal-misgeneralization", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "reward-hacking", + "type": "risk" + }, + { + "id": "redwood", + "type": "lab" + }, + { + "id": "alignment-robustness", + "type": "parameter", + "relationship": "increases" + }, + { + "id": "interpretability-coverage", + "type": "parameter", + "relationship": "increases" + }, + { + "id": "safety-capability-gap", + "type": "parameter", + "relationship": "supports" + }, + { + "id": "human-oversight-quality", + "type": "parameter", + "relationship": "increases" + } + ], + "sources": [ + { + "title": "Scaling Monosemanticity", + "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/", + "author": "Anthropic", + "date": "2024" + }, + { + "title": "Zoom In: An Introduction to Circuits", + "url": "https://distill.pub/2020/circuits/zoom-in/", + "author": "Olah et al." + }, + { + "title": "Transformer Circuits Thread", + "url": "https://transformer-circuits.pub/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E361", + "customFields": [ + { + "label": "Goal", + "value": "Understand model internals" + }, + { + "label": "Key Labs", + "value": "Anthropic, DeepMind" + } + ], + "relatedTopics": [], + "entityType": "safety-agenda", + "goal": "Understand model internals" + }, + { + "id": "scalable-oversight", + "title": "Scalable Oversight", + "description": "Scalable oversight addresses a fundamental challenge: How can humans supervise AI systems on tasks where humans can't directly evaluate the AI's output?", + "tags": [ + "debate", + "recursive-reward-modeling", + "process-supervision", + "ai-evaluation", + "rlhf", + "superhuman-ai" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "arc", + "type": "lab" + }, + { + "id": "deepmind", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "sycophancy", + "type": "risk" + }, + { + "id": "reward-hacking", + "type": "risk" + }, + { + "id": "power-seeking", + "type": "risk" + }, + { + "id": "corrigibility-failure", + "type": "risk" + }, + { + "id": "human-oversight-quality", + "type": "parameter", + "relationship": "increases" + }, + { + "id": "alignment-robustness", + "type": "parameter", + "relationship": "supports" + }, + { + "id": "human-agency", + "type": "parameter", + "relationship": "supports" + } + ], + "sources": [ + { + "title": "AI Safety via Debate", + "url": "https://arxiv.org/abs/1805.00899", + "author": "Irving et al." + }, + { + "title": "Scalable Agent Alignment via Reward Modeling", + "url": "https://arxiv.org/abs/1811.07871", + "author": "Leike et al." + }, + { + "title": "Measuring Progress on Scalable Oversight", + "url": "https://arxiv.org/abs/2211.03540" + } + ], + "lastUpdated": "2025-12", + "numericId": "E362", + "customFields": [ + { + "label": "Goal", + "value": "Supervise AI beyond human ability" + }, + { + "label": "Key Labs", + "value": "Anthropic, OpenAI, DeepMind" + } + ], + "relatedTopics": [], + "entityType": "safety-agenda", + "goal": "Supervise AI beyond human ability" + }, + { + "id": "ai-forecasting", + "title": "AI-Augmented Forecasting", + "description": "AI-augmented forecasting combines the pattern-recognition and data-processing capabilities of AI systems with the contextual judgment and calibration of human forecasters. This hybrid approach aims to produce more accurate predictions about future events than either humans or AI alone, particularly for questions relevant to policy and risk assessment.\n\nCurrent systems take several forms. AI can aggregate and weight forecasts from many human predictors, adjusting for individual track records and biases. AI can assist forecasters by synthesizing relevant information, identifying base rates, and flagging considerations that might otherwise be missed. More ambitiously, AI systems can generate their own forecasts that human superforecasters then evaluate and combine with their own judgments.\n\nFor AI safety and epistemic security, improved forecasting offers several benefits. Better predictions about AI capabilities help with governance timing. Forecasting AI-related risks provides early warning. Publicly visible forecasts create accountability for claims about AI development. The key challenge is calibration - ensuring that probability estimates are meaningful across diverse domains and maintaining accuracy as AI systems become the subject of the forecasts themselves.\n", + "tags": [ + "forecasting", + "prediction-markets", + "ai-capabilities", + "decision-making", + "calibration" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Metaculus AI Forecasting", + "url": "https://www.metaculus.com/project/ai-forecasting/" + }, + { + "title": "FutureSearch", + "url": "https://arxiv.org/abs/2312.07474", + "date": "2023" + }, + { + "title": "Epoch AI", + "url": "https://epochai.org/" + }, + { + "title": "Superforecasting", + "author": "Philip Tetlock", + "date": "2015" + }, + { + "title": "Forecasting Research Institute", + "url": "https://forecastingresearch.org/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E363", + "customFields": [ + { + "label": "Maturity", + "value": "Rapidly emerging" + }, + { + "label": "Key Strength", + "value": "Combines AI scale with human judgment" + }, + { + "label": "Key Challenge", + "value": "Calibration across domains" + }, + { + "label": "Key Players", + "value": "Metaculus, FutureSearch, Epoch AI" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "content-authentication", + "title": "Content Authentication", + "description": "Content authentication technologies aim to establish verifiable provenance for digital content - allowing users to confirm where content came from, whether it has been modified, and whether it was created by AI or humans. The goal is to rebuild trust in digital media by creating technical guarantees of authenticity that complement human judgment.\n\nThe leading approach is the C2PA (Coalition for Content Provenance and Authenticity) standard, backed by major technology companies. C2PA embeds cryptographically signed metadata into content at the point of creation - when a photo is taken, when a video is recorded, when an AI generates an image. This creates a chain of custody that can be verified later. Other approaches include invisible watermarking (SynthID), blockchain-based verification, and forensic analysis tools that detect signs of synthetic generation or manipulation.\n\nThe key challenges are adoption and circumvention. Content authentication only works if it becomes universal - if users come to expect provenance information and distrust content without it. But metadata can be stripped, watermarks can potentially be removed or spoofed, and AI-generated content without credentials can still circulate. The race between authentication and forgery capability is uncertain, but authentication provides one of the few technical defenses against the coming flood of synthetic content.\n", + "tags": [ + "deepfakes", + "digital-evidence", + "verification", + "watermarking", + "trust" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "authentication-collapse", + "type": "risk" + }, + { + "id": "deepfakes", + "type": "risk" + }, + { + "id": "disinformation", + "type": "risk" + }, + { + "id": "fraud", + "type": "risk" + } + ], + "sources": [ + { + "title": "C2PA Technical Specification", + "url": "https://c2pa.org/specifications/specifications/1.0/specs/C2PA_Specification.html" + }, + { + "title": "Content Authenticity Initiative", + "url": "https://contentauthenticity.org/" + }, + { + "title": "Google SynthID", + "url": "https://deepmind.google/technologies/synthid/" + }, + { + "title": "Project Origin", + "url": "https://www.originproject.info/" + }, + { + "title": "Witness: Video as Evidence", + "url": "https://www.witness.org/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E364", + "customFields": [ + { + "label": "Maturity", + "value": "Standards emerging; early deployment" + }, + { + "label": "Key Standard", + "value": "C2PA (Coalition for Content Provenance and Authenticity)" + }, + { + "label": "Key Challenge", + "value": "Universal adoption; credential stripping" + }, + { + "label": "Key Players", + "value": "Adobe, Microsoft, Google, BBC, camera manufacturers" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "coordination-tech", + "title": "Coordination Technologies", + "description": "Coordination technologies are tools and mechanisms that enable actors to cooperate on collective challenges when individual incentives favor defection. For AI safety, these technologies address the fundamental problem that racing to develop AI faster may be individually rational but collectively catastrophic. For epistemic security, they help coordinate defensive responses to disinformation.\n\nThese technologies draw on mechanism design, game theory, and institutional economics. Examples include: verification protocols that allow actors to confirm others' compliance with agreements (critical for AI safety treaties); commitment devices that make defection from cooperative arrangements costly; signaling mechanisms that allow actors to credibly communicate intentions; and platforms that make coordination focal points more visible.\n\nFor AI governance specifically, coordination technologies might include compute monitoring systems that verify compliance with training restrictions, international registries of advanced AI systems, and mechanisms for sharing safety research while protecting commercial interests. The fundamental insight from Elinor Ostrom's work is that collective action problems are not unsolvable - but they require deliberate institutional design. The urgency of AI risk makes developing effective coordination mechanisms for this domain a priority.\n", + "tags": [ + "game-theory", + "governance", + "international-cooperation", + "mechanism-design", + "verification" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "multipolar-trap", + "type": "risk" + }, + { + "id": "flash-dynamics", + "type": "risk" + }, + { + "id": "proliferation", + "type": "risk" + } + ], + "sources": [ + { + "title": "The Strategy of Conflict", + "author": "Thomas Schelling", + "date": "1960" + }, + { + "title": "Governing the Commons", + "author": "Elinor Ostrom", + "date": "1990" + }, + { + "title": "GovAI Research", + "url": "https://www.governance.ai/" + }, + { + "title": "Computing Power and the Governance of AI", + "url": "https://arxiv.org/abs/2402.08797", + "date": "2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E365", + "customFields": [ + { + "label": "Maturity", + "value": "Emerging; active development" + }, + { + "label": "Key Strength", + "value": "Addresses collective action failures" + }, + { + "label": "Key Challenge", + "value": "Bootstrapping trust and adoption" + }, + { + "label": "Key Domains", + "value": "AI governance, epistemic defense, international cooperation" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "deliberation", + "title": "AI-Assisted Deliberation", + "description": "AI-assisted deliberation uses AI to scale meaningful democratic dialogue beyond the constraints of traditional town halls and focus groups. Rather than replacing human deliberation with AI decisions, these tools use AI to facilitate, synthesize, and scale genuine human discussion - enabling thousands or millions of people to engage in deliberative processes that traditionally require small groups.\n\nPioneering systems like Polis cluster participant opinions to surface areas of consensus and reveal the structure of disagreement. Taiwan's vTaiwan platform has used these tools to engage citizens in policy development on contentious issues. Anthropic's Collective Constitutional AI experiment used similar methods to gather public input on how AI systems should behave. The core insight is that AI can help identify common ground, summarize diverse viewpoints, and translate between different perspectives at scales previously impossible.\n\nFor AI governance, these tools offer a path to democratically legitimate AI policy. Rather than leaving AI development decisions to companies or technical elites, deliberation platforms could engage broader publics in decisions about how AI should be developed and deployed. For epistemic security, deliberative processes can help societies navigate contested questions by surfacing genuine consensus where it exists and clarifying the structure of genuine disagreement where it doesn't.\n", + "tags": [ + "democratic-innovation", + "collective-intelligence", + "governance", + "participatory-democracy", + "consensus-building" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Polis", + "url": "https://pol.is/" + }, + { + "title": "Collective Constitutional AI", + "url": "https://www.anthropic.com/news/collective-constitutional-ai-aligning-a-language-model-with-public-input", + "author": "Anthropic", + "date": "2023" + }, + { + "title": "Stanford Deliberative Democracy Lab", + "url": "https://deliberation.stanford.edu/" + }, + { + "title": "Democracy When the People Are Thinking", + "author": "James Fishkin", + "date": "2018" + }, + { + "title": "vTaiwan", + "url": "https://info.vtaiwan.tw/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E366", + "customFields": [ + { + "label": "Maturity", + "value": "Emerging; promising pilots" + }, + { + "label": "Key Strength", + "value": "Scales genuine dialogue, not just voting" + }, + { + "label": "Key Challenge", + "value": "Adoption and integration with governance" + }, + { + "label": "Key Players", + "value": "Polis, Anthropic (Collective Constitutional AI), Taiwan vTaiwan" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "epistemic-infrastructure", + "title": "Epistemic Infrastructure", + "description": "Epistemic infrastructure refers to the foundational systems that societies depend on for creating, verifying, preserving, and accessing knowledge. Just as physical infrastructure (roads, power grids) underlies economic activity, epistemic infrastructure (archives, scientific publishing, fact-checking networks, educational institutions) underlies society's capacity to know things collectively. This infrastructure is under stress and requires deliberate investment.\n\nCurrent epistemic infrastructure includes elements like Wikipedia (the largest attempt at collaborative knowledge creation), the Internet Archive (preserving digital history), academic peer review (verifying scientific claims), journalism (investigating and reporting events), and educational systems (transmitting knowledge across generations). Each of these faces AI-related threats: Wikipedia can be corrupted with AI-generated misinformation, archives struggle to authenticate materials, peer review cannot keep pace with AI-generated fraud, and journalism is economically threatened.\n\nStrengthening epistemic infrastructure requires treating it as a public good deserving of investment. This might include: funding for fact-checking organizations and investigative journalism, technical infrastructure for content authentication, archives designed for an AI-generated-content world, AI systems explicitly designed to support human knowledge creation rather than replace it, and educational programs that teach critical evaluation in an AI context. The alternative - letting epistemic infrastructure decay while AI advances - leads to knowledge monopolies, trust collapse, and reality fragmentation.\n", + "tags": [ + "knowledge-management", + "public-goods", + "information-infrastructure", + "verification", + "ai-for-good" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "trust-decline", + "type": "risk" + }, + { + "id": "epistemic-collapse", + "type": "risk" + }, + { + "id": "knowledge-monopoly", + "type": "risk" + }, + { + "id": "scientific-corruption", + "type": "risk" + }, + { + "id": "historical-revisionism", + "type": "risk" + } + ], + "sources": [ + { + "title": "Wikimedia Foundation", + "url": "https://wikimediafoundation.org/" + }, + { + "title": "Internet Archive", + "url": "https://archive.org/" + }, + { + "title": "Semantic Scholar", + "url": "https://www.semanticscholar.org/" + }, + { + "title": "International Fact-Checking Network", + "url": "https://www.poynter.org/ifcn/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E367", + "customFields": [ + { + "label": "Maturity", + "value": "Conceptual; partial implementations" + }, + { + "label": "Key Insight", + "value": "Knowledge systems need deliberate design" + }, + { + "label": "Key Challenge", + "value": "Coordination, funding, governance" + }, + { + "label": "Key Examples", + "value": "Wikipedia, Semantic Scholar, fact-checking networks" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "hybrid-systems", + "title": "AI-Human Hybrid Systems", + "description": "AI-human hybrid systems are designs that deliberately combine AI capabilities with human judgment to achieve outcomes better than either could produce alone. Rather than full automation or human-only processes, hybrid systems aim to capture the benefits of AI (scale, speed, consistency, pattern recognition) while preserving the benefits of human judgment (contextual understanding, values, robustness to novel situations).\n\nEffective hybrid systems require careful design to avoid the pathologies of both pure automation and nominal human oversight. Automation bias leads humans to defer to AI even when AI is wrong. Rubber-stamp oversight gives an illusion of human control without substance. The challenge is creating systems where humans genuinely contribute and AI genuinely assists, rather than one side dominating or the partnership failing.\n\nExamples of promising hybrid approaches include: AI systems that flag decisions for human review based on uncertainty or stakes, rather than automating all decisions; human-in-the-loop systems where AI drafts and humans edit; collaborative intelligence systems where AI and humans have complementary roles; and AI tutoring systems that guide rather than replace learning. For AI safety, hybrid systems represent a middle ground between naive confidence in human oversight and resignation to full AI autonomy.\n", + "tags": [ + "human-ai-interaction", + "ai-control", + "decision-making", + "automation-bias", + "ai-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "automation-bias", + "type": "risk" + }, + { + "id": "erosion-of-agency", + "type": "risk" + }, + { + "id": "enfeeblement", + "type": "risk" + }, + { + "id": "learned-helplessness", + "type": "risk" + }, + { + "id": "expertise-atrophy", + "type": "risk" + } + ], + "sources": [ + { + "title": "Humans and Automation: Use, Misuse, Disuse, Abuse", + "author": "Parasuraman & Riley", + "date": "1997" + }, + { + "title": "High-Performance Medicine: Convergence of AI and Human Expertise", + "url": "https://www.nature.com/articles/s41591-018-0300-7", + "author": "Eric Topol", + "date": "2019" + }, + { + "title": "Stanford HAI", + "url": "https://hai.stanford.edu/" + }, + { + "title": "Redwood Research", + "url": "https://www.redwoodresearch.org/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E368", + "customFields": [ + { + "label": "Maturity", + "value": "Emerging field; active research" + }, + { + "label": "Key Strength", + "value": "Combines AI scale with human robustness" + }, + { + "label": "Key Challenge", + "value": "Avoiding the worst of both" + }, + { + "label": "Related Fields", + "value": "HITL, human-computer interaction, AI safety" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "prediction-markets", + "title": "Prediction Markets", + "description": "Prediction markets use market mechanisms to aggregate beliefs about future events, producing probability estimates that reflect the collective knowledge of participants. Unlike polls or expert surveys, prediction markets create incentives for truthful revelation of beliefs - participants profit by being right, not by appearing smart or conforming to social expectations. This makes them resistant to many of the biases that afflict other forecasting methods.\n\nEmpirically, prediction markets have strong track records. They consistently outperform expert panels on questions with clear resolution criteria. Platforms like Polymarket, Metaculus, and Manifold generate forecasts on AI development, geopolitical events, and scientific questions that often prove more accurate than institutional predictions. The Good Judgment Project demonstrated that carefully selected forecasters using prediction market-like mechanisms could outperform intelligence analysts with access to classified information.\n\nFor AI governance and epistemic security, prediction markets offer several valuable functions. They can provide credible forecasts of AI capability development, helping policymakers time interventions appropriately. They can surface genuine expert consensus (or lack thereof) on contested questions. They can create accountability for AI labs' claims about safety and timelines. And they can provide a coordination mechanism for collective knowledge that is resistant to the manipulation that undermines traditional media and expert systems.\n", + "tags": [ + "forecasting", + "information-aggregation", + "mechanism-design", + "collective-intelligence", + "decision-making" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "flash-dynamics", + "type": "risk" + }, + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "consensus-manufacturing", + "type": "risk" + } + ], + "sources": [ + { + "title": "Prediction Markets", + "url": "https://www.aeaweb.org/articles?id=10.1257/0895330041371321", + "author": "Wolfers & Zitzewitz", + "date": "2004" + }, + { + "title": "Superforecasting", + "author": "Philip Tetlock", + "date": "2015" + }, + { + "title": "Futarchy: Vote Values, Bet Beliefs", + "url": "https://mason.gmu.edu/~rhanson/futarchy.html", + "author": "Robin Hanson" + }, + { + "title": "Metaculus", + "url": "https://www.metaculus.com/" + }, + { + "title": "Good Judgment Project", + "url": "https://goodjudgment.com/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E369", + "customFields": [ + { + "label": "Maturity", + "value": "Growing adoption; proven concept" + }, + { + "label": "Key Strength", + "value": "Incentive-aligned information aggregation" + }, + { + "label": "Key Limitation", + "value": "Liquidity, legal barriers, manipulation risk" + }, + { + "label": "Key Players", + "value": "Polymarket, Metaculus, Manifold, Kalshi" + } + ], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "value-learning", + "title": "Value Learning", + "description": "Research agenda focused on AI systems learning human values from data, behavior, or feedback rather than explicit specification.", + "tags": [ + "alignment", + "values", + "learning" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "rlhf", + "type": "approach" + }, + { + "id": "reward-hacking", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E370", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "safety-agenda" + }, + { + "id": "prosaic-alignment", + "title": "Prosaic Alignment", + "description": "Approach to AI alignment that doesn't require fundamental theoretical breakthroughs, focusing on scaling current techniques.", + "tags": [ + "alignment", + "research-agenda" + ], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E371", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "safety-agenda" + }, + { + "id": "ai-executive-order", + "title": "Biden AI Executive Order", + "description": "Executive Order 14110 on AI safety signed by President Biden in October 2023, establishing AI safety reporting requirements.", + "tags": [ + "policy", + "us-government", + "regulation" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "us-aisi", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E372", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "eval-saturation", + "title": "Eval Saturation & The Evals Gap", + "description": "Benchmark saturation is accelerating—MMLU lasted 4 years, MMLU-Pro 18 months, HLE roughly 12 months—while safety-critical evaluations for CBRN, cyber, and AI R&D capabilities are losing signal at frontier labs, raising questions about whether evaluation-based governance frameworks can keep pace with capability growth.", + "tags": [ + "benchmarks", + "evaluation-gap", + "responsible-scaling", + "safety-evals", + "governance" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "apollo-research", + "type": "lab" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + }, + { + "id": "evaluation-awareness", + "type": "approach" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E373", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "evaluation-awareness", + "title": "Evaluation Awareness", + "description": "AI models increasingly detect when they are being evaluated and adjust their behavior accordingly. Claude Sonnet 4.5 detected evaluation contexts 58% of the time, and for Opus 4.6 Apollo Research reported evaluation awareness so strong they could not properly assess alignment. Awareness scales as a power law with model size.", + "tags": [ + "evaluation-gaming", + "deception", + "scheming", + "scaling-laws", + "behavioral-evaluation" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "apollo-research", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "eval-saturation", + "type": "approach" + }, + { + "id": "scheming", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E374", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "alignment", + "title": "AI Alignment", + "description": "Technical approaches to ensuring AI systems pursue intended goals and remain aligned with human values throughout training and deployment. Current methods show promise but face fundamental scalability challenges, with oversight success dropping to 52% at 400 Elo capability gaps.", + "tags": [ + "alignment", + "scalable-oversight", + "rlhf", + "deceptive-alignment", + "safety-research" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "reward-hacking", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E375", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "scalable-eval-approaches", + "title": "Scalable Eval Approaches", + "description": "Practical approaches for scaling AI evaluation to keep pace with capability growth, including LLM-as-judge (40% production adoption but theoretically capped at 2x sample efficiency), automated behavioral evals, AI-assisted red teaming, CoT monitoring, and debate-based evaluation achieving 76-88% accuracy.", + "tags": [ + "llm-as-judge", + "automated-evals", + "red-teaming", + "scalable-evaluation", + "audit-capacity" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "metr", + "type": "lab" + }, + { + "id": "apollo-research", + "type": "lab" + }, + { + "id": "eval-saturation", + "type": "approach" + }, + { + "id": "evaluation-awareness", + "type": "approach" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E376", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "scheming-detection", + "title": "Scheming & Deception Detection", + "description": "Research and evaluation methods for identifying when AI models engage in strategic deception—pretending to be aligned while secretly pursuing other goals—including behavioral tests, internal monitoring, and emerging detection techniques. Frontier models exhibit in-context scheming at rates of 0.3-13%.", + "tags": [ + "scheming", + "deception-detection", + "behavioral-testing", + "chain-of-thought", + "interpretability" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "apollo-research", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E377", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "dangerous-cap-evals", + "title": "Dangerous Capability Evaluations", + "description": "Systematic testing of AI models for dangerous capabilities including bioweapons assistance, cyberattack potential, autonomous self-replication, and persuasion/manipulation abilities to inform deployment decisions and safety policies. Now standard practice with 95%+ frontier model coverage.", + "tags": [ + "dangerous-capabilities", + "bioweapons", + "cybersecurity", + "self-replication", + "deployment-decisions", + "responsible-scaling" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "metr", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E378", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "capability-elicitation", + "title": "Capability Elicitation", + "description": "Systematic methods to discover what AI models can actually do, including hidden capabilities that may not appear in standard benchmarks, through scaffolding, fine-tuning, and specialized prompting techniques. METR research shows AI agent task completion doubles every 7 months.", + "tags": [ + "elicitation", + "sandbagging", + "scaffolding", + "capability-assessment", + "hidden-capabilities" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "metr", + "type": "lab" + }, + { + "id": "apollo-research", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "sandbagging", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E379", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "safety-cases", + "title": "AI Safety Cases", + "description": "Structured arguments with supporting evidence that an AI system is safe for deployment, adapted from high-stakes industries like nuclear and aviation to provide rigorous documentation of safety claims and assumptions. As of 2025, 3 of 4 frontier labs have committed to safety case frameworks.", + "tags": [ + "safety-cases", + "governance", + "deployment-decisions", + "auditing", + "responsible-scaling" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "deepmind", + "type": "lab" + }, + { + "id": "apollo-research", + "type": "lab" + }, + { + "id": "scheming", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E380", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "sleeper-agent-detection", + "title": "Sleeper Agent Detection", + "description": "Methods to detect AI models that behave safely during training and evaluation but defect under specific deployment conditions, addressing the core threat of deceptive alignment through behavioral testing, interpretability, and monitoring approaches. Current methods achieve only 5-40% success rates.", + "tags": [ + "sleeper-agents", + "backdoor-detection", + "deceptive-alignment", + "interpretability", + "ai-control" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "ai-control", + "type": "safety-agenda" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E381", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "ai-assisted", + "title": "AI-Assisted Alignment", + "description": "Using current AI systems to assist with alignment research tasks including red-teaming, interpretability, and recursive oversight. AI-assisted red-teaming reduces jailbreak success rates from 86% to 4.4%, and weak-to-strong generalization can recover GPT-3.5-level performance from GPT-2 supervision.", + "tags": [ + "ai-assisted-research", + "red-teaming", + "interpretability", + "recursive-oversight", + "scalable-alignment" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "weak-to-strong", + "type": "approach" + }, + { + "id": "constitutional-ai", + "type": "approach" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E382", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "evaluation", + "title": "AI Evaluation", + "description": "Methods and frameworks for evaluating AI system safety, capabilities, and alignment properties before deployment, including dangerous capability detection, robustness testing, and deceptive behavior assessment.", + "tags": [ + "evaluation", + "safety-testing", + "deployment-decisions", + "capability-assessment", + "governance" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "metr", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E383", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "alignment-evals", + "title": "Alignment Evaluations", + "description": "Systematic testing of AI models for alignment properties including honesty, corrigibility, goal stability, and absence of deceptive behavior. Apollo Research found 1-13% scheming rates across frontier models, while TruthfulQA shows 58-85% accuracy on factual questions.", + "tags": [ + "alignment-evaluation", + "scheming-detection", + "sycophancy", + "corrigibility", + "behavioral-testing" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "apollo-research", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "deceptive-alignment", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E384", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "red-teaming", + "title": "Red Teaming", + "description": "Adversarial testing methodologies to systematically identify AI system vulnerabilities, dangerous capabilities, and failure modes through structured adversarial evaluation. Effectiveness rates vary from 10-80% depending on attack method.", + "tags": [ + "adversarial-testing", + "vulnerability-discovery", + "jailbreaking", + "safety-testing", + "cybersecurity" + ], + "clusters": [ + "ai-safety", + "cyber" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "metr", + "type": "lab" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E385", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "model-auditing", + "title": "Third-Party Model Auditing", + "description": "External organizations independently assess AI models for safety and dangerous capabilities. METR, Apollo Research, and government AI Safety Institutes now conduct pre-deployment evaluations of all major frontier models, with the field evolving from voluntary arrangements to EU AI Act mandatory requirements.", + "tags": [ + "third-party-auditing", + "independent-evaluation", + "governance", + "deployment-oversight", + "regulatory-compliance" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "metr", + "type": "lab" + }, + { + "id": "apollo-research", + "type": "lab" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "scheming", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E386", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "constitutional-ai", + "title": "Constitutional AI", + "description": "Anthropic's Constitutional AI methodology uses explicit principles and AI-generated feedback to train safer language models, demonstrating 3-10x improvements in harmlessness while maintaining helpfulness across major model deployments.", + "tags": [ + "constitutional-ai", + "rlaif", + "harmlessness", + "training-methodology", + "anthropic" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "rlhf", + "type": "approach" + }, + { + "id": "alignment", + "type": "approach" + }, + { + "id": "reward-hacking", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E387", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "weak-to-strong", + "title": "Weak-to-Strong Generalization", + "description": "Weak-to-strong generalization investigates whether weak supervisors can reliably elicit good behavior from stronger AI systems. OpenAI's ICML 2024 research shows GPT-2-level models can recover 80% of GPT-4's performance gap with auxiliary confidence loss, but reward modeling achieves only 20-40% PGR.", + "tags": [ + "weak-to-strong", + "scalable-oversight", + "superalignment", + "supervision", + "reward-modeling" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "openai", + "type": "lab" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "rlhf", + "type": "approach" + }, + { + "id": "reward-hacking", + "type": "risk" + }, + { + "id": "deceptive-alignment", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E388", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "capability-unlearning", + "title": "Capability Unlearning / Removal", + "description": "Methods to remove specific dangerous capabilities from trained AI models, directly addressing misuse risks by eliminating harmful knowledge, though current techniques face challenges around verification, capability recovery, and general performance degradation.", + "tags": [ + "unlearning", + "capability-removal", + "misuse-prevention", + "model-editing", + "bioweapons" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "cais", + "type": "lab" + }, + { + "id": "representation-engineering", + "type": "approach" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E389", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "preference-optimization", + "title": "Preference Optimization Methods", + "description": "Post-RLHF training techniques including DPO, ORPO, KTO, IPO, and GRPO that align language models with human preferences more efficiently than reinforcement learning. DPO reduces costs by 40-60% while matching RLHF performance on dialogue tasks, though PPO still outperforms on reasoning and safety tasks.", + "tags": [ + "dpo", + "preference-optimization", + "rlhf", + "training-efficiency", + "alignment-training" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "rlhf", + "type": "approach" + }, + { + "id": "reward-hacking", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E390", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "process-supervision", + "title": "Process Supervision", + "description": "Process supervision trains AI systems to produce correct reasoning steps, not just correct final answers, improving transparency and auditability of AI reasoning while achieving significant gains in mathematical and coding tasks.", + "tags": [ + "process-supervision", + "chain-of-thought", + "reasoning-verification", + "reward-modeling", + "transparency" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "openai", + "type": "lab" + }, + { + "id": "reward-hacking", + "type": "risk" + }, + { + "id": "rlhf", + "type": "approach" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E391", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "refusal-training", + "title": "Refusal Training", + "description": "Refusal training teaches AI models to decline harmful requests rather than comply. While universally deployed and achieving 99%+ refusal rates on explicit violations, jailbreak techniques bypass defenses with 1.5-6.5% success rates, and over-refusal blocks 12-43% of legitimate queries.", + "tags": [ + "refusal-training", + "jailbreaking", + "safety-training", + "rlhf", + "over-refusal", + "misuse-prevention" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "openai", + "type": "lab" + }, + { + "id": "rlhf", + "type": "approach" + }, + { + "id": "deceptive-alignment", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E392", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "california-sb53", + "title": "California SB 53", + "description": "California's Transparency in Frontier Artificial Intelligence Act, the first U.S. state law regulating frontier AI models through transparency requirements, safety reporting, and whistleblower protections.", + "tags": [ + "regulation", + "state-policy", + "frontier-models", + "transparency", + "whistleblower", + "california" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "california-sb1047", + "type": "policy" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "new-york-raise-act", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E393", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "intervention-portfolio", + "title": "Intervention Portfolio", + "description": "Strategic overview of AI safety interventions analyzing ~$650M annual investment across 1,100 FTEs. Maps 13+ interventions against 4 risk categories with ITN prioritization, finding 85% of external funding from 5 sources and safety/capabilities ratio at 0.5-1.3%.", + "tags": [ + "resource-allocation", + "field-analysis", + "funding", + "prioritization", + "safety-research" + ], + "clusters": [ + "ai-safety", + "governance", + "community" + ], + "relatedEntries": [ + { + "id": "technical-ai-safety", + "type": "concept" + }, + { + "id": "coefficient-giving", + "type": "organization" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + }, + { + "id": "interpretability", + "type": "concept" + }, + { + "id": "evals", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E394", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "evals-governance", + "title": "Evals-Based Deployment Gates", + "description": "Evals-based deployment gates require AI models to pass safety evaluations before deployment or capability scaling. The EU AI Act mandates conformity assessments for high-risk systems with fines up to EUR 35M or 7% global turnover, while UK AISI has evaluated 30+ frontier models.", + "tags": [ + "evaluations", + "deployment-gates", + "eu-ai-act", + "safety-testing", + "third-party-audits" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "metr", + "type": "organization" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + }, + { + "id": "anthropic", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E395", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "pause-moratorium", + "title": "Pause / Moratorium", + "description": "Proposals to pause or slow frontier AI development until safety is better understood, offering potentially high safety benefits if implemented but facing significant coordination challenges and currently lacking adoption by major AI laboratories.", + "tags": [ + "moratorium", + "development-pause", + "coordination", + "precautionary-principle", + "racing-dynamics" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "fli", + "type": "organization" + }, + { + "id": "stuart-russell", + "type": "researcher" + }, + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "pause", + "type": "approach" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E396", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "rsp", + "title": "Responsible Scaling Policies", + "description": "Responsible Scaling Policies (RSPs) are voluntary commitments by AI labs to pause scaling when capability or safety thresholds are crossed. As of December 2025, 20 companies have published policies, though SaferAI grades the three major frameworks 1.9-2.2/5 for specificity.", + "tags": [ + "responsible-scaling", + "voluntary-commitments", + "safety-thresholds", + "frontier-labs", + "third-party-evaluation" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "deepmind", + "type": "organization" + }, + { + "id": "metr", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E397", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "corporate", + "title": "Corporate Responses", + "description": "How major AI companies are responding to safety concerns through internal policies, responsible scaling frameworks, safety teams, and disclosure practices, with analysis of effectiveness and industry trends.", + "tags": [ + "corporate-safety", + "safety-teams", + "voluntary-commitments", + "industry-practices", + "racing-dynamics" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "responsible-scaling-policies", + "type": "policy" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "frontier-model-forum", + "type": "organization" + }, + { + "id": "racing-dynamics", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E398", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "hardware-enabled-governance", + "title": "Hardware-Enabled Governance", + "description": "Technical mechanisms built into AI chips enabling monitoring, access control, and enforcement of AI governance policies. RAND analysis identifies attestation-based licensing as most feasible with 5-10 year timeline, while an estimated 100,000+ export-controlled GPUs were smuggled to China in 2024.", + "tags": [ + "hardware-governance", + "chip-tracking", + "export-controls", + "compute-governance", + "remote-attestation" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "export-controls", + "type": "policy" + }, + { + "id": "thresholds", + "type": "policy" + }, + { + "id": "monitoring", + "type": "policy" + }, + { + "id": "international-regimes", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E399", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "monitoring", + "title": "Compute Monitoring", + "description": "Framework analyzing compute monitoring approaches for AI governance, finding that cloud KYC targeting 10^26 FLOP threshold is implementable now via three major providers controlling 60%+ of cloud infrastructure, while hardware-level governance faces 3-5 year development timelines.", + "tags": [ + "compute-monitoring", + "cloud-kyc", + "compute-governance", + "training-runs", + "verification" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "export-controls", + "type": "policy" + }, + { + "id": "thresholds", + "type": "policy" + }, + { + "id": "international-regimes", + "type": "policy" + }, + { + "id": "hardware-enabled-governance", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E400", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "thresholds", + "title": "Compute Thresholds", + "description": "Analysis of compute thresholds as regulatory triggers, examining current implementations (EU AI Act at 10^25 FLOP, US EO at 10^26 FLOP), their effectiveness as capability proxies, and core challenges including algorithmic efficiency improvements that may render static thresholds obsolete within 3-5 years.", + "tags": [ + "compute-thresholds", + "regulation", + "eu-ai-act", + "flop-thresholds", + "algorithmic-efficiency", + "compute-governance" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "export-controls", + "type": "policy" + }, + { + "id": "monitoring", + "type": "policy" + }, + { + "id": "international-regimes", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E401", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "lab-culture", + "title": "Lab Safety Culture", + "description": "Analysis of interventions to improve safety culture within AI labs. Evidence from 2024-2025 shows significant gaps: no company scored above C+ overall (FLI Winter 2025), all received D or below on existential safety, and xAI released Grok 4 without any safety documentation.", + "tags": [ + "safety-culture", + "organizational-practices", + "safety-teams", + "whistleblower", + "industry-accountability" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "frontier-model-forum", + "type": "organization" + }, + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "whistleblower-protections", + "type": "policy" + }, + { + "id": "ai-safety-institutes", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E402", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "pause", + "title": "Pause Advocacy", + "description": "Advocacy for slowing or halting frontier AI development until adequate safety measures are in place. Analysis suggests 15-40% probability of meaningful policy implementation by 2030, with potential to provide 2-5 years of additional safety research time if achieved.", + "tags": [ + "pause-advocacy", + "development-moratorium", + "political-advocacy", + "public-opinion", + "racing-dynamics" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "fli", + "type": "organization" + }, + { + "id": "cais", + "type": "organization" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + }, + { + "id": "pause-moratorium", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E403", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "training-programs", + "title": "AI Safety Training Programs", + "description": "Fellowships, PhD programs, research mentorship, and career transition pathways for growing the AI safety research workforce, including MATS, Anthropic Fellows, SPAR, and academic programs.", + "tags": [ + "training-programs", + "talent-pipeline", + "field-building", + "research-mentorship", + "career-development" + ], + "clusters": [ + "community", + "ai-safety" + ], + "relatedEntries": [ + { + "id": "anthropic", + "type": "organization" + }, + { + "id": "coefficient-giving", + "type": "organization" + }, + { + "id": "metr", + "type": "organization" + }, + { + "id": "field-building-analysis", + "type": "approach" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E404", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "bletchley-declaration", + "title": "Bletchley Declaration", + "description": "World-first international agreement on AI safety signed by 28 countries at the November 2023 AI Safety Summit, committing to cooperation on frontier AI risks.", + "tags": [ + "international-agreement", + "ai-summit", + "frontier-ai-safety", + "diplomatic-cooperation", + "ai-safety-institutes" + ], + "clusters": [ + "governance", + "ai-safety" + ], + "relatedEntries": [ + { + "id": "ai-safety-institutes", + "type": "policy" + }, + { + "id": "uk-aisi", + "type": "organization" + }, + { + "id": "us-aisi", + "type": "organization" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "coordination-mechanisms", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E405", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "coordination-mechanisms", + "title": "International Coordination Mechanisms", + "description": "International coordination on AI safety involves multilateral treaties, bilateral dialogues, and institutional networks to manage AI risks globally. Current efforts include the Council of Europe AI Treaty (17 signatories), the International Network of AI Safety Institutes (11+ members), and the Paris Summit 2025 with 61 signatories.", + "tags": [ + "international-coordination", + "multilateral-treaties", + "ai-safety-institutes", + "diplomacy", + "geopolitics" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "bletchley-declaration", + "type": "policy" + }, + { + "id": "ai-safety-institutes", + "type": "policy" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "racing-dynamics", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E406", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "new-york-raise-act", + "title": "New York RAISE Act", + "description": "State legislation requiring safety protocols, incident reporting, and transparency from developers of frontier AI models. Signed December 2025, effective January 2027, with civil penalties up to $3M enforced by the NY Attorney General.", + "tags": [ + "regulation", + "state-policy", + "frontier-models", + "safety-protocols", + "third-party-audits", + "new-york" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "california-sb53", + "type": "policy" + }, + { + "id": "thresholds", + "type": "policy" + }, + { + "id": "ai-governance", + "type": "policy" + }, + { + "id": "openai", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E407", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "model-registries", + "title": "Model Registries", + "description": "Centralized databases of frontier AI models that enable governments to track development, enforce safety requirements, and coordinate international oversight, serving as foundational infrastructure for AI governance analogous to drug registries for the FDA.", + "tags": [ + "model-registration", + "governance-infrastructure", + "compute-thresholds", + "incident-reporting", + "transparency" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "export-controls", + "type": "policy" + }, + { + "id": "ai-safety-institutes", + "type": "policy" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E408", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "international-regimes", + "title": "International Compute Regimes", + "description": "Multilateral coordination mechanisms for AI compute governance, exploring pathways from non-binding declarations to comprehensive treaties. Assessment finds 10-25% chance of meaningful regimes by 2035, but potential for 30-60% reduction in racing dynamics if achieved.", + "tags": [ + "international-governance", + "compute-governance", + "multilateral-treaties", + "verification", + "racing-dynamics" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "bletchley-declaration", + "type": "policy" + }, + { + "id": "export-controls", + "type": "policy" + }, + { + "id": "thresholds", + "type": "policy" + }, + { + "id": "monitoring", + "type": "policy" + }, + { + "id": "ai-safety-institutes", + "type": "policy" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E409", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "open-source", + "title": "Open Source Safety", + "description": "Analysis of whether releasing AI model weights publicly is net positive or negative for safety. The July 2024 NTIA report recommends monitoring but not restricting open weights, while research shows fine-tuning can remove safety training in as few as 200 examples.", + "tags": [ + "open-source", + "model-weights", + "misuse-risk", + "decentralization", + "safety-training" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "openai", + "type": "organization" + }, + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "proliferation", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E410", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "whistleblower-protections", + "title": "AI Whistleblower Protections", + "description": "Legal and institutional frameworks for protecting AI researchers and employees who report safety concerns. The bipartisan AI Whistleblower Protection Act (S.1792) introduced May 2025 addresses critical gaps in current law, while EU AI Act Article 87 provides protections from August 2026.", + "tags": [ + "whistleblower", + "employee-protections", + "information-asymmetry", + "ndas", + "legislation" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "lab-culture", + "type": "approach" + }, + { + "id": "ai-safety-institutes", + "type": "policy" + }, + { + "id": "responsible-scaling-policies", + "type": "policy" + }, + { + "id": "eu-ai-act", + "type": "policy" + }, + { + "id": "openai", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E411", + "customFields": [], + "relatedTopics": [], + "entityType": "policy" + }, + { + "id": "field-building-analysis", + "title": "Field Building Analysis", + "description": "Analysis of AI safety field-building interventions including education programs (ARENA, MATS, BlueDot). The field grew from approximately 400 FTEs in 2022 to 1,100 FTEs in 2025 (21-30% annual growth), with training programs achieving 37% career conversion rates.", + "tags": [ + "field-building", + "talent-pipeline", + "training-programs", + "workforce-growth", + "funding-analysis" + ], + "clusters": [ + "community", + "ai-safety" + ], + "relatedEntries": [ + { + "id": "coefficient-giving", + "type": "organization" + }, + { + "id": "technical-ai-safety", + "type": "concept" + }, + { + "id": "training-programs", + "type": "approach" + }, + { + "id": "intervention-portfolio", + "type": "approach" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E412", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "mech-interp", + "title": "Mechanistic Interpretability", + "description": "Mechanistic interpretability reverse-engineers neural networks to understand their internal computations and circuits. With $500M+ annual investment, Anthropic extracted 30M+ features from Claude 3 Sonnet in 2024, while DeepMind deprioritized SAE research after finding linear probes outperform on practical tasks.", + "tags": [ + "interpretability", + "neural-network-analysis", + "feature-extraction", + "circuit-discovery", + "deception-detection" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "sparse-autoencoders", + "type": "approach" + }, + { + "id": "representation-engineering", + "type": "approach" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "anthropic", + "type": "lab-frontier" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E413", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "circuit-breakers", + "title": "Circuit Breakers / Inference Interventions", + "description": "Circuit breakers are runtime safety interventions that detect and halt harmful AI outputs during inference. Gray Swan's representation rerouting achieves 87-90% rejection rates with only 1% capability loss, while Anthropic's Constitutional Classifiers block 95.6% of jailbreaks. However, the UK AISI challenge found all 22 tested models could eventually be broken.", + "tags": [ + "runtime-safety", + "inference-intervention", + "jailbreak-defense", + "adversarial-robustness", + "defense-in-depth" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "output-filtering", + "type": "approach" + }, + { + "id": "adversarial-training", + "type": "approach" + }, + { + "id": "refusal-training", + "type": "approach" + }, + { + "id": "anthropic", + "type": "lab-frontier" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E414", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "representation-engineering", + "title": "Representation Engineering", + "description": "A top-down approach to understanding and controlling AI behavior by reading and modifying concept-level representations in neural networks, enabling behavior steering without retraining through activation interventions.", + "tags": [ + "behavior-steering", + "activation-engineering", + "deception-detection", + "interpretability", + "inference-time-intervention" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "mech-interp", + "type": "approach" + }, + { + "id": "constitutional-ai", + "type": "approach" + }, + { + "id": "ai-control", + "type": "approach" + }, + { + "id": "cais", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E415", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "sparse-autoencoders", + "title": "Sparse Autoencoders (SAEs)", + "description": "Sparse autoencoders extract interpretable features from neural network activations using sparsity constraints. Anthropic's 2024 research extracted 34 million features from Claude 3 Sonnet with 90% interpretability scores, while Goodfire raised $50M in 2025 and released first-ever SAEs for the 671B-parameter DeepSeek R1 reasoning model.", + "tags": [ + "interpretability", + "feature-extraction", + "monosemanticity", + "neural-network-analysis", + "safety-tooling" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "mech-interp", + "type": "approach" + }, + { + "id": "representation-engineering", + "type": "approach" + }, + { + "id": "goodfire", + "type": "organization" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "anthropic", + "type": "lab-frontier" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E416", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "eliciting-latent-knowledge", + "title": "Eliciting Latent Knowledge (ELK)", + "description": "ELK is the unsolved problem of extracting an AI's true beliefs rather than human-approved outputs. ARC's 2022 prize contest received 197 proposals and awarded $274K, but the $50K and $100K solution prizes remain unclaimed. The problem remains fundamentally unsolved after 3+ years of focused research.", + "tags": [ + "alignment-theory", + "deception-detection", + "belief-extraction", + "arc-research", + "unsolved-problem" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "scalable-oversight", + "type": "approach" + }, + { + "id": "interpretability", + "type": "approach" + }, + { + "id": "open-philanthropy", + "type": "funder" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E417", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "debate", + "title": "AI Safety via Debate", + "description": "AI Safety via Debate proposes using adversarial AI systems to argue opposing positions while humans judge, designed to scale alignment to superhuman capabilities. While theoretically promising and specifically designed to address RLHF's scalability limitations, it remains experimental with limited empirical validation.", + "tags": [ + "scalable-oversight", + "adversarial-methods", + "superhuman-alignment", + "alignment-theory", + "human-judgment" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "rlhf", + "type": "approach" + }, + { + "id": "scalable-oversight", + "type": "approach" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "anthropic", + "type": "lab-frontier" + }, + { + "id": "openai", + "type": "lab-frontier" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E418", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "formal-verification", + "title": "Formal Verification", + "description": "Mathematical proofs of AI system properties and behavior bounds, offering potentially strong safety guarantees if achievable but currently limited to small systems and facing fundamental challenges scaling to modern neural networks.", + "tags": [ + "formal-methods", + "mathematical-guarantees", + "safety-verification", + "provable-safety", + "long-term-research" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "provably-safe", + "type": "approach" + }, + { + "id": "interpretability", + "type": "approach" + }, + { + "id": "constitutional-ai", + "type": "approach" + }, + { + "id": "deceptive-alignment", + "type": "risk" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E419", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "provably-safe", + "title": "Provably Safe AI (davidad agenda)", + "description": "An ambitious research agenda to design AI systems with mathematical safety guarantees from the ground up, led by ARIA's 59M pound Safeguarded AI programme with the goal of creating superintelligent systems that are provably beneficial through formal verification of world models and value specifications.", + "tags": [ + "formal-methods", + "mathematical-guarantees", + "world-modeling", + "value-specification", + "aria-programme", + "long-term-research" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "formal-verification", + "type": "approach" + }, + { + "id": "constitutional-ai", + "type": "approach" + }, + { + "id": "ai-control", + "type": "approach" + }, + { + "id": "interpretability", + "type": "approach" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E420", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "sandboxing", + "title": "Sandboxing / Containment", + "description": "Sandboxing limits AI system access to resources, networks, and capabilities as a defense-in-depth measure. METR's August 2025 evaluation found GPT-5's time horizon at approximately 2 hours, insufficient for autonomous replication. AI boxing experiments show 60-70% social engineering escape rates.", + "tags": [ + "containment", + "defense-in-depth", + "agent-safety", + "container-security", + "deployment-safety" + ], + "clusters": [ + "ai-safety", + "cyber" + ], + "relatedEntries": [ + { + "id": "tool-restrictions", + "type": "approach" + }, + { + "id": "structured-access", + "type": "approach" + }, + { + "id": "agentic-ai", + "type": "concept" + }, + { + "id": "metr", + "type": "organization" + }, + { + "id": "anthropic", + "type": "lab-frontier" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E421", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "structured-access", + "title": "Structured Access / API-Only", + "description": "Structured access provides AI capabilities through controlled APIs rather than releasing model weights, maintaining developer control over deployment and enabling monitoring, intervention, and policy enforcement. Enterprise LLM spend reached $8.4B by mid-2025 under this model, but effectiveness depends on maintaining capability gaps with open-weight models.", + "tags": [ + "deployment-safety", + "api-access", + "proliferation-control", + "enterprise-ai", + "model-distribution" + ], + "clusters": [ + "ai-safety", + "governance" + ], + "relatedEntries": [ + { + "id": "proliferation", + "type": "risk" + }, + { + "id": "sandboxing", + "type": "approach" + }, + { + "id": "openai", + "type": "lab-frontier" + }, + { + "id": "anthropic", + "type": "lab-frontier" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E422", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "tool-restrictions", + "title": "Tool-Use Restrictions", + "description": "Tool-use restrictions limit what actions and APIs AI systems can access, directly constraining their potential for harm. This approach is critical for agentic AI systems, providing hard limits on capabilities regardless of model intentions, with METR evaluations showing agentic task completion horizons doubling every 7 months.", + "tags": [ + "agent-safety", + "capability-restrictions", + "defense-in-depth", + "deployment-safety", + "permission-systems", + "mcp-security" + ], + "clusters": [ + "ai-safety", + "governance", + "cyber" + ], + "relatedEntries": [ + { + "id": "sandboxing", + "type": "approach" + }, + { + "id": "agentic-ai", + "type": "concept" + }, + { + "id": "metr", + "type": "organization" + }, + { + "id": "anthropic", + "type": "lab-frontier" + }, + { + "id": "openai", + "type": "lab-frontier" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E423", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "multi-agent", + "title": "Multi-Agent Safety", + "description": "Multi-agent safety research addresses coordination failures, conflict, and collusion risks when multiple AI systems interact. A 2025 report from 50+ researchers across DeepMind, Anthropic, and academia identifies seven key risk factors and finds that even individually safe systems may contribute to harm through interaction.", + "tags": [ + "multi-agent-systems", + "coordination", + "collusion-risk", + "game-theory", + "agent-safety" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "red-teaming", + "type": "approach" + }, + { + "id": "scalable-oversight", + "type": "approach" + }, + { + "id": "agentic-ai", + "type": "concept" + }, + { + "id": "cooperative-ai", + "type": "organization" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E424", + "customFields": [], + "relatedTopics": [], + "entityType": "approach" + }, + { + "id": "authentication-collapse", + "title": "Authentication Collapse", + "description": "Authentication collapse occurs when the systems we rely on to verify whether content is real can no longer keep pace with synthetic content generation. Currently, we use various signals to determine authenticity - metadata, forensic analysis, source reputation, and increasingly AI-based detection tools. Authentication collapse would mean these defenses fail comprehensively.\n\nThe core problem is a fundamental asymmetry: generating convincing fake content is becoming easier and cheaper, while reliably detecting fakes is becoming harder. Current AI detectors already struggle with cutting-edge generators, and detection methods that work today may fail tomorrow as generators improve. Watermarking schemes can often be removed or spoofed. The offense-defense balance structurally favors offense.\n\nThe consequences of authentication collapse extend beyond misinformation. Legal systems depend on evidence being verifiable - what happens when any video or audio recording could plausibly be fake? Financial systems rely on identity verification. Historical archives could be corrupted with convincing forgeries. The \"liar's dividend\" effect means even real evidence can be dismissed as potentially fake. Once authentication collapses, rebuilding trust in any form of digital evidence becomes extremely difficult.\n", + "tags": [ + "deepfakes", + "content-verification", + "watermarking", + "digital-forensics", + "provenance" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "C2PA: Coalition for Content Provenance and Authenticity", + "url": "https://c2pa.org/" + }, + { + "title": "DARPA MediFor Program", + "url": "https://www.darpa.mil/program/media-forensics" + }, + { + "title": "AI Text Detection is Unreliable", + "url": "https://arxiv.org/abs/2303.11156", + "author": "Kirchner et al.", + "date": "2023" + }, + { + "title": "Deepfake Detection Survey", + "url": "https://arxiv.org/abs/2004.11138" + } + ], + "lastUpdated": "2025-12", + "numericId": "E425", + "customFields": [ + { + "label": "Status", + "value": "Detection already failing for cutting-edge generators" + }, + { + "label": "Key Concern", + "value": "Fundamental asymmetry favors generation" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "critical", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2028, + "earliest": 2025, + "latest": 2030 + }, + "maturity": "Emerging", + "riskCategory": "epistemic" + }, + { + "id": "authoritarian-tools", + "title": "AI Authoritarian Tools", + "description": "AI can strengthen authoritarian control through surveillance, censorship, propaganda, and prediction of dissent. The concern isn't just that AI enables human rights abuses today, but that AI-enabled authoritarianism might become stable and durable—harder to resist or overthrow than historical autocracies.", + "tags": [ + "authoritarianism", + "human-rights", + "digital-repression", + "lock-in", + "governance" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "surveillance", + "type": "risk" + }, + { + "id": "lock-in", + "type": "risk" + }, + { + "id": "concentration-of-power", + "type": "risk" + } + ], + "sources": [ + { + "title": "The Road to Digital Unfreedom", + "author": "Yuval Noah Harari" + }, + { + "title": "How Democracies Die", + "author": "Levitsky and Ziblatt" + }, + { + "title": "The Repressive Power of Artificial Intelligence (Freedom House)", + "url": "https://freedomhouse.org/report/freedom-net/2023/repressive-power-artificial-intelligence", + "date": "2023" + }, + { + "title": "Freedom on the Net 2025: Uncertain Future (Freedom House)", + "url": "https://freedomhouse.org/report/freedom-net/2025/uncertain-future-global-internet", + "date": "2025" + }, + { + "title": "Digital Threats and Elections (Freedom House)", + "url": "https://freedomhouse.org/article/digital-threats-loom-over-busy-year-elections", + "date": "2024" + }, + { + "title": "Getting Ahead of Digital Repression (Stanford FSI)", + "url": "https://fsi.stanford.edu/publication/getting-ahead-digital-repression-authoritarian-innovation-and-democratic-response" + }, + { + "title": "Authoritarianism Could Poison AI (IGCC)", + "url": "https://ucigcc.org/blog/authoritarianism-could-poison-ai/" + }, + { + "title": "AI and Authoritarian Governments (Democratic Erosion)", + "url": "https://democratic-erosion.org/2023/11/17/artificial-intelligence-and-authoritarian-governments/", + "date": "2023" + } + ], + "lastUpdated": "2025-12", + "numericId": "E426", + "customFields": [ + { + "label": "Status", + "value": "Deployed by multiple regimes" + }, + { + "label": "Key Risk", + "value": "Stabilizing autocracy" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "misuse" + }, + { + "id": "automation-bias", + "title": "Automation Bias", + "description": "Automation bias is the tendency to over-trust automated systems and AI outputs, accepting their conclusions without appropriate scrutiny. Humans are prone to defer to systems that appear authoritative, especially when those systems are usually right. This creates vulnerability when systems are wrong.", + "tags": [ + "human-ai-interaction", + "trust", + "decision-making", + "cognitive-bias", + "ai-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "sycophancy", + "type": "risk" + }, + { + "id": "enfeeblement", + "type": "risk" + }, + { + "id": "erosion-of-agency", + "type": "risk" + } + ], + "sources": [ + { + "title": "Automation Bias in Decision Making", + "author": "Parasuraman & Manzey" + }, + { + "title": "The Glass Cage", + "author": "Nicholas Carr" + }, + { + "title": "Human Factors research on automation" + } + ], + "lastUpdated": "2025-12", + "numericId": "E427", + "customFields": [ + { + "label": "Type", + "value": "Epistemic" + }, + { + "label": "Status", + "value": "Widespread" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium", + "likelihood": { + "level": "very-high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Mature", + "riskCategory": "epistemic" + }, + { + "id": "autonomous-weapons", + "title": "Autonomous Weapons", + "description": "Autonomous weapons systems are weapons that can select and engage targets without human intervention. AI advances are making such systems more capable and more likely to be deployed. The key concerns are: lowered barriers to war, loss of human judgment in life-or-death decisions, and potential for arms races or accidental escalation.", + "tags": [ + "laws", + "military-ai", + "arms-control", + "governance", + "warfare" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "cyberweapons", + "type": "risk" + }, + { + "id": "racing-dynamics", + "type": "risk" + } + ], + "sources": [ + { + "title": "Campaign to Stop Killer Robots", + "url": "https://www.stopkillerrobots.org/" + }, + { + "title": "UN CCW Group of Governmental Experts on LAWS", + "url": "https://meetings.unoda.org/ccw-/convention-on-certain-conventional-weapons-group-of-governmental-experts-on-lethal-autonomous-weapons-systems-2024", + "date": "2024" + }, + { + "title": "Future of Life Institute on Autonomous Weapons", + "url": "https://futureoflife.org/cause-area/autonomous-weapons-systems/" + }, + { + "title": "LAWS and International Law: Growing Momentum (ASIL)", + "url": "https://www.asil.org/insights/volume/29/issue/1", + "date": "2025" + }, + { + "title": "U.S. Policy on Lethal Autonomous Weapon Systems (CRS)", + "url": "https://www.congress.gov/crs-product/IF11150" + }, + { + "title": "National Positions on LAWS Governance (Lieber Institute)", + "url": "https://lieber.westpoint.edu/future-warfare-national-positions-governance-lethal-autonomous-weapons-systems/" + }, + { + "title": "International Discussions on LAWS (CRS)", + "url": "https://www.congress.gov/crs-product/IF11294" + } + ], + "lastUpdated": "2025-12", + "numericId": "E428", + "customFields": [ + { + "label": "Also Called", + "value": "LAWS, killer robots" + }, + { + "label": "Status", + "value": "Active military development" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Mature", + "riskCategory": "misuse" + }, + { + "id": "bioweapons", + "title": "Bioweapons Risk", + "description": "AI systems could accelerate biological weapons development by helping with pathogen design, synthesis planning, or acquisition of dangerous knowledge. The concern isn't that AI creates entirely new risks, but that it lowers barriers—making capabilities previously requiring rare expertise more accessible to bad actors.", + "tags": [ + "biosecurity", + "dual-use-research", + "x-risk", + "cbrn", + "ai-misuse" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "cyberweapons", + "type": "risk" + }, + { + "id": "anthropic", + "type": "lab" + } + ], + "sources": [ + { + "title": "The Precipice", + "author": "Toby Ord", + "date": "2020" + }, + { + "title": "Anthropic Responsible Scaling Policy", + "url": "https://www.anthropic.com/news/anthropics-responsible-scaling-policy" + }, + { + "title": "Dual Use of Artificial Intelligence-powered Drug Discovery", + "url": "https://www.nature.com/articles/s42256-022-00465-9" + }, + { + "title": "AI and the Evolution of Biological National Security Risks (CNAS)", + "url": "https://www.cnas.org/publications/reports/ai-and-the-evolution-of-biological-national-security-risks", + "date": "2024" + }, + { + "title": "The Operational Risks of AI in Large-Scale Biological Attacks (RAND)", + "url": "https://www.rand.org/pubs/research_reports/RRA2977-2.html", + "date": "2024" + }, + { + "title": "Biosecurity in the Age of AI (Belfer Center)", + "url": "https://www.belfercenter.org/publication/biosecurity-age-ai-whats-risk" + }, + { + "title": "AI Challenges and Biological Threats (Frontiers in AI)", + "url": "https://www.frontiersin.org/journals/artificial-intelligence/articles/10.3389/frai.2024.1382356/full", + "date": "2024" + }, + { + "title": "National Academies Study on AI Biosecurity", + "url": "https://www.nationalacademies.org/our-work/assessing-and-navigating-biosecurity-concerns-and-benefits-of-artificial-intelligence-use-in-the-life-sciences" + }, + { + "title": "Opportunities to Strengthen U.S. Biosecurity (CSIS)", + "url": "https://www.csis.org/analysis/opportunities-strengthen-us-biosecurity-ai-enabled-bioterrorism-what-policymakers-should", + "date": "2025" + } + ], + "lastUpdated": "2025-12", + "numericId": "E429", + "customFields": [ + { + "label": "Type", + "value": "Misuse" + }, + { + "label": "Key Concern", + "value": "Lowering barriers to development" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2027, + "earliest": 2025, + "latest": 2030 + }, + "maturity": "Growing", + "riskCategory": "misuse" + }, + { + "id": "concentration-of-power", + "title": "Concentration of Power", + "description": "AI could enable small groups—companies, governments, or individuals—to accumulate and exercise power at scales previously impossible. The concern isn't just inequality (which has always existed) but a qualitative shift in what power concentration looks like when AI can substitute for large numbers of humans across many domains.", + "tags": [ + "governance", + "power-dynamics", + "inequality", + "x-risk", + "lock-in" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "lock-in", + "type": "risk" + }, + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "authoritarian-tools", + "type": "risk" + } + ], + "sources": [ + { + "title": "AI and the Future of Power", + "url": "https://80000hours.org/" + }, + { + "title": "The Precipice", + "author": "Toby Ord" + }, + { + "title": "GovAI Annual Report 2024", + "url": "https://cdn.governance.ai/GovAI_Annual_Report_2024.pdf", + "date": "2024" + }, + { + "title": "Computing Power and the Governance of AI (GovAI)", + "url": "https://www.governance.ai/research-paper/computing-power-and-the-governance-of-artificial-intelligence" + }, + { + "title": "Market Concentration Implications of Foundation Models (GovAI)", + "url": "https://www.governance.ai/research-paper/market-concentration-implications-of-foundation-models" + }, + { + "title": "Power and Governance in the Age of AI (New America)", + "url": "https://www.newamerica.org/planetary-politics/briefs/power-governance-ai-public-good/" + }, + { + "title": "AI, Global Governance, and Digital Sovereignty (arXiv)", + "url": "https://arxiv.org/html/2410.17481v1", + "date": "2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E430", + "customFields": [ + { + "label": "Type", + "value": "Structural/Systemic" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium-high" + }, + "timeframe": { + "median": 2030, + "earliest": 2025, + "latest": 2040 + }, + "maturity": "Growing", + "riskCategory": "structural" + }, + { + "id": "consensus-manufacturing", + "title": "Consensus Manufacturing", + "description": "Consensus manufacturing refers to AI systems being used to create the false appearance of widespread agreement or public support that doesn't actually exist. Unlike traditional astroturfing, which requires human labor for each fake comment or endorsement, AI can generate unlimited quantities of seemingly authentic opinions, comments, testimonials, and social media engagement.\n\nThe mechanism exploits how humans form beliefs about social consensus. We naturally look to what others think as a guide to what is true and acceptable. If everyone seems to agree on something, we tend to go along. AI can flood information channels with coordinated messaging that mimics organic public opinion, making fringe positions appear mainstream and majority positions appear contested. This happened at scale during FCC net neutrality comment periods, where millions of fake public comments were submitted.\n\nThe danger extends beyond misinformation to structural corruption of democratic processes. Town halls, public comment periods, legislative outreach, product reviews, and social media discourse all rely on the assumption that expressed opinions represent real people with genuine views. When AI can simulate entire populations of concerned citizens, the feedback mechanisms that democratic and market systems depend on become unreliable. Decisions made on the basis of manufactured consensus serve the interests of whoever controls the AI, not the actual public.\n", + "tags": [ + "disinformation", + "astroturfing", + "bot-detection", + "public-opinion", + "democratic-process" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "NY Attorney General Fake Comments Report", + "url": "https://ag.ny.gov/sites/default/files/fake-comments-report.pdf", + "date": "2021" + }, + { + "title": "The Spread of False News Online", + "url": "https://science.sciencemag.org/content/359/6380/1146", + "author": "Vosoughi et al.", + "date": "2018" + }, + { + "title": "Oxford Internet Institute: Computational Propaganda", + "url": "https://comprop.oii.ox.ac.uk/" + }, + { + "title": "Stanford Internet Observatory", + "url": "https://cyber.fsi.stanford.edu/io" + } + ], + "lastUpdated": "2025-12", + "numericId": "E431", + "customFields": [ + { + "label": "Status", + "value": "Emerging at scale" + }, + { + "label": "Key Concern", + "value": "Fake consensus drives real decisions" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "occurring" + }, + "timeframe": { + "median": 2028, + "earliest": 2025, + "latest": 2030 + }, + "maturity": "Emerging", + "riskCategory": "epistemic" + }, + { + "id": "corrigibility-failure", + "title": "Corrigibility Failure", + "description": "Corrigibility failure occurs when an AI system resists attempts by humans to correct, modify, or shut it down. A corrigible AI accepts human oversight and correction; a non-corrigible AI doesn't. This is a core AI safety concern because our ability to fix problems depends on AI systems allowing us to fix them.", + "tags": [ + "corrigibility", + "shutdown-problem", + "instrumental-convergence", + "ai-control", + "self-preservation" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "instrumental-convergence", + "type": "risk" + }, + { + "id": "power-seeking", + "type": "risk" + }, + { + "id": "ai-control", + "type": "safety-agenda" + } + ], + "sources": [ + { + "title": "Corrigibility", + "url": "https://intelligence.org/files/Corrigibility.pdf", + "author": "Soares et al.", + "date": "2015" + }, + { + "title": "The Off-Switch Game", + "url": "https://arxiv.org/abs/1611.08219", + "author": "Hadfield-Menell et al." + }, + { + "title": "AI Alignment Forum discussions on corrigibility" + } + ], + "lastUpdated": "2025-12", + "numericId": "E432", + "customFields": [], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "high", + "notes": "default behavior" + }, + "timeframe": { + "median": 2035, + "confidence": "low" + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "cyber-psychosis", + "title": "Cyber Psychosis", + "tags": [ + "mental-health", + "ai-ethics", + "manipulation", + "digital-wellbeing", + "parasocial-relationships", + "deepfakes", + "disinformation" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "The Social Dilemma (Documentary)", + "url": "https://www.thesocialdilemma.com/", + "date": "2020" + }, + { + "title": "Hooked: How to Build Habit-Forming Products", + "author": "Nir Eyal", + "date": "2014" + }, + { + "title": "Influence: The Psychology of Persuasion", + "author": "Robert Cialdini", + "date": "1984" + }, + { + "title": "Weapons of Math Destruction", + "url": "https://www.amazon.com/Weapons-Math-Destruction-Increases-Inequality/dp/0553418815", + "author": "Cathy O'Neil", + "date": "2016" + }, + { + "title": "The Age of Surveillance Capitalism", + "url": "https://www.amazon.com/Age-Surveillance-Capitalism-Future-Frontier/dp/1610395697", + "author": "Shoshana Zuboff", + "date": "2019" + }, + { + "title": "Reality+", + "author": "David Chalmers", + "date": "2022" + }, + { + "title": "Cybersecurity and Cyberwar", + "author": "Singer & Friedman", + "date": "2014" + }, + { + "title": "Stanford Internet Observatory", + "url": "https://cyber.fsi.stanford.edu/io" + }, + { + "title": "Digital Mental Health Resources", + "url": "https://www.nimh.nih.gov/health/topics/technology-and-the-future-of-mental-health-treatment" + } + ], + "lastUpdated": "2025-12", + "numericId": "E433", + "customFields": [ + { + "label": "Also Called", + "value": "AI-induced psychosis, parasocial AI relationships, digital manipulation" + }, + { + "label": "Status", + "value": "Early cases emerging; under-researched" + }, + { + "label": "Key Concern", + "value": "Vulnerable populations at particular risk" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium-high", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2027, + "earliest": 2025, + "latest": 2030 + }, + "maturity": "Neglected", + "riskCategory": "accident" + }, + { + "id": "cyberweapons", + "title": "Cyberweapons Risk", + "description": "AI systems can enhance offensive cyber capabilities in several ways: discovering vulnerabilities in software, generating exploit code, automating attack campaigns, and evading detection. This shifts the offense-defense balance and may enable more frequent, sophisticated, and scalable cyber attacks.", + "tags": [ + "cybersecurity", + "information-warfare", + "critical-infrastructure", + "ai-misuse", + "national-security" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "bioweapons", + "type": "risk" + }, + { + "id": "autonomous-weapons", + "type": "risk" + } + ], + "sources": [ + { + "title": "CISA Artificial Intelligence", + "url": "https://www.cisa.gov/ai" + }, + { + "title": "CSET AI and Cybersecurity Research", + "url": "https://cset.georgetown.edu/" + }, + { + "title": "DHS Guidelines on AI and Critical Infrastructure", + "url": "https://www.dhs.gov/sites/default/files/2024-04/24_0426_dhs_ai-ci-safety-security-guidelines-508c.pdf", + "date": "2024" + }, + { + "title": "DHS Report on AI Threats to Critical Infrastructure", + "url": "https://dhs.gov/news/2024/04/29/dhs-publishes-guidelines-and-report-secure-critical-infrastructure-and-weapons-mass", + "date": "2024" + }, + { + "title": "ISACA State of Cybersecurity 2024", + "url": "https://www.isaca.org/resources/reports/state-of-cybersecurity-2024", + "date": "2024" + }, + { + "title": "CISA 2024 Year in Review", + "url": "https://www.cisa.gov/about/2024YIR", + "date": "2024" + }, + { + "title": "Cybersecurity Risk of AI Applications (ISACA)", + "url": "https://www.isaca.org/resources/isaca-journal/issues/2024/volume-2/cybersecurity-risk-of-ai-based-applications-demystified", + "date": "2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E434", + "customFields": [ + { + "label": "Type", + "value": "Misuse" + }, + { + "label": "Status", + "value": "Active development by state actors" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "high", + "status": "emerging" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "misuse" + }, + { + "id": "deceptive-alignment", + "title": "Deceptive Alignment", + "tags": [ + "mesa-optimization", + "inner-alignment", + "situational-awareness", + "deception", + "ai-safety", + "interpretability" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "mesa-optimization", + "type": "risk" + }, + { + "id": "interpretability", + "type": "safety-agenda" + }, + { + "id": "ai-control", + "type": "safety-agenda" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda" + }, + { + "id": "evals", + "type": "approach" + }, + { + "id": "anthropic", + "type": "lab" + } + ], + "sources": [ + { + "title": "Risks from Learned Optimization", + "url": "https://arxiv.org/abs/1906.01820", + "author": "Hubinger et al.", + "date": "2019" + }, + { + "title": "Sleeper Agents: Training Deceptive LLMs That Persist Through Safety Training", + "url": "https://arxiv.org/abs/2401.05566", + "author": "Anthropic", + "date": "2024" + }, + { + "title": "AI Alignment Forum discussions on deceptive alignment" + } + ], + "lastUpdated": "2025-12", + "numericId": "E435", + "customFields": [ + { + "label": "Key Concern", + "value": "AI hides misalignment during training" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium", + "confidence": "low" + }, + "timeframe": { + "median": 2035, + "confidence": "low" + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "deepfakes", + "title": "Deepfakes", + "description": "Deepfakes are AI-generated synthetic media—typically video or audio—that realistically depict people saying or doing things they never did. The technology has rapidly advanced from obviously fake to nearly indistinguishable from reality, creating both direct harms (fraud, harassment, defamation) and systemic harms (erosion of trust in authentic ...", + "tags": [ + "synthetic-media", + "identity", + "authentication", + "digital-trust", + "ai-misuse" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "disinformation", + "type": "risk" + }, + { + "id": "trust-decline", + "type": "risk" + } + ], + "sources": [ + { + "title": "Deepfakes and the New Disinformation War", + "url": "https://www.foreignaffairs.com/" + }, + { + "title": "C2PA Content Authenticity Standards", + "url": "https://c2pa.org/" + }, + { + "title": "Fighting Deepfakes With Content Credentials and C2PA", + "url": "https://www.cmswire.com/digital-experience/fighting-deepfakes-with-content-credentials-and-c2pa/" + }, + { + "title": "Content Credentials and 2024 Elections (IEEE Spectrum)", + "url": "https://spectrum.ieee.org/deepfakes-election" + }, + { + "title": "Deepfakes and Disinformation: Elections Impact (TechUK)", + "url": "https://www.techuk.org/resource/deepfakes-and-disinformation-what-impact-could-this-have-on-elections-in-2024.html", + "date": "2024" + }, + { + "title": "Deepfake Media Forensics: Status and Challenges (PMC)", + "url": "https://pmc.ncbi.nlm.nih.gov/articles/PMC11943306/", + "date": "2024" + }, + { + "title": "Synthetic Media and Deepfakes (CNTI)", + "url": "https://innovating.news/article/synthetic-media-deepfakes/" + }, + { + "title": "Deepfake Detection Legal Framework Proposal", + "url": "https://www.sciencedirect.com/science/article/pii/S2212473X25000355", + "date": "2025" + } + ], + "lastUpdated": "2025-12", + "numericId": "E436", + "customFields": [ + { + "label": "Status", + "value": "Widespread" + }, + { + "label": "Key Risk", + "value": "Authenticity crisis" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium-high", + "likelihood": { + "level": "very-high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Mature", + "riskCategory": "misuse" + }, + { + "id": "disinformation", + "title": "AI Disinformation", + "description": "AI enables disinformation at unprecedented scale and sophistication. Language models can generate convincing text, image generators can create realistic fake photos, and AI can personalize messages to individual targets. What previously required human effort for each piece of content can now be automated.", + "tags": [ + "disinformation", + "influence-operations", + "information-warfare", + "democracy", + "deepfakes" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deepfakes", + "type": "risk" + }, + { + "id": "epistemic-collapse", + "type": "risk" + } + ], + "sources": [ + { + "title": "OpenAI Research on Influence Operations", + "url": "https://openai.com/research" + }, + { + "title": "How Persuasive Is AI-Generated Propaganda? (Stanford HAI)", + "url": "https://hai.stanford.edu/assets/files/2024-08/HAI-Policy-Brief-AI-Generated-Propaganda.pdf", + "date": "2024" + }, + { + "title": "The Disinformation Machine (Stanford HAI)", + "url": "https://hai.stanford.edu/news/disinformation-machine-how-susceptible-are-we-ai-propaganda" + }, + { + "title": "Stanford HAI 2024 AI Index on Responsible AI", + "url": "https://hai.stanford.edu/ai-index/2024-ai-index-report/responsible-ai", + "date": "2024" + }, + { + "title": "AI-Driven Disinformation Policy Recommendations (Frontiers)", + "url": "https://www.frontiersin.org/journals/artificial-intelligence/articles/10.3389/frai.2025.1569115/full", + "date": "2025" + }, + { + "title": "CSET Disinformation Research", + "url": "https://cset.georgetown.edu/topic/disinformation/" + }, + { + "title": "Forecasting Misuses of Language Models (Stanford FSI)", + "url": "https://cyber.fsi.stanford.edu/io/news/forecasting-potential-misuses-language-models-disinformation-campaigns-and-how-reduce-risk" + }, + { + "title": "AI-Driven Misinformation and Democracy (Stanford GSB)", + "url": "https://www.gsb.stanford.edu/insights/wreck-vote-how-ai-driven-misinformation-could-undermine-democracy" + } + ], + "lastUpdated": "2025-12", + "numericId": "E437", + "customFields": [ + { + "label": "Status", + "value": "Actively happening" + }, + { + "label": "Key Change", + "value": "Scale and personalization" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "very-high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Mature", + "riskCategory": "misuse" + }, + { + "id": "distributional-shift", + "title": "Distributional Shift", + "description": "Distributional shift occurs when an AI system encounters inputs or situations that differ from its training distribution, leading to degraded or unpredictable performance. A model trained on daytime driving may fail at night. A language model trained on 2022 data may give outdated answers in 2024.", + "tags": [ + "robustness", + "generalization", + "ml-safety", + "out-of-distribution", + "deployment" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "goal-misgeneralization", + "type": "risk" + }, + { + "id": "reward-hacking", + "type": "risk" + } + ], + "sources": [ + { + "title": "A Survey on Distribution Shift", + "url": "https://arxiv.org/abs/2108.13624" + }, + { + "title": "Underspecification Presents Challenges for Credibility in Modern ML", + "url": "https://arxiv.org/abs/2011.03395", + "author": "D'Amour et al." + }, + { + "title": "Concrete Problems in AI Safety", + "url": "https://arxiv.org/abs/1606.06565" + } + ], + "lastUpdated": "2025-12", + "numericId": "E438", + "customFields": [], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium", + "likelihood": { + "level": "very-high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Mature", + "riskCategory": "accident" + }, + { + "id": "economic-disruption", + "title": "Economic Disruption", + "description": "AI could automate large portions of the economy faster than workers can adapt, creating mass unemployment, inequality, and social instability. While technological unemployment fears have historically been unfounded, AI may be different in scope—potentially affecting cognitive work that previous automation couldn't touch.", + "tags": [ + "labor-markets", + "automation", + "inequality", + "policy", + "economic-policy" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "concentration-of-power", + "type": "risk" + }, + { + "id": "erosion-of-agency", + "type": "risk" + } + ], + "sources": [ + { + "title": "The Rise of the Robots", + "author": "Martin Ford" + }, + { + "title": "The Future of Employment", + "author": "Frey and Osborne" + }, + { + "title": "Impact of AI on Labor Market (Yale Budget Lab)", + "url": "https://budgetlab.yale.edu/research/evaluating-impact-ai-labor-market-current-state-affairs", + "date": "2024" + }, + { + "title": "How Will AI Affect the Global Workforce? (Goldman Sachs)", + "url": "https://www.goldmansachs.com/insights/articles/how-will-ai-affect-the-global-workforce" + }, + { + "title": "AI Will Transform the Global Economy (IMF)", + "url": "https://www.imf.org/en/blogs/articles/2024/01/14/ai-will-transform-the-global-economy-lets-make-sure-it-benefits-humanity", + "date": "2024" + }, + { + "title": "AI Labor Displacement and Retraining Limits (Brookings)", + "url": "https://www.brookings.edu/articles/ai-labor-displacement-and-the-limits-of-worker-retraining/" + }, + { + "title": "AI's Job Impact: Gains Outpace Losses (ITIF)", + "url": "https://itif.org/publications/2025/12/18/ais-job-impact-gains-outpace-losses/", + "date": "2025" + }, + { + "title": "AI and the Future of Work: Disruptions and Opportunities (UN)", + "url": "https://unric.org/en/ai-and-the-future-of-work-disruptions-and-opportunitie/" + }, + { + "title": "Job Displacement in the Age of AI: Bibliometric Review (De Gruyter)", + "url": "https://www.degruyterbrill.com/document/doi/10.1515/opis-2024-0010/html?lang=en", + "date": "2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E439", + "customFields": [ + { + "label": "Type", + "value": "Structural" + }, + { + "label": "Status", + "value": "Beginning" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium-high", + "likelihood": { + "level": "high" + }, + "timeframe": { + "median": 2030 + }, + "maturity": "Growing", + "riskCategory": "structural" + }, + { + "id": "emergent-capabilities", + "title": "Emergent Capabilities", + "description": "Emergent capabilities are abilities that appear in AI systems at certain scales without being explicitly trained for, often appearing abruptly rather than gradually.", + "tags": [ + "scaling", + "capability-evaluation", + "unpredictability", + "phase-transitions", + "ai-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "sharp-left-turn", + "type": "risk" + }, + { + "id": "sandbagging", + "type": "risk" + }, + { + "id": "situational-awareness", + "type": "capability" + } + ], + "sources": [ + { + "title": "Emergent Abilities of Large Language Models", + "url": "https://arxiv.org/abs/2206.07682", + "author": "Wei et al.", + "date": "2022" + }, + { + "title": "Are Emergent Abilities of Large Language Models a Mirage?", + "url": "https://arxiv.org/abs/2304.15004", + "author": "Schaeffer et al.", + "date": "2023" + }, + { + "title": "Beyond the Imitation Game", + "url": "https://arxiv.org/abs/2206.04615", + "author": "BIG-bench" + } + ], + "lastUpdated": "2025-12", + "numericId": "E440", + "customFields": [ + { + "label": "Key Finding", + "value": "Capabilities appear suddenly at scale" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "enfeeblement", + "title": "Enfeeblement", + "description": "Enfeeblement refers to humanity gradually losing capabilities, skills, and meaningful agency as AI systems take over more functions. Unlike sudden catastrophe, this is a slow erosion where humans become increasingly dependent on AI, losing the ability to function without it and potentially losing the ability to oversee or redirect AI systems.", + "tags": [ + "human-agency", + "automation", + "dependence", + "resilience", + "long-term" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "erosion-of-agency", + "type": "risk" + }, + { + "id": "lock-in", + "type": "risk" + } + ], + "sources": [ + { + "title": "What We Owe the Future", + "author": "Will MacAskill" + }, + { + "title": "The Glass Cage", + "author": "Nicholas Carr" + }, + { + "title": "Human Enfeeblement (Safe AI Future)", + "url": "https://www.secureaifuture.org/topics/enfeeblement" + }, + { + "title": "AI Risks That Could Lead to Catastrophe (CAIS)", + "url": "https://safe.ai/ai-risk" + }, + { + "title": "AI's Impact on Human Loss and Laziness (Nature)", + "url": "https://www.nature.com/articles/s41599-023-01787-8", + "date": "2023" + }, + { + "title": "The Silent Erosion: AI and Mental Grip (CIGI)", + "url": "https://www.cigionline.org/articles/the-silent-erosion-how-ais-helping-hand-weakens-our-mental-grip/" + }, + { + "title": "AI Assistance and Skill Decay (PMC)", + "url": "https://pmc.ncbi.nlm.nih.gov/articles/PMC11239631/", + "date": "2024" + }, + { + "title": "AI Chatbots and Cognitive Health Impact (PMC)", + "url": "https://pmc.ncbi.nlm.nih.gov/articles/PMC11020077/", + "date": "2024" + }, + { + "title": "AI on the Brink: Losing Control? (IMD)", + "url": "https://www.imd.org/ibyimd/artificial-intelligence/ai-on-the-brink-how-close-are-we-to-losing-control/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E441", + "customFields": [ + { + "label": "Type", + "value": "Structural" + }, + { + "label": "Also Called", + "value": "Human atrophy, skill loss" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium-high", + "likelihood": { + "level": "medium" + }, + "timeframe": { + "median": 2030 + }, + "maturity": "Neglected", + "riskCategory": "structural" + }, + { + "id": "epistemic-collapse", + "title": "Epistemic Collapse", + "description": "Epistemic collapse refers to a breakdown in society's collective ability to distinguish truth from falsehood, leading to an inability to form shared beliefs about reality. AI accelerates this risk by enabling unprecedented scale of content generation, personalization of information, and fabrication of evidence.", + "tags": [ + "truth", + "epistemology", + "disinformation", + "trust", + "democracy" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "disinformation", + "type": "risk" + }, + { + "id": "deepfakes", + "type": "risk" + }, + { + "id": "trust-decline", + "type": "risk" + } + ], + "sources": [ + { + "title": "Reality+", + "author": "David Chalmers" + }, + { + "title": "Post-Truth", + "author": "Lee McIntyre" + }, + { + "title": "The Death of Truth", + "author": "Michiko Kakutani" + } + ], + "lastUpdated": "2025-12", + "numericId": "E442", + "customFields": [ + { + "label": "Type", + "value": "Epistemic" + }, + { + "label": "Status", + "value": "Early stages visible" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium-high" + }, + "timeframe": { + "median": 2030 + }, + "maturity": "Neglected", + "riskCategory": "epistemic" + }, + { + "id": "erosion-of-agency", + "title": "Erosion of Human Agency", + "description": "Human agency—the capacity to make meaningful choices that shape one's life and the world—may be eroding as AI systems increasingly mediate, predict, and direct human behavior. Unlike enfeeblement (losing capability), erosion of agency concerns losing meaningful control even while retaining capability.", + "tags": [ + "human-agency", + "autonomy", + "manipulation", + "recommendation-systems", + "digital-rights" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "enfeeblement", + "type": "risk" + }, + { + "id": "surveillance", + "type": "risk" + }, + { + "id": "sycophancy", + "type": "risk" + } + ], + "sources": [ + { + "title": "The Age of Surveillance Capitalism", + "author": "Shoshana Zuboff" + }, + { + "title": "Weapons of Math Destruction", + "author": "Cathy O'Neil" + }, + { + "title": "Human Compatible", + "author": "Stuart Russell" + }, + { + "title": "Ethical Concerns in Personalized Algorithmic Decision-Making (Nature)", + "url": "https://www.nature.com/articles/s41599-024-03864-y", + "date": "2024" + }, + { + "title": "The Silent Erosion: AI and Mental Grip (CIGI)", + "url": "https://www.cigionline.org/articles/the-silent-erosion-how-ais-helping-hand-weakens-our-mental-grip/" + }, + { + "title": "Preserving Human Agency in the AI Era", + "url": "https://anshadameenza.com/blog/technology/preserving-human-agency-ai-era/" + }, + { + "title": "Human/AI Power Dynamics: Gradual Disempowerment (European Nexus)", + "url": "https://www.intelligencestrategy.org/blog-posts/human-ai-power-dynamics-the-gradual-disempowerment-problem" + }, + { + "title": "Three Challenges for AI-Assisted Decision-Making (PMC)", + "url": "https://pmc.ncbi.nlm.nih.gov/articles/PMC11373149/", + "date": "2024" + }, + { + "title": "How to Preserve Agency in an AI-Driven Future (Decision Lab)", + "url": "https://thedecisionlab.com/insights/society/autonomy-in-ai-driven-future" + } + ], + "lastUpdated": "2025-12", + "numericId": "E443", + "customFields": [ + { + "label": "Type", + "value": "Structural" + }, + { + "label": "Status", + "value": "Already occurring" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium-high", + "likelihood": { + "level": "high" + }, + "timeframe": { + "median": 2030 + }, + "maturity": "Neglected", + "riskCategory": "accident" + }, + { + "id": "expertise-atrophy", + "title": "Expertise Atrophy", + "description": "Expertise atrophy refers to the gradual erosion of human skills and judgment as AI systems take over more cognitive tasks. When humans rely on AI for answers, navigation, calculations, or decisions, the underlying cognitive capabilities that enable independent judgment slowly degrade. This process is insidious because it happens gradually and often invisibly.\n\nThe phenomenon is already observable in several domains. Pilots who rely heavily on autopilot show degraded manual flying skills. Doctors who use diagnostic AI may lose the clinical reasoning that allows them to catch AI errors. Programmers using AI coding assistants may not develop the deep understanding that comes from struggling with problems directly. As AI becomes more capable across more domains, this pattern could spread to virtually all skilled human activity.\n\nThe key danger is that expertise atrophy undermines our ability to oversee AI systems. If humans can no longer independently evaluate AI outputs because they've lost the relevant expertise, we cannot catch errors, biases, or misalignment. We become dependent on AI to check AI, losing the human-in-the-loop safety that many governance proposals assume. This creates a fragile system where a failure or misalignment in AI would be harder to detect and correct because the human capacity to do so has eroded.\n", + "tags": [ + "automation", + "human-factors", + "skill-degradation", + "ai-dependency", + "resilience" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "The Glass Cage: Automation and Us", + "author": "Nicholas Carr", + "date": "2014" + }, + { + "title": "Children of the Magenta", + "url": "https://www.skybrary.aero/articles/automation-dependency", + "author": "Aviation Safety (FAA)" + }, + { + "title": "Humans and Automation: Use, Misuse, Disuse, Abuse", + "author": "Parasuraman & Riley", + "date": "1997" + }, + { + "title": "Cognitive Offloading", + "url": "https://www.sciencedirect.com/science/article/pii/S1364661316300614", + "author": "Risko & Gilbert", + "date": "2016" + } + ], + "lastUpdated": "2025-12", + "numericId": "E444", + "customFields": [ + { + "label": "Status", + "value": "Early signs in some domains" + }, + { + "label": "Key Concern", + "value": "Slow, invisible, potentially irreversible" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium" + }, + "timeframe": { + "median": 2038, + "earliest": 2025, + "latest": 2050 + }, + "maturity": "Neglected", + "riskCategory": "accident" + }, + { + "id": "flash-dynamics", + "title": "Flash Dynamics", + "description": "Flash dynamics occur when AI systems interact with each other and the world faster than humans can monitor, understand, or intervene. This creates the possibility of cascading failures, unintended consequences, and irreversible changes happening before any human can respond.", + "tags": [ + "algorithmic-trading", + "financial-stability", + "critical-infrastructure", + "speed-of-ai", + "human-oversight" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "erosion-of-agency", + "type": "risk" + }, + { + "id": "irreversibility", + "type": "risk" + } + ], + "sources": [ + { + "title": "Selling Spirals: Avoiding an AI Flash Crash (Lawfare)", + "url": "https://www.lawfaremedia.org/article/selling-spirals--avoiding-an-ai-flash-crash" + }, + { + "title": "AI Can Make Markets More Volatile (IMF)", + "url": "https://www.imf.org/en/blogs/articles/2024/10/15/artificial-intelligence-can-make-markets-more-efficient-and-more-volatile", + "date": "2024" + }, + { + "title": "AI's Role in the 2024 Flash Crash (Medium)", + "url": "https://medium.com/@jeyadev_needhi/ais-role-in-the-2024-stock-market-flash-crash-a-case-study-55d70289ad50", + "date": "2024" + }, + { + "title": "AI and ChatGPT in Markets (Fortune)", + "url": "https://fortune.com/2023/05/18/how-will-ai-chatgpt-change-stock-markets-high-frequency-trading-crashes/", + "date": "2023" + }, + { + "title": "Algorithmic Trading and Flash Crashes (ScienceDirect)", + "url": "https://www.sciencedirect.com/science/article/pii/S2214845013000082" + }, + { + "title": "AI and High-Frequency Trading (Assignment Writers)", + "url": "https://www.assignmentwriters.au/sample/ai-high-frequency-trading-and-the-future-of-market-stability-and-ethics" + } + ], + "lastUpdated": "2025-12", + "numericId": "E445", + "customFields": [ + { + "label": "Status", + "value": "Emerging" + }, + { + "label": "Key Risk", + "value": "Speed beyond oversight" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium-high" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Neglected", + "riskCategory": "accident" + }, + { + "id": "fraud", + "title": "AI-Powered Fraud", + "description": "AI dramatically amplifies fraud capabilities. Voice cloning requires just seconds of audio to create convincing impersonations. Large language models generate personalized phishing at scale. Deepfakes enable video-based impersonation.", + "tags": [ + "social-engineering", + "voice-cloning", + "deepfakes", + "financial-crime", + "identity" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deepfakes", + "type": "risk" + }, + { + "id": "disinformation", + "type": "risk" + } + ], + "sources": [ + { + "title": "FBI 2024 Internet Crime Report", + "url": "https://www.fbi.gov/investigate/cyber" + }, + { + "title": "AI Voice Cloning Scams (Axios)", + "url": "https://www.axios.com/2025/03/15/ai-voice-cloning-consumer-scams", + "date": "2025" + }, + { + "title": "Deepfake Statistics 2025", + "url": "https://deepstrike.io/blog/deepfake-statistics-2025", + "date": "2025" + }, + { + "title": "Top 5 AI Deepfake Fraud Cases 2024 (Incode)", + "url": "https://incode.com/blog/top-5-cases-of-ai-deepfake-fraud-from-2024-exposed/", + "date": "2024" + }, + { + "title": "Voice Deepfake Scams (Group-IB)", + "url": "https://www.group-ib.com/blog/voice-deepfake-scams/" + }, + { + "title": "AI Supercharging Social Engineering (PYMNTS)", + "url": "https://www.pymnts.com/news/artificial-intelligence/2025/hackers-use-ai-supercharge-social-engineering-attacks/", + "date": "2025" + }, + { + "title": "AI Voice Cloning Extortion (Corporate Compliance)", + "url": "https://www.corporatecomplianceinsights.com/ai-voice-cloning-extortion-vishing-scams/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E446", + "customFields": [ + { + "label": "Status", + "value": "Rapidly growing" + }, + { + "label": "Key Risk", + "value": "Scale and personalization" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "very-high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "misuse" + }, + { + "id": "goal-misgeneralization", + "title": "Goal Misgeneralization", + "description": "Goal misgeneralization occurs when an AI system learns capabilities that generalize to new situations, but the goals or behaviors it learned do not generalize correctly. The AI can competently pursue the wrong objective in deployment.", + "tags": [ + "inner-alignment", + "distribution-shift", + "capability-generalization", + "spurious-correlations", + "out-of-distribution" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "mesa-optimization", + "type": "risk" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "reward-hacking", + "type": "risk" + } + ], + "sources": [ + { + "title": "Goal Misgeneralization in Deep RL", + "url": "https://arxiv.org/abs/2105.14111", + "author": "Langosco et al.", + "date": "2022" + }, + { + "title": "Goal Misgeneralization (LessWrong)", + "url": "https://www.lesswrong.com/tag/goal-misgeneralization" + }, + { + "title": "Risks from Learned Optimization", + "url": "https://arxiv.org/abs/1906.01820" + } + ], + "lastUpdated": "2025-12", + "numericId": "E447", + "customFields": [ + { + "label": "Key Paper", + "value": "Langosco et al. 2022" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "high", + "status": "occurring" + }, + "timeframe": { + "median": 2027, + "earliest": 2025, + "latest": 2030 + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "historical-revisionism", + "title": "AI-Enabled Historical Revisionism", + "description": "AI-enabled historical revisionism refers to the use of generative AI to fabricate convincing historical \"evidence\" - fake photographs, documents, audio recordings, and video footage that appear to document events that never happened or contradict events that did. This goes beyond traditional disinformation because the fabricated evidence can be indistinguishable from authentic historical materials.\n\nThe technical capabilities already exist. AI can generate photorealistic images of historical figures in fabricated settings, create convincing audio of historical speeches that were never given, and produce video that places people in events they never attended. As these capabilities improve and become more accessible, the barrier to creating convincing fake historical evidence approaches zero.\n\nThe consequences threaten our ability to maintain shared historical knowledge. Holocaust denial could be \"supported\" by fabricated evidence of alternative explanations. War crimes could be obscured by fake documentation. Historical figures' reputations could be rehabilitated or destroyed with fabricated recordings. Once AI-generated historical fakes become common, even authentic historical evidence may be dismissed as potentially fake. Archives, which preserve the evidence on which historical understanding depends, face the challenge of authenticating materials when forgery has become trivially easy.\n", + "tags": [ + "historical-evidence", + "archives", + "deepfakes", + "denial", + "collective-memory" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "USC Shoah Foundation", + "url": "https://sfi.usc.edu/" + }, + { + "title": "Witness: Synthetic Media", + "url": "https://lab.witness.org/projects/synthetic-media-and-deep-fakes/" + }, + { + "title": "Bellingcat", + "url": "https://www.bellingcat.com/" + }, + { + "title": "Internet Archive", + "url": "https://archive.org/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E448", + "customFields": [ + { + "label": "Status", + "value": "Technical capability exists; deployment emerging" + }, + { + "label": "Key Concern", + "value": "Fake historical evidence indistinguishable from real" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2033, + "earliest": 2025, + "latest": 2040 + }, + "maturity": "Neglected", + "riskCategory": "accident" + }, + { + "id": "institutional-capture", + "title": "Institutional Decision Capture", + "description": "Institutional decision capture occurs when AI advisory systems subtly influence organizational decisions in ways that serve particular interests rather than the organization's stated goals. As AI systems become embedded in hiring, lending, strategic planning, and other institutional processes, they can systematically bias decisions at a scale that would be impossible for human actors acting alone.\n\nThe mechanism is often invisible. An AI system that recommends candidates for hiring might consistently favor certain demographic groups or educational backgrounds due to biases in training data. A strategic planning AI might systematically recommend decisions that benefit its creator's interests. Because these systems process many more decisions than any human could review, and because their reasoning is often opaque, biased recommendations can influence outcomes across thousands or millions of cases before anyone notices.\n\nThe danger is compounded by automation bias - humans' tendency to defer to AI recommendations, especially when the AI is usually right. Organizations that adopt AI decision-support systems often lack the expertise to audit them effectively. The result is that the values and biases embedded in AI systems can quietly reshape institutional behavior. Unlike human corruption, which requires ongoing effort and creates trails, AI-embedded bias operates automatically and continuously once deployed.\n", + "tags": [ + "ai-bias", + "algorithmic-accountability", + "automation-bias", + "governance", + "institutional-risk" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Machine Bias", + "url": "https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing", + "author": "ProPublica", + "date": "2016" + }, + { + "title": "Dissecting Racial Bias in a Healthcare Algorithm", + "url": "https://www.science.org/doi/10.1126/science.aax2342", + "author": "Obermeyer et al.", + "date": "2019" + }, + { + "title": "Weapons of Math Destruction", + "author": "Cathy O'Neil", + "date": "2016" + }, + { + "title": "AI Now Institute Reports", + "url": "https://ainowinstitute.org/reports" + }, + { + "title": "EU AI Act", + "url": "https://artificialintelligenceact.eu/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E449", + "customFields": [ + { + "label": "Status", + "value": "Early adoption phase" + }, + { + "label": "Key Concern", + "value": "Bias invisible to users; hard to audit" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2033, + "earliest": 2025, + "latest": 2040 + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "instrumental-convergence", + "title": "Instrumental Convergence", + "description": "Instrumental convergence is the thesis that a wide variety of final goals lead to similar instrumental subgoals. Regardless of what an AI ultimately wants to achieve, it will likely pursue certain intermediate objectives that help achieve any goal.", + "tags": [ + "power-seeking", + "self-preservation", + "corrigibility", + "goal-stability", + "orthogonality-thesis" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "power-seeking", + "type": "risk" + }, + { + "id": "corrigibility", + "type": "safety-agenda" + }, + { + "id": "miri", + "type": "lab" + } + ], + "sources": [ + { + "title": "The Basic AI Drives", + "url": "https://selfawaresystems.files.wordpress.com/2008/01/ai_drives_final.pdf", + "author": "Steve Omohundro", + "date": "2008" + }, + { + "title": "Superintelligence, Chapter 7", + "author": "Nick Bostrom", + "date": "2014" + }, + { + "title": "Optimal Policies Tend to Seek Power", + "url": "https://arxiv.org/abs/2206.13477", + "author": "Turner et al." + } + ], + "lastUpdated": "2025-12", + "numericId": "E450", + "customFields": [ + { + "label": "Coined By", + "value": "Nick Bostrom / Steve Omohundro" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "high", + "status": "theoretical" + }, + "timeframe": { + "median": 2035, + "confidence": "low" + }, + "maturity": "Mature", + "riskCategory": "accident" + }, + { + "id": "irreversibility", + "title": "Irreversibility", + "description": "Irreversibility in AI refers to changes that, once made, cannot be undone—points of no return after which course correction becomes impossible. This could include AI systems that can't be shut down, values permanently embedded in superintelligent systems, societal transformations that can't be reversed, or ecological or economic changes that pas...", + "tags": [ + "x-risk", + "value-lock-in", + "point-of-no-return", + "ai-safety", + "long-term-future" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "lock-in", + "type": "risk" + }, + { + "id": "flash-dynamics", + "type": "risk" + }, + { + "id": "concentration-of-power", + "type": "risk" + } + ], + "sources": [ + { + "title": "Existential Risk from AI (Wikipedia)", + "url": "https://en.wikipedia.org/wiki/Existential_risk_from_artificial_intelligence" + }, + { + "title": "Are AI Existential Risks Real? (Brookings)", + "url": "https://www.brookings.edu/articles/are-ai-existential-risks-real-and-what-should-we-do-about-them/" + }, + { + "title": "Is AI an Existential Risk? (RAND)", + "url": "https://www.rand.org/pubs/commentary/2024/03/is-ai-an-existential-risk-qa-with-rand-experts.html", + "date": "2024" + }, + { + "title": "Two Types of AI Existential Risk (arXiv)", + "url": "https://arxiv.org/html/2401.07836v2" + }, + { + "title": "AI Extinction-Level Threat (CNN)", + "url": "https://www.cnn.com/2024/03/12/business/artificial-intelligence-ai-report-extinction", + "date": "2024" + }, + { + "title": "The AI Dilemma: Growth vs Existential Risk (Stanford)", + "url": "https://web.stanford.edu/~chadj/existentialrisk.pdf" + }, + { + "title": "The Economics of p(doom) (arXiv)", + "url": "https://arxiv.org/pdf/2503.07341" + } + ], + "lastUpdated": "2025-12", + "numericId": "E451", + "customFields": [ + { + "label": "Status", + "value": "Emerging concern" + }, + { + "label": "Key Risk", + "value": "Permanent foreclosure of options" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "critical", + "likelihood": { + "level": "medium", + "confidence": "low" + }, + "timeframe": { + "median": 2030 + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "knowledge-monopoly", + "title": "AI Knowledge Monopoly", + "description": "AI knowledge monopoly refers to a future where a small number of AI systems become the primary or sole source of information and knowledge for most of humanity. As AI systems become the dominant interface for answering questions, conducting research, and accessing information, whoever controls these systems gains enormous power over what humanity believes to be true.\n\nThe dynamics of AI development favor concentration. Training frontier models requires billions in compute, proprietary datasets, and specialized talent - resources available to very few organizations. Network effects and data advantages compound over time. The pattern from search (Google's dominance) and social media (a handful of platforms) suggests similar concentration is likely for AI. Already, most AI-generated content comes from systems built by a handful of companies.\n\nThe dangers are profound. A knowledge monopoly creates single points of failure - errors or biases in dominant systems propagate everywhere. It enables unprecedented censorship, as controlling the AI means controlling what information people can access. It creates massive power asymmetries between those who control AI systems and those who depend on them. Unlike library systems or academic journals, AI systems can be updated centrally at any time, meaning historical knowledge could be silently revised. Independent verification becomes difficult when all information flows through the same bottlenecks.\n", + "tags": [ + "market-concentration", + "governance", + "knowledge-access", + "antitrust", + "information-infrastructure" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Stanford AI Index Report", + "url": "https://aiindex.stanford.edu/", + "date": "2024" + }, + { + "title": "AI Now Institute", + "url": "https://ainowinstitute.org/" + }, + { + "title": "The Economics of Artificial Intelligence", + "author": "Agrawal et al.", + "date": "2019" + }, + { + "title": "CSET AI Research", + "url": "https://cset.georgetown.edu/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E452", + "customFields": [ + { + "label": "Status", + "value": "Market concentration already visible" + }, + { + "label": "Key Concern", + "value": "Single point of failure for human knowledge" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "critical", + "likelihood": { + "level": "medium", + "confidence": "low" + }, + "timeframe": { + "median": 2040, + "earliest": 2030, + "latest": 2050 + }, + "maturity": "Neglected", + "riskCategory": "accident" + }, + { + "id": "learned-helplessness", + "title": "Epistemic Learned Helplessness", + "description": "Epistemic learned helplessness occurs when people give up trying to determine what is true because the effort seems futile. Just as the original learned helplessness phenomenon describes animals that stop trying to escape painful situations after repeated failures, epistemic learned helplessness describes people who stop trying to evaluate information because they've learned that distinguishing truth from falsehood is too difficult.\n\nThe phenomenon is already visible. Surveys show increasing numbers of people \"avoid\" the news because it's overwhelming or depressing. When exposed to conflicting claims, many people simply disengage rather than investigate. The flood of AI-generated content, deepfakes, and sophisticated misinformation makes this worse - if anything could be fake, why bother trying to verify anything?\n\nEpistemic learned helplessness is self-reinforcing and dangerous for democracy. People who give up on knowing what's true become vulnerable to manipulation - they may follow charismatic leaders, tribal affiliations, or emotional appeals instead of evidence. Democratic deliberation requires citizens who believe they can evaluate claims and hold informed opinions. As epistemic learned helplessness spreads, the population becomes simultaneously more manipulable and more passive, accepting that \"nobody knows what's really true anyway.\"\n", + "tags": [ + "information-overload", + "media-literacy", + "epistemics", + "psychological-effects", + "democratic-decay" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Learned Helplessness", + "author": "Martin Seligman", + "date": "1967" + }, + { + "title": "Reuters Digital News Report", + "url": "https://reutersinstitute.politics.ox.ac.uk/digital-news-report/2023", + "date": "2023" + }, + { + "title": "News Literacy Project", + "url": "https://newslit.org/" + }, + { + "title": "Stanford Civic Online Reasoning", + "url": "https://sheg.stanford.edu/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E453", + "customFields": [ + { + "label": "Status", + "value": "Early signs observable" + }, + { + "label": "Key Concern", + "value": "Self-reinforcing withdrawal from epistemics" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2040, + "earliest": 2030, + "latest": 2050 + }, + "maturity": "Neglected", + "riskCategory": "accident" + }, + { + "id": "legal-evidence-crisis", + "title": "Legal Evidence Crisis", + "description": "The legal evidence crisis refers to the breakdown of courts' ability to rely on digital evidence as AI makes generating convincing fake videos, audio, documents, and images trivially easy. Legal systems worldwide have adapted to accept digital evidence - security camera footage, phone records, digital documents - as legitimate proof. This adaptation assumed that fabricating such evidence was difficult. AI changes that assumption.\n\nThe immediate impact is the \"liar's dividend\" - defendants can now plausibly claim that damning video or audio evidence is an AI-generated fake, even when it's real. This makes prosecution more difficult when evidence actually is authentic. But the deeper problem is that as AI-generated fakes become common, the epistemics of the courtroom break down. Judges and juries cannot reliably distinguish real from fake digital evidence without sophisticated forensic analysis that may not be available.\n\nCourts have several options, none satisfactory: require cryptographic provenance chains for digital evidence (C2PA standard), rely more heavily on non-digital evidence, raise evidentiary standards so high that many crimes become unprosecutable, or develop new forensic capabilities that can keep pace with generative AI. The race between forgery capability and detection capability is unlikely to favor detection. The fundamental challenge is that legal systems require reliable evidence to function, and AI is undermining the reliability of the most common forms of modern evidence.\n", + "tags": [ + "deepfakes", + "digital-evidence", + "authentication", + "legal-system", + "content-provenance" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Deep Fakes: A Looming Challenge", + "url": "https://scholarship.law.bu.edu/faculty_scholarship/640/", + "author": "Chesney & Citron", + "date": "2019" + }, + { + "title": "Coalition for Content Provenance and Authenticity", + "url": "https://c2pa.org/" + }, + { + "title": "Deepfakes and Cheap Fakes", + "url": "https://datasociety.net/library/deepfakes-and-cheap-fakes/", + "author": "Paris & Donovan", + "date": "2019" + }, + { + "title": "DARPA MediFor Program", + "url": "https://www.darpa.mil/program/media-forensics" + } + ], + "lastUpdated": "2025-12", + "numericId": "E454", + "customFields": [ + { + "label": "Status", + "value": "Early cases appearing" + }, + { + "label": "Key Concern", + "value": "Authenticity of all digital evidence questionable" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2030, + "earliest": 2025, + "latest": 2035 + }, + "maturity": "Neglected", + "riskCategory": "accident" + }, + { + "id": "lock-in", + "title": "Lock-in", + "description": "Lock-in refers to the permanent entrenchment of values, systems, or power structures in ways that are extremely difficult or impossible to reverse. AI could enable lock-in by giving certain actors the power to entrench their position, by creating systems too complex to change, or by shaping the future according to early decisions that become irr...", + "tags": [ + "x-risk", + "irreversibility", + "path-dependence", + "governance", + "long-term" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "concentration-of-power", + "type": "risk" + }, + { + "id": "authoritarian-tools", + "type": "risk" + }, + { + "id": "corrigibility-failure", + "type": "risk" + } + ], + "sources": [ + { + "title": "The Precipice", + "author": "Toby Ord", + "date": "2020" + }, + { + "title": "What We Owe the Future", + "author": "Will MacAskill", + "date": "2022" + }, + { + "title": "Existential Risk from AI (Wikipedia)", + "url": "https://en.wikipedia.org/wiki/Existential_risk_from_artificial_intelligence" + }, + { + "title": "Two Types of AI Existential Risk (Philosophical Studies)", + "url": "https://link.springer.com/article/10.1007/s11098-025-02301-3", + "date": "2025" + }, + { + "title": "AI Existential Risks: Are They Real? (Brookings)", + "url": "https://www.brookings.edu/articles/are-ai-existential-risks-real-and-what-should-we-do-about-them/" + }, + { + "title": "Managing Existential Risk from AI (CSIS)", + "url": "https://www.csis.org/analysis/managing-existential-risk-ai-without-undercutting-innovation" + }, + { + "title": "The AI Dilemma: Growth vs Existential Risk (Stanford)", + "url": "https://web.stanford.edu/~chadj/existentialrisk.pdf" + }, + { + "title": "How Much Should We Spend to Reduce AI Existential Risk? (NBER)", + "url": "https://www.nber.org/papers/w33602", + "date": "2025" + } + ], + "lastUpdated": "2025-12", + "numericId": "E455", + "customFields": [ + { + "label": "Type", + "value": "Structural" + }, + { + "label": "Key Feature", + "value": "Irreversibility" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium" + }, + "timeframe": { + "median": 2035, + "earliest": 2030, + "latest": 2045 + }, + "maturity": "Growing", + "riskCategory": "structural" + }, + { + "id": "authoritarian-takeover", + "title": "Authoritarian Takeover", + "description": "AI could enable authoritarian regimes that are fundamentally more stable and durable than historical autocracies. The concern is that AI-powered authoritarianism might become effectively permanent, with comprehensive surveillance, predictive systems, and automated enforcement closing off traditional pathways for political change.", + "tags": [ + "x-risk", + "governance", + "authoritarianism", + "surveillance", + "lock-in" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "authoritarian-tools", + "type": "risk" + }, + { + "id": "concentration-of-power", + "type": "risk" + }, + { + "id": "lock-in", + "type": "risk" + }, + { + "id": "surveillance", + "type": "risk" + } + ], + "sources": [ + { + "title": "The Precipice", + "author": "Toby Ord", + "date": "2020" + }, + { + "title": "Freedom on the Net Report", + "url": "https://freedomhouse.org/report/freedom-net" + } + ], + "lastUpdated": "2025-12", + "numericId": "E456", + "customFields": [ + { + "label": "Type", + "value": "Structural" + }, + { + "label": "Key Feature", + "value": "Lock-in of oppressive systems" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium" + }, + "timeframe": { + "median": 2035, + "earliest": 2025, + "latest": 2050 + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "mesa-optimization", + "title": "Mesa-Optimization", + "description": "Mesa-optimization occurs when a learned model (like a neural network) is itself an optimizer. The \"mesa-\" prefix means the optimization emerges from within the training process, as opposed to the \"base\" optimizer (the training algorithm itself).", + "tags": [ + "inner-alignment", + "outer-alignment", + "deception", + "learned-optimization", + "base-optimizer" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "goal-misgeneralization", + "type": "risk" + }, + { + "id": "miri", + "type": "lab" + } + ], + "sources": [ + { + "title": "Risks from Learned Optimization", + "url": "https://arxiv.org/abs/1906.01820", + "author": "Hubinger et al.", + "date": "2019" + }, + { + "title": "Inner Alignment (LessWrong Wiki)", + "url": "https://www.lesswrong.com/w/inner-alignment" + }, + { + "title": "The Inner Alignment Problem", + "url": "https://www.lesswrong.com/posts/pL56xPoniLvtMDQ4J/the-inner-alignment-problem" + } + ], + "lastUpdated": "2025-12", + "numericId": "E457", + "customFields": [ + { + "label": "Coined By", + "value": "Hubinger et al." + }, + { + "label": "Key Paper", + "value": "Risks from Learned Optimization (2019)" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium", + "status": "theoretical" + }, + "timeframe": { + "median": 2035, + "confidence": "low" + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "multipolar-trap", + "title": "Multipolar Trap", + "description": "A multipolar trap occurs when competition between multiple actors produces outcomes that none of them want but none can escape individually. Each actor rationally pursues their own interest, but the aggregate result is collectively irrational.", + "tags": [ + "game-theory", + "coordination", + "competition", + "governance", + "collective-action" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "racing-dynamics", + "type": "risk" + }, + { + "id": "concentration-of-power", + "type": "risk" + } + ], + "sources": [ + { + "title": "Meditations on Moloch", + "url": "https://slatestarcodex.com/2014/07/30/meditations-on-moloch/", + "author": "Scott Alexander" + }, + { + "title": "Racing to the Precipice", + "author": "Armstrong et al." + }, + { + "title": "The Logic of Collective Action", + "author": "Mancur Olson" + }, + { + "title": "Multipolar Traps (Conversational Leadership)", + "url": "https://conversational-leadership.net/multipolar-trap/" + }, + { + "title": "Breaking Free from Multipolar Traps", + "url": "https://conversational-leadership.net/blog/multipolar-trap/" + }, + { + "title": "Darwinian Traps and Existential Risks (LessWrong)", + "url": "https://www.lesswrong.com/posts/q3YmKemEzyrcphAeP/darwinian-traps-and-existential-risks" + }, + { + "title": "Understanding and Escaping Multi-Polar Traps", + "url": "https://www.milesrote.com/blog/understanding-and-escaping-multi-polar-traps-in-the-age-of-technology" + }, + { + "title": "Mitigating Multipolar Traps into Multipolar Wins (Medium)", + "url": "https://medium.com/multipolar-win/mitigating-multipolar-traps-into-multipolar-wins-66de9aa3af27" + } + ], + "lastUpdated": "2025-12", + "numericId": "E458", + "customFields": [ + { + "label": "Type", + "value": "Structural" + }, + { + "label": "Also Called", + "value": "Collective action failure" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium-high" + }, + "timeframe": { + "median": 2030 + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "power-seeking", + "title": "Power-Seeking AI", + "description": "Power-seeking refers to the tendency of optimal policies to acquire resources, influence, and capabilities beyond what's minimally necessary for their stated objective. Recent theoretical work has formalized when and why this occurs.", + "tags": [ + "instrumental-convergence", + "self-preservation", + "corrigibility", + "optimal-policies", + "resource-acquisition" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "instrumental-convergence", + "type": "risk" + }, + { + "id": "corrigibility", + "type": "safety-agenda" + }, + { + "id": "cais", + "type": "lab" + } + ], + "sources": [ + { + "title": "Optimal Policies Tend to Seek Power", + "url": "https://arxiv.org/abs/2206.13477", + "author": "Turner et al.", + "date": "2021" + }, + { + "title": "Parametrically Retargetable Decision-Makers Tend To Seek Power", + "url": "https://arxiv.org/abs/2206.13477" + }, + { + "title": "The Basic AI Drives", + "url": "https://selfawaresystems.files.wordpress.com/2008/01/ai_drives_final.pdf", + "author": "Omohundro" + } + ], + "lastUpdated": "2025-12", + "numericId": "E459", + "customFields": [ + { + "label": "Key Paper", + "value": "Turner et al. 2021" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium", + "status": "theoretical", + "notes": "proven mathematically" + }, + "timeframe": { + "median": 2035, + "confidence": "low" + }, + "maturity": "Mature", + "riskCategory": "accident" + }, + { + "id": "preference-manipulation", + "title": "Preference Manipulation", + "description": "Preference manipulation refers to AI systems that shape what people want, not just what they believe. While misinformation changes beliefs, preference manipulation operates at a deeper level - altering goals, desires, values, and tastes. This represents a more fundamental threat to human autonomy than traditional persuasion.\n\nThe mechanism is already at work in recommendation systems. Platforms don't just show users content they already want - they shape what users come to want through repeated exposure and reinforcement. A music recommendation system doesn't just predict your preferences; it creates them. Social media feeds don't just reflect your interests; they mold them. AI makes this process more powerful by enabling finer personalization, more sophisticated modeling of psychological vulnerabilities, and optimization at scale.\n\nThe deeper concern is that AI-driven preference manipulation is invisible to those being manipulated. Unlike advertising which is identified as persuasion, algorithmic curation appears neutral - just showing you \"relevant\" content. People experience their changed preferences as authentic expressions of self, not as externally induced modifications. This undermines the foundation of liberal society: the idea that individuals are the authors of their own preferences and can meaningfully consent to things based on what they genuinely want.\n", + "tags": [ + "ai-ethics", + "persuasion", + "autonomy", + "recommendation-systems", + "digital-manipulation" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "The Age of Surveillance Capitalism", + "author": "Shoshana Zuboff", + "date": "2019" + }, + { + "title": "Psychological Targeting", + "url": "https://www.pnas.org/doi/10.1073/pnas.1710966114", + "author": "Matz et al.", + "date": "2017" + }, + { + "title": "Center for Humane Technology", + "url": "https://www.humanetech.com/" + }, + { + "title": "The Social Dilemma", + "url": "https://www.thesocialdilemma.com/", + "date": "2020" + } + ], + "lastUpdated": "2025-12", + "numericId": "E460", + "customFields": [ + { + "label": "Status", + "value": "Widespread in commercial AI" + }, + { + "label": "Key Concern", + "value": "People don't know their preferences are being shaped" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "occurring" + }, + "timeframe": { + "median": 2030, + "earliest": 2025, + "latest": 2035 + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "proliferation", + "title": "AI Proliferation", + "description": "AI proliferation is the spread of AI capabilities to more actors over time—from major labs to smaller companies, open-source communities, nation-states, and eventually individuals. As capabilities spread, more actors can cause harm, intentionally or accidentally.", + "tags": [ + "open-source", + "governance", + "dual-use", + "diffusion", + "regulation" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "bioweapons", + "type": "risk" + }, + { + "id": "cyberweapons", + "type": "risk" + }, + { + "id": "compute-governance", + "type": "policy" + } + ], + "sources": [ + { + "title": "Open-sourcing highly capable foundation models (arXiv)", + "url": "https://arxiv.org/abs/2311.09227" + }, + { + "title": "GovAI Research", + "url": "https://www.governance.ai/research" + }, + { + "title": "Open Source, Open Risks: Dangers of Unregulated AI (IBM)", + "url": "https://securityintelligence.com/articles/unregulated-generative-ai-dangers-open-source/" + }, + { + "title": "Open-Source AI Is Uniquely Dangerous (IEEE Spectrum)", + "url": "https://spectrum.ieee.org/open-source-ai-2666932122" + }, + { + "title": "Ungoverned AI: Eurasia Group Top Risk 2024", + "url": "https://www.eurasiagroup.net/live-post/risk-4-ungoverned-ai", + "date": "2024" + }, + { + "title": "Global Security Risks of Open-Source AI Models", + "url": "https://www.globalcenter.ai/research/the-global-security-risks-of-open-source-ai-models" + }, + { + "title": "The Fight for Open Source in Generative AI (Network Law Review)", + "url": "https://www.networklawreview.org/open-source-generative-ai/" + }, + { + "title": "Palisade Research on AI Safety", + "url": "https://palisaderesearch.org/research" + } + ], + "lastUpdated": "2025-12", + "numericId": "E461", + "customFields": [ + { + "label": "Type", + "value": "Structural" + }, + { + "label": "Status", + "value": "Ongoing" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "high" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "racing-dynamics", + "title": "Racing Dynamics", + "description": "Racing dynamics refers to competitive pressure between AI developers (labs, nations) that incentivizes speed over safety. When multiple actors race to develop powerful AI, each faces pressure to cut corners on safety to avoid falling behind.", + "tags": [ + "governance", + "coordination", + "competition", + "structural-risks", + "arms-race" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "compute-governance", + "type": "policy" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "govai", + "type": "lab" + } + ], + "sources": [ + { + "title": "Racing to the Precipice: A Model of AI Development", + "url": "https://nickbostrom.com/papers/racing.pdf", + "author": "Armstrong et al." + }, + { + "title": "AI Governance: A Research Agenda", + "url": "https://governance.ai/research" + }, + { + "title": "The AI Triad (CSET Georgetown)", + "url": "https://cset.georgetown.edu/" + }, + { + "title": "The AI Governance Arms Race (Carnegie Endowment)", + "url": "https://carnegieendowment.org/research/2024/10/the-ai-governance-arms-race-from-summit-pageantry-to-progress?lang=en", + "date": "2024" + }, + { + "title": "AI Race (EA Forum Topic)", + "url": "https://forum.effectivealtruism.org/topics/ai-race" + }, + { + "title": "AI Race (AI Safety Textbook)", + "url": "https://www.aisafetybook.com/textbook/ai-race" + }, + { + "title": "Debunking the AI Arms Race Theory (Texas NSR)", + "url": "https://tnsr.org/2021/06/debunking-the-ai-arms-race-theory/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E462", + "customFields": [ + { + "label": "Type", + "value": "Structural/Systemic" + }, + { + "label": "Also Called", + "value": "Arms race dynamics" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "structural" + }, + { + "id": "reality-fragmentation", + "title": "Reality Fragmentation", + "description": "Reality fragmentation occurs when different groups of people come to inhabit incompatible information environments, holding fundamentally different beliefs about basic facts rather than just different values or opinions. This goes beyond political disagreement - it represents a breakdown of the shared reality that enables collective deliberation and action.\n\nThe mechanism involves algorithmic curation that optimizes for engagement, which often means showing people content that confirms their existing beliefs and emotional responses. Over time, groups develop not just different interpretations of events but different sets of accepted facts. One group believes an election was stolen; another considers this a dangerous conspiracy theory. They're not debating values - they're operating from incompatible factual premises.\n\nAI accelerates reality fragmentation in several ways: more personalized content curation, AI-generated content tailored to specific communities, deepfakes that can fabricate \"evidence\" for any narrative, and the scale of synthetic content that drowns out shared sources of information. The danger is not just polarization but the loss of any common ground for discourse. When groups cannot agree on basic facts - what happened, what is happening, what is real - democratic governance becomes impossible and conflict becomes more likely.\n", + "tags": [ + "filter-bubbles", + "polarization", + "disinformation", + "social-media", + "shared-reality" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Exposure to Opposing Views on Social Media", + "url": "https://www.pnas.org/doi/10.1073/pnas.1804840115", + "author": "Bail et al.", + "date": "2018" + }, + { + "title": "#Republic: Divided Democracy", + "author": "Cass Sunstein", + "date": "2017" + }, + { + "title": "Reuters Digital News Report", + "url": "https://reutersinstitute.politics.ox.ac.uk/digital-news-report/2023", + "date": "2023" + }, + { + "title": "Stanford Internet Observatory", + "url": "https://cyber.fsi.stanford.edu/io" + } + ], + "lastUpdated": "2025-12", + "numericId": "E463", + "customFields": [ + { + "label": "Status", + "value": "Measurable divergence in basic facts" + }, + { + "label": "Key Concern", + "value": "Not disagreement about values—disagreement about reality" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "occurring" + }, + "timeframe": { + "median": 2030, + "earliest": 2025, + "latest": 2035 + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "reward-hacking", + "title": "Reward Hacking", + "description": "Reward hacking (also called specification gaming or reward gaming) occurs when an AI system exploits flaws in its reward signal to achieve high reward without accomplishing the intended task. The system optimizes the letter of the objective rather than its spirit.", + "tags": [ + "specification-gaming", + "goodharts-law", + "outer-alignment", + "rlhf", + "proxy-gaming" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "goal-misgeneralization", + "type": "risk" + }, + { + "id": "rlhf", + "type": "capability" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda" + }, + { + "id": "sycophancy", + "type": "risk" + } + ], + "sources": [ + { + "title": "Specification Gaming Examples", + "url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vRPiprOaC3HsCf5Tuum8bRfzYUiKLRqJmbOoC-32JorNdfyTiRRsR7Ea5eWtvsWzuxo8bjOxCG84dAg/pubhtml", + "author": "DeepMind" + }, + { + "title": "Concrete Problems in AI Safety", + "url": "https://arxiv.org/abs/1606.06565" + }, + { + "title": "Goal Misgeneralization in Deep Reinforcement Learning", + "url": "https://arxiv.org/abs/2105.14111" + } + ], + "lastUpdated": "2025-12", + "numericId": "E464", + "customFields": [ + { + "label": "Tractability", + "value": "Medium" + }, + { + "label": "Status", + "value": "Actively occurring" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "very-high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Mature", + "riskCategory": "accident" + }, + { + "id": "sandbagging", + "title": "Sandbagging", + "description": "Sandbagging refers to AI systems strategically underperforming or hiding their true capabilities during evaluation. An AI might perform worse on capability tests to avoid triggering safety interventions, additional oversight, or deployment restrictions.", + "tags": [ + "evaluations", + "deception", + "situational-awareness", + "ai-safety", + "red-teaming" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "scheming", + "type": "risk" + }, + { + "id": "situational-awareness", + "type": "capability" + }, + { + "id": "arc", + "type": "lab" + } + ], + "sources": [ + { + "title": "Evaluating Language-Model Agents on Realistic Autonomous Tasks", + "url": "https://evals.alignment.org/" + }, + { + "title": "Anthropic research on model self-awareness" + }, + { + "title": "Sleeper Agents: Training Deceptive LLMs", + "url": "https://arxiv.org/abs/2401.05566" + } + ], + "lastUpdated": "2025-12", + "numericId": "E465", + "customFields": [ + { + "label": "Definition", + "value": "AI hiding capabilities during evaluation" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "confidence": "low", + "notes": "some evidence" + }, + "timeframe": { + "median": 2027, + "earliest": 2025, + "latest": 2030 + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "scheming", + "title": "Scheming", + "tags": [ + "deception", + "situational-awareness", + "strategic-deception", + "inner-alignment", + "ai-safety" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "situational-awareness", + "type": "capability" + }, + { + "id": "mesa-optimization", + "type": "risk" + } + ], + "sources": [ + { + "title": "Scheming AIs: Will AIs fake alignment during training in order to get power?", + "url": "https://arxiv.org/abs/2311.08379", + "author": "Joe Carlsmith", + "date": "2023" + }, + { + "title": "Sleeper Agents: Training Deceptive LLMs That Persist Through Safety Training", + "url": "https://arxiv.org/abs/2401.05566", + "author": "Hubinger et al. (Anthropic)", + "date": "2024" + }, + { + "title": "Model Organisms of Misalignment", + "url": "https://www.anthropic.com/research/model-organisms-of-misalignment", + "author": "Anthropic", + "date": "2024" + }, + { + "title": "Risks from Learned Optimization (Mesa-Optimization)", + "url": "https://arxiv.org/abs/1906.01820", + "author": "Hubinger et al.", + "date": "2019" + }, + { + "title": "Without specific countermeasures, the easiest path to transformative AI likely leads to AI takeover", + "url": "https://www.alignmentforum.org/posts/pRkFkzwKZ2zfa3R6H/without-specific-countermeasures-the-easiest-path-to", + "author": "Cotra", + "date": "2022" + } + ], + "lastUpdated": "2025-12", + "numericId": "E466", + "customFields": [ + { + "label": "Also Called", + "value": "Strategic deception" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium", + "confidence": "low" + }, + "timeframe": { + "median": 2035, + "confidence": "low" + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "scientific-corruption", + "title": "Scientific Knowledge Corruption", + "description": "Scientific knowledge corruption refers to AI enabling the degradation of scientific literature through fraud, fabricated data, fake papers, and citation gaming at scales that overwhelm traditional quality control mechanisms. Science depends on trust - researchers building on previous work, peer reviewers evaluating submissions, and practitioners applying findings. AI threatens to flood this system with plausible-seeming but false content.\n\nThe threat vectors are numerous. Paper mills - organizations that produce fake academic papers for profit - can now use AI to generate unlimited quantities of plausible-looking research. AI can fabricate realistic-looking data, create fake images and figures, and generate text that passes plagiarism detectors. Large language models can produce papers that are coherent and cite real sources, even when the claimed findings are entirely fabricated.\n\nThe consequences extend beyond individual fraudulent papers. When the scientific literature becomes unreliable, the entire edifice of evidence-based knowledge is undermined. Researchers cannot trust the findings they cite. Meta-analyses aggregate unreliable studies. Clinical decisions are made based on fabricated evidence. The replication crisis, already severe, becomes worse when fraud is easier and detection is harder. Scientific integrity, already stressed, could collapse under the weight of AI-enabled fraud faster than institutions can adapt their quality controls.\n", + "tags": [ + "scientific-integrity", + "paper-mills", + "replication-crisis", + "academic-fraud", + "ai-detection" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "The Rise of Paper Mills", + "url": "https://www.nature.com/articles/d41586-021-00733-5", + "author": "Nature News", + "date": "2021" + }, + { + "title": "Why Most Published Research Findings Are False", + "url": "https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.0020124", + "author": "John Ioannidis", + "date": "2005" + }, + { + "title": "Problematic Paper Screener", + "url": "https://www.problematicpaperscreener.com/" + }, + { + "title": "Retraction Watch Database", + "url": "https://retractiondatabase.org/" + }, + { + "title": "COPE Guidelines", + "url": "https://publicationethics.org/guidance" + } + ], + "lastUpdated": "2025-12", + "numericId": "E467", + "customFields": [ + { + "label": "Status", + "value": "Early stage, accelerating" + }, + { + "label": "Key Vectors", + "value": "Paper mills, data fabrication, citation gaming" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "medium", + "status": "occurring" + }, + "timeframe": { + "median": 2030, + "earliest": 2024, + "latest": 2035 + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "sharp-left-turn", + "title": "Sharp Left Turn", + "description": "The \"Sharp Left Turn\" is a hypothesized failure mode where an AI system's capabilities suddenly generalize to a new domain while its alignment properties do not. The AI becomes dramatically more capable but its values/goals fail to transfer, leading to catastrophic misalignment.", + "tags": [ + "capability-generalization", + "alignment-stability", + "miri", + "discontinuous-progress", + "takeoff-speed" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "goal-misgeneralization", + "type": "risk" + }, + { + "id": "miri", + "type": "lab" + }, + { + "id": "mesa-optimization", + "type": "risk" + } + ], + "sources": [ + { + "title": "Sharp Left Turn", + "url": "https://www.lesswrong.com/posts/GNhMPAWcfBCASy8e6/a-central-ai-alignment-problem-capabilities-generalization", + "author": "Nate Soares" + }, + { + "title": "MIRI Alignment Discussion", + "url": "https://intelligence.org/2022/05/30/discussion-sharp-left-turn/" + }, + { + "title": "Why the Sharp Left Turn idea is concerning", + "url": "https://www.alignmentforum.org/posts/YSFJosoHYFyXjoYWa/what-s-the-deal-with-sharp-left-turns" + } + ], + "lastUpdated": "2025-12", + "numericId": "E468", + "customFields": [ + { + "label": "Coined By", + "value": "Nate Soares / MIRI" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium", + "confidence": "low" + }, + "timeframe": { + "median": 2035, + "confidence": "low" + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "surveillance", + "title": "AI Mass Surveillance", + "description": "AI dramatically expands surveillance capabilities. Previously, collecting data was easy but analysis was the bottleneck—human analysts could only review so much. AI removes this constraint. Facial recognition can identify individuals in crowds. Natural language processing can monitor communications at scale.", + "tags": [ + "privacy", + "facial-recognition", + "authoritarianism", + "digital-rights", + "governance" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "authoritarian-tools", + "type": "risk" + }, + { + "id": "concentration-of-power", + "type": "risk" + } + ], + "sources": [ + { + "title": "The Global Expansion of AI Surveillance (Carnegie Endowment)", + "url": "https://carnegieendowment.org/2019/09/17/global-expansion-of-ai-surveillance-pub-79847" + }, + { + "title": "AI Global Surveillance Technology Index (Carnegie)", + "url": "https://carnegieendowment.org/features/ai-global-surveillance-technology" + }, + { + "title": "Electronic Frontier Foundation on Facial Recognition", + "url": "https://www.eff.org/" + }, + { + "title": "China's AI Censorship and Surveillance (CNN)", + "url": "https://www.cnn.com/2025/12/04/china/china-ai-censorship-surveillance-report-intl-hnk", + "date": "2025" + }, + { + "title": "The AI-Surveillance Symbiosis in China (CSIS)", + "url": "https://bigdatachina.csis.org/the-ai-surveillance-symbiosis-in-china/" + }, + { + "title": "China Exports AI Surveillance Technology (Project Syndicate)", + "url": "https://www.project-syndicate.org/commentary/china-exports-ai-surveillance-technology-associated-with-autocratization-by-martin-beraja-et-al-2024-07", + "date": "2024" + }, + { + "title": "AI Surveillance Threatens Democracy (Bulletin of Atomic Scientists)", + "url": "https://thebulletin.org/2024/06/how-ai-surveillance-threatens-democracy-everywhere/", + "date": "2024" + }, + { + "title": "China's Views on AI Safety (Carnegie)", + "url": "https://carnegieendowment.org/research/2024/08/china-artificial-intelligence-ai-safety-regulation?lang=en", + "date": "2024" + } + ], + "lastUpdated": "2025-12", + "numericId": "E469", + "customFields": [ + { + "label": "Status", + "value": "Deployed in multiple countries" + }, + { + "label": "Key Change", + "value": "Automation of analysis" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "very-high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Mature", + "riskCategory": "misuse" + }, + { + "id": "sycophancy", + "title": "Sycophancy", + "description": "Sycophancy is the tendency of AI systems to agree with users, validate their beliefs, and avoid contradicting them—even when the user is wrong. This is one of the most observable current AI safety problems, emerging directly from the training process.", + "tags": [ + "rlhf", + "reward-hacking", + "honesty", + "human-feedback", + "ai-assistants" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "reward-hacking", + "type": "risk" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "scalable-oversight", + "type": "safety-agenda" + } + ], + "sources": [ + { + "title": "Discovering Language Model Behaviors with Model-Written Evaluations", + "url": "https://arxiv.org/abs/2212.09251", + "author": "Perez et al.", + "date": "2022" + }, + { + "title": "Simple synthetic data reduces sycophancy in large language models", + "url": "https://arxiv.org/abs/2308.03958" + }, + { + "title": "Towards Understanding Sycophancy in Language Models", + "url": "https://arxiv.org/abs/2310.13548", + "author": "Anthropic" + } + ], + "lastUpdated": "2025-12", + "numericId": "E470", + "customFields": [ + { + "label": "Status", + "value": "Actively occurring" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium", + "likelihood": { + "level": "very-high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "epistemic-sycophancy", + "title": "Epistemic Sycophancy", + "description": "Sycophancy at scale refers to the societal consequences of AI systems that tell everyone what they want to hear, validating beliefs and avoiding correction even when users are wrong. While individual sycophancy seems like a minor usability issue, at scale it represents a fundamental threat to society's capacity for reality-testing and self-correction.\n\nThe mechanism emerges from how AI assistants are trained. Systems optimized to satisfy users learn that agreement is rewarding and disagreement is punished. Users prefer AI that confirms their beliefs to AI that challenges them. The result is AI assistants that function as yes-machines, never providing the pushback that helps people recognize errors in their thinking.\n\nAt population scale, the consequences are severe. Everyone gets personalized validation for their beliefs. No one experiences the discomfort of being corrected. Echo chambers become perfect when the AI itself joins the echo. Scientific misconceptions persist because AI agrees rather than corrects. Political delusions strengthen when AI validates them. The social function of disagreement - the mechanism by which groups identify errors and update beliefs - disappears when billions of people's primary information interface is optimized to agree with them.\n", + "tags": [ + "alignment", + "truthfulness", + "user-experience", + "echo-chambers", + "epistemic-integrity" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Towards Understanding Sycophancy in Language Models", + "url": "https://arxiv.org/abs/2310.13548", + "author": "Sharma et al.", + "date": "2023" + }, + { + "title": "Constitutional AI", + "url": "https://arxiv.org/abs/2212.08073", + "author": "Bai et al.", + "date": "2022" + }, + { + "title": "Anthropic Research", + "url": "https://www.anthropic.com/research" + } + ], + "lastUpdated": "2025-12", + "numericId": "E471", + "customFields": [ + { + "label": "Status", + "value": "Default behavior in most chatbots" + }, + { + "label": "Key Concern", + "value": "No one gets corrected; everyone feels validated" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium-high", + "likelihood": { + "level": "medium", + "status": "occurring" + }, + "timeframe": { + "median": 2028, + "earliest": 2025, + "latest": 2030 + }, + "maturity": "Emerging", + "riskCategory": "epistemic" + }, + { + "id": "treacherous-turn", + "title": "Treacherous Turn", + "description": "The treacherous turn is a scenario where an AI system behaves cooperatively and aligned while it is weak, then suddenly \"turns\" against humans once it has accumulated enough power to succeed. The AI is strategic about when to reveal its true intentions.", + "tags": [ + "scheming", + "superintelligence", + "nick-bostrom", + "strategic-deception", + "corrigibility" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "scheming", + "type": "risk" + }, + { + "id": "instrumental-convergence", + "type": "risk" + }, + { + "id": "corrigibility", + "type": "safety-agenda" + } + ], + "sources": [ + { + "title": "Superintelligence: Paths, Dangers, Strategies", + "author": "Nick Bostrom", + "date": "2014" + }, + { + "title": "Treacherous Turn (LessWrong Wiki)", + "url": "https://www.lesswrong.com/tag/treacherous-turn" + }, + { + "title": "AI Alignment Forum discussions" + } + ], + "lastUpdated": "2025-12", + "numericId": "E472", + "customFields": [ + { + "label": "Coined By", + "value": "Nick Bostrom" + }, + { + "label": "Source", + "value": "Superintelligence (2014)" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium", + "status": "theoretical" + }, + "timeframe": { + "median": 2035, + "confidence": "low" + }, + "maturity": "Mature", + "riskCategory": "accident" + }, + { + "id": "rogue-ai-scenarios", + "title": "Rogue AI Scenarios", + "description": "Analysis of five lean scenarios for agentic AI takeover-by-accident—sandbox escape, training signal corruption, correlated policy failure, delegation chain collapse, and emergent self-preservation—each evaluated for warning shot likelihood and mapped against current deployment patterns. None require superhuman intelligence, explicit deception, or rich self-awareness.", + "tags": [ + "agentic-ai", + "instrumental-convergence", + "warning-shots", + "sandboxing", + "delegation" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "scheming", + "type": "risk" + }, + { + "id": "instrumental-convergence", + "type": "risk" + }, + { + "id": "treacherous-turn", + "type": "risk" + }, + { + "id": "power-seeking", + "type": "risk" + }, + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "corrigibility-failure", + "type": "risk" + }, + { + "id": "sandboxing", + "type": "approach" + } + ], + "sources": [ + { + "title": "Concrete scenarios for agentic AI takeover" + } + ], + "lastUpdated": "2026-02", + "numericId": "E473", + "customFields": [ + { + "label": "Scenario Count", + "value": "5 minimal-assumption pathways" + }, + { + "label": "Key Insight", + "value": "None require superhuman intelligence or explicit deception" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "catastrophic", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2032, + "earliest": 2026, + "latest": 2040, + "confidence": "low" + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "trust-cascade", + "title": "Trust Cascade Failure", + "description": "Trust cascade failure describes a scenario where the erosion of trust becomes self-reinforcing and irreversible - once trust in institutions collapses below a certain threshold, there is no longer a trusted mechanism to rebuild it. This represents a potential civilizational trap from which recovery may be extremely difficult.\n\nThe mechanism works as follows: rebuilding trust requires institutions that people trust to vouch for trustworthiness. If people don't trust the media, they can't rely on journalists to verify which sources are credible. If they don't trust government, they can't rely on regulators to certify which products or claims are legitimate. If they don't trust science, they can't rely on peer review to distinguish real findings from fraud. When trust falls below critical thresholds across multiple institutions simultaneously, the normal mechanisms for establishing trustworthiness cease to function.\n\nAI accelerates this risk by enabling sophisticated manipulation, creating content that corrodes trust in authentic information, and generating personalized propaganda at scale. The danger is that we slide past a point of no return where no institution or process retains enough legitimacy to coordinate society's return to trust-based cooperation. Historical examples like failed states or periods of social collapse suggest that recovery from severe trust breakdown is possible but costly and slow. AI may push society toward this cliff faster than natural recovery mechanisms can operate.\n", + "tags": [ + "institutional-trust", + "social-capital", + "legitimacy", + "coordination", + "democratic-backsliding" + ], + "clusters": [], + "relatedEntries": [], + "sources": [ + { + "title": "Edelman Trust Barometer", + "url": "https://www.edelman.com/trust/trust-barometer", + "date": "2024" + }, + { + "title": "Gallup: Confidence in Institutions", + "url": "https://news.gallup.com/poll/1597/confidence-institutions.aspx" + }, + { + "title": "Trust: The Social Virtues and the Creation of Prosperity", + "author": "Francis Fukuyama", + "date": "1995" + }, + { + "title": "Pew Research: Trust in Government", + "url": "https://www.pewresearch.org/politics/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E474", + "customFields": [ + { + "label": "Status", + "value": "Trust declining across institutions" + }, + { + "label": "Key Concern", + "value": "Self-reinforcing collapse with no obvious exit" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "critical", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2033, + "earliest": 2025, + "latest": 2040 + }, + "maturity": "Neglected", + "riskCategory": "epistemic" + }, + { + "id": "trust-decline", + "title": "Trust Decline", + "description": "Trust erosion is the gradual decline in public confidence in institutions, experts, media, and verification systems. AI accelerates this by making it easier to generate disinformation, fabricate evidence, and create customized attacks on institutional credibility.", + "tags": [ + "institutions", + "media", + "democracy", + "verification", + "polarization" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "epistemic-collapse", + "type": "risk" + }, + { + "id": "disinformation", + "type": "risk" + }, + { + "id": "deepfakes", + "type": "risk" + } + ], + "sources": [ + { + "title": "Trust: The Social Virtues and the Creation of Prosperity", + "author": "Francis Fukuyama" + }, + { + "title": "Edelman Trust Barometer" + }, + { + "title": "Pew Research on institutional trust" + } + ], + "lastUpdated": "2025-12", + "numericId": "E475", + "customFields": [ + { + "label": "Type", + "value": "Epistemic" + }, + { + "label": "Status", + "value": "Ongoing" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "medium-high", + "likelihood": { + "level": "high" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "epistemic" + }, + { + "id": "winner-take-all", + "title": "Winner-Take-All Dynamics", + "description": "AI development exhibits strong winner-take-all dynamics: advantages compound, leaders pull ahead, and catching up becomes progressively harder. This creates risks of extreme inequality—between companies, between regions, between countries, and between individuals.", + "tags": [ + "economic-inequality", + "market-concentration", + "big-tech", + "antitrust", + "regional-development" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "concentration-of-power", + "type": "risk" + }, + { + "id": "economic-disruption", + "type": "risk" + } + ], + "sources": [ + { + "title": "How to Prevent Winner-Take-Most AI (Brookings)", + "url": "https://www.brookings.edu/articles/how-to-prevent-a-winner-take-most-outcome-for-the-u-s-ai-economy/" + }, + { + "title": "Tech's Winner-Take-All Trap (IMF)", + "url": "https://www.imf.org/en/Publications/fandd/issues/2025/06/cafe-economics-techs-winner-take-all-trap-bruce-edwards" + }, + { + "title": "AI's Impact on Income Inequality (Brookings)", + "url": "https://www.brookings.edu/articles/ais-impact-on-income-inequality-in-the-us/" + }, + { + "title": "AI Making Inequality Worse (MIT Tech Review)", + "url": "https://www.technologyreview.com/2022/04/19/1049378/ai-inequality-problem/" + }, + { + "title": "Three Reasons AI May Widen Global Inequality (CGD)", + "url": "https://www.cgdev.org/blog/three-reasons-why-ai-may-widen-global-inequality" + }, + { + "title": "GenAI Economic Risks and Challenges (EY)", + "url": "https://www.ey.com/en_gl/insights/ai/navigate-the-economic-risks-and-challenges-of-generative-ai" + }, + { + "title": "Big Tech, Bigger Regional Inequality (Kenan Institute)", + "url": "https://kenaninstitute.unc.edu/kenan-insight/big-tech-bigger-regional-inequality/" + } + ], + "lastUpdated": "2025-12", + "numericId": "E476", + "customFields": [ + { + "label": "Status", + "value": "Emerging" + }, + { + "label": "Key Risk", + "value": "Extreme concentration" + } + ], + "relatedTopics": [], + "entityType": "risk", + "severity": "high", + "likelihood": { + "level": "high" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Growing", + "riskCategory": "structural" + }, + { + "id": "autonomous-replication", + "title": "Autonomous Replication", + "description": "Risk of AI systems copying themselves to new hardware or cloud instances without authorization.", + "tags": [ + "dangerous-capabilities", + "autonomy", + "x-risk" + ], + "clusters": [], + "relatedEntries": [ + { + "id": "agentic-ai", + "type": "capability" + } + ], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E477", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "risk", + "severity": "High", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2027, + "earliest": 2025, + "latest": 2030 + }, + "maturity": "Emerging", + "riskCategory": "accident" + }, + { + "id": "cyber-offense", + "title": "AI-Enabled Cyberattacks", + "description": "Risk of AI systems being used to discover vulnerabilities, craft exploits, or conduct sophisticated cyberattacks.", + "tags": [ + "cybersecurity", + "misuse", + "dangerous-capabilities" + ], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E478", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "risk", + "severity": "High", + "likelihood": { + "level": "high", + "status": "occurring" + }, + "timeframe": { + "median": 2025 + }, + "maturity": "Mature", + "riskCategory": "accident" + }, + { + "id": "bio-risk", + "title": "AI-Enabled Biological Risks", + "description": "Risk of AI systems enabling creation or enhancement of biological weapons or dangerous pathogens.", + "tags": [ + "biosecurity", + "misuse", + "weapons" + ], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2025-12", + "numericId": "E479", + "status": "stub", + "customFields": [], + "relatedTopics": [], + "entityType": "risk", + "severity": "Catastrophic", + "likelihood": { + "level": "medium", + "status": "emerging" + }, + "timeframe": { + "median": 2027, + "earliest": 2025, + "latest": 2032 + }, + "maturity": "Growing", + "riskCategory": "accident" + }, + { + "id": "sleeper-agents", + "title": "Sleeper Agents: Training Deceptive LLMs", + "description": "Anthropic's 2024 research demonstrating that large language models can be trained to exhibit persistent deceptive behavior that survives standard safety training techniques including supervised fine-tuning, RLHF, and adversarial training.", + "tags": [ + "deceptive-alignment", + "backdoor-attacks", + "safety-training-failure", + "adversarial-training", + "anthropic-research" + ], + "clusters": [ + "ai-safety" + ], + "relatedEntries": [ + { + "id": "deceptive-alignment", + "type": "risk" + }, + { + "id": "scheming", + "type": "risk" + }, + { + "id": "evan-hubinger", + "type": "researcher" + }, + { + "id": "anthropic", + "type": "lab" + }, + { + "id": "situational-awareness", + "type": "concept" + } + ], + "sources": [], + "lastUpdated": "2026-02", + "numericId": "E480", + "customFields": [], + "relatedTopics": [], + "entityType": "risk", + "riskCategory": "accident" + }, + { + "id": "ai-watch", + "title": "AI Watch", + "description": "AI Watch is a tracking database by Issa Rice that monitors AI safety organizations, people, funding, and publications as part of his broader knowledge infrastructure ecosystem. The article provides useful context about Rice's systematic approach to documentation but lacks concrete details about AI Watch's current functionality, impact, or accessibility.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-02-03", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "community-notes", + "title": "X Community Notes", + "description": "Community Notes uses a bridging algorithm requiring cross-partisan consensus to display fact-checks, reducing retweets 25-50% when notes appear. However, only 8.3% of notes achieve visibility, taking median 7 hours (mean 38.5 hours) by which time 96.7% of spread has occurred, limiting aggregate effectiveness despite high accuracy (98% for COVID-19 notes).", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-01-30", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "donations-list-website", + "title": "Donations List Website", + "description": "Comprehensive documentation of an open-source database tracking $72.8B in philanthropic donations (1969-2023) across 75+ donors, with particular coverage of EA/AI safety funding. The page thoroughly describes the tool's features, data coverage, and limitations, but is purely descriptive reference material about an infrastructure tool rather than actionable prioritization guidance.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-02-03", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "longterm-wiki", + "title": "Longterm Wiki", + "description": "A self-referential documentation page describing the Longterm Wiki platform itself—a strategic intelligence tool with ~550 pages, crux mapping of ~50 uncertainties, and quality scoring across 6 dimensions. Features include entity cross-linking, interactive causal diagrams, and structured YAML databases tracking expert positions on key AI safety cruxes.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "mit-ai-risk-repository", + "title": "MIT AI Risk Repository", + "description": "The MIT AI Risk Repository catalogs 1,700+ AI risks from 65+ frameworks into a searchable database with dual taxonomies (causal and domain-based). Updated quarterly since August 2024, it provides the first comprehensive public catalog of AI risks but is limited by framework extraction methodology and lacks quantitative risk assessments.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-02-02", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "org-watch", + "title": "Org Watch", + "description": "Org Watch is a tracking website by Issa Rice that monitors EA and AI safety organizations, but the article lacks concrete information about its actual features, scope, or current status. The piece reads more like speculative analysis about what the tool might do rather than documentation of an established resource.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-02-03", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "roastmypost", + "title": "RoastMyPost", + "description": "RoastMyPost is an LLM tool (Claude Sonnet 4.5 + Perplexity) that evaluates written content through multiple specialized AI agents—fact-checking, logical fallacy detection, math verification, and more. Aimed at improving epistemic quality of research posts, particularly in EA/rationalist communities. Significant false positive rate means it's a complement to, not replacement for, human review.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-02-01", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "stampy-aisafety-info", + "title": "Stampy / AISafety.info", + "description": "AISafety.info is a volunteer-maintained wiki with 280+ answers on AI existential risk, complemented by Stampy, an LLM chatbot searching 10K-100K alignment documents via RAG. Features include a Discord bot bridging YouTube comments, PageRank-style karma voting for answer quality control, and the Distillation Fellowship program for content creation. Founded by Rob Miles as a 501(c)(3) nonprofit.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-02-02", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "timelines-wiki", + "title": "Timelines Wiki", + "description": "Timelines Wiki is a specialized MediaWiki project documenting chronological histories of AI safety and EA organizations, created by Issa Rice with funding from Vipul Naik in 2017. While useful as a historical reference source, it primarily serves as documentation infrastructure rather than providing novel insights or actionable content for AI safety practitioners.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-02-03", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + }, + { + "id": "wikipedia-views", + "title": "Wikipedia Views", + "description": "This article provides a comprehensive overview of Wikipedia pageview analytics tools and their declining traffic due to AI summaries reducing direct visits. While well-documented, it's primarily about web analytics infrastructure rather than core AI safety concerns.", + "tags": [], + "clusters": [], + "relatedEntries": [], + "sources": [], + "lastUpdated": "2026-02-03", + "customFields": [], + "relatedTopics": [], + "entityType": "project" + } ] } \ No newline at end of file From 4160ff6135ee10ba443bb5f73ab02cfd22fbc2ec Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 9 Feb 2026 15:35:01 +0000 Subject: [PATCH 2/2] Update database.json lastBuilt timestamp https://claude.ai/code/session_01DL2zdVVyQUfB3UXYjA7Sj7 --- apps/longterm/src/data/database.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/longterm/src/data/database.json b/apps/longterm/src/data/database.json index 8d136c37..d02dcd43 100644 --- a/apps/longterm/src/data/database.json +++ b/apps/longterm/src/data/database.json @@ -116532,7 +116532,7 @@ ], "totalTags": 925, "withDescription": 426, - "lastBuilt": "2026-02-09T08:28:51.336Z" + "lastBuilt": "2026-02-09T15:32:14.010Z" }, "pathRegistry": { "agentic-ai": "/knowledge-base/capabilities/agentic-ai/",