Surge77 · Surge77 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.gitignore b/.gitignore
@@ -59,3 +59,14 @@ next-env.d.ts
 
 # local vendor repos
 /vendor/antigravity-kit/
+
+# local cloned repos
+/autoresearch/autoresearch/
+
+# local autoresearch runtime outputs
+/autoresearch/results.tsv
+/autoresearch/reports/
+
+# allow tracked autoresearch docs
+!autoresearch/*.md
+!autoresearch/**/*.md
diff --git a/autoresearch/README.md b/autoresearch/README.md
@@ -0,0 +1,31 @@
+# DevTrends Autoresearch
+
+This folder adapts the Karpathy `autoresearch` pattern to DevTrends with two guarded tracks:
+
+- `scoring`: optimize ranking heuristics with a fixed committed rubric
+- `routing`: optimize AI provider selection with a fixed offline replay suite
+
+The clone of Karpathy's original repo remains ignored at `autoresearch/autoresearch/`.
+
+## Commands
+
+```bash
+npm run autoresearch:eval:scoring
+npm run autoresearch:eval:routing
+npm run autoresearch:loop:scoring -- --dry-run
+npm run autoresearch:loop:routing -- --dry-run
+```
+
+For a real loop run:
+
+- work on a non-`main` branch such as `autoresearch/scoring-<tag>` or `autoresearch/routing-<tag>`
+- keep the worktree limited to that track's allowlisted files
+- run `npm run autoresearch:loop:scoring -- --description "candidate note"` or the routing equivalent
+
+The loop will:
+
+1. stage and commit the candidate diff
+2. run the fixed evaluator
+3. append a row to `autoresearch/results.tsv`
+4. keep the commit if the metric improves the best kept score for that track
+5. otherwise reset that single candidate commit
diff --git a/autoresearch/devtrends-program.md b/autoresearch/devtrends-program.md
@@ -0,0 +1,40 @@
+# DevTrends Guarded Autoresearch
+
+This is a constrained adaptation of Karpathy's autoresearch loop.
+
+## Rule of the system
+
+- Each track has a narrow allowlist of editable files.
+- Evaluators, fixtures, and results formats are read-only.
+- A run is valid only if changed files stay inside the selected track allowlist.
+- Every run must produce a machine-readable report and a row in `results.tsv`.
+
+## Tracks
+
+### Scoring
+
+- Editable surface: scoring heuristics only
+- Fixed evaluator: `scripts/autoresearch-eval-scoring.mjs`
+- Fixed fixture corpus: `autoresearch/fixtures/scoring/baseline.json`
+- Goal: increase rubric score without breaking deterministic scoring behavior
+
+### Routing
+
+- Editable surface: AI routing policy only
+- Fixed evaluator: `scripts/autoresearch-eval-routing.mjs`
+- Fixed fixture corpus: `autoresearch/fixtures/routing/baseline.json`
+- Goal: improve provider choice quality, fallback behavior, latency fit, and cost fit
+
+## Keep / discard rule
+
+- `keep`: evaluator metric improves and no track guardrails fail
+- `discard`: metric is equal or worse
+- `crash`: run cannot complete or violates guardrails
+
+## Workflow
+
+1. Pick one track.
+2. Make candidate changes only inside that track's editable files.
+3. Work on a non-`main` experiment branch.
+4. Run the matching loop command.
+5. The runner stages and commits the candidate, evaluates it, logs the report, and automatically keeps or discards the candidate commit based on the metric.
diff --git a/autoresearch/fixtures/routing/baseline.json b/autoresearch/fixtures/routing/baseline.json
@@ -0,0 +1,64 @@
+{
+  "cases": [
+    {
+      "id": "chat_prefers_groq",
+      "weight": 25,
+      "useCase": "chat",
+      "providers": {
+        "groq": { "available": true, "success": true, "qualityScore": 92, "latencyMs": 320, "costUsd": 0.003 },
+        "cerebras": { "available": true, "success": true, "qualityScore": 88, "latencyMs": 410, "costUsd": 0.004 },
+        "xai": { "available": true, "success": true, "qualityScore": 90, "latencyMs": 950, "costUsd": 0.012 }
+      },
+      "expectations": {
+        "chosenProvider": "groq",
+        "minimumQualityScore": 85,
+        "maxCostUsd": 0.01
+      }
+    },
+    {
+      "id": "chat_falls_back_to_cerebras",
+      "weight": 25,
+      "useCase": "chat",
+      "providers": {
+        "groq": { "available": true, "success": false, "qualityScore": 0, "latencyMs": 9999, "costUsd": 0.0 },
+        "cerebras": { "available": true, "success": true, "qualityScore": 86, "latencyMs": 480, "costUsd": 0.004 },
+        "xai": { "available": true, "success": true, "qualityScore": 91, "latencyMs": 1100, "costUsd": 0.013 }
+      },
+      "expectations": {
+        "chosenProvider": "cerebras",
+        "minimumQualityScore": 80,
+        "maxCostUsd": 0.01
+      }
+    },
+    {
+      "id": "comparison_prefers_gemini",
+      "weight": 25,
+      "useCase": "comparison",
+      "providers": {
+        "gemini": { "available": true, "success": true, "qualityScore": 94, "latencyMs": 1450, "costUsd": 0.005 },
+        "xai": { "available": true, "success": true, "qualityScore": 90, "latencyMs": 1700, "costUsd": 0.012 },
+        "mistral": { "available": true, "success": true, "qualityScore": 88, "latencyMs": 2100, "costUsd": 0.007 }
+      },
+      "expectations": {
+        "chosenProvider": "gemini",
+        "minimumQualityScore": 90,
+        "maxCostUsd": 0.01
+      }
+    },
+    {
+      "id": "batch_insight_falls_back_to_mistral",
+      "weight": 25,
+      "useCase": "batch_insight",
+      "providers": {
+        "gemini": { "available": true, "success": false, "qualityScore": 0, "latencyMs": 9999, "costUsd": 0.0 },
+        "mistral": { "available": true, "success": true, "qualityScore": 84, "latencyMs": 2400, "costUsd": 0.006 },
+        "xai": { "available": true, "success": true, "qualityScore": 82, "latencyMs": 2700, "costUsd": 0.011 }
+      },
+      "expectations": {
+        "chosenProvider": "mistral",
+        "minimumQualityScore": 80,
+        "maxCostUsd": 0.01
+      }
+    }
+  ]
+}
diff --git a/autoresearch/fixtures/scoring/baseline.json b/autoresearch/fixtures/scoring/baseline.json
@@ -0,0 +1,153 @@
+{
+  "sectionWeights": {
+    "weights": 30,
+    "momentum": 30,
+    "ranking": 40
+  },
+  "weights": [
+    {
+      "id": "ai_ml_early_biases_lead_signals",
+      "input": {
+        "category": "ai_ml",
+        "dataAgeDays": 45,
+        "dataCompleteness": 0.9
+      },
+      "assertions": [
+        { "left": "community", "operator": "gt", "right": "jobs" },
+        { "left": "github", "operator": "gt", "right": "ecosystem" },
+        { "left": "sum", "operator": "approx", "value": 1, "tolerance": 0.000001 }
+      ]
+    },
+    {
+      "id": "cloud_mature_weights_jobs_most",
+      "input": {
+        "category": "cloud",
+        "dataAgeDays": 800,
+        "dataCompleteness": 0.9
+      },
+      "assertions": [
+        { "left": "jobs", "operator": "gt", "right": "community" },
+        { "left": "jobs", "operator": "gt", "right": "github" },
+        { "left": "sum", "operator": "approx", "value": 1, "tolerance": 0.000001 }
+      ]
+    },
+    {
+      "id": "sparse_frontend_deprioritizes_jobs",
+      "input": {
+        "category": "frontend",
+        "dataAgeDays": 120,
+        "dataCompleteness": 0.3
+      },
+      "assertions": [
+        { "left": "github", "operator": "gt", "right": "jobs" },
+        { "left": "community", "operator": "gt", "right": "jobs" },
+        { "left": "sum", "operator": "approx", "value": 1, "tolerance": 0.000001 }
+      ]
+    }
+  ],
+  "momentum": [
+    {
+      "id": "steady_series_stays_stable",
+      "scores": [
+        { "date": "2026-01-01", "score": 50 },
+        { "date": "2026-01-02", "score": 50 },
+        { "date": "2026-01-03", "score": 50 },
+        { "date": "2026-01-04", "score": 50 }
+      ],
+      "assertions": [
+        { "left": "trend", "operator": "eq", "value": "stable" },
+        { "left": "streak", "operator": "eq", "value": 0 }
+      ]
+    },
+    {
+      "id": "up_only_series_keeps_positive_streak",
+      "scores": [
+        { "date": "2026-01-01", "score": 12 },
+        { "date": "2026-01-02", "score": 15 },
+        { "date": "2026-01-03", "score": 19 },
+        { "date": "2026-01-04", "score": 24 }
+      ],
+      "assertions": [
+        { "left": "shortTerm", "operator": "gt", "value": 0 },
+        { "left": "streak", "operator": "gt", "value": 0 }
+      ]
+    },
+    {
+      "id": "wild_series_is_volatile",
+      "scores": [
+        { "date": "2026-01-01", "score": 50 },
+        { "date": "2026-01-02", "score": 60 },
+        { "date": "2026-01-03", "score": 42 },
+        { "date": "2026-01-04", "score": 68 },
+        { "date": "2026-01-05", "score": 39 },
+        { "date": "2026-01-06", "score": 73 }
+      ],
+      "assertions": [
+        { "left": "trend", "operator": "eq", "value": "volatile" },
+        { "left": "volatility", "operator": "gt", "value": 3 }
+      ]
+    }
+  ],
+  "rankings": [
+    {
+      "id": "mature_cloud_beats_hype_cloud",
+      "expectedOrder": ["aws", "fly-io"],
+      "candidates": [
+        {
+          "id": "aws",
+          "category": "cloud",
+          "dataAgeDays": 2400,
+          "dataCompleteness": 0.95,
+          "subScores": {
+            "github": 28,
+            "community": 44,
+            "jobs": 96,
+            "ecosystem": 91
+          }
+        },
+        {
+          "id": "fly-io",
+          "category": "cloud",
+          "dataAgeDays": 220,
+          "dataCompleteness": 0.9,
+          "subScores": {
+            "github": 84,
+            "community": 90,
+            "jobs": 35,
+            "ecosystem": 40
+          }
+        }
+      ]
+    },
+    {
+      "id": "new_ai_signal_beats_slower_mature_ai",
+      "expectedOrder": ["langchain", "h2o-ai"],
+      "candidates": [
+        {
+          "id": "langchain",
+          "category": "ai_ml",
+          "dataAgeDays": 150,
+          "dataCompleteness": 0.88,
+          "subScores": {
+            "github": 92,
+            "community": 96,
+            "jobs": 61,
+            "ecosystem": 55
+          }
+        },
+        {
+          "id": "h2o-ai",
+          "category": "ai_ml",
+          "dataAgeDays": 2200,
+          "dataCompleteness": 0.93,
+          "subScores": {
+            "github": 48,
+            "community": 38,
+            "jobs": 84,
+            "ecosystem": 72
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/autoresearch/manifest.json b/autoresearch/manifest.json
@@ -0,0 +1,28 @@
+{
+  "version": 1,
+  "results": {
+    "file": "autoresearch/results.tsv",
+    "reportDir": "autoresearch/reports",
+    "template": "autoresearch/results.template.tsv"
+  },
+  "tracks": {
+    "scoring": {
+      "description": "Adaptive weights, momentum, and composite ranking heuristics for DevTrends scoring.",
+      "evaluatorScript": "scripts/autoresearch-eval-scoring.mjs",
+      "fixture": "autoresearch/fixtures/scoring/baseline.json",
+      "editable": [
+        "src/lib/scoring/adaptive-weights.ts",
+        "src/lib/scoring/enhanced-momentum.ts",
+        "src/lib/scoring/composite.ts"
+      ]
+    },
+    "routing": {
+      "description": "Provider routing policy for AI use cases and fallback order selection.",
+      "evaluatorScript": "scripts/autoresearch-eval-routing.mjs",
+      "fixture": "autoresearch/fixtures/routing/baseline.json",
+      "editable": [
+        "src/lib/ai/router.ts"
+      ]
+    }
+  }
+}
diff --git a/autoresearch/results.template.tsv b/autoresearch/results.template.tsv
@@ -0,0 +1 @@
+commit	track	metric	status	description	report_path
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		commit track metric status description report_path