Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,14 @@ next-env.d.ts

# local vendor repos
/vendor/antigravity-kit/

# local cloned repos
/autoresearch/autoresearch/

# local autoresearch runtime outputs
/autoresearch/results.tsv
/autoresearch/reports/

# allow tracked autoresearch docs
!autoresearch/*.md
!autoresearch/**/*.md
31 changes: 31 additions & 0 deletions autoresearch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# DevTrends Autoresearch

This folder adapts the Karpathy `autoresearch` pattern to DevTrends with two guarded tracks:

- `scoring`: optimize ranking heuristics with a fixed committed rubric
- `routing`: optimize AI provider selection with a fixed offline replay suite

The clone of Karpathy's original repo remains ignored at `autoresearch/autoresearch/`.

## Commands

```bash
npm run autoresearch:eval:scoring
npm run autoresearch:eval:routing
npm run autoresearch:loop:scoring -- --dry-run
npm run autoresearch:loop:routing -- --dry-run
```

For a real loop run:

- work on a non-`main` branch such as `autoresearch/scoring-<tag>` or `autoresearch/routing-<tag>`
- keep the worktree limited to that track's allowlisted files
- run `npm run autoresearch:loop:scoring -- --description "candidate note"` or the routing equivalent

The loop will:

1. stage and commit the candidate diff
2. run the fixed evaluator
3. append a row to `autoresearch/results.tsv`
4. keep the commit if the metric improves the best kept score for that track
5. otherwise reset that single candidate commit
40 changes: 40 additions & 0 deletions autoresearch/devtrends-program.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# DevTrends Guarded Autoresearch

This is a constrained adaptation of Karpathy's autoresearch loop.

## Rule of the system

- Each track has a narrow allowlist of editable files.
- Evaluators, fixtures, and results formats are read-only.
- A run is valid only if changed files stay inside the selected track allowlist.
- Every run must produce a machine-readable report and a row in `results.tsv`.

## Tracks

### Scoring

- Editable surface: scoring heuristics only
- Fixed evaluator: `scripts/autoresearch-eval-scoring.mjs`
- Fixed fixture corpus: `autoresearch/fixtures/scoring/baseline.json`
- Goal: increase rubric score without breaking deterministic scoring behavior

### Routing

- Editable surface: AI routing policy only
- Fixed evaluator: `scripts/autoresearch-eval-routing.mjs`
- Fixed fixture corpus: `autoresearch/fixtures/routing/baseline.json`
- Goal: improve provider choice quality, fallback behavior, latency fit, and cost fit

## Keep / discard rule

- `keep`: evaluator metric improves and no track guardrails fail
- `discard`: metric is equal or worse
- `crash`: run cannot complete or violates guardrails

## Workflow

1. Pick one track.
2. Make candidate changes only inside that track's editable files.
3. Work on a non-`main` experiment branch.
4. Run the matching loop command.
5. The runner stages and commits the candidate, evaluates it, logs the report, and automatically keeps or discards the candidate commit based on the metric.
64 changes: 64 additions & 0 deletions autoresearch/fixtures/routing/baseline.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"cases": [
{
"id": "chat_prefers_groq",
"weight": 25,
"useCase": "chat",
"providers": {
"groq": { "available": true, "success": true, "qualityScore": 92, "latencyMs": 320, "costUsd": 0.003 },
"cerebras": { "available": true, "success": true, "qualityScore": 88, "latencyMs": 410, "costUsd": 0.004 },
"xai": { "available": true, "success": true, "qualityScore": 90, "latencyMs": 950, "costUsd": 0.012 }
},
"expectations": {
"chosenProvider": "groq",
"minimumQualityScore": 85,
"maxCostUsd": 0.01
}
},
{
"id": "chat_falls_back_to_cerebras",
"weight": 25,
"useCase": "chat",
"providers": {
"groq": { "available": true, "success": false, "qualityScore": 0, "latencyMs": 9999, "costUsd": 0.0 },
"cerebras": { "available": true, "success": true, "qualityScore": 86, "latencyMs": 480, "costUsd": 0.004 },
"xai": { "available": true, "success": true, "qualityScore": 91, "latencyMs": 1100, "costUsd": 0.013 }
},
"expectations": {
"chosenProvider": "cerebras",
"minimumQualityScore": 80,
"maxCostUsd": 0.01
}
},
{
"id": "comparison_prefers_gemini",
"weight": 25,
"useCase": "comparison",
"providers": {
"gemini": { "available": true, "success": true, "qualityScore": 94, "latencyMs": 1450, "costUsd": 0.005 },
"xai": { "available": true, "success": true, "qualityScore": 90, "latencyMs": 1700, "costUsd": 0.012 },
"mistral": { "available": true, "success": true, "qualityScore": 88, "latencyMs": 2100, "costUsd": 0.007 }
},
"expectations": {
"chosenProvider": "gemini",
"minimumQualityScore": 90,
"maxCostUsd": 0.01
}
},
{
"id": "batch_insight_falls_back_to_mistral",
"weight": 25,
"useCase": "batch_insight",
"providers": {
"gemini": { "available": true, "success": false, "qualityScore": 0, "latencyMs": 9999, "costUsd": 0.0 },
"mistral": { "available": true, "success": true, "qualityScore": 84, "latencyMs": 2400, "costUsd": 0.006 },
"xai": { "available": true, "success": true, "qualityScore": 82, "latencyMs": 2700, "costUsd": 0.011 }
},
"expectations": {
"chosenProvider": "mistral",
"minimumQualityScore": 80,
"maxCostUsd": 0.01
}
}
]
}
153 changes: 153 additions & 0 deletions autoresearch/fixtures/scoring/baseline.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
{
"sectionWeights": {
"weights": 30,
"momentum": 30,
"ranking": 40
},
"weights": [
{
"id": "ai_ml_early_biases_lead_signals",
"input": {
"category": "ai_ml",
"dataAgeDays": 45,
"dataCompleteness": 0.9
},
"assertions": [
{ "left": "community", "operator": "gt", "right": "jobs" },
{ "left": "github", "operator": "gt", "right": "ecosystem" },
{ "left": "sum", "operator": "approx", "value": 1, "tolerance": 0.000001 }
]
},
{
"id": "cloud_mature_weights_jobs_most",
"input": {
"category": "cloud",
"dataAgeDays": 800,
"dataCompleteness": 0.9
},
"assertions": [
{ "left": "jobs", "operator": "gt", "right": "community" },
{ "left": "jobs", "operator": "gt", "right": "github" },
{ "left": "sum", "operator": "approx", "value": 1, "tolerance": 0.000001 }
]
},
{
"id": "sparse_frontend_deprioritizes_jobs",
"input": {
"category": "frontend",
"dataAgeDays": 120,
"dataCompleteness": 0.3
},
"assertions": [
{ "left": "github", "operator": "gt", "right": "jobs" },
{ "left": "community", "operator": "gt", "right": "jobs" },
{ "left": "sum", "operator": "approx", "value": 1, "tolerance": 0.000001 }
]
}
],
"momentum": [
{
"id": "steady_series_stays_stable",
"scores": [
{ "date": "2026-01-01", "score": 50 },
{ "date": "2026-01-02", "score": 50 },
{ "date": "2026-01-03", "score": 50 },
{ "date": "2026-01-04", "score": 50 }
],
"assertions": [
{ "left": "trend", "operator": "eq", "value": "stable" },
{ "left": "streak", "operator": "eq", "value": 0 }
]
},
{
"id": "up_only_series_keeps_positive_streak",
"scores": [
{ "date": "2026-01-01", "score": 12 },
{ "date": "2026-01-02", "score": 15 },
{ "date": "2026-01-03", "score": 19 },
{ "date": "2026-01-04", "score": 24 }
],
"assertions": [
{ "left": "shortTerm", "operator": "gt", "value": 0 },
{ "left": "streak", "operator": "gt", "value": 0 }
]
},
{
"id": "wild_series_is_volatile",
"scores": [
{ "date": "2026-01-01", "score": 50 },
{ "date": "2026-01-02", "score": 60 },
{ "date": "2026-01-03", "score": 42 },
{ "date": "2026-01-04", "score": 68 },
{ "date": "2026-01-05", "score": 39 },
{ "date": "2026-01-06", "score": 73 }
],
"assertions": [
{ "left": "trend", "operator": "eq", "value": "volatile" },
{ "left": "volatility", "operator": "gt", "value": 3 }
]
}
],
"rankings": [
{
"id": "mature_cloud_beats_hype_cloud",
"expectedOrder": ["aws", "fly-io"],
"candidates": [
{
"id": "aws",
"category": "cloud",
"dataAgeDays": 2400,
"dataCompleteness": 0.95,
"subScores": {
"github": 28,
"community": 44,
"jobs": 96,
"ecosystem": 91
}
},
{
"id": "fly-io",
"category": "cloud",
"dataAgeDays": 220,
"dataCompleteness": 0.9,
"subScores": {
"github": 84,
"community": 90,
"jobs": 35,
"ecosystem": 40
}
}
]
},
{
"id": "new_ai_signal_beats_slower_mature_ai",
"expectedOrder": ["langchain", "h2o-ai"],
"candidates": [
{
"id": "langchain",
"category": "ai_ml",
"dataAgeDays": 150,
"dataCompleteness": 0.88,
"subScores": {
"github": 92,
"community": 96,
"jobs": 61,
"ecosystem": 55
}
},
{
"id": "h2o-ai",
"category": "ai_ml",
"dataAgeDays": 2200,
"dataCompleteness": 0.93,
"subScores": {
"github": 48,
"community": 38,
"jobs": 84,
"ecosystem": 72
}
}
]
}
]
}
28 changes: 28 additions & 0 deletions autoresearch/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"version": 1,
"results": {
"file": "autoresearch/results.tsv",
"reportDir": "autoresearch/reports",
"template": "autoresearch/results.template.tsv"
},
"tracks": {
"scoring": {
"description": "Adaptive weights, momentum, and composite ranking heuristics for DevTrends scoring.",
"evaluatorScript": "scripts/autoresearch-eval-scoring.mjs",
"fixture": "autoresearch/fixtures/scoring/baseline.json",
"editable": [
"src/lib/scoring/adaptive-weights.ts",
"src/lib/scoring/enhanced-momentum.ts",
"src/lib/scoring/composite.ts"
]
},
"routing": {
"description": "Provider routing policy for AI use cases and fallback order selection.",
"evaluatorScript": "scripts/autoresearch-eval-routing.mjs",
"fixture": "autoresearch/fixtures/routing/baseline.json",
"editable": [
"src/lib/ai/router.ts"
]
}
}
}
1 change: 1 addition & 0 deletions autoresearch/results.template.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
commit track metric status description report_path
Loading
Loading