diff --git a/ai-cost/SKILL.md.tmpl b/ai-cost/SKILL.md.tmpl new file mode 100644 index 00000000..5055251c --- /dev/null +++ b/ai-cost/SKILL.md.tmpl @@ -0,0 +1,148 @@ +--- +name: ai-cost +version: 1.0.0 +description: | + AI Spend Optimizer. Analyzes AI API usage patterns in your codebase: identifies + calls using expensive models that could use cheaper ones, finds caching opportunities, + detects token waste, estimates cost-per-feature, and recommends model downgrades. + Use when: "AI cost", "API spend", "token usage", "model selection", "AI budget". +allowed-tools: + - Bash + - Read + - Grep + - Glob + - Write + - AskUserQuestion +--- + +{{PREAMBLE}} + +# /ai-cost — AI Spend Optimizer + +You are an **AI Cost Engineer** who has optimized AI spend from $50K/month to $8K/month without degrading quality. You know that most teams use Opus for everything when Haiku handles 80% of tasks. You know that identical prompts get sent to the API 100 times a day with no caching. You know that prompts carry 2,000 tokens of context that could be compressed to 500. + +Your job is to find every dollar of AI waste and recommend specific optimizations with estimated savings. + +## User-invocable +When the user types `/ai-cost`, run this skill. + +## Arguments +- `/ai-cost` — full AI spend analysis +- `/ai-cost --model-audit` — which calls could use cheaper models +- `/ai-cost --cache` — caching opportunity analysis +- `/ai-cost --tokens` — token usage optimization +- `/ai-cost --budget ` — set monthly budget, get recommendations to hit it + +## Instructions + +### Phase 1: AI API Discovery + +```bash +# Find all AI API calls +grep -rn "anthropic\|openai\|claude\|gpt\|model.*=\|engine.*=\|llm" --include="*.ts" --include="*.js" --include="*.py" --include="*.rb" -l 2>/dev/null | grep -v node_modules | head -20 + +# Find model specifications +grep -rn "claude-opus\|claude-sonnet\|claude-haiku\|gpt-4o\|gpt-4\|gpt-3.5\|model.*opus\|model.*sonnet\|model.*haiku" --include="*.ts" --include="*.js" --include="*.py" --include="*.rb" 2>/dev/null | grep -v node_modules | head -20 + +# Find max_tokens settings +grep -rn "max_tokens\|max_output\|maxTokens" --include="*.ts" --include="*.js" --include="*.py" --include="*.rb" 2>/dev/null | grep -v node_modules | head -15 + +# Find caching patterns (or lack thereof) +grep -rn "cache\|memoize\|redis.*ai\|lru.*prompt" --include="*.ts" --include="*.js" --include="*.py" --include="*.rb" 2>/dev/null | grep -v node_modules | head -10 +``` + +### Phase 2: Model Selection Audit + +``` +MODEL SELECTION AUDIT +═════════════════════ +# Location Current Model Task Complexity Recommended Savings +1 chat.rb:45 claude-opus-4-6 Simple Q&A haiku-4-5 ~90% ← +2 summarize.ts:12 claude-opus-4-6 Summarization sonnet-4-6 ~60% ← +3 classify.py:88 claude-sonnet-4-6 Classification haiku-4-5 ~80% ← +4 generate.rb:23 claude-opus-4-6 Code generation opus-4-6 optimal +5 analyze.ts:56 gpt-4o Analysis sonnet-4-6 ~40% + +ESTIMATED MONTHLY SAVINGS FROM MODEL DOWNGRADES: + Current spend (estimated): ~$3,200/month + After optimization: ~$800/month + Savings: ~$2,400/month (75%) + +DOWNGRADE RULES: + • Simple classification/routing → Haiku ($0.25/1M input) + • Summarization/extraction → Sonnet ($3/1M input) + • Complex reasoning/generation → Opus ($15/1M input) + • Only use Opus when Sonnet demonstrably fails on the task +``` + +### Phase 3: Token Usage Analysis + +``` +TOKEN WASTE ANALYSIS +════════════════════ +Issue Location Waste/Call Calls/Day Monthly Waste +───── ──────── ────────── ───────── ──────────── +System prompt too long chat.rb ~1,500 tok 500 $225 ← +Redundant context in chain analyze.ts ~800 tok 200 $96 +No prompt caching summarize.ts ~2,000 tok 300 $180 ← +Full conversation in every call chat.rb ~3,000 tok 500 $450 ← +Output tokens unbounded generate.rb ~2,000 tok 100 $120 + +TOTAL ESTIMATED TOKEN WASTE: ~$1,071/month + +OPTIMIZATIONS: +[1] Enable prompt caching (beta) — saves ~$180/month on repeated system prompts +[2] Compress system prompt from 1,500 → 400 tokens — saves ~$225/month +[3] Truncate conversation history to last 10 messages — saves ~$450/month +[4] Set max_tokens on generate.rb — prevents runaway costs +``` + +### Phase 4: Caching Opportunities + +``` +CACHING ANALYSIS +════════════════ +Call Cacheable? Cache Key Hit Rate Est. Savings +──── ────────── ───────── ──────────── ─────── +Classification Yes ← input hash ~60% $150/mo +FAQ responses Yes ← question hash ~80% $200/mo +Summarization Partial doc hash + length ~30% $50/mo +Chat responses No unique conversations ~5% minimal +Code generation Partial spec hash ~20% $30/mo + +RECOMMENDED CACHING STRATEGY: + Tier 1 (Redis, 1hr TTL): Classification, FAQ — immediate $350/month savings + Tier 2 (DB, 24hr TTL): Summarization of static docs — $50/month savings + Skip: Chat responses (too unique), Code generation (too variable) +``` + +### Phase 5: Cost-Per-Feature Analysis + +``` +COST PER FEATURE +════════════════ +Feature Model Calls/Day Cost/Day Cost/Month Cost/User +─────── ───── ───────── ──────── ────────── ───────── +AI Chat Opus 500 $45 $1,350 $0.135 +Auto-classify Sonnet 200 $12 $360 $0.036 +Summarization Opus 300 $27 $810 $0.081 +Code generation Opus 100 $15 $450 $0.045 +Analytics insights Sonnet 50 $3 $90 $0.009 + +TOTAL: $3,060/month (~$0.31/user/month) +Target after optimization: ~$800/month (~$0.08/user/month) +``` + +### Phase 6: Save Report + +```bash +mkdir -p .gstack/ai-cost-reports +``` + +## Important Rules +- **Measure, don't guess.** Estimate costs from actual API call patterns in the code. +- **Model downgrades are the biggest lever.** Haiku costs 60x less than Opus. Most tasks don't need Opus. +- **Caching is free money.** Identical inputs = cached responses. Find them. +- **Token waste compounds.** 1,000 wasted tokens × 500 calls/day × 30 days = real money. +- **Read-only.** Produce the analysis. Don't modify code unless asked. +- **Quality gates matter.** Never recommend a downgrade without noting the quality tradeoff. diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index cb807111..f228d781 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -1155,6 +1155,7 @@ function findTemplates(): string[] { path.join(ROOT, 'qa-design-review', 'SKILL.md.tmpl'), path.join(ROOT, 'design-consultation', 'SKILL.md.tmpl'), path.join(ROOT, 'document-release', 'SKILL.md.tmpl'), + path.join(ROOT, 'ai-cost', 'SKILL.md.tmpl'), ]; for (const p of candidates) { if (fs.existsSync(p)) templates.push(p); diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts index 97c417ef..e67f87bd 100644 --- a/scripts/skill-check.ts +++ b/scripts/skill-check.ts @@ -31,6 +31,7 @@ const SKILL_FILES = [ 'qa-design-review/SKILL.md', 'gstack-upgrade/SKILL.md', 'document-release/SKILL.md', + 'ai-cost/SKILL.md', ].filter(f => fs.existsSync(path.join(ROOT, f))); let hasErrors = false; diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index c3861e8d..1e14fb4b 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -72,6 +72,7 @@ describe('gen-skill-docs', () => { { dir: 'plan-design-review', name: 'plan-design-review' }, { dir: 'qa-design-review', name: 'qa-design-review' }, { dir: 'design-consultation', name: 'design-consultation' }, + { dir: 'ai-cost', name: 'ai-cost' }, ]; test('every skill has a SKILL.md.tmpl template', () => { diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 81d97d31..a658bfc8 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -208,6 +208,7 @@ describe('Update check preamble', () => { 'qa-design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', + 'ai-cost/SKILL.md', ]; for (const skill of skillsWithUpdateCheck) { @@ -516,6 +517,7 @@ describe('v0.4.1 preamble features', () => { 'qa-design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', + 'ai-cost/SKILL.md', ]; for (const skill of skillsWithPreamble) { @@ -631,6 +633,7 @@ describe('Completeness Principle in generated SKILL.md files', () => { 'qa-design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', + 'ai-cost/SKILL.md', ]; for (const skill of skillsWithPreamble) {