From 119c217ef762cbfb1ed2f0d94c3d8eb76b59eea1 Mon Sep 17 00:00:00 2001 From: DevCalebR Date: Sun, 1 Mar 2026 21:11:58 -0500 Subject: [PATCH 1/4] hardening: G8 add correlation IDs and alert hooks --- .env.example | 5 ++ README.md | 1 + RUNBOOK.md | 12 +++- app/api/stripe/webhook/route.ts | 42 ++++++++++--- app/api/twilio/sms/route.ts | 47 ++++++++++----- app/api/twilio/status/route.ts | 65 +++++++++++++------- app/api/twilio/voice/route.ts | 27 ++++++--- docs/PRODUCTION_ENV.md | 4 ++ docs/PRODUCTION_READINESS_GAPS.md | 47 +++++++++++++++ lib/observability.ts | 98 +++++++++++++++++++++++++++++++ lib/twilio-logging.ts | 13 +++- tests/observability.test.ts | 42 +++++++++++++ 12 files changed, 351 insertions(+), 52 deletions(-) create mode 100644 lib/observability.ts create mode 100644 tests/observability.test.ts diff --git a/.env.example b/.env.example index be5086f..84dd23c 100644 --- a/.env.example +++ b/.env.example @@ -36,6 +36,11 @@ PORTFOLIO_DEMO_MODE= # RATE_LIMIT_STRIPE_UNAUTH_MAX=40 # RATE_LIMIT_PROTECTED_API_MAX=80 +# Optional observability + alerting +# ALERT_WEBHOOK_URL= +# ALERT_WEBHOOK_TOKEN= +# ALERT_WEBHOOK_TIMEOUT_MS=4000 + # Vercel system envs (auto-set on Vercel; optional locally for fallback testing only) # VERCEL_ENV= # VERCEL_URL= diff --git a/README.md b/README.md index 80ec288..e1dfedd 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ When a customer calls a business's Twilio number and the forwarded call is misse - SMS compliance commands (`STOP` / `START` / `HELP`) with DB-backed opt-out state - Call recording enabled on forwarded calls + recording metadata captured on callbacks - Twilio webhook protection: production-enforced `X-Twilio-Signature` validation, with shared-token fallback only in non-production +- Webhook observability baseline: correlation IDs (`X-Correlation-Id`), centralized `app.error` reporting, optional alert webhook dispatch ## Local Setup diff --git a/RUNBOOK.md b/RUNBOOK.md index 23c2f11..725d0f8 100644 --- a/RUNBOOK.md +++ b/RUNBOOK.md @@ -42,6 +42,7 @@ - Vercel runtime logs: - API route logs for `/api/twilio/voice`, `/api/twilio/status`, `/api/twilio/sms` - Look for structured prefixes: `twilio.voice`, `twilio.status`, `twilio.sms`, `twilio.messaging`, `twilio.webhook-auth` + - Look for centralized error events: `app.error` (includes `correlationId`, `source`, `event`, and metadata) - Twilio Console: - Phone Number webhook logs / Debugger - Call Logs and Recordings @@ -49,6 +50,16 @@ - Neon: - Query activity / connection issues (if DB errors occur) +## Observability + Alerts + +- Every Twilio/Stripe webhook response now includes `X-Correlation-Id`. +- For incident triage, capture the correlation ID from provider delivery logs and search Vercel logs for that ID. +- Optional alert wiring: + 1. Set `ALERT_WEBHOOK_URL` in Vercel (Slack/PagerDuty/incident gateway endpoint). + 2. Optionally set `ALERT_WEBHOOK_TOKEN` if your endpoint requires bearer auth. + 3. Optionally set `ALERT_WEBHOOK_TIMEOUT_MS` (default `4000`). + 4. Redeploy and induce a safe synthetic webhook failure in non-production to confirm alert delivery. + ## Common Failure Modes - Twilio webhooks return `401` @@ -67,4 +78,3 @@ - `DATABASE_URL` / `DIRECT_DATABASE_URL` swapped - Missing `sslmode=require` - `DIRECT_DATABASE_URL` accidentally using Neon pooler host - diff --git a/app/api/stripe/webhook/route.ts b/app/api/stripe/webhook/route.ts index e0922d6..a8869e0 100644 --- a/app/api/stripe/webhook/route.ts +++ b/app/api/stripe/webhook/route.ts @@ -2,6 +2,7 @@ import { NextResponse } from 'next/server'; import Stripe from 'stripe'; import { db } from '@/lib/db'; +import { getCorrelationIdFromRequest, reportApplicationError, withCorrelationIdHeader } from '@/lib/observability'; import { RATE_LIMIT_STRIPE_AUTH_MAX, RATE_LIMIT_STRIPE_UNAUTH_MAX, RATE_LIMIT_WINDOW_MS } from '@/lib/rate-limit-config'; import { buildRateLimitHeaders, consumeRateLimit, getClientIpAddress } from '@/lib/rate-limit'; import { getStripe } from '@/lib/stripe'; @@ -85,10 +86,12 @@ async function handleCheckoutCompleted(session: Stripe.Checkout.Session) { export async function POST(request: Request) { const clientIp = getClientIpAddress(request); + const correlationId = getCorrelationIdFromRequest(request); + const withCorrelation = (response: NextResponse) => withCorrelationIdHeader(response, correlationId); const signature = request.headers.get('stripe-signature'); const webhookSecret = process.env.STRIPE_WEBHOOK_SECRET; if (!signature || !webhookSecret) { - return NextResponse.json({ error: 'Missing Stripe webhook configuration' }, { status: 400 }); + return withCorrelation(NextResponse.json({ error: 'Missing Stripe webhook configuration' }, { status: 400 })); } const payload = await request.text(); @@ -106,16 +109,29 @@ export async function POST(request: Request) { if (!unauthRateLimit.allowed) { console.warn('Stripe webhook rate-limited (invalid signature burst)', { clientIp, + correlationId, decision: 'reject_429', }); - return NextResponse.json( + return withCorrelation( + NextResponse.json( { error: 'Too many invalid webhook attempts' }, { status: 429, headers: buildRateLimitHeaders(unauthRateLimit) } + ) ); } const message = error instanceof Error ? error.message : 'Invalid webhook signature'; - return NextResponse.json({ error: message }, { status: 400 }); + reportApplicationError({ + source: 'stripe.webhook', + event: 'invalid_signature', + correlationId, + error, + alert: false, + metadata: { + clientIp, + }, + }); + return withCorrelation(NextResponse.json({ error: message }, { status: 400 })); } const authRateLimit = consumeRateLimit({ @@ -126,10 +142,13 @@ export async function POST(request: Request) { if (!authRateLimit.allowed) { console.warn('Stripe webhook rate-limited', { clientIp, + correlationId, eventType: event.type, decision: 'reject_429', }); - return NextResponse.json({ error: 'Rate limit exceeded' }, { status: 429, headers: buildRateLimitHeaders(authRateLimit) }); + return withCorrelation( + NextResponse.json({ error: 'Rate limit exceeded' }, { status: 429, headers: buildRateLimitHeaders(authRateLimit) }) + ); } try { @@ -168,9 +187,18 @@ export async function POST(request: Request) { break; } } catch (error) { - console.error('Stripe webhook handler error', error); - return NextResponse.json({ error: 'Webhook processing failed' }, { status: 500 }); + reportApplicationError({ + source: 'stripe.webhook', + event: 'handler_error', + correlationId, + error, + metadata: { + clientIp, + eventType: event.type, + }, + }); + return withCorrelation(NextResponse.json({ error: 'Webhook processing failed' }, { status: 500 })); } - return NextResponse.json({ received: true }); + return withCorrelation(NextResponse.json({ received: true })); } diff --git a/app/api/twilio/sms/route.ts b/app/api/twilio/sms/route.ts index 6393158..941243a 100644 --- a/app/api/twilio/sms/route.ts +++ b/app/api/twilio/sms/route.ts @@ -3,6 +3,7 @@ import { NextResponse } from 'next/server'; import { findBusinessByTwilioNumber } from '@/lib/business'; import { db } from '@/lib/db'; +import { getCorrelationIdFromRequest, withCorrelationIdHeader } from '@/lib/observability'; import { normalizePhoneNumber } from '@/lib/phone'; import { RATE_LIMIT_TWILIO_AUTH_MAX, RATE_LIMIT_TWILIO_UNAUTH_MAX, RATE_LIMIT_WINDOW_MS } from '@/lib/rate-limit-config'; import { buildRateLimitHeaders, consumeRateLimit, getClientIpAddress } from '@/lib/rate-limit'; @@ -49,6 +50,8 @@ function rateLimitSmsResponse(retryAfterSeconds: number) { export async function POST(request: Request) { let messageSid: string | null = null; + const correlationId = getCorrelationIdFromRequest(request); + const withCorrelation = (response: Response) => withCorrelationIdHeader(response, correlationId); try { const formData = await request.formData(); const payload = Object.fromEntries(formData.entries()) as Record; @@ -64,18 +67,19 @@ export async function POST(request: Request) { }); if (!rateLimit.allowed) { logTwilioWarn('sms', 'webhook_unauthorized_rate_limited', { + correlationId, eventType: 'inbound_sms', decision: 'reject_429', clientIp, }); - return new NextResponse( + return withCorrelation(new NextResponse( JSON.stringify({ error: 'Too many unauthorized requests' }), { status: 429, headers: { 'Content-Type': 'application/json', ...buildRateLimitHeaders(rateLimit) } } - ); + )); } - logTwilioWarn('sms', 'webhook_unauthorized', { decision: 'reject_401' }); - return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }); + logTwilioWarn('sms', 'webhook_unauthorized', { correlationId, decision: 'reject_401' }); + return withCorrelation(NextResponse.json({ error: 'Unauthorized' }, { status: 401 })); } const authRateLimit = consumeRateLimit({ @@ -85,6 +89,7 @@ export async function POST(request: Request) { }); if (!authRateLimit.allowed) { logTwilioWarn('sms', 'webhook_rate_limited', { + correlationId, eventType: 'inbound_sms', decision: 'reject_429', accountSid: accountSid || null, @@ -94,7 +99,7 @@ export async function POST(request: Request) { Object.entries(buildRateLimitHeaders(authRateLimit)).forEach(([name, value]) => { response.headers.set(name, value); }); - return response; + return withCorrelation(response); } const to = normalizePhoneNumber(formField(formData, 'To')); @@ -104,6 +109,7 @@ export async function POST(request: Request) { logTwilioInfo('sms', 'webhook_received', { messageSid, + correlationId, eventType: 'inbound_sms', decision: 'processing', }); @@ -111,20 +117,22 @@ export async function POST(request: Request) { if (!to || !from) { logTwilioWarn('sms', 'missing_required_fields', { messageSid, + correlationId, eventType: 'inbound_sms', decision: 'noop_missing_to_or_from', }); - return xmlOk(); + return withCorrelation(xmlOk()); } const business = await findBusinessByTwilioNumber(to); if (!business) { logTwilioWarn('sms', 'business_not_found', { messageSid, + correlationId, eventType: 'inbound_sms', decision: 'noop_business_not_found', }); - return xmlOk(); + return withCorrelation(xmlOk()); } const inbound = await persistInboundMessage({ @@ -148,6 +156,7 @@ export async function POST(request: Request) { if (compliance.handled) { logTwilioInfo('sms', 'compliance_keyword_handled', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, command: compliance.command, @@ -155,18 +164,19 @@ export async function POST(request: Request) { duplicateInbound: inbound.duplicate, decision: 'reply_compliance_message', }); - return xmlOk(compliance.replyText); + return withCorrelation(xmlOk(compliance.replyText)); } if (inbound.duplicate) { logTwilioInfo('sms', 'duplicate_message_retry', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: null, decision: 'noop_duplicate', }); - return xmlOk(); + return withCorrelation(xmlOk()); } const lead = @@ -186,11 +196,12 @@ export async function POST(request: Request) { if (!lead) { logTwilioInfo('sms', 'no_matching_lead', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, decision: 'noop_no_lead_thread', }); - return xmlOk(); + return withCorrelation(xmlOk()); } await db.message.update({ @@ -209,6 +220,7 @@ export async function POST(request: Request) { if (!isSubscriptionActive(business.subscriptionStatus) || lead.billingRequired || !business.twilioPhoneNumber) { logTwilioInfo('sms', 'automation_blocked', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: lead.id, @@ -218,7 +230,7 @@ export async function POST(request: Request) { ? 'noop_billing_required' : 'noop_missing_twilio_number', }); - return xmlOk(); + return withCorrelation(xmlOk()); } const transition = advanceLeadConversation(lead, body, business); @@ -237,6 +249,7 @@ export async function POST(request: Request) { logTwilioInfo('sms', 'state_machine_transition', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: updatedLead.id, @@ -272,6 +285,7 @@ export async function POST(request: Request) { if (ownerSend.suppressed) { logTwilioWarn('sms', 'owner_notification_suppressed', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: updatedLead.id, @@ -284,6 +298,7 @@ export async function POST(request: Request) { }); logTwilioInfo('sms', 'owner_notification_sent', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: updatedLead.id, @@ -296,6 +311,7 @@ export async function POST(request: Request) { 'owner_notification_failed', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: updatedLead.id, @@ -318,6 +334,7 @@ export async function POST(request: Request) { if (leadSend.suppressed) { logTwilioWarn('sms', 'lead_reply_suppressed', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: updatedLead.id, @@ -334,6 +351,7 @@ export async function POST(request: Request) { logTwilioInfo('sms', 'lead_reply_sent', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: updatedLead.id, @@ -346,6 +364,7 @@ export async function POST(request: Request) { 'lead_reply_send_failed', { messageSid, + correlationId, eventType: 'inbound_sms', businessId: business.id, leadId: updatedLead.id, @@ -355,9 +374,9 @@ export async function POST(request: Request) { ); } - return xmlOk(); + return withCorrelation(xmlOk()); } catch (error) { - logTwilioError('sms', 'route_error', { messageSid, eventType: 'inbound_sms', decision: 'return_retryable_503' }, error); - return retryableErrorResponse(); + logTwilioError('sms', 'route_error', { messageSid, correlationId, eventType: 'inbound_sms', decision: 'return_retryable_503' }, error); + return withCorrelation(retryableErrorResponse()); } } diff --git a/app/api/twilio/status/route.ts b/app/api/twilio/status/route.ts index 13c017f..c6114b7 100644 --- a/app/api/twilio/status/route.ts +++ b/app/api/twilio/status/route.ts @@ -3,6 +3,7 @@ import { SubscriptionStatus } from '@prisma/client'; import { findBusinessByTwilioNumber } from '@/lib/business'; import { db } from '@/lib/db'; +import { getCorrelationIdFromRequest, withCorrelationIdHeader } from '@/lib/observability'; import { normalizePhoneNumber } from '@/lib/phone'; import { RATE_LIMIT_TWILIO_AUTH_MAX, RATE_LIMIT_TWILIO_UNAUTH_MAX, RATE_LIMIT_WINDOW_MS } from '@/lib/rate-limit-config'; import { buildRateLimitHeaders, consumeRateLimit, getClientIpAddress } from '@/lib/rate-limit'; @@ -56,6 +57,8 @@ function rateLimitStatusResponse(retryAfterSeconds: number) { export async function POST(request: Request) { let callSid: string | null = null; let dialCallSid: string | null = null; + const correlationId = getCorrelationIdFromRequest(request); + const withCorrelation = (response: Response) => withCorrelationIdHeader(response, correlationId); try { const formData = await request.formData(); const payload = Object.fromEntries(formData.entries()) as Record; @@ -71,18 +74,19 @@ export async function POST(request: Request) { }); if (!rateLimit.allowed) { logTwilioWarn('status', 'webhook_unauthorized_rate_limited', { + correlationId, eventType: 'dial_status_callback', decision: 'reject_429', clientIp, }); - return new NextResponse( + return withCorrelation(new NextResponse( JSON.stringify({ error: 'Too many unauthorized requests' }), { status: 429, headers: { 'Content-Type': 'application/json', ...buildRateLimitHeaders(rateLimit) } } - ); + )); } - logTwilioWarn('status', 'webhook_unauthorized', { decision: 'reject_401' }); - return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }); + logTwilioWarn('status', 'webhook_unauthorized', { correlationId, decision: 'reject_401' }); + return withCorrelation(NextResponse.json({ error: 'Unauthorized' }, { status: 401 })); } const authRateLimit = consumeRateLimit({ @@ -92,6 +96,7 @@ export async function POST(request: Request) { }); if (!authRateLimit.allowed) { logTwilioWarn('status', 'webhook_rate_limited', { + correlationId, eventType: 'dial_status_callback', decision: 'reject_429', accountSid: accountSid || null, @@ -101,7 +106,7 @@ export async function POST(request: Request) { Object.entries(buildRateLimitHeaders(authRateLimit)).forEach(([name, value]) => { response.headers.set(name, value); }); - return response; + return withCorrelation(response); } const to = normalizePhoneNumber(formField(formData, 'To')); @@ -114,6 +119,7 @@ export async function POST(request: Request) { logTwilioInfo('status', 'webhook_received', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', dialCallStatus: dialCallStatus || null, recordingSid: recording?.recordingSid ?? null, @@ -141,23 +147,25 @@ export async function POST(request: Request) { logTwilioInfo('status', 'recording_metadata_persisted_only', { callSid, + correlationId, eventType: 'recording_status_callback', recordingSid: recording?.recordingSid ?? null, recordingStatus: recording?.recordingStatus ?? null, decision: updated.count > 0 ? 'update_call_recording_metadata' : 'noop_call_not_found', }); - return xmlOk(); + return withCorrelation(xmlOk()); } if (!to || !callSid) { logTwilioWarn('status', 'missing_required_fields', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', decision: 'noop_missing_to_or_callSid', }); - return xmlOk(); + return withCorrelation(xmlOk()); } const business = await findBusinessByTwilioNumber(to); @@ -165,10 +173,11 @@ export async function POST(request: Request) { logTwilioWarn('status', 'business_not_found', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', decision: 'noop_business_not_found', }); - return xmlOk(); + return withCorrelation(xmlOk()); } const answered = dialCallStatus.trim().toLowerCase() === 'completed'; @@ -211,6 +220,7 @@ export async function POST(request: Request) { logTwilioInfo('status', 'call_upserted', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, answered, @@ -222,6 +232,7 @@ export async function POST(request: Request) { logTwilioInfo('status', 'recording_metadata_persisted', { callSid, dialCallSid, + correlationId, eventType: 'recording_status_callback', businessId: business.id, recordingSid: recording?.recordingSid ?? null, @@ -234,11 +245,12 @@ export async function POST(request: Request) { logTwilioInfo('status', 'not_missed_noop', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, decision: 'noop_not_missed', }); - return xmlOk(); + return withCorrelation(xmlOk()); } const callerPhone = from || formField(formData, 'From'); @@ -260,6 +272,7 @@ export async function POST(request: Request) { logTwilioInfo('status', 'lead_created_for_missed_call', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, @@ -269,6 +282,7 @@ export async function POST(request: Request) { logTwilioInfo('status', 'lead_reused_for_retry', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, @@ -283,24 +297,26 @@ export async function POST(request: Request) { logTwilioInfo('status', 'billing_inactive_no_sms', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, decision: 'noop_billing_inactive', }); - return xmlOk(); + return withCorrelation(xmlOk()); } if (!business.twilioPhoneNumber || lead.smsStartedAt) { logTwilioInfo('status', 'already_started_or_missing_twilio_number', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, decision: lead.smsStartedAt ? 'noop_retry_sms_already_started' : 'noop_missing_twilio_number', }); - return xmlOk(); + return withCorrelation(xmlOk()); } try { @@ -309,6 +325,7 @@ export async function POST(request: Request) { logTwilioWarn('status', 'usage_limit_reached', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, @@ -328,12 +345,13 @@ export async function POST(request: Request) { logTwilioInfo('status', 'usage_limit_owner_notify_already_recorded', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, decision: 'noop_usage_limit_notification_already_recorded', }); - return xmlOk(); + return withCorrelation(xmlOk()); } if (business.notifyPhone) { @@ -342,12 +360,13 @@ export async function POST(request: Request) { logTwilioInfo('status', 'usage_limit_owner_notify_already_claimed', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, decision: 'noop_usage_limit_notification_already_claimed', }); - return xmlOk(); + return withCorrelation(xmlOk()); } try { @@ -365,16 +384,18 @@ export async function POST(request: Request) { logTwilioWarn('status', 'usage_limit_owner_notify_suppressed', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, decision: 'owner_notification_suppressed_opted_out', }); - return xmlOk(); + return withCorrelation(xmlOk()); } logTwilioInfo('status', 'usage_limit_owner_notified', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, @@ -399,6 +420,7 @@ export async function POST(request: Request) { { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, @@ -409,7 +431,7 @@ export async function POST(request: Request) { } } - return xmlOk(); + return withCorrelation(xmlOk()); } const prompt = getServicePrompt(business); @@ -425,12 +447,13 @@ export async function POST(request: Request) { logTwilioWarn('status', 'initial_missed_call_sms_suppressed', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, decision: 'skip_opted_out_recipient', }); - return xmlOk(); + return withCorrelation(xmlOk()); } await db.lead.update({ @@ -447,6 +470,7 @@ export async function POST(request: Request) { logTwilioInfo('status', 'initial_missed_call_sms_started', { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, @@ -459,6 +483,7 @@ export async function POST(request: Request) { { callSid, dialCallSid, + correlationId, eventType: 'dial_status_callback', businessId: business.id, leadId: lead.id, @@ -473,17 +498,17 @@ export async function POST(request: Request) { lastInteractionAt: new Date(), }, }); - return retryableErrorResponse(); + return withCorrelation(retryableErrorResponse()); } - return xmlOk(); + return withCorrelation(xmlOk()); } catch (error) { logTwilioError( 'status', 'route_error', - { callSid, dialCallSid, eventType: 'dial_status_callback', decision: 'return_retryable_503' }, + { callSid, dialCallSid, correlationId, eventType: 'dial_status_callback', decision: 'return_retryable_503' }, error ); - return retryableErrorResponse(); + return withCorrelation(retryableErrorResponse()); } } diff --git a/app/api/twilio/voice/route.ts b/app/api/twilio/voice/route.ts index 005dd0b..e246790 100644 --- a/app/api/twilio/voice/route.ts +++ b/app/api/twilio/voice/route.ts @@ -2,6 +2,7 @@ import { NextResponse } from 'next/server'; import { findBusinessByTwilioNumber } from '@/lib/business'; import { db } from '@/lib/db'; +import { getCorrelationIdFromRequest, withCorrelationIdHeader } from '@/lib/observability'; import { normalizePhoneNumber } from '@/lib/phone'; import { RATE_LIMIT_TWILIO_AUTH_MAX, RATE_LIMIT_TWILIO_UNAUTH_MAX, RATE_LIMIT_WINDOW_MS } from '@/lib/rate-limit-config'; import { buildRateLimitHeaders, consumeRateLimit, getClientIpAddress } from '@/lib/rate-limit'; @@ -43,6 +44,8 @@ function rateLimitVoiceResponse(retryAfterSeconds: number) { export async function POST(request: Request) { let callSid: string | null = null; + const correlationId = getCorrelationIdFromRequest(request); + const withCorrelation = (response: NextResponse) => withCorrelationIdHeader(response, correlationId); try { const formData = await request.formData(); const payload = Object.fromEntries(formData.entries()) as Record; @@ -58,18 +61,19 @@ export async function POST(request: Request) { if (!rateLimit.allowed) { logTwilioWarn('voice', 'webhook_unauthorized_rate_limited', { callSid, + correlationId, eventType: 'incoming_call', decision: 'reject_429', clientIp, }); - return new NextResponse( + return withCorrelation(new NextResponse( JSON.stringify({ error: 'Too many unauthorized requests' }), { status: 429, headers: { 'Content-Type': 'application/json', ...buildRateLimitHeaders(rateLimit) } } - ); + )); } - logTwilioWarn('voice', 'webhook_unauthorized', { decision: 'reject_401' }); - return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }); + logTwilioWarn('voice', 'webhook_unauthorized', { correlationId, decision: 'reject_401' }); + return withCorrelation(NextResponse.json({ error: 'Unauthorized' }, { status: 401 })); } const to = normalizePhoneNumber(formField(formData, 'To')); @@ -85,6 +89,7 @@ export async function POST(request: Request) { if (!rateLimit.allowed) { logTwilioWarn('voice', 'webhook_rate_limited', { callSid, + correlationId, eventType: 'incoming_call', decision: 'reject_429', accountSid: accountSid || null, @@ -94,11 +99,12 @@ export async function POST(request: Request) { Object.entries(buildRateLimitHeaders(rateLimit)).forEach(([name, value]) => { response.headers.set(name, value); }); - return response; + return withCorrelation(response); } logTwilioInfo('voice', 'webhook_received', { callSid, + correlationId, eventType: 'incoming_call', decision: 'processing', }); @@ -107,6 +113,7 @@ export async function POST(request: Request) { if (!business) { logTwilioWarn('voice', 'business_not_found', { callSid, + correlationId, eventType: 'incoming_call', decision: 'respond_not_configured', }); @@ -115,7 +122,7 @@ export async function POST(request: Request) { response.say('Sorry, this number is not configured.'); response.hangup(); }); - return new NextResponse(xml, { headers: { 'Content-Type': 'text/xml' } }); + return withCorrelation(new NextResponse(xml, { headers: { 'Content-Type': 'text/xml' } })); } if (callSid) { @@ -140,6 +147,7 @@ export async function POST(request: Request) { logTwilioInfo('voice', 'call_persisted', { callSid, + correlationId, eventType: 'incoming_call', businessId: business.id, decision: 'upsert_call', @@ -160,18 +168,19 @@ export async function POST(request: Request) { logTwilioInfo('voice', 'twiml_returned', { callSid, + correlationId, eventType: 'incoming_call', businessId: business.id, decision: 'dial_forwarding_number_with_recording', }); - return new NextResponse(xml, { headers: { 'Content-Type': 'text/xml' } }); + return withCorrelation(new NextResponse(xml, { headers: { 'Content-Type': 'text/xml' } })); } catch (error) { - logTwilioError('voice', 'route_error', { callSid, eventType: 'incoming_call', decision: 'fallback_hangup' }, error); + logTwilioError('voice', 'route_error', { callSid, correlationId, eventType: 'incoming_call', decision: 'fallback_hangup' }, error); const xml = voiceTwiML((response) => { response.say('Sorry, we are having trouble connecting your call right now.'); response.hangup(); }); - return new NextResponse(xml, { headers: { 'Content-Type': 'text/xml' } }); + return withCorrelation(new NextResponse(xml, { headers: { 'Content-Type': 'text/xml' } })); } } diff --git a/docs/PRODUCTION_ENV.md b/docs/PRODUCTION_ENV.md index 3cf8d81..a5fc7bd 100644 --- a/docs/PRODUCTION_ENV.md +++ b/docs/PRODUCTION_ENV.md @@ -36,6 +36,9 @@ This project uses `NEXT_PUBLIC_APP_URL` as the single canonical app origin for s | `RATE_LIMIT_STRIPE_AUTH_MAX` | Server-only | Optional | Vercel | Max Stripe webhook requests per window for valid-signed traffic. Default `240`. | | `RATE_LIMIT_STRIPE_UNAUTH_MAX` | Server-only | Optional | Vercel | Max Stripe webhook requests per window for invalid-signature traffic. Default `40`. | | `RATE_LIMIT_PROTECTED_API_MAX` | Server-only | Optional | Vercel | Max requests per window for protected Stripe mutation APIs (`/api/stripe/checkout`, `/api/stripe/portal`). Default `80`. | +| `ALERT_WEBHOOK_URL` | Server-only | Optional | Vercel / Ops | If set, critical application errors are POSTed to this webhook for alert fan-out (Slack/Pager/incident gateway). | +| `ALERT_WEBHOOK_TOKEN` | Server-only | Optional | Vercel / Ops | Optional bearer token added to alert webhook requests as `Authorization: Bearer `. | +| `ALERT_WEBHOOK_TIMEOUT_MS` | Server-only | Optional | Vercel / Ops | Timeout for alert webhook dispatch. Default `4000` ms. | ## Runtime Validation (Production) @@ -51,6 +54,7 @@ The app now validates required server env vars at runtime in production via `lib - Production: `TWILIO_VALIDATE_SIGNATURE=true` is required and token-only auth is rejected - Non-production: signature mode can fall back to shared-token auth for local/dev workflows - Rate limiting defaults are tuned to avoid blocking normal Twilio/Stripe provider traffic while still throttling abusive bursts. Tune limits only if you observe false positives in logs. +- Error reporting emits structured `app.error` logs and, when configured, dispatches alert payloads to `ALERT_WEBHOOK_URL`. - `NEXT_PUBLIC_APP_URL` is the canonical value and should be set explicitly. If it is missing/invalid, the app can temporarily fall back to Vercel system env vars (`VERCEL_URL` / `VERCEL_PROJECT_PRODUCTION_URL`) to avoid auth-page crashes, but webhook/redirect behavior should still use an explicit `NEXT_PUBLIC_APP_URL`. ## Vercel: Preview vs Production diff --git a/docs/PRODUCTION_READINESS_GAPS.md b/docs/PRODUCTION_READINESS_GAPS.md index b2b63af..d25cfee 100644 --- a/docs/PRODUCTION_READINESS_GAPS.md +++ b/docs/PRODUCTION_READINESS_GAPS.md @@ -297,3 +297,50 @@ Dependencies: G4 (recommended) - `docs/PRODUCTION_READINESS_GAPS.md` - Commit SHA: - `80a23c3` + +- 2026-03-02 - G8 (DONE) + - Branch: `hardening/g8-observability` + - What changed: + - Added shared observability primitives in `lib/observability.ts`: + - request correlation ID extraction (`x-correlation-id` / `x-request-id`) + - response correlation header injection (`X-Correlation-Id`) + - centralized structured error reporting (`app.error`) + - optional alert webhook dispatch (`ALERT_WEBHOOK_URL`, optional bearer token + timeout) + - Wired Twilio webhook routes to include correlation IDs in key logs and responses: + - `app/api/twilio/voice/route.ts` + - `app/api/twilio/status/route.ts` + - `app/api/twilio/sms/route.ts` + - Wired Stripe webhook route for centralized error reporting + correlation header: + - `app/api/stripe/webhook/route.ts` + - Upgraded `lib/twilio-logging.ts` error path to feed centralized reporting (`app.error`) while preserving existing `twilio.*` logs. + - Added observability unit tests in `tests/observability.test.ts`. + - Documented alert/correlation operations in: + - `.env.example` + - `docs/PRODUCTION_ENV.md` + - `RUNBOOK.md` + - `README.md` + - Ops notes: + - Correlation IDs can now be traced from provider delivery attempts to Vercel logs. + - Alerts are wired as an optional webhook sink and do not block request handling. + - Commands run + results: + - `npm test` -> PASS (30/30) + - `npm run lint` -> PASS + - `npm run build` -> PASS + - `npm run typecheck` -> PASS + - `npm run env:check` -> PASS + - `npm run db:validate` -> PASS + - Files touched: + - `lib/observability.ts` + - `lib/twilio-logging.ts` + - `app/api/twilio/voice/route.ts` + - `app/api/twilio/status/route.ts` + - `app/api/twilio/sms/route.ts` + - `app/api/stripe/webhook/route.ts` + - `tests/observability.test.ts` + - `.env.example` + - `docs/PRODUCTION_ENV.md` + - `RUNBOOK.md` + - `README.md` + - `docs/PRODUCTION_READINESS_GAPS.md` + - Commit SHA: + - `PENDING` diff --git a/lib/observability.ts b/lib/observability.ts new file mode 100644 index 0000000..e60b6f1 --- /dev/null +++ b/lib/observability.ts @@ -0,0 +1,98 @@ +type ErrorReportPayload = { + source: string; + event: string; + correlationId: string; + error: string; + metadata: Record; + timestamp: string; +}; + +type ReportErrorInput = { + source: string; + event: string; + correlationId: string; + error?: unknown; + metadata?: Record; + alert?: boolean; +}; + +function toErrorMessage(error: unknown) { + if (error instanceof Error) return error.message; + return String(error); +} + +function sanitizeCorrelationId(value: string | null | undefined) { + const trimmed = value?.trim(); + if (!trimmed) return null; + if (trimmed.length > 128) return trimmed.slice(0, 128); + return trimmed; +} + +function generateCorrelationId() { + return `req_${Math.random().toString(36).slice(2, 10)}_${Date.now().toString(36)}`; +} + +function buildAlertPayload(payload: ErrorReportPayload) { + return { + text: `[CallbackCloser] ${payload.source}.${payload.event} (${payload.correlationId})`, + ...payload, + }; +} + +async function dispatchAlert(payload: ErrorReportPayload) { + const webhookUrl = process.env.ALERT_WEBHOOK_URL?.trim(); + if (!webhookUrl) return; + + const timeoutMsRaw = process.env.ALERT_WEBHOOK_TIMEOUT_MS?.trim(); + const timeoutMs = timeoutMsRaw ? Number.parseInt(timeoutMsRaw, 10) : 4_000; + const token = process.env.ALERT_WEBHOOK_TOKEN?.trim(); + + try { + await fetch(webhookUrl, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...(token ? { Authorization: `Bearer ${token}` } : {}), + }, + body: JSON.stringify(buildAlertPayload(payload)), + signal: AbortSignal.timeout(Number.isFinite(timeoutMs) ? Math.max(timeoutMs, 1_000) : 4_000), + }); + } catch (error) { + console.error('app.alert_dispatch_failed', { + source: payload.source, + event: payload.event, + correlationId: payload.correlationId, + error: toErrorMessage(error), + }); + } +} + +export function getCorrelationIdFromRequest(request: Pick) { + return ( + sanitizeCorrelationId(request.headers.get('x-correlation-id')) || + sanitizeCorrelationId(request.headers.get('x-request-id')) || + generateCorrelationId() + ); +} + +export function withCorrelationIdHeader(response: T, correlationId: string) { + response.headers.set('X-Correlation-Id', correlationId); + return response; +} + +export function reportApplicationError(input: ReportErrorInput) { + const errorMessage = input.error === undefined ? 'unknown_error' : toErrorMessage(input.error); + const payload: ErrorReportPayload = { + source: input.source, + event: input.event, + correlationId: input.correlationId, + error: errorMessage, + metadata: input.metadata ?? {}, + timestamp: new Date().toISOString(), + }; + + console.error('app.error', payload); + + if (input.alert === false) return; + void dispatchAlert(payload); +} diff --git a/lib/twilio-logging.ts b/lib/twilio-logging.ts index 4ce5f79..336ab25 100644 --- a/lib/twilio-logging.ts +++ b/lib/twilio-logging.ts @@ -1,3 +1,5 @@ +import { reportApplicationError } from './observability.ts'; + type TwilioLogRoute = 'voice' | 'sms' | 'status' | 'messaging' | 'webhook-auth'; type TwilioLogLevel = 'info' | 'warn' | 'error'; @@ -24,5 +26,14 @@ export function logTwilioWarn(route: TwilioLogRoute, event: string, fields: Twil } export function logTwilioError(route: TwilioLogRoute, event: string, fields: TwilioLogFields = {}, error?: unknown) { - write('error', route, event, error ? { ...fields, error: errorMessage(error) } : fields); + const payload = error ? { ...fields, error: errorMessage(error) } : fields; + const observedError = error ?? (typeof payload.error === 'string' ? payload.error : undefined); + write('error', route, event, payload); + reportApplicationError({ + source: `twilio.${route}`, + event, + correlationId: typeof fields.correlationId === 'string' ? fields.correlationId : 'n/a', + error: observedError, + metadata: payload, + }); } diff --git a/tests/observability.test.ts b/tests/observability.test.ts new file mode 100644 index 0000000..41d7c9e --- /dev/null +++ b/tests/observability.test.ts @@ -0,0 +1,42 @@ +import assert from 'node:assert/strict'; +import test from 'node:test'; + +import { getCorrelationIdFromRequest, reportApplicationError, withCorrelationIdHeader } from '../lib/observability.ts'; + +test('getCorrelationIdFromRequest prefers explicit x-correlation-id header', () => { + const request = new Request('https://example.com', { + headers: { + 'x-correlation-id': 'corr-explicit-123', + 'x-request-id': 'req-fallback-456', + }, + }); + + assert.equal(getCorrelationIdFromRequest(request), 'corr-explicit-123'); +}); + +test('getCorrelationIdFromRequest falls back to x-request-id when needed', () => { + const request = new Request('https://example.com', { + headers: { + 'x-request-id': 'req-fallback-456', + }, + }); + + assert.equal(getCorrelationIdFromRequest(request), 'req-fallback-456'); +}); + +test('withCorrelationIdHeader sets response header', () => { + const response = withCorrelationIdHeader(new Response('ok'), 'corr-abc'); + assert.equal(response.headers.get('x-correlation-id'), 'corr-abc'); +}); + +test('reportApplicationError is safe when alerts are disabled', () => { + reportApplicationError({ + source: 'test.route', + event: 'synthetic_error', + correlationId: 'corr-test', + error: new Error('boom'), + alert: false, + }); + + assert.equal(true, true); +}); From 31fc23b6963e037b6c8606970f20e1e171cc5675 Mon Sep 17 00:00:00 2001 From: DevCalebR Date: Sun, 1 Mar 2026 21:12:08 -0500 Subject: [PATCH 2/4] docs: record G8 commit sha in gap changelog --- docs/PRODUCTION_READINESS_GAPS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/PRODUCTION_READINESS_GAPS.md b/docs/PRODUCTION_READINESS_GAPS.md index d25cfee..f762ae9 100644 --- a/docs/PRODUCTION_READINESS_GAPS.md +++ b/docs/PRODUCTION_READINESS_GAPS.md @@ -343,4 +343,4 @@ Dependencies: G4 (recommended) - `README.md` - `docs/PRODUCTION_READINESS_GAPS.md` - Commit SHA: - - `PENDING` + - `119c217` From 334eaaa5bae26bd8a6024e7945b89483d03b7d60 Mon Sep 17 00:00:00 2001 From: DevCalebR Date: Sun, 1 Mar 2026 23:22:34 -0500 Subject: [PATCH 3/4] ci: use valid Clerk placeholder key in non-prod builds --- .github/workflows/ci.yml | 2 +- app/layout.tsx | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 870f441..737f197 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: NEXT_PUBLIC_APP_URL: https://example.com DATABASE_URL: postgresql://postgres:postgres@localhost:5432/callbackcloser?sslmode=require DIRECT_DATABASE_URL: postgresql://postgres:postgres@localhost:5432/callbackcloser?sslmode=require - NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY: pk_test_placeholder + NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY: pk_test_Y2xlcmsuZXhhbXBsZS5jb20k CLERK_SECRET_KEY: sk_test_placeholder STRIPE_SECRET_KEY: sk_test_placeholder STRIPE_WEBHOOK_SECRET: whsec_placeholder diff --git a/app/layout.tsx b/app/layout.tsx index 0266fed..6e718a7 100644 --- a/app/layout.tsx +++ b/app/layout.tsx @@ -6,6 +6,8 @@ import { validateServerEnv } from '@/lib/env.server'; import './globals.css'; +const CLERK_PREVIEW_FALLBACK_KEY = 'pk_test_Y2xlcmsuZXhhbXBsZS5jb20k'; + const manrope = Manrope({ subsets: ['latin'], variable: '--font-sans', @@ -16,11 +18,30 @@ export const metadata: Metadata = { description: 'Missed Call -> Booked Job SMS follow-up', }; +function isLikelyValidClerkPublishableKey(value: string) { + return /^pk_(test|live)_[A-Za-z0-9+/=_-]+$/.test(value); +} + +function resolveClerkPublishableKey() { + const configured = process.env.NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY?.trim() ?? ''; + if (configured && isLikelyValidClerkPublishableKey(configured)) { + return configured; + } + + const allowPreviewFallback = process.env.NODE_ENV !== 'production' || process.env.VERCEL_ENV === 'preview'; + if (allowPreviewFallback) { + return CLERK_PREVIEW_FALLBACK_KEY; + } + + return configured; +} + export default function RootLayout({ children }: { children: React.ReactNode }) { validateServerEnv(); + const clerkPublishableKey = resolveClerkPublishableKey(); return ( - + {children} From 93a48dba4af18d733a4975bed01e642c6a237d0f Mon Sep 17 00:00:00 2001 From: DevCalebR Date: Sun, 1 Mar 2026 23:29:39 -0500 Subject: [PATCH 4/4] ci: run verify job on Node 22 for strip-types support --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 737f197..664ea91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: - uses: actions/setup-node@v4 with: - node-version: 20 + node-version: 22 cache: npm - run: npm ci