Precise dual rate limiting for LLM APIs (RPM + TPM)
@aid-on/llm-throttle is a high-precision rate limiting library specialized for LLM API calls. It simultaneously controls both RPM (Requests Per Minute) and TPM (Tokens Per Minute) to achieve efficient API usage.
- Dual Rate Limiting: Simultaneously manages both RPM and TPM
- Token Bucket Algorithm: Smoothed rate limiting with burst handling
- Real-time Adjustment: Post-adjustment based on actual token consumption
- Detailed Metrics: Usage visualization and efficiency tracking
- Full TypeScript Support: Type-safe development experience
- Zero Dependencies: Lightweight design with no external library dependencies
npm install @aid-on/llm-throttleimport { LLMThrottle } from '@aid-on/llm-throttle';
// Configure rate limits
const limiter = new LLMThrottle({
rpm: 60, // 60 requests per minute
tpm: 10000 // 10,000 tokens per minute
});
// Check before request
const requestId = 'unique-request-id';
const estimatedTokens = 1500;
if (limiter.consume(requestId, estimatedTokens)) {
// Execute API call
const response = await callLLMAPI();
// Adjust with actual token usage
const actualTokens = response.usage.total_tokens;
limiter.adjustConsumption(requestId, actualTokens);
} else {
console.log('Rate limit reached');
}const limiter = new LLMThrottle({
rpm: 60,
tpm: 10000,
burstRPM: 120, // Allow up to 120 requests in short bursts
burstTPM: 20000 // Allow up to 20,000 tokens in short bursts
});import { RateLimitError } from '@aid-on/llm-throttle';
try {
limiter.consumeOrThrow(requestId, estimatedTokens);
// API call processing
} catch (error) {
if (error instanceof RateLimitError) {
console.log(`Limit reason: ${error.reason}`);
console.log(`Available in: ${error.availableIn}ms`);
}
}const metrics = limiter.getMetrics();
console.log('RPM usage:', metrics.rpm.percentage + '%');
console.log('TPM usage:', metrics.tpm.percentage + '%');
console.log('Average tokens/request:', metrics.consumptionHistory.averageTokensPerRequest);
console.log('Estimation accuracy:', metrics.efficiency);const check = limiter.canProcess(estimatedTokens);
if (check.allowed) {
// Can process
limiter.consume(requestId, estimatedTokens);
} else {
console.log(`Limit reason: ${check.reason}`);
console.log(`Available in: ${check.availableIn}ms`);
}new LLMThrottle(config: DualRateLimitConfig)canProcess(estimatedTokens: number): RateLimitCheckResult- Check if processing is possibleconsume(requestId: string, estimatedTokens: number, metadata?: Record<string, unknown>): boolean- Consume tokensconsumeOrThrow(requestId: string, estimatedTokens: number, metadata?: Record<string, unknown>): void- Throw error on consumption failureadjustConsumption(requestId: string, actualTokens: number): void- Adjust with actual consumptiongetMetrics(): RateLimitMetrics- Get usage metricsgetConsumptionHistory(): ConsumptionRecord[]- Get consumption historyreset(): void- Reset limit statesetHistoryRetention(ms: number): void- Set history retention period
interface DualRateLimitConfig {
rpm: number;
tpm: number;
burstRPM?: number;
burstTPM?: number;
clock?: () => number;
}
interface RateLimitCheckResult {
allowed: boolean;
reason?: 'rpm_limit' | 'tpm_limit';
availableIn?: number;
availableTokens?: {
rpm: number;
tpm: number;
};
}
interface RateLimitMetrics {
rpm: {
used: number;
available: number;
limit: number;
percentage: number;
};
tpm: {
used: number;
available: number;
limit: number;
percentage: number;
};
efficiency: number;
consumptionHistory: {
count: number;
averageTokensPerRequest: number;
totalTokens: number;
};
}import OpenAI from 'openai';
import { LLMThrottle } from '@aid-on/llm-throttle';
const openai = new OpenAI();
const limiter = new LLMThrottle({
rpm: 500, // Example OpenAI Tier 1 limits
tpm: 10000
});
async function chatCompletion(messages: any[], requestId: string) {
const estimatedTokens = estimateTokens(messages); // Custom estimation logic
if (!limiter.consume(requestId, estimatedTokens)) {
throw new Error('Rate limit reached');
}
try {
const response = await openai.chat.completions.create({
model: 'gpt-3.5-turbo',
messages
});
// Adjust with actual usage
const actualTokens = response.usage?.total_tokens || estimatedTokens;
limiter.adjustConsumption(requestId, actualTokens);
return response;
} catch (error) {
// Return estimated value on error
limiter.adjustConsumption(requestId, 0);
throw error;
}
}class APIManager {
private limiters = new Map<string, LLMThrottle>();
constructor() {
// Service-specific limit configuration
this.limiters.set('openai', new LLMThrottle({
rpm: 500, tpm: 10000
}));
this.limiters.set('anthropic', new LLMThrottle({
rpm: 1000, tpm: 20000
}));
}
async callAPI(service: string, requestId: string, estimatedTokens: number) {
const limiter = this.limiters.get(service);
if (!limiter) throw new Error(`Unknown service: ${service}`);
const check = limiter.canProcess(estimatedTokens);
if (!check.allowed) {
throw new RateLimitError(
`Rate limit exceeded for ${service}: ${check.reason}`,
check.reason!,
check.availableIn!
);
}
limiter.consume(requestId, estimatedTokens);
// API call processing...
}
}npm testMIT License