Token Budget Management: Setting and Enforcing Per-User and Per-Request Limits
Build a token budget management system with per-user quotas, per-request limits, enforcement middleware, and graceful degradation. Prevent cost overruns while maintaining service quality for your AI agents.
Why Token Budgets Are Essential
Without token budgets, a single bad prompt or a burst of traffic can consume your entire monthly LLM budget in hours. Unlike traditional API rate limiting (which caps request count), token budgets cap the actual cost driver: token consumption. A rate limit of 100 requests per minute does not prevent a single request from consuming 100,000 tokens.
Token budget management gives you three levels of control: per-request limits (prevent individual runaway calls), per-user quotas (fair resource allocation), and system-wide budgets (total spend caps).
Per-Request Token Limits
from dataclasses import dataclass
from typing import Optional
@dataclass
class TokenBudget:
max_input_tokens: int = 8000
max_output_tokens: int = 2000
max_total_tokens: int = 10000
TIER_BUDGETS = {
"free": TokenBudget(max_input_tokens=2000, max_output_tokens=500, max_total_tokens=2500),
"pro": TokenBudget(max_input_tokens=8000, max_output_tokens=2000, max_total_tokens=10000),
"enterprise": TokenBudget(max_input_tokens=32000, max_output_tokens=4000, max_total_tokens=36000),
}
class TokenBudgetEnforcer:
def validate_request(
self,
estimated_input_tokens: int,
tier: str = "pro",
) -> dict:
budget = TIER_BUDGETS.get(tier, TIER_BUDGETS["free"])
if estimated_input_tokens > budget.max_input_tokens:
return {
"allowed": False,
"reason": f"Input tokens ({estimated_input_tokens}) exceed "
f"limit ({budget.max_input_tokens})",
"suggestion": "Reduce context length or upgrade plan",
}
return {
"allowed": True,
"max_output_tokens": budget.max_output_tokens,
"remaining_budget": budget.max_total_tokens - estimated_input_tokens,
}
Per-User Quota System
Track cumulative token usage per user with rolling windows (daily, monthly) and enforce quotas.
flowchart TD
START["Token Budget Management: Setting and Enforcing Pe…"] --> A
A["Why Token Budgets Are Essential"]
A --> B
B["Per-Request Token Limits"]
B --> C
C["Per-User Quota System"]
C --> D
D["FastAPI Middleware for Budget Enforceme…"]
D --> E
E["Graceful Degradation"]
E --> F
F["Budget Alerts"]
F --> G
G["FAQ"]
G --> DONE["Key Takeaways"]
style START fill:#4f46e5,stroke:#4338ca,color:#fff
style DONE fill:#059669,stroke:#047857,color:#fff
import time
from collections import defaultdict
from typing import Dict
class UserQuotaManager:
def __init__(self):
self.usage: Dict[str, list] = defaultdict(list)
self.quotas: Dict[str, dict] = {}
def set_quota(self, user_id: str, daily_tokens: int, monthly_tokens: int):
self.quotas[user_id] = {
"daily": daily_tokens,
"monthly": monthly_tokens,
}
def record_usage(self, user_id: str, tokens: int):
self.usage[user_id].append({
"tokens": tokens,
"timestamp": time.time(),
})
def get_usage(self, user_id: str, window_seconds: int) -> int:
cutoff = time.time() - window_seconds
entries = self.usage.get(user_id, [])
return sum(e["tokens"] for e in entries if e["timestamp"] > cutoff)
def check_quota(self, user_id: str, requested_tokens: int) -> dict:
quota = self.quotas.get(user_id, {"daily": 100_000, "monthly": 2_000_000})
daily_used = self.get_usage(user_id, 86400)
monthly_used = self.get_usage(user_id, 86400 * 30)
if daily_used + requested_tokens > quota["daily"]:
return {
"allowed": False,
"reason": "daily_quota_exceeded",
"used": daily_used,
"limit": quota["daily"],
"resets_in_seconds": self._next_reset(user_id, 86400),
}
if monthly_used + requested_tokens > quota["monthly"]:
return {
"allowed": False,
"reason": "monthly_quota_exceeded",
"used": monthly_used,
"limit": quota["monthly"],
}
return {
"allowed": True,
"daily_remaining": quota["daily"] - daily_used - requested_tokens,
"monthly_remaining": quota["monthly"] - monthly_used - requested_tokens,
}
def _next_reset(self, user_id: str, window: int) -> int:
entries = self.usage.get(user_id, [])
if not entries:
return 0
oldest_in_window = min(
e["timestamp"] for e in entries
if e["timestamp"] > time.time() - window
)
return int(oldest_in_window + window - time.time())
FastAPI Middleware for Budget Enforcement
from fastapi import Request, HTTPException
from starlette.middleware.base import BaseHTTPMiddleware
class TokenBudgetMiddleware(BaseHTTPMiddleware):
def __init__(self, app, quota_manager: UserQuotaManager):
super().__init__(app)
self.quota_manager = quota_manager
async def dispatch(self, request: Request, call_next):
if not request.url.path.startswith("/api/agent"):
return await call_next(request)
user_id = request.headers.get("X-User-ID", "anonymous")
estimated_tokens = int(request.headers.get("X-Estimated-Tokens", "1000"))
check = self.quota_manager.check_quota(user_id, estimated_tokens)
if not check["allowed"]:
raise HTTPException(
status_code=429,
detail={
"error": "token_quota_exceeded",
"reason": check["reason"],
"used": check.get("used"),
"limit": check.get("limit"),
},
)
response = await call_next(request)
actual_tokens = int(response.headers.get("X-Tokens-Used", estimated_tokens))
self.quota_manager.record_usage(user_id, actual_tokens)
return response
Graceful Degradation
When a user approaches their quota, degrade gracefully instead of cutting off service entirely.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
class GracefulDegradation:
def __init__(self, quota_manager: UserQuotaManager):
self.quota_manager = quota_manager
def get_degraded_config(self, user_id: str) -> dict:
check = self.quota_manager.check_quota(user_id, 0)
if not check["allowed"]:
return {"model": None, "max_tokens": 0, "message": "Quota exceeded"}
daily_remaining = check.get("daily_remaining", 0)
daily_limit = self.quota_manager.quotas.get(user_id, {}).get("daily", 100_000)
usage_pct = 1 - (daily_remaining / daily_limit) if daily_limit else 1
if usage_pct < 0.70:
return {"model": "gpt-4o", "max_tokens": 2000, "tier": "full"}
elif usage_pct < 0.90:
return {"model": "gpt-4o-mini", "max_tokens": 1000, "tier": "reduced"}
else:
return {"model": "gpt-4o-mini", "max_tokens": 500, "tier": "minimal"}
Budget Alerts
class BudgetAlertSystem:
def __init__(self, thresholds: list[float] = None):
self.thresholds = thresholds or [0.50, 0.75, 0.90, 1.00]
self.alerted: dict[str, set] = defaultdict(set)
def check_alerts(self, user_id: str, used: int, limit: int) -> list[str]:
ratio = used / limit if limit > 0 else 1.0
alerts = []
for threshold in self.thresholds:
if ratio >= threshold and threshold not in self.alerted[user_id]:
self.alerted[user_id].add(threshold)
alerts.append(
f"User {user_id} has used {ratio:.0%} of token budget "
f"({used:,} / {limit:,} tokens)"
)
return alerts
FAQ
How do I estimate token count before sending a request?
Use the tiktoken library for accurate counts with OpenAI models: len(tiktoken.encoding_for_model("gpt-4o").encode(text)). For a fast approximation without dependencies, divide word count by 0.75. The approximation is usually within 10–15% of the actual count.
Should I enforce budgets on the client side or server side?
Always enforce on the server side — client-side checks are easily bypassed. You can add client-side estimation for a better user experience (showing remaining quota in the UI), but the server must be the authority. The middleware pattern shown above ensures every request passes through budget validation.
How do I handle token budgets for multi-turn conversations?
Track cumulative tokens across the conversation, not just per-message. Each turn adds the full conversation history as input tokens plus the new output. Set a conversation-level budget (for example, 50,000 total tokens) and either summarize history or end the conversation when the budget is reached.
#TokenBudget #RateLimiting #CostControls #Middleware #UsageManagement #AgenticAI #LearnAI #AIEngineering
Written by
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.