How to Build Token Budget Management Systems
Set per-team and per-project AI token budgets with automatic alerts.
Jay Banlasan
The AI Systems Guy
One team at a client's company ran an experimental AI workflow on a Friday afternoon and burned through their entire month's token allocation by Saturday morning. No alert fired. No one noticed until the invoice came. Building ai token budget management tracking stops this from happening. Every team gets a budget, every call counts against it, and when they're running low the system tells them before it's a problem.
Token budgets do more than prevent cost overruns. They force teams to think carefully about which AI tasks are worth running. When tokens have a visible ceiling, people stop automating low-value work just because they can.
What You Need Before Starting
- Python 3.10+
- Redis for real-time counters (
pip install redis) - SQLite for historical records
- Your AI provider's SDK
Step 1: Define the Budget Schema
Budgets live at three levels: global (total spend cap), team (per department), and project (per workflow).
import json
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class TokenBudget:
id: str # e.g. "team:marketing" or "project:lead-enrichment"
name: str
monthly_token_limit: int # total tokens (input + output)
alert_threshold: float # 0.8 = alert at 80%
hard_cap: bool # True = block requests at limit, False = alert only
owner_email: Optional[str] = None
notes: Optional[str] = None
# Example budgets
BUDGETS = [
TokenBudget("team:marketing", "Marketing Team", 5_000_000, 0.8, False, "[email protected]"),
TokenBudget("team:engineering", "Engineering Team", 20_000_000, 0.9, False, "[email protected]"),
TokenBudget("project:nightly", "Nightly Enrichment",10_000_000, 0.85, True, None),
TokenBudget("global", "Global Cap", 50_000_000, 0.9, True, "[email protected]"),
]
The hard_cap flag is important. Some budgets should be advisory (alert but don't block). Others, like nightly batch jobs, should stop cold when they hit the limit to prevent surprise overruns.
Step 2: Build Redis-Based Real-Time Counters
Redis gives you atomic increments with near-zero latency. This is what makes per-request budget enforcement practical.
import redis
from datetime import datetime
r = redis.Redis(host='localhost', port=6379, db=1)
def month_key(budget_id: str) -> str:
month = datetime.utcnow().strftime("%Y-%m")
return f"budget:{budget_id}:{month}"
def add_tokens(budget_id: str, tokens: int) -> int:
"""Increment counter and return new total."""
key = month_key(budget_id)
new_total = r.incrby(key, tokens)
# Auto-expire at end of next month (60 days is safe buffer)
r.expire(key, 60 * 24 * 3600)
return new_total
def get_usage(budget_id: str) -> int:
key = month_key(budget_id)
val = r.get(key)
return int(val) if val else 0
def reset_budget(budget_id: str):
"""Manual reset — use sparingly."""
key = month_key(budget_id)
r.delete(key)
Step 3: Build the Budget Enforcement Layer
This wraps every AI call and enforces limits before and after.
import anthropic
import os
_ai = anthropic.Anthropic()
class BudgetExceededError(Exception):
pass
def get_budget_by_id(budget_id: str) -> TokenBudget | None:
return next((b for b in BUDGETS if b.id == budget_id), None)
def ai_call_with_budget(prompt: str, budget_id: str,
model: str = "claude-haiku-3") -> str:
budget = get_budget_by_id(budget_id)
if not budget:
raise ValueError(f"Unknown budget: {budget_id}")
# Pre-flight check
current = get_usage(budget_id)
pct = current / budget.monthly_token_limit
if pct >= 1.0 and budget.hard_cap:
raise BudgetExceededError(
f"Budget '{budget.name}' is exhausted "
f"({current:,} / {budget.monthly_token_limit:,} tokens)"
)
# Make the call
response = _ai.messages.create(
model=model,
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
used = response.usage.input_tokens + response.usage.output_tokens
# Update counters for this budget AND global
new_total = add_tokens(budget_id, used)
add_tokens("global", used)
# Check alert threshold after the call
new_pct = new_total / budget.monthly_token_limit
if new_pct >= budget.alert_threshold:
send_budget_alert(budget, new_total, new_pct)
return response.content[0].text
Step 4: Build the Alert System
Alerts fire once when the threshold is crossed, not on every subsequent request.
import requests, smtplib
from email.mime.text import MIMEText
def alert_already_sent(budget_id: str) -> bool:
key = f"budget:alert_sent:{budget_id}:{datetime.utcnow().strftime('%Y-%m')}"
return r.exists(key)
def mark_alert_sent(budget_id: str):
key = f"budget:alert_sent:{budget_id}:{datetime.utcnow().strftime('%Y-%m')}"
r.setex(key, 35 * 24 * 3600, "1") # expires next month
SLACK_WEBHOOK = os.getenv("SLACK_WEBHOOK_URL")
GMAIL_ADDRESS = os.getenv("GMAIL_ADDRESS")
GMAIL_PASSWORD = os.getenv("GMAIL_APP_PASSWORD")
def send_budget_alert(budget: TokenBudget, used: int, pct: float):
if alert_already_sent(budget.id):
return
mark_alert_sent(budget.id)
msg = (f"Token budget alert: {budget.name}\n"
f"Used: {used:,} / {budget.monthly_token_limit:,} tokens ({pct*100:.0f}%)\n"
f"Hard cap: {'Yes - requests will be blocked at 100%' if budget.hard_cap else 'No - advisory only'}")
# Slack
if SLACK_WEBHOOK:
requests.post(SLACK_WEBHOOK, json={"text": f":warning: {msg}"}, timeout=5)
# Email
if budget.owner_email and GMAIL_ADDRESS:
send_email(budget.owner_email, f"AI Token Budget Alert: {budget.name}", msg)
def send_email(to: str, subject: str, body: str):
msg = MIMEText(body)
msg["Subject"] = subject
msg["From"] = GMAIL_ADDRESS
msg["To"] = to
with smtplib.SMTP("smtp.gmail.com", 587) as smtp:
smtp.starttls()
smtp.login(GMAIL_ADDRESS, GMAIL_PASSWORD)
smtp.send_message(msg)
Step 5: Build a Budget Status Report
Generate a snapshot of all budgets — useful in a daily Slack brief or ops dashboard.
def budget_status_report() -> list[dict]:
report = []
for b in BUDGETS:
used = get_usage(b.id)
pct = used / b.monthly_token_limit
status = "OK"
if pct >= 1.0:
status = "EXCEEDED"
elif pct >= b.alert_threshold:
status = "WARNING"
report.append({
"name": b.name,
"used": used,
"limit": b.monthly_token_limit,
"pct": round(pct * 100, 1),
"status": status,
"hard_cap": b.hard_cap
})
return sorted(report, key=lambda x: x["pct"], reverse=True)
# Print report
for row in budget_status_report():
bar = "#" * int(row["pct"] / 5) + "-" * (20 - int(row["pct"] / 5))
print(f"{row['name']:<25} [{bar}] {row['pct']}% {row['status']}")
Step 6: Auto-Reset Budgets Monthly
Run this at midnight on the first of each month. Redis keys auto-expire, but this gives you a clean audit trail.
import sqlite3
from datetime import date
def archive_and_reset_budgets():
conn = sqlite3.connect("token_budgets.db")
conn.execute("""
CREATE TABLE IF NOT EXISTS budget_history (
month TEXT, budget_id TEXT, budget_name TEXT,
tokens_used INTEGER, limit_tokens INTEGER,
recorded_at TEXT
)
""")
month = datetime.utcnow().strftime("%Y-%m")
for b in BUDGETS:
used = get_usage(b.id)
conn.execute("""
INSERT INTO budget_history VALUES (?,?,?,?,?,datetime('now'))
""", (month, b.id, b.name, used, b.monthly_token_limit))
conn.commit()
conn.close()
print(f"Archived {month} budget data for {len(BUDGETS)} budgets")
# crontab: 0 0 1 * * python /scripts/reset_budgets.py
What to Build Next
- Add per-user token tracking within each team's budget so you can see who the heavy users are
- Build a projected end-of-month forecast based on daily burn rate to catch overruns before they happen
- Expose budget status in a lightweight internal dashboard so teams can self-serve their usage data
Related Reading
- How to Build a Multi-Model AI Router - route to cheaper models automatically when budgets are running low
- How to Build Automatic Model Failover Systems - budget exhaustion is one valid trigger for failing over to a cheaper provider
- How to Build AI Request Throttling Systems - throttling and budget management work as a pair to enforce spend limits
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment