How to Build Token Budget Management Systems

Set per-team and per-project AI token budgets with automatic alerts.

Jay Banlasan

The AI Systems Guy

One team at a client's company ran an experimental AI workflow on a Friday afternoon and burned through their entire month's token allocation by Saturday morning. No alert fired. No one noticed until the invoice came. Building ai token budget management tracking stops this from happening. Every team gets a budget, every call counts against it, and when they're running low the system tells them before it's a problem.

Token budgets do more than prevent cost overruns. They force teams to think carefully about which AI tasks are worth running. When tokens have a visible ceiling, people stop automating low-value work just because they can.

What You Need Before Starting

Python 3.10+
Redis for real-time counters (pip install redis)
SQLite for historical records
Your AI provider's SDK

Step 1: Define the Budget Schema

Budgets live at three levels: global (total spend cap), team (per department), and project (per workflow).

import json
from dataclasses import dataclass, asdict
from typing import Optional

@dataclass
class TokenBudget:
    id: str                    # e.g. "team:marketing" or "project:lead-enrichment"
    name: str
    monthly_token_limit: int   # total tokens (input + output)
    alert_threshold: float     # 0.8 = alert at 80%
    hard_cap: bool             # True = block requests at limit, False = alert only
    owner_email: Optional[str] = None
    notes: Optional[str] = None

# Example budgets
BUDGETS = [
    TokenBudget("team:marketing",   "Marketing Team",    5_000_000,  0.8, False, "[email protected]"),
    TokenBudget("team:engineering", "Engineering Team",  20_000_000, 0.9, False, "[email protected]"),
    TokenBudget("project:nightly",  "Nightly Enrichment",10_000_000, 0.85, True,  None),
    TokenBudget("global",           "Global Cap",        50_000_000, 0.9,  True,  "[email protected]"),
]

The hard_cap flag is important. Some budgets should be advisory (alert but don't block). Others, like nightly batch jobs, should stop cold when they hit the limit to prevent surprise overruns.

Step 2: Build Redis-Based Real-Time Counters

Redis gives you atomic increments with near-zero latency. This is what makes per-request budget enforcement practical.

import redis
from datetime import datetime

r = redis.Redis(host='localhost', port=6379, db=1)

def month_key(budget_id: str) -> str:
    month = datetime.utcnow().strftime("%Y-%m")
    return f"budget:{budget_id}:{month}"

def add_tokens(budget_id: str, tokens: int) -> int:
    """Increment counter and return new total."""
    key = month_key(budget_id)
    new_total = r.incrby(key, tokens)
    # Auto-expire at end of next month (60 days is safe buffer)
    r.expire(key, 60 * 24 * 3600)
    return new_total

def get_usage(budget_id: str) -> int:
    key = month_key(budget_id)
    val = r.get(key)
    return int(val) if val else 0

def reset_budget(budget_id: str):
    """Manual reset — use sparingly."""
    key = month_key(budget_id)
    r.delete(key)

Step 3: Build the Budget Enforcement Layer

This wraps every AI call and enforces limits before and after.

import anthropic
import os

_ai = anthropic.Anthropic()

class BudgetExceededError(Exception):
    pass

def get_budget_by_id(budget_id: str) -> TokenBudget | None:
    return next((b for b in BUDGETS if b.id == budget_id), None)

def ai_call_with_budget(prompt: str, budget_id: str,
                        model: str = "claude-haiku-3") -> str:
    budget = get_budget_by_id(budget_id)
    if not budget:
        raise ValueError(f"Unknown budget: {budget_id}")
    
    # Pre-flight check
    current = get_usage(budget_id)
    pct = current / budget.monthly_token_limit
    
    if pct >= 1.0 and budget.hard_cap:
        raise BudgetExceededError(
            f"Budget '{budget.name}' is exhausted "
            f"({current:,} / {budget.monthly_token_limit:,} tokens)"
        )
    
    # Make the call
    response = _ai.messages.create(
        model=model,
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}]
    )
    
    used = response.usage.input_tokens + response.usage.output_tokens
    
    # Update counters for this budget AND global
    new_total = add_tokens(budget_id, used)
    add_tokens("global", used)
    
    # Check alert threshold after the call
    new_pct = new_total / budget.monthly_token_limit
    if new_pct >= budget.alert_threshold:
        send_budget_alert(budget, new_total, new_pct)
    
    return response.content[0].text

Step 4: Build the Alert System

Alerts fire once when the threshold is crossed, not on every subsequent request.

import requests, smtplib
from email.mime.text import MIMEText

def alert_already_sent(budget_id: str) -> bool:
    key = f"budget:alert_sent:{budget_id}:{datetime.utcnow().strftime('%Y-%m')}"
    return r.exists(key)

def mark_alert_sent(budget_id: str):
    key = f"budget:alert_sent:{budget_id}:{datetime.utcnow().strftime('%Y-%m')}"
    r.setex(key, 35 * 24 * 3600, "1")  # expires next month

SLACK_WEBHOOK = os.getenv("SLACK_WEBHOOK_URL")
GMAIL_ADDRESS  = os.getenv("GMAIL_ADDRESS")
GMAIL_PASSWORD = os.getenv("GMAIL_APP_PASSWORD")

def send_budget_alert(budget: TokenBudget, used: int, pct: float):
    if alert_already_sent(budget.id):
        return
    
    mark_alert_sent(budget.id)
    
    msg = (f"Token budget alert: {budget.name}\n"
           f"Used: {used:,} / {budget.monthly_token_limit:,} tokens ({pct*100:.0f}%)\n"
           f"Hard cap: {'Yes - requests will be blocked at 100%' if budget.hard_cap else 'No - advisory only'}")
    
    # Slack
    if SLACK_WEBHOOK:
        requests.post(SLACK_WEBHOOK, json={"text": f":warning: {msg}"}, timeout=5)
    
    # Email
    if budget.owner_email and GMAIL_ADDRESS:
        send_email(budget.owner_email, f"AI Token Budget Alert: {budget.name}", msg)

def send_email(to: str, subject: str, body: str):
    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"]    = GMAIL_ADDRESS
    msg["To"]      = to
    with smtplib.SMTP("smtp.gmail.com", 587) as smtp:
        smtp.starttls()
        smtp.login(GMAIL_ADDRESS, GMAIL_PASSWORD)
        smtp.send_message(msg)

Step 5: Build a Budget Status Report

Generate a snapshot of all budgets — useful in a daily Slack brief or ops dashboard.

def budget_status_report() -> list[dict]:
    report = []
    for b in BUDGETS:
        used = get_usage(b.id)
        pct = used / b.monthly_token_limit
        status = "OK"
        if pct >= 1.0:
            status = "EXCEEDED"
        elif pct >= b.alert_threshold:
            status = "WARNING"
        
        report.append({
            "name":        b.name,
            "used":        used,
            "limit":       b.monthly_token_limit,
            "pct":         round(pct * 100, 1),
            "status":      status,
            "hard_cap":    b.hard_cap
        })
    return sorted(report, key=lambda x: x["pct"], reverse=True)

# Print report
for row in budget_status_report():
    bar = "#" * int(row["pct"] / 5) + "-" * (20 - int(row["pct"] / 5))
    print(f"{row['name']:<25} [{bar}] {row['pct']}% {row['status']}")

Step 6: Auto-Reset Budgets Monthly

Run this at midnight on the first of each month. Redis keys auto-expire, but this gives you a clean audit trail.

import sqlite3
from datetime import date

def archive_and_reset_budgets():
    conn = sqlite3.connect("token_budgets.db")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS budget_history (
            month TEXT, budget_id TEXT, budget_name TEXT,
            tokens_used INTEGER, limit_tokens INTEGER,
            recorded_at TEXT
        )
    """)
    
    month = datetime.utcnow().strftime("%Y-%m")
    for b in BUDGETS:
        used = get_usage(b.id)
        conn.execute("""
            INSERT INTO budget_history VALUES (?,?,?,?,?,datetime('now'))
        """, (month, b.id, b.name, used, b.monthly_token_limit))
    
    conn.commit()
    conn.close()
    print(f"Archived {month} budget data for {len(BUDGETS)} budgets")

# crontab: 0 0 1 * * python /scripts/reset_budgets.py

What to Build Next

Add per-user token tracking within each team's budget so you can see who the heavy users are
Build a projected end-of-month forecast based on daily burn rate to catch overruns before they happen
Expose budget status in a lightweight internal dashboard so teams can self-serve their usage data

How to Build Token Budget Management Systems

What You Need Before Starting

Step 1: Define the Budget Schema

Step 2: Build Redis-Based Real-Time Counters

Step 3: Build the Budget Enforcement Layer

Step 4: Build the Alert System

Step 5: Build a Budget Status Report

Step 6: Auto-Reset Budgets Monthly

What to Build Next

Related Reading

Related Systems

How to Build a Multi-Model AI Router

How to Build Automatic Model Failover Systems

How to Build AI Request Throttling Systems