How to Optimize Token Usage to Cut AI Costs
Reduce AI API costs by 40-60% with smart token management strategies.
Jay Banlasan
The AI Systems Guy
Reducing AI API token costs by 40-60% through optimization is achievable on almost every production system I have audited. The pattern is always the same: system prompts with unnecessary padding, full documents sent when a summary would work, expensive models used for tasks a cheaper model handles equally well, and no caching at all. These are not edge cases. They are the default state of most AI integrations built by people who are focused on making it work, not making it cheap.
Cost optimization matters most when you are paying for AI calls you did not design. A system that runs 10,000 AI calls per day at $0.02 each costs $600/month. Cutting 50% of those tokens brings that to $300. Both systems do the same job.
What You Need Before Starting
- A running AI integration with at least a few weeks of usage to analyze
- Access to your API usage dashboard (Anthropic, OpenAI, or your provider)
- Python 3.10+ with
anthropicandtiktoken(pip install anthropic tiktoken) - Your current average tokens per call as a baseline
Step 1: Audit Your Current Token Usage
Measure before optimizing. Get your actual per-call token counts.
import anthropic
import tiktoken
client = anthropic.Anthropic()
def count_tokens_estimate(text: str, model: str = "gpt-4") -> int:
"""Estimate token count using tiktoken (works for Claude with ~10% margin)."""
try:
enc = tiktoken.encoding_for_model(model)
except KeyError:
enc = tiktoken.get_encoding("cl100k_base")
return len(enc.encode(text))
def audit_prompt(system_prompt: str, typical_user_message: str, model: str = "claude-haiku-4-5") -> dict:
system_tokens = count_tokens_estimate(system_prompt)
user_tokens = count_tokens_estimate(typical_user_message)
# Make a real call to get actual token counts
response = client.messages.create(
model=model,
max_tokens=300,
system=system_prompt,
messages=[{"role": "user", "content": typical_user_message}]
)
actual_input = response.usage.input_tokens
actual_output = response.usage.output_tokens
# Cost calculation (Claude Haiku pricing)
input_cost_per_million = 0.25
output_cost_per_million = 1.25
call_cost = (actual_input * input_cost_per_million + actual_output * output_cost_per_million) / 1_000_000
return {
"system_prompt_tokens": system_tokens,
"user_message_tokens": user_tokens,
"actual_input_tokens": actual_input,
"actual_output_tokens": actual_output,
"cost_per_call_usd": call_cost,
"monthly_cost_1k_calls": call_cost * 1000,
"monthly_cost_10k_calls": call_cost * 10000,
}
# Audit your most expensive prompt
audit = audit_prompt(
system_prompt=YOUR_SYSTEM_PROMPT,
typical_user_message="Sample typical user message here"
)
print(f"Input tokens: {audit['actual_input_tokens']} | Output: {audit['actual_output_tokens']}")
print(f"Cost per call: ${audit['cost_per_call_usd']:.5f}")
print(f"Monthly @ 10k calls: ${audit['monthly_cost_10k_calls']:.2f}")
Step 2: Trim System Prompts
System prompts are charged on every single call. Every unnecessary word is paid for 10,000 times.
# Before: Verbose system prompt (312 tokens)
VERBOSE_SYSTEM = """
You are an incredibly helpful and knowledgeable assistant who specializes in providing
exceptional customer service and support. You have been designed to assist users with
a wide variety of questions and concerns. When responding to users, please make sure
to be polite, professional, and thorough in your responses. Always try to provide
complete and accurate information. If you are unsure about something, please let the
user know rather than providing potentially incorrect information. Remember to maintain
a positive and friendly tone throughout all interactions. Your goal is to ensure that
every user has a wonderful experience and leaves satisfied with the help they received.
When answering questions:
- Be thorough and comprehensive
- Use clear and easy to understand language
- Provide examples when helpful
- Ask clarifying questions if needed
- Always be respectful and professional
"""
# After: Concise system prompt (47 tokens)
CONCISE_SYSTEM = """Customer support agent. Direct, helpful, honest.
If unsure, say so. No filler phrases."""
def compare_prompt_efficiency(original: str, optimized: str, test_message: str):
original_tokens = count_tokens_estimate(original)
optimized_tokens = count_tokens_estimate(optimized)
savings_pct = (1 - optimized_tokens / original_tokens) * 100
# Test that quality is maintained
original_response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=200,
system=original,
messages=[{"role": "user", "content": test_message}]
)
optimized_response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=200,
system=optimized,
messages=[{"role": "user", "content": test_message}]
)
print(f"Original: {original_tokens} tokens")
print(f"Optimized: {optimized_tokens} tokens ({savings_pct:.0f}% reduction)")
print(f"\nOriginal response:\n{original_response.content[0].text}")
print(f"\nOptimized response:\n{optimized_response.content[0].text}")
Step 3: Implement Prompt Caching
For system prompts over 1024 tokens, Claude's prompt caching reduces cost dramatically on repeated calls.
def call_with_cache(
system_prompt: str,
user_message: str,
model: str = "claude-haiku-4-5"
) -> dict:
response = client.messages.create(
model=model,
max_tokens=500,
system=[
{
"type": "text",
"text": system_prompt,
"cache_control": {"type": "ephemeral"} # Cache this content
}
],
messages=[{"role": "user", "content": user_message}]
)
usage = response.usage
cache_hit = hasattr(usage, 'cache_read_input_tokens') and usage.cache_read_input_tokens > 0
return {
"response": response.content[0].text,
"cache_hit": cache_hit,
"input_tokens": usage.input_tokens,
"cache_read_tokens": getattr(usage, 'cache_read_input_tokens', 0),
"cache_creation_tokens": getattr(usage, 'cache_creation_input_tokens', 0),
}
# Prompt caching pricing: cache reads cost ~10% of normal input token price
# On a 2000-token system prompt called 10k times/month:
# Without cache: 2000 * 10000 * $0.25/1M = $5.00/month
# With cache (90% hit rate): $0.50 normal + $0.05 cache reads = $0.55/month
# Savings: ~$4.45/month just on system prompt tokens
Step 4: Route Tasks to Cheaper Models
Not every task needs your most capable model. Build a router.
def classify_task_complexity(user_message: str) -> str:
"""Classify task complexity to route to appropriate model tier."""
# Simple tasks: short inputs, clear classification, extraction
simple_patterns = [
r'\b(classify|categorize|label|tag)\b',
r'\b(extract|pull|get) the\b',
r'\b(yes or no|true or false)\b',
r'\btranslate\b',
]
# Complex tasks: analysis, strategy, long-form generation
complex_patterns = [
r'\b(analyze|assess|evaluate|compare)\b',
r'\b(strategy|recommendation|plan)\b',
r'\b(write|draft|generate).{0,20}(email|report|proposal)\b',
r'\b(explain why|what is the best|how should)\b',
]
import re
message_lower = user_message.lower()
for pattern in complex_patterns:
if re.search(pattern, message_lower):
return "complex"
for pattern in simple_patterns:
if re.search(pattern, message_lower):
return "simple"
# Default: check length as proxy for complexity
return "complex" if len(user_message.split()) > 50 else "simple"
MODEL_BY_COMPLEXITY = {
"simple": "claude-haiku-4-5", # $0.25/$1.25 per 1M tokens
"complex": "claude-sonnet-4-5", # $3/$15 per 1M tokens
}
def routed_call(system_prompt: str, user_message: str) -> str:
complexity = classify_task_complexity(user_message)
model = MODEL_BY_COMPLEXITY[complexity]
response = client.messages.create(
model=model,
max_tokens=500,
system=system_prompt,
messages=[{"role": "user", "content": user_message}]
)
return response.content[0].text
Step 5: Truncate Long Inputs Intelligently
Sending 5,000-word documents when only 500 words are relevant wastes tokens.
def smart_truncate(text: str, max_tokens: int = 1000, preserve_structure: bool = True) -> str:
current_tokens = count_tokens_estimate(text)
if current_tokens <= max_tokens:
return text
if preserve_structure:
# Split into paragraphs, keep first and last, sample middle
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
if len(paragraphs) <= 3:
# Just truncate by words
words = text.split()
ratio = max_tokens / current_tokens
keep_words = int(len(words) * ratio * 0.9)
return ' '.join(words[:keep_words]) + "\n[Truncated for length]"
# Keep first 40%, skip middle, keep last 20%
keep_first = max(1, int(len(paragraphs) * 0.4))
keep_last = max(1, int(len(paragraphs) * 0.2))
skipped = len(paragraphs) - keep_first - keep_last
kept_paragraphs = (
paragraphs[:keep_first] +
[f"[... {skipped} paragraphs omitted ...]"] +
paragraphs[-keep_last:]
)
return '\n\n'.join(kept_paragraphs)
else:
words = text.split()
ratio = max_tokens / current_tokens
keep_words = int(len(words) * ratio * 0.9)
return ' '.join(words[:keep_words]) + " [truncated]"
def summarize_before_analysis(long_text: str, analysis_task: str) -> str:
"""Summarize a document first, then analyze the summary - cheaper than analyzing full doc."""
summary = client.messages.create(
model="claude-haiku-4-5", # Use cheap model for summarization
max_tokens=300,
messages=[{
"role": "user",
"content": f"Summarize this in 200 words, preserving key facts and figures:\n\n{long_text}"
}]
)
summary_text = summary.content[0].text
# Now analyze the summary with a potentially better model
analysis = client.messages.create(
model="claude-haiku-4-5",
max_tokens=400,
messages=[{
"role": "user",
"content": f"Based on this summary:\n{summary_text}\n\nTask: {analysis_task}"
}]
)
return analysis.content[0].text
Step 6: Add Response Caching for Identical Inputs
Identical inputs should never hit the API twice.
import hashlib
import sqlite3
import json
from datetime import datetime, timedelta
def get_cache_key(system_prompt: str, user_message: str, model: str) -> str:
content = f"{model}|{system_prompt}|{user_message}"
return hashlib.sha256(content.encode()).hexdigest()
def init_response_cache(db_path: str = "response_cache.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS response_cache (
cache_key TEXT PRIMARY KEY,
response TEXT,
model TEXT,
cached_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
hit_count INTEGER DEFAULT 0
)
""")
conn.commit()
conn.close()
def cached_call(
system_prompt: str,
user_message: str,
model: str = "claude-haiku-4-5",
cache_ttl_hours: int = 24,
db_path: str = "response_cache.db"
) -> tuple[str, bool]:
cache_key = get_cache_key(system_prompt, user_message, model)
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("""
SELECT response, cached_at FROM response_cache
WHERE cache_key = ?
""", (cache_key,))
row = cur.fetchone()
if row:
cached_at = datetime.fromisoformat(row[1])
if datetime.utcnow() - cached_at < timedelta(hours=cache_ttl_hours):
# Cache hit - update hit count
conn.execute("UPDATE response_cache SET hit_count = hit_count + 1 WHERE cache_key = ?", (cache_key,))
conn.commit()
conn.close()
return row[0], True # response, cache_hit=True
conn.close()
# Cache miss - call the API
response = client.messages.create(
model=model,
max_tokens=500,
system=system_prompt,
messages=[{"role": "user", "content": user_message}]
)
result = response.content[0].text
# Store in cache
conn = sqlite3.connect(db_path)
conn.execute(
"INSERT OR REPLACE INTO response_cache (cache_key, response, model) VALUES (?, ?, ?)",
(cache_key, result, model)
)
conn.commit()
conn.close()
return result, False # response, cache_hit=False
def get_cache_stats(db_path: str = "response_cache.db") -> dict:
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("SELECT COUNT(*), SUM(hit_count), AVG(hit_count) FROM response_cache")
total_entries, total_hits, avg_hits = cur.fetchone()
conn.close()
return {
"cached_entries": total_entries or 0,
"total_cache_hits": int(total_hits or 0),
"avg_hits_per_entry": round(avg_hits or 0, 1)
}
What to Build Next
- Build a token cost dashboard that tracks spend per prompt, per model, and per day automatically
- Add a streaming-based max_tokens detector that stops generation early when the response is complete
- Implement semantic deduplication to cache semantically similar (not just identical) inputs
Related Reading
- Token Optimization for Cost Control - token optimization cost control ai
- Your Team Size Is Your Weakness - reduce team size with ai
- The API as a Business Tool - api as business tool
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment