Systems Library / AI Model Setup / How to Optimize Token Usage to Cut AI Costs
AI Model Setup advanced

How to Optimize Token Usage to Cut AI Costs

Reduce AI API costs by 40-60% with smart token management strategies.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Reducing AI API token costs by 40-60% through optimization is achievable on almost every production system I have audited. The pattern is always the same: system prompts with unnecessary padding, full documents sent when a summary would work, expensive models used for tasks a cheaper model handles equally well, and no caching at all. These are not edge cases. They are the default state of most AI integrations built by people who are focused on making it work, not making it cheap.

Cost optimization matters most when you are paying for AI calls you did not design. A system that runs 10,000 AI calls per day at $0.02 each costs $600/month. Cutting 50% of those tokens brings that to $300. Both systems do the same job.

What You Need Before Starting

Step 1: Audit Your Current Token Usage

Measure before optimizing. Get your actual per-call token counts.

import anthropic
import tiktoken

client = anthropic.Anthropic()

def count_tokens_estimate(text: str, model: str = "gpt-4") -> int:
    """Estimate token count using tiktoken (works for Claude with ~10% margin)."""
    try:
        enc = tiktoken.encoding_for_model(model)
    except KeyError:
        enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def audit_prompt(system_prompt: str, typical_user_message: str, model: str = "claude-haiku-4-5") -> dict:
    system_tokens = count_tokens_estimate(system_prompt)
    user_tokens = count_tokens_estimate(typical_user_message)

    # Make a real call to get actual token counts
    response = client.messages.create(
        model=model,
        max_tokens=300,
        system=system_prompt,
        messages=[{"role": "user", "content": typical_user_message}]
    )

    actual_input = response.usage.input_tokens
    actual_output = response.usage.output_tokens

    # Cost calculation (Claude Haiku pricing)
    input_cost_per_million = 0.25
    output_cost_per_million = 1.25
    call_cost = (actual_input * input_cost_per_million + actual_output * output_cost_per_million) / 1_000_000

    return {
        "system_prompt_tokens": system_tokens,
        "user_message_tokens": user_tokens,
        "actual_input_tokens": actual_input,
        "actual_output_tokens": actual_output,
        "cost_per_call_usd": call_cost,
        "monthly_cost_1k_calls": call_cost * 1000,
        "monthly_cost_10k_calls": call_cost * 10000,
    }

# Audit your most expensive prompt
audit = audit_prompt(
    system_prompt=YOUR_SYSTEM_PROMPT,
    typical_user_message="Sample typical user message here"
)
print(f"Input tokens: {audit['actual_input_tokens']} | Output: {audit['actual_output_tokens']}")
print(f"Cost per call: ${audit['cost_per_call_usd']:.5f}")
print(f"Monthly @ 10k calls: ${audit['monthly_cost_10k_calls']:.2f}")

Step 2: Trim System Prompts

System prompts are charged on every single call. Every unnecessary word is paid for 10,000 times.

# Before: Verbose system prompt (312 tokens)
VERBOSE_SYSTEM = """
You are an incredibly helpful and knowledgeable assistant who specializes in providing
exceptional customer service and support. You have been designed to assist users with
a wide variety of questions and concerns. When responding to users, please make sure
to be polite, professional, and thorough in your responses. Always try to provide
complete and accurate information. If you are unsure about something, please let the
user know rather than providing potentially incorrect information. Remember to maintain
a positive and friendly tone throughout all interactions. Your goal is to ensure that
every user has a wonderful experience and leaves satisfied with the help they received.

When answering questions:
- Be thorough and comprehensive
- Use clear and easy to understand language
- Provide examples when helpful
- Ask clarifying questions if needed
- Always be respectful and professional
"""

# After: Concise system prompt (47 tokens)
CONCISE_SYSTEM = """Customer support agent. Direct, helpful, honest.
If unsure, say so. No filler phrases."""

def compare_prompt_efficiency(original: str, optimized: str, test_message: str):
    original_tokens = count_tokens_estimate(original)
    optimized_tokens = count_tokens_estimate(optimized)
    savings_pct = (1 - optimized_tokens / original_tokens) * 100

    # Test that quality is maintained
    original_response = client.messages.create(
        model="claude-haiku-4-5",
        max_tokens=200,
        system=original,
        messages=[{"role": "user", "content": test_message}]
    )

    optimized_response = client.messages.create(
        model="claude-haiku-4-5",
        max_tokens=200,
        system=optimized,
        messages=[{"role": "user", "content": test_message}]
    )

    print(f"Original: {original_tokens} tokens")
    print(f"Optimized: {optimized_tokens} tokens ({savings_pct:.0f}% reduction)")
    print(f"\nOriginal response:\n{original_response.content[0].text}")
    print(f"\nOptimized response:\n{optimized_response.content[0].text}")

Step 3: Implement Prompt Caching

For system prompts over 1024 tokens, Claude's prompt caching reduces cost dramatically on repeated calls.

def call_with_cache(
    system_prompt: str,
    user_message: str,
    model: str = "claude-haiku-4-5"
) -> dict:
    response = client.messages.create(
        model=model,
        max_tokens=500,
        system=[
            {
                "type": "text",
                "text": system_prompt,
                "cache_control": {"type": "ephemeral"}  # Cache this content
            }
        ],
        messages=[{"role": "user", "content": user_message}]
    )

    usage = response.usage
    cache_hit = hasattr(usage, 'cache_read_input_tokens') and usage.cache_read_input_tokens > 0

    return {
        "response": response.content[0].text,
        "cache_hit": cache_hit,
        "input_tokens": usage.input_tokens,
        "cache_read_tokens": getattr(usage, 'cache_read_input_tokens', 0),
        "cache_creation_tokens": getattr(usage, 'cache_creation_input_tokens', 0),
    }

# Prompt caching pricing: cache reads cost ~10% of normal input token price
# On a 2000-token system prompt called 10k times/month:
# Without cache: 2000 * 10000 * $0.25/1M = $5.00/month
# With cache (90% hit rate): $0.50 normal + $0.05 cache reads = $0.55/month
# Savings: ~$4.45/month just on system prompt tokens

Step 4: Route Tasks to Cheaper Models

Not every task needs your most capable model. Build a router.

def classify_task_complexity(user_message: str) -> str:
    """Classify task complexity to route to appropriate model tier."""
    # Simple tasks: short inputs, clear classification, extraction
    simple_patterns = [
        r'\b(classify|categorize|label|tag)\b',
        r'\b(extract|pull|get) the\b',
        r'\b(yes or no|true or false)\b',
        r'\btranslate\b',
    ]

    # Complex tasks: analysis, strategy, long-form generation
    complex_patterns = [
        r'\b(analyze|assess|evaluate|compare)\b',
        r'\b(strategy|recommendation|plan)\b',
        r'\b(write|draft|generate).{0,20}(email|report|proposal)\b',
        r'\b(explain why|what is the best|how should)\b',
    ]

    import re
    message_lower = user_message.lower()

    for pattern in complex_patterns:
        if re.search(pattern, message_lower):
            return "complex"

    for pattern in simple_patterns:
        if re.search(pattern, message_lower):
            return "simple"

    # Default: check length as proxy for complexity
    return "complex" if len(user_message.split()) > 50 else "simple"

MODEL_BY_COMPLEXITY = {
    "simple": "claude-haiku-4-5",    # $0.25/$1.25 per 1M tokens
    "complex": "claude-sonnet-4-5",  # $3/$15 per 1M tokens
}

def routed_call(system_prompt: str, user_message: str) -> str:
    complexity = classify_task_complexity(user_message)
    model = MODEL_BY_COMPLEXITY[complexity]

    response = client.messages.create(
        model=model,
        max_tokens=500,
        system=system_prompt,
        messages=[{"role": "user", "content": user_message}]
    )

    return response.content[0].text

Step 5: Truncate Long Inputs Intelligently

Sending 5,000-word documents when only 500 words are relevant wastes tokens.

def smart_truncate(text: str, max_tokens: int = 1000, preserve_structure: bool = True) -> str:
    current_tokens = count_tokens_estimate(text)

    if current_tokens <= max_tokens:
        return text

    if preserve_structure:
        # Split into paragraphs, keep first and last, sample middle
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
        if len(paragraphs) <= 3:
            # Just truncate by words
            words = text.split()
            ratio = max_tokens / current_tokens
            keep_words = int(len(words) * ratio * 0.9)
            return ' '.join(words[:keep_words]) + "\n[Truncated for length]"

        # Keep first 40%, skip middle, keep last 20%
        keep_first = max(1, int(len(paragraphs) * 0.4))
        keep_last = max(1, int(len(paragraphs) * 0.2))
        skipped = len(paragraphs) - keep_first - keep_last

        kept_paragraphs = (
            paragraphs[:keep_first] +
            [f"[... {skipped} paragraphs omitted ...]"] +
            paragraphs[-keep_last:]
        )
        return '\n\n'.join(kept_paragraphs)
    else:
        words = text.split()
        ratio = max_tokens / current_tokens
        keep_words = int(len(words) * ratio * 0.9)
        return ' '.join(words[:keep_words]) + " [truncated]"

def summarize_before_analysis(long_text: str, analysis_task: str) -> str:
    """Summarize a document first, then analyze the summary - cheaper than analyzing full doc."""
    summary = client.messages.create(
        model="claude-haiku-4-5",  # Use cheap model for summarization
        max_tokens=300,
        messages=[{
            "role": "user",
            "content": f"Summarize this in 200 words, preserving key facts and figures:\n\n{long_text}"
        }]
    )

    summary_text = summary.content[0].text

    # Now analyze the summary with a potentially better model
    analysis = client.messages.create(
        model="claude-haiku-4-5",
        max_tokens=400,
        messages=[{
            "role": "user",
            "content": f"Based on this summary:\n{summary_text}\n\nTask: {analysis_task}"
        }]
    )

    return analysis.content[0].text

Step 6: Add Response Caching for Identical Inputs

Identical inputs should never hit the API twice.

import hashlib
import sqlite3
import json
from datetime import datetime, timedelta

def get_cache_key(system_prompt: str, user_message: str, model: str) -> str:
    content = f"{model}|{system_prompt}|{user_message}"
    return hashlib.sha256(content.encode()).hexdigest()

def init_response_cache(db_path: str = "response_cache.db"):
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS response_cache (
            cache_key TEXT PRIMARY KEY,
            response TEXT,
            model TEXT,
            cached_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            hit_count INTEGER DEFAULT 0
        )
    """)
    conn.commit()
    conn.close()

def cached_call(
    system_prompt: str,
    user_message: str,
    model: str = "claude-haiku-4-5",
    cache_ttl_hours: int = 24,
    db_path: str = "response_cache.db"
) -> tuple[str, bool]:
    cache_key = get_cache_key(system_prompt, user_message, model)

    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute("""
        SELECT response, cached_at FROM response_cache
        WHERE cache_key = ?
    """, (cache_key,))
    row = cur.fetchone()

    if row:
        cached_at = datetime.fromisoformat(row[1])
        if datetime.utcnow() - cached_at < timedelta(hours=cache_ttl_hours):
            # Cache hit - update hit count
            conn.execute("UPDATE response_cache SET hit_count = hit_count + 1 WHERE cache_key = ?", (cache_key,))
            conn.commit()
            conn.close()
            return row[0], True  # response, cache_hit=True

    conn.close()

    # Cache miss - call the API
    response = client.messages.create(
        model=model,
        max_tokens=500,
        system=system_prompt,
        messages=[{"role": "user", "content": user_message}]
    )
    result = response.content[0].text

    # Store in cache
    conn = sqlite3.connect(db_path)
    conn.execute(
        "INSERT OR REPLACE INTO response_cache (cache_key, response, model) VALUES (?, ?, ?)",
        (cache_key, result, model)
    )
    conn.commit()
    conn.close()

    return result, False  # response, cache_hit=False

def get_cache_stats(db_path: str = "response_cache.db") -> dict:
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute("SELECT COUNT(*), SUM(hit_count), AVG(hit_count) FROM response_cache")
    total_entries, total_hits, avg_hits = cur.fetchone()
    conn.close()

    return {
        "cached_entries": total_entries or 0,
        "total_cache_hits": int(total_hits or 0),
        "avg_hits_per_entry": round(avg_hits or 0, 1)
    }

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems