How to Create AI Model Comparison Benchmarks
Build automated benchmarks to compare AI model quality for your use case.
Jay Banlasan
The AI Systems Guy
Generic AI benchmarks like MMLU tell you which model is smarter in the abstract. They do not tell you which model is better at writing your specific type of email or classifying your specific support tickets. Building your own ai model comparison benchmark testing is how you make model selection decisions based on your actual use case instead of someone else's test suite. I run these before recommending a model switch to any client.
A good benchmark answers three questions: which model produces the best output for this task, which is fastest, and which is cheapest per quality point. Sometimes a $0.001-per-call model beats a $0.015-per-call model on your specific task. You will not know without testing.
What You Need Before Starting
- Python 3.10+ with
anthropicandopenai(pip install anthropic openai) - API keys for the models you want to compare (Anthropic, OpenAI, Groq, etc.)
- A set of 10-30 representative inputs for your task
- A clear scoring rubric for what makes an output good
Step 1: Define Your Test Cases
Test cases are real examples from your use case. Do not use synthetic examples, they have different distributions than real-world inputs.
# test_cases.py
TEST_CASES = [
{
"id": "tc_001",
"category": "form_submission",
"input": {
"name": "Sarah Chen",
"company": "TechCorp",
"message": "Interested in automating our customer support. We have 500 tickets per day."
},
"expected_qualities": [
"references their volume (500 tickets)",
"mentions a specific next step",
"asks a qualifying question",
"under 100 words"
]
},
{
"id": "tc_002",
"category": "form_submission",
"input": {
"name": "Marcus Williams",
"company": "Startup Inc",
"message": "How much does it cost?"
},
"expected_qualities": [
"does not give a price immediately",
"asks about their situation first",
"warm and non-pushy tone"
]
},
{
"id": "tc_003",
"category": "form_submission",
"input": {
"name": "Janet Park",
"company": "Enterprise Co",
"message": "We had a bad experience with your competitor. Looking to switch."
},
"expected_qualities": [
"acknowledges their experience without badmouthing competitor",
"offers a specific differentiator",
"moves toward a call or demo"
]
}
]
Include edge cases in your test set: short inputs, long inputs, ambiguous requests, emotional language.
Step 2: Build Model Adapters
Wrap each model's API in a common interface so your benchmarking code stays clean.
import anthropic
import openai
import time
from dataclasses import dataclass
@dataclass
class ModelResponse:
model_id: str
output: str
latency_ms: float
input_tokens: int
output_tokens: int
cost_usd: float
# Token costs per 1M tokens (update these with current pricing)
MODEL_PRICING = {
"claude-haiku-4-5": {"input": 0.25, "output": 1.25},
"claude-sonnet-4-5": {"input": 3.00, "output": 15.00},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"gpt-4o": {"input": 5.00, "output": 15.00},
}
def calculate_cost(model_id: str, input_tokens: int, output_tokens: int) -> float:
pricing = MODEL_PRICING.get(model_id, {"input": 0, "output": 0})
return (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
def call_claude(model_id: str, system_prompt: str, user_message: str) -> ModelResponse:
client = anthropic.Anthropic()
start = time.time()
response = client.messages.create(
model=model_id,
max_tokens=500,
system=system_prompt,
messages=[{"role": "user", "content": user_message}]
)
latency_ms = (time.time() - start) * 1000
input_tokens = response.usage.input_tokens
output_tokens = response.usage.output_tokens
return ModelResponse(
model_id=model_id,
output=response.content[0].text,
latency_ms=latency_ms,
input_tokens=input_tokens,
output_tokens=output_tokens,
cost_usd=calculate_cost(model_id, input_tokens, output_tokens)
)
def call_openai(model_id: str, system_prompt: str, user_message: str) -> ModelResponse:
client = openai.OpenAI()
start = time.time()
response = client.chat.completions.create(
model=model_id,
max_tokens=500,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
)
latency_ms = (time.time() - start) * 1000
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
return ModelResponse(
model_id=model_id,
output=response.choices[0].message.content,
latency_ms=latency_ms,
input_tokens=input_tokens,
output_tokens=output_tokens,
cost_usd=calculate_cost(model_id, input_tokens, output_tokens)
)
MODEL_CALLERS = {
"claude-haiku-4-5": call_claude,
"claude-sonnet-4-5": call_claude,
"gpt-4o-mini": call_openai,
"gpt-4o": call_openai,
}
def call_model(model_id: str, system_prompt: str, user_message: str) -> ModelResponse:
caller = MODEL_CALLERS.get(model_id)
if not caller:
raise ValueError(f"No caller registered for model: {model_id}")
return caller(model_id, system_prompt, user_message)
Step 3: Build an Automated Scorer
Use a second Claude call as the judge. This is called LLM-as-judge and it handles quality dimensions that rule-based checks cannot.
def score_response(
test_case: dict,
model_response: ModelResponse,
judge_model: str = "claude-haiku-4-5"
) -> dict:
expected = "\n".join([f"- {q}" for q in test_case["expected_qualities"]])
judge_prompt = f"""Rate this AI response on a scale of 1-10 for each quality criterion.
Original input: {test_case['input']}
Response to evaluate:
{model_response.output}
Quality criteria:
{expected}
Score each criterion 1-10. Then give an overall score 1-10.
Return JSON only:
{{"criteria_scores": {{"criterion": score}}, "overall": 8, "reasoning": "one sentence"}}"""
client = anthropic.Anthropic()
response = client.messages.create(
model=judge_model,
max_tokens=300,
messages=[{"role": "user", "content": judge_prompt}]
)
import json, re
raw = response.content[0].text
json_match = re.search(r'\{.*\}', raw, re.DOTALL)
if json_match:
try:
scores = json.loads(json_match.group())
return scores
except json.JSONDecodeError:
pass
return {"overall": 5, "reasoning": "Failed to parse scores", "criteria_scores": {}}
Step 4: Run the Full Benchmark
import json
from datetime import datetime
def run_benchmark(
models: list,
test_cases: list,
system_prompt: str,
output_file: str = "benchmark_results.json"
) -> dict:
results = {
"timestamp": datetime.utcnow().isoformat(),
"models": models,
"test_case_count": len(test_cases),
"by_model": {m: [] for m in models},
"summary": {}
}
for test_case in test_cases:
print(f"\nTest case: {test_case['id']}")
# Build user message from input
user_message = "\n".join([f"{k}: {v}" for k, v in test_case["input"].items()])
for model_id in models:
print(f" Running {model_id}...", end=" ", flush=True)
try:
response = call_model(model_id, system_prompt, user_message)
scores = score_response(test_case, response)
result = {
"test_case_id": test_case["id"],
"output": response.output,
"latency_ms": round(response.latency_ms, 1),
"cost_usd": round(response.cost_usd, 6),
"input_tokens": response.input_tokens,
"output_tokens": response.output_tokens,
"quality_score": scores.get("overall", 0),
"criteria_scores": scores.get("criteria_scores", {}),
"reasoning": scores.get("reasoning", "")
}
results["by_model"][model_id].append(result)
print(f"score={result['quality_score']}/10 | latency={result['latency_ms']:.0f}ms | cost=${result['cost_usd']:.5f}")
except Exception as e:
print(f"ERROR: {e}")
results["by_model"][model_id].append({"test_case_id": test_case["id"], "error": str(e)})
# Compute summary stats
for model_id in models:
model_results = [r for r in results["by_model"][model_id] if "error" not in r]
if model_results:
results["summary"][model_id] = {
"avg_quality": round(sum(r["quality_score"] for r in model_results) / len(model_results), 2),
"avg_latency_ms": round(sum(r["latency_ms"] for r in model_results) / len(model_results), 1),
"total_cost_usd": round(sum(r["cost_usd"] for r in model_results), 4),
"cost_per_quality_point": round(
sum(r["cost_usd"] for r in model_results) /
sum(r["quality_score"] for r in model_results), 6
) if sum(r["quality_score"] for r in model_results) > 0 else 0
}
with open(output_file, "w") as f:
json.dump(results, f, indent=2)
return results
Step 5: Print a Summary Report
def print_summary(results: dict):
print("\n" + "="*60)
print("BENCHMARK SUMMARY")
print("="*60)
print(f"{'Model':<25} {'Quality':>8} {'Latency':>10} {'Cost':>10} {'$/Quality':>12}")
print("-"*60)
# Sort by quality score descending
sorted_models = sorted(
results["summary"].items(),
key=lambda x: x[1]["avg_quality"],
reverse=True
)
for model_id, stats in sorted_models:
print(f"{model_id:<25} {stats['avg_quality']:>7.1f}/10 {stats['avg_latency_ms']:>8.0f}ms ${stats['total_cost_usd']:>8.4f} ${stats['cost_per_quality_point']:>10.6f}")
print("="*60)
best_quality = sorted_models[0]
print(f"Best quality: {best_quality[0]} ({best_quality[1]['avg_quality']}/10)")
best_value = min(results["summary"].items(), key=lambda x: x[1]["cost_per_quality_point"])
print(f"Best value: {best_value[0]} (${best_value[1]['cost_per_quality_point']:.6f}/quality point)")
# Run it
if __name__ == "__main__":
from test_cases import TEST_CASES
SYSTEM_PROMPT = "You write short, personalized follow-up emails for B2B SaaS. Under 80 words. End with a question."
MODELS_TO_TEST = ["claude-haiku-4-5", "claude-sonnet-4-5", "gpt-4o-mini"]
results = run_benchmark(MODELS_TO_TEST, TEST_CASES[:5], SYSTEM_PROMPT)
print_summary(results)
What to Build Next
- Add a latency percentile analysis (p50, p95, p99) because average latency hides tail latency problems
- Build a cost projection calculator that extrapolates benchmark costs to your monthly volume
- Schedule monthly benchmark runs to detect quality drift as models get updated
Related Reading
- The Operator Model - ai operator model business
- The AI Maturity Model - ai maturity model business
- The Model Does Not Matter - ai model does not matter business
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment