How to Implement AI A/B Testing for Prompts
Run controlled experiments to find the best-performing prompts for each task.
Jay Banlasan
The AI Systems Guy
Changing a prompt without testing it is guessing. I have seen teams spend weeks debating which version of a system prompt is better when they could have had a definitive answer in 48 hours with a simple A/B test. Prompt A/B testing runs two prompt variants against the same inputs, scores the outputs, and tells you which wins with statistical evidence.
This is essential for any AI system that handles meaningful volume. A 10% quality improvement on a bot handling 1,000 conversations per day is 100 better conversations every single day. The test setup takes an afternoon. The improvement compounds indefinitely.
What You Need Before Starting
- Python 3.9+
- OpenAI API key
- A test set of 30-50 representative inputs
- A scoring function or judge model for the outputs
scipyfor statistical significance testing
Step 1: Install Dependencies
pip install openai scipy
Step 2: Define Your Prompt Variants
Be disciplined. Test one variable at a time. If you change the tone AND the structure AND the instruction, you will not know which change drove the result.
VARIANT_A = {
"name": "variant_a_baseline",
"system_prompt": """You are a helpful customer support agent.
Answer questions clearly and completely.
Be professional and friendly."""
}
VARIANT_B = {
"name": "variant_b_direct",
"system_prompt": """You are a customer support specialist.
Lead with the answer. One sentence max before the solution.
Use bullet points for steps. Be direct, not corporate."""
}
# Good: Testing one thing (tone and format structure)
# Bad: Testing this + model + temperature + max_tokens simultaneously
Step 3: Build the Test Runner
Run both variants against identical inputs and collect outputs.
import openai
import random
from datetime import datetime
client = openai.OpenAI(api_key="YOUR_API_KEY")
def run_variant(
system_prompt: str,
user_message: str,
model: str = "gpt-4o-mini",
temperature: float = 0.3
) -> str:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
],
temperature=temperature
)
return response.choices[0].message.content
def run_ab_test(
variant_a: dict,
variant_b: dict,
test_inputs: list[str],
model: str = "gpt-4o-mini",
shuffle: bool = True
) -> list[dict]:
if shuffle:
test_inputs = test_inputs.copy()
random.shuffle(test_inputs)
results = []
for i, user_input in enumerate(test_inputs):
print(f"Running test {i+1}/{len(test_inputs)}...")
output_a = run_variant(variant_a["system_prompt"], user_input, model)
output_b = run_variant(variant_b["system_prompt"], user_input, model)
results.append({
"input": user_input,
"output_a": output_a,
"output_b": output_b,
"variant_a_name": variant_a["name"],
"variant_b_name": variant_b["name"],
"timestamp": datetime.now().isoformat()
})
return results
Step 4: Build the Judge Scorer
Use a judge model to score both outputs per input.
import json
JUDGE_PROMPT = """You are evaluating two AI responses to the same user query.
Score each response 1-10 on:
- Accuracy: Does it correctly address the question?
- Clarity: Is it easy to understand?
- Actionability: Does the user know what to do next?
- Conciseness: No unnecessary words?
Return ONLY JSON: {
"response_a": {"accuracy": N, "clarity": N, "actionability": N, "conciseness": N, "overall": N},
"response_b": {"accuracy": N, "clarity": N, "actionability": N, "conciseness": N, "overall": N},
"winner": "a" or "b" or "tie",
"reason": "one sentence"
}"""
judge_client = openai.OpenAI(api_key="YOUR_API_KEY")
def judge_pair(user_input: str, output_a: str, output_b: str) -> dict:
response = judge_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": JUDGE_PROMPT},
{
"role": "user",
"content": f"User query: {user_input}\n\nResponse A:\n{output_a}\n\nResponse B:\n{output_b}"
}
],
temperature=0,
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def score_ab_results(ab_results: list[dict]) -> list[dict]:
scored = []
for i, result in enumerate(ab_results):
print(f"Judging pair {i+1}/{len(ab_results)}...")
scores = judge_pair(
result["input"],
result["output_a"],
result["output_b"]
)
scored.append({**result, "scores": scores})
return scored
Step 5: Calculate Statistical Significance
Numbers alone are not enough. Check that the difference is real and not random variation.
from scipy import stats
import numpy as np
def analyze_results(scored_results: list[dict]) -> dict:
scores_a = [r["scores"]["response_a"]["overall"] for r in scored_results]
scores_b = [r["scores"]["response_b"]["overall"] for r in scored_results]
wins_a = sum(1 for r in scored_results if r["scores"]["winner"] == "a")
wins_b = sum(1 for r in scored_results if r["scores"]["winner"] == "b")
ties = sum(1 for r in scored_results if r["scores"]["winner"] == "tie")
avg_a = np.mean(scores_a)
avg_b = np.mean(scores_b)
# Paired t-test (same inputs, different prompts)
t_stat, p_value = stats.ttest_rel(scores_a, scores_b)
significant = p_value < 0.05
winner = None
if significant:
winner = "a" if avg_a > avg_b else "b"
elif wins_a > wins_b * 1.2:
winner = "a (trending, not significant)"
elif wins_b > wins_a * 1.2:
winner = "b (trending, not significant)"
else:
winner = "no clear winner"
return {
"n": len(scored_results),
"variant_a_avg": round(avg_a, 3),
"variant_b_avg": round(avg_b, 3),
"wins_a": wins_a,
"wins_b": wins_b,
"ties": ties,
"p_value": round(p_value, 4),
"statistically_significant": significant,
"winner": winner,
"improvement_pct": round((max(avg_a, avg_b) / min(avg_a, avg_b) - 1) * 100, 1)
}
Step 6: Save and Report Results
Export results for the record and to share with the team.
import sqlite3
import csv
def save_test_results(test_name: str, variant_a: dict, variant_b: dict, scored_results: list[dict], analysis: dict):
conn = sqlite3.connect("ab_tests.db")
conn.execute("""
CREATE TABLE IF NOT EXISTS tests (
id INTEGER PRIMARY KEY AUTOINCREMENT,
test_name TEXT,
run_date TEXT,
variant_a_name TEXT,
variant_b_name TEXT,
n_samples INTEGER,
avg_a REAL,
avg_b REAL,
winner TEXT,
p_value REAL,
significant INTEGER,
analysis_json TEXT
)
""")
conn.execute(
"INSERT INTO tests VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(
test_name, datetime.now().isoformat(),
variant_a["name"], variant_b["name"],
analysis["n"], analysis["variant_a_avg"], analysis["variant_b_avg"],
analysis["winner"], analysis["p_value"],
1 if analysis["statistically_significant"] else 0,
json.dumps(analysis)
)
)
conn.commit()
conn.close()
print(f"\nTest '{test_name}' saved.")
print(f"Winner: {analysis['winner']} | Improvement: {analysis['improvement_pct']}%")
print(f"P-value: {analysis['p_value']} | Significant: {analysis['statistically_significant']}")
# Run a full test
TEST_INPUTS = [
"How do I cancel my subscription?",
"I was charged twice this month.",
"What is your refund policy?",
"How long does shipping take?",
"Can I change my order after placing it?",
]
results = run_ab_test(VARIANT_A, VARIANT_B, TEST_INPUTS)
scored = score_ab_results(results)
analysis = analyze_results(scored)
save_test_results("support-prompt-test-1", VARIANT_A, VARIANT_B, scored, analysis)
What to Build Next
- Build a multi-armed bandit system that serves the winning variant more often during the test itself, rather than waiting until the test completes
- Add a regression suite that runs your full set of historical A/B test inputs against any new prompt before you promote it to production
- Track prompt version history so you can always roll back to the last known-good version if a new prompt underperforms in production
Related Reading
- How to Write System Prompts That Control AI Behavior - write better variants to test with engineering principles, not guesses
- How to Build AI Guardrails for Safe Outputs - test guardrail prompts the same way you test production prompts
- How to Use AI for Automated Data Extraction - use extraction to pull structured metrics from your judge output for cleaner analysis
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment