Systems Library / AI Model Setup / How to Implement AI A/B Testing for Prompts
AI Model Setup advanced

How to Implement AI A/B Testing for Prompts

Run controlled experiments to find the best-performing prompts for each task.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Changing a prompt without testing it is guessing. I have seen teams spend weeks debating which version of a system prompt is better when they could have had a definitive answer in 48 hours with a simple A/B test. Prompt A/B testing runs two prompt variants against the same inputs, scores the outputs, and tells you which wins with statistical evidence.

This is essential for any AI system that handles meaningful volume. A 10% quality improvement on a bot handling 1,000 conversations per day is 100 better conversations every single day. The test setup takes an afternoon. The improvement compounds indefinitely.

What You Need Before Starting

Step 1: Install Dependencies

pip install openai scipy

Step 2: Define Your Prompt Variants

Be disciplined. Test one variable at a time. If you change the tone AND the structure AND the instruction, you will not know which change drove the result.

VARIANT_A = {
    "name": "variant_a_baseline",
    "system_prompt": """You are a helpful customer support agent.
Answer questions clearly and completely.
Be professional and friendly."""
}

VARIANT_B = {
    "name": "variant_b_direct",
    "system_prompt": """You are a customer support specialist. 
Lead with the answer. One sentence max before the solution.
Use bullet points for steps. Be direct, not corporate."""
}

# Good: Testing one thing (tone and format structure)
# Bad: Testing this + model + temperature + max_tokens simultaneously

Step 3: Build the Test Runner

Run both variants against identical inputs and collect outputs.

import openai
import random
from datetime import datetime

client = openai.OpenAI(api_key="YOUR_API_KEY")

def run_variant(
    system_prompt: str,
    user_message: str,
    model: str = "gpt-4o-mini",
    temperature: float = 0.3
) -> str:
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ],
        temperature=temperature
    )
    return response.choices[0].message.content

def run_ab_test(
    variant_a: dict,
    variant_b: dict,
    test_inputs: list[str],
    model: str = "gpt-4o-mini",
    shuffle: bool = True
) -> list[dict]:
    if shuffle:
        test_inputs = test_inputs.copy()
        random.shuffle(test_inputs)

    results = []

    for i, user_input in enumerate(test_inputs):
        print(f"Running test {i+1}/{len(test_inputs)}...")

        output_a = run_variant(variant_a["system_prompt"], user_input, model)
        output_b = run_variant(variant_b["system_prompt"], user_input, model)

        results.append({
            "input": user_input,
            "output_a": output_a,
            "output_b": output_b,
            "variant_a_name": variant_a["name"],
            "variant_b_name": variant_b["name"],
            "timestamp": datetime.now().isoformat()
        })

    return results

Step 4: Build the Judge Scorer

Use a judge model to score both outputs per input.

import json

JUDGE_PROMPT = """You are evaluating two AI responses to the same user query.
Score each response 1-10 on:
- Accuracy: Does it correctly address the question?
- Clarity: Is it easy to understand?
- Actionability: Does the user know what to do next?
- Conciseness: No unnecessary words?

Return ONLY JSON: {
  "response_a": {"accuracy": N, "clarity": N, "actionability": N, "conciseness": N, "overall": N},
  "response_b": {"accuracy": N, "clarity": N, "actionability": N, "conciseness": N, "overall": N},
  "winner": "a" or "b" or "tie",
  "reason": "one sentence"
}"""

judge_client = openai.OpenAI(api_key="YOUR_API_KEY")

def judge_pair(user_input: str, output_a: str, output_b: str) -> dict:
    response = judge_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": JUDGE_PROMPT},
            {
                "role": "user",
                "content": f"User query: {user_input}\n\nResponse A:\n{output_a}\n\nResponse B:\n{output_b}"
            }
        ],
        temperature=0,
        response_format={"type": "json_object"}
    )

    return json.loads(response.choices[0].message.content)

def score_ab_results(ab_results: list[dict]) -> list[dict]:
    scored = []
    for i, result in enumerate(ab_results):
        print(f"Judging pair {i+1}/{len(ab_results)}...")
        scores = judge_pair(
            result["input"],
            result["output_a"],
            result["output_b"]
        )
        scored.append({**result, "scores": scores})
    return scored

Step 5: Calculate Statistical Significance

Numbers alone are not enough. Check that the difference is real and not random variation.

from scipy import stats
import numpy as np

def analyze_results(scored_results: list[dict]) -> dict:
    scores_a = [r["scores"]["response_a"]["overall"] for r in scored_results]
    scores_b = [r["scores"]["response_b"]["overall"] for r in scored_results]

    wins_a = sum(1 for r in scored_results if r["scores"]["winner"] == "a")
    wins_b = sum(1 for r in scored_results if r["scores"]["winner"] == "b")
    ties = sum(1 for r in scored_results if r["scores"]["winner"] == "tie")

    avg_a = np.mean(scores_a)
    avg_b = np.mean(scores_b)

    # Paired t-test (same inputs, different prompts)
    t_stat, p_value = stats.ttest_rel(scores_a, scores_b)
    significant = p_value < 0.05

    winner = None
    if significant:
        winner = "a" if avg_a > avg_b else "b"
    elif wins_a > wins_b * 1.2:
        winner = "a (trending, not significant)"
    elif wins_b > wins_a * 1.2:
        winner = "b (trending, not significant)"
    else:
        winner = "no clear winner"

    return {
        "n": len(scored_results),
        "variant_a_avg": round(avg_a, 3),
        "variant_b_avg": round(avg_b, 3),
        "wins_a": wins_a,
        "wins_b": wins_b,
        "ties": ties,
        "p_value": round(p_value, 4),
        "statistically_significant": significant,
        "winner": winner,
        "improvement_pct": round((max(avg_a, avg_b) / min(avg_a, avg_b) - 1) * 100, 1)
    }

Step 6: Save and Report Results

Export results for the record and to share with the team.

import sqlite3
import csv

def save_test_results(test_name: str, variant_a: dict, variant_b: dict, scored_results: list[dict], analysis: dict):
    conn = sqlite3.connect("ab_tests.db")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS tests (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            test_name TEXT,
            run_date TEXT,
            variant_a_name TEXT,
            variant_b_name TEXT,
            n_samples INTEGER,
            avg_a REAL,
            avg_b REAL,
            winner TEXT,
            p_value REAL,
            significant INTEGER,
            analysis_json TEXT
        )
    """)
    conn.execute(
        "INSERT INTO tests VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
        (
            test_name, datetime.now().isoformat(),
            variant_a["name"], variant_b["name"],
            analysis["n"], analysis["variant_a_avg"], analysis["variant_b_avg"],
            analysis["winner"], analysis["p_value"],
            1 if analysis["statistically_significant"] else 0,
            json.dumps(analysis)
        )
    )
    conn.commit()
    conn.close()
    print(f"\nTest '{test_name}' saved.")
    print(f"Winner: {analysis['winner']} | Improvement: {analysis['improvement_pct']}%")
    print(f"P-value: {analysis['p_value']} | Significant: {analysis['statistically_significant']}")

# Run a full test
TEST_INPUTS = [
    "How do I cancel my subscription?",
    "I was charged twice this month.",
    "What is your refund policy?",
    "How long does shipping take?",
    "Can I change my order after placing it?",
]

results = run_ab_test(VARIANT_A, VARIANT_B, TEST_INPUTS)
scored = score_ab_results(results)
analysis = analyze_results(scored)
save_test_results("support-prompt-test-1", VARIANT_A, VARIANT_B, scored, analysis)

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems