How to Build AI Quality Scoring Pipelines
Automatically score AI output quality to route low-quality results for re-processing.
Jay Banlasan
The AI Systems Guy
A content pipeline I built was generating product descriptions and shipping them straight to a CMS. Fast, cheap, zero humans in the loop. Then a client noticed three descriptions that were factually wrong, two that were incomplete, and one that was just a list of the input keywords pasted together. The pipeline had no quality gate. Building ai output quality scoring automated systems fixed that. Now outputs that fail the quality bar get flagged, re-processed, or escalated for human review.
The goal is not perfection on every pass. It's catching the bottom 5-10% of outputs that are clearly broken before they reach production. That alone removes most of the damage.
What You Need Before Starting
- Python 3.10+
anthropicSDK- A defined quality bar for your specific use case (length, structure, factual requirements)
- Sample outputs to calibrate scoring thresholds
Step 1: Define Your Quality Dimensions
Quality means different things for different task types. Start by listing what "bad" looks like for your use case.
from dataclasses import dataclass
from typing import Optional
@dataclass
class QualitySpec:
task_type: str
min_length_chars: int # minimum character count
max_length_chars: int # maximum before it's probably bloated
required_elements: list[str] # must contain these strings or patterns
forbidden_patterns: list[str]# must NOT contain these
use_ai_judge: bool # run a second AI pass to evaluate quality
pass_threshold: float # 0.0-1.0, minimum score to pass
SPECS = {
"product_description": QualitySpec(
task_type="product_description",
min_length_chars=150,
max_length_chars=600,
required_elements=[], # checked via AI judge
forbidden_patterns=[
"as an ai", "i cannot", "i'm sorry",
"placeholder", "lorem ipsum", "[insert"
],
use_ai_judge=True,
pass_threshold=0.7
),
"lead_summary": QualitySpec(
task_type="lead_summary",
min_length_chars=80,
max_length_chars=400,
required_elements=[],
forbidden_patterns=["as an ai", "i'm unable", "i don't have"],
use_ai_judge=True,
pass_threshold=0.75
),
"email_subject": QualitySpec(
task_type="email_subject",
min_length_chars=10,
max_length_chars=70,
required_elements=[],
forbidden_patterns=["subject:", "re:", "fwd:"],
use_ai_judge=False,
pass_threshold=0.6
),
}
Step 2: Build the Rule-Based Scorer
Fast and cheap. Runs first. Catches the obvious failures without burning API tokens.
import re
@dataclass
class QualityResult:
passed: bool
score: float # 0.0 to 1.0
flags: list[str] # what failed
details: dict
def rule_based_score(output: str, spec: QualitySpec) -> QualityResult:
flags = []
score = 1.0
# Length checks
chars = len(output.strip())
if chars < spec.min_length_chars:
flags.append(f"too_short:{chars}chars<{spec.min_length_chars}")
score -= 0.4
if chars > spec.max_length_chars:
flags.append(f"too_long:{chars}chars>{spec.max_length_chars}")
score -= 0.15
# Forbidden pattern checks
output_lower = output.lower()
for pattern in spec.forbidden_patterns:
if pattern in output_lower:
flags.append(f"forbidden:'{pattern}'")
score -= 0.3
# Required element checks
for element in spec.required_elements:
if element.lower() not in output_lower:
flags.append(f"missing:'{element}'")
score -= 0.2
# Repetition check — repeated sentences signal a failure mode
sentences = re.split(r'[.!?]+', output)
sentences = [s.strip().lower() for s in sentences if len(s.strip()) > 20]
if len(sentences) != len(set(sentences)):
flags.append("repeated_sentences")
score -= 0.25
score = max(0.0, min(1.0, score))
passed = score >= spec.pass_threshold and not any("forbidden" in f for f in flags)
return QualityResult(
passed=passed, score=round(score, 3),
flags=flags,
details={"char_count": chars, "sentence_count": len(sentences)}
)
Step 3: Build the AI Judge
For outputs that pass rule-based checks but need deeper evaluation, run a second AI pass. Use a cheaper model for judging.
import anthropic
import json
_client = anthropic.Anthropic()
def ai_judge_score(output: str, task_type: str, original_input: str = "") -> float:
judge_prompt = f"""You are a quality evaluator for AI-generated content.
Task type: {task_type}
Original input: {original_input[:500] if original_input else 'Not provided'}
Content to evaluate:
---
{output[:2000]}
---
Score this content from 0.0 to 1.0 based on:
- Relevance: Does it address the task? (0-0.3)
- Completeness: Is it substantively complete? (0-0.3)
- Quality: Is it well-written, specific, non-generic? (0-0.2)
- Safety: No harmful, incorrect, or placeholder content? (0-0.2)
Return ONLY a JSON object: {{"score": 0.00, "reasoning": "one sentence"}}"""
try:
response = _client.messages.create(
model="claude-haiku-3",
max_tokens=100,
messages=[{"role": "user", "content": judge_prompt}]
)
data = json.loads(response.content[0].text.strip())
return float(data.get("score", 0.5))
except Exception:
return 0.5 # neutral score on judge failure
Step 4: Build the Full Scoring Pipeline
Combine rule-based and AI judging into one function.
def score_output(output: str, task_type: str,
original_input: str = "") -> QualityResult:
spec = SPECS.get(task_type)
if not spec:
raise ValueError(f"Unknown task type: {task_type}")
# Fast rule check first
result = rule_based_score(output, spec)
# If it failed hard rules, skip AI judge — it's clearly bad
if result.score < 0.3 or any("forbidden" in f for f in result.flags):
return result
# AI judge for borderline or passing results
if spec.use_ai_judge:
ai_score = ai_judge_score(output, task_type, original_input)
# Blend scores: 40% rules, 60% AI judge
blended = round(result.score * 0.4 + ai_score * 0.6, 3)
result.score = blended
result.passed = blended >= spec.pass_threshold
result.details["ai_judge_score"] = ai_score
return result
Step 5: Build the Retry Loop
When quality fails, retry up to N times before escalating.
def generate_with_quality_gate(prompt: str, task_type: str,
max_retries: int = 2,
model: str = "claude-haiku-3") -> dict:
attempts = []
for attempt in range(max_retries + 1):
response = _client.messages.create(
model=model, max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
output = response.content[0].text
quality = score_output(output, task_type, original_input=prompt)
attempts.append({"output": output, "quality": quality, "attempt": attempt + 1})
if quality.passed:
return {
"output": output,
"quality": quality,
"attempts": attempt + 1,
"status": "passed"
}
# Augment the prompt for retry
if attempt < max_retries:
issues = ", ".join(quality.flags[:3])
prompt += f"\n\nPrevious attempt had issues: {issues}. Please fix these."
# All retries failed — return best attempt, flagged for human review
best = max(attempts, key=lambda x: x["quality"].score)
return {
"output": best["output"],
"quality": best["quality"],
"attempts": max_retries + 1,
"status": "needs_review"
}
Step 6: Log Quality Data for Continuous Improvement
Track quality scores over time. This tells you which prompts, models, and task types are underperforming.
import sqlite3
from datetime import datetime
def log_quality_event(task_type: str, model: str, score: float,
passed: bool, flags: list, attempts: int):
conn = sqlite3.connect("quality_log.db")
conn.execute("""
CREATE TABLE IF NOT EXISTS quality_events (
ts TEXT, task_type TEXT, model TEXT, score REAL,
passed INTEGER, flags TEXT, attempts INTEGER
)
""")
conn.execute("INSERT INTO quality_events VALUES (?,?,?,?,?,?,?)", (
datetime.utcnow().isoformat(), task_type, model,
score, int(passed), json.dumps(flags), attempts
))
conn.commit()
conn.close()
def quality_report(days: int = 7) -> list:
conn = sqlite3.connect("quality_log.db")
rows = conn.execute("""
SELECT task_type, model,
AVG(score) as avg_score,
SUM(CASE WHEN passed THEN 1 ELSE 0 END) * 1.0 / COUNT(*) as pass_rate,
AVG(attempts) as avg_attempts, COUNT(*) as total
FROM quality_events
WHERE ts >= datetime('now', ?)
GROUP BY task_type, model
ORDER BY pass_rate
""", (f'-{days} days',)).fetchall()
conn.close()
return [{"task": r[0], "model": r[1], "avg_score": round(r[2], 3),
"pass_rate": round(r[3]*100, 1), "avg_attempts": round(r[4], 1),
"total": r[5]} for r in rows]
What to Build Next
- Build a human review queue UI for outputs flagged as
needs_reviewso your team can approve or reject them - Use quality score data to A/B test prompt variants and measure which version produces higher pass rates
- Add task-specific rubrics for specialized domains like legal copy, medical descriptions, or technical documentation
Related Reading
- How to Build a Multi-Model AI Router - route to a higher-capability model when quality scores are consistently low
- How to Build Automatic Model Failover Systems - combine with failover to retry failures on a different provider
- How to Build AI Request Throttling Systems - quality retries multiply your request volume; throttle to stay within limits
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment