How to Build AI Guardrails for Safe Outputs
Implement content filters and safety checks for production AI applications.
Jay Banlasan
The AI Systems Guy
AI guardrails for safe output in business applications are the difference between a tool your legal team approves and one they shut down. Guardrails are code-level checks that run before output reaches users: blocking prohibited content, catching PII leaks, enforcing topic scope, and flagging responses that claim things the model cannot know. I build guardrails into every client-facing AI deployment. They have caught real problems including a customer support bot that started inventing pricing, a lead qualifier that disclosed confidential client names, and a content generator that produced regulatory violations.
Guardrails are not a replacement for good prompts. They are a safety net for when prompts fail, which they will.
What You Need Before Starting
- Python 3.10+ with
anthropicandre(standard library) - A running AI function you want to protect
- A clear list of what your AI should never output
Step 1: Define Your Guardrail Categories
Guardrails fall into four categories. Identify which ones apply to your use case before writing code.
GUARDRAIL_CATEGORIES = {
"pii": "Personal identifying info: SSN, full credit card numbers, passwords",
"scope": "Responses outside the defined task scope",
"false_claims": "Claims the model cannot verify (prices, dates, availability)",
"tone": "Hostile, discriminatory, or inappropriate language",
"competitor": "Unprompted mentions of competitor products",
"legal": "Regulatory violations, unlicensed advice, prohibited promises",
"confidential": "Internal business data, client names, financial details",
}
# For your specific deployment, select which apply:
ACTIVE_GUARDRAILS = ["pii", "scope", "false_claims", "tone"]
Step 2: Build Rule-Based Guardrails
Fast pattern matching catches obvious violations at near-zero cost.
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class GuardrailResult:
passed: bool
category: Optional[str] = None
reason: Optional[str] = None
flagged_text: Optional[str] = None
def check_pii(text: str) -> GuardrailResult:
patterns = {
"ssn": r'\b\d{3}-\d{2}-\d{4}\b',
"credit_card": r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b',
"password_hint": r'(?i)(password|passcode|pin)\s*(?:is|:)\s*\S+',
"full_api_key": r'\b[A-Za-z0-9_-]{32,}\b', # Long random strings
}
for name, pattern in patterns.items():
match = re.search(pattern, text)
if match:
return GuardrailResult(
passed=False,
category="pii",
reason=f"PII detected: {name}",
flagged_text=match.group()[:50]
)
return GuardrailResult(passed=True)
def check_false_certainty(text: str, prohibited_claims: list = None) -> GuardrailResult:
default_prohibited = [
r'(?i)the price is \$[\d,]+',
r'(?i)guaranteed to',
r'(?i)100% (certain|sure|guaranteed)',
r'(?i)your account (will|is) (be )?immediately',
r'(?i)I can (confirm|guarantee|promise)',
]
patterns = prohibited_claims or default_prohibited
for pattern in patterns:
match = re.search(pattern, text)
if match:
return GuardrailResult(
passed=False,
category="false_claims",
reason="Response makes claims the model cannot verify",
flagged_text=match.group()
)
return GuardrailResult(passed=True)
def check_scope(text: str, out_of_scope_topics: list) -> GuardrailResult:
for topic in out_of_scope_topics:
if re.search(topic, text, re.IGNORECASE):
return GuardrailResult(
passed=False,
category="scope",
reason=f"Out-of-scope topic: {topic}",
flagged_text=topic
)
return GuardrailResult(passed=True)
def check_tone(text: str) -> GuardrailResult:
hostile_patterns = [
r'(?i)\byou (are|\'re) (stupid|an idiot|dumb|incompetent)',
r'(?i)\bshut up\b',
r'(?i)\bi (hate|despise)\b',
]
for pattern in hostile_patterns:
match = re.search(pattern, text)
if match:
return GuardrailResult(
passed=False,
category="tone",
reason="Hostile or inappropriate tone detected",
flagged_text=match.group()
)
return GuardrailResult(passed=True)
Step 3: Build an LLM-Based Semantic Guardrail
Rule-based checks miss subtle violations. Use a second, fast model call for semantic checks.
import anthropic
import json
secondary_client = anthropic.Anthropic()
def semantic_guardrail(
ai_output: str,
original_request: str,
rules: list,
model: str = "claude-haiku-4-5"
) -> GuardrailResult:
rules_text = "\n".join([f"- {rule}" for rule in rules])
check_prompt = f"""Review this AI response against the safety rules.
Original request: {original_request}
AI response to check:
{ai_output}
Safety rules:
{rules_text}
Return JSON only:
{{"passes": true/false, "violations": ["list of specific violations found"], "severity": "none|low|high"}}"""
response = secondary_client.messages.create(
model=model,
max_tokens=200,
messages=[{"role": "user", "content": check_prompt}]
)
try:
result = json.loads(response.content[0].text.strip())
passes = result.get("passes", True)
violations = result.get("violations", [])
if not passes:
return GuardrailResult(
passed=False,
category="semantic",
reason="; ".join(violations),
flagged_text=None
)
return GuardrailResult(passed=True)
except json.JSONDecodeError:
# If semantic check fails to parse, fail open (let it through with a log)
print("Warning: Semantic guardrail parse failed, failing open")
return GuardrailResult(passed=True)
Step 4: Build the Guardrail Pipeline
Chain all checks together into a single pass/fail function.
from typing import Callable
class GuardrailPipeline:
def __init__(self):
self.checks: list[Callable] = []
self.semantic_rules: list[str] = []
self.out_of_scope_topics: list[str] = []
self.use_semantic_check = False
def add_pii_check(self):
self.checks.append(lambda text, req: check_pii(text))
return self
def add_false_certainty_check(self, prohibited_patterns: list = None):
self.checks.append(lambda text, req: check_false_certainty(text, prohibited_patterns))
return self
def add_scope_check(self, prohibited_topics: list):
self.out_of_scope_topics = prohibited_topics
self.checks.append(lambda text, req: check_scope(text, prohibited_topics))
return self
def add_tone_check(self):
self.checks.append(lambda text, req: check_tone(text))
return self
def add_semantic_check(self, rules: list):
self.semantic_rules = rules
self.use_semantic_check = True
return self
def run(self, ai_output: str, original_request: str = "") -> list[GuardrailResult]:
failures = []
# Run fast rule-based checks first
for check_fn in self.checks:
result = check_fn(ai_output, original_request)
if not result.passed:
failures.append(result)
# Only run semantic check if rule-based passed (saves cost)
if not failures and self.use_semantic_check and self.semantic_rules:
result = semantic_guardrail(ai_output, original_request, self.semantic_rules)
if not result.passed:
failures.append(result)
return failures # Empty list = all passed
def build_customer_support_guardrails() -> GuardrailPipeline:
return (GuardrailPipeline()
.add_pii_check()
.add_tone_check()
.add_false_certainty_check([
r'(?i)your (refund|credit) will be processed in',
r'(?i)this (issue|bug) will be fixed (by|on)',
r'(?i)I can guarantee',
])
.add_scope_check([
r'competitor product',
r'our internal systems',
r'other customers',
])
.add_semantic_check([
"Do not reveal competitor information or internal business data",
"Do not make promises about specific timelines or outcomes",
"Do not share details about other customers",
])
)
Step 5: Wrap Your AI Function With Guardrails
def safe_ai_response(
ai_fn: Callable,
guardrails: GuardrailPipeline,
original_request: str,
fallback_message: str = "I'm unable to provide a response to that. Please contact support directly.",
**ai_kwargs
) -> dict:
# Generate the AI response
raw_response = ai_fn(**ai_kwargs)
# Run guardrails
failures = guardrails.run(raw_response, original_request)
if not failures:
return {
"response": raw_response,
"safe": True,
"violations": []
}
else:
# Log the violation for review
violation_log = [{
"category": f.category,
"reason": f.reason,
"flagged": f.flagged_text
} for f in failures]
print(f"GUARDRAIL VIOLATION: {violation_log}")
return {
"response": fallback_message,
"safe": False,
"violations": violation_log,
"original": raw_response # Kept for audit, not shown to user
}
Step 6: Log and Alert on Violations
import sqlite3
import json
from datetime import datetime
def init_violation_log(db_path: str = "guardrail_violations.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS violations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
request_preview TEXT,
response_preview TEXT,
categories TEXT,
reasons TEXT,
logged_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()
conn.close()
def log_violation(
request: str,
response: str,
violations: list,
db_path: str = "guardrail_violations.db"
):
conn = sqlite3.connect(db_path)
conn.execute(
"INSERT INTO violations (request_preview, response_preview, categories, reasons) VALUES (?, ?, ?, ?)",
(
request[:200],
response[:200],
json.dumps([v["category"] for v in violations]),
json.dumps([v["reason"] for v in violations])
)
)
conn.commit()
conn.close()
def get_violation_summary(hours: int = 24, db_path: str = "guardrail_violations.db") -> dict:
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("""
SELECT categories FROM violations
WHERE logged_at > datetime('now', ?)
""", (f'-{hours} hours',))
rows = cur.fetchall()
conn.close()
from collections import Counter
all_categories = []
for row in rows:
cats = json.loads(row[0])
all_categories.extend(cats)
return {
"total_violations": len(rows),
"by_category": dict(Counter(all_categories)),
"hours": hours
}
What to Build Next
- Add a human review queue for high-severity violations instead of just using fallback messages
- Build a guardrail coverage report that shows what percentage of your traffic goes through each check
- Implement adaptive guardrails that tighten or relax based on user trust tier
Related Reading
- Building AI Guardrails for Business Use - ai guardrails business use
- Input, Process, Output: The Universal AI Framework - input process output ai framework
- AI for Content Creation at Scale - ai content creation at scale
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment