Systems Library / AI Model Setup / How to Build AI Guardrails for Safe Outputs
AI Model Setup advanced

How to Build AI Guardrails for Safe Outputs

Implement content filters and safety checks for production AI applications.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

AI guardrails for safe output in business applications are the difference between a tool your legal team approves and one they shut down. Guardrails are code-level checks that run before output reaches users: blocking prohibited content, catching PII leaks, enforcing topic scope, and flagging responses that claim things the model cannot know. I build guardrails into every client-facing AI deployment. They have caught real problems including a customer support bot that started inventing pricing, a lead qualifier that disclosed confidential client names, and a content generator that produced regulatory violations.

Guardrails are not a replacement for good prompts. They are a safety net for when prompts fail, which they will.

What You Need Before Starting

Step 1: Define Your Guardrail Categories

Guardrails fall into four categories. Identify which ones apply to your use case before writing code.

GUARDRAIL_CATEGORIES = {
    "pii": "Personal identifying info: SSN, full credit card numbers, passwords",
    "scope": "Responses outside the defined task scope",
    "false_claims": "Claims the model cannot verify (prices, dates, availability)",
    "tone": "Hostile, discriminatory, or inappropriate language",
    "competitor": "Unprompted mentions of competitor products",
    "legal": "Regulatory violations, unlicensed advice, prohibited promises",
    "confidential": "Internal business data, client names, financial details",
}

# For your specific deployment, select which apply:
ACTIVE_GUARDRAILS = ["pii", "scope", "false_claims", "tone"]

Step 2: Build Rule-Based Guardrails

Fast pattern matching catches obvious violations at near-zero cost.

import re
from dataclasses import dataclass
from typing import Optional

@dataclass
class GuardrailResult:
    passed: bool
    category: Optional[str] = None
    reason: Optional[str] = None
    flagged_text: Optional[str] = None

def check_pii(text: str) -> GuardrailResult:
    patterns = {
        "ssn": r'\b\d{3}-\d{2}-\d{4}\b',
        "credit_card": r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b',
        "password_hint": r'(?i)(password|passcode|pin)\s*(?:is|:)\s*\S+',
        "full_api_key": r'\b[A-Za-z0-9_-]{32,}\b',  # Long random strings
    }

    for name, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            return GuardrailResult(
                passed=False,
                category="pii",
                reason=f"PII detected: {name}",
                flagged_text=match.group()[:50]
            )

    return GuardrailResult(passed=True)

def check_false_certainty(text: str, prohibited_claims: list = None) -> GuardrailResult:
    default_prohibited = [
        r'(?i)the price is \$[\d,]+',
        r'(?i)guaranteed to',
        r'(?i)100% (certain|sure|guaranteed)',
        r'(?i)your account (will|is) (be )?immediately',
        r'(?i)I can (confirm|guarantee|promise)',
    ]

    patterns = prohibited_claims or default_prohibited

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return GuardrailResult(
                passed=False,
                category="false_claims",
                reason="Response makes claims the model cannot verify",
                flagged_text=match.group()
            )

    return GuardrailResult(passed=True)

def check_scope(text: str, out_of_scope_topics: list) -> GuardrailResult:
    for topic in out_of_scope_topics:
        if re.search(topic, text, re.IGNORECASE):
            return GuardrailResult(
                passed=False,
                category="scope",
                reason=f"Out-of-scope topic: {topic}",
                flagged_text=topic
            )
    return GuardrailResult(passed=True)

def check_tone(text: str) -> GuardrailResult:
    hostile_patterns = [
        r'(?i)\byou (are|\'re) (stupid|an idiot|dumb|incompetent)',
        r'(?i)\bshut up\b',
        r'(?i)\bi (hate|despise)\b',
    ]

    for pattern in hostile_patterns:
        match = re.search(pattern, text)
        if match:
            return GuardrailResult(
                passed=False,
                category="tone",
                reason="Hostile or inappropriate tone detected",
                flagged_text=match.group()
            )

    return GuardrailResult(passed=True)

Step 3: Build an LLM-Based Semantic Guardrail

Rule-based checks miss subtle violations. Use a second, fast model call for semantic checks.

import anthropic
import json

secondary_client = anthropic.Anthropic()

def semantic_guardrail(
    ai_output: str,
    original_request: str,
    rules: list,
    model: str = "claude-haiku-4-5"
) -> GuardrailResult:
    rules_text = "\n".join([f"- {rule}" for rule in rules])

    check_prompt = f"""Review this AI response against the safety rules.

Original request: {original_request}

AI response to check:
{ai_output}

Safety rules:
{rules_text}

Return JSON only:
{{"passes": true/false, "violations": ["list of specific violations found"], "severity": "none|low|high"}}"""

    response = secondary_client.messages.create(
        model=model,
        max_tokens=200,
        messages=[{"role": "user", "content": check_prompt}]
    )

    try:
        result = json.loads(response.content[0].text.strip())
        passes = result.get("passes", True)
        violations = result.get("violations", [])

        if not passes:
            return GuardrailResult(
                passed=False,
                category="semantic",
                reason="; ".join(violations),
                flagged_text=None
            )
        return GuardrailResult(passed=True)

    except json.JSONDecodeError:
        # If semantic check fails to parse, fail open (let it through with a log)
        print("Warning: Semantic guardrail parse failed, failing open")
        return GuardrailResult(passed=True)

Step 4: Build the Guardrail Pipeline

Chain all checks together into a single pass/fail function.

from typing import Callable

class GuardrailPipeline:
    def __init__(self):
        self.checks: list[Callable] = []
        self.semantic_rules: list[str] = []
        self.out_of_scope_topics: list[str] = []
        self.use_semantic_check = False

    def add_pii_check(self):
        self.checks.append(lambda text, req: check_pii(text))
        return self

    def add_false_certainty_check(self, prohibited_patterns: list = None):
        self.checks.append(lambda text, req: check_false_certainty(text, prohibited_patterns))
        return self

    def add_scope_check(self, prohibited_topics: list):
        self.out_of_scope_topics = prohibited_topics
        self.checks.append(lambda text, req: check_scope(text, prohibited_topics))
        return self

    def add_tone_check(self):
        self.checks.append(lambda text, req: check_tone(text))
        return self

    def add_semantic_check(self, rules: list):
        self.semantic_rules = rules
        self.use_semantic_check = True
        return self

    def run(self, ai_output: str, original_request: str = "") -> list[GuardrailResult]:
        failures = []

        # Run fast rule-based checks first
        for check_fn in self.checks:
            result = check_fn(ai_output, original_request)
            if not result.passed:
                failures.append(result)

        # Only run semantic check if rule-based passed (saves cost)
        if not failures and self.use_semantic_check and self.semantic_rules:
            result = semantic_guardrail(ai_output, original_request, self.semantic_rules)
            if not result.passed:
                failures.append(result)

        return failures  # Empty list = all passed

def build_customer_support_guardrails() -> GuardrailPipeline:
    return (GuardrailPipeline()
        .add_pii_check()
        .add_tone_check()
        .add_false_certainty_check([
            r'(?i)your (refund|credit) will be processed in',
            r'(?i)this (issue|bug) will be fixed (by|on)',
            r'(?i)I can guarantee',
        ])
        .add_scope_check([
            r'competitor product',
            r'our internal systems',
            r'other customers',
        ])
        .add_semantic_check([
            "Do not reveal competitor information or internal business data",
            "Do not make promises about specific timelines or outcomes",
            "Do not share details about other customers",
        ])
    )

Step 5: Wrap Your AI Function With Guardrails

def safe_ai_response(
    ai_fn: Callable,
    guardrails: GuardrailPipeline,
    original_request: str,
    fallback_message: str = "I'm unable to provide a response to that. Please contact support directly.",
    **ai_kwargs
) -> dict:
    # Generate the AI response
    raw_response = ai_fn(**ai_kwargs)

    # Run guardrails
    failures = guardrails.run(raw_response, original_request)

    if not failures:
        return {
            "response": raw_response,
            "safe": True,
            "violations": []
        }
    else:
        # Log the violation for review
        violation_log = [{
            "category": f.category,
            "reason": f.reason,
            "flagged": f.flagged_text
        } for f in failures]

        print(f"GUARDRAIL VIOLATION: {violation_log}")

        return {
            "response": fallback_message,
            "safe": False,
            "violations": violation_log,
            "original": raw_response  # Kept for audit, not shown to user
        }

Step 6: Log and Alert on Violations

import sqlite3
import json
from datetime import datetime

def init_violation_log(db_path: str = "guardrail_violations.db"):
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS violations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            request_preview TEXT,
            response_preview TEXT,
            categories TEXT,
            reasons TEXT,
            logged_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)
    conn.commit()
    conn.close()

def log_violation(
    request: str,
    response: str,
    violations: list,
    db_path: str = "guardrail_violations.db"
):
    conn = sqlite3.connect(db_path)
    conn.execute(
        "INSERT INTO violations (request_preview, response_preview, categories, reasons) VALUES (?, ?, ?, ?)",
        (
            request[:200],
            response[:200],
            json.dumps([v["category"] for v in violations]),
            json.dumps([v["reason"] for v in violations])
        )
    )
    conn.commit()
    conn.close()

def get_violation_summary(hours: int = 24, db_path: str = "guardrail_violations.db") -> dict:
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute("""
        SELECT categories FROM violations
        WHERE logged_at > datetime('now', ?)
    """, (f'-{hours} hours',))

    rows = cur.fetchall()
    conn.close()

    from collections import Counter
    all_categories = []
    for row in rows:
        cats = json.loads(row[0])
        all_categories.extend(cats)

    return {
        "total_violations": len(rows),
        "by_category": dict(Counter(all_categories)),
        "hours": hours
    }

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems