Systems Library / AI Model Setup / How to Implement AI Content Moderation
AI Model Setup advanced

How to Implement AI Content Moderation

Automatically moderate user-generated content using AI classification.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Every platform that accepts user-generated content needs moderation. Without it, your community turns toxic fast and your brand takes the hit. Hiring human moderators is expensive and slow. AI moderation that catches 95% of violations instantly, at any scale, with a human review queue for edge cases, is how you run a clean platform without a headcount problem.

I have built moderation systems for online communities, comment sections, and user review platforms. The key is layering OpenAI's built-in moderation endpoint with a custom classifier tuned to your specific community rules. The built-in API catches the obvious violations. Your custom layer catches the context-specific ones.

What You Need Before Starting

Step 1: Start with OpenAI's Moderation Endpoint

The moderation endpoint is free and catches the major violation categories with high accuracy. Always run this first.

import openai

client = openai.OpenAI(api_key="YOUR_API_KEY")

def run_openai_moderation(content: str) -> dict:
    response = client.moderations.create(input=content)
    result = response.results[0]

    return {
        "flagged": result.flagged,
        "categories": {
            cat: getattr(result.categories, cat)
            for cat in [
                "harassment", "harassment_threatening", "hate",
                "hate_threatening", "self_harm", "self_harm_intent",
                "sexual", "sexual_minors", "violence", "violence_graphic"
            ]
        },
        "scores": {
            cat: round(getattr(result.category_scores, cat), 4)
            for cat in [
                "harassment", "harassment_threatening", "hate",
                "hate_threatening", "self_harm", "self_harm_intent",
                "sexual", "sexual_minors", "violence", "violence_graphic"
            ]
        }
    }

Step 2: Build a Custom Policy Classifier

OpenAI's endpoint does not know your specific community rules. A custom classifier handles spam, off-topic content, competitor mentions, and platform-specific violations.

COMMUNITY_POLICY = """
You are a content moderator for a professional marketing community. 
Classify the content against these specific rules:

VIOLATIONS (return the category name):
- spam: Repeated content, excessive self-promotion, or affiliate links
- off_topic: Not related to marketing, advertising, or business growth
- competitor_mention: Mentions specific competing platforms or services negatively
- personal_attack: Attacks a specific person's character rather than their ideas
- misinformation: Makes false claims about marketing tactics or statistics
- solicitation: Direct sales pitches or unsolicited service offers

CLEAN: Content that follows community rules

Return JSON: {"clean": true/false, "category": "category_name_or_null", "confidence": 0.0-1.0, "reason": "brief reason"}
Only return the JSON, nothing else."""

def run_custom_moderation(content: str, model: str = "gpt-4o-mini") -> dict:
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": COMMUNITY_POLICY},
            {"role": "user", "content": f"Content to moderate:\n\n{content}"}
        ],
        temperature=0,
        response_format={"type": "json_object"}
    )

    import json
    return json.loads(response.choices[0].message.content)

Step 3: Build the Combined Moderation Pipeline

Run both checks and combine results with a clear action decision.

from enum import Enum
from dataclasses import dataclass

class ModerationAction(str, Enum):
    APPROVE = "approve"
    BLOCK = "block"
    REVIEW = "review"  # Send to human review queue

@dataclass
class ModerationResult:
    action: ModerationAction
    reason: str
    confidence: float
    openai_flagged: bool
    custom_category: str | None
    raw_scores: dict

def moderate_content(content: str) -> ModerationResult:
    # Skip empty content
    if not content.strip():
        return ModerationResult(
            action=ModerationAction.BLOCK,
            reason="Empty content",
            confidence=1.0,
            openai_flagged=False,
            custom_category=None,
            raw_scores={}
        )

    # Layer 1: OpenAI moderation (free, fast)
    openai_result = run_openai_moderation(content)

    # Hard block on OpenAI violations above threshold
    high_risk_categories = ["sexual_minors", "hate_threatening", "violence_graphic", "self_harm_intent"]
    for cat in high_risk_categories:
        if openai_result["scores"].get(cat, 0) > 0.5:
            return ModerationResult(
                action=ModerationAction.BLOCK,
                reason=f"High-risk violation: {cat}",
                confidence=openai_result["scores"][cat],
                openai_flagged=True,
                custom_category=cat,
                raw_scores=openai_result["scores"]
            )

    if openai_result["flagged"]:
        # Flagged but not high-risk: send to human review
        top_category = max(openai_result["scores"], key=openai_result["scores"].get)
        return ModerationResult(
            action=ModerationAction.REVIEW,
            reason=f"OpenAI flagged: {top_category}",
            confidence=openai_result["scores"][top_category],
            openai_flagged=True,
            custom_category=top_category,
            raw_scores=openai_result["scores"]
        )

    # Layer 2: Custom policy check (only if OpenAI didn't flag)
    custom_result = run_custom_moderation(content)

    if not custom_result["clean"]:
        if custom_result["confidence"] >= 0.85:
            return ModerationResult(
                action=ModerationAction.BLOCK,
                reason=f"Policy violation: {custom_result['category']}",
                confidence=custom_result["confidence"],
                openai_flagged=False,
                custom_category=custom_result["category"],
                raw_scores=openai_result["scores"]
            )
        else:
            return ModerationResult(
                action=ModerationAction.REVIEW,
                reason=f"Uncertain policy violation: {custom_result['category']} ({custom_result['reason']})",
                confidence=custom_result["confidence"],
                openai_flagged=False,
                custom_category=custom_result["category"],
                raw_scores=openai_result["scores"]
            )

    return ModerationResult(
        action=ModerationAction.APPROVE,
        reason="Passed all checks",
        confidence=1.0 - max(openai_result["scores"].values()),
        openai_flagged=False,
        custom_category=None,
        raw_scores=openai_result["scores"]
    )

Step 4: Build the Human Review Queue

Items that land in REVIEW need to be visible to moderators.

import sqlite3
from datetime import datetime

def init_review_queue():
    conn = sqlite3.connect("moderation.db")
    conn.execute("""
        CREATE TABLE IF NOT EXISTS review_queue (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            content TEXT,
            author_id TEXT,
            reason TEXT,
            confidence REAL,
            category TEXT,
            submitted_at TEXT,
            reviewed_at TEXT,
            reviewer_decision TEXT,
            reviewer_id TEXT
        )
    """)
    conn.commit()
    conn.close()

def queue_for_review(content: str, author_id: str, result: ModerationResult) -> int:
    conn = sqlite3.connect("moderation.db")
    cursor = conn.execute(
        """INSERT INTO review_queue (content, author_id, reason, confidence, category, submitted_at)
        VALUES (?, ?, ?, ?, ?, ?)""",
        (content, author_id, result.reason, result.confidence, result.custom_category, datetime.now().isoformat())
    )
    row_id = cursor.lastrowid
    conn.commit()
    conn.close()
    return row_id

def process_submission(content: str, author_id: str) -> dict:
    result = moderate_content(content)

    if result.action == ModerationAction.REVIEW:
        queue_id = queue_for_review(content, author_id, result)
        return {"published": False, "queued": True, "queue_id": queue_id}

    return {
        "published": result.action == ModerationAction.APPROVE,
        "blocked": result.action == ModerationAction.BLOCK,
        "reason": result.reason if result.action == ModerationAction.BLOCK else None
    }

init_review_queue()

Step 5: Add Moderation Analytics

Track what is being flagged so you can tune your policy and spot bad actors.

def get_moderation_stats(days: int = 7) -> dict:
    conn = sqlite3.connect("moderation.db")
    rows = conn.execute(
        """SELECT category, COUNT(*) as count
        FROM review_queue
        WHERE submitted_at > datetime('now', ?)
        GROUP BY category ORDER BY count DESC""",
        (f"-{days} days",)
    ).fetchall()
    conn.close()

    return {
        "period_days": days,
        "violations_by_category": {r[0] or "unknown": r[1] for r in rows},
        "total_reviewed": sum(r[1] for r in rows)
    }

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems