Systems Library / Marketing Automation / How to Create an AI-Powered Content Audit System
Marketing Automation content marketing

How to Create an AI-Powered Content Audit System

Audit your entire content library with AI to find optimization opportunities.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Most content libraries are a mess of old articles that are cannibalizing each other, duplicating topics, and sitting at page three of Google with no plan to move them. This ai content audit system website tool crawls your entire content library, categorizes every piece by health status, and surfaces a prioritized action list. Keep, consolidate, refresh, or delete. That is the output.

Running a full content audit used to take a week. This system does the analysis overnight. You spend your time executing on the plan, not building it.

What You Need Before Starting

Step 1: Pull All Content URLs and Metadata

import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv

load_dotenv()

def crawl_content_library(sitemap_url: str, content_paths: list = None) -> list:
    """Fetch all content URLs from your sitemap with metadata."""
    
    if content_paths is None:
        content_paths = ["/blog/", "/articles/", "/systems/", "/guides/", "/posts/"]
    
    try:
        response = requests.get(sitemap_url, timeout=15)
        soup = BeautifulSoup(response.content, "xml")
        all_urls = [loc.text for loc in soup.find_all("loc")]
    except Exception as e:
        print(f"Sitemap error: {e}")
        return []
    
    content_urls = [u for u in all_urls if any(p in u for p in content_paths)]
    
    print(f"Found {len(content_urls)} content URLs")
    
    pages = []
    for url in content_urls:
        meta = extract_page_metadata(url)
        if meta:
            pages.append(meta)
    
    return pages

def extract_page_metadata(url: str) -> dict:
    try:
        response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(response.content, "html.parser")
        
        h1 = soup.find("h1")
        title_tag = soup.find("title")
        meta_desc = soup.find("meta", attrs={"name": "description"})
        
        word_count = len(soup.get_text().split())
        h2_count = len(soup.find_all("h2"))
        
        last_modified = response.headers.get("Last-Modified", "")
        
        return {
            "url": url,
            "h1": h1.get_text(strip=True) if h1 else "",
            "page_title": title_tag.get_text(strip=True) if title_tag else "",
            "meta_description": meta_desc.get("content", "") if meta_desc else "",
            "word_count": word_count,
            "h2_count": h2_count,
            "last_modified": last_modified,
            "slug": url.rstrip("/").split("/")[-1]
        }
    except Exception as e:
        return None

Step 2: Enrich with Traffic Data

from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import RunReportRequest, DateRange, Dimension, Metric
from google.oauth2 import service_account

def get_traffic_data(property_id: str) -> dict:
    credentials = service_account.Credentials.from_service_account_file(
        os.getenv("GOOGLE_CREDENTIALS_PATH"),
        scopes=["https://www.googleapis.com/auth/analytics.readonly"]
    )
    ga_client = BetaAnalyticsDataClient(credentials=credentials)
    
    request = RunReportRequest(
        property=f"properties/{property_id}",
        date_ranges=[DateRange(start_date="90daysAgo", end_date="today")],
        dimensions=[Dimension(name="pagePath")],
        metrics=[
            Metric(name="screenPageViews"),
            Metric(name="averageSessionDuration"),
            Metric(name="bounceRate"),
            Metric(name="newUsers")
        ],
        limit=500
    )
    
    response = ga_client.run_report(request)
    
    traffic = {}
    for row in response.rows:
        path = row.dimension_values[0].value
        traffic[path] = {
            "pageviews_90d": int(row.metric_values[0].value),
            "avg_duration_sec": float(row.metric_values[1].value),
            "bounce_rate": float(row.metric_values[2].value),
            "new_users_90d": int(row.metric_values[3].value)
        }
    
    return traffic

def merge_traffic_data(pages: list, traffic: dict) -> list:
    for page in pages:
        path = "/" + "/".join(page["url"].split("/")[3:])
        t = traffic.get(path, {})
        
        page["pageviews_90d"] = t.get("pageviews_90d", 0)
        page["avg_duration_sec"] = t.get("avg_duration_sec", 0)
        page["bounce_rate"] = t.get("bounce_rate", 1.0)
        page["new_users_90d"] = t.get("new_users_90d", 0)
    
    return pages

Step 3: AI Health Scoring for Each Page

import anthropic
import json

client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

def score_content_health(page: dict, similar_pages: list = None) -> dict:
    
    similar_str = ""
    if similar_pages:
        similar_str = "\nSIMILAR PAGES ON YOUR SITE (possible cannibalization):\n"
        similar_str += "\n".join(f"- {p['h1']} ({p['url']})" for p in similar_pages[:3])
    
    prompt = f"""Audit this content page and recommend an action.

PAGE DATA:
URL: {page['url']}
Title/H1: {page['h1']}
Meta Description: {page['meta_description'][:150]}
Word Count: {page['word_count']}
H2 Sections: {page['h2_count']}
Last Modified: {page['last_modified'] or 'Unknown'}

PERFORMANCE (last 90 days):
Pageviews: {page['pageviews_90d']}
Avg Session Duration: {page['avg_duration_sec']:.0f} seconds
Bounce Rate: {page['bounce_rate']:.0%}
New Users: {page['new_users_90d']}
{similar_str}

Audit this page and return JSON:
{{
  "health_status": "healthy/needs_refresh/consolidate/delete",
  "health_score": 0,
  "action": "keep/refresh/consolidate/delete/redirect",
  "urgency": "immediate/next_quarter/low",
  "reasoning": "2 sentences on why this action is recommended",
  "specific_issues": [],
  "refresh_effort": "none/low/medium/high",
  "cannibalization_risk": "none/low/medium/high",
  "seo_opportunity": "none/low/medium/high"
}}

Health score: 1-100 based on traffic, engagement, content quality signals, and freshness.
Delete candidates: under 200 words, under 100 pageviews in 90 days, duplicate coverage.
Consolidate candidates: near-duplicate topics, both low traffic, one could absorb the other.
Refresh candidates: decent traffic but high bounce, or clear traffic decay signal."""

    message = client.messages.create(
        model="claude-opus-4-5",
        max_tokens=600,
        messages=[{"role": "user", "content": prompt}]
    )
    
    raw = message.content[0].text.strip()
    if raw.startswith("```"):
        raw = raw.split("```")[1]
        if raw.startswith("json"):
            raw = raw[4:]
    
    result = json.loads(raw)
    result.update(page)
    return result

Step 4: Generate the Full Audit Report

import csv
from datetime import datetime

def run_full_audit(sitemap_url: str, ga_property_id: str, output_path: str = "content-audit.csv"):
    print("Step 1: Crawling content library...")
    pages = crawl_content_library(sitemap_url)
    
    print("Step 2: Fetching traffic data...")
    traffic = get_traffic_data(ga_property_id)
    pages = merge_traffic_data(pages, traffic)
    
    print("Step 3: Scoring content health...")
    audited = []
    for i, page in enumerate(pages):
        print(f"  Auditing {i+1}/{len(pages)}: {page['slug']}")
        
        similar = [p for p in pages if p != page
                   and any(w in p['h1'].lower() for w in page['h1'].lower().split()[:3])][:3]
        
        scored = score_content_health(page, similar)
        audited.append(scored)
    
    audited.sort(key=lambda x: x.get("health_score", 50))
    
    fieldnames = [
        "url", "h1", "word_count", "pageviews_90d", "new_users_90d",
        "bounce_rate", "avg_duration_sec", "health_status", "health_score",
        "action", "urgency", "reasoning", "specific_issues",
        "cannibalization_risk", "seo_opportunity", "refresh_effort"
    ]
    
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(audited)
    
    deletes = len([p for p in audited if p.get("action") == "delete"])
    refreshes = len([p for p in audited if p.get("action") == "refresh"])
    consolidates = len([p for p in audited if p.get("action") == "consolidate"])
    keeps = len([p for p in audited if p.get("action") == "keep"])
    
    print(f"\nAUDIT COMPLETE: {len(audited)} pages")
    print(f"  Keep: {keeps}")
    print(f"  Refresh: {refreshes}")
    print(f"  Consolidate: {consolidates}")
    print(f"  Delete: {deletes}")
    print(f"  Report saved to {output_path}")
    
    return audited

if __name__ == "__main__":
    results = run_full_audit(
        sitemap_url="https://yoursite.com/sitemap.xml",
        ga_property_id=os.getenv("GA4_PROPERTY_ID"),
        output_path=f"content-audit-{datetime.now().strftime('%Y-%m-%d')}.csv"
    )

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems