How to Create an AI-Powered Content Audit System
Audit your entire content library with AI to find optimization opportunities.
Jay Banlasan
The AI Systems Guy
Most content libraries are a mess of old articles that are cannibalizing each other, duplicating topics, and sitting at page three of Google with no plan to move them. This ai content audit system website tool crawls your entire content library, categorizes every piece by health status, and surfaces a prioritized action list. Keep, consolidate, refresh, or delete. That is the output.
Running a full content audit used to take a week. This system does the analysis overnight. You spend your time executing on the plan, not building it.
What You Need Before Starting
- Python 3.10 or higher
- Anthropic API key
- SerpAPI key
- Google Analytics 4 API access
- Your sitemap URL
pip install anthropic requests beautifulsoup4 google-analytics-data python-dotenv
Step 1: Pull All Content URLs and Metadata
import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()
def crawl_content_library(sitemap_url: str, content_paths: list = None) -> list:
"""Fetch all content URLs from your sitemap with metadata."""
if content_paths is None:
content_paths = ["/blog/", "/articles/", "/systems/", "/guides/", "/posts/"]
try:
response = requests.get(sitemap_url, timeout=15)
soup = BeautifulSoup(response.content, "xml")
all_urls = [loc.text for loc in soup.find_all("loc")]
except Exception as e:
print(f"Sitemap error: {e}")
return []
content_urls = [u for u in all_urls if any(p in u for p in content_paths)]
print(f"Found {len(content_urls)} content URLs")
pages = []
for url in content_urls:
meta = extract_page_metadata(url)
if meta:
pages.append(meta)
return pages
def extract_page_metadata(url: str) -> dict:
try:
response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(response.content, "html.parser")
h1 = soup.find("h1")
title_tag = soup.find("title")
meta_desc = soup.find("meta", attrs={"name": "description"})
word_count = len(soup.get_text().split())
h2_count = len(soup.find_all("h2"))
last_modified = response.headers.get("Last-Modified", "")
return {
"url": url,
"h1": h1.get_text(strip=True) if h1 else "",
"page_title": title_tag.get_text(strip=True) if title_tag else "",
"meta_description": meta_desc.get("content", "") if meta_desc else "",
"word_count": word_count,
"h2_count": h2_count,
"last_modified": last_modified,
"slug": url.rstrip("/").split("/")[-1]
}
except Exception as e:
return None
Step 2: Enrich with Traffic Data
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import RunReportRequest, DateRange, Dimension, Metric
from google.oauth2 import service_account
def get_traffic_data(property_id: str) -> dict:
credentials = service_account.Credentials.from_service_account_file(
os.getenv("GOOGLE_CREDENTIALS_PATH"),
scopes=["https://www.googleapis.com/auth/analytics.readonly"]
)
ga_client = BetaAnalyticsDataClient(credentials=credentials)
request = RunReportRequest(
property=f"properties/{property_id}",
date_ranges=[DateRange(start_date="90daysAgo", end_date="today")],
dimensions=[Dimension(name="pagePath")],
metrics=[
Metric(name="screenPageViews"),
Metric(name="averageSessionDuration"),
Metric(name="bounceRate"),
Metric(name="newUsers")
],
limit=500
)
response = ga_client.run_report(request)
traffic = {}
for row in response.rows:
path = row.dimension_values[0].value
traffic[path] = {
"pageviews_90d": int(row.metric_values[0].value),
"avg_duration_sec": float(row.metric_values[1].value),
"bounce_rate": float(row.metric_values[2].value),
"new_users_90d": int(row.metric_values[3].value)
}
return traffic
def merge_traffic_data(pages: list, traffic: dict) -> list:
for page in pages:
path = "/" + "/".join(page["url"].split("/")[3:])
t = traffic.get(path, {})
page["pageviews_90d"] = t.get("pageviews_90d", 0)
page["avg_duration_sec"] = t.get("avg_duration_sec", 0)
page["bounce_rate"] = t.get("bounce_rate", 1.0)
page["new_users_90d"] = t.get("new_users_90d", 0)
return pages
Step 3: AI Health Scoring for Each Page
import anthropic
import json
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
def score_content_health(page: dict, similar_pages: list = None) -> dict:
similar_str = ""
if similar_pages:
similar_str = "\nSIMILAR PAGES ON YOUR SITE (possible cannibalization):\n"
similar_str += "\n".join(f"- {p['h1']} ({p['url']})" for p in similar_pages[:3])
prompt = f"""Audit this content page and recommend an action.
PAGE DATA:
URL: {page['url']}
Title/H1: {page['h1']}
Meta Description: {page['meta_description'][:150]}
Word Count: {page['word_count']}
H2 Sections: {page['h2_count']}
Last Modified: {page['last_modified'] or 'Unknown'}
PERFORMANCE (last 90 days):
Pageviews: {page['pageviews_90d']}
Avg Session Duration: {page['avg_duration_sec']:.0f} seconds
Bounce Rate: {page['bounce_rate']:.0%}
New Users: {page['new_users_90d']}
{similar_str}
Audit this page and return JSON:
{{
"health_status": "healthy/needs_refresh/consolidate/delete",
"health_score": 0,
"action": "keep/refresh/consolidate/delete/redirect",
"urgency": "immediate/next_quarter/low",
"reasoning": "2 sentences on why this action is recommended",
"specific_issues": [],
"refresh_effort": "none/low/medium/high",
"cannibalization_risk": "none/low/medium/high",
"seo_opportunity": "none/low/medium/high"
}}
Health score: 1-100 based on traffic, engagement, content quality signals, and freshness.
Delete candidates: under 200 words, under 100 pageviews in 90 days, duplicate coverage.
Consolidate candidates: near-duplicate topics, both low traffic, one could absorb the other.
Refresh candidates: decent traffic but high bounce, or clear traffic decay signal."""
message = client.messages.create(
model="claude-opus-4-5",
max_tokens=600,
messages=[{"role": "user", "content": prompt}]
)
raw = message.content[0].text.strip()
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
result = json.loads(raw)
result.update(page)
return result
Step 4: Generate the Full Audit Report
import csv
from datetime import datetime
def run_full_audit(sitemap_url: str, ga_property_id: str, output_path: str = "content-audit.csv"):
print("Step 1: Crawling content library...")
pages = crawl_content_library(sitemap_url)
print("Step 2: Fetching traffic data...")
traffic = get_traffic_data(ga_property_id)
pages = merge_traffic_data(pages, traffic)
print("Step 3: Scoring content health...")
audited = []
for i, page in enumerate(pages):
print(f" Auditing {i+1}/{len(pages)}: {page['slug']}")
similar = [p for p in pages if p != page
and any(w in p['h1'].lower() for w in page['h1'].lower().split()[:3])][:3]
scored = score_content_health(page, similar)
audited.append(scored)
audited.sort(key=lambda x: x.get("health_score", 50))
fieldnames = [
"url", "h1", "word_count", "pageviews_90d", "new_users_90d",
"bounce_rate", "avg_duration_sec", "health_status", "health_score",
"action", "urgency", "reasoning", "specific_issues",
"cannibalization_risk", "seo_opportunity", "refresh_effort"
]
with open(output_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
writer.writerows(audited)
deletes = len([p for p in audited if p.get("action") == "delete"])
refreshes = len([p for p in audited if p.get("action") == "refresh"])
consolidates = len([p for p in audited if p.get("action") == "consolidate"])
keeps = len([p for p in audited if p.get("action") == "keep"])
print(f"\nAUDIT COMPLETE: {len(audited)} pages")
print(f" Keep: {keeps}")
print(f" Refresh: {refreshes}")
print(f" Consolidate: {consolidates}")
print(f" Delete: {deletes}")
print(f" Report saved to {output_path}")
return audited
if __name__ == "__main__":
results = run_full_audit(
sitemap_url="https://yoursite.com/sitemap.xml",
ga_property_id=os.getenv("GA4_PROPERTY_ID"),
output_path=f"content-audit-{datetime.now().strftime('%Y-%m-%d')}.csv"
)
What to Build Next
- Schedule quarterly audits automatically and diff the results against the previous quarter to see how your content health is trending
- Build a consolidation execution tool that redirects deleted pages, merges content, and updates internal links in one batch operation
- Add a content ROI calculator that shows the revenue impact of each audit action based on historical traffic-to-lead conversion rates
Related Reading
- How to Build an AI Blog Post Generator - Replace deleted or thin content with well-structured new articles
- How to Create Automated Content Performance Reports - Track the traffic impact of your audit actions over time
- How to Build an AI Script Writer for Video Content - Convert your highest-performing audited articles into video content
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment