How to Build a Content Gap Analysis System
Identify content opportunities your competitors cover that you don't.
Jay Banlasan
The AI Systems Guy
Most content teams publish based on what they feel like writing, not based on where the actual gaps are. This ai content gap analysis competitor system scrapes your competitors' content indexes, compares them against your own, and surfaces the topics they rank for that you are missing entirely. You stop guessing and start targeting gaps with actual evidence.
The ROI is direct. Every content gap you fill is a keyword cluster you claim that your competitors already proved has demand. You are not taking a risk on a topic. You are entering a market that is already validated.
What You Need Before Starting
- Python 3.10 or higher
- Anthropic API key
- SerpAPI key
- List of 3-5 competitor domains
- Your own sitemap URL or a list of your published URLs
pip install anthropic requests beautifulsoup4 python-dotenv
Step 1: Pull Your Own Published URLs
import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()
def get_sitemap_urls(sitemap_url: str) -> list:
"""Extract all URLs from an XML sitemap."""
try:
response = requests.get(sitemap_url, timeout=10)
soup = BeautifulSoup(response.content, "xml")
urls = [loc.text for loc in soup.find_all("loc")]
print(f"Found {len(urls)} URLs in sitemap")
return urls
except Exception as e:
print(f"Sitemap error: {e}")
return []
def filter_content_urls(urls: list, content_paths: list = ["/blog/", "/articles/", "/systems/", "/guides/"]) -> list:
"""Keep only content URLs, skip product/utility pages."""
return [url for url in urls if any(path in url for path in content_paths)]
Step 2: Get Competitor Content
Pull content from competitor sitemaps or crawl their blog indexes:
def get_competitor_content(domain: str, max_pages: int = 100) -> list:
"""Try to pull content titles from competitor sitemap or blog index."""
pages = []
sitemap_attempts = [
f"https://{domain}/sitemap.xml",
f"https://{domain}/sitemap_index.xml",
f"https://{domain}/blog/sitemap.xml"
]
for sitemap_url in sitemap_attempts:
try:
response = requests.get(sitemap_url, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "xml")
urls = [loc.text for loc in soup.find_all("loc")]
content_urls = [u for u in urls if any(p in u for p in ["/blog/", "/articles/", "/post/", "/guide/"])]
pages = content_urls[:max_pages]
print(f"{domain}: Found {len(pages)} content URLs")
break
except:
continue
return pages
def extract_page_topics(urls: list, domain: str) -> list:
"""Extract topic signals from URLs and page titles."""
topics = []
for url in urls[:50]:
slug = url.rstrip("/").split("/")[-1]
topic_hint = slug.replace("-", " ").replace("_", " ")
try:
response = requests.get(url, timeout=8, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(response.content, "html.parser")
title_tag = soup.find("h1") or soup.find("title")
title = title_tag.get_text(strip=True) if title_tag else topic_hint
topics.append({
"url": url,
"title": title,
"slug_hint": topic_hint,
"domain": domain
})
except:
topics.append({
"url": url,
"title": topic_hint,
"slug_hint": topic_hint,
"domain": domain
})
return topics
Step 3: Compare and Find Gaps with Claude
import anthropic
import json
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
def identify_content_gaps(
your_content: list,
competitor_content: list,
your_niche: str,
your_audience: str
) -> list:
your_titles = [c.get("title", "") or c for c in your_content[:100]]
comp_sample = competitor_content[:80]
your_str = "\n".join(f"- {t}" for t in your_titles)
comp_str = "\n".join(f"- [{c['domain']}] {c['title']}" for c in comp_sample)
prompt = f"""You are a content strategist. Identify content gaps based on this analysis.
YOUR NICHE: {your_niche}
YOUR AUDIENCE: {your_audience}
YOUR PUBLISHED CONTENT (sample):
{your_str}
COMPETITOR CONTENT (sample):
{comp_str}
Identify 15 content gap opportunities: topics competitors cover that you do not.
For each gap, provide:
1. topic: The content topic or keyword cluster
2. gap_type: "missing_entirely" / "covered_shallowly" / "different_angle_available"
3. competitor_evidence: Which competitor(s) cover this
4. audience_value: Why your audience needs this content
5. difficulty: "easy" / "medium" / "hard" (based on depth required)
6. suggested_title: A strong title for an article on this topic
7. priority: "high" / "medium" / "low"
Prioritize topics that:
- Multiple competitors cover (proven demand)
- Align closely with your audience's primary pain points
- Your existing content creates a natural lead-in for
Return as a JSON array."""
message = client.messages.create(
model="claude-opus-4-5",
max_tokens=3000,
messages=[{"role": "user", "content": prompt}]
)
raw = message.content[0].text.strip()
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
return json.loads(raw)
Step 4: Generate a Prioritized Action Plan
def generate_gap_report(gaps: list, output_path: str = "content-gap-report.md"):
high = [g for g in gaps if g.get("priority") == "high"]
medium = [g for g in gaps if g.get("priority") == "medium"]
low = [g for g in gaps if g.get("priority") == "low"]
with open(output_path, "w", encoding="utf-8") as f:
f.write("# Content Gap Analysis Report\n\n")
f.write(f"Total gaps identified: {len(gaps)}\n")
f.write(f"High priority: {len(high)} | Medium: {len(medium)} | Low: {len(low)}\n\n")
f.write("---\n\n")
for priority_label, priority_gaps in [("High Priority", high), ("Medium Priority", medium), ("Low Priority", low)]:
if not priority_gaps:
continue
f.write(f"## {priority_label} Gaps\n\n")
for i, gap in enumerate(priority_gaps, 1):
f.write(f"### {i}. {gap.get('suggested_title', gap.get('topic', ''))}\n\n")
f.write(f"**Topic:** {gap.get('topic', '')}\n")
f.write(f"**Gap Type:** {gap.get('gap_type', '')}\n")
f.write(f"**Competitors Covering This:** {gap.get('competitor_evidence', '')}\n")
f.write(f"**Audience Value:** {gap.get('audience_value', '')}\n")
f.write(f"**Difficulty:** {gap.get('difficulty', '')}\n\n")
print(f"Gap report saved to {output_path}")
if __name__ == "__main__":
my_urls = get_sitemap_urls("https://yoursite.com/sitemap.xml")
my_content = filter_content_urls(my_urls)
all_competitor_content = []
for domain in ["competitor1.com", "competitor2.com", "competitor3.com"]:
urls = get_competitor_content(domain)
topics = extract_page_topics(urls, domain)
all_competitor_content.extend(topics)
gaps = identify_content_gaps(
your_content=my_content,
competitor_content=all_competitor_content,
your_niche="AI tools for marketing operations",
your_audience="Marketing managers and agency owners"
)
generate_gap_report(gaps)
What to Build Next
- Set this up to run monthly and diff the results against last month's report to track whether you are closing gaps over time
- Connect the high-priority gaps directly to your content brief generator to auto-create briefs for the top 5 opportunities
- Add search volume data from SerpAPI to rank gaps by traffic potential, not just coverage frequency
Related Reading
- How to Build an AI Blog Post Generator - Fill your identified gaps with full articles automatically
- How to Create Automated Content Performance Reports - Track organic traffic growth as you close content gaps
- How to Build an AI Script Writer for Video Content - Cover content gaps with video content where competitors are text-only
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment