Systems Library / Marketing Automation / How to Build a Content Gap Analysis System
Marketing Automation content marketing

How to Build a Content Gap Analysis System

Identify content opportunities your competitors cover that you don't.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Most content teams publish based on what they feel like writing, not based on where the actual gaps are. This ai content gap analysis competitor system scrapes your competitors' content indexes, compares them against your own, and surfaces the topics they rank for that you are missing entirely. You stop guessing and start targeting gaps with actual evidence.

The ROI is direct. Every content gap you fill is a keyword cluster you claim that your competitors already proved has demand. You are not taking a risk on a topic. You are entering a market that is already validated.

What You Need Before Starting

Step 1: Pull Your Own Published URLs

import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv

load_dotenv()

def get_sitemap_urls(sitemap_url: str) -> list:
    """Extract all URLs from an XML sitemap."""
    try:
        response = requests.get(sitemap_url, timeout=10)
        soup = BeautifulSoup(response.content, "xml")
        urls = [loc.text for loc in soup.find_all("loc")]
        print(f"Found {len(urls)} URLs in sitemap")
        return urls
    except Exception as e:
        print(f"Sitemap error: {e}")
        return []

def filter_content_urls(urls: list, content_paths: list = ["/blog/", "/articles/", "/systems/", "/guides/"]) -> list:
    """Keep only content URLs, skip product/utility pages."""
    return [url for url in urls if any(path in url for path in content_paths)]

Step 2: Get Competitor Content

Pull content from competitor sitemaps or crawl their blog indexes:

def get_competitor_content(domain: str, max_pages: int = 100) -> list:
    """Try to pull content titles from competitor sitemap or blog index."""
    
    pages = []
    
    sitemap_attempts = [
        f"https://{domain}/sitemap.xml",
        f"https://{domain}/sitemap_index.xml",
        f"https://{domain}/blog/sitemap.xml"
    ]
    
    for sitemap_url in sitemap_attempts:
        try:
            response = requests.get(sitemap_url, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "xml")
                urls = [loc.text for loc in soup.find_all("loc")]
                content_urls = [u for u in urls if any(p in u for p in ["/blog/", "/articles/", "/post/", "/guide/"])]
                pages = content_urls[:max_pages]
                print(f"{domain}: Found {len(pages)} content URLs")
                break
        except:
            continue
    
    return pages

def extract_page_topics(urls: list, domain: str) -> list:
    """Extract topic signals from URLs and page titles."""
    topics = []
    
    for url in urls[:50]:
        slug = url.rstrip("/").split("/")[-1]
        topic_hint = slug.replace("-", " ").replace("_", " ")
        
        try:
            response = requests.get(url, timeout=8, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(response.content, "html.parser")
            
            title_tag = soup.find("h1") or soup.find("title")
            title = title_tag.get_text(strip=True) if title_tag else topic_hint
            
            topics.append({
                "url": url,
                "title": title,
                "slug_hint": topic_hint,
                "domain": domain
            })
        except:
            topics.append({
                "url": url,
                "title": topic_hint,
                "slug_hint": topic_hint,
                "domain": domain
            })
    
    return topics

Step 3: Compare and Find Gaps with Claude

import anthropic
import json

client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

def identify_content_gaps(
    your_content: list,
    competitor_content: list,
    your_niche: str,
    your_audience: str
) -> list:
    
    your_titles = [c.get("title", "") or c for c in your_content[:100]]
    comp_sample = competitor_content[:80]
    
    your_str = "\n".join(f"- {t}" for t in your_titles)
    comp_str = "\n".join(f"- [{c['domain']}] {c['title']}" for c in comp_sample)
    
    prompt = f"""You are a content strategist. Identify content gaps based on this analysis.

YOUR NICHE: {your_niche}
YOUR AUDIENCE: {your_audience}

YOUR PUBLISHED CONTENT (sample):
{your_str}

COMPETITOR CONTENT (sample):
{comp_str}

Identify 15 content gap opportunities: topics competitors cover that you do not.

For each gap, provide:
1. topic: The content topic or keyword cluster
2. gap_type: "missing_entirely" / "covered_shallowly" / "different_angle_available"
3. competitor_evidence: Which competitor(s) cover this
4. audience_value: Why your audience needs this content
5. difficulty: "easy" / "medium" / "hard" (based on depth required)
6. suggested_title: A strong title for an article on this topic
7. priority: "high" / "medium" / "low"

Prioritize topics that:
- Multiple competitors cover (proven demand)
- Align closely with your audience's primary pain points
- Your existing content creates a natural lead-in for

Return as a JSON array."""

    message = client.messages.create(
        model="claude-opus-4-5",
        max_tokens=3000,
        messages=[{"role": "user", "content": prompt}]
    )
    
    raw = message.content[0].text.strip()
    if raw.startswith("```"):
        raw = raw.split("```")[1]
        if raw.startswith("json"):
            raw = raw[4:]
    
    return json.loads(raw)

Step 4: Generate a Prioritized Action Plan

def generate_gap_report(gaps: list, output_path: str = "content-gap-report.md"):
    high = [g for g in gaps if g.get("priority") == "high"]
    medium = [g for g in gaps if g.get("priority") == "medium"]
    low = [g for g in gaps if g.get("priority") == "low"]
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("# Content Gap Analysis Report\n\n")
        f.write(f"Total gaps identified: {len(gaps)}\n")
        f.write(f"High priority: {len(high)}  |  Medium: {len(medium)}  |  Low: {len(low)}\n\n")
        f.write("---\n\n")
        
        for priority_label, priority_gaps in [("High Priority", high), ("Medium Priority", medium), ("Low Priority", low)]:
            if not priority_gaps:
                continue
            
            f.write(f"## {priority_label} Gaps\n\n")
            
            for i, gap in enumerate(priority_gaps, 1):
                f.write(f"### {i}. {gap.get('suggested_title', gap.get('topic', ''))}\n\n")
                f.write(f"**Topic:** {gap.get('topic', '')}\n")
                f.write(f"**Gap Type:** {gap.get('gap_type', '')}\n")
                f.write(f"**Competitors Covering This:** {gap.get('competitor_evidence', '')}\n")
                f.write(f"**Audience Value:** {gap.get('audience_value', '')}\n")
                f.write(f"**Difficulty:** {gap.get('difficulty', '')}\n\n")
        
    print(f"Gap report saved to {output_path}")

if __name__ == "__main__":
    my_urls = get_sitemap_urls("https://yoursite.com/sitemap.xml")
    my_content = filter_content_urls(my_urls)
    
    all_competitor_content = []
    for domain in ["competitor1.com", "competitor2.com", "competitor3.com"]:
        urls = get_competitor_content(domain)
        topics = extract_page_topics(urls, domain)
        all_competitor_content.extend(topics)
    
    gaps = identify_content_gaps(
        your_content=my_content,
        competitor_content=all_competitor_content,
        your_niche="AI tools for marketing operations",
        your_audience="Marketing managers and agency owners"
    )
    
    generate_gap_report(gaps)

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems