Systems Library / AI Model Setup / How to Use AI for Automated Summarization
AI Model Setup advanced

How to Use AI for Automated Summarization

Build systems that summarize long documents, meetings, and reports automatically.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Summarization is one of the highest-ROI AI applications in business operations. The average knowledge worker spends 28% of their week reading and processing information. Automated summarization cuts that by more than half for routine documents. I have set up pipelines that process meeting transcripts, legal documents, research reports, and news feeds, and deliver clean summaries before the team even sits down.

The mistake most people make is treating summarization as a single prompt. Long documents require a map-reduce approach: chunk the document, summarize each chunk, then summarize the summaries. This produces better results than feeding a 50-page document and hoping for the best.

What You Need Before Starting

Step 1: Install Dependencies

pip install openai pdfplumber

Step 2: Build Document Readers for Common Formats

Handle the document types you actually deal with.

import pdfplumber
from pathlib import Path

def read_pdf(path: str) -> str:
    with pdfplumber.open(path) as pdf:
        return "\n\n".join(
            page.extract_text() or ""
            for page in pdf.pages
        ).strip()

def read_text_file(path: str) -> str:
    return Path(path).read_text(encoding="utf-8").strip()

def read_document(source) -> str:
    if isinstance(source, str) and not Path(source).exists():
        return source

    path = Path(source)
    if path.suffix.lower() == ".pdf":
        return read_pdf(str(path))
    if path.suffix.lower() in {".txt", ".md", ".vtt", ".srt"}:
        return read_text_file(str(path))
    raise ValueError(f"Unsupported file type: {path.suffix}")

Step 3: Build the Chunk Summarizer

Summarize each chunk independently before combining.

import openai

client = openai.OpenAI(api_key="YOUR_API_KEY")

def chunk_document(text: str, chunk_size: int = 3000) -> list[str]:
    """Split on paragraph boundaries for clean chunks."""
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = []
    current_length = 0

    for para in paragraphs:
        para_length = len(para.split())
        if current_length + para_length > chunk_size and current_chunk:
            chunks.append("\n\n".join(current_chunk))
            current_chunk = [para]
            current_length = para_length
        else:
            current_chunk.append(para)
            current_length += para_length

    if current_chunk:
        chunks.append("\n\n".join(current_chunk))

    return chunks

def summarize_chunk(chunk: str, context: str = "", doc_type: str = "document") -> str:
    system = f"""Summarize this section of a {doc_type}. 
Be concise and factual. Preserve: key decisions, action items, numbers, names, dates, and conclusions.
Do not include filler or meta-commentary about the document."""

    user_content = chunk
    if context:
        user_content = f"Context from earlier in the document:\n{context}\n\n---\n\nCurrent section:\n{chunk}"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user_content}
        ],
        temperature=0,
        max_tokens=500
    )
    return response.choices[0].message.content

Step 4: Build the Map-Reduce Summarizer

Chain chunk summaries into a final cohesive summary.

def summarize_document(
    source,
    doc_type: str = "document",
    output_format: str = "paragraphs",
    preserve_action_items: bool = True
) -> dict:
    text = read_document(source)
    chunks = chunk_document(text)

    if not chunks:
        return {"summary": "", "chunks_processed": 0, "word_count": 0}

    # Single chunk: summarize directly
    if len(chunks) == 1:
        summary = summarize_chunk(chunks[0], doc_type=doc_type)
        return {
            "summary": summary,
            "chunks_processed": 1,
            "word_count": len(text.split()),
            "method": "direct"
        }

    # Multiple chunks: map then reduce
    print(f"Summarizing {len(chunks)} chunks...")
    chunk_summaries = []
    context_window = ""

    for i, chunk in enumerate(chunks):
        chunk_summary = summarize_chunk(chunk, context=context_window, doc_type=doc_type)
        chunk_summaries.append(chunk_summary)
        context_window = chunk_summary  # Pass summary as context to next chunk
        print(f"  Chunk {i+1}/{len(chunks)} done.")

    # Final reduction
    all_summaries = "\n\n".join(chunk_summaries)

    format_instructions = {
        "paragraphs": "Write 2-4 cohesive paragraphs.",
        "bullets": "Write as bullet points. Each bullet is one key point.",
        "tldr": "Write a single paragraph of 3-5 sentences.",
        "exec": "Write an executive summary with: Bottom Line, Key Points (3-5 bullets), and Recommended Actions."
    }.get(output_format, "Write 2-4 cohesive paragraphs.")

    action_note = "\n\nAlso extract all action items, decisions, and deadlines into a separate list." if preserve_action_items else ""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": f"You are synthesizing summaries of sections from a {doc_type} into one final summary. {format_instructions}{action_note}"
            },
            {
                "role": "user",
                "content": f"Synthesize these section summaries:\n\n{all_summaries}"
            }
        ],
        temperature=0.2,
        max_tokens=1000
    )

    return {
        "summary": response.choices[0].message.content,
        "chunks_processed": len(chunks),
        "word_count": len(text.split()),
        "method": "map-reduce"
    }

Step 5: Build a Meeting Transcript Summarizer

Meeting transcripts have special structure. Extract the decisions, not just the discussion.

MEETING_SUMMARY_TEMPLATE = """
You are summarizing a meeting transcript. Extract:

1. ATTENDEES: Who was present
2. DECISIONS: What was decided (each decision on its own line)
3. ACTION ITEMS: Who is doing what by when (Name: Task - Due date)
4. OPEN QUESTIONS: Issues discussed but not resolved
5. KEY CONTEXT: 2-3 sentences of background needed to understand the decisions

Format each section with the header and bullet points. Be specific. Include names and dates where mentioned."""

def summarize_meeting(transcript: str) -> dict:
    chunks = chunk_document(transcript, chunk_size=4000)

    if len(chunks) == 1:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": MEETING_SUMMARY_TEMPLATE},
                {"role": "user", "content": transcript}
            ],
            temperature=0.1
        )
        return {"summary": response.choices[0].message.content, "chunks": 1}

    # For long meetings, extract key data per chunk first
    chunk_data = []
    for chunk in chunks:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "Extract all decisions, action items, and open questions from this transcript section. Be specific."},
                {"role": "user", "content": chunk}
            ],
            temperature=0
        )
        chunk_data.append(response.choices[0].message.content)

    combined = "\n\n---\n\n".join(chunk_data)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": MEETING_SUMMARY_TEMPLATE},
            {"role": "user", "content": f"Compile this into a final meeting summary, removing duplicates:\n\n{combined}"}
        ],
        temperature=0.1
    )

    return {"summary": response.choices[0].message.content, "chunks": len(chunks)}

Step 6: Schedule Automatic Summaries for Incoming Documents

Watch a folder and auto-summarize any new files.

import time
import sqlite3
from pathlib import Path
from datetime import datetime

WATCH_FOLDER = Path("./incoming_docs")
WATCH_FOLDER.mkdir(exist_ok=True)
OUTPUT_FOLDER = Path("./summaries")
OUTPUT_FOLDER.mkdir(exist_ok=True)

def init_processed_log():
    conn = sqlite3.connect("summaries.db")
    conn.execute("CREATE TABLE IF NOT EXISTS processed (filename TEXT PRIMARY KEY, processed_at TEXT)")
    conn.commit()
    conn.close()

def is_processed(filename: str) -> bool:
    conn = sqlite3.connect("summaries.db")
    row = conn.execute("SELECT 1 FROM processed WHERE filename = ?", (filename,)).fetchone()
    conn.close()
    return row is not None

def mark_processed(filename: str):
    conn = sqlite3.connect("summaries.db")
    conn.execute("INSERT OR REPLACE INTO processed VALUES (?, ?)", (filename, datetime.now().isoformat()))
    conn.commit()
    conn.close()

def watch_and_summarize(poll_interval: int = 30):
    init_processed_log()
    print(f"Watching {WATCH_FOLDER} for new documents...")

    while True:
        for file in WATCH_FOLDER.glob("*"):
            if file.suffix.lower() not in {".pdf", ".txt", ".md"}:
                continue
            if is_processed(file.name):
                continue

            print(f"New file detected: {file.name}")
            try:
                result = summarize_document(str(file))
                output_file = OUTPUT_FOLDER / f"{file.stem}_summary.txt"
                output_file.write_text(result["summary"], encoding="utf-8")
                mark_processed(file.name)
                print(f"Summary saved: {output_file.name}")
            except Exception as e:
                print(f"Error processing {file.name}: {e}")

        time.sleep(poll_interval)

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems