How to Use AI for Automated Summarization
Build systems that summarize long documents, meetings, and reports automatically.
Jay Banlasan
The AI Systems Guy
Summarization is one of the highest-ROI AI applications in business operations. The average knowledge worker spends 28% of their week reading and processing information. Automated summarization cuts that by more than half for routine documents. I have set up pipelines that process meeting transcripts, legal documents, research reports, and news feeds, and deliver clean summaries before the team even sits down.
The mistake most people make is treating summarization as a single prompt. Long documents require a map-reduce approach: chunk the document, summarize each chunk, then summarize the summaries. This produces better results than feeding a 50-page document and hoping for the best.
What You Need Before Starting
- Python 3.9+
- OpenAI API key
pdfplumberfor PDF documents- Source documents to test against
Step 1: Install Dependencies
pip install openai pdfplumber
Step 2: Build Document Readers for Common Formats
Handle the document types you actually deal with.
import pdfplumber
from pathlib import Path
def read_pdf(path: str) -> str:
with pdfplumber.open(path) as pdf:
return "\n\n".join(
page.extract_text() or ""
for page in pdf.pages
).strip()
def read_text_file(path: str) -> str:
return Path(path).read_text(encoding="utf-8").strip()
def read_document(source) -> str:
if isinstance(source, str) and not Path(source).exists():
return source
path = Path(source)
if path.suffix.lower() == ".pdf":
return read_pdf(str(path))
if path.suffix.lower() in {".txt", ".md", ".vtt", ".srt"}:
return read_text_file(str(path))
raise ValueError(f"Unsupported file type: {path.suffix}")
Step 3: Build the Chunk Summarizer
Summarize each chunk independently before combining.
import openai
client = openai.OpenAI(api_key="YOUR_API_KEY")
def chunk_document(text: str, chunk_size: int = 3000) -> list[str]:
"""Split on paragraph boundaries for clean chunks."""
paragraphs = text.split("\n\n")
chunks = []
current_chunk = []
current_length = 0
for para in paragraphs:
para_length = len(para.split())
if current_length + para_length > chunk_size and current_chunk:
chunks.append("\n\n".join(current_chunk))
current_chunk = [para]
current_length = para_length
else:
current_chunk.append(para)
current_length += para_length
if current_chunk:
chunks.append("\n\n".join(current_chunk))
return chunks
def summarize_chunk(chunk: str, context: str = "", doc_type: str = "document") -> str:
system = f"""Summarize this section of a {doc_type}.
Be concise and factual. Preserve: key decisions, action items, numbers, names, dates, and conclusions.
Do not include filler or meta-commentary about the document."""
user_content = chunk
if context:
user_content = f"Context from earlier in the document:\n{context}\n\n---\n\nCurrent section:\n{chunk}"
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user_content}
],
temperature=0,
max_tokens=500
)
return response.choices[0].message.content
Step 4: Build the Map-Reduce Summarizer
Chain chunk summaries into a final cohesive summary.
def summarize_document(
source,
doc_type: str = "document",
output_format: str = "paragraphs",
preserve_action_items: bool = True
) -> dict:
text = read_document(source)
chunks = chunk_document(text)
if not chunks:
return {"summary": "", "chunks_processed": 0, "word_count": 0}
# Single chunk: summarize directly
if len(chunks) == 1:
summary = summarize_chunk(chunks[0], doc_type=doc_type)
return {
"summary": summary,
"chunks_processed": 1,
"word_count": len(text.split()),
"method": "direct"
}
# Multiple chunks: map then reduce
print(f"Summarizing {len(chunks)} chunks...")
chunk_summaries = []
context_window = ""
for i, chunk in enumerate(chunks):
chunk_summary = summarize_chunk(chunk, context=context_window, doc_type=doc_type)
chunk_summaries.append(chunk_summary)
context_window = chunk_summary # Pass summary as context to next chunk
print(f" Chunk {i+1}/{len(chunks)} done.")
# Final reduction
all_summaries = "\n\n".join(chunk_summaries)
format_instructions = {
"paragraphs": "Write 2-4 cohesive paragraphs.",
"bullets": "Write as bullet points. Each bullet is one key point.",
"tldr": "Write a single paragraph of 3-5 sentences.",
"exec": "Write an executive summary with: Bottom Line, Key Points (3-5 bullets), and Recommended Actions."
}.get(output_format, "Write 2-4 cohesive paragraphs.")
action_note = "\n\nAlso extract all action items, decisions, and deadlines into a separate list." if preserve_action_items else ""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": f"You are synthesizing summaries of sections from a {doc_type} into one final summary. {format_instructions}{action_note}"
},
{
"role": "user",
"content": f"Synthesize these section summaries:\n\n{all_summaries}"
}
],
temperature=0.2,
max_tokens=1000
)
return {
"summary": response.choices[0].message.content,
"chunks_processed": len(chunks),
"word_count": len(text.split()),
"method": "map-reduce"
}
Step 5: Build a Meeting Transcript Summarizer
Meeting transcripts have special structure. Extract the decisions, not just the discussion.
MEETING_SUMMARY_TEMPLATE = """
You are summarizing a meeting transcript. Extract:
1. ATTENDEES: Who was present
2. DECISIONS: What was decided (each decision on its own line)
3. ACTION ITEMS: Who is doing what by when (Name: Task - Due date)
4. OPEN QUESTIONS: Issues discussed but not resolved
5. KEY CONTEXT: 2-3 sentences of background needed to understand the decisions
Format each section with the header and bullet points. Be specific. Include names and dates where mentioned."""
def summarize_meeting(transcript: str) -> dict:
chunks = chunk_document(transcript, chunk_size=4000)
if len(chunks) == 1:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": MEETING_SUMMARY_TEMPLATE},
{"role": "user", "content": transcript}
],
temperature=0.1
)
return {"summary": response.choices[0].message.content, "chunks": 1}
# For long meetings, extract key data per chunk first
chunk_data = []
for chunk in chunks:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Extract all decisions, action items, and open questions from this transcript section. Be specific."},
{"role": "user", "content": chunk}
],
temperature=0
)
chunk_data.append(response.choices[0].message.content)
combined = "\n\n---\n\n".join(chunk_data)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": MEETING_SUMMARY_TEMPLATE},
{"role": "user", "content": f"Compile this into a final meeting summary, removing duplicates:\n\n{combined}"}
],
temperature=0.1
)
return {"summary": response.choices[0].message.content, "chunks": len(chunks)}
Step 6: Schedule Automatic Summaries for Incoming Documents
Watch a folder and auto-summarize any new files.
import time
import sqlite3
from pathlib import Path
from datetime import datetime
WATCH_FOLDER = Path("./incoming_docs")
WATCH_FOLDER.mkdir(exist_ok=True)
OUTPUT_FOLDER = Path("./summaries")
OUTPUT_FOLDER.mkdir(exist_ok=True)
def init_processed_log():
conn = sqlite3.connect("summaries.db")
conn.execute("CREATE TABLE IF NOT EXISTS processed (filename TEXT PRIMARY KEY, processed_at TEXT)")
conn.commit()
conn.close()
def is_processed(filename: str) -> bool:
conn = sqlite3.connect("summaries.db")
row = conn.execute("SELECT 1 FROM processed WHERE filename = ?", (filename,)).fetchone()
conn.close()
return row is not None
def mark_processed(filename: str):
conn = sqlite3.connect("summaries.db")
conn.execute("INSERT OR REPLACE INTO processed VALUES (?, ?)", (filename, datetime.now().isoformat()))
conn.commit()
conn.close()
def watch_and_summarize(poll_interval: int = 30):
init_processed_log()
print(f"Watching {WATCH_FOLDER} for new documents...")
while True:
for file in WATCH_FOLDER.glob("*"):
if file.suffix.lower() not in {".pdf", ".txt", ".md"}:
continue
if is_processed(file.name):
continue
print(f"New file detected: {file.name}")
try:
result = summarize_document(str(file))
output_file = OUTPUT_FOLDER / f"{file.stem}_summary.txt"
output_file.write_text(result["summary"], encoding="utf-8")
mark_processed(file.name)
print(f"Summary saved: {output_file.name}")
except Exception as e:
print(f"Error processing {file.name}: {e}")
time.sleep(poll_interval)
What to Build Next
- Add a summary digest that emails you every morning with one-paragraph summaries of all documents processed in the last 24 hours
- Build a comparative summarizer that reads two versions of a document and outputs only what changed between them
- Connect the meeting summarizer to your task management system so action items are automatically created as tasks when a meeting transcript is processed
Related Reading
- How to Write System Prompts That Control AI Behavior - the summarization prompt defines what gets preserved and what gets cut
- How to Build AI Guardrails for Safe Outputs - ensure summarized content does not introduce inaccuracies or hallucinated details
- How to Use AI for Automated Data Extraction - pair extraction with summarization to both structure and condense document content
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment