How to Implement Smart Context Window Management
Maximize AI output quality by intelligently managing context window limits.
Jay Banlasan
The AI Systems Guy
I was processing legal contracts with an AI summarizer and hitting context limit errors every time a document exceeded 8,000 words. My first fix was dumb: truncate the document. The summaries were missing the critical clauses that appeared in the last third. Proper ai context window optimization management changed that. The system now processes 50-page documents, keeps the most important content in context, and produces summaries that don't miss anything material.
Context window management is not just about fitting text inside a limit. It's about deciding which text matters most when you can't fit everything. That decision logic is what separates a system that works from one that loses information at the worst moments.
What You Need Before Starting
- Python 3.10+
anthropicSDK (pip install anthropic)tiktokenfor token counting (pip install tiktoken)- Documents or conversation histories you need to process
Step 1: Build Accurate Token Counters
Token counts vary by model. Estimate first, then confirm with the SDK's built-in counting.
import tiktoken
import anthropic
# For OpenAI models
def count_tokens_openai(text: str, model: str = "gpt-4o") -> int:
enc = tiktoken.encoding_for_model(model)
return len(enc.encode(text))
# For Anthropic models (use their count_tokens API for accuracy)
_client = anthropic.Anthropic()
def count_tokens_anthropic(messages: list[dict], model: str = "claude-haiku-3") -> int:
response = _client.messages.count_tokens(
model=model,
messages=messages
)
return response.input_tokens
# Quick estimate (1 token ≈ 4 chars — close enough for planning)
def estimate_tokens(text: str) -> int:
return len(text) // 4
MODEL_CONTEXT_LIMITS = {
"claude-haiku-3": 200_000,
"claude-opus-4-5": 200_000,
"gpt-4o": 128_000,
"gpt-4o-mini": 128_000,
"gemini-1.5-pro": 1_000_000,
}
def safe_limit(model: str, reserve_for_output: int = 4096) -> int:
return MODEL_CONTEXT_LIMITS.get(model, 8192) - reserve_for_output
Always reserve tokens for the output. A 200K context model still needs headroom to write its response.
Step 2: Build a Chunking Strategy
For documents that exceed context limits, chunk them intelligently. Sentence boundaries beat character counts.
import re
def chunk_by_sentences(text: str, max_tokens: int = 4000,
overlap_sentences: int = 2) -> list[str]:
sentences = re.split(r'(?<=[.!?])\s+', text.strip())
chunks = []
current_chunk = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = estimate_tokens(sentence)
if current_tokens + sentence_tokens > max_tokens and current_chunk:
chunks.append(" ".join(current_chunk))
# Keep last N sentences as overlap for continuity
current_chunk = current_chunk[-overlap_sentences:]
current_tokens = sum(estimate_tokens(s) for s in current_chunk)
current_chunk.append(sentence)
current_tokens += sentence_tokens
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def chunk_by_paragraphs(text: str, max_tokens: int = 6000) -> list[str]:
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
chunks = []
current = []
current_tokens = 0
for para in paragraphs:
para_tokens = estimate_tokens(para)
if current_tokens + para_tokens > max_tokens and current:
chunks.append("\n\n".join(current))
current = []
current_tokens = 0
current.append(para)
current_tokens += para_tokens
if current:
chunks.append("\n\n".join(current))
return chunks
Step 3: Build a Map-Reduce Processor for Long Documents
Map: summarize each chunk independently. Reduce: combine summaries into a final output.
def summarize_chunk(chunk: str, context: str = "", model: str = "claude-haiku-3") -> str:
system = "You are a precise document analyst. Extract the key facts, decisions, and obligations. Be dense and specific."
prompt = f"{f'Previous context: {context}' if context else ''}\n\nDocument section:\n{chunk}\n\nKey points from this section:"
response = _client.messages.create(
model=model,
max_tokens=512,
system=system,
messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text
def process_long_document(document: str, final_task: str,
model: str = "claude-haiku-3") -> str:
chunks = chunk_by_paragraphs(document, max_tokens=6000)
print(f"Split into {len(chunks)} chunks")
# Map: summarize each chunk
chunk_summaries = []
running_context = ""
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}...")
summary = summarize_chunk(chunk, context=running_context, model=model)
chunk_summaries.append(summary)
running_context = summary # carry forward for continuity
# Reduce: combine and produce final output
combined = "\n\n---\n\n".join(chunk_summaries)
reduce_prompt = f"""Here are summaries of each section of a document:
{combined}
Task: {final_task}
Produce a complete, accurate response drawing from all sections above."""
response = _client.messages.create(
model=model,
max_tokens=2048,
messages=[{"role": "user", "content": reduce_prompt}]
)
return response.content[0].text
Step 4: Implement Conversation History Trimming
For chatbots and agents, conversation history grows and eventually breaks the context limit. Trim intelligently.
from typing import Literal
def trim_conversation_history(
messages: list[dict],
max_tokens: int,
strategy: Literal["sliding_window", "compress_middle", "keep_first_last"] = "sliding_window"
) -> list[dict]:
if strategy == "sliding_window":
# Keep the most recent messages that fit
# Always keep system prompt if present
system = [m for m in messages if m["role"] == "system"]
conversation = [m for m in messages if m["role"] != "system"]
trimmed = []
total = sum(estimate_tokens(m["content"]) for m in system)
for msg in reversed(conversation):
msg_tokens = estimate_tokens(msg["content"])
if total + msg_tokens <= max_tokens:
trimmed.insert(0, msg)
total += msg_tokens
else:
break
return system + trimmed
elif strategy == "keep_first_last":
# Keep first 2 exchanges (sets context) and last 3 (recent context)
system = [m for m in messages if m["role"] == "system"]
conv = [m for m in messages if m["role"] != "system"]
if len(conv) <= 10:
return messages
return system + conv[:4] + [{"role": "user", "content": "[Earlier conversation trimmed]"}] + conv[-6:]
return messages # fallback: no trim
Step 5: Build an Adaptive Chunker that Respects Document Structure
For structured documents (reports, contracts, transcripts), preserve section boundaries.
def smart_chunk_structured(text: str, max_tokens: int = 8000) -> list[dict]:
"""Returns chunks with metadata about what section they came from."""
# Detect headers (Markdown-style or ALL CAPS lines)
section_pattern = re.compile(r'^(#{1,3} .+|[A-Z][A-Z\s]{5,}:?)$', re.MULTILINE)
splits = [(m.start(), m.group()) for m in section_pattern.finditer(text)]
if not splits:
# No structure found — fall back to paragraph chunking
return [{"section": "Document", "content": c} for c in chunk_by_paragraphs(text, max_tokens)]
chunks = []
for i, (start, heading) in enumerate(splits):
end = splits[i + 1][0] if i + 1 < len(splits) else len(text)
section_text = text[start:end].strip()
if estimate_tokens(section_text) > max_tokens:
# Section is too long — sub-chunk it
sub_chunks = chunk_by_paragraphs(section_text, max_tokens)
for j, sc in enumerate(sub_chunks):
chunks.append({"section": f"{heading} (part {j+1})", "content": sc})
else:
chunks.append({"section": heading, "content": section_text})
return chunks
Step 6: Choose the Right Strategy per Task
def process_document(document: str, task: str, model: str = "claude-haiku-3") -> str:
token_count = estimate_tokens(document)
context_limit = safe_limit(model)
if token_count <= context_limit:
# Fits whole — send as-is
response = _client.messages.create(
model=model, max_tokens=2048,
messages=[{"role": "user", "content": f"{task}\n\n{document}"}]
)
return response.content[0].text
elif token_count <= context_limit * 3:
# Moderately long — map-reduce
return process_long_document(document, task, model)
else:
# Very long — use structured chunking + map-reduce
chunks = smart_chunk_structured(document, max_tokens=8000)
summaries = [summarize_chunk(c["content"], model=model) for c in chunks]
combined = "\n\n".join(summaries)
return process_long_document(combined, task, model)
What to Build Next
- Add semantic importance scoring to prioritize which chunks contain the most critical information when trimming is necessary
- Build a conversation memory system that compresses old exchanges into dense summaries before trimming
- Implement streaming responses to start displaying output before full processing completes on long documents
Related Reading
- How to Build a Multi-Model AI Router - route long documents to high-context models automatically
- How to Build Automatic Model Failover Systems - fall back to a larger context window model when the primary can't fit the input
- How to Build AI Request Throttling Systems - long documents generate many tokens; throttle to stay within rate limits
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment