How to Use AI for Automated Data Extraction
Extract structured data from unstructured documents using AI parsing.
Jay Banlasan
The AI Systems Guy
Manual data entry from PDFs, emails, and contracts is one of the most expensive time sinks in any operations team. I replaced a 3-hour-per-day manual extraction task for a client with an AI pipeline that runs in under 2 minutes. The system reads incoming documents, pulls structured fields, validates the data, and pushes it directly to a spreadsheet. Nobody touches a keyboard.
The approach works on any unstructured document: invoices, contracts, intake forms, support emails, meeting transcripts, or research reports. If a human can read it and pull data from it, an AI can do it faster.
What You Need Before Starting
- Python 3.9+
- OpenAI API key
pydantic,pypdf2orpdfplumberfor PDF reading- Sample documents to test against (5-10 minimum before trusting in production)
Step 1: Install Dependencies
pip install openai pydantic pdfplumber openpyxl
pdfplumber handles most PDF types better than PyPDF2, including tables and columns.
Step 2: Build a Document Reader
Normalize inputs to plain text before sending to the AI. Handle PDF, plain text, and raw string inputs.
import pdfplumber
from pathlib import Path
def read_document(source) -> str:
"""Accept file path or raw string. Return clean text."""
if isinstance(source, str) and not Path(source).exists():
return source # Already text
path = Path(source)
if path.suffix.lower() == ".pdf":
with pdfplumber.open(path) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages]
return "\n\n".join(pages).strip()
if path.suffix.lower() in {".txt", ".md"}:
return path.read_text(encoding="utf-8").strip()
raise ValueError(f"Unsupported file type: {path.suffix}")
Step 3: Define Extraction Schemas
Use Pydantic to define exactly what fields you want. Different document types need different schemas.
from pydantic import BaseModel
from typing import Optional, List
from datetime import date
class InvoiceData(BaseModel):
invoice_number: str
vendor_name: str
vendor_email: Optional[str]
invoice_date: str
due_date: Optional[str]
line_items: List[str]
subtotal: Optional[float]
tax: Optional[float]
total_amount: float
currency: str
payment_terms: Optional[str]
class ContractParty(BaseModel):
name: str
role: str # "client", "vendor", "contractor", etc.
email: Optional[str]
address: Optional[str]
class ContractData(BaseModel):
contract_type: str
effective_date: str
expiration_date: Optional[str]
parties: List[ContractParty]
key_obligations: List[str]
payment_amount: Optional[float]
payment_frequency: Optional[str]
termination_clause: Optional[str]
governing_law: Optional[str]
class SupportEmailData(BaseModel):
customer_name: Optional[str]
customer_email: Optional[str]
issue_category: str # "billing", "technical", "general", "refund"
urgency: str # "low", "medium", "high", "critical"
issue_summary: str
product_mentioned: Optional[str]
order_number: Optional[str]
requested_action: str
Step 4: Build the Extraction Engine
Send document text with schema instructions to the model and validate the response.
import openai
import json
from pydantic import BaseModel, ValidationError
from typing import Type, TypeVar
T = TypeVar("T", bound=BaseModel)
client = openai.OpenAI(api_key="YOUR_API_KEY")
def extract_from_document(
document_text: str,
schema: Type[T],
doc_type: str = "document",
model: str = "gpt-4o"
) -> T:
schema_json = json.dumps(schema.model_json_schema(), indent=2)
system_prompt = f"""You are a precise data extraction engine.
Extract information from the provided {doc_type} and return ONLY a valid JSON object.
Match the schema exactly. Use null for missing optional fields.
Do not include markdown, explanations, or extra text.
Required JSON schema:
{schema_json}"""
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": document_text}
],
temperature=0,
response_format={"type": "json_object"}
)
raw = response.choices[0].message.content
try:
data = json.loads(raw)
return schema(**data)
except (json.JSONDecodeError, ValidationError) as e:
raise ValueError(f"Extraction failed for {doc_type}: {e}\nRaw output: {raw[:500]}")
Step 5: Add Confidence Scoring
Not all extractions are equally reliable. Add a confidence check to flag low-quality results for human review.
from pydantic import BaseModel as PydanticBase
from typing import Generic
def score_extraction_confidence(extracted: BaseModel, doc_text: str) -> float:
"""Rate 0.0-1.0 how confident we are in the extraction."""
required_fields = [
k for k, v in extracted.model_fields.items()
if v.is_required()
]
filled_required = sum(
1 for field in required_fields
if getattr(extracted, field) is not None
)
all_fields = list(extracted.model_fields.keys())
filled_all = sum(
1 for field in all_fields
if getattr(extracted, field) is not None
)
required_score = filled_required / len(required_fields) if required_fields else 1.0
optional_score = filled_all / len(all_fields) if all_fields else 1.0
return round((required_score * 0.7) + (optional_score * 0.3), 2)
def extract_with_confidence(source, schema: Type[T], doc_type: str) -> dict:
text = read_document(source)
extracted = extract_from_document(text, schema, doc_type)
confidence = score_extraction_confidence(extracted, text)
return {
"data": extracted,
"confidence": confidence,
"needs_review": confidence < 0.8,
"document_length": len(text)
}
Step 6: Process a Batch of Documents
Run extraction across a folder of documents and export results.
import openpyxl
from pathlib import Path
from datetime import datetime
def process_document_batch(
folder_path: str,
schema: Type[T],
doc_type: str,
output_path: str = "extracted_data.xlsx"
) -> dict:
folder = Path(folder_path)
files = list(folder.glob("*.pdf")) + list(folder.glob("*.txt"))
results = []
errors = []
for file in files:
print(f"Processing {file.name}...")
try:
result = extract_with_confidence(str(file), schema, doc_type)
results.append({
"file": file.name,
"confidence": result["confidence"],
"needs_review": result["needs_review"],
**result["data"].model_dump()
})
except Exception as e:
errors.append({"file": file.name, "error": str(e)})
print(f" ERROR: {e}")
# Write to Excel
if results:
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Extracted Data"
headers = list(results[0].keys())
ws.append(headers)
for row in results:
ws.append([str(row.get(h, "")) for h in headers])
wb.save(output_path)
print(f"Saved {len(results)} records to {output_path}")
return {
"processed": len(results),
"errors": len(errors),
"needs_review": sum(1 for r in results if r["needs_review"]),
"error_details": errors
}
# Run it
summary = process_document_batch(
folder_path="./invoices",
schema=InvoiceData,
doc_type="invoice"
)
print(summary)
What to Build Next
- Build a document classifier that detects document type automatically before routing to the correct schema, so you can drop any document into a single intake pipeline
- Add a human review UI that presents low-confidence extractions for correction, then feeds the corrections back to improve your prompts
- Connect the pipeline to a webhook so documents emailed to a specific address get extracted and pushed to your CRM within minutes of arrival
Related Reading
- How to Write System Prompts That Control AI Behavior - schema compliance depends heavily on system prompt design
- How to Build AI Guardrails for Safe Outputs - validation layers protect downstream systems from bad extractions
- How to Build Persona-Based AI Assistants - combine extraction with a persona for document intake chatbots
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment