Systems Library / AI Model Setup / How to Use AI for Automated Data Extraction
AI Model Setup advanced

How to Use AI for Automated Data Extraction

Extract structured data from unstructured documents using AI parsing.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Manual data entry from PDFs, emails, and contracts is one of the most expensive time sinks in any operations team. I replaced a 3-hour-per-day manual extraction task for a client with an AI pipeline that runs in under 2 minutes. The system reads incoming documents, pulls structured fields, validates the data, and pushes it directly to a spreadsheet. Nobody touches a keyboard.

The approach works on any unstructured document: invoices, contracts, intake forms, support emails, meeting transcripts, or research reports. If a human can read it and pull data from it, an AI can do it faster.

What You Need Before Starting

Step 1: Install Dependencies

pip install openai pydantic pdfplumber openpyxl

pdfplumber handles most PDF types better than PyPDF2, including tables and columns.

Step 2: Build a Document Reader

Normalize inputs to plain text before sending to the AI. Handle PDF, plain text, and raw string inputs.

import pdfplumber
from pathlib import Path

def read_document(source) -> str:
    """Accept file path or raw string. Return clean text."""
    if isinstance(source, str) and not Path(source).exists():
        return source  # Already text

    path = Path(source)
    if path.suffix.lower() == ".pdf":
        with pdfplumber.open(path) as pdf:
            pages = [page.extract_text() or "" for page in pdf.pages]
            return "\n\n".join(pages).strip()

    if path.suffix.lower() in {".txt", ".md"}:
        return path.read_text(encoding="utf-8").strip()

    raise ValueError(f"Unsupported file type: {path.suffix}")

Step 3: Define Extraction Schemas

Use Pydantic to define exactly what fields you want. Different document types need different schemas.

from pydantic import BaseModel
from typing import Optional, List
from datetime import date

class InvoiceData(BaseModel):
    invoice_number: str
    vendor_name: str
    vendor_email: Optional[str]
    invoice_date: str
    due_date: Optional[str]
    line_items: List[str]
    subtotal: Optional[float]
    tax: Optional[float]
    total_amount: float
    currency: str
    payment_terms: Optional[str]

class ContractParty(BaseModel):
    name: str
    role: str  # "client", "vendor", "contractor", etc.
    email: Optional[str]
    address: Optional[str]

class ContractData(BaseModel):
    contract_type: str
    effective_date: str
    expiration_date: Optional[str]
    parties: List[ContractParty]
    key_obligations: List[str]
    payment_amount: Optional[float]
    payment_frequency: Optional[str]
    termination_clause: Optional[str]
    governing_law: Optional[str]

class SupportEmailData(BaseModel):
    customer_name: Optional[str]
    customer_email: Optional[str]
    issue_category: str  # "billing", "technical", "general", "refund"
    urgency: str         # "low", "medium", "high", "critical"
    issue_summary: str
    product_mentioned: Optional[str]
    order_number: Optional[str]
    requested_action: str

Step 4: Build the Extraction Engine

Send document text with schema instructions to the model and validate the response.

import openai
import json
from pydantic import BaseModel, ValidationError
from typing import Type, TypeVar

T = TypeVar("T", bound=BaseModel)
client = openai.OpenAI(api_key="YOUR_API_KEY")

def extract_from_document(
    document_text: str,
    schema: Type[T],
    doc_type: str = "document",
    model: str = "gpt-4o"
) -> T:
    schema_json = json.dumps(schema.model_json_schema(), indent=2)

    system_prompt = f"""You are a precise data extraction engine. 
Extract information from the provided {doc_type} and return ONLY a valid JSON object.
Match the schema exactly. Use null for missing optional fields.
Do not include markdown, explanations, or extra text.

Required JSON schema:
{schema_json}"""

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": document_text}
        ],
        temperature=0,
        response_format={"type": "json_object"}
    )

    raw = response.choices[0].message.content

    try:
        data = json.loads(raw)
        return schema(**data)
    except (json.JSONDecodeError, ValidationError) as e:
        raise ValueError(f"Extraction failed for {doc_type}: {e}\nRaw output: {raw[:500]}")

Step 5: Add Confidence Scoring

Not all extractions are equally reliable. Add a confidence check to flag low-quality results for human review.

from pydantic import BaseModel as PydanticBase
from typing import Generic

def score_extraction_confidence(extracted: BaseModel, doc_text: str) -> float:
    """Rate 0.0-1.0 how confident we are in the extraction."""
    required_fields = [
        k for k, v in extracted.model_fields.items()
        if v.is_required()
    ]
    filled_required = sum(
        1 for field in required_fields
        if getattr(extracted, field) is not None
    )

    all_fields = list(extracted.model_fields.keys())
    filled_all = sum(
        1 for field in all_fields
        if getattr(extracted, field) is not None
    )

    required_score = filled_required / len(required_fields) if required_fields else 1.0
    optional_score = filled_all / len(all_fields) if all_fields else 1.0

    return round((required_score * 0.7) + (optional_score * 0.3), 2)

def extract_with_confidence(source, schema: Type[T], doc_type: str) -> dict:
    text = read_document(source)
    extracted = extract_from_document(text, schema, doc_type)
    confidence = score_extraction_confidence(extracted, text)

    return {
        "data": extracted,
        "confidence": confidence,
        "needs_review": confidence < 0.8,
        "document_length": len(text)
    }

Step 6: Process a Batch of Documents

Run extraction across a folder of documents and export results.

import openpyxl
from pathlib import Path
from datetime import datetime

def process_document_batch(
    folder_path: str,
    schema: Type[T],
    doc_type: str,
    output_path: str = "extracted_data.xlsx"
) -> dict:
    folder = Path(folder_path)
    files = list(folder.glob("*.pdf")) + list(folder.glob("*.txt"))

    results = []
    errors = []

    for file in files:
        print(f"Processing {file.name}...")
        try:
            result = extract_with_confidence(str(file), schema, doc_type)
            results.append({
                "file": file.name,
                "confidence": result["confidence"],
                "needs_review": result["needs_review"],
                **result["data"].model_dump()
            })
        except Exception as e:
            errors.append({"file": file.name, "error": str(e)})
            print(f"  ERROR: {e}")

    # Write to Excel
    if results:
        wb = openpyxl.Workbook()
        ws = wb.active
        ws.title = "Extracted Data"
        headers = list(results[0].keys())
        ws.append(headers)
        for row in results:
            ws.append([str(row.get(h, "")) for h in headers])
        wb.save(output_path)
        print(f"Saved {len(results)} records to {output_path}")

    return {
        "processed": len(results),
        "errors": len(errors),
        "needs_review": sum(1 for r in results if r["needs_review"]),
        "error_details": errors
    }

# Run it
summary = process_document_batch(
    folder_path="./invoices",
    schema=InvoiceData,
    doc_type="invoice"
)
print(summary)

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems