Systems Library / Finance Automation / How to Build an AI Receipt Scanner and Data Extractor
Finance Automation accounting reporting

How to Build an AI Receipt Scanner and Data Extractor

Scan and extract data from receipts automatically for accounting.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Receipts pile up in shoeboxes and email inboxes. I built an ai receipt scanner for data extraction that reads receipt images, pulls out vendor, date, amount, tax, and category, then stores everything in a structured format for accounting. No more manual data entry from crumpled paper.

Point a camera. Get structured data.

What You Need Before Starting

Step 1: Scan Receipts with AI Vision

import anthropic
import base64
import json

client = anthropic.Anthropic()

def scan_receipt(image_path):
    with open(image_path, "rb") as f:
        image_data = base64.standard_b64encode(f.read()).decode("utf-8")

    media_type = "image/jpeg" if image_path.endswith(".jpg") else "image/png"

    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=500,
        messages=[{
            "role": "user",
            "content": [
                {"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
                {"type": "text", "text": """Extract from this receipt and return JSON only:
{"vendor": "", "date": "YYYY-MM-DD", "subtotal": 0.00, "tax": 0.00, "total": 0.00, "items": [{"description": "", "amount": 0.00}], "payment_method": "", "category": ""}
If a field is not visible, use null."""}
            ]
        }]
    )
    return json.loads(message.content[0].text)

Step 2: Store Extracted Data

import sqlite3
from datetime import datetime

def init_receipt_db(db_path="receipts.db"):
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS receipts (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            vendor TEXT,
            receipt_date TEXT,
            subtotal REAL,
            tax REAL,
            total REAL,
            category TEXT,
            payment_method TEXT,
            image_path TEXT,
            items_json TEXT,
            extracted_at TEXT
        )
    """)
    conn.commit()
    return conn

def store_receipt(conn, data, image_path):
    conn.execute(
        "INSERT INTO receipts (vendor, receipt_date, subtotal, tax, total, category, payment_method, image_path, items_json, extracted_at) VALUES (?,?,?,?,?,?,?,?,?,?)",
        (data.get("vendor"), data.get("date"), data.get("subtotal", 0),
         data.get("tax", 0), data.get("total", 0), data.get("category", ""),
         data.get("payment_method", ""), image_path,
         json.dumps(data.get("items", [])), datetime.now().isoformat())
    )
    conn.commit()

Step 3: Batch Process a Folder

from pathlib import Path

def process_receipt_folder(folder_path, conn):
    processed = 0
    errors = []

    for image_file in Path(folder_path).glob("*"):
        if image_file.suffix.lower() not in [".jpg", ".jpeg", ".png"]:
            continue
        try:
            data = scan_receipt(str(image_file))
            store_receipt(conn, data, str(image_file))
            processed += 1
            print(f"Processed: {image_file.name} - {data.get('vendor', 'Unknown')} ${data.get('total', 0)}")
        except Exception as e:
            errors.append({"file": image_file.name, "error": str(e)})

    print(f"Processed {processed} receipts. {len(errors)} errors.")
    return {"processed": processed, "errors": errors}

Step 4: Validate Extracted Data

def validate_receipt(data):
    issues = []
    if not data.get("vendor"):
        issues.append("Missing vendor name")
    if not data.get("date"):
        issues.append("Missing date")
    if not data.get("total") or data["total"] <= 0:
        issues.append("Invalid total amount")
    if data.get("subtotal") and data.get("tax"):
        expected = round(data["subtotal"] + data["tax"], 2)
        if abs(expected - data.get("total", 0)) > 0.02:
            issues.append(f"Total mismatch: subtotal+tax={expected}, total={data['total']}")
    return issues

Step 5: Export for Accounting

import csv

def export_receipts(conn, month, output_path):
    rows = conn.execute("""
        SELECT vendor, receipt_date, total, tax, category
        FROM receipts WHERE strftime('%Y-%m', receipt_date) = ?
        ORDER BY receipt_date
    """, (month,)).fetchall()

    with open(output_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Vendor", "Date", "Total", "Tax", "Category"])
        for row in rows:
            writer.writerow(row)

    print(f"Exported {len(rows)} receipts for {month}")

What to Build Next

Add duplicate detection that compares new receipts against existing ones by vendor, date, and amount. Duplicate receipts are a common bookkeeping error. Catching them automatically saves correction time later.

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems