How to Build an AI Receipt Scanner and Data Extractor
Scan and extract data from receipts automatically for accounting.
Jay Banlasan
The AI Systems Guy
Receipts pile up in shoeboxes and email inboxes. I built an ai receipt scanner for data extraction that reads receipt images, pulls out vendor, date, amount, tax, and category, then stores everything in a structured format for accounting. No more manual data entry from crumpled paper.
Point a camera. Get structured data.
What You Need Before Starting
- Python 3.8+
- An AI API key with vision capabilities (Claude or GPT-4V)
- A receipt storage directory
- SQLite for extracted data
Step 1: Scan Receipts with AI Vision
import anthropic
import base64
import json
client = anthropic.Anthropic()
def scan_receipt(image_path):
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
media_type = "image/jpeg" if image_path.endswith(".jpg") else "image/png"
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=500,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
{"type": "text", "text": """Extract from this receipt and return JSON only:
{"vendor": "", "date": "YYYY-MM-DD", "subtotal": 0.00, "tax": 0.00, "total": 0.00, "items": [{"description": "", "amount": 0.00}], "payment_method": "", "category": ""}
If a field is not visible, use null."""}
]
}]
)
return json.loads(message.content[0].text)
Step 2: Store Extracted Data
import sqlite3
from datetime import datetime
def init_receipt_db(db_path="receipts.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS receipts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
vendor TEXT,
receipt_date TEXT,
subtotal REAL,
tax REAL,
total REAL,
category TEXT,
payment_method TEXT,
image_path TEXT,
items_json TEXT,
extracted_at TEXT
)
""")
conn.commit()
return conn
def store_receipt(conn, data, image_path):
conn.execute(
"INSERT INTO receipts (vendor, receipt_date, subtotal, tax, total, category, payment_method, image_path, items_json, extracted_at) VALUES (?,?,?,?,?,?,?,?,?,?)",
(data.get("vendor"), data.get("date"), data.get("subtotal", 0),
data.get("tax", 0), data.get("total", 0), data.get("category", ""),
data.get("payment_method", ""), image_path,
json.dumps(data.get("items", [])), datetime.now().isoformat())
)
conn.commit()
Step 3: Batch Process a Folder
from pathlib import Path
def process_receipt_folder(folder_path, conn):
processed = 0
errors = []
for image_file in Path(folder_path).glob("*"):
if image_file.suffix.lower() not in [".jpg", ".jpeg", ".png"]:
continue
try:
data = scan_receipt(str(image_file))
store_receipt(conn, data, str(image_file))
processed += 1
print(f"Processed: {image_file.name} - {data.get('vendor', 'Unknown')} ${data.get('total', 0)}")
except Exception as e:
errors.append({"file": image_file.name, "error": str(e)})
print(f"Processed {processed} receipts. {len(errors)} errors.")
return {"processed": processed, "errors": errors}
Step 4: Validate Extracted Data
def validate_receipt(data):
issues = []
if not data.get("vendor"):
issues.append("Missing vendor name")
if not data.get("date"):
issues.append("Missing date")
if not data.get("total") or data["total"] <= 0:
issues.append("Invalid total amount")
if data.get("subtotal") and data.get("tax"):
expected = round(data["subtotal"] + data["tax"], 2)
if abs(expected - data.get("total", 0)) > 0.02:
issues.append(f"Total mismatch: subtotal+tax={expected}, total={data['total']}")
return issues
Step 5: Export for Accounting
import csv
def export_receipts(conn, month, output_path):
rows = conn.execute("""
SELECT vendor, receipt_date, total, tax, category
FROM receipts WHERE strftime('%Y-%m', receipt_date) = ?
ORDER BY receipt_date
""", (month,)).fetchall()
with open(output_path, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Vendor", "Date", "Total", "Tax", "Category"])
for row in rows:
writer.writerow(row)
print(f"Exported {len(rows)} receipts for {month}")
What to Build Next
Add duplicate detection that compares new receipts against existing ones by vendor, date, and amount. Duplicate receipts are a common bookkeeping error. Catching them automatically saves correction time later.
Related Reading
- Building Your First Automation: A Complete Guide - automation fundamentals
- Input, Process, Output: The Universal AI Framework - AI processing patterns
- Cost of Manual vs Cost of Automated - the cost of manual receipt entry
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment