How to Build an Incident Response Automation System
Automate incident detection, triage, and response workflows.
Jay Banlasan
The AI Systems Guy
When something breaks in production, every minute counts. I built an incident response automation system that detects issues, creates an incident record, notifies the right team, and tracks resolution steps. Automated triage means faster response times.
The system handles the coordination so your team focuses on the fix.
What You Need Before Starting
- Python 3.8+
- A monitoring tool (Datadog, PagerDuty, or custom health checks)
- Slack for incident channels
- SQLite for incident tracking
Step 1: Define Severity Levels and Routing
SEVERITY_CONFIG = {
"critical": {
"response_time_minutes": 5,
"notify": ["on-call-engineer", "engineering-manager", "cto"],
"create_channel": True,
"page": True
},
"high": {
"response_time_minutes": 15,
"notify": ["on-call-engineer", "engineering-manager"],
"create_channel": True,
"page": False
},
"medium": {
"response_time_minutes": 60,
"notify": ["on-call-engineer"],
"create_channel": False,
"page": False
},
"low": {
"response_time_minutes": 240,
"notify": ["engineering-team-channel"],
"create_channel": False,
"page": False
}
}
Step 2: Create the Incident Tracker
import sqlite3
from datetime import datetime
def init_incident_db(db_path="incidents.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS incidents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
description TEXT,
severity TEXT,
status TEXT DEFAULT 'detected',
detected_at TEXT,
acknowledged_at TEXT,
resolved_at TEXT,
assigned_to TEXT,
slack_channel TEXT,
root_cause TEXT
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS incident_timeline (
id INTEGER PRIMARY KEY AUTOINCREMENT,
incident_id INTEGER,
event TEXT,
actor TEXT,
timestamp TEXT
)
""")
conn.commit()
return conn
Step 3: Create an Incident Automatically
def create_incident(conn, title, description, severity):
config = SEVERITY_CONFIG[severity]
cursor = conn.execute(
"INSERT INTO incidents (title, description, severity, detected_at) VALUES (?,?,?,?)",
(title, description, severity, datetime.now().isoformat())
)
incident_id = cursor.lastrowid
conn.execute(
"INSERT INTO incident_timeline (incident_id, event, actor, timestamp) VALUES (?,?,?,?)",
(incident_id, "Incident detected", "system", datetime.now().isoformat())
)
conn.commit()
for person in config["notify"]:
notify_responder(person, incident_id, title, severity)
return incident_id
Step 4: Track Incident Lifecycle
def acknowledge_incident(conn, incident_id, responder):
conn.execute(
"UPDATE incidents SET status='acknowledged', acknowledged_at=?, assigned_to=? WHERE id=?",
(datetime.now().isoformat(), responder, incident_id)
)
conn.execute(
"INSERT INTO incident_timeline (incident_id, event, actor, timestamp) VALUES (?,?,?,?)",
(incident_id, "Incident acknowledged", responder, datetime.now().isoformat())
)
conn.commit()
def resolve_incident(conn, incident_id, responder, root_cause):
conn.execute(
"UPDATE incidents SET status='resolved', resolved_at=?, root_cause=? WHERE id=?",
(datetime.now().isoformat(), root_cause, incident_id)
)
conn.execute(
"INSERT INTO incident_timeline (incident_id, event, actor, timestamp) VALUES (?,?,?,?)",
(incident_id, f"Resolved: {root_cause}", responder, datetime.now().isoformat())
)
conn.commit()
def notify_responder(person, incident_id, title, severity):
print(f"ALERT [{severity.upper()}] to {person}: #{incident_id} - {title}")
Step 5: Generate Incident Reports
def incident_report(conn, incident_id):
inc = conn.execute("SELECT * FROM incidents WHERE id=?", (incident_id,)).fetchone()
timeline = conn.execute(
"SELECT event, actor, timestamp FROM incident_timeline WHERE incident_id=? ORDER BY timestamp",
(incident_id,)
).fetchall()
detected = datetime.fromisoformat(inc[4])
resolved = datetime.fromisoformat(inc[6]) if inc[6] else None
ttd = (datetime.fromisoformat(inc[5]) - detected).total_seconds() / 60 if inc[5] else None
ttr = (resolved - detected).total_seconds() / 60 if resolved else None
report = f"# Incident #{inc[0]}: {inc[1]}\n\n"
report += f"Severity: {inc[3]} | Status: {inc[4]}\n"
report += f"Time to Acknowledge: {round(ttd)}min\n" if ttd else ""
report += f"Time to Resolve: {round(ttr)}min\n" if ttr else ""
report += f"Root Cause: {inc[9]}\n\n" if inc[9] else ""
report += "## Timeline\n\n"
for event, actor, ts in timeline:
report += f"- {ts}: {event} ({actor})\n"
return report
What to Build Next
Add postmortem generation that pulls the incident timeline and uses AI to draft a root cause analysis with action items. The incident is over when the fix is deployed. The learning is captured in the postmortem.
Related Reading
- Building Your First Automation: A Complete Guide - automation fundamentals
- The Feedback Loop That Powers Everything - incident feedback loops
- Identifying Your Biggest Bottleneck - finding system weak points
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment