Incident Response
Incident Management
Automated incident detection with PagerDuty integration, intelligent escalation policies, and blameless postmortem culture achieving 8-minute MTTA.
8 min
MTTA
32 min
MTTR
12
Incidents/Month
99.8%
SLA Compliance
Alert Configuration
Multi-channel alerting with intelligent deduplication and noise reduction achieving 95% actionable alert rate.
# incident_management/alerting.py
from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum
import hashlib
class AlertSeverity(Enum):
CRITICAL = "critical" # Page immediately
HIGH = "high" # Page within 5 min
MEDIUM = "medium" # Slack notification
LOW = "low" # Ticket creation
INFO = "info" # Dashboard only
class AlertChannel(Enum):
PAGERDUTY = "pagerduty"
SLACK = "slack"
EMAIL = "email"
SMS = "sms"
PHONE = "phone"
@dataclass
class AlertRule:
name: str
condition: str
severity: AlertSeverity
channels: List[AlertChannel]
dedup_key: Optional[str] = None
cooldown_minutes: int = 5
runbook_url: Optional[str] = None
# Production alert rules
ALERT_RULES = [
AlertRule(
name="API Error Rate Critical",
condition="error_rate > 5% for 2 minutes",
severity=AlertSeverity.CRITICAL,
channels=[AlertChannel.PAGERDUTY, AlertChannel.SLACK],
dedup_key="api_error_rate",
runbook_url="https://runbooks.justkalm.com/api-errors"
),
AlertRule(
name="API Latency P99 High",
condition="p99_latency > 2000ms for 3 minutes",
severity=AlertSeverity.HIGH,
channels=[AlertChannel.PAGERDUTY, AlertChannel.SLACK],
dedup_key="api_latency_p99",
runbook_url="https://runbooks.justkalm.com/latency"
),
AlertRule(
name="Database Connection Pool Exhausted",
condition="db_connections > 90% for 1 minute",
severity=AlertSeverity.CRITICAL,
channels=[AlertChannel.PAGERDUTY, AlertChannel.SLACK],
dedup_key="db_connections",
runbook_url="https://runbooks.justkalm.com/database"
),
AlertRule(
name="ML Model Accuracy Degraded",
condition="model_accuracy < 92% for 10 minutes",
severity=AlertSeverity.HIGH,
channels=[AlertChannel.SLACK, AlertChannel.EMAIL],
dedup_key="ml_accuracy",
cooldown_minutes=30,
runbook_url="https://runbooks.justkalm.com/ml-accuracy"
),
AlertRule(
name="Disk Usage Critical",
condition="disk_usage > 85% on any node",
severity=AlertSeverity.HIGH,
channels=[AlertChannel.PAGERDUTY, AlertChannel.SLACK],
dedup_key="disk_usage",
runbook_url="https://runbooks.justkalm.com/disk"
),
]
class AlertManager:
"""
Manages alert routing, deduplication,
and noise reduction.
"""
def __init__(self, pagerduty_client, slack_client):
self.pagerduty = pagerduty_client
self.slack = slack_client
self.active_alerts: Dict[str, Alert] = {}
self.cooldowns: Dict[str, float] = {}
async def process_alert(
self,
rule: AlertRule,
context: Dict
) -> Optional[str]:
"""
Process incoming alert with dedup
and routing logic.
"""
dedup_key = self._compute_dedup_key(rule, context)
# Check cooldown
if self._in_cooldown(dedup_key, rule.cooldown_minutes):
return None
# Check for existing active alert
if dedup_key in self.active_alerts:
return self._update_existing_alert(dedup_key, context)
# Create new incident
incident_id = await self._create_incident(rule, context)
# Set cooldown
self._set_cooldown(dedup_key)
return incident_idActive Incidents
INC-2847
HIGHElevated latency in ML scoring service
Started: 12 min agoAssignee: @sarahStatus: Investigating
Recently Resolved
INC-2846Cache cluster failover
18 min•2 hrs ago
INC-2845Database connection spike
8 min•6 hrs ago
INC-2844CDN origin timeout
25 min•1 day ago
Incident Lifecycle
Detected0 min
Acknowledged2 min
Investigating5 min
Mitigating15 min
Resolved32 min
Postmortem+3 days
Resolve Incidents Faster
Every minute matters. Our incident management ensures rapid response and continuous improvement.
8 min MTTA32 min MTTR99.8% SLA