Alerting
Learn how to implement comprehensive alerting systems for your applications. This guide covers alert configuration, thresholds, notification channels, and alert management.
Prerequisites
- Understanding of monitoring concepts
- Experience with metrics collection
- Knowledge of notification systems
- Familiarity with incident management
Alerting Overview

Visual representation of the alerting system workflow and components.
Alert Configuration
Set up the core alerting system:
// Alert system configuration
const alertSystem = {
// Alert levels
levels: {
critical: {
priority: 1,
color: '#FF0000',
responseTime: 900, // 15 minutes
channels: ['email', 'sms', 'slack', 'pagerduty']
},
high: {
priority: 2,
color: '#FFA500',
responseTime: 3600, // 1 hour
channels: ['email', 'slack', 'pagerduty']
},
medium: {
priority: 3,
color: '#FFFF00',
responseTime: 14400, // 4 hours
channels: ['email', 'slack']
},
low: {
priority: 4,
color: '#00FF00',
responseTime: 86400, // 24 hours
channels: ['email']
}
},
// Alert types
types: {
performance: {
category: 'system',
metrics: ['cpu', 'memory', 'disk', 'latency']
},
availability: {
category: 'system',
metrics: ['uptime', 'error_rate', 'success_rate']
},
security: {
category: 'security',
metrics: ['failed_logins', 'suspicious_activity', 'data_access']
},
business: {
category: 'business',
metrics: ['conversion_rate', 'revenue', 'user_activity']
}
},
async createAlert(data) {
const alert = {
id: this.generateId(),
timestamp: new Date(),
level: data.level,
type: data.type,
source: data.source,
message: data.message,
details: data.details,
status: 'open'
};
await this.storeAlert(alert);
await this.notifyAlert(alert);
return alert;
}
}
Threshold Configuration
Define alert thresholds and conditions:
// Alert thresholds configuration
const alertThresholds = {
// System metrics
system: {
cpu: {
critical: 95,
high: 85,
medium: 75,
duration: 300 // 5 minutes
},
memory: {
critical: 95,
high: 90,
medium: 80,
duration: 300
},
disk: {
critical: 95,
high: 90,
medium: 80,
duration: 0 // immediate
},
latency: {
critical: 1000, // ms
high: 500,
medium: 200,
duration: 300
}
},
// Application metrics
application: {
error_rate: {
critical: 10, // percentage
high: 5,
medium: 1,
duration: 300
},
response_time: {
critical: 2000, // ms
high: 1000,
medium: 500,
duration: 300
},
success_rate: {
critical: 90, // percentage (below this)
high: 95,
medium: 98,
duration: 300
}
},
// Custom alert conditions
conditions: {
async evaluate(metric, value) {
const threshold = this.getThresholdForMetric(metric);
if (!threshold) return null;
// Check if value exceeds threshold for specified duration
const history = await this.getMetricHistory(metric, threshold.duration);
const sustained = this.isSustained(history, threshold);
if (value >= threshold.critical && sustained) {
return 'critical';
} else if (value >= threshold.high && sustained) {
return 'high';
} else if (value >= threshold.medium && sustained) {
return 'medium';
}
return null;
}
}
}
Notification Channels
Configure alert notification channels:
// Notification channels configuration
const notificationChannels = {
email: {
async send(alert) {
const recipients = await this.getRecipients(alert);
const template = this.getTemplate(alert);
const message = {
to: recipients,
subject: `[${alert.level.toUpperCase()}] ${alert.message}`,
html: template.html,
text: template.text
};
return await emailService.send(message);
}
},
slack: {
async send(alert) {
const channel = await this.getChannel(alert);
const blocks = this.formatSlackMessage(alert);
return await slackClient.chat.postMessage({
channel,
blocks,
text: `[${alert.level.toUpperCase()}] ${alert.message}`
});
},
formatSlackMessage(alert) {
return [
{
type: 'header',
text: {
type: 'plain_text',
text: `Alert: ${alert.message}`
}
},
{
type: 'section',
fields: [
{
type: 'mrkdwn',
text: `*Level:* ${alert.level}`
},
{
type: 'mrkdwn',
text: `*Time:* ${alert.timestamp.toISOString()}`
},
{
type: 'mrkdwn',
text: `*Source:* ${alert.source}`
},
{
type: 'mrkdwn',
text: `*Type:* ${alert.type}`
}
]
},
{
type: 'section',
text: {
type: 'mrkdwn',
text: `*Details:*\n${JSON.stringify(alert.details, null, 2)}`
}
},
{
type: 'actions',
elements: [
{
type: 'button',
text: {
type: 'plain_text',
text: 'View Details'
},
url: `https://dashboard.example.com/alerts/${alert.id}`
},
{
type: 'button',
text: {
type: 'plain_text',
text: 'Acknowledge'
},
value: `acknowledge_${alert.id}`
}
]
}
];
}
},
pagerduty: {
async send(alert) {
const serviceId = await this.getServiceId(alert);
return await pagerdutyClient.incidents.create({
incident: {
type: 'incident',
title: alert.message,
service: {
id: serviceId,
type: 'service_reference'
},
urgency: this.mapLevelToUrgency(alert.level),
body: {
type: 'incident_body',
details: JSON.stringify(alert.details)
}
}
});
},
mapLevelToUrgency(level) {
const map = {
critical: 'high',
high: 'high',
medium: 'low',
low: 'low'
};
return map[level] || 'low';
}
}
}
Alert Management
Implement alert lifecycle management:
// Alert management system
const alertManager = {
// Alert lifecycle states
states: {
open: {
next: ['acknowledged', 'resolved', 'closed']
},
acknowledged: {
next: ['investigating', 'resolved', 'closed']
},
investigating: {
next: ['resolved', 'closed']
},
resolved: {
next: ['reopened', 'closed']
},
reopened: {
next: ['acknowledged', 'investigating', 'resolved', 'closed']
},
closed: {
next: ['reopened']
}
},
async updateAlertStatus(alertId, newStatus, user, comment) {
const alert = await this.getAlert(alertId);
if (!alert) {
throw new Error(`Alert ${alertId} not found`);
}
if (!this.states[alert.status].next.includes(newStatus)) {
throw new Error(`Cannot transition from ${alert.status} to ${newStatus}`);
}
const update = {
status: newStatus,
updatedAt: new Date(),
updatedBy: user,
statusHistory: [
...(alert.statusHistory || []),
{
from: alert.status,
to: newStatus,
timestamp: new Date(),
user,
comment
}
]
};
await this.updateAlert(alertId, update);
await this.notifyStatusChange(alert, newStatus, user, comment);
return { ...alert, ...update };
},
async escalateAlert(alertId, reason) {
const alert = await this.getAlert(alertId);
if (!alert) {
throw new Error(`Alert ${alertId} not found`);
}
// Determine escalation level
const currentLevel = alert.level;
const newLevel = this.getNextEscalationLevel(currentLevel);
if (newLevel === currentLevel) {
return alert; // Already at highest level
}
const update = {
level: newLevel,
escalationHistory: [
...(alert.escalationHistory || []),
{
from: currentLevel,
to: newLevel,
timestamp: new Date(),
reason
}
]
};
await this.updateAlert(alertId, update);
await this.notifyEscalation(alert, newLevel, reason);
return { ...alert, ...update };
}
}
Best Practices
Alert Design
Best practices for alert configuration:
- Define clear thresholds
- Minimize alert noise
- Use appropriate severity
- Include actionable context
Notification Strategy
Effective notification approach:
- Route by severity
- Use multiple channels
- Implement escalation
- Provide clear context
Alert Management
Managing the alert lifecycle:
- Track alert status
- Document resolution
- Analyze patterns
- Continuous improvement
Alert Examples
High CPU Usage Detected
Source: web-server-01
Metric: CPU Usage
Value: 98.5% (Threshold: 95%)
Duration: 5 minutes
API Error Rate Increased
Source: payment-api
Metric: Error Rate
Value: 8.2% (Threshold: 5%)
Duration: 10 minutes
Database Connection Pool Near Limit
Source: db-cluster-main
Metric: Connection Pool Usage
Value: 82% (Threshold: 80%)
Duration: 15 minutes
Common Issues
Alert Noise
Common alert noise problems:
- Too many alerts
- False positives
- Alert storms
- Duplicate notifications
Notification Issues
Notification-related challenges:
- Delivery failures
- Alert fatigue
- Missing context
- Escalation failures