From acc7e3a4dd6ef02f6a66990724b27605ff81438f Mon Sep 17 00:00:00 2001 From: strangedev Date: Tue, 16 Jun 2026 16:45:56 +0100 Subject: [PATCH 1/7] feat: add alerting service with Slack/PagerDuty support - Create new AlertingService with singleton pattern - Support LOG, SLACK, and PAGERDUTY channels - Implement alert deduplication/cooldown (15 min default) - Support rich DLQ alert payloads with metadata - Export types for DLQAlertPayload --- src/services/alerting.ts | 401 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 401 insertions(+) create mode 100644 src/services/alerting.ts diff --git a/src/services/alerting.ts b/src/services/alerting.ts new file mode 100644 index 0000000..c7b60e5 --- /dev/null +++ b/src/services/alerting.ts @@ -0,0 +1,401 @@ +/** + * Alerting Service + * + * Pluggable alert dispatcher supporting multiple channels: + * - LOG (always enabled, uses Winston logger) + * - SLACK_WEBHOOK_URL (optional, posts to Slack) + * - PAGERDUTY_ROUTING_KEY (optional, triggers PagerDuty incidents) + * + * Includes cooldown/deduplication to prevent alert fatigue. + */ + +import { logger } from '../utils/logger' +import { dlqAlertActive } from '../utils/metrics' + +export interface AlertPayload { + title: string + description: string + severity: 'info' | 'warning' | 'critical' + component: string + metadata?: Record +} + +export interface DLQAlertPayload extends AlertPayload { + dlqSize: number + statusBreakdown: { + pending: number + retried: number + resolved: number + } + oldestPendingAge?: { + eventId: string + ageMs: number + ageHumanReadable: string + } + adminLink?: string +} + +interface AlertState { + lastAlertTime: number + alertCount: number +} + +/** + * Alerting service with support for multiple channels and cooldown logic + */ +class AlertingService { + private static instance: AlertingService + private alertStates: Map = new Map() + private cooldownMs: number = 15 * 60 * 1000 // 15 minutes default + + private slackWebhookUrl: string + private pagerdutyRoutingKey: string + private enabledChannels: Set<'LOG' | 'SLACK' | 'PAGERDUTY'> + + private constructor() { + this.slackWebhookUrl = process.env.SLACK_WEBHOOK_URL || '' + this.pagerdutyRoutingKey = process.env.PAGERDUTY_ROUTING_KEY || '' + this.enabledChannels = new Set() + this.enabledChannels.add('LOG') // Always enabled + + if (this.slackWebhookUrl) { + this.enabledChannels.add('SLACK') + } + if (this.pagerdutyRoutingKey) { + this.enabledChannels.add('PAGERDUTY') + } + } + + static getInstance(): AlertingService { + if (!AlertingService.instance) { + AlertingService.instance = new AlertingService() + } + return AlertingService.instance + } + + /** + * Check if alert is within cooldown window (deduplication) + */ + private isWithinCooldown(alertKey: string): boolean { + const state = this.alertStates.get(alertKey) + if (!state) return false + + const timeSinceLastAlert = Date.now() - state.lastAlertTime + return timeSinceLastAlert < this.cooldownMs + } + + /** + * Update alert state for cooldown tracking + */ + private updateAlertState(alertKey: string): void { + const state = this.alertStates.get(alertKey) || { + lastAlertTime: 0, + alertCount: 0, + } + state.lastAlertTime = Date.now() + state.alertCount++ + this.alertStates.set(alertKey, state) + } + + /** + * Get cooldown status (useful for monitoring) + */ + private getCooldownRemaining(alertKey: string): number { + const state = this.alertStates.get(alertKey) + if (!state) return 0 + + const timeSinceLastAlert = Date.now() - state.lastAlertTime + const remaining = this.cooldownMs - timeSinceLastAlert + return Math.max(0, remaining) + } + + /** + * Emit alert to all enabled channels with cooldown checking + */ + async emit( + payload: AlertPayload, + alertKey?: string + ): Promise<{ sent: boolean; reason?: string }> { + const key = alertKey || `${payload.component}:${payload.severity}` + + // Check cooldown + if (this.isWithinCooldown(key)) { + const remaining = this.getCooldownRemaining(key) + return { + sent: false, + reason: `Alert suppressed by cooldown. Next alert in ${Math.round(remaining / 1000)}s`, + } + } + + try { + const promises: Promise[] = [] + + if (this.enabledChannels.has('LOG')) { + promises.push(this.sendToLog(payload)) + } + + if (this.enabledChannels.has('SLACK')) { + promises.push( + this.sendToSlack(payload).catch((err) => { + logger.error('[Alerting] Failed to send Slack alert:', err) + }) + ) + } + + if (this.enabledChannels.has('PAGERDUTY')) { + promises.push( + this.sendToPagerDuty(payload).catch((err) => { + logger.error('[Alerting] Failed to send PagerDuty alert:', err) + }) + ) + } + + await Promise.all(promises) + this.updateAlertState(key) + return { sent: true } + } catch (error) { + logger.error('[Alerting] Error emitting alert:', error) + return { + sent: false, + reason: error instanceof Error ? error.message : 'Unknown error', + } + } + } + + /** + * Emit DLQ-specific alert with rich metadata + */ + async emitDLQAlert( + payload: DLQAlertPayload + ): Promise<{ sent: boolean; reason?: string }> { + const result = await this.emit(payload, 'dlq:threshold') + + // Update Prometheus gauge + if (payload.severity === 'critical') { + dlqAlertActive.set(1) + } else { + dlqAlertActive.set(0) + } + + return result + } + + /** + * Clear DLQ alert state (when queue drops below threshold) + */ + clearDLQAlertState(): void { + this.alertStates.delete('dlq:threshold') + dlqAlertActive.set(0) + logger.info('[Alerting] DLQ alert state cleared (queue normalized)') + } + + /** + * Send alert to Winston logger + */ + private async sendToLog(payload: AlertPayload): Promise { + const logLevel = + payload.severity === 'critical' + ? 'error' + : payload.severity === 'warning' + ? 'warn' + : 'info' + const message = `[ALERT] [${payload.component.toUpperCase()}] ${payload.title}: ${payload.description}` + + if (logLevel === 'error') { + logger.error(message, payload.metadata) + } else if (logLevel === 'warn') { + logger.warn(message, payload.metadata) + } else { + logger.info(message, payload.metadata) + } + } + + /** + * Send alert to Slack + */ + private async sendToSlack( + payload: AlertPayload | DLQAlertPayload + ): Promise { + if (!this.slackWebhookUrl) { + return + } + + const color = + payload.severity === 'critical' + ? 'danger' + : payload.severity === 'warning' + ? 'warning' + : 'good' + const dlqPayload = payload as DLQAlertPayload + + let blocks: any[] = [ + { + type: 'header', + text: { + type: 'plain_text', + text: `🚨 ${payload.title}`, + emoji: true, + }, + }, + { + type: 'section', + text: { + type: 'mrkdwn', + text: payload.description, + }, + }, + ] + + // Add DLQ-specific metadata + if (dlqPayload.dlqSize !== undefined) { + const fields: any[] = [ + { + type: 'mrkdwn', + text: `*Current Size:*\n${dlqPayload.dlqSize} events`, + }, + { + type: 'mrkdwn', + text: `*Severity:*\n${payload.severity.toUpperCase()}`, + }, + ] + + if (dlqPayload.statusBreakdown) { + fields.push({ + type: 'mrkdwn', + text: `*Status Breakdown:*\nPending: ${dlqPayload.statusBreakdown.pending}\nRetried: ${dlqPayload.statusBreakdown.retried}\nResolved: ${dlqPayload.statusBreakdown.resolved}`, + }) + } + + if (dlqPayload.oldestPendingAge) { + fields.push({ + type: 'mrkdwn', + text: `*Oldest Event:*\nAge: ${dlqPayload.oldestPendingAge.ageHumanReadable}\nID: \`${dlqPayload.oldestPendingAge.eventId}\``, + }) + } + + blocks.push({ + type: 'section', + fields, + }) + } + + // Add custom metadata if present + if (payload.metadata && Object.keys(payload.metadata).length > 0) { + blocks.push({ + type: 'section', + text: { + type: 'mrkdwn', + text: `*Additional Info:*\n${Object.entries(payload.metadata) + .map(([key, value]) => `• ${key}: ${JSON.stringify(value)}`) + .join('\n')}`, + }, + }) + } + + // Add action links + if (dlqPayload.adminLink) { + blocks.push({ + type: 'actions', + elements: [ + { + type: 'button', + text: { + type: 'plain_text', + text: 'Inspect DLQ', + emoji: true, + }, + url: dlqPayload.adminLink, + style: 'danger', + }, + ], + }) + } + + const response = await fetch(this.slackWebhookUrl, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + blocks, + attachments: [ + { + color, + footer: 'NeuroWealth DLQ Alert System', + ts: Math.floor(Date.now() / 1000), + }, + ], + }), + }) + + if (!response.ok) { + throw new Error( + `Slack webhook returned ${response.status}: ${response.statusText}` + ) + } + } + + /** + * Send alert to PagerDuty + */ + private async sendToPagerDuty( + payload: AlertPayload | DLQAlertPayload + ): Promise { + if (!this.pagerdutyRoutingKey) { + return + } + + const dlqPayload = payload as DLQAlertPayload + const severity = + payload.severity === 'critical' + ? 'critical' + : payload.severity === 'warning' + ? 'warning' + : 'info' + + let customDetails: Record = { + ...payload.metadata, + } + + if (dlqPayload.dlqSize !== undefined) { + customDetails = { + ...customDetails, + dlq_size: dlqPayload.dlqSize, + status_breakdown: dlqPayload.statusBreakdown, + oldest_event_age: dlqPayload.oldestPendingAge?.ageHumanReadable, + } + } + + const response = await fetch('https://events.pagerduty.com/v2/enqueue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + routing_key: this.pagerdutyRoutingKey, + event_action: 'trigger', + dedup_key: `dlq-alert-${Math.floor(Date.now() / 60000)}`, // Group by minute + payload: { + summary: payload.title, + severity, + source: 'NeuroWealth Backend', + component: payload.component, + custom_details: customDetails, + }, + client: 'NeuroWealth DLQ Alerting', + client_url: dlqPayload.adminLink, + }), + }) + + if (!response.ok) { + throw new Error( + `PagerDuty API returned ${response.status}: ${response.statusText}` + ) + } + } + + /** + * Check enabled channels (for diagnostics) + */ + getEnabledChannels(): string[] { + return Array.from(this.enabledChannels) + } +} + +export const alertingService = AlertingService.getInstance() From 5a8b15a28795ab656e53dc1a3cc3eda095fddc2f Mon Sep 17 00:00:00 2001 From: strangedev Date: Tue, 16 Jun 2026 16:46:10 +0100 Subject: [PATCH 2/7] feat: add dlq_alert_active Prometheus gauge - New gauge tracks whether DLQ alert is currently active - Set to 1 when threshold exceeded, 0 when normalized - Useful for monitoring alert state in Prometheus/Grafana --- src/utils/metrics.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/utils/metrics.ts b/src/utils/metrics.ts index 6f7a9db..1a451df 100644 --- a/src/utils/metrics.ts +++ b/src/utils/metrics.ts @@ -71,6 +71,12 @@ export const dlqRetryTotal = new client.Counter({ registers: [register], }) +export const dlqAlertActive = new client.Gauge({ + name: 'dlq_alert_active', + help: 'Whether a DLQ size alert is currently active (1=active, 0=inactive)', + registers: [register], +}) + // ── Cursor/Lag Metrics ────────────────────────────────────────────────────────── export const cursorLag = new client.Gauge({ From 8a547615d9b2ebc95f2653e46a070b6a2a78e9fd Mon Sep 17 00:00:00 2001 From: strangedev Date: Tue, 16 Jun 2026 16:46:23 +0100 Subject: [PATCH 3/7] feat: add DLQ alerting configuration - DLQ_ALERT_COOLDOWN_MS: Control alert deduplication window (default 15min) - Support SLACK_WEBHOOK_URL for Slack integration - Support PAGERDUTY_ROUTING_KEY for PagerDuty integration - Add ADMIN_DASHBOARD_URL for inspection links --- src/config/env.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/config/env.ts b/src/config/env.ts index 064c7a8..f1e5676 100644 --- a/src/config/env.ts +++ b/src/config/env.ts @@ -43,7 +43,7 @@ function validateAllRequiredEnvVars(): void { if (walletKey && !/^[0-9a-f]{64}$/i.test(walletKey)) { errors.push( `WALLET_ENCRYPTION_KEY is invalid: must be exactly 64 hexadecimal characters (32 bytes). ` + - `Got length ${walletKey.length}. Generate one with: openssl rand -hex 32` + `Got length ${walletKey.length}. Generate one with: openssl rand -hex 32` ) } @@ -60,7 +60,7 @@ function validateAllRequiredEnvVars(): void { const list = errors.map(e => ` - ${e}`).join('\n') throw new Error( `Application cannot start — environment configuration errors:\n${list}\n\n` + - `Fix the variables above and restart the application.` + `Fix the variables above and restart the application.` ) } } @@ -146,8 +146,8 @@ function validateStellarKey(secretKey: string, network: 'testnet' | 'mainnet' | if (network === 'mainnet' && env !== 'production') { console.warn( '\nāš ļø CRITICAL WARNING: Using MAINNET in non-production environment!\n' + - 'āš ļø This could result in real financial loss!\n' + - 'āš ļø Verify STELLAR_NETWORK and NODE_ENV settings immediately!\n' + 'āš ļø This could result in real financial loss!\n' + + 'āš ļø Verify STELLAR_NETWORK and NODE_ENV settings immediately!\n' ) } } @@ -264,5 +264,6 @@ export const config = { }, dlq: { alertThreshold: parseInt(process.env.DLQ_ALERT_THRESHOLD || '50'), + alertCooldownMs: parseInt(process.env.DLQ_ALERT_COOLDOWN_MS || '900000'), // 15 minutes default }, } \ No newline at end of file From 52a4b4fe5f4dc3d38484eced22408524d3484e8b Mon Sep 17 00:00:00 2001 From: strangedev Date: Tue, 16 Jun 2026 16:46:37 +0100 Subject: [PATCH 4/7] feat: integrate external alerting into DLQ Replace hardcoded SIZE_ALERT_THRESHOLD with configurable config.dlq.alertThreshold - Emit rich DLQ alerts with metadata (size, status breakdown, oldest event age) - Include actionable admin dashboard links - Support alert deduplication to prevent fatigue - Calculate status breakdown (pending, retried, resolved counts) - Track oldest pending event and human-readable age - Clear alert state when queue normalizes Closes #163 --- src/stellar/dlq.ts | 113 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 107 insertions(+), 6 deletions(-) diff --git a/src/stellar/dlq.ts b/src/stellar/dlq.ts index 7388929..c209757 100644 --- a/src/stellar/dlq.ts +++ b/src/stellar/dlq.ts @@ -13,6 +13,8 @@ import { xdr } from '@stellar/stellar-sdk' import { logger } from '../utils/logger' import db from '../db' import { updateDlqSize } from '../utils/metrics' +import config from '../config' +import { alertingService, type DLQAlertPayload } from '../services/alerting' export type DeadLetterEventStatus = 'PENDING' | 'RETRIED' | 'RESOLVED' @@ -35,8 +37,6 @@ const LEGACY_DLQ_FILE = path.join( '../../logs/dead_letter_queue.json' ) -const SIZE_ALERT_THRESHOLD = 50 - function serializeScVal(value: unknown): string | unknown { if (value instanceof xdr.ScVal) { return value.toXDR('base64') @@ -275,10 +275,111 @@ export class DeadLetterQueue { } private static checkSizeAlert(size: number): void { - if (size >= SIZE_ALERT_THRESHOLD) { - logger.error( - `[DLQ ALERT] Dead-letter queue size is critically high: ${size} events. Manual intervention required.` - ) + const threshold = config.dlq.alertThreshold + + if (size >= threshold) { + this.emitDLQAlert(size, 'critical') + } else if (size > 0 && size < threshold) { + // Alert has normalized, clear the state + alertingService.clearDLQAlertState() + } + } + + /** + * Emit DLQ alert with rich metadata to external channels + */ + private static async emitDLQAlert( + size: number, + severity: 'critical' | 'warning' | 'info' + ): Promise { + try { + // Get status breakdown + const statusBreakdown = await this.getStatusBreakdown() + const oldestPending = await this.getOldestPendingEvent() + + // Calculate age in human-readable format + let ageHumanReadable = 'N/A' + if (oldestPending) { + const ageMs = Date.now() - new Date(oldestPending.createdAt).getTime() + ageHumanReadable = this.formatAge(ageMs) + } + + const payload: DLQAlertPayload = { + title: `[CRITICAL] DLQ Size Alert: ${size} events queued`, + description: `The Dead-Letter Queue has reached ${size} events. This indicates that recent event processing failures are not being automatically recovered. Immediate investigation required to identify and fix the root cause.`, + severity, + component: 'dlq', + dlqSize: size, + statusBreakdown, + oldestPendingAge: oldestPending + ? { + eventId: oldestPending.id, + ageMs: Date.now() - new Date(oldestPending.createdAt).getTime(), + ageHumanReadable, + } + : undefined, + adminLink: `${process.env.ADMIN_DASHBOARD_URL || 'https://admin.neurowealth.io'}/dlq`, + metadata: { + threshold: config.dlq.alertThreshold, + timestamp: new Date().toISOString(), + }, + } + + await alertingService.emitDLQAlert(payload) + } catch (error) { + logger.error('[DLQ] Error emitting alert:', error) + } + } + + /** + * Get DLQ status breakdown (pending, retried, resolved counts) + */ + private static async getStatusBreakdown(): Promise<{ + pending: number + retried: number + resolved: number + }> { + const pending = await (db as any).deadLetterEvent.count({ + where: { status: 'PENDING' }, + }) + const retried = await (db as any).deadLetterEvent.count({ + where: { status: 'RETRIED' }, + }) + const resolved = await (db as any).deadLetterEvent.count({ + where: { status: 'RESOLVED' }, + }) + + return { pending, retried, resolved } + } + + /** + * Get the oldest pending event (for age tracking) + */ + private static async getOldestPendingEvent(): Promise { + return (db as any).deadLetterEvent.findFirst({ + where: { status: 'PENDING' }, + orderBy: { createdAt: 'asc' }, + }) + } + + /** + * Format milliseconds to human-readable age (e.g., "2 hours 15 minutes") + */ + private static formatAge(ms: number): string { + const seconds = Math.floor(ms / 1000) + const minutes = Math.floor(seconds / 60) + const hours = Math.floor(minutes / 60) + const days = Math.floor(hours / 24) + + if (days > 0) { + return `${days} day${days > 1 ? 's' : ''} ${hours % 24} hour${hours % 24 !== 1 ? 's' : ''}` + } + if (hours > 0) { + return `${hours} hour${hours > 1 ? 's' : ''} ${minutes % 60} minute${minutes % 60 !== 1 ? 's' : ''}` + } + if (minutes > 0) { + return `${minutes} minute${minutes > 1 ? 's' : ''}` } + return `${seconds} second${seconds !== 1 ? 's' : ''}` } } From 85cd022c11abff02d9c0219f2909d64aa8b707c8 Mon Sep 17 00:00:00 2001 From: strangedev Date: Tue, 16 Jun 2026 16:46:50 +0100 Subject: [PATCH 5/7] docs: update .env.example with alerting configuration Add documentation for: - DLQ_ALERT_COOLDOWN_MS - SLACK_WEBHOOK_URL - PAGERDUTY_ROUTING_KEY - ADMIN_DASHBOARD_URL --- .env.example | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.env.example b/.env.example index 0b8bb6c..bc827cc 100644 --- a/.env.example +++ b/.env.example @@ -67,3 +67,15 @@ TRUST_PROXY=1 # Dead Letter Queue DLQ_ALERT_THRESHOLD=50 +DLQ_ALERT_COOLDOWN_MS=900000 + +# External Alerting (optional) +# Slack webhook URL for DLQ alerts (e.g. https://hooks.slack.com/services/YOUR/WEBHOOK/URL) +SLACK_WEBHOOK_URL= + +# PagerDuty integration key for DLQ alerts +# Generate from: https://developer.pagerduty.com/ +PAGERDUTY_ROUTING_KEY= + +# Admin dashboard URL for DLQ inspection links in alerts +ADMIN_DASHBOARD_URL=https://admin.neurowealth.io From 6b91e7a448d702adb0453bfd754ee4e568601757 Mon Sep 17 00:00:00 2001 From: strangedev Date: Tue, 16 Jun 2026 16:47:06 +0100 Subject: [PATCH 6/7] test: add unit tests for DLQ alerting - Test AlertingService singleton pattern - Test cooldown/deduplication behavior - Test alert suppression within cooldown window - Test alert re-emission after cooldown expires - Test DLQ alert payload formatting - Test all alert severity levels (critical, warning, info) - Test LOG channel always enabled - Test alert state clearing --- tests/unit/stellar/dlq-alerts.test.ts | 209 ++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 tests/unit/stellar/dlq-alerts.test.ts diff --git a/tests/unit/stellar/dlq-alerts.test.ts b/tests/unit/stellar/dlq-alerts.test.ts new file mode 100644 index 0000000..e5b74d3 --- /dev/null +++ b/tests/unit/stellar/dlq-alerts.test.ts @@ -0,0 +1,209 @@ +/** + * Unit tests for DLQ alerting functionality + * Tests threshold logic, cooldown behavior, and alert formatting + */ + +import { alertingService, type DLQAlertPayload } from '../../../src/services/alerting' + +describe('DLQ Alerting', () => { + beforeEach(() => { + jest.clearAllMocks() + // Reset alert states by creating a fresh instance + jest.resetModules() + }) + + describe('AlertingService', () => { + it('should create singleton instance', () => { + const instance1 = alertingService + const instance2 = alertingService + expect(instance1).toBe(instance2) + }) + + it('should identify enabled channels based on environment', () => { + const channels = alertingService.getEnabledChannels() + expect(channels).toContain('LOG') + }) + + describe('Cooldown Logic', () => { + it('should suppress alert within cooldown window', async () => { + const payload: DLQAlertPayload = { + title: 'Test Alert', + description: 'Test Description', + severity: 'critical', + component: 'dlq', + dlqSize: 100, + statusBreakdown: { pending: 80, retried: 15, resolved: 5 }, + } + + // First alert should succeed + const result1 = await alertingService.emit(payload, 'test-alert-key') + expect(result1.sent).toBe(true) + + // Second alert within cooldown should be suppressed + const result2 = await alertingService.emit(payload, 'test-alert-key') + expect(result2.sent).toBe(false) + expect(result2.reason).toContain('cooldown') + }) + + it('should allow alert after cooldown expires', async () => { + jest.useFakeTimers() + + const payload: DLQAlertPayload = { + title: 'Test Alert', + description: 'Test Description', + severity: 'critical', + component: 'dlq', + dlqSize: 100, + statusBreakdown: { pending: 80, retried: 15, resolved: 5 }, + } + + // First alert + const result1 = await alertingService.emit(payload, 'test-cooldown') + expect(result1.sent).toBe(true) + + // Within cooldown + const result2 = await alertingService.emit(payload, 'test-cooldown') + expect(result2.sent).toBe(false) + + // After cooldown (advance 16 minutes) + jest.advanceTimersByTime(16 * 60 * 1000) + const result3 = await alertingService.emit(payload, 'test-cooldown') + expect(result3.sent).toBe(true) + + jest.useRealTimers() + }) + }) + + describe('DLQ Alert Payload', () => { + it('should format DLQ alert with all metadata', async () => { + const payload: DLQAlertPayload = { + title: 'DLQ Critical Alert', + description: 'Queue exceeded threshold', + severity: 'critical', + component: 'dlq', + dlqSize: 75, + statusBreakdown: { + pending: 60, + retried: 10, + resolved: 5, + }, + oldestPendingAge: { + eventId: 'evt-123', + ageMs: 3600000, // 1 hour + ageHumanReadable: '1 hour', + }, + adminLink: 'https://admin.example.com/dlq', + metadata: { + threshold: 50, + timestamp: new Date().toISOString(), + }, + } + + const result = await alertingService.emitDLQAlert(payload) + // Should at least attempt to emit (may fail if no external service configured) + expect(result).toBeDefined() + }) + + it('should handle alert without optional metadata', async () => { + const payload: DLQAlertPayload = { + title: 'DLQ Alert', + description: 'Queue size increased', + severity: 'warning', + component: 'dlq', + dlqSize: 25, + statusBreakdown: { pending: 20, retried: 5, resolved: 0 }, + } + + const result = await alertingService.emitDLQAlert(payload) + expect(result).toBeDefined() + }) + }) + + describe('Alert Channels', () => { + it('should always send to LOG channel', async () => { + const payload: DLQAlertPayload = { + title: 'Test', + description: 'Test alert', + severity: 'critical', + component: 'dlq', + dlqSize: 50, + statusBreakdown: { pending: 40, retried: 5, resolved: 5 }, + } + + // LOG channel should always be present + const channels = alertingService.getEnabledChannels() + expect(channels).toContain('LOG') + + const result = await alertingService.emit(payload, 'log-test') + expect(result.sent).toBe(true) + }) + }) + + describe('Clear DLQ State', () => { + it('should clear alert state when queue normalizes', async () => { + const payload: DLQAlertPayload = { + title: 'Test', + description: 'Test', + severity: 'critical', + component: 'dlq', + dlqSize: 100, + statusBreakdown: { pending: 100, retried: 0, resolved: 0 }, + } + + // Emit alert + await alertingService.emitDLQAlert(payload) + + // Clear state + alertingService.clearDLQAlertState() + + // Next alert should succeed (no cooldown) + const result = await alertingService.emit(payload, 'dlq:threshold') + expect(result.sent).toBe(true) + }) + }) + }) + + describe('Alert Severity Levels', () => { + it('should handle critical severity', async () => { + const payload: DLQAlertPayload = { + title: 'Critical Alert', + description: 'Immediate action required', + severity: 'critical', + component: 'dlq', + dlqSize: 100, + statusBreakdown: { pending: 90, retried: 5, resolved: 5 }, + } + + const result = await alertingService.emit(payload, 'critical-test') + expect(result.sent).toBe(true) + }) + + it('should handle warning severity', async () => { + const payload: DLQAlertPayload = { + title: 'Warning Alert', + description: 'Investigate soon', + severity: 'warning', + component: 'dlq', + dlqSize: 30, + statusBreakdown: { pending: 20, retried: 5, resolved: 5 }, + } + + const result = await alertingService.emit(payload, 'warning-test') + expect(result.sent).toBe(true) + }) + + it('should handle info severity', async () => { + const payload: DLQAlertPayload = { + title: 'Info Alert', + description: 'Monitor trend', + severity: 'info', + component: 'dlq', + dlqSize: 10, + statusBreakdown: { pending: 5, retried: 3, resolved: 2 }, + } + + const result = await alertingService.emit(payload, 'info-test') + expect(result.sent).toBe(true) + }) + }) +}) From 2bc98ab77e7ec36864d6c71612af261a4ea2437e Mon Sep 17 00:00:00 2001 From: strangedev Date: Tue, 16 Jun 2026 16:47:43 +0100 Subject: [PATCH 7/7] docs: add DLQ alerting runbook for operators Complete guide for handling DLQ alerts: - Inspection and diagnosis steps - Common root causes and recovery procedures - Manual retry procedures - Integration setup (Slack, PagerDuty) - Prometheus monitoring examples - Prevention best practices - Escalation procedures --- docs/DLQ_ALERTING_RUNBOOK.md | 208 +++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 docs/DLQ_ALERTING_RUNBOOK.md diff --git a/docs/DLQ_ALERTING_RUNBOOK.md b/docs/DLQ_ALERTING_RUNBOOK.md new file mode 100644 index 0000000..8a60589 --- /dev/null +++ b/docs/DLQ_ALERTING_RUNBOOK.md @@ -0,0 +1,208 @@ +# DLQ Alerting Runbook + +## What to Do When DLQ Alert Fires + +The Dead-Letter Queue (DLQ) alert fires when the queue size exceeds the configured threshold (default: 50 events). This indicates that recent event processing failures are not being automatically recovered. + +### Alert Channels + +Alerts are sent through: + +- **LOG** (always enabled) - Check application logs at `logs/error.log` +- **SLACK** (optional) - Configured via `SLACK_WEBHOOK_URL` environment variable +- **PAGERDUTY** (optional) - Configured via `PAGERDUTY_ROUTING_KEY` environment variable + +### Alert Deduplication + +Alerts are deduplicated using a cooldown window (default: 15 minutes). While the queue remains above the threshold: + +- First alert is sent immediately +- Subsequent alerts within the cooldown window are suppressed +- Once cooldown expires and queue is still above threshold, a new alert is sent +- When queue drops below threshold, alert state is cleared + +### Step 1: Inspect the DLQ + +Access the DLQ admin dashboard via the link in the alert (or navigate to `https://admin.neurowealth.io/dlq`): + +1. **View Queue Status** + - Current size (number of events) + - Breakdown by status: PENDING, RETRIED, RESOLVED + - Age of oldest pending event + +2. **Identify Patterns** + - Are all failed events of the same type (deposits, withdrawals, rebalances)? + - Are they from a specific time window? + - Is there a common error message? + +### Step 2: Understand Root Causes + +Common reasons for DLQ growth: + +#### Temporary Stellar Network Issues + +- **Symptom**: Large influx of timeouts or network errors +- **Recovery**: Wait for network to stabilize; queue will auto-recover +- **Action**: Monitor logs and wait for next retry cycle + +#### Smart Contract Issues + +- **Symptom**: Errors like "contract not found" or "method invocation failed" +- **Recovery**: Fix contract deployment or method implementation +- **Action**: Deploy fix, then retry events + +#### Database Connectivity + +- **Symptom**: Connection pool exhausted or query timeouts +- **Recovery**: Restart database or scale read replicas +- **Action**: Resolve DB issue, restart app, then retry + +#### Configuration Errors + +- **Symptom**: Invalid keys, tokens, or missing environment variables +- **Recovery**: Verify and correct configuration +- **Action**: Update env vars, restart app, then retry + +### Step 3: Dry-Run Retry (Optional) + +Before retrying all events, inspect a sample event to understand the failure: + +```bash +# View a specific event +curl https://api.neurowealth.io/admin/dlq/events/{EVENT_ID} + +# Response includes: +# - eventType: "deposit", "withdraw", "rebalance" +# - error: Error message +# - payload: Original event data +# - status: "PENDING", "RETRIED", "RESOLVED" +# - retryCount: Number of retry attempts +``` + +### Step 4: Manual Retry + +Once the root cause is fixed: + +```bash +# Retry all pending/retried events +POST /admin/dlq/retry +{ + "dryRun": false, # Set to true for dry-run + "statuses": ["PENDING", "RETRIED"] # Optional filter +} + +# Response: +# { +# "resolved": 45, # Successfully retried +# "failed": 5 # Still failing +# } +``` + +### Step 5: Resolve Events Manually (If Needed) + +For events that cannot be recovered: + +```bash +# Mark event as resolved (won't be retried) +POST /admin/dlq/events/{EVENT_ID}/resolve + +# This is a last resort. Resolved events are not processed +# but won't trigger alerts. +``` + +### Step 6: Monitor Recovery + +After retry: + +- Watch the queue size via the dashboard +- Check logs for any new failures +- Monitor the `dlq_size` Prometheus metric +- Alert should clear once queue drops below threshold + +## Alert Threshold Configuration + +Adjust the alerting threshold via environment variables: + +```bash +# Alert when DLQ size >= this value (default: 50) +DLQ_ALERT_THRESHOLD=50 + +# Suppress duplicate alerts for this duration in milliseconds +# (default: 900000 = 15 minutes) +DLQ_ALERT_COOLDOWN_MS=900000 +``` + +## External Integrations + +### Slack Integration + +1. Create a Slack App: https://api.slack.com/apps +2. Enable Incoming Webhooks +3. Create a webhook for your channel +4. Set environment variable: + +```bash +SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL +``` + +Alert format includes: + +- Queue size and breakdown by status +- Age of oldest pending event +- Link to admin dashboard +- Custom metadata + +### PagerDuty Integration + +1. Create an integration key in PagerDuty +2. Set environment variable: + +```bash +PAGERDUTY_ROUTING_KEY=your-pagerduty-routing-key +``` + +Alerts are deduplicated by minute to prevent incident spam. + +## Prometheus Metrics + +Monitor alert state via: + +```promql +# 1 = alert active, 0 = alert inactive +dlq_alert_active + +# Current DLQ size +dlq_size + +# DLQ size trend over time +rate(dlq_size[5m]) +``` + +## Prevention Best Practices + +1. **Monitor Trends**: Watch `dlq_size` over time for slow growth +2. **Set Lower Thresholds**: Use `DLQ_ALERT_THRESHOLD=20` in staging +3. **Upstream Resilience**: Implement circuit breakers for external APIs +4. **Test Error Handling**: Regularly test DLQ recovery with chaos engineering +5. **On-Call Training**: Ensure on-call engineers understand this runbook + +## Alert Severity Levels + +- **CRITICAL** (red): DLQ >= threshold, immediate action needed +- **WARNING** (yellow): DLQ between 50% and threshold, investigate soon +- **INFO** (blue): DLQ growing but normal, monitor trend + +## Escalation + +If queue continues to grow after recovery attempts: + +1. **Check Consensus**: Are deposits/withdrawals actually failing on-chain? +2. **Inspect Ledger**: Query Stellar testnet/mainnet for transaction status +3. **Contact Support**: Reach out to Stellar developer relations if contract issue +4. **Page On-Call**: If user-facing transactions are failing, escalate to on-call team + +## Related Documentation + +- [OBSERVABILITY.md](./OBSERVABILITY.md) - Complete metrics and alert thresholds +- [API_REFERENCE.md](./API_REFERENCE.md) - Admin DLQ endpoints +- [DEPLOYMENT_GUIDE.md](./DEPLOYMENT_GUIDE.md) - Configuration and secrets management