From acc7e3a4dd6ef02f6a66990724b27605ff81438f Mon Sep 17 00:00:00 2001
From: strangedev <strngecloud.iv@gmail.com>
Date: Tue, 16 Jun 2026 16:45:56 +0100
Subject: [PATCH 1/7] feat: add alerting service with Slack/PagerDuty support

- Create new AlertingService with singleton pattern
- Support LOG, SLACK, and PAGERDUTY channels
- Implement alert deduplication/cooldown (15 min default)
- Support rich DLQ alert payloads with metadata
- Export types for DLQAlertPayload
---
 src/services/alerting.ts | 401 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 401 insertions(+)
 create mode 100644 src/services/alerting.ts

diff --git a/src/services/alerting.ts b/src/services/alerting.ts
new file mode 100644
index 0000000..c7b60e5
--- /dev/null
+++ b/src/services/alerting.ts
@@ -0,0 +1,401 @@
+/**
+ * Alerting Service
+ *
+ * Pluggable alert dispatcher supporting multiple channels:
+ * - LOG (always enabled, uses Winston logger)
+ * - SLACK_WEBHOOK_URL (optional, posts to Slack)
+ * - PAGERDUTY_ROUTING_KEY (optional, triggers PagerDuty incidents)
+ *
+ * Includes cooldown/deduplication to prevent alert fatigue.
+ */
+
+import { logger } from '../utils/logger'
+import { dlqAlertActive } from '../utils/metrics'
+
+export interface AlertPayload {
+  title: string
+  description: string
+  severity: 'info' | 'warning' | 'critical'
+  component: string
+  metadata?: Record<string, any>
+}
+
+export interface DLQAlertPayload extends AlertPayload {
+  dlqSize: number
+  statusBreakdown: {
+    pending: number
+    retried: number
+    resolved: number
+  }
+  oldestPendingAge?: {
+    eventId: string
+    ageMs: number
+    ageHumanReadable: string
+  }
+  adminLink?: string
+}
+
+interface AlertState {
+  lastAlertTime: number
+  alertCount: number
+}
+
+/**
+ * Alerting service with support for multiple channels and cooldown logic
+ */
+class AlertingService {
+  private static instance: AlertingService
+  private alertStates: Map<string, AlertState> = new Map()
+  private cooldownMs: number = 15 * 60 * 1000 // 15 minutes default
+
+  private slackWebhookUrl: string
+  private pagerdutyRoutingKey: string
+  private enabledChannels: Set<'LOG' | 'SLACK' | 'PAGERDUTY'>
+
+  private constructor() {
+    this.slackWebhookUrl = process.env.SLACK_WEBHOOK_URL || ''
+    this.pagerdutyRoutingKey = process.env.PAGERDUTY_ROUTING_KEY || ''
+    this.enabledChannels = new Set()
+    this.enabledChannels.add('LOG') // Always enabled
+
+    if (this.slackWebhookUrl) {
+      this.enabledChannels.add('SLACK')
+    }
+    if (this.pagerdutyRoutingKey) {
+      this.enabledChannels.add('PAGERDUTY')
+    }
+  }
+
+  static getInstance(): AlertingService {
+    if (!AlertingService.instance) {
+      AlertingService.instance = new AlertingService()
+    }
+    return AlertingService.instance
+  }
+
+  /**
+   * Check if alert is within cooldown window (deduplication)
+   */
+  private isWithinCooldown(alertKey: string): boolean {
+    const state = this.alertStates.get(alertKey)
+    if (!state) return false
+
+    const timeSinceLastAlert = Date.now() - state.lastAlertTime
+    return timeSinceLastAlert < this.cooldownMs
+  }
+
+  /**
+   * Update alert state for cooldown tracking
+   */
+  private updateAlertState(alertKey: string): void {
+    const state = this.alertStates.get(alertKey) || {
+      lastAlertTime: 0,
+      alertCount: 0,
+    }
+    state.lastAlertTime = Date.now()
+    state.alertCount++
+    this.alertStates.set(alertKey, state)
+  }
+
+  /**
+   * Get cooldown status (useful for monitoring)
+   */
+  private getCooldownRemaining(alertKey: string): number {
+    const state = this.alertStates.get(alertKey)
+    if (!state) return 0
+
+    const timeSinceLastAlert = Date.now() - state.lastAlertTime
+    const remaining = this.cooldownMs - timeSinceLastAlert
+    return Math.max(0, remaining)
+  }
+
+  /**
+   * Emit alert to all enabled channels with cooldown checking
+   */
+  async emit(
+    payload: AlertPayload,
+    alertKey?: string
+  ): Promise<{ sent: boolean; reason?: string }> {
+    const key = alertKey || `${payload.component}:${payload.severity}`
+
+    // Check cooldown
+    if (this.isWithinCooldown(key)) {
+      const remaining = this.getCooldownRemaining(key)
+      return {
+        sent: false,
+        reason: `Alert suppressed by cooldown. Next alert in ${Math.round(remaining / 1000)}s`,
+      }
+    }
+
+    try {
+      const promises: Promise<void>[] = []
+
+      if (this.enabledChannels.has('LOG')) {
+        promises.push(this.sendToLog(payload))
+      }
+
+      if (this.enabledChannels.has('SLACK')) {
+        promises.push(
+          this.sendToSlack(payload).catch((err) => {
+            logger.error('[Alerting] Failed to send Slack alert:', err)
+          })
+        )
+      }
+
+      if (this.enabledChannels.has('PAGERDUTY')) {
+        promises.push(
+          this.sendToPagerDuty(payload).catch((err) => {
+            logger.error('[Alerting] Failed to send PagerDuty alert:', err)
+          })
+        )
+      }
+
+      await Promise.all(promises)
+      this.updateAlertState(key)
+      return { sent: true }
+    } catch (error) {
+      logger.error('[Alerting] Error emitting alert:', error)
+      return {
+        sent: false,
+        reason: error instanceof Error ? error.message : 'Unknown error',
+      }
+    }
+  }
+
+  /**
+   * Emit DLQ-specific alert with rich metadata
+   */
+  async emitDLQAlert(
+    payload: DLQAlertPayload
+  ): Promise<{ sent: boolean; reason?: string }> {
+    const result = await this.emit(payload, 'dlq:threshold')
+
+    // Update Prometheus gauge
+    if (payload.severity === 'critical') {
+      dlqAlertActive.set(1)
+    } else {
+      dlqAlertActive.set(0)
+    }
+
+    return result
+  }
+
+  /**
+   * Clear DLQ alert state (when queue drops below threshold)
+   */
+  clearDLQAlertState(): void {
+    this.alertStates.delete('dlq:threshold')
+    dlqAlertActive.set(0)
+    logger.info('[Alerting] DLQ alert state cleared (queue normalized)')
+  }
+
+  /**
+   * Send alert to Winston logger
+   */
+  private async sendToLog(payload: AlertPayload): Promise<void> {
+    const logLevel =
+      payload.severity === 'critical'
+        ? 'error'
+        : payload.severity === 'warning'
+          ? 'warn'
+          : 'info'
+    const message = `[ALERT] [${payload.component.toUpperCase()}] ${payload.title}: ${payload.description}`
+
+    if (logLevel === 'error') {
+      logger.error(message, payload.metadata)
+    } else if (logLevel === 'warn') {
+      logger.warn(message, payload.metadata)
+    } else {
+      logger.info(message, payload.metadata)
+    }
+  }
+
+  /**
+   * Send alert to Slack
+   */
+  private async sendToSlack(
+    payload: AlertPayload | DLQAlertPayload
+  ): Promise<void> {
+    if (!this.slackWebhookUrl) {
+      return
+    }
+
+    const color =
+      payload.severity === 'critical'
+        ? 'danger'
+        : payload.severity === 'warning'
+          ? 'warning'
+          : 'good'
+    const dlqPayload = payload as DLQAlertPayload
+
+    let blocks: any[] = [
+      {
+        type: 'header',
+        text: {
+          type: 'plain_text',
+          text: `🚨 ${payload.title}`,
+          emoji: true,
+        },
+      },
+      {
+        type: 'section',
+        text: {
+          type: 'mrkdwn',
+          text: payload.description,
+        },
+      },
+    ]
+
+    // Add DLQ-specific metadata
+    if (dlqPayload.dlqSize !== undefined) {
+      const fields: any[] = [
+        {
+          type: 'mrkdwn',
+          text: `*Current Size:*\n${dlqPayload.dlqSize} events`,
+        },
+        {
+          type: 'mrkdwn',
+          text: `*Severity:*\n${payload.severity.toUpperCase()}`,
+        },
+      ]
+
+      if (dlqPayload.statusBreakdown) {
+        fields.push({
+          type: 'mrkdwn',
+          text: `*Status Breakdown:*\nPending: ${dlqPayload.statusBreakdown.pending}\nRetried: ${dlqPayload.statusBreakdown.retried}\nResolved: ${dlqPayload.statusBreakdown.resolved}`,
+        })
+      }
+
+      if (dlqPayload.oldestPendingAge) {
+        fields.push({
+          type: 'mrkdwn',
+          text: `*Oldest Event:*\nAge: ${dlqPayload.oldestPendingAge.ageHumanReadable}\nID: \`${dlqPayload.oldestPendingAge.eventId}\``,
+        })
+      }
+
+      blocks.push({
+        type: 'section',
+        fields,
+      })
+    }
+
+    // Add custom metadata if present
+    if (payload.metadata && Object.keys(payload.metadata).length > 0) {
+      blocks.push({
+        type: 'section',
+        text: {
+          type: 'mrkdwn',
+          text: `*Additional Info:*\n${Object.entries(payload.metadata)
+            .map(([key, value]) => `• ${key}: ${JSON.stringify(value)}`)
+            .join('\n')}`,
+        },
+      })
+    }
+
+    // Add action links
+    if (dlqPayload.adminLink) {
+      blocks.push({
+        type: 'actions',
+        elements: [
+          {
+            type: 'button',
+            text: {
+              type: 'plain_text',
+              text: 'Inspect DLQ',
+              emoji: true,
+            },
+            url: dlqPayload.adminLink,
+            style: 'danger',
+          },
+        ],
+      })
+    }
+
+    const response = await fetch(this.slackWebhookUrl, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        blocks,
+        attachments: [
+          {
+            color,
+            footer: 'NeuroWealth DLQ Alert System',
+            ts: Math.floor(Date.now() / 1000),
+          },
+        ],
+      }),
+    })
+
+    if (!response.ok) {
+      throw new Error(
+        `Slack webhook returned ${response.status}: ${response.statusText}`
+      )
+    }
+  }
+
+  /**
+   * Send alert to PagerDuty
+   */
+  private async sendToPagerDuty(
+    payload: AlertPayload | DLQAlertPayload
+  ): Promise<void> {
+    if (!this.pagerdutyRoutingKey) {
+      return
+    }
+
+    const dlqPayload = payload as DLQAlertPayload
+    const severity =
+      payload.severity === 'critical'
+        ? 'critical'
+        : payload.severity === 'warning'
+          ? 'warning'
+          : 'info'
+
+    let customDetails: Record<string, any> = {
+      ...payload.metadata,
+    }
+
+    if (dlqPayload.dlqSize !== undefined) {
+      customDetails = {
+        ...customDetails,
+        dlq_size: dlqPayload.dlqSize,
+        status_breakdown: dlqPayload.statusBreakdown,
+        oldest_event_age: dlqPayload.oldestPendingAge?.ageHumanReadable,
+      }
+    }
+
+    const response = await fetch('https://events.pagerduty.com/v2/enqueue', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        routing_key: this.pagerdutyRoutingKey,
+        event_action: 'trigger',
+        dedup_key: `dlq-alert-${Math.floor(Date.now() / 60000)}`, // Group by minute
+        payload: {
+          summary: payload.title,
+          severity,
+          source: 'NeuroWealth Backend',
+          component: payload.component,
+          custom_details: customDetails,
+        },
+        client: 'NeuroWealth DLQ Alerting',
+        client_url: dlqPayload.adminLink,
+      }),
+    })
+
+    if (!response.ok) {
+      throw new Error(
+        `PagerDuty API returned ${response.status}: ${response.statusText}`
+      )
+    }
+  }
+
+  /**
+   * Check enabled channels (for diagnostics)
+   */
+  getEnabledChannels(): string[] {
+    return Array.from(this.enabledChannels)
+  }
+}
+
+export const alertingService = AlertingService.getInstance()

From 5a8b15a28795ab656e53dc1a3cc3eda095fddc2f Mon Sep 17 00:00:00 2001
From: strangedev <strngecloud.iv@gmail.com>
Date: Tue, 16 Jun 2026 16:46:10 +0100
Subject: [PATCH 2/7] feat: add dlq_alert_active Prometheus gauge

- New gauge tracks whether DLQ alert is currently active
- Set to 1 when threshold exceeded, 0 when normalized
- Useful for monitoring alert state in Prometheus/Grafana
---
 src/utils/metrics.ts | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/utils/metrics.ts b/src/utils/metrics.ts
index 6f7a9db..1a451df 100644
--- a/src/utils/metrics.ts
+++ b/src/utils/metrics.ts
@@ -71,6 +71,12 @@ export const dlqRetryTotal = new client.Counter({
   registers: [register],
 })
 
+export const dlqAlertActive = new client.Gauge({
+  name: 'dlq_alert_active',
+  help: 'Whether a DLQ size alert is currently active (1=active, 0=inactive)',
+  registers: [register],
+})
+
 // ── Cursor/Lag Metrics ──────────────────────────────────────────────────────────
 
 export const cursorLag = new client.Gauge({

From 8a547615d9b2ebc95f2653e46a070b6a2a78e9fd Mon Sep 17 00:00:00 2001
From: strangedev <strngecloud.iv@gmail.com>
Date: Tue, 16 Jun 2026 16:46:23 +0100
Subject: [PATCH 3/7] feat: add DLQ alerting configuration

- DLQ_ALERT_COOLDOWN_MS: Control alert deduplication window (default 15min)
- Support SLACK_WEBHOOK_URL for Slack integration
- Support PAGERDUTY_ROUTING_KEY for PagerDuty integration
- Add ADMIN_DASHBOARD_URL for inspection links
---
 src/config/env.ts | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/config/env.ts b/src/config/env.ts
index 064c7a8..f1e5676 100644
--- a/src/config/env.ts
+++ b/src/config/env.ts
@@ -43,7 +43,7 @@ function validateAllRequiredEnvVars(): void {
   if (walletKey && !/^[0-9a-f]{64}$/i.test(walletKey)) {
     errors.push(
       `WALLET_ENCRYPTION_KEY is invalid: must be exactly 64 hexadecimal characters (32 bytes). ` +
-        `Got length ${walletKey.length}. Generate one with: openssl rand -hex 32`
+      `Got length ${walletKey.length}. Generate one with: openssl rand -hex 32`
     )
   }
 
@@ -60,7 +60,7 @@ function validateAllRequiredEnvVars(): void {
     const list = errors.map(e => `  - ${e}`).join('\n')
     throw new Error(
       `Application cannot start — environment configuration errors:\n${list}\n\n` +
-        `Fix the variables above and restart the application.`
+      `Fix the variables above and restart the application.`
     )
   }
 }
@@ -146,8 +146,8 @@ function validateStellarKey(secretKey: string, network: 'testnet' | 'mainnet' |
   if (network === 'mainnet' && env !== 'production') {
     console.warn(
       '\n⚠️  CRITICAL WARNING: Using MAINNET in non-production environment!\n' +
-        '⚠️  This could result in real financial loss!\n' +
-        '⚠️  Verify STELLAR_NETWORK and NODE_ENV settings immediately!\n'
+      '⚠️  This could result in real financial loss!\n' +
+      '⚠️  Verify STELLAR_NETWORK and NODE_ENV settings immediately!\n'
     )
   }
 }
@@ -264,5 +264,6 @@ export const config = {
   },
   dlq: {
     alertThreshold: parseInt(process.env.DLQ_ALERT_THRESHOLD || '50'),
+    alertCooldownMs: parseInt(process.env.DLQ_ALERT_COOLDOWN_MS || '900000'), // 15 minutes default
   },
 }
\ No newline at end of file

From 52a4b4fe5f4dc3d38484eced22408524d3484e8b Mon Sep 17 00:00:00 2001
From: strangedev <strngecloud.iv@gmail.com>
Date: Tue, 16 Jun 2026 16:46:37 +0100
Subject: [PATCH 4/7] feat: integrate external alerting into DLQ

Replace hardcoded SIZE_ALERT_THRESHOLD with configurable config.dlq.alertThreshold
- Emit rich DLQ alerts with metadata (size, status breakdown, oldest event age)
- Include actionable admin dashboard links
- Support alert deduplication to prevent fatigue
- Calculate status breakdown (pending, retried, resolved counts)
- Track oldest pending event and human-readable age
- Clear alert state when queue normalizes

Closes #163
---
 src/stellar/dlq.ts | 113 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 107 insertions(+), 6 deletions(-)

diff --git a/src/stellar/dlq.ts b/src/stellar/dlq.ts
index 7388929..c209757 100644
--- a/src/stellar/dlq.ts
+++ b/src/stellar/dlq.ts
@@ -13,6 +13,8 @@ import { xdr } from '@stellar/stellar-sdk'
 import { logger } from '../utils/logger'
 import db from '../db'
 import { updateDlqSize } from '../utils/metrics'
+import config from '../config'
+import { alertingService, type DLQAlertPayload } from '../services/alerting'
 
 export type DeadLetterEventStatus = 'PENDING' | 'RETRIED' | 'RESOLVED'
 
@@ -35,8 +37,6 @@ const LEGACY_DLQ_FILE = path.join(
   '../../logs/dead_letter_queue.json'
 )
 
-const SIZE_ALERT_THRESHOLD = 50
-
 function serializeScVal(value: unknown): string | unknown {
   if (value instanceof xdr.ScVal) {
     return value.toXDR('base64')
@@ -275,10 +275,111 @@ export class DeadLetterQueue {
   }
 
   private static checkSizeAlert(size: number): void {
-    if (size >= SIZE_ALERT_THRESHOLD) {
-      logger.error(
-        `[DLQ ALERT] Dead-letter queue size is critically high: ${size} events. Manual intervention required.`
-      )
+    const threshold = config.dlq.alertThreshold
+
+    if (size >= threshold) {
+      this.emitDLQAlert(size, 'critical')
+    } else if (size > 0 && size < threshold) {
+      // Alert has normalized, clear the state
+      alertingService.clearDLQAlertState()
+    }
+  }
+
+  /**
+   * Emit DLQ alert with rich metadata to external channels
+   */
+  private static async emitDLQAlert(
+    size: number,
+    severity: 'critical' | 'warning' | 'info'
+  ): Promise<void> {
+    try {
+      // Get status breakdown
+      const statusBreakdown = await this.getStatusBreakdown()
+      const oldestPending = await this.getOldestPendingEvent()
+
+      // Calculate age in human-readable format
+      let ageHumanReadable = 'N/A'
+      if (oldestPending) {
+        const ageMs = Date.now() - new Date(oldestPending.createdAt).getTime()
+        ageHumanReadable = this.formatAge(ageMs)
+      }
+
+      const payload: DLQAlertPayload = {
+        title: `[CRITICAL] DLQ Size Alert: ${size} events queued`,
+        description: `The Dead-Letter Queue has reached ${size} events. This indicates that recent event processing failures are not being automatically recovered. Immediate investigation required to identify and fix the root cause.`,
+        severity,
+        component: 'dlq',
+        dlqSize: size,
+        statusBreakdown,
+        oldestPendingAge: oldestPending
+          ? {
+              eventId: oldestPending.id,
+              ageMs: Date.now() - new Date(oldestPending.createdAt).getTime(),
+              ageHumanReadable,
+            }
+          : undefined,
+        adminLink: `${process.env.ADMIN_DASHBOARD_URL || 'https://admin.neurowealth.io'}/dlq`,
+        metadata: {
+          threshold: config.dlq.alertThreshold,
+          timestamp: new Date().toISOString(),
+        },
+      }
+
+      await alertingService.emitDLQAlert(payload)
+    } catch (error) {
+      logger.error('[DLQ] Error emitting alert:', error)
+    }
+  }
+
+  /**
+   * Get DLQ status breakdown (pending, retried, resolved counts)
+   */
+  private static async getStatusBreakdown(): Promise<{
+    pending: number
+    retried: number
+    resolved: number
+  }> {
+    const pending = await (db as any).deadLetterEvent.count({
+      where: { status: 'PENDING' },
+    })
+    const retried = await (db as any).deadLetterEvent.count({
+      where: { status: 'RETRIED' },
+    })
+    const resolved = await (db as any).deadLetterEvent.count({
+      where: { status: 'RESOLVED' },
+    })
+
+    return { pending, retried, resolved }
+  }
+
+  /**
+   * Get the oldest pending event (for age tracking)
+   */
+  private static async getOldestPendingEvent(): Promise<PrismaDeadLetterRow | null> {
+    return (db as any).deadLetterEvent.findFirst({
+      where: { status: 'PENDING' },
+      orderBy: { createdAt: 'asc' },
+    })
+  }
+
+  /**
+   * Format milliseconds to human-readable age (e.g., "2 hours 15 minutes")
+   */
+  private static formatAge(ms: number): string {
+    const seconds = Math.floor(ms / 1000)
+    const minutes = Math.floor(seconds / 60)
+    const hours = Math.floor(minutes / 60)
+    const days = Math.floor(hours / 24)
+
+    if (days > 0) {
+      return `${days} day${days > 1 ? 's' : ''} ${hours % 24} hour${hours % 24 !== 1 ? 's' : ''}`
+    }
+    if (hours > 0) {
+      return `${hours} hour${hours > 1 ? 's' : ''} ${minutes % 60} minute${minutes % 60 !== 1 ? 's' : ''}`
+    }
+    if (minutes > 0) {
+      return `${minutes} minute${minutes > 1 ? 's' : ''}`
     }
+    return `${seconds} second${seconds !== 1 ? 's' : ''}`
   }
 }

From 85cd022c11abff02d9c0219f2909d64aa8b707c8 Mon Sep 17 00:00:00 2001
From: strangedev <strngecloud.iv@gmail.com>
Date: Tue, 16 Jun 2026 16:46:50 +0100
Subject: [PATCH 5/7] docs: update .env.example with alerting configuration

Add documentation for:
- DLQ_ALERT_COOLDOWN_MS
- SLACK_WEBHOOK_URL
- PAGERDUTY_ROUTING_KEY
- ADMIN_DASHBOARD_URL
---
 .env.example | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.env.example b/.env.example
index 0b8bb6c..bc827cc 100644
--- a/.env.example
+++ b/.env.example
@@ -67,3 +67,15 @@ TRUST_PROXY=1
 
 # Dead Letter Queue
 DLQ_ALERT_THRESHOLD=50
+DLQ_ALERT_COOLDOWN_MS=900000
+
+# External Alerting (optional)
+# Slack webhook URL for DLQ alerts (e.g. https://hooks.slack.com/services/YOUR/WEBHOOK/URL)
+SLACK_WEBHOOK_URL=
+
+# PagerDuty integration key for DLQ alerts
+# Generate from: https://developer.pagerduty.com/
+PAGERDUTY_ROUTING_KEY=
+
+# Admin dashboard URL for DLQ inspection links in alerts
+ADMIN_DASHBOARD_URL=https://admin.neurowealth.io

From 6b91e7a448d702adb0453bfd754ee4e568601757 Mon Sep 17 00:00:00 2001
From: strangedev <strngecloud.iv@gmail.com>
Date: Tue, 16 Jun 2026 16:47:06 +0100
Subject: [PATCH 6/7] test: add unit tests for DLQ alerting

- Test AlertingService singleton pattern
- Test cooldown/deduplication behavior
- Test alert suppression within cooldown window
- Test alert re-emission after cooldown expires
- Test DLQ alert payload formatting
- Test all alert severity levels (critical, warning, info)
- Test LOG channel always enabled
- Test alert state clearing
---
 tests/unit/stellar/dlq-alerts.test.ts | 209 ++++++++++++++++++++++++++
 1 file changed, 209 insertions(+)
 create mode 100644 tests/unit/stellar/dlq-alerts.test.ts

diff --git a/tests/unit/stellar/dlq-alerts.test.ts b/tests/unit/stellar/dlq-alerts.test.ts
new file mode 100644
index 0000000..e5b74d3
--- /dev/null
+++ b/tests/unit/stellar/dlq-alerts.test.ts
@@ -0,0 +1,209 @@
+/**
+ * Unit tests for DLQ alerting functionality
+ * Tests threshold logic, cooldown behavior, and alert formatting
+ */
+
+import { alertingService, type DLQAlertPayload } from '../../../src/services/alerting'
+
+describe('DLQ Alerting', () => {
+    beforeEach(() => {
+        jest.clearAllMocks()
+        // Reset alert states by creating a fresh instance
+        jest.resetModules()
+    })
+
+    describe('AlertingService', () => {
+        it('should create singleton instance', () => {
+            const instance1 = alertingService
+            const instance2 = alertingService
+            expect(instance1).toBe(instance2)
+        })
+
+        it('should identify enabled channels based on environment', () => {
+            const channels = alertingService.getEnabledChannels()
+            expect(channels).toContain('LOG')
+        })
+
+        describe('Cooldown Logic', () => {
+            it('should suppress alert within cooldown window', async () => {
+                const payload: DLQAlertPayload = {
+                    title: 'Test Alert',
+                    description: 'Test Description',
+                    severity: 'critical',
+                    component: 'dlq',
+                    dlqSize: 100,
+                    statusBreakdown: { pending: 80, retried: 15, resolved: 5 },
+                }
+
+                // First alert should succeed
+                const result1 = await alertingService.emit(payload, 'test-alert-key')
+                expect(result1.sent).toBe(true)
+
+                // Second alert within cooldown should be suppressed
+                const result2 = await alertingService.emit(payload, 'test-alert-key')
+                expect(result2.sent).toBe(false)
+                expect(result2.reason).toContain('cooldown')
+            })
+
+            it('should allow alert after cooldown expires', async () => {
+                jest.useFakeTimers()
+
+                const payload: DLQAlertPayload = {
+                    title: 'Test Alert',
+                    description: 'Test Description',
+                    severity: 'critical',
+                    component: 'dlq',
+                    dlqSize: 100,
+                    statusBreakdown: { pending: 80, retried: 15, resolved: 5 },
+                }
+
+                // First alert
+                const result1 = await alertingService.emit(payload, 'test-cooldown')
+                expect(result1.sent).toBe(true)
+
+                // Within cooldown
+                const result2 = await alertingService.emit(payload, 'test-cooldown')
+                expect(result2.sent).toBe(false)
+
+                // After cooldown (advance 16 minutes)
+                jest.advanceTimersByTime(16 * 60 * 1000)
+                const result3 = await alertingService.emit(payload, 'test-cooldown')
+                expect(result3.sent).toBe(true)
+
+                jest.useRealTimers()
+            })
+        })
+
+        describe('DLQ Alert Payload', () => {
+            it('should format DLQ alert with all metadata', async () => {
+                const payload: DLQAlertPayload = {
+                    title: 'DLQ Critical Alert',
+                    description: 'Queue exceeded threshold',
+                    severity: 'critical',
+                    component: 'dlq',
+                    dlqSize: 75,
+                    statusBreakdown: {
+                        pending: 60,
+                        retried: 10,
+                        resolved: 5,
+                    },
+                    oldestPendingAge: {
+                        eventId: 'evt-123',
+                        ageMs: 3600000, // 1 hour
+                        ageHumanReadable: '1 hour',
+                    },
+                    adminLink: 'https://admin.example.com/dlq',
+                    metadata: {
+                        threshold: 50,
+                        timestamp: new Date().toISOString(),
+                    },
+                }
+
+                const result = await alertingService.emitDLQAlert(payload)
+                // Should at least attempt to emit (may fail if no external service configured)
+                expect(result).toBeDefined()
+            })
+
+            it('should handle alert without optional metadata', async () => {
+                const payload: DLQAlertPayload = {
+                    title: 'DLQ Alert',
+                    description: 'Queue size increased',
+                    severity: 'warning',
+                    component: 'dlq',
+                    dlqSize: 25,
+                    statusBreakdown: { pending: 20, retried: 5, resolved: 0 },
+                }
+
+                const result = await alertingService.emitDLQAlert(payload)
+                expect(result).toBeDefined()
+            })
+        })
+
+        describe('Alert Channels', () => {
+            it('should always send to LOG channel', async () => {
+                const payload: DLQAlertPayload = {
+                    title: 'Test',
+                    description: 'Test alert',
+                    severity: 'critical',
+                    component: 'dlq',
+                    dlqSize: 50,
+                    statusBreakdown: { pending: 40, retried: 5, resolved: 5 },
+                }
+
+                // LOG channel should always be present
+                const channels = alertingService.getEnabledChannels()
+                expect(channels).toContain('LOG')
+
+                const result = await alertingService.emit(payload, 'log-test')
+                expect(result.sent).toBe(true)
+            })
+        })
+
+        describe('Clear DLQ State', () => {
+            it('should clear alert state when queue normalizes', async () => {
+                const payload: DLQAlertPayload = {
+                    title: 'Test',
+                    description: 'Test',
+                    severity: 'critical',
+                    component: 'dlq',
+                    dlqSize: 100,
+                    statusBreakdown: { pending: 100, retried: 0, resolved: 0 },
+                }
+
+                // Emit alert
+                await alertingService.emitDLQAlert(payload)
+
+                // Clear state
+                alertingService.clearDLQAlertState()
+
+                // Next alert should succeed (no cooldown)
+                const result = await alertingService.emit(payload, 'dlq:threshold')
+                expect(result.sent).toBe(true)
+            })
+        })
+    })
+
+    describe('Alert Severity Levels', () => {
+        it('should handle critical severity', async () => {
+            const payload: DLQAlertPayload = {
+                title: 'Critical Alert',
+                description: 'Immediate action required',
+                severity: 'critical',
+                component: 'dlq',
+                dlqSize: 100,
+                statusBreakdown: { pending: 90, retried: 5, resolved: 5 },
+            }
+
+            const result = await alertingService.emit(payload, 'critical-test')
+            expect(result.sent).toBe(true)
+        })
+
+        it('should handle warning severity', async () => {
+            const payload: DLQAlertPayload = {
+                title: 'Warning Alert',
+                description: 'Investigate soon',
+                severity: 'warning',
+                component: 'dlq',
+                dlqSize: 30,
+                statusBreakdown: { pending: 20, retried: 5, resolved: 5 },
+            }
+
+            const result = await alertingService.emit(payload, 'warning-test')
+            expect(result.sent).toBe(true)
+        })
+
+        it('should handle info severity', async () => {
+            const payload: DLQAlertPayload = {
+                title: 'Info Alert',
+                description: 'Monitor trend',
+                severity: 'info',
+                component: 'dlq',
+                dlqSize: 10,
+                statusBreakdown: { pending: 5, retried: 3, resolved: 2 },
+            }
+
+            const result = await alertingService.emit(payload, 'info-test')
+            expect(result.sent).toBe(true)
+        })
+    })
+})

From 2bc98ab77e7ec36864d6c71612af261a4ea2437e Mon Sep 17 00:00:00 2001
From: strangedev <strngecloud.iv@gmail.com>
Date: Tue, 16 Jun 2026 16:47:43 +0100
Subject: [PATCH 7/7] docs: add DLQ alerting runbook for operators

Complete guide for handling DLQ alerts:
- Inspection and diagnosis steps
- Common root causes and recovery procedures
- Manual retry procedures
- Integration setup (Slack, PagerDuty)
- Prometheus monitoring examples
- Prevention best practices
- Escalation procedures
---
 docs/DLQ_ALERTING_RUNBOOK.md | 208 +++++++++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 docs/DLQ_ALERTING_RUNBOOK.md

diff --git a/docs/DLQ_ALERTING_RUNBOOK.md b/docs/DLQ_ALERTING_RUNBOOK.md
new file mode 100644
index 0000000..8a60589
--- /dev/null
+++ b/docs/DLQ_ALERTING_RUNBOOK.md
@@ -0,0 +1,208 @@
+# DLQ Alerting Runbook
+
+## What to Do When DLQ Alert Fires
+
+The Dead-Letter Queue (DLQ) alert fires when the queue size exceeds the configured threshold (default: 50 events). This indicates that recent event processing failures are not being automatically recovered.
+
+### Alert Channels
+
+Alerts are sent through:
+
+- **LOG** (always enabled) - Check application logs at `logs/error.log`
+- **SLACK** (optional) - Configured via `SLACK_WEBHOOK_URL` environment variable
+- **PAGERDUTY** (optional) - Configured via `PAGERDUTY_ROUTING_KEY` environment variable
+
+### Alert Deduplication
+
+Alerts are deduplicated using a cooldown window (default: 15 minutes). While the queue remains above the threshold:
+
+- First alert is sent immediately
+- Subsequent alerts within the cooldown window are suppressed
+- Once cooldown expires and queue is still above threshold, a new alert is sent
+- When queue drops below threshold, alert state is cleared
+
+### Step 1: Inspect the DLQ
+
+Access the DLQ admin dashboard via the link in the alert (or navigate to `https://admin.neurowealth.io/dlq`):
+
+1. **View Queue Status**
+   - Current size (number of events)
+   - Breakdown by status: PENDING, RETRIED, RESOLVED
+   - Age of oldest pending event
+
+2. **Identify Patterns**
+   - Are all failed events of the same type (deposits, withdrawals, rebalances)?
+   - Are they from a specific time window?
+   - Is there a common error message?
+
+### Step 2: Understand Root Causes
+
+Common reasons for DLQ growth:
+
+#### Temporary Stellar Network Issues
+
+- **Symptom**: Large influx of timeouts or network errors
+- **Recovery**: Wait for network to stabilize; queue will auto-recover
+- **Action**: Monitor logs and wait for next retry cycle
+
+#### Smart Contract Issues
+
+- **Symptom**: Errors like "contract not found" or "method invocation failed"
+- **Recovery**: Fix contract deployment or method implementation
+- **Action**: Deploy fix, then retry events
+
+#### Database Connectivity
+
+- **Symptom**: Connection pool exhausted or query timeouts
+- **Recovery**: Restart database or scale read replicas
+- **Action**: Resolve DB issue, restart app, then retry
+
+#### Configuration Errors
+
+- **Symptom**: Invalid keys, tokens, or missing environment variables
+- **Recovery**: Verify and correct configuration
+- **Action**: Update env vars, restart app, then retry
+
+### Step 3: Dry-Run Retry (Optional)
+
+Before retrying all events, inspect a sample event to understand the failure:
+
+```bash
+# View a specific event
+curl https://api.neurowealth.io/admin/dlq/events/{EVENT_ID}
+
+# Response includes:
+# - eventType: "deposit", "withdraw", "rebalance"
+# - error: Error message
+# - payload: Original event data
+# - status: "PENDING", "RETRIED", "RESOLVED"
+# - retryCount: Number of retry attempts
+```
+
+### Step 4: Manual Retry
+
+Once the root cause is fixed:
+
+```bash
+# Retry all pending/retried events
+POST /admin/dlq/retry
+{
+  "dryRun": false,  # Set to true for dry-run
+  "statuses": ["PENDING", "RETRIED"]  # Optional filter
+}
+
+# Response:
+# {
+#   "resolved": 45,  # Successfully retried
+#   "failed": 5      # Still failing
+# }
+```
+
+### Step 5: Resolve Events Manually (If Needed)
+
+For events that cannot be recovered:
+
+```bash
+# Mark event as resolved (won't be retried)
+POST /admin/dlq/events/{EVENT_ID}/resolve
+
+# This is a last resort. Resolved events are not processed
+# but won't trigger alerts.
+```
+
+### Step 6: Monitor Recovery
+
+After retry:
+
+- Watch the queue size via the dashboard
+- Check logs for any new failures
+- Monitor the `dlq_size` Prometheus metric
+- Alert should clear once queue drops below threshold
+
+## Alert Threshold Configuration
+
+Adjust the alerting threshold via environment variables:
+
+```bash
+# Alert when DLQ size >= this value (default: 50)
+DLQ_ALERT_THRESHOLD=50
+
+# Suppress duplicate alerts for this duration in milliseconds
+# (default: 900000 = 15 minutes)
+DLQ_ALERT_COOLDOWN_MS=900000
+```
+
+## External Integrations
+
+### Slack Integration
+
+1. Create a Slack App: https://api.slack.com/apps
+2. Enable Incoming Webhooks
+3. Create a webhook for your channel
+4. Set environment variable:
+
+```bash
+SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL
+```
+
+Alert format includes:
+
+- Queue size and breakdown by status
+- Age of oldest pending event
+- Link to admin dashboard
+- Custom metadata
+
+### PagerDuty Integration
+
+1. Create an integration key in PagerDuty
+2. Set environment variable:
+
+```bash
+PAGERDUTY_ROUTING_KEY=your-pagerduty-routing-key
+```
+
+Alerts are deduplicated by minute to prevent incident spam.
+
+## Prometheus Metrics
+
+Monitor alert state via:
+
+```promql
+# 1 = alert active, 0 = alert inactive
+dlq_alert_active
+
+# Current DLQ size
+dlq_size
+
+# DLQ size trend over time
+rate(dlq_size[5m])
+```
+
+## Prevention Best Practices
+
+1. **Monitor Trends**: Watch `dlq_size` over time for slow growth
+2. **Set Lower Thresholds**: Use `DLQ_ALERT_THRESHOLD=20` in staging
+3. **Upstream Resilience**: Implement circuit breakers for external APIs
+4. **Test Error Handling**: Regularly test DLQ recovery with chaos engineering
+5. **On-Call Training**: Ensure on-call engineers understand this runbook
+
+## Alert Severity Levels
+
+- **CRITICAL** (red): DLQ >= threshold, immediate action needed
+- **WARNING** (yellow): DLQ between 50% and threshold, investigate soon
+- **INFO** (blue): DLQ growing but normal, monitor trend
+
+## Escalation
+
+If queue continues to grow after recovery attempts:
+
+1. **Check Consensus**: Are deposits/withdrawals actually failing on-chain?
+2. **Inspect Ledger**: Query Stellar testnet/mainnet for transaction status
+3. **Contact Support**: Reach out to Stellar developer relations if contract issue
+4. **Page On-Call**: If user-facing transactions are failing, escalate to on-call team
+
+## Related Documentation
+
+- [OBSERVABILITY.md](./OBSERVABILITY.md) - Complete metrics and alert thresholds
+- [API_REFERENCE.md](./API_REFERENCE.md) - Admin DLQ endpoints
+- [DEPLOYMENT_GUIDE.md](./DEPLOYMENT_GUIDE.md) - Configuration and secrets management