Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ out/
*.yml
!docker/prometheus.yml
!docker/grafana/**/*.yml
!docker/alerting/**/*.yml
.editorconfig
68 changes: 68 additions & 0 deletions docker/alerting/alert.rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
groups:
- name: linkiving-core-alerts
rules:

# ── 가용성 ─────────────────────────────────────────────
# 인스턴스가 스크레이프되지 않음 (앱 다운/네트워크 단절)
- alert: InstanceDown
expr: up{job="linkiving-core"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "인스턴스 다운: {{ $labels.instance }}"
description: "{{ $labels.instance }} 가 1분 이상 스크레이프되지 않습니다."

# ── HTTP 5xx 에러율 ────────────────────────────────────
# 전체 요청 중 5xx 비율이 5분간 5% 초과
- alert: HighServerErrorRate
expr: |
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
/ sum(rate(http_server_requests_seconds_count[5m]))
> 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "서버 5xx 에러율 높음"
description: "최근 5분간 5xx 에러율이 5%를 초과했습니다 (현재 {{ $value | humanizePercentage }})."

# ── 응답 지연 (p95) ────────────────────────────────────
# p95 응답시간이 1초 초과
- alert: HighLatencyP95
expr: |
histogram_quantile(
0.95,
sum by (le) (rate(http_server_requests_seconds_bucket[5m]))
) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "응답 지연(p95) 높음"
description: "p95 응답시간이 1초를 초과했습니다 (현재 {{ $value }}s)."

# ── 외부 AI 호출 실패율 ────────────────────────────────
# AI 호출 중 failure 비율이 5분간 20% 초과 (클라이언트별)
- alert: HighAiCallFailureRate
expr: |
sum by (client) (rate(ai_client_calls_total{result="failure"}[5m]))
/ sum by (client) (rate(ai_client_calls_total[5m]))
> 0.2
for: 5m
labels:
severity: warning
annotations:
summary: "AI 호출 실패율 높음: {{ $labels.client }}"
description: "AI 클라이언트 '{{ $labels.client }}' 실패율이 5분간 20%를 초과했습니다 (현재 {{ $value | humanizePercentage }})."

# ── 비동기 작업 최종 실패 ──────────────────────────────
# 재시도 소진 후 최종 실패가 발생하면 즉시 알림 (수동 복구 필요 신호)
- alert: AsyncTaskFinalFailure
expr: increase(async_task_failures_total[10m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "비동기 작업 최종 실패: {{ $labels.task }}"
description: "task '{{ $labels.task }}'{{ if $labels.action }} (action {{ $labels.action }}){{ end }} 에서 재시도 소진 후 최종 실패가 발생했습니다. 수동 복구가 필요할 수 있습니다."
30 changes: 30 additions & 0 deletions docker/alerting/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Alertmanager 설정

global:
resolve_timeout: 5m

route:
receiver: 'default'
group_by: [ 'alertname' ]
group_wait: 30s # 같은 그룹 알림을 모아 보내기 전 대기
group_interval: 5m # 같은 그룹에 새 알림 추가 시 재전송 간격
repeat_interval: 4h # 동일 알림 반복 전송 간격

receivers:
- name: 'default'
# 아직 전달 경로 미정 → 비어 있음 (알림은 Alertmanager UI(:9093)에서만 확인됨).
# 전달 경로가 정해지면 아래 중 하나를 채우세요.

# ── Discord 예시 ────────────────────────────
# webhook_configs:
# - url: '${DISCORD_WEBHOOK_URL}/slack' # Discord 는 webhook URL 뒤에 /slack 붙이면 slack 포맷 호환
# send_resolved: true

# ── 이메일 예시 ─────────────────────────────
# email_configs:
# - to: 'team@example.com'
# from: 'alert@example.com'
# smarthost: 'smtp.example.com:587'
# auth_username: 'alert@example.com'
# auth_password: '${SMTP_PASSWORD}'
# send_resolved: true
21 changes: 21 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ services:
container_name: prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./alerting/alert.rules.yml:/etc/prometheus/alert.rules.yml:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
Expand Down Expand Up @@ -102,6 +103,26 @@ services:
max-size: "10m"
max-file: "3"

alertmanager:
image: prom/alertmanager:v0.27.0
container_name: alertmanager
volumes:
- ./alerting/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
restart: unless-stopped
networks:
- app-network
deploy:
resources:
limits:
memory: 128M
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"

volumes:
prometheus_data:
grafana_data:
Expand Down
Loading
Loading