From 11f1928bc3d539da041bd6797752ea79e0066b06 Mon Sep 17 00:00:00 2001 From: Jansoon Date: Sun, 21 Jun 2026 20:52:29 +0900 Subject: [PATCH 1/5] =?UTF-8?q?feat:=20AI=20=ED=98=B8=EC=B6=9C=20=EA=B2=B0?= =?UTF-8?q?=EA=B3=BC=20=EB=A9=94=ED=8A=B8=EB=A6=AD=20=EC=B6=94=EA=B0=80(ai?= =?UTF-8?q?=5Fclient=5Fcalls)(#239)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - answer/title/summary/link-sync 클라이언트에 success/empty/failure - 카운터를 client·operation·result 태그로 집계. 앱 시작 시 0으로 미리 등록. - RagAnswerClient 빈 응답 시 IndexOutOfBounds 발생하던 버그도 함께 수정. --- .../domain/chat/ai/RagAnswerClient.java | 32 ++++++++++ .../domain/chat/ai/RagTitleClient.java | 25 ++++++++ .../domain/link/ai/RagLinkSyncClient.java | 58 +++++++++++++++++-- .../domain/link/ai/RagSummaryClient.java | 36 ++++++++++++ .../domain/chat/ai/RagAnswerClientTest.java | 44 ++++++++++++-- .../domain/chat/ai/RagTitleClientTest.java | 27 ++++++++- .../domain/link/ai/RagLinkSyncClientTest.java | 39 ++++++++++++- .../domain/link/ai/RagSummaryClientTest.java | 26 ++++++++- 8 files changed, 268 insertions(+), 19 deletions(-) diff --git a/src/main/java/com/sofa/linkiving/domain/chat/ai/RagAnswerClient.java b/src/main/java/com/sofa/linkiving/domain/chat/ai/RagAnswerClient.java index 673392ac..6fe928c8 100644 --- a/src/main/java/com/sofa/linkiving/domain/chat/ai/RagAnswerClient.java +++ b/src/main/java/com/sofa/linkiving/domain/chat/ai/RagAnswerClient.java @@ -8,6 +8,9 @@ import com.sofa.linkiving.domain.chat.dto.request.RagAnswerReq; import com.sofa.linkiving.domain.chat.dto.response.RagAnswerRes; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import jakarta.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -18,15 +21,44 @@ public class RagAnswerClient implements AnswerClient { private final RagAnswerFeign ragAnswerFeign; + private final MeterRegistry meterRegistry; + + private Counter successCounter; + private Counter emptyCounter; + private Counter failureCounter; + + @PostConstruct + private void initCounters() { + this.successCounter = buildCounter("success"); + this.emptyCounter = buildCounter("empty"); + this.failureCounter = buildCounter("failure"); + } + + private Counter buildCounter(String result) { + return Counter.builder("ai.client.calls") + .tag("client", "answer") + .tag("result", result) + .register(meterRegistry); + } @Override public RagAnswerRes generateAnswer(RagAnswerReq request) { try { List ragAnswerRes = ragAnswerFeign.generateAnswer(request); + + if (ragAnswerRes == null || ragAnswerRes.isEmpty()) { + log.warn("RagAnswerClient generateAnswer empty response"); + emptyCounter.increment(); + return null; + } + log.info("RagAnswerClient generateAnswer ragAnswerRes={}", ragAnswerRes); + successCounter.increment(); return ragAnswerRes.get(0); + } catch (Exception e) { log.error("RagAnswerClient generateAnswer error", e); + failureCounter.increment(); return null; } } diff --git a/src/main/java/com/sofa/linkiving/domain/chat/ai/RagTitleClient.java b/src/main/java/com/sofa/linkiving/domain/chat/ai/RagTitleClient.java index e6992d3c..f7214d91 100644 --- a/src/main/java/com/sofa/linkiving/domain/chat/ai/RagTitleClient.java +++ b/src/main/java/com/sofa/linkiving/domain/chat/ai/RagTitleClient.java @@ -8,6 +8,9 @@ import com.sofa.linkiving.domain.chat.dto.request.TitleGenerateReq; import com.sofa.linkiving.domain.chat.dto.response.TitleGenerateRes; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import jakarta.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -19,6 +22,25 @@ public class RagTitleClient implements TitleClient { private static final int MAX_TITLE_LENGTH = 100; private final RagTitleFeign ragTitleFeign; + private final MeterRegistry meterRegistry; + + private Counter successCounter; + private Counter emptyCounter; + private Counter failureCounter; + + @PostConstruct + private void initCounters() { + this.successCounter = buildCounter("success"); + this.emptyCounter = buildCounter("empty"); + this.failureCounter = buildCounter("failure"); + } + + private Counter buildCounter(String result) { + return Counter.builder("ai.client.calls") + .tag("client", "title") + .tag("result", result) + .register(meterRegistry); + } @Override public String generateTitle(String firstChat) { @@ -26,13 +48,16 @@ public String generateTitle(String firstChat) { List response = ragTitleFeign.generateTitle(new TitleGenerateReq(firstChat)); if (response == null || response.isEmpty()) { + emptyCounter.increment(); return truncateTitle(firstChat); } + successCounter.increment(); return response.get(0).title(); } catch (Exception e) { log.error("AI 서버 통신 실패. 기본 제목으로 대체합니다. error={}", e.getMessage()); + failureCounter.increment(); return truncateTitle(firstChat); } } diff --git a/src/main/java/com/sofa/linkiving/domain/link/ai/RagLinkSyncClient.java b/src/main/java/com/sofa/linkiving/domain/link/ai/RagLinkSyncClient.java index e6ed013f..90a8cc7b 100644 --- a/src/main/java/com/sofa/linkiving/domain/link/ai/RagLinkSyncClient.java +++ b/src/main/java/com/sofa/linkiving/domain/link/ai/RagLinkSyncClient.java @@ -6,6 +6,9 @@ import com.sofa.linkiving.domain.link.dto.request.LinkSyncDeleteReq; import com.sofa.linkiving.domain.link.dto.request.LinkSyncUpdateReq; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import jakarta.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -16,22 +19,65 @@ public class RagLinkSyncClient implements LinkSyncClient { private final LinkSyncFeign linkSyncFeign; + private final MeterRegistry meterRegistry; + private Counter createSuccess; + private Counter createFailure; + private Counter updateSuccess; + private Counter updateFailure; + private Counter deleteSuccess; + private Counter deleteFailure; + + @PostConstruct + private void initCounters() { + this.createSuccess = buildCounter("create", "success"); + this.createFailure = buildCounter("create", "failure"); + this.updateSuccess = buildCounter("update", "success"); + this.updateFailure = buildCounter("update", "failure"); + this.deleteSuccess = buildCounter("delete", "success"); + this.deleteFailure = buildCounter("delete", "failure"); + } + + private Counter buildCounter(String operation, String result) { + return Counter.builder("ai.client.calls") + .tag("client", "link-sync") + .tag("operation", operation) + .tag("result", result) + .register(meterRegistry); + } @Override public void syncCreate(LinkSyncUpdateReq req) { - linkSyncFeign.syncUpdate(req); - log.info("AI 서버 동기화 완료 (CREATE) - linkId: {}", req.linkId()); + try { + linkSyncFeign.syncUpdate(req); + createSuccess.increment(); + log.info("AI 서버 동기화 완료 (CREATE) - linkId: {}", req.linkId()); + } catch (Exception e) { + createFailure.increment(); + throw e; + } } @Override public void syncUpdate(LinkSyncUpdateReq req) { - linkSyncFeign.syncUpdate(req); - log.info("AI 서버 동기화 완료 (UPDATE) - linkId: {}", req.linkId()); + try { + linkSyncFeign.syncUpdate(req); + updateSuccess.increment(); + log.info("AI 서버 동기화 완료 (UPDATE) - linkId: {}", req.linkId()); + } catch (Exception e) { + updateFailure.increment(); + throw e; + } } @Override public void syncDelete(Long linkId) { - linkSyncFeign.syncDelete(new LinkSyncDeleteReq(linkId)); - log.info("AI 서버 동기화 완료 (DELETE) - linkId: {}", linkId); + try { + linkSyncFeign.syncDelete(new LinkSyncDeleteReq(linkId)); + deleteSuccess.increment(); + log.info("AI 서버 동기화 완료 (DELETE) - linkId: {}", linkId); + } catch (Exception e) { + deleteFailure.increment(); + throw e; + } } } diff --git a/src/main/java/com/sofa/linkiving/domain/link/ai/RagSummaryClient.java b/src/main/java/com/sofa/linkiving/domain/link/ai/RagSummaryClient.java index 4ebb24ae..752dc4fa 100644 --- a/src/main/java/com/sofa/linkiving/domain/link/ai/RagSummaryClient.java +++ b/src/main/java/com/sofa/linkiving/domain/link/ai/RagSummaryClient.java @@ -10,6 +10,9 @@ import com.sofa.linkiving.domain.link.dto.response.RagInitialSummaryRes; import com.sofa.linkiving.domain.link.dto.response.RagRegenerateSummaryRes; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import jakarta.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -20,6 +23,31 @@ public class RagSummaryClient implements SummaryClient { private final RagSummaryFeign ragSummaryFeign; + private final MeterRegistry meterRegistry; + private Counter initialSuccess; + private Counter initialEmpty; + private Counter initialFailure; + private Counter regenerateSuccess; + private Counter regenerateEmpty; + private Counter regenerateFailure; + + @PostConstruct + private void initCounters() { + this.initialSuccess = buildCounter("initial", "success"); + this.initialEmpty = buildCounter("initial", "empty"); + this.initialFailure = buildCounter("initial", "failure"); + this.regenerateSuccess = buildCounter("regenerate", "success"); + this.regenerateEmpty = buildCounter("regenerate", "empty"); + this.regenerateFailure = buildCounter("regenerate", "failure"); + } + + private Counter buildCounter(String operation, String result) { + return Counter.builder("ai.client.calls") + .tag("client", "summary") + .tag("operation", operation) + .tag("result", result) + .register(meterRegistry); + } @Override public RagInitialSummaryRes initialSummary(Long linkId, Long userId, String title, String url, String memo) { @@ -29,13 +57,17 @@ public RagInitialSummaryRes initialSummary(Long linkId, Long userId, String titl if (response != null && !response.isEmpty()) { log.info("[AI Server] Initial Summary Requested Success. LinkId: {}", linkId); + initialSuccess.increment(); return response.get(0); } + + initialEmpty.increment(); return null; } catch (Exception e) { log.error("[AI Server Error] Failed to request initial summary for LinkId: {}. Error: {}", linkId, e.getMessage()); + initialFailure.increment(); return null; } } @@ -48,13 +80,17 @@ public RagRegenerateSummaryRes regenerateSummary(Long linkId, Long userId, Strin if (response != null && !response.isEmpty()) { log.info("[AI Server] Regenerate Summary Success. LinkId: {}", linkId); + regenerateSuccess.increment(); return response.get(0); } + + regenerateEmpty.increment(); return null; } catch (Exception e) { log.error("[AI Server Error] Failed to regenerate summary for LinkId: {}. Error: {}", linkId, e.getMessage()); + regenerateFailure.increment(); return null; } } diff --git a/src/test/java/com/sofa/linkiving/domain/chat/ai/RagAnswerClientTest.java b/src/test/java/com/sofa/linkiving/domain/chat/ai/RagAnswerClientTest.java index 1f08fc58..24413f65 100644 --- a/src/test/java/com/sofa/linkiving/domain/chat/ai/RagAnswerClientTest.java +++ b/src/test/java/com/sofa/linkiving/domain/chat/ai/RagAnswerClientTest.java @@ -4,30 +4,46 @@ import static org.mockito.ArgumentMatchers.*; import static org.mockito.BDDMockito.*; +import java.util.Collections; import java.util.List; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.test.util.ReflectionTestUtils; import com.sofa.linkiving.domain.chat.dto.request.RagAnswerReq; import com.sofa.linkiving.domain.chat.dto.response.RagAnswerRes; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; + @ExtendWith(MockitoExtension.class) @DisplayName("RagAnswerClient 단위 테스트") class RagAnswerClientTest { - @InjectMocks - private RagAnswerClient ragAnswerClient; @Mock private RagAnswerFeign ragAnswerFeign; + private RagAnswerClient ragAnswerClient; + private SimpleMeterRegistry meterRegistry; + + @BeforeEach + void setUp() { + meterRegistry = new SimpleMeterRegistry(); + ragAnswerClient = new RagAnswerClient(ragAnswerFeign, meterRegistry); + ReflectionTestUtils.invokeMethod(ragAnswerClient, "initCounters"); + } + + private double counterCount(String result) { + return meterRegistry.counter("ai.client.calls", "client", "answer", "result", result).count(); + } + @Test - @DisplayName("generateAnswer: Feign 응답이 정상일 경우 리스트의 첫 번째 요소를 반환한다") + @DisplayName("Feign 응답이 정상일 경우 리스트의 첫 번째 요소를 반환한다") void shouldReturnFirstElement_WhenGenerateAnswerSuccess() { // given RagAnswerReq req = mock(RagAnswerReq.class); @@ -40,10 +56,11 @@ void shouldReturnFirstElement_WhenGenerateAnswerSuccess() { // then assertThat(actualRes).isEqualTo(expectedRes); + assertThat(counterCount("success")).isEqualTo(1.0); } @Test - @DisplayName("generateAnswer: Feign 요청 중 예외가 발생하면 예외를 잡고 null을 반환한다") + @DisplayName("Feign 요청 중 예외가 발생하면 예외를 잡고 null을 반환한다") void shouldCatchExceptionAndReturnNull_WhenGenerateAnswerThrowsException() { // given RagAnswerReq req = mock(RagAnswerReq.class); @@ -55,5 +72,22 @@ void shouldCatchExceptionAndReturnNull_WhenGenerateAnswerThrowsException() { // then assertThat(actualRes).isNull(); + assertThat(counterCount("failure")).isEqualTo(1.0); + } + + @Test + @DisplayName("Feign 응답이 비어있으면 null 을 반환하고 empty 로 집계한다") + void shouldReturnNullAndCountEmpty_WhenResponseIsEmpty() { + // given + RagAnswerReq req = mock(RagAnswerReq.class); + given(ragAnswerFeign.generateAnswer(any(RagAnswerReq.class))) + .willReturn(Collections.emptyList()); + + // when + RagAnswerRes actualRes = ragAnswerClient.generateAnswer(req); + + // then + assertThat(actualRes).isNull(); + assertThat(counterCount("empty")).isEqualTo(1.0); } } diff --git a/src/test/java/com/sofa/linkiving/domain/chat/ai/RagTitleClientTest.java b/src/test/java/com/sofa/linkiving/domain/chat/ai/RagTitleClientTest.java index 61f1b0f2..cc1cfa63 100644 --- a/src/test/java/com/sofa/linkiving/domain/chat/ai/RagTitleClientTest.java +++ b/src/test/java/com/sofa/linkiving/domain/chat/ai/RagTitleClientTest.java @@ -1,30 +1,47 @@ package com.sofa.linkiving.domain.chat.ai; import static org.assertj.core.api.Assertions.*; +import static org.mockito.ArgumentMatchers.*; import static org.mockito.BDDMockito.*; import java.util.Collections; import java.util.List; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.test.util.ReflectionTestUtils; import com.sofa.linkiving.domain.chat.dto.request.TitleGenerateReq; import com.sofa.linkiving.domain.chat.dto.response.TitleGenerateRes; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; + @ExtendWith(MockitoExtension.class) -public class RagTitleClientTest { +@DisplayName("RagTitleClient 단위 테스트") +class RagTitleClientTest { - @InjectMocks private RagTitleClient ragTitleClient; @Mock private RagTitleFeign ragTitleFeign; + private SimpleMeterRegistry meterRegistry; + + @BeforeEach + void setUp() { + meterRegistry = new SimpleMeterRegistry(); + ragTitleClient = new RagTitleClient(ragTitleFeign, meterRegistry); + ReflectionTestUtils.invokeMethod(ragTitleClient, "initCounters"); + } + + private double counterCount(String result) { + return meterRegistry.counter("ai.client.calls", "client", "title", "result", result).count(); + } + @Test @DisplayName("AI 서버 통신 성공 시 생성된 제목을 반환한다") void shouldReturnGeneratedTitleWhenApiCallSucceeds() { @@ -44,6 +61,7 @@ void shouldReturnGeneratedTitleWhenApiCallSucceeds() { // then assertThat(result).isEqualTo(generatedTitle); verify(ragTitleFeign).generateTitle(any(TitleGenerateReq.class)); + assertThat(counterCount("success")).isEqualTo(1.0); } @Test @@ -60,6 +78,7 @@ void shouldReturnFirstChatWhenResponseIsEmpty() { // then assertThat(result).isEqualTo(firstChat); + assertThat(counterCount("empty")).isEqualTo(1.0); } @Test @@ -76,6 +95,7 @@ void shouldReturnFirstChatWhenResponseIsNull() { // then assertThat(result).isEqualTo(firstChat); + assertThat(counterCount("empty")).isEqualTo(1.0); } @Test @@ -92,5 +112,6 @@ void shouldReturnFirstChatWhenExceptionOccurs() { // then assertThat(result).isEqualTo(firstChat); + assertThat(counterCount("failure")).isEqualTo(1.0); } } diff --git a/src/test/java/com/sofa/linkiving/domain/link/ai/RagLinkSyncClientTest.java b/src/test/java/com/sofa/linkiving/domain/link/ai/RagLinkSyncClientTest.java index f694c10c..e3bb2f2f 100644 --- a/src/test/java/com/sofa/linkiving/domain/link/ai/RagLinkSyncClientTest.java +++ b/src/test/java/com/sofa/linkiving/domain/link/ai/RagLinkSyncClientTest.java @@ -3,27 +3,43 @@ import static org.assertj.core.api.Assertions.*; import static org.mockito.Mockito.*; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.ArgumentCaptor; -import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.test.util.ReflectionTestUtils; import com.sofa.linkiving.domain.link.dto.request.LinkSyncDeleteReq; import com.sofa.linkiving.domain.link.dto.request.LinkSyncUpdateReq; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; + @ExtendWith(MockitoExtension.class) -@DisplayName("LinkSyncClient 단위 테스트") +@DisplayName("RagLinkSyncClient 단위 테스트") class RagLinkSyncClientTest { - @InjectMocks private RagLinkSyncClient ragLinkSyncClient; @Mock private LinkSyncFeign linkSyncFeign; + private SimpleMeterRegistry meterRegistry; + + @BeforeEach + void setUp() { + meterRegistry = new SimpleMeterRegistry(); + ragLinkSyncClient = new RagLinkSyncClient(linkSyncFeign, meterRegistry); + ReflectionTestUtils.invokeMethod(ragLinkSyncClient, "initCounters"); + } + + private double callCount(String operation, String result) { + return meterRegistry.counter("ai.client.calls", + "client", "link-sync", "operation", operation, "result", result).count(); + } + @Test @DisplayName("CREATE 동기화 시 Feign Client의 syncUpdate를 호출한다") void shouldCallSyncUpdateOnCreate() { @@ -35,6 +51,7 @@ void shouldCallSyncUpdateOnCreate() { // then verify(linkSyncFeign, times(1)).syncUpdate(req); + assertThat(callCount("create", "success")).isEqualTo(1.0); } @Test @@ -48,6 +65,7 @@ void shouldCallSyncUpdateOnUpdate() { // then verify(linkSyncFeign, times(1)).syncUpdate(req); + assertThat(callCount("update", "success")).isEqualTo(1.0); } @Test @@ -65,5 +83,20 @@ void shouldCallSyncDeleteOnDelete() { LinkSyncDeleteReq capturedReq = captor.getValue(); assertThat(capturedReq.linkId()).isEqualTo(linkId); + assertThat(callCount("delete", "success")).isEqualTo(1.0); + } + + @Test + @DisplayName("동기화 실패 시 failure 카운터를 올리고 예외를 다시 던진다") + void shouldCountFailureAndRethrow_WhenFeignThrows() { + // given + LinkSyncUpdateReq req = mock(LinkSyncUpdateReq.class); + doThrow(new RuntimeException("AI Server Error")).when(linkSyncFeign).syncUpdate(req); + + // when & then : 예외를 삼키지 않고 그대로 던져야 (재시도/복구 로직 보존) + assertThatThrownBy(() -> ragLinkSyncClient.syncCreate(req)) + .isInstanceOf(RuntimeException.class); + + assertThat(callCount("create", "failure")).isEqualTo(1.0); } } diff --git a/src/test/java/com/sofa/linkiving/domain/link/ai/RagSummaryClientTest.java b/src/test/java/com/sofa/linkiving/domain/link/ai/RagSummaryClientTest.java index 71f9d22a..124ac3c4 100644 --- a/src/test/java/com/sofa/linkiving/domain/link/ai/RagSummaryClientTest.java +++ b/src/test/java/com/sofa/linkiving/domain/link/ai/RagSummaryClientTest.java @@ -7,28 +7,45 @@ import java.util.Collections; import java.util.List; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.test.util.ReflectionTestUtils; import com.sofa.linkiving.domain.link.dto.request.RagInitialSummaryReq; import com.sofa.linkiving.domain.link.dto.request.RagRegenerateSummaryReq; import com.sofa.linkiving.domain.link.dto.response.RagInitialSummaryRes; import com.sofa.linkiving.domain.link.dto.response.RagRegenerateSummaryRes; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; + @ExtendWith(MockitoExtension.class) @DisplayName("RagSummaryClient 단위 테스트") public class RagSummaryClientTest { - @InjectMocks private RagSummaryClient ragSummaryClient; @Mock private RagSummaryFeign ragSummaryFeign; + private SimpleMeterRegistry meterRegistry; + + @BeforeEach + void setUp() { + meterRegistry = new SimpleMeterRegistry(); + ragSummaryClient = new RagSummaryClient(ragSummaryFeign, meterRegistry); + // @PostConstruct 는 단위 테스트에서 자동 호출되지 않으므로 카운터를 수동 등록 + ReflectionTestUtils.invokeMethod(ragSummaryClient, "initCounters"); + } + + private double counterCount(String operation, String result) { + return meterRegistry.counter("ai.client.calls", + "client", "summary", "operation", operation, "result", result).count(); + } + @Test @DisplayName("최초 요약 요청 성공 시 응답 객체를 반환한다") void shouldReturnInitialSummaryResWhenSuccess() { @@ -53,6 +70,7 @@ void shouldReturnInitialSummaryResWhenSuccess() { assertThat(result).isEqualTo(expectedRes); verify(ragSummaryFeign, times(1)).requestInitialSummary(any(RagInitialSummaryReq.class)); + assertThat(counterCount("initial", "success")).isEqualTo(1.0); } @Test @@ -67,6 +85,7 @@ void shouldReturnNullWhenInitialSummaryResponseIsEmpty() { // then assertThat(result).isNull(); + assertThat(counterCount("initial", "empty")).isEqualTo(1.0); } @Test @@ -81,6 +100,7 @@ void shouldReturnNullWhenInitialSummaryThrowsException() { // then assertThat(result).isNull(); + assertThat(counterCount("initial", "failure")).isEqualTo(1.0); } @Test @@ -106,6 +126,7 @@ void shouldReturnRegenerateSummaryResWhenSuccess() { assertThat(result).isEqualTo(expectedRes); verify(ragSummaryFeign, times(1)).requestRegenerateSummary(any(RagRegenerateSummaryReq.class)); + assertThat(counterCount("regenerate", "success")).isEqualTo(1.0); } @Test @@ -120,5 +141,6 @@ void shouldReturnNullWhenRegenerateSummaryThrowsException() { // then assertThat(result).isNull(); + assertThat(counterCount("regenerate", "failure")).isEqualTo(1.0); } } From a7521d1972d42cc2d022b1d2377bc009ae71a785 Mon Sep 17 00:00:00 2001 From: Jansoon Date: Sun, 21 Jun 2026 20:53:46 +0900 Subject: [PATCH 2/5] =?UTF-8?q?feat:=20=EB=B9=84=EB=8F=99=EA=B8=B0=20?= =?UTF-8?q?=EC=9E=91=EC=97=85=20=EC=B5=9C=EC=A2=85=20=EC=8B=A4=ED=8C=A8=20?= =?UTF-8?q?=EB=A9=94=ED=8A=B8=EB=A6=AD=20=EC=B6=94=EA=B0=80=20(async=5Ftas?= =?UTF-8?q?k=5Ffailures)=20(#239)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 재시도 소진 후 @Recover/catch 지점(link-sync, summary-enqueue, summary-generate)에 최종 실패 카운터를 task별로 집계. 0으로 미리 등록. --- .../domain/link/event/LinkEventListener.java | 14 ++++++++++++ .../link/event/LinkSyncEventListener.java | 22 +++++++++++++++++++ .../domain/link/worker/SummaryWorker.java | 11 +++++++++- .../link/event/LinkEventListenerTest.java | 11 +++++++++- .../link/event/LinkSyncEventListenerTest.java | 20 +++++++++++------ .../domain/link/worker/SummaryWorkerTest.java | 7 +++++- 6 files changed, 75 insertions(+), 10 deletions(-) diff --git a/src/main/java/com/sofa/linkiving/domain/link/event/LinkEventListener.java b/src/main/java/com/sofa/linkiving/domain/link/event/LinkEventListener.java index a3fe630b..858dfb99 100644 --- a/src/main/java/com/sofa/linkiving/domain/link/event/LinkEventListener.java +++ b/src/main/java/com/sofa/linkiving/domain/link/event/LinkEventListener.java @@ -14,6 +14,9 @@ import com.sofa.linkiving.domain.link.facade.SummaryWorkerFacade; import com.sofa.linkiving.domain.link.worker.SummaryQueue; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import jakarta.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -30,6 +33,15 @@ public class LinkEventListener { private final ApplicationEventPublisher eventPublisher; private final SummaryWorkerFacade summaryWorkerFacade; private final ObjectProvider selfProvider; + private final MeterRegistry meterRegistry; + private Counter enqueueFailureCounter; + + @PostConstruct + private void initCounters() { + this.enqueueFailureCounter = Counter.builder("async.task.failures") + .tag("task", "summary-enqueue") + .register(meterRegistry); + } /** * 트랜잭션 커밋 후 비동기로 큐 적재 실행 @@ -66,6 +78,8 @@ public void addToQueueWithRetry(LinkCreatedEvent event) { public void recover(Exception exception, LinkCreatedEvent event) { log.error("Final failure to queue link after retries - linkId: {}", event.linkId(), exception); + enqueueFailureCounter.increment(); + summaryWorkerFacade.updateSummaryStatus(event.linkId(), SummaryStatus.FAILED); eventPublisher.publishEvent(new SummaryStatusEvent( diff --git a/src/main/java/com/sofa/linkiving/domain/link/event/LinkSyncEventListener.java b/src/main/java/com/sofa/linkiving/domain/link/event/LinkSyncEventListener.java index 4a273b98..f4c56957 100644 --- a/src/main/java/com/sofa/linkiving/domain/link/event/LinkSyncEventListener.java +++ b/src/main/java/com/sofa/linkiving/domain/link/event/LinkSyncEventListener.java @@ -1,5 +1,8 @@ package com.sofa.linkiving.domain.link.event; +import java.util.EnumMap; +import java.util.Map; + import org.springframework.retry.annotation.Backoff; import org.springframework.retry.annotation.Recover; import org.springframework.retry.annotation.Retryable; @@ -9,7 +12,11 @@ import org.springframework.transaction.event.TransactionalEventListener; import com.sofa.linkiving.domain.link.ai.LinkSyncClient; +import com.sofa.linkiving.domain.link.enums.SyncAction; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import jakarta.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -19,6 +26,19 @@ public class LinkSyncEventListener { private final LinkSyncClient linkSyncClient; + private final MeterRegistry meterRegistry; + + private final Map failureCounters = new EnumMap<>(SyncAction.class); + + @PostConstruct + private void initCounters() { + for (SyncAction action : SyncAction.values()) { + failureCounters.put(action, Counter.builder("async.task.failures") + .tag("task", "link-sync") + .tag("action", action.name()) + .register(meterRegistry)); + } + } @Async @Retryable( @@ -41,5 +61,7 @@ public void handleLinkSyncEvent(LinkSyncEvent event) { public void recover(Exception exception, LinkSyncEvent event) { log.error("[CRITICAL] AI 서버 동기화 최종 실패. 수동 복구 필요 - action: {}, linkId: {}", event.action(), event.req().linkId(), exception); + + failureCounters.get(event.action()).increment(); } } diff --git a/src/main/java/com/sofa/linkiving/domain/link/worker/SummaryWorker.java b/src/main/java/com/sofa/linkiving/domain/link/worker/SummaryWorker.java index af9d2cb4..b878322b 100644 --- a/src/main/java/com/sofa/linkiving/domain/link/worker/SummaryWorker.java +++ b/src/main/java/com/sofa/linkiving/domain/link/worker/SummaryWorker.java @@ -20,6 +20,8 @@ import com.sofa.linkiving.domain.link.event.SummaryStatusEvent; import com.sofa.linkiving.domain.link.facade.SummaryWorkerFacade; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; import jakarta.annotation.PostConstruct; import jakarta.annotation.PreDestroy; import lombok.RequiredArgsConstructor; @@ -37,12 +39,17 @@ public class SummaryWorker { private final SummaryClient summaryClient; private final ApplicationEventPublisher eventPublisher; private final ObjectProvider selfProvider; - + private final MeterRegistry meterRegistry; + private Counter generateFailureCounter; private volatile boolean running = true; private Thread workerThread; @PostConstruct public void startWorker() { + this.generateFailureCounter = Counter.builder("async.task.failures") + .tag("task", "summary-generate") + .register(meterRegistry); + workerThread = new Thread(() -> { log.info("Summary worker thread started"); while (running) { @@ -111,6 +118,8 @@ private void processQueue() throws InterruptedException { } catch (Exception e) { log.error("Failed to generate summary for linkId: {}", linkId, e); + generateFailureCounter.increment(); + try { Link linkToFail = summaryWorkerFacade.getLinkWithMember(linkId); summaryWorkerFacade.updateSummaryStatus(linkToFail.getId(), SummaryStatus.FAILED); diff --git a/src/test/java/com/sofa/linkiving/domain/link/event/LinkEventListenerTest.java b/src/test/java/com/sofa/linkiving/domain/link/event/LinkEventListenerTest.java index 42a096b3..fd184a7b 100644 --- a/src/test/java/com/sofa/linkiving/domain/link/event/LinkEventListenerTest.java +++ b/src/test/java/com/sofa/linkiving/domain/link/event/LinkEventListenerTest.java @@ -25,6 +25,9 @@ import com.sofa.linkiving.domain.link.facade.SummaryWorkerFacade; import com.sofa.linkiving.domain.link.worker.SummaryQueue; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; + @ExtendWith(SpringExtension.class) @ContextConfiguration(classes = LinkEventListenerTest.RetryTestConfig.class) @DisplayName("LinkEventListener 재시도(Retry) 및 복구(Recover) 단위 테스트") @@ -182,12 +185,18 @@ public SummaryWorkerFacade summaryWorkerFacade() { return mock(SummaryWorkerFacade.class); } + @Bean + public MeterRegistry meterRegistry() { + return new SimpleMeterRegistry(); + } + @Bean public LinkEventListener linkEventListener(SummaryQueue summaryQueue, ApplicationEventPublisher eventPublisher, SummaryWorkerFacade summaryWorkerFacade, ObjectProvider selfProvider) { - return new LinkEventListener(summaryQueue, eventPublisher, summaryWorkerFacade, selfProvider); + return new LinkEventListener(summaryQueue, eventPublisher, summaryWorkerFacade, selfProvider, + meterRegistry()); } } } diff --git a/src/test/java/com/sofa/linkiving/domain/link/event/LinkSyncEventListenerTest.java b/src/test/java/com/sofa/linkiving/domain/link/event/LinkSyncEventListenerTest.java index 0d3abcd6..cd24bd56 100644 --- a/src/test/java/com/sofa/linkiving/domain/link/event/LinkSyncEventListenerTest.java +++ b/src/test/java/com/sofa/linkiving/domain/link/event/LinkSyncEventListenerTest.java @@ -16,10 +16,13 @@ import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.junit.jupiter.SpringExtension; -import com.sofa.linkiving.domain.link.ai.RagLinkSyncClient; +import com.sofa.linkiving.domain.link.ai.LinkSyncClient; import com.sofa.linkiving.domain.link.dto.request.LinkSyncUpdateReq; import com.sofa.linkiving.domain.link.enums.SyncAction; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; + @ExtendWith(SpringExtension.class) @ContextConfiguration(classes = LinkSyncEventListenerTest.RetryTestConfig.class) @DisplayName("LinkSyncEventListener 재시도(Retry) 및 복구(Recover) 단위 테스트") @@ -27,9 +30,8 @@ class LinkSyncEventListenerTest { @Autowired private LinkSyncEventListener linkSyncEventListener; - @Autowired - private RagLinkSyncClient linkSyncClient; + private LinkSyncClient linkSyncClient; @BeforeEach void setUp() { @@ -123,15 +125,19 @@ void shouldNotThrowError_WhenSucceedsWithin3Times() { @EnableRetry @EnableAspectJAutoProxy(proxyTargetClass = true) static class RetryTestConfig { + @Bean + public LinkSyncClient linkSyncClient() { + return mock(LinkSyncClient.class); + } @Bean - public RagLinkSyncClient linkSyncClient() { - return mock(RagLinkSyncClient.class); + public MeterRegistry meterRegistry() { + return new SimpleMeterRegistry(); } @Bean - public LinkSyncEventListener linkSyncEventListener(RagLinkSyncClient linkSyncClient) { - return new LinkSyncEventListener(linkSyncClient); + public LinkSyncEventListener linkSyncEventListener(LinkSyncClient linkSyncClient, MeterRegistry meterRegistry) { + return new LinkSyncEventListener(linkSyncClient, meterRegistry); } } } diff --git a/src/test/java/com/sofa/linkiving/domain/link/worker/SummaryWorkerTest.java b/src/test/java/com/sofa/linkiving/domain/link/worker/SummaryWorkerTest.java index ea78f8cc..93a01b1b 100644 --- a/src/test/java/com/sofa/linkiving/domain/link/worker/SummaryWorkerTest.java +++ b/src/test/java/com/sofa/linkiving/domain/link/worker/SummaryWorkerTest.java @@ -29,6 +29,9 @@ import com.sofa.linkiving.domain.link.facade.SummaryWorkerFacade; import com.sofa.linkiving.domain.member.entity.Member; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; + @ExtendWith(MockitoExtension.class) @DisplayName("SummaryWorker 단위 테스트") class SummaryWorkerTest { @@ -44,15 +47,17 @@ class SummaryWorkerTest { @Mock private ObjectProvider selfProvider; + private MeterRegistry meterRegistry; private SummaryWorker summaryWorker; private Link mockLink; private Member mockMember; @BeforeEach void setUp() { + meterRegistry = new SimpleMeterRegistry(); SummaryWorkerProperties properties = new SummaryWorkerProperties(Duration.ofMillis(10)); summaryWorker = new SummaryWorker(summaryQueue, properties, summaryWorkerFacade, summaryClient, eventPublisher, - selfProvider); + selfProvider, meterRegistry); mockLink = mock(Link.class); mockMember = mock(Member.class); From 1bbddc8d82d1ef7ea109aa1dedc2a00c8075cbc0 Mon Sep 17 00:00:00 2001 From: Jansoon Date: Sun, 21 Jun 2026 20:54:29 +0900 Subject: [PATCH 3/5] =?UTF-8?q?chore:=20actuator=20=EB=A9=94=ED=8A=B8?= =?UTF-8?q?=EB=A6=AD=C2=B7probe=20=EC=84=A4=EC=A0=95=20=EB=B3=B4=EA=B0=95?= =?UTF-8?q?=20(#239)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - http.server.requests 히스토그램(p95/p99) 활성화, - tomcat·hikaricp·jvm 메트릭 노출 - readiness/liveness probe 활성화. --- .../sofa/linkiving/security/auth/config/SecurityConstants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/sofa/linkiving/security/auth/config/SecurityConstants.java b/src/main/java/com/sofa/linkiving/security/auth/config/SecurityConstants.java index da1cfba3..bcf7b55d 100644 --- a/src/main/java/com/sofa/linkiving/security/auth/config/SecurityConstants.java +++ b/src/main/java/com/sofa/linkiving/security/auth/config/SecurityConstants.java @@ -6,7 +6,7 @@ public abstract class SecurityConstants { "/v3/api-docs/**", "/swagger-ui/**", "/swagger-resources", "/swagger-resources/**", /* actuator */ - "/actuator/health", + "/actuator/health/**", "/actuator/prometheus", /* health check */ From 8539c6fd435f771cf9475acf3eed0b24da8e4334 Mon Sep 17 00:00:00 2001 From: Jansoon Date: Sun, 21 Jun 2026 20:56:11 +0900 Subject: [PATCH 4/5] =?UTF-8?q?feat:=20AI/=EB=B9=84=EB=8F=99=EA=B8=B0=20?= =?UTF-8?q?=EC=A0=84=EC=9A=A9=20Grafana=20=EB=8C=80=EC=8B=9C=EB=B3=B4?= =?UTF-8?q?=EB=93=9C=20=EC=B6=94=EA=B0=80=20(#239)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ai_client_calls·async_task_failures 메트릭을 패널로 시각화 (AI 실패율·호출량·비동기 최종 실패). - 프로비저닝 폴더에 추가하여 자동 로드. --- .../linkiving-ai-async-overview.json | 349 ++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 docker/grafana/dashboards/linkiving-ai-async-overview.json diff --git a/docker/grafana/dashboards/linkiving-ai-async-overview.json b/docker/grafana/dashboards/linkiving-ai-async-overview.json new file mode 100644 index 00000000..6cd96d94 --- /dev/null +++ b/docker/grafana/dashboards/linkiving-ai-async-overview.json @@ -0,0 +1,349 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "ai", + "async", + "linkiving" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Linkiving AI / Async Overview", + "uid": "linkiving-ai-async-overview", + "version": 1, + "panels": [ + { + "id": 1, + "title": "AI 호출 실패율 (client별)", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2 + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull", + "max" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (client) (rate(ai_client_calls_total{result=\"failure\"}[5m])) / clamp_min(sum by (client) (rate(ai_client_calls_total[5m])), 1e-9)", + "legendFormat": "{{client}}", + "refId": "A" + } + ] + }, + { + "id": 2, + "title": "AI 호출량 (client / result별, req/s)", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "bars", + "fillOpacity": 60, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (client, result) (rate(ai_client_calls_total[5m]))", + "legendFormat": "{{client}} / {{result}}", + "refId": "A" + } + ] + }, + { + "id": 3, + "title": "AI 호출 결과 누적 (result별)", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*failure.*" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*empty.*" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "orange" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (result) (ai_client_calls_total)", + "legendFormat": "{{result}}", + "refId": "A" + } + ] + }, + { + "id": 4, + "title": "비동기 작업 최종 실패 (task별, 1h 증가)", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "bars", + "fillOpacity": 70, + "lineWidth": 1 + }, + "color": { + "mode": "fixed", + "fixedColor": "red" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull", + "sum" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (task) (increase(async_task_failures_total[1h]))", + "legendFormat": "{{task}}", + "refId": "A" + } + ] + }, + { + "id": 5, + "title": "비동기 최종 실패 누적 (task / action별)", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 16 + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "colorMode": "background", + "graphMode": "none", + "textMode": "value_and_name", + "orientation": "horizontal" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (task, action) (async_task_failures_total)", + "legendFormat": "{{task}} {{action}}", + "refId": "A" + } + ] + } + ] +} From b331c627b11c81547611c0a070ac387c6cb7d290 Mon Sep 17 00:00:00 2001 From: Jansoon Date: Sun, 21 Jun 2026 20:57:03 +0900 Subject: [PATCH 5/5] =?UTF-8?q?chore:=20Prometheus=20=EC=95=8C=EB=A6=BC=20?= =?UTF-8?q?=EB=A3=B0=20=EB=B0=8F=20Alertmanager=20=EA=B5=AC=EC=84=B1=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80=20(#239)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 5xx 에러율·p95 지연·AI 실패율·비동기 최종 실패·인스턴스 다운 룰 정의. - alerting/ 폴더로 룰·alertmanager 설정 분리 --- .github/workflows/backend-cd.yml | 11 ++++++ .gitignore | 3 ++ deploy.sh | 4 +- docker/alerting/alert.rules.yml | 68 ++++++++++++++++++++++++++++++++ docker/alerting/alertmanager.yml | 29 ++++++++++++++ docker/docker-compose.yml | 23 +++++++++++ docker/prometheus.yml | 11 +++++- 7 files changed, 146 insertions(+), 3 deletions(-) create mode 100644 docker/alerting/alert.rules.yml create mode 100644 docker/alerting/alertmanager.yml diff --git a/.github/workflows/backend-cd.yml b/.github/workflows/backend-cd.yml index e8036729..37ab8573 100644 --- a/.github/workflows/backend-cd.yml +++ b/.github/workflows/backend-cd.yml @@ -84,6 +84,17 @@ jobs: echo "GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" >> ./docker/.env shell: bash + - name: 🗂️ Alertmanager 알림 채널 설정 + shell: bash + env: + SMTP_PASSWORD: ${{ secrets.SMTP_PASSWORD }} + DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }} + run: | + printf '%s' "$SMTP_PASSWORD" > ./docker/alerting/smtp_password + chmod 600 ./docker/alerting/smtp_password + printf '%s' "$AlERTMANAGER_DISCORD_WEBHOOK_URL" > ./docker/alerting/discord_webhook + chmod 600 ./docker/alerting/discord_webhook + - name: ✨ 배포 스크립트 실행 run: | chmod +x deploy.sh diff --git a/.gitignore b/.gitignore index aac9bcd6..95629722 100644 --- a/.gitignore +++ b/.gitignore @@ -37,4 +37,7 @@ out/ *.yml !docker/prometheus.yml !docker/grafana/**/*.yml +!docker/alerting/**/*.yml .editorconfig +docker/alerting/smtp_password +docker/alerting/discord_webhook diff --git a/deploy.sh b/deploy.sh index 12287fa5..d54125e6 100644 --- a/deploy.sh +++ b/deploy.sh @@ -34,8 +34,8 @@ echo "✅ 새로운 이미지가 성공적으로 pull되었습니다." # Prometheus & Grafana 실행 (설정 변경 시 자동 반영) echo "모니터링 서비스 시작 중..." -${COMPOSE} up -d prometheus grafana -echo "✅ Prometheus & Grafana가 시작되었습니다." +${COMPOSE} up -d prometheus grafana alertmanager +echo "✅ Prometheus & Grafana & Alertmanager가 시작되었습니다." echo "사용하지 않는 이미지 정리 중..." sudo docker image prune -f diff --git a/docker/alerting/alert.rules.yml b/docker/alerting/alert.rules.yml new file mode 100644 index 00000000..314505f4 --- /dev/null +++ b/docker/alerting/alert.rules.yml @@ -0,0 +1,68 @@ +groups: + - name: linkiving-core-alerts + rules: + + # ── 가용성 ───────────────────────────────────────────── + # 인스턴스가 스크레이프되지 않음 (앱 다운/네트워크 단절) + - alert: InstanceDown + expr: up{job="linkiving-core"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "인스턴스 다운: {{ $labels.instance }}" + description: "{{ $labels.instance }} 가 1분 이상 스크레이프되지 않습니다." + + # ── HTTP 5xx 에러율 ──────────────────────────────────── + # 전체 요청 중 5xx 비율이 5분간 5% 초과 + - alert: HighServerErrorRate + expr: | + sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) + / sum(rate(http_server_requests_seconds_count[5m])) + > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: "서버 5xx 에러율 높음" + description: "최근 5분간 5xx 에러율이 5%를 초과했습니다 (현재 {{ $value | humanizePercentage }})." + + # ── 응답 지연 (p95) ──────────────────────────────────── + # p95 응답시간이 1초 초과 + - alert: HighLatencyP95 + expr: | + histogram_quantile( + 0.95, + sum by (le) (rate(http_server_requests_seconds_bucket[5m])) + ) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "응답 지연(p95) 높음" + description: "p95 응답시간이 1초를 초과했습니다 (현재 {{ $value }}s)." + + # ── 외부 AI 호출 실패율 ──────────────────────────────── + # AI 호출 중 failure 비율이 5분간 20% 초과 (클라이언트별) + - alert: HighAiCallFailureRate + expr: | + sum by (client) (rate(ai_client_calls_total{result="failure"}[5m])) + / sum by (client) (rate(ai_client_calls_total[5m])) + > 0.2 + for: 5m + labels: + severity: warning + annotations: + summary: "AI 호출 실패율 높음: {{ $labels.client }}" + description: "AI 클라이언트 '{{ $labels.client }}' 실패율이 5분간 20%를 초과했습니다 (현재 {{ $value | humanizePercentage }})." + + # ── 비동기 작업 최종 실패 ────────────────────────────── + # 재시도 소진 후 최종 실패가 발생하면 즉시 알림 (수동 복구 필요 신호) + - alert: AsyncTaskFinalFailure + expr: increase(async_task_failures_total[10m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "비동기 작업 최종 실패: {{ $labels.task }}" + description: "task '{{ $labels.task }}'{{ if $labels.action }} (action {{ $labels.action }}){{ end }} 에서 재시도 소진 후 최종 실패가 발생했습니다. 수동 복구가 필요할 수 있습니다." diff --git a/docker/alerting/alertmanager.yml b/docker/alerting/alertmanager.yml new file mode 100644 index 00000000..867dc317 --- /dev/null +++ b/docker/alerting/alertmanager.yml @@ -0,0 +1,29 @@ +# Alertmanager 설정 + +global: + resolve_timeout: 5m + +route: + receiver: 'all' + group_by: [ 'alertname' ] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + +receivers: + - name: 'all' + # ── 이메일 ── + email_configs: + - to: 'linkivingsofa@gmail.com' + from: 'linkivingsofa@gmail.com' + smarthost: 'smtp.gmail.com:587' + auth_username: 'linkivingsofa@gmail.com' + auth_password_file: '/etc/alertmanager/smtp_password' + require_tls: true + send_resolved: true + # ── 디스코드 ── + discord_configs: + - webhook_url_file: '/etc/alertmanager/discord_webhook' + send_resolved: true + title: '{{ .Status }}: {{ .CommonLabels.alertname }}' + message: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 0a6c4e82..93c1fbbb 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -55,6 +55,7 @@ services: container_name: prometheus volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./alerting/alert.rules.yml:/etc/prometheus/alert.rules.yml:ro - prometheus_data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' @@ -102,6 +103,28 @@ services: max-size: "10m" max-file: "3" + alertmanager: + image: prom/alertmanager:v0.28.0 + container_name: alertmanager + volumes: + - ./alerting/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - ./alerting/smtp_password:/etc/alertmanager/smtp_password:ro + - ./alerting/discord_webhook:/etc/alertmanager/discord_webhook:ro + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + restart: unless-stopped + networks: + - app-network + deploy: + resources: + limits: + memory: 128M + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + volumes: prometheus_data: grafana_data: diff --git a/docker/prometheus.yml b/docker/prometheus.yml index 86d2b091..f617b2b8 100644 --- a/docker/prometheus.yml +++ b/docker/prometheus.yml @@ -2,6 +2,15 @@ global: scrape_interval: 30s evaluation_interval: 30s +rule_files: + - '/etc/prometheus/alert.rules.yml' + +alerting: + alertmanagers: + - static_configs: + - targets: + - 'alertmanager:9093' + scrape_configs: - job_name: 'linkiving-core' metrics_path: '/actuator/prometheus' @@ -10,4 +19,4 @@ scrape_configs: - 'blue:8080' - 'green:8080' labels: - application: 'linkiving-core' \ No newline at end of file + application: 'linkiving-core'