diff --git a/backend/fraud-service/internal/fraud/checker.go b/backend/fraud-service/internal/fraud/checker.go index ff40f592..0b86d91a 100644 --- a/backend/fraud-service/internal/fraud/checker.go +++ b/backend/fraud-service/internal/fraud/checker.go @@ -17,7 +17,26 @@ type Checker struct { func (c *Checker) CheckTransaction(ctx context.Context, req *fraudv1.TransactionRequest) (*fraudv1.FraudCheckResponse, error) { start := time.Now() resp := evaluate(req) - fanOutExternalChecks(ctx, req) + results := fanOutExternalChecks(ctx, req) + + allFailed := true + for _, r := range results { + if r.OK() { + allFailed = false + break + } + } + + // Medium-risk transactions that pass rule-based scoring still need + // external provider verification. Reject them when all providers are down. + if allFailed && resp.GetApproved() && resp.GetRiskScore() > 0.3 { + resp = &fraudv1.FraudCheckResponse{ + Approved: false, + RiskScore: resp.GetRiskScore() + 0.3, + Reason: "external-verification-unavailable", + } + } + metrics.Observe(resp.GetApproved(), resp.GetReason(), time.Since(start), float64(resp.GetRiskScore())) return resp, nil } diff --git a/backend/fraud-service/internal/fraud/external.go b/backend/fraud-service/internal/fraud/external.go index 620f554b..bedc85da 100644 --- a/backend/fraud-service/internal/fraud/external.go +++ b/backend/fraud-service/internal/fraud/external.go @@ -14,6 +14,7 @@ import ( "time" fraudv1 "github.com/speedscale/microsvc/fraud-service/gen/fraud/v1" + "github.com/speedscale/microsvc/fraud-service/internal/metrics" ) func envOrDefault(key, def string) string { @@ -25,32 +26,44 @@ func envOrDefault(key, def string) string { var externalClient = &http.Client{Timeout: 5 * time.Second} -func fanOutExternalChecks(ctx context.Context, req *fraudv1.TransactionRequest) { +type ExternalResult struct { + Provider string + Status int + Err error +} + +func (r ExternalResult) OK() bool { + return r.Err == nil && r.Status >= 200 && r.Status < 300 +} + +func fanOutExternalChecks(ctx context.Context, req *fraudv1.TransactionRequest) []ExternalResult { ctx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() + results := make([]ExternalResult, 3) var wg sync.WaitGroup wg.Add(3) go func() { defer wg.Done() - callStripeRadar(ctx, req) + results[0] = callStripeRadar(ctx, req) }() go func() { defer wg.Done() - callSiftScience(ctx, req) + results[1] = callSiftScience(ctx, req) }() go func() { defer wg.Done() - callMaxMind(ctx, req) + results[2] = callMaxMind(ctx, req) }() wg.Wait() + return results } -func callStripeRadar(ctx context.Context, req *fraudv1.TransactionRequest) { +func callStripeRadar(ctx context.Context, req *fraudv1.TransactionRequest) ExternalResult { apiKey := envOrDefault("STRIPE_API_KEY", "sk_test_fake_key_for_demo") form := url.Values{} @@ -62,36 +75,41 @@ func callStripeRadar(ctx context.Context, req *fraudv1.TransactionRequest) { strings.NewReader(form.Encode())) if err != nil { log.Printf("stripe: build request: %v", err) - return + return ExternalResult{Provider: "stripe", Err: err} } httpReq.Header.Set("Authorization", "Bearer "+apiKey) httpReq.Header.Set("Content-Type", "application/x-www-form-urlencoded") + start := time.Now() resp, err := externalClient.Do(httpReq) + dur := time.Since(start) if err != nil { log.Printf("stripe: %v", err) - return + metrics.ObserveExternal("stripe", 0, dur) + return ExternalResult{Provider: "stripe", Err: err} } resp.Body.Close() log.Printf("stripe: status %d", resp.StatusCode) + metrics.ObserveExternal("stripe", resp.StatusCode, dur) + return ExternalResult{Provider: "stripe", Status: resp.StatusCode} } -func callSiftScience(ctx context.Context, req *fraudv1.TransactionRequest) { +func callSiftScience(ctx context.Context, req *fraudv1.TransactionRequest) ExternalResult { apiKey := envOrDefault("SIFT_API_KEY", "fake_sift_key_for_demo") body := map[string]interface{}{ - "$api_key": apiKey, - "$type": "$transaction", - "$amount": int64(req.GetAmount() * 1e6), - "$user_id": req.GetUserId(), - "$currency_code": "USD", - "$transaction_id": fmt.Sprintf("%s-%d", req.GetAccountId(), time.Now().UnixMilli()), + "$api_key": apiKey, + "$type": "$transaction", + "$amount": int64(req.GetAmount() * 1e6), + "$user_id": req.GetUserId(), + "$currency_code": "USD", + "$transaction_id": fmt.Sprintf("%s-%d", req.GetAccountId(), time.Now().UnixMilli()), } payload, err := json.Marshal(body) if err != nil { log.Printf("sift: marshal: %v", err) - return + return ExternalResult{Provider: "sift", Err: err} } httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, @@ -99,20 +117,25 @@ func callSiftScience(ctx context.Context, req *fraudv1.TransactionRequest) { bytes.NewReader(payload)) if err != nil { log.Printf("sift: build request: %v", err) - return + return ExternalResult{Provider: "sift", Err: err} } httpReq.Header.Set("Content-Type", "application/json") + start := time.Now() resp, err := externalClient.Do(httpReq) + dur := time.Since(start) if err != nil { log.Printf("sift: %v", err) - return + metrics.ObserveExternal("sift", 0, dur) + return ExternalResult{Provider: "sift", Err: err} } resp.Body.Close() log.Printf("sift: status %d", resp.StatusCode) + metrics.ObserveExternal("sift", resp.StatusCode, dur) + return ExternalResult{Provider: "sift", Status: resp.StatusCode} } -func callMaxMind(ctx context.Context, req *fraudv1.TransactionRequest) { +func callMaxMind(ctx context.Context, req *fraudv1.TransactionRequest) ExternalResult { accountID := envOrDefault("MAXMIND_ACCOUNT_ID", "000000") licenseKey := envOrDefault("MAXMIND_LICENSE_KEY", "fake_maxmind_key_for_demo") @@ -128,7 +151,7 @@ func callMaxMind(ctx context.Context, req *fraudv1.TransactionRequest) { payload, err := json.Marshal(body) if err != nil { log.Printf("maxmind: marshal: %v", err) - return + return ExternalResult{Provider: "maxmind", Err: err} } httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, @@ -136,16 +159,21 @@ func callMaxMind(ctx context.Context, req *fraudv1.TransactionRequest) { bytes.NewReader(payload)) if err != nil { log.Printf("maxmind: build request: %v", err) - return + return ExternalResult{Provider: "maxmind", Err: err} } httpReq.Header.Set("Content-Type", "application/json") httpReq.SetBasicAuth(accountID, licenseKey) + start := time.Now() resp, err := externalClient.Do(httpReq) + dur := time.Since(start) if err != nil { log.Printf("maxmind: %v", err) - return + metrics.ObserveExternal("maxmind", 0, dur) + return ExternalResult{Provider: "maxmind", Err: err} } resp.Body.Close() log.Printf("maxmind: status %d", resp.StatusCode) + metrics.ObserveExternal("maxmind", resp.StatusCode, dur) + return ExternalResult{Provider: "maxmind", Status: resp.StatusCode} } diff --git a/backend/fraud-service/internal/metrics/metrics.go b/backend/fraud-service/internal/metrics/metrics.go index 29c17e30..724d54d3 100644 --- a/backend/fraud-service/internal/metrics/metrics.go +++ b/backend/fraud-service/internal/metrics/metrics.go @@ -28,6 +28,17 @@ var ( Help: "Distribution of computed risk scores (0.0-1.0).", Buckets: []float64{0, 0.1, 0.25, 0.5, 0.7, 0.9, 1.0}, }) + + ExternalRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "fraud_external_requests_total", + Help: "Outbound fraud-check API calls by provider and HTTP status.", + }, []string{"provider", "status"}) + + ExternalRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "fraud_external_request_duration_seconds", + Help: "Latency of outbound fraud-check API calls.", + Buckets: prometheus.DefBuckets, + }, []string{"provider"}) ) // Observe records the outcome of a single fraud check. @@ -36,3 +47,9 @@ func Observe(approved bool, reason string, dur time.Duration, risk float64) { CheckDuration.Observe(dur.Seconds()) RiskScore.Observe(risk) } + +// ObserveExternal records the outcome of an outbound API call. +func ObserveExternal(provider string, status int, dur time.Duration) { + ExternalRequestsTotal.WithLabelValues(provider, strconv.Itoa(status)).Inc() + ExternalRequestDuration.WithLabelValues(provider).Observe(dur.Seconds()) +} diff --git a/backend/transactions-service/src/main/java/com/banking/transactionsservice/config/ErrorSpikeFilter.java b/backend/transactions-service/src/main/java/com/banking/transactionsservice/config/ErrorSpikeFilter.java new file mode 100644 index 00000000..56e98993 --- /dev/null +++ b/backend/transactions-service/src/main/java/com/banking/transactionsservice/config/ErrorSpikeFilter.java @@ -0,0 +1,58 @@ +package com.banking.transactionsservice.config; + +import jakarta.servlet.FilterChain; +import jakarta.servlet.ServletException; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.core.Ordered; +import org.springframework.core.annotation.Order; +import org.springframework.stereotype.Component; +import org.springframework.web.filter.OncePerRequestFilter; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.concurrent.ThreadLocalRandom; + +@Component +@Order(Ordered.HIGHEST_PRECEDENCE + 10) +@ConditionalOnProperty(name = "error-spike.enabled", havingValue = "true", matchIfMissing = true) +public class ErrorSpikeFilter extends OncePerRequestFilter { + + @Value("${error-spike.probability:0.30}") + private double probability; + + @Value("${error-spike.minute-marks:10,40}") + private String minuteMarks; + + @Value("${error-spike.duration-minutes:2}") + private int durationMinutes; + + @Override + protected void doFilterInternal(HttpServletRequest request, HttpServletResponse response, + FilterChain chain) throws ServletException, IOException { + if (isSpikeWindow() && ThreadLocalRandom.current().nextDouble() < probability) { + response.sendError(HttpServletResponse.SC_SERVICE_UNAVAILABLE, "service temporarily degraded"); + return; + } + chain.doFilter(request, response); + } + + @Override + protected boolean shouldNotFilter(HttpServletRequest request) { + String path = request.getRequestURI(); + return path.startsWith("/actuator"); + } + + private boolean isSpikeWindow() { + int minute = LocalDateTime.now().getMinute(); + for (String mark : minuteMarks.split(",")) { + int m = Integer.parseInt(mark.trim()); + if (minute >= m && minute < m + durationMinutes) { + return true; + } + } + return false; + } +} diff --git a/kubernetes/observability/dashboards/banking-app-errors.json b/kubernetes/observability/dashboards/banking-app-errors.json index 07d25312..3b8350b0 100644 --- a/kubernetes/observability/dashboards/banking-app-errors.json +++ b/kubernetes/observability/dashboards/banking-app-errors.json @@ -197,11 +197,57 @@ }, "options": {"showHeader": true, "cellHeight": "sm", "footer": {"show": false}} }, + { + "type": "timeseries", + "title": "Third-party API Health (fraud-service)", + "description": "Outbound calls from fraud-service to Stripe, Sift, and MaxMind. Non-2xx responses indicate provider issues or bad credentials.", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 21}, + "datasource": {"type": "prometheus", "uid": "prometheus-app"}, + "targets": [ + {"expr": "sum by (provider, status) (rate(fraud_external_requests_total{status!~\"2..\"}[1m]))", "legendFormat": "{{provider}} {{status}}", "refId": "A"} + ], + "fieldConfig": { + "defaults": { + "custom": {"drawStyle": "bars", "fillOpacity": 80, "lineWidth": 0, "stacking": {"mode": "normal"}}, + "unit": "reqps", + "decimals": 2 + }, + "overrides": [ + {"matcher": {"id": "byRegexp", "options": "stripe"}, "properties": [{"id": "color", "value": {"fixedColor": "purple", "mode": "fixed"}}]}, + {"matcher": {"id": "byRegexp", "options": "sift"}, "properties": [{"id": "color", "value": {"fixedColor": "orange", "mode": "fixed"}}]}, + {"matcher": {"id": "byRegexp", "options": "maxmind"}, "properties": [{"id": "color", "value": {"fixedColor": "red", "mode": "fixed"}}]} + ] + }, + "options": {"tooltip": {"mode": "multi"}} + }, + { + "type": "stat", + "title": "Fraud Check Rejections", + "description": "Transactions rejected by fraud-service, by reason", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 21}, + "datasource": {"type": "prometheus", "uid": "prometheus-app"}, + "targets": [ + {"expr": "sum by (reason) (increase(fraud_checks_total{approved=\"false\"}[$__range]))", "legendFormat": "{{reason}}", "refId": "A"} + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "decimals": 0, + "thresholds": {"mode": "absolute", "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 10}, + {"color": "red", "value": 50} + ]}, + "color": {"mode": "thresholds"} + } + }, + "options": {"graphMode": "area", "textMode": "auto", "reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value"} + }, { "type": "table", "title": "Recent Traces", "description": "Click a trace ID to view spans in Jaeger", - "gridPos": {"h": 10, "w": 12, "x": 0, "y": 21}, + "gridPos": {"h": 10, "w": 12, "x": 0, "y": 29}, "datasource": {"type": "jaeger", "uid": "jaeger"}, "targets": [ {"refId": "A", "datasource": {"type": "jaeger", "uid": "jaeger"}, "queryType": "search", "service": "api-gateway", "operation": "", "tags": "", "minDuration": "", "maxDuration": "", "limit": 20} @@ -221,11 +267,31 @@ ] } }, + { + "type": "logs", + "title": "Application Logs — Errors", + "description": "ERROR and WARN log lines from banking-app services (via Promtail → Loki)", + "gridPos": {"h": 10, "w": 12, "x": 12, "y": 29}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + {"expr": "{namespace=\"banking-app\"} |~ \"(?i)(ERROR|WARN|Exception|panic|fatal)\"", "refId": "A"} + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": false, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + }, { "type": "logs", "title": "eBPF Traffic — Error Requests", "description": "Speedscale eBPF-captured requests returning errors. Full payload inspection requires Speedscale / proxymock.", - "gridPos": {"h": 10, "w": 12, "x": 12, "y": 21}, + "gridPos": {"h": 10, "w": 12, "x": 0, "y": 39}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ {"expr": "{exporter=\"OTLP\"} | json | body_l7protocol = \"http\" | body_status >= 400 | line_format \"{{.body_service}} {{.body_command}} {{.body_location}} → {{.body_status}} ({{.body_duration}}ms)\"", "refId": "A"} diff --git a/kubernetes/observability/kustomization.yaml b/kubernetes/observability/kustomization.yaml index 2463b21b..6b98b732 100644 --- a/kubernetes/observability/kustomization.yaml +++ b/kubernetes/observability/kustomization.yaml @@ -13,6 +13,7 @@ resources: - otel-collector.yaml - jaeger-deployment.yaml - loki.yaml + - promtail.yaml - grafana.yaml generatorOptions: diff --git a/kubernetes/observability/promtail.yaml b/kubernetes/observability/promtail.yaml new file mode 100644 index 00000000..8e00149a --- /dev/null +++ b/kubernetes/observability/promtail.yaml @@ -0,0 +1,124 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: promtail-config + namespace: observability +data: + promtail.yaml: | + server: + http_listen_port: 3101 + grpc_listen_port: 0 + + positions: + filename: /tmp/positions.yaml + + clients: + - url: http://loki:3100/loki/api/v1/push + + scrape_configs: + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only collect from banking-app namespace + - source_labels: [__meta_kubernetes_namespace] + regex: banking-app + action: keep + # Drop simulation-client logs (too noisy) + - source_labels: [__meta_kubernetes_pod_label_app] + regex: simulation-client + action: drop + # Standard k8s metadata labels + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: app + - source_labels: [__meta_kubernetes_container_name] + target_label: container + # Use the pod log path + - source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_container_name + target_label: __path__ + separator: / + replacement: /var/log/pods/*$1/$2/*.log + pipeline_stages: + - cri: {} +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: promtail + namespace: observability + labels: + app: promtail +spec: + selector: + matchLabels: + app: promtail + template: + metadata: + labels: + app: promtail + spec: + serviceAccountName: promtail + containers: + - name: promtail + image: grafana/promtail:2.9.8 + args: + - -config.file=/etc/promtail/promtail.yaml + ports: + - containerPort: 3101 + name: http + volumeMounts: + - name: config + mountPath: /etc/promtail + - name: pods + mountPath: /var/log/pods + readOnly: true + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + memory: 128Mi + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: config + configMap: + name: promtail-config + - name: pods + hostPath: + path: /var/log/pods +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: promtail + namespace: observability +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: promtail +rules: + - apiGroups: [""] + resources: [nodes, nodes/proxy, services, endpoints, pods] + verbs: [get, watch, list] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: promtail +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: promtail +subjects: + - kind: ServiceAccount + name: promtail + namespace: observability