feat(scans): Reset resource failed findings to 0 for ephemeral resources (#10929)

jfagoagas · web-flow · commit 85d38b5f7181 · 2026-04-29T19:08:16.000+02:00
diff --git a/api/CHANGELOG.md b/api/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 All notable changes to the **Prowler API** are documented in this file.
 
+## [1.27.0] (Prowler UNRELEASED)
+
+### 🚀 Added
+
+- New `scan-reset-ephemeral-resources` post-scan task zeroes `failed_findings_count` for resources missing from the latest full-scope scan, keeping ephemeral resources from polluting the Resources page sort [(#10929)](https://github.com/prowler-cloud/prowler/pull/10929)
+
+---
+
 ## [1.26.1] (Prowler v5.25.1)
 
 ### 🐞 Fixed
diff --git a/api/src/backend/api/models.py b/api/src/backend/api/models.py
@@ -595,10 +595,40 @@ class Scan(RowLevelSecurityProtectedModel):
     objects = ActiveProviderManager()
     all_objects = models.Manager()
 
+    _SCOPING_SCANNER_ARG_KEYS_CACHE: tuple[str, ...] | None = None
+
+    @classmethod
+    def get_scoping_scanner_arg_keys(cls) -> tuple[str, ...]:
+        """Return the scanner_args keys that mark a scan as scoped.
+
+        Derived from ``prowler.lib.scan.scan.Scan.__init__`` so the API stays
+        in sync with whatever the SDK actually accepts as filters. Cached at
+        class level — the signature is stable for the process lifetime.
+        """
+        if cls._SCOPING_SCANNER_ARG_KEYS_CACHE is None:
+            import inspect
+
+            from prowler.lib.scan.scan import Scan as ProwlerScan
+
+            params = inspect.signature(ProwlerScan.__init__).parameters
+            cls._SCOPING_SCANNER_ARG_KEYS_CACHE = tuple(
+                name for name in params if name not in ("self", "provider")
+            )
+        return cls._SCOPING_SCANNER_ARG_KEYS_CACHE
+
     class TriggerChoices(models.TextChoices):
         SCHEDULED = "scheduled", _("Scheduled")
         MANUAL = "manual", _("Manual")
 
+    # Trigger values for scans that ran the SDK end-to-end. Imported scans (or
+    # any future trigger) are intentionally NOT in this set — they may carry
+    # only a partial slice of resources, so post-scan logic that depends on a
+    # full-scope sweep (e.g. resetting ephemeral resource findings) must skip
+    # them by default.
+    LIVE_SCAN_TRIGGERS = frozenset(
+        (TriggerChoices.SCHEDULED.value, TriggerChoices.MANUAL.value)
+    )
+
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False)
     name = models.CharField(
         blank=True, null=True, max_length=100, validators=[MinLengthValidator(3)]
@@ -681,6 +711,24 @@ class Meta(RowLevelSecurityProtectedModel.Meta):
     class JSONAPIMeta:
         resource_name = "scans"
 
+    def is_full_scope(self) -> bool:
+        """Return True if this scan ran with no scoping filters at all.
+
+        Used to gate post-scan operations (such as resetting the
+        failed_findings_count of resources missing from the scan) that are only
+        safe when the scan covered every check, service, and category. Imported
+        scans are NOT full-scope by definition — they may carry only a partial
+        slice of resources, so they're rejected via ``trigger`` even before the
+        scanner_args check.
+        """
+        if self.trigger not in self.LIVE_SCAN_TRIGGERS:
+            return False
+        scanner_args = self.scanner_args or {}
+        for key in self.get_scoping_scanner_arg_keys():
+            if scanner_args.get(key):
+                return False
+        return True
+
 
 class AttackPathsScan(RowLevelSecurityProtectedModel):
     objects = ActiveProviderManager()
diff --git a/api/src/backend/tasks/jobs/scan.py b/api/src/backend/tasks/jobs/scan.py
@@ -10,16 +10,29 @@
 
 import sentry_sdk
 from celery.utils.log import get_task_logger
+from config.django.base import DJANGO_FINDINGS_BATCH_SIZE
 from config.env import env
 from config.settings.celery import CELERY_DEADLOCK_ATTEMPTS
 from django.db import IntegrityError, OperationalError
-from django.db.models import Case, Count, IntegerField, Max, Min, Prefetch, Q, Sum, When
+from django.db.models import (
+    Case,
+    Count,
+    Exists,
+    IntegerField,
+    Max,
+    Min,
+    OuterRef,
+    Prefetch,
+    Q,
+    Sum,
+    When,
+)
 from django.utils import timezone as django_timezone
 from tasks.jobs.queries import (
     COMPLIANCE_UPSERT_PROVIDER_SCORE_SQL,
     COMPLIANCE_UPSERT_TENANT_SUMMARY_SQL,
 )
-from tasks.utils import CustomEncoder
+from tasks.utils import CustomEncoder, batched
 
 from api.compliance import PROWLER_COMPLIANCE_OVERVIEW_TEMPLATE
 from api.constants import SEVERITY_ORDER
@@ -2069,3 +2082,169 @@ def aggregate_finding_group_summaries(tenant_id: str, scan_id: str):
         "created": created_count,
         "updated": updated_count,
     }
+
+
+def reset_ephemeral_resource_findings_count(tenant_id: str, scan_id: str) -> dict:
+    """Zero failed_findings_count for resources missing from a completed full-scope scan.
+
+    Resources that exist in the database for the scan's provider but were not
+    touched by this scan are treated as ephemeral. We keep their historical
+    findings, but reset the denormalized counter that drives the Resources page
+    sort so they stop ranking at the top.
+
+    Skipped (no-op) when:
+        - The scan is not in COMPLETED state.
+        - The scan ran with any scoping filter in scanner_args (partial scope).
+
+    Query design (must scale to 500k+ resources per provider):
+        Phase 1 — collect ephemeral IDs with one anti-join read.
+            Outer filter ``(tenant_id, provider_id, failed_findings_count > 0)``
+            uses ``resources_tenant_provider_idx``. The correlated
+            ``NOT EXISTS`` subquery hits the implicit unique index
+            ``(tenant_id, scan_id, resource_id)`` on ``ResourceScanSummary``.
+            ``NOT EXISTS`` (vs ``NOT IN``) is null-safe and lets the planner
+            choose between hash anti-join and indexed nested-loop anti-join.
+            ``.iterator(chunk_size=...)`` skips the queryset cache so memory
+            stays bounded while streaming UUIDs.
+        Phase 2 — UPDATE in fixed-size batches.
+            One large UPDATE would hold row-exclusive locks for seconds and
+            create a WAL spike. Batched UPDATEs by ``id__in`` (~1k rows each)
+            hit the primary key, keep each lock window ~50ms, bound WAL chunks,
+            and let other writers proceed between batches.
+            ``failed_findings_count__gt=0`` in the UPDATE is idempotent under
+            concurrent scans and skips no-op rewrites.
+        Reads use the primary DB, not the replica: ``ResourceScanSummary`` rows
+        were written by the same scan task that triggered this one, so replica
+        lag could falsely classify scanned resources as ephemeral.
+
+        Scope detection (``Scan.is_full_scope()``) derives the set of scoping
+        scanner_args from ``prowler.lib.scan.scan.Scan.__init__`` via
+        introspection, so the API can never drift from the SDK's filter
+        contract. Imported scans are also rejected by trigger — they may only
+        cover a partial slice of resources.
+    """
+    with rls_transaction(tenant_id):
+        scan = Scan.objects.filter(tenant_id=tenant_id, id=scan_id).first()
+
+    if scan is None:
+        logger.warning(f"Scan {scan_id} not found")
+        return {"status": "skipped", "reason": "scan not found"}
+
+    if scan.state != StateChoices.COMPLETED:
+        logger.info(f"Scan {scan_id} not completed; skipping ephemeral reset")
+        return {"status": "skipped", "reason": "scan not completed"}
+
+    if not scan.is_full_scope():
+        logger.info(
+            f"Scan {scan_id} ran with scoping filters; skipping ephemeral reset"
+        )
+        return {"status": "skipped", "reason": "partial scan scope"}
+
+    # Race protection: if a newer completed full-scope scan exists for this
+    # provider, our ResourceScanSummary set is stale relative to the resources'
+    # current failed_findings_count values (which the newer scan already
+    # refreshed). Wiping based on the older scan would zero counts the newer
+    # scan just set. Skip and let the newer scan's reset task do the work; if
+    # this task was delayed in the queue, that's the correct outcome.
+    # `completed_at__isnull=False` is required: Postgres orders NULL first in
+    # DESC, so a sibling COMPLETED scan with a missing completed_at would sort
+    # as "newest" and incorrectly cause us to skip.
+    with rls_transaction(tenant_id):
+        latest_full_scope_scan_id = (
+            Scan.objects.filter(
+                tenant_id=tenant_id,
+                provider_id=scan.provider_id,
+                state=StateChoices.COMPLETED,
+                completed_at__isnull=False,
+            )
+            .order_by("-completed_at", "-inserted_at")
+            .values_list("id", flat=True)
+            .first()
+        )
+    if latest_full_scope_scan_id != scan.id:
+        logger.info(
+            f"Scan {scan_id} is not the latest completed scan for provider "
+            f"{scan.provider_id}; skipping ephemeral reset"
+        )
+        return {"status": "skipped", "reason": "newer scan exists"}
+
+    # Defensive gate: ResourceScanSummary rows are written by perform_prowler_scan
+    # via best-effort bulk_create. If those writes failed silently (or the scan
+    # genuinely produced resources but no summaries were persisted), the
+    # ~Exists(in_scan) anti-join below would classify EVERY resource for this
+    # provider as ephemeral and zero their counts. Bail loudly instead.
+    with rls_transaction(tenant_id):
+        summaries_present = ResourceScanSummary.objects.filter(
+            tenant_id=tenant_id, scan_id=scan_id
+        ).exists()
+    if scan.unique_resource_count > 0 and not summaries_present:
+        logger.error(
+            f"Scan {scan_id} reports {scan.unique_resource_count} unique "
+            f"resources but no ResourceScanSummary rows are persisted; "
+            f"skipping ephemeral reset to avoid wiping valid counts"
+        )
+        return {"status": "skipped", "reason": "summaries missing"}
+
+    # Stays on the primary DB intentionally. ResourceScanSummary rows are
+    # written by perform_prowler_scan in the same chain that triggered this
+    # task, so replica lag could return an empty/partial summary set; a stale
+    # read here would classify every Resource as ephemeral and wipe valid
+    # failed_findings_count values on the primary. Same rationale as
+    # update_provider_compliance_scores below in this module.
+    # Materializing the ID list (rather than streaming the iterator into
+    # batched UPDATEs) is intentional: it lets the UPDATEs run in their own
+    # short rls_transactions instead of one long transaction holding row locks
+    # on every batch. At 500k UUIDs the peak memory is ~40 MB — acceptable for
+    # a Celery worker — and is the better trade-off versus a multi-second
+    # write-lock window blocking concurrent scans.
+    with rls_transaction(tenant_id):
+        in_scan = ResourceScanSummary.objects.filter(
+            tenant_id=tenant_id,
+            scan_id=scan_id,
+            resource_id=OuterRef("pk"),
+        )
+        ephemeral_ids = list(
+            Resource.objects.filter(
+                tenant_id=tenant_id,
+                provider_id=scan.provider_id,
+                failed_findings_count__gt=0,
+            )
+            .filter(~Exists(in_scan))
+            .values_list("id", flat=True)
+            .iterator(chunk_size=DJANGO_FINDINGS_BATCH_SIZE)
+        )
+
+    if not ephemeral_ids:
+        logger.info(f"No ephemeral resources for scan {scan_id}")
+        return {
+            "status": "completed",
+            "scan_id": str(scan_id),
+            "provider_id": str(scan.provider_id),
+            "reset": 0,
+        }
+
+    total_updated = 0
+    for batch, _ in batched(ephemeral_ids, DJANGO_FINDINGS_BATCH_SIZE):
+        # batched() always yields a final tuple, which is empty when the input
+        # length is an exact multiple of the batch size. Skip it so we don't
+        # issue a no-op UPDATE ... WHERE id IN ().
+        if not batch:
+            continue
+        with rls_transaction(tenant_id):
+            total_updated += Resource.objects.filter(
+                tenant_id=tenant_id,
+                id__in=batch,
+                failed_findings_count__gt=0,
+            ).update(failed_findings_count=0)
+
+    logger.info(
+        f"Ephemeral resource reset for scan {scan_id}: "
+        f"{total_updated} resources zeroed for provider {scan.provider_id}"
+    )
+
+    return {
+        "status": "completed",
+        "scan_id": str(scan_id),
+        "provider_id": str(scan.provider_id),
+        "reset": total_updated,
+    }
diff --git a/api/src/backend/tasks/tasks.py b/api/src/backend/tasks/tasks.py
@@ -58,6 +58,7 @@
     aggregate_findings,
     create_compliance_requirements,
     perform_prowler_scan,
+    reset_ephemeral_resource_findings_count,
     update_provider_compliance_scores,
 )
 from tasks.utils import (
@@ -77,6 +78,7 @@
 from prowler.lib.outputs.compliance.generic.generic import GenericCompliance
 from prowler.lib.outputs.finding import Finding as FindingOutput
 
+
 logger = get_task_logger(__name__)
 
 
@@ -158,6 +160,13 @@ def _perform_scan_complete_tasks(tenant_id: str, scan_id: str, provider_id: str)
             generate_outputs_task.si(
                 scan_id=scan_id, provider_id=provider_id, tenant_id=tenant_id
             ),
+            # post-scan task — runs in the parallel group so a
+            # failure cannot cascade into reports or integrations. Its only
+            # prerequisite is that perform_prowler_scan has committed
+            # ResourceScanSummary, which is true by the time this chain fires.
+            reset_ephemeral_resource_findings_count_task.si(
+                tenant_id=tenant_id, scan_id=scan_id
+            ),
         ),
         group(
             # Use optimized task that generates both reports with shared queries
@@ -393,7 +402,8 @@ class AttackPathsScanRLSTask(RLSTask):
     SDK initialization, or Neo4j configuration errors during setup).
     """
 
-    def on_failure(self, exc, task_id, args, kwargs, _einfo):
+    def on_failure(self, exc, task_id, args, kwargs, _einfo):  # noqa: ARG002
+        del args  # Required by Celery's Task.on_failure signature; not used.
         tenant_id = kwargs.get("tenant_id")
         scan_id = kwargs.get("scan_id")
 
@@ -790,6 +800,32 @@ def aggregate_daily_severity_task(tenant_id: str, scan_id: str):
     return aggregate_daily_severity(tenant_id=tenant_id, scan_id=scan_id)
 
 
+@shared_task(name="scan-reset-ephemeral-resources", queue="overview")
+@handle_provider_deletion
+def reset_ephemeral_resource_findings_count_task(tenant_id: str, scan_id: str):
+    """Reset failed_findings_count for resources missing from a completed full-scope scan.
+
+    Failures are swallowed and returned as a status: this task lives inside the
+    post-scan group, and Celery propagates group-member exceptions into the next
+    chain step — meaning a crash here would block compliance reports and
+    integrations. The reset is purely cosmetic (UI sort optimization), so a
+    bad run is logged and absorbed rather than allowed to cascade.
+    """
+    try:
+        return reset_ephemeral_resource_findings_count(
+            tenant_id=tenant_id, scan_id=scan_id
+        )
+    except Exception as exc:  # noqa: BLE001 — intentionally broad
+        logger.exception(
+            f"reset_ephemeral_resource_findings_count failed for scan {scan_id}: {exc}"
+        )
+        return {
+            "status": "failed",
+            "scan_id": str(scan_id),
+            "reason": str(exc),
+        }
+
+
 @shared_task(base=RLSTask, name="scan-finding-group-summaries", queue="overview")
 @set_tenant(keep_tenant=True)
 @handle_provider_deletion
diff --git a/api/src/backend/tasks/tests/test_scan.py b/api/src/backend/tasks/tests/test_scan.py