Skip to content

Commit 85d38b5

Browse files
authored
feat(scans): Reset resource failed findings to 0 for ephemeral resources (#10929)
1 parent 59dcdb8 commit 85d38b5

5 files changed

Lines changed: 588 additions & 3 deletions

File tree

api/CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
All notable changes to the **Prowler API** are documented in this file.
44

5+
## [1.27.0] (Prowler UNRELEASED)
6+
7+
### 🚀 Added
8+
9+
- New `scan-reset-ephemeral-resources` post-scan task zeroes `failed_findings_count` for resources missing from the latest full-scope scan, keeping ephemeral resources from polluting the Resources page sort [(#10929)](https://github.com/prowler-cloud/prowler/pull/10929)
10+
11+
---
12+
513
## [1.26.1] (Prowler v5.25.1)
614

715
### 🐞 Fixed

api/src/backend/api/models.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,10 +595,40 @@ class Scan(RowLevelSecurityProtectedModel):
595595
objects = ActiveProviderManager()
596596
all_objects = models.Manager()
597597

598+
_SCOPING_SCANNER_ARG_KEYS_CACHE: tuple[str, ...] | None = None
599+
600+
@classmethod
601+
def get_scoping_scanner_arg_keys(cls) -> tuple[str, ...]:
602+
"""Return the scanner_args keys that mark a scan as scoped.
603+
604+
Derived from ``prowler.lib.scan.scan.Scan.__init__`` so the API stays
605+
in sync with whatever the SDK actually accepts as filters. Cached at
606+
class level — the signature is stable for the process lifetime.
607+
"""
608+
if cls._SCOPING_SCANNER_ARG_KEYS_CACHE is None:
609+
import inspect
610+
611+
from prowler.lib.scan.scan import Scan as ProwlerScan
612+
613+
params = inspect.signature(ProwlerScan.__init__).parameters
614+
cls._SCOPING_SCANNER_ARG_KEYS_CACHE = tuple(
615+
name for name in params if name not in ("self", "provider")
616+
)
617+
return cls._SCOPING_SCANNER_ARG_KEYS_CACHE
618+
598619
class TriggerChoices(models.TextChoices):
599620
SCHEDULED = "scheduled", _("Scheduled")
600621
MANUAL = "manual", _("Manual")
601622

623+
# Trigger values for scans that ran the SDK end-to-end. Imported scans (or
624+
# any future trigger) are intentionally NOT in this set — they may carry
625+
# only a partial slice of resources, so post-scan logic that depends on a
626+
# full-scope sweep (e.g. resetting ephemeral resource findings) must skip
627+
# them by default.
628+
LIVE_SCAN_TRIGGERS = frozenset(
629+
(TriggerChoices.SCHEDULED.value, TriggerChoices.MANUAL.value)
630+
)
631+
602632
id = models.UUIDField(primary_key=True, default=uuid7, editable=False)
603633
name = models.CharField(
604634
blank=True, null=True, max_length=100, validators=[MinLengthValidator(3)]
@@ -681,6 +711,24 @@ class Meta(RowLevelSecurityProtectedModel.Meta):
681711
class JSONAPIMeta:
682712
resource_name = "scans"
683713

714+
def is_full_scope(self) -> bool:
715+
"""Return True if this scan ran with no scoping filters at all.
716+
717+
Used to gate post-scan operations (such as resetting the
718+
failed_findings_count of resources missing from the scan) that are only
719+
safe when the scan covered every check, service, and category. Imported
720+
scans are NOT full-scope by definition — they may carry only a partial
721+
slice of resources, so they're rejected via ``trigger`` even before the
722+
scanner_args check.
723+
"""
724+
if self.trigger not in self.LIVE_SCAN_TRIGGERS:
725+
return False
726+
scanner_args = self.scanner_args or {}
727+
for key in self.get_scoping_scanner_arg_keys():
728+
if scanner_args.get(key):
729+
return False
730+
return True
731+
684732

685733
class AttackPathsScan(RowLevelSecurityProtectedModel):
686734
objects = ActiveProviderManager()

api/src/backend/tasks/jobs/scan.py

Lines changed: 181 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,29 @@
1010

1111
import sentry_sdk
1212
from celery.utils.log import get_task_logger
13+
from config.django.base import DJANGO_FINDINGS_BATCH_SIZE
1314
from config.env import env
1415
from config.settings.celery import CELERY_DEADLOCK_ATTEMPTS
1516
from django.db import IntegrityError, OperationalError
16-
from django.db.models import Case, Count, IntegerField, Max, Min, Prefetch, Q, Sum, When
17+
from django.db.models import (
18+
Case,
19+
Count,
20+
Exists,
21+
IntegerField,
22+
Max,
23+
Min,
24+
OuterRef,
25+
Prefetch,
26+
Q,
27+
Sum,
28+
When,
29+
)
1730
from django.utils import timezone as django_timezone
1831
from tasks.jobs.queries import (
1932
COMPLIANCE_UPSERT_PROVIDER_SCORE_SQL,
2033
COMPLIANCE_UPSERT_TENANT_SUMMARY_SQL,
2134
)
22-
from tasks.utils import CustomEncoder
35+
from tasks.utils import CustomEncoder, batched
2336

2437
from api.compliance import PROWLER_COMPLIANCE_OVERVIEW_TEMPLATE
2538
from api.constants import SEVERITY_ORDER
@@ -2069,3 +2082,169 @@ def aggregate_finding_group_summaries(tenant_id: str, scan_id: str):
20692082
"created": created_count,
20702083
"updated": updated_count,
20712084
}
2085+
2086+
2087+
def reset_ephemeral_resource_findings_count(tenant_id: str, scan_id: str) -> dict:
2088+
"""Zero failed_findings_count for resources missing from a completed full-scope scan.
2089+
2090+
Resources that exist in the database for the scan's provider but were not
2091+
touched by this scan are treated as ephemeral. We keep their historical
2092+
findings, but reset the denormalized counter that drives the Resources page
2093+
sort so they stop ranking at the top.
2094+
2095+
Skipped (no-op) when:
2096+
- The scan is not in COMPLETED state.
2097+
- The scan ran with any scoping filter in scanner_args (partial scope).
2098+
2099+
Query design (must scale to 500k+ resources per provider):
2100+
Phase 1 — collect ephemeral IDs with one anti-join read.
2101+
Outer filter ``(tenant_id, provider_id, failed_findings_count > 0)``
2102+
uses ``resources_tenant_provider_idx``. The correlated
2103+
``NOT EXISTS`` subquery hits the implicit unique index
2104+
``(tenant_id, scan_id, resource_id)`` on ``ResourceScanSummary``.
2105+
``NOT EXISTS`` (vs ``NOT IN``) is null-safe and lets the planner
2106+
choose between hash anti-join and indexed nested-loop anti-join.
2107+
``.iterator(chunk_size=...)`` skips the queryset cache so memory
2108+
stays bounded while streaming UUIDs.
2109+
Phase 2 — UPDATE in fixed-size batches.
2110+
One large UPDATE would hold row-exclusive locks for seconds and
2111+
create a WAL spike. Batched UPDATEs by ``id__in`` (~1k rows each)
2112+
hit the primary key, keep each lock window ~50ms, bound WAL chunks,
2113+
and let other writers proceed between batches.
2114+
``failed_findings_count__gt=0`` in the UPDATE is idempotent under
2115+
concurrent scans and skips no-op rewrites.
2116+
Reads use the primary DB, not the replica: ``ResourceScanSummary`` rows
2117+
were written by the same scan task that triggered this one, so replica
2118+
lag could falsely classify scanned resources as ephemeral.
2119+
2120+
Scope detection (``Scan.is_full_scope()``) derives the set of scoping
2121+
scanner_args from ``prowler.lib.scan.scan.Scan.__init__`` via
2122+
introspection, so the API can never drift from the SDK's filter
2123+
contract. Imported scans are also rejected by trigger — they may only
2124+
cover a partial slice of resources.
2125+
"""
2126+
with rls_transaction(tenant_id):
2127+
scan = Scan.objects.filter(tenant_id=tenant_id, id=scan_id).first()
2128+
2129+
if scan is None:
2130+
logger.warning(f"Scan {scan_id} not found")
2131+
return {"status": "skipped", "reason": "scan not found"}
2132+
2133+
if scan.state != StateChoices.COMPLETED:
2134+
logger.info(f"Scan {scan_id} not completed; skipping ephemeral reset")
2135+
return {"status": "skipped", "reason": "scan not completed"}
2136+
2137+
if not scan.is_full_scope():
2138+
logger.info(
2139+
f"Scan {scan_id} ran with scoping filters; skipping ephemeral reset"
2140+
)
2141+
return {"status": "skipped", "reason": "partial scan scope"}
2142+
2143+
# Race protection: if a newer completed full-scope scan exists for this
2144+
# provider, our ResourceScanSummary set is stale relative to the resources'
2145+
# current failed_findings_count values (which the newer scan already
2146+
# refreshed). Wiping based on the older scan would zero counts the newer
2147+
# scan just set. Skip and let the newer scan's reset task do the work; if
2148+
# this task was delayed in the queue, that's the correct outcome.
2149+
# `completed_at__isnull=False` is required: Postgres orders NULL first in
2150+
# DESC, so a sibling COMPLETED scan with a missing completed_at would sort
2151+
# as "newest" and incorrectly cause us to skip.
2152+
with rls_transaction(tenant_id):
2153+
latest_full_scope_scan_id = (
2154+
Scan.objects.filter(
2155+
tenant_id=tenant_id,
2156+
provider_id=scan.provider_id,
2157+
state=StateChoices.COMPLETED,
2158+
completed_at__isnull=False,
2159+
)
2160+
.order_by("-completed_at", "-inserted_at")
2161+
.values_list("id", flat=True)
2162+
.first()
2163+
)
2164+
if latest_full_scope_scan_id != scan.id:
2165+
logger.info(
2166+
f"Scan {scan_id} is not the latest completed scan for provider "
2167+
f"{scan.provider_id}; skipping ephemeral reset"
2168+
)
2169+
return {"status": "skipped", "reason": "newer scan exists"}
2170+
2171+
# Defensive gate: ResourceScanSummary rows are written by perform_prowler_scan
2172+
# via best-effort bulk_create. If those writes failed silently (or the scan
2173+
# genuinely produced resources but no summaries were persisted), the
2174+
# ~Exists(in_scan) anti-join below would classify EVERY resource for this
2175+
# provider as ephemeral and zero their counts. Bail loudly instead.
2176+
with rls_transaction(tenant_id):
2177+
summaries_present = ResourceScanSummary.objects.filter(
2178+
tenant_id=tenant_id, scan_id=scan_id
2179+
).exists()
2180+
if scan.unique_resource_count > 0 and not summaries_present:
2181+
logger.error(
2182+
f"Scan {scan_id} reports {scan.unique_resource_count} unique "
2183+
f"resources but no ResourceScanSummary rows are persisted; "
2184+
f"skipping ephemeral reset to avoid wiping valid counts"
2185+
)
2186+
return {"status": "skipped", "reason": "summaries missing"}
2187+
2188+
# Stays on the primary DB intentionally. ResourceScanSummary rows are
2189+
# written by perform_prowler_scan in the same chain that triggered this
2190+
# task, so replica lag could return an empty/partial summary set; a stale
2191+
# read here would classify every Resource as ephemeral and wipe valid
2192+
# failed_findings_count values on the primary. Same rationale as
2193+
# update_provider_compliance_scores below in this module.
2194+
# Materializing the ID list (rather than streaming the iterator into
2195+
# batched UPDATEs) is intentional: it lets the UPDATEs run in their own
2196+
# short rls_transactions instead of one long transaction holding row locks
2197+
# on every batch. At 500k UUIDs the peak memory is ~40 MB — acceptable for
2198+
# a Celery worker — and is the better trade-off versus a multi-second
2199+
# write-lock window blocking concurrent scans.
2200+
with rls_transaction(tenant_id):
2201+
in_scan = ResourceScanSummary.objects.filter(
2202+
tenant_id=tenant_id,
2203+
scan_id=scan_id,
2204+
resource_id=OuterRef("pk"),
2205+
)
2206+
ephemeral_ids = list(
2207+
Resource.objects.filter(
2208+
tenant_id=tenant_id,
2209+
provider_id=scan.provider_id,
2210+
failed_findings_count__gt=0,
2211+
)
2212+
.filter(~Exists(in_scan))
2213+
.values_list("id", flat=True)
2214+
.iterator(chunk_size=DJANGO_FINDINGS_BATCH_SIZE)
2215+
)
2216+
2217+
if not ephemeral_ids:
2218+
logger.info(f"No ephemeral resources for scan {scan_id}")
2219+
return {
2220+
"status": "completed",
2221+
"scan_id": str(scan_id),
2222+
"provider_id": str(scan.provider_id),
2223+
"reset": 0,
2224+
}
2225+
2226+
total_updated = 0
2227+
for batch, _ in batched(ephemeral_ids, DJANGO_FINDINGS_BATCH_SIZE):
2228+
# batched() always yields a final tuple, which is empty when the input
2229+
# length is an exact multiple of the batch size. Skip it so we don't
2230+
# issue a no-op UPDATE ... WHERE id IN ().
2231+
if not batch:
2232+
continue
2233+
with rls_transaction(tenant_id):
2234+
total_updated += Resource.objects.filter(
2235+
tenant_id=tenant_id,
2236+
id__in=batch,
2237+
failed_findings_count__gt=0,
2238+
).update(failed_findings_count=0)
2239+
2240+
logger.info(
2241+
f"Ephemeral resource reset for scan {scan_id}: "
2242+
f"{total_updated} resources zeroed for provider {scan.provider_id}"
2243+
)
2244+
2245+
return {
2246+
"status": "completed",
2247+
"scan_id": str(scan_id),
2248+
"provider_id": str(scan.provider_id),
2249+
"reset": total_updated,
2250+
}

api/src/backend/tasks/tasks.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
aggregate_findings,
5959
create_compliance_requirements,
6060
perform_prowler_scan,
61+
reset_ephemeral_resource_findings_count,
6162
update_provider_compliance_scores,
6263
)
6364
from tasks.utils import (
@@ -77,6 +78,7 @@
7778
from prowler.lib.outputs.compliance.generic.generic import GenericCompliance
7879
from prowler.lib.outputs.finding import Finding as FindingOutput
7980

81+
8082
logger = get_task_logger(__name__)
8183

8284

@@ -158,6 +160,13 @@ def _perform_scan_complete_tasks(tenant_id: str, scan_id: str, provider_id: str)
158160
generate_outputs_task.si(
159161
scan_id=scan_id, provider_id=provider_id, tenant_id=tenant_id
160162
),
163+
# post-scan task — runs in the parallel group so a
164+
# failure cannot cascade into reports or integrations. Its only
165+
# prerequisite is that perform_prowler_scan has committed
166+
# ResourceScanSummary, which is true by the time this chain fires.
167+
reset_ephemeral_resource_findings_count_task.si(
168+
tenant_id=tenant_id, scan_id=scan_id
169+
),
161170
),
162171
group(
163172
# Use optimized task that generates both reports with shared queries
@@ -393,7 +402,8 @@ class AttackPathsScanRLSTask(RLSTask):
393402
SDK initialization, or Neo4j configuration errors during setup).
394403
"""
395404

396-
def on_failure(self, exc, task_id, args, kwargs, _einfo):
405+
def on_failure(self, exc, task_id, args, kwargs, _einfo): # noqa: ARG002
406+
del args # Required by Celery's Task.on_failure signature; not used.
397407
tenant_id = kwargs.get("tenant_id")
398408
scan_id = kwargs.get("scan_id")
399409

@@ -790,6 +800,32 @@ def aggregate_daily_severity_task(tenant_id: str, scan_id: str):
790800
return aggregate_daily_severity(tenant_id=tenant_id, scan_id=scan_id)
791801

792802

803+
@shared_task(name="scan-reset-ephemeral-resources", queue="overview")
804+
@handle_provider_deletion
805+
def reset_ephemeral_resource_findings_count_task(tenant_id: str, scan_id: str):
806+
"""Reset failed_findings_count for resources missing from a completed full-scope scan.
807+
808+
Failures are swallowed and returned as a status: this task lives inside the
809+
post-scan group, and Celery propagates group-member exceptions into the next
810+
chain step — meaning a crash here would block compliance reports and
811+
integrations. The reset is purely cosmetic (UI sort optimization), so a
812+
bad run is logged and absorbed rather than allowed to cascade.
813+
"""
814+
try:
815+
return reset_ephemeral_resource_findings_count(
816+
tenant_id=tenant_id, scan_id=scan_id
817+
)
818+
except Exception as exc: # noqa: BLE001 — intentionally broad
819+
logger.exception(
820+
f"reset_ephemeral_resource_findings_count failed for scan {scan_id}: {exc}"
821+
)
822+
return {
823+
"status": "failed",
824+
"scan_id": str(scan_id),
825+
"reason": str(exc),
826+
}
827+
828+
793829
@shared_task(base=RLSTask, name="scan-finding-group-summaries", queue="overview")
794830
@set_tenant(keep_tenant=True)
795831
@handle_provider_deletion

0 commit comments

Comments
 (0)