From ea8b4096712226b9cd9f0afa14476319e509eef3 Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Fri, 5 Jun 2026 01:26:07 -0400 Subject: [PATCH 1/8] feat(eng): add Notify-GitHubOnBuildResult.ps1 PowerShell helper that files / updates / closes a ci-broken GitHub issue on microsoft/aspire for a given branch. Two modes: - Failure: GETs open ci-broken issues, filters by a hidden HTML-comment marker (), and either creates a new issue (with a managed failures-table region in the body) or appends a row to the existing one and posts a follow-up @-mention comment. - Success: closes the matching open issue with a green-build comment. Drives the gh CLI throughout, authenticated via $env:GH_TOKEN that the caller sets after minting an aspire-repo-bot installation token. List uses the strongly-consistent /issues endpoint, not /search/issues, so near-simultaneous failures don't each file a duplicate; a post-create re-list catches the rare race past that window and closes ours as a duplicate of the older. Always exits 0. Warnings surface via task.logissue + task.complete result=SucceededWithIssues so a silent regression (bot loses permission, label deleted, gh API shape changes) goes yellow rather than green. -DryRun logs the would-be gh calls without mutating GitHub. In dry-run the script skips token mint entirely so the wrapper can validate pipeline plumbing without resolving aspire-repo-bot credentials. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../scripts/Notify-GitHubOnBuildResult.ps1 | 449 ++++++++++++++++++ 1 file changed, 449 insertions(+) create mode 100644 eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 diff --git a/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 b/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 new file mode 100644 index 00000000000..ed8591d93a1 --- /dev/null +++ b/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 @@ -0,0 +1,449 @@ +# Files or updates a GitHub issue on microsoft/aspire when the internal AzDO +# build fails on a publishing branch, and closes the issue when the next build +# of the same branch goes green. +# +# Invoked from the notify_failure and notify_success stages of azure-pipelines.yml. +# Both the failure path (`-Mode Failure`) and the success path (`-Mode Success`) +# live in this single script so the search-and-dedupe logic stays in one place. +# +# Auth: mints an aspire-repo-bot GitHub App installation access token via the +# shared Get-AspireBotInstallationToken.ps1 helper, exports it as GH_TOKEN for +# the gh CLI, and also marks it as a secret AzDO variable so any incidental +# log echo gets redacted. +# +# Dedupe strategy: +# - Issues are identified by a hidden HTML-comment marker in the body: +# +# - One open issue per branch at a time. +# - We use GET /repos/.../issues?labels=ci-broken&state=open (strongly +# consistent) and filter locally on the marker. The /search/issues +# endpoint is intentionally avoided because its 1-2 min eventual +# consistency causes near-simultaneous failed builds to each see +# "0 hits" and each file a duplicate. +# - On create, we re-list to detect the rare race where two builds raced +# past the consistency window simultaneously, and close ours as a +# duplicate of the older one. +# +# Failures table: +# - The issue body contains a managed markdown table inside a fenced +# region delimited by / +# . Each subsequent failure on the same +# branch appends a row (build, commit, failed stages, timestamp). +# A follow-up comment is also posted so @-mentions still fire — body +# edits don't generate notifications. +# - Visible rows are capped (FailuresTableMaxRows); older rows are +# summarized as "_N earlier failures omitted_" and remain accessible +# via the per-failure comments. +# - Only content between the markers is rewritten; any human-added +# prose elsewhere in the body is preserved across updates. +# +# Safety: this script ALWAYS exits 0. A flaky notification path must not +# turn an otherwise-correct build red. All API errors are logged via +# Write-Warning and swallowed. + +[CmdletBinding()] +param( + [Parameter(Mandatory = $true)][ValidateSet('Failure', 'Success')][string]$Mode, + # Optional at binding time so dry-run can run without aspire-repo-bot + # credentials. Live mode enforces presence in the main body — Mandatory + # would run before the dry-run gate. + [Parameter()][string]$AppId, + [Parameter()][string]$PrivateKeyPem, + [Parameter()][string]$Owner = 'microsoft', + [Parameter()][string]$Repo = 'aspire', + [Parameter(Mandatory = $true)][string]$Branch, + [Parameter(Mandatory = $true)][string]$BuildId, + [Parameter(Mandatory = $true)][string]$BuildNumber, + [Parameter(Mandatory = $true)][string]$BuildUrl, + [Parameter(Mandatory = $true)][string]$CommitSha, + [Parameter()][string]$FailedStages = '', + [Parameter()][switch]$DryRun +) + +$ErrorActionPreference = 'Stop' + +# Labels applied to every filed issue. ci-broken is the existing aspire-wide +# label whose description is literally "Internal ADO pipeline is failing". +$Script:IssueLabels = @('area-engineering-systems', 'ci-broken', 'blocking-clean-ci') + +# Assignees notified on issue creation. Also @-mentioned in each new-failure +# comment because edits to issue bodies do not re-fire notifications, but +# comment @-mentions do. +$Script:Assignees = @('joperezr', 'radical') +$Script:MentionLine = 'cc @joperezr @radical' + +# Managed region in the issue body. Update-FailuresTableInBody only touches +# content between these markers, so any human-added prose elsewhere in the +# body is preserved across updates. +$Script:FailuresTableBeginMarker = '' +$Script:FailuresTableEndMarker = '' +$Script:FailuresTableHeader = "| # | When (UTC) | Build | Commit | Failed stages |`n|---|------------|-------|--------|---------------|" +# Cap visible rows in the issue body. Older rows are collapsed into a +# "_N earlier failures omitted_" line; full per-failure history is still +# preserved in the issue's comments. +$Script:FailuresTableMaxRows = 50 + +# See Exit-NotifyScript for the SucceededWithIssues rationale. +$Script:HasWarnings = $false + +function Write-Step { + param([string]$Message) + Write-Host "[$Mode] $Message" +} + +# Logs to console AND, when running inside an AzDO pipeline, emits the +# logissue logging command so the warning surfaces in the task's Issues +# tab + build summary. task.logissue alone does NOT change the task +# result — Exit-NotifyScript does that via task.complete at exit time. +# See https://learn.microsoft.com/azure/devops/pipelines/scripts/logging-commands#logissue-log-an-error-or-warning +function Write-NotifyWarning { + param([string]$Message) + Write-Warning $Message + if ($env:TF_BUILD -eq 'True') { + Write-Host "##vso[task.logissue type=warning]$Message" + } + $Script:HasWarnings = $true +} + +# Single exit point. If anything called Write-NotifyWarning, emit +# task.complete to flip the AzDO task result to SucceededWithIssues so +# the silent-dead-feature class of bug (e.g., bot lost issues:write, +# label deleted, API shape changed) becomes visible in build status +# rather than buried in task logs no one reads on green builds. +function Exit-NotifyScript { + if ($Script:HasWarnings -and $env:TF_BUILD -eq 'True') { + Write-Host "##vso[task.complete result=SucceededWithIssues;]done" + } + exit 0 +} + +# Defense in depth: pipeline gate is the primary filter. Pipeline +# trigger's `main*` wildcard means we must match `main` exactly. +function Test-NotifiableBranch { + param([string]$Name) + return ($Name -eq 'main') -or ($Name -like 'release/*') +} + +# Thin wrapper around the gh CLI. Throws on non-zero exit so the surrounding +# try/catch + always-exit-0 contract picks up errors. stderr is folded into the +# same stream so any failure message is visible in the thrown exception. +# Auth: gh reads GH_TOKEN from the process environment (set once in the main +# body after the bot token is minted); no token plumbing through call sites. +function Invoke-Gh { + [CmdletBinding()] + param( + [Parameter(Mandatory = $true)][string[]]$ArgList, + [string]$StdinBody + ) + if ($PSBoundParameters.ContainsKey('StdinBody')) { + $output = $StdinBody | & gh @ArgList 2>&1 + } + else { + $output = & gh @ArgList 2>&1 + } + if ($LASTEXITCODE -ne 0) { + throw "gh $($ArgList -join ' ') failed (exit $LASTEXITCODE): $output" + } + return $output +} + +# Lists open issues with the ci-broken label and filters locally on the +# branch-specific marker. Returns an array (possibly empty) sorted by issue +# number ascending — so [0] is always the oldest open issue for this branch. +# `gh issue list` returns issues only (excludes PRs) and handles pagination +# internally up to --limit; realistic ci-broken issue count is < 10. +function Get-OpenBrokenIssuesForBranch { + param([string]$Marker) + + $json = Invoke-Gh -ArgList @( + 'issue', 'list', + '--repo', "$Owner/$Repo", + '--label', 'ci-broken', + '--state', 'open', + '--limit', '1000', + '--json', 'number,body,url' + ) + + $all = @($json | ConvertFrom-Json) + $matched = @($all | Where-Object { $_.body -and $_.body.Contains($Marker) }) + return @($matched | Sort-Object -Property number) +} + +function New-FailureTableRow { + param([int]$Index) + $shortSha = if ($CommitSha.Length -ge 7) { $CommitSha.Substring(0, 7) } else { $CommitSha } + $commitLink = "[``$shortSha``](https://github.com/$Owner/$Repo/commit/$CommitSha)" + # Escape pipes in user-supplied content so they don't break the markdown + # table. FailedStages currently comes from a fixed enumeration in the + # pipeline, but a future caller could pass arbitrary text. + $stages = if ([string]::IsNullOrWhiteSpace($FailedStages)) { '—' } else { $FailedStages -replace '\|', '\|' } + $when = (Get-Date).ToUniversalTime().ToString('yyyy-MM-dd HH:mm') + return "| $Index | $when | [$BuildNumber]($BuildUrl) | $commitLink | $stages |" +} + +# Initial failures table section embedded in a freshly-filed issue body. +# Contains the header plus the first failure row (index 1). +function New-InitialFailuresTableSection { + $row = New-FailureTableRow -Index 1 + return @" +$($Script:FailuresTableBeginMarker) +$($Script:FailuresTableHeader) +$row +$($Script:FailuresTableEndMarker) +"@ +} + +# Splices a new failure row into the managed region of an existing issue body. +# - Locates the begin/end markers; if missing (issue body was hand-edited or +# pre-dates this feature), returns the body unchanged with a warning. +# - Counts existing visible rows + any "N earlier omitted" tally to determine +# the next failure index. +# - When the visible-row count exceeds FailuresTableMaxRows, drops oldest +# rows and rolls them into the omitted tally. +function Update-FailuresTableInBody { + param([string]$Body) + + $beginIdx = $Body.IndexOf($Script:FailuresTableBeginMarker) + $endIdx = $Body.IndexOf($Script:FailuresTableEndMarker) + if ($beginIdx -lt 0 -or $endIdx -lt 0 -or $endIdx -lt $beginIdx) { + Write-NotifyWarning "Could not locate failures-table markers in issue body; skipping body update." + return $Body + } + + $contentStart = $beginIdx + $Script:FailuresTableBeginMarker.Length + $managedContent = $Body.Substring($contentStart, $endIdx - $contentStart) + + # Data rows look like: | 3 | 2026-06-04 22:34 | [...](...) | ... | ... | + $lines = $managedContent -split "`r?`n" + $dataRows = @($lines | Where-Object { $_ -match '^\|\s*\d+\s*\|' }) + + $omittedCount = 0 + foreach ($line in $lines) { + if ($line -match '^_(\d+) earlier failures omitted') { + $omittedCount = [int]$Matches[1] + break + } + } + + $nextIndex = $dataRows.Count + $omittedCount + 1 + $newRow = New-FailureTableRow -Index $nextIndex + $allRows = @($dataRows) + @($newRow) + + if ($allRows.Count -gt $Script:FailuresTableMaxRows) { + $extraToDrop = $allRows.Count - $Script:FailuresTableMaxRows + $omittedCount += $extraToDrop + $allRows = $allRows | Select-Object -Skip $extraToDrop + } + + $omittedLine = '' + if ($omittedCount -gt 0) { + $omittedLine = "_$omittedCount earlier failures omitted; see issue comments._`n`n" + } + + $newContent = "`n" + $omittedLine + $Script:FailuresTableHeader + "`n" + ($allRows -join "`n") + "`n" + + return $Body.Substring(0, $contentStart) + $newContent + $Body.Substring($endIdx) +} + +function New-IssueBody { + param([string]$Marker) + + $failuresTable = New-InitialFailuresTableSection + + return @" +$Marker + +The internal Azure DevOps build for ``microsoft-aspire`` (definition 1602) +is failing on ``$Branch``. + +$failuresTable + +This issue is updated with a new row in the table above on each subsequent +failure of the same branch, and closed automatically when the next build of +``$Branch`` succeeds. See [docs/ci/internal-build-failure-notifications.md](https://github.com/$Owner/$Repo/blob/main/docs/ci/internal-build-failure-notifications.md). + +$Script:MentionLine +"@ +} + +function New-FailureFollowupCommentBody { + return @" +Another failure on ``$Branch`` — see the failures table in the issue body for full history. + +- **Build:** [$BuildNumber]($BuildUrl) +- **Commit:** ``$CommitSha`` + +$Script:MentionLine +"@ +} + +function New-SuccessCommentBody { + return @" +✅ Build is green again on ``$Branch``: + +- **Build:** [$BuildNumber]($BuildUrl) +- **Commit:** ``$CommitSha`` + +Closing. +"@ +} + +function Invoke-FailureMode { + param([string]$Marker) + + if ($DryRun) { + Write-Step "DRY-RUN: would run 'gh issue list --repo $Owner/$Repo --label ci-broken --state open' and filter by marker '$Marker'" + Write-Step "DRY-RUN: assuming no existing issue; would run: gh issue create --repo $Owner/$Repo --title `"Internal build broken on $Branch`" --label $($Script:IssueLabels -join ',') --assignee $($Script:Assignees -join ',')" + Write-Step "DRY-RUN: issue body would contain:" + $body = New-IssueBody -Marker $Marker + $body -split "`n" | ForEach-Object { Write-Step "DRY-RUN: | $_" } + Write-Step "DRY-RUN: would then re-list and close as duplicate if not oldest (race handler)" + Write-Step "DRY-RUN: if an existing issue had been found, would run 'gh issue edit' to update the failures table and 'gh issue comment' for the follow-up with @-mentions" + return + } + + $existing = Get-OpenBrokenIssuesForBranch -Marker $Marker + + if ($existing.Count -eq 0) { + Write-Step "No existing open issue for branch '$Branch'. Creating one." + + $issueBody = New-IssueBody -Marker $Marker + # `gh issue create` accepts --label and --assignee multiple times. + # Build flag pairs rather than relying on comma-separation, which is + # more brittle if a label/assignee ever contains a comma. + $labelFlags = @($Script:IssueLabels | ForEach-Object { '--label'; $_ }) + $assigneeFlags = @($Script:Assignees | ForEach-Object { '--assignee'; $_ }) + + $createArgs = @( + 'issue', 'create', + '--repo', "$Owner/$Repo", + '--title', "Internal build broken on $Branch", + '--body-file', '-' + ) + $labelFlags + $assigneeFlags + + # gh issue create prints the new issue's URL as the last non-empty + # line of stdout. Parse the number out for the race-handler compare. + $createOutput = Invoke-Gh -ArgList $createArgs -StdinBody $issueBody + $createdUrl = (@($createOutput) | Where-Object { -not [string]::IsNullOrWhiteSpace($_) } | Select-Object -Last 1).ToString().Trim() + if ($createdUrl -notmatch '/issues/(\d+)$') { + throw "Could not parse issue number from gh output: '$createdUrl'" + } + $createdNumber = [int]$Matches[1] + Write-Step "Created issue #${createdNumber}: $createdUrl" + + # Post-create race handler: re-list and close as duplicate if we + # aren't the oldest. See file header "Dedupe strategy". + $recheck = Get-OpenBrokenIssuesForBranch -Marker $Marker + $oldest = $recheck | Where-Object { $_.number -ne $createdNumber } | Select-Object -First 1 + if ($null -ne $oldest -and $oldest.number -lt $createdNumber) { + Write-Step "Race detected: older open issue #$($oldest.number) found. Closing our just-created #${createdNumber} as duplicate." + $dupBody = "Duplicate of #$($oldest.number). Two near-simultaneous failed builds raced past the dedupe window; auto-closing this one." + Invoke-Gh -ArgList @('issue', 'comment', "$createdNumber", '--repo', "$Owner/$Repo", '--body-file', '-') -StdinBody $dupBody | Out-Null + # `gh issue close --reason` accepts "completed" or "not planned" + # (with a space) — gh maps the latter to the API's not_planned. + Invoke-Gh -ArgList @('issue', 'close', "$createdNumber", '--repo', "$Owner/$Repo", '--reason', 'not planned') | Out-Null + } + return + } + + $target = $existing[0] + if ($existing.Count -gt 1) { + Write-NotifyWarning "Found $($existing.Count) open ci-broken issues for branch '$Branch' (numbers: $($existing.number -join ', ')). Updating the oldest (#$($target.number)) and leaving the rest for human cleanup." + } + else { + Write-Step "Found existing open issue #$($target.number) for branch '$Branch'. Updating failures table and appending comment." + } + + # Append a row to the failures table in the issue body. Race: two + # near-simultaneous failures doing list->modify->edit may drop one row; + # accepted because the follow-up comment still fires and per-failure + # history is preserved in the issue comments. The table is the at-a-glance + # summary, not the system of record. + $currentBody = $target.body + if ($null -ne $currentBody) { + $newBody = Update-FailuresTableInBody -Body $currentBody + if ($newBody -ne $currentBody) { + Invoke-Gh -ArgList @('issue', 'edit', "$($target.number)", '--repo', "$Owner/$Repo", '--body-file', '-') -StdinBody $newBody | Out-Null + Write-Step "Updated failures table in #$($target.number)." + } + } + + $commentBody = New-FailureFollowupCommentBody + Invoke-Gh -ArgList @('issue', 'comment', "$($target.number)", '--repo', "$Owner/$Repo", '--body-file', '-') -StdinBody $commentBody | Out-Null + Write-Step "Appended failure comment to #$($target.number): $($target.url)" +} + +function Invoke-SuccessMode { + param([string]$Marker) + + if ($DryRun) { + Write-Step "DRY-RUN: would run 'gh issue list --repo $Owner/$Repo --label ci-broken --state open' and filter by marker '$Marker'" + Write-Step "DRY-RUN: for each matching open issue, would run 'gh issue comment' with green-build body and 'gh issue close --reason completed'" + return + } + + $existing = Get-OpenBrokenIssuesForBranch -Marker $Marker + if ($existing.Count -eq 0) { + Write-Step "No open ci-broken issue for branch '$Branch'. Nothing to close." + return + } + + foreach ($issue in $existing) { + Write-Step "Closing issue #$($issue.number) ($($issue.url)) with green-build comment." + + $commentBody = New-SuccessCommentBody + Invoke-Gh -ArgList @('issue', 'comment', "$($issue.number)", '--repo', "$Owner/$Repo", '--body-file', '-') -StdinBody $commentBody | Out-Null + Invoke-Gh -ArgList @('issue', 'close', "$($issue.number)", '--repo', "$Owner/$Repo", '--reason', 'completed') | Out-Null + Write-Step "Closed #$($issue.number)." + } +} + +# === main === + +try { + Write-Step "Starting. Branch='$Branch' BuildId=$BuildId BuildNumber=$BuildNumber DryRun=$DryRun" + + if (-not (Test-NotifiableBranch -Name $Branch)) { + Write-NotifyWarning "Branch '$Branch' is not in the notifiable set (main, release/*). No action taken." + Exit-NotifyScript + } + + $marker = "" + + if (-not $DryRun) { + # Live mode enforces -AppId/-PrivateKeyPem presence here (see param comment). + if ([string]::IsNullOrWhiteSpace($AppId) -or [string]::IsNullOrWhiteSpace($PrivateKeyPem)) { + Write-NotifyWarning "Live mode requires -AppId and -PrivateKeyPem; aborting." + Exit-NotifyScript + } + $tokenScript = Join-Path $PSScriptRoot 'Get-AspireBotInstallationToken.ps1' + $token = & $tokenScript -AppId $AppId -PrivateKeyPem $PrivateKeyPem -Owner $Owner -Repo $Repo + if ([string]::IsNullOrWhiteSpace($token)) { + Write-NotifyWarning "Failed to mint installation token. Skipping notification." + Exit-NotifyScript + } + # Mark the token as a secret pipeline variable so any incidental + # log echo in this script or downstream tasks gets redacted by AzDO. + # Get-AspireBotInstallationToken.ps1 prints the token to stdout for + # caller capture; this is the AzDO-side hardening. + Write-Host "##vso[task.setvariable variable=__notifyGhToken;issecret=true]$token" + # gh reads its auth token from GH_TOKEN (process env var, not + # persisted to gh's config file). Set here so every subsequent + # `gh` call in this process authenticates as aspire-repo-bot. + $env:GH_TOKEN = $token + } + + switch ($Mode) { + 'Failure' { Invoke-FailureMode -Marker $marker } + 'Success' { Invoke-SuccessMode -Marker $marker } + } + + Write-Step "Done." + Exit-NotifyScript +} +catch { + # Never break the build. (See Exit-NotifyScript.) + Write-NotifyWarning "Notification failed: $($_.Exception.Message)" + Write-Warning $_.ScriptStackTrace + Exit-NotifyScript +} From ededbfc86e1bd050ffc18bc61b0a776df75be12f Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Fri, 5 Jun 2026 01:26:25 -0400 Subject: [PATCH 2/8] ci(notify): wire notify_failure / notify_success stages on internal pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two stages to azure-pipelines.yml that gate on the upstream build_sign_native / build / prepare_installers stage results and run Notify-GitHubOnBuildResult.ps1 on 1es-ubuntu-2204: - notify_failure: fires when at least one upstream stage Failed. Composes a comma-separated -FailedStages list from dependencies..result so the filed / updated issue body identifies which stage broke. - notify_success: fires when all three upstream stages Succeeded / SucceededWithIssues (prepare_installers may legitimately Skip on stable GA release builds). Closes the open ci-broken issue for the branch. Both stages mint the aspire-repo-bot installation token via Get-AspireBotInstallationToken.ps1 and export it as GH_TOKEN so the gh CLI invocations in the script authenticate as the bot. Adds _IsNotificationBranch in common-variables.yml — exact match on refs/heads/main (NOT startsWith, the pipeline trigger's `main*` wildcard would otherwise sweep in branches like main-something) plus startsWith refs/heads/release/. Excludes internal/release/* so internal branch names don't leak into the public tracker. Aspire-Release-Secrets variable group is imported at pipeline scope with the same non-PR + main/release/* gate, so manual feature-branch and PR runs don't pay the variable-group auth check at queue time. A notifyOnFailureDryRun queue-time parameter logs would-be gh calls without mutating GitHub; applies to both stages so a green-build dry-run can't accidentally close real open issues. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- eng/pipelines/azure-pipelines.yml | 219 +++++++++++++++++++++++++++++ eng/pipelines/common-variables.yml | 10 ++ 2 files changed, 229 insertions(+) diff --git a/eng/pipelines/azure-pipelines.yml b/eng/pipelines/azure-pipelines.yml index cc20d6986b2..63aa8c39024 100644 --- a/eng/pipelines/azure-pipelines.yml +++ b/eng/pipelines/azure-pipelines.yml @@ -21,6 +21,15 @@ parameters: - stable - staging - daily + # When true, the notify_failure and notify_success stages run in dry-run + # mode: they log the GitHub REST calls they would make to file/update/close + # the ci-broken issue, but do not actually mutate anything on + # microsoft/aspire. Use this when validating pipeline plumbing changes + # without spamming issues. + - name: notifyOnFailureDryRun + displayName: 'Notify on failure: dry-run (log REST calls, do not mutate)' + type: boolean + default: false trigger: batch: true @@ -64,6 +73,13 @@ variables: - template: /eng/pipelines/common-variables.yml@self - template: /eng/common/templates-official/variables/pool-providers.yml@self + # aspire-bot-app-id + aspire-bot-private-key for notify_failure / notify_success. + # Gate mirrors the notify stage conditions: non-PR + main/release/* only. + # A manual run on a feature branch never needs these, so don't pay the + # variable-group auth check at queue time. + - ${{ if and(notin(variables['Build.Reason'], 'PullRequest'), or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/release/'))) }}: + - group: Aspire-Release-Secrets + - name: _BuildConfig value: Release - name: Build.Arcade.ArtifactsPath @@ -577,3 +593,206 @@ extends: Write-Host "Resolved macOS npm validation RID: $rid" Write-Host "##vso[task.setvariable variable=NpmValidationRid]$rid" displayName: 🟣Resolve native macOS RID + + # ---------------------------------------------------------------- + # On every non-PR build of main / release/*, file or update a GitHub + # issue on microsoft/aspire when the build fails, and close any open + # ci-broken issue for the branch when the build succeeds. + # + # Split into TWO stages so we can gate at the STAGE level — only + # stage conditions can reference dependencies..result; from a + # JOB condition in a different stage, the only available form is + # stageDependencies...result (no stage-aggregate form), + # which would force us to enumerate specific job names from upstream + # stages. + # + # See docs/ci/internal-build-failure-notifications.md for the full + # contract (labels, marker, dedupe behavior, dry-run). + # ---------------------------------------------------------------- + - stage: notify_failure + displayName: Notify Build Failure + dependsOn: + - build_sign_native + - build + - prepare_installers + # Files an issue when at least one upstream stage Failed. + # Canceled stage results match neither 'Failed' nor 'Succeeded', + # so operator/timeout cancellations produce no notification. + condition: | + and( + ne(variables['Build.Reason'], 'PullRequest'), + eq(variables['_IsNotificationBranch'], 'true'), + or( + in(dependencies.build.result, 'Failed'), + in(dependencies.build_sign_native.result, 'Failed'), + in(dependencies.prepare_installers.result, 'Failed') + ) + ) + variables: + # Stage results, exposed as runtime variables so the pwsh step below + # can compose a "failed stages" list for the GitHub notification. + - name: BuildStageResult + value: $[ dependencies.build.result ] + - name: BuildSignNativeStageResult + value: $[ dependencies.build_sign_native.result ] + - name: PrepareInstallersStageResult + value: $[ dependencies.prepare_installers.result ] + # Drives the -DryRun:$value pattern in the pwsh wrapper below. + # Values are literal 'true'/'false' so the wrapper can parse with + # `-eq 'true'` and avoid PowerShell's array-splat positional + # binding — passing '-DryRun' through @extraArgs binds the string + # to the first non-mandatory positional string parameter (-Owner) + # instead of engaging the switch, leaving DryRun=False and + # producing a live token mint against '-DryRun/aspire'. + - ${{ if eq(parameters.notifyOnFailureDryRun, true) }}: + - name: NotifyDryRunFlag + value: 'true' + - ${{ else }}: + - name: NotifyDryRunFlag + value: 'false' + jobs: + - template: /eng/common/templates-official/jobs/jobs.yml@self + parameters: + enableMicrobuild: false + enablePublishUsingPipelines: false + enablePublishBuildAssets: false + # No build output to publish; mirrors prepare_installers. + enablePublishBuildArtifacts: false + enableTelemetry: true + workspace: + clean: all + jobs: + - job: NotifyOnFailure + displayName: 'File / update ci-broken issue' + timeoutInMinutes: 15 + pool: + name: NetCore1ESPool-Internal + image: 1es-ubuntu-2204 + os: linux + templateContext: + mb: + publish: + enabled: false + steps: + - checkout: self + displayName: 'Checkout repo (for notify script)' + fetchDepth: 1 + - pwsh: | + $ErrorActionPreference = 'Stop' + # Use env vars instead of $(Build.*) macros: a branch with a + # quote in it would break out of the PowerShell literal before + # the script's try/catch (and "always exit 0" invariant) + # could absorb it. + $branch = $env:BUILD_SOURCEBRANCH -replace '^refs/heads/', '' + $buildUrl = "$($env:SYSTEM_TEAMFOUNDATIONCOLLECTIONURI)$($env:SYSTEM_TEAMPROJECT)/_build/results?buildId=$($env:BUILD_BUILDID)" + + # Pass DryRun as a typed -DryRun:$bool. Do NOT conditionally + # splat '-DryRun' through @extraArgs — array splatting is + # positional (about_Splatting), so the literal string would + # bind to the first non-mandatory positional string parameter + # (-Owner) instead of engaging the switch. + $dryRun = '$(NotifyDryRunFlag)' -eq 'true' + + # Compose the failed-stages list from the upstream stage + # results exposed via stage-scoped variables above. + $failed = @() + if ('$(BuildStageResult)' -eq 'Failed') { $failed += 'build' } + if ('$(BuildSignNativeStageResult)' -eq 'Failed') { $failed += 'build_sign_native' } + if ('$(PrepareInstallersStageResult)' -eq 'Failed') { $failed += 'prepare_installers' } + $failedStages = $failed -join ', ' + + & "$env:BUILD_SOURCESDIRECTORY/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1" ` + -Mode Failure ` + -AppId "$env:ASPIRE_BOT_APP_ID" ` + -PrivateKeyPem "$env:ASPIRE_BOT_PRIVATE_KEY" ` + -Branch $branch ` + -BuildId $env:BUILD_BUILDID ` + -BuildNumber $env:BUILD_BUILDNUMBER ` + -BuildUrl $buildUrl ` + -CommitSha $env:BUILD_SOURCEVERSION ` + -FailedStages $failedStages ` + -DryRun:$dryRun + displayName: 'Run notify script (Failure mode)' + # Resolve aspire-repo-bot secrets only in live mode; dry-run + # skips the token mint and accepts empty -AppId/-PrivateKeyPem. + ${{ if ne(parameters.notifyOnFailureDryRun, true) }}: + env: + ASPIRE_BOT_APP_ID: $(aspire-bot-app-id) + ASPIRE_BOT_PRIVATE_KEY: $(aspire-bot-private-key) + + - stage: notify_success + displayName: Notify Build Success + dependsOn: + - build_sign_native + - build + - prepare_installers + # Only fires when all three upstream stages are Succeeded / + # SucceededWithIssues — except prepare_installers may legitimately + # Skip on stable GA release builds (its own condition excludes + # stable channel builds), which is not a failure. Canceled stage + # results don't match either branch and produce no notification. + condition: | + and( + ne(variables['Build.Reason'], 'PullRequest'), + eq(variables['_IsNotificationBranch'], 'true'), + in(dependencies.build.result, 'Succeeded', 'SucceededWithIssues'), + in(dependencies.build_sign_native.result, 'Succeeded', 'SucceededWithIssues'), + in(dependencies.prepare_installers.result, 'Succeeded', 'SucceededWithIssues', 'Skipped') + ) + variables: + - ${{ if eq(parameters.notifyOnFailureDryRun, true) }}: + - name: NotifyDryRunFlag + value: 'true' + - ${{ else }}: + - name: NotifyDryRunFlag + value: 'false' + jobs: + - template: /eng/common/templates-official/jobs/jobs.yml@self + parameters: + enableMicrobuild: false + enablePublishUsingPipelines: false + enablePublishBuildAssets: false + enablePublishBuildArtifacts: false + enableTelemetry: true + workspace: + clean: all + jobs: + - job: CloseOnSuccess + displayName: 'Close ci-broken issue' + timeoutInMinutes: 15 + pool: + name: NetCore1ESPool-Internal + image: 1es-ubuntu-2204 + os: linux + templateContext: + mb: + publish: + enabled: false + steps: + - checkout: self + displayName: 'Checkout repo (for notify script)' + fetchDepth: 1 + - pwsh: | + $ErrorActionPreference = 'Stop' + $branch = $env:BUILD_SOURCEBRANCH -replace '^refs/heads/', '' + $buildUrl = "$($env:SYSTEM_TEAMFOUNDATIONCOLLECTIONURI)$($env:SYSTEM_TEAMPROJECT)/_build/results?buildId=$($env:BUILD_BUILDID)" + + # See -DryRun:$dryRun comment in the Failure-mode step above. + $dryRun = '$(NotifyDryRunFlag)' -eq 'true' + + & "$env:BUILD_SOURCESDIRECTORY/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1" ` + -Mode Success ` + -AppId "$env:ASPIRE_BOT_APP_ID" ` + -PrivateKeyPem "$env:ASPIRE_BOT_PRIVATE_KEY" ` + -Branch $branch ` + -BuildId $env:BUILD_BUILDID ` + -BuildNumber $env:BUILD_BUILDNUMBER ` + -BuildUrl $buildUrl ` + -CommitSha $env:BUILD_SOURCEVERSION ` + -DryRun:$dryRun + displayName: 'Run notify script (Success mode)' + # See env-block comment on the Failure-mode step above. + ${{ if ne(parameters.notifyOnFailureDryRun, true) }}: + env: + ASPIRE_BOT_APP_ID: $(aspire-bot-app-id) + ASPIRE_BOT_PRIVATE_KEY: $(aspire-bot-private-key) diff --git a/eng/pipelines/common-variables.yml b/eng/pipelines/common-variables.yml index 7d2c847a391..dfa7d3b20f7 100644 --- a/eng/pipelines/common-variables.yml +++ b/eng/pipelines/common-variables.yml @@ -58,3 +58,13 @@ variables: # repositories." Consumed by publish-winget.yml to guard upstream submission. - name: _IsProductionBranch value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/release/'), startsWith(variables['Build.SourceBranch'], 'refs/heads/internal/release/')) }} + + # Branches where notify_failure / notify_success file or close GitHub + # issues. Excludes internal/release/* so internal branch names don't + # leak into the public microsoft/aspire tracker. + # + # IMPORTANT: exact match on refs/heads/main, not startsWith — the + # pipeline trigger's `main*` wildcard would otherwise sweep in + # branches like main-something. + - name: _IsNotificationBranch + value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/release/')) }} From 7870f74f8a2eaddf46d136448e17eb42a0505955 Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Fri, 5 Jun 2026 01:26:31 -0400 Subject: [PATCH 3/8] docs(ci): document internal-build-failure notification flow Adds docs/ci/internal-build-failure-notifications.md describing the contract (labels, marker syntax, failures-table region, dedupe behavior, dry-run, manual cleanup) and pointer from eng/pipelines/README.md so anyone reading the pipeline docs lands on the notification system. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../internal-build-failure-notifications.md | 164 ++++++++++++++++++ eng/pipelines/README.md | 6 + 2 files changed, 170 insertions(+) create mode 100644 docs/ci/internal-build-failure-notifications.md diff --git a/docs/ci/internal-build-failure-notifications.md b/docs/ci/internal-build-failure-notifications.md new file mode 100644 index 00000000000..1c74ec5affc --- /dev/null +++ b/docs/ci/internal-build-failure-notifications.md @@ -0,0 +1,164 @@ +# Internal build failure notifications + +The internal Azure DevOps pipeline (`microsoft-aspire`, definition 1602, +defined in [`eng/pipelines/azure-pipelines.yml`](../../eng/pipelines/azure-pipelines.yml)) +files a GitHub issue on [microsoft/aspire](https://github.com/microsoft/aspire/issues) +when it breaks on a publishing branch, and closes that issue when the next +build of the same branch goes green. + +This document describes the contract so future maintainers can reason about +the behavior without re-reading the pipeline YAML. + +## What gets notified + +Two stages run at the end of every non-PR internal build: + +- `notify_failure` — files or updates a GitHub issue when at least one + of `build_sign_native` / `build` / `prepare_installers` ends with + `Failed`. +- `notify_success` — closes any open `ci-broken` issue for the branch + when all three upstream stages end with `Succeeded` or + `SucceededWithIssues` (`prepare_installers` may also legitimately + `Skip` on stable GA release builds — that is accepted as success). + +Both stages gate on the branch being either: + +- `refs/heads/main` (exact — the trigger uses the wildcard `main*` so an + exact match here is load-bearing to avoid sweeping in branches like + `main-something`), or +- `refs/heads/release/*`. + +`internal/release/*` is deliberately excluded so internal branch names +don't leak into the public issue tracker. Pull-request builds are also +excluded. + +The two stages must be at the stage level (not as two jobs in a single +stage) because cross-stage dependency results can only be referenced +from a stage condition via `dependencies..result`; from a job +condition the only available form is +`stageDependencies...result`, which has no stage-aggregate +equivalent. + +## What gets filed + +When the `notify_failure` stage fires, it creates (or appends a comment to) +a single GitHub issue per affected branch: + +- **Title:** `Internal build broken on ` +- **Labels:** `area-engineering-systems`, `ci-broken`, `blocking-clean-ci` +- **Assignees:** `joperezr`, `radical` +- **Body marker:** the first line is a hidden HTML comment + `` used for dedup. + +Only one open issue per branch exists at a time. + +The body contains a managed markdown table inside a fenced region +delimited by `` / +``. Each row records one failure: index, +UTC timestamp, build link, commit SHA (linked to the GitHub commit), +and the comma-separated list of failed upstream stages (`build`, +`build_sign_native`, `prepare_installers`). + +On each subsequent failure the script: + +1. **Updates the issue body** to append a new row to the table. Only + content between the fenced markers is rewritten — any human-added + prose elsewhere in the body is preserved. +2. **Posts a follow-up comment** containing the new build link, commit + SHA, and `cc @joperezr @radical`. The comment is what fires + notifications — body edits don't. + +Visible rows in the table are capped (currently 50). Older rows are +collapsed into a `_N earlier failures omitted_` summary line; the full +per-failure history remains in the issue's comments. + +`Canceled` stage results (operator cancellation, 1ES timeouts) intentionally +do not file an issue — the stage condition uses explicit `in(..., 'Failed')` +checks which exclude `Canceled`. + +## What gets closed + +The `notify_success` stage lists open `ci-broken` issues, filters by the +branch marker, and for each match posts a "build is green again" comment +and closes the issue with `state_reason: completed`. + +## Dedup and race handling + +Issue lookup uses `GET /repos/microsoft/aspire/issues?labels=ci-broken&state=open` +(strongly consistent) plus a local body-marker filter. The Search API is +intentionally avoided because its 1–2 minute eventual-consistency window +would cause near-simultaneous failed builds to each see "0 hits" and file +duplicate issues. + +After the create-issue path, the script re-lists immediately. If our +just-created issue is not the oldest carrying the marker, it closes itself +as a duplicate of the older issue. This self-heals the rare race when two +builds fail within seconds of each other. + +## Auth + +The script mints an installation access token for the **aspire-repo-bot** +GitHub App via [`Get-AspireBotInstallationToken.ps1`](../../eng/pipelines/scripts/Get-AspireBotInstallationToken.ps1) +(the same helper used by the release pipeline's +`dispatch-release-github-tasks.ps1`). The token is immediately marked as a +secret AzDO pipeline variable so any incidental log echo is redacted. + +The App's `aspire-bot-app-id` and `aspire-bot-private-key` secrets come +from the `Aspire-Release-Secrets` variable group, imported at pipeline +scope in `eng/pipelines/azure-pipelines.yml` and gated on non-PR builds +of `refs/heads/main` or `refs/heads/release/*` — the same condition the +notify stages use. Manual runs on feature branches and PR builds skip +the import entirely. + +**Prerequisite**: the aspire-repo-bot install on microsoft/aspire must have +`issues:write` permission. If missing, the script will 403 on every call +(but never break the build — see below). + +## Disabling for a single run + +Queue the pipeline manually and set `Notify on failure: dry-run` to true. +In dry-run mode, both stages log the REST calls they *would* make without +mutating anything on GitHub. This applies to both the failure and success +paths — a green-build dry-run will not accidentally close real open +issues. + +Dry-run mode is fully decoupled from the aspire-repo-bot credentials: +the wrapper omits the `ASPIRE_BOT_APP_ID` / `ASPIRE_BOT_PRIVATE_KEY` env +block and the script's `-AppId` / `-PrivateKeyPem` parameters are +non-mandatory, so a dry-run validation works without Aspire-Release-Secrets +variable group access and never mints a token. + +## Why this never breaks the build + +[`Notify-GitHubOnBuildResult.ps1`](../../eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1) +wraps the entire body in `try`/`catch` and always exits 0. Any GitHub API +error, network blip, or 401/403 from a missing App permission produces a +`Write-Warning` in the job log but leaves the build result unchanged. A +flaky notification path must never turn an otherwise-correct build red. + +However, a silently-skipped notification is its own failure mode — operators +need to see when the notification path itself broke (e.g., revoked App +permission, GitHub API shape change, deleted label). The catch block emits +AzDO logging commands so failures are visible without breaking the build: + +- `##vso[task.logissue type=warning]` surfaces the warning in the build + summary, in 1ES dashboards, and on the badge. +- `##vso[task.complete result=SucceededWithIssues;]` bumps the job result + to `SucceededWithIssues`, which renders as a yellow badge instead of + green. Notifications and dashboards can filter on this. + +A build that finishes "green-but-yellow" means the upstream build itself +succeeded, but the notify stage's call to GitHub failed for some reason — +worth investigating, but does not block anything that depends on the build. + +## Manually filing or closing + +If you need to file or close a `ci-broken` issue by hand (e.g. during +recovery), use the existing label and add the marker `` +as the first line of the body. The script's next run will treat it as the +canonical open issue and append/close accordingly. + +For the failures table to grow, also include the fenced region +(`` / ``) +somewhere in the body. If the markers are absent the script logs a +warning, leaves the body alone, and still posts the follow-up comment. diff --git a/eng/pipelines/README.md b/eng/pipelines/README.md index 2cdb0911344..9ab738eedef 100644 --- a/eng/pipelines/README.md +++ b/eng/pipelines/README.md @@ -40,3 +40,9 @@ This pipeline: ## Template Structure The public pipelines (`azure-pipelines-public.yml` and `azdo-tests.yml`) use a shared template (`templates/public-pipeline-template.yml`) to avoid code duplication while maintaining the same functionality. + +## Build-result notifications + +`azure-pipelines.yml` files a GitHub issue on microsoft/aspire when the +internal build breaks on `main` or `release/*`, and closes it when the +next build is green. See [docs/ci/internal-build-failure-notifications.md](../../docs/ci/internal-build-failure-notifications.md). From d62bd59abe6f88f5ccd8835335094ab4bba699e4 Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Fri, 5 Jun 2026 01:37:29 -0400 Subject: [PATCH 4/8] =?UTF-8?q?fix(ci-notify):=20use=20task.setsecret=20fo?= =?UTF-8?q?r=20token;=20correct=20REST=E2=86=92gh=20wording?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Notify-GitHubOnBuildResult.ps1: register the GitHub App installation token via `##vso[task.setsecret]` instead of `##vso[task.setvariable ...;issecret=true]`. The only purpose was log masking, but task.setvariable also persists the value as a job-scoped variable that other tasks could accidentally reference via $(__notifyGhToken). task.setsecret gives us the masking without the persistence. - azure-pipelines.yml + docs: dry-run mode logs the `gh` CLI commands it would run, not GitHub REST calls. Fix the parameter displayName, the parameter comment, and the corresponding docs paragraph. - docs: Azure Pipelines stage results use `Skipped`, not `Skip` (the YAML stage condition correctly checks for 'Skipped'). Fix the prose. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/ci/internal-build-failure-notifications.md | 11 ++++++----- eng/pipelines/azure-pipelines.yml | 4 ++-- .../scripts/Notify-GitHubOnBuildResult.ps1 | 14 +++++++++----- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/docs/ci/internal-build-failure-notifications.md b/docs/ci/internal-build-failure-notifications.md index 1c74ec5affc..8aea7789694 100644 --- a/docs/ci/internal-build-failure-notifications.md +++ b/docs/ci/internal-build-failure-notifications.md @@ -19,7 +19,8 @@ Two stages run at the end of every non-PR internal build: - `notify_success` — closes any open `ci-broken` issue for the branch when all three upstream stages end with `Succeeded` or `SucceededWithIssues` (`prepare_installers` may also legitimately - `Skip` on stable GA release builds — that is accepted as success). + end with `Skipped` on stable GA release builds — that is accepted + as success). Both stages gate on the branch being either: @@ -117,10 +118,10 @@ the import entirely. ## Disabling for a single run Queue the pipeline manually and set `Notify on failure: dry-run` to true. -In dry-run mode, both stages log the REST calls they *would* make without -mutating anything on GitHub. This applies to both the failure and success -paths — a green-build dry-run will not accidentally close real open -issues. +In dry-run mode, both stages log the `gh` CLI commands they *would* run +without mutating anything on GitHub. This applies to both the failure +and success paths — a green-build dry-run will not accidentally close +real open issues. Dry-run mode is fully decoupled from the aspire-repo-bot credentials: the wrapper omits the `ASPIRE_BOT_APP_ID` / `ASPIRE_BOT_PRIVATE_KEY` env diff --git a/eng/pipelines/azure-pipelines.yml b/eng/pipelines/azure-pipelines.yml index 63aa8c39024..7043ebb1e5f 100644 --- a/eng/pipelines/azure-pipelines.yml +++ b/eng/pipelines/azure-pipelines.yml @@ -22,12 +22,12 @@ parameters: - staging - daily # When true, the notify_failure and notify_success stages run in dry-run - # mode: they log the GitHub REST calls they would make to file/update/close + # mode: they log the `gh` CLI commands they would run to file/update/close # the ci-broken issue, but do not actually mutate anything on # microsoft/aspire. Use this when validating pipeline plumbing changes # without spamming issues. - name: notifyOnFailureDryRun - displayName: 'Notify on failure: dry-run (log REST calls, do not mutate)' + displayName: 'Notify on failure: dry-run (log gh commands, do not mutate)' type: boolean default: false diff --git a/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 b/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 index ed8591d93a1..30a10fae890 100644 --- a/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 +++ b/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 @@ -422,11 +422,15 @@ try { Write-NotifyWarning "Failed to mint installation token. Skipping notification." Exit-NotifyScript } - # Mark the token as a secret pipeline variable so any incidental - # log echo in this script or downstream tasks gets redacted by AzDO. - # Get-AspireBotInstallationToken.ps1 prints the token to stdout for - # caller capture; this is the AzDO-side hardening. - Write-Host "##vso[task.setvariable variable=__notifyGhToken;issecret=true]$token" + # Register the token with AzDO so any incidental log echo in this + # script or downstream tasks gets redacted. Using task.setsecret + # rather than task.setvariable;issecret=true so the token isn't + # also persisted as a job-scoped variable that other tasks could + # accidentally reference via $(__notifyGhToken) — we only need + # the log-masking effect. The token is consumed by `gh` through + # the GH_TOKEN process env var set below. + # https://learn.microsoft.com/azure/devops/pipelines/scripts/logging-commands#setsecret-register-a-value-as-a-secret + Write-Host "##vso[task.setsecret]$token" # gh reads its auth token from GH_TOKEN (process env var, not # persisted to gh's config file). Set here so every subsequent # `gh` call in this process authenticates as aspire-repo-bot. From 089eef1fa18cd38678bd75c73bfef35db5cd15e8 Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Tue, 9 Jun 2026 13:52:51 -0400 Subject: [PATCH 5/8] test(ci): add unit tests for Notify-GitHubOnBuildResult.ps1 The internal-build-failure notification script had no automated coverage, despite the established tests/Infrastructure.Tests/PowerShellScripts/ pattern and the script's highly testable pure helpers. Add NotifyGitHubOnBuildResultTests covering: - Test-NotifiableBranch: main (exact), release/*, and the negatives that matter (main-something, mainline, internal/release/*, feature/*) - New-FailureTableRow: pipe-escaping, short-SHA shortening with full-SHA link, sub-7-char SHA fallback, em-dash for empty FailedStages - Update-FailuresTableInBody: append + surrounding-prose preservation, max-rows rollover into the "earlier failures omitted" summary, omitted-count carry into the next index, and the missing-markers warning path (body left unchanged) - -DryRun (Failure + Success): exits 0 and launches zero gh processes, verified with a recording fake gh placed ahead of the real one on PATH The helpers are exercised by dot-sourcing the shipped script unmodified with a non-notifiable branch, so its main routine bails before any token mint or gh call and leaves the functions in scope. 'exit' inside a dot-sourced script only unwinds that script, not the test harness, so no testability hook in the production script is needed. Repo-root lookup reuses the shared worktree-aware TestUtils.FindRepoRoot() rather than adding another private Aspire.slnx walk. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Infrastructure.Tests.csproj | 1 + .../NotifyGitHubOnBuildResultTests.cs | 305 ++++++++++++++++++ 2 files changed, 306 insertions(+) create mode 100644 tests/Infrastructure.Tests/PowerShellScripts/NotifyGitHubOnBuildResultTests.cs diff --git a/tests/Infrastructure.Tests/Infrastructure.Tests.csproj b/tests/Infrastructure.Tests/Infrastructure.Tests.csproj index 49f855a65b2..bad4bcd2ab4 100644 --- a/tests/Infrastructure.Tests/Infrastructure.Tests.csproj +++ b/tests/Infrastructure.Tests/Infrastructure.Tests.csproj @@ -29,6 +29,7 @@ + diff --git a/tests/Infrastructure.Tests/PowerShellScripts/NotifyGitHubOnBuildResultTests.cs b/tests/Infrastructure.Tests/PowerShellScripts/NotifyGitHubOnBuildResultTests.cs new file mode 100644 index 00000000000..b2adfcba50e --- /dev/null +++ b/tests/Infrastructure.Tests/PowerShellScripts/NotifyGitHubOnBuildResultTests.cs @@ -0,0 +1,305 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.RegularExpressions; +using Aspire.TestUtilities; +using Aspire.Templates.Tests; +using Xunit; + +namespace Infrastructure.Tests; + +/// +/// Tests for eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1. +/// +/// The pure helper functions (branch classification, failures-table editing, +/// row formatting) are exercised by dot-sourcing the script unmodified and +/// invoking the functions directly. Dot-sourcing with a non-notifiable branch +/// makes the script's main routine bail out before any token mint or gh call, +/// so the helpers are left defined in scope with no side effects. ('exit' inside +/// a dot-sourced script only unwinds that script, not the test harness.) +/// +/// The -DryRun contract is exercised end-to-end against the real script with a +/// recording fake 'gh' on PATH, asserting that no gh process is ever launched. +/// +public sealed class NotifyGitHubOnBuildResultTests : IDisposable +{ + private readonly TestTempDirectory _tempDir = new(); + private readonly string _scriptPath; + private readonly ITestOutputHelper _output; + + public NotifyGitHubOnBuildResultTests(ITestOutputHelper output) + { + _output = output; + _scriptPath = Path.Combine( + TestUtils.FindRepoRoot()?.FullName ?? throw new InvalidOperationException("Could not find repository root"), + "eng", "pipelines", "scripts", "Notify-GitHubOnBuildResult.ps1"); + } + + public void Dispose() => _tempDir.Dispose(); + + [Theory] + [InlineData("main", true)] + [InlineData("release/9.0", true)] + [InlineData("release/10.1.2", true)] + [InlineData("main-something", false)] // pipeline trigger uses a main* wildcard; must match 'main' exactly + [InlineData("mainline", false)] + [InlineData("internal/release/9.0", false)] + [InlineData("feature/anything", false)] + [RequiresTools(["pwsh"])] + public async Task TestNotifiableBranch_ClassifiesBranches(string branch, bool expected) + { + var (_, outContent) = await RunHarnessAsync($"$__result = Test-NotifiableBranch -Name '{branch}'"); + + Assert.Equal(expected ? "True" : "False", outContent.Trim()); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task NewFailureTableRow_EscapesPipesInFailedStages() + { + var (_, row) = await RunHarnessAsync(""" + $CommitSha = 'abcdef1234567' + $FailedStages = 'build | test' + $BuildNumber = 'B9' + $BuildUrl = 'https://example.test/9' + $__result = New-FailureTableRow -Index 3 + """); + + // A raw pipe would split the markdown table cell; it must be backslash-escaped. + Assert.Contains(@"build \| test", row); + Assert.DoesNotContain("build | test", row); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task NewFailureTableRow_ShortensCommitShaToSevenChars() + { + var (_, row) = await RunHarnessAsync(""" + $CommitSha = 'abcdef1234567' + $FailedStages = 'build' + $BuildNumber = 'B9' + $BuildUrl = 'https://example.test/9' + $__result = New-FailureTableRow -Index 3 + """); + + // Displayed sha is shortened to 7 chars, but the commit link uses the full sha. + Assert.Contains("[`abcdef1`]", row); + Assert.Contains("/commit/abcdef1234567)", row); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task NewFailureTableRow_HandlesShaShorterThanSevenChars() + { + var (_, row) = await RunHarnessAsync(""" + $CommitSha = 'abc' + $FailedStages = 'build' + $BuildNumber = 'B9' + $BuildUrl = 'https://example.test/9' + $__result = New-FailureTableRow -Index 3 + """); + + // Substring(0, 7) would throw on a sub-7-char sha; the script must fall back to the full value. + Assert.Contains("[`abc`]", row); + Assert.Contains("/commit/abc)", row); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task NewFailureTableRow_RendersEmDashForEmptyFailedStages() + { + var (_, row) = await RunHarnessAsync(""" + $CommitSha = 'abcdef1234567' + $FailedStages = '' + $BuildNumber = 'B9' + $BuildUrl = 'https://example.test/9' + $__result = New-FailureTableRow -Index 3 + """); + + Assert.Contains("| \u2014 |", row); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task UpdateFailuresTableInBody_AppendsRowAndPreservesSurroundingProse() + { + var (_, body) = await RunHarnessAsync(""" + $begin = '' + $end = '' + $hdr = "| # | When (UTC) | Build | Commit | Failed stages |`n|---|------------|-------|--------|---------------|" + $existing = "INTRO PROSE`n$begin`n$hdr`n| 1 | 2026-01-01 00:00 | [B1](u) | [``abc``](c) | x |`n$end`nTRAILING PROSE" + $__result = Update-FailuresTableInBody -Body $existing + """); + + Assert.Equal(2, CountDataRows(body)); + Assert.Contains("| 2 |", body); + // Human-authored prose outside the managed region must survive the edit. + Assert.Contains("INTRO PROSE", body); + Assert.Contains("TRAILING PROSE", body); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task UpdateFailuresTableInBody_RollsOverOldestRowsBeyondCap() + { + var (_, body) = await RunHarnessAsync(""" + $begin = '' + $end = '' + $hdr = "| # | When (UTC) | Build | Commit | Failed stages |`n|---|------------|-------|--------|---------------|" + $rows = (1..50 | ForEach-Object { "| $_ | 2026-01-01 00:00 | [B](u) | [``abc``](c) | x |" }) -join "`n" + $existing = "$begin`n$hdr`n$rows`n$end" + $__result = Update-FailuresTableInBody -Body $existing + """); + + // FailuresTableMaxRows = 50: the 51st row drops the oldest, which is summarized as omitted. + Assert.Equal(50, CountDataRows(body)); + Assert.Matches(@"_\d+ earlier failures omitted", body); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task UpdateFailuresTableInBody_AccountsForExistingOmittedCountInNextIndex() + { + var (_, body) = await RunHarnessAsync(""" + $begin = '' + $end = '' + $hdr = "| # | When (UTC) | Build | Commit | Failed stages |`n|---|------------|-------|--------|---------------|" + $omitted = '_3 earlier failures omitted; see issue comments._' + $existing = "$begin`n$omitted`n`n$hdr`n| 4 | 2026-01-01 00:00 | [B](u) | [``abc``](c) | x |`n| 5 | 2026-01-01 00:00 | [B](u) | [``abc``](c) | x |`n$end" + $__result = Update-FailuresTableInBody -Body $existing + """); + + // nextIndex = visibleRows(2) + omitted(3) + 1 = 6 + Assert.Contains("| 6 |", body); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task UpdateFailuresTableInBody_LeavesBodyUnchangedAndWarnsWhenMarkersMissing() + { + var (result, body) = await RunHarnessAsync(""" + $existing = 'a hand-edited body with no managed markers' + $__result = Update-FailuresTableInBody -Body $existing + """); + + Assert.Equal("a hand-edited body with no managed markers", body); + Assert.Contains("Could not locate failures-table markers", result.Output); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task DryRunFailureMode_LaunchesNoGhProcess() + { + var result = await RunRealScriptDryRunAsync("Failure"); + + result.EnsureExitCode(0); + Assert.Contains("DRY-RUN", result.Output); + Assert.Contains("gh issue create", result.Output); + AssertNoGhInvoked(); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task DryRunSuccessMode_LaunchesNoGhProcess() + { + var result = await RunRealScriptDryRunAsync("Success"); + + result.EnsureExitCode(0); + Assert.Contains("DRY-RUN", result.Output); + Assert.Contains("gh issue close --reason completed", result.Output); + AssertNoGhInvoked(); + } + + private async Task<(CommandResult Result, string OutContent)> RunHarnessAsync(string body) + { + var harnessPath = Path.Combine(_tempDir.Path, $"harness-{Guid.NewGuid():N}.ps1"); + var outPath = Path.Combine(_tempDir.Path, $"out-{Guid.NewGuid():N}.txt"); + + File.WriteAllText(harnessPath, HarnessTemplate.Replace("# __BODY__", body)); + + using var cmd = new PowerShellCommand(harnessPath, _output) + .WithTimeout(TimeSpan.FromMinutes(2)); + + var result = await cmd.ExecuteAsync( + "-ScriptPath", $"\"{_scriptPath}\"", + "-Out", $"\"{outPath}\""); + + result.EnsureExitCode(0, "notify harness failed"); + + var outContent = File.Exists(outPath) ? File.ReadAllText(outPath) : string.Empty; + return (result, outContent); + } + + private async Task RunRealScriptDryRunAsync(string mode) + { + var fakeBin = Path.Combine(_tempDir.Path, $"fakebin-{Guid.NewGuid():N}"); + Directory.CreateDirectory(fakeBin); + var ghCallLog = Path.Combine(_tempDir.Path, $"gh-calls-{Guid.NewGuid():N}.log"); + WriteFakeGh(fakeBin); + + var pathWithFakeGh = fakeBin + Path.PathSeparator + Environment.GetEnvironmentVariable("PATH"); + + using var cmd = new PowerShellCommand(_scriptPath, _output) + .WithTimeout(TimeSpan.FromMinutes(2)) + .WithEnvironmentVariable("PATH", pathWithFakeGh) + .WithEnvironmentVariable("GH_CALL_LOG", ghCallLog); + + _lastGhCallLog = ghCallLog; + + return await cmd.ExecuteAsync( + "-Mode", mode, + "-Branch", "main", + "-BuildId", "1", + "-BuildNumber", "B1", + "-BuildUrl", "https://example.test/1", + "-CommitSha", "deadbeefcafef00d", + "-DryRun"); + } + + private string? _lastGhCallLog; + + private void AssertNoGhInvoked() + { + // The recording fake gh appends to GH_CALL_LOG on every invocation, so the + // file existing at all means dry-run launched gh — which it must never do. + if (_lastGhCallLog is not null && File.Exists(_lastGhCallLog)) + { + Assert.Fail($"Dry-run must not launch gh, but the fake gh recorded calls:\n{File.ReadAllText(_lastGhCallLog)}"); + } + } + + private static void WriteFakeGh(string dir) + { + if (OperatingSystem.IsWindows()) + { + File.WriteAllText( + Path.Combine(dir, "gh.cmd"), + "@echo off\r\n>>\"%GH_CALL_LOG%\" echo %*\r\nexit /b 0\r\n"); + } + else + { + var ghPath = Path.Combine(dir, "gh"); + File.WriteAllText(ghPath, "#!/bin/sh\nprintf '%s\\n' \"$*\" >> \"$GH_CALL_LOG\"\nexit 0\n"); + File.SetUnixFileMode( + ghPath, + UnixFileMode.UserRead | UnixFileMode.UserWrite | UnixFileMode.UserExecute | + UnixFileMode.GroupRead | UnixFileMode.GroupExecute | + UnixFileMode.OtherRead | UnixFileMode.OtherExecute); + } + } + + private static int CountDataRows(string body) + => Regex.Matches(body, @"(?m)^\|\s*\d+\s*\|").Count; + + // Dot-sources the shipped script with a non-notifiable branch (so main bails before any + // gh/token work) and runs the per-test body, which sets $__result. The "# __BODY__" token + // is replaced with the test body; the script's $ and {} are left verbatim. + private const string HarnessTemplate = """ + param([string]$ScriptPath, [string]$Out) + $ErrorActionPreference = 'Stop' + . "$ScriptPath" -Mode Failure -Branch 'ci-test-harness-not-notifiable' -BuildId '1' -BuildNumber 'B1' -BuildUrl 'https://example.test/1' -CommitSha 'deadbeefcafef00d' -DryRun *> $null + $Owner = 'microsoft'; $Repo = 'aspire' + # __BODY__ + [System.IO.File]::WriteAllText($Out, [string]$__result) + """; +} From 719f84373f5fa3931843c5ccf11e192b9b41bb8f Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Tue, 9 Jun 2026 14:45:53 -0400 Subject: [PATCH 6/8] fix(ci-notify): cover all build stages so failures aren't missed The internal pipeline restructure (#17760) added build_extension, template_tests, and assemble (publish) stages. The notify stages only watched build / build_sign_native / prepare_installers, so: - a failure isolated to a new stage (e.g. assemble) filed NO issue, and - notify_success could falsely CLOSE an open ci-broken issue, because a dependency failure surfaces downstream as Skipped, not Failed. Coverage fix: - both notify stages now dependsOn and check all six build stages - notify_success allows Skipped only for prepare_installers (defensive; on notifiable branches it runs whenever build_sign_native succeeded) - FailedStages list and docs extended to the full stage set Also addresses review feedback on the notifier script: - probe for gh up front (Test-GhAvailable) so a missing CLI warns instead of silently swallowing every API call - anchor the dedupe-marker match to start-of-line (Test-IssueBodyMatchesMarker) so it can't match the marker pasted mid-prose into an unrelated issue - drop the in-body failures table; per-failure comments are the history. Re-home the failed-stages list into the issue body and the follow-up comment - cross-reference the three places the notifiable-branch policy lives - correct the task.setsecret wording in the docs - tests updated to cover the new helpers The YAML coverage change cannot run on GitHub PRs; validate with an AzDO dry-run (notifyOnFailureDryRun=true) before merge. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../internal-build-failure-notifications.md | 72 +++---- eng/pipelines/azure-pipelines.yml | 62 ++++-- .../scripts/Notify-GitHubOnBuildResult.ps1 | 190 +++++++----------- .../NotifyGitHubOnBuildResultTests.cs | 181 +++++++++-------- 4 files changed, 258 insertions(+), 247 deletions(-) diff --git a/docs/ci/internal-build-failure-notifications.md b/docs/ci/internal-build-failure-notifications.md index 8aea7789694..16623be11fa 100644 --- a/docs/ci/internal-build-failure-notifications.md +++ b/docs/ci/internal-build-failure-notifications.md @@ -11,16 +11,30 @@ the behavior without re-reading the pipeline YAML. ## What gets notified -Two stages run at the end of every non-PR internal build: +Two stages run at the end of every non-PR internal build, after every +other stage: - `notify_failure` — files or updates a GitHub issue when at least one - of `build_sign_native` / `build` / `prepare_installers` ends with - `Failed`. + build stage ends with `Failed`. It depends on **every** non-notify + stage — `build_sign_native`, `build_extension`, `build`, + `template_tests`, `assemble`, and `prepare_installers` — so a break + anywhere (including the publish stage `assemble`) files an issue. - `notify_success` — closes any open `ci-broken` issue for the branch - when all three upstream stages end with `Succeeded` or - `SucceededWithIssues` (`prepare_installers` may also legitimately - end with `Skipped` on stable GA release builds — that is accepted - as success). + when all of those stages end with `Succeeded` or `SucceededWithIssues` + (`prepare_installers` may also end with `Skipped`; see below). + +A stage whose dependency failed is reported as `Skipped` (not `Failed`) +by Azure Pipelines, so each stage must be watched directly — a single +downstream stage cannot be relied on to roll failures up. This is why +both conditions enumerate the full stage set rather than gating on one +terminal stage. + +`prepare_installers` is allowed to end with `Skipped` in the success +condition only as a defensive measure. On a notifiable branch +(`main` / `release/*`) it runs whenever `build_sign_native` succeeded, so +a `Skipped` there implies `build_sign_native` did not succeed — a failure +path already covered by `notify_failure`. (Its own condition only skips on +the *stable* channel for **non**-notifiable branches.) Both stages gate on the branch being either: @@ -53,25 +67,16 @@ a single GitHub issue per affected branch: Only one open issue per branch exists at a time. -The body contains a managed markdown table inside a fenced region -delimited by `` / -``. Each row records one failure: index, -UTC timestamp, build link, commit SHA (linked to the GitHub commit), -and the comma-separated list of failed upstream stages (`build`, -`build_sign_native`, `prepare_installers`). +The body records the first failure's build link, commit SHA, and the +comma-separated list of failed stages (any of `build_sign_native`, +`build_extension`, `build`, `template_tests`, `assemble`, +`prepare_installers`). It is written once at creation and is **not** +rewritten afterwards. -On each subsequent failure the script: - -1. **Updates the issue body** to append a new row to the table. Only - content between the fenced markers is rewritten — any human-added - prose elsewhere in the body is preserved. -2. **Posts a follow-up comment** containing the new build link, commit - SHA, and `cc @joperezr @radical`. The comment is what fires - notifications — body edits don't. - -Visible rows in the table are capped (currently 50). Older rows are -collapsed into a `_N earlier failures omitted_` summary line; the full -per-failure history remains in the issue's comments. +On each subsequent failure the script **posts a follow-up comment** with +that build's link, commit SHA, failed stages, and `cc @joperezr @radical`. +The comments are the per-failure history — and the comment is what fires +notifications, since editing the issue body would not. `Canceled` stage results (operator cancellation, 1ES timeouts) intentionally do not file an issue — the stage condition uses explicit `in(..., 'Failed')` @@ -101,8 +106,10 @@ builds fail within seconds of each other. The script mints an installation access token for the **aspire-repo-bot** GitHub App via [`Get-AspireBotInstallationToken.ps1`](../../eng/pipelines/scripts/Get-AspireBotInstallationToken.ps1) (the same helper used by the release pipeline's -`dispatch-release-github-tasks.ps1`). The token is immediately marked as a -secret AzDO pipeline variable so any incidental log echo is redacted. +`dispatch-release-github-tasks.ps1`). The token is immediately registered +as a secret with the agent via `##vso[task.setsecret]` so any incidental +log echo is redacted; it is consumed by `gh` through the `GH_TOKEN` process +environment variable and is not persisted as a pipeline variable. The App's `aspire-bot-app-id` and `aspire-bot-private-key` secrets come from the `Aspire-Release-Secrets` variable group, imported at pipeline @@ -156,10 +163,7 @@ worth investigating, but does not block anything that depends on the build. If you need to file or close a `ci-broken` issue by hand (e.g. during recovery), use the existing label and add the marker `` -as the first line of the body. The script's next run will treat it as the -canonical open issue and append/close accordingly. - -For the failures table to grow, also include the fenced region -(`` / ``) -somewhere in the body. If the markers are absent the script logs a -warning, leaves the body alone, and still posts the follow-up comment. +as the first line of the body. The marker must start a line — the script +matches it anchored to the start of a line so the text can't accidentally +match if pasted mid-prose into an unrelated issue. The script's next run +will treat the issue as the canonical open one and append/close accordingly. diff --git a/eng/pipelines/azure-pipelines.yml b/eng/pipelines/azure-pipelines.yml index 9526e058404..38f1803386d 100644 --- a/eng/pipelines/azure-pipelines.yml +++ b/eng/pipelines/azure-pipelines.yml @@ -912,28 +912,45 @@ extends: displayName: Notify Build Failure dependsOn: - build_sign_native + - build_extension - build + - template_tests + - assemble - prepare_installers - # Files an issue when at least one upstream stage Failed. - # Canceled stage results match neither 'Failed' nor 'Succeeded', - # so operator/timeout cancellations produce no notification. + # Files an issue when at least one watched stage Failed. We depend on + # EVERY non-notify stage so a break anywhere — including the publish + # stage `assemble`, the VSIX build `build_extension`, or `template_tests` + # — files an issue. A stage whose dependency failed is reported as + # 'Skipped' (not 'Failed'), so each stage must be watched directly + # rather than relying on one downstream stage to roll failures up. + # Canceled stage results match neither 'Failed' nor 'Succeeded', so + # operator/timeout cancellations produce no notification. condition: | and( ne(variables['Build.Reason'], 'PullRequest'), eq(variables['_IsNotificationBranch'], 'true'), or( - in(dependencies.build.result, 'Failed'), in(dependencies.build_sign_native.result, 'Failed'), + in(dependencies.build_extension.result, 'Failed'), + in(dependencies.build.result, 'Failed'), + in(dependencies.template_tests.result, 'Failed'), + in(dependencies.assemble.result, 'Failed'), in(dependencies.prepare_installers.result, 'Failed') ) ) variables: # Stage results, exposed as runtime variables so the pwsh step below # can compose a "failed stages" list for the GitHub notification. - - name: BuildStageResult - value: $[ dependencies.build.result ] - name: BuildSignNativeStageResult value: $[ dependencies.build_sign_native.result ] + - name: BuildExtensionStageResult + value: $[ dependencies.build_extension.result ] + - name: BuildStageResult + value: $[ dependencies.build.result ] + - name: TemplateTestsStageResult + value: $[ dependencies.template_tests.result ] + - name: AssembleStageResult + value: $[ dependencies.assemble.result ] - name: PrepareInstallersStageResult value: $[ dependencies.prepare_installers.result ] # Drives the -DryRun:$value pattern in the pwsh wrapper below. @@ -995,8 +1012,11 @@ extends: # Compose the failed-stages list from the upstream stage # results exposed via stage-scoped variables above. $failed = @() - if ('$(BuildStageResult)' -eq 'Failed') { $failed += 'build' } if ('$(BuildSignNativeStageResult)' -eq 'Failed') { $failed += 'build_sign_native' } + if ('$(BuildExtensionStageResult)' -eq 'Failed') { $failed += 'build_extension' } + if ('$(BuildStageResult)' -eq 'Failed') { $failed += 'build' } + if ('$(TemplateTestsStageResult)' -eq 'Failed') { $failed += 'template_tests' } + if ('$(AssembleStageResult)' -eq 'Failed') { $failed += 'assemble' } if ('$(PrepareInstallersStageResult)' -eq 'Failed') { $failed += 'prepare_installers' } $failedStages = $failed -join ', ' @@ -1023,19 +1043,35 @@ extends: displayName: Notify Build Success dependsOn: - build_sign_native + - build_extension - build + - template_tests + - assemble - prepare_installers - # Only fires when all three upstream stages are Succeeded / - # SucceededWithIssues — except prepare_installers may legitimately - # Skip on stable GA release builds (its own condition excludes - # stable channel builds), which is not a failure. Canceled stage - # results don't match either branch and produce no notification. + # Only closes the issue when EVERY watched stage is Succeeded / + # SucceededWithIssues (the same stage set as notify_failure). Requiring + # the full set means a build that fixes one stage but breaks another + # (e.g. green build/build_sign_native but a failed `assemble`) does NOT + # falsely close the open ci-broken issue. + # + # prepare_installers also permits 'Skipped': on a notifiable branch + # (main / release/*) it runs whenever build_sign_native succeeded, so a + # Skip there implies build_sign_native did not succeed — a failure path + # already covered by notify_failure. (The stable-channel skip in its own + # condition only takes effect on non-notifiable branches.) Allowing + # 'Skipped' is therefore defensive and cannot cause a false close. + # + # Canceled stage results don't match either branch and produce no + # notification. condition: | and( ne(variables['Build.Reason'], 'PullRequest'), eq(variables['_IsNotificationBranch'], 'true'), - in(dependencies.build.result, 'Succeeded', 'SucceededWithIssues'), in(dependencies.build_sign_native.result, 'Succeeded', 'SucceededWithIssues'), + in(dependencies.build_extension.result, 'Succeeded', 'SucceededWithIssues'), + in(dependencies.build.result, 'Succeeded', 'SucceededWithIssues'), + in(dependencies.template_tests.result, 'Succeeded', 'SucceededWithIssues'), + in(dependencies.assemble.result, 'Succeeded', 'SucceededWithIssues'), in(dependencies.prepare_installers.result, 'Succeeded', 'SucceededWithIssues', 'Skipped') ) variables: diff --git a/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 b/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 index 30a10fae890..a8eb3d3408b 100644 --- a/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 +++ b/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 @@ -8,8 +8,8 @@ # # Auth: mints an aspire-repo-bot GitHub App installation access token via the # shared Get-AspireBotInstallationToken.ps1 helper, exports it as GH_TOKEN for -# the gh CLI, and also marks it as a secret AzDO variable so any incidental -# log echo gets redacted. +# the gh CLI, and registers it as a secret with the agent (task.setsecret) so +# any incidental log echo gets redacted. # # Dedupe strategy: # - Issues are identified by a hidden HTML-comment marker in the body: @@ -24,18 +24,13 @@ # past the consistency window simultaneously, and close ours as a # duplicate of the older one. # -# Failures table: -# - The issue body contains a managed markdown table inside a fenced -# region delimited by / -# . Each subsequent failure on the same -# branch appends a row (build, commit, failed stages, timestamp). -# A follow-up comment is also posted so @-mentions still fire — body -# edits don't generate notifications. -# - Visible rows are capped (FailuresTableMaxRows); older rows are -# summarized as "_N earlier failures omitted_" and remain accessible -# via the per-failure comments. -# - Only content between the markers is rewritten; any human-added -# prose elsewhere in the body is preserved across updates. +# Per-failure history: +# - The issue body is written once at creation (build, commit, and the +# failed stages for the first failure). Each subsequent failure on the +# same branch posts a comment with that build's details and @-mentions — +# editing the body would not re-fire notifications, but comments do. +# - The comments are the per-failure history; the body is not rewritten +# after creation. # # Safety: this script ALWAYS exits 0. A flaky notification path must not # turn an otherwise-correct build red. All API errors are logged via @@ -72,17 +67,6 @@ $Script:IssueLabels = @('area-engineering-systems', 'ci-broken', 'blocking-clean $Script:Assignees = @('joperezr', 'radical') $Script:MentionLine = 'cc @joperezr @radical' -# Managed region in the issue body. Update-FailuresTableInBody only touches -# content between these markers, so any human-added prose elsewhere in the -# body is preserved across updates. -$Script:FailuresTableBeginMarker = '' -$Script:FailuresTableEndMarker = '' -$Script:FailuresTableHeader = "| # | When (UTC) | Build | Commit | Failed stages |`n|---|------------|-------|--------|---------------|" -# Cap visible rows in the issue body. Older rows are collapsed into a -# "_N earlier failures omitted_" line; full per-failure history is still -# preserved in the issue's comments. -$Script:FailuresTableMaxRows = 50 - # See Exit-NotifyScript for the SucceededWithIssues rationale. $Script:HasWarnings = $false @@ -119,6 +103,13 @@ function Exit-NotifyScript { # Defense in depth: pipeline gate is the primary filter. Pipeline # trigger's `main*` wildcard means we must match `main` exactly. +# +# IMPORTANT: the notifiable-branch policy is defined in THREE places that +# must stay in sync if the policy ever changes: +# 1. _IsNotificationBranch in eng/pipelines/common-variables.yml +# 2. the notify_failure / notify_success stage `condition:` blocks in +# eng/pipelines/azure-pipelines.yml +# 3. this function function Test-NotifiableBranch { param([string]$Name) return ($Name -eq 'main') -or ($Name -like 'release/*') @@ -147,6 +138,33 @@ function Invoke-Gh { return $output } +# Probe for the gh CLI up front. If it's missing from the image, every +# Invoke-Gh call would throw into the top-level catch and we'd silently stop +# filing/closing issues (silent-dead-feature). The release pipeline +# pin-installs gh for the same reason. Returns $true when `gh --version` runs +# and exits 0. +function Test-GhAvailable { + try { + & gh --version *> $null + return ($LASTEXITCODE -eq 0) + } + catch { + return $false + } +} + +# Matches the dedupe marker only at the START of a line. Every issue body this +# script files puts the marker on its own first line, so anchoring avoids a +# false match if the marker text is ever pasted mid-prose into an unrelated +# issue (which success-mode would otherwise comment on and close). +function Test-IssueBodyMatchesMarker { + param([string]$Body, [string]$Marker) + if ([string]::IsNullOrEmpty($Body)) { + return $false + } + return [regex]::IsMatch($Body, '(?m)^' + [regex]::Escape($Marker)) +} + # Lists open issues with the ci-broken label and filters locally on the # branch-specific marker. Returns an array (possibly empty) sorted by issue # number ascending — so [0] is always the oldest open issue for this branch. @@ -165,90 +183,24 @@ function Get-OpenBrokenIssuesForBranch { ) $all = @($json | ConvertFrom-Json) - $matched = @($all | Where-Object { $_.body -and $_.body.Contains($Marker) }) + $matched = @($all | Where-Object { Test-IssueBodyMatchesMarker -Body $_.body -Marker $Marker }) return @($matched | Sort-Object -Property number) } -function New-FailureTableRow { - param([int]$Index) - $shortSha = if ($CommitSha.Length -ge 7) { $CommitSha.Substring(0, 7) } else { $CommitSha } - $commitLink = "[``$shortSha``](https://github.com/$Owner/$Repo/commit/$CommitSha)" - # Escape pipes in user-supplied content so they don't break the markdown - # table. FailedStages currently comes from a fixed enumeration in the - # pipeline, but a future caller could pass arbitrary text. - $stages = if ([string]::IsNullOrWhiteSpace($FailedStages)) { '—' } else { $FailedStages -replace '\|', '\|' } - $when = (Get-Date).ToUniversalTime().ToString('yyyy-MM-dd HH:mm') - return "| $Index | $when | [$BuildNumber]($BuildUrl) | $commitLink | $stages |" -} - -# Initial failures table section embedded in a freshly-filed issue body. -# Contains the header plus the first failure row (index 1). -function New-InitialFailuresTableSection { - $row = New-FailureTableRow -Index 1 - return @" -$($Script:FailuresTableBeginMarker) -$($Script:FailuresTableHeader) -$row -$($Script:FailuresTableEndMarker) -"@ -} - -# Splices a new failure row into the managed region of an existing issue body. -# - Locates the begin/end markers; if missing (issue body was hand-edited or -# pre-dates this feature), returns the body unchanged with a warning. -# - Counts existing visible rows + any "N earlier omitted" tally to determine -# the next failure index. -# - When the visible-row count exceeds FailuresTableMaxRows, drops oldest -# rows and rolls them into the omitted tally. -function Update-FailuresTableInBody { - param([string]$Body) - - $beginIdx = $Body.IndexOf($Script:FailuresTableBeginMarker) - $endIdx = $Body.IndexOf($Script:FailuresTableEndMarker) - if ($beginIdx -lt 0 -or $endIdx -lt 0 -or $endIdx -lt $beginIdx) { - Write-NotifyWarning "Could not locate failures-table markers in issue body; skipping body update." - return $Body - } - - $contentStart = $beginIdx + $Script:FailuresTableBeginMarker.Length - $managedContent = $Body.Substring($contentStart, $endIdx - $contentStart) - - # Data rows look like: | 3 | 2026-06-04 22:34 | [...](...) | ... | ... | - $lines = $managedContent -split "`r?`n" - $dataRows = @($lines | Where-Object { $_ -match '^\|\s*\d+\s*\|' }) - - $omittedCount = 0 - foreach ($line in $lines) { - if ($line -match '^_(\d+) earlier failures omitted') { - $omittedCount = [int]$Matches[1] - break - } +# Renders the failed-stages list for the issue body / comment. Em-dash when +# empty. The pipeline passes a comma-separated list of stage names (no pipe +# characters), so no markdown escaping is needed. +function Format-FailedStages { + if ([string]::IsNullOrWhiteSpace($FailedStages)) { + return '—' } - - $nextIndex = $dataRows.Count + $omittedCount + 1 - $newRow = New-FailureTableRow -Index $nextIndex - $allRows = @($dataRows) + @($newRow) - - if ($allRows.Count -gt $Script:FailuresTableMaxRows) { - $extraToDrop = $allRows.Count - $Script:FailuresTableMaxRows - $omittedCount += $extraToDrop - $allRows = $allRows | Select-Object -Skip $extraToDrop - } - - $omittedLine = '' - if ($omittedCount -gt 0) { - $omittedLine = "_$omittedCount earlier failures omitted; see issue comments._`n`n" - } - - $newContent = "`n" + $omittedLine + $Script:FailuresTableHeader + "`n" + ($allRows -join "`n") + "`n" - - return $Body.Substring(0, $contentStart) + $newContent + $Body.Substring($endIdx) + return $FailedStages } function New-IssueBody { param([string]$Marker) - $failuresTable = New-InitialFailuresTableSection + $stages = Format-FailedStages return @" $Marker @@ -256,22 +208,26 @@ $Marker The internal Azure DevOps build for ``microsoft-aspire`` (definition 1602) is failing on ``$Branch``. -$failuresTable +- **Build:** [$BuildNumber]($BuildUrl) +- **Commit:** ``$CommitSha`` +- **Failed stages:** $stages -This issue is updated with a new row in the table above on each subsequent -failure of the same branch, and closed automatically when the next build of -``$Branch`` succeeds. See [docs/ci/internal-build-failure-notifications.md](https://github.com/$Owner/$Repo/blob/main/docs/ci/internal-build-failure-notifications.md). +Each subsequent failure on the same branch adds a comment with that build's +details; the comments are the per-failure history. This issue is closed +automatically when the next build of ``$Branch`` succeeds. See [docs/ci/internal-build-failure-notifications.md](https://github.com/$Owner/$Repo/blob/main/docs/ci/internal-build-failure-notifications.md). $Script:MentionLine "@ } function New-FailureFollowupCommentBody { + $stages = Format-FailedStages return @" -Another failure on ``$Branch`` — see the failures table in the issue body for full history. +Another failure on ``$Branch``. - **Build:** [$BuildNumber]($BuildUrl) - **Commit:** ``$CommitSha`` +- **Failed stages:** $stages $Script:MentionLine "@ @@ -298,7 +254,7 @@ function Invoke-FailureMode { $body = New-IssueBody -Marker $Marker $body -split "`n" | ForEach-Object { Write-Step "DRY-RUN: | $_" } Write-Step "DRY-RUN: would then re-list and close as duplicate if not oldest (race handler)" - Write-Step "DRY-RUN: if an existing issue had been found, would run 'gh issue edit' to update the failures table and 'gh issue comment' for the follow-up with @-mentions" + Write-Step "DRY-RUN: if an existing issue had been found, would run 'gh issue comment' to append a follow-up failure comment with @-mentions" return } @@ -351,21 +307,7 @@ function Invoke-FailureMode { Write-NotifyWarning "Found $($existing.Count) open ci-broken issues for branch '$Branch' (numbers: $($existing.number -join ', ')). Updating the oldest (#$($target.number)) and leaving the rest for human cleanup." } else { - Write-Step "Found existing open issue #$($target.number) for branch '$Branch'. Updating failures table and appending comment." - } - - # Append a row to the failures table in the issue body. Race: two - # near-simultaneous failures doing list->modify->edit may drop one row; - # accepted because the follow-up comment still fires and per-failure - # history is preserved in the issue comments. The table is the at-a-glance - # summary, not the system of record. - $currentBody = $target.body - if ($null -ne $currentBody) { - $newBody = Update-FailuresTableInBody -Body $currentBody - if ($newBody -ne $currentBody) { - Invoke-Gh -ArgList @('issue', 'edit', "$($target.number)", '--repo', "$Owner/$Repo", '--body-file', '-') -StdinBody $newBody | Out-Null - Write-Step "Updated failures table in #$($target.number)." - } + Write-Step "Found existing open issue #$($target.number) for branch '$Branch'. Appending failure comment." } $commentBody = New-FailureFollowupCommentBody @@ -411,6 +353,12 @@ try { $marker = "" if (-not $DryRun) { + # Bail loudly if gh is missing rather than minting a token we can't use + # and then failing every API call into the catch (see Test-GhAvailable). + if (-not (Test-GhAvailable)) { + Write-NotifyWarning "gh CLI is not available on this image; cannot file/update/close issues. Skipping notification." + Exit-NotifyScript + } # Live mode enforces -AppId/-PrivateKeyPem presence here (see param comment). if ([string]::IsNullOrWhiteSpace($AppId) -or [string]::IsNullOrWhiteSpace($PrivateKeyPem)) { Write-NotifyWarning "Live mode requires -AppId and -PrivateKeyPem; aborting." diff --git a/tests/Infrastructure.Tests/PowerShellScripts/NotifyGitHubOnBuildResultTests.cs b/tests/Infrastructure.Tests/PowerShellScripts/NotifyGitHubOnBuildResultTests.cs index b2adfcba50e..41fcab02aa3 100644 --- a/tests/Infrastructure.Tests/PowerShellScripts/NotifyGitHubOnBuildResultTests.cs +++ b/tests/Infrastructure.Tests/PowerShellScripts/NotifyGitHubOnBuildResultTests.cs @@ -1,7 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Text.RegularExpressions; using Aspire.TestUtilities; using Aspire.Templates.Tests; using Xunit; @@ -55,135 +54,139 @@ public async Task TestNotifiableBranch_ClassifiesBranches(string branch, bool ex [Fact] [RequiresTools(["pwsh"])] - public async Task NewFailureTableRow_EscapesPipesInFailedStages() + public async Task GhAvailable_ReturnsTrue_WhenGhExitsZero() { - var (_, row) = await RunHarnessAsync(""" - $CommitSha = 'abcdef1234567' - $FailedStages = 'build | test' - $BuildNumber = 'B9' - $BuildUrl = 'https://example.test/9' - $__result = New-FailureTableRow -Index 3 + var bin = MakeFakeGhBin(exitCode: 0); + var (_, r) = await RunHarnessAsync($""" + $env:PATH = '{bin}' + $__result = Test-GhAvailable """); - // A raw pipe would split the markdown table cell; it must be backslash-escaped. - Assert.Contains(@"build \| test", row); - Assert.DoesNotContain("build | test", row); + Assert.Equal("True", r.Trim()); } [Fact] [RequiresTools(["pwsh"])] - public async Task NewFailureTableRow_ShortensCommitShaToSevenChars() + public async Task GhAvailable_ReturnsFalse_WhenGhExitsNonZero() { - var (_, row) = await RunHarnessAsync(""" - $CommitSha = 'abcdef1234567' - $FailedStages = 'build' - $BuildNumber = 'B9' - $BuildUrl = 'https://example.test/9' - $__result = New-FailureTableRow -Index 3 + var bin = MakeFakeGhBin(exitCode: 3); + var (_, r) = await RunHarnessAsync($""" + $env:PATH = '{bin}' + $__result = Test-GhAvailable """); - // Displayed sha is shortened to 7 chars, but the commit link uses the full sha. - Assert.Contains("[`abcdef1`]", row); - Assert.Contains("/commit/abcdef1234567)", row); + Assert.Equal("False", r.Trim()); } [Fact] [RequiresTools(["pwsh"])] - public async Task NewFailureTableRow_HandlesShaShorterThanSevenChars() + public async Task GhAvailable_ReturnsFalse_WhenGhMissingFromPath() { - var (_, row) = await RunHarnessAsync(""" - $CommitSha = 'abc' - $FailedStages = 'build' - $BuildNumber = 'B9' - $BuildUrl = 'https://example.test/9' - $__result = New-FailureTableRow -Index 3 + // PATH points at an empty dir, so `gh` cannot be resolved; the probe must + // report unavailable rather than let the resolution error escape. + var empty = Path.Combine(_tempDir.Path, $"empty-{Guid.NewGuid():N}"); + Directory.CreateDirectory(empty); + var (_, r) = await RunHarnessAsync($""" + $env:PATH = '{empty}' + $__result = Test-GhAvailable """); - // Substring(0, 7) would throw on a sub-7-char sha; the script must fall back to the full value. - Assert.Contains("[`abc`]", row); - Assert.Contains("/commit/abc)", row); + Assert.Equal("False", r.Trim()); } [Fact] [RequiresTools(["pwsh"])] - public async Task NewFailureTableRow_RendersEmDashForEmptyFailedStages() + public async Task IssueBodyMatchesMarker_True_WhenMarkerAtStartOfLine() { - var (_, row) = await RunHarnessAsync(""" - $CommitSha = 'abcdef1234567' - $FailedStages = '' - $BuildNumber = 'B9' - $BuildUrl = 'https://example.test/9' - $__result = New-FailureTableRow -Index 3 + var (_, r) = await RunHarnessAsync(""" + $marker = '' + $body = "$marker`n`nThe build is failing." + $__result = Test-IssueBodyMatchesMarker -Body $body -Marker $marker """); - Assert.Contains("| \u2014 |", row); + Assert.Equal("True", r.Trim()); } [Fact] [RequiresTools(["pwsh"])] - public async Task UpdateFailuresTableInBody_AppendsRowAndPreservesSurroundingProse() + public async Task IssueBodyMatchesMarker_False_WhenMarkerOnlyMidProse() { - var (_, body) = await RunHarnessAsync(""" - $begin = '' - $end = '' - $hdr = "| # | When (UTC) | Build | Commit | Failed stages |`n|---|------------|-------|--------|---------------|" - $existing = "INTRO PROSE`n$begin`n$hdr`n| 1 | 2026-01-01 00:00 | [B1](u) | [``abc``](c) | x |`n$end`nTRAILING PROSE" - $__result = Update-FailuresTableInBody -Body $existing + // The marker pasted inside a sentence (not at line start) must NOT match, + // so success-mode never comments on / closes an unrelated issue. + var (_, r) = await RunHarnessAsync(""" + $marker = '' + $body = "see the tracking issue mentioning $marker inline somewhere" + $__result = Test-IssueBodyMatchesMarker -Body $body -Marker $marker """); - Assert.Equal(2, CountDataRows(body)); - Assert.Contains("| 2 |", body); - // Human-authored prose outside the managed region must survive the edit. - Assert.Contains("INTRO PROSE", body); - Assert.Contains("TRAILING PROSE", body); + Assert.Equal("False", r.Trim()); } [Fact] [RequiresTools(["pwsh"])] - public async Task UpdateFailuresTableInBody_RollsOverOldestRowsBeyondCap() + public async Task IssueBodyMatchesMarker_False_WhenMarkerAbsentOrBodyEmpty() { - var (_, body) = await RunHarnessAsync(""" - $begin = '' - $end = '' - $hdr = "| # | When (UTC) | Build | Commit | Failed stages |`n|---|------------|-------|--------|---------------|" - $rows = (1..50 | ForEach-Object { "| $_ | 2026-01-01 00:00 | [B](u) | [``abc``](c) | x |" }) -join "`n" - $existing = "$begin`n$hdr`n$rows`n$end" - $__result = Update-FailuresTableInBody -Body $existing + var (_, r) = await RunHarnessAsync(""" + $marker = '' + $a = Test-IssueBodyMatchesMarker -Body 'no marker here' -Marker $marker + $b = Test-IssueBodyMatchesMarker -Body '' -Marker $marker + $__result = "$a,$b" """); - // FailuresTableMaxRows = 50: the 51st row drops the oldest, which is summarized as omitted. - Assert.Equal(50, CountDataRows(body)); - Assert.Matches(@"_\d+ earlier failures omitted", body); + Assert.Equal("False,False", r.Trim()); } [Fact] [RequiresTools(["pwsh"])] - public async Task UpdateFailuresTableInBody_AccountsForExistingOmittedCountInNextIndex() + public async Task NewIssueBody_ContainsMarkerBuildCommitAndFailedStages() { var (_, body) = await RunHarnessAsync(""" - $begin = '' - $end = '' - $hdr = "| # | When (UTC) | Build | Commit | Failed stages |`n|---|------------|-------|--------|---------------|" - $omitted = '_3 earlier failures omitted; see issue comments._' - $existing = "$begin`n$omitted`n`n$hdr`n| 4 | 2026-01-01 00:00 | [B](u) | [``abc``](c) | x |`n| 5 | 2026-01-01 00:00 | [B](u) | [``abc``](c) | x |`n$end" - $__result = Update-FailuresTableInBody -Body $existing + $Branch = 'main' + $BuildNumber = 'B9' + $BuildUrl = 'https://example.test/9' + $CommitSha = 'abcdef1234567' + $FailedStages = 'build, assemble' + $__result = New-IssueBody -Marker '' """); - // nextIndex = visibleRows(2) + omitted(3) + 1 = 6 - Assert.Contains("| 6 |", body); + Assert.StartsWith("", body); + Assert.Contains("[B9](https://example.test/9)", body); + Assert.Contains("abcdef1234567", body); + Assert.Contains("**Failed stages:** build, assemble", body); + Assert.Contains("cc @joperezr @radical", body); + // The in-body failures table was removed; no managed-region markers remain. + Assert.DoesNotContain("ci-broken-failures", body); } [Fact] [RequiresTools(["pwsh"])] - public async Task UpdateFailuresTableInBody_LeavesBodyUnchangedAndWarnsWhenMarkersMissing() + public async Task NewFailureFollowupComment_ContainsBuildCommitAndFailedStages() { - var (result, body) = await RunHarnessAsync(""" - $existing = 'a hand-edited body with no managed markers' - $__result = Update-FailuresTableInBody -Body $existing + var (_, comment) = await RunHarnessAsync(""" + $Branch = 'main' + $BuildNumber = 'B42' + $BuildUrl = 'https://example.test/42' + $CommitSha = 'deadbeefcafef00d' + $FailedStages = 'template_tests' + $__result = New-FailureFollowupCommentBody """); - Assert.Equal("a hand-edited body with no managed markers", body); - Assert.Contains("Could not locate failures-table markers", result.Output); + Assert.Contains("[B42](https://example.test/42)", comment); + Assert.Contains("deadbeefcafef00d", comment); + Assert.Contains("**Failed stages:** template_tests", comment); + Assert.Contains("cc @joperezr @radical", comment); + } + + [Fact] + [RequiresTools(["pwsh"])] + public async Task FormatFailedStages_RendersEmDashWhenEmpty() + { + var (_, r) = await RunHarnessAsync(""" + $FailedStages = '' + $__result = Format-FailedStages + """); + + Assert.Equal("\u2014", r.Trim()); } [Fact] @@ -288,8 +291,28 @@ private static void WriteFakeGh(string dir) } } - private static int CountDataRows(string body) - => Regex.Matches(body, @"(?m)^\|\s*\d+\s*\|").Count; + private string MakeFakeGhBin(int exitCode) + { + var dir = Path.Combine(_tempDir.Path, $"ghbin-{Guid.NewGuid():N}"); + Directory.CreateDirectory(dir); + if (OperatingSystem.IsWindows()) + { + File.WriteAllText( + Path.Combine(dir, "gh.cmd"), + $"@echo off\r\necho gh version 0.0\r\nexit /b {exitCode}\r\n"); + } + else + { + var ghPath = Path.Combine(dir, "gh"); + File.WriteAllText(ghPath, $"#!/bin/sh\necho 'gh version 0.0'\nexit {exitCode}\n"); + File.SetUnixFileMode( + ghPath, + UnixFileMode.UserRead | UnixFileMode.UserWrite | UnixFileMode.UserExecute | + UnixFileMode.GroupRead | UnixFileMode.GroupExecute | + UnixFileMode.OtherRead | UnixFileMode.OtherExecute); + } + return dir; + } // Dot-sources the shipped script with a non-notifiable branch (so main bails before any // gh/token work) and runs the per-test body, which sets $__result. The "# __BODY__" token From 66136561d4a76440b780008bd1bc7a981d61aaed Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Tue, 9 Jun 2026 14:54:49 -0400 Subject: [PATCH 7/8] refactor(ci-notify): extract shared notify stage job into a template notify_failure and notify_success were ~50 lines of duplicated job scaffolding (the jobs.yml@self wrapper, pool, checkout, and the pwsh notifier call) that could silently drift apart. Extract that job into eng/pipelines/templates/notify-build-result.yml, parameterized by {mode, jobName, jobDisplayName, dryRun}. Each stage keeps only what must differ and live at stage scope: dependsOn, the condition, and the six *StageResult variables. Also drops the per-stage NotifyDryRunFlag variable indirection in favor of parsing the queue-time parameter directly in the template ($dryRun = [bool]::Parse('${{ parameters.dryRun }}')), keeping the typed -DryRun:$dryRun call. The template follows the existing double-nest convention (stage jobs: -> repo template -> jobs.yml@self) already used by the build_extension stage. This is internal-pipeline YAML that does not run on GitHub PRs; validate with an AzDO dry-run (notifyOnFailureDryRun=true) before merge. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- eng/pipelines/azure-pipelines.yml | 166 +++--------------- .../templates/notify-build-result.yml | 101 +++++++++++ 2 files changed, 128 insertions(+), 139 deletions(-) create mode 100644 eng/pipelines/templates/notify-build-result.yml diff --git a/eng/pipelines/azure-pipelines.yml b/eng/pipelines/azure-pipelines.yml index 38f1803386d..f158497ebbd 100644 --- a/eng/pipelines/azure-pipelines.yml +++ b/eng/pipelines/azure-pipelines.yml @@ -939,8 +939,8 @@ extends: ) ) variables: - # Stage results, exposed as runtime variables so the pwsh step below - # can compose a "failed stages" list for the GitHub notification. + # Stage results exposed as runtime variables so the shared notify + # template can compose a "failed stages" list for the notification. - name: BuildSignNativeStageResult value: $[ dependencies.build_sign_native.result ] - name: BuildExtensionStageResult @@ -953,91 +953,13 @@ extends: value: $[ dependencies.assemble.result ] - name: PrepareInstallersStageResult value: $[ dependencies.prepare_installers.result ] - # Drives the -DryRun:$value pattern in the pwsh wrapper below. - # Values are literal 'true'/'false' so the wrapper can parse with - # `-eq 'true'` and avoid PowerShell's array-splat positional - # binding — passing '-DryRun' through @extraArgs binds the string - # to the first non-mandatory positional string parameter (-Owner) - # instead of engaging the switch, leaving DryRun=False and - # producing a live token mint against '-DryRun/aspire'. - - ${{ if eq(parameters.notifyOnFailureDryRun, true) }}: - - name: NotifyDryRunFlag - value: 'true' - - ${{ else }}: - - name: NotifyDryRunFlag - value: 'false' jobs: - - template: /eng/common/templates-official/jobs/jobs.yml@self + - template: /eng/pipelines/templates/notify-build-result.yml@self parameters: - enableMicrobuild: false - enablePublishUsingPipelines: false - enablePublishBuildAssets: false - # No build output to publish; mirrors prepare_installers. - enablePublishBuildArtifacts: false - enableTelemetry: true - workspace: - clean: all - jobs: - - job: NotifyOnFailure - displayName: 'File / update ci-broken issue' - timeoutInMinutes: 15 - pool: - name: NetCore1ESPool-Internal - image: 1es-ubuntu-2204 - os: linux - templateContext: - mb: - publish: - enabled: false - steps: - - checkout: self - displayName: 'Checkout repo (for notify script)' - fetchDepth: 1 - - pwsh: | - $ErrorActionPreference = 'Stop' - # Use env vars instead of $(Build.*) macros: a branch with a - # quote in it would break out of the PowerShell literal before - # the script's try/catch (and "always exit 0" invariant) - # could absorb it. - $branch = $env:BUILD_SOURCEBRANCH -replace '^refs/heads/', '' - $buildUrl = "$($env:SYSTEM_TEAMFOUNDATIONCOLLECTIONURI)$($env:SYSTEM_TEAMPROJECT)/_build/results?buildId=$($env:BUILD_BUILDID)" - - # Pass DryRun as a typed -DryRun:$bool. Do NOT conditionally - # splat '-DryRun' through @extraArgs — array splatting is - # positional (about_Splatting), so the literal string would - # bind to the first non-mandatory positional string parameter - # (-Owner) instead of engaging the switch. - $dryRun = '$(NotifyDryRunFlag)' -eq 'true' - - # Compose the failed-stages list from the upstream stage - # results exposed via stage-scoped variables above. - $failed = @() - if ('$(BuildSignNativeStageResult)' -eq 'Failed') { $failed += 'build_sign_native' } - if ('$(BuildExtensionStageResult)' -eq 'Failed') { $failed += 'build_extension' } - if ('$(BuildStageResult)' -eq 'Failed') { $failed += 'build' } - if ('$(TemplateTestsStageResult)' -eq 'Failed') { $failed += 'template_tests' } - if ('$(AssembleStageResult)' -eq 'Failed') { $failed += 'assemble' } - if ('$(PrepareInstallersStageResult)' -eq 'Failed') { $failed += 'prepare_installers' } - $failedStages = $failed -join ', ' - - & "$env:BUILD_SOURCESDIRECTORY/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1" ` - -Mode Failure ` - -AppId "$env:ASPIRE_BOT_APP_ID" ` - -PrivateKeyPem "$env:ASPIRE_BOT_PRIVATE_KEY" ` - -Branch $branch ` - -BuildId $env:BUILD_BUILDID ` - -BuildNumber $env:BUILD_BUILDNUMBER ` - -BuildUrl $buildUrl ` - -CommitSha $env:BUILD_SOURCEVERSION ` - -FailedStages $failedStages ` - -DryRun:$dryRun - displayName: 'Run notify script (Failure mode)' - # Resolve aspire-repo-bot secrets only in live mode; dry-run - # skips the token mint and accepts empty -AppId/-PrivateKeyPem. - ${{ if ne(parameters.notifyOnFailureDryRun, true) }}: - env: - ASPIRE_BOT_APP_ID: $(aspire-bot-app-id) - ASPIRE_BOT_PRIVATE_KEY: $(aspire-bot-private-key) + mode: Failure + jobName: NotifyOnFailure + jobDisplayName: 'File / update ci-broken issue' + dryRun: ${{ parameters.notifyOnFailureDryRun }} - stage: notify_success displayName: Notify Build Success @@ -1075,59 +997,25 @@ extends: in(dependencies.prepare_installers.result, 'Succeeded', 'SucceededWithIssues', 'Skipped') ) variables: - - ${{ if eq(parameters.notifyOnFailureDryRun, true) }}: - - name: NotifyDryRunFlag - value: 'true' - - ${{ else }}: - - name: NotifyDryRunFlag - value: 'false' + # Stage results exposed as runtime variables so the shared notify + # template can compose a "failed stages" list. In Success mode none + # are 'Failed', so the list is empty and the notifier ignores it. + - name: BuildSignNativeStageResult + value: $[ dependencies.build_sign_native.result ] + - name: BuildExtensionStageResult + value: $[ dependencies.build_extension.result ] + - name: BuildStageResult + value: $[ dependencies.build.result ] + - name: TemplateTestsStageResult + value: $[ dependencies.template_tests.result ] + - name: AssembleStageResult + value: $[ dependencies.assemble.result ] + - name: PrepareInstallersStageResult + value: $[ dependencies.prepare_installers.result ] jobs: - - template: /eng/common/templates-official/jobs/jobs.yml@self + - template: /eng/pipelines/templates/notify-build-result.yml@self parameters: - enableMicrobuild: false - enablePublishUsingPipelines: false - enablePublishBuildAssets: false - enablePublishBuildArtifacts: false - enableTelemetry: true - workspace: - clean: all - jobs: - - job: CloseOnSuccess - displayName: 'Close ci-broken issue' - timeoutInMinutes: 15 - pool: - name: NetCore1ESPool-Internal - image: 1es-ubuntu-2204 - os: linux - templateContext: - mb: - publish: - enabled: false - steps: - - checkout: self - displayName: 'Checkout repo (for notify script)' - fetchDepth: 1 - - pwsh: | - $ErrorActionPreference = 'Stop' - $branch = $env:BUILD_SOURCEBRANCH -replace '^refs/heads/', '' - $buildUrl = "$($env:SYSTEM_TEAMFOUNDATIONCOLLECTIONURI)$($env:SYSTEM_TEAMPROJECT)/_build/results?buildId=$($env:BUILD_BUILDID)" - - # See -DryRun:$dryRun comment in the Failure-mode step above. - $dryRun = '$(NotifyDryRunFlag)' -eq 'true' - - & "$env:BUILD_SOURCESDIRECTORY/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1" ` - -Mode Success ` - -AppId "$env:ASPIRE_BOT_APP_ID" ` - -PrivateKeyPem "$env:ASPIRE_BOT_PRIVATE_KEY" ` - -Branch $branch ` - -BuildId $env:BUILD_BUILDID ` - -BuildNumber $env:BUILD_BUILDNUMBER ` - -BuildUrl $buildUrl ` - -CommitSha $env:BUILD_SOURCEVERSION ` - -DryRun:$dryRun - displayName: 'Run notify script (Success mode)' - # See env-block comment on the Failure-mode step above. - ${{ if ne(parameters.notifyOnFailureDryRun, true) }}: - env: - ASPIRE_BOT_APP_ID: $(aspire-bot-app-id) - ASPIRE_BOT_PRIVATE_KEY: $(aspire-bot-private-key) + mode: Success + jobName: CloseOnSuccess + jobDisplayName: 'Close ci-broken issue' + dryRun: ${{ parameters.notifyOnFailureDryRun }} diff --git a/eng/pipelines/templates/notify-build-result.yml b/eng/pipelines/templates/notify-build-result.yml new file mode 100644 index 00000000000..03364ce138f --- /dev/null +++ b/eng/pipelines/templates/notify-build-result.yml @@ -0,0 +1,101 @@ +# Job template shared by the notify_failure and notify_success stages in +# azure-pipelines.yml. Both stages run the same notifier script with a +# different -Mode and job name; only the per-stage dependsOn / condition +# (which must reference dependencies..result at the stage level) and +# the *StageResult variables stay in the stage definitions. +# +# See docs/ci/internal-build-failure-notifications.md for the full contract. + +parameters: + - name: mode + type: string + values: + - Failure + - Success + - name: jobName + type: string + - name: jobDisplayName + type: string + # Queue-time dry-run flag, forwarded from the pipeline's + # notifyOnFailureDryRun parameter. In dry-run the notifier logs the gh + # commands it would run and mints no token, so the aspire-repo-bot secret + # env block is omitted entirely. + - name: dryRun + type: boolean + default: false + +jobs: +- template: /eng/common/templates-official/jobs/jobs.yml@self + parameters: + enableMicrobuild: false + enablePublishUsingPipelines: false + enablePublishBuildAssets: false + # No build output to publish; mirrors prepare_installers. + enablePublishBuildArtifacts: false + enableTelemetry: true + workspace: + clean: all + jobs: + - job: ${{ parameters.jobName }} + displayName: ${{ parameters.jobDisplayName }} + timeoutInMinutes: 15 + pool: + name: NetCore1ESPool-Internal + image: 1es-ubuntu-2204 + os: linux + templateContext: + mb: + publish: + enabled: false + steps: + - checkout: self + displayName: 'Checkout repo (for notify script)' + fetchDepth: 1 + - pwsh: | + $ErrorActionPreference = 'Stop' + # Use env vars instead of $(Build.*) macros: a branch with a + # quote in it would break out of the PowerShell literal before + # the script's try/catch (and "always exit 0" invariant) could + # absorb it. + $branch = $env:BUILD_SOURCEBRANCH -replace '^refs/heads/', '' + $buildUrl = "$($env:SYSTEM_TEAMFOUNDATIONCOLLECTIONURI)$($env:SYSTEM_TEAMPROJECT)/_build/results?buildId=$($env:BUILD_BUILDID)" + + # The dry-run flag is interpolated from the queue-time parameter at + # template-expansion time, then parsed to a real bool and passed as + # a typed -DryRun:$bool. Do NOT splat a literal '-DryRun' string — + # array splatting is positional (about_Splatting), so the string + # would bind to the first non-mandatory positional string parameter + # (-Owner) instead of engaging the switch. + $dryRun = [bool]::Parse('${{ parameters.dryRun }}') + + # Compose the failed-stages list from the stage-result variables the + # stage exposes (see each stage's `variables:` block). In Success + # mode none are 'Failed', so the list is empty and the notifier + # ignores it. + $failed = @() + if ('$(BuildSignNativeStageResult)' -eq 'Failed') { $failed += 'build_sign_native' } + if ('$(BuildExtensionStageResult)' -eq 'Failed') { $failed += 'build_extension' } + if ('$(BuildStageResult)' -eq 'Failed') { $failed += 'build' } + if ('$(TemplateTestsStageResult)' -eq 'Failed') { $failed += 'template_tests' } + if ('$(AssembleStageResult)' -eq 'Failed') { $failed += 'assemble' } + if ('$(PrepareInstallersStageResult)' -eq 'Failed') { $failed += 'prepare_installers' } + $failedStages = $failed -join ', ' + + & "$env:BUILD_SOURCESDIRECTORY/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1" ` + -Mode ${{ parameters.mode }} ` + -AppId "$env:ASPIRE_BOT_APP_ID" ` + -PrivateKeyPem "$env:ASPIRE_BOT_PRIVATE_KEY" ` + -Branch $branch ` + -BuildId $env:BUILD_BUILDID ` + -BuildNumber $env:BUILD_BUILDNUMBER ` + -BuildUrl $buildUrl ` + -CommitSha $env:BUILD_SOURCEVERSION ` + -FailedStages $failedStages ` + -DryRun:$dryRun + displayName: 'Run notify script (${{ parameters.mode }} mode)' + # Resolve aspire-repo-bot secrets only in live mode; dry-run skips the + # token mint and accepts empty -AppId/-PrivateKeyPem. + ${{ if ne(parameters.dryRun, true) }}: + env: + ASPIRE_BOT_APP_ID: $(aspire-bot-app-id) + ASPIRE_BOT_PRIVATE_KEY: $(aspire-bot-private-key) From ccda4d4a0a43fb6447f516f7f0793851b9031a30 Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Tue, 9 Jun 2026 15:33:36 -0400 Subject: [PATCH 8/8] refactor(ci-notify): drop post-create duplicate-issue race handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first-failure path created the issue, then re-listed and self-closed if it wasn't the oldest carrying the marker — guarding against two builds of the same branch filing duplicates within the gh-issue-list consistency window. Builds are rolling, so that race is vanishingly rare, and the guard cost an extra `gh issue list` round-trip on every first failure. Drop it: on the rare double-file, the existing "found N open issues, update the oldest, leave the rest for human cleanup" path already degrades gracefully and a human closes the duplicate. Addresses review feedback on PR #17920. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../internal-build-failure-notifications.md | 8 ++--- .../scripts/Notify-GitHubOnBuildResult.ps1 | 29 ++++--------------- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/docs/ci/internal-build-failure-notifications.md b/docs/ci/internal-build-failure-notifications.md index 16623be11fa..2e793a28503 100644 --- a/docs/ci/internal-build-failure-notifications.md +++ b/docs/ci/internal-build-failure-notifications.md @@ -96,10 +96,10 @@ intentionally avoided because its 1–2 minute eventual-consistency window would cause near-simultaneous failed builds to each see "0 hits" and file duplicate issues. -After the create-issue path, the script re-lists immediately. If our -just-created issue is not the oldest carrying the marker, it closes itself -as a duplicate of the older issue. This self-heals the rare race when two -builds fail within seconds of each other. +Two builds of the same branch failing within that window can still briefly +create two issues. Because builds are rolling this is rare, and the cost of +auto-deduping it (an extra `gh issue list` round-trip on every first-failure) +isn't worth it — the duplicate is left for a human to close. ## Auth diff --git a/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 b/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 index a8eb3d3408b..e0f46e5a037 100644 --- a/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 +++ b/eng/pipelines/scripts/Notify-GitHubOnBuildResult.ps1 @@ -20,9 +20,10 @@ # endpoint is intentionally avoided because its 1-2 min eventual # consistency causes near-simultaneous failed builds to each see # "0 hits" and each file a duplicate. -# - On create, we re-list to detect the rare race where two builds raced -# past the consistency window simultaneously, and close ours as a -# duplicate of the older one. +# - Two builds of the same branch failing within the same window can still +# briefly create two issues. Builds are rolling so this is rare; the +# duplicate is left for a human to close rather than auto-deduped, which +# avoids an extra `gh issue list` round-trip on every first-failure. # # Per-failure history: # - The issue body is written once at creation (build, commit, and the @@ -253,7 +254,6 @@ function Invoke-FailureMode { Write-Step "DRY-RUN: issue body would contain:" $body = New-IssueBody -Marker $Marker $body -split "`n" | ForEach-Object { Write-Step "DRY-RUN: | $_" } - Write-Step "DRY-RUN: would then re-list and close as duplicate if not oldest (race handler)" Write-Step "DRY-RUN: if an existing issue had been found, would run 'gh issue comment' to append a follow-up failure comment with @-mentions" return } @@ -278,27 +278,10 @@ function Invoke-FailureMode { ) + $labelFlags + $assigneeFlags # gh issue create prints the new issue's URL as the last non-empty - # line of stdout. Parse the number out for the race-handler compare. + # line of stdout. $createOutput = Invoke-Gh -ArgList $createArgs -StdinBody $issueBody $createdUrl = (@($createOutput) | Where-Object { -not [string]::IsNullOrWhiteSpace($_) } | Select-Object -Last 1).ToString().Trim() - if ($createdUrl -notmatch '/issues/(\d+)$') { - throw "Could not parse issue number from gh output: '$createdUrl'" - } - $createdNumber = [int]$Matches[1] - Write-Step "Created issue #${createdNumber}: $createdUrl" - - # Post-create race handler: re-list and close as duplicate if we - # aren't the oldest. See file header "Dedupe strategy". - $recheck = Get-OpenBrokenIssuesForBranch -Marker $Marker - $oldest = $recheck | Where-Object { $_.number -ne $createdNumber } | Select-Object -First 1 - if ($null -ne $oldest -and $oldest.number -lt $createdNumber) { - Write-Step "Race detected: older open issue #$($oldest.number) found. Closing our just-created #${createdNumber} as duplicate." - $dupBody = "Duplicate of #$($oldest.number). Two near-simultaneous failed builds raced past the dedupe window; auto-closing this one." - Invoke-Gh -ArgList @('issue', 'comment', "$createdNumber", '--repo', "$Owner/$Repo", '--body-file', '-') -StdinBody $dupBody | Out-Null - # `gh issue close --reason` accepts "completed" or "not planned" - # (with a space) — gh maps the latter to the API's not_planned. - Invoke-Gh -ArgList @('issue', 'close', "$createdNumber", '--repo', "$Owner/$Repo", '--reason', 'not planned') | Out-Null - } + Write-Step "Created issue: $createdUrl" return }