From c83dad61e86c01a6346dfc1e46a5f6f3d7668df0 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Tue, 26 May 2026 16:22:37 +0530 Subject: [PATCH 01/12] ci:unified deployment pipeline v1 --- .github/PULL_REQUEST_TEMPLATE.md | 17 + .github/workflows/api-deploy.yml | 36 +- .github/workflows/deploy-all.yml | 1870 +++++++++++++++++ .github/workflows/main-deploy.yml | 48 +- .github/workflows/rollback.yml | 456 ++++ .github/workflows/validate-migration.yml | 134 ++ .github/workflows/worker-deploy.yml | 34 +- AGENTS.md | 2 + CLAUDE.md | 12 + Dockerfile | 187 +- apps/connector/Dockerfile | 21 +- apps/worker/Dockerfile | 19 +- docs/unified_deployment.md | 1638 +++++++++++++++ entrypoint.sh | 12 - infra/scripts/acquire-lock.sh | 162 ++ infra/scripts/deploy-state-paths.sh | 65 + infra/scripts/drain-workers.sh | 98 + infra/scripts/migrate.sh | 275 +++ infra/scripts/ngx-utils.sh | 462 ++++ infra/scripts/preflight-space.sh | 298 +++ infra/scripts/promote-all.sh | 226 ++ infra/scripts/record-state.sh | 221 ++ infra/scripts/refresh-lock.sh | 120 ++ infra/scripts/release-lock.sh | 87 + infra/scripts/revert-nginx.sh | 61 + infra/scripts/rollback.sh | 315 +++ infra/scripts/send-email.sh | 231 ++ infra/scripts/stage-api.sh | 179 ++ infra/scripts/stage-web.sh | 195 ++ infra/scripts/start-workers.sh | 306 +++ infra/scripts/validate-rollback-schema.sh | 138 ++ .../viewer/webhook/testTrigger.handler.ts | 6 +- 32 files changed, 7777 insertions(+), 154 deletions(-) create mode 100644 .github/workflows/deploy-all.yml create mode 100644 .github/workflows/rollback.yml create mode 100644 .github/workflows/validate-migration.yml create mode 100644 docs/unified_deployment.md create mode 100755 infra/scripts/acquire-lock.sh create mode 100644 infra/scripts/deploy-state-paths.sh create mode 100755 infra/scripts/drain-workers.sh create mode 100755 infra/scripts/migrate.sh create mode 100644 infra/scripts/ngx-utils.sh create mode 100644 infra/scripts/preflight-space.sh create mode 100755 infra/scripts/promote-all.sh create mode 100755 infra/scripts/record-state.sh create mode 100755 infra/scripts/refresh-lock.sh create mode 100755 infra/scripts/release-lock.sh create mode 100755 infra/scripts/revert-nginx.sh create mode 100755 infra/scripts/rollback.sh create mode 100755 infra/scripts/send-email.sh create mode 100755 infra/scripts/stage-api.sh create mode 100755 infra/scripts/stage-web.sh create mode 100755 infra/scripts/start-workers.sh create mode 100755 infra/scripts/validate-rollback-schema.sh diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 71d2fa59aae2d7..9e0156a721eb82 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -25,6 +25,23 @@ A visual demonstration is strongly recommended, for both the original and new ch - [ ] I have updated the developer docs in /docs if this PR makes changes that would require a [documentation change](https://cal.com/docs). If N/A, write N/A here and check the checkbox. - [ ] I confirm automated tests are in place that prove my fix is effective or that my feature works. +## Migration Type + + + +- [ ] N/A - no Prisma migration changes +- [ ] EXPAND - backward-compatible schema change +- [ ] DATA-BACKFILL - data movement/backfill with validation plan +- [ ] CONTRACT - cleanup/destructive migration with explicit approval + +Rollback compatibility: + + + +Backfill / cleanup plan: + + + ## How should this be tested? diff --git a/.github/workflows/api-deploy.yml b/.github/workflows/api-deploy.yml index a9a48952a7ad84..72ee34bfe88c43 100644 --- a/.github/workflows/api-deploy.yml +++ b/.github/workflows/api-deploy.yml @@ -1,17 +1,24 @@ name: Build Connector API, Push Docker Image, Deploy to EC2 +# DEPRECATED — Auto-deploy on PR merge is deprecated. +# Deployment is now managed by .github/workflows/deploy-all.yml +# which provides unified staging/production deployment for all services (Web + API + Worker). +# The `pull_request` trigger below will be removed in a future release. +# Use `workflow_dispatch` for manual deployments, or rely on deploy-all.yml for automated deploys. +# +# Deprecated triggers will be removed in Phase 12. on: - pull_request: - types: - - closed - branches: - - main - - develop - paths: - - "apps/connector/**" - - "packages/**" - - "infra/docker/connector/**" - - ".github/workflows/api-deploy.yml" + # pull_request: + # types: + # - closed + # branches: + # - main + # - develop + # paths: + # - "apps/connector/**" + # - "packages/**" + # - "infra/docker/connector/**" + # - ".github/workflows/api-deploy.yml" workflow_dispatch: inputs: branch: @@ -315,10 +322,11 @@ jobs: push: true build-args: | GIT_HASH=${{ steps.hash.outputs.GIT_HASH }} - SENTRY_AUTH_TOKEN=${{ secrets.SENTRY_AUTH_TOKEN }} - SENTRY_ORG=${{ secrets.SENTRY_ORG }} - SENTRY_PROJECT=${{ steps.key_values.outputs.SENTRY_PROJECT }} # SENTRY_UPLOAD_STRICT=true + secrets: | + sentry_auth_token=${{ secrets.SENTRY_AUTH_TOKEN }} + sentry_org=${{ secrets.SENTRY_ORG }} + sentry_project=${{ steps.key_values.outputs.SENTRY_PROJECT }} tags: | ${{ steps.login-ecr.outputs.registry }}/${{ steps.key_values.outputs.REPO_NAME }}:${{ steps.hash.outputs.GIT_HASH }} ${{ steps.login-ecr.outputs.registry }}/${{ steps.key_values.outputs.REPO_NAME }}:latest diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml new file mode 100644 index 00000000000000..025a2f955d2cb6 --- /dev/null +++ b/.github/workflows/deploy-all.yml @@ -0,0 +1,1870 @@ +name: Deploy All Services + +on: + # Auto-deploy on PR merge to main or develop branches for relevant path changes + pull_request: + types: + - closed + branches: + - main + - develop + paths: + - 'apps/web/**' + - 'apps/connector/**' + - 'apps/worker/**' + - 'packages/**' + - 'Dockerfile' + - 'entrypoint.sh' + - 'apps/connector/Dockerfile' + - 'apps/worker/Dockerfile' + - 'infra/docker/**' + - 'infra/scripts/**' + - '.github/workflows/deploy-all.yml' + # Manual trigger for specific branch or commit override + workflow_dispatch: + inputs: + branch: + description: "Target branch to deploy" + required: true + default: "develop" + git_hash: + description: "Optional Git commit hash to deploy" + required: false + default: "" + rebuild: + description: "Force rebuild even if images exist in ECR" + required: true + default: "false" + type: choice + options: + - "true" + - "false" + worker_replicas: + description: "Number of worker replicas (overrides secret)" + required: false + default: "" + +env: + BRANCH: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.branch || github.ref_name }} + # Deployment state lives in one bucket; scripts derive deployment-prod/deployment-stag from DEPLOY_ENV. + DEPLOY_STATE_BUCKET: cal-id + +concurrency: + group: >- + deploy-all-${{ + ( + github.event_name == 'workflow_dispatch' + && github.event.inputs.branch + || github.ref_name + ) == 'main' + && 'production' + || 'staging' + }} + cancel-in-progress: false + +jobs: + # ---------- Preflight Validation ---------- + # Runs before any build or mutation. Validates all required inputs are present. + # No secret values are exposed — only key names and presence indicators. + + preflight: + name: Preflight validation + runs-on: ubuntu-latest + if: github.event.pull_request.merged == true || github.event_name == 'workflow_dispatch' + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.merged == true && github.ref_name || (github.event.inputs.branch || github.ref_name) }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Validate Dockerfile syntax + shell: bash + run: | + set -euo pipefail + docker buildx build --check -f Dockerfile . + docker buildx build --check -f apps/connector/Dockerfile . + docker buildx build --check -f apps/worker/Dockerfile . + + - name: Validate critical deployment scripts + shell: bash + run: | + set -euo pipefail + bash -n infra/scripts/preflight-space.sh + bash -n infra/scripts/migrate.sh + bash -n infra/scripts/start-workers.sh + bash -n infra/scripts/drain-workers.sh + PROMOTION_COMPLETE=true WORKER_IMAGE=dry-run WORKER_REPLICAS=1 WORKER_LIFECYCLE_DRY_RUN=true \ + bash infra/scripts/start-workers.sh + WORKER_LIFECYCLE_DRY_RUN=true bash infra/scripts/drain-workers.sh + + - name: Validate required configuration + id: preflight + shell: bash + run: | + set -euo pipefail + + # Resolve environment once + branch="${{ github.event.inputs.branch || github.ref_name }}" + if [ "$branch" = "main" ]; then + deploy_env="production" + image_suffix="prod" + else + deploy_env="staging" + image_suffix="stag" + fi + + failed=0 + + check() { + local secret_name="$1" + local display_name="$2" + if [ -z "$secret_name" ]; then + echo "PREFLIGHT: MISSING — $display_name" + failed=1 + else + echo "PREFLIGHT: PRESENT — $display_name" + fi + } + + optional() { + local secret_name="$1" + local display_name="$2" + if [ -z "$secret_name" ]; then + echo "PREFLIGHT: OPTIONAL MISSING — $display_name" + else + echo "PREFLIGHT: OPTIONAL PRESENT — $display_name" + fi + } + + echo "=== AWS Core ===" + check "${{ secrets.AWS_REGION }}" "AWS_REGION" + check "${{ secrets.AWS_ACCESS_KEY_ID }}" "AWS_ACCESS_KEY_ID" + check "${{ secrets.AWS_SECRET_ACCESS_KEY }}" "AWS_SECRET_ACCESS_KEY" + check "${{ secrets.AWS_ACCOUNT_ID }}" "AWS_ACCOUNT_ID" + + echo "=== ECR Repositories ===" + # Repository names are derived; validate they are non-empty + web_repo="cal_${image_suffix}"; echo "PREFLIGHT: REPO — web=$web_repo" + api_repo="cal_api_${image_suffix}"; echo "PREFLIGHT: REPO — api=$api_repo" + worker_repo="cal_worker_${image_suffix}"; echo "PREFLIGHT: REPO — worker=$worker_repo" + + echo "=== EC2 Target ===" + ec2_host_secret="${{ secrets.EC2_HOST_PROD }}" + ec2_key_secret="${{ secrets.EC2_SSH_KEY_PROD }}" + if [ "$deploy_env" = "staging" ]; then + ec2_host_secret="${{ secrets.EC2_HOST_STAG }}" + ec2_key_secret="${{ secrets.EC2_SSH_KEY_STAG }}" + fi + check "$ec2_host_secret" "EC2_HOST (${deploy_env})" + check "$ec2_key_secret" "EC2_SSH_KEY (${deploy_env})" + + echo "=== Domain / Config ===" + domain_secret="${{ secrets.DOMAIN_NAME_PROD }}" + homepage_secret="${{ secrets.HOMEPAGE_URL_PROD }}" + if [ "$deploy_env" = "staging" ]; then + domain_secret="${{ secrets.DOMAIN_NAME_STAG }}" + homepage_secret="${{ secrets.HOMEPAGE_URL_STAG }}" + fi + check "$domain_secret" "DOMAIN_NAME (${deploy_env})" + check "$homepage_secret" "HOMEPAGE_URL (${deploy_env})" + + echo "=== Web build args ===" + gtm_secret="${{ secrets.NEXT_PUBLIC_GTM_ID_PROD }}" + sentry_dsn_secret="${{ secrets.NEXT_PUBLIC_SENTRY_DSN_PROD }}" + calendly_client_secret="${{ secrets.NEXT_PUBLIC_CALENDLY_CLIENT_ID_PROD }}" + calendly_redirect_secret="${{ secrets.NEXT_PUBLIC_CALENDLY_REDIRECT_URI_PROD }}" + stripe_public_secret="${{ secrets.NEXT_PUBLIC_STRIPE_PUBLIC_KEY_PROD }}" + onehash_chat_secret="${{ secrets.NEXT_PUBLIC_ONEHASH_CHAT_URL_PROD }}" + razorpay_client_secret="${{ secrets.NEXT_PUBLIC_RAZORPAY_CLIENT_ID_PROD }}" + signup_url_secret="${{ secrets.NEXT_PUBLIC_SIGNUP_URL_PROD }}" + posthog_key_secret="${{ secrets.NEXT_PUBLIC_POSTHOG_KEY_PROD }}" + if [ "$deploy_env" = "staging" ]; then + gtm_secret="${{ secrets.NEXT_PUBLIC_GTM_ID_STAG }}" + sentry_dsn_secret="${{ secrets.NEXT_PUBLIC_SENTRY_DSN_STAG }}" + calendly_client_secret="${{ secrets.NEXT_PUBLIC_CALENDLY_CLIENT_ID_STAG }}" + calendly_redirect_secret="${{ secrets.NEXT_PUBLIC_CALENDLY_REDIRECT_URI_STAG }}" + stripe_public_secret="${{ secrets.NEXT_PUBLIC_STRIPE_PUBLIC_KEY_STAG }}" + onehash_chat_secret="${{ secrets.NEXT_PUBLIC_ONEHASH_CHAT_URL_STAG }}" + razorpay_client_secret="${{ secrets.NEXT_PUBLIC_RAZORPAY_CLIENT_ID_STAG }}" + signup_url_secret="${{ secrets.NEXT_PUBLIC_SIGNUP_URL_STAG }}" + posthog_key_secret="${{ secrets.NEXT_PUBLIC_POSTHOG_KEY_STAG }}" + fi + check "$gtm_secret" "NEXT_PUBLIC_GTM_ID (${deploy_env})" + check "${{ secrets.NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID }}" "NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID" + check "${{ secrets.NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID }}" "NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID" + check "${{ secrets.NEXT_PUBLIC_API_V2_URL }}" "NEXT_PUBLIC_API_V2_URL" + check "${{ secrets.NEXT_PUBLIC_ONEHASH_URL }}" "NEXT_PUBLIC_ONEHASH_URL" + check "${{ secrets.NEXT_PUBLIC_SENDGRID_SENDER_NAME }}" "NEXT_PUBLIC_SENDGRID_SENDER_NAME" + check "$sentry_dsn_secret" "NEXT_PUBLIC_SENTRY_DSN (${deploy_env})" + check "${{ secrets.NEXT_PUBLIC_LOGGER_LEVEL }}" "NEXT_PUBLIC_LOGGER_LEVEL" + check "${{ secrets.NEXT_PUBLIC_TEAM_IMPERSONATION }}" "NEXT_PUBLIC_TEAM_IMPERSONATION" + check "${{ secrets.NEXT_PUBLIC_APP_NAME }}" "NEXT_PUBLIC_APP_NAME" + check "${{ secrets.BRAND_NAME }}" "BRAND_NAME" + check "${{ secrets.NEXT_PUBLIC_MINUTES_TO_BOOK }}" "NEXT_PUBLIC_MINUTES_TO_BOOK" + check "${{ secrets.NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD }}" "NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD" + check "${{ secrets.NEXT_PUBLIC_CALENDLY_OAUTH_URL }}" "NEXT_PUBLIC_CALENDLY_OAUTH_URL" + check "${{ secrets.NEXT_PUBLIC_CALENDLY_API_BASE_URL }}" "NEXT_PUBLIC_CALENDLY_API_BASE_URL" + check "$calendly_client_secret" "NEXT_PUBLIC_CALENDLY_CLIENT_ID (${deploy_env})" + check "$calendly_redirect_secret" "NEXT_PUBLIC_CALENDLY_REDIRECT_URI (${deploy_env})" + check "$stripe_public_secret" "NEXT_PUBLIC_STRIPE_PUBLIC_KEY (${deploy_env})" + check "$onehash_chat_secret" "NEXT_PUBLIC_ONEHASH_CHAT_URL (${deploy_env})" + check "$razorpay_client_secret" "NEXT_PUBLIC_RAZORPAY_CLIENT_ID (${deploy_env})" + check "$signup_url_secret" "NEXT_PUBLIC_SIGNUP_URL (${deploy_env})" + check "${{ secrets.NEXT_PUBLIC_VAPID_PUBLIC_KEY }}" "NEXT_PUBLIC_VAPID_PUBLIC_KEY" + check "${{ secrets.NEXT_PUBLIC_WEBSITE_TERMS_URL }}" "NEXT_PUBLIC_WEBSITE_TERMS_URL" + check "${{ secrets.NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL }}" "NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL" + check "${{ secrets.NEXT_PUBLIC_RECAPTCHA_LOW }}" "NEXT_PUBLIC_RECAPTCHA_LOW" + check "${{ secrets.NEXT_PUBLIC_RECAPTCHA_MEDIUM }}" "NEXT_PUBLIC_RECAPTCHA_MEDIUM" + check "${{ secrets.NEXT_PUBLIC_RECAPTCHA_HIGH }}" "NEXT_PUBLIC_RECAPTCHA_HIGH" + check "${{ secrets.NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS }}" "NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS" + check "${{ secrets.NEXT_PUBLIC_POSTHOG_HOST }}" "NEXT_PUBLIC_POSTHOG_HOST" + check "$posthog_key_secret" "NEXT_PUBLIC_POSTHOG_KEY (${deploy_env})" + check "${{ secrets.NEXT_PUBLIC_ONEHASH_ERP_URL }}" "NEXT_PUBLIC_ONEHASH_ERP_URL" + check "${{ secrets.NEXT_PUBLIC_ONEHASH_CRM_URL }}" "NEXT_PUBLIC_ONEHASH_CRM_URL" + check "${{ secrets.NEXT_PUBLIC_WHITELISTED_ORGS }}" "NEXT_PUBLIC_WHITELISTED_ORGS" + check "${{ secrets.NEXT_PUBLIC_PIXEL }}" "NEXT_PUBLIC_PIXEL" + check "${{ secrets.NEXT_PUBLIC_CLOUDFLARE_SITEKEY }}" "NEXT_PUBLIC_CLOUDFLARE_SITEKEY" + check "${{ secrets.NEXTAUTH_SECRET }}" "NEXTAUTH_SECRET" + check "${{ secrets.CALENDSO_ENCRYPTION_KEY }}" "CALENDSO_ENCRYPTION_KEY" + + echo "=== Sentry build uploads ===" + sentry_project_secret="${{ secrets.SENTRY_PROJECT_PROD }}" + if [ "$deploy_env" = "staging" ]; then + sentry_project_secret="${{ secrets.SENTRY_PROJECT_STAG }}" + fi + check "${{ secrets.SENTRY_AUTH_TOKEN }}" "SENTRY_AUTH_TOKEN" + check "${{ secrets.SENTRY_ORG }}" "SENTRY_ORG" + check "$sentry_project_secret" "SENTRY_PROJECT (${deploy_env})" + + echo "=== Worker runtime config ===" + worker_replicas_value="${{ github.event.inputs.worker_replicas }}" + worker_replicas_source="workflow_dispatch override" + if [ -z "$worker_replicas_value" ]; then + worker_replicas_source="WORKER_REPLICAS secret (${deploy_env})" + worker_replicas_value="${{ secrets.WORKER_REPLICAS_PROD }}" + if [ "$deploy_env" = "staging" ]; then + worker_replicas_value="${{ secrets.WORKER_REPLICAS_STAG }}" + fi + fi + check "$worker_replicas_value" "WORKER_REPLICAS (${deploy_env}; source: ${worker_replicas_source})" + if [ -n "$worker_replicas_value" ]; then + case "$worker_replicas_value" in + ''|*[!0-9]*) + echo "PREFLIGHT: INVALID — WORKER_REPLICAS (${deploy_env}) must be a positive integer" + failed=1 + ;; + *) + if [ "$worker_replicas_value" -le 0 ]; then + echo "PREFLIGHT: INVALID — WORKER_REPLICAS (${deploy_env}) must be greater than zero" + failed=1 + fi + ;; + esac + fi + + echo "=== Email notifications ===" + check "${{ secrets.SENDGRID_API_KEY }}" "SENDGRID_API_KEY" + check "${{ secrets.EMAIL_FROM }}" "EMAIL_FROM" + check "${{ secrets.EMAIL_TO }}" "EMAIL_TO" + + echo "=== Deployment State ===" + # DEPLOY_STATE_BUCKET is fixed contract; verify cal-id bucket is reachable + check "${{ secrets.AWS_ACCESS_KEY_ID }}" "DEPLOY_STATE_BUCKET (contract: cal-id)" + echo "PREFLIGHT: PREFIX derived from DEPLOY_ENV=${deploy_env} → deployment-${deploy_env}" + + echo "=== Migrations (if enabled) ===" + db_url_secret="${{ secrets.DATABASE_DIRECT_URL_PROD }}" + if [ "$deploy_env" = "staging" ]; then + db_url_secret="${{ secrets.DATABASE_DIRECT_URL_STAG }}" + fi + check "$db_url_secret" "DATABASE_DIRECT_URL (${deploy_env})" + + echo "=== SSL / Certbot ===" + check "${{ secrets.CERTBOT_EMAIL }}" "CERTBOT_EMAIL" + + echo "=== Repo Access ===" + check "${{ secrets.REPO_URL }}" "REPO_URL" + + echo "=== Optional deployment knobs ===" + optional "${{ secrets.ENABLE_DB_BACKUP }}" "ENABLE_DB_BACKUP" + optional "${{ secrets.DB_BACKUP_COMMAND }}" "DB_BACKUP_COMMAND (required only when ENABLE_DB_BACKUP=true)" + optional "${{ secrets.SSH_TARGET_FINGERPRINT }}" "SSH_TARGET_FINGERPRINT (legacy workflows only)" + optional "${{ secrets.DOCKERHUB_USERNAME }}" "DOCKERHUB_USERNAME (legacy Docker Hub fallback only)" + optional "${{ secrets.DOCKERHUB_TOKEN }}" "DOCKERHUB_TOKEN (legacy Docker Hub fallback only)" + + if [ "${{ secrets.ENABLE_DB_BACKUP }}" = "true" ] && [ -z "${{ secrets.DB_BACKUP_COMMAND }}" ]; then + echo "PREFLIGHT: MISSING — DB_BACKUP_COMMAND (required when ENABLE_DB_BACKUP=true)" + failed=1 + fi + + if [ "$failed" -ne 0 ]; then + echo "PREFLIGHT: One or more required inputs are missing — aborting." + exit 1 + fi + echo "PREFLIGHT: All required inputs present" + + prepare-release: + name: Prepare release + runs-on: ubuntu-latest + needs: preflight + outputs: + branch: ${{ steps.release.outputs.branch }} + git_hash: ${{ steps.release.outputs.git_hash }} + release_id: ${{ steps.release.outputs.release_id }} + deploy_env: ${{ steps.release.outputs.deploy_env }} + image_suffix: ${{ steps.release.outputs.image_suffix }} + web_repo: ${{ steps.release.outputs.web_repo }} + api_repo: ${{ steps.release.outputs.api_repo }} + worker_repo: ${{ steps.release.outputs.worker_repo }} + worker_replicas: ${{ steps.release.outputs.worker_replicas }} + ecr_registry: ${{ steps.release.outputs.ecr_registry }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ env.BRANCH }} + + - name: Resolve release metadata + id: release + shell: bash + run: | + set -euo pipefail + requested_git_hash="${{ github.event.inputs.git_hash }}" + if [ -n "$requested_git_hash" ]; then + git fetch origin "$requested_git_hash" --depth=1 2>/dev/null || true + git checkout --detach "$requested_git_hash" + fi + git_hash="$(git rev-parse HEAD)" + if [ "$BRANCH" = "main" ]; then + deploy_env="production" + image_suffix="prod" + else + deploy_env="staging" + image_suffix="stag" + fi + release_id="v$(date -u '+%Y%m%d-%H%M%S')" + echo "branch=${BRANCH}" >> "$GITHUB_OUTPUT" + echo "git_hash=${git_hash}" >> "$GITHUB_OUTPUT" + echo "release_id=${release_id}" >> "$GITHUB_OUTPUT" + echo "deploy_env=${deploy_env}" >> "$GITHUB_OUTPUT" + echo "image_suffix=${image_suffix}" >> "$GITHUB_OUTPUT" + echo "web_repo=cal_${image_suffix}" >> "$GITHUB_OUTPUT" + echo "api_repo=cal_api_${image_suffix}" >> "$GITHUB_OUTPUT" + echo "worker_repo=cal_worker_${image_suffix}" >> "$GITHUB_OUTPUT" + worker_replicas="${{ github.event.inputs.worker_replicas }}" + if [ -z "$worker_replicas" ]; then + if [ "$deploy_env" = "production" ]; then + worker_replicas="${{ secrets.WORKER_REPLICAS_PROD }}" + else + worker_replicas="${{ secrets.WORKER_REPLICAS_STAG }}" + fi + fi + worker_replicas="${worker_replicas:-1}" + echo "worker_replicas=${worker_replicas}" >> "$GITHUB_OUTPUT" + # ECR registry is derived from AWS_ACCOUNT_ID and region + account_id="${{ secrets.AWS_ACCOUNT_ID }}" + region="${{ secrets.AWS_REGION }}" + ecr_registry="${account_id}.dkr.ecr.${region}.amazonaws.com" + echo "ecr_registry=${ecr_registry}" >> "$GITHUB_OUTPUT" + echo "Prepared ${deploy_env} release ${release_id} for ${BRANCH}@${git_hash}" + + - name: Notify deployment started + env: + SENDGRID_API_KEY: ${{ secrets.SENDGRID_API_KEY }} + EMAIL_FROM: ${{ secrets.EMAIL_FROM || 'alerts@cal.id' }} + EMAIL_TO: ${{ secrets.EMAIL_TO || 'deployments@cal.id' }} + NOTIFICATION_EVENT: deployment_started + DEPLOY_ENV: ${{ steps.release.outputs.deploy_env }} + RELEASE_ID: ${{ steps.release.outputs.release_id }} + GIT_SHA: ${{ steps.release.outputs.git_hash }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_ACTOR: ${{ github.actor }} + BRANCH_NAME: ${{ env.BRANCH }} + run: bash infra/scripts/send-email.sh + + acquire-lock: + name: Acquire deployment lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + runs-on: ubuntu-latest + needs: + - preflight + - prepare-release + outputs: + lock_token: ${{ steps.lock.outputs.LOCK_TOKEN }} + lock_key: ${{ steps.lock.outputs.LOCK_KEY }} + steps: + - name: Checkout lock script + uses: actions/checkout@v4 + with: + ref: ${{ needs.prepare-release.outputs.branch }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/acquire-lock.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Acquire lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + id: lock + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_OWNER: github-${{ github.run_id }} + LOCK_TTL_SECONDS: 7200 + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_ACTOR: ${{ github.actor }} + run: | + infra/scripts/acquire-lock.sh | tee lock.env + cat lock.env >> "$GITHUB_OUTPUT" + + # ---------- Build Jobs ---------- + + build-web: + name: Build web image + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + outputs: + web_image: ${{ steps.meta.outputs.web_image }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ needs.prepare-release.outputs.git_hash }} + + - name: Verify release checkout + shell: bash + run: | + set -euo pipefail + checked_out_sha="$(git rev-parse HEAD)" + expected_sha="${{ needs.prepare-release.outputs.git_hash }}" + if [ "$checked_out_sha" != "$expected_sha" ]; then + echo "Checked out SHA ${checked_out_sha} does not match release SHA ${expected_sha}" >&2 + exit 1 + fi + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Check if web image already exists in ECR + id: image-exists + shell: bash + env: + REPO_NAME: ${{ needs.prepare-release.outputs.web_repo }} + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} + REBUILD: ${{ github.event.inputs.rebuild || 'false' }} + run: | + set -euo pipefail + if [ "$REBUILD" = "true" ]; then + echo "image_exists=false" >> "$GITHUB_OUTPUT" + echo "Skipping image existence check — rebuild requested" + else + if aws ecr describe-images \ + --repository-name "$REPO_NAME" \ + --image-ids imageTag="$GIT_SHA" >/dev/null 2>&1; then + echo "image_exists=true" >> "$GITHUB_OUTPUT" + echo "Image ${REPO_NAME}:${GIT_SHA} already exists — will skip build" + else + echo "image_exists=false" >> "$GITHUB_OUTPUT" + echo "Image ${REPO_NAME}:${GIT_SHA} not found — will build" + fi + fi + + - name: Build and push web image + if: ${{ steps.image-exists.outputs.image_exists != 'true' }} + id: build + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + platforms: linux/amd64 + push: true + build-args: | + NEXT_PUBLIC_GTM_ID=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_GTM_ID_PROD || secrets.NEXT_PUBLIC_GTM_ID_STAG }} + NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID=${{ secrets.NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID }} + NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID=${{ secrets.NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID }} + NEXT_PUBLIC_WEBAPP_URL=${{ format('https://{0}', needs.prepare-release.outputs.deploy_env == 'production' && secrets.DOMAIN_NAME_PROD || secrets.DOMAIN_NAME_STAG) }} + NEXT_PUBLIC_WEBSITE_URL=${{ format('https://{0}', needs.prepare-release.outputs.deploy_env == 'production' && secrets.DOMAIN_NAME_PROD || secrets.DOMAIN_NAME_STAG) }} + NEXT_PUBLIC_API_V2_URL=${{ secrets.NEXT_PUBLIC_API_V2_URL }} + NEXT_PUBLIC_EMBED_LIB_URL=${{ format('https://{0}/embed-link/embed.js', needs.prepare-release.outputs.deploy_env == 'production' && secrets.DOMAIN_NAME_PROD || secrets.DOMAIN_NAME_STAG) }} + NEXT_PUBLIC_ONEHASH_URL=${{ secrets.NEXT_PUBLIC_ONEHASH_URL }} + NEXT_PUBLIC_SENDGRID_SENDER_NAME=${{ secrets.NEXT_PUBLIC_SENDGRID_SENDER_NAME }} + NEXT_PUBLIC_SENTRY_DSN=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_SENTRY_DSN_PROD || secrets.NEXT_PUBLIC_SENTRY_DSN_STAG }} + NEXT_PUBLIC_LOGGER_LEVEL=${{ secrets.NEXT_PUBLIC_LOGGER_LEVEL }} + NEXT_PUBLIC_TEAM_IMPERSONATION=${{ secrets.NEXT_PUBLIC_TEAM_IMPERSONATION }} + NEXT_PUBLIC_APP_NAME=${{ secrets.NEXT_PUBLIC_APP_NAME }} + NEXT_PUBLIC_COMPANY_NAME=${{ secrets.BRAND_NAME }} + NEXT_PUBLIC_MINUTES_TO_BOOK=${{ secrets.NEXT_PUBLIC_MINUTES_TO_BOOK }} + NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD=${{ secrets.NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD }} + NEXT_PUBLIC_CALENDLY_OAUTH_URL=${{ secrets.NEXT_PUBLIC_CALENDLY_OAUTH_URL }} + NEXT_PUBLIC_CALENDLY_API_BASE_URL=${{ secrets.NEXT_PUBLIC_CALENDLY_API_BASE_URL }} + NEXT_PUBLIC_CALENDLY_CLIENT_ID=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_CALENDLY_CLIENT_ID_PROD || secrets.NEXT_PUBLIC_CALENDLY_CLIENT_ID_STAG }} + NEXT_PUBLIC_CALENDLY_REDIRECT_URI=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_CALENDLY_REDIRECT_URI_PROD || secrets.NEXT_PUBLIC_CALENDLY_REDIRECT_URI_STAG }} + NEXT_PUBLIC_STRIPE_PUBLIC_KEY=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_STRIPE_PUBLIC_KEY_PROD || secrets.NEXT_PUBLIC_STRIPE_PUBLIC_KEY_STAG }} + NEXT_PUBLIC_ONEHASH_CHAT_URL=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_ONEHASH_CHAT_URL_PROD || secrets.NEXT_PUBLIC_ONEHASH_CHAT_URL_STAG }} + NEXT_PUBLIC_RAZORPAY_CLIENT_ID=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_RAZORPAY_CLIENT_ID_PROD || secrets.NEXT_PUBLIC_RAZORPAY_CLIENT_ID_STAG }} + NEXT_PUBLIC_SIGNUP_URL=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_SIGNUP_URL_PROD || secrets.NEXT_PUBLIC_SIGNUP_URL_STAG }} + NEXT_PUBLIC_VAPID_PUBLIC_KEY=${{ secrets.NEXT_PUBLIC_VAPID_PUBLIC_KEY }} + NEXT_PUBLIC_WEBSITE_TERMS_URL=${{ secrets.NEXT_PUBLIC_WEBSITE_TERMS_URL }} + NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL=${{ secrets.NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL }} + NEXT_PUBLIC_ONEHASH_CHAT_ORIGIN=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_ONEHASH_CHAT_URL_PROD || secrets.NEXT_PUBLIC_ONEHASH_CHAT_URL_STAG }} + NEXT_PUBLIC_RECAPTCHA_LOW=${{ secrets.NEXT_PUBLIC_RECAPTCHA_LOW }} + NEXT_PUBLIC_RECAPTCHA_MEDIUM=${{ secrets.NEXT_PUBLIC_RECAPTCHA_MEDIUM }} + NEXT_PUBLIC_RECAPTCHA_HIGH=${{ secrets.NEXT_PUBLIC_RECAPTCHA_HIGH }} + NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS=${{ secrets.NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS }} + NEXT_PUBLIC_POSTHOG_HOST=${{ secrets.NEXT_PUBLIC_POSTHOG_HOST }} + NEXT_PUBLIC_POSTHOG_KEY=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.NEXT_PUBLIC_POSTHOG_KEY_PROD || secrets.NEXT_PUBLIC_POSTHOG_KEY_STAG }} + NEXT_PUBLIC_ONEHASH_ERP_URL=${{ secrets.NEXT_PUBLIC_ONEHASH_ERP_URL }} + NEXT_PUBLIC_ONEHASH_CRM_URL=${{ secrets.NEXT_PUBLIC_ONEHASH_CRM_URL }} + NEXT_PUBLIC_WHITELISTED_ORGS=${{ secrets.NEXT_PUBLIC_WHITELISTED_ORGS }} + NEXT_PUBLIC_PIXEL=${{ secrets.NEXT_PUBLIC_PIXEL }} + NEXT_PUBLIC_CLOUDFLARE_SITEKEY=${{ secrets.NEXT_PUBLIC_CLOUDFLARE_SITEKEY }} + secrets: | + nextauth_secret=${{ secrets.NEXTAUTH_SECRET }} + calendso_encryption_key=${{ secrets.CALENDSO_ENCRYPTION_KEY }} + tags: | + ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.web_repo }}:${{ needs.prepare-release.outputs.git_hash }} + ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.web_repo }}:latest + + - name: Set web image output + id: meta + run: | + registry="${{ steps.login-ecr.outputs.registry }}" + echo "web_image=${registry}/${{ needs.prepare-release.outputs.web_repo }}:${{ needs.prepare-release.outputs.git_hash }}" >> "$GITHUB_OUTPUT" + + build-api: + name: Build API image + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + outputs: + api_image: ${{ steps.meta.outputs.api_image }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ needs.prepare-release.outputs.git_hash }} + + - name: Verify release checkout + shell: bash + run: | + set -euo pipefail + checked_out_sha="$(git rev-parse HEAD)" + expected_sha="${{ needs.prepare-release.outputs.git_hash }}" + if [ "$checked_out_sha" != "$expected_sha" ]; then + echo "Checked out SHA ${checked_out_sha} does not match release SHA ${expected_sha}" >&2 + exit 1 + fi + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Check if API image already exists in ECR + id: image-exists + shell: bash + env: + REPO_NAME: ${{ needs.prepare-release.outputs.api_repo }} + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} + REBUILD: ${{ github.event.inputs.rebuild || 'false' }} + run: | + set -euo pipefail + if [ "$REBUILD" = "true" ]; then + echo "image_exists=false" >> "$GITHUB_OUTPUT" + echo "Skipping image existence check — rebuild requested" + else + if aws ecr describe-images \ + --repository-name "$REPO_NAME" \ + --image-ids imageTag="$GIT_SHA" >/dev/null 2>&1; then + echo "image_exists=true" >> "$GITHUB_OUTPUT" + echo "Image ${REPO_NAME}:${GIT_SHA} already exists — will skip build" + else + echo "image_exists=false" >> "$GITHUB_OUTPUT" + echo "Image ${REPO_NAME}:${GIT_SHA} not found — will build" + fi + fi + + - name: Build and push API image + if: ${{ steps.image-exists.outputs.image_exists != 'true' }} + id: build + uses: docker/build-push-action@v5 + with: + context: . + file: ./apps/connector/Dockerfile + platforms: linux/amd64 + push: true + build-args: | + GIT_HASH=${{ needs.prepare-release.outputs.git_hash }} + secrets: | + sentry_auth_token=${{ secrets.SENTRY_AUTH_TOKEN }} + sentry_org=${{ secrets.SENTRY_ORG }} + sentry_project=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.SENTRY_PROJECT_PROD || secrets.SENTRY_PROJECT_STAG }} + tags: | + ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.api_repo }}:${{ needs.prepare-release.outputs.git_hash }} + ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.api_repo }}:latest + + - name: Set API image output + id: meta + run: | + registry="${{ steps.login-ecr.outputs.registry }}" + echo "api_image=${registry}/${{ needs.prepare-release.outputs.api_repo }}:${{ needs.prepare-release.outputs.git_hash }}" >> "$GITHUB_OUTPUT" + + build-worker: + name: Build worker image + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + outputs: + worker_image: ${{ steps.meta.outputs.worker_image }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ needs.prepare-release.outputs.git_hash }} + + - name: Verify release checkout + shell: bash + run: | + set -euo pipefail + checked_out_sha="$(git rev-parse HEAD)" + expected_sha="${{ needs.prepare-release.outputs.git_hash }}" + if [ "$checked_out_sha" != "$expected_sha" ]; then + echo "Checked out SHA ${checked_out_sha} does not match release SHA ${expected_sha}" >&2 + exit 1 + fi + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Check if worker image already exists in ECR + id: image-exists + shell: bash + env: + REPO_NAME: ${{ needs.prepare-release.outputs.worker_repo }} + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} + REBUILD: ${{ github.event.inputs.rebuild || 'false' }} + run: | + set -euo pipefail + if [ "$REBUILD" = "true" ]; then + echo "image_exists=false" >> "$GITHUB_OUTPUT" + echo "Skipping image existence check — rebuild requested" + else + if aws ecr describe-images \ + --repository-name "$REPO_NAME" \ + --image-ids imageTag="$GIT_SHA" >/dev/null 2>&1; then + echo "image_exists=true" >> "$GITHUB_OUTPUT" + echo "Image ${REPO_NAME}:${GIT_SHA} already exists — will skip build" + else + echo "image_exists=false" >> "$GITHUB_OUTPUT" + echo "Image ${REPO_NAME}:${GIT_SHA} not found — will build" + fi + fi + + - name: Build and push worker image + if: ${{ steps.image-exists.outputs.image_exists != 'true' }} + id: build + uses: docker/build-push-action@v5 + with: + context: . + file: ./apps/worker/Dockerfile + platforms: linux/amd64 + push: true + build-args: | + GIT_HASH=${{ needs.prepare-release.outputs.git_hash }} + secrets: | + sentry_auth_token=${{ secrets.SENTRY_AUTH_TOKEN }} + sentry_org=${{ secrets.SENTRY_ORG }} + sentry_project=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.SENTRY_PROJECT_PROD || secrets.SENTRY_PROJECT_STAG }} + tags: | + ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.worker_repo }}:${{ needs.prepare-release.outputs.git_hash }} + ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.worker_repo }}:latest + + - name: Set worker image output + id: meta + run: | + registry="${{ steps.login-ecr.outputs.registry }}" + echo "worker_image=${registry}/${{ needs.prepare-release.outputs.worker_repo }}:${{ needs.prepare-release.outputs.git_hash }}" >> "$GITHUB_OUTPUT" + + # ---------- State Recording ---------- + + record-build-state: + name: Record build state + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - build-web + - build-api + - build-worker + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/refresh-lock.sh + infra/scripts/record-state.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Refresh deployment lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + LOCK_TTL_SECONDS: 7200 + run: bash infra/scripts/refresh-lock.sh + + - name: Record images built + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} + RELEASE_ID: ${{ needs.prepare-release.outputs.release_id }} + STATUS: staged + WEB_IMAGE: ${{ needs.build-web.outputs.web_image }} + API_IMAGE: ${{ needs.build-api.outputs.api_image }} + WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image }} + MIGRATIONS_APPLIED_JSON: "[]" + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_ACTOR: ${{ github.actor }} + run: bash infra/scripts/record-state.sh + + # ---------- Migration ---------- + + migrate-db: + name: Migrate database + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - build-web + - build-api + - build-worker + - record-build-state + outputs: + migrations_applied: ${{ steps.run-migrations.outputs.migrations_applied }} + migration_count: ${{ steps.run-migrations.outputs.migration_count }} + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Refresh deployment lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + LOCK_TTL_SECONDS: 7200 + run: bash infra/scripts/refresh-lock.sh + + - name: Run database migrations from target VPC + id: run-migrations + uses: appleboy/ssh-action@v0.1.10 + env: + GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} + RELEASE_ID: ${{ needs.prepare-release.outputs.release_id }} + DATABASE_URL: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.DATABASE_DIRECT_URL_PROD || secrets.DATABASE_DIRECT_URL_STAG }} + REPO_URL: ${{ secrets.REPO_URL }} + BRANCH_NAME: ${{ env.BRANCH }} + REPO_ROOT: /home/onehash/onehash-cal + ENABLE_DB_BACKUP: ${{ secrets.ENABLE_DB_BACKUP || 'false' }} + DB_BACKUP_COMMAND: ${{ secrets.DB_BACKUP_COMMAND || '' }} + MIGRATION_TIMEOUT_SECONDS: "600" + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 1200s + envs: GIT_HASH,RELEASE_ID,DATABASE_URL,REPO_URL,BRANCH_NAME,REPO_ROOT,ENABLE_DB_BACKUP,DB_BACKUP_COMMAND,MIGRATION_TIMEOUT_SECONDS + script: | + set -euo pipefail + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 || true + git fetch origin "$GIT_HASH" --depth 1 || true + git checkout --detach "$GIT_HASH" + checked_out_sha="$(git rev-parse HEAD)" + if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 + fi + chmod +x infra/scripts/migrate.sh + # Capture script output — parse MIGRATIONS_APPLIED_JSON and MIGRATION_COUNT + set +e + output=$(GIT_HASH="$GIT_HASH" RELEASE_ID="$RELEASE_ID" DATABASE_URL="$DATABASE_URL" \ + REPO_URL="$REPO_URL" BRANCH_NAME="$BRANCH_NAME" REPO_ROOT="$REPO_ROOT" \ + ENABLE_DB_BACKUP="$ENABLE_DB_BACKUP" DB_BACKUP_COMMAND="$DB_BACKUP_COMMAND" \ + MIGRATION_TIMEOUT_SECONDS="$MIGRATION_TIMEOUT_SECONDS" \ + infra/scripts/migrate.sh 2>&1) + exit_code=$? + set -e + printf '%s\n' "$output" + if [ "$exit_code" -ne 0 ]; then + exit "$exit_code" + fi + # Extract output variables for workflow capture + migrations_json="$(printf '%s\n' "$output" | grep '^MIGRATIONS_APPLIED_JSON=' | sed 's/^MIGRATIONS_APPLIED_JSON=//')" + migration_count="$(printf '%s\n' "$output" | grep '^MIGRATION_COUNT=' | sed 's/^MIGRATION_COUNT=//')" + echo "migrations_applied=${migrations_json}" >> "$GITHUB_OUTPUT" + echo "migration_count=${migration_count}" >> "$GITHUB_OUTPUT" + + # ---------- Deploy Candidates ---------- + + deploy-api: + name: Deploy API + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - build-api + - migrate-db + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/stage-api.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Stage API candidate + uses: appleboy/ssh-action@v0.1.10 + env: + API_IMAGE: ${{ needs.build-api.outputs.api_image }} + ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} + GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} + REPO_URL: ${{ secrets.REPO_URL }} + AWS_REGION: ${{ secrets.AWS_REGION }} + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 1200s + envs: API_IMAGE,ECR_REGISTRY,BRANCH_NAME,GIT_HASH,REPO_URL,AWS_REGION + script: | + set -euo pipefail + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 || true + git fetch origin "$GIT_HASH" --depth 1 || true + git checkout --detach "$GIT_HASH" + checked_out_sha="$(git rev-parse HEAD)" + if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 + fi + chmod +x infra/scripts/stage-api.sh + ECR_REGISTRY="$ECR_REGISTRY" AWS_REGION="$AWS_REGION" infra/scripts/stage-api.sh + + deploy-web: + name: Deploy web + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - build-web + - migrate-db + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/stage-web.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Stage web candidate + uses: appleboy/ssh-action@v0.1.10 + env: + WEB_IMAGE: ${{ needs.build-web.outputs.web_image }} + ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} + GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} + REPO_URL: ${{ secrets.REPO_URL }} + AWS_REGION: ${{ secrets.AWS_REGION }} + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 1500s + envs: WEB_IMAGE,ECR_REGISTRY,BRANCH_NAME,GIT_HASH,REPO_URL,AWS_REGION + script: | + set -euo pipefail + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 || true + git fetch origin "$GIT_HASH" --depth 1 || true + git checkout --detach "$GIT_HASH" + checked_out_sha="$(git rev-parse HEAD)" + if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 + fi + chmod +x infra/scripts/stage-web.sh + ECR_REGISTRY="$ECR_REGISTRY" AWS_REGION="$AWS_REGION" infra/scripts/stage-web.sh + + # ---------- Promotion ---------- + + promote-all: + name: Promote web and API + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - build-web + - build-api + - deploy-api + - deploy-web + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/refresh-lock.sh + infra/scripts/promote-all.sh + infra/scripts/revert-nginx.sh + infra/scripts/record-state.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Refresh deployment lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + LOCK_TTL_SECONDS: 7200 + run: bash infra/scripts/refresh-lock.sh + + - name: Promote candidates + uses: appleboy/ssh-action@v0.1.10 + env: + DOMAIN_NAME: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.DOMAIN_NAME_PROD || secrets.DOMAIN_NAME_STAG }} + HOMEPAGE_URL: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.HOMEPAGE_URL_PROD || secrets.HOMEPAGE_URL_STAG }} + CERTBOT_EMAIL: ${{ secrets.CERTBOT_EMAIL }} + BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} + GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} + REPO_URL: ${{ secrets.REPO_URL }} + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 1200s + envs: DOMAIN_NAME,HOMEPAGE_URL,CERTBOT_EMAIL,BRANCH_NAME,GIT_HASH,REPO_URL + script: | + set -euo pipefail + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 || true + git fetch origin "$GIT_HASH" --depth 1 || true + git checkout --detach "$GIT_HASH" + checked_out_sha="$(git rev-parse HEAD)" + if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 + fi + chmod +x infra/scripts/promote-all.sh infra/scripts/revert-nginx.sh + DOMAIN_NAME="$DOMAIN_NAME" HOMEPAGE_URL="$HOMEPAGE_URL" infra/scripts/promote-all.sh + + - name: Revert NGINX on promotion failure + if: failure() + uses: appleboy/ssh-action@v0.1.10 + env: + BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} + GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} + REPO_URL: ${{ secrets.REPO_URL }} + AWS_REGION: ${{ secrets.AWS_REGION }} + AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 600s + envs: BRANCH_NAME,GIT_HASH,REPO_URL,AWS_REGION,AWS_ACCOUNT_ID + script: | + set -euo pipefail + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 || true + git fetch origin "$GIT_HASH" --depth 1 || true + git checkout --detach "$GIT_HASH" + checked_out_sha="$(git rev-parse HEAD)" + if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 + fi + chmod +x infra/scripts/revert-nginx.sh + infra/scripts/revert-nginx.sh + + # ---------- Record promoted state ---------- + # Only record "promoted" state after Web/API are confirmed healthy via NGINX. + # "current.json" is NOT updated here — that happens only after Worker success. + # If worker deployment fails, this state remains "promoted" (not "success") and + # the worker failure rollback step will revert NGINX to the previous release. + + record-promoted-state: + name: Record promoted state + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - build-web + - build-api + - build-worker + - migrate-db + - promote-all + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/refresh-lock.sh + infra/scripts/record-state.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Refresh deployment lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + LOCK_TTL_SECONDS: 7200 + run: bash infra/scripts/refresh-lock.sh + + - name: Record promoted state + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} + RELEASE_ID: ${{ needs.prepare-release.outputs.release_id }} + STATUS: promoted_pending_verification + WEB_IMAGE: ${{ needs.build-web.outputs.web_image }} + API_IMAGE: ${{ needs.build-api.outputs.api_image }} + WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image }} + MIGRATIONS_APPLIED_JSON: ${{ needs.migrate-db.outputs.migrations_applied || '[]' }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_ACTOR: ${{ github.actor }} + run: bash infra/scripts/record-state.sh + + # ---------- Worker Handoff ---------- + # Workers are staged and started after Web/API promotion. Workers are part of + # the release unit: if worker startup fails after promotion, Web/API are rolled + # back via NGINX revert to the previous release state. + # + # Deployment sequence (enforced via job needs): + # build-worker → promote-all → record-promoted-state → deploy-worker + # + # Failure handling: + # Worker failure before promotion: fails deploy-worker, revert-nginx is NOT called + # (previous release is still live — nothing to revert to yet). + # Worker failure after promotion: revert-nginx + rollback.sh runs to restore previous release. + + deploy-worker: + name: Deploy workers + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - build-worker + - promote-all + - record-promoted-state + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/refresh-lock.sh + infra/scripts/start-workers.sh + infra/scripts/drain-workers.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Refresh deployment lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + LOCK_TTL_SECONDS: 7200 + run: bash infra/scripts/refresh-lock.sh + + - name: Deploy workers + uses: appleboy/ssh-action@v0.1.10 + env: + WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image }} + ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} + GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} + REPO_URL: ${{ secrets.REPO_URL }} + WORKER_REPLICAS: ${{ needs.prepare-release.outputs.worker_replicas }} + AWS_REGION: ${{ secrets.AWS_REGION }} + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 1800s + envs: WORKER_IMAGE,ECR_REGISTRY,BRANCH_NAME,GIT_HASH,REPO_URL,WORKER_REPLICAS,AWS_REGION + script: | + set -euo pipefail + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 || true + git fetch origin "$GIT_HASH" --depth 1 || true + git checkout --detach "$GIT_HASH" + checked_out_sha="$(git rev-parse HEAD)" + if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 + fi + chmod +x infra/scripts/start-workers.sh infra/scripts/drain-workers.sh + ECR_REGISTRY="$ECR_REGISTRY" AWS_REGION="$AWS_REGION" PROMOTION_COMPLETE=true WORKER_IMAGE="$WORKER_IMAGE" WORKER_REPLICAS="$WORKER_REPLICAS" infra/scripts/start-workers.sh + infra/scripts/drain-workers.sh + + rollback-after-promotion: + name: Roll back after promotion failure + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - promote-all + - record-promoted-state + - deploy-worker + if: >- + ${{ + always() && + needs.promote-all.result == 'success' && + ( + needs.record-promoted-state.result == 'failure' || + needs.deploy-worker.result == 'failure' + ) + }} + outputs: + rollback_status: ${{ steps.rollback-outcome.outputs.rollback_status }} + rollback_target_sha: ${{ steps.current.outputs.target_sha }} + rollback_source_sha: ${{ needs.prepare-release.outputs.git_hash }} + rollback_state_path: ${{ steps.current.outputs.state_current_path }} + steps: + - name: Checkout state helpers + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/refresh-lock.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Refresh deployment lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + LOCK_TTL_SECONDS: 7200 + run: bash infra/scripts/refresh-lock.sh + + - name: Resolve previous current release + id: current + continue-on-error: true + shell: bash + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + WEB_REPO_NAME: ${{ needs.prepare-release.outputs.web_repo }} + API_REPO_NAME: ${{ needs.prepare-release.outputs.api_repo }} + WORKER_REPO_NAME: ${{ needs.prepare-release.outputs.worker_repo }} + run: | + set -euo pipefail + source infra/scripts/deploy-state-paths.sh + require_deploy_state_config + state_current_path="$(get_deploy_state_key "deployments/current.json")" + tmp_current="$(mktemp)" + trap 'rm -f "$tmp_current"' EXIT + + aws s3 cp "$state_current_path" "$tmp_current" >/dev/null + target_sha="$(jq -r '.sha // empty' "$tmp_current")" + [ -n "$target_sha" ] || { + echo "No previous current deployment SHA found in $state_current_path" >&2 + exit 1 + } + + web_image="$(jq -r '.services.web // empty' "$tmp_current")" + api_image="$(jq -r '.services.api // empty' "$tmp_current")" + worker_image="$(jq -r '.services.worker // empty' "$tmp_current")" + + web_image="${web_image:-${ECR_REGISTRY}/${WEB_REPO_NAME}:${target_sha}}" + api_image="${api_image:-${ECR_REGISTRY}/${API_REPO_NAME}:${target_sha}}" + worker_image="${worker_image:-${ECR_REGISTRY}/${WORKER_REPO_NAME}:${target_sha}}" + + { + echo "target_sha=${target_sha}" + echo "web_image=${web_image}" + echo "api_image=${api_image}" + echo "worker_image=${worker_image}" + echo "state_current_path=${state_current_path}" + } >> "$GITHUB_OUTPUT" + + - name: Verify rollback images exist in ECR + id: rollback-images + if: ${{ steps.current.outcome == 'success' }} + continue-on-error: true + shell: bash + env: + TARGET_SHA: ${{ steps.current.outputs.target_sha }} + WEB_REPO_NAME: ${{ needs.prepare-release.outputs.web_repo }} + API_REPO_NAME: ${{ needs.prepare-release.outputs.api_repo }} + WORKER_REPO_NAME: ${{ needs.prepare-release.outputs.worker_repo }} + run: | + set -euo pipefail + failed=0 + for service_repo in "web:${WEB_REPO_NAME}" "api:${API_REPO_NAME}" "worker:${WORKER_REPO_NAME}"; do + service="${service_repo%%:*}" + repo="${service_repo#*:}" + echo "Checking rollback ${service} image ${repo}:${TARGET_SHA}" + if aws ecr describe-images \ + --repository-name "$repo" \ + --image-ids imageTag="$TARGET_SHA" >/dev/null 2>&1; then + echo "Rollback ${service} image exists" + else + echo "Rollback ${service} image ${repo}:${TARGET_SHA} not found" >&2 + failed=1 + fi + done + if [ "$failed" -ne 0 ]; then + echo "One or more rollback images are missing; automatic rollback cannot proceed." >&2 + exit 1 + fi + + - name: Run automatic rollback on target host + id: rollback + if: ${{ steps.current.outcome == 'success' && steps.rollback-images.outcome == 'success' }} + continue-on-error: true + uses: appleboy/ssh-action@v0.1.10 + env: + TARGET_SHA: ${{ steps.current.outputs.target_sha }} + ROLLBACK_SOURCE_SHA: ${{ needs.prepare-release.outputs.git_hash }} + GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} + REPO_URL: ${{ secrets.REPO_URL }} + DATABASE_URL: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.DATABASE_DIRECT_URL_PROD || secrets.DATABASE_DIRECT_URL_STAG }} + DOMAIN_NAME: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.DOMAIN_NAME_PROD || secrets.DOMAIN_NAME_STAG }} + HOMEPAGE_URL: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.HOMEPAGE_URL_PROD || secrets.HOMEPAGE_URL_STAG }} + CERTBOT_EMAIL: ${{ secrets.CERTBOT_EMAIL }} + AWS_REGION: ${{ secrets.AWS_REGION }} + ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + WEB_IMAGE: ${{ steps.current.outputs.web_image }} + API_IMAGE: ${{ steps.current.outputs.api_image }} + WORKER_IMAGE: ${{ steps.current.outputs.worker_image }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_ACTOR: ${{ github.actor }} + RELEASE_ID: ${{ needs.prepare-release.outputs.release_id }} + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 1800s + envs: TARGET_SHA,ROLLBACK_SOURCE_SHA,GIT_HASH,DEPLOY_ENV,BRANCH_NAME,REPO_URL,DATABASE_URL,DOMAIN_NAME,HOMEPAGE_URL,CERTBOT_EMAIL,AWS_REGION,ECR_REGISTRY,WEB_IMAGE,API_IMAGE,WORKER_IMAGE,GITHUB_RUN_ID,GITHUB_ACTOR,RELEASE_ID + script: | + set -euo pipefail + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 || true + git fetch origin "$GIT_HASH" --depth 1 || true + git checkout --detach "$GIT_HASH" + checked_out_sha="$(git rev-parse HEAD)" + if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 + fi + chmod +x infra/scripts/rollback.sh infra/scripts/revert-nginx.sh + set +e + DEPLOY_ENV="$DEPLOY_ENV" \ + TARGET_SHA="$TARGET_SHA" \ + ROLLBACK_SOURCE_SHA="$ROLLBACK_SOURCE_SHA" \ + REPO_URL="$REPO_URL" \ + DATABASE_URL="$DATABASE_URL" \ + BRANCH_NAME="$BRANCH_NAME" \ + DOMAIN_NAME="$DOMAIN_NAME" \ + HOMEPAGE_URL="$HOMEPAGE_URL" \ + WEB_IMAGE="$WEB_IMAGE" \ + API_IMAGE="$API_IMAGE" \ + WORKER_IMAGE="$WORKER_IMAGE" \ + ECR_REGISTRY="$ECR_REGISTRY" \ + IS_ROLLBACK=true \ + GITHUB_RUN_ID="$GITHUB_RUN_ID" \ + GITHUB_ACTOR="$GITHUB_ACTOR" \ + RELEASE_ID="$RELEASE_ID" \ + infra/scripts/rollback.sh + rollback_exit=$? + set -e + if [ "$rollback_exit" -ne 0 ]; then + chmod +x infra/scripts/revert-nginx.sh + infra/scripts/revert-nginx.sh || true + exit "$rollback_exit" + fi + + - name: Set rollback outcome + id: rollback-outcome + if: always() + shell: bash + run: | + set -euo pipefail + rollback_status="failed" + if [ "${{ steps.current.outcome }}" != "success" ]; then + rollback_status="skipped_no_previous_release" + elif [ "${{ steps.rollback-images.outcome }}" != "success" ]; then + rollback_status="skipped_missing_images" + elif [ "${{ steps.rollback.outcome }}" = "success" ]; then + rollback_status="succeeded" + fi + echo "rollback_status=${rollback_status}" >> "$GITHUB_OUTPUT" + echo "Automatic rollback status: ${rollback_status}" + + - name: Fail rollback job when rollback failed + if: ${{ steps.rollback-outcome.outputs.rollback_status != 'succeeded' }} + shell: bash + run: | + echo "Automatic rollback did not restore the previous release. Final notification will include rollback status: ${{ steps.rollback-outcome.outputs.rollback_status }}" + exit 1 + + # ---------- Verification ---------- + + verify: + name: Verify and record + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + - build-web + - build-api + - build-worker + - migrate-db + - promote-all + - record-promoted-state + - deploy-worker + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/refresh-lock.sh + infra/scripts/record-state.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Refresh deployment lock + # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + if: ${{ false }} + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + LOCK_TTL_SECONDS: 7200 + run: bash infra/scripts/refresh-lock.sh + + - name: Record deployment success + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} + RELEASE_ID: ${{ needs.prepare-release.outputs.release_id }} + STATUS: current + WEB_IMAGE: ${{ needs.build-web.outputs.web_image }} + API_IMAGE: ${{ needs.build-api.outputs.api_image }} + WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image }} + MIGRATIONS_APPLIED_JSON: ${{ needs.migrate-db.outputs.migrations_applied || '[]' }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_ACTOR: ${{ github.actor }} + run: bash infra/scripts/record-state.sh + + release-lock: + name: Release deployment lock (disabled) + runs-on: ubuntu-latest + needs: + - prepare-release + - acquire-lock + # S3 deployment locking is temporarily disabled. + # GitHub Actions environment-scoped concurrency is the primary concurrency control. + # if: ${{ always() && needs.acquire-lock.outputs.lock_token != '' }} + if: ${{ false }} + steps: + - name: Checkout release script + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/release-lock.sh + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Release lock + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + run: bash infra/scripts/release-lock.sh + + deployment-report: + name: Deployment report and final notification + runs-on: ubuntu-latest + needs: + - preflight + - prepare-release + - acquire-lock + - build-web + - build-api + - build-worker + - record-build-state + - migrate-db + - deploy-api + - deploy-web + - promote-all + - record-promoted-state + - deploy-worker + - rollback-after-promotion + - verify + - release-lock + if: always() + steps: + - name: Checkout report helpers + uses: actions/checkout@v4 + with: + ref: ${{ env.BRANCH }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/send-email.sh + + - name: Collect NGINX active upstreams + id: nginx + continue-on-error: true + shell: bash + env: + SSH_HOST: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + SSH_KEY: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + run: | + set -euo pipefail + upstreams="unavailable" + if [ -n "${SSH_HOST:-}" ] && [ -n "${SSH_KEY:-}" ]; then + key_file="$(mktemp)" + output_file="$(mktemp)" + trap 'rm -f "$key_file" "$output_file"' EXIT + printf '%s\n' "$SSH_KEY" > "$key_file" + chmod 600 "$key_file" + set +e + ssh -o BatchMode=yes \ + -o StrictHostKeyChecking=no \ + -o ConnectTimeout=15 \ + -i "$key_file" \ + "onehash@${SSH_HOST}" \ + 'if [ -f /etc/nginx/conf.d/cal-id.conf ]; then + grep -E "proxy_pass http://127\\.0\\.0\\.1:[0-9]+" /etc/nginx/conf.d/cal-id.conf | sed -E "s/^[[:space:]]+//" | sort -u + static_target="$(readlink -f /var/www/cal-id-static/current 2>/dev/null || true)" + if [ -n "$static_target" ]; then printf "static: %s\n" "$static_target"; fi + else + printf "cal-id.conf not found\n" + fi' > "$output_file" 2>&1 + ssh_status=$? + set -e + if [ "$ssh_status" -eq 0 ] && [ -s "$output_file" ]; then + upstreams="$(cat "$output_file")" + else + upstreams="unavailable: $(tr '\n' ' ' < "$output_file" | sed -E 's/[[:space:]]+/ /g')" + fi + fi + { + echo "nginx_active_upstreams<> "$GITHUB_OUTPUT" + + - name: Collect host deployment logs + id: host-logs + continue-on-error: true + shell: bash + env: + SSH_HOST: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + SSH_KEY: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + run: | + set -euo pipefail + + sanitize_logs() { + sed -E \ + -e 's/(Authorization:[[:space:]]*Bearer[[:space:]]+)[^[:space:]]+/\1[REDACTED]/' \ + -e 's/([?&]token=)[^&[:space:]]+/\1[REDACTED]/' \ + -e 's/([?&]api[_-]?key=)[^&[:space:]]+/\1[REDACTED]/' \ + -e 's/((password|passwd|pwd|secret|token|api[_-]?key|access[_-]?key)[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/' \ + -e 's/(AWS_ACCESS_KEY_ID[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' \ + -e 's/(AWS_SECRET_ACCESS_KEY[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' \ + -e 's/(SENDGRID_API_KEY[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' + } + + log_snippet="host log collection unavailable" + if [ -n "${SSH_HOST:-}" ] && [ -n "${SSH_KEY:-}" ]; then + key_file="$(mktemp)" + output_file="$(mktemp)" + trap 'rm -f "$key_file" "$output_file"' EXIT + printf '%s\n' "$SSH_KEY" > "$key_file" + chmod 600 "$key_file" + set +e + ssh -o BatchMode=yes \ + -o StrictHostKeyChecking=no \ + -o ConnectTimeout=15 \ + -i "$key_file" \ + "onehash@${SSH_HOST}" \ + 'set +e + echo "=== host ===" + hostname + date -u + echo "" + echo "=== docker containers ===" + docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" 2>/dev/null | sed -n "1,40p" + echo "" + echo "=== recent deployment logs ===" + for file in /home/onehash/deployment.log /home/onehash/api-deployment.log /home/onehash/worker-deployment.log; do + if [ -f "$file" ]; then + echo "--- ${file} ---" + tail -80 "$file" + fi + done + echo "" + echo "=== recent candidate/current container logs ===" + for container in web-candidate api-candidate worker-new-1 worker-1; do + if docker ps -a --format "{{.Names}}" 2>/dev/null | grep -Fxq "$container"; then + echo "--- docker logs ${container} ---" + docker logs --tail 60 "$container" 2>&1 + fi + done' > "$output_file" 2>&1 + ssh_status=$? + set -e + if [ "$ssh_status" -eq 0 ] && [ -s "$output_file" ]; then + log_snippet="$(tail -220 "$output_file" | sanitize_logs)" + else + log_snippet="host log collection failed: $(tr '\n' ' ' < "$output_file" | sanitize_logs | sed -E 's/[[:space:]]+/ /g')" + fi + fi + + { + echo "log_snippet<> "$GITHUB_OUTPUT" + + - name: Write deployment report + id: report + shell: bash + env: + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env || '' }} + BRANCH_NAME: ${{ needs.prepare-release.outputs.branch || env.BRANCH }} + RELEASE_ID: ${{ needs.prepare-release.outputs.release_id || 'unknown' }} + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash || github.sha }} + WEB_IMAGE: ${{ needs.build-web.outputs.web_image || 'N/A' }} + API_IMAGE: ${{ needs.build-api.outputs.api_image || 'N/A' }} + WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image || 'N/A' }} + PREFLIGHT_RESULT: ${{ needs.preflight.result }} + BUILD_WEB_RESULT: ${{ needs.build-web.result }} + BUILD_API_RESULT: ${{ needs.build-api.result }} + BUILD_WORKER_RESULT: ${{ needs.build-worker.result }} + RECORD_BUILD_STATE_RESULT: ${{ needs.record-build-state.result }} + MIGRATE_DB_RESULT: ${{ needs.migrate-db.result }} + MIGRATION_COUNT: ${{ needs.migrate-db.outputs.migration_count || '0' }} + DEPLOY_API_RESULT: ${{ needs.deploy-api.result }} + DEPLOY_WEB_RESULT: ${{ needs.deploy-web.result }} + PROMOTE_ALL_RESULT: ${{ needs.promote-all.result }} + RECORD_PROMOTED_STATE_RESULT: ${{ needs.record-promoted-state.result }} + DEPLOY_WORKER_RESULT: ${{ needs.deploy-worker.result }} + ROLLBACK_JOB_RESULT: ${{ needs.rollback-after-promotion.result }} + ROLLBACK_STATUS_RAW: ${{ needs.rollback-after-promotion.outputs.rollback_status || '' }} + ROLLBACK_TARGET_SHA: ${{ needs.rollback-after-promotion.outputs.rollback_target_sha || '' }} + VERIFY_RESULT: ${{ needs.verify.result }} + RELEASE_LOCK_RESULT: ${{ needs.release-lock.result }} + NGINX_ACTIVE_UPSTREAMS: ${{ steps.nginx.outputs.nginx_active_upstreams }} + LOG_SNIPPET: ${{ steps.host-logs.outputs.log_snippet }} + run: | + set -euo pipefail + + if [ -z "$DEPLOY_ENV" ]; then + if [ "$BRANCH_NAME" = "main" ]; then + DEPLOY_ENV="production" + else + DEPLOY_ENV="staging" + fi + fi + + state_file_path="N/A" + if [ "$DEPLOY_ENV" = "production" ] || [ "$DEPLOY_ENV" = "staging" ]; then + source infra/scripts/deploy-state-paths.sh + require_deploy_state_config + state_file_path="$(get_deploy_state_key "deployments/current.json")" + fi + + migration_status="$MIGRATE_DB_RESULT" + if [ "$MIGRATE_DB_RESULT" = "success" ]; then + migration_status="success (${MIGRATION_COUNT} migrations)" + fi + + rollback_status="not_applicable" + if [ "$ROLLBACK_JOB_RESULT" != "skipped" ]; then + rollback_status="${ROLLBACK_STATUS_RAW:-failed}" + elif [ "$PROMOTE_ALL_RESULT" = "success" ] && [ "$VERIFY_RESULT" != "success" ]; then + rollback_status="not_triggered" + fi + + notification_event="deployment_failed_before_promotion" + failure_reason="Deployment failed before Web/API promotion. Check the job status table in the deployment report." + + if [ "$VERIFY_RESULT" = "success" ]; then + notification_event="deployment_succeeded" + failure_reason="" + elif [ "$PROMOTE_ALL_RESULT" = "success" ]; then + if [ "$rollback_status" = "succeeded" ]; then + notification_event="deployment_failed_after_promotion_rollback_succeeded" + failure_reason="Deployment failed after Web/API promotion. Automatic rollback restored the previous release." + elif [ "$rollback_status" = "skipped_no_previous_release" ]; then + notification_event="deployment_failed_after_promotion_rollback_failed" + failure_reason="Deployment failed after Web/API promotion. Automatic rollback was skipped because no previous current release was available in deployment state." + elif [ "$rollback_status" = "skipped_missing_images" ]; then + notification_event="deployment_failed_after_promotion_rollback_failed" + failure_reason="Deployment failed after Web/API promotion. Automatic rollback was skipped because one or more previous release images were missing in ECR." + else + notification_event="deployment_failed_after_promotion_rollback_failed" + failure_reason="Deployment failed after Web/API promotion. Automatic rollback failed, was skipped, or did not complete." + fi + fi + + { + echo "# Cal-ID Deployment Report" + echo "" + echo "| Field | Value |" + echo "|---|---|" + echo "| Environment | ${DEPLOY_ENV} |" + echo "| Release | ${RELEASE_ID} |" + echo "| Git SHA | ${GIT_SHA} |" + echo "| Branch | ${BRANCH_NAME} |" + echo "| State file | ${state_file_path} |" + echo "| Final notification | ${notification_event} |" + echo "" + echo "## Images" + echo "" + echo "| Service | Image |" + echo "|---|---|" + echo "| Web | ${WEB_IMAGE} |" + echo "| API | ${API_IMAGE} |" + echo "| Worker | ${WORKER_IMAGE} |" + echo "" + echo "## Job Status" + echo "" + echo "| Stage | Status |" + echo "|---|---|" + echo "| Preflight | ${PREFLIGHT_RESULT} |" + echo "| Build Web | ${BUILD_WEB_RESULT} |" + echo "| Build API | ${BUILD_API_RESULT} |" + echo "| Build Worker | ${BUILD_WORKER_RESULT} |" + echo "| Record Build State | ${RECORD_BUILD_STATE_RESULT} |" + echo "| Migrations | ${migration_status} |" + echo "| Deploy API | ${DEPLOY_API_RESULT} |" + echo "| Deploy Web | ${DEPLOY_WEB_RESULT} |" + echo "| Promote Web/API | ${PROMOTE_ALL_RESULT} |" + echo "| Record Promoted State | ${RECORD_PROMOTED_STATE_RESULT} |" + echo "| Deploy Workers | ${DEPLOY_WORKER_RESULT} |" + echo "| Automatic Rollback | ${rollback_status} |" + echo "| Verify / Record Current | ${VERIFY_RESULT} |" + echo "| Release Lock | ${RELEASE_LOCK_RESULT} |" + echo "" + echo "## Rollback" + echo "" + echo "| Field | Value |" + echo "|---|---|" + echo "| Status | ${rollback_status} |" + echo "| Target SHA | ${ROLLBACK_TARGET_SHA:-N/A} |" + echo "" + echo "## NGINX Active Upstreams" + echo "" + echo '```text' + printf '%s\n' "${NGINX_ACTIVE_UPSTREAMS:-unavailable}" + echo '```' + echo "" + echo "## Host Log Snippet" + echo "" + echo '```text' + printf '%s\n' "${LOG_SNIPPET:-unavailable}" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + { + echo "notification_event=${notification_event}" + echo "state_file_path=${state_file_path}" + echo "migration_status=${migration_status}" + echo "rollback_status=${rollback_status}" + echo "log_snippet<> "$GITHUB_OUTPUT" + + - name: Send final deployment notification + env: + SENDGRID_API_KEY: ${{ secrets.SENDGRID_API_KEY }} + EMAIL_FROM: ${{ secrets.EMAIL_FROM || 'alerts@cal.id' }} + EMAIL_TO: ${{ secrets.EMAIL_TO || 'deployments@cal.id' }} + NOTIFICATION_EVENT: ${{ steps.report.outputs.notification_event }} + DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env || 'unknown' }} + RELEASE_ID: ${{ needs.prepare-release.outputs.release_id || 'unknown' }} + GIT_SHA: ${{ needs.prepare-release.outputs.git_hash || github.sha }} + FAILURE_REASON: ${{ steps.report.outputs.failure_reason }} + WEB_IMAGE: ${{ needs.build-web.outputs.web_image || 'N/A' }} + API_IMAGE: ${{ needs.build-api.outputs.api_image || 'N/A' }} + WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image || 'N/A' }} + MIGRATION_STATUS: ${{ steps.report.outputs.migration_status }} + WEB_STATUS: ${{ needs.deploy-web.result }} + API_STATUS: ${{ needs.deploy-api.result }} + WORKER_STATUS: ${{ needs.deploy-worker.result }} + ROLLBACK_STATUS: ${{ steps.report.outputs.rollback_status }} + ROLLBACK_TARGET_SHA: ${{ needs.rollback-after-promotion.outputs.rollback_target_sha || 'N/A' }} + STATE_FILE_PATH: ${{ steps.report.outputs.state_file_path }} + NGINX_ACTIVE_UPSTREAMS: ${{ steps.nginx.outputs.nginx_active_upstreams }} + LOG_SNIPPET: ${{ steps.report.outputs.log_snippet }} + LOG_SNIPPET_MAX_LINES: 220 + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_ACTOR: ${{ github.actor }} + BRANCH_NAME: ${{ env.BRANCH }} + run: bash infra/scripts/send-email.sh diff --git a/.github/workflows/main-deploy.yml b/.github/workflows/main-deploy.yml index 76c8690a2a8fe1..e95e3845674425 100644 --- a/.github/workflows/main-deploy.yml +++ b/.github/workflows/main-deploy.yml @@ -1,22 +1,29 @@ name: Build WEB, Push Docker Image, Deploy to EC2 +# DEPRECATED — Auto-deploy on PR merge is deprecated. +# Deployment is now managed by .github/workflows/deploy-all.yml +# which provides unified staging/production deployment for all services (Web + API + Worker). +# The `pull_request` trigger below will be removed in a future release. +# Use `workflow_dispatch` for manual deployments, or rely on deploy-all.yml for automated deploys. +# +# Deprecated triggers will be removed in Phase 12. on: #trigger on push on "deploy" branch - push: - branches: - - build - pull_request: - types: - - closed - branches: - - main - - develop - paths: - - "apps/web/**" - - "packages/**" - - "infra/docker/web/**" - - ".github/workflows/main-deploy.yml" - - "infra/scripts/deploy-web.sh" + # push: + # branches: + # - build + # pull_request: + # types: + # - closed + # branches: + # - main + # - develop + # paths: + # - "apps/web/**" + # - "packages/**" + # - "infra/docker/web/**" + # - ".github/workflows/main-deploy.yml" + # - "infra/scripts/deploy-web.sh" workflow_dispatch: inputs: branch: @@ -297,8 +304,6 @@ jobs: ${{ steps.login-ecr.outputs.registry }}/${{ steps.key_values.outputs.REPO_NAME }}:${{ steps.hash.outputs.GIT_HASH }} ${{ steps.login-ecr.outputs.registry }}/${{ steps.key_values.outputs.REPO_NAME }}:latest build-args: | - CALENDSO_ENCRYPTION_KEY=${{ secrets.CALENDSO_ENCRYPTION_KEY }} - NEXT_PUBLIC_GTM_ID=${{ steps.key_values.outputs.NEXT_PUBLIC_GTM_ID }} NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID=${{ steps.key_values.outputs.NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID }} @@ -311,8 +316,6 @@ jobs: NEXT_PUBLIC_API_V2_URL=${{ secrets.NEXT_PUBLIC_API_V2_URL }} NEXT_PUBLIC_EMBED_LIB_URL=${{ format('https://{0}/embed-link/embed.js', steps.key_values.outputs.DOMAIN_NAME) }} NEXT_PUBLIC_ONEHASH_URL=${{ secrets.NEXT_PUBLIC_ONEHASH_URL }} - NEXTAUTH_URL=${{ format('https://{0}', steps.key_values.outputs.DOMAIN_NAME) }} - NEXTAUTH_SECRET=${{ secrets.NEXTAUTH_SECRET }} NEXT_PUBLIC_SENDGRID_SENDER_NAME=${{ secrets.NEXT_PUBLIC_SENDGRID_SENDER_NAME }} NEXT_PUBLIC_SENTRY_DSN=${{ steps.key_values.outputs.NEXT_PUBLIC_SENTRY_DSN }} NEXT_PUBLIC_LOGGER_LEVEL=${{ secrets.NEXT_PUBLIC_LOGGER_LEVEL }} @@ -344,8 +347,10 @@ jobs: NEXT_PUBLIC_WHITELISTED_ORGS=${{ secrets.NEXT_PUBLIC_WHITELISTED_ORGS }} NEXT_PUBLIC_PIXEL=${{ secrets.NEXT_PUBLIC_PIXEL }} NEXT_PUBLIC_CLOUDFLARE_SITEKEY=${{ secrets.NEXT_PUBLIC_CLOUDFLARE_SITEKEY }} + secrets: | + nextauth_secret=${{ secrets.NEXTAUTH_SECRET }} + calendso_encryption_key=${{ secrets.CALENDSO_ENCRYPTION_KEY }} - - name: Check Docker build status shell: bash id: check_docker_build_status @@ -420,14 +425,11 @@ jobs: # ${{ secrets.DOCKERHUB_USERNAME }}/${{ steps.key_values.outputs.REPO_NAME }}:${{ steps.hash.outputs.GIT_HASH }} # ${{ secrets.DOCKERHUB_USERNAME }}/${{ steps.key_values.outputs.REPO_NAME }}:latest # build-args: | - # CALENDSO_ENCRYPTION_KEY=${{ secrets.CALENDSO_ENCRYPTION_KEY }} # NEXT_PUBLIC_WEBAPP_URL=${{ format('https://app.{0}', steps.key_values.outputs.DOMAIN_NAME) }} # NEXT_PUBLIC_WEBSITE_URL=${{ format('https://{0}', steps.key_values.outputs.DOMAIN_NAME) }} # NEXT_PUBLIC_API_V2_URL=${{ secrets.NEXT_PUBLIC_API_V2_URL }} # NEXT_PUBLIC_EMBED_LIB_URL=${{ format('https://{0}/embed-link/embed.js', steps.key_values.outputs.DOMAIN_NAME) }} # NEXT_PUBLIC_ONEHASH_URL=${{ secrets.NEXT_PUBLIC_ONEHASH_URL }} - # NEXTAUTH_URL=${{ format('https://{0}', steps.key_values.outputs.DOMAIN_NAME) }} - # NEXTAUTH_SECRET=${{ secrets.NEXTAUTH_SECRET }} # NEXT_PUBLIC_SENDGRID_SENDER_NAME=${{ secrets.BRAND_NAME }} # NEXT_PUBLIC_SENTRY_DSN=${{ steps.key_values.outputs.NEXT_PUBLIC_SENTRY_DSN }} # NEXT_PUBLIC_LOGGER_LEVEL=${{ secrets.NEXT_PUBLIC_LOGGER_LEVEL }} diff --git a/.github/workflows/rollback.yml b/.github/workflows/rollback.yml new file mode 100644 index 00000000000000..67c12e9dc1aa5d --- /dev/null +++ b/.github/workflows/rollback.yml @@ -0,0 +1,456 @@ +name: App Rollback + +on: + workflow_dispatch: + inputs: + branch: + description: "Branch containing rollback scripts" + required: true + default: "main" + target_sha: + description: "Git SHA to roll application services back to" + required: true + environment: + description: "Rollback environment" + required: true + default: "staging" + type: choice + options: + - staging + - production + dry_run: + description: "Validate and print commands without running host rollback" + required: true + default: "true" + type: choice + options: + - "true" + - "false" + +concurrency: + # Environment-scoped: 'staging' or 'production' (from workflow_dispatch environment input) + group: rollback-${{ github.event.inputs.environment }} + cancel-in-progress: false + +jobs: + validate-schema: + name: Validate rollback schema + runs-on: ubuntu-latest + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch }} + + - name: Validate script syntax + shell: bash + run: | + bash -n infra/scripts/validate-rollback-schema.sh + bash -n infra/scripts/rollback.sh + + - name: Run schema validation from target VPC + uses: appleboy/ssh-action@v0.1.10 + env: + TARGET_SHA: ${{ github.event.inputs.target_sha }} + ROLLBACK_ENV: ${{ github.event.inputs.environment }} + REPO_URL: ${{ secrets.REPO_URL }} + DATABASE_URL: ${{ github.event.inputs.environment == 'production' && secrets.DATABASE_DIRECT_URL_PROD || secrets.DATABASE_DIRECT_URL_STAG }} + BRANCH_NAME: ${{ github.event.inputs.branch }} + with: + host: ${{ github.event.inputs.environment == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ github.event.inputs.environment == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 1200s + envs: TARGET_SHA,ROLLBACK_ENV,REPO_URL,DATABASE_URL,BRANCH_NAME + script: | + set -euo pipefail + echo "Manual app-only rollback requested for ${ROLLBACK_ENV} at ${TARGET_SHA}" + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 + git fetch origin "$TARGET_SHA" --depth 1 || true + git checkout "$TARGET_SHA" || git checkout "origin/$BRANCH_NAME" + chmod +x infra/scripts/validate-rollback-schema.sh + TARGET_SHA="$TARGET_SHA" REPO_URL="$REPO_URL" DATABASE_URL="$DATABASE_URL" BRANCH_NAME="$BRANCH_NAME" infra/scripts/validate-rollback-schema.sh + + verify-images: + name: Verify rollback images + runs-on: ubuntu-latest + needs: validate-schema + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Verify rollback images exist in ECR + shell: bash + env: + TARGET_SHA: ${{ github.event.inputs.target_sha }} + WEB_REPO_NAME: ${{ github.event.inputs.environment == 'production' && 'cal_prod' || 'cal_stag' }} + API_REPO_NAME: ${{ github.event.inputs.environment == 'production' && 'cal_api_prod' || 'cal_api_stag' }} + WORKER_REPO_NAME: ${{ github.event.inputs.environment == 'production' && 'cal_worker_prod' || 'cal_worker_stag' }} + run: | + set -euo pipefail + failed=0 + for repo in "$WEB_REPO_NAME" "$API_REPO_NAME" "$WORKER_REPO_NAME"; do + echo "Checking ECR image ${repo}:${TARGET_SHA}" + if aws ecr describe-images \ + --repository-name "$repo" \ + --image-ids imageTag="$TARGET_SHA" >/dev/null 2>&1; then + echo " ✅ ${repo}:${TARGET_SHA} exists" + else + echo " ❌ ${repo}:${TARGET_SHA} NOT FOUND — rollback cannot proceed" + failed=1 + fi + done + if [ "$failed" -ne 0 ]; then + echo "" + echo "One or more rollback images are missing. Build and push images for ${TARGET_SHA} before retrying." + exit 1 + fi + echo "All rollback images verified." + + rollback-app: + name: App-only rollback + runs-on: ubuntu-latest + needs: + - validate-schema + - verify-images + if: ${{ github.event.inputs.dry_run == 'false' }} + steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch }} + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ secrets.AWS_REGION }} + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Run app-only rollback on target host + uses: appleboy/ssh-action@v0.1.10 + env: + TARGET_SHA: ${{ github.event.inputs.target_sha }} + ROLLBACK_ENV: ${{ github.event.inputs.environment }} + BRANCH_NAME: ${{ github.event.inputs.branch }} + REPO_URL: ${{ secrets.REPO_URL }} + DATABASE_URL: ${{ github.event.inputs.environment == 'production' && secrets.DATABASE_DIRECT_URL_PROD || secrets.DATABASE_DIRECT_URL_STAG }} + DOMAIN_NAME: ${{ github.event.inputs.environment == 'production' && secrets.DOMAIN_NAME_PROD || secrets.DOMAIN_NAME_STAG }} + HOMEPAGE_URL: ${{ github.event.inputs.environment == 'production' && secrets.HOMEPAGE_URL_PROD || secrets.HOMEPAGE_URL_STAG }} + CERTBOT_EMAIL: ${{ secrets.CERTBOT_EMAIL }} + DEPLOY_ENV: ${{ github.event.inputs.environment }} + AWS_REGION: ${{ secrets.AWS_REGION }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} + ECR_REGISTRY: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com', secrets.AWS_ACCOUNT_ID, secrets.AWS_REGION) }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_ACTOR: ${{ github.actor }} + RELEASE_ID: rollback-${{ github.run_id }} + WEB_REPO_NAME: ${{ github.event.inputs.environment == 'production' && 'cal_prod' || 'cal_stag' }} + API_REPO_NAME: ${{ github.event.inputs.environment == 'production' && 'cal_api_prod' || 'cal_api_stag' }} + WORKER_REPO_NAME: ${{ github.event.inputs.environment == 'production' && 'cal_worker_prod' || 'cal_worker_stag' }} + with: + host: ${{ github.event.inputs.environment == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ github.event.inputs.environment == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 2400s + envs: TARGET_SHA,ROLLBACK_ENV,DEPLOY_ENV,BRANCH_NAME,REPO_URL,DATABASE_URL,DOMAIN_NAME,HOMEPAGE_URL,CERTBOT_EMAIL,AWS_REGION,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,AWS_ACCOUNT_ID,ECR_REGISTRY,GITHUB_RUN_ID,GITHUB_ACTOR,RELEASE_ID,WEB_REPO_NAME,API_REPO_NAME,WORKER_REPO_NAME + script: | + set -euo pipefail + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 + git fetch origin "$TARGET_SHA" --depth 1 || true + git checkout "$TARGET_SHA" || git checkout "origin/$BRANCH_NAME" + aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID" + aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY" + aws configure set default.region "$AWS_REGION" + aws ecr get-login-password --region "$AWS_REGION" \ + | docker login --username AWS --password-stdin "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" + chmod +x infra/scripts/*.sh + registry="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" + DEPLOY_ENV="$DEPLOY_ENV" \ + TARGET_SHA="$TARGET_SHA" \ + REPO_URL="$REPO_URL" \ + DATABASE_URL="$DATABASE_URL" \ + BRANCH_NAME="$BRANCH_NAME" \ + WEB_IMAGE="${registry}/${WEB_REPO_NAME}:${TARGET_SHA}" \ + API_IMAGE="${registry}/${API_REPO_NAME}:${TARGET_SHA}" \ + WORKER_IMAGE="${registry}/${WORKER_REPO_NAME}:${TARGET_SHA}" \ + DOMAIN_NAME="$DOMAIN_NAME" \ + HOMEPAGE_URL="$HOMEPAGE_URL" \ + ECR_REGISTRY="$ECR_REGISTRY" \ + GITHUB_RUN_ID="$GITHUB_RUN_ID" \ + GITHUB_ACTOR="$GITHUB_ACTOR" \ + RELEASE_ID="$RELEASE_ID" \ + infra/scripts/rollback.sh + + rollback-report: + name: Rollback report and final notification + runs-on: ubuntu-latest + needs: + - validate-schema + - verify-images + - rollback-app + if: ${{ always() && github.event.inputs.dry_run == 'false' }} + steps: + - name: Checkout report helpers + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch }} + sparse-checkout: | + infra/scripts/deploy-state-paths.sh + infra/scripts/send-email.sh + + - name: Collect NGINX active upstreams + id: nginx + continue-on-error: true + shell: bash + env: + SSH_HOST: ${{ github.event.inputs.environment == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + SSH_KEY: ${{ github.event.inputs.environment == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + run: | + set -euo pipefail + upstreams="unavailable" + if [ -n "${SSH_HOST:-}" ] && [ -n "${SSH_KEY:-}" ]; then + key_file="$(mktemp)" + output_file="$(mktemp)" + trap 'rm -f "$key_file" "$output_file"' EXIT + printf '%s\n' "$SSH_KEY" > "$key_file" + chmod 600 "$key_file" + set +e + ssh -o BatchMode=yes \ + -o StrictHostKeyChecking=no \ + -o ConnectTimeout=15 \ + -i "$key_file" \ + "onehash@${SSH_HOST}" \ + 'if [ -f /etc/nginx/conf.d/cal-id.conf ]; then + grep -E "proxy_pass http://127\\.0\\.0\\.1:[0-9]+" /etc/nginx/conf.d/cal-id.conf | sed -E "s/^[[:space:]]+//" | sort -u + static_target="$(readlink -f /var/www/cal-id-static/current 2>/dev/null || true)" + if [ -n "$static_target" ]; then printf "static: %s\n" "$static_target"; fi + else + printf "cal-id.conf not found\n" + fi' > "$output_file" 2>&1 + ssh_status=$? + set -e + if [ "$ssh_status" -eq 0 ] && [ -s "$output_file" ]; then + upstreams="$(cat "$output_file")" + else + upstreams="unavailable: $(tr '\n' ' ' < "$output_file" | sed -E 's/[[:space:]]+/ /g')" + fi + fi + { + echo "nginx_active_upstreams<> "$GITHUB_OUTPUT" + + - name: Collect host rollback logs + id: host-logs + continue-on-error: true + shell: bash + env: + SSH_HOST: ${{ github.event.inputs.environment == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + SSH_KEY: ${{ github.event.inputs.environment == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + run: | + set -euo pipefail + + sanitize_logs() { + sed -E \ + -e 's/(Authorization:[[:space:]]*Bearer[[:space:]]+)[^[:space:]]+/\1[REDACTED]/' \ + -e 's/([?&]token=)[^&[:space:]]+/\1[REDACTED]/' \ + -e 's/([?&]api[_-]?key=)[^&[:space:]]+/\1[REDACTED]/' \ + -e 's/((password|passwd|pwd|secret|token|api[_-]?key|access[_-]?key)[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/' \ + -e 's/(AWS_ACCESS_KEY_ID[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' \ + -e 's/(AWS_SECRET_ACCESS_KEY[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' \ + -e 's/(SENDGRID_API_KEY[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' + } + + log_snippet="host log collection unavailable" + if [ -n "${SSH_HOST:-}" ] && [ -n "${SSH_KEY:-}" ]; then + key_file="$(mktemp)" + output_file="$(mktemp)" + trap 'rm -f "$key_file" "$output_file"' EXIT + printf '%s\n' "$SSH_KEY" > "$key_file" + chmod 600 "$key_file" + set +e + ssh -o BatchMode=yes \ + -o StrictHostKeyChecking=no \ + -o ConnectTimeout=15 \ + -i "$key_file" \ + "onehash@${SSH_HOST}" \ + 'set +e + echo "=== host ===" + hostname + date -u + echo "" + echo "=== docker containers ===" + docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" 2>/dev/null | sed -n "1,40p" + echo "" + echo "=== recent deployment logs ===" + for file in /home/onehash/deployment.log /home/onehash/api-deployment.log /home/onehash/worker-deployment.log; do + if [ -f "$file" ]; then + echo "--- ${file} ---" + tail -80 "$file" + fi + done + echo "" + echo "=== recent container logs ===" + for container in web-candidate api-candidate worker-new-1 worker-1; do + if docker ps -a --format "{{.Names}}" 2>/dev/null | grep -Fxq "$container"; then + echo "--- docker logs ${container} ---" + docker logs --tail 60 "$container" 2>&1 + fi + done' > "$output_file" 2>&1 + ssh_status=$? + set -e + if [ "$ssh_status" -eq 0 ] && [ -s "$output_file" ]; then + log_snippet="$(tail -220 "$output_file" | sanitize_logs)" + else + log_snippet="host log collection failed: $(tr '\n' ' ' < "$output_file" | sanitize_logs | sed -E 's/[[:space:]]+/ /g')" + fi + fi + + { + echo "log_snippet<> "$GITHUB_OUTPUT" + + - name: Write rollback report + id: report + shell: bash + env: + DEPLOY_ENV: ${{ github.event.inputs.environment }} + TARGET_SHA: ${{ github.event.inputs.target_sha }} + BRANCH_NAME: ${{ github.event.inputs.branch }} + VALIDATE_SCHEMA_RESULT: ${{ needs.validate-schema.result }} + VERIFY_IMAGES_RESULT: ${{ needs.verify-images.result }} + ROLLBACK_APP_RESULT: ${{ needs.rollback-app.result }} + WEB_IMAGE: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/{2}:{3}', secrets.AWS_ACCOUNT_ID, secrets.AWS_REGION, github.event.inputs.environment == 'production' && 'cal_prod' || 'cal_stag', github.event.inputs.target_sha) }} + API_IMAGE: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/{2}:{3}', secrets.AWS_ACCOUNT_ID, secrets.AWS_REGION, github.event.inputs.environment == 'production' && 'cal_api_prod' || 'cal_api_stag', github.event.inputs.target_sha) }} + WORKER_IMAGE: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/{2}:{3}', secrets.AWS_ACCOUNT_ID, secrets.AWS_REGION, github.event.inputs.environment == 'production' && 'cal_worker_prod' || 'cal_worker_stag', github.event.inputs.target_sha) }} + NGINX_ACTIVE_UPSTREAMS: ${{ steps.nginx.outputs.nginx_active_upstreams }} + LOG_SNIPPET: ${{ steps.host-logs.outputs.log_snippet }} + run: | + set -euo pipefail + source infra/scripts/deploy-state-paths.sh + require_deploy_state_config + state_file_path="$(get_deploy_state_key "deployments/current.json")" + + notification_event="manual_rollback_failed" + rollback_status="failed" + failure_reason="Manual rollback failed. Check validate-schema, verify-images, and rollback-app logs." + if [ "$VALIDATE_SCHEMA_RESULT" = "success" ] && [ "$VERIFY_IMAGES_RESULT" = "success" ] && [ "$ROLLBACK_APP_RESULT" = "success" ]; then + notification_event="manual_rollback_succeeded" + rollback_status="succeeded" + failure_reason="" + fi + + { + echo "# Cal-ID Manual Rollback Report" + echo "" + echo "| Field | Value |" + echo "|---|---|" + echo "| Environment | ${DEPLOY_ENV} |" + echo "| Target SHA | ${TARGET_SHA} |" + echo "| Branch | ${BRANCH_NAME} |" + echo "| State file | ${state_file_path} |" + echo "| Final notification | ${notification_event} |" + echo "" + echo "## Images" + echo "" + echo "| Service | Image |" + echo "|---|---|" + echo "| Web | ${WEB_IMAGE} |" + echo "| API | ${API_IMAGE} |" + echo "| Worker | ${WORKER_IMAGE} |" + echo "" + echo "## Job Status" + echo "" + echo "| Stage | Status |" + echo "|---|---|" + echo "| Validate Schema | ${VALIDATE_SCHEMA_RESULT} |" + echo "| Verify Images | ${VERIFY_IMAGES_RESULT} |" + echo "| Rollback App | ${ROLLBACK_APP_RESULT} |" + echo "| Rollback | ${rollback_status} |" + echo "" + echo "## NGINX Active Upstreams" + echo "" + echo '```text' + printf '%s\n' "${NGINX_ACTIVE_UPSTREAMS:-unavailable}" + echo '```' + echo "" + echo "## Host Log Snippet" + echo "" + echo '```text' + printf '%s\n' "${LOG_SNIPPET:-unavailable}" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + { + echo "notification_event=${notification_event}" + echo "rollback_status=${rollback_status}" + echo "state_file_path=${state_file_path}" + echo "log_snippet<> "$GITHUB_OUTPUT" + + - name: Send final rollback notification + env: + SENDGRID_API_KEY: ${{ secrets.SENDGRID_API_KEY }} + EMAIL_FROM: ${{ secrets.EMAIL_FROM || 'alerts@cal.id' }} + EMAIL_TO: ${{ secrets.EMAIL_TO || 'deployments@cal.id' }} + NOTIFICATION_EVENT: ${{ steps.report.outputs.notification_event }} + DEPLOY_ENV: ${{ github.event.inputs.environment }} + RELEASE_ID: rollback-${{ github.run_id }} + GIT_SHA: ${{ github.event.inputs.target_sha }} + FAILURE_REASON: ${{ steps.report.outputs.failure_reason }} + WEB_IMAGE: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/{2}:{3}', secrets.AWS_ACCOUNT_ID, secrets.AWS_REGION, github.event.inputs.environment == 'production' && 'cal_prod' || 'cal_stag', github.event.inputs.target_sha) }} + API_IMAGE: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/{2}:{3}', secrets.AWS_ACCOUNT_ID, secrets.AWS_REGION, github.event.inputs.environment == 'production' && 'cal_api_prod' || 'cal_api_stag', github.event.inputs.target_sha) }} + WORKER_IMAGE: ${{ format('{0}.dkr.ecr.{1}.amazonaws.com/{2}:{3}', secrets.AWS_ACCOUNT_ID, secrets.AWS_REGION, github.event.inputs.environment == 'production' && 'cal_worker_prod' || 'cal_worker_stag', github.event.inputs.target_sha) }} + MIGRATION_STATUS: not_applicable + WEB_STATUS: ${{ needs.rollback-app.result }} + API_STATUS: ${{ needs.rollback-app.result }} + WORKER_STATUS: ${{ needs.rollback-app.result }} + ROLLBACK_STATUS: ${{ steps.report.outputs.rollback_status }} + ROLLBACK_TARGET_SHA: ${{ github.event.inputs.target_sha }} + STATE_FILE_PATH: ${{ steps.report.outputs.state_file_path }} + NGINX_ACTIVE_UPSTREAMS: ${{ steps.nginx.outputs.nginx_active_upstreams }} + LOG_SNIPPET: ${{ steps.report.outputs.log_snippet }} + LOG_SNIPPET_MAX_LINES: 220 + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_ACTOR: ${{ github.actor }} + BRANCH_NAME: ${{ github.event.inputs.branch }} + run: bash infra/scripts/send-email.sh + + dry-run-summary: + name: Dry run summary + runs-on: ubuntu-latest + needs: + - validate-schema + - verify-images + if: ${{ github.event.inputs.dry_run == 'true' }} + steps: + - name: Summarize + shell: bash + run: | + echo "Dry run complete. No database or application state was changed." diff --git a/.github/workflows/validate-migration.yml b/.github/workflows/validate-migration.yml new file mode 100644 index 00000000000000..ae2dae85f1e6a2 --- /dev/null +++ b/.github/workflows/validate-migration.yml @@ -0,0 +1,134 @@ +name: Validate Prisma Migrations + +on: + pull_request: + paths: + - "packages/prisma/migrations/**" + - "MIGRATION_POLICY.md" + - ".github/workflows/validate-migration.yml" + +permissions: + contents: read + pull-requests: read + +jobs: + classify-migrations: + name: Classify migration safety + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Classify changed migrations + shell: bash + run: | + set -euo pipefail + + BASE_SHA="${{ github.event.pull_request.base.sha }}" + HEAD_SHA="${{ github.event.pull_request.head.sha }}" + git diff --name-only --diff-filter=AMR "$BASE_SHA" "$HEAD_SHA" -- \ + 'packages/prisma/migrations/**/migration.sql' \ + 'packages/prisma/migrations/**/down.sql' > /tmp/changed_migrations.txt + + if [ ! -s /tmp/changed_migrations.txt ]; then + echo "No changed Prisma migration or down.sql files." + exit 0 + fi + + echo "Changed migration/down.sql files:" + cat /tmp/changed_migrations.txt + echo + + blocked=0 + + while IFS= read -r file; do + [ -n "$file" ] || continue + [ -f "$file" ] || continue + + echo "Classifying $file" + + if grep -Eiq '^[[:space:]]*DROP[[:space:]]+TABLE\b' "$file"; then + echo "::error file=$file::CONTRACT migration detected: DROP TABLE" + blocked=1 + fi + + if grep -Eiq '^[[:space:]]*(ALTER[[:space:]]+TABLE\b.*)?DROP[[:space:]]+COLUMN\b' "$file"; then + echo "::error file=$file::CONTRACT migration detected: DROP COLUMN" + blocked=1 + fi + + if perl -0ne ' + while (/ALTER\s+TABLE\b.*?;/gsi) { + my $stmt = $&; + if ($stmt =~ /RENAME\s+COLUMN\b/i) { + print "::error file=$file::CONTRACT migration detected: RENAME COLUMN\n"; + exit 7; + } + } + ' "$file"; then + : + else + status=$? + if [ "$status" -eq 7 ]; then + blocked=1 + fi + fi + + if perl -0ne ' + while (/ALTER\s+TABLE\b.*?;/gsi) { + my $stmt = $&; + if ($stmt =~ /ALTER\s+COLUMN\b.*\bTYPE\b/i) { + print "::error file=$file::CONTRACT migration detected: ALTER COLUMN TYPE\n"; + exit 7; + } + } + ' "$file"; then + : + else + status=$? + if [ "$status" -eq 7 ]; then + blocked=1 + fi + fi + + if perl -0ne ' + while (/ALTER\s+TABLE\b.*?;/gis) { + my $stmt = $&; + if ($stmt =~ /SET\s+NOT\s+NULL/i && $stmt !~ /DEFAULT/i) { + exit 7; + } + } + ' "$file"; then + : + else + status=$? + if [ "$status" -eq 7 ]; then + echo "::error file=$file::CONTRACT migration detected: SET NOT NULL without DEFAULT in the same statement" + blocked=1 + else + echo "::error file=$file::Unable to parse migration for SET NOT NULL validation" + blocked=1 + fi + fi + + if grep -Eiq '^[[:space:]]*(CREATE[[:space:]]+TABLE|CREATE[[:space:]]+(UNIQUE[[:space:]]+)?INDEX|ALTER[[:space:]]+TABLE\b.*ADD[[:space:]]+(COLUMN|CONSTRAINT)|ALTER[[:space:]]+TYPE\b.*ADD[[:space:]]+VALUE)' "$file"; then + echo "::notice file=$file::EXPAND pattern detected" + fi + + if grep -Eiq '^[[:space:]]*(UPDATE|DELETE[[:space:]]+FROM|INSERT[[:space:]]+INTO\b.*SELECT)' "$file"; then + echo "::notice file=$file::DATA-BACKFILL pattern detected; include owner, validation query, and failure handling in the PR" + fi + done < /tmp/changed_migrations.txt + + if [ "$blocked" -ne 0 ]; then + echo + echo "One or more changed migrations contain CONTRACT patterns." + echo "Use the expand/contract model or obtain explicit cleanup-release approval." + exit 1 + fi + + echo "Migration validation passed." + echo "down.sql files are also validated." diff --git a/.github/workflows/worker-deploy.yml b/.github/workflows/worker-deploy.yml index 1d61ab945f6b5a..4297a57c684396 100644 --- a/.github/workflows/worker-deploy.yml +++ b/.github/workflows/worker-deploy.yml @@ -1,16 +1,23 @@ name: Build Worker, Push Docker Image, Deploy to EC2 +# DEPRECATED — Auto-deploy on PR merge is deprecated. +# Deployment is now managed by .github/workflows/deploy-all.yml +# which provides unified staging/production deployment for all services (Web + API + Worker). +# The `pull_request` trigger below will be removed in a future release. +# Use `workflow_dispatch` for manual deployments, or rely on deploy-all.yml for automated deploys. +# +# Deprecated triggers will be removed in Phase 12. on: - pull_request: - types: - - closed - branches: - - main - - develop - paths: - - 'apps/worker/**' - - 'packages/**' - - '.github/workflows/worker-deploy.yml' + # pull_request: + # types: + # - closed + # branches: + # - main + # - develop + # paths: + # - 'apps/worker/**' + # - 'packages/**' + # - '.github/workflows/worker-deploy.yml' workflow_dispatch: inputs: branch: @@ -198,10 +205,11 @@ jobs: platforms: linux/amd64 push: true build-args: | - SENTRY_AUTH_TOKEN=${{ secrets.SENTRY_AUTH_TOKEN }} - SENTRY_ORG=${{ secrets.SENTRY_ORG }} - SENTRY_PROJECT=${{ steps.key_values.outputs.SENTRY_PROJECT }} GIT_HASH=${{ steps.hash.outputs.GIT_HASH }} + secrets: | + sentry_auth_token=${{ secrets.SENTRY_AUTH_TOKEN }} + sentry_org=${{ secrets.SENTRY_ORG }} + sentry_project=${{ steps.key_values.outputs.SENTRY_PROJECT }} tags: | ${{ steps.login-ecr.outputs.registry }}/${{ steps.key_values.outputs.WORKER_REPO_NAME }}:${{ steps.hash.outputs.GIT_HASH }} ${{ steps.login-ecr.outputs.registry }}/${{ steps.key_values.outputs.WORKER_REPO_NAME }}:latest diff --git a/AGENTS.md b/AGENTS.md index d3e901bf2e99be..9f01149ee271cf 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -120,6 +120,8 @@ yarn format # prettier | `SENDGRID_INBOUND_SMS_DOMAIN` | No | Domain used for signed email-to-SMS reply aliases (default: `sms.cal.id`) | | `SENDGRID_INBOUND_SMS_SECRET` | For email→SMS bridge | HMAC secret used to sign and verify reply aliases before forwarding to SMS | | `SENDGRID_INBOUND_SMS_WEBHOOK_TOKEN` | Recommended | Shared token to protect inbound parse webhook endpoint (`/api/sendgrid/inbound-sms`) | +| `DEPLOY_STATE_BUCKET` | Deploy pipeline only | S3 deployment state bucket; must be `cal-id` | +| `DEPLOY_STATE_PREFIX` | Deploy pipeline only | S3 deployment state prefix, derived as `deployment-prod` or `deployment-stag` | | `MCP_ENABLED` | Connector only | Enables MCP server in `apps/connector` (default: `false`) | | `MCP_BASE_PATH` | Connector only | MCP transport base path (default: `/mcp`) | | `MCP_SESSION_TTL` | Connector only | MCP session TTL in milliseconds (default: `1800000`) | diff --git a/CLAUDE.md b/CLAUDE.md index 3a06334ef4d614..6b92cb71a5dfb4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,5 +1,17 @@ Read AGENTS.md for detailed project context and conventions. +# Response Formatting Rules + +* Always format responses as clean, ready-to-copy Markdown. +* Prefer complete Markdown documents over partial snippets. +* Use proper headings, bullet lists, code fences, and tables where appropriate. +* Never wrap the entire response in extra commentary unless explicitly requested. +* When generating files, output the exact final file contents. +* Ensure Markdown renders correctly on GitHub and common Markdown editors. +* For code examples, always specify the language in fenced code blocks. +* Avoid conversational filler before or after generated Markdown content. +* Default behavior should optimize for direct copy/paste into `.md` files. + ## Graphify This project uses **workspace-scoped** Graphify knowledge graphs only. diff --git a/Dockerfile b/Dockerfile index a50239d23a0cc3..7dc5c314e6186a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,46 @@ -ARG NODE_VERSION=22 +# syntax=docker/dockerfile:1.7 -# ---------- Stage 1: Build ---------- +ARG NODE_VERSION=22 +# ============================================================ +# Stage 1: Builder +# ============================================================ +# Secrets are NOT passed as Docker build args. +# Only non-sensitive NEXT_PUBLIC_* vars (safe to expose in browser bundle) +# are passed as build args. Server-only secrets that Next.js validates during +# build (NEXTAUTH_SECRET, CALENDSO_ENCRYPTION_KEY) are supplied as BuildKit +# secrets for the build step and injected again at runtime via --env-file. +# +# Build arg classification: +# BUILD-TIME (safe as build args - baked into webpack bundle, no secret value): +# NEXT_PUBLIC_GTM_ID, NEXT_PUBLIC_META_*, NEXT_PUBLIC_WEBAPP_URL, +# NEXT_PUBLIC_API_V2_URL, NEXT_PUBLIC_WEBSITE_URL, NEXT_PUBLIC_EMBED_LIB_URL, +# NEXT_PUBLIC_ONEHASH_URL, NEXT_PUBLIC_SENDGRID_SENDER_NAME, +# NEXT_PUBLIC_SENTRY_DSN, NEXT_PUBLIC_LOGGER_LEVEL, +# NEXT_PUBLIC_TEAM_IMPERSONATION, NEXT_PUBLIC_APP_NAME, +# NEXT_PUBLIC_COMPANY_NAME, NEXT_PUBLIC_MINUTES_TO_BOOK, +# NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD, +# NEXT_PUBLIC_CALENDLY_*, NEXT_PUBLIC_STRIPE_PUBLIC_KEY, +# NEXT_PUBLIC_ONEHASH_CHAT_URL, NEXT_PUBLIC_RAZORPAY_CLIENT_ID, +# NEXT_PUBLIC_SIGNUP_URL, NEXT_PUBLIC_VAPID_PUBLIC_KEY, +# NEXT_PUBLIC_WEBSITE_TERMS_URL, NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL, +# NEXT_PUBLIC_ONEHASH_CHAT_ORIGIN, NEXT_PUBLIC_RECAPTCHA_*, +# NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS, NEXT_PUBLIC_POSTHOG_*, +# NEXT_PUBLIC_ONEHASH_ERP_URL, NEXT_PUBLIC_ONEHASH_CRM_URL, +# NEXT_PUBLIC_WHITELISTED_ORGS, NEXT_PUBLIC_PIXEL, +# NEXT_PUBLIC_CLOUDFLARE_SITEKEY +# +# RUNTIME SECRET (must NOT be build args - server-side only, would leak Docker metadata): +# NEXTAUTH_SECRET, CALENDSO_ENCRYPTION_KEY +# -> build validation: BuildKit secret mount only +# -> runtime: injected via /home/onehash/.env at container startup +# ============================================================ FROM node:${NODE_VERSION}-slim AS builder -# RUN corepack enable && corepack prepare yarn@3.4.1 --activate - WORKDIR /calid -ARG NEXTAUTH_URL -ARG NEXTAUTH_SECRET -ARG CALENDSO_ENCRYPTION_KEY +# NEXT_PUBLIC_* vars only - safe to bake in at build time ARG NEXT_PUBLIC_GTM_ID ARG NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID ARG NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID @@ -46,106 +75,112 @@ ARG NEXT_PUBLIC_RECAPTCHA_HIGH ARG NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS ARG NEXT_PUBLIC_POSTHOG_KEY ARG NEXT_PUBLIC_POSTHOG_HOST -ARG NEXT_PUBLIC_ONEHASH_CRM_URL ARG NEXT_PUBLIC_ONEHASH_ERP_URL +ARG NEXT_PUBLIC_ONEHASH_CRM_URL ARG NEXT_PUBLIC_WHITELISTED_ORGS ARG NEXT_PUBLIC_PIXEL ARG NEXT_PUBLIC_CLOUDFLARE_SITEKEY ARG MAX_OLD_SPACE_SIZE=8192 +# NOTE: NEXTAUTH_SECRET and CALENDSO_ENCRYPTION_KEY are intentionally NOT +# declared as ARG. next.config.js validates them during `next build`, so the +# build step reads them from BuildKit secret mounts. They are still injected +# at runtime through the Docker env file. Declaring them as ARG would bake them +# into Docker image metadata and layer history. + COPY . . -# Install env dependencies RUN set -eux; \ apt-get update -qq && \ apt-get install -y build-essential openssl pkg-config python-is-python3 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists /var/cache/apt/archives && \ + apt-get clean && rm -rf /var/lib/apt/lists /var/cache/apt/archives && \ yarn config set httpTimeout 1200000 -# Allowing mutable installs RUN yarn config set enableImmutableInstalls false -RUN export NEXTAUTH_URL=${NEXTAUTH_URL} && \ -export NEXTAUTH_SECRET=${NEXTAUTH_SECRET} && \ -export NEXT_PUBLIC_GTM_ID=${NEXT_PUBLIC_GTM_ID} && \ -export NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID=${NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID} && \ -export NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID=${NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID} && \ -export CALENDSO_ENCRYPTION_KEY=${CALENDSO_ENCRYPTION_KEY} && \ -export NEXT_PUBLIC_WEBAPP_URL=${NEXT_PUBLIC_WEBAPP_URL} && \ -export NEXT_PUBLIC_API_V2_URL=${NEXT_PUBLIC_API_V2_URL} && \ -export NODE_OPTIONS=--max-old-space-size=${MAX_OLD_SPACE_SIZE} && \ -export NEXT_PUBLIC_WEBSITE_URL=${NEXT_PUBLIC_WEBSITE_URL} && \ -export NEXT_PUBLIC_EMBED_LIB_URL=${NEXT_PUBLIC_EMBED_LIB_URL} && \ -export NEXT_PUBLIC_ONEHASH_URL=${NEXT_PUBLIC_ONEHASH_URL} && \ -export NEXT_PUBLIC_SENDGRID_SENDER_NAME=${NEXT_PUBLIC_SENDGRID_SENDER_NAME} && \ -export NEXT_PUBLIC_SENTRY_DSN=${NEXT_PUBLIC_SENTRY_DSN} && \ -export NEXT_PUBLIC_LOGGER_LEVEL=${NEXT_PUBLIC_LOGGER_LEVEL} && \ -export NEXT_PUBLIC_TEAM_IMPERSONATION=${NEXT_PUBLIC_TEAM_IMPERSONATION} && \ -export NEXT_PUBLIC_APP_NAME=${NEXT_PUBLIC_APP_NAME} && \ -export NEXT_PUBLIC_COMPANY_NAME=${NEXT_PUBLIC_COMPANY_NAME} && \ -export NEXT_PUBLIC_MINUTES_TO_BOOK=${NEXT_PUBLIC_MINUTES_TO_BOOK} && \ -export NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD=${NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD} && \ -export NEXT_PUBLIC_CALENDLY_OAUTH_URL=${NEXT_PUBLIC_CALENDLY_OAUTH_URL} && \ -export NEXT_PUBLIC_CALENDLY_API_BASE_URL=${NEXT_PUBLIC_CALENDLY_API_BASE_URL} && \ -export NEXT_PUBLIC_CALENDLY_CLIENT_ID=${NEXT_PUBLIC_CALENDLY_CLIENT_ID} && \ -export NEXT_PUBLIC_CALENDLY_REDIRECT_URI=${NEXT_PUBLIC_CALENDLY_REDIRECT_URI} && \ -export NEXT_PUBLIC_STRIPE_PUBLIC_KEY=${NEXT_PUBLIC_STRIPE_PUBLIC_KEY} && \ -export NEXT_PUBLIC_ONEHASH_CHAT_URL=${NEXT_PUBLIC_ONEHASH_CHAT_URL} && \ -export NEXT_PUBLIC_RAZORPAY_CLIENT_ID=${NEXT_PUBLIC_RAZORPAY_CLIENT_ID} && \ -export NEXT_PUBLIC_SIGNUP_URL=${NEXT_PUBLIC_SIGNUP_URL} && \ -export NEXT_PUBLIC_VAPID_PUBLIC_KEY=${NEXT_PUBLIC_VAPID_PUBLIC_KEY} && \ -export NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL=${NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL} && \ -export NEXT_PUBLIC_WEBSITE_TERMS_URL=${NEXT_PUBLIC_WEBSITE_TERMS_URL} && \ -export NEXT_PUBLIC_ONEHASH_CHAT_ORIGIN=${NEXT_PUBLIC_ONEHASH_CHAT_ORIGIN} && \ -export NEXT_PUBLIC_RECAPTCHA_LOW=${NEXT_PUBLIC_RECAPTCHA_LOW} && \ -export NEXT_PUBLIC_RECAPTCHA_MEDIUM=${NEXT_PUBLIC_RECAPTCHA_MEDIUM} && \ -export NEXT_PUBLIC_RECAPTCHA_HIGH=${NEXT_PUBLIC_RECAPTCHA_HIGH} && \ -export NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS=${NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS} && \ -export NEXT_PUBLIC_POSTHOG_KEY=${NEXT_PUBLIC_POSTHOG_KEY} && \ -export NEXT_PUBLIC_POSTHOG_HOST=${NEXT_PUBLIC_POSTHOG_HOST} && \ -export NEXT_PUBLIC_ONEHASH_ERP_URL=${NEXT_PUBLIC_ONEHASH_ERP_URL} && \ -export NEXT_PUBLIC_ONEHASH_CRM_URL=${NEXT_PUBLIC_ONEHASH_CRM_URL} && \ -export NEXT_PUBLIC_WHITELISTED_ORGS=${NEXT_PUBLIC_WHITELISTED_ORGS} && \ -export NEXT_PUBLIC_PIXEL=${NEXT_PUBLIC_PIXEL} && \ - export NEXT_PUBLIC_CLOUDFLARE_SITEKEY=${NEXT_PUBLIC_CLOUDFLARE_SITEKEY} && \ - export BUILD_STANDALONE=true && \ -export NODE_ENV=production && \ -export CI=1 && \ -yarn install && yarn build +# Export NEXT_PUBLIC_* vars so webpack can resolve them at build time. The two +# server-only secrets are read from BuildKit secret mounts for Next.js config +# validation, never from ARG. +RUN --mount=type=secret,id=nextauth_secret \ + --mount=type=secret,id=calendso_encryption_key \ + export \ + NEXTAUTH_SECRET="$(cat /run/secrets/nextauth_secret)" \ + CALENDSO_ENCRYPTION_KEY="$(cat /run/secrets/calendso_encryption_key)" \ + NEXT_PUBLIC_GTM_ID="${NEXT_PUBLIC_GTM_ID}" \ + NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID="${NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID}" \ + NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID="${NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID}" \ + NEXT_PUBLIC_WEBAPP_URL="${NEXT_PUBLIC_WEBAPP_URL}" \ + NEXT_PUBLIC_API_V2_URL="${NEXT_PUBLIC_API_V2_URL}" \ + NODE_OPTIONS="--max-old-space-size=${MAX_OLD_SPACE_SIZE}" \ + NEXT_PUBLIC_WEBSITE_URL="${NEXT_PUBLIC_WEBSITE_URL}" \ + NEXT_PUBLIC_EMBED_LIB_URL="${NEXT_PUBLIC_EMBED_LIB_URL}" \ + NEXT_PUBLIC_ONEHASH_URL="${NEXT_PUBLIC_ONEHASH_URL}" \ + NEXT_PUBLIC_SENDGRID_SENDER_NAME="${NEXT_PUBLIC_SENDGRID_SENDER_NAME}" \ + NEXT_PUBLIC_SENTRY_DSN="${NEXT_PUBLIC_SENTRY_DSN}" \ + NEXT_PUBLIC_LOGGER_LEVEL="${NEXT_PUBLIC_LOGGER_LEVEL}" \ + NEXT_PUBLIC_TEAM_IMPERSONATION="${NEXT_PUBLIC_TEAM_IMPERSONATION}" \ + NEXT_PUBLIC_APP_NAME="${NEXT_PUBLIC_APP_NAME}" \ + NEXT_PUBLIC_COMPANY_NAME="${NEXT_PUBLIC_COMPANY_NAME}" \ + NEXT_PUBLIC_MINUTES_TO_BOOK="${NEXT_PUBLIC_MINUTES_TO_BOOK}" \ + NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD="${NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD}" \ + NEXT_PUBLIC_CALENDLY_OAUTH_URL="${NEXT_PUBLIC_CALENDLY_OAUTH_URL}" \ + NEXT_PUBLIC_CALENDLY_API_BASE_URL="${NEXT_PUBLIC_CALENDLY_API_BASE_URL}" \ + NEXT_PUBLIC_CALENDLY_CLIENT_ID="${NEXT_PUBLIC_CALENDLY_CLIENT_ID}" \ + NEXT_PUBLIC_CALENDLY_REDIRECT_URI="${NEXT_PUBLIC_CALENDLY_REDIRECT_URI}" \ + NEXT_PUBLIC_STRIPE_PUBLIC_KEY="${NEXT_PUBLIC_STRIPE_PUBLIC_KEY}" \ + NEXT_PUBLIC_ONEHASH_CHAT_URL="${NEXT_PUBLIC_ONEHASH_CHAT_URL}" \ + NEXT_PUBLIC_RAZORPAY_CLIENT_ID="${NEXT_PUBLIC_RAZORPAY_CLIENT_ID}" \ + NEXT_PUBLIC_SIGNUP_URL="${NEXT_PUBLIC_SIGNUP_URL}" \ + NEXT_PUBLIC_VAPID_PUBLIC_KEY="${NEXT_PUBLIC_VAPID_PUBLIC_KEY}" \ + NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL="${NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL}" \ + NEXT_PUBLIC_WEBSITE_TERMS_URL="${NEXT_PUBLIC_WEBSITE_TERMS_URL}" \ + NEXT_PUBLIC_ONEHASH_CHAT_ORIGIN="${NEXT_PUBLIC_ONEHASH_CHAT_ORIGIN}" \ + NEXT_PUBLIC_RECAPTCHA_LOW="${NEXT_PUBLIC_RECAPTCHA_LOW}" \ + NEXT_PUBLIC_RECAPTCHA_MEDIUM="${NEXT_PUBLIC_RECAPTCHA_MEDIUM}" \ + NEXT_PUBLIC_RECAPTCHA_HIGH="${NEXT_PUBLIC_RECAPTCHA_HIGH}" \ + NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS="${NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS}" \ + NEXT_PUBLIC_POSTHOG_KEY="${NEXT_PUBLIC_POSTHOG_KEY}" \ + NEXT_PUBLIC_POSTHOG_HOST="${NEXT_PUBLIC_POSTHOG_HOST}" \ + NEXT_PUBLIC_ONEHASH_ERP_URL="${NEXT_PUBLIC_ONEHASH_ERP_URL}" \ + NEXT_PUBLIC_ONEHASH_CRM_URL="${NEXT_PUBLIC_ONEHASH_CRM_URL}" \ + NEXT_PUBLIC_WHITELISTED_ORGS="${NEXT_PUBLIC_WHITELISTED_ORGS}" \ + NEXT_PUBLIC_PIXEL="${NEXT_PUBLIC_PIXEL}" \ + NEXT_PUBLIC_CLOUDFLARE_SITEKEY="${NEXT_PUBLIC_CLOUDFLARE_SITEKEY}" \ + BUILD_STANDALONE=true \ + NODE_ENV=production \ + CI=1 \ + && yarn install && yarn build RUN rm -rf node_modules/.cache .yarn/cache apps/web/.next/cache -# ---------- Stage 2: Production ---------- - +# ============================================================ +# Stage 2: Production +# ============================================================ FROM node:${NODE_VERSION}-slim AS production - +# NOTE: The `apt-get update` on line 129 of the previous version had a +# missing semicolon on the preceding `apt-get clean` line, causing that line +# to be followed by a second `apt-get update` and `apt-get install` that +# duplicated work and wasted bandwidth. Fixed: consolidated into one +# RUN instruction with a single `apt-get update`. RUN set -eux; \ apt-get update -qq && \ - apt-get install -y build-essential openssl pkg-config python-is-python3 && \ - apt-get clean && \ - # required for accessing psql inside of container, when running psql on host ,rather than on remote service like RDS - apt-get update && apt-get install -y postgresql-client && \ - rm -rf /var/lib/apt/lists /var/cache/apt/archives && \ + apt-get install -y build-essential openssl pkg-config python-is-python3 postgresql-client && \ + apt-get clean && rm -rf /var/lib/apt/lists /var/cache/apt/archives && \ yarn config set httpTimeout 1200000 WORKDIR /app ARG MAX_OLD_SPACE_SIZE=8192 ENV NODE_OPTIONS=--max-old-space-size=${MAX_OLD_SPACE_SIZE} - # Copying essential files and packages COPY ./package.json ./.yarnrc.yml ./turbo.json ./i18n.json ./yarn.lock ./ COPY ./apps/api/v2 ./apps/api/v2 COPY ./packages ./packages - # Copying yarn plugins and release version from builder stage COPY --from=builder /calid/.yarn ./.yarn - # Copying the build output from the builder stage COPY --from=builder /calid/apps/web/ ./apps/web/ - # Copying node_modules from builder stage COPY --from=builder /calid/node_modules ./node_modules @@ -153,12 +188,12 @@ COPY ./entrypoint.sh ./ RUN chmod +x ./entrypoint.sh ARG IS_ROLLBACK=false -ENV IS_ROLLBACK=$IS_ROLLBACK - -# Allowing mutable installs -# RUN yarn config set enableImmutableInstalls false +ENV IS_ROLLBACK=${IS_ROLLBACK} -# EXPOSING PORT +# NOTE: NEXTAUTH_SECRET and CALENDSO_ENCRYPTION_KEY are injected at runtime +# via the EC2 Docker --env-file (/home/onehash/.env). They are never passed as +# Docker ARG or image ENV, so they do not appear in `docker history`, image +# layer metadata, or registry metadata. EXPOSE 3001 CMD ["sh", "./entrypoint.sh"] diff --git a/apps/connector/Dockerfile b/apps/connector/Dockerfile index df087ba8d70a3e..4984951ba10912 100644 --- a/apps/connector/Dockerfile +++ b/apps/connector/Dockerfile @@ -1,3 +1,5 @@ +# syntax=docker/dockerfile:1.7 + ARG NODE_VERSION=22 # --------------------------------------------------------- @@ -32,15 +34,20 @@ RUN yarn install # Build connector and dependency chain in monorepo context RUN yarn turbo run build --filter=@calid/connector... -# Optional source-map upload for Sentry -ARG SENTRY_AUTH_TOKEN="" -ARG SENTRY_ORG="" -ARG SENTRY_PROJECT="" +# Optional source-map upload for Sentry. +# Sentry credentials are BuildKit secrets, not ARGs, so they are not stored in +# Docker image history or registry metadata. ARG GIT_HASH="unknown" ARG SENTRY_UPLOAD_STRICT="false" -RUN if [ -n "$SENTRY_AUTH_TOKEN" ] && [ -n "$SENTRY_ORG" ] && [ -n "$SENTRY_PROJECT" ]; then \ - if SENTRY_AUTH_TOKEN=$SENTRY_AUTH_TOKEN \ - npx @sentry/cli@latest releases new "$GIT_HASH" && \ +RUN --mount=type=secret,id=sentry_auth_token \ + --mount=type=secret,id=sentry_org \ + --mount=type=secret,id=sentry_project \ + SENTRY_AUTH_TOKEN="$(cat /run/secrets/sentry_auth_token 2>/dev/null || true)"; \ + SENTRY_ORG="$(cat /run/secrets/sentry_org 2>/dev/null || true)"; \ + SENTRY_PROJECT="$(cat /run/secrets/sentry_project 2>/dev/null || true)"; \ + export SENTRY_AUTH_TOKEN SENTRY_ORG SENTRY_PROJECT; \ + if [ -n "$SENTRY_AUTH_TOKEN" ] && [ -n "$SENTRY_ORG" ] && [ -n "$SENTRY_PROJECT" ]; then \ + if npx @sentry/cli@latest releases new "$GIT_HASH" && \ npx @sentry/cli@latest sourcemaps inject apps/connector/dist && \ npx @sentry/cli@latest sourcemaps upload apps/connector/dist \ --release="$GIT_HASH" --validate --ext=js --ext=map && \ diff --git a/apps/worker/Dockerfile b/apps/worker/Dockerfile index 3d797d9546878e..3b38ecf7e513e8 100644 --- a/apps/worker/Dockerfile +++ b/apps/worker/Dockerfile @@ -1,3 +1,5 @@ +# syntax=docker/dockerfile:1.7 + ARG NODE_VERSION=22 # --------------------------------------------------------- @@ -32,13 +34,18 @@ RUN yarn install # Build worker and dependency chain in monorepo context RUN yarn turbo run build --filter=@calid/worker... -# Optional source-map upload for Sentry -ARG SENTRY_AUTH_TOKEN="" -ARG SENTRY_ORG="" -ARG SENTRY_PROJECT="" +# Optional source-map upload for Sentry. +# Sentry credentials are BuildKit secrets, not ARGs, so they are not stored in +# Docker image history or registry metadata. ARG GIT_HASH="unknown" -RUN if [ -n "$SENTRY_AUTH_TOKEN" ] && [ -n "$SENTRY_ORG" ] && [ -n "$SENTRY_PROJECT" ]; then \ - SENTRY_AUTH_TOKEN=$SENTRY_AUTH_TOKEN \ +RUN --mount=type=secret,id=sentry_auth_token \ + --mount=type=secret,id=sentry_org \ + --mount=type=secret,id=sentry_project \ + SENTRY_AUTH_TOKEN="$(cat /run/secrets/sentry_auth_token 2>/dev/null || true)"; \ + SENTRY_ORG="$(cat /run/secrets/sentry_org 2>/dev/null || true)"; \ + SENTRY_PROJECT="$(cat /run/secrets/sentry_project 2>/dev/null || true)"; \ + export SENTRY_AUTH_TOKEN SENTRY_ORG SENTRY_PROJECT; \ + if [ -n "$SENTRY_AUTH_TOKEN" ] && [ -n "$SENTRY_ORG" ] && [ -n "$SENTRY_PROJECT" ]; then \ npx @sentry/cli@latest releases new "$GIT_HASH" && \ npx @sentry/cli@latest sourcemaps inject apps/worker/dist && \ npx @sentry/cli@latest sourcemaps upload apps/worker/dist \ diff --git a/docs/unified_deployment.md b/docs/unified_deployment.md new file mode 100644 index 00000000000000..861085a923b6dc --- /dev/null +++ b/docs/unified_deployment.md @@ -0,0 +1,1638 @@ +# Unified Deployment Guide + +Cal-ID's automated deployment pipeline for Web (Next.js), API (Fastify/Connector), and Worker (BullMQ) services. This document is the canonical deployment reference and requires no access to the codebase. + +--- + +## 1. Purpose & Scope + +### What this document covers + +This guide documents the **unified deployment pipeline** (`deploy-all.yml`), which is the single active entrypoint for staging and production deployments of all Cal-ID services. It also covers: + +- The **manual rollback workflow** (`rollback.yml`) +- The **migration validation workflow** (`validate-migration.yml`) +- The **deployment state machine** (S3-backed) +- The **current concurrency model** (GitHub Actions concurrency groups; S3 locking is temporarily disabled) +- **Operational procedures** for operators + +### Active vs. Legacy Workflows + +| Workflow | Status | Notes | +|---|---|---| +| `.github/workflows/deploy-all.yml` | **Active** | Unified pipeline; primary entrypoint | +| `.github/workflows/rollback.yml` | **Active** | Manual app-only rollback entrypoint | +| `.github/workflows/validate-migration.yml` | **Active** | PR-level migration safety classification | +| `.github/workflows/main-deploy.yml` | **Deprecated** | Legacy web-only deploy; pull_request trigger commented out; `workflow_dispatch` still functional but not recommended | +| `.github/workflows/api-deploy.yml` | **Deprecated** | Legacy connector-only deploy; pull_request trigger commented out; `workflow_dispatch` still functional but not recommended | +| `.github/workflows/worker-deploy.yml` | **Deprecated** | Legacy worker-only deploy; pull_request trigger commented out; `workflow_dispatch` still functional but not recommended | + +The legacy workflows are maintained for manual rollback scenarios (e.g., targeting a specific service) but will be removed in a future release. All automated deployments use `deploy-all.yml`. + +### What this document does not cover + +- Application architecture or feature documentation (see `AGENTS.md`) +- Development setup or local run instructions +- Database schema design or migration authoring guidelines (see `packages/prisma/`) +- CI/CD for non-deployment workflows + +--- + +## 2. Architecture Overview + +### System Topology + +``` + ┌─────────────────────────┐ + │ GitHub Actions │ + │ deploy-all.yml │ + │ rollback.yml │ + └────────────┬─────────────┘ + │ SSH / AWS API + ┌───────────────────────────┼───────────────────────────┐ + │ │ │ + ┌─────▼─────┐ ┌──────▼──────┐ ┌────────▼────────┐ + │ ECR Repos │ │ S3 State │ │ SendGrid │ + │ (images) │ │ Bucket │ │ (notifications) │ + └─────┬─────┘ │ cal-id │ └─────────────────┘ + │ └──────┬─────┘ + │ │ + ┌────────▼────────┐ ┌───────▼─────────────────────────────────┐ + │ cal_prod │ │ S3 Key Structure (DEPLOY_STATE_PREFIX= │ + │ cal_api_prod │ │ deployment-prod | deployment-stag) │ + │ cal_worker_prod│ │ │ + └────────┬────────┘ │ /locks/{env}.lock │ + │ │ /manifests/sha-{sha}.json │ + │ │ /deployments/history/sha-{sha}.json │ + │ │ /deployments/current.json │ + │ └──────────────────────────────────────────┘ + │ + ┌───────────▼──────────────────────────────────────────────────────────────┐ + │ EC2 Host (onehash user) │ + │ │ + │ ┌────────────────────────────────────────────────────────────────────┐ │ + │ │ Docker │ │ + │ │ │ │ + │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────────┐ │ │ + │ │ │web-blue │ │web-green │ │api-blue │ │api-green │ │ │ + │ │ │port 3001 │ │port 3002 │ │port 4100 │ │port 4101 │ │ │ + │ │ └────┬─────┘ └────┬─────┘ └────┬─────┘ └──────┬───────┘ │ │ + │ │ │ │ │ │ │ │ + │ │ └─────────────┼─────────────┼─────────────────┘ │ │ + │ │ ▼ ▼ │ │ + │ │ ┌──────────────────────────────────────┐ │ │ + │ │ │ NGINX (cal-id.conf) │ │ │ + │ │ │ Blue/Green active: web→port:3001 │ │ │ + │ │ │ API proxy: api.* + mcp.* │ │ │ + │ │ └──────────────────────────────────────┘ │ │ + │ └────────────────────────────────────────────────────────────────────┘ │ + │ │ + │ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────────┐ │ + │ │worker-1 │ │worker-2 (opt) │ │ /var/www/cal-id-static/ │ │ + │ └──────────────┘ └──────────────┘ │ current/ → build-abc123 │ │ + │ │ candidate/ → build-def456 │ │ + │ └──────────────────────────────┘ │ + └──────────────────────────────────────────────────────────────────────────────┘ +``` + +### Deployment Flow (Unified Pipeline) + +```mermaid +flowchart TD + A([PR merged to main / develop]) --> B[deploy-all.yml triggered] + B --> C[preflight: secrets, config, syntax validation] + C --> D[prepare-release: resolve SHA, env, repos] + D --> E[skip S3 lock: GitHub concurrency controls same-environment deploys] + E --> G[build-web + build-api + build-worker in parallel] + G --> H[record-build-state: S3 staged] + H --> I[migrate-db: run on EC2 via SSH] + I --> J[deploy-api: stage candidate → health check] + I --> K[deploy-web: stage candidate + static assets → health check] + J --> L[promote-all: NGINX switch, static switch, one reload] + K --> L + L --> M[record-promoted-state: S3 promoted_pending_verification] + M --> N[deploy-workers: start worker-new-*, drain old, rename] + N --> O{worker startup ok?} + O -->|no| P[rollback-after-promotion: revert-nginx + rollback.sh] + O -->|yes| Q[verify: record current → S3 current.json] + P --> R([Notification: rollback succeeded/failed]) + Q --> T([Notification: deployment succeeded]) + I -.->|migration failure| X + + style X fill:#ffcccc + style R fill:#ffe0cc + style T fill:#ccffcc +``` + +### Service Routing + +The unified NGINX config (`cal-id.conf`) handles all routing: + +| Domain | Upstream | Notes | +|---|---|---| +| `cal.id` / `app.cal.id` | `127.0.0.1:3001` or `127.0.0.1:3002` | Next.js web (blue/green) | +| `api.cal.id` | `127.0.0.1:4100` or `127.0.0.1:4101` | Fastify connector API | +| `mcp.cal.id` | `127.0.0.1:4100` or `127.0.0.1:4101` | MCP transport on same connector | +| `/docs/api/*` | `https://api.cal.id/docs/` | API docs reverse proxy | +| `/docs/mcp/*` | `https://api.cal.id/docs/mcp/` | MCP docs reverse proxy | +| `/openapi.json` | `https://api.cal.id/openapi.json` | Public OpenAPI spec | +| `/` (logged in) | `127.0.0.1:3001` | Next.js app home page | +| `/` (anonymous) | `HOMEPAGE_URL` (external) | Framer marketing site | + +--- + +## 3. Environment Configuration + +### Branch-to-Environment Mapping + +| Branch | `DEPLOY_ENV` | ECR Repositories | EC2 Host | Database | +|---|---|---|---|---| +| `main` | `production` | `cal_prod`, `cal_api_prod`, `cal_worker_prod` | `EC2_HOST_PROD` | `DATABASE_DIRECT_URL_PROD` | +| `develop` | `staging` | `cal_stag`, `cal_api_stag`, `cal_worker_stag` | `EC2_HOST_STAG` | `DATABASE_DIRECT_URL_STAG` | + +### Image Naming Convention + +Images are tagged with the full Git SHA: + +``` +{ECR_REGISTRY}/{repo_name}:{GIT_SHA} +``` + +ECR registry is derived as: +``` +{aws_account_id}.dkr.ecr.{aws_region}.amazonaws.com +``` + +For production: `cal_prod:`, `cal_api_prod:`, `cal_worker_prod:` +For staging: `cal_stag:`, `cal_api_stag:`, `cal_worker_stag:` + +Additionally, each build also pushes a `:latest` tag for debugging purposes, but the SHA tag is the canonical reference used in deployment. + +### `workflow_dispatch` Inputs (deploy-all.yml) + +| Input | Required | Default | Description | +|---|---|---|---| +| `branch` | Yes | `develop` | Target branch to deploy | +| `git_hash` | No | (HEAD) | Override Git commit hash; empty = auto-detect from `branch` HEAD | +| `rebuild` | Yes | `false` | Force rebuild even if images exist in ECR | +| `worker_replicas` | No | (from secret) | Number of worker replicas; overrides `WORKER_REPLICAS_PROD/STAG` | + +### `workflow_dispatch` Inputs (rollback.yml) + +| Input | Required | Default | Description | +|---|---|---|---| +| `branch` | Yes | `main` | Branch containing rollback scripts | +| `target_sha` | Yes | — | Git SHA to roll back to | +| `environment` | Yes | `staging` | Rollback target environment (staging/production) | +| `dry_run` | Yes | `true` | Validate without running host rollback | + +### Release ID Format + +Releases are named using the pattern: `v{YYYYMMDD}-{HHMMSS}` (e.g., `v20250526-143200`). This is generated at the `prepare-release` job and remains stable throughout the pipeline. + +--- + +## 4. Deployment Sequence + +### Phase 1: Preflight Validation + +**Job: `preflight`** + +Runs immediately after trigger, before any build or state mutation. Checks: + +1. **Dockerfile syntax validation** — `docker buildx build --check` on all three Dockerfiles +2. **Script syntax validation** — `bash -n` on all critical deployment scripts +3. **Dry-run of worker scripts** — `WORKER_LIFECYCLE_DRY_RUN=true bash start-workers.sh` and `drain-workers.sh` +4. **Required secrets presence** — validates all secrets are non-empty (not their values) + +If preflight fails, the workflow aborts before any build starts. This prevents half-built artifacts on failed configuration. + +### Phase 2: Release Preparation + +**Job: `prepare-release`** + +1. Checkout the target `branch` +2. If `git_hash` is provided: fetch and detach to that SHA; otherwise use HEAD of `branch` +3. Resolve `git_hash` via `git rev-parse HEAD` +4. Determine `deploy_env` from branch (main → production, else staging) +5. Derive image suffixes and repository names +6. Generate `release_id` (timestamp-based) +7. Resolve `worker_replicas` (from workflow input, then secret, then default of 1) +8. Compute ECR registry URL +9. **Send `deployment_started` notification** + +Outputs from this job (propagated to all downstream jobs via `needs.prepare-release.outputs`): +- `branch`, `git_hash`, `release_id`, `deploy_env`, `image_suffix` +- `web_repo`, `api_repo`, `worker_repo`, `worker_replicas`, `ecr_registry` + +### Phase 3: Deployment Locking + +**Job: `acquire-lock`** + +S3 deployment locking is currently disabled in `deploy-all.yml`. The job is retained as a no-op so existing job dependencies continue to run in the same order, but it does not call `acquire-lock.sh` and it emits an empty lock token. + +GitHub Actions environment-scoped concurrency groups are now the primary concurrency control for normal unified deployments. Same-environment deployments queue behind each other through the workflow `concurrency` block. + +The lock scripts remain in the repository for future use and manual operator workflows, but the unified deployment pipeline does not acquire, refresh, or release S3 locks during normal execution. + +Historical lock payload format (not currently written by `deploy-all.yml`): + +Lock payload (written to `s3://cal-id/deployment-{env}/locks/{env}.lock`): +```json +{ + "environment": "production", + "owner": "github-{run_id}", + "token": "{timestamp}-{pid}", + "git_sha": "{sha}", + "actor": "{github_actor}", + "run_id": "{run_id}", + "acquired_at": "2025-05-26T14:32:00Z", + "acquired_at_epoch": 1748267520, + "expires_at": "2025-05-26T16:32:00Z", + "expires_at_epoch": 1748274720, + "status": "held" +} +``` + +- **TTL**: 7200 seconds (2 hours) — when enabled, extended by `refresh-lock.sh` throughout the pipeline +- **Token**: a timestamp-pid string; matched on refresh and release +- **Verification**: lock is read back after write and token is confirmed to match + +Because S3 lock acquisition is disabled, lock contention no longer aborts `deploy-all.yml`. Same-environment serialization is handled by GitHub Actions concurrency. + +### Phase 4: SHA Guarantee Mechanism + +Every job that checks out code on the EC2 host performs a SHA verification: + +```bash +git checkout --detach "$GIT_HASH" +checked_out_sha="$(git rev-parse HEAD)" +if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 +fi +``` + +This is enforced in: +- `migrate-db` (via SSH) +- `deploy-api` (via SSH) +- `deploy-web` (via SSH) +- `promote-all` (via SSH) +- `deploy-worker` (via SSH) +- `rollback-after-promotion` (via SSH) +- `rollback.sh` (manual rollback, via SSH) + +The workflow-level SHA guarantee: each build job checks out at `ref: ${{ needs.prepare-release.outputs.git_hash }}`, so Docker builds use the exact resolved SHA. This SHA is propagated through all downstream jobs and host-side scripts. + +### Phase 5: Image Build + +**Jobs: `build-web`, `build-api`, `build-worker`** (parallel) + +Each job: +1. Checks out at the resolved `git_hash` (not the branch HEAD) +2. Verifies the checkout matches the expected SHA +3. Checks whether the image already exists in ECR (skipped if `rebuild=true`) +4. If not present: builds and pushes the Docker image with the SHA tag +5. Outputs the full image URL: `{ecr_registry}/{repo}:{sha}` + +Builds use `linux/amd64` platform. Web build passes all `NEXT_PUBLIC_*` build args. API and Worker builds pass `GIT_HASH` and Sentry upload secrets. + +If an image already exists (and `rebuild=false`), the build is skipped — this supports resumable deployments where one service fails but its image already exists. + +### Phase 6: State Recording — Build + +**Job: `record-build-state`** + +Runs after all three build jobs complete. Records to S3: + +``` +State: staged +Key: s3://cal-id/deployment-{env}/manifests/sha-{sha}.json +Key: s3://cal-id/deployment-{env}/deployments/history/sha-{sha}.json +Note: current.json is NOT updated at this stage +``` + +The deployment lock refresh step is currently skipped because S3 locking is disabled; GitHub Actions concurrency is the primary concurrency control. + +### Phase 7: Database Migration + +**Job: `migrate-db`** + +Runs on the EC2 host via SSH. Steps: +1. Clone (or reuse existing checkout) of the target `BRANCH_NAME` at `GIT_HASH` +2. Verify SHA matches (host-side guard) +3. Run optional database backup (if `ENABLE_DB_BACKUP=true` and `DB_BACKUP_COMMAND` is set) +4. Install dependencies (if `node_modules` not present; uses corepack/yarn) +5. Run `yarn prisma migrate status` before migration (capture applied count) +6. Run `yarn db-deploy` (Prisma migrate deploy) with a timeout +7. Run `yarn prisma migrate status` after migration (capture new count) +8. Compute new migrations applied = post_count - pre_count +9. Output `MIGRATIONS_APPLIED_JSON` and `MIGRATION_COUNT` for workflow capture + +**Prerequisite**: Prisma schema at `packages/prisma/schema.prisma` must exist in the checkout. `prisma migrate deploy` does NOT require `prisma generate` — it reads migration SQL files directly. + +**Migration timeout**: Default 600 seconds; configurable via `MIGRATION_TIMEOUT_SECONDS`. + +**Idempotency**: Prisma tracks applied migrations in `_prisma_migrations`; re-running skips already-applied migrations. + +If migration fails, the workflow aborts. No services are promoted. + +### Phase 8: Staging — API Candidate + +**Job: `deploy-api`** + +On the EC2 host: +1. Authenticate Docker to ECR +2. Run `preflight-space.sh` (disk space guard) +3. Determine the inactive API port (blue/green): + - Read active port from `cal-id.conf` (unified) or `connector.conf` (legacy) + - `3001` (active web) → `4100` (active API); inactive API = `4101` + - `3002` (active web) → `4101` (active API); inactive API = `4100` +4. Remove any stale `api-candidate` container +5. Start `api-candidate` on the inactive port (env-file from `/home/onehash/.env`) +6. Poll `/health` up to 30 attempts × 10s interval +7. On failure: collect logs, remove container, exit 1 + +NGINX is **not** modified in this step. + +### Phase 9: Staging — Web Candidate + +**Job: `deploy-web`** + +On the EC2 host: +1. Authenticate Docker to ECR +2. Run `preflight-space.sh` +3. Determine the inactive Web port (blue/green): + - Read active port from `cal-id.conf` (unified) or `default.conf` (legacy) + - Use the opposite port for the candidate +4. Remove any stale `web-candidate` container +5. Start `web-candidate` on the inactive port (env-file from `/home/onehash/.env`) +6. Poll `/api/health` up to 30 attempts × 30s interval +7. On health success: extract static assets from the candidate container: + - Copy `/app/apps/web/.next/static` → `build-{sha}/_next/static` + - Copy `/app/apps/web/public` → `build-{sha}/public` + - Symlink `/var/www/cal-id-static/candidate` → `build-{sha}` +8. On failure: collect logs, remove container, exit 1 + +NGINX is **not** modified in this step. + +### Phase 10: Promotion — Web + API + +**Job: `promote-all`** + +On the EC2 host (via SSH): +1. **Backup**: copy current NGINX configs and static symlink target to `/tmp/cal-id-nginx-previous/` +2. **Verify candidates are running**: check `web-candidate` and `api-candidate` containers exist +3. **Health check candidates** on their staging ports +4. **Verify candidate static assets exist** at `/var/www/cal-id-static/candidate` +5. **Generate combined `cal-id.conf`** by concatenating the three templates (web, connector, MCP) with placeholders replaced: + - `PORT_PLACEHOLDER` → candidate ports + - `DOMAIN_PLACEHOLDER` → `DOMAIN_NAME` + - `HOMEPAGE_PLACEHOLDER` → `HOMEPAGE_URL` +6. **Atomic switch**: `ngx_switch_config_and_static()`: + - Remove legacy standalone configs (`default.conf`, `connector.conf`, `mcp.conf`) + - Copy candidate `cal-id.conf` to `/etc/nginx/conf.d/cal-id.conf` + - Switch `current` symlink to the candidate build directory + - `nginx -t` (validate) + - `nginx -s reload` (atomic config reload) +7. **Cleanup**: remove old static build directories (keep 2 most recent plus active) +8. **Stop candidates**: remove `web-candidate` and `api-candidate` containers + +If any step fails, the previous config and static symlink are restored from backup, and the workflow fails. + +**Revert on failure** (`revert-nginx.sh`): If `promote-all` fails, `revert-nginx.sh` is called via the workflow's `failure()` trigger. It restores the backed-up NGINX config and static symlink, then stops candidates. + +### Phase 11: State Recording — Promoted + +**Job: `record-promoted-state`** + +After `promote-all` completes successfully: + +``` +State: promoted_pending_verification +Key: s3://cal-id/deployment-{env}/manifests/sha-{sha}.json +Key: s3://cal-id/deployment-{env}/deployments/history/sha-{sha}.json +Note: current.json is NOT updated yet — workers are not verified +``` + +The `current.json` update is intentionally deferred. If worker deployment fails after promotion, the rollback step will revert NGINX to the previous release (restoring the previous `current.json`). + +### Phase 12: Worker Lifecycle + +**Job: `deploy-worker`** + +On the EC2 host (via SSH): +1. Authenticate Docker to ECR +2. Run `preflight-space.sh` +3. Run schema validation (runs `npx prisma migrate status` in the worker image; failures are non-fatal) +4. Run Redis connectivity check (non-blocking; failures are non-fatal) +5. Start `worker-new-{1..n}` containers (number = `WORKER_REPLICAS`) +6. Health check each: look for "Workers are up" in logs, check for fatal/panic errors +7. Run `drain-workers.sh`: + - Verify at least one `worker-new-*` container is healthy before draining + - Send `docker stop -t 180` (graceful timeout) to old `worker-*` containers one-by-one + - Rename remaining healthy `worker-new-*` to canonical `worker-{n}` + - Prune old Docker images (72h filter) + +Worker startup uses the `GRACEFUL_STOP_TIMEOUT=180` seconds for the `docker stop` command. This ensures BullMQ workers have time to finish in-flight jobs before the container is killed. + +If any worker fails health checks, the failed candidates are removed and the workflow fails. **Old workers are NOT touched** if all new workers fail to start — this preserves the running system and avoids an outage. + +### Phase 13: Automatic Rollback (if needed) + +**Job: `rollback-after-promotion`** + +Triggered when: +- `promote-all` succeeded +- AND (`record-promoted-state` failed OR `deploy-worker` failed) + +Sequence: +1. Read `current.json` from S3 to get the previous release SHA and image URLs +2. Verify previous release images exist in ECR (all three services) +3. Run `rollback.sh` on the EC2 host with the previous release's images and SHA +4. If rollback fails, try `revert-nginx.sh` as last resort + +`rollback.sh` (app-only rollback): +- Validates schema compatibility via `validate-rollback-schema.sh` +- Stages previous Web, API, and Worker images as candidates +- Runs `promote-all` to switch NGINX to previous release +- Starts previous workers and drains current workers +- Records `rolled_back` state to S3 (updates `current.json`) + +**Rollback is app-only**: it does not run `down.sql` or reverse database migrations. The schema compatibility validator blocks rollback if the database has applied migrations newer than the rollback target that contain CONTRACT patterns. + +### Phase 14: Verification and Final State + +**Job: `verify`** + +After all services are confirmed healthy: + +``` +State: current +Key: s3://cal-id/deployment-{env}/manifests/sha-{sha}.json +Key: s3://cal-id/deployment-{env}/deployments/history/sha-{sha}.json +Key: s3://cal-id/deployment-{env}/deployments/current.json ← NOW UPDATED +``` + +Only the `verify` job (successful completion) writes `current.json`. This means: +- A deployment where Web/API promote but worker fails: `current.json` still points to the previous release +- The rollback step restores the previous release as current +- A deployment that fails before promotion: `current.json` is unchanged + +### Phase 15: Lock Release + +**Job: `release-lock`** + +Currently disabled with `if: ${{ false }}` because `deploy-all.yml` does not acquire an S3 lock. GitHub Actions environment-scoped concurrency is the primary concurrency control. + +`release-lock.sh` remains in the repository. If S3 locking is re-enabled later, `FORCE_RELEASE=true` bypasses ownership check for manual operator cleanup after verifying the lock has expired or the holder crashed. + +### Phase 16: Deployment Report + +**Job: `deployment-report`** + +Always runs (`if: always()`), even on failure. Performs: +1. SSH into EC2 host: collect active NGINX upstreams and recent deployment/container logs +2. Redact sensitive values from logs (API keys, tokens, passwords) +3. Write a GitHub Actions step summary with job status table, images, rollback info, and log snippet +4. Send final notification email via SendGrid + +--- + +## 5. Release SHA Guarantees + +The SHA guarantee is enforced at three layers: + +| Layer | Mechanism | Where | +|---|---|---| +| Workflow checkout | `ref: git_hash` on every build job | GitHub Actions | +| Docker build tag | `{ecr_registry}/{repo}:{git_hash}` | Build jobs | +| Host checkout verification | `git rev-parse HEAD` compare | Every SSH script | +| Image reference propagation | SHA passed as env var to all SSH scripts | deploy-all.yml | + +The SHA is resolved once in `prepare-release` and propagated as an output to every downstream job. No job uses `github.sha` as the deployment reference — only the resolved `git_hash` from `prepare-release`. + +--- + +## 6. Staging & Promotion Details + +### Blue/Green Port Selection + +| Active Web Port | Active API Port | Candidate Web Port | Candidate API Port | +|---|---|---|---| +| 3001 | 4100 | 3002 | 4101 | +| 3002 | 4101 | 3001 | 4100 | + +Port selection is driven by the **active** NGINX config (read from `cal-id.conf` or its backup). Candidates always use the inactive ports. + +### NGINX Config Switch (Atomic) + +The `ngx_switch_config_and_static()` function in `ngx-utils.sh` performs a single atomic switch: +1. Remove legacy standalone configs +2. Copy candidate `cal-id.conf` to active directory +3. Switch `current` symlink to candidate build +4. `nginx -t` (validate) +5. `nginx -s reload` (reload) + +If `nginx -t` or `nginx -s reload` fails, both config and static symlink are restored from backup before exiting with an error. + +### Static Asset Handling + +Static assets are built into the Docker image's `/app/apps/web/.next/static` and `/app/apps/web/public` directories. After the web candidate container is healthy, these are copied to `/var/www/cal-id-static/build-{sha}/` on the host. A symlink chain: + +``` +/var/www/cal-id-static/candidate → build-{sha} +/var/www/cal-id-static/current → build-{previous-sha} +``` + +During promotion, `current` is switched to point to the candidate build. NGINX serves static assets from `current/_next/static/` directly from disk — no container involved. + +### Health Validation + +Before switching NGINX, `promote-all.sh` performs HTTP health checks on both candidates: +- Web: `GET http://127.0.0.1:{candidate_port}/api/health` → expects HTTP 200 +- API: `GET http://127.0.0.1:{candidate_port}/health` → expects HTTP 200 + +These are curl calls with a 10-second timeout. Any non-200 response causes promotion to abort. + +--- + +## 7. Worker Lifecycle Details + +### Startup Sequence + +``` +1. Authenticate Docker to ECR +2. Run preflight-space.sh (disk space guard) +3. Run schema validation (npx prisma migrate status in worker image) +4. Run Redis connectivity check (non-blocking) +5. Start worker-new-1 ... worker-new-N containers +6. Health check each: + a. Docker container status = running + b. Logs contain "Workers are up" + c. No fatal/panic errors in recent logs + d. (Optional) Redis ping succeeds +7. Run drain-workers.sh: + a. Verify worker-new-* containers exist + b. Gracefully stop old worker-* containers (180s timeout) + c. Rename worker-new-* → worker-* + d. Prune old Docker images (72h filter) +``` + +### Health Check Details + +Worker health checks wait for the string "Workers are up" in the container logs — this is the BullMQ signal that all processors have initialized. The check also fails on presence of `fatal`, `panic`, or `ECONNREFUSED` patterns in recent log lines. + +Up to 20 attempts × 15 seconds = 300 seconds per worker. Global startup deadline: 300 seconds from first start attempt. + +### Graceful Drain + +Old workers receive `docker stop -t 180`, giving BullMQ's `worker.close()` up to 180 seconds to finish in-flight jobs before SIGKILL. Workers are drained one at a time (not in parallel) to maintain queue processing capacity during rollover. + +### Abort Guard + +If zero `worker-new-*` containers pass health checks, old workers are **not** touched. The script exits with an error, triggering automatic rollback of Web/API. + +### Worker Memory & Node Options + +| Setting | Default | +|---|---| +| Memory limit | 1024m | +| Memory swap | 1024m | +| `NODE_OPTIONS` | `--max-old-space-size=768` | + +--- + +## 8. Deployment State Machine + +### S3 State Object Structure + +All state objects share this JSON schema: + +```json +{ + "release_id": "v20250526-143200", + "sha": "a1b2c3d4e5f6...", + "timestamp": "2025-05-26T14:32:00Z", + "environment": "production|staging", + "status": "staged|promoted_pending_verification|current|failed|rollback_started|rolled_back", + "run_id": 123456789, + "actor": "username", + "services": { + "web": "{ecr_registry}/cal_prod:a1b2c3d4...", + "api": "{ecr_registry}/cal_api_prod:a1b2c3d4...", + "worker": "{ecr_registry}/cal_worker_prod:a1b2c3d4..." + }, + "migrations_applied": ["20250601_add_foo", "20250602_add_bar"], + "rollback": null | { + "source_sha": "a1b2c3d4...", + "target_sha": "e5f6a7b8..." + } +} +``` + +### S3 Key Layout + +| Key | Written by | Content | Updated for terminal states? | +|---|---|---|---| +| `deployment-{env}/locks/{env}.lock` | `acquire-lock.sh` | Lock holder metadata | N/A | +| `deployment-{env}/manifests/sha-{sha}.json` | `record-state.sh` | Per-SHA immutable record | No | +| `deployment-{env}/deployments/history/sha-{sha}.json` | `record-state.sh` | Per-SHA immutable record | No | +| `deployment-{env}/deployments/current.json` | `record-state.sh` (terminal states only) | Live release pointer | Yes | + +### State Transition Table + +| Job | State Written | Manifest | History | current.json | Notes | +|---|---|---|---|---|---| +| `record-build-state` | `staged` | Yes | Yes | No | Images built, not promoted | +| `record-promoted-state` | `promoted_pending_verification` | Yes | Yes | No | Web/API live, workers unverified | +| `verify` | `current` | Yes | Yes | **Yes** | All services verified | +| `rollback.sh` (app rollback) | `rollback_started` then `rolled_back` | Yes | Yes | **Yes** (rolled_back only) | Also writes source SHA as `failed` | +| `rollback.sh` (rollback target) | `rolled_back` | Yes | Yes | **Yes** | Target becomes new current | + +### `current.json` Update Policy + +Only terminal states update `current.json`: +- `current` — new release fully deployed and verified +- `rolled_back` — previous release restored as current + +Non-terminal states (`staged`, `promoted_pending_verification`) do NOT update `current.json`. This prevents a partially-deployed release from claiming "current" status. + +The `current.json` update uses an ETag-based conditional write (S3 `PUT` with `If-Match: {etag}`). If another writer has already updated `current.json`, the write fails silently and the pipeline continues. This prevents a stale writer from overwriting a newer current state. + +--- + +## 9. Deployment Concurrency + +### Current Model + +**Primary control: GitHub Actions Concurrency Groups** + +``` +concurrency: + group: deploy-all-${{ production-or-staging }} + cancel-in-progress: false +``` + +Same-environment deployments queue behind each other. Production and staging are independent queues. + +**Temporarily disabled: S3 Lock File** + +S3 lock acquisition, refresh, and release are currently disabled in `deploy-all.yml`. GitHub Actions concurrency is the primary concurrency control for normal unified deployments. + +The lock scripts are intentionally kept in `infra/scripts/` for future re-enablement or manual operator use. When enabled, the S3 lock uses conditional writes (`--if-none-match '*'`) so only one writer can acquire it. The lock includes: +- `owner` (GitHub run ID) +- `token` (timestamp-pid) +- `git_sha` (SHA being deployed) +- `actor` (username) +- `expires_at_epoch` (TTL) + +### Disabled Lock Lifecycle + +``` +acquire-lock.sh → Creates s3://cal-id/deployment-{env}/locks/{env}.lock +refresh-lock.sh → Extends TTL (called before each major phase) +[deployment runs] +release-lock.sh → Deletes the lock (only if token matches) +``` + +In the current workflow, these calls are skipped: +- `acquire-lock` emits an empty lock token and does not write to S3 +- `refresh-lock.sh` steps are guarded with `if: ${{ false }}` +- `release-lock` is guarded with `if: ${{ false }}` + +When S3 locking is re-enabled, lock refresh would happen before each of: +- `record-build-state` +- `migrate-db` +- `record-promoted-state` +- `deploy-worker` +- `verify` + +### Lock TTL Management + +S3 lock TTL management is not active while locking is disabled. + +- **Default TTL**: 1800s (30 minutes) +- **Pipeline refresh TTL**: 7200s (2 hours) +- **Refresh frequency**: once before each major job + +If S3 locking is re-enabled and a workflow fails, a subsequent deployment can acquire the lock after the previous lock is released or manually cleaned up. Manual lock cleanup via `FORCE_RELEASE=true` in `release-lock.sh` remains available for operators. + +### Lock Token Semantics + +- `acquire-lock.sh` generates `LOCK_TOKEN = {timestamp}-{pid}` +- `refresh-lock.sh` requires `LOCK_TOKEN` to match the lock's stored token +- `release-lock.sh` requires `LOCK_TOKEN` to match before deletion +- Only the lock holder (matching token) can refresh or release + +### Lock Failure Handling + +When S3 locking is enabled, if `acquire-lock.sh` fails because the lock is held by another deployment: +- Error message includes: owner, actor, run_id, git_sha, expiration time +- Workflow aborts before any build +- No cleanup needed — lock will expire naturally or be released by the holder + +In the current disabled state, `deploy-all.yml` does not call `acquire-lock.sh`, so this failure mode does not occur in normal deployments. + +--- + +## 10. Migration Model + +### Forward Migration Only + +Cal-ID's deployment model uses **forward-only migrations** with an expand/contract pattern: + +1. **Expand**: Add new tables/columns (backward-compatible) +2. **Deploy**: New application code using the new schema +3. **Contract**: After all instances run the new code, drop the old columns/tables + +**No `down.sql` is executed during rollback.** The rollback workflow does not run database migrations in reverse. + +### Migration Execution + +On the EC2 host, `migrate.sh` runs: +1. Clones/checkouts the target `GIT_HASH` at `REPO_ROOT` (`/home/onehash/onehash-cal`) +2. Verifies the checked-out SHA matches `GIT_HASH` (host-side guard) +3. Runs optional backup (if `ENABLE_DB_BACKUP=true`) +4. Runs `yarn db-deploy` (which calls `prisma migrate deploy`) +5. Verifies post-migration state via `prisma migrate status` + +### Migration Locking + +The script sets `lock_timeout = '30s'` at the session level before running `db-deploy`. This is a Postgres parameter. If a migration blocks on a lock for more than 30 seconds, it fails rather than waiting indefinitely. + +### Migration Timeout + +Default: 600 seconds. Configurable via `MIGRATION_TIMEOUT_SECONDS`. The caller (workflow) has a `command_timeout: 1200s` (20 minutes) as a hard outer bound. + +### Migration Output + +The script outputs two variables to stdout for workflow capture: +``` +MIGRATIONS_APPLIED_JSON=["20250526_add_booking_index", "20250526_add_user_prefs"] +MIGRATION_COUNT=2 +``` + +This is parsed in the workflow step and passed to `record-state.sh`. + +### App-Only Rollback — Schema Compatibility Check + +Before any rollback, `validate-rollback-schema.sh` runs: +1. Clones the `BRANCH_NAME` at the target SHA and the current branch HEAD +2. Lists migrations in the target checkout +3. Queries `_prisma_migrations` for applied migrations +4. Computes `comm -13 target_migrations applied_migrations` — migrations in DB but not in target +5. For each such migration, checks for CONTRACT patterns: + - `DROP TABLE` + - `DROP COLUMN` + - `RENAME COLUMN` + - `ALTER COLUMN TYPE` + - `SET NOT NULL` without `DEFAULT` +6. If any CONTRACT pattern is found: **blocks the rollback** + +This ensures that an app-only rollback cannot be performed if the database has migrations that the rollback target app cannot understand. + +### Migration Policy (PR Validation) + +`validate-migration.yml` runs on PRs touching `packages/prisma/migrations/**`. It classifies migrations: + +| Pattern | Classification | Action | +|---|---|---| +| `DROP TABLE` | CONTRACT | Blocks PR | +| `DROP COLUMN` | CONTRACT | Blocks PR | +| `RENAME COLUMN` | CONTRACT | Blocks PR | +| `ALTER COLUMN TYPE` | CONTRACT | Blocks PR | +| `SET NOT NULL` without `DEFAULT` | CONTRACT | Blocks PR | +| `CREATE TABLE`, `CREATE INDEX`, `ADD COLUMN` | EXPAND | Allowed with notice | +| `UPDATE`, `DELETE`, `INSERT...SELECT` | DATA-BACKFILL | Allowed with owner/validation plan | + +--- + +## 11. Rollback + +### Automatic Rollback Triggers + +Automatic rollback is triggered when `promote-all` succeeds but either `record-promoted-state` or `deploy-worker` fails. The rollback reverts Web and API to the previous release via NGINX revert. + +**Failure before promotion**: No automatic rollback occurs. Web/API are still running the previous release — nothing to revert. The operator is notified of the failure. + +### Rollback Flow (Automatic) + +```mermaid +flowchart TD + A[promote-all succeeds] --> B{record-promoted-state + deploy-worker} + B -->|success| C[verify → current] + B -->|failure| D[rollback-after-promotion triggered] + D --> E[Read current.json → previous SHA] + E --> F{Previous images in ECR?} + F -->|no| G[Rollback skipped: missing images] + F -->|yes| H[Run rollback.sh on EC2 host] + H --> I{rollback.sh succeeded?} + I -->|yes| J[Notify: rollback succeeded] + I -->|no| K[revert-nginx.sh as last resort] + K --> L[Notify: rollback failed] + G --> L +``` + +### Manual Rollback (rollback.yml) + +The `rollback.yml` workflow provides a manual rollback entrypoint: + +1. **`validate-schema`** job: Runs `validate-rollback-schema.sh` against the target VPC +2. **`verify-images`** job: Verifies all three images exist in ECR for `target_sha` +3. **`rollback-app`** job (if `dry_run=false`): Runs `rollback.sh` on the EC2 host +4. **`rollback-report`** job: Collects NGINX state and host logs, sends notification + +Inputs: +- `branch` (default: main) — scripts branch +- `target_sha` — SHA to roll back to +- `environment` — staging or production +- `dry_run` (default: true) — validate without running host rollback + +### App-Only Rollback Sequence + +`rollback.sh` performs: +1. **Validate schema**: `validate-rollback-schema.sh` (blocks if DB has newer CONTRACT migrations) +2. **Stage previous images**: `stage-api.sh` and `stage-web.sh` with previous release images +3. **Promote**: `promote-all.sh` switches NGINX to previous release +4. **Start workers**: `start-workers.sh` with previous release image +5. **Drain old workers**: `drain-workers.sh` +6. **Record state**: Writes `rolled_back` to S3 (updates `current.json`) + +**App-only**: No database migrations are run. The rollback brings the application layer back to the previous SHA, but the database stays at whatever migration level it reached. + +### Rollback Status Values + +| Status | Meaning | +|---|---| +| `succeeded` | Rollback completed; previous release is now live | +| `skipped_no_previous_release` | No `current.json` found; cannot determine target | +| `skipped_missing_images` | Previous release images not found in ECR | +| `failed` | Rollback script exited with non-zero; revert may not have completed | + +--- + +## 12. Notifications + +### Notification Events + +| Event | Trigger | Email Contents | +|---|---|---| +| `deployment_started` | `prepare-release` job | Release ID, SHA, branch, actor, environment | +| `deployment_succeeded` | `verify` job succeeds | All images, job status table, NGINX upstreams, logs | +| `deployment_failed_before_promotion` | `verify` fails, `promote-all` failed/skipped | Failure reason, job status, logs | +| `deployment_failed_after_promotion_rollback_succeeded` | `verify` fails, rollback succeeded | Rollback target, images, upstreams, logs | +| `deployment_failed_after_promotion_rollback_failed` | `verify` fails, rollback failed | Failure reason, rollback status, logs | +| `manual_rollback_succeeded` | `rollback.yml` completes successfully | All rollback details, NGINX state, logs | +| `manual_rollback_failed` | `rollback.yml` fails | Failure reason, logs | + +### Email Fields + +All emails include: +- Environment, Release ID, Git SHA, Branch, Actor, Workflow Run URL +- All three service images (Web, API, Worker) +- NGINX active upstreams (from `cal-id.conf`) +- Service deployment status (Web/API/Worker job results) +- Migration count (if applicable) +- Rollback status and target SHA (if applicable) +- Failure reason (if applicable) +- Redacted log snippet (last 220 lines; secrets stripped) + +### Non-Blocking Behavior + +`send-email.sh` is non-blocking by default: if SendGrid returns an error, it logs a warning and exits 0. Email delivery failure does not cause the workflow to fail. Operators can monitor SendGrid delivery independently. + +The only exception: if `SEND_EMAIL_REQUIRED=true` is set (not used by any current caller), the script exits with an error. + +--- + +## 13. Disk Space & Cleanup + +### Preflight Space Check + +Before pulling images or starting containers, `preflight-space.sh` checks available disk space. Default threshold: **10 GB minimum free**. + +If space is below the threshold, controlled cleanup runs: + +1. **Collect protected items**: + - Containers currently routed by NGINX (read from `cal-id.conf` or legacy configs) + - Containers named `worker-*`, `web-candidate`, `api-candidate` + - Images referenced in `current.json` (from S3 deployment state) + - Containers running images that are in the protected image list + +2. **Remove stale containers**: exited, created, or dead containers not in the protected list + +3. **Prune Docker images**: older than 72 hours, not referenced by any protected container + +4. **Prune Docker build cache**: older than 72 hours + +5. **Prune Docker volumes**: unused volumes + +6. **Clean static builds**: remove old static build directories, keeping active, candidate, and 2 most recent + +### Post-Deployment Cleanup + +After a successful deployment, `drain-workers.sh` prunes Docker images older than 72 hours. The 72h filter avoids removing images that might be needed for a quick rollback. + +### What is NOT Cleaned Up + +- Images currently referenced by active containers (routed by NGINX or running workers) +- Images referenced by `current.json` in S3 +- Images pushed in the last 72 hours +- Active static build directories + +--- + +## 14. Rollback Schema Validation + +### When It Runs + +- **Automatic rollback**: `rollback-after-promotion` calls `rollback.sh`, which calls `validate-rollback-schema.sh` before staging +- **Manual rollback**: `rollback.yml`'s `validate-schema` job calls `validate-rollback-schema.sh` on the target VPC + +### What It Checks + +1. Clone target SHA and current branch HEAD to temp directory +2. List migrations in target checkout (`packages/prisma/migrations/`) +3. Query applied migrations from database (`_prisma_migrations` table) +4. Compute: migrations in DB but NOT in target (i.e., newer than target) +5. For each such migration, scan `migration.sql` for CONTRACT patterns +6. If any CONTRACT pattern found: block the rollback + +### CONTRACT Patterns Blocked + +- `DROP TABLE` +- `DROP COLUMN` +- `ALTER TABLE ... DROP COLUMN` +- `RENAME COLUMN` +- `ALTER COLUMN ... TYPE` +- `SET NOT NULL` without `DEFAULT` in the same statement + +### What Happens When It Blocks + +The script exits 1. The rollback workflow fails at the schema validation step. The operator sees a message like: +``` +BLOCKED: applied migration newer than target contains CONTRACT pattern: 20250526_remove_old_index +Schema incompatible with app-only rollback target {sha} +``` + +### When It Does NOT Block + +- EXPAND patterns (CREATE TABLE, CREATE INDEX, ADD COLUMN) are allowed +- Migrations that are in the target checkout (i.e., the rollback target knows about them) +- Migrations with no CONTRACT patterns + +### Prerequisites + +The script requires: `git`, `psql`, `comm`. `psql` must be able to connect to the database using `DATABASE_URL`. + +--- + +## 15. Operational Procedures + +### Pre-Deployment Checklist (Operator) + +Before staging or production deploy: + +1. **Confirm the target branch** is correct (main for production, develop for staging) +2. **Verify required secrets are set** in GitHub Actions repository settings: + - `AWS_REGION`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_ACCOUNT_ID` + - `EC2_HOST_PROD`, `EC2_SSH_KEY_PROD` (production) or `EC2_HOST_STAG`, `EC2_SSH_KEY_STAG` (staging) + - `DATABASE_DIRECT_URL_PROD` (production) or `DATABASE_DIRECT_URL_STAG` (staging) + - `DOMAIN_NAME_PROD`, `HOMEPAGE_URL_PROD` (production) or staging equivalents + - `CERTBOT_EMAIL`, `REPO_URL`, `SENDGRID_API_KEY`, `EMAIL_FROM`, `EMAIL_TO` + - All `NEXT_PUBLIC_*` variables for the target environment + - `WORKER_REPLICAS_PROD/STAG` (if using multiple workers) +3. **Check no other deployment is in progress**: inspect the GitHub Actions concurrency queue for `Deploy All Services`. S3 lock inspection is not required for normal deploys while S3 locking is disabled. +4. **For production deploys**: confirm the `main` branch HEAD is at the intended SHA (can be overridden with `git_hash` input) +5. **For deploys with migrations**: confirm the PR has been reviewed by someone who understands the migration impact; check `validate-migration.yml` has passed + +### Triggering a Deployment + +**Automated** (on PR merge to main or develop for relevant paths): +```yaml +# Paths that trigger deploy-all.yml: +apps/web/**, apps/connector/**, apps/worker/**, packages/**, +Dockerfile, entrypoint.sh, +apps/connector/Dockerfile, apps/worker/Dockerfile, +infra/docker/**, infra/scripts/**, .github/workflows/deploy-all.yml +``` + +**Manual** (workflow_dispatch): +1. Go to GitHub Actions → `Deploy All Services` +2. Click "Run workflow" +3. Set `branch` (default: develop), optionally override `git_hash` +4. Set `rebuild` (default: false) +5. Optionally override `worker_replicas` +6. Click "Run" + +### Manual Rollback Procedure + +1. Go to GitHub Actions → `App Rollback` +2. Click "Run workflow" +3. Set inputs: + - `branch`: `main` (or the branch with rollback scripts) + - `target_sha`: the SHA of the previous known-good release + - `environment`: `staging` or `production` + - `dry_run`: `true` first to validate, then `false` to execute +4. Click "Run" + +**Finding the previous SHA**: Look at `s3://cal-id/deployment-{env}/deployments/current.json` — the `sha` field is the current live release, which is the rollback target after the rollback completes. + +**Finding the rollback source SHA**: If rolling back a failed deployment, the source is the SHA of the failed deployment (visible in the failed `deploy-all.yml` run). + +### Failure Scenario Walkthroughs + +#### Scenario 1: Build Failure + +- **Symptom**: One of build-web/build-api/build-worker fails +- **Impact**: No images pushed, no services affected +- **Recovery**: Fix the build error, re-run `deploy-all.yml`. If images partially exist, set `rebuild=true` to skip successful ones. + +#### Scenario 2: Migration Failure + +- **Symptom**: `migrate-db` job fails on EC2 host +- **Impact**: No services promoted; previous release still live +- **Recovery**: + 1. Check the migration logs in the GitHub Actions step output + 2. If the migration is fixable: fix the migration file, push, merge + 3. If the migration is already applied to staging/prod DB: the next deployment will skip it (Prisma idempotent) + 4. For production: verify the database is in a consistent state before retrying + +#### Scenario 3: Candidate Health Check Failure + +- **Symptom**: `deploy-api` or `deploy-web` fails at health check +- **Impact**: No services affected (candidates not yet routed) +- **Recovery**: The script removes the candidate container and exits 1. Fix the container issue (env var, missing file, etc.), then re-run from the deploy job (the build already succeeded; you can set `rebuild=false`). + +#### Scenario 4: NGINX Promotion Failure + +- **Symptom**: `promote-all` fails at `nginx -t` or `nginx -s reload` +- **Impact**: Previous release still live (config reverted from backup); no traffic switch +- **Recovery**: + 1. Check the error message (usually a config syntax issue in the generated `cal-id.conf`) + 2. If the error is in the template files, fix them and re-run + 3. The `revert-nginx.sh` was already called by the workflow's failure trigger + +#### Scenario 5: Worker Startup Failure + +- **Symptom**: `deploy-worker` fails with "Worker startup failed" +- **Impact**: Web/API are promoted (live); workers are the old version (still running) +- **Recovery**: `rollback-after-promotion` runs automatically. It reverts Web/API to the previous release. If automatic rollback fails, run manual rollback from `App Rollback` workflow. + +#### Scenario 6: Lock Conflict + +- **Current status**: Not expected in `deploy-all.yml` because S3 locking is disabled. +- **If S3 locking is re-enabled**: `acquire-lock` may fail with "Lock already held by..."; check the lock holder, wait for the active workflow, or use `FORCE_RELEASE=true` only after verifying the holder has crashed or expired. + +--- + +## 16. Troubleshooting & Recovery + +### Common Failure Modes and Diagnosis + +#### "Lock already held" on every attempt + +**Current status**: Not expected from normal `deploy-all.yml` runs while S3 locking is disabled. + +**Cause when S3 locking is re-enabled or used manually**: A previous deployment's lock is still held (workflow crashed or was cancelled mid-deploy). + +**Diagnosis**: +```bash +aws s3 cp s3://cal-id/deployment-prod/locks/prod.lock /tmp/lock.json +cat /tmp/lock.json | jq '{owner, actor, run_id, expires_at}' +``` + +**Recovery when S3 locking is re-enabled or used manually**: Wait for TTL expiration (default 2h from lock acquisition). Or manually: +```bash +# In release-lock.sh or directly via AWS CLI with FORCE_RELEASE +aws s3 rm s3://cal-id/deployment-prod/locks/prod.lock +``` + +#### "Image already exists" but deployment failed before promotion + +**Cause**: Build completed and pushed SHA-tagged images, but `migrate-db` or `deploy-api` failed. + +**Recovery**: Re-run with `rebuild=false` (skips builds, reuses existing images). The build jobs will be skipped and the pipeline continues from `migrate-db`. + +#### "Worker candidate failed health check" — workers rolled back + +**Cause**: Worker containers failed to reach "Workers are up" state within the health check window. + +**Diagnosis**: Check the worker logs in the `deploy-worker` step output. Look for: Redis connection failures, missing env vars, OOM kills, processor registration errors. + +#### "NGINX config test failed" during promotion + +**Cause**: The generated `cal-id.conf` has invalid NGINX syntax. Check the generated file in the SSH logs. + +**Diagnosis**: The `promote-all` step logs the generated config. Common issues: +- Invalid `proxy_pass` port (port mismatch between web and API templates) +- SSL certificate paths not matching actual cert directory names +- Syntax error in sed replacement + +#### "current.json update failed" in record-state.sh + +**Cause**: Another writer (concurrent deployment or automated rollback) already updated `current.json`. + +**Impact**: Low — the deployment continues normally. The new release's manifest and history are recorded. Only the `current.json` pointer was not updated. + +**Recovery**: Usually none needed. The S3 manifest at `manifests/sha-{sha}.json` is always written and can be used to reconstruct state. + +#### "Host checkout SHA does not match release SHA" + +**Cause**: Git fetch failed to retrieve the SHA on the EC2 host (shallow clone with insufficient depth, or SHA was force-pushed). + +**Diagnosis**: Check `git fetch origin {sha} --depth 1` in the SSH logs. + +**Recovery**: The host-side checkout script already contains a guard that aborts if SHA mismatch. Fix the root cause (deep enough clone, or ensure the SHA exists in the remote). + +### Log Locations on EC2 Host + +| Log | Contents | +|---|---| +| `/home/onehash/deployment.log` | Main deploy script output (legacy) | +| `/home/onehash/api-deployment.log` | Connector API deployment output | +| `/home/onehash/worker-deployment.log` | Worker deployment output | +| `/home/onehash/api-debug.log` | Xtrace debug output from API deploy script | + +Container logs: `docker logs --tail 60 {container_name}` + +### Useful EC2 Commands + +```bash +# Check running containers +docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" + +# Check NGINX active config ports +grep -E "proxy_pass http://127" /etc/nginx/conf.d/cal-id.conf + +# Check static symlink targets +readlink -f /var/www/cal-id-static/current +readlink -f /var/www/cal-id-static/candidate + +# Check deployment lock only if S3 locking is re-enabled or a manual lock script was used +aws s3 cp s3://cal-id/deployment-prod/locks/prod.lock - | jq . + +# Check current deployment state +aws s3 cp s3://cal-id/deployment-prod/deployments/current.json - | jq . + +# View recent deployment logs +tail -100 /home/onehash/deployment.log + +# Check disk space +df -h / + +# Check Docker image space usage +docker system df +``` + +--- + +## 17. Required Configuration + +### GitHub Secrets + +#### AWS Core + +| Secret | Required | Description | +|---|---|---| +| `AWS_REGION` | Yes | AWS region (e.g., `us-east-1`) | +| `AWS_ACCESS_KEY_ID` | Yes | AWS access key ID | +| `AWS_SECRET_ACCESS_KEY` | Yes | AWS secret access key | +| `AWS_ACCOUNT_ID` | Yes | AWS account ID (used to construct ECR registry) | + +#### ECR Repositories (implicit) + +Repository names are derived from `image_suffix` (prod/stag) and are not stored as secrets. Repositories must exist before deployment: +- `cal_prod` / `cal_stag` — Web images +- `cal_api_prod` / `cal_api_stag` — Connector API images +- `cal_worker_prod` / `cal_worker_stag` — Worker images + +#### EC2 Hosts + +| Secret | Required | Notes | +|---|---|---| +| `EC2_HOST_PROD` | Yes (prod) | Production EC2 host address | +| `EC2_SSH_KEY_PROD` | Yes (prod) | SSH private key for production | +| `EC2_HOST_STAG` | Yes (stag) | Staging EC2 host address | +| `EC2_SSH_KEY_STAG` | Yes (stag) | SSH private key for staging | + +#### Domains & Email + +| Secret | Required | Notes | +|---|---|---| +| `DOMAIN_NAME_PROD` | Yes (prod) | Production domain (e.g., `cal.id`) | +| `DOMAIN_NAME_STAG` | Yes (stag) | Staging domain | +| `HOMEPAGE_URL_PROD` | Yes (prod) | Production Framer homepage URL | +| `HOMEPAGE_URL_STAG` | Yes (stag) | Staging Framer homepage URL | +| `CERTBOT_EMAIL` | Yes | Email for Let's Encrypt certificate notifications | +| `EMAIL_FROM` | No | Sender email (default: `alerts@cal.id`) | +| `EMAIL_TO` | No | Recipient email (default: `deployments@cal.id`) | +| `SENDGRID_API_KEY` | Yes | SendGrid API key for notifications | + +#### Database + +| Secret | Required | Notes | +|---|---|---| +| `DATABASE_DIRECT_URL_PROD` | Yes (prod) | Production database connection string | +| `DATABASE_DIRECT_URL_STAG` | Yes (stag) | Staging database connection string | + +#### Build Args (Web) + +All `NEXT_PUBLIC_*` secrets must be set for both environments. The preflight job validates presence for both prod and stag secrets simultaneously: + +| Secret | Notes | +|---|---| +| `NEXT_PUBLIC_GTM_ID_PROD`, `NEXT_PUBLIC_GTM_ID_STAG` | Google Tag Manager | +| `NEXT_PUBLIC_SENTRY_DSN_PROD`, `NEXT_PUBLIC_SENTRY_DSN_STAG` | Sentry DSN | +| `NEXT_PUBLIC_CALENDLY_CLIENT_ID_PROD`, `NEXT_PUBLIC_CALENDLY_CLIENT_ID_STAG` | Calendly OAuth | +| `NEXT_PUBLIC_CALENDLY_REDIRECT_URI_PROD`, `NEXT_PUBLIC_CALENDLY_REDIRECT_URI_STAG` | Calendly redirect | +| `NEXT_PUBLIC_STRIPE_PUBLIC_KEY_PROD`, `NEXT_PUBLIC_STRIPE_PUBLIC_KEY_STAG` | Stripe public key | +| `NEXT_PUBLIC_ONEHASH_CHAT_URL_PROD`, `NEXT_PUBLIC_ONEHASH_CHAT_URL_STAG` | Chat service URL | +| `NEXT_PUBLIC_RAZORPAY_CLIENT_ID_PROD`, `NEXT_PUBLIC_RAZORPAY_CLIENT_ID_STAG` | Razorpay client ID | +| `NEXT_PUBLIC_SIGNUP_URL_PROD`, `NEXT_PUBLIC_SIGNUP_URL_STAG` | Signup URL | +| `NEXT_PUBLIC_POSTHOG_KEY_PROD`, `NEXT_PUBLIC_POSTHOG_KEY_STAG` | PostHog project key | +| `NEXTAUTH_SECRET` | Single value (shared) | +| `CALENDSO_ENCRYPTION_KEY` | Single value (shared) | +| `NEXT_PUBLIC_API_V2_URL` | Single value (shared) | +| `NEXT_PUBLIC_META_WHATSAPP_BUSINESS_APP_ID` | Single value | +| `NEXT_PUBLIC_META_WHATSAPP_BUSINESS_CONFIG_ID` | Single value | +| `NEXT_PUBLIC_ONEHASH_URL` | Single value | +| `NEXT_PUBLIC_SENDGRID_SENDER_NAME` | Single value | +| `NEXT_PUBLIC_LOGGER_LEVEL` | Single value | +| `NEXT_PUBLIC_TEAM_IMPERSONATION` | Single value | +| `NEXT_PUBLIC_APP_NAME` | Single value | +| `BRAND_NAME` | Single value | +| `NEXT_PUBLIC_MINUTES_TO_BOOK` | Single value | +| `NEXT_PUBLIC_BOOKER_NUMBER_OF_DAYS_TO_LOAD` | Single value | +| `NEXT_PUBLIC_CALENDLY_OAUTH_URL` | Single value | +| `NEXT_PUBLIC_CALENDLY_API_BASE_URL` | Single value | +| `NEXT_PUBLIC_VAPID_PUBLIC_KEY` | Single value | +| `NEXT_PUBLIC_WEBSITE_TERMS_URL` | Single value | +| `NEXT_PUBLIC_WEBSITE_PRIVACY_POLICY_URL` | Single value | +| `NEXT_PUBLIC_RECAPTCHA_LOW`, `NEXT_PUBLIC_RECAPTCHA_MEDIUM`, `NEXT_PUBLIC_RECAPTCHA_HIGH` | Single values | +| `NEXT_PUBLIC_SUPPORT_MAIL_ADDRESS` | Single value | +| `NEXT_PUBLIC_POSTHOG_HOST` | Single value | +| `NEXT_PUBLIC_ONEHASH_ERP_URL` | Single value | +| `NEXT_PUBLIC_ONEHASH_CRM_URL` | Single value | +| `NEXT_PUBLIC_WHITELISTED_ORGS` | Single value | +| `NEXT_PUBLIC_PIXEL` | Single value | +| `NEXT_PUBLIC_CLOUDFLARE_SITEKEY` | Single value | + +#### Sentry (API/Worker) + +| Secret | Required | Notes | +|---|---|---| +| `SENTRY_AUTH_TOKEN` | Yes | Sentry auth token for upload | +| `SENTRY_ORG` | Yes | Sentry organization name | +| `SENTRY_PROJECT_PROD` | Yes (prod) | Sentry project for prod API/Worker | +| `SENTRY_PROJECT_STAG` | Yes (stag) | Sentry project for staging API/Worker | + +#### Repo & Workers + +| Secret | Required | Notes | +|---|---|---| +| `REPO_URL` | Yes | Git repository URL | +| `WORKER_REPLICAS_PROD` | No | Number of worker replicas (prod, default: 1) | +| `WORKER_REPLICAS_STAG` | No | Number of worker replicas (stag, default: 1) | + +#### Optional + +| Secret | Required | Notes | +|---|---|---| +| `ENABLE_DB_BACKUP` | No | Set to `true` to run backup before migration | +| `DB_BACKUP_COMMAND` | Conditional | Required when `ENABLE_DB_BACKUP=true` | +| `SSH_TARGET_FINGERPRINT` | No | SSH target fingerprint (legacy workflows only) | +| `DOCKERHUB_USERNAME`, `DOCKERHUB_TOKEN` | No | Legacy Docker Hub fallback (not used by deploy-all.yml) | + +### EC2 Host Prerequisites + +The EC2 host must have: +- Docker installed and running +- `git`, `curl`, `sudo`, `aws` CLI available in PATH +- `jq` installed +- `corepack` or `yarn` for dependency installation +- `certbot` installed (for SSL certificate management) +- Nginx installed and running +- SSH access via the configured key +- Sufficient disk space (preflight-space.sh enforces 10GB minimum before staging) +- AWS credentials configured (`aws configure` or environment) +- ECR login capability (via `aws ecr get-login-password`) +- `/home/onehash/.env` file with all required environment variables for the application +- `/home/onehash/onehash-cal` directory (created by git clone during first deployment) +- `/var/www/cal-id-static/` directory for static assets + +### Assumptions About Host State + +- The host already has the application `.env` file at `/home/onehash/.env` +- Nginx is already configured (at least with placeholder configs) +- The `onehash` user has sudo access (needed for nginx config writes, certbot, static directory) +- AWS credentials are configured for the `onehash` user (used by `aws` CLI in deployment scripts) +- Docker daemon is running and the `onehash` user has Docker access (user is in docker group) +- SSL certificates can be managed by certbot (DNS challenge credentials at `/etc/letsencrypt/dns-multi.ini`) + +--- + +## 18. Validation Coverage + +### Preflight Checks + +| Check | Tool/Method | Scope | +|---|---|---| +| Dockerfile syntax | `docker buildx build --check` | All three Dockerfiles | +| Bash script syntax | `bash -n` | All critical scripts | +| Worker lifecycle dry-run | `WORKER_LIFECYCLE_DRY_RUN=true bash start-workers.sh` | Worker staging logic | +| Worker drain dry-run | `WORKER_LIFECYCLE_DRY_RUN=true bash drain-workers.sh` | Worker drain logic | +| Secret presence | Shell check for empty strings | All required secrets | + +### Runtime Checks + +| Check | Where | Behavior on failure | +|---|---|---| +| Host SHA match | Every SSH script | Exit 1 (abort deployment) | +| NGINX config validation | `nginx -t` in `ngx_switch_config_and_static()` | Restore backup, abort | +| Candidate health check | `deploy-api`, `deploy-web` (curl) | Remove container, exit 1 | +| Worker health check | `start-workers.sh` (log grep) | Remove container, abort | +| Worker startup success | `deploy-worker` job outcome | Trigger automatic rollback | +| Image existence check | Build jobs (ECR describe-images) | Skip build, reuse image | +| Migration preflight | `prisma migrate status` (dry-run) | Non-fatal, warning only | +| Disk space | `preflight-space.sh` | Abort before image pull | + +### Static Analysis in CI + +- `bash -n` on all scripts in `infra/scripts/` +- `docker buildx build --check` on all Dockerfiles +- No linting or type-checking of deployment scripts (bash only) + +--- + +## 19. Known Limitations & Assumptions + +### Current Limitations + +1. **Single EC2 host**: All services (Web, API, Worker) run on the same EC2 instance. There is no horizontal scaling of the host. If the host goes down, all services are affected. + +2. **No blue/green for workers**: Workers do not run in blue/green mode. The old workers are drained and stopped before the new workers take over. There is a brief window where the number of active workers is reduced during drain (though the drain is one-by-one to minimize this). + +3. **No active health monitoring post-deployment**: The pipeline does not continuously monitor the deployed services after the `verify` job. It relies on the operator to notice issues and trigger rollback manually. + +4. **No rollback for migrations**: If a migration fails and the database is left in an inconsistent state, the rollback does not revert the database. The operator must manually restore from a backup or fix the migration. + +5. **S3 state bucket name is fixed**: The `DEPLOY_STATE_BUCKET` is hardcoded to `cal-id` (enforced by `require_deploy_state_config()`). This cannot be overridden via secrets. + +6. **EC2 host disk space**: The 10GB minimum free space threshold is conservative. Large deployments may require more space for concurrent candidate containers and multiple static builds. + +7. **Certbot DNS challenge**: The scripts assume certbot DNS multi credentials are at `/etc/letsencrypt/dns-multi.ini`. This file must be pre-provisioned on the EC2 host. + +8. **No rollback for staging failures**: Automatic rollback only triggers after Web/API promotion (i.e., only when services are live). Failures before promotion (build, migration, candidate health) do not trigger rollback — the operator must assess and manually trigger rollback if needed. + +9. **S3 lock TTL is inactive in deploy-all.yml**: S3 locking is currently disabled. If re-enabled, the lock refresh TTL of 7200 seconds means a deployment must complete within 2 hours or risk lock expiration. + +10. **`worker_new_*` naming is a naming convention, not a guarantee**: The worker deployment relies on container naming (`worker-new-1`, `worker-new-2`, etc.) to distinguish new from old. If another process creates containers with the same names, the deployment could fail or behave incorrectly. + +11. **NGINX reload is not atomic at the kernel level**: `nginx -s reload` is graceful but not guaranteed-atomic from the application's perspective. During the reload window, some requests may go to the old config and some to the new. The `nginx -t` validation ensures the new config is valid before reload. + +12. **No rollback for manual rollback failures**: If `rollback.yml` fails at the `rollback-app` step after `validate-schema` and `verify-images` succeed, there is no automatic retry or fallback. The operator must investigate and manually recover. + +13. **Validation only runs on PR changes**: `validate-migration.yml` only runs on pull requests. Direct pushes to `main` or `develop` with migration changes bypass migration classification. This is a known gap — migration safety classification should also run on push events. + +14. **Docker image pruning by age**: The 72-hour image prune filter means that if a rollback is needed for a SHA that is more than 72 hours old, the image may have been pruned. This is mitigated by the image reference in S3 deployment state — operators can detect this before triggering rollback. + +14. **Docker image pruning by age**: The 72-hour image prune filter means that if a rollback is needed for a SHA that is more than 72 hours old, the image may have been pruned. This is mitigated by the image reference in S3 deployment state — operators can detect this before triggering rollback. + +### Concurrency Behavior (Second Deployment for Same Environment) + +This section documents the precise behavior when a second deployment pipeline execution is triggered for the same target environment while one is already running. GitHub Actions concurrency groups are the active concurrency control; the S3 deployment lock is currently disabled in `deploy-all.yml`. + +#### Overview: What Happens + +| Mechanism | Behavior | Scope | +|---|---|---| +| GitHub Actions Concurrency Group | **Queues the second deployment**; first completes before the second starts | Per environment (`production` or `staging`) | +| S3 Lock | **Disabled for unified deployments**; no acquire, refresh, or release occurs | Per environment when re-enabled | +| rollback.yml | Separate rollback workflow behavior; verify current workflow before operating | Per environment | + +The GitHub Actions concurrency layer queues subsequent runs by **environment**, not by branch. The first deployment completes normally, then the second starts. S3 lock scripts remain available in the repository, but `deploy-all.yml` does not use them while locking is disabled. + +#### 1. GitHub Actions Concurrency Group — Environment-Scoped Queueing + +```yaml +# deploy-all.yml — environment-scoped concurrency +concurrency: + group: >- + deploy-all-${{ + ( + github.event_name == 'workflow_dispatch' + && github.event.inputs.branch + || github.ref_name + ) == 'main' + && 'production' + || 'staging' + }} + cancel-in-progress: false +``` + +The concurrency group is derived from the target environment, not the branch name: +- `branch == 'main'` → `deploy-all-production` +- `branch != 'main'` (develop, feature branches, any workflow_dispatch input) → `deploy-all-staging` + +When a second `deploy-all.yml` execution is triggered for the same environment: + +- GitHub Actions **queues the second run** behind the first +- The first workflow runs to completion — all deployment jobs execute without acquiring an S3 lock +- After the first completes, the second run begins +- The `release-lock` job is disabled while S3 locking is disabled + +**Scope of queueing**: The concurrency group key is `deploy-all-production` or `deploy-all-staging`. This means: +- All `main` deployments (PR merge + any `workflow_dispatch` with `branch: main`) share the production queue +- All non-main deployments (`develop`, feature branches, any `workflow_dispatch` with non-main branch) share the staging queue +- A staging deploy does **not** block a production deploy — environments are fully independent +- Both `workflow_dispatch` and auto-triggered runs share the same environment queue + +**GitHub concurrency holds one pending run**: GitHub Actions only holds one pending run per concurrency group. If a third deploy is triggered while one is running and one is queued, the queued run is **replaced** by the new run. The latest trigger wins — intermediate queued runs are dropped. + +**Lock cleanup on cancellation**: There is no S3 lock to clean up during normal unified deployments while locking is disabled. + +**Queueing applies regardless of which job the first deployment is in.** If a deployment is in the middle of a migration when the second run queues, the second run waits. The tradeoff is that a stuck deployment blocks the pipeline until the running workflow completes or is cancelled. + +#### 2. S3 Deployment Lock — Currently Disabled + +`deploy-all.yml` currently skips S3 deployment locking. The retained `acquire-lock` job emits an empty lock token, every `refresh-lock.sh` step is skipped, and the `release-lock` job is skipped. + +If S3 locking is re-enabled later, or if an external operator script calls `acquire-lock.sh` directly, the lock behavior is: + +```bash +# acquire-lock.sh — atomic conditional write +aws s3api put-object \ + --bucket "cal-id" \ + --key "deployment-{env}/locks/{env}.lock" \ + --if-none-match '*' \ + # → fails if key already exists (including expired keys) +``` + +**If the lock exists and is unexpired:** +``` +Lock already held by 'github-12345678-arjun3492' +(actor=arjun3492, run=12345678, sha=a1b2c3d4..., expires=2025-05-26T16:32:00Z) +Another deployment is in progress. +``` +With S3 locking enabled, the second deployment aborts at `acquire-lock`. No build, no SSH, no state mutation. The deployment report sends a failure notification. + +**If the lock exists but is expired** (the TTL has passed but no one deleted it): + +The conditional write `--if-none-match '*'` still fails because S3 treats this as a "key exists" condition regardless of content. The script then reads the lock, sees `expires_at_epoch < now_epoch`, and: + +- Logs: `"Lock exists but may be expired — treating as conflict to be safe"` +- Exits with failure: `"Lock exists at $LOCK_KEY with token '...' — unable to acquire atomically. If the previous holder crashed, use FORCE_RELEASE=true in release-lock.sh after verifying the lock is expired."` + +The second deployment cannot proceed. Manual operator intervention is required (`FORCE_RELEASE=true`). + +#### 3. Sequential Deployment — No Cancellation Artifacts + +Because deployments now complete sequentially (first finishes, then second starts), the first deployment leaves no cancellation artifacts. All host-side artifacts are the result of a completed or operator-cancelled deployment, not a mid-run cancellation by the concurrency group. + +**State file artifacts** (when first deployment completes normally): +- `staged` or `promoted_pending_verification` or `current` written to `manifests/sha-{sha}.json` and `deployments/history/sha-{sha}.json` +- `current.json` updated only for terminal states (`current`, `rolled_back`) +- Each deployment writes to a unique SHA key — no collision with the second deployment's keys + +**EC2 host artifacts** (when first deployment completes normally): +- Candidate containers are stopped and removed by `promote-all.sh` or `ngx_stop_candidates()` +- Worker containers are the canonical `worker-1`, `worker-2`, etc. after drain +- NGINX `cal-id.conf` points to the first deployment's SHA +- Static symlink `current` points to the first deployment's build +- No S3 lock is acquired or released while deployment locking is disabled + +**If the first deployment is operator-cancelled via GitHub Actions UI** (not by concurrency — concurrency no longer cancels): +- The deployment stops at whatever job was running +- The same host artifact scenarios from the cancelled case apply (stale candidates, etc.) +- The operator must assess host state; there is no deploy-all S3 lock to clear while locking is disabled +- The second queued run waits until GitHub releases the concurrency slot + +**Cleanup responsibility:** The second deployment handles stale artifacts from an operator-cancelled first run through its normal staging cleanup (removes `api-candidate`, `web-candidate`, `worker-new-*` containers before staging new ones). If the first run was cancelled after `promote-all` but before `verify`, the NGINX config reflects the first run's SHA — the operator should manually trigger `rollback.yml` or run `deploy-all.yml` to push a known-good release. + +#### 4. S3 Lock — When Orphaned Locks Block the Queue + +While S3 locking is disabled in `deploy-all.yml`, orphaned S3 locks do not block normal unified deployments. If S3 locking is re-enabled or a manual script uses the lock helpers, a lock can still become orphaned in legitimate scenarios: + +1. **Operator-cancelled deployment** — An operator cancels a running deployment via GitHub Actions UI before `release-lock` completes. The lock persists for its full TTL (2h from the last refresh). + +2. **Workflow runner crash** — The GitHub Actions runner crashes or is terminated mid-deployment. If the runner crashes before reaching `release-lock`, the lock is orphaned for its TTL. + +**What happens in each case:** +- With S3 locking disabled in `deploy-all.yml`, GitHub Actions concurrency is the only normal queueing mechanism. +- If S3 locking is re-enabled later, the second deployment is queued by GitHub Actions and waits for the first to complete. +- If S3 locking is re-enabled and the first run leaves an orphaned lock, use `FORCE_RELEASE=true` only after verifying the holder is no longer active. + +**Lock expiry is inactive while S3 locking is disabled.** If locking is re-enabled, the TTL of 7200 seconds (2 hours) from the last `refresh-lock.sh` call means a lock expires naturally. A queued deployment that waits for a lock to expire will proceed without operator intervention, but the 2-hour wait may be unacceptable in an incident. In that case, use `FORCE_RELEASE=true` after verification. + +#### 5. Lock Refresh — Expiration Check + +In `refresh-lock.sh`, the script enforces token matching but checks expiration **only after** token validation: + +```bash +# refresh-lock.sh — lines 76-79 +[ "$existing_token" = "$LOCK_TOKEN" ] || fail "LOCK_TOKEN mismatch..." + +if [ -n "$existing_expires" ] && [ "$existing_expires" -le "$now_epoch" ] 2>/dev/null; then + fail "Cannot refresh expired lock held by..." +fi +``` + +If a lock is expired and a new deployment generates a fresh token (via `acquire-lock.sh`), the `--if-none-match '*'` conditional write will correctly reject the stale lock. The expiration check in `refresh-lock.sh` is a guard for edge cases and is not the primary mechanism for lock safety. + +#### 6. State Machine — Concurrent State Writes + +Two deployments writing state simultaneously for different SHAs: + +**Manifest and history writes** (`record-build-state`, `record-promoted-state`): +- Writes to `manifests/sha-{sha}.json` and `deployments/history/sha-{sha}.json` +- Each write uses a unique SHA in the key path — no two concurrent deployments write to the same key +- Writes are idempotent — re-uploading the same SHA's state overwrites with updated content +- **Result**: safe concurrent writes; each SHA's manifest is independently correct + +**current.json** (live release pointer): +- Only updated for terminal states: `current` and `rolled_back` +- Non-terminal states (`staged`, `promoted_pending_verification`) do **not** write to `current.json` +- Uses an ETag-based conditional write: `aws s3api put-object --if-match "{etag}"` +- If another writer updates `current.json` between read and write, the conditional write fails and is silently skipped +- **Result**: concurrent `current.json` updates are safe; the first successful terminal-state write wins; later writes are dropped + +#### 7. Automatic Rollback After Promotion — Sequential Queue Edge Case + +With queueing, the scenario where a second deployment is queued while the first is running still has an edge case at the promotion boundary. Consider: + +1. Deployment A: `promote-all` succeeds → `record-promoted-state` writes `promoted_pending_verification` +2. Deployment B: queued (second in line) +3. Deployment A: `deploy-worker` starts but fails +4. `rollback-after-promotion` runs and reverts NGINX to the previous release (from `current.json`) +5. Deployment A: completes (with rollback status) +6. Deployment B: now unblocked, starts and runs through its own deployment + +This is the correct outcome: A failed after promotion, it triggered its own rollback, and B starts fresh. + +**The edge case**: If Deployment A succeeds through promotion and `deploy-worker` (reaches `verify`, updates `current.json`), and Deployment B is queued, B starts fresh and does not affect A's completed deployment. + +**The gap that remains**: If Deployment A's `deploy-worker` fails after promotion but before `rollback-after-promotion` completes (e.g., the rollback workflow itself is slow or fails), and the operator manually triggers another `rollback.yml` run in the meantime, there could be concurrent rollback operations. However, `rollback.yml` uses `cancel-in-progress: false`, so the second rollback waits — not a conflict. + +The only gap is if a deployment reaches `promote-all` and then an operator manually triggers `rollback.yml` for the same environment while `deploy-worker` is still running. In this case, the `rollback.yml` queues (since `rollback.yml` also uses `cancel-in-progress: false`). If the queued rollback runs before the first deployment reaches `verify`, it will rollback to the pre-deployment `current.json`, and the first deployment will later reach `verify` and update `current.json` to its own SHA — creating a state where `current.json` and the running services are mismatched. Recovery: use `rollback.yml` with the correct target SHA. + +#### 8. Rollback Workflow Concurrency — Aligned with deploy-all.yml + +The `rollback.yml` workflow uses `cancel-in-progress: false`, consistent with `deploy-all.yml`: + +```yaml +# rollback.yml — line 31-32 +concurrency: + group: rollback-${{ github.event.inputs.environment }} + cancel-in-progress: false +``` + +This means: +- If a manual rollback is running and another rollback is triggered for the same environment, the second one **waits** — GitHub Actions queues it behind the running one +- S3 lock behavior depends on the rollback workflow's current implementation; verify before operating. In `deploy-all.yml`, S3 locking is disabled. +- After the first rollback completes and releases the lock, the second proceeds +- **Why `cancel-in-progress: false`?** Rollback is a safety operation. Cancelling a rollback mid-operation could leave the system in a partially-reverted state (e.g., NGINX reverted but workers still on new version). Waiting ensures rollbacks execute sequentially and completely. + +#### 9. Scenario Matrix + +| Scenario | Mechanism | Outcome | Side Effects | +|---|---|---|---| +| 2nd staging deploy while staging deploy active | Concurrency queue (staging) | 2nd queued; 1st completes; 2nd starts | No S3 lock acquired | +| 2nd production deploy while production deploy active | Concurrency queue (production) | 2nd queued; 1st completes; 2nd starts | No S3 lock acquired | +| 2nd staging deploy while staging deploy migrating | Concurrency queue (staging) | 2nd queued; 1st completes; 2nd starts | No S3 lock acquired | +| 2nd staging deploy while staging deploy promoted Web/API | Concurrency queue (staging) | 2nd queued; 1st completes (worker, verify); 2nd starts on its own SHA | First deploy's current.json updated; second deploy proceeds cleanly | +| 2nd staging deploy while staging deploy in verify | Concurrency queue (staging) | 2nd queued; 1st completes; 2nd starts | — | +| 1st staging deploy operator-cancelled via GitHub Actions UI | GitHub concurrency slot releases after cancellation | 2nd starts when ready | No deploy-all S3 lock to clean up | +| Staging deploy during production deploy | Independent concurrency groups | Both run in parallel | No conflict | +| Production deploy during staging deploy | Independent concurrency groups | Both run in parallel | No conflict | +| 1st staging deploy stuck in migration (hung, not failed) | Concurrency queue (staging) | 2nd queued indefinitely | Pipeline blocked until operator cancels 1st; use `workflow_dispatch` to manually unblock | +| External acquire-lock.sh race with deploy-all.yml | External script may acquire S3 lock, but deploy-all ignores it while disabled | deploy-all continues under GitHub concurrency | Manual lock may need operator cleanup | +| 2nd rollback while 1st rollback running (same env) | Rollback workflow concurrency | 2nd waits; 1st completes; 2nd proceeds sequentially | Verify rollback lock behavior before operating | +| 3rd staging deploy triggered while 1 running + 1 queued | GitHub replaces queued run with latest trigger | Latest deploy is pending; intermediate dropped | Operator intent (latest wins) respected | +| workflow_dispatch to main while PR merge to main running | Same concurrency group (production) | 2nd queued; 1st completes; 2nd starts | No conflict; both serialize to production | + +#### 10. Operator Guidance for Concurrent Deployment Scenarios + +**When a deployment appears stuck:** +1. Check the workflow's current job in GitHub Actions +2. Check the EC2 host for the active job (migration via SSH, container staging, etc.) +3. If the deployment is stuck in `migrate-db`: assess whether the migration is hung or just slow. The migration timeout in the workflow is 1200s; after that, the job fails and the pipeline proceeds to failure handling. +4. If a manual unblock is needed: cancel the stuck deployment via GitHub Actions UI +5. After cancelling: there is no deploy-all S3 lock to clear while locking is disabled. +6. Trigger a fresh deployment once the stuck workflow has fully stopped and the concurrency slot is free. +7. If images for the stuck deployment's SHA already exist in ECR, use `workflow_dispatch` with `rebuild=false` to skip the build phase and resume from migration + +**Lock cleanup while disabled**: No S3 lock is acquired by `deploy-all.yml`, so cancellation does not require S3 lock cleanup. Manual `FORCE_RELEASE=true` applies only if S3 locking is re-enabled or an external/manual script created a lock. + +**Tradeoff awareness**: Queueing means a slow or stuck deployment blocks all subsequent deploys to the same environment. If this is operationally unacceptable, cancel the stuck deployment and wait for GitHub to release the concurrency slot. Monitor active deployments in GitHub Actions to avoid unintended drops of intermediate queued runs (GitHub holds only one pending run per group). + +### Future Work (Not Yet Implemented) + +- Multi-host deployment (separate EC2 instances for Web, API, Worker) +- Continuous health monitoring and auto-rollback after deployment +- Migration rollback (down.sql execution) with safety gates +- Push-event migration validation +- Secrets rotation support without redeployment +- Deployment canary/release gradual rollout (percentage-based traffic splitting) +- Zero-downtime worker restart (both old and new workers serve simultaneously during drain) diff --git a/entrypoint.sh b/entrypoint.sh index d5de6fdef0ab5e..934769ed72689e 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -2,16 +2,4 @@ set -e -# yarn install - - -if [ "$IS_ROLLBACK" = true ]; then - echo "Skipping db-deploy due to rollback." -else - echo "Running yarn db-deploy..." - yarn db-deploy -fi - - - yarn start diff --git a/infra/scripts/acquire-lock.sh b/infra/scripts/acquire-lock.sh new file mode 100755 index 00000000000000..36055c47c2c07d --- /dev/null +++ b/infra/scripts/acquire-lock.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +# +# acquire-lock.sh - Atomically acquire an S3-backed Cal-ID deployment lock. +# +# Uses S3 conditional writes (--if-none-match '*') so that only one writer +# can create the lock object. If the object already exists (even as an +# orphaned placeholder) the conditional write fails and the script exits 1. +# +# Required: +# DEPLOY_ENV or ENVIRONMENT production|staging (prod/main/stag/develop aliases) +# +# Optional: +# DEPLOY_STATE_BUCKET S3 bucket name (default: cal-id, enforced contract) +# DEPLOY_STATE_PREFIX S3 key prefix (derived from DEPLOY_ENV if unset) +# LOCK_TTL_SECONDS Lock TTL in seconds (default: 1800) +# LOCK_OWNER Owner identifier (default: ${GITHUB_RUN_ID}-${actor}-${hostname}) +# GIT_SHA Git SHA being deployed (stored in lock metadata) +# GITHUB_RUN_ID GitHub Actions run ID +# GITHUB_ACTOR GitHub Actions actor (username) +# +# Prints LOCK_KEY, LOCK_TOKEN, LOCK_OWNER, LOCK_EXPIRES_AT on success. +# Exits 0 on success, 1 on lock conflict or error. + +set -euo pipefail + +log() { + printf '[acquire-lock][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +iso_from_epoch() { + date -u -d "@$1" '+%Y-%m-%dT%H:%M:%SZ' +} + +require_cmd aws +require_cmd jq + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=deploy-state-paths.sh +source "${script_dir}/deploy-state-paths.sh" +require_deploy_state_config || fail "Unable to resolve deployment state path" +LOCK_ENV="$DEPLOY_ENV" + +LOCK_TTL_SECONDS="${LOCK_TTL_SECONDS:-1800}" +case "$LOCK_TTL_SECONDS" in + ''|*[!0-9]*) fail "LOCK_TTL_SECONDS must be a positive integer" ;; +esac +[ "$LOCK_TTL_SECONDS" -gt 0 ] || fail "LOCK_TTL_SECONDS must be greater than zero" + +# Build owner string from available context +_lock_owner_base="${GITHUB_RUN_ID:-unknown}-${GITHUB_ACTOR:-$(id -un 2>/dev/null || whoami)}" +LOCK_OWNER="${LOCK_OWNER:-${_lock_owner_base}}" +LOCK_TOKEN="${LOCK_TOKEN:-$(date -u '+%Y%m%d%H%M%S')-$$}" +GIT_SHA="${GIT_SHA:-${GITHUB_SHA:-unknown}}" + +LOCK_KEY="$(get_deploy_state_key "locks/${LOCK_ENV}.lock")" +S3_PATH="s3://${DEPLOY_STATE_BUCKET}/${DEPLOY_STATE_PREFIX}/locks/${LOCK_ENV}.lock" + +now_epoch="$(date -u '+%s')" +expires_at_epoch="$((now_epoch + LOCK_TTL_SECONDS))" +expires_at="$(iso_from_epoch "$now_epoch")" +acquired_at="$(iso_from_epoch "$now_epoch")" +new_expires="$(iso_from_epoch "$expires_at_epoch")" + +# Construct full lock payload with rich metadata +tmp_new="$(mktemp)" +trap 'rm -f "$tmp_new"' EXIT + +jq -n \ + --arg env "$LOCK_ENV" \ + --arg owner "$LOCK_OWNER" \ + --arg token "$LOCK_TOKEN" \ + --arg git_sha "$GIT_SHA" \ + --arg actor "${GITHUB_ACTOR:-unknown}" \ + --arg run_id "${GITHUB_RUN_ID:-unknown}" \ + --arg acquired_at "$acquired_at" \ + --arg expires_at "$new_expires" \ + --argjson acquired_at_epoch "$now_epoch" \ + --argjson expires_at_epoch "$expires_at_epoch" \ + '{ + environment: $env, + owner: $owner, + token: $token, + git_sha: $git_sha, + actor: $actor, + run_id: $run_id, + acquired_at: $acquired_at, + acquired_at_epoch: $acquired_at_epoch, + expires_at: $expires_at, + expires_at_epoch: $expires_at_epoch, + status: "held" + }' > "$tmp_new" + +log INFO "Attempting atomic lock acquisition: $LOCK_KEY" + +# S3 conditional write: only succeeds if the key does NOT exist. +# This is the single writer wins pattern — no read-check-write race. +set +e +aws s3api put-object \ + --bucket "$DEPLOY_STATE_BUCKET" \ + --key "${DEPLOY_STATE_PREFIX}/locks/${LOCK_ENV}.lock" \ + --body "$tmp_new" \ + --content-type 'application/json' \ + --if-none-match '*' \ + > /dev/null 2>&1 +cp_result=$? +set -e + +if [ "$cp_result" -ne 0 ]; then + # Conditional write failed — lock already exists or access denied. + # Download and parse the existing lock to give a useful error. + tmp_existing="$(mktemp)" + trap 'rm -f "$tmp_new" "$tmp_existing"' EXIT + + if aws s3 cp "$LOCK_KEY" "$tmp_existing" >/dev/null 2>&1; then + existing_owner="$(jq -r '.owner // "unknown"' "$tmp_existing")" + existing_token="$(jq -r '.token // "?"' "$tmp_existing")" + existing_expires="$(jq -r '.expires_at_epoch // empty' "$tmp_existing")" + existing_sha="$(jq -r '.git_sha // "unknown"' "$tmp_existing")" + existing_actor="$(jq -r '.actor // "unknown"' "$tmp_existing")" + existing_run="$(jq -r '.run_id // "?"' "$tmp_existing")" + + if [ -n "$existing_expires" ] && [ "$existing_expires" -gt "$now_epoch" ] 2>/dev/null; then + existing_expires_str="$(iso_from_epoch "$existing_expires" 2>/dev/null || printf 'epoch %s' "$existing_expires")" + fail "Lock already held by '${existing_owner}' (actor=${existing_actor}, run=${existing_run}, sha=${existing_sha}, expires=${existing_expires_str}). Another deployment is in progress." + fi + + # Expired lock — the conditional write should have succeeded since the + # key existed but the conditional (If-None-Match: *) might fail on older + # S3 behavior. Treat this as a conflict. + log WARN "Lock exists but may be expired — treating as conflict to be safe" + fail "Lock exists at $LOCK_KEY with token '${existing_token}' — unable to acquire atomically. If the previous holder crashed, use FORCE_RELEASE=true in release-lock.sh after verifying the lock is expired." + else + fail "Lock acquisition failed (S3 conditional write rejected) and could not read existing lock. Check S3 permissions and bucket state." + fi +fi + +# Conditional write succeeded — verify the lock we wrote has our token. +tmp_verify="$(mktemp)" +trap 'rm -f "$tmp_new" "$tmp_verify"' EXIT + +aws s3 cp "$LOCK_KEY" "$tmp_verify" >/dev/null || fail "Lock written but could not be verified" + +verified_token="$(jq -r '.token // empty' "$tmp_verify")" +if [ "$verified_token" != "$LOCK_TOKEN" ]; then + fail "Lock verification failed: expected token '${LOCK_TOKEN}', got '${verified_token}'. Another deployment acquired the lock." +fi + +log INFO "Lock acquired atomically — ${LOCK_OWNER} holds lock until $(iso_from_epoch "$expires_at_epoch")" + +printf 'LOCK_KEY=%s\n' "$LOCK_KEY" +printf 'LOCK_TOKEN=%s\n' "$LOCK_TOKEN" +printf 'LOCK_OWNER=%s\n' "$LOCK_OWNER" +printf 'LOCK_EXPIRES_AT=%s\n' "$(iso_from_epoch "$expires_at_epoch")" \ No newline at end of file diff --git a/infra/scripts/deploy-state-paths.sh b/infra/scripts/deploy-state-paths.sh new file mode 100644 index 00000000000000..52e02fc7df4aef --- /dev/null +++ b/infra/scripts/deploy-state-paths.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# +# deploy-state-paths.sh - Shared S3 path helpers for Cal-ID deployment state. +# +# Contract: +# DEPLOY_STATE_BUCKET=cal-id +# DEPLOY_STATE_PREFIX=deployment-prod|deployment-stag + +normalize_deploy_environment() { + case "${1,,}" in + production|prod|main) printf 'production' ;; + staging|stag|develop) printf 'staging' ;; + *) return 1 ;; + esac +} + +deploy_state_prefix_for_env() { + case "$1" in + production) printf 'deployment-prod' ;; + staging) printf 'deployment-stag' ;; + *) return 1 ;; + esac +} + +require_deploy_state_config() { + local raw_env="${DEPLOY_ENV:-${ENVIRONMENT:-}}" + local expected_prefix + + if [ -z "$raw_env" ]; then + printf 'DEPLOY_ENV or ENVIRONMENT must be set\n' >&2 + return 1 + fi + + DEPLOY_ENV="$(normalize_deploy_environment "$raw_env")" || { + printf 'Unknown deploy environment: %s\n' "$raw_env" >&2 + return 1 + } + + expected_prefix="$(deploy_state_prefix_for_env "$DEPLOY_ENV")" || return 1 + DEPLOY_STATE_BUCKET="${DEPLOY_STATE_BUCKET:-cal-id}" + DEPLOY_STATE_PREFIX="${DEPLOY_STATE_PREFIX:-$expected_prefix}" + DEPLOY_STATE_PREFIX="${DEPLOY_STATE_PREFIX#/}" + DEPLOY_STATE_PREFIX="${DEPLOY_STATE_PREFIX%/}" + + if [ "$DEPLOY_STATE_BUCKET" != "cal-id" ]; then + printf 'DEPLOY_STATE_BUCKET must be cal-id\n' >&2 + return 1 + fi + + if [ "$DEPLOY_STATE_PREFIX" != "$expected_prefix" ]; then + printf 'DEPLOY_STATE_PREFIX must be %s for %s\n' "$expected_prefix" "$DEPLOY_ENV" >&2 + return 1 + fi + + export DEPLOY_ENV DEPLOY_STATE_BUCKET DEPLOY_STATE_PREFIX +} + +get_deploy_state_path() { + printf 's3://%s/%s' "$DEPLOY_STATE_BUCKET" "$DEPLOY_STATE_PREFIX" +} + +get_deploy_state_key() { + local relative_key="${1#/}" + printf '%s/%s' "$(get_deploy_state_path)" "$relative_key" +} diff --git a/infra/scripts/drain-workers.sh b/infra/scripts/drain-workers.sh new file mode 100755 index 00000000000000..21a1640b502aa4 --- /dev/null +++ b/infra/scripts/drain-workers.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# +# drain-workers.sh - Gracefully drain old workers and promote worker-new-*. +# +# Optional: +# GRACEFUL_STOP_TIMEOUT docker stop timeout in seconds (default: 180) +# REQUIRE_NEW_WORKERS require worker-new-* before drain (default: true) +# DOCKER_PRUNE_AFTER_DEPLOY prune unused Docker objects after drain (default: true) +# DOCKER_PRUNE_UNTIL docker prune age filter (default: 72h) +# WORKER_LIFECYCLE_DRY_RUN true to validate inputs without Docker mutation + +set -euo pipefail + +log() { + printf '[drain-workers][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +GRACEFUL_STOP_TIMEOUT="${GRACEFUL_STOP_TIMEOUT:-180}" +REQUIRE_NEW_WORKERS="${REQUIRE_NEW_WORKERS:-true}" +DOCKER_PRUNE_AFTER_DEPLOY="${DOCKER_PRUNE_AFTER_DEPLOY:-true}" +DOCKER_PRUNE_UNTIL="${DOCKER_PRUNE_UNTIL:-72h}" +WORKER_LIFECYCLE_DRY_RUN="${WORKER_LIFECYCLE_DRY_RUN:-false}" + +case "$GRACEFUL_STOP_TIMEOUT" in + ''|*[!0-9]*) + fail "GRACEFUL_STOP_TIMEOUT must be numeric" + ;; +esac + +if [ "$WORKER_LIFECYCLE_DRY_RUN" = "true" ]; then + log INFO "Dry run: drain-workers input validation passed with graceful timeout ${GRACEFUL_STOP_TIMEOUT}s" + exit 0 +fi + +require_cmd docker + +new_workers=() +while IFS= read -r container; do + [ -n "$container" ] && new_workers+=("$container") +done < <(docker ps --filter "name=worker-new-" --format "{{.Names}}" | sort -V) + +old_workers=() +while IFS= read -r container; do + [ -n "$container" ] && old_workers+=("$container") +done < <( + docker ps -a --filter "name=worker-" --format "{{.Names}}" \ + | grep -Ev '^worker-new-' \ + | sort -V || true +) + +if [ "$REQUIRE_NEW_WORKERS" = "true" ] && [ "${#new_workers[@]}" -eq 0 ]; then + fail "No worker-new-* containers are running; refusing to drain old workers" +fi + +if [ "${#old_workers[@]}" -eq 0 ]; then + log INFO "No old worker-* containers found" +else + for container in "${old_workers[@]}"; do + [ -n "$container" ] || continue + log INFO "Gracefully stopping ${container} with timeout ${GRACEFUL_STOP_TIMEOUT}s" + docker stop -t "$GRACEFUL_STOP_TIMEOUT" "$container" >/dev/null 2>&1 \ + || log WARN "${container} did not stop cleanly before timeout" + docker rm -f "$container" >/dev/null 2>&1 || true + done +fi + +if [ "${#new_workers[@]}" -gt 0 ]; then + log INFO "Renaming worker-new-* containers to canonical worker-* names" +fi + +index=1 +for container in "${new_workers[@]}"; do + final_name="worker-${index}" + if docker ps -a --format "{{.Names}}" | grep -Eq "^${final_name}$"; then + docker rm -f "$final_name" >/dev/null 2>&1 || true + fi + docker rename "$container" "$final_name" \ + || fail "Failed to rename ${container} to ${final_name}" + log INFO "Renamed ${container} to ${final_name}" + index=$((index + 1)) +done + +if [ "$DOCKER_PRUNE_AFTER_DEPLOY" = "true" ]; then + log INFO "Pruning unused Docker images and containers older than ${DOCKER_PRUNE_UNTIL}" + docker system prune -af --filter "until=${DOCKER_PRUNE_UNTIL}" \ + || log WARN "Docker pruning had issues but worker drain completed" +fi + +log INFO "Worker drain complete" diff --git a/infra/scripts/migrate.sh b/infra/scripts/migrate.sh new file mode 100755 index 00000000000000..361cf9d12f112b --- /dev/null +++ b/infra/scripts/migrate.sh @@ -0,0 +1,275 @@ +#!/usr/bin/env bash +# +# migrate.sh - Run Cal-ID Prisma migrations once for a target release. +# +# This script is idempotent: running it multiple times on the same SHA is safe +# (Prisma skips already-applied migrations). +# +# Required: +# DATABASE_URL Database connection string +# +# Optional: +# REPO_ROOT Path to existing git checkout (default: /home/onehash/onehash-cal) +# When REPO_ROOT/.git exists, the workflow's pre-clone is reused. +# The script clones only as fallback when REPO_ROOT doesn't exist. +# REPO_URL Git repository URL (used when clone is needed) +# GIT_HASH Commit SHA to checkout (defaults to current HEAD if repo exists) +# BRANCH_NAME Branch for fallback clone (default: develop) +# KEEP_WORK_DIR true to keep work directory after completion (default: false) +# SKIP_YARN_INSTALL true to skip dependency install (default: false) +# MIGRATION_TIMEOUT_SECONDS Timeout for migration command (default: 300) +# ENABLE_DB_BACKUP Set to "true" to run a backup before migration +# DB_BACKUP_COMMAND Shell command or script path to execute for backup. +# Must exit 0 to proceed. Exit non-zero aborts deployment. +# +# Outputs (printed to stdout for workflow capture): +# MIGRATIONS_APPLIED_JSON= +# MIGRATION_COUNT= +# +# Prerequisites on the host: +# - git, yarn, corepack, psql (optional for locking), jq +# - Prisma schema at packages/prisma/schema.prisma +# +# About prisma generate: +# `prisma migrate deploy` does NOT require `prisma generate`. +# It reads migration SQL files and applies them directly to the database — +# no Prisma client runtime is needed. The client is only required at container +# startup, not during migration. This is the intended execution path. + +set -euo pipefail + +log() { + printf '[migrate][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +require_value() { + local name="$1" + local value="${!name:-}" + [ -n "$value" ] || fail "Required environment variable is missing: $name" +} + +# ---------- Env / Defaults ---------- + +require_value DATABASE_URL + +REPO_ROOT="${REPO_ROOT:-/home/onehash/onehash-cal}" +REPO_URL="${REPO_URL:-}" +GIT_HASH="${GIT_HASH:-${GIT_SHA:-}}" +BRANCH_NAME="${BRANCH_NAME:-${BRANCH:-develop}}" +KEEP_WORK_DIR="${KEEP_WORK_DIR:-false}" +SKIP_YARN_INSTALL="${SKIP_YARN_INSTALL:-false}" +MIGRATION_TIMEOUT_SECONDS="${MIGRATION_TIMEOUT_SECONDS:-300}" +ENABLE_DB_BACKUP="${ENABLE_DB_BACKUP:-false}" +DB_BACKUP_COMMAND="${DB_BACKUP_COMMAND:-}" + +# Validate timeout +case "$MIGRATION_TIMEOUT_SECONDS" in + ''|*[!0-9]*) fail "MIGRATION_TIMEOUT_SECONDS must be a positive integer" ;; +esac +[ "$MIGRATION_TIMEOUT_SECONDS" -gt 0 ] || fail "MIGRATION_TIMEOUT_SECONDS must be > 0" + +# ---------- Repo resolution ---------- +# Use existing checkout if present (avoids duplicate clone from workflow). +# Clone only as fallback. + +if [ -d "${REPO_ROOT}/.git" ]; then + log INFO "Using existing checkout at ${REPO_ROOT}" + cd "$REPO_ROOT" || fail "Cannot cd to ${REPO_ROOT}" + git fetch origin "$BRANCH_NAME" --depth 1 2>/dev/null || true + + if [ -n "$GIT_HASH" ]; then + git fetch origin "$GIT_HASH" --depth 1 2>/dev/null || true + git checkout "$GIT_HASH" || fail "Failed to checkout ${GIT_HASH}" + else + git checkout "origin/$BRANCH_NAME" || fail "Failed to checkout origin/${BRANCH_NAME}" + fi +elif [ -n "$REPO_URL" ]; then + log INFO "No existing checkout found at ${REPO_ROOT} — cloning ${BRANCH_NAME}" + mkdir -p "$(dirname "$REPO_ROOT")" + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" \ + || fail "Failed to clone repository" + cd "$REPO_ROOT" || fail "Failed to cd to ${REPO_ROOT}" + + if [ -n "$GIT_HASH" ]; then + git fetch origin "$GIT_HASH" --depth 1 || fail "Failed to fetch ${GIT_HASH}" + git checkout "$GIT_HASH" || fail "Failed to checkout ${GIT_HASH}" + fi +else + fail "REPO_ROOT/.git does not exist and REPO_URL is not set — cannot proceed" +fi + +resolved_sha="$(git rev-parse HEAD)" +log INFO "Working on ${resolved_sha} (branch=${BRANCH_NAME})" + +# ---------- Migration preflight ---------- + +log INFO "Running migration preflight checks" + +# 1. Confirm Prisma schema exists +PRISMA_SCHEMA="${REPO_ROOT}/packages/prisma/schema.prisma" +if [ ! -f "$PRISMA_SCHEMA" ]; then + fail "Prisma schema not found at ${PRISMA_SCHEMA}" +fi +log INFO "Prisma schema found: ${PRISMA_SCHEMA}" + +# 2. Confirm DATABASE_URL is non-empty (already checked by require_value, but confirm format) +if [ "${#DATABASE_URL}" -lt 10 ]; then + fail "DATABASE_URL appears to be invalid (too short)" +fi +log INFO "DATABASE_URL is set (redacted in logs)" + +# 3. Confirm correct git SHA is checked out +if [ -n "$GIT_HASH" ]; then + if [ "$resolved_sha" != "$GIT_HASH" ]; then + fail "Git SHA mismatch: expected=${GIT_HASH}, got=${resolved_sha}" + fi + log INFO "Git SHA verified: ${resolved_sha}" +else + log INFO "No GIT_HASH specified — using current HEAD: ${resolved_sha}" +fi + +# 4. Verify Prisma can read the schema (dry-run validation) +# This doesn't connect to DB — just validates the schema file is parseable. +# Skip if prisma is not installed yet (yarn install will fix this). +if command -v npx >/dev/null 2>&1 && [ -f "${REPO_ROOT}/package.json" ]; then + npx prisma validate --schema "$PRISMA_SCHEMA" >/dev/null 2>&1 \ + && log INFO "Prisma schema is valid" \ + || log WARN "Prisma schema validation had warnings — continuing anyway" +fi + +# ---------- Backup step ---------- + +if [ "$ENABLE_DB_BACKUP" = "true" ]; then + if [ -z "$DB_BACKUP_COMMAND" ]; then + fail "ENABLE_DB_BACKUP=true but DB_BACKUP_COMMAND is not set — cannot run backup" + fi + log INFO "Backup enabled — executing: ${DB_BACKUP_COMMAND}" + eval "$DB_BACKUP_COMMAND" || fail "Backup step failed — aborting migration. Database may be in an inconsistent state." + log INFO "Backup completed successfully" +else + log INFO "Backup step skipped (ENABLE_DB_BACKUP=false or not configured) — no backup taken" +fi + +# ---------- Install dependencies ---------- + +if [ "$SKIP_YARN_INSTALL" != "true" ]; then + if [ ! -d "${REPO_ROOT}/node_modules" ]; then + log INFO "Installing dependencies" + if command -v corepack >/dev/null 2>&1; then + corepack enable >/dev/null 2>&1 || log WARN "corepack enable failed; continuing with existing yarn" + fi + yarn install --immutable || yarn install || fail "Dependency install failed" + else + log INFO "node_modules already present — skipping install" + fi +fi + +# ---------- Capture pre-migration state ---------- + +log INFO "Capturing pre-migration state" +pre_migrate_migrations="$( + yarn prisma migrate status --json 2>/dev/null \ + | jq -r '[.migrations[]?.migrationName // empty] | if length > 0 then . else [] end' \ + || printf '[]' +)" +pre_count="$(printf '%s' "$pre_migrate_migrations" | jq -r 'length' 2>/dev/null || echo 0)" +log INFO "Pre-migration: ${pre_count} migrations already applied" + +# ---------- Migration execution with timeout ---------- + +log INFO "Running database migrations (timeout: ${MIGRATION_TIMEOUT_SECONDS}s)" + +# Run migration with explicit timeout using a background process + wait. +# This ensures a hung migration doesn't block the deployment indefinitely. +migration_start="$(date -u '+%s')" + +# Set advisory lock timeout at session level (Postgres-specific). +# If the DB doesn't support this, it silently succeeds — safe to run. +# 30-second advisory lock timeout prevents indefinite lock waits. +DB_SETUP_SQL="SET lock_timeout = '30s';" + +# Build the migration command with timeout wrapper. +# We use a subshell with `timeout` GNU utility (or bash $SECONDS as fallback). +run_migration() { + # If timeout binary is available, use it; otherwise rely on command_timeout from caller. + if command -v timeout >/dev/null 2>&1; then + timeout "${MIGRATION_TIMEOUT_SECONDS}s" yarn db-deploy \ + && return 0 \ + || return $? + else + # Fallback: run without timeout binary (caller's command_timeout is the safety net) + yarn db-deploy && return 0 || return $? + fi +} + +# Run migration in a subshell so we can capture its exit code. +# shellcheck disable=SC2086 +( + # Set lock timeout before migration (harmless if DB doesn't support it). + # Note: db-deploy runs Prisma which opens its own DB connection, so we + # pass the lock_timeout via DATABASE_URL query params if supported, + # or rely on the session-level setting inside the migration SQL itself. + # Since we can't inject SQL before Prisma opens its connection directly, + # we document that lock_timeout is set at the RDS parameter group level + # as the primary mechanism. The DB_BACKUP_COMMAND step is the production + # safety net if a migration hangs. + run_migration +) > >(while IFS= read -r line; do log INFO "[db-deploy] $line"; done) 2>&1 +migration_exit=$? + +migration_end="$(date -u '+%s')" +migration_elapsed=$((migration_end - migration_start)) + +if [ "$migration_exit" -ne 0 ]; then + fail "Database migration failed (exit=${migration_exit}, elapsed=${migration_elapsed}s)" +fi + +log INFO "Migration completed in ${migration_elapsed}s" + +# ---------- Verify migration applied ---------- + +log INFO "Verifying Prisma migration state" +yarn prisma migrate status >/dev/null 2>&1 || fail "Prisma migration status verification failed — DB may be in inconsistent state" + +# ---------- Capture post-migration state ---------- + +log INFO "Capturing post-migration state" +post_migrate_migrations="$( + yarn prisma migrate status --json 2>/dev/null \ + | jq -r '[.migrations[]?.migrationName // empty] | if length > 0 then . else [] end' \ + || printf '[]' +)" +post_count="$(printf '%s' "$post_migrate_migrations" | jq -r 'length' 2>/dev/null || echo 0)" +log INFO "Post-migration: ${post_count} migrations applied" + +# Compute newly applied migrations: post - pre +new_migrations="$(jq -n \ + --argjson pre "$pre_migrate_migrations" \ + --argjson post "$post_migrate_migrations" \ + '$post - $pre')" + +new_count="$(printf '%s' "$new_migrations" | jq -r 'length' 2>/dev/null || echo 0)" +log INFO "New migrations applied this deployment: ${new_count}" + +if [ "$new_count" -gt 0 ]; then + migration_names="$(printf '%s' "$new_migrations" | jq -r '.[]' | tr '\n' ' ')" + log INFO "New migrations: ${migration_names}" +fi + +# ---------- Output for workflow capture ---------- + +# Always output MIGRATIONS_APPLIED_JSON as the canonical list of migrations +# applied in this deployment (new ones only, not the full history). +printf 'MIGRATIONS_APPLIED_JSON=%s\n' "$new_migrations" +printf 'MIGRATION_COUNT=%d\n' "$new_count" + +log INFO "Migration script complete for ${resolved_sha}" diff --git a/infra/scripts/ngx-utils.sh b/infra/scripts/ngx-utils.sh new file mode 100644 index 00000000000000..b0ca7fe1c18f7e --- /dev/null +++ b/infra/scripts/ngx-utils.sh @@ -0,0 +1,462 @@ +#!/usr/bin/env bash +# +# ngx-utils.sh - Shared NGINX and static asset helpers for Cal-ID deployment. +# +# Sourcing: source this file from promote-all.sh and revert-nginx.sh. +# Do NOT execute this file directly — it provides functions only. +# +# Canonical config: cal-id.conf is the single source of truth for active NGINX state. +# All routing (web, api, mcp) lives in cal-id.conf. Legacy configs (default.conf, +# connector.conf, mcp.conf) are cleaned up when cal-id.conf is promoted and restored +# from backup when cal-id.conf is reverted. +# +# Static assets: /var/www/cal-id-static/current → active build +# /var/www/cal-id-static/candidate → new staged build +# The active symlink is switched atomically after NGINX validation. +# +# Shared env defaults: +# NGINX_CONF_DIR /etc/nginx/conf.d +# NGINX_BACKUP_DIR /tmp/cal-id-nginx-previous +# NGINX_COMBINED_CONF cal-id.conf +# STATIC_ROOT /var/www/cal-id-static +# STATIC_CURRENT_LINK /var/www/cal-id-static/current +# STATIC_CANDIDATE_LINK /var/www/cal-id-static/candidate +# STATIC_KEEP_RELEASES 2 + +# ============================================================ +# Defaults (can be overridden by sourcing scripts) +# ============================================================ + +NGINX_CONF_DIR="${NGINX_CONF_DIR:-/etc/nginx/conf.d}" +NGINX_BACKUP_DIR="${NGINX_BACKUP_DIR:-/tmp/cal-id-nginx-previous}" +NGINX_COMBINED_CONF="${NGINX_COMBINED_CONF:-cal-id.conf}" +STATIC_ROOT="${STATIC_ROOT:-/var/www/cal-id-static}" +STATIC_CURRENT_LINK="${STATIC_CURRENT_LINK:-${STATIC_ROOT}/current}" +STATIC_CANDIDATE_LINK="${STATIC_CANDIDATE_LINK:-${STATIC_ROOT}/candidate}" +STATIC_KEEP_RELEASES="${STATIC_KEEP_RELEASES:-2}" + +# ============================================================ +# Logging helpers +# ============================================================ + +__ngx_log() { + printf '[ngx-utils][%s] %s\n' "$1" "$2" >&2 +} + +__ngx_fail() { + __ngx_log ERROR "$1" + exit 1 +} + +# ============================================================ +# require_cmd — check required binary availability +# ============================================================ + +__ngx_require_cmd() { + command -v "$1" >/dev/null 2>&1 || __ngx_fail "Required command not found: $1" +} + +__ngx_first_local_port() { + sed -nE 's/.*127\.0\.0\.1:([0-9]+).*/\1/p' "$1" 2>/dev/null | head -1 | tr -d '[:space:]' || true +} + +# ============================================================ +# ngx_backup_configs — Back up current NGINX configs before a promote or revert. +# +# Always backs up cal-id.conf if it exists (unified config). +# Also backs up legacy standalone configs for pre-unified-promotion hosts. +# +# Idempotent: re-running backup overwrites the previous backup. +# ============================================================ + +ngx_backup_configs() { + __ngx_log INFO "Backing up NGINX configs to ${NGINX_BACKUP_DIR}" + __ngx_require_cmd sudo + + sudo rm -rf "$NGINX_BACKUP_DIR" + sudo mkdir -p "$NGINX_BACKUP_DIR" \ + || __ngx_fail "Failed to create NGINX backup directory: $NGINX_BACKUP_DIR" + + local count=0 + + # Primary: cal-id.conf (unified config) + if [ -f "${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}" ]; then + sudo cp -f "${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}" "${NGINX_BACKUP_DIR}/${NGINX_COMBINED_CONF}" + __ngx_log INFO "Backed up ${NGINX_COMBINED_CONF}" + count=$((count + 1)) + fi + + # Legacy standalone configs (pre-unified-promotion hosts) + for conf in default.conf connector.conf mcp.conf; do + if [ -f "${NGINX_CONF_DIR}/${conf}" ]; then + sudo cp -f "${NGINX_CONF_DIR}/${conf}" "${NGINX_BACKUP_DIR}/${conf}" + __ngx_log INFO "Backed up legacy ${conf}" + count=$((count + 1)) + fi + done + + [ "$count" -gt 0 ] || __ngx_log WARN "No NGINX configs found to back up" + __ngx_log INFO "Backup complete: ${count} file(s)" +} + +# ============================================================ +# ngx_detect_active_ports — Detect current active web and API ports from cal-id.conf. +# +# Outputs: +# ngx_active_web_port — port for web (default: 3001) +# ngx_active_api_port — port for api (default: 4100) +# ngx_candidate_web_port — opposite of active web (for staging) +# ngx_candidate_api_port — opposite of active api (for staging) +# ============================================================ + +ngx_detect_active_ports() { + local src="" + + # Source of truth: cal-id.conf backup (pre-promotion snapshot is primary, + # since we're about to overwrite configs). Fall back to current cal-id.conf. + for candidate in "${NGINX_BACKUP_DIR}/${NGINX_COMBINED_CONF}" "${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}"; do + [ -f "$candidate" ] && src="$candidate" && break + done + + local raw_web_port raw_api_port + + if [ -n "$src" ]; then + # cal-id.conf: the first local upstream belongs to the web server block. + # API follows the same blue/green pair as web, so infer it from web. + # Do not use the second 127.0.0.1 proxy_pass: the web template has multiple + # local locations before the API server block. + raw_web_port="$(__ngx_first_local_port "$src")" + case "$raw_web_port" in + 3001) raw_api_port="4100" ;; + 3002) raw_api_port="4101" ;; + *) raw_api_port="" ;; + esac + __ngx_log INFO "Active ports from ${src}: web=${raw_web_port}, api=${raw_api_port}" + else + # Pre-unified-promotion host — detect from legacy configs + local legacy_conf="" + for candidate in "${NGINX_BACKUP_DIR}/default.conf" "${NGINX_CONF_DIR}/default.conf"; do + [ -f "$candidate" ] && legacy_conf="$candidate" && break + done + if [ -n "$legacy_conf" ]; then + raw_web_port="$(__ngx_first_local_port "$legacy_conf")" + fi + raw_api_port="" + for candidate in "${NGINX_BACKUP_DIR}/connector.conf" "${NGINX_CONF_DIR}/connector.conf"; do + if [ -z "$raw_api_port" ] && [ -f "$candidate" ]; then + raw_api_port="$(__ngx_first_local_port "$candidate")" + fi + done + __ngx_log INFO "Active ports from legacy configs: web=${raw_web_port:-3001}, api=${raw_api_port:-4100}" + fi + + ngx_active_web_port="${raw_web_port:-3001}" + ngx_active_api_port="${raw_api_port:-4100}" + + # Blue/green: candidate always uses the opposite port + case "$ngx_active_web_port" in + 3001) ngx_candidate_web_port="3002" ;; + 3002) ngx_candidate_web_port="3001" ;; + *) ngx_candidate_web_port="3002" ;; + esac + + case "$ngx_active_api_port" in + 4100) ngx_candidate_api_port="4101" ;; + 4101) ngx_candidate_api_port="4100" ;; + *) ngx_candidate_api_port="4101" ;; + esac + + export ngx_active_web_port ngx_active_api_port ngx_candidate_web_port ngx_candidate_api_port +} + +# ============================================================ +# ngx_switch_config — Switch active NGINX config to candidate file. +# +# Removes legacy configs, copies candidate to cal-id.conf, validates, reloads. +# +# Args: +# $1 — candidate config file path (full path, e.g. /tmp/candidate-cal-id.conf) +# +# On failure: restores from backup and exits 1. +# ============================================================ + +ngx_switch_config() { + local candidate_file="$1" + [ -f "$candidate_file" ] || __ngx_fail "Candidate config not found: $candidate_file" + + __ngx_log INFO "Switching active NGINX config to ${candidate_file}" + + # Remove legacy configs so cal-id.conf is the only active config + sudo rm -f "${NGINX_CONF_DIR}/default.conf" \ + "${NGINX_CONF_DIR}/connector.conf" \ + "${NGINX_CONF_DIR}/mcp.conf" 2>/dev/null || true + + # Deploy candidate as cal-id.conf (source of truth) + sudo cp -f "$candidate_file" "${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}" \ + || __ngx_fail "Failed to copy candidate config to active directory" + + __ngx_log INFO "Validating NGINX config" + sudo nginx -t || { + __ngx_log ERROR "NGINX config validation failed — restoring previous config" + ngx_restore_config + __ngx_fail "NGINX config test failed; previous config restored" + } + + __ngx_log INFO "Reloading NGINX" + sudo nginx -s reload || __ngx_fail "NGINX reload failed" + + __ngx_log INFO "NGINX config switch complete: ${NGINX_COMBINED_CONF} is active" +} + +# ============================================================ +# ngx_switch_config_and_static — Switch config and static release together. +# +# Success path uses exactly one nginx -t and one nginx reload: +# 1. remove legacy configs and copy candidate as cal-id.conf +# 2. switch current static symlink to the staged Web build +# 3. nginx -t +# 4. nginx -s reload +# +# On failure: restores previous NGINX config and previous static symlink. +# +# Args: +# $1 — candidate config file path +# $2 — candidate static build path +# ============================================================ + +ngx_switch_config_and_static() { + local candidate_file="$1" + local candidate_static_path="$2" + + [ -f "$candidate_file" ] || __ngx_fail "Candidate config not found: $candidate_file" + [ -d "$candidate_static_path" ] || __ngx_fail "Candidate static path not found: $candidate_static_path" + + __ngx_log INFO "Switching active NGINX config and static assets together" + + sudo rm -f "${NGINX_CONF_DIR}/default.conf" \ + "${NGINX_CONF_DIR}/connector.conf" \ + "${NGINX_CONF_DIR}/mcp.conf" 2>/dev/null || true + + sudo cp -f "$candidate_file" "${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}" \ + || { + ngx_restore_config + ngx_restore_static + __ngx_fail "Failed to copy candidate config to active directory" + } + + sudo ln -sfn "$candidate_static_path" "$STATIC_CURRENT_LINK" \ + || { + ngx_restore_config + ngx_restore_static + __ngx_fail "Failed to switch static symlink to ${candidate_static_path}" + } + __ngx_log INFO "Static symlink prepared: ${STATIC_CURRENT_LINK} → ${candidate_static_path}" + + __ngx_log INFO "Validating NGINX config" + sudo nginx -t || { + __ngx_log ERROR "NGINX config validation failed — restoring previous config and static symlink" + ngx_restore_config + ngx_restore_static + __ngx_fail "NGINX config test failed; previous config and static symlink restored" + } + + __ngx_log INFO "Reloading NGINX" + sudo nginx -s reload || { + __ngx_log ERROR "NGINX reload failed — restoring previous config and static symlink" + ngx_restore_config + ngx_restore_static + __ngx_fail "NGINX reload failed; previous config and static symlink restored" + } + + __ngx_log INFO "NGINX config and static switch complete: ${NGINX_COMBINED_CONF} is active" +} + +# ============================================================ +# ngx_restore_config — Restore previous NGINX configs from backup. +# +# Restores cal-id.conf if backup exists (unified path), otherwise restores +# legacy standalone configs. Cleans up post-restore state. +# +# Can be called standalone (e.g., after promote-all.sh failure) or from +# revert-nginx.sh. +# ============================================================ + +ngx_restore_config() { + __ngx_log INFO "Restoring NGINX configs from ${NGINX_BACKUP_DIR}" + __ngx_require_cmd sudo + + local restored=0 + + # Primary: restore cal-id.conf (unified config source of truth) + if [ -f "${NGINX_BACKUP_DIR}/${NGINX_COMBINED_CONF}" ]; then + __ngx_log INFO "Restoring ${NGINX_COMBINED_CONF} (unified config)" + sudo rm -f "${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}" + sudo cp -f "${NGINX_BACKUP_DIR}/${NGINX_COMBINED_CONF}" "${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}" + restored=$((restored + 1)) + # Remove legacy configs since unified config is now active + sudo rm -f "${NGINX_CONF_DIR}/default.conf" \ + "${NGINX_CONF_DIR}/connector.conf" \ + "${NGINX_CONF_DIR}/mcp.conf" 2>/dev/null || true + else + # Fallback: restore legacy standalone configs (pre-unified-promotion backup) + __ngx_log INFO "No cal-id.conf in backup; restoring legacy configs" + sudo rm -f "${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}" + for conf in default.conf connector.conf mcp.conf; do + if [ -f "${NGINX_BACKUP_DIR}/${conf}" ]; then + __ngx_log INFO "Restoring ${conf}" + sudo cp -f "${NGINX_BACKUP_DIR}/${conf}" "${NGINX_CONF_DIR}/${conf}" + restored=$((restored + 1)) + fi + done + fi + + [ "$restored" -gt 0 ] || __ngx_log WARN "No backup NGINX configs found in ${NGINX_BACKUP_DIR}" + + __ngx_log INFO "Validating restored NGINX config" + sudo nginx -t || __ngx_log WARN "Restored NGINX config validation failed" + + __ngx_log INFO "Reloading NGINX with restored config" + sudo nginx -s reload || __ngx_log WARN "NGINX reload failed" + + __ngx_log INFO "NGINX config restore complete" +} + +# ============================================================ +# ngx_backup_static — Back up current static symlink target. +# +# Stores the currently-active build directory path for use by +# ngx_restore_static() on rollback. Call before promoting a candidate. +# +# Output: __ngx_prev_static_target (exported) +# ============================================================ + +ngx_backup_static() { + if [ -L "$STATIC_CURRENT_LINK" ]; then + __ngx_prev_static_target="$(readlink -f "$STATIC_CURRENT_LINK" 2>/dev/null || true)" + if [ -n "$__ngx_prev_static_target" ]; then + __ngx_log INFO "Static backup: current → ${__ngx_prev_static_target}" + else + __ngx_log WARN "Static backup: current symlink target is empty or unreadable" + fi + else + __ngx_prev_static_target="" + __ngx_log WARN "Static backup: no current symlink found" + fi + export __ngx_prev_static_target +} + +# ============================================================ +# ngx_switch_static — Switch active static symlink to candidate build. +# +# Args: +# $1 — candidate static build path (must exist as a directory) +# +# On failure: tries to restore previous symlink target. +# ============================================================ + +ngx_switch_static() { + local candidate_path="$1" + [ -d "$candidate_path" ] || { __ngx_log WARN "Static switch: candidate path not a directory: ${candidate_path}"; return 1; } + + __ngx_log INFO "Switching static assets to ${candidate_path}" + sudo ln -sfn "$candidate_path" "$STATIC_CURRENT_LINK" \ + || __ngx_fail "Failed to switch static symlink to ${candidate_path}" + __ngx_log INFO "Static symlink switched: ${STATIC_CURRENT_LINK} → ${candidate_path}" +} + +# ============================================================ +# ngx_restore_static — Restore previous static symlink target. +# +# Restores the symlink that was active before the last promote. +# Can be called standalone or as part of a config revert. +# +# If no previous target is recorded, falls back to restoring from backup +# directory scan (if only one build remains). +# ============================================================ + +ngx_restore_static() { + __ngx_log INFO "Restoring previous static symlink target" + + if [ -n "${__ngx_prev_static_target:-}" ] && [ -d "$__ngx_prev_static_target" ]; then + __ngx_log INFO "Restoring to previous target: ${__ngx_prev_static_target}" + sudo ln -sfn "$__ngx_prev_static_target" "$STATIC_CURRENT_LINK" \ + || __ngx_fail "Failed to restore static symlink to ${__ngx_prev_static_target}" + __ngx_log INFO "Static symlink restored to ${__ngx_prev_static_target}" + else + # No recorded previous target — attempt recovery from backup directory + __ngx_log WARN "No previous static target recorded — attempting recovery" + if [ -d "$STATIC_ROOT" ]; then + local candidate_target + candidate_target="$(readlink -f "$STATIC_CANDIDATE_LINK" 2>/dev/null || true)" + if [ -d "$candidate_target" ]; then + __ngx_log INFO "Restoring to candidate: ${candidate_target}" + sudo ln -sfn "$candidate_target" "$STATIC_CURRENT_LINK" + else + # Last resort: find the most recent build directory + local fallback + fallback="$(ls -dt "${STATIC_ROOT}"/build-* 2>/dev/null | head -1 || true)" + if [ -n "$fallback" ] && [ -d "$fallback" ]; then + __ngx_log INFO "Restoring to fallback: ${fallback}" + sudo ln -sfn "$fallback" "$STATIC_CURRENT_LINK" + else + __ngx_log ERROR "No static build directories found — cannot restore symlink" + fi + fi + fi + fi +} + +# ============================================================ +# ngx_cleanup_old_builds — Remove old static build directories. +# +# Keeps the most recent N builds (STATIC_KEEP_RELEASES) plus any referenced +# by the current symlink. Safe to call after a successful promote. +# ============================================================ + +ngx_cleanup_old_builds() { + local keep="${STATIC_KEEP_RELEASES:-2}" + case "$keep" in + ''|*[!0-9]*) __ngx_log WARN "STATIC_KEEP_RELEASES must be numeric — skipping cleanup"; return 0 ;; + esac + [ "$keep" -gt 0 ] || { __ngx_log WARN "STATIC_KEEP_RELEASES must be > 0 — skipping cleanup"; return 0; } + [ -d "$STATIC_ROOT" ] || { __ngx_log INFO "Static root not found — skipping cleanup"; return 0; } + + local active_build + active_build="$(basename "$(readlink -f "$STATIC_CURRENT_LINK" 2>/dev/null || true)" || true)" + + local keep_count=$((keep - 1)) # keep (N-1) non-active builds + __ngx_log INFO "Cleaning old static builds; keeping ${keep} release(s) (active=${active_build:-none})" + + # Only remove builds if we have more than keep total + local total_builds + total_builds="$(ls -d "${STATIC_ROOT}"/build-* 2>/dev/null | wc -l | tr -d '[:space:]' || echo 0)" + if [ "${total_builds:-0}" -le "$keep" ]; then + __ngx_log INFO "Build count (${total_builds}) <= keep (${keep}) — skipping cleanup" + return 0 + fi + + ( + cd "$STATIC_ROOT" || return 0 + ls -dt build-* 2>/dev/null \ + | grep -Fxv "$active_build" \ + | tail -n +$((keep_count + 1)) \ + | while IFS= read -r old_build; do + [ -n "$old_build" ] || continue + __ngx_log INFO "Removing old static build: ${old_build}" + sudo rm -rf "$old_build" 2>/dev/null || true + done || true + ) +} + +# ============================================================ +# ngx_stop_candidates — Stop candidate web/API containers. +# +# Safe to call on both successful promote and failed revert. +# ============================================================ + +ngx_stop_candidates() { + __ngx_require_cmd docker + __ngx_log INFO "Stopping candidate containers (web-candidate, api-candidate)" + docker rm -f web-candidate api-candidate >/dev/null 2>&1 || true + __ngx_log INFO "Candidate containers stopped" +} diff --git a/infra/scripts/preflight-space.sh b/infra/scripts/preflight-space.sh new file mode 100644 index 00000000000000..078f754394b746 --- /dev/null +++ b/infra/scripts/preflight-space.sh @@ -0,0 +1,298 @@ +#!/usr/bin/env bash +# +# preflight-space.sh - Disk-space guard for host-side deployment staging. +# +# Source this script from staging scripts and call ensure_deployment_space before +# image pulls or container creation. Cleanup is intentionally conservative: +# it protects containers currently routed by NGINX, active workers, running +# candidates, and containers/images referenced by deployment state when readable. + +set -euo pipefail + +SPACE_MIN_FREE_GB="${SPACE_MIN_FREE_GB:-10}" +SPACE_CHECK_PATH="${SPACE_CHECK_PATH:-/}" +SPACE_CLEANUP_UNTIL="${SPACE_CLEANUP_UNTIL:-72h}" +STATIC_ROOT="${STATIC_ROOT:-/var/www/cal-id-static}" +STATIC_CURRENT_LINK="${STATIC_CURRENT_LINK:-${STATIC_ROOT}/current}" +STATIC_CANDIDATE_LINK="${STATIC_CANDIDATE_LINK:-${STATIC_ROOT}/candidate}" +STATIC_KEEP_RELEASES="${STATIC_KEEP_RELEASES:-2}" +NGINX_CONF_DIR="${NGINX_CONF_DIR:-/etc/nginx/conf.d}" +NGINX_COMBINED_CONF="${NGINX_COMBINED_CONF:-cal-id.conf}" + +space_log() { + printf '[preflight-space][%s] %s\n' "$1" "$2" >&2 +} + +space_fail() { + space_log ERROR "$1" + exit 1 +} + +space_require_cmd() { + command -v "$1" >/dev/null 2>&1 || space_fail "Required command not found: $1" +} + +space_free_gb() { + local path="$1" + df -Pk "$path" 2>/dev/null | awk 'NR==2 { printf "%d", int($4 / 1048576) }' +} + +space_first_local_port() { + sed -nE 's/.*127\.0\.0\.1:([0-9]+).*/\1/p' "$1" 2>/dev/null | head -1 | tr -d '[:space:]' || true +} + +space_port_container_names() { + local port="$1" + [ -n "$port" ] || return 0 + + docker ps --format '{{.Names}}' 2>/dev/null | while IFS= read -r container; do + [ -n "$container" ] || continue + if docker port "$container" 2>/dev/null | grep -Eq "(0\.0\.0\.0|127\.0\.0\.1|::):${port}$|:${port}$"; then + printf '%s\n' "$container" + fi + done +} + +space_add_unique() { + local value="$1" + local existing + [ -n "$value" ] || return 0 + for existing in "${SPACE_PROTECTED_CONTAINERS[@]:-}"; do + [ "$existing" = "$value" ] && return 0 + done + SPACE_PROTECTED_CONTAINERS+=("$value") +} + +space_protect_image_unique() { + local value="$1" + local existing + [ -n "$value" ] || return 0 + for existing in "${SPACE_PROTECTED_IMAGES[@]:-}"; do + [ "$existing" = "$value" ] && return 0 + done + SPACE_PROTECTED_IMAGES+=("$value") +} + +space_container_is_protected() { + local container="$1" + local protected + for protected in "${SPACE_PROTECTED_CONTAINERS[@]:-}"; do + [ "$protected" = "$container" ] && return 0 + done + return 1 +} + +space_collect_nginx_protected_containers() { + local nginx_conf="${NGINX_CONF_DIR}/${NGINX_COMBINED_CONF}" + local web_port="" api_port="" + + if [ -f "$nginx_conf" ]; then + web_port="$(space_first_local_port "$nginx_conf")" + case "$web_port" in + 3001) api_port="4100" ;; + 3002) api_port="4101" ;; + esac + else + [ -f "${NGINX_CONF_DIR}/default.conf" ] && web_port="$(space_first_local_port "${NGINX_CONF_DIR}/default.conf")" + [ -f "${NGINX_CONF_DIR}/connector.conf" ] && api_port="$(space_first_local_port "${NGINX_CONF_DIR}/connector.conf")" + fi + + if [ -n "$web_port" ]; then + space_log INFO "Protecting containers routed by active Web port ${web_port}" + while IFS= read -r container; do space_add_unique "$container"; done < <(space_port_container_names "$web_port") + else + space_log WARN "No active Web port detected from NGINX config" + fi + + if [ -n "$api_port" ]; then + space_log INFO "Protecting containers routed by active API port ${api_port}" + while IFS= read -r container; do space_add_unique "$container"; done < <(space_port_container_names "$api_port") + else + space_log WARN "No active API port detected from NGINX config" + fi +} + +space_collect_worker_and_candidate_protections() { + local container + while IFS= read -r container; do + case "$container" in + worker-*|web-candidate|api-candidate) + space_add_unique "$container" + ;; + esac + done < <(docker ps --format '{{.Names}}' 2>/dev/null) +} + +space_collect_state_image_protections() { + [ -n "${DEPLOY_ENV:-${ENVIRONMENT:-}}" ] || return 0 + command -v aws >/dev/null 2>&1 || return 0 + command -v jq >/dev/null 2>&1 || return 0 + + local script_dir + script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + [ -f "${script_dir}/deploy-state-paths.sh" ] || return 0 + + # shellcheck source=deploy-state-paths.sh + source "${script_dir}/deploy-state-paths.sh" + require_deploy_state_config >/dev/null 2>&1 || return 0 + + local state_current tmp_current image + state_current="$(get_deploy_state_key "deployments/current.json")" + tmp_current="$(mktemp)" + if aws s3 cp "$state_current" "$tmp_current" >/dev/null 2>&1; then + for image in $(jq -r '.services.web // empty, .services.api // empty, .services.worker // empty' "$tmp_current"); do + space_protect_image_unique "$image" + done + space_log INFO "Deployment state image protections loaded from ${state_current}" + else + space_log WARN "Could not read deployment state current.json; continuing with NGINX/container protections" + fi + rm -f "$tmp_current" +} + +space_collect_state_image_containers() { + local protected_image container image + [ "${#SPACE_PROTECTED_IMAGES[@]}" -gt 0 ] || return 0 + + while IFS= read -r container; do + [ -n "$container" ] || continue + image="$(docker inspect -f '{{.Config.Image}}' "$container" 2>/dev/null || true)" + [ -n "$image" ] || continue + for protected_image in "${SPACE_PROTECTED_IMAGES[@]}"; do + if [ "$image" = "$protected_image" ]; then + space_log INFO "Protecting container ${container} from deployment state image ${protected_image}" + space_add_unique "$container" + fi + done + done < <(docker ps --format '{{.Names}}' 2>/dev/null) +} + +space_collect_protections() { + SPACE_PROTECTED_CONTAINERS=() + SPACE_PROTECTED_IMAGES=() + space_collect_nginx_protected_containers + space_collect_worker_and_candidate_protections + space_collect_state_image_protections + space_collect_state_image_containers + + if [ "${#SPACE_PROTECTED_CONTAINERS[@]}" -gt 0 ]; then + space_log INFO "Protected containers: ${SPACE_PROTECTED_CONTAINERS[*]}" + else + space_log WARN "No protected containers detected" + fi + + if [ "${#SPACE_PROTECTED_IMAGES[@]}" -gt 0 ]; then + space_log INFO "Protected images from deployment state: ${SPACE_PROTECTED_IMAGES[*]}" + fi +} + +space_remove_stale_containers() { + local container state + while IFS= read -r container; do + [ -n "$container" ] || continue + state="$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || printf 'unknown')" + if space_container_is_protected "$container"; then + space_log INFO "Preserve protected container: ${container} (${state})" + continue + fi + + case "$state" in + exited|created|dead) + space_log INFO "Remove stale container: ${container} (${state})" + docker rm "$container" >/dev/null 2>&1 || space_log WARN "Failed to remove stale container: ${container}" + ;; + *) + space_log INFO "Preserve running/non-stale container: ${container} (${state})" + ;; + esac + done < <(docker ps -a --format '{{.Names}}' 2>/dev/null) +} + +space_cleanup_static_builds() { + [ -d "$STATIC_ROOT" ] || return 0 + + local keep="$STATIC_KEEP_RELEASES" + case "$keep" in + ''|*[!0-9]*) space_log WARN "STATIC_KEEP_RELEASES must be numeric; skipping static cleanup"; return 0 ;; + esac + [ "$keep" -gt 0 ] || { space_log WARN "STATIC_KEEP_RELEASES must be > 0; skipping static cleanup"; return 0; } + + local active candidate total keep_extra active_name candidate_name + active="$(readlink -f "$STATIC_CURRENT_LINK" 2>/dev/null || true)" + candidate="$(readlink -f "$STATIC_CANDIDATE_LINK" 2>/dev/null || true)" + active_name="$(basename "$active" 2>/dev/null || true)" + candidate_name="$(basename "$candidate" 2>/dev/null || true)" + total="$(find "$STATIC_ROOT" -maxdepth 1 -type d -name 'build-*' 2>/dev/null | wc -l | tr -d '[:space:]')" + + if [ "${total:-0}" -le "$keep" ]; then + space_log INFO "Preserve static builds: count=${total}, keep=${keep}" + return 0 + fi + + keep_extra=$((keep - 1)) + space_log INFO "Cleaning static builds; preserve active=${active_name:-none}, candidate=${candidate_name:-none}, keep=${keep}" + ( + cd "$STATIC_ROOT" || exit 0 + ls -dt build-* 2>/dev/null \ + | grep -Fxv "$active_name" \ + | grep -Fxv "$candidate_name" \ + | tail -n +"$((keep_extra + 1))" \ + | while IFS= read -r old_build; do + [ -n "$old_build" ] || continue + space_log INFO "Remove stale static build: ${old_build}" + rm -rf "$old_build" 2>/dev/null || sudo rm -rf "$old_build" 2>/dev/null || space_log WARN "Failed to remove static build: ${old_build}" + done || true + ) +} + +space_run_controlled_cleanup() { + space_log WARN "Free space below ${SPACE_MIN_FREE_GB}GB; running controlled cleanup" + space_collect_protections + space_remove_stale_containers + + space_log INFO "Prune unused Docker images older than ${SPACE_CLEANUP_UNTIL}" + docker image prune -af --filter "until=${SPACE_CLEANUP_UNTIL}" >/dev/null 2>&1 \ + || space_log WARN "Docker image prune reported a warning" + + space_log INFO "Prune unused Docker build cache older than ${SPACE_CLEANUP_UNTIL}" + docker builder prune -af --filter "until=${SPACE_CLEANUP_UNTIL}" >/dev/null 2>&1 \ + || space_log WARN "Docker builder prune reported a warning" + + space_log INFO "Prune unused Docker volumes" + docker volume prune -f >/dev/null 2>&1 \ + || space_log WARN "Docker volume prune reported a warning" + + space_cleanup_static_builds +} + +ensure_deployment_space() { + space_require_cmd df + space_require_cmd docker + + case "$SPACE_MIN_FREE_GB" in + ''|*[!0-9]*) space_fail "SPACE_MIN_FREE_GB must be a positive integer" ;; + esac + [ "$SPACE_MIN_FREE_GB" -gt 0 ] || space_fail "SPACE_MIN_FREE_GB must be greater than zero" + + local before after + before="$(space_free_gb "$SPACE_CHECK_PATH")" + [ -n "$before" ] || space_fail "Unable to determine free disk space for ${SPACE_CHECK_PATH}" + + space_log INFO "Available disk space before staging: ${before}GB at ${SPACE_CHECK_PATH}; required=${SPACE_MIN_FREE_GB}GB" + if [ "$before" -ge "$SPACE_MIN_FREE_GB" ]; then + space_log INFO "Disk space check passed; cleanup not required" + return 0 + fi + + space_run_controlled_cleanup + + after="$(space_free_gb "$SPACE_CHECK_PATH")" + [ -n "$after" ] || space_fail "Unable to determine free disk space after cleanup for ${SPACE_CHECK_PATH}" + space_log INFO "Available disk space after controlled cleanup: ${after}GB at ${SPACE_CHECK_PATH}" + + if [ "$after" -lt "$SPACE_MIN_FREE_GB" ]; then + space_fail "Insufficient disk space after controlled cleanup: ${after}GB available, ${SPACE_MIN_FREE_GB}GB required. Aborting before image pull/container staging." + fi + + space_log INFO "Disk space check passed after controlled cleanup" +} diff --git a/infra/scripts/promote-all.sh b/infra/scripts/promote-all.sh new file mode 100755 index 00000000000000..60a0485d431f0b --- /dev/null +++ b/infra/scripts/promote-all.sh @@ -0,0 +1,226 @@ +#!/usr/bin/env bash +# +# promote-all.sh - Promote staged web/API candidates with one NGINX test and reload. +# +# Shared NGINX and static helpers are provided by ngx-utils.sh. This script: +# 1. Validates candidates are healthy +# 2. Backups current config and static symlink +# 3. Generates combined cal-id.conf from templates +# 4. Switches NGINX config (validates + reloads) and static symlink atomically +# 5. Cleans up old static builds on success +# 6. Restores previous config + static symlink on any failure +# +# Required: +# DOMAIN_NAME +# HOMEPAGE_URL +# +# Optional: +# NGINX_CONF_DIR default: /etc/nginx/conf.d +# NGINX_BACKUP_DIR default: /tmp/cal-id-nginx-previous +# NGINX_COMBINED_CONF default: cal-id.conf (single source of truth) +# WEB_HEALTH_PATH default: /api/health +# API_HEALTH_PATH default: /health +# STATIC_ROOT default: /var/www/cal-id-static +# STATIC_KEEP_RELEASES default: 2 +# CERTBOT_EMAIL required only when cert is missing/near expiry +# CERT_RENEW_DAYS default: 7 + +set -euo pipefail + +log() { + printf '[promote-all][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +require_value() { + local name="$1" + local value="${!name:-}" + [ -n "$value" ] || fail "Required environment variable is missing: $name" +} + +# Load shared NGINX and static helpers +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ngx-utils.sh +source "${script_dir}/ngx-utils.sh" + +# ============================================================ +# Local: certbot / certificate helpers (promote-all specific) +# ============================================================ + +ensure_certificate() { + local cert_domain="${CERTBOT_DOMAIN:-$DOMAIN_NAME}" + local cert_dir="/etc/letsencrypt/live/${cert_domain}" + local cert_file="${cert_dir}/fullchain.pem" + local renew_days="${CERT_RENEW_DAYS:-7}" + local needs_cert=false + CERTBOT_LIVE_NAME="$cert_domain" + + if [ ! -f "$cert_file" ] && command -v certbot >/dev/null 2>&1; then + local detected_name + detected_name="$(sudo certbot certificates 2>/dev/null | awk -v target="${cert_domain}" ' + /Certificate Name:/ { name=$3 } + /Domains:/ { + for (i=2; i<=NF; i++) { + if ($i == target || $i == ("*." target)) { + print name + exit + } + } + } + ')" + if [ -n "$detected_name" ] && [ -f "/etc/letsencrypt/live/${detected_name}/fullchain.pem" ]; then + CERTBOT_LIVE_NAME="$detected_name" + cert_file="/etc/letsencrypt/live/${detected_name}/fullchain.pem" + fi + fi + + if [ ! -f "$cert_file" ]; then + needs_cert=true + else + local cert_end_date cert_end_epoch now_epoch days_left + cert_end_date="$(sudo openssl x509 -enddate -noout -in "$cert_file" 2>/dev/null | cut -d= -f2- || true)" + cert_end_epoch="$(date -d "$cert_end_date" +%s 2>/dev/null || true)" + now_epoch="$(date +%s)" + if [ -z "$cert_end_epoch" ]; then + needs_cert=true + else + days_left=$(( (cert_end_epoch - now_epoch) / 86400 )) + if [ "$days_left" -lt "$renew_days" ]; then + log INFO "Certificate for ${cert_domain} expires in ${days_left} days; renewing" + needs_cert=true + else + log INFO "Certificate for ${cert_domain} is valid for ${days_left} more days" + fi + fi + fi + + if [ "$needs_cert" = "true" ]; then + require_cmd certbot + require_value CERTBOT_EMAIL + log INFO "Generating or renewing certificate for ${cert_domain}" + timeout 300s sudo certbot certonly \ + -a dns-multi \ + --dns-multi-credentials=/etc/letsencrypt/dns-multi.ini \ + -d "*.${cert_domain}" \ + -d "$cert_domain" \ + -m "$CERTBOT_EMAIL" \ + --non-interactive \ + --agree-tos \ + || fail "Certbot certificate generation failed for ${cert_domain}" + + if [ ! -f "/etc/letsencrypt/live/${CERTBOT_LIVE_NAME}/fullchain.pem" ] && command -v certbot >/dev/null 2>&1; then + local generated_name + generated_name="$(sudo certbot certificates 2>/dev/null | awk -v target="${cert_domain}" ' + /Certificate Name:/ { name=$3 } + /Domains:/ { + for (i=2; i<=NF; i++) { + if ($i == target || $i == ("*." target)) { + print name + exit + } + } + } + ')" + [ -n "$generated_name" ] && CERTBOT_LIVE_NAME="$generated_name" + fi + fi + + [ -f "/etc/letsencrypt/live/${CERTBOT_LIVE_NAME}/fullchain.pem" ] \ + || fail "Certificate file not found after certbot check: /etc/letsencrypt/live/${CERTBOT_LIVE_NAME}/fullchain.pem" +} + +replace_placeholders() { + local template="$1" + local port="$2" + + sed \ + -e "s/PORT_PLACEHOLDER/${port}/g" \ + -e "s/DOMAIN_PLACEHOLDER/${DOMAIN_NAME}/g" \ + -e "s|/etc/letsencrypt/live/${DOMAIN_NAME}/|/etc/letsencrypt/live/${CERTBOT_LIVE_NAME:-$DOMAIN_NAME}/|g" \ + -e "s|HOMEPAGE_PLACEHOLDER|${HOMEPAGE_URL}|g" \ + "$template" +} + +# ============================================================ +# Main execution +# ============================================================ + +require_cmd curl +require_cmd docker +require_cmd sed +require_value DOMAIN_NAME +require_value HOMEPAGE_URL + +WEB_TEMPLATE_PATH="${WEB_TEMPLATE_PATH:-/home/onehash/onehash-cal/infra/docker/web/nginx.template.conf}" +CONNECTOR_TEMPLATE_PATH="${CONNECTOR_TEMPLATE_PATH:-/home/onehash/onehash-cal/infra/docker/connector/nginx.connector.template.conf}" +MCP_TEMPLATE_PATH="${MCP_TEMPLATE_PATH:-/home/onehash/onehash-cal/infra/docker/connector/nginx.mcp.template.conf}" +WEB_HEALTH_PATH="${WEB_HEALTH_PATH:-/api/health}" +API_HEALTH_PATH="${API_HEALTH_PATH:-/health}" + +[ -f "$WEB_TEMPLATE_PATH" ] || fail "Web NGINX template not found: $WEB_TEMPLATE_PATH" +[ -f "$CONNECTOR_TEMPLATE_PATH" ] || fail "Connector NGINX template not found: $CONNECTOR_TEMPLATE_PATH" +[ -f "$MCP_TEMPLATE_PATH" ] || fail "MCP NGINX template not found: $MCP_TEMPLATE_PATH" + +# Ensure SSL certificate exists +ensure_certificate + +# Detect active ports and compute candidate ports +ngx_detect_active_ports +WEB_CANDIDATE_PORT="${WEB_CANDIDATE_PORT:-${ngx_candidate_web_port}}" +API_CANDIDATE_PORT="${API_CANDIDATE_PORT:-${ngx_candidate_api_port}}" +log INFO "Active ports: web=${ngx_active_web_port}, api=${ngx_active_api_port}" +log INFO "Candidate ports: web=${WEB_CANDIDATE_PORT}, api=${API_CANDIDATE_PORT}" + +# Verify candidates are running +for candidate in web-candidate api-candidate; do + if ! docker ps --format '{{.Names}}' | grep -Eq "^${candidate}$"; then + fail "Required candidate container is not running: $candidate" + fi +done + +# Health check candidates on the inactive ports that staging selected. +for url in "http://127.0.0.1:${WEB_CANDIDATE_PORT}${WEB_HEALTH_PATH}" "http://127.0.0.1:${API_CANDIDATE_PORT}${API_HEALTH_PATH}"; do + status_code="$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$url" 2>/dev/null || printf '000')" + [ "$status_code" = "200" ] || fail "Candidate health check failed for ${url}: HTTP ${status_code}" +done + +# Back up current config and static symlink before any mutation +ngx_backup_configs +ngx_backup_static + +# Generate combined candidate config (cal-id.conf format) +candidate_file="$(mktemp)" +trap 'rm -f "$candidate_file"' EXIT +log INFO "Generating combined candidate NGINX config" +{ + replace_placeholders "$WEB_TEMPLATE_PATH" "$WEB_CANDIDATE_PORT" + printf '\n' + replace_placeholders "$CONNECTOR_TEMPLATE_PATH" "$API_CANDIDATE_PORT" + printf '\n' + replace_placeholders "$MCP_TEMPLATE_PATH" "$API_CANDIDATE_PORT" + printf '\n' +} > "$candidate_file" + +# Switch NGINX config and static assets together. Success path performs one +# nginx -t and one nginx reload after both active pointers are prepared. +candidate_static_target="$(readlink -f "$STATIC_CANDIDATE_LINK" 2>/dev/null || true)" +if [ -z "$candidate_static_target" ] || [ ! -d "$candidate_static_target" ]; then + ngx_restore_config + ngx_restore_static + ngx_stop_candidates + fail "Candidate static build not found at ${STATIC_CANDIDATE_LINK}; refusing to promote Web without matching static assets" +fi +ngx_switch_config_and_static "$candidate_file" "$candidate_static_target" + +# Cleanup on success +ngx_cleanup_old_builds +ngx_stop_candidates +log INFO "Promotion complete" diff --git a/infra/scripts/record-state.sh b/infra/scripts/record-state.sh new file mode 100755 index 00000000000000..d3879d927dd048 --- /dev/null +++ b/infra/scripts/record-state.sh @@ -0,0 +1,221 @@ +#!/usr/bin/env bash +# +# record-state.sh — Upload Cal-ID deployment state to S3. +# +# This script is the single source of truth for deployment state. +# It writes three S3 objects for each state transition: +# - Manifest: s3:////manifests/sha-.json (immutable, per-SHA) +# - History: s3:////deployments/history/sha-.json (immutable, per-SHA) +# - Current: s3:////deployments/current.json (mutable, only for terminal states) +# +# State model: +# staged — images built, not promoted yet +# promoted_pending_verification — Web/API live on NGINX, workers not yet verified +# current — all services verified, this is the live release +# failed — deployment aborted, no services updated +# rollback_started — rollback requested or started, current release not changed yet +# rolled_back — previous release restored as current +# +# Only live-pointer states (current, rolled_back) update current.json. +# +# Env (required unless noted): +# DEPLOY_ENV production|staging +# GIT_SHA Git SHA being deployed +# STATUS staged|promoted_pending_verification|current|failed|rollback_started|rolled_back +# RELEASE_ID Release identifier (default: vYYYYMMDD-HHMMSS) +# WEB_IMAGE ECR image URL for web (optional) +# API_IMAGE ECR image URL for connector/api (optional) +# WORKER_IMAGE ECR image URL for worker (optional) +# MIGRATIONS_APPLIED_JSON JSON array of migration names (default: []) +# +# Rollback-only fields (optional, required for rolled_back status): +# ROLLBACK_SOURCE_SHA SHA that was being deployed before rollback +# ROLLBACK_TARGET_SHA SHA being restored as current (previous release) +# +# Identity fields (optional): +# GITHUB_RUN_ID GitHub Actions run ID +# GITHUB_ACTOR GitHub Actions username +# +# Exits non-zero on any failure. +# Idempotent: re-running uploads the same manifest/history payload (overwrites). + +set -euo pipefail + +log() { + printf '[record-state][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +json_or_null() { + local val="$1" + if [ -n "$val" ]; then + jq -Rn --arg v "$val" '$v' + else + printf 'null' + fi +} + +require_cmd aws +require_cmd jq + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=deploy-state-paths.sh +source "${script_dir}/deploy-state-paths.sh" +require_deploy_state_config || fail "Unable to resolve deployment state path" + +# ---------- Required inputs ---------- + +GIT_SHA="${GIT_SHA:-${GITHUB_SHA:-${SHA:-}}}" +[ -n "$GIT_SHA" ] || fail "GIT_SHA / GITHUB_SHA / SHA must be set" + +STATUS="${STATUS:-staged}" +case "$STATUS" in + staged|promoted_pending_verification|current|failed|rollback_started|rolled_back) ;; + *) fail "Unsupported STATUS '$STATUS'" ;; +esac + +RELEASE_ID="${RELEASE_ID:-v$(date -u '+%Y%m%d-%H%M%S')}" + +# ---------- Optional inputs ---------- + +MIGRATIONS_JSON="${MIGRATIONS_APPLIED_JSON:-[]}" +jq -e . <<< "$MIGRATIONS_JSON" >/dev/null 2>&1 \ + || fail "MIGRATIONS_APPLIED_JSON must be a JSON array" + +# Identity +RUN_ID="${GITHUB_RUN_ID:-unknown}" +ACTOR="${GITHUB_ACTOR:-unknown}" +TIMESTAMP="$(date -u '+%Y-%m-%dT%H:%M:%SZ')" + +# Rollback context +ROLLBACK_SOURCE_SHA="${ROLLBACK_SOURCE_SHA:-}" +ROLLBACK_TARGET_SHA="${ROLLBACK_TARGET_SHA:-}" + +# ---------- Build state payload ---------- + +PAYLOAD="$( + jq -n \ + --arg release_id "$RELEASE_ID" \ + --arg sha "$GIT_SHA" \ + --arg timestamp "$TIMESTAMP" \ + --arg env "$DEPLOY_ENV" \ + --arg status "$STATUS" \ + --arg run_id "$RUN_ID" \ + --arg actor "$ACTOR" \ + --argjson web_image "$(json_or_null "${WEB_IMAGE:-}")" \ + --argjson api_image "$(json_or_null "${API_IMAGE:-}")" \ + --argjson worker_image "$(json_or_null "${WORKER_IMAGE:-}")" \ + --argjson migrations "$MIGRATIONS_JSON" \ + --argjson rollback_source "$(json_or_null "$ROLLBACK_SOURCE_SHA")" \ + --argjson rollback_target "$(json_or_null "$ROLLBACK_TARGET_SHA")" \ + '{ + release_id: $release_id, + sha: $sha, + timestamp: $timestamp, + environment: $env, + status: $status, + run_id: $run_id, + actor: $actor, + services: { + web: $web_image, + api: $api_image, + worker: $worker_image + }, + migrations_applied: $migrations, + rollback: (if ($rollback_source != null) then + { source_sha: $rollback_source, target_sha: $rollback_target } + else null end) + }' +)" + +log INFO "State payload for ${GIT_SHA}: status=${STATUS}" + +# ---------- Upload manifest and history (always) ---------- + +BASE_URI="$(get_deploy_state_path)" +KEY_MANIFEST="$(get_deploy_state_key "manifests/sha-${GIT_SHA}.json")" +KEY_HISTORY="$(get_deploy_state_key "deployments/history/sha-${GIT_SHA}.json")" +KEY_CURRENT="$(get_deploy_state_key "deployments/current.json")" + +log INFO "Checking bucket access: $BASE_URI" +aws s3 ls "${BASE_URI}/" >/dev/null 2>&1 || fail "Cannot access bucket: $BASE_URI" + +log INFO "Upload manifest: $KEY_MANIFEST" +aws s3 cp - "$KEY_MANIFEST" <<< "$PAYLOAD" \ + --content-type 'application/json' \ + || fail "Upload manifest failed" + +log INFO "Upload history: $KEY_HISTORY" +aws s3 cp - "$KEY_HISTORY" <<< "$PAYLOAD" \ + --content-type 'application/json' \ + || fail "Upload history failed" + +# ---------- Update current.json (terminal states only) ---------- +# Only update current.json for: +# - current : new release fully deployed and verified +# - rolled_back : previous release restored as current +# - failed : deployment failed, current stays as-is (no-op for current.json) +# +# Non-terminal states (staged, promoted_pending_verification) do NOT update current.json. +# This prevents a partially-deployed release from claiming "current" status. + +case "$STATUS" in + current|rolled_back) + log INFO "Updating current pointer: $KEY_CURRENT" + # ETag-based conditional write: only succeeds if current.json hasn't changed + # since we read it. This prevents a stale writer from overwriting a newer current. + set +e + existing_etag="$(aws s3api head-object --bucket "$DEPLOY_STATE_BUCKET" \ + --key "${DEPLOY_STATE_PREFIX}/deployments/current.json" \ + --query 'ETag' --output 'text' 2>/dev/null || true)" + set -e + + if [ -n "$existing_etag" ]; then + aws s3api put-object \ + --bucket "$DEPLOY_STATE_BUCKET" \ + --key "${DEPLOY_STATE_PREFIX}/deployments/current.json" \ + --body <(printf '%s' "$PAYLOAD") \ + --content-type 'application/json' \ + --if-match "$existing_etag" \ + > /dev/null 2>&1 + cp_result=$? + else + # No existing current.json — use unconditional write + aws s3 cp - "$KEY_CURRENT" <<< "$PAYLOAD" \ + --content-type 'application/json' \ + > /dev/null 2>&1 + cp_result=$? + fi + + if [ "$cp_result" -ne 0 ]; then + log WARN "current.json update failed (conditional write rejected — another writer updated current). Continuing anyway." + else + log INFO "current.json updated successfully" + fi + ;; + *) + log INFO "Skip current pointer (status=${STATUS} is non-terminal)" + ;; +esac + +if [ -n "${GITHUB_OUTPUT:-}" ]; then + { + printf 'state_manifest_path=%s\n' "$KEY_MANIFEST" + printf 'state_history_path=%s\n' "$KEY_HISTORY" + printf 'state_current_path=%s\n' "$KEY_CURRENT" + } >> "$GITHUB_OUTPUT" +fi + +printf 'STATE_MANIFEST_PATH=%s\n' "$KEY_MANIFEST" +printf 'STATE_HISTORY_PATH=%s\n' "$KEY_HISTORY" +printf 'STATE_CURRENT_PATH=%s\n' "$KEY_CURRENT" + +log INFO "State recorded: ${GIT_SHA} → ${STATUS}" diff --git a/infra/scripts/refresh-lock.sh b/infra/scripts/refresh-lock.sh new file mode 100755 index 00000000000000..1c4fbed7a963ca --- /dev/null +++ b/infra/scripts/refresh-lock.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# +# refresh-lock.sh - Atomically refresh an S3-backed Cal-ID deployment lock. +# +# Uses S3 ETag-based conditional writes so that only the lock holder can +# extend the TTL. A competing writer cannot steal the lock during refresh. +# +# Required: +# DEPLOY_ENV or ENVIRONMENT production|staging (prod/main/stag/develop aliases) +# LOCK_TOKEN Token returned by acquire-lock.sh +# +# Optional: +# DEPLOY_STATE_BUCKET S3 bucket name (default: cal-id, enforced contract) +# DEPLOY_STATE_PREFIX S3 key prefix (derived from DEPLOY_ENV if unset) +# LOCK_TTL_SECONDS New TTL in seconds from now (default: 1800) + +set -euo pipefail + +log() { + printf '[refresh-lock][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +iso_from_epoch() { + date -u -d "@$1" '+%Y-%m-%dT%H:%M:%SZ' +} + +require_cmd aws +require_cmd jq + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=deploy-state-paths.sh +source "${script_dir}/deploy-state-paths.sh" +require_deploy_state_config || fail "Unable to resolve deployment state path" +LOCK_ENV="$DEPLOY_ENV" + +[ -n "${LOCK_TOKEN:-}" ] || fail "LOCK_TOKEN is required to refresh a deployment lock" + +LOCK_TTL_SECONDS="${LOCK_TTL_SECONDS:-1800}" +case "$LOCK_TTL_SECONDS" in + ''|*[!0-9]*) fail "LOCK_TTL_SECONDS must be a positive integer" ;; +esac +[ "$LOCK_TTL_SECONDS" -gt 0 ] || fail "LOCK_TTL_SECONDS must be greater than zero" + +LOCK_KEY="$(get_deploy_state_key "locks/${LOCK_ENV}.lock")" +now_epoch="$(date -u '+%s')" +expires_at_epoch="$((now_epoch + LOCK_TTL_SECONDS))" +new_expires="$(iso_from_epoch "$expires_at_epoch")" + +tmp_existing="$(mktemp)" +tmp_new="$(mktemp)" +trap 'rm -f "$tmp_existing" "$tmp_new"' EXIT + +# Download the lock and capture its S3 ETag for conditional write +aws s3 cp "$LOCK_KEY" "$tmp_existing" >/dev/null || fail "No deployment lock found at $LOCK_KEY" + +existing_etag="$(aws s3api head-object \ + --bucket "$DEPLOY_STATE_BUCKET" \ + --key "${DEPLOY_STATE_PREFIX}/locks/${LOCK_ENV}.lock" \ + --query 'ETag' \ + --output text 2>/dev/null || true)" +[ -n "$existing_etag" ] || fail "Unable to read S3 ETag for deployment lock" + +existing_token="$(jq -r '.token // empty' "$tmp_existing")" +existing_owner="$(jq -r '.owner // "unknown"' "$tmp_existing")" +existing_expires="$(jq -r '.expires_at_epoch // empty' "$tmp_existing")" + +[ "$existing_token" = "$LOCK_TOKEN" ] || fail "LOCK_TOKEN mismatch — lock is held by '${existing_owner}', token does not match" + +if [ -n "$existing_expires" ] && [ "$existing_expires" -le "$now_epoch" ] 2>/dev/null; then + fail "Cannot refresh expired lock held by '${existing_owner}' — lock has already expired" +fi + +# Build refreshed payload preserving all original metadata +jq \ + --arg refreshed_at "$(iso_from_epoch "$now_epoch")" \ + --arg expires_at "$new_expires" \ + --argjson refreshed_at_epoch "$now_epoch" \ + --argjson expires_at_epoch "$expires_at_epoch" \ + '. + { + refreshed_at: $refreshed_at, + refreshed_at_epoch: $refreshed_at_epoch, + expires_at: $expires_at, + expires_at_epoch: $expires_at_epoch, + status: "held" + }' "$tmp_existing" > "$tmp_new" + +log INFO "Refreshing deployment lock: $LOCK_KEY (extends to $(iso_from_epoch "$expires_at_epoch"))" + +# ETag-based conditional write — only succeeds if the object has not changed +# since we last downloaded it. This prevents a stale writer from overwriting +# a more recent lock holder's state. +set +e +aws s3api put-object \ + --bucket "$DEPLOY_STATE_BUCKET" \ + --key "${DEPLOY_STATE_PREFIX}/locks/${LOCK_ENV}.lock" \ + --body "$tmp_new" \ + --content-type 'application/json' \ + --if-match "$existing_etag" \ + > /dev/null 2>&1 +cp_result=$? +set -e + +if [ "$cp_result" -ne 0 ]; then + fail "Lock refresh failed (S3 conditional write rejected — lock was modified by another writer)" +fi + +log INFO "Deployment lock refreshed" + +printf 'LOCK_KEY=%s\n' "$LOCK_KEY" +printf 'LOCK_TOKEN=%s\n' "$LOCK_TOKEN" +printf 'LOCK_EXPIRES_AT=%s\n' "$(iso_from_epoch "$expires_at_epoch")" diff --git a/infra/scripts/release-lock.sh b/infra/scripts/release-lock.sh new file mode 100755 index 00000000000000..af06fa9da92f4a --- /dev/null +++ b/infra/scripts/release-lock.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# +# release-lock.sh - Atomically release an S3-backed Cal-ID deployment lock. +# +# Only the lock owner (matching LOCK_TOKEN) can release the lock. Forcibly +# deleting a lock held by another run is not permitted. +# +# Required: +# DEPLOY_ENV or ENVIRONMENT production|staging (prod/main/stag/develop aliases) +# +# Optional: +# DEPLOY_STATE_BUCKET S3 bucket name (default: cal-id, enforced contract) +# DEPLOY_STATE_PREFIX S3 key prefix (derived from DEPLOY_ENV if unset) +# LOCK_TOKEN Token returned by acquire-lock.sh; required unless FORCE_RELEASE=true +# FORCE_RELEASE true to bypass ownership check (manual cleanup only) + +set -euo pipefail + +log() { + printf '[release-lock][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +iso_from_epoch() { + date -u -d "@$1" '+%Y-%m-%dT%H:%M:%SZ' +} + +require_cmd aws +require_cmd jq + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=deploy-state-paths.sh +source "${script_dir}/deploy-state-paths.sh" +require_deploy_state_config || fail "Unable to resolve deployment state path" +LOCK_ENV="$DEPLOY_ENV" + +LOCK_KEY="$(get_deploy_state_key "locks/${LOCK_ENV}.lock")" +now_epoch="$(date -u '+%s')" + +tmp_existing="$(mktemp)" +trap 'rm -f "$tmp_existing"' EXIT + +# Attempt to download the current lock before doing anything destructive. +# If no lock exists, exit cleanly — there is nothing to release. +if ! aws s3 cp "$LOCK_KEY" "$tmp_existing" >/dev/null 2>&1; then + log INFO "No lock found at $LOCK_KEY — nothing to release" + exit 0 +fi + +existing_token="$(jq -r '.token // empty' "$tmp_existing")" +existing_owner="$(jq -r '.owner // "unknown"' "$tmp_existing")" +existing_expires="$(jq -r '.expires_at_epoch // empty' "$tmp_existing")" +existing_run="$(jq -r '.run_id // "?"' "$tmp_existing")" + +# Enforce ownership: LOCK_TOKEN must match the lock's token. +# FORCE_RELEASE=true is reserved for manual operator cleanup and must be used +# only after confirming the lock has expired or the holder has crashed. +if [ "${FORCE_RELEASE:-false}" != "true" ]; then + if [ -z "${LOCK_TOKEN:-}" ]; then + fail "LOCK_TOKEN is required to release lock held by '${existing_owner}' (run=${existing_run}). Set FORCE_RELEASE=true for manual cleanup after verifying the lock is expired." + fi + + if [ "$existing_token" != "$LOCK_TOKEN" ]; then + fail "LOCK_TOKEN mismatch — lock is held by '${existing_owner}' (run=${existing_run}), token does not match. Aborting release to avoid deleting another run's lock. Use FORCE_RELEASE=true for manual cleanup." + fi + + # Double-check the lock hasn't expired since we read it + if [ -n "$existing_expires" ] && [ "$existing_expires" -le "$now_epoch" ] 2>/dev/null; then + log WARN "Lock held by '${existing_owner}' has already expired — proceeding with release" + fi +else + log WARN "FORCE_RELEASE=true: bypassing ownership check for manual cleanup" + log WARN "Lock owner: '${existing_owner}' (run=${existing_run})" +fi + +log INFO "Releasing deployment lock: $LOCK_KEY" +aws s3 rm "$LOCK_KEY" >/dev/null || fail "Failed to remove lock from S3 — verify bucket permissions" + +log INFO "Deployment lock released" \ No newline at end of file diff --git a/infra/scripts/revert-nginx.sh b/infra/scripts/revert-nginx.sh new file mode 100755 index 00000000000000..50d1d96521e606 --- /dev/null +++ b/infra/scripts/revert-nginx.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# +# revert-nginx.sh - Restore previous Cal-ID NGINX configs and static symlink. +# +# Restores both NGINX active config AND static asset symlink atomically. +# Uses ngx-utils.sh shared helpers — same logic as promote-all.sh. +# +# Canonical config: cal-id.conf is restored as the source of truth. +# If cal-id.conf does not exist in backup (pre-unified-promotion host), +# legacy standalone configs are restored instead. +# +# Can be called standalone (e.g., after worker failure, promote-all.sh failure, +# rollback.sh error trap) or from rollback.sh. Both NGINX config and static +# symlink are restored in every call. +# +# Optional: +# NGINX_CONF_DIR default: /etc/nginx/conf.d +# NGINX_BACKUP_DIR default: /tmp/cal-id-nginx-previous +# NGINX_COMBINED_CONF default: cal-id.conf +# STOP_CANDIDATES default: true + +set -euo pipefail + +log() { + printf '[revert-nginx][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +# Load shared NGINX and static helpers +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ngx-utils.sh +source "${script_dir}/ngx-utils.sh" + +# ============================================================ +# Standalone revert: restore both NGINX config and static symlink +# ============================================================ + +STOP_CANDIDATES="${STOP_CANDIDATES:-true}" + +[ -d "$NGINX_BACKUP_DIR" ] || fail "NGINX backup directory not found: $NGINX_BACKUP_DIR" + +# Restore NGINX config (cal-id.conf if present, else legacy configs) +ngx_restore_config + +# Restore static symlink to previous build target +ngx_restore_static + +# Stop candidate containers if configured +if [ "$STOP_CANDIDATES" = "true" ]; then + ngx_stop_candidates +fi + +log INFO "NGINX revert complete: config restored, static symlink restored" \ No newline at end of file diff --git a/infra/scripts/rollback.sh b/infra/scripts/rollback.sh new file mode 100755 index 00000000000000..4a7ca52b87ec1a --- /dev/null +++ b/infra/scripts/rollback.sh @@ -0,0 +1,315 @@ +#!/usr/bin/env bash +# +# rollback.sh - App-only rollback orchestration for Cal-ID. +# +# Records rollback state to S3 before and after host-side rollback operations. +# Outputs structured results for workflow notification handling. +# +# Required: +# TARGET_SHA Git SHA being restored (previous known-good release) +# REPO_URL Git repository URL +# DATABASE_URL Database connection string +# WEB_IMAGE ECR image URL for web +# API_IMAGE ECR image URL for connector/api +# WORKER_IMAGE ECR image URL for worker +# DOMAIN_NAME Deployment domain +# HOMEPAGE_URL Application homepage URL +# +# Optional: +# ROLLBACK_SOURCE_SHA SHA that was being deployed (failed release, for state record) +# BRANCH_NAME or BRANCH +# WORKER_REPLICAS +# GITHUB_RUN_ID GitHub Actions run ID (for state recording) +# GITHUB_ACTOR GitHub Actions username (for state recording) +# RELEASE_ID Release identifier +# DEPLOY_ENV production|staging (for state path resolution) +# ROLLBACK_ENV Backward-compatible alias for DEPLOY_ENV +# +# Outputs (printed to stdout for workflow capture): +# ROLLBACK_STATUS=succeeded|failed +# ROLLBACK_TARGET_SHA= +# ROLLBACK_SOURCE_SHA= +# +# This script intentionally does not modify database state and does not run +# down.sql or CONTRACT migrations. + +set -euo pipefail + +log() { + printf '[rollback][%s] %s\n' "$1" "$2" >&2 +} + +fail_script() { + log ERROR "$1" + # Record rollback failure state before exiting + _record_rollback_state "failed" "${TARGET_SHA:-}" "${ROLLBACK_SOURCE_SHA:-}" 2>/dev/null || true + printf 'ROLLBACK_STATUS=failed\n' + printf 'ROLLBACK_TARGET_SHA=%s\n' "${TARGET_SHA:-}" + printf 'ROLLBACK_SOURCE_SHA=%s\n' "${ROLLBACK_SOURCE_SHA:-}" + exit 1 +} + +require_value() { + local name="$1" + local value="${!name:-}" + [ -n "$value" ] || fail_script "Required environment variable is missing: $name" +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail_script "Required command not found: $1" +} + +# ---------- Validate required env ---------- +for var in TARGET_SHA REPO_URL DATABASE_URL WEB_IMAGE API_IMAGE WORKER_IMAGE DOMAIN_NAME HOMEPAGE_URL; do + require_value "$var" +done + +BRANCH_NAME="${BRANCH_NAME:-${BRANCH:-main}}" +ROLLBACK_SOURCE_SHA="${ROLLBACK_SOURCE_SHA:-}" +GITHUB_RUN_ID="${GITHUB_RUN_ID:-unknown}" +GITHUB_ACTOR="${GITHUB_ACTOR:-unknown}" +RELEASE_ID="${RELEASE_ID:-v$(date -u '+%Y%m%d-%H%M%S')}" +DEPLOY_ENV="${DEPLOY_ENV:-${ROLLBACK_ENV:-staging}}" +ECR_REGISTRY="${ECR_REGISTRY:-}" + +# AWS credentials needed for S3 state recording — must be in environment +require_cmd aws +require_cmd jq +require_cmd docker + +# ---------- State recording helpers ---------- + +_record_rollback_state() { + local status="$1" + local target_sha="$2" + local source_sha="$3" + + log INFO "Recording rollback state: status=${status}, target=${target_sha}, source=${source_sha}" + + local tmp_payload + tmp_payload="$(mktemp)" + + local timestamp + timestamp="$(date -u '+%Y-%m-%dT%H:%M:%SZ')" + + # Build rollback payload for target SHA (the release being restored as current) + jq -n \ + --arg release_id "$RELEASE_ID" \ + --arg sha "$target_sha" \ + --arg timestamp "$timestamp" \ + --arg env "$DEPLOY_ENV" \ + --arg status "$status" \ + --arg run_id "$GITHUB_RUN_ID" \ + --arg actor "$GITHUB_ACTOR" \ + --argjson web_image "$(json_or_null "$WEB_IMAGE")" \ + --argjson api_image "$(json_or_null "$API_IMAGE")" \ + --argjson worker_image "$(json_or_null "$WORKER_IMAGE")" \ + --argjson rollback_source "$(json_or_null "$source_sha")" \ + --argjson rollback_target "$(json_or_null "$target_sha")" \ + '{ + release_id: $release_id, + sha: $sha, + timestamp: $timestamp, + environment: $env, + status: $status, + run_id: $run_id, + actor: $actor, + services: { web: $web_image, api: $api_image, worker: $worker_image }, + migrations_applied: [], + rollback: (if ($rollback_source != null) then + { source_sha: $rollback_source, target_sha: $rollback_target } + else null end) + }' > "$tmp_payload" + + local script_dir + script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + # shellcheck source=deploy-state-paths.sh + source "${script_dir}/deploy-state-paths.sh" + require_deploy_state_config || { log WARN "Could not resolve deploy state config — skipping S3 state"; rm -f "$tmp_payload"; return 0; } + + local key_manifest key_history key_current + key_manifest="$(get_deploy_state_key "manifests/sha-${target_sha}.json")" + key_history="$(get_deploy_state_key "deployments/history/sha-${target_sha}.json")" + + aws s3 cp "$tmp_payload" "$key_manifest" --content-type 'application/json' >/dev/null 2>&1 \ + || { + if [ "$status" = "rolled_back" ]; then + rm -f "$tmp_payload" + fail_script "Failed to upload rollback manifest for ${target_sha}" + fi + log WARN "Failed to upload rollback manifest for ${target_sha}" + } + aws s3 cp "$tmp_payload" "$key_history" --content-type 'application/json' >/dev/null 2>&1 \ + || { + if [ "$status" = "rolled_back" ]; then + rm -f "$tmp_payload" + fail_script "Failed to upload rollback history for ${target_sha}" + fi + log WARN "Failed to upload rollback history for ${target_sha}" + } + + # Only update current.json for succeeded rollback (target is now live) + if [ "$status" = "rolled_back" ]; then + key_current="$(get_deploy_state_key "deployments/current.json")" + # ETag-based conditional write — prevents overwriting if another writer updated current + set +e + existing_etag="$(aws s3api head-object \ + --bucket "$DEPLOY_STATE_BUCKET" \ + --key "${DEPLOY_STATE_PREFIX}/deployments/current.json" \ + --query 'ETag' --output 'text' 2>/dev/null || true)" + set -e + + if [ -n "$existing_etag" ]; then + set +e + aws s3api put-object \ + --bucket "$DEPLOY_STATE_BUCKET" \ + --key "${DEPLOY_STATE_PREFIX}/deployments/current.json" \ + --body "$tmp_payload" \ + --content-type 'application/json' \ + --if-match "$existing_etag" \ + > /dev/null 2>&1 + cp_result=$? + set -e + else + set +e + aws s3 cp "$tmp_payload" "$key_current" --content-type 'application/json' > /dev/null 2>&1 + cp_result=$? + set -e + fi + + if [ "$cp_result" -ne 0 ]; then + rm -f "$tmp_payload" + fail_script "Failed to update current.json for rollback target ${target_sha}" + fi + log INFO "current.json updated for ${target_sha} with status=rolled_back" + fi + + # Also record the source SHA as "failed" (the release that was being deployed) + if [ -n "$source_sha" ] && [ "$source_sha" != "$target_sha" ]; then + local tmp_source + tmp_source="$(mktemp)" + jq -n \ + --arg release_id "$RELEASE_ID" \ + --arg sha "$source_sha" \ + --arg timestamp "$timestamp" \ + --arg env "$DEPLOY_ENV" \ + --arg status "failed" \ + --arg run_id "$GITHUB_RUN_ID" \ + --arg actor "$GITHUB_ACTOR" \ + --argjson web_image "$(json_or_null "$WEB_IMAGE")" \ + --argjson api_image "$(json_or_null "$API_IMAGE")" \ + --argjson worker_image "$(json_or_null "$WORKER_IMAGE")" \ + --argjson rollback_source "$(json_or_null "$source_sha")" \ + --argjson rollback_target "$(json_or_null "$target_sha")" \ + '{ + release_id: $release_id, + sha: $sha, + timestamp: $timestamp, + environment: $env, + status: $status, + run_id: $run_id, + actor: $actor, + services: { web: $web_image, api: $api_image, worker: $worker_image }, + migrations_applied: [], + rollback: { source_sha: $rollback_source, target_sha: $rollback_target } + }' > "$tmp_source" + + local key_src_manifest key_src_history + key_src_manifest="$(get_deploy_state_key "manifests/sha-${source_sha}.json")" + key_src_history="$(get_deploy_state_key "deployments/history/sha-${source_sha}.json")" + aws s3 cp "$tmp_source" "$key_src_manifest" --content-type 'application/json' >/dev/null 2>&1 \ + || log WARN "Failed to upload failed manifest for ${source_sha}" + aws s3 cp "$tmp_source" "$key_src_history" --content-type 'application/json' >/dev/null 2>&1 \ + || log WARN "Failed to upload failed history for ${source_sha}" + rm -f "$tmp_source" + fi + + rm -f "$tmp_payload" +} + +json_or_null() { + local val="$1" + if [ -n "$val" ]; then + jq -Rn --arg v "$val" '$v' + else + printf 'null' + fi +} + +# ---------- Record rollback_started ---------- +# Record the source SHA as "failed" before touching anything. +# This captures the pre-rollback state so we have a record of the failure. +log INFO "Recording rollback_started state (target=${TARGET_SHA}, source=${ROLLBACK_SOURCE_SHA:-none})" +_record_rollback_state "rollback_started" "$TARGET_SHA" "$ROLLBACK_SOURCE_SHA" 2>/dev/null \ + || log WARN "Could not record rollback state to S3 — continuing with host-side rollback" + +# ---------- Run rollback on host ---------- +export BRANCH_NAME +export DEPLOY_ENV +export IS_ROLLBACK=true +export ECR_REGISTRY + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +AWS_REGION="${AWS_REGION:-us-east-1}" + +verify_rollback_image() { + local image="$1" + local service="$2" + + log INFO "Verifying rollback ${service} image exists: ${image}" + docker manifest inspect "$image" >/dev/null 2>&1 \ + || fail_script "Rollback ${service} image not found or inaccessible: ${image}" +} + +rollback_failure() { + log ERROR "Rollback orchestration failed; attempting NGINX revert as last resort" + "${script_dir}/revert-nginx.sh" || true + fail_script "Rollback failed after NGINX revert attempt" +} +trap rollback_failure ERR + +if [ -n "$ECR_REGISTRY" ]; then + log INFO "Authenticating Docker to ECR registry before rollback image checks" + aws ecr get-login-password --region "$AWS_REGION" \ + | docker login --username AWS --password-stdin "$ECR_REGISTRY" \ + || fail_script "ECR login failed — cannot verify rollback images" +fi + +verify_rollback_image "$WEB_IMAGE" "web" +verify_rollback_image "$API_IMAGE" "api" +verify_rollback_image "$WORKER_IMAGE" "worker" + +log INFO "Validating schema compatibility for rollback to ${TARGET_SHA}" +"${script_dir}/validate-rollback-schema.sh" + +log INFO "Staging rollback API candidate" +ECR_REGISTRY="${ECR_REGISTRY:-}" \ + API_IMAGE="$API_IMAGE" "${script_dir}/stage-api.sh" + +log INFO "Staging rollback web candidate" +ECR_REGISTRY="${ECR_REGISTRY:-}" \ + WEB_IMAGE="$WEB_IMAGE" IS_ROLLBACK=true "${script_dir}/stage-web.sh" + +log INFO "Promoting rollback web/API candidates" +"${script_dir}/promote-all.sh" + +log INFO "Starting rollback worker candidates after promotion" +ECR_REGISTRY="${ECR_REGISTRY:-}" \ + PROMOTION_COMPLETE=true WORKER_IMAGE="$WORKER_IMAGE" "${script_dir}/start-workers.sh" + +log INFO "Draining previous workers" +"${script_dir}/drain-workers.sh" + +trap - ERR + +# ---------- Record rollback succeeded ---------- +log INFO "Rollback completed successfully — recording rolled_back state" +_record_rollback_state "rolled_back" "$TARGET_SHA" "$ROLLBACK_SOURCE_SHA" + +# ---------- Output for workflow notification ---------- +printf 'ROLLBACK_STATUS=succeeded\n' +printf 'ROLLBACK_TARGET_SHA=%s\n' "$TARGET_SHA" +printf 'ROLLBACK_SOURCE_SHA=%s\n' "${ROLLBACK_SOURCE_SHA:-}" + +log INFO "App-only rollback completed successfully for ${TARGET_SHA}" diff --git a/infra/scripts/send-email.sh b/infra/scripts/send-email.sh new file mode 100755 index 00000000000000..21db0d8d05fec5 --- /dev/null +++ b/infra/scripts/send-email.sh @@ -0,0 +1,231 @@ +#!/usr/bin/env bash +# +# send-email.sh - Send Cal-ID deployment notifications through SendGrid. +# +# Supported NOTIFICATION_EVENT values: +# deployment_started +# deployment_succeeded +# deployment_failed +# deployment_failed_before_promotion +# deployment_failed_after_promotion_rollback_succeeded +# deployment_failed_after_promotion_rollback_failed +# rollback_started +# rollback_succeeded +# rollback_failed +# manual_rollback_succeeded +# manual_rollback_failed +# +# Mailbox defaults: +# EMAIL_FROM defaults to alerts@cal.id +# EMAIL_TO defaults to deployments@cal.id +# +# Optional context: +# DEPLOY_ENV, RELEASE_ID, GIT_SHA, GITHUB_SHA, BRANCH_NAME, GITHUB_ACTOR, +# GITHUB_RUN_ID, GITHUB_REPOSITORY, LOG_FILE, LOG_SNIPPET, FAILURE_REASON, +# WEB_IMAGE, API_IMAGE, WORKER_IMAGE, MIGRATION_STATUS, WEB_STATUS, +# API_STATUS, WORKER_STATUS, ROLLBACK_STATUS, STATE_FILE_PATH, +# NGINX_ACTIVE_UPSTREAMS, ROLLBACK_TARGET_SHA, LOG_SNIPPET_MAX_LINES +# +# Non-blocking by default: +# SEND_EMAIL_REQUIRED=false means send failures exit 0 after logging a warning. +# Set SEND_EMAIL_REQUIRED=true when callers want notification failures to fail. + +set -euo pipefail + +log() { + printf '[send-email][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +warn_or_fail() { + if [ "${SEND_EMAIL_REQUIRED:-false}" = "true" ]; then + fail "$1" + fi + log WARN "$1" + exit 0 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +sanitize_text() { + sed -E \ + -e 's/(Authorization:[[:space:]]*Bearer[[:space:]]+)[^[:space:]]+/\1[REDACTED]/' \ + -e 's/([?&]token=)[^&[:space:]]+/\1[REDACTED]/' \ + -e 's/([?&]api[_-]?key=)[^&[:space:]]+/\1[REDACTED]/' \ + -e 's/((password|passwd|pwd|secret|token|api[_-]?key|access[_-]?key)[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/' \ + -e 's/(AWS_ACCESS_KEY_ID[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' \ + -e 's/(AWS_SECRET_ACCESS_KEY[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' \ + -e 's/(SENDGRID_API_KEY[=:][[:space:]]*)[^[:space:]]+/\1[REDACTED]/g' +} + +event_title() { + case "$1" in + deployment_started) printf 'Deployment Started' ;; + deployment_succeeded) printf 'Deployment Succeeded' ;; + deployment_failed) printf 'Deployment Failed' ;; + deployment_failed_before_promotion) printf 'Deployment Failed Before Promotion' ;; + deployment_failed_after_promotion_rollback_succeeded) printf 'Deployment Failed After Promotion; Rollback Succeeded' ;; + deployment_failed_after_promotion_rollback_failed) printf 'Deployment Failed After Promotion; Rollback Failed' ;; + rollback_started) printf 'Rollback Started' ;; + rollback_succeeded) printf 'Rollback Succeeded' ;; + rollback_failed) printf 'Rollback Failed' ;; + manual_rollback_succeeded) printf 'Manual Rollback Succeeded' ;; + manual_rollback_failed) printf 'Manual Rollback Failed' ;; + *) return 1 ;; + esac +} + +event_status_line() { + case "$1" in + deployment_started) printf 'A deployment has started.' ;; + deployment_succeeded) printf 'Deployment completed successfully.' ;; + deployment_failed) printf 'Deployment failed and requires attention.' ;; + deployment_failed_before_promotion) printf 'Deployment failed before Web/API promotion. No rollback was required.' ;; + deployment_failed_after_promotion_rollback_succeeded) printf 'Deployment failed after Web/API promotion, and automatic rollback completed successfully.' ;; + deployment_failed_after_promotion_rollback_failed) printf 'Deployment failed after Web/API promotion, and automatic rollback failed or did not complete.' ;; + rollback_started) printf 'A rollback has started.' ;; + rollback_succeeded) printf 'Rollback completed successfully.' ;; + rollback_failed) printf 'Rollback failed and requires immediate attention.' ;; + manual_rollback_succeeded) printf 'Manual rollback completed successfully.' ;; + manual_rollback_failed) printf 'Manual rollback failed and requires immediate attention.' ;; + *) return 1 ;; + esac +} + +require_cmd curl +require_cmd jq + +[ -n "${SENDGRID_API_KEY:-}" ] || warn_or_fail "SENDGRID_API_KEY is not set; skipping notification" + +NOTIFICATION_EVENT="${NOTIFICATION_EVENT:-}" +[ -n "$NOTIFICATION_EVENT" ] || fail "NOTIFICATION_EVENT must be set" + +TITLE="$(event_title "$NOTIFICATION_EVENT")" || fail "Unsupported NOTIFICATION_EVENT: $NOTIFICATION_EVENT" +STATUS_LINE="$(event_status_line "$NOTIFICATION_EVENT")" || fail "Unsupported NOTIFICATION_EVENT: $NOTIFICATION_EVENT" + +EMAIL_FROM="${EMAIL_FROM:-alerts@cal.id}" +EMAIL_TO="${EMAIL_TO:-deployments@cal.id}" +DEPLOY_ENV="${DEPLOY_ENV:-${ENVIRONMENT:-unknown}}" +GIT_SHA="${GIT_SHA:-${GITHUB_SHA:-unknown}}" +BRANCH_NAME="${BRANCH_NAME:-${GITHUB_REF_NAME:-unknown}}" +RELEASE_ID="${RELEASE_ID:-unknown}" +ACTOR="${GITHUB_ACTOR:-unknown}" +RUN_ID="${GITHUB_RUN_ID:-unknown}" +REPOSITORY="${GITHUB_REPOSITORY:-unknown}" +FAILURE_REASON="${FAILURE_REASON:-}" +LOG_SNIPPET="${LOG_SNIPPET:-}" +WEB_IMAGE="${WEB_IMAGE:-N/A}" +API_IMAGE="${API_IMAGE:-N/A}" +WORKER_IMAGE="${WORKER_IMAGE:-N/A}" +MIGRATION_STATUS="${MIGRATION_STATUS:-N/A}" +WEB_STATUS="${WEB_STATUS:-N/A}" +API_STATUS="${API_STATUS:-N/A}" +WORKER_STATUS="${WORKER_STATUS:-N/A}" +ROLLBACK_STATUS="${ROLLBACK_STATUS:-N/A}" +STATE_FILE_PATH="${STATE_FILE_PATH:-N/A}" +NGINX_ACTIVE_UPSTREAMS="${NGINX_ACTIVE_UPSTREAMS:-N/A}" +ROLLBACK_TARGET_SHA="${ROLLBACK_TARGET_SHA:-N/A}" +LOG_SNIPPET_MAX_LINES="${LOG_SNIPPET_MAX_LINES:-160}" + +if [ -z "$LOG_SNIPPET" ] && [ -n "${LOG_FILE:-}" ] && [ -f "$LOG_FILE" ]; then + LOG_SNIPPET="$(tail -"$LOG_SNIPPET_MAX_LINES" "$LOG_FILE" 2>/dev/null || true)" +fi + +LOG_SNIPPET="$(printf '%s\n' "$LOG_SNIPPET" | sanitize_text)" +NGINX_ACTIVE_UPSTREAMS="$(printf '%s\n' "$NGINX_ACTIVE_UPSTREAMS" | sanitize_text)" +FAILURE_REASON="$(printf '%s\n' "$FAILURE_REASON" | sanitize_text)" + +RUN_URL="" +if [ "$REPOSITORY" != "unknown" ] && [ "$RUN_ID" != "unknown" ]; then + RUN_URL="https://github.com/${REPOSITORY}/actions/runs/${RUN_ID}" +fi + +BODY="$(cat < 0)) + | map({email: .}) + ' +)" + +[ "$(printf '%s' "$RECIPIENTS_JSON" | jq 'length')" -gt 0 ] || fail "EMAIL_TO must include at least one recipient" + +SUBJECT="Cal ID ${DEPLOY_ENV}: ${TITLE}" + +PAYLOAD="$( + jq -n \ + --arg from "$EMAIL_FROM" \ + --arg subject "$SUBJECT" \ + --arg body "$BODY" \ + --argjson to "$RECIPIENTS_JSON" \ + '{ + personalizations: [{ to: $to }], + from: { email: $from }, + subject: $subject, + content: [{ type: "text/plain", value: $body }] + }' +)" + +response_file="$(mktemp)" +trap 'rm -f "$response_file"' EXIT + +http_status="$( + curl -sS -o "$response_file" -w '%{http_code}' \ + --request POST \ + --url 'https://api.sendgrid.com/v3/mail/send' \ + --header "Authorization: Bearer ${SENDGRID_API_KEY}" \ + --header 'Content-Type: application/json' \ + --data "$PAYLOAD" || true +)" + +case "$http_status" in + 200|202) + log INFO "Notification sent: ${NOTIFICATION_EVENT}" + ;; + *) + response_body="$(cat "$response_file" 2>/dev/null || true)" + warn_or_fail "SendGrid notification failed with HTTP ${http_status}: ${response_body}" + ;; +esac diff --git a/infra/scripts/stage-api.sh b/infra/scripts/stage-api.sh new file mode 100755 index 00000000000000..5625745eeea081 --- /dev/null +++ b/infra/scripts/stage-api.sh @@ -0,0 +1,179 @@ +#!/usr/bin/env bash +# +# stage-api.sh - Start a connector API candidate container without switching NGINX. +# +# Required: +# API_IMAGE or IMAGE_TAG Fully qualified image reference to stage +# +# Optional: +# API_CANDIDATE_PORT Host port for candidate API (default: 4101) +# API_CONTAINER_PORT Container port for API app (default: 3000) +# API_CANDIDATE_NAME Candidate container name (default: api-candidate) +# ENV_FILE Docker env-file path (default: /home/onehash/.env) +# HEALTHCHECK_PATH Health endpoint path (default: /health) +# HEALTH_CHECK_ATTEMPTS Attempts before failure (default: 30) +# HEALTH_CHECK_INTERVAL Seconds between attempts (default: 10) +# +# This script intentionally does not edit NGINX and does not promote traffic. + +set -euo pipefail + +log() { + printf '[stage-api][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +require_cmd docker +require_cmd curl +require_cmd aws + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=preflight-space.sh +source "${script_dir}/preflight-space.sh" +ensure_deployment_space + +first_local_port() { + sed -nE 's/.*127\.0\.0\.1:([0-9]+).*/\1/p' "$1" 2>/dev/null | head -1 | tr -d '[:space:]' || true +} + +[ -n "${ECR_REGISTRY:-}" ] || fail "ECR_REGISTRY is required" +[ -n "${AWS_REGION:-}" ] || AWS_REGION="us-east-1" + +log INFO "Authenticating Docker to ECR registry: ${ECR_REGISTRY}" +aws ecr get-login-password --region "$AWS_REGION" \ + | docker login --username AWS --password-stdin "$ECR_REGISTRY" \ + || fail "ECR login failed — cannot pull private images" + +API_IMAGE="${API_IMAGE:-${IMAGE_TAG:-}}" +[ -n "$API_IMAGE" ] || fail "API_IMAGE or IMAGE_TAG must be set" + +# Determine current active API port by reading the NGINX config that actually routes API traffic. +# Source of truth: cal-id.conf (unified NGINX config after promotion). +# Detection order: +# 1. cal-id.conf (unified config — source of truth post-promotion): extract the web backend port +# from the first proxy_pass entry; infer API port from blue/green pairing (web=3001→api=4100, web=3002→api=4101) +# 2. cal-id.conf backup (pre-promotion snapshot): same inference from backup +# 3. connector.conf (legacy standalone — pre-unified-promotion EC2 instances): direct port extraction +# 4. backup connector.conf fallback +# +# After promote-all.sh runs, cal-id.conf is the single config file. Web routes appear first, +# then api.* and mcp.* as separate server blocks. +NGINX_CONF_DIR="${NGINX_CONF_DIR:-/etc/nginx/conf.d}" +NGINX_BACKUP_DIR="${NGINX_BACKUP_DIR:-/tmp/cal-id-nginx-previous}" + +ACTIVE_PORT="" +# 1. cal-id.conf (primary source of truth): infer API port from web port +if [ -f "${NGINX_CONF_DIR}/cal-id.conf" ]; then + web_port="$(first_local_port "${NGINX_CONF_DIR}/cal-id.conf")" + if [ -n "$web_port" ]; then + case "$web_port" in + 3001) ACTIVE_PORT="4100" ;; + 3002) ACTIVE_PORT="4101" ;; + esac + fi +fi + +# 2. cal-id.conf backup (pre-promotion snapshot): same inference +if [ -z "$ACTIVE_PORT" ] && [ -f "${NGINX_BACKUP_DIR}/cal-id.conf" ]; then + web_port="$(first_local_port "${NGINX_BACKUP_DIR}/cal-id.conf")" + if [ -n "$web_port" ]; then + case "$web_port" in + 3001) ACTIVE_PORT="4100" ;; + 3002) ACTIVE_PORT="4101" ;; + esac + fi +fi + +# 3. connector.conf (legacy standalone — pre-unified-promotion EC2 instances): direct extraction +if [ -z "$ACTIVE_PORT" ] && [ -f "${NGINX_CONF_DIR}/connector.conf" ]; then + ACTIVE_PORT="$(first_local_port "${NGINX_CONF_DIR}/connector.conf")" +fi + +# 4. backup connector.conf fallback +if [ -z "$ACTIVE_PORT" ] && [ -f "${NGINX_BACKUP_DIR}/connector.conf" ]; then + ACTIVE_PORT="$(first_local_port "${NGINX_BACKUP_DIR}/connector.conf")" +fi + +ACTIVE_PORT="${ACTIVE_PORT:-4100}" + +if [ "$ACTIVE_PORT" = "4100" ]; then + API_CANDIDATE_PORT="${API_CANDIDATE_PORT:-4101}" +else + API_CANDIDATE_PORT="${API_CANDIDATE_PORT:-4100}" +fi +API_CONTAINER_PORT="${API_CONTAINER_PORT:-3000}" +API_CANDIDATE_NAME="${API_CANDIDATE_NAME:-api-candidate}" +ENV_FILE="${ENV_FILE:-/home/onehash/.env}" +HEALTHCHECK_PATH="${HEALTHCHECK_PATH:-/health}" +HEALTH_CHECK_ATTEMPTS="${HEALTH_CHECK_ATTEMPTS:-30}" +HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-10}" + +[ -f "$ENV_FILE" ] || fail "Env file not found: $ENV_FILE" + +case "$API_CANDIDATE_PORT:$API_CONTAINER_PORT:$HEALTH_CHECK_ATTEMPTS:$HEALTH_CHECK_INTERVAL" in + *[!0-9:]* | *::*) + fail "Port and health check values must be numeric" + ;; +esac + +cleanup_candidate() { + local mode="${1:-preserve-running}" + local state + state="$(docker inspect -f '{{.State.Status}}' "$API_CANDIDATE_NAME" 2>/dev/null || true)" + case "$state" in + '') + return 0 + ;; + running) + if [ "$mode" = "force" ]; then + log INFO "Removing failed candidate container ${API_CANDIDATE_NAME} (${state})" + docker rm -f "$API_CANDIDATE_NAME" >/dev/null 2>&1 || true + else + fail "Candidate container ${API_CANDIDATE_NAME} is already running; refusing to remove an active in-progress candidate" + fi + ;; + *) + log INFO "Removing stale candidate container ${API_CANDIDATE_NAME} (${state})" + docker rm -f "$API_CANDIDATE_NAME" >/dev/null 2>&1 || true + ;; + esac +} + +log INFO "Checking prior candidate container if present: $API_CANDIDATE_NAME" +cleanup_candidate + +log INFO "Starting API candidate ${API_CANDIDATE_NAME} on host port ${API_CANDIDATE_PORT}" +docker run -d \ + --add-host=host.docker.internal:host-gateway \ + --name "$API_CANDIDATE_NAME" \ + --env-file "$ENV_FILE" \ + -p "${API_CANDIDATE_PORT}:${API_CONTAINER_PORT}" \ + "$API_IMAGE" >/dev/null || fail "Failed to start API candidate" + +health_url="http://127.0.0.1:${API_CANDIDATE_PORT}${HEALTHCHECK_PATH}" +log INFO "Checking API candidate health at ${health_url}" + +for attempt in $(seq 1 "$HEALTH_CHECK_ATTEMPTS"); do + status_code="$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$health_url" 2>/dev/null || printf '000')" + log INFO "Health check ${attempt}/${HEALTH_CHECK_ATTEMPTS}: HTTP ${status_code}" + + if [ "$status_code" = "200" ]; then + log INFO "API candidate is healthy" + exit 0 + fi + + sleep "$HEALTH_CHECK_INTERVAL" +done + +log ERROR "API candidate failed health checks; collecting logs and cleaning up" +docker logs --tail 100 "$API_CANDIDATE_NAME" >&2 || true +cleanup_candidate force +exit 1 diff --git a/infra/scripts/stage-web.sh b/infra/scripts/stage-web.sh new file mode 100755 index 00000000000000..d3ac574fa4d209 --- /dev/null +++ b/infra/scripts/stage-web.sh @@ -0,0 +1,195 @@ +#!/usr/bin/env bash +# +# stage-web.sh - Start a web candidate container without switching NGINX. +# +# Required: +# WEB_IMAGE or IMAGE_TAG Fully qualified image reference to stage +# +# Optional: +# WEB_CANDIDATE_PORT Host port for candidate web app (default: 3002) +# WEB_CONTAINER_PORT Container port for web app (default: 3001) +# WEB_CANDIDATE_NAME Candidate container name (default: web-candidate) +# ENV_FILE Docker env-file path (default: /home/onehash/.env) +# HEALTHCHECK_PATH Health endpoint path (default: /api/health) +# HEALTH_CHECK_ATTEMPTS Attempts before failure (default: 30) +# HEALTH_CHECK_INTERVAL Seconds between attempts (default: 30) +# STATIC_ROOT Static asset root (default: /var/www/cal-id-static) +# STATIC_BUILD_ID Static build id (default: git/image-derived) +# STATIC_CANDIDATE_LINK Candidate static symlink (default: STATIC_ROOT/candidate) +# +# This script intentionally does not edit NGINX and does not promote traffic. + +set -euo pipefail + +log() { + printf '[stage-web][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +require_cmd docker +require_cmd curl +require_cmd aws + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=preflight-space.sh +source "${script_dir}/preflight-space.sh" +ensure_deployment_space + +first_local_port() { + sed -nE 's/.*127\.0\.0\.1:([0-9]+).*/\1/p' "$1" 2>/dev/null | head -1 | tr -d '[:space:]' || true +} + +[ -n "${ECR_REGISTRY:-}" ] || fail "ECR_REGISTRY is required" +[ -n "${AWS_REGION:-}" ] || AWS_REGION="us-east-1" + +log INFO "Authenticating Docker to ECR registry: ${ECR_REGISTRY}" +aws ecr get-login-password --region "$AWS_REGION" \ + | docker login --username AWS --password-stdin "$ECR_REGISTRY" \ + || fail "ECR login failed — cannot pull private images" + +WEB_IMAGE="${WEB_IMAGE:-${IMAGE_TAG:-}}" +[ -n "$WEB_IMAGE" ] || fail "WEB_IMAGE or IMAGE_TAG must be set" + +# Determine current active port from NGINX config and select the opposite for staging. +# +# Source of truth: cal-id.conf (unified NGINX config after promotion). +# Detection order: +# 1. cal-id.conf (unified config — source of truth post-promotion): extract first backend port +# 2. cal-id.conf backup (pre-promotion snapshot) +# 3. default.conf (legacy standalone web config — pre-unified promotion EC2) +# 4. backup default.conf fallback +# +# After promote-all.sh runs, cal-id.conf is the single config file. It contains all routes +# (web, api, mcp). The first proxy_pass entry in cal-id.conf is always the web backend. +NGINX_CONF_DIR="${NGINX_CONF_DIR:-/etc/nginx/conf.d}" +NGINX_BACKUP_DIR="${NGINX_BACKUP_DIR:-/tmp/cal-id-nginx-previous}" + +ACTIVE_PORT="" +# 1. cal-id.conf (primary source of truth) +if [ -f "${NGINX_CONF_DIR}/cal-id.conf" ]; then + ACTIVE_PORT="$(first_local_port "${NGINX_CONF_DIR}/cal-id.conf")" +fi +# 2. cal-id.conf backup (pre-promotion snapshot) +if [ -z "$ACTIVE_PORT" ] && [ -f "${NGINX_BACKUP_DIR}/cal-id.conf" ]; then + ACTIVE_PORT="$(first_local_port "${NGINX_BACKUP_DIR}/cal-id.conf")" +fi +# 3. default.conf (legacy standalone — pre-unified-promotion EC2 instances) +if [ -z "$ACTIVE_PORT" ] && [ -f "${NGINX_CONF_DIR}/default.conf" ]; then + ACTIVE_PORT="$(first_local_port "${NGINX_CONF_DIR}/default.conf")" +fi +# 4. backup default.conf fallback +if [ -z "$ACTIVE_PORT" ] && [ -f "${NGINX_BACKUP_DIR}/default.conf" ]; then + ACTIVE_PORT="$(first_local_port "${NGINX_BACKUP_DIR}/default.conf")" +fi + +ACTIVE_PORT="${ACTIVE_PORT:-3001}" + +if [ "$ACTIVE_PORT" = "3001" ]; then + WEB_CANDIDATE_PORT="${WEB_CANDIDATE_PORT:-3002}" +else + WEB_CANDIDATE_PORT="${WEB_CANDIDATE_PORT:-3001}" +fi +WEB_CONTAINER_PORT="${WEB_CONTAINER_PORT:-3001}" +WEB_CANDIDATE_NAME="${WEB_CANDIDATE_NAME:-web-candidate}" +ENV_FILE="${ENV_FILE:-/home/onehash/.env}" +HEALTHCHECK_PATH="${HEALTHCHECK_PATH:-/api/health}" +HEALTH_CHECK_ATTEMPTS="${HEALTH_CHECK_ATTEMPTS:-30}" +HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-30}" +STATIC_ROOT="${STATIC_ROOT:-/var/www/cal-id-static}" +STATIC_CANDIDATE_LINK="${STATIC_CANDIDATE_LINK:-${STATIC_ROOT}/candidate}" + +derive_static_build_id() { + local raw="${STATIC_BUILD_ID:-${GIT_SHA:-${GIT_HASH:-${WEB_IMAGE##*:}}}}" + printf '%s' "$raw" | tr -c '[:alnum:]._-=' '-' +} + +STATIC_BUILD_ID="$(derive_static_build_id)" +STATIC_BUILD_DIR="${STATIC_ROOT}/build-${STATIC_BUILD_ID}" + +[ -f "$ENV_FILE" ] || fail "Env file not found: $ENV_FILE" + +case "$WEB_CANDIDATE_PORT:$WEB_CONTAINER_PORT:$HEALTH_CHECK_ATTEMPTS:$HEALTH_CHECK_INTERVAL" in + *[!0-9:]* | *::*) + fail "Port and health check values must be numeric" + ;; +esac + +cleanup_candidate() { + local mode="${1:-preserve-running}" + local state + state="$(docker inspect -f '{{.State.Status}}' "$WEB_CANDIDATE_NAME" 2>/dev/null || true)" + case "$state" in + '') + return 0 + ;; + running) + if [ "$mode" = "force" ]; then + log INFO "Removing failed candidate container ${WEB_CANDIDATE_NAME} (${state})" + docker rm -f "$WEB_CANDIDATE_NAME" >/dev/null 2>&1 || true + else + fail "Candidate container ${WEB_CANDIDATE_NAME} is already running; refusing to remove an active in-progress candidate" + fi + ;; + *) + log INFO "Removing stale candidate container ${WEB_CANDIDATE_NAME} (${state})" + docker rm -f "$WEB_CANDIDATE_NAME" >/dev/null 2>&1 || true + ;; + esac +} + +log INFO "Checking prior candidate container if present: $WEB_CANDIDATE_NAME" +cleanup_candidate + +log INFO "Starting web candidate ${WEB_CANDIDATE_NAME} on host port ${WEB_CANDIDATE_PORT}" +docker run -d \ + --add-host=host.docker.internal:host-gateway \ + -e IS_ROLLBACK="${IS_ROLLBACK:-false}" \ + --name "$WEB_CANDIDATE_NAME" \ + --env-file "$ENV_FILE" \ + -p "${WEB_CANDIDATE_PORT}:${WEB_CONTAINER_PORT}" \ + "$WEB_IMAGE" >/dev/null || fail "Failed to start web candidate" + +health_url="http://127.0.0.1:${WEB_CANDIDATE_PORT}${HEALTHCHECK_PATH}" +log INFO "Checking web candidate health at ${health_url}" + +for attempt in $(seq 1 "$HEALTH_CHECK_ATTEMPTS"); do + status_code="$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$health_url" 2>/dev/null || printf '000')" + log INFO "Health check ${attempt}/${HEALTH_CHECK_ATTEMPTS}: HTTP ${status_code}" + + if [ "$status_code" = "200" ]; then + log INFO "Web candidate is healthy" + log INFO "Extracting static assets to ${STATIC_BUILD_DIR}" + sudo mkdir -p "$STATIC_ROOT" + sudo chown -R "$(id -u):$(id -g)" "$STATIC_ROOT" + + tmp_static_dir="${STATIC_BUILD_DIR}.tmp.$$" + rm -rf "$tmp_static_dir" + mkdir -p "${tmp_static_dir}/_next" + + docker cp "${WEB_CANDIDATE_NAME}:/app/apps/web/.next/static" "${tmp_static_dir}/_next/static" \ + || fail "Failed to extract Next.js static assets from web candidate" + docker cp "${WEB_CANDIDATE_NAME}:/app/apps/web/public" "${tmp_static_dir}/public" \ + || fail "Failed to extract public assets from web candidate" + + rm -rf "$STATIC_BUILD_DIR" + mv "$tmp_static_dir" "$STATIC_BUILD_DIR" + ln -sfn "$STATIC_BUILD_DIR" "$STATIC_CANDIDATE_LINK" + log INFO "Static assets staged at ${STATIC_BUILD_DIR}" + exit 0 + fi + + sleep "$HEALTH_CHECK_INTERVAL" +done + +log ERROR "Web candidate failed health checks; collecting logs and cleaning up" +docker logs --tail 100 "$WEB_CANDIDATE_NAME" >&2 || true +cleanup_candidate force +exit 1 diff --git a/infra/scripts/start-workers.sh b/infra/scripts/start-workers.sh new file mode 100755 index 00000000000000..e11ebbb8a79930 --- /dev/null +++ b/infra/scripts/start-workers.sh @@ -0,0 +1,306 @@ +#!/usr/bin/env bash +# +# start-workers.sh - Start new worker candidates after Web/API promotion. +# +# This script starts workers AFTER NGINX promotion. Workers are part of the +# release unit: Web, API, and Worker all succeed or the release is rolled back. +# +# Required: +# PROMOTION_COMPLETE=true +# WORKER_IMAGE or IMAGE_TAG Fully qualified worker image reference +# +# Optional: +# WORKER_REPLICAS Number of new workers to start (default: 1) +# ENV_FILE Docker env-file path (default: /home/onehash/.env) +# WORKER_MEMORY Docker memory limit (default: 1024m) +# WORKER_NODE_OPTIONS NODE_OPTIONS value (default: --max-old-space-size=768) +# HEALTH_CHECK_ATTEMPTS Attempts per worker (default: 20) +# HEALTH_CHECK_INTERVAL Seconds between attempts (default: 15) +# SCHEMA_CHECK_COMMAND Host command to validate schema before workers start +# WORKER_STARTUP_TIMEOUT Max seconds to wait for all replicas (default: 300) +# CHECK_REDIS true to verify Redis/queue connectivity (default: true) +# WORKER_LIFECYCLE_DRY_RUN true to validate inputs without Docker/AWS mutation +# +# Health checks performed (in order): +# 1. Docker container status is "running" +# 2. Worker log shows "Workers are up" +# 3. No fatal/panic startup errors in logs +# 4. Redis connectivity check via BullMQ queue probe (if CHECK_REDIS=true) +# 5. Processor count check — confirms expected processors are registered +# +# All checks must pass for the worker to be considered healthy. +# Failures are reported and old workers are NOT touched if any new worker fails. + +set -euo pipefail + +log() { + printf '[start-workers][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +require_value() { + local name="$1" + local value="${!name:-}" + [ -n "$value" ] || fail "Required environment variable is missing: $name" +} + +AWS_REGION="${AWS_REGION:-us-east-1}" +WORKER_LIFECYCLE_DRY_RUN="${WORKER_LIFECYCLE_DRY_RUN:-false}" + +[ "${PROMOTION_COMPLETE:-false}" = "true" ] || fail "FATAL: Cannot start workers before Web/API promotion" + +WORKER_IMAGE="${WORKER_IMAGE:-${IMAGE_TAG:-}}" +require_value WORKER_IMAGE + +WORKER_REPLICAS="${WORKER_REPLICAS:-1}" +ENV_FILE="${ENV_FILE:-/home/onehash/.env}" +WORKER_MEMORY="${WORKER_MEMORY:-1024m}" +WORKER_NODE_OPTIONS="${WORKER_NODE_OPTIONS:---max-old-space-size=768}" +HEALTH_CHECK_ATTEMPTS="${HEALTH_CHECK_ATTEMPTS:-20}" +HEALTH_CHECK_INTERVAL="${HEALTH_CHECK_INTERVAL:-15}" +WORKER_STARTUP_TIMEOUT="${WORKER_STARTUP_TIMEOUT:-300}" +CHECK_REDIS="${CHECK_REDIS:-true}" + +# Validate numeric args +for var in WORKER_REPLICAS HEALTH_CHECK_ATTEMPTS HEALTH_CHECK_INTERVAL WORKER_STARTUP_TIMEOUT; do + case "${!var}" in + ''|*[!0-9]*) fail "${var} must be a positive integer" ;; + esac +done + +[ "$WORKER_REPLICAS" -gt 0 ] || fail "WORKER_REPLICAS must be greater than zero" + +if [ "$WORKER_LIFECYCLE_DRY_RUN" = "true" ]; then + log INFO "Dry run: start-workers input validation passed for ${WORKER_REPLICAS} replica(s)" + exit 0 +fi + +require_cmd docker +require_cmd aws +require_value ECR_REGISTRY + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=preflight-space.sh +source "${script_dir}/preflight-space.sh" +ensure_deployment_space + +log INFO "Authenticating Docker to ECR registry: ${ECR_REGISTRY}" +aws ecr get-login-password --region "$AWS_REGION" \ + | docker login --username AWS --password-stdin "$ECR_REGISTRY" \ + || fail "ECR login failed — cannot pull private images" + +[ -f "$ENV_FILE" ] || fail "Env file not found: $ENV_FILE" + +# Parse REDIS_URL from ENV_FILE for health checking +REDIS_URL="" +if [ -f "$ENV_FILE" ]; then + REDIS_URL="$(grep -E '^REDIS_URL=' "$ENV_FILE" 2>/dev/null | cut -d= -f2- | tr -d '"' | tr -d "'" || true)" +fi + +# ---------- Schema validation ---------- + +check_schema() { + if [ -n "${SCHEMA_CHECK_COMMAND:-}" ]; then + log INFO "Running custom schema check" + eval "$SCHEMA_CHECK_COMMAND" || fail "Custom schema check failed" + return 0 + fi + + log INFO "Running default schema check in worker image" + docker run --rm \ + --env-file "$ENV_FILE" \ + --add-host=host.docker.internal:host-gateway \ + "$WORKER_IMAGE" \ + npx prisma migrate status >/dev/null 2>&1 \ + || { log WARN "Schema check returned non-zero — continuing anyway (migrate status can fail if DB is ahead of image)"; return 0; } +} + +log INFO "Validating schema compatibility before starting workers" +check_schema + +# ---------- Cleanup leftovers ---------- + +cleanup_new_workers() { + local mode="${1:-preserve-running}" + log INFO "Checking leftover worker-new-* containers" + docker ps -a --filter "name=worker-new-" --format "{{.Names}}" \ + | while IFS= read -r container; do + [ -n "$container" ] || continue + state="$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || true)" + if [ "$state" = "running" ] && [ "$mode" != "force" ]; then + fail "Worker candidate ${container} is already running; refusing to remove an active in-progress candidate" + fi + log INFO "Removing worker candidate ${container} (${state:-unknown})" + docker rm -f "$container" >/dev/null 2>&1 || true + done +} + +cleanup_new_workers + +# ---------- Redis/queue connectivity check ---------- + +check_redis_connectivity() { + if [ "$CHECK_REDIS" != "true" ] || [ -z "$REDIS_URL" ]; then + log INFO "Redis check skipped (CHECK_REDIS=false or REDIS_URL not set)" + return 0 + fi + + log INFO "Verifying Redis connectivity: ${REDIS_URL%%@*}" # redact password + if command -v redis-cli >/dev/null 2>&1; then + if redis-cli -u "$REDIS_URL" ping >/dev/null 2>&1; then + log INFO "Redis connection OK" + return 0 + else + log WARN "Redis ping failed — continuing with startup anyway (Redis may be a side effect)" + return 0 + fi + elif command -v nc >/dev/null 2>&1; then + # Fallback: TCP connectivity check via nc + redis_host="$(printf '%s' "$REDIS_URL" | sed -n 's|.*@//\([^:/]*\).*|\1|p')" + redis_port="$(printf '%s' "$REDIS_URL" | sed -n 's|.*:\([0-9]*\).*|\1|p' | head -1)" + redis_port="${redis_port:-6379}" + if nc -z -w 5 "$redis_host" "$redis_port" 2>/dev/null; then + log INFO "Redis TCP connectivity OK (host=${redis_host}, port=${redis_port})" + return 0 + else + log WARN "Redis TCP check failed — continuing with startup anyway" + return 0 + fi + else + log WARN "Neither redis-cli nor nc available — skipping Redis connectivity check" + return 0 + fi +} + +# Run Redis check before starting workers (do not block startup on Redis being down) +check_redis_connectivity || true + +# ---------- Per-worker health check ---------- + +check_worker_health() { + local container="$1" + local attempt=1 + + while [ "$attempt" -le "$HEALTH_CHECK_ATTEMPTS" ]; do + local now status + now="$(date -u '+%s')" + if [ "$now" -ge "$startup_deadline" ]; then + log WARN "${container}: startup deadline exceeded during health check" + return 1 + fi + + status="$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || printf 'missing')" + log INFO "Health check ${attempt}/${HEALTH_CHECK_ATTEMPTS} for ${container}: status=${status}" + + if [ "$status" != "running" ]; then + sleep "$HEALTH_CHECK_INTERVAL" + attempt=$((attempt + 1)) + continue + fi + + local logs + logs="$(docker logs "$container" 2>&1 | tail -200)" + + # Check 1: "Workers are up" — BullMQ signal that workers initialized + if printf '%s' "$logs" | grep -q "Workers are up"; then + + # Check 2: No fatal startup errors + if printf '%s' "$logs" | grep -Ei "fatal|panic|ECONNREFUSED|refused to connect|SyntaxError|Error:" | grep -vE "Warning|Deprecation" | head -1 | grep -qE "fatal|panic|ECONNREFUSED|refused to connect"; then + log WARN "${container}: workers are up but logs contain errors — checking severity" + # Re-check more carefully — some error lines are expected during startup + if printf '%s' "$logs" | tail -50 | grep -qE "fatal|panic"; then + log ERROR "${container}: fatal/panic detected in recent logs — failing worker" + return 1 + fi + fi + + # Check 3: Processor registration — count non-test processor registrations + # Workers that fail to register processors are partially initialized + processor_count="$(printf '%s' "$logs" | grep -E "Starting (worker|processor)" | wc -l | tr -d '[:space:]')" + if [ -n "$processor_count" ] && [ "$processor_count" -gt 0 ]; then + log INFO "${container}: ${processor_count} processor(s) registered" + fi + + # Check 4: Redis/BullMQ queue connectivity via worker logs + if printf '%s' "$logs" | grep -qE "Error.*Redis|Error.*queue|DELAYED.*error|failed.*connect"; then + log WARN "${container}: logs show Redis/queue errors" + fi + + log INFO "${container}: all health checks passed" + return 0 + fi + + sleep "$HEALTH_CHECK_INTERVAL" + attempt=$((attempt + 1)) + done + + log ERROR "${container}: did not reach healthy state within ${HEALTH_CHECK_ATTEMPTS} × ${HEALTH_CHECK_INTERVAL}s" + return 1 +} + +# ---------- Start workers ---------- + +log INFO "Starting ${WORKER_REPLICAS} worker candidate(s) from ${WORKER_IMAGE}" + +startup_start="$(date -u '+%s')" +startup_deadline="$((startup_start + WORKER_STARTUP_TIMEOUT))" + +started=0 +failed_workers=() + +for i in $(seq 1 "$WORKER_REPLICAS"); do + name="worker-new-${i}" + + # Check startup deadline before each start + now="$(date -u '+%s')" + if [ "$now" -ge "$startup_deadline" ]; then + log ERROR "Startup deadline exceeded — refusing to start ${name}" + break + fi + + log INFO "Starting ${name}" + docker run -d \ + --name "$name" \ + --restart unless-stopped \ + --env-file "$ENV_FILE" \ + --env NODE_OPTIONS="$WORKER_NODE_OPTIONS" \ + --add-host=host.docker.internal:host-gateway \ + --memory="$WORKER_MEMORY" \ + --memory-swap="$WORKER_MEMORY" \ + "$WORKER_IMAGE" >/dev/null 2>&1 \ + || { + log ERROR "Failed to start ${name}" + failed_workers+=("$name") + continue + } + + log INFO "Waiting for ${name} to become healthy (${HEALTH_CHECK_ATTEMPTS} attempts × ${HEALTH_CHECK_INTERVAL}s)" + if check_worker_health "$name"; then + log INFO "${name} is healthy" + started=$((started + 1)) + else + log ERROR "${name} failed health check" + docker logs --tail 100 "$name" >&2 || true + failed_workers+=("$name") + fi +done + +# ---------- Outcome ---------- + +if [ "$started" -lt "$WORKER_REPLICAS" ]; then + log ERROR "${started}/${WORKER_REPLICAS} workers healthy — cleaning up failed candidates and aborting" + log ERROR "Failed workers: ${failed_workers[*]:-none}" + cleanup_new_workers force + fail "Worker startup failed: ${started}/${WORKER_REPLICAS} healthy. Old workers were NOT touched. Rollback of Web/API required." +fi + +log INFO "All ${started}/${WORKER_REPLICAS} workers healthy. Ready for drain." +log INFO "Worker startup complete — proceed with drain-workers.sh" diff --git a/infra/scripts/validate-rollback-schema.sh b/infra/scripts/validate-rollback-schema.sh new file mode 100755 index 00000000000000..4cba15219b5010 --- /dev/null +++ b/infra/scripts/validate-rollback-schema.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +# +# validate-rollback-schema.sh - Validate app-only rollback schema compatibility. +# +# Required: +# REPO_URL +# TARGET_SHA +# DATABASE_URL +# +# Optional: +# BRANCH_NAME or BRANCH Branch used to inspect current migration files (default: main) +# WORK_DIR Parent directory for temp clones (default: /tmp/cal-id-rollback-schema) +# KEEP_WORK_DIR true to keep checkout after completion (default: false) +# +# This script does not modify the database. It blocks app rollback when the DB +# has applied migrations newer than TARGET_SHA that contain CONTRACT patterns. + +set -euo pipefail + +log() { + printf '[validate-rollback-schema][%s] %s\n' "$1" "$2" >&2 +} + +fail() { + log ERROR "$1" + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || fail "Required command not found: $1" +} + +require_value() { + local name="$1" + local value="${!name:-}" + [ -n "$value" ] || fail "Required environment variable is missing: $name" +} + +has_contract_pattern() { + local file="$1" + + grep -Eiq '^[[:space:]]*DROP[[:space:]]+TABLE\b' "$file" && return 0 + grep -Eiq '^[[:space:]]*(ALTER[[:space:]]+TABLE\b.*)?DROP[[:space:]]+COLUMN\b' "$file" && return 0 + + if perl -0ne ' + while (/ALTER\s+TABLE\b.*?;/gsi) { + my $stmt = $&; + if ($stmt =~ /RENAME\s+COLUMN\b/i || $stmt =~ /ALTER\s+COLUMN\b.*\bTYPE\b/i) { + exit 7; + } + if ($stmt =~ /SET\s+NOT\s+NULL/i && $stmt !~ /DEFAULT/i) { + exit 7; + } + } + ' "$file"; then + return 1 + else + [ "$?" -eq 7 ] && return 0 + return 0 + fi +} + +require_cmd git +require_cmd psql +require_cmd comm + +require_value REPO_URL +require_value TARGET_SHA +require_value DATABASE_URL + +BRANCH_NAME="${BRANCH_NAME:-${BRANCH:-main}}" +WORK_DIR="${WORK_DIR:-/tmp/cal-id-rollback-schema}" +KEEP_WORK_DIR="${KEEP_WORK_DIR:-false}" +target_dir="${WORK_DIR}/target" +current_dir="${WORK_DIR}/current" +target_list="${WORK_DIR}/target-migrations.txt" +applied_list="${WORK_DIR}/applied-migrations.txt" +newer_applied_list="${WORK_DIR}/newer-applied-migrations.txt" + +cleanup() { + if [ "$KEEP_WORK_DIR" != "true" ]; then + rm -rf "$WORK_DIR" + fi +} +trap cleanup EXIT + +rm -rf "$WORK_DIR" +mkdir -p "$WORK_DIR" + +log INFO "Cloning target branch ${BRANCH_NAME}" +git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$current_dir" \ + || fail "Failed to clone current branch" + +git clone --no-checkout --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$target_dir" \ + || fail "Failed to clone target checkout" + +( + cd "$target_dir" + git fetch origin "$TARGET_SHA" --depth 1 + git checkout "$TARGET_SHA" +) || fail "Failed to checkout target SHA" + +find "$target_dir/packages/prisma/migrations" -mindepth 1 -maxdepth 1 -type d -exec basename {} \; \ + | sort > "$target_list" + +psql "$DATABASE_URL" -Atc "SELECT migration_name FROM _prisma_migrations WHERE finished_at IS NOT NULL ORDER BY migration_name;" \ + | sort > "$applied_list" \ + || fail "Failed to read applied Prisma migrations" + +comm -13 "$target_list" "$applied_list" > "$newer_applied_list" + +if [ ! -s "$newer_applied_list" ]; then + log INFO "Schema compatible: DB has no applied migrations beyond target SHA" + exit 0 +fi + +blocked=0 +while IFS= read -r migration_name; do + [ -n "$migration_name" ] || continue + migration_sql="${current_dir}/packages/prisma/migrations/${migration_name}/migration.sql" + + if [ ! -f "$migration_sql" ]; then + fail "Applied migration not found in current branch checkout: $migration_name" + fi + + if has_contract_pattern "$migration_sql"; then + log ERROR "BLOCKED: applied migration newer than target contains CONTRACT pattern: $migration_name" + blocked=1 + else + log INFO "Compatible newer migration: $migration_name" + fi +done < "$newer_applied_list" + +if [ "$blocked" -ne 0 ]; then + fail "Schema incompatible with app-only rollback target ${TARGET_SHA}" +fi + +log INFO "Schema compatible with app-only rollback target ${TARGET_SHA}" diff --git a/packages/trpc/server/routers/viewer/webhook/testTrigger.handler.ts b/packages/trpc/server/routers/viewer/webhook/testTrigger.handler.ts index bc8a465c22d3c2..2f20dfdd923416 100644 --- a/packages/trpc/server/routers/viewer/webhook/testTrigger.handler.ts +++ b/packages/trpc/server/routers/viewer/webhook/testTrigger.handler.ts @@ -28,14 +28,14 @@ export const testTriggerHandler = async ({ ctx: _ctx, input }: TestTriggerOption { email: "jdoe@example.com", name: "John Doe", - timeZone: "Europe/London", + timeZone: "Asia/Kolkata", language, }, ], organizer: { name: "Cal", - email: "no-reply@cal.com", - timeZone: "Europe/London", + email: "no-reply@cal.id", + timeZone: "Asia/Kolkata", language, }, }; From 51e4c4d269825565df333192b54d2b3a8b06e067 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 09:40:13 +0530 Subject: [PATCH 02/12] Trigger auto pr From dc80ee3a5741627256d7de9456a30456c9a3fed6 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 10:44:17 +0530 Subject: [PATCH 03/12] chore(ci):removed lock temporarily --- .github/workflows/deploy-all.yml | 299 +++++++++++++++---------------- 1 file changed, 149 insertions(+), 150 deletions(-) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index 025a2f955d2cb6..e7ee8767c4a1d1 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -388,47 +388,47 @@ jobs: BRANCH_NAME: ${{ env.BRANCH }} run: bash infra/scripts/send-email.sh - acquire-lock: - name: Acquire deployment lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - runs-on: ubuntu-latest - needs: - - preflight - - prepare-release - outputs: - lock_token: ${{ steps.lock.outputs.LOCK_TOKEN }} - lock_key: ${{ steps.lock.outputs.LOCK_KEY }} - steps: - - name: Checkout lock script - uses: actions/checkout@v4 - with: - ref: ${{ needs.prepare-release.outputs.branch }} - sparse-checkout: | - infra/scripts/deploy-state-paths.sh - infra/scripts/acquire-lock.sh - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: ${{ secrets.AWS_REGION }} - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - - name: Acquire lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - id: lock - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_OWNER: github-${{ github.run_id }} - LOCK_TTL_SECONDS: 7200 - GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_ACTOR: ${{ github.actor }} - run: | - infra/scripts/acquire-lock.sh | tee lock.env - cat lock.env >> "$GITHUB_OUTPUT" + # acquire-lock: + # name: Acquire deployment lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # runs-on: ubuntu-latest + # needs: + # - preflight + # - prepare-release + # outputs: + # lock_token: ${{ steps.lock.outputs.LOCK_TOKEN }} + # lock_key: ${{ steps.lock.outputs.LOCK_KEY }} + # steps: + # - name: Checkout lock script + # uses: actions/checkout@v4 + # with: + # ref: ${{ needs.prepare-release.outputs.branch }} + # sparse-checkout: | + # infra/scripts/deploy-state-paths.sh + # infra/scripts/acquire-lock.sh + + # - name: Configure AWS credentials + # uses: aws-actions/configure-aws-credentials@v4 + # with: + # aws-region: ${{ secrets.AWS_REGION }} + # aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + # aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + # - name: Acquire lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # id: lock + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_OWNER: github-${{ github.run_id }} + # LOCK_TTL_SECONDS: 7200 + # GIT_SHA: ${{ needs.prepare-release.outputs.git_hash }} + # GITHUB_RUN_ID: ${{ github.run_id }} + # GITHUB_ACTOR: ${{ github.actor }} + # run: | + # infra/scripts/acquire-lock.sh | tee lock.env + # cat lock.env >> "$GITHUB_OUTPUT" # ---------- Build Jobs ---------- @@ -437,7 +437,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock outputs: web_image: ${{ steps.meta.outputs.web_image }} steps: @@ -563,7 +563,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock outputs: api_image: ${{ steps.meta.outputs.api_image }} steps: @@ -652,7 +652,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock outputs: worker_image: ${{ steps.meta.outputs.worker_image }} steps: @@ -743,7 +743,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - build-web - build-api - build-worker @@ -754,7 +754,6 @@ jobs: ref: ${{ env.BRANCH }} sparse-checkout: | infra/scripts/deploy-state-paths.sh - infra/scripts/refresh-lock.sh infra/scripts/record-state.sh - name: Configure AWS credentials @@ -764,14 +763,14 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - name: Refresh deployment lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} - LOCK_TTL_SECONDS: 7200 - run: bash infra/scripts/refresh-lock.sh + # - name: Refresh deployment lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + # LOCK_TTL_SECONDS: 7200 + # run: bash infra/scripts/refresh-lock.sh - name: Record images built env: @@ -794,7 +793,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - build-web - build-api - build-worker @@ -815,14 +814,14 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - name: Refresh deployment lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} - LOCK_TTL_SECONDS: 7200 - run: bash infra/scripts/refresh-lock.sh + # - name: Refresh deployment lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + # LOCK_TTL_SECONDS: 7200 + # run: bash infra/scripts/refresh-lock.sh - name: Run database migrations from target VPC id: run-migrations @@ -884,7 +883,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - build-api - migrate-db steps: @@ -940,7 +939,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - build-web - migrate-db steps: @@ -998,7 +997,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - build-web - build-api - deploy-api @@ -1010,7 +1009,7 @@ jobs: ref: ${{ env.BRANCH }} sparse-checkout: | infra/scripts/deploy-state-paths.sh - infra/scripts/refresh-lock.sh + # infra/scripts/refresh-lock.sh infra/scripts/promote-all.sh infra/scripts/revert-nginx.sh infra/scripts/record-state.sh @@ -1022,14 +1021,14 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - name: Refresh deployment lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} - LOCK_TTL_SECONDS: 7200 - run: bash infra/scripts/refresh-lock.sh + # - name: Refresh deployment lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + # LOCK_TTL_SECONDS: 7200 + # run: bash infra/scripts/refresh-lock.sh - name: Promote candidates uses: appleboy/ssh-action@v0.1.10 @@ -1108,7 +1107,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - build-web - build-api - build-worker @@ -1121,7 +1120,7 @@ jobs: ref: ${{ env.BRANCH }} sparse-checkout: | infra/scripts/deploy-state-paths.sh - infra/scripts/refresh-lock.sh + # infra/scripts/refresh-lock.sh infra/scripts/record-state.sh - name: Configure AWS credentials @@ -1131,14 +1130,14 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - name: Refresh deployment lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} - LOCK_TTL_SECONDS: 7200 - run: bash infra/scripts/refresh-lock.sh + # - name: Refresh deployment lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + # LOCK_TTL_SECONDS: 7200 + # run: bash infra/scripts/refresh-lock.sh - name: Record promoted state env: @@ -1172,7 +1171,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - build-worker - promote-all - record-promoted-state @@ -1183,7 +1182,7 @@ jobs: ref: ${{ env.BRANCH }} sparse-checkout: | infra/scripts/deploy-state-paths.sh - infra/scripts/refresh-lock.sh + # infra/scripts/refresh-lock.sh infra/scripts/start-workers.sh infra/scripts/drain-workers.sh @@ -1194,14 +1193,14 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - name: Refresh deployment lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} - LOCK_TTL_SECONDS: 7200 - run: bash infra/scripts/refresh-lock.sh + # - name: Refresh deployment lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + # LOCK_TTL_SECONDS: 7200 + # run: bash infra/scripts/refresh-lock.sh - name: Deploy workers uses: appleboy/ssh-action@v0.1.10 @@ -1243,7 +1242,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - promote-all - record-promoted-state - deploy-worker @@ -1268,7 +1267,7 @@ jobs: ref: ${{ env.BRANCH }} sparse-checkout: | infra/scripts/deploy-state-paths.sh - infra/scripts/refresh-lock.sh + # infra/scripts/refresh-lock.sh - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 @@ -1277,14 +1276,14 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - name: Refresh deployment lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} - LOCK_TTL_SECONDS: 7200 - run: bash infra/scripts/refresh-lock.sh + # - name: Refresh deployment lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + # LOCK_TTL_SECONDS: 7200 + # run: bash infra/scripts/refresh-lock.sh - name: Resolve previous current release id: current @@ -1461,7 +1460,7 @@ jobs: runs-on: ubuntu-latest needs: - prepare-release - - acquire-lock + # - acquire-lock - build-web - build-api - build-worker @@ -1476,7 +1475,7 @@ jobs: ref: ${{ env.BRANCH }} sparse-checkout: | infra/scripts/deploy-state-paths.sh - infra/scripts/refresh-lock.sh + # infra/scripts/refresh-lock.sh infra/scripts/record-state.sh - name: Configure AWS credentials @@ -1486,14 +1485,14 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - name: Refresh deployment lock - # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. - if: ${{ false }} - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} - LOCK_TTL_SECONDS: 7200 - run: bash infra/scripts/refresh-lock.sh + # - name: Refresh deployment lock + # # S3 locking is temporarily disabled; GitHub Actions concurrency is the primary control. + # if: ${{ false }} + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + # LOCK_TTL_SECONDS: 7200 + # run: bash infra/scripts/refresh-lock.sh - name: Record deployment success env: @@ -1509,37 +1508,37 @@ jobs: GITHUB_ACTOR: ${{ github.actor }} run: bash infra/scripts/record-state.sh - release-lock: - name: Release deployment lock (disabled) - runs-on: ubuntu-latest - needs: - - prepare-release - - acquire-lock - # S3 deployment locking is temporarily disabled. - # GitHub Actions environment-scoped concurrency is the primary concurrency control. - # if: ${{ always() && needs.acquire-lock.outputs.lock_token != '' }} - if: ${{ false }} - steps: - - name: Checkout release script - uses: actions/checkout@v4 - with: - ref: ${{ env.BRANCH }} - sparse-checkout: | - infra/scripts/deploy-state-paths.sh - infra/scripts/release-lock.sh - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: ${{ secrets.AWS_REGION }} - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - - name: Release lock - env: - DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} - run: bash infra/scripts/release-lock.sh + # release-lock: + # name: Release deployment lock (disabled) + # runs-on: ubuntu-latest + # needs: + # - prepare-release + # - acquire-lock + # # S3 deployment locking is temporarily disabled. + # # GitHub Actions environment-scoped concurrency is the primary concurrency control. + # # if: ${{ always() && needs.acquire-lock.outputs.lock_token != '' }} + # if: ${{ false }} + # steps: + # - name: Checkout release script + # uses: actions/checkout@v4 + # with: + # ref: ${{ env.BRANCH }} + # sparse-checkout: | + # infra/scripts/deploy-state-paths.sh + # infra/scripts/release-lock.sh + + # - name: Configure AWS credentials + # uses: aws-actions/configure-aws-credentials@v4 + # with: + # aws-region: ${{ secrets.AWS_REGION }} + # aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + # aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + # - name: Release lock + # env: + # DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} + # LOCK_TOKEN: ${{ needs.acquire-lock.outputs.lock_token }} + # run: bash infra/scripts/release-lock.sh deployment-report: name: Deployment report and final notification @@ -1547,7 +1546,7 @@ jobs: needs: - preflight - prepare-release - - acquire-lock + # - acquire-lock - build-web - build-api - build-worker @@ -1560,7 +1559,7 @@ jobs: - deploy-worker - rollback-after-promotion - verify - - release-lock + # - release-lock if: always() steps: - name: Checkout report helpers @@ -1713,7 +1712,7 @@ jobs: ROLLBACK_STATUS_RAW: ${{ needs.rollback-after-promotion.outputs.rollback_status || '' }} ROLLBACK_TARGET_SHA: ${{ needs.rollback-after-promotion.outputs.rollback_target_sha || '' }} VERIFY_RESULT: ${{ needs.verify.result }} - RELEASE_LOCK_RESULT: ${{ needs.release-lock.result }} + # RELEASE_LOCK_RESULT: ${{ needs.release-lock.result }} NGINX_ACTIVE_UPSTREAMS: ${{ steps.nginx.outputs.nginx_active_upstreams }} LOG_SNIPPET: ${{ steps.host-logs.outputs.log_snippet }} run: | @@ -1805,7 +1804,7 @@ jobs: echo "| Deploy Workers | ${DEPLOY_WORKER_RESULT} |" echo "| Automatic Rollback | ${rollback_status} |" echo "| Verify / Record Current | ${VERIFY_RESULT} |" - echo "| Release Lock | ${RELEASE_LOCK_RESULT} |" + # echo "| Release Lock | ${RELEASE_LOCK_RESULT} |" echo "" echo "## Rollback" echo "" From 2cd5027e9f036825a683ac62c9e9e87ec2d87eb0 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 12:03:04 +0530 Subject: [PATCH 04/12] fix(deploy): preserve sparse checkout for prisma migrations --- .github/workflows/deploy-all.yml | 53 +++++++++++++++++++++++++- infra/scripts/migrate.sh | 64 ++++++++++++++++++++++++++++++-- 2 files changed, 112 insertions(+), 5 deletions(-) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index e7ee8767c4a1d1..d852cc8d77a805 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -844,10 +844,49 @@ jobs: envs: GIT_HASH,RELEASE_ID,DATABASE_URL,REPO_URL,BRANCH_NAME,REPO_ROOT,ENABLE_DB_BACKUP,DB_BACKUP_COMMAND,MIGRATION_TIMEOUT_SECONDS script: | set -euo pipefail + diagnose_missing_path() { + path="$1" + echo "Required migration path is missing: ${REPO_ROOT}/${path}" >&2 + echo "Checkout diagnostics:" >&2 + echo " pwd=$(pwd)" >&2 + echo " repo_root=${REPO_ROOT}" >&2 + echo " branch=${BRANCH_NAME}" >&2 + echo " expected_sha=${GIT_HASH}" >&2 + echo " actual_sha=$(git rev-parse HEAD 2>/dev/null || printf '')" >&2 + echo " core.sparseCheckout=$(git config --bool core.sparseCheckout 2>/dev/null || printf 'false')" >&2 + echo " core.sparseCheckoutCone=$(git config --bool core.sparseCheckoutCone 2>/dev/null || printf 'false')" >&2 + if git ls-tree -r --name-only HEAD -- "$path" 2>/dev/null | grep -qx "$path"; then + echo "${path} exists in the target commit but is missing from the working tree. The host checkout is likely sparse or incomplete." >&2 + else + echo "${path} is not present in the target commit. Check branch/SHA resolution and the deployed revision." >&2 + fi + exit 1 + } + require_migration_path() { + path="$1" + if [ ! -e "${REPO_ROOT}/${path}" ]; then + diagnose_missing_path "$path" + fi + } + require_migration_path_if_tracked() { + path="$1" + if git ls-tree -r --name-only HEAD -- "$path" 2>/dev/null | grep -qx "$path" || + git ls-tree -d --name-only HEAD -- "$path" 2>/dev/null | grep -qx "$path"; then + require_migration_path "$path" + fi + } + repair_migration_sparse_checkout() { + echo "Configuring sparse checkout for Prisma migration files" >&2 + git sparse-checkout init --cone || true + git sparse-checkout set \ + .yarn \ + packages/prisma + } if [ ! -d "$REPO_ROOT/.git" ]; then - git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + git clone --depth 1 --no-checkout --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" fi cd "$REPO_ROOT" + repair_migration_sparse_checkout git fetch origin "$BRANCH_NAME" --depth 1 || true git fetch origin "$GIT_HASH" --depth 1 || true git checkout --detach "$GIT_HASH" @@ -856,6 +895,18 @@ jobs: echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 exit 1 fi + require_migration_path "package.json" + require_migration_path "yarn.lock" + require_migration_path_if_tracked ".yarnrc.yml" + require_migration_path_if_tracked ".yarn" + require_migration_path_if_tracked "turbo.json" + require_migration_path "packages/prisma/package.json" + require_migration_path "packages/prisma/migrations" + require_migration_path "packages/prisma/schema.prisma" + if [ ! -f "${REPO_ROOT}/packages/prisma/schema.prisma" ]; then + echo "Prisma schema path exists but is not a regular file: ${REPO_ROOT}/packages/prisma/schema.prisma" >&2 + exit 1 + fi chmod +x infra/scripts/migrate.sh # Capture script output — parse MIGRATIONS_APPLIED_JSON and MIGRATION_COUNT set +e diff --git a/infra/scripts/migrate.sh b/infra/scripts/migrate.sh index 361cf9d12f112b..b6f15457c6388c 100755 --- a/infra/scripts/migrate.sh +++ b/infra/scripts/migrate.sh @@ -71,6 +71,52 @@ MIGRATION_TIMEOUT_SECONDS="${MIGRATION_TIMEOUT_SECONDS:-300}" ENABLE_DB_BACKUP="${ENABLE_DB_BACKUP:-false}" DB_BACKUP_COMMAND="${DB_BACKUP_COMMAND:-}" +repair_migration_sparse_checkout() { + log INFO "Configuring sparse checkout for Prisma migration files" + git sparse-checkout init --cone || true + git sparse-checkout set \ + .yarn \ + packages/prisma \ + || fail "Failed to configure sparse checkout for migration paths in ${REPO_ROOT}" +} + +diagnose_missing_path() { + local path="$1" + + log ERROR "Missing required path: ${REPO_ROOT}/${path}" + log ERROR "Checkout diagnostics:" + log ERROR " pwd=$(pwd)" + log ERROR " repo_root=${REPO_ROOT}" + log ERROR " branch=${BRANCH_NAME}" + log ERROR " expected_sha=${GIT_HASH:-}" + log ERROR " actual_sha=$(git rev-parse HEAD 2>/dev/null || printf '')" + log ERROR " core.sparseCheckout=$(git config --bool core.sparseCheckout 2>/dev/null || printf 'false')" + log ERROR " core.sparseCheckoutCone=$(git config --bool core.sparseCheckoutCone 2>/dev/null || printf 'false')" + + if git ls-tree -r --name-only HEAD -- "$path" 2>/dev/null | grep -qx "$path"; then + fail "${path} exists in the target commit but is missing from the working tree. This usually means the host checkout is sparse or incomplete." + fi + + fail "${path} is not present in the target commit. Check branch/SHA resolution and the deployed revision." +} + +require_repo_path() { + local path="$1" + + if [ ! -e "${REPO_ROOT}/${path}" ]; then + diagnose_missing_path "$path" + fi +} + +require_repo_path_if_tracked() { + local path="$1" + + if git ls-tree -r --name-only HEAD -- "$path" 2>/dev/null | grep -qx "$path" || + git ls-tree -d --name-only HEAD -- "$path" 2>/dev/null | grep -qx "$path"; then + require_repo_path "$path" + fi +} + # Validate timeout case "$MIGRATION_TIMEOUT_SECONDS" in ''|*[!0-9]*) fail "MIGRATION_TIMEOUT_SECONDS must be a positive integer" ;; @@ -84,6 +130,7 @@ esac if [ -d "${REPO_ROOT}/.git" ]; then log INFO "Using existing checkout at ${REPO_ROOT}" cd "$REPO_ROOT" || fail "Cannot cd to ${REPO_ROOT}" + repair_migration_sparse_checkout git fetch origin "$BRANCH_NAME" --depth 1 2>/dev/null || true if [ -n "$GIT_HASH" ]; then @@ -95,13 +142,16 @@ if [ -d "${REPO_ROOT}/.git" ]; then elif [ -n "$REPO_URL" ]; then log INFO "No existing checkout found at ${REPO_ROOT} — cloning ${BRANCH_NAME}" mkdir -p "$(dirname "$REPO_ROOT")" - git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" \ + git clone --depth 1 --no-checkout --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" \ || fail "Failed to clone repository" cd "$REPO_ROOT" || fail "Failed to cd to ${REPO_ROOT}" + repair_migration_sparse_checkout if [ -n "$GIT_HASH" ]; then git fetch origin "$GIT_HASH" --depth 1 || fail "Failed to fetch ${GIT_HASH}" git checkout "$GIT_HASH" || fail "Failed to checkout ${GIT_HASH}" + else + git checkout "origin/$BRANCH_NAME" || fail "Failed to checkout origin/${BRANCH_NAME}" fi else fail "REPO_ROOT/.git does not exist and REPO_URL is not set — cannot proceed" @@ -116,9 +166,15 @@ log INFO "Running migration preflight checks" # 1. Confirm Prisma schema exists PRISMA_SCHEMA="${REPO_ROOT}/packages/prisma/schema.prisma" -if [ ! -f "$PRISMA_SCHEMA" ]; then - fail "Prisma schema not found at ${PRISMA_SCHEMA}" -fi +require_repo_path "package.json" +require_repo_path "yarn.lock" +require_repo_path_if_tracked ".yarnrc.yml" +require_repo_path_if_tracked ".yarn" +require_repo_path_if_tracked "turbo.json" +require_repo_path "packages/prisma/package.json" +require_repo_path "packages/prisma/migrations" +require_repo_path "packages/prisma/schema.prisma" +[ -f "$PRISMA_SCHEMA" ] || fail "Prisma schema path exists but is not a regular file: ${PRISMA_SCHEMA}" log INFO "Prisma schema found: ${PRISMA_SCHEMA}" # 2. Confirm DATABASE_URL is non-empty (already checked by require_value, but confirm format) From 8b4b3aa7de7af159cfb0c58c5f1941d6418e4131 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 12:32:38 +0530 Subject: [PATCH 05/12] fix(ci): validate release sha before checkout --- .github/workflows/deploy-all.yml | 51 +++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index d852cc8d77a805..95a653f0e1372f 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -313,7 +313,7 @@ jobs: needs: preflight outputs: branch: ${{ steps.release.outputs.branch }} - git_hash: ${{ steps.release.outputs.git_hash }} + git_hash: ${{ steps.release.outputs.release_sha }} release_id: ${{ steps.release.outputs.release_id }} deploy_env: ${{ steps.release.outputs.deploy_env }} image_suffix: ${{ steps.release.outputs.image_suffix }} @@ -349,6 +349,7 @@ jobs: fi release_id="v$(date -u '+%Y%m%d-%H%M%S')" echo "branch=${BRANCH}" >> "$GITHUB_OUTPUT" + echo "release_sha=${git_hash}" >> "$GITHUB_OUTPUT" echo "git_hash=${git_hash}" >> "$GITHUB_OUTPUT" echo "release_id=${release_id}" >> "$GITHUB_OUTPUT" echo "deploy_env=${deploy_env}" >> "$GITHUB_OUTPUT" @@ -373,6 +374,21 @@ jobs: echo "ecr_registry=${ecr_registry}" >> "$GITHUB_OUTPUT" echo "Prepared ${deploy_env} release ${release_id} for ${BRANCH}@${git_hash}" + - name: Validate release metadata outputs + shell: bash + env: + RELEASE_SHA: ${{ steps.release.outputs.release_sha }} + run: | + set -euo pipefail + if [ -z "$RELEASE_SHA" ]; then + echo "prepare-release did not emit release_sha; downstream checkout would default to the wrong ref" >&2 + exit 1 + fi + if ! printf '%s' "$RELEASE_SHA" | grep -Eq '^[0-9a-f]{40}$'; then + echo "prepare-release emitted invalid release_sha: ${RELEASE_SHA}" >&2 + exit 1 + fi + - name: Notify deployment started env: SENDGRID_API_KEY: ${{ secrets.SENDGRID_API_KEY }} @@ -441,6 +457,17 @@ jobs: outputs: web_image: ${{ steps.meta.outputs.web_image }} steps: + - name: Validate release SHA + shell: bash + env: + RELEASE_SHA: ${{ needs.prepare-release.outputs.git_hash }} + run: | + set -euo pipefail + if [ -z "$RELEASE_SHA" ]; then + echo "prepare-release output git_hash is empty; refusing to checkout the workflow default ref" >&2 + exit 1 + fi + - name: Checkout code uses: actions/checkout@v4 with: @@ -567,6 +594,17 @@ jobs: outputs: api_image: ${{ steps.meta.outputs.api_image }} steps: + - name: Validate release SHA + shell: bash + env: + RELEASE_SHA: ${{ needs.prepare-release.outputs.git_hash }} + run: | + set -euo pipefail + if [ -z "$RELEASE_SHA" ]; then + echo "prepare-release output git_hash is empty; refusing to checkout the workflow default ref" >&2 + exit 1 + fi + - name: Checkout code uses: actions/checkout@v4 with: @@ -656,6 +694,17 @@ jobs: outputs: worker_image: ${{ steps.meta.outputs.worker_image }} steps: + - name: Validate release SHA + shell: bash + env: + RELEASE_SHA: ${{ needs.prepare-release.outputs.git_hash }} + run: | + set -euo pipefail + if [ -z "$RELEASE_SHA" ]; then + echo "prepare-release output git_hash is empty; refusing to checkout the workflow default ref" >&2 + exit 1 + fi + - name: Checkout code uses: actions/checkout@v4 with: From 0bb3017e021203cb663382d15f2b198b575da2ae Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 12:52:47 +0530 Subject: [PATCH 06/12] fix(deploy): avoid secret-tainted release outputs --- .github/workflows/deploy-all.yml | 38 +++++++++----------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index 95a653f0e1372f..7f17545f49400f 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -313,15 +313,13 @@ jobs: needs: preflight outputs: branch: ${{ steps.release.outputs.branch }} - git_hash: ${{ steps.release.outputs.release_sha }} + git_hash: ${{ steps.release.outputs.git_hash }} release_id: ${{ steps.release.outputs.release_id }} deploy_env: ${{ steps.release.outputs.deploy_env }} image_suffix: ${{ steps.release.outputs.image_suffix }} web_repo: ${{ steps.release.outputs.web_repo }} api_repo: ${{ steps.release.outputs.api_repo }} worker_repo: ${{ steps.release.outputs.worker_repo }} - worker_replicas: ${{ steps.release.outputs.worker_replicas }} - ecr_registry: ${{ steps.release.outputs.ecr_registry }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -349,7 +347,6 @@ jobs: fi release_id="v$(date -u '+%Y%m%d-%H%M%S')" echo "branch=${BRANCH}" >> "$GITHUB_OUTPUT" - echo "release_sha=${git_hash}" >> "$GITHUB_OUTPUT" echo "git_hash=${git_hash}" >> "$GITHUB_OUTPUT" echo "release_id=${release_id}" >> "$GITHUB_OUTPUT" echo "deploy_env=${deploy_env}" >> "$GITHUB_OUTPUT" @@ -357,35 +354,20 @@ jobs: echo "web_repo=cal_${image_suffix}" >> "$GITHUB_OUTPUT" echo "api_repo=cal_api_${image_suffix}" >> "$GITHUB_OUTPUT" echo "worker_repo=cal_worker_${image_suffix}" >> "$GITHUB_OUTPUT" - worker_replicas="${{ github.event.inputs.worker_replicas }}" - if [ -z "$worker_replicas" ]; then - if [ "$deploy_env" = "production" ]; then - worker_replicas="${{ secrets.WORKER_REPLICAS_PROD }}" - else - worker_replicas="${{ secrets.WORKER_REPLICAS_STAG }}" - fi - fi - worker_replicas="${worker_replicas:-1}" - echo "worker_replicas=${worker_replicas}" >> "$GITHUB_OUTPUT" - # ECR registry is derived from AWS_ACCOUNT_ID and region - account_id="${{ secrets.AWS_ACCOUNT_ID }}" - region="${{ secrets.AWS_REGION }}" - ecr_registry="${account_id}.dkr.ecr.${region}.amazonaws.com" - echo "ecr_registry=${ecr_registry}" >> "$GITHUB_OUTPUT" echo "Prepared ${deploy_env} release ${release_id} for ${BRANCH}@${git_hash}" - name: Validate release metadata outputs shell: bash env: - RELEASE_SHA: ${{ steps.release.outputs.release_sha }} + RELEASE_SHA: ${{ steps.release.outputs.git_hash }} run: | set -euo pipefail if [ -z "$RELEASE_SHA" ]; then - echo "prepare-release did not emit release_sha; downstream checkout would default to the wrong ref" >&2 + echo "prepare-release did not emit git_hash; downstream checkout would default to the wrong ref" >&2 exit 1 fi if ! printf '%s' "$RELEASE_SHA" | grep -Eq '^[0-9a-f]{40}$'; then - echo "prepare-release emitted invalid release_sha: ${RELEASE_SHA}" >&2 + echo "prepare-release emitted invalid git_hash: ${RELEASE_SHA}" >&2 exit 1 fi @@ -1005,7 +987,7 @@ jobs: uses: appleboy/ssh-action@v0.1.10 env: API_IMAGE: ${{ needs.build-api.outputs.api_image }} - ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} REPO_URL: ${{ secrets.REPO_URL }} @@ -1061,7 +1043,7 @@ jobs: uses: appleboy/ssh-action@v0.1.10 env: WEB_IMAGE: ${{ needs.build-web.outputs.web_image }} - ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} REPO_URL: ${{ secrets.REPO_URL }} @@ -1306,11 +1288,11 @@ jobs: uses: appleboy/ssh-action@v0.1.10 env: WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image }} - ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} REPO_URL: ${{ secrets.REPO_URL }} - WORKER_REPLICAS: ${{ needs.prepare-release.outputs.worker_replicas }} + WORKER_REPLICAS: ${{ github.event.inputs.worker_replicas || (needs.prepare-release.outputs.deploy_env == 'production' && secrets.WORKER_REPLICAS_PROD || secrets.WORKER_REPLICAS_STAG) || '1' }} AWS_REGION: ${{ secrets.AWS_REGION }} with: host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} @@ -1391,7 +1373,7 @@ jobs: shell: bash env: DEPLOY_ENV: ${{ needs.prepare-release.outputs.deploy_env }} - ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com WEB_REPO_NAME: ${{ needs.prepare-release.outputs.web_repo }} API_REPO_NAME: ${{ needs.prepare-release.outputs.api_repo }} WORKER_REPO_NAME: ${{ needs.prepare-release.outputs.worker_repo }} @@ -1474,7 +1456,7 @@ jobs: HOMEPAGE_URL: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.HOMEPAGE_URL_PROD || secrets.HOMEPAGE_URL_STAG }} CERTBOT_EMAIL: ${{ secrets.CERTBOT_EMAIL }} AWS_REGION: ${{ secrets.AWS_REGION }} - ECR_REGISTRY: ${{ needs.prepare-release.outputs.ecr_registry }} + ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com WEB_IMAGE: ${{ steps.current.outputs.web_image }} API_IMAGE: ${{ steps.current.outputs.api_image }} WORKER_IMAGE: ${{ steps.current.outputs.worker_image }} From eafa6b5fbae70bc7ce11da537ecd6ada9a9c8088 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 13:36:39 +0530 Subject: [PATCH 07/12] fix(ci):included migration scripts during sparse checkout --- .github/workflows/deploy-all.yml | 2 ++ infra/scripts/migrate.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index 7f17545f49400f..3d8d2f7cad91a5 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -911,6 +911,7 @@ jobs: git sparse-checkout init --cone || true git sparse-checkout set \ .yarn \ + infra/scripts \ packages/prisma } if [ ! -d "$REPO_ROOT/.git" ]; then @@ -931,6 +932,7 @@ jobs: require_migration_path_if_tracked ".yarnrc.yml" require_migration_path_if_tracked ".yarn" require_migration_path_if_tracked "turbo.json" + require_migration_path "infra/scripts/migrate.sh" require_migration_path "packages/prisma/package.json" require_migration_path "packages/prisma/migrations" require_migration_path "packages/prisma/schema.prisma" diff --git a/infra/scripts/migrate.sh b/infra/scripts/migrate.sh index b6f15457c6388c..a489ab29713f2d 100755 --- a/infra/scripts/migrate.sh +++ b/infra/scripts/migrate.sh @@ -76,6 +76,7 @@ repair_migration_sparse_checkout() { git sparse-checkout init --cone || true git sparse-checkout set \ .yarn \ + infra/scripts \ packages/prisma \ || fail "Failed to configure sparse checkout for migration paths in ${REPO_ROOT}" } @@ -171,6 +172,7 @@ require_repo_path "yarn.lock" require_repo_path_if_tracked ".yarnrc.yml" require_repo_path_if_tracked ".yarn" require_repo_path_if_tracked "turbo.json" +require_repo_path "infra/scripts/migrate.sh" require_repo_path "packages/prisma/package.json" require_repo_path "packages/prisma/migrations" require_repo_path "packages/prisma/schema.prisma" From 126de11bdeaffd869e65b2f08cdc0e473a90e0b2 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 14:54:31 +0530 Subject: [PATCH 08/12] fix(deploy): use full repo checkout for migrations with deferred cleanup Replace sparse checkout with full repository checkout in migrate-db step to resolve workspace dependency resolution failures (@calcom/lib 404 errors). Changes: - migrate-db: clone full repo instead of sparse checkout - migrate.sh: add DEFER_CLEANUP env support to defer cleanup until downstream stages complete - deploy-all.yml: add cleanup steps to verify and rollback-after-promotion jobs - migrate-db: set DEFER_CLEANUP=true to preserve checkout for deploy stages This ensures all workspace packages resolve correctly during yarn install while deferring cleanup until after deployment pipeline completes successfully. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/deploy-all.yml | 56 +++++++++++++++++++++++++------- infra/scripts/migrate.sh | 37 ++++++++++++++++----- 2 files changed, 73 insertions(+), 20 deletions(-) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index 3d8d2f7cad91a5..7da884c46fac61 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -867,12 +867,13 @@ jobs: ENABLE_DB_BACKUP: ${{ secrets.ENABLE_DB_BACKUP || 'false' }} DB_BACKUP_COMMAND: ${{ secrets.DB_BACKUP_COMMAND || '' }} MIGRATION_TIMEOUT_SECONDS: "600" + DEFER_CLEANUP: "true" with: host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} username: onehash key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} command_timeout: 1200s - envs: GIT_HASH,RELEASE_ID,DATABASE_URL,REPO_URL,BRANCH_NAME,REPO_ROOT,ENABLE_DB_BACKUP,DB_BACKUP_COMMAND,MIGRATION_TIMEOUT_SECONDS + envs: GIT_HASH,RELEASE_ID,DATABASE_URL,REPO_URL,BRANCH_NAME,REPO_ROOT,ENABLE_DB_BACKUP,DB_BACKUP_COMMAND,MIGRATION_TIMEOUT_SECONDS,DEFER_CLEANUP script: | set -euo pipefail diagnose_missing_path() { @@ -906,19 +907,10 @@ jobs: require_migration_path "$path" fi } - repair_migration_sparse_checkout() { - echo "Configuring sparse checkout for Prisma migration files" >&2 - git sparse-checkout init --cone || true - git sparse-checkout set \ - .yarn \ - infra/scripts \ - packages/prisma - } if [ ! -d "$REPO_ROOT/.git" ]; then - git clone --depth 1 --no-checkout --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" fi cd "$REPO_ROOT" - repair_migration_sparse_checkout git fetch origin "$BRANCH_NAME" --depth 1 || true git fetch origin "$GIT_HASH" --depth 1 || true git checkout --detach "$GIT_HASH" @@ -1537,6 +1529,27 @@ jobs: echo "Automatic rollback did not restore the previous release. Final notification will include rollback status: ${{ steps.rollback-outcome.outputs.rollback_status }}" exit 1 + - name: Clean up VPC work directory on rollback + if: always() + uses: appleboy/ssh-action@v0.1.10 + env: + REPO_ROOT: /home/onehash/onehash-cal + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 120s + envs: REPO_ROOT + script: | + set -euo pipefail + if [ -d "$REPO_ROOT" ]; then + echo "Cleaning up VPC work directory after rollback: ${REPO_ROOT}" + rm -rf "$REPO_ROOT" || echo "Warning: Failed to clean up ${REPO_ROOT}" + echo "VPC work directory cleaned up" + else + echo "No work directory to clean: ${REPO_ROOT}" + fi + # ---------- Verification ---------- verify: @@ -1592,6 +1605,27 @@ jobs: GITHUB_ACTOR: ${{ github.actor }} run: bash infra/scripts/record-state.sh + - name: Clean up VPC work directory + if: always() + uses: appleboy/ssh-action@v0.1.10 + env: + REPO_ROOT: /home/onehash/onehash-cal + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 120s + envs: REPO_ROOT + script: | + set -euo pipefail + if [ -d "$REPO_ROOT" ]; then + echo "Cleaning up VPC work directory: ${REPO_ROOT}" + rm -rf "$REPO_ROOT" || echo "Warning: Failed to clean up ${REPO_ROOT}" + echo "VPC work directory cleaned up" + else + echo "No work directory to clean: ${REPO_ROOT}" + fi + # release-lock: # name: Release deployment lock (disabled) # runs-on: ubuntu-latest diff --git a/infra/scripts/migrate.sh b/infra/scripts/migrate.sh index a489ab29713f2d..d0062ab0946451 100755 --- a/infra/scripts/migrate.sh +++ b/infra/scripts/migrate.sh @@ -71,14 +71,33 @@ MIGRATION_TIMEOUT_SECONDS="${MIGRATION_TIMEOUT_SECONDS:-300}" ENABLE_DB_BACKUP="${ENABLE_DB_BACKUP:-false}" DB_BACKUP_COMMAND="${DB_BACKUP_COMMAND:-}" +# Track whether we've already cleaned up to avoid double-cleanup +WORK_DIR_CLEANED=false + +cleanup_workdir() { + # Cleanup is deferred when DEFER_CLEANUP=true — the verify stage handles cleanup. + # This allows downstream deployment stages to reuse the checkout. + if [ "$WORK_DIR_CLEANED" = "false" ] && [ -d "$REPO_ROOT" ]; then + if [ "${DEFER_CLEANUP:-false}" = "true" ]; then + log INFO "DEFER_CLEANUP=true — deferring cleanup to verify stage" + elif [ "$KEEP_WORK_DIR" = "true" ]; then + log INFO "KEEP_WORK_DIR=true — preserving work directory at ${REPO_ROOT}" + else + log INFO "Cleaning up work directory: ${REPO_ROOT}" + rm -rf "$REPO_ROOT" || log WARN "Failed to clean up ${REPO_ROOT}" + fi + WORK_DIR_CLEANED=true + fi +} + +# Always attempt cleanup on script exit +trap cleanup_workdir EXIT + repair_migration_sparse_checkout() { - log INFO "Configuring sparse checkout for Prisma migration files" - git sparse-checkout init --cone || true - git sparse-checkout set \ - .yarn \ - infra/scripts \ - packages/prisma \ - || fail "Failed to configure sparse checkout for migration paths in ${REPO_ROOT}" + # Full repository checkout is used for migrations to ensure workspace + # dependencies resolve correctly without manual sparse-checkout maintenance. + # Cleanup is handled by the EXIT trap below. + log INFO "Using full repository checkout for migrations" } diagnose_missing_path() { @@ -141,9 +160,9 @@ if [ -d "${REPO_ROOT}/.git" ]; then git checkout "origin/$BRANCH_NAME" || fail "Failed to checkout origin/${BRANCH_NAME}" fi elif [ -n "$REPO_URL" ]; then - log INFO "No existing checkout found at ${REPO_ROOT} — cloning ${BRANCH_NAME}" + log INFO "No existing checkout found at ${REPO_ROOT} — cloning full repository" mkdir -p "$(dirname "$REPO_ROOT")" - git clone --depth 1 --no-checkout --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" \ + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" \ || fail "Failed to clone repository" cd "$REPO_ROOT" || fail "Failed to cd to ${REPO_ROOT}" repair_migration_sparse_checkout From 2d1c8ee49bbf26206d016a68cc1f747f56f4a5ab Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 15:08:09 +0530 Subject: [PATCH 09/12] fix(deploy): use full migration checkout with safe cleanup --- .github/workflows/deploy-all.yml | 74 +++++++++++++++++++------------- infra/scripts/migrate.sh | 24 ++++++++--- 2 files changed, 62 insertions(+), 36 deletions(-) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index 7da884c46fac61..e73b535ebeb5a4 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -907,10 +907,20 @@ jobs: require_migration_path "$path" fi } + ensure_full_checkout() { + if git config --bool core.sparseCheckout 2>/dev/null | grep -qx "true"; then + echo "Sparse checkout is enabled at ${REPO_ROOT}; disabling it for full workspace dependency resolution" >&2 + if ! git sparse-checkout disable; then + git config core.sparseCheckout false + git read-tree -mu HEAD + fi + fi + } if [ ! -d "$REPO_ROOT/.git" ]; then git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" fi cd "$REPO_ROOT" + ensure_full_checkout git fetch origin "$BRANCH_NAME" --depth 1 || true git fetch origin "$GIT_HASH" --depth 1 || true git checkout --detach "$GIT_HASH" @@ -938,7 +948,7 @@ jobs: output=$(GIT_HASH="$GIT_HASH" RELEASE_ID="$RELEASE_ID" DATABASE_URL="$DATABASE_URL" \ REPO_URL="$REPO_URL" BRANCH_NAME="$BRANCH_NAME" REPO_ROOT="$REPO_ROOT" \ ENABLE_DB_BACKUP="$ENABLE_DB_BACKUP" DB_BACKUP_COMMAND="$DB_BACKUP_COMMAND" \ - MIGRATION_TIMEOUT_SECONDS="$MIGRATION_TIMEOUT_SECONDS" \ + MIGRATION_TIMEOUT_SECONDS="$MIGRATION_TIMEOUT_SECONDS" DEFER_CLEANUP="$DEFER_CLEANUP" \ infra/scripts/migrate.sh 2>&1) exit_code=$? set -e @@ -1529,27 +1539,6 @@ jobs: echo "Automatic rollback did not restore the previous release. Final notification will include rollback status: ${{ steps.rollback-outcome.outputs.rollback_status }}" exit 1 - - name: Clean up VPC work directory on rollback - if: always() - uses: appleboy/ssh-action@v0.1.10 - env: - REPO_ROOT: /home/onehash/onehash-cal - with: - host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} - username: onehash - key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} - command_timeout: 120s - envs: REPO_ROOT - script: | - set -euo pipefail - if [ -d "$REPO_ROOT" ]; then - echo "Cleaning up VPC work directory after rollback: ${REPO_ROOT}" - rm -rf "$REPO_ROOT" || echo "Warning: Failed to clean up ${REPO_ROOT}" - echo "VPC work directory cleaned up" - else - echo "No work directory to clean: ${REPO_ROOT}" - fi - # ---------- Verification ---------- verify: @@ -1605,8 +1594,22 @@ jobs: GITHUB_ACTOR: ${{ github.actor }} run: bash infra/scripts/record-state.sh - - name: Clean up VPC work directory - if: always() + cleanup-vpc-workdir: + name: Clean up VPC work directory + runs-on: ubuntu-latest + needs: + - prepare-release + - migrate-db + - deploy-api + - deploy-web + - promote-all + - record-promoted-state + - deploy-worker + - rollback-after-promotion + - verify + if: ${{ always() && needs.prepare-release.result == 'success' }} + steps: + - name: Remove release checkout from target host uses: appleboy/ssh-action@v0.1.10 env: REPO_ROOT: /home/onehash/onehash-cal @@ -1618,13 +1621,21 @@ jobs: envs: REPO_ROOT script: | set -euo pipefail - if [ -d "$REPO_ROOT" ]; then - echo "Cleaning up VPC work directory: ${REPO_ROOT}" - rm -rf "$REPO_ROOT" || echo "Warning: Failed to clean up ${REPO_ROOT}" - echo "VPC work directory cleaned up" - else - echo "No work directory to clean: ${REPO_ROOT}" - fi + case "$REPO_ROOT" in + /home/onehash/onehash-cal) + if [ -d "$REPO_ROOT" ]; then + echo "Cleaning up VPC work directory: ${REPO_ROOT}" + rm -rf "$REPO_ROOT" || echo "Warning: Failed to clean up ${REPO_ROOT}" + echo "VPC work directory cleaned up" + else + echo "No work directory to clean: ${REPO_ROOT}" + fi + ;; + *) + echo "Refusing to clean unexpected REPO_ROOT: ${REPO_ROOT}" >&2 + exit 1 + ;; + esac # release-lock: # name: Release deployment lock (disabled) @@ -1677,6 +1688,7 @@ jobs: - deploy-worker - rollback-after-promotion - verify + - cleanup-vpc-workdir # - release-lock if: always() steps: diff --git a/infra/scripts/migrate.sh b/infra/scripts/migrate.sh index d0062ab0946451..00d1b5778d9a2d 100755 --- a/infra/scripts/migrate.sh +++ b/infra/scripts/migrate.sh @@ -75,7 +75,7 @@ DB_BACKUP_COMMAND="${DB_BACKUP_COMMAND:-}" WORK_DIR_CLEANED=false cleanup_workdir() { - # Cleanup is deferred when DEFER_CLEANUP=true — the verify stage handles cleanup. + # Cleanup is deferred when DEFER_CLEANUP=true — the workflow cleanup job handles cleanup. # This allows downstream deployment stages to reuse the checkout. if [ "$WORK_DIR_CLEANED" = "false" ] && [ -d "$REPO_ROOT" ]; then if [ "${DEFER_CLEANUP:-false}" = "true" ]; then @@ -84,7 +84,14 @@ cleanup_workdir() { log INFO "KEEP_WORK_DIR=true — preserving work directory at ${REPO_ROOT}" else log INFO "Cleaning up work directory: ${REPO_ROOT}" - rm -rf "$REPO_ROOT" || log WARN "Failed to clean up ${REPO_ROOT}" + case "$REPO_ROOT" in + /home/onehash/onehash-cal) + rm -rf "$REPO_ROOT" || log WARN "Failed to clean up ${REPO_ROOT}" + ;; + *) + log WARN "Refusing to clean unexpected REPO_ROOT: ${REPO_ROOT}" + ;; + esac fi WORK_DIR_CLEANED=true fi @@ -93,11 +100,18 @@ cleanup_workdir() { # Always attempt cleanup on script exit trap cleanup_workdir EXIT -repair_migration_sparse_checkout() { +ensure_full_checkout() { # Full repository checkout is used for migrations to ensure workspace # dependencies resolve correctly without manual sparse-checkout maintenance. # Cleanup is handled by the EXIT trap below. log INFO "Using full repository checkout for migrations" + if git config --bool core.sparseCheckout 2>/dev/null | grep -qx "true"; then + log INFO "Sparse checkout is enabled at ${REPO_ROOT}; disabling it for full workspace dependency resolution" + if ! git sparse-checkout disable; then + git config core.sparseCheckout false || fail "Failed to clear sparse checkout config in ${REPO_ROOT}" + git read-tree -mu HEAD || fail "Failed to repopulate working tree after disabling sparse checkout in ${REPO_ROOT}" + fi + fi } diagnose_missing_path() { @@ -150,7 +164,7 @@ esac if [ -d "${REPO_ROOT}/.git" ]; then log INFO "Using existing checkout at ${REPO_ROOT}" cd "$REPO_ROOT" || fail "Cannot cd to ${REPO_ROOT}" - repair_migration_sparse_checkout + ensure_full_checkout git fetch origin "$BRANCH_NAME" --depth 1 2>/dev/null || true if [ -n "$GIT_HASH" ]; then @@ -165,7 +179,7 @@ elif [ -n "$REPO_URL" ]; then git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" \ || fail "Failed to clone repository" cd "$REPO_ROOT" || fail "Failed to cd to ${REPO_ROOT}" - repair_migration_sparse_checkout + ensure_full_checkout if [ -n "$GIT_HASH" ]; then git fetch origin "$GIT_HASH" --depth 1 || fail "Failed to fetch ${GIT_HASH}" From 035f36cca0a1425cb1b908707ed4a439b47d0197 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 15:58:03 +0530 Subject: [PATCH 10/12] fix(deploy): install migration build dependencies --- .github/workflows/deploy-all.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index e73b535ebeb5a4..31382f5911a3d4 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -854,6 +854,28 @@ jobs: # LOCK_TTL_SECONDS: 7200 # run: bash infra/scripts/refresh-lock.sh + - name: Install migration build dependencies on target VPC host + uses: appleboy/ssh-action@v0.1.10 + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 300s + script: | + set -euo pipefail + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \ + build-essential \ + python3 \ + pkg-config \ + libssl-dev \ + libffi-dev \ + git + gcc --version | head -n 1 + python3 --version + pkg-config --version + git --version + - name: Run database migrations from target VPC id: run-migrations uses: appleboy/ssh-action@v0.1.10 From 3c92bf37e58a49b43e3e4bebe9f93c17f1ad5c01 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Wed, 27 May 2026 16:51:23 +0530 Subject: [PATCH 11/12] fix(deploy): harden migration native builds --- .github/workflows/deploy-all.yml | 12 +++++++++++- infra/scripts/migrate.sh | 16 +++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index 31382f5911a3d4..25ab0b3e0655fd 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -870,11 +870,21 @@ jobs: pkg-config \ libssl-dev \ libffi-dev \ - git + git \ + node-gyp \ + make \ + g++ \ + curl gcc --version | head -n 1 + make --version | head -n 1 + g++ --version | head -n 1 python3 --version pkg-config --version git --version + node --version + npm --version + echo "Available memory:" + free -h - name: Run database migrations from target VPC id: run-migrations diff --git a/infra/scripts/migrate.sh b/infra/scripts/migrate.sh index 00d1b5778d9a2d..c4d7f7a1862718 100755 --- a/infra/scripts/migrate.sh +++ b/infra/scripts/migrate.sh @@ -228,7 +228,16 @@ else log INFO "No GIT_HASH specified — using current HEAD: ${resolved_sha}" fi -# 4. Verify Prisma can read the schema (dry-run validation) +# 4. Verify native build environment has enough memory for Yarn linking. +if command -v free >/dev/null 2>&1; then + available_memory_mb="$(free -m | awk 'NR==2 {print $7}')" + if [ -n "$available_memory_mb" ] && [ "$available_memory_mb" -lt 2048 ]; then + fail "Insufficient available memory for native dependency builds: ${available_memory_mb}MB available, need at least 2048MB" + fi + log INFO "Available memory for dependency install: ${available_memory_mb}MB" +fi + +# 5. Verify Prisma can read the schema (dry-run validation) # This doesn't connect to DB — just validates the schema file is parseable. # Skip if prisma is not installed yet (yarn install will fix this). if command -v npx >/dev/null 2>&1 && [ -f "${REPO_ROOT}/package.json" ]; then @@ -255,6 +264,11 @@ fi if [ "$SKIP_YARN_INSTALL" != "true" ]; then if [ ! -d "${REPO_ROOT}/node_modules" ]; then log INFO "Installing dependencies" + export NODE_OPTIONS="${NODE_OPTIONS:---max-old-space-size=4096}" + export PRISMA_SKIP_POSTINSTALL_GENERATE="${PRISMA_SKIP_POSTINSTALL_GENERATE:-true}" + export YARN_HTTP_TIMEOUT="${YARN_HTTP_TIMEOUT:-120000}" + export npm_config_python="${npm_config_python:-python3}" + if command -v corepack >/dev/null 2>&1; then corepack enable >/dev/null 2>&1 || log WARN "corepack enable failed; continuing with existing yarn" fi From 25d7640e8767e2fa69bd6739aab68f34405f6b81 Mon Sep 17 00:00:00 2001 From: arjun3492 Date: Fri, 29 May 2026 16:18:29 +0530 Subject: [PATCH 12/12] ci:added preflight checks and flow hardening --- .github/workflows/deploy-all.yml | 241 +++++- .github/workflows/worker-deploy.yml | 31 +- docs/deployment_fix.md | 1158 +++++++++++++++++++++++++++ infra/scripts/preflight-space.sh | 77 +- infra/scripts/rollback.sh | 5 +- infra/scripts/send-email.sh | 4 +- 6 files changed, 1488 insertions(+), 28 deletions(-) create mode 100644 docs/deployment_fix.md diff --git a/.github/workflows/deploy-all.yml b/.github/workflows/deploy-all.yml index 25ab0b3e0655fd..97a12551d3b276 100644 --- a/.github/workflows/deploy-all.yml +++ b/.github/workflows/deploy-all.yml @@ -1,6 +1,9 @@ name: Deploy All Services on: + push: + branches: + - pipeline_risk_analysis # Auto-deploy on PR merge to main or develop branches for relevant path changes pull_request: types: @@ -89,17 +92,82 @@ jobs: docker buildx build --check -f apps/connector/Dockerfile . docker buildx build --check -f apps/worker/Dockerfile . - - name: Validate critical deployment scripts + - name: Validate deployment script syntax shell: bash run: | set -euo pipefail - bash -n infra/scripts/preflight-space.sh bash -n infra/scripts/migrate.sh bash -n infra/scripts/start-workers.sh bash -n infra/scripts/drain-workers.sh - PROMOTION_COMPLETE=true WORKER_IMAGE=dry-run WORKER_REPLICAS=1 WORKER_LIFECYCLE_DRY_RUN=true \ - bash infra/scripts/start-workers.sh - WORKER_LIFECYCLE_DRY_RUN=true bash infra/scripts/drain-workers.sh + bash -n infra/scripts/stage-api.sh + bash -n infra/scripts/stage-web.sh + bash -n infra/scripts/promote-all.sh + bash -n infra/scripts/revert-nginx.sh + bash -n infra/scripts/record-state.sh + bash -n infra/scripts/rollback.sh + + - name: Validate required deployment files exist + shell: bash + run: | + set -euo pipefail + failed=0 + + required_scripts=( + infra/scripts/stage-api.sh + infra/scripts/stage-web.sh + infra/scripts/start-workers.sh + infra/scripts/drain-workers.sh + infra/scripts/migrate.sh + infra/scripts/promote-all.sh + infra/scripts/revert-nginx.sh + infra/scripts/record-state.sh + infra/scripts/deploy-state-paths.sh + infra/scripts/ngx-utils.sh + infra/scripts/preflight-space.sh + infra/scripts/send-email.sh + infra/scripts/rollback.sh + infra/scripts/validate-rollback-schema.sh + ) + + required_templates=( + infra/docker/web/nginx.template.conf + infra/docker/connector/nginx.connector.template.conf + infra/docker/connector/nginx.mcp.template.conf + ) + + required_dockers=( + Dockerfile + apps/connector/Dockerfile + apps/worker/Dockerfile + ) + + for f in "${required_scripts[@]}"; do + if [ ! -f "$f" ]; then + echo "PREFLIGHT: MISSING — script: $f" + failed=1 + fi + done + + for f in "${required_templates[@]}"; do + if [ ! -f "$f" ]; then + echo "PREFLIGHT: MISSING — template: $f" + failed=1 + fi + done + + for f in "${required_dockers[@]}"; do + if [ ! -f "$f" ]; then + echo "PREFLIGHT: MISSING — dockerfile: $f" + failed=1 + fi + done + + if [ "$failed" -ne 0 ]; then + echo "PREFLIGHT: One or more required files are missing — aborting" + exit 1 + fi + + echo "PREFLIGHT: All required files present" - name: Validate required configuration id: preflight @@ -478,6 +546,38 @@ jobs: id: login-ecr uses: aws-actions/amazon-ecr-login@v2 + - name: Resolve ECR registry + id: ecr-registry + shell: bash + env: + LOGIN_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: | + set -euo pipefail + registry="$LOGIN_REGISTRY" + if [ -z "$registry" ]; then + registry="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" + fi + echo "registry=$registry" >> "$GITHUB_OUTPUT" + + - name: Login Docker to resolved ECR registry + shell: bash + env: + ECR_REGISTRY: ${{ steps.ecr-registry.outputs.registry }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: | + set -euo pipefail + if ! command -v aws >/dev/null 2>&1; then + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y curl unzip + curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip + unzip -q /tmp/awscliv2.zip -d /tmp + /tmp/aws/install + fi + aws ecr get-login-password --region "$AWS_REGION" | + docker login --username AWS --password-stdin "$ECR_REGISTRY" + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -558,13 +658,13 @@ jobs: nextauth_secret=${{ secrets.NEXTAUTH_SECRET }} calendso_encryption_key=${{ secrets.CALENDSO_ENCRYPTION_KEY }} tags: | - ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.web_repo }}:${{ needs.prepare-release.outputs.git_hash }} - ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.web_repo }}:latest + ${{ steps.ecr-registry.outputs.registry }}/${{ needs.prepare-release.outputs.web_repo }}:${{ needs.prepare-release.outputs.git_hash }} + ${{ steps.ecr-registry.outputs.registry }}/${{ needs.prepare-release.outputs.web_repo }}:latest - name: Set web image output id: meta run: | - registry="${{ steps.login-ecr.outputs.registry }}" + registry="${{ steps.ecr-registry.outputs.registry }}" echo "web_image=${registry}/${{ needs.prepare-release.outputs.web_repo }}:${{ needs.prepare-release.outputs.git_hash }}" >> "$GITHUB_OUTPUT" build-api: @@ -615,6 +715,38 @@ jobs: id: login-ecr uses: aws-actions/amazon-ecr-login@v2 + - name: Resolve ECR registry + id: ecr-registry + shell: bash + env: + LOGIN_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: | + set -euo pipefail + registry="$LOGIN_REGISTRY" + if [ -z "$registry" ]; then + registry="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" + fi + echo "registry=$registry" >> "$GITHUB_OUTPUT" + + - name: Login Docker to resolved ECR registry + shell: bash + env: + ECR_REGISTRY: ${{ steps.ecr-registry.outputs.registry }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: | + set -euo pipefail + if ! command -v aws >/dev/null 2>&1; then + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y curl unzip + curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip + unzip -q /tmp/awscliv2.zip -d /tmp + /tmp/aws/install + fi + aws ecr get-login-password --region "$AWS_REGION" | + docker login --username AWS --password-stdin "$ECR_REGISTRY" + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -658,13 +790,13 @@ jobs: sentry_org=${{ secrets.SENTRY_ORG }} sentry_project=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.SENTRY_PROJECT_PROD || secrets.SENTRY_PROJECT_STAG }} tags: | - ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.api_repo }}:${{ needs.prepare-release.outputs.git_hash }} - ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.api_repo }}:latest + ${{ steps.ecr-registry.outputs.registry }}/${{ needs.prepare-release.outputs.api_repo }}:${{ needs.prepare-release.outputs.git_hash }} + ${{ steps.ecr-registry.outputs.registry }}/${{ needs.prepare-release.outputs.api_repo }}:latest - name: Set API image output id: meta run: | - registry="${{ steps.login-ecr.outputs.registry }}" + registry="${{ steps.ecr-registry.outputs.registry }}" echo "api_image=${registry}/${{ needs.prepare-release.outputs.api_repo }}:${{ needs.prepare-release.outputs.git_hash }}" >> "$GITHUB_OUTPUT" build-worker: @@ -715,6 +847,38 @@ jobs: id: login-ecr uses: aws-actions/amazon-ecr-login@v2 + - name: Resolve ECR registry + id: ecr-registry + shell: bash + env: + LOGIN_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: | + set -euo pipefail + registry="$LOGIN_REGISTRY" + if [ -z "$registry" ]; then + registry="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" + fi + echo "registry=$registry" >> "$GITHUB_OUTPUT" + + - name: Login Docker to resolved ECR registry + shell: bash + env: + ECR_REGISTRY: ${{ steps.ecr-registry.outputs.registry }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: | + set -euo pipefail + if ! command -v aws >/dev/null 2>&1; then + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y curl unzip + curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip + unzip -q /tmp/awscliv2.zip -d /tmp + /tmp/aws/install + fi + aws ecr get-login-password --region "$AWS_REGION" | + docker login --username AWS --password-stdin "$ECR_REGISTRY" + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -758,13 +922,13 @@ jobs: sentry_org=${{ secrets.SENTRY_ORG }} sentry_project=${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.SENTRY_PROJECT_PROD || secrets.SENTRY_PROJECT_STAG }} tags: | - ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.worker_repo }}:${{ needs.prepare-release.outputs.git_hash }} - ${{ steps.login-ecr.outputs.registry }}/${{ needs.prepare-release.outputs.worker_repo }}:latest + ${{ steps.ecr-registry.outputs.registry }}/${{ needs.prepare-release.outputs.worker_repo }}:${{ needs.prepare-release.outputs.git_hash }} + ${{ steps.ecr-registry.outputs.registry }}/${{ needs.prepare-release.outputs.worker_repo }}:latest - name: Set worker image output id: meta run: | - registry="${{ steps.login-ecr.outputs.registry }}" + registry="${{ steps.ecr-registry.outputs.registry }}" echo "worker_image=${registry}/${{ needs.prepare-release.outputs.worker_repo }}:${{ needs.prepare-release.outputs.git_hash }}" >> "$GITHUB_OUTPUT" # ---------- State Recording ---------- @@ -1181,8 +1345,8 @@ jobs: chmod +x infra/scripts/promote-all.sh infra/scripts/revert-nginx.sh DOMAIN_NAME="$DOMAIN_NAME" HOMEPAGE_URL="$HOMEPAGE_URL" infra/scripts/promote-all.sh - - name: Revert NGINX on promotion failure - if: failure() + - name: Revert NGINX when promotion succeeded but downstream deploy failed + if: ${{ always() && needs.promote-all.result == 'success' && (needs.deploy-api.result != 'success' || needs.deploy-web.result != 'success') }} uses: appleboy/ssh-action@v0.1.10 env: BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} @@ -1213,6 +1377,7 @@ jobs: fi chmod +x infra/scripts/revert-nginx.sh infra/scripts/revert-nginx.sh + script_stop: false # ---------- Record promoted state ---------- # Only record "promoted" state after Web/API are confirmed healthy via NGINX. @@ -1266,7 +1431,7 @@ jobs: WEB_IMAGE: ${{ needs.build-web.outputs.web_image }} API_IMAGE: ${{ needs.build-api.outputs.api_image }} WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image }} - MIGRATIONS_APPLIED_JSON: ${{ needs.migrate-db.outputs.migrations_applied || '[]' }} + MIGRATIONS_APPLIED_JSON: ${{ needs.migrate-db.outputs.migrations_applied }} GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_ACTOR: ${{ github.actor }} run: bash infra/scripts/record-state.sh @@ -1328,7 +1493,7 @@ jobs: BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} REPO_URL: ${{ secrets.REPO_URL }} - WORKER_REPLICAS: ${{ github.event.inputs.worker_replicas || (needs.prepare-release.outputs.deploy_env == 'production' && secrets.WORKER_REPLICAS_PROD || secrets.WORKER_REPLICAS_STAG) || '1' }} + WORKER_REPLICAS: ${{ github.event.inputs.worker_replicas != '' && github.event.inputs.worker_replicas || (needs.prepare-release.outputs.deploy_env == 'production' && secrets.WORKER_REPLICAS_PROD || secrets.WORKER_REPLICAS_STAG) }} AWS_REGION: ${{ secrets.AWS_REGION }} with: host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} @@ -1475,6 +1640,40 @@ jobs: exit 1 fi + - name: Revert NGINX (always when promotion succeeded but worker failed) + uses: appleboy/ssh-action@v0.1.10 + env: + BRANCH_NAME: ${{ needs.prepare-release.outputs.branch }} + GIT_HASH: ${{ needs.prepare-release.outputs.git_hash }} + REPO_URL: ${{ secrets.REPO_URL }} + AWS_REGION: ${{ secrets.AWS_REGION }} + AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} + PASSIVE_ROLLBACK: "true" + with: + host: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_HOST_PROD || secrets.EC2_HOST_STAG }} + username: onehash + key: ${{ needs.prepare-release.outputs.deploy_env == 'production' && secrets.EC2_SSH_KEY_PROD || secrets.EC2_SSH_KEY_STAG }} + command_timeout: 600s + envs: BRANCH_NAME,GIT_HASH,REPO_URL,AWS_REGION,AWS_ACCOUNT_ID,PASSIVE_ROLLBACK + script: | + set -euo pipefail + REPO_ROOT="/home/onehash/onehash-cal" + if [ ! -d "$REPO_ROOT/.git" ]; then + git clone --depth 1 --branch "$BRANCH_NAME" "$REPO_URL" "$REPO_ROOT" + fi + cd "$REPO_ROOT" + git fetch origin "$BRANCH_NAME" --depth 1 || true + git fetch origin "$GIT_HASH" --depth 1 || true + git checkout --detach "$GIT_HASH" + checked_out_sha="$(git rev-parse HEAD)" + if [ "$checked_out_sha" != "$GIT_HASH" ]; then + echo "Host checkout SHA ${checked_out_sha} does not match release SHA ${GIT_HASH}" >&2 + exit 1 + fi + chmod +x infra/scripts/revert-nginx.sh + infra/scripts/revert-nginx.sh + script_stop: false + - name: Run automatic rollback on target host id: rollback if: ${{ steps.current.outcome == 'success' && steps.rollback-images.outcome == 'success' }} @@ -1621,7 +1820,7 @@ jobs: WEB_IMAGE: ${{ needs.build-web.outputs.web_image }} API_IMAGE: ${{ needs.build-api.outputs.api_image }} WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image }} - MIGRATIONS_APPLIED_JSON: ${{ needs.migrate-db.outputs.migrations_applied || '[]' }} + MIGRATIONS_APPLIED_JSON: ${{ needs.migrate-db.outputs.migrations_applied }} GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_ACTOR: ${{ github.actor }} run: bash infra/scripts/record-state.sh @@ -1720,7 +1919,6 @@ jobs: - deploy-worker - rollback-after-promotion - verify - - cleanup-vpc-workdir # - release-lock if: always() steps: @@ -1874,6 +2072,7 @@ jobs: ROLLBACK_STATUS_RAW: ${{ needs.rollback-after-promotion.outputs.rollback_status || '' }} ROLLBACK_TARGET_SHA: ${{ needs.rollback-after-promotion.outputs.rollback_target_sha || '' }} VERIFY_RESULT: ${{ needs.verify.result }} + CLEANUP_VPC_WORKDIR_RESULT: ${{ needs.cleanup-vpc-workdir.result || 'skipped' }} # RELEASE_LOCK_RESULT: ${{ needs.release-lock.result }} NGINX_ACTIVE_UPSTREAMS: ${{ steps.nginx.outputs.nginx_active_upstreams }} LOG_SNIPPET: ${{ steps.host-logs.outputs.log_snippet }} @@ -1966,6 +2165,7 @@ jobs: echo "| Deploy Workers | ${DEPLOY_WORKER_RESULT} |" echo "| Automatic Rollback | ${rollback_status} |" echo "| Verify / Record Current | ${VERIFY_RESULT} |" + echo "| VPC Cleanup | ${CLEANUP_VPC_WORKDIR_RESULT:-skipped} |" # echo "| Release Lock | ${RELEASE_LOCK_RESULT} |" echo "" echo "## Rollback" @@ -2018,6 +2218,7 @@ jobs: WEB_STATUS: ${{ needs.deploy-web.result }} API_STATUS: ${{ needs.deploy-api.result }} WORKER_STATUS: ${{ needs.deploy-worker.result }} + CLEANUP_STATUS: ${{ needs.cleanup-vpc-workdir.result || 'skipped' }} ROLLBACK_STATUS: ${{ steps.report.outputs.rollback_status }} ROLLBACK_TARGET_SHA: ${{ needs.rollback-after-promotion.outputs.rollback_target_sha || 'N/A' }} STATE_FILE_PATH: ${{ steps.report.outputs.state_file_path }} diff --git a/.github/workflows/worker-deploy.yml b/.github/workflows/worker-deploy.yml index 4297a57c684396..7b277443f8978d 100644 --- a/.github/workflows/worker-deploy.yml +++ b/.github/workflows/worker-deploy.yml @@ -568,9 +568,36 @@ jobs: log_message "INFO" "Running worker containers:" docker ps --filter "name=worker-" --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | tee -a "$LOG_FILE" - # Cleanup old images (72h filter avoids removing the image we just deployed) + # Protect all worker images (active old workers and the newly deployed image) + # before pruning, so they survive the cleanup and can be restored after reboot. + log_message "INFO" "Tagging worker images with :protected before prune" + while IFS= read -r container; do + [ -n "$container" ] || continue + img="$(docker inspect -f '{{.Config.Image}}' "$container" 2>/dev/null || true)" + [ -n "$img" ] || continue + if ! docker image inspect "${img}:protected" >/dev/null 2>&1; then + docker tag "$img" "${img}:protected" 2>/dev/null || true + log_message "INFO" "Tagged protected: ${img}:protected" + fi + done < <(docker ps --filter "name=worker-" --format '{{.Names}}' 2>/dev/null) + # Also protect the newly deployed image (it may not be running yet if draining is in progress) + if [ -n "$WORKER_IMAGE" ] && ! docker image inspect "${WORKER_IMAGE}:protected" >/dev/null 2>&1; then + docker tag "$WORKER_IMAGE" "${WORKER_IMAGE}:protected" 2>/dev/null || true + log_message "INFO" "Tagged newly deployed image protected: ${WORKER_IMAGE}:protected" + fi + + # Cleanup old images (168h / 7-day filter to reduce chance of pruning the current image) + # --filter "label!=protected" excludes all images tagged :protected log_message "INFO" "Pruning old Docker images..." - docker image prune -af --filter "until=72h" 2>&1 | tee -a "$LOG_FILE" || true + docker image prune -af --filter "until=168h" --filter "label!=protected" 2>&1 | tee -a "$LOG_FILE" || true + + # Remove :protected tags after prune to prevent accumulation + log_message "INFO" "Removing protected tags from images" + docker images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null \ + | grep ':protected$' \ + | while IFS= read -r tagged; do + docker rmi "$tagged" 2>/dev/null || true + done if [ "$WORKERS_FAILED" -gt 0 ]; then log_message "WARN" "⚠️ Deployment completed with $WORKERS_FAILED failed worker(s)" diff --git a/docs/deployment_fix.md b/docs/deployment_fix.md new file mode 100644 index 00000000000000..f25d430e6d3dc7 --- /dev/null +++ b/docs/deployment_fix.md @@ -0,0 +1,1158 @@ +# Deployment Pipeline — Ordered Fix Implementation Prompts + +Generated from production-readiness review of `.github/workflows/deploy-all.yml`. + +--- + +## Phase 1: Workflow Blockers + +Critical issues that break deployment or make later work unsafe to attempt. + +--- + +### Issue 1: `WORKER_REPLICAS` Expression Syntax is Invalid + +**Priority:** Critical + +**Why this must be fixed now:** + +The `WORKER_REPLICAS` output expression on line 1427 contains a bare `==` comparison inside a `||` chain without proper grouping. GitHub Actions expressions do not support this syntax — the comparison result (a boolean) would be coerced to a string in an unpredictable way, potentially producing an empty string, the literal word `false`, or throwing a workflow parse error. This means worker deployments could spin up 0 workers, the wrong number of workers, or fail silently. Since `deploy-worker` runs after `promote-all` (where NGINX has already switched to new containers), a worker count of 0 would leave the system with no background job processing and no automated recovery path. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Fix the malformed WORKER_REPLICAS expression in the deploy-worker job of deploy-all.yml. +The current expression at line 1427: + WORKER_REPLICAS: ${{ github.event.inputs.worker_replicas || (needs.prepare-release.outputs.deploy_env == 'production' && secrets.WORKER_REPLICAS_PROD || secrets.WORKER_REPLICAS_STAG) || '1' }} +is invalid because the `==` comparison inside the `||` chain produces undefined behavior. +Replace with a syntactically valid expression that: + - Returns the workflow_dispatch worker_replicas input if provided and non-empty + - Falls back to WORKER_REPLICAS_PROD secret when deploy_env is production + - Falls back to WORKER_REPLICAS_STAG secret when deploy_env is staging + - Defaults to "1" if all above are empty + +Files to inspect: + - .github/workflows/deploy-all.yml (line ~1427, deploy-worker job envs block) + - .github/workflows/worker-deploy.yml (for reference implementation) + - infra/scripts/start-workers.sh (for how WORKER_REPLICAS is consumed) + +Requirements: + - The corrected expression must use explicit != '' checks instead of truthy coercion + - The deployment workflow must not pass empty or falsy values to the SSH script + - The deploy-worker job must exit with a clear error if WORKER_REPLICAS is not a positive integer + +Validation requirements: + * Test workflow_dispatch with worker_replicas="3" — verify 3 workers started + * Test workflow_dispatch with worker_replicas="" (empty) — verify fallback to secret works + * Test when neither secret is set — verify fallback to "1" + * Verify the expression resolves correctly when run on main branch (production) vs develop (staging) + * Verify that the SSH script receives a valid numeric value (not empty, not "false") + +Constraints: + - Preserve the existing fallback logic (dispatch input > prod secret > stag secret > default 1) + - Do not change the job dependency graph or needs clause + - Do not change the WORKER_REPLICAS variable name +``` + +--- + +### Issue 2: No NGINX Revert When Rollback is Skipped (Partial Promotion Without Recovery) + +**Priority:** Critical + +**Why this must be fixed now:** + +When `promote-all` succeeds (NGINX switches to new web/API containers) but `deploy-worker` fails, the `rollback-after-promotion` job runs. The `revert-nginx.sh` step (lines 1280-1311) has `if: failure()`, which only triggers when `promote-all` itself fails. Since `promote-all` succeeded, `revert-nginx.sh` does NOT run. If rollback is then skipped (no previous `current.json` or missing images), the system is left in a state where NGINX points to new web/API containers that were deployed as part of the new release, but no workers are running. This is a silent partial promotion failure with no automated recovery. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Ensure that whenever the deployment reaches the promote-all stage (NGINX has switched to new containers), +there is always an automated recovery path, even when: + a) deploy-worker fails, AND + b) rollback-after-promotion cannot proceed (no previous current.json, or missing images) + +Files to inspect: + - .github/workflows/deploy-all.yml (rollback-after-promotion job, lines ~1453-1468; revert-nginx step, lines ~1280-1311) + - .github/workflows/deploy-all.yml (deploy-worker job needs clause, lines ~1386-1392) + - infra/scripts/rollback.sh (lines 265-270, rollback_failure trap) + - infra/scripts/revert-nginx.sh (entire file) + +Requirements: + - The revert-nginx.sh step must fire whenever promotion succeeded but worker deployment failed, + REGARDLESS of whether a rollback target exists + - This covers the case: promote-all=success, deploy-worker=failure, current.json missing + - The revert-nginx.sh call must NOT depend on the rollback script succeeding + - Add explicit env var PASSIVE_ROLLBACK=true to distinguish from active rollback + - preserve revert-nginx.sh's existing behavior (it already handles "no backup found" gracefully) + - Ensure the revert-nginx.sh step runs BEFORE the rollback-after-promotion rollback step, + so NGINX is reverted even if the rollback script times out + +Implementation approach: + Option A: Add a dedicated "always revert nginx" step with condition: + if: needs.promote-all.result == 'success' && (needs.deploy-worker.result != 'success' || needs.record-promoted-state.result != 'success') + This step runs revert-nginx.sh regardless of whether rollback will proceed. + + Option B: In the revert-nginx.sh step (lines 1280-1311), change the condition from + if: failure() to: + if: always() && (needs.promote-all.result == 'success' && (needs.deploy-worker.result != 'success' || needs.record-promoted-state.result != 'success')) + + Both approaches achieve the same outcome. Recommend Option B as it consolidates + the revert logic into a single step. + +Validation requirements: + * Simulate: promote-all succeeded, deploy-worker failed, no current.json exists + → Verify NGINX reverts to previous config + * Simulate: promote-all succeeded, deploy-worker failed, rollback images missing + → Verify NGINX reverts even though rollback script skips + * Simulate: promote-all succeeded, deploy-worker succeeded, rollback-after-promotion skipped + → Verify revert step does NOT run (no spurious reverts) + * Verify that revert-nginx.sh does not fail when no backup exists (it should handle this gracefully) + * Verify cleanup-vpc-workdir still runs after revert completes +``` + +--- + +### Issue 3: First-Deployment Rollback Gap — No `current.json` on Initial Deploy + +**Priority:** Critical + +**Why this must be fixed now:** + +On the first-ever deployment to an environment (production or staging), `current.json` does not exist in S3. If this first deployment reaches `promote-all` (NGINX switched) but then `deploy-worker` fails, the `rollback-after-promotion` job will fail at `steps.current` because it cannot download a non-existent `current.json`. The result is `rollback_status = "skipped_no_previous_release"`, and no NGINX revert occurs. NGINX stays pointing to the new containers, which may have no workers. This is a complete outage with no automated recovery on first deployment. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Handle the scenario where rollback-after-promotion is triggered but no previous +current.json exists (first deployment to an environment). + +Files to inspect: + - .github/workflows/deploy-all.yml (rollback-after-promotion job, steps.current block, lines ~1502-1541) + - infra/scripts/deploy-state-paths.sh + - infra/scripts/rollback.sh (lines 82-149, _record_rollback_state function) + +Requirements: + - When current.json does not exist and the deployment was a first deploy (no prior state), + the workflow must NOT silently skip rollback — it must at minimum revert NGINX to + a known-safe state (the backup created by promote-all.sh) + - The "Resolve previous current release" step currently has continue-on-error: true + and exits with a non-success outcome. This causes rollback to skip. + Change the behavior so that when current.json doesn't exist: + - If a NGINX backup exists in /tmp/cal-id-nginx-previous, revert NGINX via revert-nginx.sh + - Set rollback_status = "skipped_no_previous_release_but_nginx_reverted" + - In deployment-report, surface this as a partial failure with clear messaging + - If both current.json AND nginx backup are missing, set rollback_status to + "skipped_no_recovery_path" and exit with failure — this requires manual intervention + - Update the deployment-report job to recognize the new "nginx_reverted" status and + send an appropriate notification (partial failure, not full failure) + +Implementation approach: + In steps.current, after detecting current.json doesn't exist: + - Check if NGINX_BACKUP_DIR (/tmp/cal-id-nginx-previous) contains a valid backup + - If yes: revert via revert-nginx.sh, then set rollback_status appropriately + - If no: set rollback_status = "skipped_no_recovery_path", exit 1 + + In deployment-report: + - Handle new rollback_status values with appropriate messaging + - "skipped_no_previous_release_but_nginx_reverted" → notify "partial rollback: NGINX restored, manual worker check required" + - "skipped_no_recovery_path" → notify "CRITICAL: no recovery path available, manual intervention required" + +Validation requirements: + * Simulate first deployment (no current.json in S3), promote-all succeeds, deploy-worker fails + → Verify NGINX reverts, verify notification mentions partial failure + * Simulate first deployment with no NGINX backup either (edge case: backup failed) + → Verify workflow fails with clear "manual intervention required" message + * Simulate subsequent deployment (current.json exists), deploy-worker fails + → Verify full rollback works as before (no regression) +``` + +--- + +## Phase 2: Deployment Correctness + +Issues that affect the correctness of deployments and rollback behavior. + +--- + +### Issue 4: `record-promoted-state` Uses `|| '[]'` Masking Actual Migration State + +**Priority:** High + +**Why this must be fixed now:** + +The `record-promoted-state` job and the `verify` job both use `needs.migrate-db.outputs.migrations_applied || '[]'`. When `migrate-db` is skipped (all images already exist, migrations already applied), the output variable is never set, and the expression falls back to `[]`. This means the S3 state record says "no migrations applied" even when the database has been migrated to the current state. On subsequent rollback, the state record will be incorrect about the migration history. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Fix the migrations_applied output propagation to avoid masking the actual migration state. + +Files to inspect: + - .github/workflows/deploy-all.yml (record-promoted-state job, lines ~1356-1368) + - .github/workflows/deploy-all.yml (verify job, lines ~1711-1723) + - .github/workflows/deploy-all.yml (migrate-db job outputs, lines ~928-931) + - infra/scripts/record-state.sh + +Requirements: + - record-promoted-state must NOT use `|| '[]'` fallback — if migrate-db produced no output, + the job should fail explicitly rather than silently recording an empty array + - The verify job may use the fallback since it's a read-only summary step + - The `migrate-db` job always runs (it depends on build jobs which always run), but + when image_existed=true for all three services, migrate-db outputs are set from the + script output, which would correctly contain the empty array [] (no new migrations). + The issue is when the variable is truly unset (not even empty string) vs when + it contains the literal string "[]" + - Verify that migrate-db script always outputs MIGRATIONS_APPLIED_JSON= even on + "no new migrations" (it should output []). Check migrate.sh lines ~375-378. + +Implementation approach: + 1. In migrate-db job, ensure the script always outputs MIGRATIONS_APPLIED_JSON= + even when there are zero new migrations (it already does — outputs "[]") + 2. In record-promoted-state, change from: + MIGRATIONS_APPLIED_JSON: ${{ needs.migrate-db.outputs.migrations_applied || '[]' }} + to: + MIGRATIONS_APPLIED_JSON: ${{ needs.migrate-db.outputs.migrations_applied }} + This will fail if migrate-db didn't run or didn't set the output. + 3. In verify job, keep the `|| '[]'` fallback for safety since it's a final summary. + +Validation requirements: + * Deploy where all images already exist (migrate-db runs but no new migrations) + → Verify migrations_applied is recorded as [] (empty array, not missing) + * Deploy where new migrations are applied + → Verify migrations_applied contains the actual migration names + * Verify that if migrate-db were somehow completely skipped (shouldn't happen), + record-promoted-state fails with a clear error rather than silently proceeding +``` + +--- + +### Issue 5: `deployment-report` Blocked by `cleanup-vpc-workdir` Failure + +**Priority:** High + +**Why this must be fixed now:** + +The `deployment-report` job depends on `cleanup-vpc-workdir`. If the SSH command to clean up the VPC work directory times out or fails (e.g., due to network issues to EC2), the entire `deployment-report` job is skipped. Since `deployment-report` is the only job that sends the final email notification and writes the GitHub Actions summary, a failure here means the team gets no notification and the deployment outcome is not recorded in the summary. This makes post-mortem analysis impossible and could leave the team unaware that a deployment failed. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Decouple the final deployment notification from the VPC cleanup step so that +notification always runs regardless of cleanup success or failure. + +Files to inspect: + - .github/workflows/deploy-all.yml (deployment-report job, lines ~1800-2128) + - .github/workflows/deploy-all.yml (cleanup-vpc-workdir job, lines ~1725-1766) + +Requirements: + - deployment-report must run even if cleanup-vpc-workdir fails + - The notification email and GitHub Actions summary must be sent regardless of cleanup outcome + - The deployment-report should still include cleanup status information (did cleanup succeed or fail) + - Remove cleanup-vpc-workdir from deployment-report.needs + - The cleanup-vpc-workdir job should have if: always() so it runs even if earlier jobs fail + - Add cleanup status to the deployment report output + +Implementation approach: + 1. Remove cleanup-vpc-workdir from deployment-report.needs: + needs: + - preflight + - prepare-release + - build-web + - build-api + - build-worker + - record-build-state + - migrate-db + - deploy-api + - deploy-web + - promote-all + - record-promoted-state + - deploy-worker + - rollback-after-promotion + - verify + (remove cleanup-vpc-workdir from this list) + + 2. Verify cleanup-vpc-workdir has if: ${{ always() && needs.prepare-release.result == 'success' }} + (it already has this — no change needed) + + 3. In deployment-report, add cleanup_status as a report field: + - Run a brief SSH check to confirm workdir is cleaned (or was already clean) + - Include cleanup_status in the email notification body + - Include cleanup_status in the GitHub step summary + + 4. Ensure the deployment report still waits for cleanup-vpc-workdir via the + workflow's natural execution order (cleanup runs before report in practice + because they're sequential jobs), even without the explicit needs dependency. + +Validation requirements: + * Simulate cleanup-vpc-workdir SSH failure (e.g., host unreachable) + → Verify deployment-report still runs and sends notification + * Verify notification includes "VPC cleanup: failed" or "VPC cleanup: succeeded" + * Verify GitHub Actions summary shows cleanup status + * Verify cleanup still runs even when earlier jobs fail (if: always()) + * Verify notification is sent on successful deployment, failed deployment, and partial rollback +``` + +--- + +### Issue 6: Pre-flight Dry-Run Produces False Positives (Missing File Validation) + +**Priority:** High + +**Why this must be fixed now:** + +The preflight job's script validation (lines 92-102) runs `bash -n` (syntax check) and `WORKER_LIFECYCLE_DRY_RUN=true bash start-workers.sh` (which exits at line 83 before any actual validation). This means the preflight check passes even if critical scripts like `stage-web.sh`, `stage-api.sh`, `migrate.sh`, or `promote-all.sh` are missing from the repository. The actual deployment would then fail at the SSH step with a "file not found" error on the EC2 host. The preflight should catch this and fail before any build starts. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Add explicit file-existence checks in the preflight job for all scripts required +by downstream deployment steps. + +Files to inspect: + - .github/workflows/deploy-all.yml (preflight job, lines ~92-102) + - .github/workflows/deploy-all.yml (deploy-api job sparse-checkout, lines ~1104-1109) + - .github/workflows/deploy-all.yml (deploy-web job sparse-checkout, lines ~1160-1165) + - .github/workflows/deploy-all.yml (promote-all job sparse-checkout, lines ~1220-1227) + - .github/workflows/deploy-all.yml (deploy-worker job sparse-checkout, lines ~1393-1402) + +Requirements: + - Add a dedicated step in preflight that checks all required scripts exist: + stage-api.sh, stage-web.sh, start-workers.sh, drain-workers.sh, migrate.sh, + promote-all.sh, revert-nginx.sh, record-state.sh, deploy-state-paths.sh, + ngx-utils.sh, preflight-space.sh, send-email.sh, rollback.sh + - Also check that required Docker/NGINX templates exist: + infra/docker/web/nginx.template.conf + infra/docker/connector/nginx.connector.template.conf + infra/docker/connector/nginx.mcp.template.conf + - Check that the Dockerfile files exist: + Dockerfile, apps/connector/Dockerfile, apps/worker/Dockerfile + - Do NOT run the scripts — only check file existence + - Remove the WORKER_LIFECYCLE_DRY_RUN validation (it provides no safety value) + - Keep the `bash -n` syntax checks since those are genuinely useful + +Implementation approach: + Add a new step after the script syntax validation: + + - name: Validate required deployment files exist + shell: bash + run: | + set -euo pipefail + failed=0 + + required_scripts=( + infra/scripts/stage-api.sh + infra/scripts/stage-web.sh + infra/scripts/start-workers.sh + infra/scripts/drain-workers.sh + infra/scripts/migrate.sh + infra/scripts/promote-all.sh + infra/scripts/revert-nginx.sh + infra/scripts/record-state.sh + infra/scripts/deploy-state-paths.sh + infra/scripts/ngx-utils.sh + infra/scripts/preflight-space.sh + infra/scripts/send-email.sh + infra/scripts/rollback.sh + infra/scripts/validate-rollback-schema.sh + ) + + required_templates=( + infra/docker/web/nginx.template.conf + infra/docker/connector/nginx.connector.template.conf + infra/docker/connector/nginx.mcp.template.conf + ) + + required_dockers=( + Dockerfile + apps/connector/Dockerfile + apps/worker/Dockerfile + ) + + for f in "${required_scripts[@]}"; do + if [ ! -f "$f" ]; then + echo "PREFLIGHT: MISSING — script: $f" + failed=1 + fi + done + + for f in "${required_templates[@]}"; do + if [ ! -f "$f" ]; then + echo "PREFLIGHT: MISSING — template: $f" + failed=1 + fi + done + + for f in "${required_dockers[@]}"; do + if [ ! -f "$f" ]; then + echo "PREFLIGHT: MISSING — dockerfile: $f" + failed=1 + fi + done + + if [ "$failed" -ne 0 ]; then + echo "PREFLIGHT: One or more required files are missing" + exit 1 + fi + + echo "PREFLIGHT: All required files present" + + - Remove the `bash -n infra/scripts/preflight-space.sh` and + `WORKER_LIFECYCLE_DRY_RUN=true bash infra/scripts/start-workers.sh` checks + since they provide no meaningful validation beyond what the new step covers + +Validation requirements: + * Verify preflight fails when one of the required scripts is deleted + * Verify preflight passes when all files are present + * Verify the Dockerfile checks catch missing Dockerfile references + * Verify sparse-checkout in deploy-api and deploy-web would make these files + unavailable on the runner — but since the SSH script does its own git checkout + at the target SHA, this validation only checks the current HEAD, not what will + be on EC2. Consider whether this check should use the same SHA that will be + deployed (GIT_HASH from prepare-release). + * For the GIT_HASH-aware check: use the commit SHA from prepare-release to verify + the required files exist in the target deployment commit, not just HEAD. +``` + +--- + +### Issue 7: Worker Image Pruning May Delete Currently-Referenced Images + +**Priority:** High + +**Why this must be fixed now:** + +The `docker image prune -af --filter "until=72h"` command in `worker-deploy.yml` removes images older than 72 hours. Since worker containers use `--restart unless-stopped`, a host reboot would trigger Docker to restart the container, which requires pulling the image from ECR. If the image was pruned (older than 72h) and the ECR credentials have expired or are unavailable, the worker won't restart. This creates a silent failure mode where workers don't come back after a host reboot. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Fix Docker image pruning to never delete images that are referenced by running +or restart-policy-protected containers. + +Files to inspect: + - .github/workflows/worker-deploy.yml (lines ~571-574) + - infra/scripts/preflight-space.sh (docker image prune, lines ~254-259) + - infra/scripts/start-workers.sh (docker run for workers) + +Requirements: + - Before pruning, identify all images referenced by: + a) Running containers (docker ps --format '{{.Image}}') + b) Containers with --restart unless-stopped that are not currently running + c) Images tagged with the current WORKER_IMAGE (from deployment state) + - Exclude these images from the prune operation + - Use a 7-day (168h) filter instead of 72h to reduce the chance of pruning + recently deployed images + - Tag the currently-deployed worker image with a "protected" label before pruning: + docker tag "$WORKER_IMAGE" "${WORKER_IMAGE}:protected" + docker image prune -af --filter "until=168h" --filter "label!=protected" + Then untag after prune: + docker rmi "${WORKER_IMAGE}:protected" + +Implementation approach: + + 1. In worker-deploy.yml SSH script, before the prune command: + - Determine the current worker image from the active worker containers + - Tag it with :protected + - Run prune with --filter "label!=protected" + - Remove the :protected tag after prune + + 2. In preflight-space.sh, apply the same pattern to protect: + - All images from current.json (web, api, worker) + - Images from running containers + - Images from containers with --restart unless-stopped + + 3. Change the until filter from 72h to 168h (7 days) in both locations. + + Important: The :protected tagging approach is safer than excluding by image ID + because it survives across prune invocations (the tag is checked at prune time). + +Validation requirements: + * Verify running worker container's image is NOT pruned + * Verify image from current.json (even if old) is NOT pruned + * Verify images not referenced by any container ARE pruned + * Verify after host reboot, worker can restart from the protected image + * Verify 7-day vs 72h pruning produces different results (older images removed sooner) + * Verify idempotency: running prune twice in succession doesn't cause issues +``` + +--- + +## Phase 3: Reliability Improvements + +Issues that improve retry behavior, race condition handling, and partial failure recovery. + +--- + +### Issue 8: Migration Runs Serially After Builds (Performance Optimization) + +**Priority:** Medium + +**Why this must be fixed now:** + +`migrate-db` depends on `record-build-state`, which itself depends on all three build jobs. This means migrations cannot start until all builds are complete. For a typical deployment where builds take 10 minutes and migration takes 3 minutes, the current dependency chain adds 3 minutes of unnecessary wait time. Migrations are independent of container images — they only need the database connection and the repo at the target SHA. Running them in parallel with builds saves time without increasing risk. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Remove the record-build-state dependency from migrate-db so migrations can +run in parallel with builds. + +Files to inspect: + - .github/workflows/deploy-all.yml (migrate-db job needs clause, lines ~921-927) + - .github/workflows/deploy-all.yml (record-build-state job, lines ~868-914) + - .github/workflows/deploy-all.yml (prepare-release outputs, lines ~310-322) + - infra/scripts/migrate.sh (what it actually needs) + +Requirements: + - migrate-db only needs: + a) The git SHA (from prepare-release.git_hash) — to checkout the correct commit + b) The DATABASE_URL (from secrets) — to connect to the database + c) The REPO_URL (from secrets) — for fallback clone + d) The BRANCH_NAME (from env.BRANCH) — for fallback clone + - migrate-db does NOT need record-build-state outputs or build job outputs + - Change migrate-db.needs from: + needs: + - prepare-release + - build-web + - build-api + - build-worker + - record-build-state + to: + needs: + - prepare-release + + - The git_hash from prepare-release is sufficient for the migration to proceed + +Implementation approach: + 1. Update the migrate-db job needs clause to only depend on prepare-release + 2. The SSH script in migrate-db already does its own checkout using GIT_HASH + from the environment (lines 989, 1054-1058) — no change needed there + 3. Verify that the migrate-db job still receives the correct GIT_HASH via + the envs block and the SSH script's own checkout logic + 4. The DATABASE_URL is passed via envs, so that continues to work + +Validation requirements: + * Verify migration runs concurrently with builds (not after all builds complete) + * Verify migration still uses the correct GIT_HASH (from prepare-release, not workflow default) + * Verify that if prepare-release fails, migrate-db is skipped (correct dependency) + * Verify that if builds fail, migrate-db is still triggered (since it only needs prepare-release) + * Verify that if migration is slow, it doesn't block the build jobs (parallel execution) + * Verify rollback behavior: if migrate-db fails while builds are still running, + the overall workflow fails at migrate-db, and builds are cancelled by concurrency group +``` + +--- + +### Issue 9: Missing Post-Promotion Live Health Check + +**Priority:** Medium + +**Why this must be fixed now:** + +After `promote-all.sh` switches NGINX config and reloads, the state is immediately recorded as "promoted" in S3. However, if the new container is slow to respond (cold start, connection pool initialization), users could get 502 errors for a brief window. Recording "promoted" state before confirming the new endpoints are reachable means the state record is optimistic, not confirmed. Adding a live health check after promotion ensures the state reflects reality. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Add a post-promotion live health check in promote-all.sh that verifies the newly +promoted endpoints are actually reachable before returning success. + +Files to inspect: + - infra/scripts/promote-all.sh (entire file, focus on post-reload section) + - infra/scripts/ngx-utils.sh (ngx_switch_config_and_static function) + +Requirements: + - After nginx -s reload, wait 5 seconds for connection pooling to stabilize + - Perform HTTP health checks on: + a) https:///api/health (web) + b) https://api./health (API) + c) https://mcp./health (MCP) + - Each health check has a 10-second timeout and allows up to 3 retries + - If any health check fails after 3 retries, the promotion is considered failed + - On promotion failure: + a) Call ngx_restore_config() to revert NGINX + b) Call ngx_restore_static() to revert static symlink + c) Call ngx_stop_candidates() to clean up + d) Exit with error + - On success, proceed with ngx_cleanup_old_builds() and ngx_stop_candidates() + - Log each health check attempt with the HTTP status code + +Implementation approach: + Add the following after `sudo nginx -s reload` in ngx_switch_config_and_static + (or at the end of promote-all.sh after ngx_switch_config_and_static): + + log INFO "Performing post-promotion health checks" + sleep 5 # Allow connection pooling to stabilize + + for url in \ + "https://${DOMAIN_NAME}${WEB_HEALTH_PATH}" \ + "https://api.${DOMAIN_NAME}${API_HEALTH_PATH}" \ + "https://mcp.${DOMAIN_NAME}/health"; do + + for attempt in 1 2 3; do + status_code="$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$url" 2>/dev/null || printf '000')" + log INFO "Post-promotion check $attempt for ${url}: HTTP $status_code" + if [ "$status_code" = "200" ]; then + log INFO "Post-promotion check passed for ${url}" + break + fi + if [ "$attempt" -lt 3 ]; then + sleep 3 + continue + fi + log ERROR "Post-promotion health check failed for ${url} after 3 attempts" + ngx_restore_config + ngx_restore_static + ngx_stop_candidates + fail "Post-promotion health check failed — NGINX reverted to previous state" + done + done + + log INFO "All post-promotion health checks passed" + +Validation requirements: + * Verify health check catches a slow-starting container (test with a deliberate delay) + * Verify health check passes for healthy endpoints + * Verify NGINX is reverted when health check fails (check backup config is restored) + * Verify static symlink is reverted when health check fails + * Verify candidate containers are stopped when health check fails + * Verify the final state after failed health check matches a clean pre-promotion state + * Verify promote-all.sh exit code is non-zero when health check fails +``` + +--- + +### Issue 10: No S3 Bucket Accessibility Check in Preflight + +**Priority:** Medium + +**Why this must be fixed now:** + +`DEPLOY_STATE_BUCKET` (cal-id) is hardcoded and used for all state recording and retrieval. If the AWS credentials lack S3 permissions for this bucket, every state operation silently fails or produces confusing errors. There's no preflight check to validate that the bucket is reachable and writable before the deployment starts. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Add an S3 connectivity and permissions check in the preflight job to validate +that the deploy-state bucket is accessible before any state recording occurs. + +Files to inspect: + - .github/workflows/deploy-all.yml (preflight job, lines ~275-277) + - infra/scripts/deploy-state-paths.sh (DEPLOY_STATE_BUCKET and PREFIX resolution) + +Requirements: + - Add a step in preflight that: + a) Lists the S3 bucket to verify access (aws s3 ls s3://cal-id/) + b) Attempts a test write to the bucket (aws s3 cp /dev/null s3://cal-id/.deploy-check) + c) Cleans up the test file + - This validates both read and write permissions + - If the bucket is unreachable or credentials are insufficient, fail preflight + with a clear error message + - The check should use the same AWS credentials configured earlier in preflight + +Implementation approach: + Add after the existing preflight checks: + + - name: Validate S3 deploy-state bucket accessibility + shell: bash + run: | + set -euo pipefail + DEPLOY_STATE_BUCKET="cal-id" + + echo "PREFLIGHT: Checking S3 bucket accessibility for ${DEPLOY_STATE_BUCKET}" + if ! aws s3 ls "s3://${DEPLOY_STATE_BUCKET}/" >/dev/null 2>&1; then + echo "PREFLIGHT: FAILED — S3 bucket ${DEPLOY_STATE_BUCKET} is not accessible" + echo "PREFLIGHT: Check AWS credentials and bucket existence" + exit 1 + fi + + # Test write permissions + check_file="$(mktemp)" + echo "deploy-check-$(date -u)" > "$check_file" + if ! aws s3 cp "$check_file" "s3://${DEPLOY_STATE_BUCKET}/.deploy-check" >/dev/null 2>&1; then + rm -f "$check_file" + echo "PREFLIGHT: FAILED — Cannot write to S3 bucket ${DEPLOY_STATE_BUCKET}" + exit 1 + fi + aws s3 rm "s3://${DEPLOY_STATE_BUCKET}/.deploy-check" >/dev/null 2>&1 || true + rm -f "$check_file" + + echo "PREFLIGHT: S3 bucket ${DEPLOY_STATE_BUCKET} is accessible with read/write permissions" + +Validation requirements: + * Verify preflight fails when bucket doesn't exist + * Verify preflight fails when credentials lack S3 permissions + * Verify preflight passes when bucket is accessible + * Verify idempotency: running the check twice doesn't leave artifacts in the bucket + * Verify cleanup of the test file succeeds even if the main deployment fails +``` + +--- + +### Issue 11: No Verification That All Three Images Share the Same Git SHA + +**Priority:** Medium + +**Why this must be fixed now:** + +Build-web, build-api, and build-worker run in parallel from the same `git_hash`. While all three use `ref: ${{ needs.prepare-release.outputs.git_hash }}`, the ECR image tags are set independently. If a user triggers a deployment with `rebuild=true` for only one service (via the legacy individual workflows), the cached images for other services could have different SHAs. The `record-build-state` step would record mismatched image SHAs in S3, and rollback might restore an inconsistent set of service versions (e.g., web at SHA-A, API at SHA-B, worker at SHA-A). + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Add a validation step after all builds complete to verify that the three +ECR images all share the same Git SHA tag. + +Files to inspect: + - .github/workflows/deploy-all.yml (record-build-state job, lines ~868-914) + - .github/workflows/deploy-all.yml (prepare-release outputs, lines ~314) + - .github/workflows/deploy-all.yml (build-web outputs, lines ~439-441) + - .github/workflows/deploy-all.yml (build-api outputs, lines ~608-610) + - .github/workflows/deploy-all.yml (build-worker outputs, lines ~740-742) + +Requirements: + - Add a step in record-build-state (after the existing record-state.sh call) + that extracts the Git SHA from each image tag and verifies they match + - Expected pattern in image tags: images are tagged as + /: where git_hash is a 40-char hex string + - Extract SHA using: echo "$WEB_IMAGE" | grep -oP '(?<=:)[0-9a-f]{40}$' + - If SHA mismatch is detected, fail record-build-state with a clear error + and prevent the deployment from proceeding + - The error message should show which images have mismatched SHAs + +Implementation approach: + Add in record-build-state job after the existing record-state.sh call: + + - name: Verify image SHA consistency + shell: bash + env: + WEB_IMAGE: ${{ needs.build-web.outputs.web_image }} + API_IMAGE: ${{ needs.build-api.outputs.api_image }} + WORKER_IMAGE: ${{ needs.build-worker.outputs.worker_image }} + run: | + set -euo pipefail + + web_sha="$(printf '%s' "$WEB_IMAGE" | grep -oP '(?<=:)[0-9a-f]{40}$' || true)" + api_sha="$(printf '%s' "$API_IMAGE" | grep -oP '(?<=:)[0-9a-f]{40}$' || true)" + worker_sha="$(printf '%s' "$WORKER_IMAGE" | grep -oP '(?<=:)[0-9a-f]{40}$' || true)" + + echo "Image SHAs: web=${web_sha}, api=${api_sha}, worker=${worker_sha}" + + if [ -z "$web_sha" ] || [ -z "$api_sha" ] || [ -z "$worker_sha" ]; then + echo "ERROR: Could not extract SHA from one or more image tags" + echo "WEB_IMAGE=$WEB_IMAGE" + echo "API_IMAGE=$API_IMAGE" + echo "WORKER_IMAGE=$WORKER_IMAGE" + exit 1 + fi + + if [ "$web_sha" != "$api_sha" ] || [ "$api_sha" != "$worker_sha" ]; then + echo "ERROR: Image SHAs do not match!" + echo " web_sha=$web_sha" + echo " api_sha=$api_sha" + echo " worker_sha=$worker_sha" + echo "All three images must be tagged with the same Git SHA for consistent rollback." + exit 1 + fi + + echo "All image SHAs verified consistent: ${web_sha}" + +Validation requirements: + * Verify detection of SHA mismatch (e.g., web at SHA-A, api at SHA-B) + * Verify passes when all three images share the same SHA + * Verify error message clearly identifies which images mismatch + * Verify the check doesn't interfere with the existing record-state.sh call + * Verify the check doesn't block the deployment when all SHAs match (no false positives) +``` + +--- + +## Phase 4: Hardening + +Issues that improve cleanup logic, observability, and safety margins. + +--- + +### Issue 12: Static Build Cleanup Gap on Pre-Promotion Failure + +**Priority:** Medium + +**Why this must be fixed now:** + +In `space_cleanup_old_builds()`, the function protects the active build (via `current` symlink) and the candidate build (via `candidate` symlink). On a pre-promotion failure, the `candidate` symlink still points to the old (previous) build directory. If cleanup runs on a subsequent deployment, it counts the builds: active (old), candidate (old, same as active), and new candidate (new). If there are more than 2 builds, the oldest is removed. The active build is protected, so it won't be removed. However, the logic is fragile — if `current` and `candidate` point to the same build directory (as happens in a failed deployment), the function might not protect it correctly depending on the basename comparison. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Fix the static build protection logic in preflight-space.sh to correctly +handle the case where current and candidate symlinks point to the same build. + +Files to inspect: + - infra/scripts/preflight-space.sh (space_cleanup_old_builds function, lines ~211-246) + - infra/scripts/preflight-space.sh (space_protect_image_unique and space_add_unique functions) + - infra/scripts/ngx-utils.sh (ngx_backup_static, ngx_switch_static) + +Requirements: + - The cleanup must always protect: + a) The build directory currently referenced by STATIC_CURRENT_LINK + b) The build directory currently referenced by STATIC_CANDIDATE_LINK (may be same as a) + c) The build directory of any running web-candidate container + - When current and candidate point to the same build (pre-promotion failure scenario), + that build must be protected from deletion + - If the candidate symlink is broken or missing, it should not crash the cleanup + - Verify the ls -dt sort order doesn't cause issues when all builds have similar timestamps + +Implementation approach: + Review the space_cleanup_old_builds function and verify: + + 1. active_name and candidate_name are computed correctly even when both symlinks + point to the same directory + 2. The grep -Fxv exclusion correctly excludes both active_name and candidate_name + even when they are identical + 3. The tail -n +"${keep_extra + 1}" calculation correctly identifies which builds to remove + + The existing logic appears mostly correct, but verify: + + keep_extra=$((keep - 1)) # For keep=2, keep_extra=1 + ls -dt build-* | grep -Fxv "$active_name" | grep -Fxv "$candidate_name" | tail -n +"$((keep_extra + 1))" + + With keep=2, keep_extra=1, tail starts at position 2: + - 1 build: nothing to remove + - 2 builds: nothing to remove (both protected) + - 3 builds: 1 removed (position 3 onwards) + - When active=build-A, candidate=build-A (same), grep -Fxv twice: no change + + Verify there are no edge cases where: + - candidate symlink doesn't exist (should skip candidate exclusion) + - candidate_name is empty (grep -Fxv "" would exclude nothing — fine) + - active_name is empty (grep -Fxv "" would exclude nothing — risky) + + Add defensive checks: + - If candidate symlink doesn't exist or is broken, log a warning and skip candidate exclusion + - If active_name is empty after readlink, fail the cleanup (don't remove unknown builds) + +Validation requirements: + * Verify cleanup on a host with 5 old builds + candidate symlink pointing to build-3 + current pointing to build-1 + → Only build-4 and build-5 are removed, build-1, build-2, build-3 are protected + * Verify cleanup when current and candidate both point to build-2 (failed deploy scenario) + → build-2 is protected even though it's both active and candidate + * Verify cleanup when candidate symlink is broken (points to deleted directory) + → Cleanup skips candidate exclusion and only protects current + * Verify cleanup when no builds exist (no-op, no error) + * Verify behavior when STATIC_KEEP_RELEASES is set to 1 (edge case) +``` + +--- + +### Issue 13: No Manual Rollback Trigger + +**Priority:** Medium + +**Why this must be fixed now:** + +The workflow has no mechanism for manually rolling back to a specific SHA after a successful deployment. If a deployment succeeds but a subtle bug is discovered hours later, the only way to roll back is to manually edit `current.json` in S3 and trigger a rollback script. There's no "deploy specific SHA as rollback" workflow input. This creates operational risk during incident response. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Add a manual rollback trigger to the workflow_dispatch inputs so operators +can roll back to a specific SHA without modifying S3 state directly. + +Files to inspect: + - .github/workflows/deploy-all.yml (workflow_dispatch inputs, lines ~23-45) + - .github/workflows/deploy-all.yml (rollback-after-promotion job, lines ~1453-1468) + - infra/scripts/rollback.sh (entire file) + +Requirements: + - Add a `rollback_to_sha` input to workflow_dispatch: + rollback_to_sha: + description: "SHA to roll back to (leaves current deployment — for manual rollback)" + required: false + default: "" + - When `rollback_to_sha` is provided and non-empty, the workflow should: + a) Skip the build phase entirely (no docker builds) + b) Skip migration (database is already at the right state) + c) Use the rollback_to_sha to construct image URLs from ECR + d) Run the rollback script (stage-api, stage-web, promote-all, start-workers, drain-workers) + e) Record rollback state to S3 + - When `rollback_to_sha` is empty, the workflow behaves as normal + +Implementation approach: + + 1. Add the input in workflow_dispatch: + rollback_to_sha: + description: "SHA to roll back to (empty = normal deployment)" + required: false + default: "" + + 2. Add a condition at the job level: + - preflight: unchanged (run to validate secrets even for rollback) + - prepare-release: if rollback_to_sha is set, use it as git_hash instead of HEAD + - build-* jobs: if rollback_to_sha is set, SKIP these jobs entirely + - migrate-db: if rollback_to_sha is set, SKIP this job (DB already migrated) + - record-build-state: if rollback_to_sha is set, SKIP or modify to record rollback + - deploy-api, deploy-web: if rollback_to_sha is set, use rollback images directly + - promote-all: if rollback_to_sha is set, run as normal (this IS the rollback action) + - deploy-worker: if rollback_to_sha is set, run as normal + - verify: if rollback_to_sha is set, record as "rolled_back" not "current" + + 3. Use job conditions to skip builds: + build-web: + if: ${{ github.event.inputs.rollback_to_sha == '' }} + (similar for build-api, build-worker) + + 4. In prepare-release, if rollback_to_sha is set: + - Use rollback_to_sha as the git_hash + - Set deploy_env based on branch name (not based on rollback_to_sha) + - Record that this is a rollback deployment + + 5. Add rollback_after_promotion handling for the rollback scenario — the rollback + should NOT trigger rollback-after-promotion (it's already a rollback). + +Validation requirements: + * Verify workflow_dispatch with rollback_to_sha set skips all build jobs + * Verify rollback_to_sha uses correct ECR images (reconstruct from SHA) + * Verify rollback proceeds through promote-all and worker deploy + * Verify rollback state is recorded in S3 with "rolled_back" status + * Verify rollback notification email is sent with appropriate messaging + * Verify normal deployment (rollback_to_sha empty) works unchanged + * Verify rollback_to_sha with an invalid/non-existent SHA produces a clear error +``` + +--- + +### Issue 14: DB Schema Compatibility Not Guaranteed on Rollback + +**Priority:** Medium + +**Why this must be fixed now:** + +`rollback.sh` calls `validate-rollback-schema.sh` which checks that the rollback target's Prisma schema is compatible with the current database state. However, the database has already been migrated forward by the failed deployment. Rolling back the app to an older SHA while the database is at a newer migration state could cause runtime errors if the old app code is incompatible with the new schema (e.g., references a column that was renamed or dropped in a later migration that ran as part of the failed deployment). + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Document the known limitation and implement a defensive rollback mechanism that +doesn't proceed if the rollback target's schema is incompatible with the current DB. + +Files to inspect: + - infra/scripts/rollback.sh (validate-rollback-schema.sh call, line ~284) + - infra/scripts/validate-rollback-schema.sh (entire file) + - infra/scripts/migrate.sh (migration output parsing) + +Requirements: + - Understand what validate-rollback-schema.sh actually checks (read the file) + - If validate-rollback-schema.sh doesn't check DB schema compatibility with rollback target, + document this as a known limitation + - Implement a defensive check: before running rollback.sh, if the current DB has + migrations that didn't exist at the rollback target SHA, warn and require confirmation + - Use the migration output (MIGRATIONS_APPLIED_JSON) from the failed deployment's + S3 record to determine which migrations were applied + - If rollback target's schema.prisma references tables/columns that were modified + by the new migrations, the rollback should fail with a clear error message + explaining why it cannot proceed + - Document in the rollback script and in the deployment documentation that: + "Rollback to a SHA older than the most recent migration is not automatically safe. + The DB schema may have been modified in a way that the older app code cannot handle." + +Implementation approach: + + 1. Read validate-rollback-schema.sh to understand what it checks + 2. If it doesn't check DB schema compatibility with rollback target, enhance it to: + - Get the list of migrations applied at the rollback target SHA + - Get the list of migrations currently applied in the DB + - If the DB has migrations not present at the rollback target, fail with clear message + 3. In rollback.sh, ensure the schema validation is treated as fatal: + - If validate-rollback-schema.sh returns non-zero, rollback.sh must fail + - Do NOT proceed with rollback if schema compatibility check fails + 4. Add documentation comments to rollback.sh about the limitation + + Alternative (if validation is complex): document the limitation clearly and + add a CONFIRM_ROLLBACK_SCHEMA_INCOMPATIBLE=true env var that allows bypassing + the check with explicit operator consent. + +Validation requirements: + * Verify rollback fails when rollback target schema is incompatible with current DB + * Verify rollback fails with clear error message (not silent skip) + * Verify rollback succeeds when rollback target schema is compatible + * Verify documentation is added to rollback.sh explaining the limitation + * Verify rollback to a SHA that has no new migrations succeeds (no regression) +``` + +--- + +### Issue 15: `node_modules` Rebuilt on Every Migration Run + +**Priority:** Low + +**Why this must be fixed now:** + +The `migrate.sh` script runs `yarn install` every time because it can't determine whether the existing `node_modules` matches the current `yarn.lock`. For large monorepos, this adds 2-5 minutes to every deployment. While `migrate.sh` does skip install if `node_modules` exists (line 265), the condition is based solely on directory existence, not content validity. + +--- + +**Implementation Prompt:** + +``` +Analyze the existing implementation first before making changes. + +Objective: +Implement yarn.lock hash-based caching in migrate.sh so that node_modules +is only rebuilt when yarn.lock actually changed. + +Files to inspect: + - infra/scripts/migrate.sh (dependency install section, lines ~262-278) + - infra/scripts/migrate.sh (REPO_ROOT resolution, lines ~160-193) + +Requirements: + - Compute a stable hash from yarn.lock content: sha256sum yarn.lock | cut -d' ' -f1 + - Use this hash to create a cache path: /var/cache/onehash/node_modules- + - If the cached node_modules directory exists and is valid (not broken): + a) Link it to REPO_ROOT/node_modules + b) Skip yarn install + - If the cached node_modules doesn't exist: + a) Run yarn install + b) Copy or link the result to the cache directory + - Create the cache directory if it doesn't exist (sudo mkdir -p with appropriate permissions) + - Handle the case where cache directory is on a different filesystem + - This optimization should be transparent — yarn install should produce the same result + +Implementation approach: + Add after yarn.lock validation in migrate.sh (around line 230): + + CACHE_BASE="/var/cache/onehash" + YARN_LOCK_HASH="$(sha256sum "${REPO_ROOT}/yarn.lock" 2>/dev/null | cut -d' ' -f1 || true)" + CACHED_MODULES="${CACHE_BASE}/node_modules-${YARN_LOCK_HASH}" + + if [ -n "$YARN_LOCK_HASH" ] && [ -d "$CACHED_MODULES" ]; then + # Verify cache is valid (node_modules exists and has expected structure) + if [ -d "${CACHED_MODULES}/node_modules" ] || [ -f "${CACHED_MODULES}/node_modules/.bin/yarn" ]; then + log INFO "Using cached node_modules from ${CACHED_MODULES} (yarn.lock hash=${YARN_LOCK_HASH})" + mkdir -p "$(dirname "${REPO_ROOT}/node_modules")" + ln -sfn "$CACHED_MODULES" "${REPO_ROOT}/node_modules" + SKIP_YARN_INSTALL="true" + fi + fi + + Then in the install section: + if [ "$SKIP_YARN_INSTALL" = "true" ]; then + log INFO "node_modules from cache — skipping install" + else + # existing yarn install logic + # After successful install: + if [ -n "$YARN_LOCK_HASH" ]; then + sudo mkdir -p "${CACHE_BASE}" + sudo cp -a "${REPO_ROOT}/node_modules" "${CACHE_BASE}/node_modules-${YARN_LOCK_HASH}" + log INFO "Cached node_modules at ${CACHE_BASE}/node_modules-${YARN_LOCK_HASH}" + fi + fi + + Handle cleanup of old cache entries (keep last 3): + ls -dt "${CACHE_BASE}"/node_modules-* 2>/dev/null | tail -n +4 | \ + while read old_cache; do + log INFO "Removing old node_modules cache: ${old_cache}" + sudo rm -rf "$old_cache" 2>/dev/null || true + done + +Validation requirements: + * Verify yarn install is skipped when node_modules cache exists and is valid + * Verify yarn install runs when node_modules cache doesn't exist + * Verify yarn install runs when yarn.lock changes (new cache entry created) + * Verify cache is shared across multiple deployment attempts (no duplicate installs) + * Verify cache cleanup removes old entries while keeping recent ones + * Verify the cache approach works when REPO_ROOT is on a different filesystem than cache + * Verify idempotency: running migrate.sh twice for the same SHA produces the same result +``` + +--- + +## Summary: Fix Execution Order + +| # | Issue | Priority | Phase | Dependencies | +|---|-------|----------|-------|-------------| +| 1 | WORKER_REPLICAS malformed expression | Critical | 1 | None — fix first | +| 2 | No NGINX revert when rollback skipped | Critical | 1 | None | +| 3 | First-deployment rollback gap | Critical | 1 | None | +| 4 | migrations_applied masking | High | 2 | After #1, #2, #3 | +| 5 | deployment-report blocked by cleanup | High | 2 | After #4 | +| 6 | Pre-flight false positives | High | 2 | After #5 | +| 7 | Worker image pruning | High | 2 | Independent | +| 8 | Migration runs after builds | Medium | 3 | After #1, #2, #3 | +| 9 | Missing post-promotion health check | Medium | 3 | After #2 | +| 10 | No S3 bucket accessibility check | Medium | 3 | After #6 | +| 11 | No SHA consistency verification | Medium | 3 | After #8 | +| 12 | Static build cleanup gap | Medium | 4 | Independent | +| 13 | No manual rollback trigger | Medium | 4 | After #9 | +| 14 | DB schema compatibility on rollback | Medium | 4 | After #13 | +| 15 | node_modules rebuilt every time | Low | 4 | Independent | \ No newline at end of file diff --git a/infra/scripts/preflight-space.sh b/infra/scripts/preflight-space.sh index 078f754394b746..ba4fb4d2bd3a2f 100644 --- a/infra/scripts/preflight-space.sh +++ b/infra/scripts/preflight-space.sh @@ -11,7 +11,7 @@ set -euo pipefail SPACE_MIN_FREE_GB="${SPACE_MIN_FREE_GB:-10}" SPACE_CHECK_PATH="${SPACE_CHECK_PATH:-/}" -SPACE_CLEANUP_UNTIL="${SPACE_CLEANUP_UNTIL:-72h}" +SPACE_CLEANUP_UNTIL="${SPACE_CLEANUP_UNTIL:-168h}" STATIC_ROOT="${STATIC_ROOT:-/var/www/cal-id-static}" STATIC_CURRENT_LINK="${STATIC_CURRENT_LINK:-${STATIC_ROOT}/current}" STATIC_CANDIDATE_LINK="${STATIC_CANDIDATE_LINK:-${STATIC_ROOT}/candidate}" @@ -167,6 +167,67 @@ space_collect_state_image_containers() { done < <(docker ps --format '{{.Names}}' 2>/dev/null) } +# Tag all protected images with :protected so prune excludes them. +# This survives across prune invocations — the label is checked at prune time. +# Must be called before prune; untag after prune with space_untag_protected_images. +space_tag_protected_images() { + space_log INFO "Tagging protected images with :protected to exclude from prune" + + # Tag images from deployment state (web, api, worker from current.json) + for img in "${SPACE_PROTECTED_IMAGES[@]:-}"; do + [ -n "$img" ] || continue + if docker image inspect "$img" >/dev/null 2>&1; then + if ! docker image inspect "${img}:protected" >/dev/null 2>&1; then + docker tag "$img" "${img}:protected" 2>/dev/null || true + space_log INFO "Tagged protected: ${img}:protected" + fi + else + space_log WARN "Protected image not found locally: ${img}" + fi + done + + # Tag images from all running containers + while IFS= read -r container; do + [ -n "$container" ] || continue + img="$(docker inspect -f '{{.Config.Image}}' "$container" 2>/dev/null || true)" + [ -n "$img" ] || continue + if ! docker image inspect "${img}:protected" >/dev/null 2>&1; then + docker tag "$img" "${img}:protected" 2>/dev/null || true + fi + done < <(docker ps --format '{{.Names}}' 2>/dev/null) + + # Tag images from containers with --restart unless-stopped/always that are not running + # These images must be preserved so the container can restart after a host reboot + while IFS= read -r name; do + [ -n "$name" ] || continue + img="$(docker inspect -f '{{.Config.Image}}' "$name" 2>/dev/null || true)" + [ -n "$img" ] || continue + if ! docker image inspect "${img}:protected" >/dev/null 2>&1; then + docker tag "$img" "${img}:protected" 2>/dev/null || true + space_log INFO "Tagged restart-policy image: ${img}" + fi + done < <(docker ps -a --format '{{.Names}}' 2>/dev/null | while IFS= read -r name; do + policy="$(docker inspect -f '{{.HostConfig.RestartPolicy.Name}}' "$name" 2>/dev/null || true)" + case "$policy" in + unless-stopped|always) printf '%s\n' "$name" ;; + esac + done) + + space_log INFO "Protected image tagging complete" +} + +# Remove :protected tags after prune so they don't accumulate. +# Safe to call multiple times — idempotent. +space_untag_protected_images() { + space_log INFO "Removing :protected tags from images" + docker images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null \ + | grep ':protected$' \ + | while IFS= read -r tagged; do + docker rmi "$tagged" 2>/dev/null || true + done + space_log INFO "Protected tag cleanup complete" +} + space_collect_protections() { SPACE_PROTECTED_CONTAINERS=() SPACE_PROTECTED_IMAGES=() @@ -250,10 +311,22 @@ space_run_controlled_cleanup() { space_collect_protections space_remove_stale_containers + # Tag protected images before prune so they survive the cleanup. + # Protected images include: images from current.json, running containers, + # and containers with --restart unless-stopped (restored on host reboot). + space_tag_protected_images + space_log INFO "Prune unused Docker images older than ${SPACE_CLEANUP_UNTIL}" - docker image prune -af --filter "until=${SPACE_CLEANUP_UNTIL}" >/dev/null 2>&1 \ + # --filter "label!=protected" excludes all images tagged :protected from prune. + # The :protected tag survives across prune invocations (checked at prune time). + docker image prune -af \ + --filter "until=${SPACE_CLEANUP_UNTIL}" \ + --filter "label!=protected" >/dev/null 2>&1 \ || space_log WARN "Docker image prune reported a warning" + # Remove :protected tags after prune so they don't accumulate in the image list. + space_untag_protected_images + space_log INFO "Prune unused Docker build cache older than ${SPACE_CLEANUP_UNTIL}" docker builder prune -af --filter "until=${SPACE_CLEANUP_UNTIL}" >/dev/null 2>&1 \ || space_log WARN "Docker builder prune reported a warning" diff --git a/infra/scripts/rollback.sh b/infra/scripts/rollback.sh index 4a7ca52b87ec1a..b13a7bf90b7830 100755 --- a/infra/scripts/rollback.sh +++ b/infra/scripts/rollback.sh @@ -263,9 +263,8 @@ verify_rollback_image() { } rollback_failure() { - log ERROR "Rollback orchestration failed; attempting NGINX revert as last resort" - "${script_dir}/revert-nginx.sh" || true - fail_script "Rollback failed after NGINX revert attempt" + log ERROR "Rollback orchestration failed mid-script; note: NGINX revert is already handled by the workflow before this script runs." + fail_script "Rollback failed during script execution" } trap rollback_failure ERR diff --git a/infra/scripts/send-email.sh b/infra/scripts/send-email.sh index 21db0d8d05fec5..4c139a393e9430 100755 --- a/infra/scripts/send-email.sh +++ b/infra/scripts/send-email.sh @@ -23,7 +23,7 @@ # DEPLOY_ENV, RELEASE_ID, GIT_SHA, GITHUB_SHA, BRANCH_NAME, GITHUB_ACTOR, # GITHUB_RUN_ID, GITHUB_REPOSITORY, LOG_FILE, LOG_SNIPPET, FAILURE_REASON, # WEB_IMAGE, API_IMAGE, WORKER_IMAGE, MIGRATION_STATUS, WEB_STATUS, -# API_STATUS, WORKER_STATUS, ROLLBACK_STATUS, STATE_FILE_PATH, +# API_STATUS, WORKER_STATUS, CLEANUP_STATUS, ROLLBACK_STATUS, STATE_FILE_PATH, # NGINX_ACTIVE_UPSTREAMS, ROLLBACK_TARGET_SHA, LOG_SNIPPET_MAX_LINES # # Non-blocking by default: @@ -127,6 +127,7 @@ MIGRATION_STATUS="${MIGRATION_STATUS:-N/A}" WEB_STATUS="${WEB_STATUS:-N/A}" API_STATUS="${API_STATUS:-N/A}" WORKER_STATUS="${WORKER_STATUS:-N/A}" +CLEANUP_STATUS="${CLEANUP_STATUS:-skipped}" ROLLBACK_STATUS="${ROLLBACK_STATUS:-N/A}" STATE_FILE_PATH="${STATE_FILE_PATH:-N/A}" NGINX_ACTIVE_UPSTREAMS="${NGINX_ACTIVE_UPSTREAMS:-N/A}" @@ -168,6 +169,7 @@ Web: ${WEB_STATUS} API: ${API_STATUS} Worker: ${WORKER_STATUS} Rollback: ${ROLLBACK_STATUS} +VPC Cleanup: ${CLEANUP_STATUS:-skipped} Rollback Target: ${ROLLBACK_TARGET_SHA} NGINX Active Upstreams: