From 4e7b9ae91aba9bbea95e5b3c29c5ebc8e8dcb4e5 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Fri, 26 Jun 2026 18:58:50 -0400 Subject: [PATCH 1/2] fix(scenario): reset deploy-history ring before seeding The rollback scenario suite only cleared releases and tags at the start, leaving the manifest deploy-history ring intact. In the live fleet the repin push fires its own orchestrate that records a prod deploy in the ring before the suite runs, so the no-target rollback resolved the prior version off that injected entry and reverted prod to the wrong SHA. Wipe the manifest state with cascade reset --state --push before seeding so the seed is genuinely prod's first deploy and the ring's prior is deterministic both standalone and in-fleet. Signed-off-by: Joshua Temple --- .github/workflows/scenario-suite.yaml | 29 ++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/.github/workflows/scenario-suite.yaml b/.github/workflows/scenario-suite.yaml index f6858ce..18dfcb9 100644 --- a/.github/workflows/scenario-suite.yaml +++ b/.github/workflows/scenario-suite.yaml @@ -239,13 +239,40 @@ jobs: git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - - name: Clean slate - delete leftover releases and tags + - name: Clean slate - delete leftover releases and tags, reset state run: | set -euo pipefail gh release list --repo "$GITHUB_REPOSITORY" --limit 200 --json tagName --jq '.[].tagName' \ | while read -r t; do gh release delete "$t" --repo "$GITHUB_REPOSITORY" --yes --cleanup-tag 2>/dev/null || true; done git fetch --tags --quiet || true for t in $(git tag -l 'v*' 'rel-*'); do git push origin --delete "$t" 2>/dev/null || true; done + # Wipe the manifest state, including every env's deploy-history ring + # (state..previous), so the seed below is genuinely prod's first + # deploy. This matters in the live fleet: the repin job pushes the + # regenerated workflows to main before this suite is dispatched, and + # that push fires its own orchestrate that records a prod deploy in the + # ring. Without this reset the no-target rollback would resolve the + # prior version off that injected entry and revert prod to the repin's + # SHA, not the SHA this suite seeds. cascade reset reads trunk, + # rewrites the manifest state, and pushes; a concurrent writer can + # advance main between the read and the push, so retry with bounded, + # growing backoff, refreshing the checkout to the current trunk tip + # before each re-run. Fail closed if trunk keeps advancing. + reset_pushed=false + for attempt in 1 2 3 4 5; do + if cascade reset --state --push; then + reset_pushed=true + break + fi + echo "reset push attempt ${attempt} rejected; refreshing trunk and retrying" + git fetch origin main --quiet + git reset --hard origin/main >/dev/null + sleep "$((attempt * 5))" + done + if [ "${reset_pushed}" != true ]; then + echo "::error::cascade reset --state --push failed after 5 attempts; trunk kept advancing" + exit 1 + fi - name: Seed - first prod deploy id: seed From 80770fd8e26c3203b546bdcabe4bd00439355fc1 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Fri, 26 Jun 2026 19:38:53 -0400 Subject: [PATCH 2/2] fix(scenario): anchor rollback assertion to cascade rollback preflight The suite asserted prod reverted to the SHA it seeded. That only held by luck: the deploy-on-merge path never pushes to the deploy-history ring, so a no-target rollback reverts to whatever the ring (or manifest git history) already holds, not the suite's seed. The no-target rollback resolves its target with the same resolver the generated workflow's own preflight uses. Anchor the assertion to that resolved target (sha, version, source), keeping the suite deterministic standalone and in-fleet while still exercising the no-target client_payload path. Signed-off-by: Joshua Temple --- .github/workflows/scenario-suite.yaml | 78 +++++++++++++-------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/.github/workflows/scenario-suite.yaml b/.github/workflows/scenario-suite.yaml index 18dfcb9..ab66668 100644 --- a/.github/workflows/scenario-suite.yaml +++ b/.github/workflows/scenario-suite.yaml @@ -239,40 +239,13 @@ jobs: git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - - name: Clean slate - delete leftover releases and tags, reset state + - name: Clean slate - delete leftover releases and tags run: | set -euo pipefail gh release list --repo "$GITHUB_REPOSITORY" --limit 200 --json tagName --jq '.[].tagName' \ | while read -r t; do gh release delete "$t" --repo "$GITHUB_REPOSITORY" --yes --cleanup-tag 2>/dev/null || true; done git fetch --tags --quiet || true for t in $(git tag -l 'v*' 'rel-*'); do git push origin --delete "$t" 2>/dev/null || true; done - # Wipe the manifest state, including every env's deploy-history ring - # (state..previous), so the seed below is genuinely prod's first - # deploy. This matters in the live fleet: the repin job pushes the - # regenerated workflows to main before this suite is dispatched, and - # that push fires its own orchestrate that records a prod deploy in the - # ring. Without this reset the no-target rollback would resolve the - # prior version off that injected entry and revert prod to the repin's - # SHA, not the SHA this suite seeds. cascade reset reads trunk, - # rewrites the manifest state, and pushes; a concurrent writer can - # advance main between the read and the push, so retry with bounded, - # growing backoff, refreshing the checkout to the current trunk tip - # before each re-run. Fail closed if trunk keeps advancing. - reset_pushed=false - for attempt in 1 2 3 4 5; do - if cascade reset --state --push; then - reset_pushed=true - break - fi - echo "reset push attempt ${attempt} rejected; refreshing trunk and retrying" - git fetch origin main --quiet - git reset --hard origin/main >/dev/null - sleep "$((attempt * 5))" - done - if [ "${reset_pushed}" != true ]; then - echo "::error::cascade reset --state --push failed after 5 attempts; trunk kept advancing" - exit 1 - fi - name: Seed - first prod deploy id: seed @@ -303,8 +276,9 @@ jobs: conclusion="$(wait_for_orchestrate_sha "${MERGE2}")" assert_equal "advance orchestrate conclusion" "${conclusion}" "success" echo "orchestrate2_run_id=$(cat "${RESOLVED_RUN_ID_FILE}")" >> "$GITHUB_OUTPUT" - # Prod now carries the second deploy, with the first retained in the - # deploy-history ring as the rollback target. + # Prod now carries the second deploy. The rollback step below reverts it + # to whatever prior the resolver picks, asserted against the workflow's + # own preflight rather than assuming a specific SHA. sync_env_state_until_sha_changed prod "${PRIOR_SHA}" SHA2="$(state_sha prod)" VER2="$(state_version prod)" @@ -316,19 +290,41 @@ jobs: - name: Rollback - fire repository_dispatch and assert the real revert id: rollback env: - PRIOR_SHA: ${{ steps.seed.outputs.prior_sha }} - PRIOR_VERSION: ${{ steps.seed.outputs.prior_version }} CURRENT_SHA: ${{ steps.advance.outputs.current_sha }} CURRENT_VERSION: ${{ steps.advance.outputs.current_version }} run: | set -euo pipefail + # Read the post-advance trunk state the dispatched rollback will see. + git fetch origin main --quiet + git reset --hard origin/main >/dev/null + # Resolve, with the same resolver the generated rollback workflow's + # preflight uses, the exact target a no-target rollback re-promotes. + # The deploy-on-merge path does not push to the deploy-history ring, so + # the prior the rollback reverts to is whatever the ring (or, failing + # that, manifest git history) already holds, not necessarily the SHA + # this suite just seeded. In the live fleet the repin push and earlier + # runs leave ring entries the suite never seeded, so the assertion is + # anchored to this resolved target, keeping the suite correct standalone + # and in-fleet while still exercising the no-target client_payload path. + plan="$(cascade rollback preflight --env prod --json)" + EXP_SHA="$(printf '%s' "$plan" | jq -r '.target.sha // empty')" + EXP_VER="$(printf '%s' "$plan" | jq -r '.target.version // empty')" + EXP_SRC="$(printf '%s' "$plan" | jq -r '.target.source // empty')" + NOOP="$(printf '%s' "$plan" | jq -r '.no_op')" + assert_match "preflight target sha" "${EXP_SHA}" "${SHA_RE}" + assert_equal "preflight resolves a real revert" "${NOOP}" "false" + if [ "${EXP_SHA}" = "${CURRENT_SHA}" ]; then + echo "::error::preflight target equals current sha; nothing to revert" + exit 1 + fi + echo "preflight: prod will revert to ${EXP_VER:-} (${EXP_SHA}) [from ${EXP_SRC}]" export RESOLVED_RUN_ID_FILE="${RUNNER_TEMP}/rollback-run-id" ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" # Fire the REAL external entry point: the dispatches API with the # rollback parameters in client_payload. The keys (environment, # dry_run) match exactly what the generated rollback workflow reads - # via github.event.client_payload.. No target is sent, so - # preflight resolves the previous version from the deploy-history ring. + # via github.event.client_payload.. No target is sent, so the + # workflow's own preflight resolves the previous version the same way. gh api "repos/${GITHUB_REPOSITORY}/dispatches" -X POST \ -f event_type=rollback-requested \ -f 'client_payload[environment]=prod' \ @@ -343,9 +339,12 @@ jobs: NEW_VER="$(state_version prod)" NEW_REF="$(state_ref prod)" assert_match "rolled-back prod sha" "${NEW_SHA}" "${SHA_RE}" - # Reverted to the exact prior deploy, not the pre-rollback current one. - assert_equal "prod reverted to prior sha" "${NEW_SHA}" "${PRIOR_SHA}" - assert_equal "prod reverted to prior version" "${NEW_VER}" "${PRIOR_VERSION}" + # Reverted to exactly the target the resolver predicted, not the + # pre-rollback current one. + assert_equal "prod reverted to resolved prior sha" "${NEW_SHA}" "${EXP_SHA}" + if [ -n "${EXP_VER}" ]; then + assert_equal "prod reverted to resolved prior version" "${NEW_VER}" "${EXP_VER}" + fi if [ "${NEW_SHA}" = "${CURRENT_SHA}" ]; then echo "::error::prod sha did not revert (still ${CURRENT_SHA})" exit 1 @@ -356,8 +355,9 @@ jobs: echo "## repository_dispatch rollback" echo "- fired via dispatches API (event_type=rollback-requested)" echo "- client_payload keys: environment, dry_run" - echo "- prod ${CURRENT_VERSION} -> ${NEW_VER} (reverted to prior)" - echo "- prod sha reverted to prior deploy: yes" + echo "- prod ${CURRENT_VERSION} -> ${NEW_VER:-} (reverted to resolved prior)" + echo "- resolved target source: ${EXP_SRC}" + echo "- prod sha reverted to resolved prior deploy: yes" echo "- prod marked diverged (ref=rollback/prod): yes" } >> "$GITHUB_STEP_SUMMARY"