diff --git a/.github/workflows/harness-ci.yml b/.github/workflows/harness-ci.yml index 82606ac..5a07c1f 100644 --- a/.github/workflows/harness-ci.yml +++ b/.github/workflows/harness-ci.yml @@ -59,9 +59,33 @@ name: harness CI (no LLM) # TEST_P256_VERIFIER_ADDRESS_HEIMA per test-environment refresh. # TEST_K11_VERIFIER_ADDRESS_HEIMA # +# Additional secrets for the optional path-conditional auto-deploy of the +# test broker EC2 (issue #101 — see docs/ci-setup.md §7): +# +# OIDC_AWS_ROLE_ARN_DEPLOY IAM role assumed by deploy-test-broker. Trust +# policy: federated on GitHub Actions OIDC, +# conditioned on repo:litentry/agentKeys:*. +# Inline policy: ssm:SendCommand on +# document/AWS-RunShellScript + +# one EC2 instance ARN (= TEST_BROKER_INSTANCE_ID). +# Provisioned by scripts/provision-ci-deploy-role.sh. +# SEPARATE from TEST_OIDC_AWS_ROLE_ARN by design: +# e2e role exercises the workload (sts:AssumeRole +# on data roles, S3 verify), deploy role drives +# the broker re-deploy on EC2. Separation of +# duties — a compromise of one doesn't grant +# the other's capability. +# TEST_BROKER_INSTANCE_ID EC2 instance ID (i-xxxxxxxxxxxxxxxxx) hosting +# test-broker.${ZONE}. Pinned in the deploy role's +# inline SSM policy so a leaked session cred +# cannot SendCommand on any other EC2. +# # Gating: until TEST_OIDC_AWS_ROLE_ARN is set, the workflow's preflight # job surfaces a ::warning:: skip and exits clean — safe to merge before -# the operator activates the test infra. +# the operator activates the test infra. The auto-deploy gate is a +# distinct check (OIDC_AWS_ROLE_ARN_DEPLOY + TEST_BROKER_INSTANCE_ID +# both present) so harness validation can be activated without +# auto-deploy, and vice versa. # # WebAuthn: never invoked. harness/v2-stage1-demo.sh defaults to # WEBAUTHN_MODE=0 (line 131), v2-stage2-demo.sh accepts --stub, neither @@ -90,14 +114,27 @@ on: default: "all" type: choice options: ["1", "2", "3", "all"] + force_deploy_broker: + description: "Force deploy-test-broker even if no broker paths changed (dry-run validation)" + required: false + default: "false" + type: choice + options: ["false", "true"] concurrency: group: harness-ci-${{ github.ref }} cancel-in-progress: true permissions: - id-token: write # GitHub Actions OIDC → assume TEST_OIDC_AWS_ROLE_ARN + id-token: write # GitHub Actions OIDC → assume TEST_OIDC_AWS_ROLE_ARN + # (and OIDC_AWS_ROLE_ARN_DEPLOY for deploy-test-broker) contents: read + pull-requests: read # dorny/paths-filter@v3 on pull_request events queries + # the GitHub REST API (/repos/.../pulls/N/files) to list + # changed paths. Without this, the API returns + # 'Bad credentials' and the detect-changes job fails. + # Required only on PR triggers; workflow_dispatch + + # push triggers don't need it (no PR to query). jobs: rust-checks: @@ -126,6 +163,44 @@ jobs: # map — same convention as the existing @claude review workflow. - run: cargo test --workspace -- --test-threads=1 + detect-changes: + # Issue #101: path-conditional triggers for auto-deploy of the test broker. + # Computes `broker_changed` so deploy-test-broker can skip when a PR only + # touches docs/harness/test infra — saves ~3 min cargo rebuild + ssm wait + # per CI run, and avoids touching the test EC2 from PRs that don't need to. + # + # Path-filter false-negative caveats (see issue #101 "Trade-offs"): + # - workspace-shared crates (agentkeys-types, agentkeys-signer-protocol) + # ripple into the broker → listed in the filter conservatively. + # - Cargo.lock changes → also listed (a transitive dep bump can affect + # broker behavior at runtime). + name: detect changed paths (broker / contracts) + runs-on: ubuntu-latest + outputs: + broker_changed: ${{ steps.f.outputs.broker }} + steps: + - uses: actions/checkout@v4 + with: + # paths-filter needs the merge-base to diff against; default fetch + # is shallow. fetch-depth=0 ⇒ full history (cheap on a small repo). + fetch-depth: 0 + - uses: dorny/paths-filter@v3 + id: f + with: + filters: | + broker: + - 'crates/agentkeys-broker-server/**' + - 'crates/agentkeys-worker-*/**' + - 'crates/agentkeys-signer-protocol/**' + - 'crates/agentkeys-types/**' + - 'crates/agentkeys-core/**' + - 'scripts/setup-broker-host.sh' + - 'scripts/setup-broker-host.sh.d/**' + - 'scripts/broker.env' + - 'scripts/broker.test.env' + - 'Cargo.toml' + - 'Cargo.lock' + preflight: # Gate the harness jobs on the test infra credentials being present. # Until the operator sets TEST_OIDC_AWS_ROLE_ARN, the harness jobs @@ -135,6 +210,7 @@ jobs: needs: rust-checks outputs: should_run: ${{ steps.gate.outputs.should_run }} + deploy_ready: ${{ steps.gate.outputs.deploy_ready }} steps: - id: gate run: | @@ -145,11 +221,281 @@ jobs: echo "should_run=false" >> "$GITHUB_OUTPUT" echo "::warning::TEST_OIDC_AWS_ROLE_ARN unset — harness E2E skipped. See workflow header for operator setup." fi + # deploy_ready: both deploy-side secrets must be present. Independent + # of should_run so an operator can opt INTO harness validation + # without enabling auto-deploy (e.g. while still vetting the deploy + # role's blast radius). + if [ -n "${{ secrets.OIDC_AWS_ROLE_ARN_DEPLOY }}" ] && [ -n "${{ secrets.TEST_BROKER_INSTANCE_ID }}" ]; then + echo "deploy_ready=true" >> "$GITHUB_OUTPUT" + echo "deploy secrets present; auto-deploy eligible" + else + echo "deploy_ready=false" >> "$GITHUB_OUTPUT" + echo "::notice::OIDC_AWS_ROLE_ARN_DEPLOY or TEST_BROKER_INSTANCE_ID unset — auto-deploy skipped. See docs/ci-setup.md §7." + fi + + deploy-test-broker: + # Issue #101: drives `setup-broker-host.sh --test --yes` on the test broker + # EC2 via AWS SSM whenever a PR/push changes broker-affecting paths. + # + # Why deploy BEFORE harness-e2e (vs the issue's `needs: harness-e2e`): + # the failure mode this fixes is "harness scripts at version B vs broker + # binary at version A → spurious pass or confusing failure". Deploying + # first means harness-e2e validates the SAME revision the PR proposes — + # so a broker bug introduced by the PR is caught in the same PR, not + # leaked to whoever pushes next. Trade-off: a broker bug that crashes on + # startup will fail the deploy and skip harness-e2e (which is also the + # right signal — there's nothing to test). + # + # Concurrency: cross-PR races on the test EC2 are possible (PR-A deploys + # version A, PR-B deploys version B mid-flight, PR-A's harness sees B). + # Mitigation deferred to the followup PR — first cut accepts the race + # since concurrent broker-touching PRs are rare and the test EC2 is + # disposable. To add later: `concurrency: group: test-broker-deploy` + # with `cancel-in-progress: false` so deploys queue. + name: deploy broker to test EC2 (path-conditional) + needs: [preflight, detect-changes] + if: | + needs.preflight.outputs.should_run == 'true' && + needs.preflight.outputs.deploy_ready == 'true' && + (needs.detect-changes.outputs.broker_changed == 'true' || + (github.event_name == 'workflow_dispatch' && inputs.force_deploy_broker == 'true')) + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials via OIDC (deploy role) + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.OIDC_AWS_ROLE_ARN_DEPLOY }} + aws-region: ${{ secrets.TEST_AWS_REGION || 'us-east-1' }} + # Session name shows up in CloudTrail — distinct from the e2e + # role's session-name pattern so the deploy invocations are + # filterable separately. + role-session-name: gh-deploy-${{ github.run_id }} + + - name: Sanity-check the test broker EC2 is SSM-managed + # Fail fast with a clear remediation path. Three failure modes are + # distinguished: + # - AccessDenied → deploy role lacks ssm:DescribeInstanceInformation. + # Operator re-runs provision-ci-deploy-role.sh on their laptop; + # the inline policy is idempotently refreshed to include it. + # - Empty/None → instance genuinely not registered (no agent, no + # profile, wrong region). Operator SSH-debugs or re-runs + # setup-broker-host.sh which auto-installs amazon-ssm-agent. + # - Other state → unexpected; fail loud with the value for triage. + env: + REGION: ${{ secrets.TEST_AWS_REGION || 'us-east-1' }} + INSTANCE_ID: ${{ secrets.TEST_BROKER_INSTANCE_ID }} + run: | + set -euo pipefail + stderr_file=$(mktemp) + state=$(aws ssm describe-instance-information \ + --region "$REGION" \ + --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' \ + --output text 2>"$stderr_file" || echo "") + if grep -q "AccessDenied" "$stderr_file"; then + echo "::error::Deploy role lacks ssm:DescribeInstanceInformation." + echo "::error::Fix: re-run scripts/provision-ci-deploy-role.sh on the operator laptop —" + echo "::error::the inline policy is now refreshed with the missing perm (idempotent)." + rm -f "$stderr_file" + exit 1 + fi + rm -f "$stderr_file" + [ -z "$state" ] && state="None" + case "$state" in + Online) + echo "::notice::SSM agent online on $INSTANCE_ID" + ;; + None) + echo "::error::$INSTANCE_ID is not SSM-managed (state=$state)." + echo "::error::SSH into the broker EC2 and run scripts/setup-broker-host.sh --test --yes —" + echo "::error::it auto-installs amazon-ssm-agent. See docs/ci-setup.md §7.1." + exit 1 + ;; + *) + echo "::error::SSM agent state = $state on $INSTANCE_ID (expected Online)" + exit 1 + ;; + esac + + - name: Compute deploy ref (PR head or push branch) + # GitHub provides GITHUB_HEAD_REF for PRs (source branch) and + # GITHUB_REF_NAME for push events. Falling through to "evm" as a + # safety net for manual workflow_dispatch on the default branch. + # The test EC2 fetches + checks out this ref before re-running + # setup-broker-host.sh, so the deployed binary matches the PR. + run: | + set -euo pipefail + ref="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME:-evm}}" + if [ -z "$ref" ]; then + echo "::error::could not derive a ref to deploy" + exit 1 + fi + # Refuse refs that contain shell metacharacters (defense-in-depth + # — GitHub already validates branch names, but the value is + # interpolated into a remote shell snippet below). + if printf '%s' "$ref" | grep -qE '[^A-Za-z0-9._/-]'; then + echo "::error::ref '$ref' contains unsupported characters" + exit 1 + fi + echo "DEPLOY_REF=$ref" >> "$GITHUB_ENV" + echo "::notice::will deploy ref: $ref" + + - name: SendCommand — fetch + checkout + setup-broker-host.sh --test --yes + env: + REGION: ${{ secrets.TEST_AWS_REGION || 'us-east-1' }} + INSTANCE_ID: ${{ secrets.TEST_BROKER_INSTANCE_ID }} + # Operator-pinnable override; the auto-discover loop below covers the + # common candidates when this isn't set. + REPO_DIR_OVERRIDE: ${{ secrets.TEST_BROKER_REPO_DIR }} + run: | + set -euo pipefail + # Compose the remote shell script. `$DEPLOY_REF` is interpolated by + # the runner's shell (GHA env block makes it visible here); the + # remote SSM-driven shell sees the literal branch name. The remote + # shell runs as root (SSM-default on Ubuntu AMIs); git ops use + # `sudo -u ` so the working tree stays owned by whoever + # originally cloned it (typically ubuntu, sometimes agentkeys / root). + # + # Repo location auto-discovery: try TEST_BROKER_REPO_DIR override + # first, then common candidates. Fail fast with a clear remediation + # path if no candidate has the repo. Avoids the 'cd: can\'t cd to + # /home/ubuntu/agentKeys' failure mode when the operator cloned to + # a non-default path. + read -r -d '' deploy_script <&2 + echo "candidates tried: \$REPO_DIR_OVERRIDE /home/ubuntu/agentKeys /home/agentkey/agentKeys /opt/agentkeys /srv/agentkeys /root/agentKeys etc." >&2 + echo "Fix: pin the path via the TEST_BROKER_REPO_DIR repo secret." >&2 + exit 2 + fi + echo "using repo at \$REPO_DIR" + REPO_OWNER=\$(stat -c '%U' "\$REPO_DIR") + echo "tree is owned by \$REPO_OWNER" + cd "\$REPO_DIR" + sudo -u "\$REPO_OWNER" git fetch --prune origin + sudo -u "\$REPO_OWNER" git checkout "$DEPLOY_REF" || sudo -u "\$REPO_OWNER" git checkout "origin/$DEPLOY_REF" + sudo -u "\$REPO_OWNER" git pull --ff-only origin "$DEPLOY_REF" 2>/dev/null || true + bash scripts/setup-broker-host.sh --test --yes --non-interactive + EOF + + # jq --arg passes the multi-line script outside of shell parameter + # expansion (no modifier bugs per CLAUDE.md heredoc-trap rule). + params=$(jq -n --arg script "$deploy_script" '{ + commands: [$script], + executionTimeout: ["900"] + }') + + cmd_id=$(aws ssm send-command \ + --region "$REGION" \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --comment "gh-ci deploy ${GITHUB_RUN_ID} ref=${DEPLOY_REF}" \ + --parameters "$params" \ + --query 'Command.CommandId' \ + --output text) + echo "SSM_COMMAND_ID=$cmd_id" >> "$GITHUB_ENV" + echo "::notice::SSM SendCommand queued: $cmd_id" + + - name: Poll SSM command until completion + env: + REGION: ${{ secrets.TEST_AWS_REGION || 'us-east-1' }} + INSTANCE_ID: ${{ secrets.TEST_BROKER_INSTANCE_ID }} + run: | + set -euo pipefail + # Poll every 10s for up to 15 min. The command runs setup-broker-host.sh + # which rebuilds + restarts broker/signer/4 workers; cold cargo cache + # can be ~10min, warm ~3min. + for i in $(seq 1 90); do + sleep 10 + status=$(aws ssm get-command-invocation \ + --region "$REGION" \ + --command-id "$SSM_COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'Status' \ + --output text 2>/dev/null || echo "Pending") + echo "iter=$i status=$status" + case "$status" in + Success) + aws ssm get-command-invocation \ + --region "$REGION" \ + --command-id "$SSM_COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text | tail -200 + echo "::notice::deploy ok (ssm command $SSM_COMMAND_ID)" + exit 0 + ;; + Failed|Cancelled|TimedOut) + echo "::error::SSM command terminal status: $status" + aws ssm get-command-invocation \ + --region "$REGION" \ + --command-id "$SSM_COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query '{stdout:StandardOutputContent,stderr:StandardErrorContent}' \ + --output json + exit 1 + ;; + Pending|InProgress|Delayed) + continue + ;; + *) + echo "::warning::unexpected status: $status" + ;; + esac + done + echo "::error::SSM command $SSM_COMMAND_ID did not complete within 15min" + exit 1 harness-e2e: name: harness/v2-stage*-demo.sh on Heima mainnet (test deployer) - needs: preflight - if: needs.preflight.outputs.should_run == 'true' + needs: [preflight, deploy-test-broker] + # Codex adversarial review (PR #102) confirmed: the harness's chain-mutating + # scripts (heima-fund-account.sh + heima-agent-create.sh) share ONE Heima + # test deployer wallet. The outer `concurrency: harness-ci-${{ github.ref }}` + # only cancels in-flight runs on the SAME ref — concurrent runs on DIFFERENT + # refs (PR branch + manual dispatch, two PRs, etc.) share the deployer and + # collide on nonce in the Heima mempool, surfacing as + # `replacement transaction underpriced`. + # + # This second concurrency group, scoped to the deployer (not the ref), + # serializes harness-e2e runs globally. `cancel-in-progress: false` queues + # subsequent runs instead of cancelling them — so a long-running harness + # doesn't lose work to a newer push. + concurrency: + group: heima-test-deployer-nonce + cancel-in-progress: false + # Run when: + # - preflight gates green (test infra is set up) + # - AND either: + # (a) deploy-test-broker succeeded (PR re-deployed the broker + # to test EC2, validating fresh broker code), OR + # (b) deploy-test-broker was skipped (no broker paths changed + # OR deploy_ready=false — the EC2's existing binary still + # covers the harness contract). + # always() forces evaluation even when the upstream `if:` skips + # deploy-test-broker (GHA treats `needs:` deps with skipped jobs as + # failing the implicit `success()` filter without always()). + if: | + always() && + needs.preflight.outputs.should_run == 'true' && + (needs.deploy-test-broker.result == 'success' || + needs.deploy-test-broker.result == 'skipped') runs-on: ubuntu-latest timeout-minutes: 60 diff --git a/README.md b/README.md index 8807068..75d499e 100644 --- a/README.md +++ b/README.md @@ -4,16 +4,20 @@ Credential broker for AI agents. A master (human) delegates scoped, revocable ac Status: pre-v0. Stage 5 in progress (see `harness/progress.json`). -## What it does +Architecture, language choices, trust boundaries: [`docs/arch.md`](docs/arch.md). + +--- + +## 👤 For humans + +### What it does - **Master CLI** (`agentkeys`) — runs on your laptop; owns a session key in the OS keychain; approves pair/recover/scope-change requests. - **Sandbox daemon** (`agentkeys-daemon`) — runs inside the agent sandbox; brokers credential reads over MCP + a Unix socket; never exposes raw keys to the agent. - **Provisioner** (`agentkeys-provisioner` + `provisioner-scripts`) — Rust orchestrator drives TypeScript/Playwright scrapers to sign up for services and hand the resulting API key back through the trust boundary. - **Mock backend** (`agentkeys-mock-server`) — v0-only; mirrors the Heima parachain API so we can build end-to-end before the chain integration lands. -Architecture, language choices, trust boundaries: [`docs/arch.md`](docs/arch.md). - -## Workspace layout +### Workspace layout ``` crates/ @@ -31,7 +35,7 @@ harness/ stage-gated build harness + progress ~80% Rust, 100% of the security-critical path in Rust. TypeScript is confined to browser automation and (post-MVP) the Web GUI frontend. -## Build & test +### Build & test ``` cargo build @@ -50,12 +54,56 @@ cargo test -p agentkeys-daemon -p agentkeys-mcp cargo test -p agentkeys-provisioner ``` -## Development +### First-machine setup + +Fresh laptop? Start with [`docs/dev-setup.md`](docs/dev-setup.md) — it walks you through rustup, jj, Node, AWS CLI, browser, and runs the workspace smoke tests. -Staged build plan in [`docs/spec/plans/development-stages.md`](docs/spec/plans/development-stages.md). Each stage has a `harness/stage-N-done.sh` gate that must exit 0 before the stage is marked complete. Contributor workflow: [`CLAUDE.md`](CLAUDE.md). +### Inner-loop dev -Version control uses [jj (Jujutsu)](https://github.com/jj-vcs/jj), not raw git. +Iterating on the broker, signer, mock-server, or operator-side scripts? [`docs/spec/broker-and-operator-dev-guide.md`](docs/spec/broker-and-operator-dev-guide.md) covers the local edit-build-test loop: which process to run on which port, how to point harness scripts at `localhost`, how to use `harness/v2-stage*-demo.sh` for resumable step-by-step testing. -## License +### License Dual-licensed under **MIT OR Apache-2.0**, at your choice. + +--- + +## 🤖 For AI coding agents + +**You must read these before making any change.** They override defaults from your training data and cover the project-specific guardrails. + +| Read | Why | +|---|---| +| [`CLAUDE.md`](CLAUDE.md) | Project-specific rules: docs layout, /create-pr workflow in worktrees, terminology-source-of-truth, branch push policy, idempotent-remote-setup invariants, runbook-fix-fold-back policy. **Read first, every session.** | +| [`docs/arch.md`](docs/arch.md) | Single source of truth for component inventory (K1–K11), trust boundaries, HDKD actor tree, per-actor binding ceremonies. When the per-doc detail outgrows arch.md, link outward — never duplicate. | +| [`docs/spec/plans/development-stages.md`](docs/spec/plans/development-stages.md) | The 8-stage build plan. Each stage has a `harness/stage-N-done.sh` gate; never self-grade — run the gate. | +| [`docs/spec/plans/execution-plan.md`](docs/spec/plans/execution-plan.md) | Orchestration runbook (ralph, team, ultraqa workflows). | +| [`docs/spec/broker-and-operator-dev-guide.md`](docs/spec/broker-and-operator-dev-guide.md) | Inner edit-build-test loop for broker + operator-side code. Use this before suggesting changes to the broker's run-time behavior. | + +### Hard rules (from CLAUDE.md) + +These are non-negotiable. Violating them produces broken PRs / corrupted state. + +- **Use `jj` (Jujutsu), never raw `git`.** Common mappings in CLAUDE.md. The one exception: inside a Claude Code `.claude/worktrees//` worktree, the initial commit must use `git` (jj can't colocate in a git-worktree); then `cd` to the main repo and push via `jj git push`. Never include `Co-Authored-By:` lines in those commits. +- **Branch `evm` pushes immediately.** On `evm`, push after every `jj describe` — the remote broker host pulls from `origin/evm` to redeploy. "I'll push at the end" silently breaks deploys. +- **Diagnose before edit.** Reproduce the failure locally first; isolate the layer (shell / client / doc / broker code / network). If the cause is local to the operator's shell, respond with the one-line fix — don't edit the repo. +- **Land the fix everywhere.** Once a local repro proves a fix is correct, land it the same turn — search the repo for every affected file, commit, push to `origin/evm`. Don't stop at "verified locally" or "fixed one file." +- **Runbook fix fold-back.** When an operator hits a runbook failure, two things land in the same turn: (1) the targeted fix, (2) a revision to the runbook so the next operator doesn't hit the same trap. +- **No hardcoded values.** Use env var + default, CLI flag + default, or a config file. If you must hardcode temporarily, log it in [`hardcoded.md`](hardcoded.md) with file:line + reason + what would unblock dynamic. +- **Idempotent remote setup.** Every script that mutates remote state (AWS / Heima / CI / VM / DNS) must exit 0 on re-run without re-applying. Pre-check with `get-*` before mutating; log `ok | skip | fail `. +- **Plan completion is all-or-nothing.** When implementing a plan, every numbered step must be done — or the PR summary's "What did NOT land" section must explicitly list what was skipped and why. +- **Terminology source of truth.** Never invent a new name for a concept arch.md already names. If you find divergence, fix it in the same commit or document the alias in arch.md's "Canonical names" section. + +### Per-session protocol + +1. `jj log --limit 10 && cat harness/progress.json && bash harness/init.sh $(jq -r .current_stage harness/progress.json)` +2. Read the stage contract for the current stage in `docs/spec/plans/development-stages.md`. +3. Pick the HIGHEST-PRIORITY incomplete deliverable from `harness/features.json`. +4. Implement ONE deliverable, run `cargo test -p `, `jj describe`, update `harness/features.json`, `jj new`. + +### Single entry points + +Don't reach for ad-hoc `systemctl`, `scp`, or `forge script` — these are wrapped: + +- **Remote broker host** (binary upgrades, systemd, nginx, env tweaks): `bash scripts/setup-broker-host.sh` +- **Heima chain bring-up** (deploy, binding ceremonies, scope grants, K11 enroll, audit-row append, worker smoke): `bash scripts/setup-heima.sh` diff --git a/docs/ci-setup.md b/docs/ci-setup.md index 005d77b..0e270bc 100644 --- a/docs/ci-setup.md +++ b/docs/ci-setup.md @@ -365,6 +365,122 @@ gh workflow run harness-ci.yml --repo litentry/agentKeys --field stage=3 When the workflow passes against the test stack, CI is live. Every subsequent push to a PR triggers it; you're done. +### 7. (Optional) Wire auto-deploy of the test broker (issue [#101](https://github.com/litentry/agentKeys/issues/101)) + +Without this step, the workflow validates against the **already-deployed** test broker. If a PR changes broker code (`crates/agentkeys-broker-server/**`, `crates/agentkeys-worker-*/**`, `crates/agentkeys-signer-protocol/**`, `scripts/setup-broker-host.sh*`, or any workspace-shared crate the broker links against), the test broker binary silently drifts from the PR's source tree — the harness then exercises *old* broker code against *new* harness scripts, producing either spurious passes or confusing failures. + +Step 7 wires a second OIDC role (`github-actions-agentkeys-deploy`) plus two new GitHub secrets. When activated, the workflow's `detect-changes` job sees broker-affecting paths in the diff, the `deploy-test-broker` job assumes that role, and `aws ssm send-command` drives `setup-broker-host.sh --test --yes` on the test EC2 — re-deploying the broker so `harness-e2e` validates the PR's actual code. The deploy job is **gated three ways**: + +1. `paths-filter` boolean (no broker code changed → skip). +2. Both deploy secrets present (`OIDC_AWS_ROLE_ARN_DEPLOY` + `TEST_BROKER_INSTANCE_ID`). +3. `preflight.outputs.should_run == 'true'` (test infra fully wired). + +If any gate fails, the deploy job is **skipped, not failed** — `harness-e2e` still runs against the existing broker binary. So this step is fully opt-in; partial activation is safe. + +#### 7.1 Run the provisioning script + +```bash +awsp agentkeys-admin +# Look up the test broker EC2 instance ID (one-shot — pin it once): +TEST_BROKER_INSTANCE_ID=$(aws ec2 describe-instances \ + --region "$REGION" \ + --filters "Name=ip-address,Values=$(curl -sS "https://dns.google/resolve?name=$BROKER_HOST&type=A" | jq -r '.Answer[0].data')" \ + --query 'Reservations[0].Instances[0].InstanceId' --output text) +echo "$TEST_BROKER_INSTANCE_ID" # → i-xxxxxxxxxxxxxxxxx + +# Idempotent provisioning — safe to re-run. Use --fix-ssm on the FIRST run +# so the script auto-attaches AmazonSSMManagedInstanceCore to the broker EC2's +# instance profile if it's missing (a fresh EC2 commonly lacks this policy). +bash scripts/provision-ci-deploy-role.sh \ + --test-broker-instance-id "$TEST_BROKER_INSTANCE_ID" \ + --env-file scripts/operator-workstation.test.env \ + --fix-ssm +``` + +The script: + +- Creates / refreshes the `github-actions-agentkeys-deploy` IAM role with a federated trust policy on the GitHub Actions OIDC provider, scoped to `repo:litentry/agentKeys:*` (any branch in this repo can trigger; the workflow's path filter + preflight gate further restrict when the role is actually used). +- Attaches an inline policy `agentkeys-ci-deploy-ssm` with: + - `ssm:SendCommand` on `document/AWS-RunShellScript` + the one instance ARN (so even if the role's session creds leaked, the worst a third party can do is re-run setup-broker-host.sh on the test EC2 — a destructive op there is `terraform apply`-style: idempotent, recoverable, and contained to the test environment). + - `ssm:GetCommandInvocation` / `ssm:ListCommandInvocations` / `ssm:DescribeInstanceInformation` for status polling + the workflow's pre-deploy sanity check. + - `ec2:DescribeInstances` scoped to the one instance ID, for the workflow's pre-deploy sanity check. + +> Already provisioned the role before `ssm:DescribeInstanceInformation` was added to the policy template? Re-run the provisioning script. `put-role-policy` is idempotent — it overwrites the inline policy with the current source-of-truth shape, picking up any added permissions. +- Verifies the test EC2 is registered with SSM (`PingStatus = Online`). With `--fix-ssm`, auto-remediates the common "instance profile is missing AmazonSSMManagedInstanceCore" case by attaching the policy and polling for up to 3 min for the SSM agent to refresh its creds. Without `--fix-ssm`, just reports the failure with manual fix instructions. + +**SSM remediation modes (what `--fix-ssm` covers, what it doesn't):** + +| Failure | What `--fix-ssm` does | What it CAN'T fix automatically | +|---|---|---| +| Instance profile missing `AmazonSSMManagedInstanceCore` | Attaches the policy, polls for Online | (handled) | +| Policy already attached, agent process running with stale creds | Polls until agent refreshes (~1-3 min typical) | If poll times out: SSH + `sudo systemctl restart amazon-ssm-agent`, OR `aws ec2 reboot-instances …` | +| Instance has NO instance profile at all | Creates a dedicated `agentkeys-test-broker-ssm` role + instance profile (EC2 trust + `AmazonSSMManagedInstanceCore`) and associates it with the EC2. IMDS surfaces the new creds within ~30s. Safe because the broker's app-layer AWS access uses static creds from `broker.env`, not IMDS — adding IMDS-served creds can only ADD capability for the SSM agent, not displace anything. | (handled) | +| SSM Agent not installed (no `amazon-ssm-agent` unit) | Reports state; can't reach the box to install (operator's laptop has no SSH-into-EC2 capability from the provision script) | Re-run `bash scripts/setup-broker-host.sh --test --yes` on the EC2 — it now installs `amazon-ssm-agent` (snap preferred, .deb fallback) as part of broker bootstrap. One-shot manual recovery if you don't want to re-run the full setup: `ssh test-broker 'sudo snap install amazon-ssm-agent --classic && sudo systemctl enable --now snap.amazon-ssm-agent.amazon-ssm-agent.service'` | +| Private VPC subnet without an SSM VPC endpoint | Reports state | Operator wires the VPC endpoint (unlikely for a public-IP broker, but possible) | + +Re-running the script after any of the operator-side fixes is safe (idempotent — every step is `get-*` pre-checked before any mutation). + +#### 7.2 Set the two new repo secrets + +```bash +# Print the deploy role ARN you just provisioned (script also prints this): +role_arn=$(aws iam get-role --role-name github-actions-agentkeys-deploy \ + --query 'Role.Arn' --output text) + +gh secret set OIDC_AWS_ROLE_ARN_DEPLOY --repo litentry/agentKeys --body "$role_arn" +gh secret set TEST_BROKER_INSTANCE_ID --repo litentry/agentKeys --body "$TEST_BROKER_INSTANCE_ID" +``` + +| Secret | Purpose | +|---|---| +| `OIDC_AWS_ROLE_ARN_DEPLOY` | ARN of `github-actions-agentkeys-deploy` — assumed by the `deploy-test-broker` job via GitHub Actions OIDC. | +| `TEST_BROKER_INSTANCE_ID` | EC2 instance ID (`i-…`) hosting `test-broker.${ZONE}`. The deploy role's inline policy is scoped to *this single instance*. | +| `TEST_BROKER_REPO_DIR` | **Optional.** Absolute path of the agentKeys git checkout on the EC2 (e.g. `/home/ubuntu/agentKeys`). The deploy workflow auto-discovers across common candidates (`/home/ubuntu/agentKeys`, `/home/ubuntu/agentkeys`, `/opt/agentkeys`, `/srv/agentkeys`, `/root/agentKeys`), so this only needs to be set when the operator cloned to a non-standard path and the workflow's auto-discover step prints `could not locate the agentKeys checkout`. | + +#### 7.3 Dry-run validate + +Trigger the workflow manually with `force_deploy_broker=true` so the deploy fires regardless of whether the latest commit touched broker paths. + +**Pre-merge — `--ref` is required.** `gh workflow run` reads the workflow definition from the *default branch* (`main`) unless you tell it otherwise. Since the `force_deploy_broker` input lives on the PR branch, dispatching without `--ref` fails with `HTTP 422: Unexpected inputs provided: ["force_deploy_broker"]`. Pass `--ref` so GHA reads the workflow YAML (and its inputs) from the PR branch instead: + +```bash +gh workflow run harness-ci.yml --repo litentry/agentKeys \ + --ref claude/adoring-bell-1b9ca8 \ + --field stage=1 \ + --field force_deploy_broker=true +``` + +Replace `claude/adoring-bell-1b9ca8` with your actual PR branch name (`git rev-parse --abbrev-ref HEAD` if you're on it locally). + +**Post-merge — `--ref` is optional.** Once this PR is on `main`, dispatching without `--ref` will work because the input is part of the default-branch workflow definition. (The `--ref` form still works and lets you target any branch.) + +Then in the run logs: + +- `deploy-test-broker` should show `SSM agent online on i-…` (sanity check passed). +- The `SendCommand` step prints the command ID; the next step polls until `Success`. +- On success: the tail of `StandardOutputContent` shows `setup-broker-host.sh` finishing cleanly (`ok systemd unit … active`, `ok nginx running`, etc.). +- On failure: stdout + stderr are dumped to the GHA log. The most common cause is `git checkout` failing on the EC2 because the source tree doesn't have the PR branch fetched — fix by ssh-ing into the box and running `sudo -u ubuntu git fetch --prune origin` once. + +#### 7.4 Disable / disarm + +Remove either secret to disarm — the workflow's `preflight.outputs.deploy_ready` will flip to `false` and the deploy job silently skips: + +```bash +gh secret delete OIDC_AWS_ROLE_ARN_DEPLOY --repo litentry/agentKeys +# or +gh secret delete TEST_BROKER_INSTANCE_ID --repo litentry/agentKeys +``` + +The IAM role can stay provisioned indefinitely — without the secret it can't be assumed by GHA, and the inline SSM perms are scoped to one instance. + +#### Out of scope for issue #101 + +Per [issue #101](https://github.com/litentry/agentKeys/issues/101) "Out of scope": + +- **Prod broker auto-deploy** — never. The prod broker EC2 stays manual via `bash scripts/setup-broker-host.sh --upgrade` from the operator laptop, per CLAUDE.md "Remote broker host (single entry point)". +- **Auto-deploy of test Heima EVM contracts** — deferred to a follow-up PR (issue #101 rollout plan step 7). Contract redeploys mint new addresses and require the `SECRETS_REWRITE_PAT` token to update six `TEST_*_ADDRESS_HEIMA` secrets — more risk than the broker deploy, so it ships separately. +- **Mainnet prod contract redeploy** — never automatic. Manual via `bash scripts/setup-heima.sh` only. + ## What the workflow does on every run 1. Restores submodules + Rust toolchain + Foundry + cargo cache. diff --git a/docs/spec/broker-and-operator-dev-guide.md b/docs/spec/broker-and-operator-dev-guide.md new file mode 100644 index 0000000..88dcae8 --- /dev/null +++ b/docs/spec/broker-and-operator-dev-guide.md @@ -0,0 +1,336 @@ +# Broker + Local Operator Dev Guide + +**Audience:** developers iterating on the broker, the workers, or the operator-side scripts (`harness/`, `scripts/heima-*.sh`). +**Scope:** the inner edit-build-test loop — running the broker stack on your laptop, exercising it with operator scripts, and knowing which knob to turn when something breaks. + +This guide is **not** the environment bootstrap doc (see [`docs/dev-setup.md`](../dev-setup.md)) or the deploy-to-real-host runbook (see [`docs/operator-runbook-stage7.md`](../operator-runbook-stage7.md)). Read those first if you have a fresh machine or you're standing up a new broker EC2. + +--- + +## 1. The local stack at a glance + +The deployed broker runs five processes on one EC2. For local dev you run the same five processes on `localhost`, on the same ports, with the same env contract. Same code path — only the env values change. + +| Process | Default port | Crate | Purpose | Local-dev role | +|---|---|---|---|---| +| `agentkeys-mock-server` | `:8090` | `agentkeys-mock-server` | v0 backend; mirrors the Heima parachain extrinsic surface | Stand-in for the chain RPC + the legacy session-validation backend | +| `agentkeys-broker-server` | `:8091` | `agentkeys-broker-server` | The credential broker — auth, cap-mint, OIDC issuer | The component you're most often editing | +| `agentkeys-signer` (dev_key_service) | `:8092` | `agentkeys-broker-server` (same binary, different listener) | EVM keypair derivation from `omni_account` via HKDF | Stub for the future TEE signer (see [`signer-protocol.md`](./signer-protocol.md)) | +| `agentkeys-worker-audit` | `:9092` | `agentkeys-worker-audit` | Merkle-root batching for credential audit | Only matters if you're touching audit code | +| `agentkeys-worker-email` | `:9093` | `agentkeys-worker-email` | Inbound email handler (SES → cap-mint trigger) | Only matters for email-link auth | +| `agentkeys-worker-creds` | `:9094` | `agentkeys-worker-creds` | Credential store — STS + S3 PrincipalTag-scoped | The data plane the cap-mint flow leads to | +| `agentkeys-worker-memory` | `:9095` | `agentkeys-worker-memory` | Memory store — STS + S3 (per-actor isolation) | Symmetric with creds | + +In the deployed stack `nginx` fronts the broker + signer + 4 workers on `:443` with public hostnames. Locally you talk to the ports directly — no nginx, no TLS. + +--- + +## 2. First-time local-stack bring-up + +After [`docs/dev-setup.md`](../dev-setup.md) §1–§2 (rust, jj, node, `cargo build --workspace --release`), generate the broker's two ES256 keypairs once: + +```bash +mkdir -p ~/.agentkeys/broker +cargo run -q --release -p agentkeys-broker-server -- keygen --purpose oidc --out ~/.agentkeys/broker/oidc-keypair.json +cargo run -q --release -p agentkeys-broker-server -- keygen --purpose session --out ~/.agentkeys/broker/session-keypair.json +chmod 600 ~/.agentkeys/broker/{oidc,session}-keypair.json +``` + +These are the only persistent local state the broker needs. Treat them like any other dev secret — kept under `~/.agentkeys/`, gitignored at the home-directory level, never copied off your laptop. Regenerating them invalidates every previously-derived wallet that depended on the matching session pubkey, so don't `rm` them mid-session. + +--- + +## 3. Inner loop A — edit broker code + +The broker reads its config from env vars and the two keypair files. Source a dev env file once per shell, then iterate with `cargo run`. + +### 3.1 The dev env + +Create `scripts/broker.dev.env` (gitignored — copy + edit from `scripts/broker.env`): + +```bash +# Local-dev broker env — everything points at localhost. +ACCOUNT_ID=000000000000 # placeholder; AWS calls go to mock backend +BROKER_DATA_ROLE_ARN=arn:aws:iam::000000000000:role/dev # never assumed in local dev +BROKER_AWS_REGION=us-east-1 # any region; not actually hit +BROKER_OIDC_ISSUER=http://127.0.0.1:8091 # matches --bind/--port below +BROKER_OIDC_KEYPAIR_PATH=$HOME/.agentkeys/broker/oidc-keypair.json +BROKER_SESSION_KEYPAIR_PATH=$HOME/.agentkeys/broker/session-keypair.json +BROKER_AUTH_METHODS=wallet_sig,email_link +BROKER_AUDIT_ANCHORS=sqlite # sqlite store; never writes to chain +BROKER_EMAIL_SENDER=stub # in-memory; no SES, no AWS creds needed +BROKER_EMAIL_FROM_ADDRESS=dev@localhost +BROKER_BACKEND_URL=http://127.0.0.1:8090 # points at the local mock-server below + +# dev_key_service signer (issue #74 step 1b) +DEV_KEY_SERVICE_MASTER_SECRET=local-dev-secret-32-bytes-min-length-please +``` + +Three lines matter most for local dev: + +- `BROKER_EMAIL_SENDER=stub` — skips SES; magic-link tokens land in an in-process `Vec` that you read back via the test harness or a `curl`-driven `/v1/auth/email/list-pending` endpoint (broker test feature). +- `BROKER_AUDIT_ANCHORS=sqlite` — every audit row lands in a local SQLite file; nothing hits the chain. Set to `evm_testnet` ONLY when you've built with `--features audit-evm` AND you actually want to test the on-chain anchor path (Phase C, not shipped as of PR #102). +- `BROKER_BACKEND_URL` — the broker calls a "backend" for legacy session validation (the v0 mock-server, or a real chain backend in v0.2+). In local dev this points at `agentkeys-mock-server :8090` started in §3.3 below. + +### 3.2 Build the broker with the right features + +`cargo run` defaults to debug + workspace default features. The broker MUST be built with `--features auth-email-link` if `BROKER_AUTH_METHODS` includes `email_link` (which the dev env above does) — otherwise the broker boot-fails with `BROKER_AUTH_METHODS="email_link": unknown or feature-gated-out auth method`. + +```bash +# Iteration build (~10s warm, ~3min cold): +cargo build -p agentkeys-broker-server --features auth-email-link + +# Or release for cycle-accurate testing (~30s warm, ~5min cold): +cargo build --release -p agentkeys-broker-server --features auth-email-link +``` + +Cargo footgun (per [`scripts/setup-broker-host.sh:547`](../../scripts/setup-broker-host.sh)): never combine `-p agentkeys-broker-server -p agentkeys-mock-server --features auth-email-link` — cargo silently drops the feature flag. Always build the two binaries in separate `cargo build` invocations. + +### 3.3 Run the three foreground processes + +Three terminals. Source the dev env in each; pass `--bind 127.0.0.1 --port

`: + +```bash +# Terminal 1 — mock-server (v0 backend the broker talks to) +set -a; source scripts/broker.dev.env; set +a +cargo run --release -p agentkeys-mock-server -- --bind 127.0.0.1 --port 8090 + +# Terminal 2 — broker (your usual edit target) +set -a; source scripts/broker.dev.env; set +a +RUST_LOG=info,agentkeys_broker_server=debug \ + cargo run --release -p agentkeys-broker-server --features auth-email-link -- \ + --bind 127.0.0.1 --port 8091 + +# Terminal 3 — signer (dev_key_service; serves /dev/derive-address + /dev/sign-*) +set -a; source scripts/broker.dev.env; set +a +cargo run --release -p agentkeys-broker-server -- \ + --bind 127.0.0.1 --port 8092 --signer-only +``` + +The signer is the SAME binary as the broker (`agentkeys-broker-server`) with `--signer-only` — it serves only `/dev/*` + `/healthz` and shares the keypair files with the broker process on `:8091`. + +Skip workers (`agentkeys-worker-{audit,email,creds,memory}` on `:9092-:9095`) until you're editing them — the broker's hot path doesn't require them for most flows. + +### 3.4 Sanity check + +```bash +curl -s http://127.0.0.1:8091/healthz # → "ok" +curl -s http://127.0.0.1:8091/.well-known/openid-configuration | jq . # OIDC discovery doc +curl -s http://127.0.0.1:8091/.well-known/jwks.json | jq . # broker's JWKS +``` + +If healthz returns `ok` but the JWKS is empty, the keypair files aren't being read — check the paths in your dev env. If the broker boot-fails with `BROKER_AUTH_METHODS=email_link: unknown`, you forgot `--features auth-email-link` on the cargo build. + +### 3.5 Hot-reload loop + +There's no `cargo watch` in the workspace, but the dev loop is fast enough without it: + +1. Edit Rust in `crates/agentkeys-broker-server/src/...`. +2. `Ctrl-C` Terminal 2's broker. +3. Re-run the `cargo run -p agentkeys-broker-server ...` command from §3.3 (shell history is your friend). +4. The first re-run rebuilds the broker (~10s incremental); subsequent runs reuse the artifact. + +For a tighter loop while editing a single module, write a unit test next to the module and use `cargo test -p agentkeys-broker-server ` — typically <2s per iteration. + +--- + +## 4. Inner loop B — edit operator scripts + +The operator-side scripts (`harness/v2-stage{1,2,3}-demo.sh`, `scripts/heima-*.sh`, `scripts/agentkeys-*-demo.sh`) are the dev loop for the *operator workflow*: cap-mint, identity bootstrap, scope grants, S3 isolation tests. They run on your laptop and call the broker (local or remote) via plain HTTP + `cast` + `aws`. + +### 4.1 Point the operator env at the local broker + +Create `scripts/operator-workstation.dev.env` (gitignored — copy + edit from `scripts/operator-workstation.env`): + +```bash +# Local-dev operator env — points the harness scripts at localhost +ACCOUNT_ID=000000000000 +REGION=us-east-1 +BROKER_HOST=127.0.0.1:8091 +OIDC_ISSUER=http://127.0.0.1:8091 +AGENTKEYS_SIGNER_URL=http://127.0.0.1:8092 +BACKEND_URL=http://127.0.0.1:8090 + +# Local-stack workers (skip these until you wire them up — broker hot path doesn't need them) +AGENTKEYS_WORKER_AUDIT_URL=http://127.0.0.1:9092 +AGENTKEYS_WORKER_EMAIL_URL=http://127.0.0.1:9093 +AGENTKEYS_WORKER_CRED_URL=http://127.0.0.1:9094 +AGENTKEYS_WORKER_MEMORY_URL=http://127.0.0.1:9095 + +# Local chain backbone — pick ONE based on what you're testing: +# anvil — fully local (forge anvil running on 127.0.0.1:8545); fastest +# heima-paseo — Heima testnet; real chain, no real money +# heima — Heima mainnet (production); use with care +AGENTKEYS_CHAIN=anvil +``` + +### 4.2 Run the canonical inner-loop demo + +[`harness/v2-stage1-demo.sh`](../../harness/v2-stage1-demo.sh) is the end-to-end exerciser most operator edits land against. It's a 13-step script: install CLI → email-link init → identity bootstrap → S3 envelope smoke test → chain bring-up → device register → agent create → scope grant → K11 enroll → cap-mint roundtrip. + +```bash +set -a; source scripts/operator-workstation.dev.env; set +a + +# Full demo against local stack: +bash harness/v2-stage1-demo.sh --chain anvil + +# Re-run just one step you're iterating on: +bash harness/v2-stage1-demo.sh --only-step 7 + +# Skip the slow bits (CLI build, chain deploy, S3 provisioning): +bash harness/v2-stage1-demo.sh --skip-build --skip-deploy --skip-provision + +# Stop after a specific step (useful when bisecting a regression): +bash harness/v2-stage1-demo.sh --to-step 5 +``` + +The `--from-step N` / `--to-step N` / `--only-step N` triad is the inner-loop primitive — every step prints `[step N/M]` to stderr, every step is idempotent. If step 7 fails after a script edit, fix the script, re-run with `--from-step 7`, you keep the work from steps 1–6. + +### 4.3 Anvil for fully-local chain dev + +When you don't want to talk to Heima at all, run [foundry](https://book.getfoundry.sh/anvil/) anvil locally: + +```bash +# Terminal 4 — local EVM (anvil) on :8545 +anvil --chain-id 31337 --port 8545 +``` + +Then `AGENTKEYS_CHAIN=anvil` in your operator env makes every `cast send` hit anvil instead of Heima. The deployer wallet is whichever anvil-prefunded key you point at via `HEIMA_DEPLOYER_KEY` / `HEIMA_DEPLOYER_KEY_FILE`. Anvil's mempool is single-tenant — none of the [PR #102 nonce-contention issues](./plans/issue-101-ci-auto-deploy.md) bite locally. + +### 4.4 Editing `setup-broker-host.sh` + +`scripts/setup-broker-host.sh` is the canonical "single entry point" for the broker EC2 (per CLAUDE.md "Remote broker host (single entry point)" policy). When you change it, the unit-test is to dry-run it on a throwaway VM, but the practical inner loop is: + +1. Edit the script. +2. `bash -n scripts/setup-broker-host.sh` — syntax check. +3. SSH into the test broker EC2 (`bash scripts/ssh-broker.sh`), `cd ~/agentKeys`, `git pull`, `bash scripts/setup-broker-host.sh --test --yes` — exercise the full path. +4. **Or** push to your PR branch and let the [CI auto-deploy](#5-inner-loop-c--ci-auto-deploy-issue-101) (PR #102) drive it on the test EC2. + +Step 4 is usually faster — no SSH, you get fresh logs in the GHA run, and the harness validates the deploy end-to-end. + +--- + +## 5. Inner loop C — CI auto-deploy (issue #101) + +Per [PR #102](https://github.com/litentry/agentKeys/pull/102), pushing broker-affecting changes to a PR branch auto-deploys to the test EC2 via SSM and runs the full harness against the freshly-deployed broker. You see broker bugs in your own PR, not the next operator's. + +What counts as "broker-affecting" — the path-filter list in [`.github/workflows/harness-ci.yml`](../../.github/workflows/harness-ci.yml): + +``` +crates/agentkeys-broker-server/** +crates/agentkeys-worker-*/** +crates/agentkeys-signer-protocol/** +crates/agentkeys-types/** +crates/agentkeys-core/** +scripts/setup-broker-host.sh +scripts/setup-broker-host.sh.d/** +scripts/broker.env +scripts/broker.test.env +Cargo.toml +Cargo.lock +``` + +Untouched + auto-deploy is opt-in (gated on `OIDC_AWS_ROLE_ARN_DEPLOY` + `TEST_BROKER_INSTANCE_ID` repo secrets — see [`docs/ci-setup.md`](../ci-setup.md) §7). + +To dry-run the deploy without a broker code change, dispatch manually with the override: + +```bash +gh workflow run harness-ci.yml --repo litentry/agentKeys \ + --ref \ + --field stage=1 \ + --field force_deploy_broker=true +``` + +--- + +## 6. Config-file map — which file controls what + +Three files, three audiences. The "is the broker reading the right thing" debug usually comes down to which one you sourced. + +| File | Where it lives | Who reads it | Local-dev override | +|---|---|---|---| +| [`scripts/broker.env`](../../scripts/broker.env) | **Broker host** (EC2 or your laptop's broker process) | `agentkeys-broker-server` (every entry has a matching constant in `crates/agentkeys-broker-server/src/env.rs`) | `scripts/broker.dev.env` (gitignored, copied from `broker.env`, swap hosts to `127.0.0.1`) | +| [`scripts/operator-workstation.env`](../../scripts/operator-workstation.env) | **Operator laptop** | Every `harness/` + `scripts/heima-*.sh` script | `scripts/operator-workstation.dev.env` (gitignored, swap hosts to `127.0.0.1:809x`) | +| [`scripts/broker.test.env`](../../scripts/broker.test.env) | **Test broker host** (CI auto-deploy target) | `agentkeys-broker-server` running on the test EC2 | Same shape as `broker.env`; CI workflow materializes per-run values into this on the runner | + +Mixing them on the wrong host is the most common config bug. The broker host should NEVER source `operator-workstation.env` — that file has AWS admin tooling vars (BUCKET, OIDC_PROVIDER_ARN) that don't exist as broker-server env vars and would silently shadow what the broker actually reads. + +--- + +## 7. Debugging cheatsheet + +### 7.1 Logs + +The broker uses `tracing_subscriber` with `EnvFilter` ([`crates/agentkeys-broker-server/src/main.rs:73`](../../crates/agentkeys-broker-server/src/main.rs)). Control via `RUST_LOG`: + +```bash +# Default — only INFO and above +cargo run -p agentkeys-broker-server -- ... + +# Verbose for the broker, quiet for everything else +RUST_LOG=info,agentkeys_broker_server=debug cargo run -p agentkeys-broker-server -- ... + +# Trace-level for one specific module +RUST_LOG=info,agentkeys_broker_server::handlers::cap=trace cargo run -p agentkeys-broker-server -- ... +``` + +On the deployed broker, logs go to systemd journal: + +```bash +ssh broker journalctl -u agentkeys-broker --since '5 min ago' -f +ssh broker journalctl -u agentkeys-signer --since '5 min ago' -f +``` + +### 7.2 Port collisions + +If `cargo run` errors with `Address already in use`, find the stuck process: + +```bash +lsof -nP -iTCP:8091 -sTCP:LISTEN # broker +lsof -nP -iTCP:8090 -sTCP:LISTEN # mock-server +lsof -nP -iTCP:8092 -sTCP:LISTEN # signer +``` + +Kill by PID (the only `kill -9` you should reach for during dev) or by name: `pkill -f agentkeys-broker-server`. + +### 7.3 The broker boots, then immediately exits + +Common shapes: + +| Symptom | Cause | Fix | +|---|---|---| +| `BROKER_AUTH_METHODS="email_link": unknown or feature-gated-out auth method` | Built without `--features auth-email-link` | Re-build with the feature; see §3.2 | +| `failed to read OIDC keypair: No such file` | `BROKER_OIDC_KEYPAIR_PATH` doesn't exist | Re-run the `keygen` from §2 | +| `BROKER_BACKEND_URL=http://127.0.0.1:8090: connection refused` | Mock-server isn't running on `:8090` | Start it (Terminal 1 in §3.3) | +| Broker logs are silent | `RUST_LOG` unset and the default filter is too quiet for what you want | Add `RUST_LOG=debug` to your `cargo run` command | +| `SES GetEmailIdentity: AccessDenied` | `BROKER_EMAIL_SENDER=ses` but no AWS creds in the shell | Set `BROKER_EMAIL_SENDER=stub` for local dev | + +### 7.4 The harness fails at a specific step + +Re-run with `--from-step N` to keep prior progress, OR `--only-step N` to test one step in isolation. Every step is idempotent — re-running a passed step is a no-op. If `--only-step 7` fails the same way as the full run, the bug is in that step's script; if it passes, the bug is cross-step state that the previous steps mutated. + +--- + +## 8. Chain profile selection + +`AGENTKEYS_CHAIN` controls which RPC + which contract addresses every harness script talks to. Default in `v2-stage1-demo.sh` is `heima-paseo`; common alternates: + +| Profile | RPC | When to use | Cost | +|---|---|---|---| +| `anvil` | `http://127.0.0.1:8545` | Fully local; fastest iteration; no real-world side effects | Free | +| `heima-paseo` | Heima testnet | Real-chain semantics without real-money cost; default for `v2-stage1-demo.sh` | Testnet HEI (free from faucet) | +| `heima` | Heima mainnet | The canonical chain; matches what CI's harness-e2e runs against | Real HEI — small per-run cost | + +Switch with `--chain` on any harness script. Contract addresses for `heima` and `heima-paseo` live in [`scripts/operator-workstation.env`](../../scripts/operator-workstation.env); add `anvil` ones by running `bash scripts/setup-heima.sh --chain anvil --from-step 4 --to-step 8` after starting your local anvil. + +--- + +## 9. Related docs + +- [`docs/arch.md`](../arch.md) — single source of truth for component inventory + trust boundaries. +- [`docs/dev-setup.md`](../dev-setup.md) — first-time machine bootstrap (rust, jj, node, AWS CLI, browser). +- [`docs/operator-runbook-stage7.md`](../operator-runbook-stage7.md) — deploy-to-real-EC2 walkthrough (manual; not for local dev). +- [`docs/ci-setup.md`](../ci-setup.md) — no-LLM CI + auto-deploy of test broker (issue #101 / PR #102). +- [`docs/spec/signer-protocol.md`](./signer-protocol.md) — wire contract for the signer (TEE swap-in target). +- [`docs/spec/credential-backend-interface.md`](./credential-backend-interface.md) — the `CredentialBackend` trait; what the broker's storage plug-ins must implement. +- [`docs/spec/plans/development-stages.md`](./plans/development-stages.md) — the staged build plan + harness gates. diff --git a/scripts/heima-agent-create.sh b/scripts/heima-agent-create.sh index b8c1859..4848b60 100755 --- a/scripts/heima-agent-create.sh +++ b/scripts/heima-agent-create.sh @@ -200,13 +200,27 @@ if [ "$DRY_RUN" = "1" ]; then exit 0 fi +# Resolve PENDING nonce for the master wallet — same protection as the +# heima-fund-account.sh fix in PR #102. If the prior run's registerAgentDevice +# tx is still in the mempool, the default `latest` nonce derivation collides. +PENDING_NONCE=$(cast nonce "$MASTER_ADDR" --rpc-url "$RPC_HTTP" --block pending 2>/dev/null || echo "") +if [ -n "$PENDING_NONCE" ]; then + log "pending nonce for master = $PENDING_NONCE" + CAST_ARGS+=(--nonce "$PENDING_NONCE") +fi + log "Submitting registerAgentDevice tx via cast send …" set +e CAST_OUT=$(cast "${CAST_ARGS[@]}" 2>&1) CAST_RC=$? set -e if [ "$CAST_RC" != "0" ]; then - echo " cast send FAILED (exit $CAST_RC). Output:" >&2 + if printf '%s\n' "$CAST_OUT" | grep -qi "replacement transaction underpriced"; then + echo " cast send FAILED: prior tx with same nonce is pending in Heima mempool." >&2 + echo " Wait ~1 minute and re-run. Output:" >&2 + else + echo " cast send FAILED (exit $CAST_RC). Output:" >&2 + fi echo "$CAST_OUT" >&2 exit 1 fi diff --git a/scripts/heima-fund-account.sh b/scripts/heima-fund-account.sh index 55fd01a..aaa102d 100755 --- a/scripts/heima-fund-account.sh +++ b/scripts/heima-fund-account.sh @@ -125,15 +125,38 @@ if [ "$DRY_RUN" = "1" ]; then exit 0 fi +# Resolve PENDING nonce (defends against the race where a prior run's funding +# tx is still in the mempool — cast's default `latest` nonce derivation would +# collide with the stuck pending tx, surfacing as +# `replacement transaction underpriced`. PR #102 / codex adversarial review.) +log "Resolving pending nonce for $DEPLOYER_ADDR" +PENDING_NONCE=$(cast nonce "$DEPLOYER_ADDR" --rpc-url "$RPC_HTTP" --block pending 2>/dev/null || echo "") +if [ -z "$PENDING_NONCE" ]; then + warn "could not resolve pending nonce — proceeding without explicit --nonce (cast will use latest)" + NONCE_ARGS=() +else + ok "pending nonce = $PENDING_NONCE" + NONCE_ARGS=(--nonce "$PENDING_NONCE") +fi + log "Submitting transfer via cast send …" set +e SEND_OUT=$(cast send "$TO_ADDR" --value "$AMOUNT_WEI" \ --rpc-url "$RPC_HTTP" --chain-id "$LIVE_CHAIN_ID" \ + "${NONCE_ARGS[@]}" \ --private-key "$DEPLOYER_KEY" 2>&1) SEND_RC=$? set -e if [ "$SEND_RC" != "0" ]; then - echo " cast send FAILED (exit $SEND_RC). Output:" >&2 + # Surface the underpriced-replacement case with a specific remediation — + # the broader workflow-level concurrency lock SHOULD prevent this from + # firing for parallel runs, but a stuck mempool tx still trips it. + if printf '%s\n' "$SEND_OUT" | grep -qi "replacement transaction underpriced"; then + echo " cast send FAILED: prior tx with same nonce is pending in Heima mempool." >&2 + echo " Wait ~1 minute for it to confirm or drop, then re-run. Output:" >&2 + else + echo " cast send FAILED (exit $SEND_RC). Output:" >&2 + fi echo "$SEND_OUT" >&2 exit 1 fi diff --git a/scripts/provision-ci-deploy-role.sh b/scripts/provision-ci-deploy-role.sh new file mode 100755 index 0000000..66b3475 --- /dev/null +++ b/scripts/provision-ci-deploy-role.sh @@ -0,0 +1,564 @@ +#!/usr/bin/env bash +# scripts/provision-ci-deploy-role.sh — idempotent creation of the +# `github-actions-agentkeys-deploy` IAM role that lets the no-LLM CI +# workflow drive `setup-broker-host.sh --test --yes` on the test broker +# EC2 via AWS Systems Manager (SSM). +# +# Per arch.md trust posture (issue #101): the role is reachable ONLY +# via GitHub Actions OIDC from the `litentry/agentKeys` repo, and its +# inline policy is scoped to: +# - `ssm:SendCommand` on document/AWS-RunShellScript + the ONE test +# broker instance ARN — so even if the role were stolen, the worst +# it can do is queue a shell command on that single EC2. +# - `ssm:GetCommandInvocation` + `ssm:ListCommandInvocations` for +# status polling (no resource scope, read-only). +# - `ec2:DescribeInstances` so the workflow can sanity-check the +# instance is reachable before sending the command. +# +# Why a separate role from `github-actions-agentkeys-e2e`: +# - The e2e role's perms (sts:AssumeRole on test data roles + S3 +# verify) are read/write into the test environment AS the workload. +# - The deploy role's perms (ssm:SendCommand on the broker EC2) are +# control-plane: it tells the EC2 to re-deploy the broker binary. +# - Separation of duties: a compromise of CI's e2e creds cannot +# trigger a broker re-deploy, and vice versa. +# +# Out of scope (stays manual per CLAUDE.md "Remote broker host (single +# entry point)" + "Idempotent remote-setup rule (CLOUD)"): +# - The PROD broker EC2 (broker.litentry.org) — no auto-deploy ever. +# - The Heima EVM PROD contract redeploy — never automatic. +# +# Required env (sourced from $ENV_FILE): +# - ACCOUNT_ID +# - REGION +# Required CLI flags: +# - --test-broker-instance-id i-xxxxxxxxx (the EC2 hosting the test broker) +# Optional CLI flags: +# - --repo litentry/agentKeys (default; pinned in OIDC sub condition) +# - --role-name github-actions-agentkeys-deploy (default) +# - --env-file scripts/operator-workstation.test.env (default) +# - --fix-ssm Auto-attach AmazonSSMManagedInstanceCore to the broker EC2's +# instance profile role if the SSM agent is offline, then poll +# for up to 3 min waiting for the agent to refresh creds. +# Safe to pass on every run (idempotent: aws iam attach-role-policy +# no-ops on re-attach, and the auto-attach is gated on PingStatus +# != Online so a healthy EC2 is untouched). +# - --dry-run (print planned changes; no AWS calls that mutate state) +# +# Required AWS profile: agentkeys-admin (the script checks caller ARN). +# +# Outcomes per step (matches the idempotent-remote-setup rule shape): +# - `ok proceeding` → mutation applied +# - `skip ` → no-op (e.g. role already present + trust matches) +# - `fail ` → hard error, exit non-zero + +set -euo pipefail + +# ─── CLI parse ──────────────────────────────────────────────────────────────── +DRY_RUN=0 +FIX_SSM=0 +TEST_BROKER_INSTANCE_ID="" +REPO_SLUG="litentry/agentKeys" +ROLE_NAME="github-actions-agentkeys-deploy" +SSM_POLICY_NAME="agentkeys-ci-deploy-ssm" +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +ENV_FILE="${ENV_FILE:-$REPO_ROOT/scripts/operator-workstation.test.env}" + +while [ $# -gt 0 ]; do + case "$1" in + --test-broker-instance-id) TEST_BROKER_INSTANCE_ID="$2"; shift 2 ;; + --repo) REPO_SLUG="$2"; shift 2 ;; + --role-name) ROLE_NAME="$2"; shift 2 ;; + --env-file) ENV_FILE="$2"; shift 2 ;; + --fix-ssm) FIX_SSM=1; shift ;; + --dry-run) DRY_RUN=1; shift ;; + --help|-h) + sed -n '2,/^set -euo/p' "$0" | sed 's/^# \{0,1\}//' | sed '$d'; exit 0 ;; + *) echo "unknown flag: $1 (try --help)" >&2; exit 2 ;; + esac +done + +# ─── Logging primitives (mirrors provision-vault-role.sh) ───────────────────── +if [ -t 2 ]; then + C_HEAD='\033[1;36m'; C_OK='\033[1;32m'; C_SKIP='\033[1;33m' + C_WARN='\033[1;33m'; C_ERR='\033[1;31m'; C_RESET='\033[0m' +else + C_HEAD=''; C_OK=''; C_SKIP=''; C_WARN=''; C_ERR=''; C_RESET='' +fi +log() { printf "${C_HEAD}==>${C_RESET} %s\n" "$*" >&2; } +ok() { printf " ${C_OK}ok${C_RESET} %s\n" "$*" >&2; } +skip() { printf " ${C_SKIP}skip${C_RESET} %s\n" "$*" >&2; } +warn() { printf " ${C_WARN}warn${C_RESET} %s\n" "$*" >&2; } +die() { printf " ${C_ERR}fail${C_RESET} %s\n" "$*" >&2; exit 1; } + +# ─── Preconditions ──────────────────────────────────────────────────────────── +[ -f "$ENV_FILE" ] || die "missing $ENV_FILE (pass --env-file to override)" +set -a; . "$ENV_FILE"; set +a + +ACCOUNT_ID="${ACCOUNT_ID:?ACCOUNT_ID required in $ENV_FILE}" +REGION="${REGION:?REGION required in $ENV_FILE}" + +[ -n "$TEST_BROKER_INSTANCE_ID" ] \ + || die "missing --test-broker-instance-id (look up via: aws ec2 describe-instances --region $REGION --filters 'Name=tag:Name,Values=agentkeys-test-broker' --query 'Reservations[0].Instances[0].InstanceId')" + +[[ "$TEST_BROKER_INSTANCE_ID" =~ ^i-[0-9a-f]{8,17}$ ]] \ + || die "instance ID shape invalid: $TEST_BROKER_INSTANCE_ID (expected i-<8-17 hex chars>)" + +[[ "$REPO_SLUG" =~ ^[A-Za-z0-9._-]+/[A-Za-z0-9._-]+$ ]] \ + || die "repo slug shape invalid: $REPO_SLUG (expected owner/repo)" + +command -v jq >/dev/null || die "jq not found in PATH (brew install jq)" +command -v aws >/dev/null || die "aws CLI not found in PATH" + +# Caller identity must be agentkeys-admin (matches the rest of the provision-* +# scripts; lowercase compare because the live IAM user is `agentKeys-admin`). +caller_arn=$(aws sts get-caller-identity --query Arn --output text 2>&1) \ + || die "aws sts get-caller-identity failed: $caller_arn" +arn_lc=$(printf '%s' "$caller_arn" | tr '[:upper:]' '[:lower:]') +case "$arn_lc" in + *":user/agentkeys-admin"*) ok "caller is admin: $caller_arn" ;; + *) die "caller is $caller_arn — needs agentkeys-admin (try: awsp agentkeys-admin)" ;; +esac + +# ─── Step 1: ensure the GitHub Actions OIDC provider exists in the account ─── +log "OIDC provider: token.actions.githubusercontent.com" +gha_provider_arn="arn:aws:iam::${ACCOUNT_ID}:oidc-provider/token.actions.githubusercontent.com" +if aws iam get-open-id-connect-provider --open-id-connect-provider-arn "$gha_provider_arn" >/dev/null 2>&1; then + skip "GHA OIDC provider already registered" +else + if [ "$DRY_RUN" = "1" ]; then + log "DRY RUN — would create-open-id-connect-provider for token.actions.githubusercontent.com" + else + # Thumbprint per GitHub's published cert (matches docs/ci-setup.md §4 note). + # If the cert chain rolls, this needs a refresh; AWS rejects mismatches. + aws iam create-open-id-connect-provider \ + --url https://token.actions.githubusercontent.com \ + --client-id-list sts.amazonaws.com \ + --thumbprint-list 6938fd4d98bab03faadb97b34396831e3780aea1 \ + >/dev/null \ + || die "create-open-id-connect-provider failed" + ok "GHA OIDC provider registered" + fi +fi + +# ─── Step 2: trust policy ───────────────────────────────────────────────────── +# Federated on the GHA OIDC provider, scoped to the litentry/agentKeys repo. +# `StringLike` on `sub` lets PR branches AND `refs/heads/*` push events +# trigger; the workflow itself is the second gate (path filter + concurrency). +# +# To tighten further later (e.g. main-branch-only deploys), change the StringLike +# pattern to `repo:litentry/agentKeys:ref:refs/heads/evm` or similar. +trust_policy=$(jq -n \ + --arg provider "$gha_provider_arn" \ + --arg sub_pattern "repo:${REPO_SLUG}:*" \ + '{ + Version: "2012-10-17", + Statement: [{ + Effect: "Allow", + Principal: { Federated: $provider }, + Action: "sts:AssumeRoleWithWebIdentity", + Condition: { + StringEquals: { + "token.actions.githubusercontent.com:aud": "sts.amazonaws.com" + }, + StringLike: { + "token.actions.githubusercontent.com:sub": $sub_pattern + } + } + }] + }') + +# ─── Step 3: role existence ────────────────────────────────────────────────── +log "Role existence: $ROLE_NAME" +if aws iam get-role --role-name "$ROLE_NAME" >/dev/null 2>&1; then + skip "role already exists" + if [ "$DRY_RUN" = "1" ]; then + log "DRY RUN — would update-assume-role-policy with: $trust_policy" + else + log "Refreshing trust policy (idempotent; sub pattern: repo:${REPO_SLUG}:*)" + aws iam update-assume-role-policy \ + --role-name "$ROLE_NAME" \ + --policy-document "$trust_policy" \ + || die "update-assume-role-policy failed" + ok "trust policy refreshed" + fi +else + if [ "$DRY_RUN" = "1" ]; then + log "DRY RUN — would create-role $ROLE_NAME with trust: $trust_policy" + else + log "Creating role $ROLE_NAME" + # IAM CreateRole --description allows only printable ASCII + Latin-1 + # (regex [\t\n\r\x20-\x7e\xa1-\xff]*). Em-dash / en-dash / arrows trip + # "Value at 'description' failed to satisfy constraint" at AWS-call time. + # Keep this string ASCII-only. + aws iam create-role \ + --role-name "$ROLE_NAME" \ + --assume-role-policy-document "$trust_policy" \ + --description "CI deploy role - drives setup-broker-host.sh on the test EC2 via SSM (issue #101)" \ + >/dev/null \ + || die "create-role failed" + ok "role created" + fi +fi + +# ─── Step 4: inline SSM policy ─────────────────────────────────────────────── +# Narrow on purpose: SendCommand limited to the document + the ONE instance +# ARN. Even a compromised role can only re-run setup-broker-host.sh on the +# test broker; nothing in prod, nothing on other EC2s. +instance_arn="arn:aws:ec2:${REGION}:${ACCOUNT_ID}:instance/${TEST_BROKER_INSTANCE_ID}" +ssm_document_arn="arn:aws:ssm:${REGION}::document/AWS-RunShellScript" + +inline_policy=$(jq -n \ + --arg doc_arn "$ssm_document_arn" \ + --arg inst_arn "$instance_arn" \ + --arg inst_id "$TEST_BROKER_INSTANCE_ID" \ + '{ + Version: "2012-10-17", + Statement: [ + { + Sid: "SendShellCommandToTestBrokerOnly", + Effect: "Allow", + Action: "ssm:SendCommand", + Resource: [$doc_arn, $inst_arn] + }, + { + Sid: "PollCommandStatus", + Effect: "Allow", + Action: [ + "ssm:GetCommandInvocation", + "ssm:ListCommandInvocations", + "ssm:DescribeInstanceInformation" + ], + Resource: "*" + }, + { + Sid: "DescribeTestBrokerInstanceOnly", + Effect: "Allow", + Action: "ec2:DescribeInstances", + Resource: "*", + Condition: { + StringEquals: { + "ec2:InstanceId": [$inst_id] + } + } + } + ] + }') + +log "Inline policy: $SSM_POLICY_NAME" +if [ "$DRY_RUN" = "1" ]; then + log "DRY RUN — would put-role-policy: $inline_policy" +else + aws iam put-role-policy \ + --role-name "$ROLE_NAME" \ + --policy-name "$SSM_POLICY_NAME" \ + --policy-document "$inline_policy" \ + || die "put-role-policy failed" + ok "inline policy applied ($(echo "$inline_policy" | jq '.Statement | length') statements; SendCommand scoped to $TEST_BROKER_INSTANCE_ID)" +fi + +# ─── Step 5: verify the test broker EC2 is SSM-managed ─────────────────────── +# If the instance lacks AmazonSSMManagedInstanceCore (via its instance profile) +# OR the SSM Agent isn't running, SendCommand will queue the command and time +# out without delivering it. Fail fast here with a clear remediation path. +# +# With --fix-ssm, the script attempts auto-remediation: +# - Looks up the EC2's instance profile via DescribeInstances +# - Extracts the role name behind the profile +# - Attaches AmazonSSMManagedInstanceCore (idempotent: AWS no-ops on re-attach) +# - Re-polls PingStatus for up to 3 min waiting for the agent to refresh creds +# - If still offline after 3 min: tells operator to reboot or restart the agent +# +# The auto-attach is safe because the operator is already running as +# agentkeys-admin (verified above) — they HAVE iam:AttachRolePolicy. Without +# --fix-ssm the script just reports + exits (no IAM mutation, no surprises). +# Creates the dedicated SSM-only instance profile + role and associates +# it with the EC2 instance. Used when the EC2 has NO profile attached at +# all — common on test brokers spun up by setup-cloud.sh --test (the +# broker process authenticates via static creds in /etc/agentkeys/broker.env, +# so the EC2 was never given an instance profile). +# +# Why this is safe to add to an already-running broker: +# - The broker's app-layer AWS calls use AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY +# from broker.env explicitly; the static creds take precedence over IMDS. +# - Adding an IMDS-served instance profile cannot reduce capability — it only +# ADDS a credential source for processes that don't already have static creds +# (which on the broker EC2 = the SSM agent and not much else). +# +# Names: +# - Role: agentkeys-test-broker-ssm +# - Profile: agentkeys-test-broker-ssm (same — conventional) +# +# Idempotent: every step is get-* pre-checked. Safe to call repeatedly. +SSM_INSTANCE_ROLE_NAME="agentkeys-test-broker-ssm" +SSM_INSTANCE_PROFILE_NAME="agentkeys-test-broker-ssm" + +create_and_associate_ssm_profile() { + local instance_id="$1" + local policy_arn="arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + + # ── Role ── + if aws iam get-role --role-name "$SSM_INSTANCE_ROLE_NAME" >/dev/null 2>&1; then + skip "role $SSM_INSTANCE_ROLE_NAME already exists" + else + log "Creating role $SSM_INSTANCE_ROLE_NAME (EC2 trust)" + local ec2_trust + ec2_trust=$(jq -n '{ + Version: "2012-10-17", + Statement: [{ + Effect: "Allow", + Principal: { Service: "ec2.amazonaws.com" }, + Action: "sts:AssumeRole" + }] + }') + aws iam create-role \ + --role-name "$SSM_INSTANCE_ROLE_NAME" \ + --assume-role-policy-document "$ec2_trust" \ + --description "Lets the test broker EC2 register with AWS SSM (issue #101)" \ + >/dev/null \ + || { warn "create-role failed"; return 1; } + ok "role $SSM_INSTANCE_ROLE_NAME created" + fi + + # ── Managed policy attach (idempotent — AWS no-ops on re-attach) ── + local already_attached + already_attached=$(aws iam list-attached-role-policies \ + --role-name "$SSM_INSTANCE_ROLE_NAME" \ + --query "AttachedPolicies[?PolicyArn=='$policy_arn'].PolicyArn" \ + --output text 2>/dev/null || echo "") + if [ -n "$already_attached" ]; then + skip "AmazonSSMManagedInstanceCore already attached to $SSM_INSTANCE_ROLE_NAME" + else + aws iam attach-role-policy \ + --role-name "$SSM_INSTANCE_ROLE_NAME" \ + --policy-arn "$policy_arn" \ + || { warn "attach-role-policy failed"; return 1; } + ok "AmazonSSMManagedInstanceCore attached to $SSM_INSTANCE_ROLE_NAME" + fi + + # ── Instance profile ── + if aws iam get-instance-profile --instance-profile-name "$SSM_INSTANCE_PROFILE_NAME" >/dev/null 2>&1; then + skip "instance profile $SSM_INSTANCE_PROFILE_NAME already exists" + else + log "Creating instance profile $SSM_INSTANCE_PROFILE_NAME" + aws iam create-instance-profile \ + --instance-profile-name "$SSM_INSTANCE_PROFILE_NAME" \ + >/dev/null \ + || { warn "create-instance-profile failed"; return 1; } + ok "instance profile $SSM_INSTANCE_PROFILE_NAME created" + fi + + # ── Add role to profile ── + local profile_role + profile_role=$(aws iam get-instance-profile \ + --instance-profile-name "$SSM_INSTANCE_PROFILE_NAME" \ + --query 'InstanceProfile.Roles[0].RoleName' \ + --output text 2>/dev/null || echo "None") + if [ "$profile_role" = "$SSM_INSTANCE_ROLE_NAME" ]; then + skip "role already added to instance profile" + else + if [ "$profile_role" != "None" ] && [ -n "$profile_role" ]; then + warn "instance profile $SSM_INSTANCE_PROFILE_NAME currently holds role $profile_role (expected $SSM_INSTANCE_ROLE_NAME)" + warn "Refusing to swap — operator should reconcile manually." + return 1 + fi + aws iam add-role-to-instance-profile \ + --instance-profile-name "$SSM_INSTANCE_PROFILE_NAME" \ + --role-name "$SSM_INSTANCE_ROLE_NAME" \ + || { warn "add-role-to-instance-profile failed"; return 1; } + ok "added $SSM_INSTANCE_ROLE_NAME to instance profile" + # IAM is eventually consistent — newly-attached role may not show up in + # the EC2 associate API for a few seconds. Brief sleep here is the + # documented pattern (AWS docs: "may take up to 30s to propagate"). + log "Waiting 15s for IAM eventual consistency" + sleep 15 + fi + + # ── Associate profile with EC2 ── + local current_profile_arn + current_profile_arn=$(aws ec2 describe-iam-instance-profile-associations \ + --region "$REGION" \ + --filters "Name=instance-id,Values=$instance_id" \ + --query 'IamInstanceProfileAssociations[?State==`associated` || State==`associating`].IamInstanceProfile.Arn' \ + --output text 2>/dev/null || echo "") + if [ -n "$current_profile_arn" ] && [ "$current_profile_arn" != "None" ]; then + skip "instance already has profile associated: $current_profile_arn" + else + log "Associating $SSM_INSTANCE_PROFILE_NAME with $instance_id" + aws ec2 associate-iam-instance-profile \ + --region "$REGION" \ + --instance-id "$instance_id" \ + --iam-instance-profile "Name=$SSM_INSTANCE_PROFILE_NAME" \ + >/dev/null \ + || { warn "associate-iam-instance-profile failed"; return 1; } + ok "profile associated; EC2 IMDS will surface new creds within ~30s" + fi + + return 0 +} + +attach_ssm_managed_policy_if_missing() { + # Returns 0 if policy was attached or already present; non-zero on hard error. + local instance_id="$1" + local profile_arn role_name policy_arn already_attached + + policy_arn="arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + + profile_arn=$(aws ec2 describe-instances \ + --region "$REGION" \ + --instance-ids "$instance_id" \ + --query 'Reservations[0].Instances[0].IamInstanceProfile.Arn' \ + --output text 2>/dev/null || echo "None") + + if [ -z "$profile_arn" ] || [ "$profile_arn" = "None" ] || [ "$profile_arn" = "null" ]; then + log "instance $instance_id has NO IAM instance profile — creating + associating one" + create_and_associate_ssm_profile "$instance_id" || return 1 + return 0 + fi + + # Profile ARN shape: arn:aws:iam::ACCT:instance-profile/ + local profile_name="${profile_arn##*/}" + log "instance profile: $profile_name" + + role_name=$(aws iam get-instance-profile \ + --instance-profile-name "$profile_name" \ + --query 'InstanceProfile.Roles[0].RoleName' \ + --output text 2>/dev/null || echo "None") + + if [ -z "$role_name" ] || [ "$role_name" = "None" ]; then + warn "instance profile $profile_name has no role attached — auto-remediation is blocked." + return 1 + fi + log "role behind profile: $role_name" + + already_attached=$(aws iam list-attached-role-policies \ + --role-name "$role_name" \ + --query "AttachedPolicies[?PolicyArn=='$policy_arn'].PolicyArn" \ + --output text 2>/dev/null || echo "") + + if [ -n "$already_attached" ]; then + ok "AmazonSSMManagedInstanceCore already attached to $role_name" + return 0 + fi + + log "Attaching AmazonSSMManagedInstanceCore to $role_name" + aws iam attach-role-policy \ + --role-name "$role_name" \ + --policy-arn "$policy_arn" \ + || { warn "attach-role-policy failed"; return 1; } + ok "AmazonSSMManagedInstanceCore attached to $role_name" + return 0 +} + +poll_ssm_online() { + local instance_id="$1" max_iters="$2" state + for _ in $(seq 1 "$max_iters"); do + state=$(aws ssm describe-instance-information \ + --region "$REGION" \ + --filters "Key=InstanceIds,Values=$instance_id" \ + --query 'InstanceInformationList[0].PingStatus' \ + --output text 2>/dev/null || echo "None") + case "$state" in + Online) printf '%s' "$state"; return 0 ;; + esac + sleep 10 + done + printf '%s' "${state:-None}" + return 1 +} + +log "Verify SSM agent reachable: $TEST_BROKER_INSTANCE_ID" +if [ "$DRY_RUN" = "1" ]; then + log "DRY RUN — would query ssm describe-instance-information for $TEST_BROKER_INSTANCE_ID" +else + # Capture stderr separately so AccessDenied doesn't get silently mapped to + # "None" (instance-not-registered). They're distinct failure modes: + # - AccessDenied → caller (agentkeys-admin) lacks ssm:DescribeInstanceInformation. + # Fix the caller's IAM, not the EC2. + # - Empty/None → instance genuinely not registered with SSM. Remediate the EC2. + ssm_stderr=$(mktemp /tmp/ssm-describe.XXXXXX.err) + ssm_state=$(aws ssm describe-instance-information \ + --region "$REGION" \ + --filters "Key=InstanceIds,Values=$TEST_BROKER_INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' \ + --output text 2>"$ssm_stderr" || echo "") + if grep -q "AccessDenied" "$ssm_stderr"; then + rm -f "$ssm_stderr" + die "caller lacks ssm:DescribeInstanceInformation. This is the upstream +of every 'PingStatus=None' loop — without read perms, the script cannot tell +'instance not registered with SSM' from 'I have no permission to look'. Fix +by attaching AmazonSSMReadOnlyAccess to the admin group ONCE: + aws iam attach-group-policy \\ + --group-name AgentKeyAdmin \\ + --policy-arn arn:aws:iam::aws:policy/AmazonSSMReadOnlyAccess +Then re-run this script." + fi + # Empty state = no record found (genuinely not registered). + [ -z "$ssm_state" ] && ssm_state="None" + rm -f "$ssm_stderr" + + case "$ssm_state" in + Online) + ok "SSM agent online — workflow can SendCommand" + ;; + ConnectionLost|Inactive|None|"") + if [ "$FIX_SSM" = "1" ]; then + log "Auto-remediating (--fix-ssm): attach AmazonSSMManagedInstanceCore + poll" + if attach_ssm_managed_policy_if_missing "$TEST_BROKER_INSTANCE_ID"; then + log "Polling SSM PingStatus for up to 3 min (agent refresh window)" + final_state=$(poll_ssm_online "$TEST_BROKER_INSTANCE_ID" 18) || true + if [ "$final_state" = "Online" ]; then + ok "SSM agent now online" + else + warn "SSM agent still $final_state after 3 min — policy attached, but the" + warn "agent process hasn't picked up the refreshed creds. Pick ONE:" + warn " a) SSH and bounce the agent:" + warn " ssh test-broker 'sudo systemctl restart amazon-ssm-agent'" + warn " b) Reboot the EC2 (heavier):" + warn " aws ec2 reboot-instances --instance-ids $TEST_BROKER_INSTANCE_ID --region $REGION" + warn "Then re-run this script (no flags) to confirm Online." + exit 1 + fi + else + exit 1 + fi + else + die "$TEST_BROKER_INSTANCE_ID is not registered with SSM (state=$ssm_state). Re-run with --fix-ssm +to attempt auto-remediation (attaches AmazonSSMManagedInstanceCore to the +EC2's instance profile role, then polls until the SSM agent refreshes). +Or remediate manually: + 1. EC2 instance profile is missing AmazonSSMManagedInstanceCore. Fix: + aws ec2 describe-instances --region $REGION --instance-ids $TEST_BROKER_INSTANCE_ID \\ + --query 'Reservations[0].Instances[0].IamInstanceProfile.Arn' + Then attach the policy to the role behind that instance profile: + aws iam attach-role-policy --role-name \\ + --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + Reboot the EC2 (or restart amazon-ssm-agent) to pick up new perms. + 2. SSM Agent not installed/running. Fix (Ubuntu 22.04+ ships it): + ssh test-broker 'sudo systemctl enable --now amazon-ssm-agent' + 3. Instance is in a private VPC subnet without an SSM VPC endpoint. + (Unlikely for a public-IP broker, but worth a glance at the routing.)" + fi + ;; + *) + warn "SSM agent state = $ssm_state (unexpected) — proceed with caution" + ;; + esac +fi + +# ─── Final: print the ARN so the operator can paste it into the GHA secret ── +role_arn=$(aws iam get-role --role-name "$ROLE_NAME" --query 'Role.Arn' --output text 2>/dev/null || echo "?") +ok "deploy role ready: $role_arn" +cat <&2 + +Next: + # 1. Set the two GitHub secrets (idempotent — overwrites existing values): + gh secret set OIDC_AWS_ROLE_ARN_DEPLOY --repo $REPO_SLUG --body "$role_arn" + gh secret set TEST_BROKER_INSTANCE_ID --repo $REPO_SLUG --body "$TEST_BROKER_INSTANCE_ID" + + # 2. Trigger a workflow_dispatch with broker_changed=true to dry-run the + # deploy path on the test EC2 (see docs/ci-setup.md §7). + +EOF + +echo "$role_arn" diff --git a/scripts/setup-broker-host.sh b/scripts/setup-broker-host.sh index 44b471e..166d01c 100755 --- a/scripts/setup-broker-host.sh +++ b/scripts/setup-broker-host.sh @@ -21,6 +21,13 @@ set -euo pipefail +# AWS SSM-driven invocations (harness-ci.yml deploy-test-broker, issue #101) +# don't export HOME on the remote shell. Under set -u that hits 'HOME: unbound +# variable' at the rustup `source "$HOME/.cargo/env"` line. Resolve HOME from +# /etc/passwd if missing so the script is callable from both interactive ssh +# sessions and SSM SendCommand. +export HOME="${HOME:-$(getent passwd "$(id -u)" | cut -d: -f6)}" + REPO_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" # ─── Defaults ───────────────────────────────────────────────────────────────── @@ -790,6 +797,67 @@ EOF sudo systemctl reload ssh 2>/dev/null || sudo systemctl reload sshd 2>/dev/null || warn "sshd reload failed — restart manually" fi +# ─── AWS SSM Agent (idempotent install) ─────────────────────────────────────── +# Required by harness-ci.yml deploy-test-broker job (issue #101): the GitHub +# Actions workflow drives `setup-broker-host.sh --test --yes` on the EC2 via +# `aws ssm send-command`. That path needs amazon-ssm-agent installed AND +# active here. +# +# Some Ubuntu AMIs (including some Canonical / Multipass-derived images +# downstream of the AWS Marketplace base) ship without amazon-ssm-agent. +# When that's the case, `systemctl restart amazon-ssm-agent` errors with +# "Unit amazon-ssm-agent.service not found" — the failure mode the operator +# hit on 2026-05-23. Fold the install into broker-host bootstrap so every +# new test broker is SSM-ready out of the box. +# +# Two install paths, in priority order: +# 1) snap (AWS-blessed on Ubuntu 22.04+; service: snap.amazon-ssm-agent.amazon-ssm-agent.service) +# 2) deb fallback (older / non-snap images; service: amazon-ssm-agent.service) +# +# Both produce a unit named `amazon-ssm-agent` in our systemctl alias check +# below, so subsequent `setup-broker-host.sh --upgrade` re-runs skip. +ssm_unit_active() { + systemctl is-active snap.amazon-ssm-agent.amazon-ssm-agent.service >/dev/null 2>&1 \ + || systemctl is-active amazon-ssm-agent.service >/dev/null 2>&1 +} + +if ssm_unit_active; then + log "amazon-ssm-agent already active — skipping install" +else + log "Installing amazon-ssm-agent (required for CI auto-deploy per issue #101)" + if command -v snap >/dev/null 2>&1; then + # snap install is idempotent — re-running on an already-installed agent + # exits 0 with a "snap already installed" message. + sudo snap install amazon-ssm-agent --classic >/dev/null \ + || warn "snap install amazon-ssm-agent failed — falling back to deb" + sudo systemctl enable --now snap.amazon-ssm-agent.amazon-ssm-agent.service \ + >/dev/null 2>&1 || true + fi + + if ! ssm_unit_active; then + # Snap path didn't take — fall back to the .deb from AWS. + REGION_FOR_SSM="${REGION:-us-east-1}" + SSM_DEB_URL="https://s3.${REGION_FOR_SSM}.amazonaws.com/amazon-ssm-${REGION_FOR_SSM}/latest/debian_amd64/amazon-ssm-agent.deb" + SSM_TMP_DEB=$(mktemp /tmp/amazon-ssm-agent.XXXXXX.deb) + if curl -sSfL "$SSM_DEB_URL" -o "$SSM_TMP_DEB"; then + sudo dpkg -i "$SSM_TMP_DEB" >/dev/null \ + || warn "dpkg install amazon-ssm-agent.deb failed" + sudo systemctl enable --now amazon-ssm-agent.service \ + >/dev/null 2>&1 || warn "amazon-ssm-agent enable/start failed" + else + warn "could not download amazon-ssm-agent.deb from $SSM_DEB_URL" + fi + rm -f "$SSM_TMP_DEB" + fi + + if ssm_unit_active; then + log "amazon-ssm-agent installed and active" + else + warn "amazon-ssm-agent install did not produce an active unit — CI auto-deploy will fail until this is resolved" + warn "Manual recovery: sudo snap install amazon-ssm-agent --classic && sudo systemctl enable --now snap.amazon-ssm-agent.amazon-ssm-agent.service" + fi +fi + if [[ "$CRED_MODE" == "profile" ]]; then sudo install -d -m 0700 -o agentkeys -g agentkeys /var/lib/agentkeys/.aws if [[ ! -f /var/lib/agentkeys/.aws/credentials ]]; then