diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0553153..674f4d1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,18 +6,43 @@ on: pull_request: branches: [main] +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + env: MIX_ENV: test + ELIXIR_VERSION: "1.19" + OTP_VERSION: "28" jobs: - test: - name: Test (Elixir ${{ matrix.elixir }} / OTP ${{ matrix.otp }}) + changes: + name: Detect changes runs-on: ubuntu-latest + outputs: + durable: ${{ steps.filter.outputs.durable }} + dashboard: ${{ steps.filter.outputs.dashboard }} + ci: ${{ steps.filter.outputs.ci }} + steps: + - uses: actions/checkout@v6 - strategy: - matrix: - elixir: ["1.19"] - otp: ["28"] + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + durable: + - 'durable/**' + dashboard: + - 'durable/**' + - 'durable_dashboard/**' + ci: + - '.github/workflows/**' + + durable: + name: Durable + needs: changes + if: needs.changes.outputs.durable == 'true' || needs.changes.outputs.ci == 'true' || github.event_name == 'push' + runs-on: ubuntu-latest services: postgres: @@ -33,28 +58,32 @@ jobs: --health-timeout=5s --health-retries=5 + defaults: + run: + working-directory: durable + steps: - uses: actions/checkout@v6 - name: Set up Elixir uses: erlef/setup-beam@v1 with: - elixir-version: ${{ matrix.elixir }} - otp-version: ${{ matrix.otp }} + elixir-version: ${{ env.ELIXIR_VERSION }} + otp-version: ${{ env.OTP_VERSION }} - name: Restore dependencies cache uses: actions/cache@v4 with: - path: deps - key: ${{ runner.os }}-mix-${{ hashFiles('**/mix.lock') }} - restore-keys: ${{ runner.os }}-mix- + path: durable/deps + key: ${{ runner.os }}-durable-mix-${{ hashFiles('durable/mix.lock') }} + restore-keys: ${{ runner.os }}-durable-mix- - name: Restore build cache - uses: actions/cache@v5 + uses: actions/cache@v4 with: - path: _build - key: ${{ runner.os }}-build-${{ matrix.otp }}-${{ matrix.elixir }}-${{ hashFiles('**/mix.lock') }} - restore-keys: ${{ runner.os }}-build-${{ matrix.otp }}-${{ matrix.elixir }}- + path: durable/_build + key: ${{ runner.os }}-durable-build-${{ env.OTP_VERSION }}-${{ env.ELIXIR_VERSION }}-${{ hashFiles('durable/mix.lock') }} + restore-keys: ${{ runner.os }}-durable-build-${{ env.OTP_VERSION }}-${{ env.ELIXIR_VERSION }}- - name: Install dependencies run: mix deps.get @@ -65,31 +94,86 @@ jobs: - name: Compile run: mix compile --warnings-as-errors + - name: Credo + run: mix credo --strict + - name: Run tests run: mix test - credo: - name: Credo + dashboard: + name: Dashboard + needs: changes + if: needs.changes.outputs.dashboard == 'true' || needs.changes.outputs.ci == 'true' || github.event_name == 'push' runs-on: ubuntu-latest + defaults: + run: + working-directory: durable_dashboard + steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Elixir uses: erlef/setup-beam@v1 with: - elixir-version: "1.19" - otp-version: "28" + elixir-version: ${{ env.ELIXIR_VERSION }} + otp-version: ${{ env.OTP_VERSION }} + + - name: Set up pnpm + uses: pnpm/action-setup@v4 + with: + version: 9 + + - name: Set up Node + uses: actions/setup-node@v4 + with: + node-version: "20" + cache: pnpm + cache-dependency-path: durable_dashboard/assets/pnpm-lock.yaml - name: Restore dependencies cache uses: actions/cache@v4 with: - path: deps - key: ${{ runner.os }}-mix-${{ hashFiles('**/mix.lock') }} - restore-keys: ${{ runner.os }}-mix- + path: durable_dashboard/deps + key: ${{ runner.os }}-dashboard-mix-${{ hashFiles('durable_dashboard/mix.lock', 'durable/mix.lock') }} + restore-keys: ${{ runner.os }}-dashboard-mix- - - name: Install dependencies + - name: Restore build cache + uses: actions/cache@v4 + with: + path: durable_dashboard/_build + key: ${{ runner.os }}-dashboard-build-${{ env.OTP_VERSION }}-${{ env.ELIXIR_VERSION }}-${{ hashFiles('durable_dashboard/mix.lock', 'durable/mix.lock') }} + restore-keys: ${{ runner.os }}-dashboard-build-${{ env.OTP_VERSION }}-${{ env.ELIXIR_VERSION }}- + + - name: Install Elixir dependencies run: mix deps.get - - name: Run Credo - run: mix credo --strict + - name: Install JS dependencies + working-directory: durable_dashboard/assets + run: pnpm install --frozen-lockfile + + - name: Check formatting + run: mix format --check-formatted + + - name: Build assets + working-directory: durable_dashboard/assets + run: pnpm build + + - name: Compile + run: mix compile --warnings-as-errors + + - name: Run tests + run: mix test + + ci-status: + name: CI status + needs: [changes, durable, dashboard] + if: always() + runs-on: ubuntu-latest + steps: + - name: Verify all required jobs passed + run: | + set -e + [[ "${{ needs.changes.result }}" == "success" ]] || exit 1 + [[ "${{ needs.durable.result }}" =~ ^(success|skipped)$ ]] || exit 1 + [[ "${{ needs.dashboard.result }}" =~ ^(success|skipped)$ ]] || exit 1 diff --git a/.gitignore b/.gitignore index 7111dd6..2d75449 100644 --- a/.gitignore +++ b/.gitignore @@ -1,24 +1,10 @@ -# The directory Mix will write compiled artifacts to. -/_build/ - -# If you run "mix test --cover", coverage assets end up here. -/cover/ - -# The directory Mix downloads your dependencies sources to. -/deps/ - -# Where third-party dependencies like ExDoc output generated docs. -/doc/ +# Editor / IDE +.elixir_ls/ -# Temporary files, for example, from tests. -/tmp/ - -# If the VM crashes, it generates a dump, let's ignore it too. +# Crash dumps erl_crash.dump -# Also ignore archive artifacts (built via "mix archive.build"). -*.ez - -# Ignore package tarball (built via "mix hex.build"). -durable-*.tar - +# Workspace-level generated docs (per-project ignores live in each subdir) +/doc/ +/_build/ +/deps/ diff --git a/CLAUDE.md b/CLAUDE.md index 61be1c3..b973bae 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,6 +4,14 @@ Durable workflow engine for Elixir - provides resumable, reliable workflows with Durable is an **embeddable library** - users add it to their supervision tree and provide their own Ecto repo. +## Dashboard UI work + +Before touching any file under `durable_dashboard/`, read +`durable_dashboard/DESIGN.md`. It codifies the design language (tokens, +typography, spacing, motion, status semantics, component primitives, +composition patterns). New visual decisions are made there first, then +applied in code. + ## Quick Reference ```bash diff --git a/README.md b/README.md index 78098db..04fc96f 100644 --- a/README.md +++ b/README.md @@ -1,531 +1,65 @@ -# Durable +# Durable workspace -[![Build Status](https://github.com/wavezync/durable/actions/workflows/ci.yml/badge.svg)](https://github.com/wavezync/durable/actions/workflows/ci.yml) -[![Hex.pm](https://img.shields.io/hexpm/v/durable.svg)](https://hex.pm/packages/durable) +This is a path-dep monorepo (pnpm-workspaces style) containing the Durable +workflow engine and its surrounding packages. Each subdirectory is an +independent Mix project with its own `mix.exs`, `_build/`, `deps/`, and Hex +publishing pipeline. -A durable, resumable workflow engine for Elixir. Similar to Temporal/Inngest. +## Packages -## Features +| Path | Package | Description | +| --- | --- | --- | +| [`durable/`](durable/) | `:durable` | Core workflow engine — resumable, reliable workflows with PostgreSQL persistence. | +| [`durable_dashboard/`](durable_dashboard/) | `:durable_dashboard` | LiveView-first web dashboard for monitoring and managing Durable workflows. | +| [`examples/phoenix_demo/`](examples/phoenix_demo/) | (unpublished) | Reference Phoenix app wiring up `:durable` and `:durable_dashboard`. | -- **Pipeline Model** - Context flows from step to step, simple and explicit -- **Resumability** - Sleep, wait for events, wait for human input -- **Branching** - Pattern-matched conditional flow control -- **Parallel** - Run steps concurrently with result collection -- **Compensations** - Saga pattern with automatic rollback -- **Cron Scheduling** - Recurring workflows with cron expressions -- **Reliability** - Automatic retries with exponential/linear/constant backoff -- **Orchestration** - Parent/child workflow composition -- **Persistence** - PostgreSQL-backed execution state +## Workspace commands -## Installation +A thin root `mix.exs` fans out common commands to each published package: -```elixir -def deps do - [{:durable, "~> 0.0.0-alpha"}] -end -``` - -## Quick Start - -### 1. Create Migration - -```elixir -defmodule MyApp.Repo.Migrations.AddDurable do - use Ecto.Migration - def up, do: Durable.Migration.up() - def down, do: Durable.Migration.down() -end -``` - -### 2. Add to Supervision Tree - -```elixir -children = [ - MyApp.Repo, - {Durable, repo: MyApp.Repo, queues: %{default: [concurrency: 10]}} -] -``` - -### 3. Define & Run - -```elixir -defmodule MyApp.OrderWorkflow do - use Durable - use Durable.Helpers - - workflow "process_order", timeout: hours(2) do - # First step receives workflow input - step :validate, fn input -> - {:ok, %{ - order_id: input["id"], - items: input["items"], - customer_id: input["customer_id"] - }} - end - - # Each step receives previous step's output as context - step :calculate_total, fn ctx -> - total = ctx.items |> Enum.map(& &1["price"]) |> Enum.sum() - {:ok, assign(ctx, :total, total)} - end - - step :charge_payment, [retry: [max_attempts: 3, backoff: :exponential]], fn ctx -> - {:ok, charge} = PaymentService.charge(ctx.order_id, ctx.total) - {:ok, assign(ctx, :charge_id, charge.id)} - end - - step :send_confirmation, fn ctx -> - EmailService.send_confirmation(ctx.order_id) - {:ok, ctx} - end - end -end - -# Start it -{:ok, id} = Durable.start(MyApp.OrderWorkflow, %{"id" => "order_123", "items" => items}) -``` - -## Examples - -### Approval Workflow - -Wait for human approval with timeout fallback. - -```elixir -defmodule MyApp.ExpenseApproval do - use Durable - use Durable.Helpers - use Durable.Wait - - workflow "expense_approval" do - step :request_approval, fn ctx -> - result = wait_for_approval("manager", - prompt: "Approve $#{ctx["amount"]} expense?", - timeout: days(3), - timeout_value: :auto_rejected - ) - {:ok, assign(ctx, :decision, result)} - end - - branch on: fn ctx -> ctx.decision end do - :approved -> - step :process, fn ctx -> - Expenses.reimburse(ctx["employee_id"], ctx["amount"]) - {:ok, assign(ctx, :status, :reimbursed)} - end - - _ -> - step :notify_rejection, fn ctx -> - Mailer.send_rejection(ctx["employee_id"]) - {:ok, assign(ctx, :status, :rejected)} - end - end - end -end - -# Approve externally -Durable.provide_input(workflow_id, "manager", :approved) -``` - -### Parallel Data Fetch - -Fetch data concurrently, then combine results. - -```elixir -defmodule MyApp.DashboardBuilder do - use Durable - use Durable.Helpers - - workflow "build_dashboard" do - step :init, fn input -> - {:ok, %{user_id: input["user_id"]}} - end - - # Parallel steps produce results in __results__ map - parallel do - step :user, fn ctx -> - {:ok, %{user: Users.get(ctx.user_id)}} - end - - step :orders, fn ctx -> - {:ok, %{orders: Orders.recent(ctx.user_id)}} - end - - step :notifications, fn ctx -> - {:ok, %{notifs: Notifications.unread(ctx.user_id)}} - end - end - - # Access results from __results__ map - step :render, fn ctx -> - results = ctx[:__results__] - - # Results are tagged tuples: ["ok", data] or ["error", reason] - user = case results["user"] do - ["ok", data] -> data.user - _ -> nil - end - - orders = case results["orders"] do - ["ok", data] -> data.orders - _ -> [] - end - - notifs = case results["notifications"] do - ["ok", data] -> data.notifs - _ -> [] - end - - dashboard = Dashboard.build(user, orders, notifs) - {:ok, assign(ctx, :dashboard, dashboard)} - end - end -end - -# Or use into: to transform results directly -defmodule MyApp.DashboardBuilderWithInto do - use Durable - use Durable.Helpers - - workflow "build_dashboard_v2" do - step :init, fn input -> - {:ok, %{user_id: input["user_id"]}} - end - - parallel into: fn ctx, results -> - # results contains tuples: %{user: {:ok, data}, orders: {:ok, data}, ...} - case {results[:user], results[:orders], results[:notifications]} do - {{:ok, user_data}, {:ok, orders_data}, {:ok, notifs_data}} -> - {:ok, Map.merge(ctx, %{ - user: user_data.user, - orders: orders_data.orders, - notifs: notifs_data.notifs - })} - - _ -> - {:error, "Failed to fetch dashboard data"} - end - end do - step :user, fn ctx -> {:ok, %{user: Users.get(ctx.user_id)}} end - step :orders, fn ctx -> {:ok, %{orders: Orders.recent(ctx.user_id)}} end - step :notifications, fn ctx -> {:ok, %{notifs: Notifications.unread(ctx.user_id)}} end - end - - step :render, fn ctx -> - dashboard = Dashboard.build(ctx.user, ctx.orders, ctx.notifs) - {:ok, assign(ctx, :dashboard, dashboard)} - end - end -end -``` - -### Batch Processing - -Process items with controlled concurrency using `Task.async_stream`. - -```elixir -defmodule MyApp.BulkEmailer do - use Durable - use Durable.Helpers - - workflow "send_campaign" do - step :load, fn input -> - recipients = Subscribers.active(input["campaign_id"]) - {:ok, %{campaign_id: input["campaign_id"], recipients: recipients}} - end - - step :send_emails, fn ctx -> - results = - ctx.recipients - |> Task.async_stream( - fn recipient -> - case Mailer.send_campaign(recipient, ctx.campaign_id) do - :ok -> {:ok, recipient} - {:error, reason} -> {:error, {recipient, reason}} - end - end, - max_concurrency: 10, - timeout: :infinity - ) - |> Enum.map(fn {:ok, r} -> r end) - - sent = for {:ok, _} <- results, do: 1 - failed = for {:error, _} <- results, do: 1 - - {:ok, ctx - |> assign(:sent_count, length(sent)) - |> assign(:failed_count, length(failed))} - end - end -end -``` - -### Trip Booking (Saga) - -Book multiple services with automatic rollback on failure. - -```elixir -defmodule MyApp.TripBooking do - use Durable - use Durable.Helpers - - workflow "book_trip" do - step :book_flight, [compensate: :cancel_flight], fn ctx -> - booking = Flights.book(ctx["flight"]) - {:ok, assign(ctx, :flight, booking)} - end - - step :book_hotel, [compensate: :cancel_hotel], fn ctx -> - booking = Hotels.book(ctx["hotel"]) - {:ok, assign(ctx, :hotel, booking)} - end - - step :charge, fn ctx -> - total = ctx.flight.price + ctx.hotel.price - Payments.charge(ctx["card"], total) - {:ok, assign(ctx, :charged, true)} - end - - compensate :cancel_flight, fn ctx -> - Flights.cancel(ctx.flight.id) - {:ok, ctx} - end - - compensate :cancel_hotel, fn ctx -> - Hotels.cancel(ctx.hotel.id) - {:ok, ctx} - end - end -end -``` - -### Scheduled Reports - -Run daily at 9am. - -```elixir -defmodule MyApp.DailyReport do - use Durable - use Durable.Helpers - use Durable.Scheduler.DSL - - @schedule cron: "0 9 * * *", timezone: "America/New_York" - workflow "daily_sales_report" do - step :generate, fn _input -> - report = Reports.sales_summary(Date.utc_today()) - {:ok, %{report: report}} - end - - step :distribute, fn ctx -> - Mailer.send_report(ctx.report, to: "team@company.com") - Slack.post_summary(ctx.report, channel: "#sales") - {:ok, ctx} - end - end -end - -# Register in supervision tree -{Durable, repo: MyApp.Repo, scheduled_modules: [MyApp.DailyReport]} -``` - -### Delayed & Scheduled Execution - -Sleep, schedule for specific times, and wait for events. - -```elixir -defmodule MyApp.TrialReminder do - use Durable - use Durable.Helpers - use Durable.Wait - - workflow "trial_reminder" do - step :welcome, fn ctx -> - Mailer.send_welcome(ctx["user_id"]) - {:ok, %{user_id: ctx["user_id"], trial_started_at: ctx["trial_started_at"]}} - end - - step :wait_3_days, fn ctx -> - sleep(days(3)) - {:ok, ctx} - end - - step :check_in, fn ctx -> - Mailer.send_tips(ctx.user_id) - {:ok, ctx} - end - - step :wait_until_trial_ends, fn ctx -> - trial_end = DateTime.add(ctx.trial_started_at, 14, :day) - schedule_at(trial_end) - {:ok, ctx} - end - - step :convert_or_remind, fn ctx -> - if Subscriptions.active?(ctx.user_id) do - {:ok, assign(ctx, :converted, true)} - else - Mailer.send_upgrade_reminder(ctx.user_id) - {:ok, assign(ctx, :converted, false)} - end - end - end -end -``` - -### Event-Driven Workflow - -Wait for external webhook events. - -```elixir -defmodule MyApp.PaymentFlow do - use Durable - use Durable.Helpers - use Durable.Wait - - workflow "payment_flow" do - step :create_invoice, fn ctx -> - invoice = Invoices.create(ctx["order_id"], ctx["amount"]) - {:ok, %{order_id: ctx["order_id"], invoice_id: invoice.id}} - end - - step :await_payment, fn ctx -> - {event, _payload} = wait_for_any(["payment.success", "payment.failed"], - timeout: days(7), - timeout_value: {"payment.expired", nil} - ) - {:ok, assign(ctx, :result, event)} - end - - branch on: fn ctx -> ctx.result end do - "payment.success" -> - step :fulfill, fn ctx -> - Orders.fulfill(ctx.order_id) - {:ok, assign(ctx, :status, :fulfilled)} - end - - _ -> - step :cancel, fn ctx -> - Orders.cancel(ctx.order_id) - {:ok, assign(ctx, :status, :cancelled)} - end - end - end -end - -# Webhook handler sends event -Durable.send_event(workflow_id, "payment.success", %{transaction_id: "txn_123"}) -``` - -## Reference - -### Helper Functions - -```elixir -use Durable.Helpers - -assign(ctx, :key, value) # Set a value -assign(ctx, %{a: 1, b: 2}) # Merge multiple values -update(ctx, :key, default, fn old -> new end) -append(ctx, :list, item) # Append to list -increment(ctx, :count) # Increment by 1 -increment(ctx, :count, 5) # Increment by 5 -``` - -### Time Helpers - -```elixir -seconds(30) # 30_000 ms -minutes(5) # 300_000 ms -hours(2) # 7_200_000 ms -days(7) # 604_800_000 ms +```bash +mix setup # mix deps.get in durable/, then durable_dashboard/ +mix compile # compile both +mix test # run both test suites +mix format # format both projects +mix precommit # run each project's precommit alias ``` -### Orchestration +`examples/phoenix_demo` is intentionally outside the fan-out — it's an +integration sample, not a published package, and uses its own DB on port +`53412`. Run it directly via `cd examples/phoenix_demo && mix ...`. -```elixir -use Durable.Orchestration +## Working in a single package -# Synchronous: call child and wait for result -case call_workflow(MyApp.PaymentWorkflow, %{"amount" => 100}, timeout: hours(1)) do - {:ok, result} -> {:ok, assign(data, :payment, result)} - {:error, reason} -> {:error, reason} -end +Each package is a normal Mix project. From inside any subdirectory, the usual +commands work: `mix deps.get`, `mix compile`, `mix test`, `mix format`, +`mix credo --strict` (in `durable/`), `mix hex.publish`, etc. -# Fire-and-forget: start child and continue -{:ok, child_id} = start_workflow(MyApp.EmailWorkflow, %{"to" => email}, ref: :welcome) - -# call_workflow also works inside parallel blocks (executed inline) -parallel do - step :payment, fn data -> - case call_workflow(MyApp.PaymentWorkflow, %{"amount" => data.total}, ref: :pay) do - {:ok, result} -> {:ok, assign(data, :payment, result)} - {:error, reason} -> {:error, reason} - end - end - - step :shipping, fn data -> - case call_workflow(MyApp.ShippingWorkflow, %{"id" => data.order_id}, ref: :ship) do - {:ok, result} -> {:ok, assign(data, :shipping, result)} - {:error, reason} -> {:error, reason} - end - end -end -``` +## Cross-package references -### API +Packages link to each other via `path:` deps in dev. To publish, swap the +`path:` line for the Hex version: ```elixir -Durable.start(Module, input) -Durable.start(Module, input, queue: :priority, scheduled_at: datetime) -Durable.get_execution(id) -Durable.list_executions(workflow: Module, status: :running) -Durable.cancel(id, "reason") -Durable.send_event(id, "event", payload) -Durable.provide_input(id, "input_name", data) -Durable.list_children(parent_id) +# durable_dashboard/mix.exs +{:durable, path: "../durable"} # dev +{:durable, "~> 0.1"} # release ``` -## Mix Tasks +## Database -Durable includes mix tasks for managing workflows from the command line. +A shared `docker-compose.yml` at the workspace root brings up the Postgres +instance the core durable test suite expects (port `54321`): ```bash -# Show queue status and workflow summary -mix durable.status - -# List workflow executions (with filters) -mix durable.list # all executions -mix durable.list --status running # filter by status -mix durable.list --workflow MyApp.OrderWorkflow # filter by workflow -mix durable.list --limit 20 --format json # limit results, JSON output - -# Start a workflow -mix durable.run MyApp.OrderWorkflow # no input -mix durable.run MyApp.OrderWorkflow --input '{"id": 123}' # with JSON input -mix durable.run MyApp.OrderWorkflow --queue high_priority # specific queue - -# Cancel a workflow -mix durable.cancel -mix durable.cancel --reason "no longer needed" - -# Clean up old executions -mix durable.cleanup --older-than 30d # completed/failed older than 30 days -mix durable.cleanup --older-than 7d --status completed # only completed, older than 7 days -mix durable.cleanup --older-than 24h --dry-run # preview what would be deleted +docker compose up -d ``` -## Guides - -- [Branching](guides/branching.md) - Conditional flow control -- [Parallel](guides/parallel.md) - Concurrent execution -- [Compensations](guides/compensations.md) - Saga pattern -- [Waiting](guides/waiting.md) - Sleep, events, human input -- [Orchestration](guides/orchestration.md) - Parent/child workflow composition - -## Coming Soon - -- Phoenix LiveView dashboard +The Phoenix demo runs against its own Postgres on port `53412`; see +[`examples/phoenix_demo/docker-compose.yml`](examples/phoenix_demo/docker-compose.yml). -## License +## Layout reference -MIT +This layout is modeled on [`elixir-nx/nx`](https://github.com/elixir-nx/nx), +which uses the same path-dep + thin-coordinator pattern across `nx/`, +`exla/`, and `torchx/`. diff --git a/docs/bug-reports/2026-04-12-parallel-context-and-serialization.md b/docs/bug-reports/2026-04-12-parallel-context-and-serialization.md new file mode 100644 index 0000000..94b1ef7 --- /dev/null +++ b/docs/bug-reports/2026-04-12-parallel-context-and-serialization.md @@ -0,0 +1,364 @@ +# Bug Report: Parallel step context loss, tuple serialization, and zombie workflows + +**Date:** 2026-04-12 +**Severity:** High — multiple classes of bugs cause silent data corruption, cryptic errors, and unrecoverable stuck workflows +**Reproduction:** `examples/phoenix_demo` — onboarding workflow with parallel provisioning block + +## Summary + +Four related issues were discovered while building complex workflows for the `examples/phoenix_demo` app. They compound on each other: bug #1 triggers bug #3, which masks the root cause and leaves workflows in the zombie state described in bug #4. Bug #2 is triggered independently when users construct `parallel into:` callbacks. + +Individually these are library footguns. Together they make parallel blocks dangerous to use without deep knowledge of the executor internals. + +--- + +## Bug 1 — Parallel steps don't inherit parent workflow context + +**Severity:** High (silent data loss) +**Status:** ✅ Fixed (2026-04-13) — `create_parallel_child/5` now copies the parent's accumulated context into the child execution, and `execute_parallel_step/3` merges it into the pipeline data before invoking `StepRunner.execute/4`. Context is a snapshot taken at spawn time — concurrent siblings still can't see each other's writes, which is intentional. +**File:** `lib/durable/executor.ex:785-825` (`execute_parallel_step/3`) + +### Observed behaviour + +```elixir +step :register_employee, fn data -> + put_context(:employee_name, data["name"]) # Sets context on parent + {:ok, %{name: data["name"]}} +end + +parallel into: fn _ctx, results -> {:ok, results} end do + step :setup_email, fn _data -> + name = get_context(:employee_name) # ← Returns nil! + email = String.downcase(name) <> "@company.com" # ← Crashes: String.downcase(nil) + {:ok, %{email: email}} + end +end +``` + +`get_context/1` inside a parallel step returns `nil` for keys that were set by previous non-parallel steps. The developer expects the context to flow through the pipeline — instead, parallel steps see an empty context. + +### Root cause + +Parallel steps execute as **separate child workflow executions** (separate DB rows, separate processes). In `execute_parallel_step/3`: + +```elixir +# lib/durable/executor.ex:801 +data = atomize_keys(execution.input) # ← Only the workflow's INITIAL input +case StepRunner.execute(step_def, data, execution.id, config) do +``` + +And in `StepRunner.execute_with_retry/6` (line 56): +```elixir +Process.put(:durable_context, data) # ← data here = parent's initial input, not accumulated context +``` + +The child execution is created with just the parent's initial input as its context — any `put_context/2` calls made by prior steps are lost. + +### Proposed fix + +When creating child executions for a parallel block, copy the parent's `execution.context` into the child's `context` field. In `execute_parallel_step/3`, load that context into the process dictionary before the step runs: + +```elixir +# Merge parent context when spawning parallel children (at spawn time) +parent_context = parent_exec.context || %{} +child_context = Map.merge(parent_context, %{"__parallel_step" => step_name_str}) + +# In execute_parallel_step/3 before StepRunner.execute: +merged_data = Map.merge(atomize_keys(execution.context), atomize_keys(execution.input)) +Process.put(:durable_context, merged_data) +``` + +### Test to add + +```elixir +test "parallel steps can read context set by earlier steps" do + {:ok, id} = Durable.start(ParallelContextWorkflow, %{"seed" => "hello"}, inline: true) + execution = Durable.Query.get_execution(id) + assert execution.status == :completed + assert execution.context["seen_in_parallel"] == "hello" # set by prior step, read inside parallel +end +``` + +--- + +## Bug 2 — Raw tuples in `parallel into:` results break JSON storage + +**Severity:** High (workflow fails after parallel completes, may corrupt state) +**Status:** ✅ Fixed (2026-04-12) — `apply_parallel_into/3` now pre-serializes results via the existing `serialize_parallel_results/1` before calling the user's `into:` callback, so user code never sees raw tagged tuples. +**File:** `lib/durable/executor.ex:980` (`apply_parallel_into/3`) + +### Observed behaviour + +```elixir +parallel into: fn _ctx, results -> + # results = %{setup_email: {:ok, %{...}}, setup_dev_tools: {:ok, %{...}}} + {:ok, %{provisioning: "complete", results: results}} # ← crashes when saved +end +``` + +The save to PostgreSQL fails with: +``` +** (Protocol.UndefinedError) protocol Jason.Encoder not implemented for Tuple +Got value: {:ok, %{"domain" => "company.com", "email" => "..."}} +``` + +### Root cause + +When there is **no** `into:` function, the library calls `serialize_parallel_results/1` (line 994) which correctly unwraps tagged tuples: + +```elixir +defp serialize_parallel_results(results) do + Map.new(results, fn {key, value} -> + serialized_value = + case value do + {:ok, data} -> ["ok", data] + {:error, reason} -> ["error", serialize_error_reason(reason)] + other -> other + end + {serialized_key, serialized_value} + end) +end +``` + +But when there **is** an `into:` function, raw tuples are passed straight to user code (line 980): + +```elixir +defp apply_parallel_into(into_fn, base_ctx, results) when is_function(into_fn, 2) do + into_fn.(base_ctx, results) # ← results still contains tuples +``` + +If the user's callback returns those tuples (which is a natural thing to do — "just pass results through"), storage crashes later. + +### Proposed fix + +Pre-serialize results before passing to the user's callback. The callback should never receive raw Erlang tuples: + +```elixir +defp apply_parallel_into(into_fn, base_ctx, results) when is_function(into_fn, 2) do + safe_results = serialize_parallel_results(results) + into_fn.(base_ctx, safe_results) +rescue + ... +end +``` + +This is the same serialization the library already applies for the no-`into:` case. Users would then see `%{setup_email: ["ok", %{...}]}` — obviously not a tuple — and handle it with documented helpers (e.g. `Durable.parallel_ok?/1`). + +### Test to add + +```elixir +test "parallel into: callback receives JSON-safe results" do + workflow = build_workflow_with_into(fn _ctx, results -> + # User returns results directly — must not crash storage + {:ok, %{results: results}} + end) + {:ok, id} = Durable.start(workflow, %{}, inline: true) + execution = Durable.Query.get_execution(id) + assert execution.status == :completed +end +``` + +--- + +## Bug 3 — Secondary encoding error hides the root cause + +**Severity:** Medium (breaks debuggability) +**Status:** ✅ Fixed (2026-04-12) — `mark_failed/3` now runs errors through `sanitize_for_json/1` (recursive tuple/pid/function stringification) before persisting, with a try/rescue fallback that stores a minimal diagnostic if even the sanitized payload fails to save. Workflows can no longer get stuck because the error couldn't be recorded. +**File:** `lib/durable/executor.ex:1146` (`mark_failed/3`) + +### Observed behaviour + +When a step crashed (bug #1), the executor built an error map including stacktrace frames. Some of those frames referenced tuple patterns like `{:ok, ...}` as literal data. Saving that error record to PostgreSQL then crashed with a `Protocol.UndefinedError` — which was the error reported to the developer. The original `String.replace(nil, ...)` crash was never seen. + +### Root cause + +`mark_failed/3` does `Repo.update` with the raw error map: +```elixir +execution +|> WorkflowExecution.status_changeset(:failed, %{ + error: error, # ← could contain anything + ... +}) +|> Repo.update(config) # ← crashes if error contains tuple/pid/function +``` + +There is no sanitization before the error is persisted as JSONB. + +### Proposed fix + +Add a `sanitize_for_json/1` helper that recursively walks a term and replaces unencodable values with their `inspect/1` string form. Call it in `mark_failed/3` before the update. If the save still fails, fall back to storing a minimal `%{type: "serialization_error", message: Exception.message(e)}`. + +```elixir +defp mark_failed(config, execution, error) do + safe_error = sanitize_for_json(error) + + try do + execution + |> WorkflowExecution.status_changeset(:failed, %{ + error: safe_error, + completed_at: DateTime.utc_now() + }) + |> Ecto.Changeset.change(locked_by: nil, locked_at: nil) + |> Repo.update(config) + rescue + e -> + fallback = %{type: "unrecorded_error", message: Exception.message(e)} + # ... retry with fallback + end + ... +end + +defp sanitize_for_json(term) when is_tuple(term), + do: term |> Tuple.to_list() |> sanitize_for_json() +defp sanitize_for_json(term) when is_map(term), + do: Map.new(term, fn {k, v} -> {sanitize_key(k), sanitize_for_json(v)} end) +defp sanitize_for_json(term) when is_list(term), + do: Enum.map(term, &sanitize_for_json/1) +defp sanitize_for_json(term) when is_atom(term) or is_binary(term) or is_number(term), + do: term +defp sanitize_for_json(term), do: inspect(term) +``` + +### Test to add + +```elixir +test "mark_failed survives unencodable error values" do + error_with_tuple = %{ + type: "crash", + details: {:some, "tuple"}, # Jason.Encoder chokes on this + extra: [pid: self()] + } + + {:ok, exec} = insert_running_workflow() + assert {:error, _} = Executor.mark_failed(config, exec, error_with_tuple) + + reloaded = Repo.get(config, WorkflowExecution, exec.id) + assert reloaded.status == :failed + assert reloaded.error["type"] == "crash" # preserved + assert is_binary(reloaded.error["details"]) # tuple stringified +end +``` + +--- + +## Bug 4 — Zombie workflows in `:waiting` with nothing to wait on + +**Severity:** Medium (creates unrecoverable stuck state) +**Status:** ✅ Fixed (2026-04-13) — Added optional adapter callback `recover_zombie_workflows/2`. The Postgres implementation detects workflows in `:waiting` with no pending inputs/events whose last update is past the stale lock timeout, and marks them `:failed` with `error.type == "zombie_detected"`. `StaleJobRecovery.do_recovery/1` calls it after the regular stale-lock sweep, so the periodic recovery (default 60s) automatically surfaces stuck workflows. Emits `[:durable, :queue, :zombie_recovered]` telemetry. + +### Observed behaviour + +After bug #1 + bug #3, a workflow was left with: +- `status = :waiting` +- `locked_by` pointing to a dead worker +- `locked_at` timestamp from hours ago +- No rows in `pending_inputs` or `pending_events` +- `error = NULL` (because the error save itself crashed) + +From the outside, the workflow looks like it is waiting for user input — but nothing can unblock it. The dashboard displays it as `Waiting` indefinitely. The stale-lock recovery eventually clears the lock but doesn't transition the status. + +### Root cause + +The workflow state machine assumes every `:waiting` status corresponds to a `PendingInput` or `PendingEvent` row. There is no integrity check that verifies the invariant. + +### Proposed fix + +Two parts: + +**A. Prevent creation of zombies:** When `mark_failed/3` (or any terminal handler) crashes, the recovery code must still transition the workflow out of `:waiting` / `:running`. Bug #3's fix covers most of this. + +**B. Detect existing zombies:** Extend the stale-job recovery (`lib/durable/queue/stale_job_recovery.ex`) to detect: + +``` +workflow.status = :waiting +AND ( + NOT EXISTS (SELECT 1 FROM pending_inputs + WHERE workflow_id = workflow.id AND status = :pending) + AND NOT EXISTS (SELECT 1 FROM pending_events + WHERE workflow_id = workflow.id AND status = :pending) +) +AND workflow.locked_at < now() - interval '1 stale_lock_timeout' +``` + +Transition these to `:failed` with a diagnostic error: +```elixir +%{ + type: "zombie_detected", + message: "Workflow was in :waiting status with no pending inputs or events; likely crashed during state transition.", + current_step: workflow.current_step, + locked_at: workflow.locked_at +} +``` + +And emit a PubSub event so dashboards can highlight it. + +### Test to add + +```elixir +test "zombie workflow recovery fails stuck :waiting workflows with no pending inputs" do + {:ok, exec} = create_workflow(status: :waiting, locked_at: long_ago()) + # No pending inputs/events inserted + + :ok = StaleJobRecovery.run(config) + + reloaded = Repo.get(config, WorkflowExecution, exec.id) + assert reloaded.status == :failed + assert reloaded.error["type"] == "zombie_detected" +end +``` + +--- + +## Cross-cutting: Developer-facing diagnostics + +Even after fixing these bugs, similar issues will surface in the future. Two small investments pay off: + +### Add compile-time warning when `get_context/1` is called inside a parallel step body + +The DSL macro knows when it's inside a `parallel` block. If `get_context/1` is seen in the AST of a `step` inside `parallel`, emit a compile warning: + +``` +warning: get_context/1 inside a parallel step only sees context copied at spawn time; + mutations made by other parallel siblings are not visible. Pass needed values via + the step's `data` argument instead. +``` + +### Log the full error chain + +When the executor catches and wraps errors, include the original exception as `original_error` so the developer sees both layers: + +```elixir +rescue + e -> + {:error, %{ + type: "parallel_into_error", + message: Exception.message(e), + original_error: inspect(e), # ← new + stacktrace: Exception.format_stacktrace(__STACKTRACE__) + }} +end +``` + +--- + +## Repro steps (end-to-end) + +1. Check out `feat/dashboard` at commit with `examples/phoenix_demo/lib/phoenix_demo/workflows/onboarding_workflow.ex` pre-fix. +2. `cd examples/phoenix_demo && mix seed_workflows` +3. Open `/dashboard`, find the "employee_onboarding" workflow, submit the equipment form. +4. Observe: status flips to `waiting` at step `parallel_NNNN` and stays there. +5. Inspect with psql: + ``` + SELECT status, current_step, locked_by, error + FROM durable.workflow_executions + WHERE id = '...'; + ``` + → `waiting`, `parallel_NNNN`, non-null stale lock, NULL error. +6. Find the original crash in server logs: `Protocol.UndefinedError ... not implemented for Tuple ... {:error, %{"message" => "no function clause matching in String.replace/4"}}`. + +## Files to patch (priority order) + +1. `lib/durable/executor.ex` — bugs #1, #2, #3 (parallel context, into pre-serialization, error sanitization) +2. `lib/durable/queue/stale_job_recovery.ex` — bug #4 (zombie detector) +3. `lib/durable/dsl/step.ex` — compile-time warning for `get_context` in parallel steps +4. `test/durable/executor_test.exs` — regression coverage for all four bugs diff --git a/docs/bug-reports/2026-04-13-fix-plan.md b/docs/bug-reports/2026-04-13-fix-plan.md new file mode 100644 index 0000000..ff9523f --- /dev/null +++ b/docs/bug-reports/2026-04-13-fix-plan.md @@ -0,0 +1,689 @@ +# Fix & Test Plan — 2026-04-13 Audit Follow-up + +**Scope:** Fix the 16 issues identified in `2026-04-13-follow-up-audit.md` AND build an aggressive testing strategy that proactively surfaces new ones. + +--- + +# Part 1 — Fixes, grouped into 8 atomic PRs + +Each group is independently deployable. Ordered by severity/blast-radius. + +--- + +## PR 1 — Context persistence + timeout_value sanitization (CRITICAL) + +Addresses **C-1, C-2, H-6**. These are the user-visible "the API doesn't do what it says" bugs. + +### 1.1 Rewrite `save_data_as_context/3` to persist all process-dict writes + +**File:** `lib/durable/executor.ex` (line 1117) + +```elixir +defp save_data_as_context(config, execution, data) do + # Persist everything the step accumulated: data returned from the body + # PLUS every put_context/2 write made during execution. Step return wins + # on key collision because the return is the step's explicit contract. + process_ctx = Process.get(:durable_context, %{}) + + merged = + process_ctx + |> Map.merge(atomize_keys(data)) + |> sanitize_for_json() + + execution |> Ecto.Changeset.change(context: merged) |> Repo.update(config) +end +``` + +**Fixes H-6 automatically:** decision `{:goto, target, new_data}` also routes through `save_data_as_context/3`, so prior context will now persist into the target step. + +Delete the now-unused `merge_orchestration_context/1` and `orchestration_key?/1` (or leave as thin aliases). + +### 1.2 Sanitize `serialize_timeout_value/1` + +**File:** `lib/durable/executor.ex` (around line 1640) + +```elixir +defp serialize_timeout_value(value) do + %{"__value__" => sanitize_for_json(value)} +end +``` + +### 1.3 Update `deserialize_timeout_value/1` contract + +Users now receive sanitized shapes (`["error", :timeout]` instead of `{:error, :timeout}`). Document the shape change. Option: keep a `deserialize` helper that best-effort rehydrates tagged-tuple shapes: + +```elixir +defp deserialize_timeout_value(%{"__value__" => ["ok", v]}), do: {:ok, v} +defp deserialize_timeout_value(%{"__value__" => ["error", v]}), do: {:error, v} +defp deserialize_timeout_value(%{"__value__" => v}), do: v +``` + +This preserves the round-trip ergonomic for the 80% case. + +### 1.4 Tests (must add) + +- `test/durable/context_test.exs`: + - `put_context/2 writes persist to the next sequential step without being in the return map` + - `put_context/2 writes persist across decision {:goto, ...}` + - `put_context/2 writes survive a wait_for_input / resume cycle` + - Step return value wins on key collision with prior put_context +- `test/durable/wait_test.exs`: + - `wait_for_event with timeout_value: {:error, :timeout} round-trips correctly` + - `wait_for_input with timeout_value: {deep, nested, {tuples}} does not crash` + - `wait_for_any with map-valued timeout_value preserves shape` + +--- + +## PR 2 — Serialization sweep at all write boundaries + +Addresses **H-1, H-2, H-3, H-4, H-7**. Pure defensive hardening — apply `Executor.sanitize_for_json/1` at every remaining write path. + +### 2.1 `Durable.Wait.provide_input/4` + +**File:** `lib/durable/wait.ex:547` + +Before persisting user `data`: +```elixir +def provide_input(workflow_id, input_name, data, opts \\ []) do + safe_data = Executor.sanitize_for_json(data) + + with {:ok, pending} <- find_pending_input(config, workflow_id, input_name), + {:ok, completed} <- complete_pending_input(config, pending, safe_data), + {:ok, _} <- Executor.resume_workflow(workflow_id, %{input_name => safe_data}, opts) do + ... +``` + +### 2.2 `Durable.Wait.send_event/4` + +**File:** `lib/durable/wait.ex:575` — same pattern on `payload`. + +### 2.3 `StepRunner.fail_step_execution` + +**File:** `lib/durable/executor/step_runner.ex:292` + +```elixir +defp fail_step_execution(config, step_exec, error, logs, duration_ms) do + safe_error = Executor.sanitize_for_json(error) + ... +end +``` + +### 2.4 `StepRunner.serialize_output` — replace shallow logic with `sanitize_for_json` + +**File:** `lib/durable/executor/step_runner.ex:322` + +Current implementation handles top-level tuples only. Replace with: + +```elixir +defp serialize_output(data), do: Executor.sanitize_for_json(data) +``` + +Remove the now-redundant custom tuple-handling code. + +### 2.5 `Executor.resume_workflow/3` argument sanitization + +**File:** `lib/durable/executor.ex:160` + +```elixir +def resume_workflow(workflow_id, additional_context \\ %{}, opts \\ []) do + safe_context = sanitize_for_json(additional_context) + # use safe_context below +``` + +### 2.6 Tests + +- For each of the 5 write boundaries: add a test that pushes a tuple-heavy payload through the API and reloads from the DB to verify the write succeeded AND values can be JSON-encoded. +- Parametric test: generate a list of pathological values (tuples, PIDs, functions, deeply-nested maps with tuples at leaves) and push each through every user-facing write API. Verify no crashes. + +--- + +## PR 3 — Retry state continuity + +Addresses **H-5**. Retries should see accumulated state from prior attempts. + +### 3.1 Preserve process-dict state across retry attempts + +**File:** `lib/durable/executor/step_runner.ex:50-106` + +Change the top-of-function `Process.put(:durable_context, data)` to a merge: + +```elixir +defp execute_with_retry(step, data, workflow_id, attempt, max_attempts, config) do + Context.set_current_step(step.name) + + # On retry, keep prior put_context writes from the previous attempt. + # First attempt: existing context is empty/orchestration-only, so merge is a no-op. + prior_ctx = Process.get(:durable_context, %{}) + Process.put(:durable_context, Map.merge(prior_ctx, atomize_keys(data))) + ... +``` + +### 3.2 Persist context between retry attempts (bonus) + +Even better: after a failed attempt that's going to retry, snapshot the process dict to `execution.context` so a worker crash doesn't lose accumulated state: + +```elixir +# In the retry branch of execute_with_retry: +{:ok, exec} = save_data_as_context(config, execution, data) +Process.sleep(backoff_ms) +execute_with_retry(step, data, workflow_id, attempt + 1, max_attempts, config) +``` + +### 3.3 Tests + +- `test/durable/retry_test.exs` (new file): + - `put_context writes survive across retry attempts` + - `retry count accumulated via put_context is visible to successful final attempt` + - `retry preserves arbitrary state (nested maps, lists)` + - `a worker crash between attempts resumes with prior put_context state intact` (needs crash-injection infrastructure; see Part 2) + +--- + +## PR 4 — Transactional state transitions + +Addresses **M-2, M-3, M-5**. Close the gaps where a DB blip between two updates can orphan a workflow. + +### 4.1 PendingInput timeout + workflow resume as one atomic op + +**File:** `lib/durable/wait/timeout_worker.ex:120+` + +Replace the two separate updates with `Ecto.Multi`: + +```elixir +Ecto.Multi.new() +|> Ecto.Multi.update(:input, PendingInput.timeout_changeset(input)) +|> Ecto.Multi.update(:workflow, WorkflowExecution.resume_changeset(workflow, timeout_context)) +|> Repo.transaction(config) +``` + +Add `WorkflowExecution.resume_changeset/2` to the schema module (sets status back to `:pending` + merges context). + +### 4.2 Child completion + parent resume atomically + +**File:** `lib/durable/executor.ex:1339-1356` + +Same pattern: wrap `PendingEvent` update + parent `resume_workflow` in an `Ecto.Multi`. + +### 4.3 Ack retry with backoff + telemetry + +**File:** `lib/durable/queue/adapters/postgres.ex:64` and `lib/durable/queue/poller.ex:289` + +Wrap `ack/2` in a retry helper: + +```elixir +def ack(config, job_id, retries \\ 3) do + case do_ack(config, job_id) do + :ok -> :ok + {:error, reason} when retries > 0 -> + Process.sleep(:rand.uniform(100) * (4 - retries)) + ack(config, job_id, retries - 1) + {:error, reason} = err -> + :telemetry.execute([:durable, :queue, :ack_failed], %{count: 1}, + %{job_id: job_id, reason: reason}) + err + end +end +``` + +### 4.4 Tests + +- `test/durable/wait/timeout_worker_test.exs`: force the workflow update to fail (via Mox or a DB-level constraint) and verify the PendingInput is NOT marked `:timeout` (transactional rollback). +- `test/durable/executor/orchestration_test.exs`: force the parent `resume_workflow` to fail and verify `PendingEvent` is NOT marked `:received`. +- `test/durable/queue/adapters/postgres_test.exs`: `ack/2` retries transient failures; `:ack_failed` telemetry fires after exhaustion. + +--- + +## PR 5 — Extended zombie detection + +Addresses **M-1**. `:compensating` can also zombie. + +### 5.1 Broaden `recover_zombie_workflows/2` + +**File:** `lib/durable/queue/adapters/postgres.ex` + +Extend the existing zombie query to also catch `:compensating` workflows with no active compensation step running: + +```elixir +zombie_query = + from(w in WorkflowExecution, + as: :workflow, + where: w.status in [:waiting, :compensating], + where: w.updated_at < ^cutoff, + ... + where: + w.status != :compensating or + not exists( + from(s in StepExecution, + where: s.workflow_id == parent_as(:workflow).id + and s.status == :running + ) + ), + ... +``` + +### 5.2 Tests + +- New tests in `test/durable/queue/adapters/postgres_test.exs`: + - `:compensating workflow with no running compensation step gets failed` + - `:compensating workflow with an in-flight compensation step is preserved` + +--- + +## PR 6 — Ergonomic improvements + +Addresses **M-4, M-6**. + +### 6.1 WaitGroup per-event timeout status + +**File:** `lib/durable/wait/timeout_worker.ex:218-265` + +When resuming a timed-out `wait_for_all`/`wait_for_any`, include per-event status: + +```elixir +resume_context = %{ + event_name => %{ + status: :timeout, # or :received + results: per_event_map, # %{event_name => %{status:, value:}} + partial: received_count < expected_count + } +} +``` + +Log a warning if a late event arrives after the group is `:timeout`. + +### 6.2 LogCapture warns on stringified metadata + +**File:** `lib/durable/log_capture.ex:261-269` + +```elixir +defp serialize_value(value) when not is_safe_json(value) do + Logger.warning("[Durable.LogCapture] non-JSON-safe metadata stringified: #{inspect(value)}") + inspect(value) +end +``` + +(Rate-limit the warning so a hot loop doesn't spam.) + +### 6.3 Tests + +- Test late-arriving events are handled gracefully +- Test the warning fires once per step when bad metadata is seen + +--- + +## PR 7 — Scheduler resilience + +Addresses **L-1**. + +### 7.1 `ScheduledWorkflow` fields + +Add migration: +```elixir +add :last_error, :text +add :last_error_at, :utc_datetime_usec +add :consecutive_failures, :integer, default: 0 +add :auto_disabled_at, :utc_datetime_usec +``` + +### 7.2 `scheduler.ex` logic + +On module-load failure: +```elixir +case parse_module(schedule.workflow_module) do + {:error, reason} -> + increment_failure_count(schedule, reason) + if schedule.consecutive_failures >= 5, do: auto_disable(schedule) + {:ok, module} -> ... +end +``` + +### 7.3 Tests + +- `test/durable/scheduler_test.exs`: + - Failure increments counter, updates `last_error_at` + - 5 consecutive failures auto-disables the schedule + - A successful run after a failure resets the counter + +--- + +## PR 8 — Documentation + cancel cascade + +Addresses **L-2, L-3, L-4, L-5**. Pure docs + one defensive log. + +### 8.1 Orphan completion warning + +**File:** `lib/durable/executor.ex` (child → parent notification path) + +```elixir +if parent.status != :waiting do + Logger.warning( + "[Durable] child completion arrived for non-waiting parent — " <> + "parent_id=#{parent.id} parent_status=#{parent.status} " <> + "result=#{inspect(result, limit: 200)}" + ) +end +``` + +### 8.2 Doc updates + +- `lib/durable/orchestration.ex`: document that `call_workflow` returns the child's full context (not just explicit outputs). Offer a workaround pattern. +- `lib/durable/wait.ex`: document that step bodies containing `wait_for_*` re-execute from the top on resume — side effects before the wait must be idempotent. +- `lib/durable/dsl/step.ex`: compile-time note for future `each/foreach` macro — iterations must be isolated. + +--- + +# Part 2 — Aggressive testing strategy + +The current test suite has **332 tests** that exercise happy paths and a handful of named error scenarios. It does not systematically stress the execution engine. Below is a plan to add **aggressive, property-based, and chaos-style tests** that proactively surface bugs like the 4 + 16 we already found. + +The goal: make it hard to introduce a new bug in this class without a red test. + +--- + +## Test strategy A — Property-based serialization fuzzing + +### Tool + +Add `stream_data` to `mix.exs`: +```elixir +{:stream_data, "~> 1.0", only: [:test, :dev]} +``` + +### A.1 A generator for "pathological user data" + +**File:** `test/support/generators.ex` (new) + +```elixir +defmodule Durable.Test.Generators do + import StreamData + + @doc "Generates any Elixir term, including JSON-hostile shapes." + def any_term do + sized(fn size -> + frequency([ + {5, atom(:alphanumeric)}, + {5, integer()}, + {5, binary()}, + {3, one_of([constant(nil), boolean(), float()])}, + {2, map_of(atom(:alphanumeric), any_term(), max_length: size)}, + {2, list_of(any_term(), max_length: size)}, + # JSON-hostile leaves: + {1, tuple({atom(:alphanumeric), any_term()})}, # tagged tuples + {1, reference_gen()}, + {1, pid_gen()}, + {1, function_gen()} + ]) + end) + end + + def reference_gen, do: constant(make_ref()) + def pid_gen, do: constant(self()) + def function_gen, do: constant(&Enum.map/2) +end +``` + +### A.2 Property tests for every JSONB write boundary + +**File:** `test/durable/sanitization_property_test.exs` (new) + +```elixir +use ExUnitProperties + +property "sanitize_for_json/1 output is always Jason-encodable" do + check all term <- Durable.Test.Generators.any_term(), max_runs: 500 do + sanitized = Durable.Executor.sanitize_for_json(term) + assert {:ok, _} = Jason.encode(sanitized) + end +end + +property "save_data_as_context survives any step return value" do + check all return_value <- map_of(atom(:alphanumeric), any_term()), + max_runs: 200 do + {:ok, exec} = start_workflow_returning(return_value) + assert exec.status in [:completed, :waiting] + assert {:ok, _} = Jason.encode(exec.context) + end +end + +property "provide_input accepts any user-supplied data" do + check all data <- any_term(), max_runs: 200 do + {:ok, wf} = start_workflow_waiting_for_input() + assert :ok = Durable.provide_input(wf.id, "in", data) + end +end + +# Repeat for: send_event, wait_for_* timeout_value, parallel into: return value, +# step return value, decision data, compensation error, retry error +``` + +### Why this matters + +Every time we add a new JSONB write path, CI will catch it if it doesn't sanitize. + +--- + +## Test strategy B — State-machine exhaustiveness + +### B.1 Encode the legal state machine as a spec + +**File:** `test/support/state_machine.ex` (new) + +```elixir +defmodule Durable.Test.StateMachine do + @workflow_statuses ~w(pending running waiting completed failed cancelled + compensating compensated compensation_failed)a + + @legal_transitions %{ + pending: [:running, :cancelled], + running: [:waiting, :completed, :failed, :compensating, :cancelled], + waiting: [:pending, :failed, :cancelled], # pending = resumed + compensating: [:compensated, :compensation_failed, :failed], + completed: [], # terminal + failed: [:compensating, :compensated], + cancelled: [], # terminal + compensated: [], + compensation_failed: [] + } + + def legal?(from, to), do: to in Map.get(@legal_transitions, from, []) +end +``` + +### B.2 Property test: every transition observed in a test fixture is legal + +For every workflow in every test, collect the sequence of status transitions and assert each one is in `legal_transitions`. This surfaces accidentally-permissive code paths. + +```elixir +property "every workflow only transitions through legal states" do + check all workflow_module <- member_of([SimpleWorkflow, BranchedWorkflow, ...]), + max_runs: 50 do + transitions = capture_transitions(workflow_module, %{}) + for {from, to} <- Enum.zip(transitions, tl(transitions)) do + assert Durable.Test.StateMachine.legal?(from, to), + "illegal transition: #{from} -> #{to}" + end + end +end +``` + +### B.3 Invariant tests + +Check invariants that should hold at all times: + +```elixir +test "invariant: locked_by nil implies locked_at nil" do + run_random_workflows(50) + Repo.all(WorkflowExecution) + |> Enum.each(fn w -> + assert is_nil(w.locked_by) == is_nil(w.locked_at) + end) +end + +test "invariant: :waiting implies pending_input or pending_event exists (after stale timeout)" do + run_random_workflows(50) + # ... wait past the stale timeout ... + :ok = StaleJobRecovery.recover_now(Durable) + + # Now no :waiting workflow should exist without a pending input/event. + stuck = Repo.all(from w in WorkflowExecution, where: w.status == :waiting) + for w <- stuck do + inputs = Repo.exists?(from p in PendingInput, where: p.workflow_id == ^w.id and p.status == :pending) + events = Repo.exists?(from e in PendingEvent, where: e.workflow_id == ^w.id and e.status == :pending) + assert inputs or events, "zombie survived recovery: #{w.id}" + end +end + +test "invariant: every step_execution has an existing workflow_id" do + ... +end + +test "invariant: completed_at implies status in terminal set" do + ... +end +``` + +--- + +## Test strategy C — Chaos / crash injection + +### C.1 Crash hooks in the executor + +Add an opt-in mechanism for tests to inject failures: + +**File:** `lib/durable/executor.ex` (add private function) + +```elixir +if Mix.env() == :test do + defp maybe_inject_fault(tag) do + case Process.get({:durable_fault, tag}) do + nil -> :ok + :raise -> raise "injected fault at #{tag}" + :throw -> throw(:injected) + :exit -> exit(:injected) + end + end +else + defp maybe_inject_fault(_), do: :ok +end +``` + +Sprinkle `maybe_inject_fault(:after_save_context)`, `:before_mark_completed`, `:before_ack`, etc. at every state transition point. + +### C.2 Chaos test suite + +**File:** `test/durable/chaos_test.exs` (new) + +```elixir +for tag <- [:after_save_context, :before_mark_completed, :before_ack, + :between_parallel_spawn, :during_compensation, ...] do + test "workflow recovers from fault at #{tag}" do + Process.put({:durable_fault, unquote(tag)}, :raise) + + {:ok, id} = Durable.start(SimpleWorkflow, %{}) + + # Let the stale-lock recovery kick in + :ok = StaleJobRecovery.recover_now(Durable, stale_lock_timeout: 1) + Process.delete({:durable_fault, unquote(tag)}) + + # Wait for completion + wait_for_terminal_state(id, 5_000) + + exec = Durable.Query.get_execution(id) + assert exec.status in [:completed, :failed] + refute is_nil(exec.error) and exec.status == :failed # error recorded + end +end +``` + +This catches the class of bug where a crash mid-transition leaves the system in an unrecoverable state. + +--- + +## Test strategy D — Concurrency / race tests + +### D.1 Parallel spawn race + +Start 100 workflows simultaneously with parallel blocks. Verify all reach terminal states within 30 seconds. Verify no duplicate child executions. Verify no stale locks. + +### D.2 Ack-vs-recovery race + +Simulate a worker that finishes a job but `ack` fails. Verify stale-lock recovery picks it up. Verify the job doesn't execute twice (requires idempotency key). + +### D.3 Event-vs-timeout race + +`wait_for_event` with 1s timeout. In parallel: fire the event at t=1000ms. Verify exactly one of {:received, :timeout} wins and the other is logged as late. + +### D.4 Cancel-vs-completion race + +Start a workflow, wait for it to be `:running`, issue `cancel/1` and let the step complete in parallel. Verify the final status is deterministic (likely `:cancelled` wins, but verify no orphaned step executions). + +### D.5 Tool + +Use `ExUnit.Case.async: false` plus `Task.async_stream` over N workflows. Assert aggregate invariants. + +--- + +## Test strategy E — Continuous exercise + +### E.1 Long-running soak test + +**File:** `test/durable/soak_test.exs` (opt-in via `@moduletag :soak`) + +Run 10,000 random workflows over 5 minutes. Assert: +- No zombies remain at the end +- All workflows reach a terminal state +- No stale locks older than the timeout +- No orphan step executions +- Memory and DB row count stay bounded + +Not run in CI; run nightly or before releases. + +### E.2 Phoenix demo as an E2E integration + +The existing `mix seed_workflows` task runs a representative mix of workflows. Wrap it in a test that asserts: +- Every seeded workflow either completes, fails cleanly, or correctly awaits input +- The dashboard's `get_metrics` doesn't crash on any realistic payload + +--- + +## Test strategy F — Regression gate + +### F.1 Add each bug report's reproduction as a test + +Every item in the bug-report docs gets a named test. Name format: `bug_NNNN__test.exs`. This is already partially done for the 4 original bugs; extend to the 16 new ones as their fixes ship. + +### F.2 Credo + dialyzer gates + +Already running `credo --strict` in CI. Add `mix dialyzer` as a gate — it catches the class of type-mismatch that produces tuple-in-a-map issues at compile time. + +--- + +# Priority & sequencing + +| PR | Fixes | Risk | Effort | +|----|-------|------|--------| +| 1 | C-1, C-2, H-6 | High (semantics change to `save_data_as_context`) | 1-2 days | +| 2 | H-1 — H-7 | Low (pure defensive) | 0.5 day | +| 3 | H-5 | Medium | 0.5 day | +| 4 | M-2, M-3, M-5 | Medium (Ecto.Multi refactors) | 1 day | +| 5 | M-1 | Low | 0.25 day | +| 6 | M-4, M-6 | Low | 0.5 day | +| 7 | L-1 | Low (migration) | 0.5 day | +| 8 | L-2, L-3, L-4, L-5 | Trivial | 0.25 day | + +| Test strategy | When | +|---|---| +| A — Property fuzzing | Ship with PR 2 | +| B — State machine | Ship with PR 5 | +| C — Chaos injection | Ship with PR 4 | +| D — Concurrency | Ship as a separate PR after PR 4 | +| E — Soak | Nightly / pre-release, not CI-blocking | +| F — Regression gate | Ongoing discipline; each PR adds named tests | + +Total estimated effort for fixes: **~1 week**. Total for tests: **~3-4 days** additional. Tests strategy A alone would have caught C-2, H-1, H-2 before they made it to production. + +# Summary + +The recurring pattern in these bugs is **user data crossing an unvalidated persistence boundary**. The fix template is: sanitize + transact + recover. The testing strategy enforces that pattern with property tests, state-machine invariants, and crash injection. + +With this plan fully executed, a new bug of the same class should be impossible to land without a CI failure. diff --git a/docs/bug-reports/2026-04-13-follow-up-audit.md b/docs/bug-reports/2026-04-13-follow-up-audit.md new file mode 100644 index 0000000..062e427 --- /dev/null +++ b/docs/bug-reports/2026-04-13-follow-up-audit.md @@ -0,0 +1,210 @@ +# Follow-up Audit: Serialization, State-Machine, and Context Footguns + +**Date:** 2026-04-13 +**Scope:** Systematic sweep after fixing the 4 bugs in `2026-04-12-parallel-context-and-serialization.md`. Three audit axes: (A) serialization write-paths, (B) state-machine dead-ends, (C) context-flow surprises. + +## TL;DR + +**16 additional issues found**, clustered around the same 4 root causes (silent serialization crashes, state dead-ends, unexpected context loss, missing recovery paths). **2 are critical enough to warrant immediate fixes.** Full severity-ordered list below. + +--- + +## 🔴 CRITICAL — must fix next + +### C-1. `put_context/2` silently loses data between steps + +**Status:** ✅ Fixed (2026-04-13). `save_data_as_context/3` now merges the full process-dict into the persisted context; step return wins on collision. `handle_decision_result/8` also passes the merged `exec.context` forward instead of the goto's raw new_data, so context survives across `{:goto, ...}` boundaries (closes H-6 too). + +**Files:** `lib/durable/executor.ex:1117-1141` (`save_data_as_context/3`), `lib/durable/context.ex` + +`put_context(:foo, "bar")` in one step is **not** visible to the next step unless the user ALSO returns `:foo` in their step output: + +```elixir +step :s1, fn data -> + put_context(:charge_id, "ch_1") # ← silently discarded at save time + {:ok, %{completed: true}} # ← only this map is persisted +end + +step :s2, fn data -> + get_context(:charge_id) # ← returns nil! +end +``` + +`save_data_as_context/3` only merges **orchestration keys** (`__child:*`, `__fire_forget:*`) from the process dict. Every other user-written key is dropped. + +This is the single biggest usability bug in the library. It makes the `put_context` / `get_context` API a footgun: it *looks* like it persists state, but only works if the user also threads the data through return values — at which point you might as well just use `data[:foo]`. + +**Evidence it's already hurting the docs:** `examples/phoenix_demo/lib/phoenix_demo/workflows/document_workflow.ex` uses `put_context` heavily, and "works" only because the user also returned those same keys in `{:ok, %{filename: …, path: …, …}}`. A developer copying the pattern without including the keys in the return would silently lose state. + +**Fix:** In `save_data_as_context/3`, merge ALL user keys from the process dict (not just orchestration keys) into `data` before persisting: + +```elixir +defp save_data_as_context(config, execution, data) do + process_ctx = Process.get(:durable_context, %{}) + merged = + process_ctx + |> Map.merge(atomize_keys(data)) # step return wins over prior writes + |> sanitize_for_json() + + execution |> Ecto.Changeset.change(context: merged) |> Repo.update(config) +end +``` + +The step's return value takes precedence over `put_context` writes on key collision, which matches user intuition. + +### C-2. `:timeout_value` in `wait_for_*` crashes JSONB storage + +**Status:** ✅ Fixed (2026-04-13). `serialize_timeout_value/1` now routes both the `is_map(value)` and the catch-all `__value__` paths through `sanitize_for_json/1`. Tuples become lists, PIDs/refs become inspect strings, atoms still go through the `__atom__` round-trip helper. + +**Files:** `lib/durable/executor.ex:1520, 1551, 1590, 1607` (`serialize_timeout_value/1`) + +The entire `wait_for_event / wait_for_input / wait_for_any / wait_for_all / call_workflow` family accepts a `:timeout_value` option. Users naturally pass idiomatic Elixir tuples: + +```elixir +wait_for_approval("manager_ok", timeout: hours(3), timeout_value: {:error, :timeout}) +wait_for_input("feedback", timeout_value: {:ok, :auto_accepted}) +``` + +`serialize_timeout_value/1` wraps the value in `%{"__value__" => value}` but doesn't run `sanitize_for_json/1` on it. The tuple leaks straight into `PendingInput.timeout_value` / `PendingEvent.timeout_value` (both `:map` fields) and crashes `Repo.insert` at setup time — before the workflow even enters `:waiting`. + +**Fix:** Pipe through the sanitizer: + +```elixir +defp serialize_timeout_value(value), do: %{"__value__" => sanitize_for_json(value)} +``` + +On the deserialize side (`wait/timeout_worker.ex:128-131`), the inverse isn't needed — tuples become lists via the sanitizer, and user code in `wait_for_*` returns values already pattern-matchable as `["error", :timeout]` (or we change the contract to always return sanitized shapes). + +--- + +## 🟠 HIGH — fix in next batch + +### H-1. `provide_input/3` writes user-supplied data without sanitization +**Status:** ✅ Fixed (2026-04-13) in PR 2. `Durable.Wait.provide_input/4` now sanitizes `data` once at the API boundary; both the `PendingInput.response` write and the downstream `resume_workflow/3` call see a JSONB-safe payload. +**File:** `lib/durable/wait.ex:547` + +### H-2. `send_event/3` writes user payload without sanitization +**Status:** ✅ Fixed (2026-04-13) in PR 2. Same pattern as H-1 applied to `Durable.Wait.send_event/4`. +**File:** `lib/durable/wait.ex:575` + +### H-3. `StepRunner.fail_step_execution` doesn't sanitize the error +**Status:** ✅ Fixed (2026-04-13) in PR 2. `fail_step_execution/5` now runs the error map through `Executor.sanitize_for_json/1` before the changeset. +**File:** `lib/durable/executor/step_runner.ex:292` + +### H-4. `StepRunner.serialize_output` is shallow +**Status:** ✅ Fixed (2026-04-13) in PR 2. The custom case-clause logic is replaced with a recursive call to `Executor.sanitize_for_json/1`, with non-map outputs wrapped under `:value`. Nested tuples in step outputs are now handled at any depth. +**File:** `lib/durable/executor/step_runner.ex:322` + +### H-5. Retries reset `put_context` state +**Status:** ✅ Fixed (2026-04-13) in PR 3. `StepRunner.execute_with_retry/6` now merges the existing process dict instead of overwriting on retry attempts, so prior `put_context` writes survive into subsequent attempts. +**File:** `lib/durable/executor/step_runner.ex:50-106` +On retry, `Process.put(:durable_context, data)` (line 56) overwrites any `put_context` writes from the failed attempt. Users expect retry-aware state like: + +```elixir +step :flaky, retry: [max_attempts: 3], fn data -> + count = (get_context(:attempt) || 0) + 1 + put_context(:attempt, count) # ← resets to 1 every retry + ... +end +``` + +**Fix:** At the top of `execute_with_retry/6`, merge `data` with the existing process dict instead of overwriting. Combined with C-1's fix, this gives retries genuine state continuity. + +### H-6. Decision `{:goto, step, new_data}` loses prior context +**Status:** ✅ Fixed (2026-04-13) as part of PR 1. `handle_decision_result/8` now passes the merged `exec.context` (after `save_data_as_context/3`) to the next step instead of the goto's raw `new_data`. Prior `put_context` writes survive the goto. +**File:** `lib/durable/executor.ex:378+` (`handle_decision_result`) +If prior steps did `put_context(:foo, 1)` and a decision returns `{:goto, :later, %{bar: 2}}`, step `:later` sees only `%{bar: 2}` — `foo` is gone. Same root cause as C-1. The fix for C-1 handles this automatically. + +### H-7. `resume_workflow/3` doesn't sanitize `additional_context` +**Status:** ✅ Fixed (2026-04-13) in PR 2. `Executor.resume_workflow/3` sanitizes `additional_context` before merging it into `execution.context`. This is the safety net for any future caller that bypasses the API boundary sanitization. +**File:** `lib/durable/executor.ex:160-171` + +--- + +## 🟡 MEDIUM — address after serialization sweep + +### M-1. No zombie detector for `:compensating` state +**Status:** ✅ Fixed (2026-04-13) in PR 5. `recover_zombie_workflows/2` now also catches `:compensating` workflows past the stale timeout with no running compensation `StepExecution`. Tests assert healthy compensating workflows with a live step are preserved. +**File:** `lib/durable/executor.ex:1455-1470` +The zombie detector we built only covers `:waiting`. If a compensation handler crashes mid-rollback (or the save after compensation fails), the workflow stays `:compensating` forever. **Fix:** Extend `recover_zombie_workflows/2` to also detect `:compensating` workflows older than the stale timeout with no active compensation step. + +### M-2. PendingInput timeout can orphan a workflow +**Status:** ✅ Fixed (2026-04-13) in PR 4. `handle_input_timeout/2` now uses `Ecto.Multi` via new helpers `atomic_resume_after_timeout/4` and `atomic_cancel_after_timeout/4`. The PendingInput transition and the parent workflow's `:pending` flip happen in one transaction; a DB blip between the two rolls back both. +**File:** `lib/durable/wait/timeout_worker.ex:120+` +`TimeoutWorker` marks the input `:timeout` and then calls `resume_workflow/3`. If the resume fails (DB blip, crash between the two updates), the input is `:timeout` but the workflow is still `:waiting` — and now the zombie detector won't fire (it requires no-pending-inputs). **Fix:** Wrap the two updates in a transaction, OR set the workflow to `:pending` as part of the same update statement using `Ecto.Multi`. + +### M-3. Child completion with failed parent notification +**Status:** ✅ Fixed (2026-04-13) in PR 4. `notify_orchestration_parent/4` now calls `atomic_fulfill_event_and_resume_parent/4`, which wraps the PendingEvent update and the parent workflow's context merge + `:pending` flip in a single `Ecto.Multi`. Orphan child completions (parent already cancelled/completed) are tolerated with a no-op and logged. +**File:** `lib/durable/executor.ex:1339-1356` +Child updates `PendingEvent` to `:received`, then calls `resume_workflow/3` on parent. If resume fails, parent stays `:waiting` with a consumed event. **Fix:** Use `Ecto.Multi` to couple both updates, OR make resume retry with backoff on transient failures. + +### M-4. `WaitGroup` partial completion ambiguity +**Status:** ✅ Fixed (2026-04-13) in PR 6. The resume context now includes `:__wait_group_status__ => %{event_name => %{"status" => "received"|"timeout", "value" => ...}}`. The whole timeout + parent-resume is also wrapped in an `Ecto.Multi` so partial/lost events can't desync. +**File:** `lib/durable/wait/timeout_worker.ex:218-265` (`handle_wait_group_timeout`) +On `wait_for_all` timeout with 3 of 5 events received, the resume payload can't distinguish "4th timed out" from "5th still arriving." Late-arriving events after timeout are silently dropped. **Fix:** Include per-event status in the resume payload; log a warning when a received event finds its group already `:timeout`. + +### M-5. `ack` failure causes silent re-execution +**Status:** ✅ Fixed (2026-04-13) in PR 4. `Durable.Queue.Adapters.Postgres.ack/2` now retries transient failures up to 3 times with randomized backoff and fires `[:durable, :queue, :ack_failed]` telemetry on exhaustion. A previously-silent `Repo.update` error would have resulted in stale-recovery re-executing the job 5 minutes later; now operators see telemetry before that window. +**File:** `lib/durable/queue/adapters/postgres.ex:64` +If the ack `Repo.update` fails after a workflow succeeds, the lock persists until the 300s stale-lock recovery kicks in — which then re-enqueues the already-done workflow. No idempotency key means it runs again. **Fix:** Retry ack with exponential backoff; add an `ack_failed` counter in telemetry so operators see silent re-executions. + +### M-6. `LogCapture` silently drops unencodable metadata +**Status:** ✅ Fixed (2026-04-13) in PR 6. `serialize_value/1` now emits a rate-limited (once per step) warning when metadata is stringified via `inspect/1`, pointing devs at the non-JSON-safe value. +**File:** `lib/durable/log_capture.ex:261-269` +`serialize_value/1` converts tuples/PIDs/funs to `inspect/1` strings. No crash, but the debugging signal is degraded — devs see `"#PID<0.123.0>"` instead of the structured data they logged. **Fix:** Emit a one-time warning per step when metadata was stringified, so devs know to structure their logs. + +--- + +## 🟢 LOW / DOCS + +### L-1. Scheduler silently fails on module reload +**Status:** ✅ Fixed (2026-04-13) in PR 7. Migration `V20260413000000AddSchedulerResilience` adds `last_error`, `last_error_at`, `consecutive_failures`, `auto_disabled_at` fields to `scheduled_workflows`. New `ScheduledWorkflow.failure_changeset/3` and `success_changeset/3` track state. `Scheduler.execute_schedule/2` advances `next_run_at` on module load failure and auto-disables after 5 consecutive failures. +**File:** `lib/durable/scheduler.ex:162+` +If a `ScheduledWorkflow` references a module that no longer exists (hot code reload removed it), the scheduler logs and returns without updating `next_run_at`. On the next poll it finds the same schedule, logs again, returns again — a noisy loop with no actual work. **Fix:** Add `:last_error` + `:last_error_at` fields to `ScheduledWorkflow`; increment a failure counter; auto-disable after N consecutive failures. + +### L-2. `cancel_workflow/1` doesn't synchronously cancel running children +**Status:** ✅ Fixed (2026-04-13) in PR 4 / PR 8. `notify_orchestration_parent/4` now emits a `Logger.warning` when a child completion arrives for a non-waiting parent, making orphan completions visible. Full cascade cancellation semantics are documented as future work. +**File:** `lib/durable/executor.ex:1389+` +Parent cancellation marks children `:cancelled` in `[:pending, :running, :waiting]` states, but if a child is mid-step-execution on another worker, the step runs to completion. Then child completion tries to notify parent, finds parent `:cancelled`, silently returns `:ok` — result is lost. **Fix:** Log a warning when a completion notification finds a non-waiting parent; consider a dead-letter queue for these orphan completions. + +### L-3. `call_workflow` exposes child's full context to parent +**Status:** ✅ Fixed (2026-04-13) in PR 8. `@doc` on `Durable.Orchestration.call_workflow/3` now explicitly documents that the success payload is the child's full context; shows a pattern for filtering to explicit outputs. +**File:** `lib/durable/orchestration.ex:226-230` +`build_result_payload` returns `%{result: child.context}` — the entire child context becomes a key in the parent's data flow. This is a minor info-leak surprise (parent sees all child state, not just outputs). **Fix:** Document; consider adding a `returns:` option to filter what propagates. + +### L-4. `wait_for_input` resumption re-runs the whole step body +**Status:** ✅ Fixed (2026-04-13) in PR 8. `@doc` on `Durable.Wait.wait_for_event/2` now includes a "Resumption semantics" section explaining that the step body re-executes from the top and that side effects before the wait must be idempotent. Shows both the anti-pattern and the fix (split into two steps). +**File:** Step bodies contain `wait_for_input` as a thrown marker +When resumed, the step body re-executes from the top. Side effects before `wait_for_input` run twice. Users often don't realize this. **Fix:** Document loudly; `wait_for_*` primitives are resumption barriers, and anything before them must be idempotent. + +### L-5. No `each/foreach` macro yet — design it with isolation in mind +**Status:** ✅ Noted (2026-04-13) in PR 8. `Durable.DSL.Step` module docstring now includes a "Future work" section calling out that iteration primitives must be designed with context isolation per iteration to avoid `put_context` races across concurrent iterations. +**Files:** `lib/durable/dsl/*.ex` +No iteration primitive exists today. If/when added: design iterations with isolated process dict contexts (like parallel) to avoid race conditions between iterations. + +--- + +## Recommended fix order + +1. **Immediate:** C-1 and C-2 (critical data loss and instant crashes) +2. **Serialization sweep (single PR):** H-1 through H-7 — apply `sanitize_for_json/1` at all write boundaries, plus retry/goto context-merging +3. **Transactional safety (separate PR):** M-2, M-3, M-5 via `Ecto.Multi` +4. **Extended zombie detection (separate PR):** M-1 +5. **Ergonomic improvements (doc + small tweaks):** M-4, M-6, all L-* + +## Testing strategy + +Each fix should add a regression test in the existing test file: +- C-1 → new tests in `test/durable/context_test.exs` for cross-step persistence +- C-2 → new test in `test/durable/wait_test.exs` for tuple `timeout_value` +- H-1/H-2 → tests in `test/durable/wait_test.exs` +- H-3/H-4 → tests in `test/durable/step_runner_test.exs` +- H-5 → new test in `test/durable/retry_test.exs` for context continuity across retries +- H-6 → new test in `test/durable/decision_test.exs` +- M-* → integration tests simulating crashes between state transitions + +## Risk assessment + +The **biggest exposure is C-1**. Right now the library's documented `put_context`/`get_context` API doesn't do what it says it does. Every workflow that uses it works only because users also return the keys they care about. A new user following the documentation pattern without that workaround will silently lose state. + +The **biggest live crash risk is C-2**. It triggers on very idiomatic usage (`timeout_value: {:error, :timeout}`). Until fixed, this is a footgun waiting to catch anyone trying the `wait_for_*` family with typical Elixir conventions. diff --git a/.credo.exs b/durable/.credo.exs similarity index 100% rename from .credo.exs rename to durable/.credo.exs diff --git a/.formatter.exs b/durable/.formatter.exs similarity index 100% rename from .formatter.exs rename to durable/.formatter.exs diff --git a/durable/.gitignore b/durable/.gitignore new file mode 100644 index 0000000..e949aa8 --- /dev/null +++ b/durable/.gitignore @@ -0,0 +1,27 @@ +# The directory Mix will write compiled artifacts to. +/_build/ + +# If you run "mix test --cover", coverage assets end up here. +/cover/ + +# The directory Mix downloads your dependencies sources to. +/deps/ + +# Where third-party dependencies like ExDoc output generated docs. +/doc/ + +# Temporary files, for example, from tests. +/tmp/ + +# If the VM crashes, it generates a dump, let's ignore it too. +erl_crash.dump + +# Also ignore archive artifacts (built via "mix archive.build"). +*.ez + +# Ignore package tarball (built via "mix hex.build"). +durable-*.tar + +# JS package manager +node_modules/ +package-lock.json diff --git a/durable/LICENSE b/durable/LICENSE new file mode 100644 index 0000000..c927461 --- /dev/null +++ b/durable/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 WaveZync + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/durable/README.md b/durable/README.md new file mode 100644 index 0000000..54b87e1 --- /dev/null +++ b/durable/README.md @@ -0,0 +1,542 @@ +# Durable + +[![Build Status](https://github.com/wavezync/durable/actions/workflows/ci.yml/badge.svg)](https://github.com/wavezync/durable/actions/workflows/ci.yml) +[![Hex.pm](https://img.shields.io/hexpm/v/durable.svg)](https://hex.pm/packages/durable) + +A durable, resumable workflow engine for Elixir. Similar to Temporal/Inngest. + +## Features + +- **Pipeline Model** - Context flows from step to step, simple and explicit +- **Resumability** - Sleep, wait for events, wait for human input +- **Branching** - Pattern-matched conditional flow control +- **Parallel** - Run steps concurrently with result collection +- **Compensations** - Saga pattern with automatic rollback +- **Cron Scheduling** - Recurring workflows with cron expressions +- **Reliability** - Automatic retries with exponential/linear/constant backoff +- **Orchestration** - Parent/child workflow composition +- **Persistence** - PostgreSQL-backed execution state + +## Installation + +```elixir +def deps do + [{:durable, "~> 0.0.0-alpha"}] +end +``` + +## Quick Start + +### 1. Create Migration + +```elixir +defmodule MyApp.Repo.Migrations.AddDurable do + use Ecto.Migration + def up, do: Durable.Migration.up() + def down, do: Durable.Migration.down() +end +``` + +When a Durable upgrade ships new internal migrations, generate a new wrapper +migration and run your normal Ecto migration flow: + +```bash +mix durable.gen.upgrade -r MyApp.Repo +mix ecto.migrate +``` + +Use `mix durable.migrations -r MyApp.Repo --check` in CI or deploy gates to +fail when the database is behind the Durable library version. + +### 2. Add to Supervision Tree + +```elixir +children = [ + MyApp.Repo, + {Durable, repo: MyApp.Repo, queues: %{default: [concurrency: 10]}} +] +``` + +### 3. Define & Run + +```elixir +defmodule MyApp.OrderWorkflow do + use Durable + use Durable.Helpers + + workflow "process_order", timeout: hours(2) do + # First step receives workflow input + step :validate, fn input -> + {:ok, %{ + order_id: input["id"], + items: input["items"], + customer_id: input["customer_id"] + }} + end + + # Each step receives previous step's output as context + step :calculate_total, fn ctx -> + total = ctx.items |> Enum.map(& &1["price"]) |> Enum.sum() + {:ok, assign(ctx, :total, total)} + end + + step :charge_payment, [retry: [max_attempts: 3, backoff: :exponential]], fn ctx -> + {:ok, charge} = PaymentService.charge(ctx.order_id, ctx.total) + {:ok, assign(ctx, :charge_id, charge.id)} + end + + step :send_confirmation, fn ctx -> + EmailService.send_confirmation(ctx.order_id) + {:ok, ctx} + end + end +end + +# Start it +{:ok, id} = Durable.start(MyApp.OrderWorkflow, %{"id" => "order_123", "items" => items}) +``` + +## Examples + +### Approval Workflow + +Wait for human approval with timeout fallback. + +```elixir +defmodule MyApp.ExpenseApproval do + use Durable + use Durable.Helpers + use Durable.Wait + + workflow "expense_approval" do + step :request_approval, fn ctx -> + result = wait_for_approval("manager", + prompt: "Approve $#{ctx["amount"]} expense?", + timeout: days(3), + timeout_value: :auto_rejected + ) + {:ok, assign(ctx, :decision, result)} + end + + branch on: fn ctx -> ctx.decision end do + :approved -> + step :process, fn ctx -> + Expenses.reimburse(ctx["employee_id"], ctx["amount"]) + {:ok, assign(ctx, :status, :reimbursed)} + end + + _ -> + step :notify_rejection, fn ctx -> + Mailer.send_rejection(ctx["employee_id"]) + {:ok, assign(ctx, :status, :rejected)} + end + end + end +end + +# Approve externally +Durable.provide_input(workflow_id, "manager", :approved) +``` + +### Parallel Data Fetch + +Fetch data concurrently, then combine results. + +```elixir +defmodule MyApp.DashboardBuilder do + use Durable + use Durable.Helpers + + workflow "build_dashboard" do + step :init, fn input -> + {:ok, %{user_id: input["user_id"]}} + end + + # Parallel steps produce results in __results__ map + parallel do + step :user, fn ctx -> + {:ok, %{user: Users.get(ctx.user_id)}} + end + + step :orders, fn ctx -> + {:ok, %{orders: Orders.recent(ctx.user_id)}} + end + + step :notifications, fn ctx -> + {:ok, %{notifs: Notifications.unread(ctx.user_id)}} + end + end + + # Access results from __results__ map + step :render, fn ctx -> + results = ctx[:__results__] + + # Results are tagged tuples: ["ok", data] or ["error", reason] + user = case results["user"] do + ["ok", data] -> data.user + _ -> nil + end + + orders = case results["orders"] do + ["ok", data] -> data.orders + _ -> [] + end + + notifs = case results["notifications"] do + ["ok", data] -> data.notifs + _ -> [] + end + + dashboard = Dashboard.build(user, orders, notifs) + {:ok, assign(ctx, :dashboard, dashboard)} + end + end +end + +# Or use into: to transform results directly +defmodule MyApp.DashboardBuilderWithInto do + use Durable + use Durable.Helpers + + workflow "build_dashboard_v2" do + step :init, fn input -> + {:ok, %{user_id: input["user_id"]}} + end + + parallel into: fn ctx, results -> + # results contains tuples: %{user: {:ok, data}, orders: {:ok, data}, ...} + case {results[:user], results[:orders], results[:notifications]} do + {{:ok, user_data}, {:ok, orders_data}, {:ok, notifs_data}} -> + {:ok, Map.merge(ctx, %{ + user: user_data.user, + orders: orders_data.orders, + notifs: notifs_data.notifs + })} + + _ -> + {:error, "Failed to fetch dashboard data"} + end + end do + step :user, fn ctx -> {:ok, %{user: Users.get(ctx.user_id)}} end + step :orders, fn ctx -> {:ok, %{orders: Orders.recent(ctx.user_id)}} end + step :notifications, fn ctx -> {:ok, %{notifs: Notifications.unread(ctx.user_id)}} end + end + + step :render, fn ctx -> + dashboard = Dashboard.build(ctx.user, ctx.orders, ctx.notifs) + {:ok, assign(ctx, :dashboard, dashboard)} + end + end +end +``` + +### Batch Processing + +Process items with controlled concurrency using `Task.async_stream`. + +```elixir +defmodule MyApp.BulkEmailer do + use Durable + use Durable.Helpers + + workflow "send_campaign" do + step :load, fn input -> + recipients = Subscribers.active(input["campaign_id"]) + {:ok, %{campaign_id: input["campaign_id"], recipients: recipients}} + end + + step :send_emails, fn ctx -> + results = + ctx.recipients + |> Task.async_stream( + fn recipient -> + case Mailer.send_campaign(recipient, ctx.campaign_id) do + :ok -> {:ok, recipient} + {:error, reason} -> {:error, {recipient, reason}} + end + end, + max_concurrency: 10, + timeout: :infinity + ) + |> Enum.map(fn {:ok, r} -> r end) + + sent = for {:ok, _} <- results, do: 1 + failed = for {:error, _} <- results, do: 1 + + {:ok, ctx + |> assign(:sent_count, length(sent)) + |> assign(:failed_count, length(failed))} + end + end +end +``` + +### Trip Booking (Saga) + +Book multiple services with automatic rollback on failure. + +```elixir +defmodule MyApp.TripBooking do + use Durable + use Durable.Helpers + + workflow "book_trip" do + step :book_flight, [compensate: :cancel_flight], fn ctx -> + booking = Flights.book(ctx["flight"]) + {:ok, assign(ctx, :flight, booking)} + end + + step :book_hotel, [compensate: :cancel_hotel], fn ctx -> + booking = Hotels.book(ctx["hotel"]) + {:ok, assign(ctx, :hotel, booking)} + end + + step :charge, fn ctx -> + total = ctx.flight.price + ctx.hotel.price + Payments.charge(ctx["card"], total) + {:ok, assign(ctx, :charged, true)} + end + + compensate :cancel_flight, fn ctx -> + Flights.cancel(ctx.flight.id) + {:ok, ctx} + end + + compensate :cancel_hotel, fn ctx -> + Hotels.cancel(ctx.hotel.id) + {:ok, ctx} + end + end +end +``` + +### Scheduled Reports + +Run daily at 9am. + +```elixir +defmodule MyApp.DailyReport do + use Durable + use Durable.Helpers + use Durable.Scheduler.DSL + + @schedule cron: "0 9 * * *", timezone: "America/New_York" + workflow "daily_sales_report" do + step :generate, fn _input -> + report = Reports.sales_summary(Date.utc_today()) + {:ok, %{report: report}} + end + + step :distribute, fn ctx -> + Mailer.send_report(ctx.report, to: "team@company.com") + Slack.post_summary(ctx.report, channel: "#sales") + {:ok, ctx} + end + end +end + +# Register in supervision tree +{Durable, repo: MyApp.Repo, scheduled_modules: [MyApp.DailyReport]} +``` + +### Delayed & Scheduled Execution + +Sleep, schedule for specific times, and wait for events. + +```elixir +defmodule MyApp.TrialReminder do + use Durable + use Durable.Helpers + use Durable.Wait + + workflow "trial_reminder" do + step :welcome, fn ctx -> + Mailer.send_welcome(ctx["user_id"]) + {:ok, %{user_id: ctx["user_id"], trial_started_at: ctx["trial_started_at"]}} + end + + step :wait_3_days, fn ctx -> + sleep(days(3)) + {:ok, ctx} + end + + step :check_in, fn ctx -> + Mailer.send_tips(ctx.user_id) + {:ok, ctx} + end + + step :wait_until_trial_ends, fn ctx -> + trial_end = DateTime.add(ctx.trial_started_at, 14, :day) + schedule_at(trial_end) + {:ok, ctx} + end + + step :convert_or_remind, fn ctx -> + if Subscriptions.active?(ctx.user_id) do + {:ok, assign(ctx, :converted, true)} + else + Mailer.send_upgrade_reminder(ctx.user_id) + {:ok, assign(ctx, :converted, false)} + end + end + end +end +``` + +### Event-Driven Workflow + +Wait for external webhook events. + +```elixir +defmodule MyApp.PaymentFlow do + use Durable + use Durable.Helpers + use Durable.Wait + + workflow "payment_flow" do + step :create_invoice, fn ctx -> + invoice = Invoices.create(ctx["order_id"], ctx["amount"]) + {:ok, %{order_id: ctx["order_id"], invoice_id: invoice.id}} + end + + step :await_payment, fn ctx -> + {event, _payload} = wait_for_any(["payment.success", "payment.failed"], + timeout: days(7), + timeout_value: {"payment.expired", nil} + ) + {:ok, assign(ctx, :result, event)} + end + + branch on: fn ctx -> ctx.result end do + "payment.success" -> + step :fulfill, fn ctx -> + Orders.fulfill(ctx.order_id) + {:ok, assign(ctx, :status, :fulfilled)} + end + + _ -> + step :cancel, fn ctx -> + Orders.cancel(ctx.order_id) + {:ok, assign(ctx, :status, :cancelled)} + end + end + end +end + +# Webhook handler sends event +Durable.send_event(workflow_id, "payment.success", %{transaction_id: "txn_123"}) +``` + +## Reference + +### Helper Functions + +```elixir +use Durable.Helpers + +assign(ctx, :key, value) # Set a value +assign(ctx, %{a: 1, b: 2}) # Merge multiple values +update(ctx, :key, default, fn old -> new end) +append(ctx, :list, item) # Append to list +increment(ctx, :count) # Increment by 1 +increment(ctx, :count, 5) # Increment by 5 +``` + +### Time Helpers + +```elixir +seconds(30) # 30_000 ms +minutes(5) # 300_000 ms +hours(2) # 7_200_000 ms +days(7) # 604_800_000 ms +``` + +### Orchestration + +```elixir +use Durable.Orchestration + +# Synchronous: call child and wait for result +case call_workflow(MyApp.PaymentWorkflow, %{"amount" => 100}, timeout: hours(1)) do + {:ok, result} -> {:ok, assign(data, :payment, result)} + {:error, reason} -> {:error, reason} +end + +# Fire-and-forget: start child and continue +{:ok, child_id} = start_workflow(MyApp.EmailWorkflow, %{"to" => email}, ref: :welcome) + +# call_workflow also works inside parallel blocks (executed inline) +parallel do + step :payment, fn data -> + case call_workflow(MyApp.PaymentWorkflow, %{"amount" => data.total}, ref: :pay) do + {:ok, result} -> {:ok, assign(data, :payment, result)} + {:error, reason} -> {:error, reason} + end + end + + step :shipping, fn data -> + case call_workflow(MyApp.ShippingWorkflow, %{"id" => data.order_id}, ref: :ship) do + {:ok, result} -> {:ok, assign(data, :shipping, result)} + {:error, reason} -> {:error, reason} + end + end +end +``` + +### API + +```elixir +Durable.start(Module, input) +Durable.start(Module, input, queue: :priority, scheduled_at: datetime) +Durable.get_execution(id) +Durable.list_executions(workflow: Module, status: :running) +Durable.cancel(id, "reason") +Durable.send_event(id, "event", payload) +Durable.provide_input(id, "input_name", data) +Durable.list_children(parent_id) +``` + +## Mix Tasks + +Durable includes mix tasks for managing workflows from the command line. + +```bash +# Show queue status and workflow summary +mix durable.status + +# List workflow executions (with filters) +mix durable.list # all executions +mix durable.list --status running # filter by status +mix durable.list --workflow MyApp.OrderWorkflow # filter by workflow +mix durable.list --limit 20 --format json # limit results, JSON output + +# Start a workflow +mix durable.run MyApp.OrderWorkflow # no input +mix durable.run MyApp.OrderWorkflow --input '{"id": 123}' # with JSON input +mix durable.run MyApp.OrderWorkflow --queue high_priority # specific queue + +# Cancel a workflow +mix durable.cancel +mix durable.cancel --reason "no longer needed" + +# Clean up old executions +mix durable.cleanup --older-than 30d # completed/failed older than 30 days +mix durable.cleanup --older-than 7d --status completed # only completed, older than 7 days +mix durable.cleanup --older-than 24h --dry-run # preview what would be deleted +``` + +## Guides + +- [Branching](guides/branching.md) - Conditional flow control +- [Parallel](guides/parallel.md) - Concurrent execution +- [Compensations](guides/compensations.md) - Saga pattern +- [Waiting](guides/waiting.md) - Sleep, events, human input +- [Orchestration](guides/orchestration.md) - Parent/child workflow composition + +## Coming Soon + +- Phoenix LiveView dashboard + +## License + +MIT diff --git a/config/config.exs b/durable/config/config.exs similarity index 100% rename from config/config.exs rename to durable/config/config.exs diff --git a/config/dev.exs b/durable/config/dev.exs similarity index 100% rename from config/dev.exs rename to durable/config/dev.exs diff --git a/config/prod.exs b/durable/config/prod.exs similarity index 100% rename from config/prod.exs rename to durable/config/prod.exs diff --git a/config/runtime.exs b/durable/config/runtime.exs similarity index 100% rename from config/runtime.exs rename to durable/config/runtime.exs diff --git a/config/test.exs b/durable/config/test.exs similarity index 100% rename from config/test.exs rename to durable/config/test.exs diff --git a/guides/ai_workflows.md b/durable/guides/ai_workflows.md similarity index 100% rename from guides/ai_workflows.md rename to durable/guides/ai_workflows.md diff --git a/guides/branching.md b/durable/guides/branching.md similarity index 100% rename from guides/branching.md rename to durable/guides/branching.md diff --git a/guides/compensations.md b/durable/guides/compensations.md similarity index 100% rename from guides/compensations.md rename to durable/guides/compensations.md diff --git a/guides/orchestration.md b/durable/guides/orchestration.md similarity index 100% rename from guides/orchestration.md rename to durable/guides/orchestration.md diff --git a/guides/parallel.md b/durable/guides/parallel.md similarity index 100% rename from guides/parallel.md rename to durable/guides/parallel.md diff --git a/guides/waiting.md b/durable/guides/waiting.md similarity index 100% rename from guides/waiting.md rename to durable/guides/waiting.md diff --git a/lib/durable.ex b/durable/lib/durable.ex similarity index 100% rename from lib/durable.ex rename to durable/lib/durable.ex diff --git a/lib/durable/application.ex b/durable/lib/durable/application.ex similarity index 100% rename from lib/durable/application.ex rename to durable/lib/durable/application.ex diff --git a/lib/durable/config.ex b/durable/lib/durable/config.ex similarity index 80% rename from lib/durable/config.ex rename to durable/lib/durable/config.ex index 0e70ca9..fbf7f6d 100644 --- a/lib/durable/config.ex +++ b/durable/lib/durable/config.ex @@ -45,7 +45,9 @@ defmodule Durable.Config do heartbeat_interval: pos_integer(), scheduled_modules: [module()], scheduler_interval: pos_integer(), - log_level: false | :debug | :info | :warning | :error + log_level: false | :debug | :info | :warning | :error, + pubsub: atom() | nil, + owns_pubsub?: boolean() } defstruct [ @@ -58,7 +60,9 @@ defmodule Durable.Config do :heartbeat_interval, :scheduled_modules, :scheduler_interval, - :log_level + :log_level, + :pubsub, + owns_pubsub?: false ] @schema [ @@ -111,6 +115,15 @@ defmodule Durable.Config do type: {:in, [false, :debug, :info, :warning, :error]}, default: false, doc: "Log level for Ecto queries (false disables logging, default: false)" + ], + pubsub: [ + type: :atom, + default: nil, + doc: + "Phoenix.PubSub server name for lifecycle broadcasts. " <> + "Pass an atom like `MyApp.PubSub` to reuse a PubSub started by the host app. " <> + "Pass `:start` to have Durable start its own (named after the instance). " <> + "Leave as `nil` to disable broadcasting (default)." ] ] @@ -118,11 +131,19 @@ defmodule Durable.Config do Creates a new validated configuration from options. Returns `{:ok, config}` if valid, `{:error, reason}` otherwise. + + Respects the `:durable` app env key `:disable_queue_processing`. When + set to `true`, it forces `queue_enabled: false` regardless of the + user's opts. Mix tasks set this before booting the host app so they + don't accidentally claim jobs they can't finish. """ @spec new(keyword()) :: {:ok, t()} | {:error, NimbleOptions.ValidationError.t()} def new(opts) do + opts = maybe_force_queue_disabled(opts) + case NimbleOptions.validate(opts, @schema) do {:ok, validated} -> + validated = resolve_pubsub(validated) {:ok, struct(__MODULE__, validated)} {:error, %NimbleOptions.ValidationError{}} = error -> @@ -130,6 +151,32 @@ defmodule Durable.Config do end end + defp maybe_force_queue_disabled(opts) do + if Application.get_env(:durable, :disable_queue_processing, false) do + Keyword.put(opts, :queue_enabled, false) + else + opts + end + end + + # Resolve `:pubsub` sentinel values to concrete server names. + # `:start` becomes the conventional `Durable..PubSub` name so the + # supervisor can start its own PubSub under that name and sets the owns flag. + defp resolve_pubsub(opts) do + case Keyword.get(opts, :pubsub) do + :start -> + name = Keyword.get(opts, :name, Durable) + owned = Module.concat([name, PubSub]) + + opts + |> Keyword.put(:pubsub, owned) + |> Keyword.put(:owns_pubsub?, true) + + _ -> + opts + end + end + @doc """ Creates a new validated configuration, raising on error. """ diff --git a/lib/durable/context.ex b/durable/lib/durable/context.ex similarity index 100% rename from lib/durable/context.ex rename to durable/lib/durable/context.ex diff --git a/lib/durable/definition.ex b/durable/lib/durable/definition.ex similarity index 100% rename from lib/durable/definition.ex rename to durable/lib/durable/definition.ex diff --git a/lib/durable/dsl/step.ex b/durable/lib/durable/dsl/step.ex similarity index 91% rename from lib/durable/dsl/step.ex rename to durable/lib/durable/dsl/step.ex index 9ef1347..bcaf72a 100644 --- a/lib/durable/dsl/step.ex +++ b/durable/lib/durable/dsl/step.ex @@ -7,6 +7,14 @@ defmodule Durable.DSL.Step do Data flows from step to step. Each step receives the previous step's output and returns `{:ok, data}` or `{:error, reason}`. + ## Future work — `each/foreach` isolation (L-5) + + If/when this DSL gains an iteration primitive (e.g. `each/3` to run a step + once per item in a list), design with **context isolation per iteration**. + Parallel-style shared context across concurrent iterations creates races on + `put_context/2` writes; isolated contexts avoid that class of bug. See + `docs/bug-reports/2026-04-13-follow-up-audit.md`. + ## Usage workflow "process_order" do @@ -134,7 +142,18 @@ defmodule Durable.DSL.Step do end defp build_decision(name, opts, body_fn) do - normalized_opts = normalize_step_opts(opts) + # Walk the body AST for `{:goto, atom, _}` patterns so the dashboard + # can render the conditional branches in the workflow graph. Best- + # effort: `:goto` calls with computed atoms (`{:goto, var, _}`) are + # not detected and operators can supply an explicit `branches:` + # opt to override. + branches = extract_goto_targets(body_fn) + + normalized_opts = + opts + |> normalize_step_opts() + |> Map.put(:branches, branches) + func_name = :"__decision_body_#{name}__" quote do @@ -151,6 +170,28 @@ defmodule Durable.DSL.Step do end end + # Recognises `{:goto, :atom, _}` *tuple literals* anywhere inside the + # body AST (decision body shapes are typically `cond do … end` / + # `case do … end`). 3-tuples render in AST as + # `{:{}, meta, [:goto, target_atom, _]}`. 2-tuples (rare: + # `{:goto, :atom}`) render as `{:goto, target_atom}` directly. Returns + # unique target atoms in source order. + defp extract_goto_targets(body_ast) do + {_, found} = + Macro.prewalk(body_ast, [], fn + {:{}, _meta, [:goto, target | _]} = node, acc when is_atom(target) -> + {node, [target | acc]} + + {:goto, target} = node, acc when is_atom(target) -> + {node, [target | acc]} + + other, acc -> + {other, acc} + end) + + found |> Enum.reverse() |> Enum.uniq() + end + @doc """ Defines a compensation handler for saga pattern. diff --git a/lib/durable/dsl/time_helpers.ex b/durable/lib/durable/dsl/time_helpers.ex similarity index 100% rename from lib/durable/dsl/time_helpers.ex rename to durable/lib/durable/dsl/time_helpers.ex diff --git a/lib/durable/dsl/workflow.ex b/durable/lib/durable/dsl/workflow.ex similarity index 100% rename from lib/durable/dsl/workflow.ex rename to durable/lib/durable/dsl/workflow.ex diff --git a/lib/durable/executor.ex b/durable/lib/durable/executor.ex similarity index 80% rename from lib/durable/executor.ex rename to durable/lib/durable/executor.ex index ce9ef25..29f0ee7 100644 --- a/lib/durable/executor.ex +++ b/durable/lib/durable/executor.ex @@ -14,6 +14,7 @@ defmodule Durable.Executor do alias Durable.Definition.Workflow alias Durable.Executor.CompensationRunner alias Durable.Executor.StepRunner + alias Durable.PubSub, as: DurablePubSub alias Durable.Repo alias Durable.Storage.Schemas.PendingEvent alias Durable.Storage.Schemas.PendingInput @@ -48,6 +49,8 @@ defmodule Durable.Executor do with {:ok, workflow_def} <- get_workflow_definition(module, opts), {:ok, execution} <- create_execution(config, module, workflow_def, input, opts) do + DurablePubSub.broadcast_workflow(config, :workflow_started, workflow_event(execution)) + # For inline/synchronous execution (useful for testing) if Keyword.get(opts, :inline, false) do execute_workflow(execution.id, config) @@ -73,12 +76,15 @@ defmodule Durable.Executor do execution when execution.status in [:pending, :running, :waiting] -> error = if reason, do: %{type: "cancelled", message: reason}, else: %{type: "cancelled"} - execution - |> WorkflowExecution.status_changeset(:cancelled, %{ - error: error, - completed_at: DateTime.utc_now() - }) - |> Repo.update(config) + {:ok, cancelled} = + execution + |> WorkflowExecution.status_changeset(:cancelled, %{ + error: error, + completed_at: DateTime.utc_now() + }) + |> Repo.update(config) + + DurablePubSub.broadcast_workflow(config, :workflow_cancelled, workflow_event(cancelled)) # Cascade cancel to child workflows cancel_child_workflows(config, workflow_id) @@ -153,11 +159,14 @@ defmodule Durable.Executor do def resume_workflow(workflow_id, additional_context \\ %{}, opts \\ []) do durable_name = Keyword.get(opts, :durable, Durable) config = Config.get(durable_name) + # Sanitize at the API boundary — callers (provide_input, send_event, + # timeout workers) supply maps that can carry arbitrary user terms. + safe_additional = sanitize_for_json(additional_context) with {:ok, execution} <- load_execution(config, workflow_id), true <- execution.status == :waiting || {:error, :not_waiting} do # Merge additional context - new_context = Map.merge(execution.context || %{}, additional_context) + new_context = Map.merge(execution.context || %{}, safe_additional) execution |> Ecto.Changeset.change( @@ -226,9 +235,16 @@ defmodule Durable.Executor do end defp mark_running(config, execution) do - execution - |> WorkflowExecution.status_changeset(:running, %{started_at: DateTime.utc_now()}) - |> Repo.update(config) + case execution + |> WorkflowExecution.status_changeset(:running, %{started_at: DateTime.utc_now()}) + |> Repo.update(config) do + {:ok, running} = ok -> + DurablePubSub.broadcast_workflow(config, :workflow_resumed, workflow_event(running)) + ok + + other -> + other + end end defp execute_steps(steps, execution, config, initial_data) do @@ -376,7 +392,17 @@ defmodule Durable.Executor do case find_jump_target(target_step, remaining_steps, step.name, step_index) do {:ok, target_steps} -> - execute_steps_recursive(target_steps, exec, step_index, workflow_def, config, data) + # Use the merged exec.context (prior put_context writes + goto data), + # not the goto's raw data — otherwise put_context writes from prior + # steps would silently vanish across the goto boundary. + execute_steps_recursive( + target_steps, + exec, + step_index, + workflow_def, + config, + exec.context + ) {:error, reason} -> handle_step_failure( @@ -751,6 +777,12 @@ defmodule Durable.Executor do # Create a child workflow execution for a single parallel step defp create_parallel_child(parent_exec, step, data, queue, config) do + # Inherit the parent's accumulated context so prior put_context/2 calls + # are visible to the child step via get_context/1. This is a snapshot at + # spawn time — concurrent siblings still can't see each other's writes, + # which is intentional (avoids races across parallel branches). + parent_context = parent_exec.context || %{} + attrs = %{ workflow_module: parent_exec.workflow_module, workflow_name: parent_exec.workflow_name, @@ -758,7 +790,7 @@ defmodule Durable.Executor do queue: to_string(queue), priority: 0, input: data, - context: %{"__parallel_step" => Atom.to_string(step.name)}, + context: Map.put(parent_context, "__parallel_step", Atom.to_string(step.name)), parent_workflow_id: parent_exec.id, current_step: Atom.to_string(step.name) } @@ -784,8 +816,21 @@ defmodule Durable.Executor do message: "Step #{step_name} not found in workflow" }) else - # Use parent's input as the pipeline data (stored in child's input) - data = atomize_keys(execution.input) + # Merge the inherited parent context into the pipeline data so that + # get_context/1 inside the step body resolves keys set by prior + # put_context/2 calls in the parent's non-parallel steps. + # + # StepRunner will Process.put(:durable_context, data) before running + # the body, which is why we need these keys to live inside `data`. + # The __parallel_step marker is internal plumbing — strip it from + # what the user sees. + inherited = + execution.context + |> Map.drop(["__parallel_step", :__parallel_step]) + |> atomize_keys() + + input = atomize_keys(execution.input) + data = Map.merge(inherited, input) case StepRunner.execute(step_def, data, execution.id, config) do {:ok, output_data} -> @@ -965,6 +1010,11 @@ defmodule Durable.Executor do end defp apply_parallel_into(into_fn, base_ctx, results) when is_function(into_fn, 2) do + # User callbacks get raw {:ok, value} / {:error, reason} tuples so they + # can pattern-match cleanly — this is the intended ergonomic API. + # Any tuples that leak into the callback's return are sanitized at the + # save boundary (save_data_as_context -> sanitize_for_json), so storage + # can't crash regardless of what shape the user returns. into_fn.(base_ctx, results) rescue e -> @@ -1079,70 +1129,145 @@ defmodule Durable.Executor do # Also merges orchestration keys from process dict to ensure child workflow # references are persisted through DB round-trips defp save_data_as_context(config, execution, data) do - merged = merge_orchestration_context(data) - - execution - |> Ecto.Changeset.change(context: merged) - |> Repo.update(config) - end - - # Merge orchestration keys (__child:*, __fire_forget:*, __child_done:*) from - # process dict into the data to persist. These keys are set by - # Durable.Orchestration.call_workflow/start_workflow via put_context. - defp merge_orchestration_context(data) do + # Persist the step's cumulative context — everything the user wrote via + # put_context/2 during the step body PLUS everything they returned from + # the step. Step return wins on key collision because the return value + # is the step's explicit contract. + # + # Rationale: Context is cumulative. Before this change, put_context/2 + # writes were silently dropped unless the user also returned those keys, + # which made the put_context API a footgun (see bug report C-1, + # 2026-04-13-follow-up-audit.md). process_ctx = Process.get(:durable_context, %{}) - orchestration_keys = + # Merge preserving key shapes: step return wins over prior writes on + # collision. We deliberately don't atomize keys here — users may return + # maps with mixed atom/string keys (e.g., via Helpers.assign/3 layered + # on top of string-keyed DB-round-tripped input) and atomize_keys/1 + # deduplicates non-deterministically via Map.new, which corrupted + # values in the branch DSL path. + merged = process_ctx - |> Enum.filter(fn {key, _} -> orchestration_key?(key) end) - |> Map.new() + |> Map.merge(data) + |> sanitize_for_json() - Map.merge(data, orchestration_keys) - end - - defp orchestration_key?(key) when is_atom(key) do - orchestration_key?(Atom.to_string(key)) - end - - defp orchestration_key?(key) when is_binary(key) do - String.starts_with?(key, "__child:") or - String.starts_with?(key, "__fire_forget:") or - String.starts_with?(key, "__child_done:") + execution + |> Ecto.Changeset.change(context: merged) + |> Repo.update(config) end - defp orchestration_key?(_), do: false - defp mark_completed(config, execution, final_data) do + # Sanitize before persisting — the final step may return data containing + # raw tuples (e.g., child executions in a parallel block complete with + # {:ok, value} that gets unwrapped but sometimes leaks through). + safe_final = sanitize_for_json(final_data) + {:ok, execution} = execution |> WorkflowExecution.status_changeset(:completed, %{ - context: final_data, + context: safe_final, completed_at: DateTime.utc_now(), current_step: nil }) |> Ecto.Changeset.change(locked_by: nil, locked_at: nil) |> Repo.update(config) - maybe_notify_parent(config, execution, :completed, final_data) + DurablePubSub.broadcast_workflow(config, :workflow_completed, workflow_event(execution)) + + maybe_notify_parent(config, execution, :completed, safe_final) {:ok, execution} end defp mark_failed(config, execution, error) do - {:ok, execution} = - execution - |> WorkflowExecution.status_changeset(:failed, %{ - error: error, - completed_at: DateTime.utc_now() - }) - |> Ecto.Changeset.change(locked_by: nil, locked_at: nil) - |> Repo.update(config) + # Sanitize the error before persisting — a step crash may hand us a map + # containing tuples, PIDs, or functions (via stacktrace, raw values, etc). + # Postgres JSONB can't store those, so we recursively replace unencodable + # leaves with their inspect/1 string. If save STILL fails we fall back to + # a minimal diagnostic error so the workflow is never left as a zombie. + safe_error = sanitize_for_json(error) + + result = + try do + execution + |> WorkflowExecution.status_changeset(:failed, %{ + error: safe_error, + completed_at: DateTime.utc_now() + }) + |> Ecto.Changeset.change(locked_by: nil, locked_at: nil) + |> Repo.update(config) + rescue + e -> + fallback = %{ + type: "unrecorded_error", + message: + "Workflow failed, but original error could not be persisted: #{Exception.message(e)}", + original_error_inspect: inspect(error, limit: :infinity) + } + + execution + |> WorkflowExecution.status_changeset(:failed, %{ + error: fallback, + completed_at: DateTime.utc_now() + }) + |> Ecto.Changeset.change(locked_by: nil, locked_at: nil) + |> Repo.update(config) + end + + case result do + {:ok, execution} -> + DurablePubSub.broadcast_workflow(config, :workflow_failed, workflow_event(execution)) + maybe_notify_parent(config, execution, :failed, safe_error) + {:error, safe_error} + + {:error, changeset} -> + # Last resort — we can't even save the fallback. Log and move on so + # the worker doesn't crash. + require Logger + + Logger.error( + "[Durable] Failed to mark workflow #{execution.id} as failed: #{inspect(changeset)}" + ) + + {:error, safe_error} + end + end + + # Recursively convert a term into something Jason can encode. Tuples become + # lists, non-Date/DateTime atoms pass through, and anything else exotic + # (PIDs, ports, functions, refs) is replaced by its inspect/1 form. + @doc false + @spec sanitize_for_json(term()) :: term() + def sanitize_for_json(%module{} = struct) + when module in [Date, DateTime, NaiveDateTime, Time, Decimal], + do: struct + + def sanitize_for_json(%_{} = struct), + do: sanitize_for_json(Map.from_struct(struct)) + + def sanitize_for_json(map) when is_map(map) do + Map.new(map, fn {k, v} -> {sanitize_json_key(k), sanitize_for_json(v)} end) + end - maybe_notify_parent(config, execution, :failed, error) + def sanitize_for_json(list) when is_list(list) do + Enum.map(list, &sanitize_for_json/1) + end - {:error, error} + def sanitize_for_json(tuple) when is_tuple(tuple) do + tuple |> Tuple.to_list() |> Enum.map(&sanitize_for_json/1) end + def sanitize_for_json(term) + when is_binary(term) or is_number(term) or is_boolean(term) or is_nil(term), + do: term + + def sanitize_for_json(term) when is_atom(term), do: term + + def sanitize_for_json(term), do: inspect(term) + + defp sanitize_json_key(k) when is_atom(k) or is_binary(k), do: k + defp sanitize_json_key(k), do: inspect(k) + # ============================================================================ # Parent Notification (Orchestration) # ============================================================================ @@ -1220,22 +1345,74 @@ defmodule Durable.Executor do case Repo.one(config, query) do nil -> - # Parent not waiting (fire-and-forget case, or already timed out) + # Parent not waiting (fire-and-forget case, or already timed out). + # Log this as an orphan completion — useful when investigating why + # a parent never received its child's result. + if execution.parent_workflow_id do + Logger.warning( + "[Durable] child completion arrived for non-waiting parent — " <> + "child=#{execution.id} parent=#{execution.parent_workflow_id} " <> + "event=#{event_name} status=#{status}" + ) + end + :ok pending_event -> - # Fulfill the pending event - {:ok, _} = - pending_event - |> PendingEvent.receive_changeset(payload) - |> Repo.update(config) - - # Find the child ref from parent's context to store result under the right key + # Atomic: fulfill the pending event AND transition the parent back + # to :pending in a single transaction. Before this change a crash + # between the two updates left the event :received but the parent + # stuck in :waiting (Bug M-3). parent = Repo.get(config, WorkflowExecution, execution.parent_workflow_id) result_context = build_parent_result_context(parent, execution.id, payload) + atomic_fulfill_event_and_resume_parent(config, pending_event, parent, result_context) + end + end - # Resume the parent workflow - resume_workflow(execution.parent_workflow_id, result_context) + defp atomic_fulfill_event_and_resume_parent(config, pending_event, parent, result_context) do + safe_context = sanitize_for_json(result_context) + + multi = + Ecto.Multi.new() + |> Ecto.Multi.update( + :event, + PendingEvent.receive_changeset(pending_event, pending_event.payload || %{}) + ) + |> Ecto.Multi.run(:parent, fn repo, _changes -> + case repo.get(WorkflowExecution, parent.id) do + nil -> + {:error, :parent_not_found} + + %WorkflowExecution{status: :waiting} = exec -> + new_context = Map.merge(exec.context || %{}, safe_context) + + exec + |> Ecto.Changeset.change( + context: new_context, + status: :pending, + locked_by: nil, + locked_at: nil + ) + |> repo.update() + + %WorkflowExecution{status: status} -> + # Parent already moved on (cancelled, completed, etc.). Tolerate; + # the event is still marked :received via the Multi above. + {:ok, %{status: status, no_op: true}} + end + end) + + case Repo.transaction(config, multi) do + {:ok, _} -> + :ok + + {:error, stage, reason, _} -> + Logger.error( + "[Durable] failed to atomically fulfill child event + resume parent: " <> + "stage=#{stage} reason=#{inspect(reason)} parent=#{parent.id}" + ) + + {:error, reason} end end @@ -1434,7 +1611,7 @@ defmodule Durable.Executor do on_timeout: Keyword.get(opts, :on_timeout, :resume) } - {:ok, _pending_input} = + {:ok, pending_input} = %PendingInput{} |> PendingInput.changeset(attrs) |> Repo.insert(config) @@ -1444,6 +1621,9 @@ defmodule Durable.Executor do |> Ecto.Changeset.change(status: :waiting) |> Repo.update(config) + DurablePubSub.broadcast_workflow(config, :workflow_waiting, workflow_event(execution)) + DurablePubSub.broadcast_input(config, :input_requested, pending_input_event(pending_input)) + {:waiting, execution} end @@ -1535,11 +1715,52 @@ defmodule Durable.Executor do end end + # Serialize a user-supplied timeout_value for JSONB storage in PendingInput / + # PendingEvent / WaitGroup rows. Users naturally pass idiomatic Elixir shapes + # like `{:error, :timeout}` or `%{nested: {:tuple}}`; sanitize_for_json/1 + # normalizes tuples to lists and exotic terms (PIDs, functions, refs) to + # their inspect/1 form so Postgrex never crashes on encode. defp serialize_timeout_value(nil), do: nil - defp serialize_timeout_value(value) when is_map(value), do: value + + defp serialize_timeout_value(value) when is_map(value), + do: sanitize_for_json(value) defp serialize_timeout_value(value) when is_atom(value), do: %{"__atom__" => Atom.to_string(value)} - defp serialize_timeout_value(value), do: %{"__value__" => value} + defp serialize_timeout_value(value), + do: %{"__value__" => sanitize_for_json(value)} + + # ============================================================================ + # PubSub event builders + # ============================================================================ + + defp workflow_event(%WorkflowExecution{} = execution) do + %{ + id: execution.id, + workflow_module: execution.workflow_module, + workflow_name: execution.workflow_name, + status: execution.status, + queue: execution.queue, + current_step: execution.current_step, + started_at: execution.started_at, + completed_at: execution.completed_at, + inserted_at: execution.inserted_at, + updated_at: execution.updated_at + } + end + + defp pending_input_event(%PendingInput{} = pending) do + %{ + id: pending.id, + workflow_id: pending.workflow_id, + input_name: pending.input_name, + step_name: pending.step_name, + input_type: pending.input_type, + status: pending.status, + prompt: pending.prompt, + timeout_at: pending.timeout_at, + inserted_at: pending.inserted_at + } + end end diff --git a/lib/durable/executor/backoff.ex b/durable/lib/durable/executor/backoff.ex similarity index 100% rename from lib/durable/executor/backoff.ex rename to durable/lib/durable/executor/backoff.ex diff --git a/lib/durable/executor/compensation_runner.ex b/durable/lib/durable/executor/compensation_runner.ex similarity index 100% rename from lib/durable/executor/compensation_runner.ex rename to durable/lib/durable/executor/compensation_runner.ex diff --git a/lib/durable/executor/step_runner.ex b/durable/lib/durable/executor/step_runner.ex similarity index 73% rename from lib/durable/executor/step_runner.ex rename to durable/lib/durable/executor/step_runner.ex index 0b01a26..70f80fe 100644 --- a/lib/durable/executor/step_runner.ex +++ b/durable/lib/durable/executor/step_runner.ex @@ -12,6 +12,7 @@ defmodule Durable.Executor.StepRunner do alias Durable.Context alias Durable.Definition.Step alias Durable.Executor.Backoff + alias Durable.PubSub, as: DurablePubSub alias Durable.Repo alias Durable.Storage.Schemas.StepExecution @@ -50,9 +51,13 @@ defmodule Durable.Executor.StepRunner do # Set current step for logging/observability Context.set_current_step(step.name) - # Set current data in process dictionary for wait functions to access - # This is needed because wait_for_event etc. check the context for resumed data - Process.put(:durable_context, data) + # Set current data in process dictionary for wait functions to access. + # On retry attempts (attempt > 1) we MERGE rather than overwrite so that + # put_context/2 writes from the prior failed attempt remain visible to + # the user step body — matching the cumulative-context contract that + # save_data_as_context/3 enforces between sequential steps. + prior_ctx = if attempt > 1, do: Process.get(:durable_context, %{}), else: %{} + Process.put(:durable_context, Map.merge(prior_ctx, data)) # Create step execution record {:ok, step_exec} = create_step_execution(config, workflow_id, step, attempt) @@ -246,39 +251,93 @@ defmodule Durable.Executor.StepRunner do end defp update_step_execution(config, step_exec, :running) do - step_exec - |> StepExecution.start_changeset() - |> Repo.update(config) + case step_exec + |> StepExecution.start_changeset() + |> Repo.update(config) do + {:ok, updated} = ok -> + DurablePubSub.broadcast_step(config, :step_started, step_event(updated)) + ok + + other -> + other + end end defp update_step_execution(config, step_exec, :waiting) do - step_exec - |> Ecto.Changeset.change(status: :waiting) - |> Repo.update(config) + case step_exec + |> Ecto.Changeset.change(status: :waiting) + |> Repo.update(config) do + {:ok, updated} = ok -> + DurablePubSub.broadcast_step(config, :step_waiting, step_event(updated)) + ok + + other -> + other + end end defp complete_step_execution(config, step_exec, output, logs, duration_ms) do serializable_output = serialize_output(output) - step_exec - |> StepExecution.complete_changeset(serializable_output, logs, duration_ms) - |> Repo.update(config) + case step_exec + |> StepExecution.complete_changeset(serializable_output, logs, duration_ms) + |> Repo.update(config) do + {:ok, updated} = ok -> + DurablePubSub.broadcast_step(config, :step_completed, step_event(updated)) + ok + + other -> + other + end end defp fail_step_execution(config, step_exec, error, logs, duration_ms) do - step_exec - |> StepExecution.fail_changeset(error, logs, duration_ms) - |> Repo.update(config) + # Sanitize the error map — exception payloads frequently carry tuples + # (e.g., FunctionClauseError args), PIDs, refs, and functions in the + # stacktrace area, none of which can survive a JSONB write. Mirror the + # defense applied to workflow-level mark_failed in lib/durable/executor.ex. + safe_error = Durable.Executor.sanitize_for_json(error) + + case step_exec + |> StepExecution.fail_changeset(safe_error, logs, duration_ms) + |> Repo.update(config) do + {:ok, updated} = ok -> + DurablePubSub.broadcast_step(config, :step_failed, step_event(updated)) + ok + + other -> + other + end end - defp serialize_output(output) when is_map(output), do: output - defp serialize_output(output) when is_list(output), do: %{value: output} - defp serialize_output(output) when is_binary(output), do: %{value: output} - defp serialize_output(output) when is_number(output), do: %{value: output} - defp serialize_output(output) when is_atom(output), do: %{value: Atom.to_string(output)} - defp serialize_output(output) when is_tuple(output), do: %{value: Tuple.to_list(output)} + defp step_event(%StepExecution{} = step_exec) do + %{ + id: step_exec.id, + workflow_id: step_exec.workflow_id, + step_name: step_exec.step_name, + step_type: step_exec.step_type, + status: step_exec.status, + attempt: step_exec.attempt, + duration_ms: step_exec.duration_ms, + started_at: step_exec.started_at, + completed_at: step_exec.completed_at + } + end + + # Convert an arbitrary step output into a JSONB-safe map. The schema field + # is `:map`, so non-map outputs are wrapped under a `:value` key. Recursive + # sanitization handles deeply-nested tuples / PIDs / refs / functions — + # the previous shallow implementation only flattened top-level shapes and + # let nested tuples crash JSONB encoding. defp serialize_output(nil), do: nil - defp serialize_output(output), do: %{value: inspect(output)} + + defp serialize_output(output) when is_map(output) do + Durable.Executor.sanitize_for_json(output) + end + + defp serialize_output(output) do + %{value: Durable.Executor.sanitize_for_json(output)} + end # Normalize error to map format for database storage # The error field in StepExecution expects a map diff --git a/lib/durable/helpers.ex b/durable/lib/durable/helpers.ex similarity index 100% rename from lib/durable/helpers.ex rename to durable/lib/durable/helpers.ex diff --git a/lib/durable/log_capture.ex b/durable/lib/durable/log_capture.ex similarity index 85% rename from lib/durable/log_capture.ex rename to durable/lib/durable/log_capture.ex index 200d317..98b3477 100644 --- a/lib/durable/log_capture.ex +++ b/durable/lib/durable/log_capture.ex @@ -35,6 +35,13 @@ defmodule Durable.LogCapture do alias Durable.LogCapture.IOServer + require Logger + + # Process-dict flag: track whether we've already emitted the M-6 + # "metadata stringified" warning during the current step. Cleared in + # start_capture/0. + @warn_flag :durable_log_capture_warned + @logs_key :durable_logs @original_gl_key :durable_original_group_leader @io_server_key :durable_io_capture_pid @@ -54,6 +61,9 @@ defmodule Durable.LogCapture do if Keyword.get(config, :enabled, true) do # Initialize log buffer Process.put(@logs_key, []) + # Reset the per-step "warned about non-JSON metadata" flag so the + # warning fires at most once per step (Bug M-6). + Process.delete(@warn_flag) # Start IO capture if enabled if Keyword.get(config, :io_capture, true) do @@ -266,7 +276,32 @@ defmodule Durable.LogCapture do defp serialize_value(v) when is_number(v), do: v defp serialize_value(v) when is_boolean(v), do: v defp serialize_value(nil), do: nil - defp serialize_value(v), do: inspect(v) + + defp serialize_value(v) do + # Bug M-6: surface (once per step) when log metadata has been silently + # stringified — devs often don't realize they passed a non-JSON-safe + # value and end up debugging an `inspect/1` blob instead of structured + # data. Rate-limited via a process-dict flag. + warn_once_about_unsafe_metadata(v) + inspect(v) + end + + defp warn_once_about_unsafe_metadata(v) do + case Process.get(@warn_flag) do + true -> + :ok + + _ -> + Process.put(@warn_flag, true) + + Logger.warning( + "[Durable.LogCapture] non-JSON-safe metadata value stringified " <> + "to inspect/1 form: #{inspect(v, limit: 80)}. To preserve " <> + "structured data, only pass binaries, atoms, numbers, booleans, " <> + "or nil in Logger metadata." + ) + end + end defp format_logs_for_storage(logs) do # Convert string keys to atoms for consistency with schema diff --git a/lib/durable/log_capture/handler.ex b/durable/lib/durable/log_capture/handler.ex similarity index 100% rename from lib/durable/log_capture/handler.ex rename to durable/lib/durable/log_capture/handler.ex diff --git a/lib/durable/log_capture/io_server.ex b/durable/lib/durable/log_capture/io_server.ex similarity index 100% rename from lib/durable/log_capture/io_server.ex rename to durable/lib/durable/log_capture/io_server.ex diff --git a/lib/durable/migration.ex b/durable/lib/durable/migration.ex similarity index 59% rename from lib/durable/migration.ex rename to durable/lib/durable/migration.ex index 6f12b3b..8637ad7 100644 --- a/lib/durable/migration.ex +++ b/durable/lib/durable/migration.ex @@ -20,6 +20,16 @@ defmodule Durable.Migration do mix ecto.migrate + When Durable adds new internal migrations in a future release, generate a + new host-app wrapper migration: + + mix durable.gen.upgrade -r MyApp.Repo + mix ecto.migrate + + To check whether a database is behind the Durable library version: + + mix durable.migrations -r MyApp.Repo --check + ## Options * `:prefix` - The PostgreSQL schema name (default: `"durable"`) @@ -83,6 +93,18 @@ defmodule Durable.Migration do @spec all_versions() :: [pos_integer()] defdelegate all_versions(), to: Migrator + @doc """ + Returns the latest available Durable migration version. + """ + @spec current_version() :: pos_integer() + defdelegate current_version(), to: Migrator + + @doc """ + Returns the migration version immediately before `version`, or 0 for the first migration. + """ + @spec previous_version(pos_integer()) :: non_neg_integer() + defdelegate previous_version(version \\ Migrator.current_version()), to: Migrator + @doc """ Returns the list of applied migration versions. @@ -91,9 +113,34 @@ defmodule Durable.Migration do @spec migrated_versions(keyword()) :: [pos_integer()] defdelegate migrated_versions(opts \\ []), to: Migrator + @doc """ + Returns the latest applied Durable migration version, or 0 when none are applied. + + Pass an Ecto repo to check outside an Ecto migration: + + Durable.Migration.migrated_version(MyApp.Repo) + Durable.Migration.migrated_version(MyApp.Repo, prefix: "private") + + Without a repo, this uses the current Ecto migration runner context. + """ + @spec migrated_version(keyword() | module()) :: non_neg_integer() + def migrated_version(opts_or_repo \\ []) + + def migrated_version(opts) when is_list(opts), do: Migrator.migrated_version(opts) + def migrated_version(repo) when is_atom(repo), do: Migrator.migrated_version(repo) + + @spec migrated_version(module(), keyword()) :: non_neg_integer() + defdelegate migrated_version(repo, opts), to: Migrator + @doc """ Returns pending migrations (not yet applied). """ - @spec pending_versions(keyword()) :: [pos_integer()] - defdelegate pending_versions(opts \\ []), to: Migrator + @spec pending_versions(keyword() | module()) :: [pos_integer()] + def pending_versions(opts_or_repo \\ []) + + def pending_versions(opts) when is_list(opts), do: Migrator.pending_versions(opts) + def pending_versions(repo) when is_atom(repo), do: Migrator.pending_versions(repo) + + @spec pending_versions(module(), keyword()) :: [pos_integer()] + defdelegate pending_versions(repo, opts), to: Migrator end diff --git a/lib/durable/migration/base.ex b/durable/lib/durable/migration/base.ex similarity index 100% rename from lib/durable/migration/base.ex rename to durable/lib/durable/migration/base.ex diff --git a/lib/durable/migration/migrations/v20260103000000_initial_schema.ex b/durable/lib/durable/migration/migrations/v20260103000000_initial_schema.ex similarity index 100% rename from lib/durable/migration/migrations/v20260103000000_initial_schema.ex rename to durable/lib/durable/migration/migrations/v20260103000000_initial_schema.ex diff --git a/lib/durable/migration/migrations/v20260104000000_add_wait_primitives.ex b/durable/lib/durable/migration/migrations/v20260104000000_add_wait_primitives.ex similarity index 100% rename from lib/durable/migration/migrations/v20260104000000_add_wait_primitives.ex rename to durable/lib/durable/migration/migrations/v20260104000000_add_wait_primitives.ex diff --git a/durable/lib/durable/migration/migrations/v20260413000000_add_scheduler_resilience.ex b/durable/lib/durable/migration/migrations/v20260413000000_add_scheduler_resilience.ex new file mode 100644 index 0000000..e2c10c2 --- /dev/null +++ b/durable/lib/durable/migration/migrations/v20260413000000_add_scheduler_resilience.ex @@ -0,0 +1,36 @@ +defmodule Durable.Migration.Migrations.V20260413000000AddSchedulerResilience do + @moduledoc false + # Bug L-1: scheduler resilience. + # Adds tracking fields so the scheduler can detect persistently-failing + # ScheduledWorkflow rows (e.g., the workflow_module no longer exists after + # a code reload) and auto-disable them after a configurable failure count + # instead of looping noisily forever. + use Durable.Migration.Base + + @impl true + def version, do: 20_260_413_000_000 + + @impl true + def up(prefix) do + alter table(:scheduled_workflows, prefix: prefix) do + add_if_not_exists(:last_error, :text) + add_if_not_exists(:last_error_at, :utc_datetime_usec) + add_if_not_exists(:consecutive_failures, :integer, default: 0) + add_if_not_exists(:auto_disabled_at, :utc_datetime_usec) + end + + :ok + end + + @impl true + def down(prefix) do + alter table(:scheduled_workflows, prefix: prefix) do + remove_if_exists(:last_error, :text) + remove_if_exists(:last_error_at, :utc_datetime_usec) + remove_if_exists(:consecutive_failures, :integer) + remove_if_exists(:auto_disabled_at, :utc_datetime_usec) + end + + :ok + end +end diff --git a/lib/durable/migration/migrator.ex b/durable/lib/durable/migration/migrator.ex similarity index 72% rename from lib/durable/migration/migrator.ex rename to durable/lib/durable/migration/migrator.ex index f3d350a..4139bc7 100644 --- a/lib/durable/migration/migrator.ex +++ b/durable/lib/durable/migration/migrator.ex @@ -14,7 +14,8 @@ defmodule Durable.Migration.Migrator do # When adding new migrations, append them to this list. @migrations [ Durable.Migration.Migrations.V20260103000000InitialSchema, - Durable.Migration.Migrations.V20260104000000AddWaitPrimitives + Durable.Migration.Migrations.V20260104000000AddWaitPrimitives, + Durable.Migration.Migrations.V20260413000000AddSchedulerResilience ] @doc """ @@ -37,6 +38,26 @@ defmodule Durable.Migration.Migrator do |> Enum.sort_by(&elem(&1, 0)) end + @doc """ + Returns the latest Durable migration version. + """ + @spec current_version() :: pos_integer() + def current_version do + all_versions() + |> List.last() + end + + @doc """ + Returns the migration version before the given version, or 0 for the first migration. + """ + @spec previous_version(pos_integer()) :: non_neg_integer() + def previous_version(version \\ current_version()) do + all_versions() + |> Enum.filter(&(&1 < version)) + |> List.last() || + 0 + end + @doc """ Runs pending migrations up. @@ -156,17 +177,61 @@ defmodule Durable.Migration.Migrator do SchemaMigration.versions(prefix) end + @spec migrated_versions(module(), keyword()) :: [pos_integer()] + def migrated_versions(repo, opts) when is_atom(repo) and is_list(opts) do + prefix = Keyword.get(opts, :prefix, "durable") + SchemaMigration.versions(repo, prefix) + end + + @doc """ + Returns the latest applied migration version, or 0 when Durable hasn't been migrated. + """ + @spec migrated_version(keyword() | module()) :: non_neg_integer() + def migrated_version(opts_or_repo \\ []) + + def migrated_version(opts) when is_list(opts) do + opts + |> migrated_versions() + |> latest_version() + end + + def migrated_version(repo) when is_atom(repo) do + migrated_version(repo, []) + end + + @spec migrated_version(module(), keyword()) :: non_neg_integer() + def migrated_version(repo, opts) when is_atom(repo) and is_list(opts) do + repo + |> migrated_versions(opts) + |> latest_version() + end + @doc """ Returns pending migration versions. """ - @spec pending_versions(keyword()) :: [pos_integer()] - def pending_versions(opts \\ []) do + @spec pending_versions(keyword() | module()) :: [pos_integer()] + def pending_versions(opts_or_repo \\ []) + + def pending_versions(opts) when is_list(opts) do applied = migrated_versions(opts) all_versions() -- applied end + def pending_versions(repo) when is_atom(repo) do + pending_versions(repo, []) + end + + @spec pending_versions(module(), keyword()) :: [pos_integer()] + def pending_versions(repo, opts) when is_atom(repo) and is_list(opts) do + applied = migrated_versions(repo, opts) + all_versions() -- applied + end + # Private helpers + defp latest_version([]), do: 0 + defp latest_version(versions), do: List.last(versions) + defp filter_to_version(migrations, nil, _direction), do: migrations defp filter_to_version(migrations, target, :up) do diff --git a/lib/durable/migration/schema_migration.ex b/durable/lib/durable/migration/schema_migration.ex similarity index 79% rename from lib/durable/migration/schema_migration.ex rename to durable/lib/durable/migration/schema_migration.ex index 103bad4..c567c42 100644 --- a/lib/durable/migration/schema_migration.ex +++ b/durable/lib/durable/migration/schema_migration.ex @@ -14,6 +14,11 @@ defmodule Durable.Migration.SchemaMigration do """ @spec ensure_table!(String.t()) :: :ok def ensure_table!(prefix) do + ensure_table!(get_repo(), prefix) + end + + @spec ensure_table!(module(), String.t()) :: :ok + def ensure_table!(repo, prefix) do # Use direct repo query to ensure table is created immediately # (not deferred like Ecto.Migration's execute) query = """ @@ -23,7 +28,7 @@ defmodule Durable.Migration.SchemaMigration do ) """ - get_repo().query!(query, []) + repo.query!(query, []) :ok end @@ -32,6 +37,11 @@ defmodule Durable.Migration.SchemaMigration do """ @spec table_exists?(String.t()) :: boolean() def table_exists?(prefix) do + table_exists?(get_repo(), prefix) + end + + @spec table_exists?(module(), String.t()) :: boolean() + def table_exists?(repo, prefix) do query = """ SELECT EXISTS ( SELECT FROM information_schema.tables @@ -40,7 +50,7 @@ defmodule Durable.Migration.SchemaMigration do ) """ - %{rows: [[exists]]} = get_repo().query!(query, [prefix, @table_name]) + %{rows: [[exists]]} = repo.query!(query, [prefix, @table_name]) exists end @@ -49,9 +59,14 @@ defmodule Durable.Migration.SchemaMigration do """ @spec versions(String.t()) :: [pos_integer()] def versions(prefix) do - if table_exists?(prefix) do + versions(get_repo(), prefix) + end + + @spec versions(module(), String.t()) :: [pos_integer()] + def versions(repo, prefix) do + if table_exists?(repo, prefix) do query = "SELECT version FROM #{prefix}.#{@table_name} ORDER BY version" - %{rows: rows} = get_repo().query!(query, []) + %{rows: rows} = repo.query!(query, []) Enum.map(rows, fn [v] -> v end) else [] diff --git a/lib/durable/orchestration.ex b/durable/lib/durable/orchestration.ex similarity index 84% rename from lib/durable/orchestration.ex rename to durable/lib/durable/orchestration.ex index ce42b2f..f9b88f9 100644 --- a/lib/durable/orchestration.ex +++ b/durable/lib/durable/orchestration.ex @@ -65,6 +65,20 @@ defmodule Durable.Orchestration do - `:queue` - Queue for the child workflow (default: "default") - `:durable` - Durable instance name (default: Durable) + ## What the parent sees on success + + The `result` returned on success is the child workflow's **entire final + context**, not just its explicit outputs. This includes every + `put_context/2` key the child set, plus the final step's return value. + + If you need to expose only specific outputs, return a clean shape from the + child's final step or pick only the keys you need in the parent: + + case call_workflow(MyApp.PaymentWorkflow, %{"amount" => 100}) do + {:ok, %{"payment_id" => id}} -> {:ok, assign(data, :payment_id, id)} + {:error, reason} -> {:error, reason} + end + ## Examples case call_workflow(MyApp.PaymentWorkflow, %{"amount" => 100}, timeout: hours(1)) do @@ -128,10 +142,33 @@ defmodule Durable.Orchestration do # Create child and continue (no throw) {:ok, child_id} = create_child_execution(module, input, parent_id, opts) Context.put_context(fire_key, child_id) + record_call_child(child_id, :fire, ref) {:ok, child_id} end end + # Records a step → child mapping in the parent's context so the dashboard + # can attach a drill-in chevron to the calling step. Mirrors the + # `__parallel_children` pattern used by parallel steps in + # `lib/durable/executor.ex`. + defp record_call_child(child_id, kind, ref) do + step_name = Context.current_step() + + meta = %{ + "step_name" => step_name && Atom.to_string(step_name), + "kind" => Atom.to_string(kind), + "ref" => to_string(ref) + } + + children = + Context.get_context(:__call_children, %{}) || + Context.get_context("__call_children", %{}) || + %{} + + Context.put_context(:__call_children, Map.put(children, child_id, meta)) + :ok + end + # ============================================================================ # Helpers # ============================================================================ @@ -139,6 +176,7 @@ defmodule Durable.Orchestration do defp create_and_wait(module, input, parent_id, child_key, opts) do {:ok, child_id} = create_child_execution(module, input, parent_id, opts) Context.put_context(child_key, child_id) + record_call_child(child_id, :call, Keyword.get(opts, :ref, module_to_ref(module))) throw( {:call_workflow, diff --git a/durable/lib/durable/pubsub.ex b/durable/lib/durable/pubsub.ex new file mode 100644 index 0000000..09774bc --- /dev/null +++ b/durable/lib/durable/pubsub.ex @@ -0,0 +1,198 @@ +defmodule Durable.PubSub do + @moduledoc """ + Thin wrapper around `Phoenix.PubSub` for broadcasting Durable lifecycle events. + + This module is a no-op when `phoenix_pubsub` is not loaded, so it is safe to + call from executor code regardless of whether a dashboard or external consumer + is subscribed. + + ## Topics + + Broadcasts go to three topic families, scoped per Durable instance (the + instance name is taken from the config): + + * `"durable::workflows"` — every workflow lifecycle event + * `"durable::workflow:"` — events for one specific workflow + * `"durable::schedules"` — schedule CRUD events + * `"durable::inputs"` — pending input lifecycle events + + ## Event shape + + Every broadcast is the tuple `{:durable_event, kind, payload}` where `kind` is + one of the atoms enumerated in `t:kind/0` and `payload` is a map with fields + relevant to the event. See the individual `broadcast_*` helpers for specifics. + + ## Enabling + + Add `{:phoenix_pubsub, "~> 2.1"}` to your dependencies and either: + + * Pass `pubsub: MyApp.PubSub` in the Durable supervision-tree args to reuse + a PubSub started by the host app, or + * Pass nothing — Durable will start its own `Phoenix.PubSub` named + `Durable..PubSub`. + + When neither a dependency nor a `:pubsub` option is present, all broadcasts + silently no-op and subscribers simply never receive messages. + """ + + alias Durable.Config + + @type kind :: + :workflow_started + | :workflow_resumed + | :workflow_waiting + | :workflow_completed + | :workflow_failed + | :workflow_cancelled + | :step_started + | :step_completed + | :step_failed + | :step_waiting + | :input_requested + | :input_provided + | :schedule_toggled + | :schedule_triggered + + @type payload :: map() + + @doc """ + Returns the PubSub server name for a Durable instance. + + Returns `nil` when the instance has no PubSub configured. + """ + @spec server(Config.t()) :: module() | nil + def server(%Config{pubsub: nil}), do: nil + def server(%Config{pubsub: name}) when is_atom(name), do: name + + @doc """ + Returns the PubSub server name for an instance by name (convenience). + """ + @spec server_for(atom()) :: module() | nil + def server_for(instance_name) do + case Config.get_safe(instance_name) do + nil -> nil + config -> server(config) + end + end + + @doc """ + Returns the default `Phoenix.PubSub` child spec name for a Durable instance. + + Used by the supervisor to start an owned PubSub when the caller did not + provide one. + """ + @spec default_name(atom()) :: atom() + def default_name(instance_name) do + Module.concat([instance_name, PubSub]) + end + + @doc """ + Returns the topic string for all workflow events on this instance. + """ + @spec workflows_topic(Config.t()) :: String.t() + def workflows_topic(%Config{name: name}), do: "durable:#{name}:workflows" + + @doc """ + Returns the topic string for one specific workflow's events. + """ + @spec workflow_topic(Config.t(), String.t()) :: String.t() + def workflow_topic(%Config{name: name}, workflow_id) do + "durable:#{name}:workflow:#{workflow_id}" + end + + @doc """ + Returns the topic string for schedule events on this instance. + """ + @spec schedules_topic(Config.t()) :: String.t() + def schedules_topic(%Config{name: name}), do: "durable:#{name}:schedules" + + @doc """ + Returns the topic string for pending-input events on this instance. + """ + @spec inputs_topic(Config.t()) :: String.t() + def inputs_topic(%Config{name: name}), do: "durable:#{name}:inputs" + + @doc """ + Subscribes the calling process to a topic. + + Returns `:ok` if PubSub is configured, `{:error, :no_pubsub}` otherwise. + """ + @spec subscribe(Config.t(), String.t()) :: :ok | {:error, :no_pubsub} + def subscribe(%Config{} = config, topic) do + with server when not is_nil(server) <- server(config), + true <- pubsub_loaded?() do + Phoenix.PubSub.subscribe(server, topic) + else + _ -> {:error, :no_pubsub} + end + end + + @doc """ + Unsubscribes the calling process from a topic. + """ + @spec unsubscribe(Config.t(), String.t()) :: :ok + def unsubscribe(%Config{} = config, topic) do + with server when not is_nil(server) <- server(config), + true <- pubsub_loaded?() do + Phoenix.PubSub.unsubscribe(server, topic) + else + _ -> :ok + end + end + + @doc """ + Broadcasts a workflow lifecycle event. + + Sends to both the global workflows topic and the per-workflow topic. + """ + @spec broadcast_workflow(Config.t(), kind(), payload()) :: :ok + def broadcast_workflow(%Config{} = config, kind, %{id: workflow_id} = payload) do + msg = {:durable_event, kind, payload} + broadcast(config, workflows_topic(config), msg) + broadcast(config, workflow_topic(config, workflow_id), msg) + end + + @doc """ + Broadcasts a step lifecycle event. + + Sent only to the per-workflow topic (step events would overwhelm the global + topic and aren't generally interesting at that level). + """ + @spec broadcast_step(Config.t(), kind(), payload()) :: :ok + def broadcast_step(%Config{} = config, kind, %{workflow_id: workflow_id} = payload) do + msg = {:durable_event, kind, payload} + broadcast(config, workflow_topic(config, workflow_id), msg) + end + + @doc """ + Broadcasts a pending-input lifecycle event. + + Sent to both the inputs topic and the per-workflow topic. + """ + @spec broadcast_input(Config.t(), kind(), payload()) :: :ok + def broadcast_input(%Config{} = config, kind, %{workflow_id: workflow_id} = payload) do + msg = {:durable_event, kind, payload} + broadcast(config, inputs_topic(config), msg) + broadcast(config, workflow_topic(config, workflow_id), msg) + end + + @doc """ + Broadcasts a schedule event. + """ + @spec broadcast_schedule(Config.t(), kind(), payload()) :: :ok + def broadcast_schedule(%Config{} = config, kind, payload) do + msg = {:durable_event, kind, payload} + broadcast(config, schedules_topic(config), msg) + end + + defp broadcast(%Config{} = config, topic, msg) do + with server when not is_nil(server) <- server(config), + true <- pubsub_loaded?() do + Phoenix.PubSub.broadcast(server, topic, msg) + else + _ -> :ok + end + end + + defp pubsub_loaded?, do: Code.ensure_loaded?(Phoenix.PubSub) +end diff --git a/lib/durable/query.ex b/durable/lib/durable/query.ex similarity index 78% rename from lib/durable/query.ex rename to durable/lib/durable/query.ex index 412dcf9..4d688ff 100644 --- a/lib/durable/query.ex +++ b/durable/lib/durable/query.ex @@ -158,6 +158,70 @@ defmodule Durable.Query do |> Enum.map(&execution_to_map(&1, false, false)) end + @doc """ + Returns workflow execution counts grouped by status. + + Returns a map like `%{pending: 5, running: 3, completed: 100, ...}`. + + ## Options + + - `:durable` - The Durable instance name (default: Durable) + + """ + @spec dashboard_counts(keyword()) :: %{atom() => non_neg_integer()} + def dashboard_counts(opts \\ []) do + config = get_config(opts) + + query = + from(w in WorkflowExecution, + group_by: w.status, + select: {w.status, count(w.id)} + ) + + Repo.all(config, query) + |> Map.new() + end + + @doc """ + Lists workflow executions with total count for pagination. + + Returns `{executions, total_count}`. + + Accepts the same filters as `list_executions/1`. + """ + @spec list_executions_with_total(keyword()) :: {[map()], non_neg_integer()} + def list_executions_with_total(filters \\ []) do + config = get_config(filters) + limit = Keyword.get(filters, :limit, 50) + offset = Keyword.get(filters, :offset, 0) + + base_query = + from(w in WorkflowExecution, + order_by: [desc: w.inserted_at] + ) + + base_query = apply_filters(base_query, filters) + + list_query = from(w in base_query, limit: ^limit, offset: ^offset) + + # The count query mustn't inherit `order_by` — Postgres rejects ORDER BY + # against a column that isn't in GROUP BY when the SELECT is an aggregate + # (`SELECT count(id) FROM ... ORDER BY inserted_at` raises 42803). Strip + # it from the base query before adding the aggregate. + total_query = + base_query + |> exclude(:order_by) + |> select([w], count(w.id)) + + workflows = + Repo.all(config, list_query) + |> Enum.map(&execution_to_map(&1, false, false)) + + total = Repo.one(config, total_query) + + {workflows, total} + end + # Private functions defp get_config(opts) do diff --git a/lib/durable/queue/adapter.ex b/durable/lib/durable/queue/adapter.ex similarity index 80% rename from lib/durable/queue/adapter.ex rename to durable/lib/durable/queue/adapter.ex index 845f771..75d48e8 100644 --- a/lib/durable/queue/adapter.ex +++ b/durable/lib/durable/queue/adapter.ex @@ -104,6 +104,25 @@ defmodule Durable.Queue.Adapter do """ @callback heartbeat(config :: Config.t(), job_id :: job_id()) :: :ok | {:error, term()} + @doc """ + Recovers "zombie" workflows — executions stuck in `:waiting` status with + no pending inputs or events that could ever unblock them. + + This typically happens when a step crashes during a state transition and + the executor can't record a clean error (e.g. due to a secondary error + while serializing). The workflow remains in `:waiting` indefinitely even + though nothing is actually waiting on it. + + Zombies older than the timeout are marked `:failed` with a diagnostic + error. Returns the count of workflows recovered. + + This callback is optional so third-party adapters don't break on upgrade. + """ + @callback recover_zombie_workflows(config :: Config.t(), timeout_seconds :: pos_integer()) :: + {:ok, non_neg_integer()} | {:error, term()} + + @optional_callbacks recover_zombie_workflows: 2 + @doc """ Returns the default adapter module. """ diff --git a/lib/durable/queue/adapters/postgres.ex b/durable/lib/durable/queue/adapters/postgres.ex similarity index 60% rename from lib/durable/queue/adapters/postgres.ex rename to durable/lib/durable/queue/adapters/postgres.ex index 1402d4a..5dd0754 100644 --- a/lib/durable/queue/adapters/postgres.ex +++ b/durable/lib/durable/queue/adapters/postgres.ex @@ -11,10 +11,12 @@ defmodule Durable.Queue.Adapters.Postgres do alias Durable.Config alias Durable.Repo - alias Durable.Storage.Schemas.WorkflowExecution + alias Durable.Storage.Schemas.{PendingEvent, PendingInput, StepExecution, WorkflowExecution} import Ecto.Query + require Logger + @impl true def fetch_jobs(%Config{} = config, queue, limit, node_id) when is_binary(queue) and limit > 0 do @@ -52,16 +54,45 @@ defmodule Durable.Queue.Adapters.Postgres do @impl true def ack(%Config{} = config, job_id) when is_binary(job_id) do + ack_with_retry(config, job_id, _attempt = 1, _max_attempts = 3) + end + + # Retry the ack on transient failures (DB blip, connection lost). If we + # gave up here, the workflow would stay locked until stale-recovery + # released it 5 minutes later — at which point it would silently re-execute + # because there's no idempotency key (Bug M-5). The retry buys us time; + # the telemetry surfaces persistent failures so operators can intervene. + defp ack_with_retry(%Config{} = config, job_id, attempt, max_attempts) do case Repo.get(config, WorkflowExecution, job_id) do nil -> {:error, :not_found} execution -> - execution - |> WorkflowExecution.unlock_changeset() - |> Repo.update(config) - - :ok + case execution + |> WorkflowExecution.unlock_changeset() + |> Repo.update(config) do + {:ok, _} -> + :ok + + {:error, _reason} when attempt < max_attempts -> + backoff_ms = :rand.uniform(50) * attempt + Process.sleep(backoff_ms) + ack_with_retry(config, job_id, attempt + 1, max_attempts) + + {:error, reason} -> + :telemetry.execute( + [:durable, :queue, :ack_failed], + %{count: 1, attempts: attempt}, + %{job_id: job_id, reason: reason, durable: config.name} + ) + + Logger.error( + "[Durable] ack failed after #{attempt} attempts for job #{job_id}: " <> + inspect(reason) + ) + + {:error, reason} + end end end @@ -135,6 +166,92 @@ defmodule Durable.Queue.Adapters.Postgres do e -> {:error, Exception.message(e)} end + @impl true + def recover_zombie_workflows(%Config{} = config, timeout_seconds) when timeout_seconds > 0 do + cutoff = DateTime.add(DateTime.utc_now(), -timeout_seconds, :second) + + # Two zombie classes: + # + # 1. :waiting workflows with no pending inputs or events to ever unblock them. + # 2. :compensating workflows with no actively running compensation step + # (Bug M-1: the compensation handler crashed mid-rollback). + # + # Executions with a healthy lock heartbeat are excluded — those are still + # being actively processed by some worker. + waiting_zombies = + from(w in WorkflowExecution, + as: :workflow, + where: w.status == :waiting, + where: w.updated_at < ^cutoff, + where: is_nil(w.locked_by) or w.locked_at < ^cutoff, + where: + not exists( + from(p in PendingInput, + where: p.workflow_id == parent_as(:workflow).id and p.status == :pending + ) + ), + where: + not exists( + from(e in PendingEvent, + where: e.workflow_id == parent_as(:workflow).id and e.status == :pending + ) + ), + select: w.id + ) + + compensating_zombies = + from(w in WorkflowExecution, + as: :workflow, + where: w.status == :compensating, + where: w.updated_at < ^cutoff, + where: is_nil(w.locked_by) or w.locked_at < ^cutoff, + where: + not exists( + from(s in StepExecution, + where: s.workflow_id == parent_as(:workflow).id and s.status == :running + ) + ), + select: w.id + ) + + zombie_ids = Repo.all(config, waiting_zombies) ++ Repo.all(config, compensating_zombies) + + if zombie_ids == [] do + {:ok, 0} + else + now = DateTime.utc_now() + + zombie_error = %{ + "type" => "zombie_detected", + "message" => + "Workflow was in :waiting or :compensating status with no active work for longer than the stale lock timeout. Likely crashed during a state transition.", + "detected_at" => DateTime.to_iso8601(now) + } + + {count, _} = + Repo.update_all( + config, + from(w in WorkflowExecution, where: w.id in ^zombie_ids), + set: [ + status: :failed, + error: zombie_error, + locked_by: nil, + locked_at: nil, + completed_at: now, + updated_at: now + ] + ) + + Logger.warning( + "[Durable] Zombie recovery marked #{count} workflow(s) as :failed: #{inspect(zombie_ids)}" + ) + + {:ok, count} + end + rescue + e -> {:error, Exception.message(e)} + end + @impl true def heartbeat(%Config{} = config, job_id) when is_binary(job_id) do now = DateTime.utc_now() diff --git a/lib/durable/queue/manager.ex b/durable/lib/durable/queue/manager.ex similarity index 100% rename from lib/durable/queue/manager.ex rename to durable/lib/durable/queue/manager.ex diff --git a/lib/durable/queue/poller.ex b/durable/lib/durable/queue/poller.ex similarity index 100% rename from lib/durable/queue/poller.ex rename to durable/lib/durable/queue/poller.ex diff --git a/lib/durable/queue/stale_job_recovery.ex b/durable/lib/durable/queue/stale_job_recovery.ex similarity index 65% rename from lib/durable/queue/stale_job_recovery.ex rename to durable/lib/durable/queue/stale_job_recovery.ex index d1cc662..1ad65fa 100644 --- a/lib/durable/queue/stale_job_recovery.ex +++ b/durable/lib/durable/queue/stale_job_recovery.ex @@ -81,22 +81,33 @@ defmodule Durable.Queue.StaleJobRecovery do defp do_recovery(%Config{} = config) do adapter = Adapter.default_adapter() - case adapter.recover_stale_locks(config, config.stale_lock_timeout) do - {:ok, 0} -> - {:ok, 0} + stale_result = adapter.recover_stale_locks(config, config.stale_lock_timeout) + log_recovery(:stale, stale_result, config.name) - {:ok, count} -> - Logger.info("Recovered #{count} stale job(s) for #{inspect(config.name)}") - emit_telemetry(count, config.name) - {:ok, count} + # Zombie recovery is an optional adapter capability. Skip if not implemented. + if function_exported?(adapter, :recover_zombie_workflows, 2) do + zombie_result = adapter.recover_zombie_workflows(config, config.stale_lock_timeout) + log_recovery(:zombie, zombie_result, config.name) + end + + # Preserve the return shape expected by `recover_now/1` callers. + stale_result + end - {:error, reason} = error -> - Logger.error( - "Failed to recover stale locks for #{inspect(config.name)}: #{inspect(reason)}" - ) + defp log_recovery(_kind, {:ok, 0}, _name), do: :ok - error - end + defp log_recovery(:stale, {:ok, count}, name) do + Logger.info("Recovered #{count} stale job(s) for #{inspect(name)}") + emit_telemetry(count, name) + end + + defp log_recovery(:zombie, {:ok, count}, name) do + Logger.warning("Marked #{count} zombie workflow(s) as failed for #{inspect(name)}") + emit_zombie_telemetry(count, name) + end + + defp log_recovery(kind, {:error, reason}, name) do + Logger.error("Failed #{kind} recovery for #{inspect(name)}: #{inspect(reason)}") end defp schedule_recovery(interval) do @@ -114,4 +125,12 @@ defmodule Durable.Queue.StaleJobRecovery do %{durable: durable_name} ) end + + defp emit_zombie_telemetry(count, durable_name) do + :telemetry.execute( + [:durable, :queue, :zombie_recovered], + %{count: count}, + %{durable: durable_name} + ) + end end diff --git a/lib/durable/queue/worker.ex b/durable/lib/durable/queue/worker.ex similarity index 100% rename from lib/durable/queue/worker.ex rename to durable/lib/durable/queue/worker.ex diff --git a/lib/durable/repo.ex b/durable/lib/durable/repo.ex similarity index 87% rename from lib/durable/repo.ex rename to durable/lib/durable/repo.ex index f569083..c4392da 100644 --- a/lib/durable/repo.ex +++ b/durable/lib/durable/repo.ex @@ -126,6 +126,20 @@ defmodule Durable.Repo do config.repo.delete_all(queryable, merge_opts(config, opts)) end + # ============================================================================ + # Transactions + # ============================================================================ + + @doc """ + Runs an `Ecto.Multi` (or a function) inside a database transaction on the + configured repo. Used to atomically couple state transitions (e.g. marking + a PendingInput `:timeout` and resuming the workflow it belongs to) so a + crash between the two updates can't orphan the workflow. + """ + def transaction(%Config{} = config, fun_or_multi, opts \\ []) do + config.repo.transaction(fun_or_multi, merge_opts(config, opts)) + end + # ============================================================================ # Raw SQL # ============================================================================ diff --git a/lib/durable/scheduler/api.ex b/durable/lib/durable/scheduler/api.ex similarity index 98% rename from lib/durable/scheduler/api.ex rename to durable/lib/durable/scheduler/api.ex index baf40aa..42f4c95 100644 --- a/lib/durable/scheduler/api.ex +++ b/durable/lib/durable/scheduler/api.ex @@ -315,6 +315,12 @@ defmodule Durable.Scheduler.API do def register(module, opts \\ []) do durable_name = Keyword.get(opts, :durable, Durable) + # Modules are lazy-loaded in dev; `function_exported?/3` returns false + # for an existing-but-not-yet-loaded module, which would silently skip + # registration at scheduler boot. Force the load first so the export + # check is meaningful. + _ = Code.ensure_loaded(module) + if function_exported?(module, :__schedules__, 0) do schedules = module.__schedules__() register_schedules(module, schedules, durable_name) diff --git a/lib/durable/scheduler/dsl.ex b/durable/lib/durable/scheduler/dsl.ex similarity index 100% rename from lib/durable/scheduler/dsl.ex rename to durable/lib/durable/scheduler/dsl.ex diff --git a/lib/durable/scheduler/scheduler.ex b/durable/lib/durable/scheduler/scheduler.ex similarity index 75% rename from lib/durable/scheduler/scheduler.ex rename to durable/lib/durable/scheduler/scheduler.ex index 91611db..f15c06d 100644 --- a/lib/durable/scheduler/scheduler.ex +++ b/durable/lib/durable/scheduler/scheduler.ex @@ -26,7 +26,11 @@ defmodule Durable.Scheduler do require Logger + alias Crontab.CronExpression.Parser, as: CronParser + alias Crontab.Scheduler, as: CronScheduler + alias Durable.Repo alias Durable.Scheduler.API + alias Durable.Storage.Schemas.ScheduledWorkflow @default_interval 60_000 @@ -179,7 +183,64 @@ defmodule Durable.Scheduler do start_workflow(schedule, mod, config) {:error, reason} -> - Logger.error("Failed to load module for schedule #{schedule.name}: #{inspect(reason)}") + # Bug L-1: instead of just logging and re-firing on every poll cycle, + # record the failure on the schedule row, advance next_run_at past + # the current trigger so we don't loop hot, and auto-disable after + # too many consecutive failures. + record_failure( + schedule, + config, + "Failed to load module #{schedule.workflow_module}: #{inspect(reason)}" + ) + end + end + + defp record_failure(schedule, config, message) do + # Compute the next run time so we don't immediately re-fire on the next + # poll. If the cron parse itself fails, just defer 5 minutes. + next_run_at = compute_next_run(schedule) + + changeset = + ScheduledWorkflow.failure_changeset(schedule, message, + next_run_at: next_run_at, + auto_disable_after: 5 + ) + + case Repo.update(changeset, config) do + {:ok, updated} -> + if updated.auto_disabled_at do + Logger.error( + "[Durable.Scheduler] Auto-disabled #{schedule.name} after " <> + "#{updated.consecutive_failures} consecutive failures: #{message}" + ) + else + Logger.warning( + "[Durable.Scheduler] Schedule #{schedule.name} failure " <> + "(#{updated.consecutive_failures}): #{message}" + ) + end + + {:error, cs} -> + Logger.error( + "[Durable.Scheduler] Failed to record failure for #{schedule.name}: " <> + inspect(cs.errors) + ) + end + end + + defp compute_next_run(schedule) do + case CronParser.parse(schedule.cron_expression) do + {:ok, cron} -> + case CronScheduler.get_next_run_date(cron, DateTime.utc_now() |> DateTime.to_naive()) do + {:ok, naive} -> + DateTime.from_naive!(naive, "Etc/UTC") + + _ -> + DateTime.add(DateTime.utc_now(), 300, :second) + end + + _ -> + DateTime.add(DateTime.utc_now(), 300, :second) end end diff --git a/lib/durable/storage/schemas/pending_event.ex b/durable/lib/durable/storage/schemas/pending_event.ex similarity index 100% rename from lib/durable/storage/schemas/pending_event.ex rename to durable/lib/durable/storage/schemas/pending_event.ex diff --git a/lib/durable/storage/schemas/pending_input.ex b/durable/lib/durable/storage/schemas/pending_input.ex similarity index 100% rename from lib/durable/storage/schemas/pending_input.ex rename to durable/lib/durable/storage/schemas/pending_input.ex diff --git a/lib/durable/storage/schemas/scheduled_workflow.ex b/durable/lib/durable/storage/schemas/scheduled_workflow.ex similarity index 61% rename from lib/durable/storage/schemas/scheduled_workflow.ex rename to durable/lib/durable/storage/schemas/scheduled_workflow.ex index 645148a..fba08cc 100644 --- a/lib/durable/storage/schemas/scheduled_workflow.ex +++ b/durable/lib/durable/storage/schemas/scheduled_workflow.ex @@ -22,6 +22,10 @@ defmodule Durable.Storage.Schemas.ScheduledWorkflow do enabled: boolean(), last_run_at: DateTime.t() | nil, next_run_at: DateTime.t() | nil, + last_error: String.t() | nil, + last_error_at: DateTime.t() | nil, + consecutive_failures: integer(), + auto_disabled_at: DateTime.t() | nil, inserted_at: DateTime.t(), updated_at: DateTime.t() } @@ -40,6 +44,11 @@ defmodule Durable.Storage.Schemas.ScheduledWorkflow do field(:enabled, :boolean, default: true) field(:last_run_at, :utc_datetime_usec) field(:next_run_at, :utc_datetime_usec) + # Bug L-1 — scheduler resilience tracking + field(:last_error, :string) + field(:last_error_at, :utc_datetime_usec) + field(:consecutive_failures, :integer, default: 0) + field(:auto_disabled_at, :utc_datetime_usec) timestamps(type: :utc_datetime_usec) end @@ -51,7 +60,11 @@ defmodule Durable.Storage.Schemas.ScheduledWorkflow do :queue, :enabled, :last_run_at, - :next_run_at + :next_run_at, + :last_error, + :last_error_at, + :consecutive_failures, + :auto_disabled_at ] @doc """ @@ -81,6 +94,70 @@ defmodule Durable.Storage.Schemas.ScheduledWorkflow do |> cast(%{enabled: enabled}, [:enabled]) end + @doc """ + Creates a changeset that records a failure to load / start the scheduled + workflow. Increments `consecutive_failures` and stamps `last_error*`. + When the failure count reaches `auto_disable_after`, the schedule is + automatically disabled and `auto_disabled_at` is set so operators can + tell why the schedule stopped firing. + """ + def failure_changeset(scheduled_workflow, error_message, opts \\ []) do + auto_disable_after = Keyword.get(opts, :auto_disable_after, 5) + next_run_at = Keyword.get(opts, :next_run_at) + now = DateTime.utc_now() + new_count = (scheduled_workflow.consecutive_failures || 0) + 1 + auto_disable? = new_count >= auto_disable_after + + attrs = %{ + last_error: String.slice(to_string(error_message), 0, 1024), + last_error_at: now, + consecutive_failures: new_count, + next_run_at: next_run_at + } + + attrs = + if auto_disable? do + attrs + |> Map.put(:enabled, false) + |> Map.put(:auto_disabled_at, now) + else + attrs + end + + cast(scheduled_workflow, attrs, [ + :last_error, + :last_error_at, + :consecutive_failures, + :enabled, + :auto_disabled_at, + :next_run_at + ]) + end + + @doc """ + Creates a changeset that records a successful trigger. Resets + `consecutive_failures` to 0 and clears the last_error fields. + """ + def success_changeset(scheduled_workflow, last_run_at, next_run_at) do + cast( + scheduled_workflow, + %{ + last_run_at: last_run_at, + next_run_at: next_run_at, + last_error: nil, + last_error_at: nil, + consecutive_failures: 0 + }, + [ + :last_run_at, + :next_run_at, + :last_error, + :last_error_at, + :consecutive_failures + ] + ) + end + @doc """ Creates a changeset for upserting during registration. diff --git a/lib/durable/storage/schemas/step_execution.ex b/durable/lib/durable/storage/schemas/step_execution.ex similarity index 100% rename from lib/durable/storage/schemas/step_execution.ex rename to durable/lib/durable/storage/schemas/step_execution.ex diff --git a/lib/durable/storage/schemas/wait_group.ex b/durable/lib/durable/storage/schemas/wait_group.ex similarity index 100% rename from lib/durable/storage/schemas/wait_group.ex rename to durable/lib/durable/storage/schemas/wait_group.ex diff --git a/lib/durable/storage/schemas/workflow_execution.ex b/durable/lib/durable/storage/schemas/workflow_execution.ex similarity index 100% rename from lib/durable/storage/schemas/workflow_execution.ex rename to durable/lib/durable/storage/schemas/workflow_execution.ex diff --git a/lib/durable/supervisor.ex b/durable/lib/durable/supervisor.ex similarity index 83% rename from lib/durable/supervisor.ex rename to durable/lib/durable/supervisor.ex index a4d0412..35bcf49 100644 --- a/lib/durable/supervisor.ex +++ b/durable/lib/durable/supervisor.ex @@ -95,10 +95,15 @@ defmodule Durable.Supervisor do # Task supervisor for parallel step execution task_sup_name = task_supervisor_name(config.name) - # Base children always include the task supervisor - base_children = [ - {Task.Supervisor, name: task_sup_name} - ] + pubsub_children = + if config.owns_pubsub? do + check_pubsub_dependency!(config) + [{Phoenix.PubSub, name: config.pubsub}] + else + [] + end + + base_children = pubsub_children ++ [{Task.Supervisor, name: task_sup_name}] children = if config.queue_enabled do @@ -123,6 +128,20 @@ defmodule Durable.Supervisor do Supervisor.init(children, strategy: :one_for_one) end + defp check_pubsub_dependency!(%Config{name: name}) do + unless Code.ensure_loaded?(Phoenix.PubSub) do + raise """ + Durable instance #{inspect(name)} was configured with `pubsub: :start` but + `:phoenix_pubsub` is not available. Add it to your dependencies: + + {:phoenix_pubsub, "~> 2.1"} + + Or pass `pubsub: MyApp.PubSub` to reuse a PubSub already started by the + host application, or omit `:pubsub` to disable broadcasting. + """ + end + end + @doc """ Returns the Task.Supervisor name for a Durable instance. """ diff --git a/lib/durable/wait.ex b/durable/lib/durable/wait.ex similarity index 89% rename from lib/durable/wait.ex rename to durable/lib/durable/wait.ex index 19c3ea1..8eacd2a 100644 --- a/lib/durable/wait.ex +++ b/durable/lib/durable/wait.ex @@ -41,6 +41,7 @@ defmodule Durable.Wait do """ alias Durable.Config + alias Durable.PubSub, as: DurablePubSub alias Durable.Repo alias Durable.Storage.Schemas.{PendingEvent, PendingInput, WaitGroup, WorkflowExecution} @@ -135,6 +136,29 @@ defmodule Durable.Wait do - `:timeout` - Timeout in milliseconds (optional) - `:timeout_value` - Value to return on timeout (optional) + ## Resumption semantics + + `wait_for_event/2` is a **resumption barrier**, not a pause inside the + step body. When the event arrives, the step body re-executes from the + top — it doesn't "continue" from the line where the wait was called. + On re-execution, the wait finds the event data in the restored context + and returns immediately without re-suspending. + + This means any side effects BEFORE `wait_for_event/2` run each time the + step is invoked (once on suspend, again on resume). Make them idempotent + or move them into a separate prior step: + + # 🚫 not idempotent — sends two emails + step :wait_for_approval, fn data -> + Mailer.send_approval_request(data) # runs on both passes + result = wait_for_event("approved") + {:ok, assign(data, :approved, result)} + end + + # ✅ side effect lives in its own step + step :request_approval, fn data -> Mailer.send_approval_request(data); {:ok, data} end + step :await_approval, fn data -> {:ok, assign(data, :approved, wait_for_event("approved"))} end + ## Examples # Wait indefinitely @@ -545,10 +569,16 @@ defmodule Durable.Wait do @spec provide_input(String.t(), String.t(), map(), keyword()) :: :ok | {:error, term()} def provide_input(workflow_id, input_name, data, opts \\ []) do config = get_config(opts) + # Sanitize at the API boundary so any tuples / PIDs / functions that + # leak in from controllers, LiveViews, or external callers can't crash + # JSONB persistence in PendingInput.response or workflow.context. + safe_data = Durable.Executor.sanitize_for_json(data) with {:ok, pending} <- find_pending_input(config, workflow_id, input_name), - {:ok, _} <- complete_pending_input(config, pending, data), - {:ok, _} <- Durable.Executor.resume_workflow(workflow_id, %{input_name => data}, opts) do + {:ok, completed} <- complete_pending_input(config, pending, safe_data), + {:ok, _} <- + Durable.Executor.resume_workflow(workflow_id, %{input_name => safe_data}, opts) do + DurablePubSub.broadcast_input(config, :input_provided, pending_input_to_map(completed)) :ok end end @@ -572,10 +602,12 @@ defmodule Durable.Wait do @spec send_event(String.t(), String.t(), map(), keyword()) :: :ok | {:error, term()} def send_event(workflow_id, event_name, payload, opts \\ []) do config = get_config(opts) + # Sanitize at the API boundary — same rationale as provide_input/4. + safe_payload = Durable.Executor.sanitize_for_json(payload) with {:ok, pending_event} <- find_pending_event(config, workflow_id, event_name), - {:ok, _} <- receive_pending_event(config, pending_event, payload), - {:ok, _} <- maybe_resume_workflow(config, workflow_id, event_name, payload, opts) do + {:ok, _} <- receive_pending_event(config, pending_event, safe_payload), + {:ok, _} <- maybe_resume_workflow(config, workflow_id, event_name, safe_payload, opts) do :ok end end @@ -794,14 +826,40 @@ defmodule Durable.Wait do end defp complete_pending_input(config, pending, response) do + # PendingInput.response is typed `:map`. Non-map responses (a single + # string from wait_for_choice/wait_for_text, an approval atom, etc.) + # would silently fail Ecto's cast and short-circuit the resume flow, + # leaving the workflow stuck in :waiting until timeout. + # + # The workflow's downstream context still receives the unwrapped value + # via Executor.resume_workflow, so this wrapping is purely for the + # audit/display field on PendingInput. + storable_response = + if is_map(response) do + response + else + %{"value" => response} + end + pending - |> PendingInput.complete_changeset(response) + |> PendingInput.complete_changeset(storable_response) |> Repo.update(config) end defp receive_pending_event(config, pending_event, payload) do + # PendingEvent.payload is typed `:map`. Same wrap pattern as + # complete_pending_input/3 — non-map payloads (e.g. an event sent with + # just a string identifier) get stored under `"value"` so the cast + # always succeeds and the workflow resume isn't blocked. + storable_payload = + if is_map(payload) do + payload + else + %{"value" => payload} + end + pending_event - |> PendingEvent.receive_changeset(payload) + |> PendingEvent.receive_changeset(storable_payload) |> Repo.update(config) end diff --git a/durable/lib/durable/wait/timeout_worker.ex b/durable/lib/durable/wait/timeout_worker.ex new file mode 100644 index 0000000..347f560 --- /dev/null +++ b/durable/lib/durable/wait/timeout_worker.ex @@ -0,0 +1,410 @@ +defmodule Durable.Wait.TimeoutWorker do + @moduledoc """ + Background worker that enforces timeouts for pending inputs and events. + + Periodically checks for timed-out waits and either: + - Resumes the workflow with the timeout_value (if on_timeout: :resume) + - Fails the workflow (if on_timeout: :fail) + """ + + use GenServer + + alias Durable.Executor + alias Durable.Repo + alias Durable.Storage.Schemas.{PendingEvent, PendingInput, WaitGroup, WorkflowExecution} + + import Ecto.Query + + require Logger + + @default_interval 60_000 + + # ============================================================================ + # Client API + # ============================================================================ + + @doc """ + Starts the timeout worker. + + ## Options + + - `:config` - The Durable config (required) + - `:interval` - Check interval in milliseconds (default: 60_000) + """ + def start_link(opts) do + config = Keyword.fetch!(opts, :config) + interval = Keyword.get(opts, :interval, @default_interval) + + GenServer.start_link( + __MODULE__, + %{config: config, interval: interval}, + name: worker_name(config.name) + ) + end + + @doc """ + Returns the process name for a given Durable instance. + """ + def worker_name(durable_name) do + Module.concat([durable_name, Wait, TimeoutWorker]) + end + + @doc """ + Manually triggers a timeout check. + """ + def check_timeouts(durable_name \\ Durable) do + GenServer.cast(worker_name(durable_name), :check_timeouts) + end + + # ============================================================================ + # GenServer Callbacks + # ============================================================================ + + @impl true + def init(state) do + schedule_check(state.interval) + {:ok, state} + end + + @impl true + def handle_info(:check_timeouts, state) do + do_check_timeouts(state.config) + schedule_check(state.interval) + {:noreply, state} + end + + @impl true + def handle_cast(:check_timeouts, state) do + do_check_timeouts(state.config) + {:noreply, state} + end + + # ============================================================================ + # Private Functions + # ============================================================================ + + defp schedule_check(interval) do + Process.send_after(self(), :check_timeouts, interval) + end + + defp do_check_timeouts(config) do + now = DateTime.utc_now() + + process_timed_out_inputs(config, now) + process_timed_out_events(config, now) + process_timed_out_wait_groups(config, now) + end + + defp process_timed_out_inputs(config, now) do + # Find pending inputs that have timed out + query = + from(p in PendingInput, + where: + p.status == :pending and + not is_nil(p.timeout_at) and + p.timeout_at <= ^now, + preload: [:workflow] + ) + + timed_out = Repo.all(config, query) + + Enum.each(timed_out, fn pending_input -> + handle_input_timeout(config, pending_input) + end) + end + + defp handle_input_timeout(config, pending_input) do + on_timeout = pending_input.on_timeout || :resume + + case on_timeout do + :resume -> + timeout_value = deserialize_timeout_value(pending_input.timeout_value) + + resume_data = %{ + pending_input.input_name => timeout_value, + :__timeout__ => true + } + + # Atomic: mark the input :timeout AND transition the workflow back + # to :pending in a single transaction. Before this change, a crash + # between the two updates left the input as :timeout but the workflow + # stuck in :waiting forever (Bug M-2). + result = + atomic_resume_after_timeout( + config, + PendingInput.timeout_changeset(pending_input), + pending_input.workflow_id, + resume_data + ) + + case result do + {:ok, _} -> + Logger.info( + "Timeout handled for pending input #{pending_input.input_name} " <> + "in workflow #{pending_input.workflow_id} (resume)" + ) + + {:error, stage, reason, _changes} -> + Logger.error( + "Failed input timeout transaction for #{pending_input.workflow_id}: " <> + "#{stage} → #{inspect(reason)}" + ) + end + + :fail -> + # Mark input :timeout then cancel — same Multi pattern. + case atomic_cancel_after_timeout( + config, + PendingInput.timeout_changeset(pending_input), + pending_input.workflow_id, + "Timeout waiting for input: #{pending_input.input_name}" + ) do + {:ok, _} -> + Logger.info( + "Timeout handled for pending input #{pending_input.input_name} " <> + "in workflow #{pending_input.workflow_id} (fail)" + ) + + {:error, stage, reason, _changes} -> + Logger.error( + "Failed input timeout transaction for #{pending_input.workflow_id}: " <> + "#{stage} → #{inspect(reason)}" + ) + end + end + end + + defp process_timed_out_events(config, now) do + # Find pending events that have timed out (only single events, not in groups) + query = + from(p in PendingEvent, + where: + p.status == :pending and + p.wait_type == :single and + is_nil(p.wait_group_id) and + not is_nil(p.timeout_at) and + p.timeout_at <= ^now, + preload: [:workflow] + ) + + timed_out = Repo.all(config, query) + + Enum.each(timed_out, fn pending_event -> + handle_event_timeout(config, pending_event) + end) + end + + defp handle_event_timeout(config, pending_event) do + timeout_value = deserialize_timeout_value(pending_event.timeout_value) + + resume_data = %{ + pending_event.event_name => timeout_value, + :__timeout__ => true + } + + case atomic_resume_after_timeout( + config, + PendingEvent.timeout_changeset(pending_event), + pending_event.workflow_id, + resume_data + ) do + {:ok, _} -> + Logger.info( + "Timeout handled for pending event #{pending_event.event_name} " <> + "in workflow #{pending_event.workflow_id}" + ) + + {:error, stage, reason, _changes} -> + Logger.error( + "Failed event timeout transaction for #{pending_event.workflow_id}: " <> + "#{stage} → #{inspect(reason)}" + ) + end + end + + # Atomically: persist the pending row's :timeout transition AND flip the + # owning workflow back to :pending with the timeout payload merged into + # context. If either step fails the entire transaction rolls back, so the + # workflow can never end up "input/event marked timeout but workflow still + # stuck in :waiting". + defp atomic_resume_after_timeout(config, pending_changeset, workflow_id, resume_data) do + safe_resume = Executor.sanitize_for_json(resume_data) + + multi = + Ecto.Multi.new() + |> Ecto.Multi.update(:pending, pending_changeset) + |> Ecto.Multi.run(:workflow, fn repo, _changes -> + case repo.get(WorkflowExecution, workflow_id) do + nil -> + {:error, :workflow_not_found} + + %WorkflowExecution{status: :waiting} = exec -> + new_context = Map.merge(exec.context || %{}, safe_resume) + + exec + |> Ecto.Changeset.change( + context: new_context, + status: :pending, + locked_by: nil, + locked_at: nil + ) + |> repo.update() + + %WorkflowExecution{status: status} -> + # The workflow already moved on (e.g. someone provided input + # before the timeout sweep ran). Tolerate this — the pending + # row is still marked :timeout via the Multi above, which is + # the right outcome. + {:ok, %{status: status, no_op: true}} + end + end) + + Repo.transaction(config, multi) + end + + defp atomic_cancel_after_timeout(config, pending_changeset, workflow_id, reason) do + multi = + Ecto.Multi.new() + |> Ecto.Multi.update(:pending, pending_changeset) + |> Ecto.Multi.run(:workflow, fn repo, _changes -> + case repo.get(WorkflowExecution, workflow_id) do + nil -> + {:error, :workflow_not_found} + + %WorkflowExecution{} = exec -> + exec + |> WorkflowExecution.status_changeset(:cancelled, %{ + error: %{"type" => "timeout", "message" => reason}, + completed_at: DateTime.utc_now() + }) + |> Ecto.Changeset.change(locked_by: nil, locked_at: nil) + |> repo.update() + end + end) + + Repo.transaction(config, multi) + end + + defp process_timed_out_wait_groups(config, now) do + # Find wait groups that have timed out + query = + from(w in WaitGroup, + where: + w.status == :pending and + not is_nil(w.timeout_at) and + w.timeout_at <= ^now, + preload: [:workflow] + ) + + timed_out = Repo.all(config, query) + + Enum.each(timed_out, fn wait_group -> + handle_wait_group_timeout(config, wait_group) + end) + end + + defp handle_wait_group_timeout(config, wait_group) do + resume_data = build_wait_group_resume_data(wait_group) + + case atomic_resume_after_wait_group_timeout(config, wait_group, resume_data) do + {:ok, _} -> + log_wait_group_timeout_success(wait_group) + + {:error, stage, reason, _} -> + Logger.error( + "Failed wait_group timeout transaction for #{wait_group.workflow_id}: " <> + "#{stage} → #{inspect(reason)}" + ) + end + end + + # Builds the resume context for a timed-out wait group. + # + # Bug M-4: include per-event status in the resume payload so user code + # can distinguish events that were :received from events that :timed_out. + defp build_wait_group_resume_data(wait_group) do + timeout_value = deserialize_timeout_value(wait_group.timeout_value) + received = wait_group.received_events || %{} + + per_event_status = + Map.new(wait_group.event_names || [], fn name -> + case Map.get(received, name) do + nil -> {name, %{"status" => "timeout", "value" => nil}} + payload -> {name, %{"status" => "received", "value" => payload}} + end + end) + + fallback_result = + case wait_group.wait_type do + :any -> {:timeout, nil} + :all -> {:timeout, received} + end + + %{ + :__wait_group_result__ => timeout_value || fallback_result, + :__wait_group_status__ => per_event_status, + :__timeout__ => true + } + end + + defp log_wait_group_timeout_success(wait_group) do + received_count = map_size(wait_group.received_events || %{}) + expected_count = length(wait_group.event_names || []) + + Logger.info( + "Timeout handled for wait group #{wait_group.id} " <> + "(#{wait_group.wait_type}) in workflow #{wait_group.workflow_id} — " <> + "received #{received_count} of #{expected_count} events" + ) + end + + defp atomic_resume_after_wait_group_timeout(config, wait_group, resume_data) do + safe_resume = Executor.sanitize_for_json(resume_data) + now = DateTime.utc_now() + + multi = + Ecto.Multi.new() + |> Ecto.Multi.update(:wait_group, WaitGroup.timeout_changeset(wait_group)) + |> Ecto.Multi.update_all( + :pending_events, + from(p in PendingEvent, + where: p.wait_group_id == ^wait_group.id and p.status == :pending + ), + set: [status: :timeout, completed_at: now] + ) + |> Ecto.Multi.run(:workflow, fn repo, _changes -> + case repo.get(WorkflowExecution, wait_group.workflow_id) do + nil -> + {:error, :workflow_not_found} + + %WorkflowExecution{status: :waiting} = exec -> + new_context = Map.merge(exec.context || %{}, safe_resume) + + exec + |> Ecto.Changeset.change( + context: new_context, + status: :pending, + locked_by: nil, + locked_at: nil + ) + |> repo.update() + + %WorkflowExecution{status: status} -> + {:ok, %{status: status, no_op: true}} + end + end) + + Repo.transaction(config, multi) + end + + defp deserialize_timeout_value(nil), do: nil + + defp deserialize_timeout_value(%{"__atom__" => atom_string}) do + String.to_existing_atom(atom_string) + rescue + ArgumentError -> String.to_atom(atom_string) + end + + defp deserialize_timeout_value(%{"__value__" => value}), do: value + defp deserialize_timeout_value(value) when is_map(value), do: value +end diff --git a/lib/mix/helpers.ex b/durable/lib/mix/helpers.ex similarity index 58% rename from lib/mix/helpers.ex rename to durable/lib/mix/helpers.ex index 0244104..ff223b8 100644 --- a/lib/mix/helpers.ex +++ b/durable/lib/mix/helpers.ex @@ -10,6 +10,24 @@ defmodule Durable.Mix.Helpers do Mix.Task.run("app.start") end + @doc """ + Same as `ensure_started/0`, but forces Durable's queue processing off + for this BEAM. Use this for diagnostic/read-only tasks (inspect, + pending, doctor, list, status) so the short-lived mix process doesn't + claim queue jobs it can't finish — any unlocked job would otherwise + be immediately re-locked by the task's own poller and then orphaned + on task exit. + + The override goes through `Application.put_env(:durable, + :disable_queue_processing, true)`, which `Durable.Config.new/1` + consults when the host app's supervisor tree boots. No effect on + other BEAMs (each OS process has its own app env). + """ + def ensure_started_readonly do + Application.put_env(:durable, :disable_queue_processing, true) + Mix.Task.run("app.start") + end + @doc """ Parses --name option, returns Durable instance name atom. """ @@ -117,4 +135,52 @@ defmodule Durable.Mix.Helpers do def strip_elixir_prefix(module_str) when is_binary(module_str) do String.replace_prefix(module_str, "Elixir.", "") end + + @doc """ + Resolves a workflow ID from a full UUID or a prefix. Returns `{:ok, id}` + when exactly one workflow matches, `{:error, :ambiguous, [ids]}` on + multiple matches, or `{:error, :not_found}`. + + Operators rarely type full UUIDs; prefix lookup mirrors what they + already paste from `mix durable.list` truncated IDs. + """ + import Ecto.Query + + def resolve_workflow_id(durable_name, input) when is_binary(input) do + config = Durable.Config.get(durable_name) + trimmed = String.trim(input) + + if valid_uuid?(trimmed) do + case Durable.Repo.get(config, Durable.Storage.Schemas.WorkflowExecution, trimmed) do + nil -> {:error, :not_found} + exec -> {:ok, exec.id} + end + else + resolve_by_prefix(config, trimmed) + end + end + + defp valid_uuid?(str) do + case Ecto.UUID.cast(str) do + {:ok, _} -> true + :error -> false + end + end + + defp resolve_by_prefix(config, prefix) do + pattern = prefix <> "%" + + query = + from(w in Durable.Storage.Schemas.WorkflowExecution, + where: fragment("?::text LIKE ?", w.id, ^pattern), + select: w.id, + limit: 10 + ) + + case Durable.Repo.all(config, query) do + [] -> {:error, :not_found} + [id] -> {:ok, id} + ids -> {:error, :ambiguous, ids} + end + end end diff --git a/lib/mix/tasks/durable.cancel.ex b/durable/lib/mix/tasks/durable.cancel.ex similarity index 97% rename from lib/mix/tasks/durable.cancel.ex rename to durable/lib/mix/tasks/durable.cancel.ex index e873afd..fa8c67b 100644 --- a/lib/mix/tasks/durable.cancel.ex +++ b/durable/lib/mix/tasks/durable.cancel.ex @@ -26,7 +26,7 @@ defmodule Mix.Tasks.Durable.Cancel do @impl Mix.Task def run(args) do - Helpers.ensure_started() + Helpers.ensure_started_readonly() {opts, positional, _} = OptionParser.parse(args, strict: [reason: :string, name: :string]) diff --git a/lib/mix/tasks/durable.cleanup.ex b/durable/lib/mix/tasks/durable.cleanup.ex similarity index 99% rename from lib/mix/tasks/durable.cleanup.ex rename to durable/lib/mix/tasks/durable.cleanup.ex index a37a598..0382dcf 100644 --- a/lib/mix/tasks/durable.cleanup.ex +++ b/durable/lib/mix/tasks/durable.cleanup.ex @@ -41,7 +41,7 @@ defmodule Mix.Tasks.Durable.Cleanup do @impl Mix.Task def run(args) do - Helpers.ensure_started() + Helpers.ensure_started_readonly() {opts, _, _} = OptionParser.parse(args, diff --git a/durable/lib/mix/tasks/durable.doctor.ex b/durable/lib/mix/tasks/durable.doctor.ex new file mode 100644 index 0000000..b963dd4 --- /dev/null +++ b/durable/lib/mix/tasks/durable.doctor.ex @@ -0,0 +1,290 @@ +defmodule Mix.Tasks.Durable.Doctor do + @shortdoc "Diagnoses common workflow/queue/scheduler health problems" + + @moduledoc """ + Health check for a Durable instance. Surfaces the classes of stuck + state that recovery doesn't always fix on its own: + + * zombies — `:waiting` workflows with no pending inputs or events + * stale locks — `:running` workflows whose lock hasn't been + refreshed for longer than `stale_lock_timeout` + * compensation stragglers — `:compensating` workflows that never + reached `:compensated` or `:compensation_failed` + * disabled schedules — scheduled workflows auto-disabled after + repeated failures + + Pass `--fix` to invoke the adapter's recovery (same path the + background `StaleJobRecovery` GenServer runs on its interval). + + ## Usage + + mix durable.doctor [--fix] [--name NAME] + + ## Options + + * `--fix` - Run recovery for stale locks + zombies after reporting + * `--json` - Emit the findings as JSON + * `--name NAME` - The Durable instance name (default: Durable) + """ + + use Mix.Task + + alias Durable.Mix.Helpers + alias Durable.Queue.Adapter + alias Durable.Repo + alias Durable.Storage.Schemas.{PendingEvent, PendingInput, ScheduledWorkflow, WorkflowExecution} + + import Ecto.Query + + @impl Mix.Task + def run(args) do + Helpers.ensure_started_readonly() + + {opts, _, _} = + OptionParser.parse(args, strict: [fix: :boolean, json: :boolean, name: :string]) + + durable_name = Helpers.get_durable_name(opts) + config = Durable.Config.get(durable_name) + + report = %{ + zombies: find_zombies(config), + stale_locks: find_stale_locks(config), + stuck_compensating: find_stuck_compensating(config), + disabled_schedules: find_disabled_schedules(config) + } + + if Keyword.get(opts, :json, false) do + Mix.shell().info(Jason.encode!(report, pretty: true)) + else + print_report(report, config) + end + + if Keyword.get(opts, :fix, false), do: run_fix(durable_name) + end + + # --- Finders --------------------------------------------------------------- + + # A zombie is a :waiting workflow with no pending inputs or events — + # nothing can ever unblock it. StaleJobRecovery's adapter fix targets + # exactly this shape; surfacing it separately here lets operators see + # the problem without running recovery. + defp find_zombies(config) do + query = + from(w in WorkflowExecution, + as: :w, + where: w.status == :waiting, + where: + not exists( + from(p in PendingInput, + where: p.workflow_id == parent_as(:w).id and p.status == :pending + ) + ), + where: + not exists( + from(e in PendingEvent, + where: e.workflow_id == parent_as(:w).id and e.status == :pending + ) + ), + select: %{ + id: w.id, + name: w.workflow_name, + current_step: w.current_step, + updated_at: w.updated_at + }, + limit: 100 + ) + + Repo.all(config, query) + end + + defp find_stale_locks(config) do + cutoff = DateTime.add(DateTime.utc_now(), -config.stale_lock_timeout, :second) + + query = + from(w in WorkflowExecution, + where: w.status == :running, + where: not is_nil(w.locked_by), + where: w.locked_at < ^cutoff, + select: %{ + id: w.id, + name: w.workflow_name, + locked_by: w.locked_by, + locked_at: w.locked_at + }, + limit: 100 + ) + + Repo.all(config, query) + end + + # Compensation that got stuck before reaching a terminal state. Rare, + # but devastating when it happens because the rollback is half-done. + defp find_stuck_compensating(config) do + cutoff = DateTime.add(DateTime.utc_now(), -config.stale_lock_timeout, :second) + + query = + from(w in WorkflowExecution, + where: w.status == :compensating, + where: w.updated_at < ^cutoff, + select: %{ + id: w.id, + name: w.workflow_name, + current_step: w.current_step, + updated_at: w.updated_at + }, + limit: 100 + ) + + Repo.all(config, query) + end + + defp find_disabled_schedules(config) do + query = + from(s in ScheduledWorkflow, + where: not is_nil(s.auto_disabled_at), + order_by: [desc: s.auto_disabled_at], + select: %{ + id: s.id, + name: s.name, + consecutive_failures: s.consecutive_failures, + auto_disabled_at: s.auto_disabled_at, + last_error_at: s.last_error_at + }, + limit: 100 + ) + + Repo.all(config, query) + end + + # --- Text output ----------------------------------------------------------- + + defp print_report(report, config) do + total = + length(report.zombies) + length(report.stale_locks) + + length(report.stuck_compensating) + length(report.disabled_schedules) + + header = + "Durable doctor — #{inspect(config.name)} (stale_lock_timeout=#{config.stale_lock_timeout}s)" + + Mix.shell().info(header) + + if total == 0 do + Mix.shell().info(" No issues found.") + else + print_section( + "Zombies (:waiting, no pending inputs/events)", + report.zombies, + &zombie_row/1, + [ + "ID", + "Workflow", + "Current step", + "Stuck since" + ] + ) + + print_section( + "Stale locks (:running, lock not refreshed)", + report.stale_locks, + &stale_row/1, + [ + "ID", + "Workflow", + "Locked by", + "Locked at" + ] + ) + + print_section( + "Stuck compensating", + report.stuck_compensating, + &compensating_row/1, + ["ID", "Workflow", "Current step", "Since"] + ) + + print_section( + "Disabled schedules", + report.disabled_schedules, + &schedule_row/1, + ["Name", "Failures", "Disabled at", "Last error"] + ) + end + end + + defp print_section(_title, [], _row_fn, _headers), do: :ok + + defp print_section(title, rows, row_fn, headers) do + Mix.shell().info("") + Mix.shell().info("#{title} (#{length(rows)})") + + rows + |> Enum.map(row_fn) + |> Helpers.format_table(headers) + |> Enum.each(fn line -> Mix.shell().info(" #{line}") end) + end + + defp zombie_row(z) do + [ + Helpers.truncate_id(z.id), + z.name, + z.current_step || "—", + Helpers.format_datetime(z.updated_at) + ] + end + + defp stale_row(s) do + [Helpers.truncate_id(s.id), s.name, s.locked_by, Helpers.format_datetime(s.locked_at)] + end + + defp compensating_row(c) do + [ + Helpers.truncate_id(c.id), + c.name, + c.current_step || "—", + Helpers.format_datetime(c.updated_at) + ] + end + + defp schedule_row(s) do + [ + s.name, + to_string(s.consecutive_failures), + Helpers.format_datetime(s.auto_disabled_at), + Helpers.format_datetime(s.last_error_at) + ] + end + + # --- Fix ------------------------------------------------------------------- + + # We call the adapter directly (not `StaleJobRecovery.recover_now/1`) + # because this task runs with queue processing disabled, so the + # StaleJobRecovery GenServer isn't in the supervision tree. + defp run_fix(durable_name) do + config = Durable.Config.get(durable_name) + adapter = Adapter.default_adapter() + + Mix.shell().info("") + Mix.shell().info("Running recovery...") + + run_stale_lock_fix(adapter, config) + run_zombie_fix(adapter, config) + end + + defp run_stale_lock_fix(adapter, config) do + case adapter.recover_stale_locks(config, config.stale_lock_timeout) do + {:ok, count} -> Mix.shell().info(" Stale-lock recovery: #{count} row(s)") + {:error, reason} -> Mix.shell().error(" Stale-lock recovery failed: #{inspect(reason)}") + end + end + + defp run_zombie_fix(adapter, config) do + if function_exported?(adapter, :recover_zombie_workflows, 2) do + case adapter.recover_zombie_workflows(config, config.stale_lock_timeout) do + {:ok, count} -> Mix.shell().info(" Zombie recovery: #{count} row(s)") + {:error, reason} -> Mix.shell().error(" Zombie recovery failed: #{inspect(reason)}") + end + else + Mix.shell().info(" Zombie recovery: adapter does not implement it, skipping") + end + end +end diff --git a/lib/mix/tasks/durable.gen.migration.ex b/durable/lib/mix/tasks/durable.gen.migration.ex similarity index 98% rename from lib/mix/tasks/durable.gen.migration.ex rename to durable/lib/mix/tasks/durable.gen.migration.ex index b3c8e1d..a99d8c6 100644 --- a/lib/mix/tasks/durable.gen.migration.ex +++ b/durable/lib/mix/tasks/durable.gen.migration.ex @@ -5,6 +5,7 @@ defmodule Mix.Tasks.Durable.Gen.Migration do Generates a new Durable internal migration. This is for Durable library developers only, not end users. + Host applications should use `mix durable.gen.upgrade` instead. $ mix durable.gen.migration add_compensation_tracking diff --git a/durable/lib/mix/tasks/durable.gen.upgrade.ex b/durable/lib/mix/tasks/durable.gen.upgrade.ex new file mode 100644 index 0000000..d38f3ad --- /dev/null +++ b/durable/lib/mix/tasks/durable.gen.upgrade.ex @@ -0,0 +1,139 @@ +defmodule Mix.Tasks.Durable.Gen.Upgrade do + @shortdoc "Generates a host-app migration for pending Durable migrations" + + @moduledoc """ + Generates an Ecto migration that upgrades Durable's internal database schema. + + This is the host-application counterpart to `mix durable.gen.migration`. + Durable migrations are explicit: when Durable adds new internal migrations, + generate a new wrapper migration and run `mix ecto.migrate`. + + ## Usage + + mix durable.gen.upgrade + mix durable.gen.upgrade -r MyApp.Repo + mix durable.gen.upgrade --prefix private + mix durable.gen.upgrade --to 20260413000000 + + ## Options + + * `-r`, `--repo` - The Ecto repo to generate the migration for + * `--prefix` - Durable PostgreSQL schema prefix (default: `"durable"`) + * `--to` - Durable migration version to upgrade to (default: latest) + * `--migrations-path` - Destination migrations path + """ + + use Mix.Task + + import Macro, only: [camelize: 1] + import Mix.Ecto + import Mix.EctoSQL + import Mix.Generator + + alias Durable.Migration + + @aliases [r: :repo] + @switches [ + repo: [:string, :keep], + prefix: :string, + to: :integer, + migrations_path: :string, + no_compile: :boolean, + no_deps_check: :boolean + ] + + @impl Mix.Task + def run(args) do + repos = parse_repos!(args) + {opts, _argv} = OptionParser.parse!(args, strict: @switches, aliases: @aliases) + + Enum.map(repos, fn repo -> + ensure_repo(repo, args) + generate(repo, opts) + end) + end + + defp parse_repos!(args) do + case parse_repo(args) do + [] -> Mix.raise("Could not find an Ecto repo. Pass one with -r MyApp.Repo.") + repos -> repos + end + end + + defp generate(repo, opts) do + target_version = target_version!(opts) + prefix = Keyword.get(opts, :prefix, "durable") + previous_version = Migration.previous_version(target_version) + path = Keyword.get(opts, :migrations_path) || Path.join(source_repo_priv(repo), "migrations") + base_name = "upgrade_durable_to_v#{target_version}" + file = Path.join(path, "#{timestamp()}_#{base_name}.exs") + + unless File.dir?(path), do: create_directory(path) + ensure_unique!(path, base_name) + + create_file( + file, + migration_template(repo, target_version, previous_version, prefix, base_name) + ) + + Mix.shell().info(""" + + Run `mix ecto.migrate` to apply Durable migrations up to #{target_version}. + """) + + file + end + + defp target_version!(opts) do + target_version = Keyword.get(opts, :to, Migration.current_version()) + + if target_version in Migration.all_versions() do + target_version + else + Mix.raise( + "Unknown Durable migration version #{inspect(target_version)}. " <> + "Known versions: #{Enum.join(Migration.all_versions(), ", ")}" + ) + end + end + + defp ensure_unique!(path, base_name) do + fuzzy_path = Path.join(path, "*_#{base_name}.exs") + + if Path.wildcard(fuzzy_path) != [] do + Mix.raise("migration can't be created, there is already a migration file for #{base_name}.") + end + end + + defp migration_template(repo, target_version, previous_version, prefix, base_name) do + module_name = Module.concat([repo, Migrations, camelize(base_name)]) + + """ + defmodule #{inspect(module_name)} do + use Ecto.Migration + + def up do + Durable.Migration.up(to: #{target_version}, prefix: #{inspect(prefix)}) + end + + def down do + Durable.Migration.down(to: #{previous_version}, prefix: #{inspect(prefix)}) + end + end + """ + end + + defp timestamp do + {{year, month, day}, {hour, minute, second}} = :calendar.universal_time() + + "#{year}" <> + pad(month) <> + pad(day) <> + pad(hour) <> + pad(minute) <> + pad(second) + end + + defp pad(value) when value < 10, do: "0#{value}" + defp pad(value), do: to_string(value) +end diff --git a/durable/lib/mix/tasks/durable.inspect.ex b/durable/lib/mix/tasks/durable.inspect.ex new file mode 100644 index 0000000..86516d7 --- /dev/null +++ b/durable/lib/mix/tasks/durable.inspect.ex @@ -0,0 +1,438 @@ +defmodule Mix.Tasks.Durable.Inspect do + @shortdoc "Shows detailed state of a single workflow execution" + + @moduledoc """ + Deep-dive on a single workflow execution. Use this as the first stop + when a workflow is stuck or behaving unexpectedly — it prints the + header, step timeline (including wait/resume pairs), pending inputs + and events, parent/children, and any error. + + ## Usage + + mix durable.inspect WORKFLOW_ID [options] + + `WORKFLOW_ID` may be a full UUID or a unique prefix (the task will + refuse ambiguous prefixes rather than guess). + + ## Options + + * `--json` - Emit a JSON document instead of the text report + * `--context` - Include full context as pretty JSON (default: truncated) + * `--name NAME` - The Durable instance name (default: Durable) + """ + + use Mix.Task + + alias Durable.Mix.Helpers + alias Durable.Repo + alias Durable.Storage.Schemas.{PendingEvent, PendingInput, StepExecution, WorkflowExecution} + + import Ecto.Query + + @impl Mix.Task + def run(args) do + Helpers.ensure_started_readonly() + + {opts, positional, _} = + OptionParser.parse(args, strict: [json: :boolean, context: :boolean, name: :string]) + + case positional do + [id | _] -> inspect_workflow(id, opts) + [] -> Mix.shell().error("Usage: mix durable.inspect WORKFLOW_ID [--json] [--context]") + end + end + + defp inspect_workflow(input, opts) do + durable_name = Helpers.get_durable_name(opts) + + with {:ok, id} <- resolve_id(durable_name, input), + {:ok, report} <- collect_report(durable_name, id) do + if Keyword.get(opts, :json, false) do + Mix.shell().info(Jason.encode!(report, pretty: true)) + else + print_text_report(report, opts) + end + else + {:error, :not_found} -> + Mix.shell().error("No workflow matches: #{input}") + + {:error, :ambiguous, matches} -> + Mix.shell().error( + "Prefix matches #{length(matches)} workflows — narrow it down:\n " <> + Enum.join(matches, "\n ") + ) + end + end + + defp resolve_id(durable_name, input), do: Helpers.resolve_workflow_id(durable_name, input) + + defp collect_report(durable_name, id) do + config = Durable.Config.get(durable_name) + + case Repo.get(config, WorkflowExecution, id) do + nil -> + {:error, :not_found} + + execution -> + steps = fetch_steps(config, id) + pending_inputs = fetch_pending_inputs(config, id) + pending_events = fetch_pending_events(config, id) + children = fetch_children(config, id) + + report = %{ + workflow: serialize_workflow(execution), + step_timeline: serialize_steps(steps), + pending_inputs: Enum.map(pending_inputs, &serialize_pending_input/1), + pending_events: Enum.map(pending_events, &serialize_pending_event/1), + children: Enum.map(children, &serialize_child/1) + } + + {:ok, report} + end + end + + defp fetch_steps(config, id) do + Repo.all( + config, + from(s in StepExecution, where: s.workflow_id == ^id, order_by: [asc: s.inserted_at]) + ) + end + + defp fetch_pending_inputs(config, id) do + Repo.all( + config, + from(p in PendingInput, where: p.workflow_id == ^id, order_by: [asc: p.inserted_at]) + ) + end + + defp fetch_pending_events(config, id) do + Repo.all( + config, + from(p in PendingEvent, where: p.workflow_id == ^id, order_by: [asc: p.inserted_at]) + ) + end + + defp fetch_children(config, id) do + Repo.all( + config, + from(w in WorkflowExecution, + where: w.parent_workflow_id == ^id, + order_by: [asc: w.inserted_at] + ) + ) + end + + # --- Serialization --------------------------------------------------------- + + defp serialize_workflow(exec) do + %{ + id: exec.id, + module: Helpers.strip_elixir_prefix(exec.workflow_module), + name: exec.workflow_name, + status: exec.status, + current_step: exec.current_step, + queue: exec.queue, + priority: exec.priority, + parent_workflow_id: exec.parent_workflow_id, + inserted_at: exec.inserted_at, + started_at: exec.started_at, + completed_at: exec.completed_at, + scheduled_at: exec.scheduled_at, + locked_by: exec.locked_by, + locked_at: exec.locked_at, + duration: Helpers.format_duration(exec.started_at, exec.completed_at), + error: exec.error, + context: exec.context, + input: exec.input + } + end + + # Group step rows by step_name and emit a timeline that clearly shows + # wait/resume pairs — operators need to see BOTH the pause and the + # resume to reason about what actually happened. + defp serialize_steps(steps) do + steps + |> Enum.group_by(& &1.step_name) + |> Enum.map(fn {name, rows} -> + sorted = Enum.sort_by(rows, & &1.inserted_at, DateTime) + latest = List.last(sorted) + + %{ + step_name: name, + step_type: latest.step_type, + latest_status: latest.status, + attempts: Enum.map(sorted, &serialize_step_row/1) + } + end) + |> Enum.sort_by(fn %{attempts: [first | _]} -> first.inserted_at end, DateTime) + end + + defp serialize_step_row(row) do + %{ + status: row.status, + attempt: row.attempt, + inserted_at: row.inserted_at, + started_at: row.started_at, + completed_at: row.completed_at, + duration_ms: row.duration_ms, + error: row.error + } + end + + defp serialize_pending_input(p) do + %{ + input_name: p.input_name, + input_type: p.input_type, + status: p.status, + step_name: p.step_name, + prompt: p.prompt, + timeout_at: p.timeout_at, + inserted_at: p.inserted_at, + completed_at: p.completed_at + } + end + + defp serialize_pending_event(p) do + %{ + event_name: p.event_name, + status: p.status, + wait_type: p.wait_type, + timeout_at: p.timeout_at, + inserted_at: p.inserted_at, + completed_at: p.completed_at + } + end + + defp serialize_child(w) do + %{ + id: w.id, + name: w.workflow_name, + status: w.status, + current_step: w.current_step, + inserted_at: w.inserted_at, + completed_at: w.completed_at + } + end + + # --- Text output ----------------------------------------------------------- + + defp print_text_report(%{workflow: wf} = report, opts) do + info = fn msg -> Mix.shell().info(msg) end + info.("") + info.("Workflow #{wf.id}") + info.(" #{wf.module} · #{wf.name}") + info.(" Status: #{wf.status}#{current_step_suffix(wf)}") + info.(" Queue: #{wf.queue} Priority: #{wf.priority}") + info.(" Created: #{Helpers.format_datetime(wf.inserted_at)}") + info.(" Started: #{Helpers.format_datetime(wf.started_at)}") + info.(" Completed: #{Helpers.format_datetime(wf.completed_at)}") + info.(" Duration: #{wf.duration}") + + maybe_print_lock(info, wf) + maybe_print_parent(info, wf) + maybe_print_error(info, wf) + + print_steps(info, report.step_timeline) + print_pending_inputs(info, report.pending_inputs) + print_pending_events(info, report.pending_events) + print_children(info, report.children) + + print_context(info, wf.context, Keyword.get(opts, :context, false)) + + diagnose(info, report) + info.("") + end + + defp current_step_suffix(%{current_step: nil}), do: "" + defp current_step_suffix(%{current_step: step}), do: " (at #{step})" + + defp maybe_print_lock(_info, %{locked_by: nil}), do: :ok + + defp maybe_print_lock(info, wf) do + info.(" Locked by: #{wf.locked_by} at #{Helpers.format_datetime(wf.locked_at)}") + end + + defp maybe_print_parent(_info, %{parent_workflow_id: nil}), do: :ok + + defp maybe_print_parent(info, wf) do + info.(" Parent: #{wf.parent_workflow_id}") + end + + defp maybe_print_error(_info, %{error: nil}), do: :ok + defp maybe_print_error(_info, %{error: err}) when err == %{}, do: :ok + + defp maybe_print_error(info, %{error: err}) do + info.("") + info.("Error") + + err + |> Jason.encode!(pretty: true) + |> String.split("\n") + |> Enum.each(fn line -> info.(" #{line}") end) + end + + defp print_steps(info, []), do: info.("\nSteps (no step executions yet)") + + defp print_steps(info, steps) do + info.("") + info.("Steps") + + rows = + Enum.map(steps, fn step -> + [ + step.step_name, + to_string(step.step_type), + to_string(step.latest_status), + length(step.attempts), + duration_for(step), + resume_note(step) + ] + end) + + headers = ["Step", "Type", "Status", "Rows", "Duration", "Note"] + + Helpers.format_table(rows, headers) + |> Enum.each(fn line -> info.(" #{line}") end) + end + + defp duration_for(%{attempts: attempts}) do + case List.last(attempts) do + %{duration_ms: nil} -> "—" + %{duration_ms: ms} -> "#{ms}ms" + end + end + + # Multiple rows per step_name means the step paused (wait_for_*) and + # then resumed; flag that so operators don't read a completed step as + # "never waited". + defp resume_note(%{attempts: attempts}) when length(attempts) > 1, do: "wait→resume" + defp resume_note(_), do: "" + + defp print_pending_inputs(_info, []), do: :ok + + defp print_pending_inputs(info, inputs) do + info.("") + info.("Pending inputs") + + rows = + Enum.map(inputs, fn p -> + [ + p.input_name, + to_string(p.input_type), + to_string(p.status), + Helpers.format_datetime(p.timeout_at), + p.step_name + ] + end) + + Helpers.format_table(rows, ["Name", "Type", "Status", "Timeout at", "Step"]) + |> Enum.each(fn line -> info.(" #{line}") end) + end + + defp print_pending_events(_info, []), do: :ok + + defp print_pending_events(info, events) do + info.("") + info.("Pending events") + + rows = + Enum.map(events, fn p -> + [ + p.event_name, + to_string(p.status), + to_string(p.wait_type), + Helpers.format_datetime(p.timeout_at) + ] + end) + + Helpers.format_table(rows, ["Name", "Status", "Type", "Timeout at"]) + |> Enum.each(fn line -> info.(" #{line}") end) + end + + defp print_children(_info, []), do: :ok + + defp print_children(info, children) do + info.("") + info.("Children") + + rows = + Enum.map(children, fn c -> + [Helpers.truncate_id(c.id), c.name, to_string(c.status), c.current_step || "—"] + end) + + Helpers.format_table(rows, ["ID", "Name", "Status", "Step"]) + |> Enum.each(fn line -> info.(" #{line}") end) + end + + defp print_context(_info, nil, _), do: :ok + defp print_context(_info, ctx, _) when ctx == %{}, do: :ok + + defp print_context(info, ctx, true) do + info.("") + info.("Context") + + ctx + |> Jason.encode!(pretty: true) + |> String.split("\n") + |> Enum.each(fn line -> info.(" #{line}") end) + end + + defp print_context(info, ctx, false) do + info.("") + keys = Map.keys(ctx) |> Enum.map(&to_string/1) |> Enum.sort() + + preview = + case keys do + [] -> "(empty)" + _ -> Enum.join(keys, ", ") + end + + info.("Context keys (pass --context for full): #{preview}") + end + + # Terse diagnostics — surface the most common "why is this stuck?" + # signals so operators don't have to eyeball every field. + defp diagnose(info, %{workflow: wf} = report) do + findings = [] + findings = maybe_zombie(findings, wf, report) + findings = maybe_stale_lock(findings, wf) + findings = maybe_failed_no_compensation(findings, wf) + + unless findings == [] do + info.("") + info.("Diagnostics") + Enum.each(findings, fn line -> info.(" • #{line}") end) + end + end + + defp maybe_zombie(findings, %{status: :waiting}, %{pending_inputs: [], pending_events: []}) do + [ + "Status is :waiting but no pending inputs or events — likely a zombie. Run `mix durable.doctor --fix`." + | findings + ] + end + + defp maybe_zombie(findings, _, _), do: findings + + defp maybe_stale_lock(findings, %{locked_by: nil}), do: findings + + defp maybe_stale_lock(findings, %{locked_at: locked_at}) do + age = DateTime.diff(DateTime.utc_now(), locked_at, :second) + + if age > 600 do + ["Lock is #{Helpers.format_seconds(age)} old — possible stale lock." | findings] + else + findings + end + end + + defp maybe_failed_no_compensation(findings, %{status: :failed} = wf) do + if wf.error do + ["Workflow failed. See the Error section above for details." | findings] + else + ["Workflow failed but no error payload was recorded — inspect the queue logs." | findings] + end + end + + defp maybe_failed_no_compensation(findings, _), do: findings +end diff --git a/lib/mix/tasks/durable.install.ex b/durable/lib/mix/tasks/durable.install.ex similarity index 100% rename from lib/mix/tasks/durable.install.ex rename to durable/lib/mix/tasks/durable.install.ex diff --git a/lib/mix/tasks/durable.list.ex b/durable/lib/mix/tasks/durable.list.ex similarity index 99% rename from lib/mix/tasks/durable.list.ex rename to durable/lib/mix/tasks/durable.list.ex index 9b15cd4..f934ddd 100644 --- a/lib/mix/tasks/durable.list.ex +++ b/durable/lib/mix/tasks/durable.list.ex @@ -30,7 +30,7 @@ defmodule Mix.Tasks.Durable.List do @impl Mix.Task def run(args) do - Helpers.ensure_started() + Helpers.ensure_started_readonly() {opts, _, _} = OptionParser.parse(args, diff --git a/durable/lib/mix/tasks/durable.migrations.ex b/durable/lib/mix/tasks/durable.migrations.ex new file mode 100644 index 0000000..7c0de64 --- /dev/null +++ b/durable/lib/mix/tasks/durable.migrations.ex @@ -0,0 +1,141 @@ +defmodule Mix.Tasks.Durable.Migrations do + @shortdoc "Displays Durable internal migration status" + + @moduledoc """ + Displays Durable's internal migration status for an Ecto repo. + + Use `--check` in CI or deploy gates to fail when the database is behind the + Durable library version. + + ## Usage + + mix durable.migrations + mix durable.migrations -r MyApp.Repo --check + mix durable.migrations --prefix private + mix durable.migrations --json + + ## Options + + * `-r`, `--repo` - The Ecto repo to inspect + * `--prefix` - Durable PostgreSQL schema prefix (default: `"durable"`) + * `--json` - Emit JSON instead of text + * `--check` - Raise if any Durable migrations are pending + """ + + use Mix.Task + + import Mix.Ecto + + alias Durable.Migration + + @aliases [r: :repo] + @switches [ + repo: [:string, :keep], + prefix: :string, + json: :boolean, + check: :boolean, + no_compile: :boolean, + no_deps_check: :boolean + ] + + @impl Mix.Task + def run(args) do + repos = parse_repos!(args) + {opts, _argv} = OptionParser.parse!(args, strict: @switches, aliases: @aliases) + + reports = Enum.map(repos, &build_report(&1, args, opts)) + emit_reports(reports, opts) + maybe_raise_for_pending(reports, opts) + + :ok + end + + defp parse_repos!(args) do + case parse_repo(args) do + [] -> Mix.raise("Could not find an Ecto repo. Pass one with -r MyApp.Repo.") + repos -> repos + end + end + + defp build_report(repo, args, opts) do + ensure_repo(repo, args) + prefix = Keyword.get(opts, :prefix, "durable") + + case Ecto.Migrator.with_repo(repo, &migration_status(&1, prefix), mode: :temporary) do + {:ok, report, _started} -> + report + + {:error, error} -> + Mix.raise("Could not start repo #{inspect(repo)}, error: #{inspect(error)}") + end + end + + defp migration_status(repo, prefix) do + current_version = Migration.current_version() + migrated_version = Migration.migrated_version(repo, prefix: prefix) + pending_versions = Migration.pending_versions(repo, prefix: prefix) + + %{ + repo: inspect(repo), + prefix: prefix, + current_version: current_version, + migrated_version: migrated_version, + pending_versions: pending_versions, + status: status(pending_versions) + } + end + + defp status([]), do: "up" + defp status(_pending_versions), do: "pending" + + defp emit_reports(reports, opts) do + if Keyword.get(opts, :json, false) do + emit_json(reports) + else + Enum.each(reports, &emit_text/1) + end + end + + defp emit_json([report]), do: Mix.shell().info(Jason.encode!(report, pretty: true)) + defp emit_json(reports), do: Mix.shell().info(Jason.encode!(reports, pretty: true)) + + defp emit_text(report) do + Mix.shell().info(""" + + Repo: #{report.repo} + Prefix: #{report.prefix} + Current Durable version: #{report.current_version} + Migrated database version: #{report.migrated_version} + Pending versions: #{format_pending(report.pending_versions)} + Status: #{report.status} + """) + end + + defp format_pending([]), do: "none" + defp format_pending(versions), do: Enum.join(versions, ", ") + + defp maybe_raise_for_pending(reports, opts) do + if Keyword.get(opts, :check, false) and Enum.any?(reports, &pending?/1) do + Mix.raise(check_error(reports)) + end + end + + defp pending?(report), do: report.pending_versions != [] + + defp check_error(reports) do + pending = + reports + |> Enum.filter(&pending?/1) + |> Enum.map_join("\n", fn report -> + " #{report.repo} prefix=#{inspect(report.prefix)} pending=#{format_pending(report.pending_versions)}" + end) + + """ + Durable migrations are pending: + #{pending} + + Generate an upgrade migration with `mix durable.gen.upgrade -r YourApp.Repo`, + then run `mix ecto.migrate`. + """ + end +end diff --git a/durable/lib/mix/tasks/durable.pending.ex b/durable/lib/mix/tasks/durable.pending.ex new file mode 100644 index 0000000..a615fff --- /dev/null +++ b/durable/lib/mix/tasks/durable.pending.ex @@ -0,0 +1,214 @@ +defmodule Mix.Tasks.Durable.Pending do + @shortdoc "Lists pending inputs and events blocking workflows" + + @moduledoc """ + Lists outstanding pending inputs and pending events across all + workflows. Use this to answer "what is waiting on me?" without + clicking through the dashboard. + + ## Usage + + mix durable.pending [options] + + ## Options + + * `--inputs` - Only list pending inputs + * `--events` - Only list pending events + * `--limit N` - Max rows per section (default: 50) + * `--expiring-in HOURS` - Only show rows that time out within N hours + * `--json` - Emit JSON instead of tables + * `--name NAME` - The Durable instance name (default: Durable) + """ + + use Mix.Task + + alias Durable.Mix.Helpers + alias Durable.Repo + alias Durable.Storage.Schemas.{PendingEvent, PendingInput, WorkflowExecution} + + import Ecto.Query + + @impl Mix.Task + def run(args) do + Helpers.ensure_started_readonly() + + {opts, _, _} = + OptionParser.parse(args, + strict: [ + inputs: :boolean, + events: :boolean, + limit: :integer, + expiring_in: :integer, + json: :boolean, + name: :string + ] + ) + + durable_name = Helpers.get_durable_name(opts) + only_inputs = Keyword.get(opts, :inputs, false) + only_events = Keyword.get(opts, :events, false) + + show_inputs = only_inputs or not only_events + show_events = only_events or not only_inputs + + inputs = if show_inputs, do: fetch_inputs(durable_name, opts), else: [] + events = if show_events, do: fetch_events(durable_name, opts), else: [] + + if Keyword.get(opts, :json, false) do + emit_json(%{pending_inputs: inputs, pending_events: events}) + else + emit_text(inputs, show_inputs, events, show_events) + end + end + + defp fetch_inputs(durable_name, opts) do + config = Durable.Config.get(durable_name) + limit = Keyword.get(opts, :limit, 50) + cutoff = expiring_cutoff(opts) + + base = + from(p in PendingInput, + join: w in WorkflowExecution, + on: p.workflow_id == w.id, + where: p.status == :pending, + order_by: [asc: p.timeout_at, asc: p.inserted_at], + limit: ^limit, + select: %{ + workflow_id: p.workflow_id, + workflow_name: w.workflow_name, + name: p.input_name, + type: p.input_type, + step: p.step_name, + timeout_at: p.timeout_at, + inserted_at: p.inserted_at + } + ) + + base + |> apply_expiring(cutoff) + |> then(&Repo.all(config, &1)) + end + + defp fetch_events(durable_name, opts) do + config = Durable.Config.get(durable_name) + limit = Keyword.get(opts, :limit, 50) + cutoff = expiring_cutoff(opts) + + base = + from(p in PendingEvent, + join: w in WorkflowExecution, + on: p.workflow_id == w.id, + where: p.status == :pending, + order_by: [asc: p.timeout_at, asc: p.inserted_at], + limit: ^limit, + select: %{ + workflow_id: p.workflow_id, + workflow_name: w.workflow_name, + name: p.event_name, + wait_type: p.wait_type, + timeout_at: p.timeout_at, + inserted_at: p.inserted_at + } + ) + + base + |> apply_expiring(cutoff) + |> then(&Repo.all(config, &1)) + end + + defp expiring_cutoff(opts) do + case Keyword.get(opts, :expiring_in) do + nil -> nil + hours -> DateTime.add(DateTime.utc_now(), hours * 3600, :second) + end + end + + defp apply_expiring(query, nil), do: query + + defp apply_expiring(query, cutoff) do + from(p in query, where: not is_nil(p.timeout_at) and p.timeout_at <= ^cutoff) + end + + defp emit_json(payload), do: Mix.shell().info(Jason.encode!(payload, pretty: true)) + + defp emit_text(inputs, true, events, true) do + print_inputs(inputs) + Mix.shell().info("") + print_events(events) + end + + defp emit_text(inputs, true, _events, false), do: print_inputs(inputs) + defp emit_text(_inputs, false, events, true), do: print_events(events) + defp emit_text(_, _, _, _), do: :ok + + defp print_inputs([]) do + Mix.shell().info("Pending inputs") + Mix.shell().info(" (none)") + end + + defp print_inputs(inputs) do + Mix.shell().info("Pending inputs (#{length(inputs)})") + + rows = + Enum.map(inputs, fn row -> + [ + Helpers.truncate_id(row.workflow_id), + row.workflow_name, + row.name, + to_string(row.type), + row.step, + format_timeout(row.timeout_at), + Helpers.format_seconds(age_seconds(row.inserted_at)) + ] + end) + + headers = ["Workflow", "Name", "Input", "Type", "Step", "Timeout", "Age"] + + Helpers.format_table(rows, headers) + |> Enum.each(fn line -> Mix.shell().info(" #{line}") end) + end + + defp print_events([]) do + Mix.shell().info("Pending events") + Mix.shell().info(" (none)") + end + + defp print_events(events) do + Mix.shell().info("Pending events (#{length(events)})") + + rows = + Enum.map(events, fn row -> + [ + Helpers.truncate_id(row.workflow_id), + row.workflow_name, + row.name, + to_string(row.wait_type), + format_timeout(row.timeout_at), + Helpers.format_seconds(age_seconds(row.inserted_at)) + ] + end) + + headers = ["Workflow", "Name", "Event", "Wait", "Timeout", "Age"] + + Helpers.format_table(rows, headers) + |> Enum.each(fn line -> Mix.shell().info(" #{line}") end) + end + + defp format_timeout(nil), do: "—" + + # Show relative "in 2h" for future timeouts, "overdue 5m" for past ones — + # fastest way to spot rows that should already have been handled. + defp format_timeout(dt) do + diff = DateTime.diff(dt, DateTime.utc_now(), :second) + + if diff > 0 do + "in #{Helpers.format_seconds(diff)}" + else + "overdue #{Helpers.format_seconds(-diff)}" + end + end + + defp age_seconds(inserted_at) do + DateTime.diff(DateTime.utc_now(), inserted_at, :second) + end +end diff --git a/durable/lib/mix/tasks/durable.provide_input.ex b/durable/lib/mix/tasks/durable.provide_input.ex new file mode 100644 index 0000000..ed2ab1c --- /dev/null +++ b/durable/lib/mix/tasks/durable.provide_input.ex @@ -0,0 +1,132 @@ +defmodule Mix.Tasks.Durable.ProvideInput do + @shortdoc "Provides a pending input to unblock a workflow" + + @moduledoc """ + Unblocks a workflow that is waiting on `wait_for_input/2` (or any of + its convenience wrappers like `wait_for_approval`, `wait_for_choice`, + `wait_for_text`, `wait_for_form`). + + Use this when the dashboard is down, when you want to script a + response, or when you just prefer the terminal. + + ## Usage + + mix durable.provide_input WORKFLOW_ID INPUT_NAME DATA [options] + + `DATA` is parsed as JSON when it looks like JSON (objects, arrays, + numbers, quoted strings, booleans, null); otherwise it is passed as + a raw string. `WORKFLOW_ID` accepts a unique prefix. + + ## Options + + * `--json-file PATH` - Read the JSON payload from a file instead + * `--name NAME` - The Durable instance name (default: Durable) + + ## Examples + + # A form response + mix durable.provide_input ab12 equipment_preferences '{"laptop":"mac"}' + + # A single choice + mix durable.provide_input ab12 orientation_slot morning + + # From a file + mix durable.provide_input ab12 manager_approval --json-file approval.json + """ + + use Mix.Task + + alias Durable.Mix.Helpers + + @impl Mix.Task + def run(args) do + Helpers.ensure_started_readonly() + + {opts, positional, _} = + OptionParser.parse(args, strict: [json_file: :string, name: :string]) + + case positional do + [id, input_name | rest] -> + with {:ok, data} <- build_payload(rest, opts), + {:ok, resolved_id} <- Helpers.resolve_workflow_id(Helpers.get_durable_name(opts), id) do + submit(resolved_id, input_name, data, opts) + else + {:error, :ambiguous, matches} -> + Mix.shell().error("Prefix is ambiguous:\n " <> Enum.join(matches, "\n ")) + + {:error, :not_found} -> + Mix.shell().error("No workflow matches: #{id}") + + {:error, reason} -> + Mix.shell().error("Invalid data: #{reason}") + end + + _ -> + Mix.shell().error( + "Usage: mix durable.provide_input WORKFLOW_ID INPUT_NAME DATA [--json-file PATH]" + ) + end + end + + defp submit(workflow_id, input_name, data, opts) do + durable = Helpers.get_durable_name(opts) + + case Durable.Wait.provide_input(workflow_id, input_name, data, durable: durable) do + :ok -> + Mix.shell().info("Provided input '#{input_name}' to #{Helpers.truncate_id(workflow_id)}.") + + {:error, %Ecto.Changeset{} = cs} -> + errors = Ecto.Changeset.traverse_errors(cs, fn {msg, _} -> msg end) + Mix.shell().error("Validation failed: #{inspect(errors)}") + + {:error, reason} -> + Mix.shell().error("Failed: #{inspect(reason)}") + end + end + + defp build_payload([], opts) do + case Keyword.get(opts, :json_file) do + nil -> + {:error, "missing DATA positional arg or --json-file"} + + path -> + with {:ok, body} <- read_file(path), + {:ok, json} <- Jason.decode(body) do + {:ok, json} + else + {:error, %Jason.DecodeError{} = e} -> {:error, Exception.message(e)} + {:error, reason} -> {:error, inspect(reason)} + end + end + end + + defp build_payload([raw | _], _opts), do: {:ok, parse_data(raw)} + + defp read_file(path) do + case File.read(path) do + {:ok, body} -> {:ok, body} + {:error, reason} -> {:error, "could not read #{path}: #{:file.format_error(reason)}"} + end + end + + # Parse as JSON when it starts with a JSON sentinel; otherwise treat + # as a raw string (the library wraps non-map values as `%{"value" => + # raw}` before storage, which keeps single_choice / text / approval + # submissions working the same way the dashboard does them). + defp parse_data(raw) do + trimmed = String.trim(raw) + + if json_like?(trimmed) do + case Jason.decode(trimmed) do + {:ok, decoded} -> decoded + _ -> raw + end + else + raw + end + end + + defp json_like?(<>) when c in [?{, ?[, ?", ?-, ?t, ?f, ?n], do: true + defp json_like?(<>) when c in ?0..?9, do: true + defp json_like?(_), do: false +end diff --git a/durable/lib/mix/tasks/durable.retry.ex b/durable/lib/mix/tasks/durable.retry.ex new file mode 100644 index 0000000..04ed818 --- /dev/null +++ b/durable/lib/mix/tasks/durable.retry.ex @@ -0,0 +1,80 @@ +defmodule Mix.Tasks.Durable.Retry do + @shortdoc "Starts a fresh execution of a workflow using a prior run's input" + + @moduledoc """ + Starts a NEW workflow execution with the same module, queue, and + input as an existing one. This matches what the dashboard's "Retry" + button does — it does not resume the original execution in place. + + Useful after a `:failed` or `:cancelled` run when you want to try + again with identical input. + + ## Usage + + mix durable.retry WORKFLOW_ID [--name NAME] + + `WORKFLOW_ID` accepts a unique prefix. + + ## Options + + * `--name NAME` - The Durable instance name (default: Durable) + """ + + use Mix.Task + + alias Durable.Mix.Helpers + alias Durable.Repo + alias Durable.Storage.Schemas.WorkflowExecution + + @impl Mix.Task + def run(args) do + Helpers.ensure_started_readonly() + + {opts, positional, _} = OptionParser.parse(args, strict: [name: :string]) + + case positional do + [id | _] -> + durable = Helpers.get_durable_name(opts) + + case Helpers.resolve_workflow_id(durable, id) do + {:ok, resolved_id} -> retry(resolved_id, durable) + {:error, :not_found} -> Mix.shell().error("No workflow matches: #{id}") + {:error, :ambiguous, ms} -> Mix.shell().error("Ambiguous:\n " <> Enum.join(ms, "\n ")) + end + + [] -> + Mix.shell().error("Usage: mix durable.retry WORKFLOW_ID") + end + end + + defp retry(id, durable) do + config = Durable.Config.get(durable) + + case Repo.get(config, WorkflowExecution, id) do + nil -> + Mix.shell().error("Workflow #{id} vanished between resolution and retry.") + + exec -> + module = Module.safe_concat([exec.workflow_module]) + + case Durable.start(module, exec.input, queue: exec.queue, durable: durable) do + {:ok, new_id} -> + Mix.shell().info( + "Started new execution #{Helpers.truncate_id(new_id)} " <> + "(retry of #{Helpers.truncate_id(id)})" + ) + + Mix.shell().info(" module: #{Helpers.strip_elixir_prefix(exec.workflow_module)}") + Mix.shell().info(" queue: #{exec.queue}") + + {:error, reason} -> + Mix.shell().error("Failed to start retry: #{inspect(reason)}") + end + end + rescue + ArgumentError -> + Mix.shell().error( + "Could not load module #{inspect(:ignored)} — is the workflow module available?" + ) + end +end diff --git a/lib/mix/tasks/durable.run.ex b/durable/lib/mix/tasks/durable.run.ex similarity index 98% rename from lib/mix/tasks/durable.run.ex rename to durable/lib/mix/tasks/durable.run.ex index ddae75d..f4d6772 100644 --- a/lib/mix/tasks/durable.run.ex +++ b/durable/lib/mix/tasks/durable.run.ex @@ -29,7 +29,7 @@ defmodule Mix.Tasks.Durable.Run do @impl Mix.Task def run(args) do - Helpers.ensure_started() + Helpers.ensure_started_readonly() {opts, positional, _} = OptionParser.parse(args, diff --git a/durable/lib/mix/tasks/durable.send_event.ex b/durable/lib/mix/tasks/durable.send_event.ex new file mode 100644 index 0000000..b858a9b --- /dev/null +++ b/durable/lib/mix/tasks/durable.send_event.ex @@ -0,0 +1,117 @@ +defmodule Mix.Tasks.Durable.SendEvent do + @shortdoc "Sends an event to a waiting workflow" + + @moduledoc """ + Delivers an event to a workflow that is suspended on + `wait_for_event/2`, `wait_for_any/2`, or `wait_for_all/2`. + + Payload parsing rules match `mix durable.provide_input` — JSON when + it looks like JSON, raw string otherwise. + + ## Usage + + mix durable.send_event WORKFLOW_ID EVENT_NAME PAYLOAD [options] + + ## Options + + * `--json-file PATH` - Read payload from a file instead + * `--name NAME` - The Durable instance name (default: Durable) + + ## Examples + + mix durable.send_event ab12 payment_confirmed '{"amount":99.99}' + mix durable.send_event ab12 shipping_update delivered + """ + + use Mix.Task + + alias Durable.Mix.Helpers + + @impl Mix.Task + def run(args) do + Helpers.ensure_started_readonly() + + {opts, positional, _} = + OptionParser.parse(args, strict: [json_file: :string, name: :string]) + + case positional do + [id, event_name | rest] -> + with {:ok, payload} <- build_payload(rest, opts), + {:ok, resolved_id} <- + Helpers.resolve_workflow_id(Helpers.get_durable_name(opts), id) do + submit(resolved_id, event_name, payload, opts) + else + {:error, :ambiguous, matches} -> + Mix.shell().error("Prefix is ambiguous:\n " <> Enum.join(matches, "\n ")) + + {:error, :not_found} -> + Mix.shell().error("No workflow matches: #{id}") + + {:error, reason} -> + Mix.shell().error("Invalid payload: #{reason}") + end + + _ -> + Mix.shell().error( + "Usage: mix durable.send_event WORKFLOW_ID EVENT_NAME PAYLOAD [--json-file PATH]" + ) + end + end + + defp submit(workflow_id, event_name, payload, opts) do + durable = Helpers.get_durable_name(opts) + + case Durable.Wait.send_event(workflow_id, event_name, payload, durable: durable) do + :ok -> + Mix.shell().info("Sent event '#{event_name}' to #{Helpers.truncate_id(workflow_id)}.") + + {:error, reason} -> + Mix.shell().error("Failed: #{inspect(reason)}") + end + end + + # Shares parsing with durable.provide_input. Keeping it duplicated is + # cheaper than making a shared helper at this size, and each task + # stays standalone. + defp build_payload([], opts) do + case Keyword.get(opts, :json_file) do + nil -> + {:error, "missing PAYLOAD positional arg or --json-file"} + + path -> + with {:ok, body} <- read_file(path), + {:ok, json} <- Jason.decode(body) do + {:ok, json} + else + {:error, %Jason.DecodeError{} = e} -> {:error, Exception.message(e)} + {:error, reason} -> {:error, inspect(reason)} + end + end + end + + defp build_payload([raw | _], _opts), do: {:ok, parse_payload(raw)} + + defp read_file(path) do + case File.read(path) do + {:ok, body} -> {:ok, body} + {:error, reason} -> {:error, "could not read #{path}: #{:file.format_error(reason)}"} + end + end + + defp parse_payload(raw) do + trimmed = String.trim(raw) + + if json_like?(trimmed) do + case Jason.decode(trimmed) do + {:ok, decoded} -> decoded + _ -> raw + end + else + raw + end + end + + defp json_like?(<>) when c in [?{, ?[, ?", ?-, ?t, ?f, ?n], do: true + defp json_like?(<>) when c in ?0..?9, do: true + defp json_like?(_), do: false +end diff --git a/lib/mix/tasks/durable.status.ex b/durable/lib/mix/tasks/durable.status.ex similarity index 98% rename from lib/mix/tasks/durable.status.ex rename to durable/lib/mix/tasks/durable.status.ex index dcb8f7f..6b9d800 100644 --- a/lib/mix/tasks/durable.status.ex +++ b/durable/lib/mix/tasks/durable.status.ex @@ -33,7 +33,7 @@ defmodule Mix.Tasks.Durable.Status do @impl Mix.Task def run(args) do - Helpers.ensure_started() + Helpers.ensure_started_readonly() {opts, _, _} = OptionParser.parse(args, strict: [name: :string]) durable_name = Helpers.get_durable_name(opts) diff --git a/durable/mix.exs b/durable/mix.exs new file mode 100644 index 0000000..dd482b2 --- /dev/null +++ b/durable/mix.exs @@ -0,0 +1,106 @@ +defmodule Durable.MixProject do + use Mix.Project + + @version "0.0.0-alpha" + @source_url "https://github.com/wavezync/durable" + @homepage_url "https://durable.wavezync.com" + + def project do + [ + app: :durable, + version: @version, + elixir: "~> 1.15", + elixirc_paths: elixirc_paths(Mix.env()), + start_permanent: Mix.env() == :prod, + aliases: aliases(), + deps: deps(), + name: "Durable", + homepage_url: @homepage_url, + description: "A durable, resumable workflow engine for Elixir", + source_url: @source_url, + docs: docs(), + package: package() + ] + end + + def cli do + [ + preferred_envs: [precommit: :test] + ] + end + + def application do + [ + extra_applications: [:logger], + mod: {Durable.Application, []} + ] + end + + defp elixirc_paths(:test), do: ["lib", "test/support"] + defp elixirc_paths(_), do: ["lib"] + + defp deps do + [ + # Core + {:ecto_sql, "~> 3.12"}, + {:postgrex, "~> 0.19"}, + {:jason, "~> 1.4"}, + {:telemetry, "~> 1.3"}, + {:nimble_options, "~> 1.1"}, + {:crontab, "~> 1.1"}, + {:igniter, "~> 0.6", optional: true}, + {:phoenix_pubsub, "~> 2.1", optional: true}, + + # Dev/Test + {:ex_doc, "~> 0.34", only: :dev, runtime: false}, + {:credo, "~> 1.7", only: [:dev, :test], runtime: false}, + {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false} + ] + end + + defp aliases do + [ + setup: ["deps.get", "ecto.setup"], + "ecto.setup": ["ecto.create", "ecto.migrate"], + "ecto.reset": ["ecto.drop", "ecto.setup"], + test: ["ecto.create --quiet", "ecto.migrate --quiet", "test"], + precommit: ["format", "compile --warnings-as-errors", "credo --strict", "test"] + ] + end + + defp docs do + [ + main: "readme", + source_url: @source_url, + source_ref: "v#{@version}", + extras: [ + "README.md", + "guides/ai_workflows.md", + "guides/branching.md", + "guides/compensations.md", + "guides/orchestration.md", + "guides/parallel.md", + "guides/waiting.md" + ], + groups_for_modules: [ + "Mix Tasks": [ + Mix.Tasks.Durable.Migrations, + Mix.Tasks.Durable.Gen.Upgrade, + Mix.Tasks.Durable.Status, + Mix.Tasks.Durable.List, + Mix.Tasks.Durable.Run, + Mix.Tasks.Durable.Cancel, + Mix.Tasks.Durable.Cleanup + ] + ] + ] + end + + defp package do + [ + licenses: ["MIT"], + links: %{"GitHub" => @source_url}, + files: ~w(lib priv .formatter.exs mix.exs README.md LICENSE) + ] + end +end diff --git a/mix.lock b/durable/mix.lock similarity index 98% rename from mix.lock rename to durable/mix.lock index 8f01a80..bab22bb 100644 --- a/mix.lock +++ b/durable/mix.lock @@ -25,6 +25,7 @@ "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, "owl": {:hex, :owl, "0.13.0", "26010e066d5992774268f3163506972ddac0a7e77bfe57fa42a250f24d6b876e", [:mix], [{:ucwidth, "~> 0.2", [hex: :ucwidth, repo: "hexpm", optional: true]}], "hexpm", "59bf9d11ce37a4db98f57cb68fbfd61593bf419ec4ed302852b6683d3d2f7475"}, + "phoenix_pubsub": {:hex, :phoenix_pubsub, "2.2.0", "ff3a5616e1bed6804de7773b92cbccfc0b0f473faf1f63d7daf1206c7aeaaa6f", [:mix], [], "hexpm", "adc313a5bf7136039f63cfd9668fde73bba0765e0614cba80c06ac9460ff3e96"}, "postgrex": {:hex, :postgrex, "0.21.1", "2c5cc830ec11e7a0067dd4d623c049b3ef807e9507a424985b8dcf921224cd88", [:mix], [{:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "27d8d21c103c3cc68851b533ff99eef353e6a0ff98dc444ea751de43eb48bdac"}, "req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"}, "rewrite": {:hex, :rewrite, "1.2.0", "80220eb14010e175b67c939397e1a8cdaa2c32db6e2e0a9d5e23e45c0414ce21", [:mix], [{:glob_ex, "~> 0.1", [hex: :glob_ex, repo: "hexpm", optional: false]}, {:sourceror, "~> 1.0", [hex: :sourceror, repo: "hexpm", optional: false]}, {:text_diff, "~> 0.1", [hex: :text_diff, repo: "hexpm", optional: false]}], "hexpm", "a1cd702bbb9d51613ab21091f04a386d750fc6f4516b81900df082d78b2d8c50"}, diff --git a/priv/repo/migrations/20241229000001_create_workflow_executions.exs b/durable/priv/repo/migrations/20241229000001_create_workflow_executions.exs similarity index 100% rename from priv/repo/migrations/20241229000001_create_workflow_executions.exs rename to durable/priv/repo/migrations/20241229000001_create_workflow_executions.exs diff --git a/priv/repo/migrations/20241229000002_create_step_executions.exs b/durable/priv/repo/migrations/20241229000002_create_step_executions.exs similarity index 100% rename from priv/repo/migrations/20241229000002_create_step_executions.exs rename to durable/priv/repo/migrations/20241229000002_create_step_executions.exs diff --git a/priv/repo/migrations/20241229000003_create_pending_inputs.exs b/durable/priv/repo/migrations/20241229000003_create_pending_inputs.exs similarity index 100% rename from priv/repo/migrations/20241229000003_create_pending_inputs.exs rename to durable/priv/repo/migrations/20241229000003_create_pending_inputs.exs diff --git a/priv/repo/migrations/20241229000004_create_scheduled_workflows.exs b/durable/priv/repo/migrations/20241229000004_create_scheduled_workflows.exs similarity index 100% rename from priv/repo/migrations/20241229000004_create_scheduled_workflows.exs rename to durable/priv/repo/migrations/20241229000004_create_scheduled_workflows.exs diff --git a/priv/test_repo/migrations/20241229000001_create_durable_tables.exs b/durable/priv/test_repo/migrations/20241229000001_create_durable_tables.exs similarity index 100% rename from priv/test_repo/migrations/20241229000001_create_durable_tables.exs rename to durable/priv/test_repo/migrations/20241229000001_create_durable_tables.exs diff --git a/test/durable/branch_test.exs b/durable/test/durable/branch_test.exs similarity index 100% rename from test/durable/branch_test.exs rename to durable/test/durable/branch_test.exs diff --git a/test/durable/compensation_test.exs b/durable/test/durable/compensation_test.exs similarity index 100% rename from test/durable/compensation_test.exs rename to durable/test/durable/compensation_test.exs diff --git a/test/durable/context_test.exs b/durable/test/durable/context_test.exs similarity index 71% rename from test/durable/context_test.exs rename to durable/test/durable/context_test.exs index 62fb18c..dc2bbb8 100644 --- a/test/durable/context_test.exs +++ b/durable/test/durable/context_test.exs @@ -135,6 +135,42 @@ defmodule Durable.ContextTest do end end + # ============================================================================ + # Bug C-1 regression — put_context must persist across steps + # ============================================================================ + + describe "put_context persistence (Bug C-1 regression)" do + test "put_context writes persist to the next step without being in the return map" do + {:ok, execution} = create_and_execute_workflow(PutContextPersistWorkflow, %{}) + + assert execution.status == :completed + # Step 1 did put_context(:charge_id, "ch_1") but returned {:ok, %{step1_done: true}}. + # Step 2 read get_context(:charge_id) and stored it. + # Before the fix, Step 2 saw nil because save_data_as_context dropped + # the put_context write. + assert execution.context["seen_charge_id"] == "ch_1" + assert execution.context["step1_done"] == true + end + + test "put_context writes persist across decision {:goto, ...}" do + {:ok, execution} = create_and_execute_workflow(PutContextGotoWorkflow, %{}) + + assert execution.status == :completed + # Step 1 put_context'd :prior, decision goto'd to :after_goto, which reads it. + # Before the fix, the goto's new_data replaced context wholesale and :prior vanished. + assert execution.context["saw_prior"] == "before_decision" + end + + test "step return value wins over prior put_context on key collision" do + {:ok, execution} = create_and_execute_workflow(PutContextCollisionWorkflow, %{}) + + assert execution.status == :completed + # Step 1 did put_context(:mode, "from_ctx") then returned %{mode: "from_return"}. + # Step 2 reads :mode — step return should win. + assert execution.context["final_mode"] == "from_return" + end + end + # ============================================================================ # Helper Functions # ============================================================================ @@ -325,3 +361,70 @@ defmodule NumericContextWorkflow do end) end end + +# ============================================================================ +# Bug C-1 regression fixtures +# ============================================================================ + +defmodule PutContextPersistWorkflow do + use Durable + use Durable.Helpers + use Durable.Context + + workflow "put_context_persist" do + step(:step1, fn _data -> + # Write to context via put_context — it should persist to the next step + # EVEN THOUGH the step's return map doesn't include :charge_id. + put_context(:charge_id, "ch_1") + {:ok, %{step1_done: true}} + end) + + step(:step2, fn _data -> + # Before the fix, get_context(:charge_id) returned nil here. + {:ok, %{seen_charge_id: get_context(:charge_id)}} + end) + end +end + +defmodule PutContextGotoWorkflow do + use Durable + use Durable.Helpers + use Durable.Context + + workflow "put_context_goto" do + step(:step1, fn _data -> + put_context(:prior, "before_decision") + {:ok, %{step1_done: true}} + end) + + decision(:decide, fn data -> + # Goto new_data deliberately does NOT include :prior — + # the fix should preserve it via the process-dict merge. + {:goto, :after_goto, Map.drop(data, [:prior, "prior"])} + end) + + step(:skipped, fn _data -> {:ok, %{should_not_run: true}} end) + + step(:after_goto, fn _data -> + {:ok, %{saw_prior: get_context(:prior)}} + end) + end +end + +defmodule PutContextCollisionWorkflow do + use Durable + use Durable.Helpers + use Durable.Context + + workflow "put_context_collision" do + step(:step1, fn _data -> + put_context(:mode, "from_ctx") + # Step return has the same key — return value MUST win on collision. + {:ok, %{mode: "from_return"}} + end) + + step(:step2, fn _data -> + {:ok, %{final_mode: get_context(:mode)}} + end) + end +end diff --git a/test/durable/decision_test.exs b/durable/test/durable/decision_test.exs similarity index 100% rename from test/durable/decision_test.exs rename to durable/test/durable/decision_test.exs diff --git a/durable/test/durable/executor/crash_modes_test.exs b/durable/test/durable/executor/crash_modes_test.exs new file mode 100644 index 0000000..76473b7 --- /dev/null +++ b/durable/test/durable/executor/crash_modes_test.exs @@ -0,0 +1,83 @@ +defmodule Durable.Executor.CrashModesTest do + @moduledoc """ + Resilience matrix — every shape of step-body crash should produce a workflow + in `:failed` with a serializable error payload, locks cleared, and a + `:workflow_failed` PubSub broadcast. Mirrors oban's "safely executing jobs + with any type of exit" test. + + TASK_CRASH (`Process.exit(self(), :kill)`) is intentionally NOT in this + matrix — in inline mode `self()` is the test process, so killing it would + abort the test. That path is exercised in the supervised Worker test. + """ + + use Durable.DataCase, async: false + + import Durable.DataCase, only: [create_and_execute_workflow: 2, pid_to_bin: 0] + + alias Durable.Config + alias Durable.PubSub, as: DurablePubSub + alias Durable.Storage.Schemas.WorkflowExecution + alias Durable.TestWorkflows.SinkWorkflow + + @crash_actions [ + {"ERROR", "sink_error"}, + {"RAISE", "RuntimeError"}, + {"EXIT", "exit"} + ] + + describe "step-body crash modes" do + for {action, expected_type} <- @crash_actions do + test "#{action} produces :failed with error.type=#{expected_type} and clears the lock" do + action = unquote(action) + expected_type = unquote(expected_type) + + config = Config.get(Durable) + repo = config.repo + + :ok = DurablePubSub.subscribe(config, DurablePubSub.workflows_topic(config)) + + ref = "ref-#{action}" + + input = %{ + "action" => action, + "ref" => ref, + "bin_pid" => pid_to_bin() + } + + {:ok, exec} = create_and_execute_workflow(SinkWorkflow, input) + + assert exec.status == :failed, "expected #{action} → :failed, got #{exec.status}" + assert exec.error["type"] == expected_type + assert is_binary(exec.error["message"]) + assert exec.error["message"] != "" + assert {:ok, _} = Jason.encode(exec.error) + + # Lock cleared on failure + assert exec.locked_by == nil + assert exec.locked_at == nil + + # Workflow row reloads identical (eliminates "is the in-memory copy + # different from what's persisted?" doubt). + reloaded = repo.get!(WorkflowExecution, exec.id) + assert reloaded.status == :failed + assert reloaded.locked_by == nil + + # PubSub broadcast happened + wf_id = exec.id + assert_receive {:durable_event, :workflow_failed, %{id: ^wf_id, status: :failed}}, 200 + end + end + end + + describe "non-crashing actions sanity (smoke)" do + test "OK action completes cleanly" do + ref = "ref-OK" + input = %{"action" => "OK", "ref" => ref, "bin_pid" => pid_to_bin()} + + {:ok, exec} = create_and_execute_workflow(SinkWorkflow, input) + + assert exec.status == :completed + assert_receive {:done, ^ref} + end + end +end diff --git a/durable/test/durable/executor/error_sanitization_test.exs b/durable/test/durable/executor/error_sanitization_test.exs new file mode 100644 index 0000000..60654cf --- /dev/null +++ b/durable/test/durable/executor/error_sanitization_test.exs @@ -0,0 +1,149 @@ +defmodule Durable.Executor.ErrorSanitizationTest do + @moduledoc """ + Regression coverage for Bug #3 — secondary JSON encoding errors hiding the + root cause (see `docs/bug-reports/2026-04-12-parallel-context-and-serialization.md`). + + When a step crashed with an error payload that contained tuples, PIDs, + functions, or refs, `Repo.update` would fail to encode it as JSONB and + throw `Protocol.UndefinedError`. That secondary error masked the original + crash and left the workflow in an unrecoverable `:waiting` state. + + The fix: recursively sanitize the error payload through `sanitize_for_json/1` + before persisting, with a try/rescue fallback that stores a minimal + diagnostic if even the sanitized payload fails to save. + """ + + use Durable.DataCase, async: false + + alias Durable.Config + alias Durable.Executor + alias Durable.Storage.Schemas.WorkflowExecution + + describe "sanitize_for_json/1" do + test "passes through JSON-safe primitives unchanged" do + assert Executor.sanitize_for_json(nil) == nil + assert Executor.sanitize_for_json(true) == true + assert Executor.sanitize_for_json(42) == 42 + assert Executor.sanitize_for_json(3.14) == 3.14 + assert Executor.sanitize_for_json("hello") == "hello" + assert Executor.sanitize_for_json(:an_atom) == :an_atom + end + + test "converts tuples to lists recursively" do + assert Executor.sanitize_for_json({:ok, "yes"}) == [:ok, "yes"] + assert Executor.sanitize_for_json({1, 2, 3}) == [1, 2, 3] + + assert Executor.sanitize_for_json({{:nested, "tuple"}, "outer"}) == + [[:nested, "tuple"], "outer"] + end + + test "walks maps recursively" do + input = %{status: {:ok, "done"}, count: 5} + assert Executor.sanitize_for_json(input) == %{status: [:ok, "done"], count: 5} + end + + test "walks lists recursively" do + input = [{:ok, 1}, {:error, :boom}, "ok"] + assert Executor.sanitize_for_json(input) == [[:ok, 1], [:error, :boom], "ok"] + end + + test "preserves Date/DateTime/Time structs (Jason knows how to encode them)" do + now = DateTime.utc_now() + today = Date.utc_today() + time = Time.utc_now() + naive = NaiveDateTime.utc_now() + + assert Executor.sanitize_for_json(now) == now + assert Executor.sanitize_for_json(today) == today + assert Executor.sanitize_for_json(time) == time + assert Executor.sanitize_for_json(naive) == naive + end + + test "converts other structs to plain maps" do + range = 1..5 + # Range is a struct that Jason cannot encode directly. + result = Executor.sanitize_for_json(range) + assert is_map(result) + refute is_struct(result) + end + + test "stringifies PIDs, references, and functions" do + pid_result = Executor.sanitize_for_json(self()) + assert is_binary(pid_result) + assert String.contains?(pid_result, "PID") + + ref_result = Executor.sanitize_for_json(make_ref()) + assert is_binary(ref_result) + + fun_result = Executor.sanitize_for_json(&Enum.map/2) + assert is_binary(fun_result) + end + + test "handles deeply nested unencodable values" do + messy_error = %{ + type: "crash", + details: [{:ok, {:some, "tuple"}}, [{:error, self()}]], + meta: %{inner: {make_ref(), :boom}} + } + + result = Executor.sanitize_for_json(messy_error) + + # Verify the result can actually be encoded to JSON — the point of + # this whole function. + assert {:ok, _json} = Jason.encode(result) + end + end + + describe "mark_failed survives unencodable error payloads (end-to-end)" do + test "workflow with error payload containing tuples can be persisted as :failed" do + config = Config.get(Durable) + repo = config.repo + + {:ok, execution} = + %WorkflowExecution{} + |> WorkflowExecution.changeset(%{ + workflow_module: "TestWorkflow", + workflow_name: "test", + status: :running, + queue: "default", + priority: 0, + input: %{}, + context: %{}, + locked_by: "test_node", + locked_at: DateTime.utc_now() + }) + |> repo.insert() + + # A payload that would have crashed Repo.update before the sanitizer. + nasty_error = %{ + type: "FunctionClauseError", + message: "no function clause matching", + details: {:ok, {:some, "tuple"}}, + worker_pid: self(), + request_ref: make_ref(), + retries: [{:attempt, 1, {:error, :timeout}}] + } + + # Simulate what mark_failed does internally: sanitize, then persist. + # This is a focused integration check on the save path. + safe = Executor.sanitize_for_json(nasty_error) + + {:ok, _} = + execution + |> Ecto.Changeset.change(status: :failed, error: safe) + |> repo.update() + + # Re-fetch so we read the JSONB-round-tripped string-keyed version + # (the same shape the dashboard and consumers will see). + reloaded = repo.get!(WorkflowExecution, execution.id) + + assert reloaded.status == :failed + assert reloaded.error["type"] == "FunctionClauseError" + # Tuples became lists (atoms survive round-trip as strings via JSON) + assert reloaded.error["details"] == ["ok", ["some", "tuple"]] + # PID became a string + assert is_binary(reloaded.error["worker_pid"]) + assert String.contains?(reloaded.error["worker_pid"], "PID") + end + end +end diff --git a/test/durable/integration_test.exs b/durable/test/durable/integration_test.exs similarity index 100% rename from test/durable/integration_test.exs rename to durable/test/durable/integration_test.exs diff --git a/test/durable/log_capture/handler_test.exs b/durable/test/durable/log_capture/handler_test.exs similarity index 100% rename from test/durable/log_capture/handler_test.exs rename to durable/test/durable/log_capture/handler_test.exs diff --git a/test/durable/log_capture/integration_test.exs b/durable/test/durable/log_capture/integration_test.exs similarity index 77% rename from test/durable/log_capture/integration_test.exs rename to durable/test/durable/log_capture/integration_test.exs index 05f3dcb..5ee4c6b 100644 --- a/test/durable/log_capture/integration_test.exs +++ b/durable/test/durable/log_capture/integration_test.exs @@ -33,13 +33,10 @@ defmodule Durable.LogCapture.IntegrationTest do assert step_exec != nil, "Step execution should be created" assert step_exec.status == :completed assert is_list(step_exec.logs) + assert step_exec.logs != [], "expected Logger calls to be captured in step.logs" - # Check that logs were captured - if step_exec.logs != [] do - messages = Enum.map_join(step_exec.logs, " ", & &1["message"]) - # At minimum we should see some log content - assert messages =~ "message" or messages =~ "workflow" - end + messages = Enum.map_join(step_exec.logs, " ", & &1["message"]) + assert messages =~ "message" or messages =~ "workflow" end test "captures IO output in workflow step" do @@ -59,14 +56,13 @@ defmodule Durable.LogCapture.IntegrationTest do assert step_exec != nil, "Step execution should be created" assert step_exec.status == :completed assert is_list(step_exec.logs) + assert step_exec.logs != [], "expected IO output to be captured in step.logs" - # Check for IO logs io_logs = Enum.filter(step_exec.logs, fn log -> log["source"] == "io" end) + assert io_logs != [], "expected at least one log entry with source=io" - if io_logs != [] do - io_messages = Enum.map_join(io_logs, " ", & &1["message"]) - assert io_messages =~ "IO" or io_messages =~ "output" - end + io_messages = Enum.map_join(io_logs, " ", & &1["message"]) + assert io_messages =~ "IO" or io_messages =~ "output" end test "each step has isolated logs" do @@ -88,14 +84,19 @@ defmodule Durable.LogCapture.IntegrationTest do [first, second] = step_execs - # Each step should have its own logs + assert first.logs != [], "first step should have captured logs" + assert second.logs != [], "second step should have captured logs" + first_messages = Enum.map_join(first.logs, " ", & &1["message"]) second_messages = Enum.map_join(second.logs, " ", & &1["message"]) - # Check logs are isolated (first step shouldn't have second step's log) - if first_messages != "" and second_messages != "" do - assert first_messages =~ "First" or not (first_messages =~ "Second") - end + # Logs must be isolated: the first step's buffer must not contain the + # second step's log message, and vice versa. + refute first_messages =~ "Second step log", + "first step's logs leaked the second step's message" + + refute second_messages =~ "First step log", + "second step's logs leaked the first step's message" end end @@ -115,16 +116,15 @@ defmodule Durable.LogCapture.IntegrationTest do ) assert step_exec != nil + assert step_exec.logs != [], "expected at least one captured log entry" - if step_exec.logs != [] do - [log | _] = step_exec.logs + [log | _] = step_exec.logs - assert Map.has_key?(log, "timestamp") - assert Map.has_key?(log, "level") - assert Map.has_key?(log, "message") - assert Map.has_key?(log, "source") - assert Map.has_key?(log, "metadata") - end + assert Map.has_key?(log, "timestamp") + assert Map.has_key?(log, "level") + assert Map.has_key?(log, "message") + assert Map.has_key?(log, "source") + assert Map.has_key?(log, "metadata") end test "timestamp is ISO8601 format" do @@ -142,12 +142,10 @@ defmodule Durable.LogCapture.IntegrationTest do ) assert step_exec != nil + assert step_exec.logs != [], "expected at least one captured log entry" - if step_exec.logs != [] do - [log | _] = step_exec.logs - # Should be parseable as DateTime - assert {:ok, _, _} = DateTime.from_iso8601(log["timestamp"]) - end + [log | _] = step_exec.logs + assert {:ok, _, _} = DateTime.from_iso8601(log["timestamp"]) end end diff --git a/test/durable/log_capture/io_server_test.exs b/durable/test/durable/log_capture/io_server_test.exs similarity index 100% rename from test/durable/log_capture/io_server_test.exs rename to durable/test/durable/log_capture/io_server_test.exs diff --git a/test/durable/log_capture_test.exs b/durable/test/durable/log_capture_test.exs similarity index 100% rename from test/durable/log_capture_test.exs rename to durable/test/durable/log_capture_test.exs diff --git a/durable/test/durable/migration_test.exs b/durable/test/durable/migration_test.exs new file mode 100644 index 0000000..c768db4 --- /dev/null +++ b/durable/test/durable/migration_test.exs @@ -0,0 +1,57 @@ +defmodule Durable.MigrationTest do + use Durable.DataCase, async: false + + alias Durable.Migration + alias Durable.Migration.Migrator + + test "current_version returns the latest registered migration version" do + assert Migration.current_version() == List.last(Migration.all_versions()) + end + + test "previous_version returns the version before the target" do + [first, second | _] = Migration.all_versions() + + assert Migration.previous_version(first) == 0 + assert Migration.previous_version(second) == first + assert Migration.previous_version() == Enum.at(Migration.all_versions(), -2) + end + + test "explicit repo helpers report default schema state" do + assert Migration.migrated_version(Durable.TestRepo) == Migration.current_version() + assert Migration.pending_versions(Durable.TestRepo) == [] + end + + test "explicit repo helpers report missing prefixes as unmigrated" do + prefix = "durable_missing_#{System.unique_integer([:positive])}" + + assert Migration.migrated_version(Durable.TestRepo, prefix: prefix) == 0 + + assert Migration.pending_versions(Durable.TestRepo, prefix: prefix) == + Migration.all_versions() + end + + test "all migration files are registered with the migrator" do + file_modules = + "lib/durable/migration/migrations/v*.ex" + |> Path.wildcard() + |> Enum.map(&module_from_file!/1) + |> Enum.sort() + + registered_modules = + Migrator.all_migrations() + |> Enum.map(fn {_version, mod} -> mod end) + |> Enum.sort() + + assert registered_modules == file_modules + end + + defp module_from_file!(path) do + path + |> File.read!() + |> then(&Regex.run(~r/defmodule\s+([A-Za-z0-9_.]+)/, &1)) + |> case do + [_match, module] -> String.to_existing_atom("Elixir." <> module) + nil -> flunk("Expected #{path} to define a module") + end + end +end diff --git a/test/durable/orchestration_test.exs b/durable/test/durable/orchestration_test.exs similarity index 100% rename from test/durable/orchestration_test.exs rename to durable/test/durable/orchestration_test.exs diff --git a/test/durable/parallel_test.exs b/durable/test/durable/parallel_test.exs similarity index 78% rename from test/durable/parallel_test.exs rename to durable/test/durable/parallel_test.exs index 20e17ae..f1b0fe9 100644 --- a/test/durable/parallel_test.exs +++ b/durable/test/durable/parallel_test.exs @@ -269,6 +269,91 @@ defmodule Durable.ParallelTest do end end + describe "parallel execution - context inheritance (Bug #1 regression)" do + test "parallel children can read context set by earlier non-parallel steps" do + config = Config.get(Durable) + repo = config.repo + + {:ok, parent} = create_and_execute_workflow(ParallelContextInheritWorkflow, %{}) + assert parent.status == :waiting + + # Each child should have inherited the parent's accumulated context + children = get_child_executions(repo, parent.id) + + Enum.each(children, fn child -> + # The shared key was put_context'd in :seed before the parallel block. + # It must now live in the child's context (in addition to __parallel_step). + assert child.context["shared_key"] == "hello", + "Child #{child.id} did not inherit shared_key; got: #{inspect(child.context)}" + + assert child.context["shared_count"] == 42 + # Internal marker is still there for the executor's use + assert child.context["__parallel_step"] != nil + end) + + # Execute children — they use get_context to read the inherited values + execute_children(repo, parent.id, config) + + # Finalize parent + parent = repo.get!(WorkflowExecution, parent.id) + Executor.execute_workflow(parent.id, config) + parent = repo.get!(WorkflowExecution, parent.id) + + assert parent.status == :completed + + # Both readers saw the inherited values + results = parent.context["__results__"] + assert results["reader_a"] == ["ok", %{"saw" => "hello", "count" => 42}] + assert results["reader_b"] == ["ok", %{"saw" => "hello"}] + end + + test "parallel children don't see each other's put_context writes (intentional isolation)" do + # Siblings run concurrently; allowing them to share mutations would race. + # This test documents the current boundary: inherited context is a + # snapshot taken at spawn time. + config = Config.get(Durable) + repo = config.repo + + {:ok, parent} = create_and_execute_workflow(ParallelIsolationWorkflow, %{}) + execute_children(repo, parent.id, config) + + parent = repo.get!(WorkflowExecution, parent.id) + Executor.execute_workflow(parent.id, config) + parent = repo.get!(WorkflowExecution, parent.id) + + assert parent.status == :completed + results = parent.context["__results__"] + # writer_step put_context(:sibling_wrote, true), but peer_step can't see it + assert results["peer_step"] == ["ok", %{"saw_sibling" => nil}] + end + end + + describe "parallel execution - into: callback safety (Bug #2 regression)" do + test "raw tuples returned by into: callback are sanitized before persisting" do + config = Config.get(Durable) + repo = config.repo + + {:ok, parent} = create_and_execute_workflow(IntoPassthroughWorkflow, %{}) + execute_children(repo, parent.id, config) + + parent = repo.get!(WorkflowExecution, parent.id) + Executor.execute_workflow(parent.id, config) + parent = repo.get!(WorkflowExecution, parent.id) + + # Before the fix, this would crash with Protocol.UndefinedError because + # the user's callback returned a map containing raw {:ok, _} tuples + # from `results` and Jason can't encode tuples to JSONB. After the + # fix, save_data_as_context runs sanitize_for_json on everything it + # persists — tuples become lists automatically. + assert parent.status == :completed + pass_through = parent.context["pass_through"] + assert is_map(pass_through) + # Tuples have been converted to lists + assert match?(["ok", _], pass_through["task_x"]) + assert match?(["ok", _], pass_through["task_y"]) + end + end + describe "parallel execution - distributed" do test "children can be executed on separate workers" do config = Config.get(Durable) @@ -780,3 +865,84 @@ defmodule QueueRoutingWorkflow do end) end end + +# =========================================================================== +# Test fixtures for Bug #1 regression — parallel context inheritance +# =========================================================================== + +defmodule ParallelContextInheritWorkflow do + use Durable + use Durable.Helpers + use Durable.Context + + workflow "parallel_context_inherit" do + step(:seed, fn _data -> + # Context values flow to downstream steps via the step's return map. + # This is the standard Durable pipeline pattern. + {:ok, %{shared_key: "hello", shared_count: 42, seeded: true}} + end) + + parallel into: fn _ctx, results -> {:ok, %{__results__: results}} end do + step(:reader_a, fn _data -> + # Before the fix, these would return nil because parallel children + # did not inherit the parent's accumulated context. + {:ok, %{saw: get_context(:shared_key), count: get_context(:shared_count)}} + end) + + step(:reader_b, fn _data -> + {:ok, %{saw: get_context(:shared_key, "missing")}} + end) + end + end +end + +defmodule ParallelIsolationWorkflow do + use Durable + use Durable.Helpers + use Durable.Context + + workflow "parallel_isolation" do + step(:setup, fn _data -> + {:ok, %{}} + end) + + parallel into: fn _ctx, results -> {:ok, %{__results__: results}} end do + step(:writer_step, fn _data -> + # Writes made inside one parallel sibling must NOT be visible to another. + put_context(:sibling_wrote, true) + {:ok, %{wrote: true}} + end) + + step(:peer_step, fn _data -> + # Give the writer a head start — this is still not enough for + # cross-sibling reads to work because parallel siblings are isolated. + Process.sleep(5) + {:ok, %{saw_sibling: get_context(:sibling_wrote)}} + end) + end + end +end + +# =========================================================================== +# Test fixture for Bug #2 regression — into: callback pre-serialization +# =========================================================================== + +defmodule IntoPassthroughWorkflow do + use Durable + use Durable.Helpers + use Durable.Context + + workflow "into_passthrough" do + step(:setup, fn _data -> + {:ok, %{}} + end) + + # The user's into: callback passes the `results` map through untouched. + # Before the fix, `results` contained raw {:ok, _} tuples and storage + # crashed. After the fix, results are pre-serialized to lists. + parallel into: fn _ctx, results -> {:ok, %{pass_through: results}} end do + step(:task_x, fn _data -> {:ok, %{x: 1}} end) + step(:task_y, fn _data -> {:ok, %{y: 2}} end) + end + end +end diff --git a/durable/test/durable/pubsub_test.exs b/durable/test/durable/pubsub_test.exs new file mode 100644 index 0000000..302485e --- /dev/null +++ b/durable/test/durable/pubsub_test.exs @@ -0,0 +1,150 @@ +defmodule Durable.PubSubTest do + @moduledoc """ + Tests for `Durable.PubSub` lifecycle broadcasts. + + Subscribes to workflow and input topics, drives workflows inline via the + executor, and asserts that the expected `{:durable_event, kind, payload}` + messages arrive. + """ + use Durable.DataCase, async: false + + alias Durable.Config + alias Durable.Executor + alias Durable.PubSub, as: DurablePubSub + alias Durable.Wait + + setup do + config = Config.get(Durable) + {:ok, config: config} + end + + describe "subscribe/2" do + test "returns :ok when pubsub is configured", %{config: config} do + assert :ok = DurablePubSub.subscribe(config, DurablePubSub.workflows_topic(config)) + end + + test "returns {:error, :no_pubsub} when config has no pubsub", %{config: config} do + nilled = %{config | pubsub: nil} + assert {:error, :no_pubsub} = DurablePubSub.subscribe(nilled, "whatever") + end + end + + describe "workflow lifecycle broadcasts" do + test "broadcasts :workflow_started on start_workflow", %{config: config} do + :ok = DurablePubSub.subscribe(config, DurablePubSub.workflows_topic(config)) + + {:ok, wf_id} = Executor.start_workflow(Durable.TestWorkflows.SimpleWorkflow, %{}) + + assert_receive {:durable_event, :workflow_started, %{id: ^wf_id, status: :pending}} + end + + test "broadcasts :workflow_resumed → :workflow_completed on inline execute", %{ + config: config + } do + :ok = DurablePubSub.subscribe(config, DurablePubSub.workflows_topic(config)) + + {:ok, wf_id} = Executor.start_workflow(Durable.TestWorkflows.SimpleWorkflow, %{}) + Executor.execute_workflow(wf_id, config) + + assert_receive {:durable_event, :workflow_started, %{id: ^wf_id}} + assert_receive {:durable_event, :workflow_resumed, %{id: ^wf_id, status: :running}} + + assert_receive {:durable_event, :workflow_completed, %{id: ^wf_id, status: :completed}} + end + + test "broadcasts :workflow_cancelled on cancel", %{config: config} do + :ok = DurablePubSub.subscribe(config, DurablePubSub.workflows_topic(config)) + + {:ok, wf_id} = Executor.start_workflow(Durable.TestWorkflows.SimpleWorkflow, %{}) + # drain the start event + assert_receive {:durable_event, :workflow_started, _} + + :ok = Executor.cancel_workflow(wf_id, "test") + + assert_receive {:durable_event, :workflow_cancelled, %{id: ^wf_id, status: :cancelled}} + end + + test "per-workflow topic also receives events", %{config: config} do + {:ok, wf_id} = Executor.start_workflow(Durable.TestWorkflows.SimpleWorkflow, %{}) + + # Subscribe AFTER start — check completed event arrives via per-workflow topic + :ok = DurablePubSub.subscribe(config, DurablePubSub.workflow_topic(config, wf_id)) + + Executor.execute_workflow(wf_id, config) + + assert_receive {:durable_event, :workflow_completed, %{id: ^wf_id}} + end + end + + describe "step lifecycle broadcasts" do + test "broadcasts :step_started and :step_completed", %{config: config} do + {:ok, wf_id} = Executor.start_workflow(Durable.TestWorkflows.SimpleWorkflow, %{}) + :ok = DurablePubSub.subscribe(config, DurablePubSub.workflow_topic(config, wf_id)) + + Executor.execute_workflow(wf_id, config) + + assert_receive {:durable_event, :step_started, %{step_name: "hello", status: :running}} + + assert_receive {:durable_event, :step_completed, %{step_name: "hello", status: :completed}} + end + end + + describe "input lifecycle broadcasts" do + defmodule InputWorkflow do + @moduledoc false + use Durable + use Durable.Wait + + workflow "approval" do + step(:ask, fn _data -> + approval = wait_for_input("approve", type: :approval, prompt: "OK?") + {:ok, %{approval: approval}} + end) + end + end + + test "broadcasts :input_requested when workflow waits for input", %{config: config} do + :ok = DurablePubSub.subscribe(config, DurablePubSub.inputs_topic(config)) + + {:ok, wf_id} = Executor.start_workflow(InputWorkflow, %{}) + Executor.execute_workflow(wf_id, config) + + assert_receive {:durable_event, :input_requested, + %{workflow_id: ^wf_id, input_name: "approve"}} + end + + test "broadcasts :input_provided when input is supplied", %{config: config} do + {:ok, wf_id} = Executor.start_workflow(InputWorkflow, %{}) + Executor.execute_workflow(wf_id, config) + + :ok = DurablePubSub.subscribe(config, DurablePubSub.inputs_topic(config)) + + :ok = Wait.provide_input(wf_id, "approve", %{approved: true}) + + assert_receive {:durable_event, :input_provided, + %{workflow_id: ^wf_id, input_name: "approve", status: :completed}} + end + end + + describe "topic naming" do + test "workflows_topic is scoped by instance name", %{config: config} do + assert DurablePubSub.workflows_topic(config) == "durable:Elixir.Durable:workflows" + end + + test "workflow_topic includes workflow id", %{config: config} do + assert DurablePubSub.workflow_topic(config, "abc-123") == + "durable:Elixir.Durable:workflow:abc-123" + end + + test "inputs_topic is scoped by instance name", %{config: config} do + assert DurablePubSub.inputs_topic(config) == "durable:Elixir.Durable:inputs" + end + end + + describe "no-pubsub fallback" do + test "broadcast_workflow is a no-op when pubsub is nil" do + config = %Config{name: :test, pubsub: nil} + assert :ok = DurablePubSub.broadcast_workflow(config, :workflow_started, %{id: "x"}) + end + end +end diff --git a/test/durable/queue/adapters/postgres_test.exs b/durable/test/durable/queue/adapters/postgres_test.exs similarity index 53% rename from test/durable/queue/adapters/postgres_test.exs rename to durable/test/durable/queue/adapters/postgres_test.exs index 339dd39..698a85a 100644 --- a/test/durable/queue/adapters/postgres_test.exs +++ b/durable/test/durable/queue/adapters/postgres_test.exs @@ -1,6 +1,8 @@ defmodule Durable.Queue.Adapters.PostgresTest do use Durable.DataCase, async: false + import Ecto.Query + alias Durable.Config alias Durable.Queue.Adapters.Postgres alias Durable.Storage.Schemas.WorkflowExecution @@ -158,6 +160,190 @@ defmodule Durable.Queue.Adapters.PostgresTest do end end + describe "recover_zombie_workflows/2 (Bug #4 regression)" do + alias Durable.Storage.Schemas.{PendingEvent, PendingInput} + + test "marks :waiting workflows with no pending inputs/events as :failed" do + long_ago = DateTime.add(DateTime.utc_now(), -3600, :second) + + zombie = insert_execution(workflow_name: "zombie_a", status: :waiting) + + # Backdate updated_at so the zombie is past the stale cutoff + {1, _} = + repo().update_all( + from(w in WorkflowExecution, where: w.id == ^zombie.id), + set: [updated_at: long_ago] + ) + + {:ok, count} = Postgres.recover_zombie_workflows(config(), 300) + assert count == 1 + + reloaded = repo().get!(WorkflowExecution, zombie.id) + assert reloaded.status == :failed + assert reloaded.error["type"] == "zombie_detected" + assert reloaded.error["message"] =~ "waiting" + assert reloaded.completed_at != nil + end + + test "leaves healthy :waiting workflows alone (they have a pending input)" do + long_ago = DateTime.add(DateTime.utc_now(), -3600, :second) + + waiter = insert_execution(workflow_name: "healthy_waiter", status: :waiting) + + {1, _} = + repo().update_all( + from(w in WorkflowExecution, where: w.id == ^waiter.id), + set: [updated_at: long_ago] + ) + + # Insert a pending input that this waiter depends on + {:ok, _} = + %PendingInput{} + |> PendingInput.changeset(%{ + workflow_id: waiter.id, + step_name: "step", + input_name: "approval", + input_type: :approval, + status: :pending + }) + |> repo().insert() + + {:ok, count} = Postgres.recover_zombie_workflows(config(), 300) + assert count == 0 + + reloaded = repo().get!(WorkflowExecution, waiter.id) + assert reloaded.status == :waiting + end + + test "leaves healthy :waiting workflows alone (they have a pending event)" do + long_ago = DateTime.add(DateTime.utc_now(), -3600, :second) + + waiter = insert_execution(workflow_name: "event_waiter", status: :waiting) + + {1, _} = + repo().update_all( + from(w in WorkflowExecution, where: w.id == ^waiter.id), + set: [updated_at: long_ago] + ) + + {:ok, _} = + %PendingEvent{} + |> PendingEvent.changeset(%{ + workflow_id: waiter.id, + event_name: "payment_confirmed", + step_name: "await", + status: :pending + }) + |> repo().insert() + + {:ok, count} = Postgres.recover_zombie_workflows(config(), 300) + assert count == 0 + + reloaded = repo().get!(WorkflowExecution, waiter.id) + assert reloaded.status == :waiting + end + + test "leaves recently updated :waiting workflows alone (within stale timeout)" do + # This waiter has no pending inputs/events, but its updated_at is fresh + # so we don't know yet if it's stuck — maybe it just transitioned. + recent_waiter = + insert_execution(workflow_name: "recent_waiter", status: :waiting) + + {:ok, count} = Postgres.recover_zombie_workflows(config(), 300) + assert count == 0 + + reloaded = repo().get!(WorkflowExecution, recent_waiter.id) + assert reloaded.status == :waiting + end + + test "leaves :running and :completed workflows alone" do + long_ago = DateTime.add(DateTime.utc_now(), -3600, :second) + + running = insert_execution(workflow_name: "running", status: :running) + completed = insert_execution(workflow_name: "completed", status: :completed) + + {2, _} = + repo().update_all( + from(w in WorkflowExecution, where: w.id in ^[running.id, completed.id]), + set: [updated_at: long_ago] + ) + + {:ok, count} = Postgres.recover_zombie_workflows(config(), 300) + assert count == 0 + + assert repo().get!(WorkflowExecution, running.id).status == :running + assert repo().get!(WorkflowExecution, completed.id).status == :completed + end + + test "marks :compensating workflows with no running compensation as :failed (M-1)" do + long_ago = DateTime.add(DateTime.utc_now(), -3600, :second) + + zombie = insert_execution(workflow_name: "comp_zombie", status: :compensating) + + {1, _} = + repo().update_all( + from(w in WorkflowExecution, where: w.id == ^zombie.id), + set: [updated_at: long_ago] + ) + + {:ok, count} = Postgres.recover_zombie_workflows(config(), 300) + assert count == 1 + assert repo().get!(WorkflowExecution, zombie.id).status == :failed + end + + test "leaves :compensating workflows with a running compensation step alone" do + alias Durable.Storage.Schemas.StepExecution + long_ago = DateTime.add(DateTime.utc_now(), -3600, :second) + + live = insert_execution(workflow_name: "comp_live", status: :compensating) + + {1, _} = + repo().update_all( + from(w in WorkflowExecution, where: w.id == ^live.id), + set: [updated_at: long_ago] + ) + + # Insert an actively-running compensation step + {:ok, _step} = + %StepExecution{} + |> StepExecution.changeset(%{ + workflow_id: live.id, + step_name: "comp_step", + step_type: "compensation", + attempt: 1, + status: :running + }) + |> repo().insert() + + {:ok, count} = Postgres.recover_zombie_workflows(config(), 300) + assert count == 0 + assert repo().get!(WorkflowExecution, live.id).status == :compensating + end + + test "handles multiple zombies in a single sweep" do + long_ago = DateTime.add(DateTime.utc_now(), -3600, :second) + + zombies = + for i <- 1..3 do + z = insert_execution(workflow_name: "zombie_#{i}", status: :waiting) + z.id + end + + {3, _} = + repo().update_all( + from(w in WorkflowExecution, where: w.id in ^zombies), + set: [updated_at: long_ago] + ) + + {:ok, count} = Postgres.recover_zombie_workflows(config(), 300) + assert count == 3 + + for id <- zombies do + assert repo().get!(WorkflowExecution, id).status == :failed + end + end + end + describe "ack/2" do test "clears lock fields" do job = @@ -175,6 +361,51 @@ defmodule Durable.Queue.Adapters.PostgresTest do assert execution.locked_at == nil end + test "ack of a successfully-finished job is idempotent (M-5 regression)" do + # Bug M-5: a transient ack failure used to silently re-execute the job + # via stale-recovery. The retry path makes this less likely. Calling + # ack twice on the same job should be a no-op (no errors, no duplicates). + job = + insert_execution( + workflow_name: "test", + locked_by: "node_a", + locked_at: DateTime.utc_now(), + status: :running + ) + + assert :ok = Postgres.ack(config(), job.id) + assert :ok = Postgres.ack(config(), job.id) + end + end + + describe "ack/2 telemetry" do + test "ack_failed telemetry fires when ack ultimately fails" do + # Use a non-existent job ID — the get returns nil, so the existing + # :not_found branch fires. To exercise the failure-after-retries + # branch we'd need to fault-inject Repo.update; here we just assert + # the telemetry handler can be attached without crashing the system. + ref = make_ref() + test_pid = self() + + :ok = + :telemetry.attach( + "ack-failed-test-#{inspect(ref)}", + [:durable, :queue, :ack_failed], + fn _event, measurements, metadata, _ -> + send(test_pid, {:ack_failed, ref, measurements, metadata}) + end, + nil + ) + + # Non-existent jobs return :not_found without firing telemetry. + assert {:error, :not_found} = Postgres.ack(config(), Ecto.UUID.generate()) + refute_received {:ack_failed, ^ref, _, _}, 100 + + :telemetry.detach("ack-failed-test-#{inspect(ref)}") + end + end + + describe "ack/2 not_found" do test "returns error for non-existent job" do result = Postgres.ack(config(), Ecto.UUID.generate()) assert result == {:error, :not_found} diff --git a/durable/test/durable/queue/stale_job_recovery_integration_test.exs b/durable/test/durable/queue/stale_job_recovery_integration_test.exs new file mode 100644 index 0000000..936fc9d --- /dev/null +++ b/durable/test/durable/queue/stale_job_recovery_integration_test.exs @@ -0,0 +1,172 @@ +defmodule Durable.Queue.StaleJobRecoveryIntegrationTest do + @moduledoc """ + Drives the live `Durable.Queue.StaleJobRecovery` GenServer (not the adapter + function in isolation — that path is covered by + `test/durable/queue/adapters/postgres_test.exs`). + + These tests exist to verify the wiring around the recovery sweep: + - `recover_now/1` triggers a synchronous run while the GenServer is alive + - Telemetry fires with the right counts + - Both stale-lock recovery AND zombie detection happen on a single tick + - Recently-locked / healthy rows are left alone + + All `async: false` — we rely on a globally-named supervised Durable instance + under the shared sandbox. + """ + + use Durable.DataCase, async: false + + @moduletag :supervised + + alias Durable.Config + alias Durable.Queue.StaleJobRecovery + alias Durable.Storage.Schemas.{PendingEvent, WorkflowExecution} + alias Durable.TelemetryHandler + + setup do + # `stale_lock_timeout: 1` (seconds) lets us age a row past the cutoff in + # ~1.1s of sleep without dragging the suite. + Durable.DataCase.start_supervised_durable!(stale_lock_timeout: 1) + TelemetryHandler.attach_events() + :ok + end + + describe "stale-lock recovery via recover_now/1" do + test "rescues a row with a stale lock and emits :stale_recovered telemetry" do + config = Config.get(Durable) + repo = config.repo + + # Stale: locked 10s ago. + stale = + insert_execution(repo, + workflow_name: "stale_one", + status: :running, + locked_by: "dead_node", + locked_at: DateTime.add(DateTime.utc_now(), -10, :second) + ) + + # Healthy: locked just now. Should be left alone. + healthy = + insert_execution(repo, + workflow_name: "healthy_one", + status: :running, + locked_by: "live_node", + locked_at: DateTime.utc_now() + ) + + assert {:ok, count} = StaleJobRecovery.recover_now(Durable) + assert count >= 1 + + assert_receive {:event, :stale_recovered, %{count: ^count}, %{durable: Durable}}, 500 + + # Stale row reset. + stale_after = repo.get!(WorkflowExecution, stale.id) + assert stale_after.status == :pending + assert stale_after.locked_by == nil + assert stale_after.locked_at == nil + + # Healthy row untouched. + healthy_after = repo.get!(WorkflowExecution, healthy.id) + assert healthy_after.status == :running + assert healthy_after.locked_by == "live_node" + end + + test "no stale rows → no :stale_recovered telemetry" do + assert {:ok, 0} = StaleJobRecovery.recover_now(Durable) + refute_receive {:event, :stale_recovered, _, _}, 100 + end + end + + describe "zombie detection via recover_now/1" do + test "marks :waiting workflows with no pending events/inputs as :failed and emits :zombie_recovered" do + config = Config.get(Durable) + repo = config.repo + + # Insert a :waiting row, then backdate updated_at past the stale cutoff. + zombie = + insert_execution(repo, + workflow_name: "zombie_one", + status: :waiting, + locked_by: nil, + locked_at: nil + ) + + long_ago = DateTime.add(DateTime.utc_now(), -3600, :second) + + {1, _} = + repo.update_all( + Ecto.Query.from(w in WorkflowExecution, where: w.id == ^zombie.id), + set: [updated_at: long_ago] + ) + + # Healthy waiter: same shape but with a pending event keeping it alive. + healthy_waiter = + insert_execution(repo, + workflow_name: "healthy_waiter", + status: :waiting, + locked_by: nil, + locked_at: nil + ) + + {1, _} = + repo.update_all( + Ecto.Query.from(w in WorkflowExecution, where: w.id == ^healthy_waiter.id), + set: [updated_at: long_ago] + ) + + {:ok, _} = + %PendingEvent{} + |> PendingEvent.changeset(%{ + workflow_id: healthy_waiter.id, + event_name: "alive", + step_name: "await", + status: :pending + }) + |> repo.insert() + + assert {:ok, _stale_count} = StaleJobRecovery.recover_now(Durable) + + assert_receive {:event, :zombie_recovered, %{count: zc}, %{durable: Durable}} + when zc >= 1, + 500 + + zombie_after = repo.get!(WorkflowExecution, zombie.id) + assert zombie_after.status == :failed + assert zombie_after.error["type"] == "zombie_detected" + + healthy_after = repo.get!(WorkflowExecution, healthy_waiter.id) + assert healthy_after.status == :waiting + end + end + + describe "background tick" do + test "GenServer survives an empty sweep (smoke test that the live process is healthy)" do + pid = Process.whereis(Durable.Queue.StaleJobRecovery) + assert is_pid(pid) + assert Process.alive?(pid) + + # Run a sweep, then a fence call to confirm the GenServer is still + # responsive and didn't crash. + assert {:ok, 0} = StaleJobRecovery.recover_now(Durable) + assert :sys.get_state(pid).config.name == Durable + end + end + + defp insert_execution(repo, opts) do + attrs = %{ + workflow_module: "TestWorkflow", + workflow_name: Keyword.get(opts, :workflow_name, "test"), + status: Keyword.get(opts, :status, :pending), + queue: "default", + priority: 0, + input: %{}, + context: %{}, + locked_by: Keyword.get(opts, :locked_by), + locked_at: Keyword.get(opts, :locked_at) + } + + %WorkflowExecution{} + |> WorkflowExecution.changeset(attrs) + |> repo.insert!() + end +end diff --git a/durable/test/durable/queue/worker_test.exs b/durable/test/durable/queue/worker_test.exs new file mode 100644 index 0000000..4f713e6 --- /dev/null +++ b/durable/test/durable/queue/worker_test.exs @@ -0,0 +1,160 @@ +defmodule Durable.Queue.WorkerTest do + @moduledoc """ + Drives the real `Durable.Queue.Worker` GenServer end-to-end. + + Until now, the worker process was untested — `Durable.DataCase` starts + Durable with `queue_enabled: false`, so the Poller / DynamicSupervisor / + Worker chain never ran in CI. This file fills that gap by starting Durable + with `queue_enabled: true` and observing real Worker lifecycles via + telemetry, the sink workflow's test-pid messages, and DB state. + + All tests are `async: false` (mandatory — they stand up a global Durable + supervisor with the default `Durable` name and rely on shared sandbox mode). + """ + + use Durable.DataCase, async: false + + @moduletag :supervised + + import Durable.DataCase, + only: [pid_to_bin: 0, with_backoff: 1, with_backoff: 2] + + alias Durable.Config + alias Durable.Queue.StaleJobRecovery + alias Durable.Storage.Schemas.WorkflowExecution + alias Durable.TelemetryHandler + alias Durable.TestWorkflows.SinkWorkflow + + setup do + Durable.DataCase.start_supervised_durable!(stale_lock_timeout: 1) + TelemetryHandler.attach_events() + :ok + end + + describe "happy path" do + test "Worker executes an OK job, emits :job_completed telemetry, and acks" do + ref = "ok-#{System.unique_integer([:positive])}" + input = %{"action" => "OK", "ref" => ref, "bin_pid" => pid_to_bin()} + + {:ok, wf_id} = Durable.start(SinkWorkflow, input) + + # The sink workflow's :run step sends {:done, ref} on success. + assert_receive {:done, ^ref}, 2_000 + + # Worker emitted job_completed telemetry with status=:completed. + assert_receive {:event, :job_completed, %{duration_ms: dur}, + %{status: :completed, job_id: ^wf_id}}, + 2_000 + + assert dur >= 0 + + # Persisted row reflects success and the lock is cleared. + with_backoff(fn -> + config = Config.get(Durable) + exec = config.repo.get!(WorkflowExecution, wf_id) + assert exec.status == :completed + assert exec.locked_by == nil + assert exec.locked_at == nil + end) + end + end + + describe "step error path through the Worker" do + @tag :capture_log + test "RAISE inside the step → executor marks :failed → Worker emits status=:failed" do + ref = "raise-#{System.unique_integer([:positive])}" + input = %{"action" => "RAISE", "ref" => ref, "bin_pid" => pid_to_bin()} + + {:ok, wf_id} = Durable.start(SinkWorkflow, input) + + assert_receive {:event, :job_completed, _measure, %{status: :failed, job_id: ^wf_id}}, + 2_000 + + with_backoff(fn -> + config = Config.get(Durable) + exec = config.repo.get!(WorkflowExecution, wf_id) + assert exec.status == :failed + assert exec.error["type"] == "RuntimeError" + assert is_binary(exec.error["message"]) + assert exec.locked_by == nil + assert exec.locked_at == nil + end) + end + end + + describe "worker killed mid-step" do + @tag :capture_log + test "external kill leaves the lock, StaleJobRecovery rescues the row" do + ref = "stale-#{System.unique_integer([:positive])}" + + # Long enough sleep to give us time to find and kill the worker before + # the task finishes naturally. + input = %{ + "action" => "SLEEP", + "ref" => ref, + "bin_pid" => pid_to_bin(), + "sleep_ms" => 10_000 + } + + {:ok, wf_id} = Durable.start(SinkWorkflow, input) + + # Wait for the step to actually start. + assert_receive {:started, ^ref}, 2_000 + + worker_pid = + with_backoff([total: 50, sleep: 10], fn -> + pid = Durable.DataCase.get_worker_pid(Durable, "default", wf_id) + assert is_pid(pid), "expected to find a Worker pid for job #{wf_id}" + pid + end) + + # Capture the locked_at before we kill — used to assert it doesn't change + # after the kill (no further heartbeats land). + config = Config.get(Durable) + exec_before = config.repo.get!(WorkflowExecution, wf_id) + assert exec_before.status == :running + assert exec_before.locked_by != nil + locked_at_before = exec_before.locked_at + + Process.exit(worker_pid, :kill) + + # The worker death never sends {:done, _}. + refute_receive {:done, ^ref}, 200 + + # Row remains :running with the same (or older) locked_at — recovery + # hasn't run yet. + exec_during = config.repo.get!(WorkflowExecution, wf_id) + assert exec_during.status == :running + assert DateTime.compare(exec_during.locked_at, locked_at_before) != :gt + + # Wait long enough for the lock to be considered stale (we set + # stale_lock_timeout: 1 in the setup, in seconds). Plus margin so the + # row's last-heartbeat `locked_at` is comfortably past the cutoff + # before recover_now runs. + Process.sleep(2_100) + + assert {:ok, count} = StaleJobRecovery.recover_now(Durable) + assert count >= 1 + + # Stale recovery telemetry fired. + assert_receive {:event, :stale_recovered, %{count: c}, _meta} when c >= 1, 500 + + # Row back to :pending, lock cleared. Use a generous backoff window — + # if the previous recover_now didn't reach our row (e.g. a prior test + # left a stale row that won the race), the test should re-trigger + # rather than flake. + with_backoff([total: 200, sleep: 25], fn -> + case config.repo.get!(WorkflowExecution, wf_id) do + %{status: :pending} = exec -> + assert exec.locked_by == nil + assert exec.locked_at == nil + + other -> + # Re-trigger recovery in case our row didn't make the first sweep. + _ = StaleJobRecovery.recover_now(Durable) + flunk("expected :pending, got #{inspect(other.status)}") + end + end) + end + end +end diff --git a/test/durable/resume_edge_cases_test.exs b/durable/test/durable/resume_edge_cases_test.exs similarity index 100% rename from test/durable/resume_edge_cases_test.exs rename to durable/test/durable/resume_edge_cases_test.exs diff --git a/durable/test/durable/retry_context_test.exs b/durable/test/durable/retry_context_test.exs new file mode 100644 index 0000000..034aec9 --- /dev/null +++ b/durable/test/durable/retry_context_test.exs @@ -0,0 +1,72 @@ +defmodule Durable.RetryContextTest do + @moduledoc """ + Regression coverage for PR 3 / H-5 — `put_context/2` writes made during a + failed retry attempt must remain visible to subsequent attempts. Before + the fix, `Process.put(:durable_context, data)` at the top of every retry + attempt overwrote prior writes. + """ + + use Durable.DataCase, async: false + + alias Durable.Config + alias Durable.Executor + alias Durable.Storage.Schemas.WorkflowExecution + + test "put_context writes survive across retry attempts" do + config = Config.get(Durable) + repo = config.repo + + {:ok, workflow_def} = RetryAccumulator.__default_workflow__() + + {:ok, exec} = + %WorkflowExecution{} + |> WorkflowExecution.changeset(%{ + workflow_module: Atom.to_string(RetryAccumulator), + workflow_name: workflow_def.name, + status: :pending, + queue: "default", + priority: 0, + input: %{}, + context: %{} + }) + |> repo.insert() + + Executor.execute_workflow(exec.id, config) + + reloaded = repo.get!(WorkflowExecution, exec.id) + assert reloaded.status == :completed + + # The step succeeds on attempt #3 only; if put_context was preserved + # across attempts, :seen_attempts will be 3. + assert reloaded.context["seen_attempts"] == 3 + end +end + +defmodule RetryAccumulator do + use Durable + use Durable.Helpers + use Durable.Context + + workflow "retry_accumulator" do + step( + :flaky, + [retry: [max_attempts: 3, backoff: :linear]], + fn _data -> + # Increment a counter via put_context — this should accumulate + # across retries. Before H-5's fix, it always read nil and reset + # to 1 on every attempt. + prior = get_context(:seen_attempts) || 0 + current = prior + 1 + put_context(:seen_attempts, current) + + if current < 3 do + {:error, %{type: "transient", message: "retry me"}} + else + # Don't bother returning :seen_attempts in the map — the C-1 fix + # ensures put_context writes persist regardless. + {:ok, %{done: true}} + end + end + ) + end +end diff --git a/durable/test/durable/sanitization_boundaries_test.exs b/durable/test/durable/sanitization_boundaries_test.exs new file mode 100644 index 0000000..8e9d153 --- /dev/null +++ b/durable/test/durable/sanitization_boundaries_test.exs @@ -0,0 +1,277 @@ +defmodule Durable.SanitizationBoundariesTest do + @moduledoc """ + Regression coverage for PR 2 — the "sanitization sweep" at every user-data + write boundary. Each test pushes a pathological payload (tuples, PIDs, + refs, functions, deeply-nested mixes) through one user-facing API and + verifies the workflow / step / pending-input / pending-event row was + persisted without a `Protocol.UndefinedError`. + + Boundaries covered in PR 2: + H-1 Durable.Wait.provide_input/4 → PendingInput.response + H-2 Durable.Wait.send_event/4 → PendingEvent.payload + H-3 StepRunner.fail_step_execution → StepExecution.error + H-4 StepRunner.serialize_output → StepExecution.output + H-7 Executor.resume_workflow/3 → WorkflowExecution.context + + See docs/bug-reports/2026-04-13-follow-up-audit.md for context. + """ + + use Durable.DataCase, async: false + + alias Durable.Config + alias Durable.Executor + alias Durable.Storage.Schemas.{PendingEvent, PendingInput, StepExecution, WorkflowExecution} + alias Durable.Wait + + import Ecto.Query + + defp config, do: Config.get(Durable) + defp repo, do: config().repo + + defp pathological_payload do + %{ + "string_key" => {:tuple, "value"}, + tagged: {:error, :timeout}, + nested: [{:ok, %{ref: make_ref()}}, {:error, self()}], + func: &Enum.map/2, + pid: self(), + tuple_in_list: [{:a, 1}, {:b, 2}] + } + end + + # -------------------------------------------------------------------------- + + describe "H-1 — Wait.provide_input/4 sanitizes user-supplied data" do + test "accepts pathological data and stores a JSON-encodable response" do + {:ok, execution} = start_workflow_waiting_for_input("manager_approval") + + assert :ok = Wait.provide_input(execution.id, "manager_approval", pathological_payload()) + + pending = get_pending_input(execution.id, "manager_approval") + assert pending.status == :completed + # Round-trip via Jason — the assertion that matters: no encode crash. + assert {:ok, _} = Jason.encode(pending.response) + # Spot-check: tuples became lists. + assert pending.response["tagged"] == ["error", "timeout"] + assert is_binary(pending.response["pid"]) + end + end + + # -------------------------------------------------------------------------- + + describe "H-2 — Wait.send_event/4 sanitizes user-supplied payload" do + test "accepts pathological payload and stores a JSON-encodable event" do + {:ok, execution} = start_workflow_waiting_for_event("payment_confirmed") + + assert :ok = Wait.send_event(execution.id, "payment_confirmed", pathological_payload()) + + pending = get_pending_event(execution.id, "payment_confirmed") + assert pending.status == :received + assert {:ok, _} = Jason.encode(pending.payload) + assert pending.payload["tagged"] == ["error", "timeout"] + end + end + + # -------------------------------------------------------------------------- + + describe "H-3 — StepRunner.fail_step_execution sanitizes errors" do + test "step that fails with a tuple-bearing error map persists cleanly" do + {:ok, execution} = start_workflow_module(FailingStepWorkflow, %{}) + + # The workflow should reach :failed and the step's error must be JSON-safe. + reloaded = repo().get!(WorkflowExecution, execution.id) + assert reloaded.status == :failed + + step = repo().one(from(s in StepExecution, where: s.workflow_id == ^execution.id)) + assert step.status == :failed + assert {:ok, _} = Jason.encode(step.error) + # Tuples flattened + assert step.error["details"] == ["error", "boom"] + end + end + + # -------------------------------------------------------------------------- + + describe "H-4 — StepRunner.serialize_output sanitizes step outputs" do + test "step that returns a map containing nested tuples persists cleanly" do + {:ok, execution} = start_workflow_module(NestedTupleOutputWorkflow, %{}) + + reloaded = repo().get!(WorkflowExecution, execution.id) + assert reloaded.status == :completed + + step = + repo().one( + from(s in StepExecution, + where: s.workflow_id == ^execution.id and s.step_name == "produce_tuples" + ) + ) + + assert {:ok, _} = Jason.encode(step.output) + # The shallow old serializer left nested tuples — verify recursion now flattens them. + assert step.output["nested"]["tag"] == ["ok", "deep"] + end + end + + # -------------------------------------------------------------------------- + + describe "H-7 — Executor.resume_workflow/3 sanitizes additional_context" do + test "resume with additional_context containing tuples doesn't crash" do + {:ok, execution} = start_workflow_waiting_for_event("any_event") + + bad_ctx = %{"resumed_with" => {:ok, %{ref: make_ref()}}} + assert {:ok, _} = Executor.resume_workflow(execution.id, bad_ctx) + + reloaded = repo().get!(WorkflowExecution, execution.id) + assert {:ok, _} = Jason.encode(reloaded.context) + assert is_list(reloaded.context["resumed_with"]) + end + end + + # ========================================================================== + # Helpers + # ========================================================================== + + defp start_workflow_waiting_for_input(input_name) do + workflow_module = + case input_name do + "manager_approval" -> InputWaitFixtureWorkflow + _ -> raise "unknown input fixture: #{input_name}" + end + + {:ok, execution} = start_workflow_module(workflow_module, %{}) + reloaded = repo().get!(WorkflowExecution, execution.id) + assert reloaded.status == :waiting + {:ok, reloaded} + end + + defp start_workflow_waiting_for_event(event_name) do + workflow_module = + case event_name do + "payment_confirmed" -> EventWaitFixtureWorkflow + "any_event" -> AnyEventWaitFixtureWorkflow + _ -> raise "unknown event fixture: #{event_name}" + end + + {:ok, execution} = start_workflow_module(workflow_module, %{}) + reloaded = repo().get!(WorkflowExecution, execution.id) + assert reloaded.status == :waiting + {:ok, reloaded} + end + + defp start_workflow_module(module, input) do + {:ok, workflow_def} = module.__default_workflow__() + + attrs = %{ + workflow_module: Atom.to_string(module), + workflow_name: workflow_def.name, + status: :pending, + queue: "default", + priority: 0, + input: input, + context: %{} + } + + {:ok, execution} = + %WorkflowExecution{} + |> WorkflowExecution.changeset(attrs) + |> repo().insert() + + Executor.execute_workflow(execution.id, config()) + {:ok, repo().get!(WorkflowExecution, execution.id)} + end + + defp get_pending_input(workflow_id, name) do + repo().one( + from(p in PendingInput, + where: p.workflow_id == ^workflow_id and p.input_name == ^name + ) + ) + end + + defp get_pending_event(workflow_id, name) do + repo().one( + from(p in PendingEvent, + where: p.workflow_id == ^workflow_id and p.event_name == ^name + ) + ) + end +end + +# ============================================================================ +# Fixtures +# ============================================================================ + +defmodule InputWaitFixtureWorkflow do + use Durable + use Durable.Helpers + use Durable.Wait + + workflow "input_wait_fixture" do + step(:await, fn data -> + result = wait_for_approval("manager_approval", prompt: "ok?") + {:ok, assign(data, :result, result)} + end) + end +end + +defmodule EventWaitFixtureWorkflow do + use Durable + use Durable.Helpers + use Durable.Wait + + workflow "event_wait_fixture" do + step(:await, fn data -> + result = wait_for_event("payment_confirmed", timeout: hours(1)) + {:ok, assign(data, :result, result)} + end) + end +end + +defmodule AnyEventWaitFixtureWorkflow do + use Durable + use Durable.Helpers + use Durable.Wait + + workflow "any_event_wait_fixture" do + step(:await, fn data -> + result = wait_for_event("any_event", timeout: hours(1)) + {:ok, assign(data, :result, result)} + end) + end +end + +defmodule FailingStepWorkflow do + use Durable + use Durable.Helpers + + workflow "failing_step" do + step(:boom, fn _data -> + # User returns a tuple-bearing error map — H-3 says this must persist. + {:error, + %{ + type: "test_failure", + message: "intentional", + details: {:error, :boom}, + worker: self(), + ref: make_ref() + }} + end) + end +end + +defmodule NestedTupleOutputWorkflow do + use Durable + use Durable.Helpers + + workflow "nested_tuple_output" do + step(:produce_tuples, fn _data -> + # User returns a map whose nested values include raw tuples. The old + # shallow serialize_output left these untouched — JSONB would crash. + {:ok, + %{ + status: "ok", + nested: %{tag: {:ok, "deep"}, list: [{:a, 1}, {:b, 2}]} + }} + end) + end +end diff --git a/durable/test/durable/scheduler_resilience_test.exs b/durable/test/durable/scheduler_resilience_test.exs new file mode 100644 index 0000000..3d057d7 --- /dev/null +++ b/durable/test/durable/scheduler_resilience_test.exs @@ -0,0 +1,99 @@ +defmodule Durable.SchedulerResilienceTest do + @moduledoc """ + Regression coverage for PR 7 / L-1 — the scheduler must surface persistent + failures in `ScheduledWorkflow.failure_changeset/3` and auto-disable the + schedule after N consecutive failures, rather than looping hot on the + same unresolvable schedule every poll cycle. + """ + use Durable.DataCase, async: false + + alias Durable.Config + alias Durable.Repo + alias Durable.Storage.Schemas.ScheduledWorkflow + + defp config, do: Config.get(Durable) + + defp insert_schedule(attrs \\ %{}) do + defaults = %{ + name: "test_resilience_#{System.unique_integer([:positive])}", + workflow_module: "Elixir.NonExistentModuleXYZ", + workflow_name: "never_runs", + cron_expression: "*/5 * * * *", + enabled: true + } + + attrs = Map.merge(defaults, attrs) + + {:ok, s} = + %ScheduledWorkflow{} + |> ScheduledWorkflow.changeset(attrs) + |> Repo.insert(config()) + + s + end + + test "failure_changeset/3 records error + increments counter" do + schedule = insert_schedule() + + {:ok, updated} = + schedule + |> ScheduledWorkflow.failure_changeset("module not found") + |> Repo.update(config()) + + assert updated.consecutive_failures == 1 + assert updated.last_error =~ "module not found" + assert updated.last_error_at != nil + # Not auto-disabled yet + assert updated.enabled == true + assert is_nil(updated.auto_disabled_at) + end + + test "failure_changeset auto-disables after N consecutive failures" do + schedule = insert_schedule(%{consecutive_failures: 4}) + + {:ok, updated} = + schedule + |> ScheduledWorkflow.failure_changeset("still broken", auto_disable_after: 5) + |> Repo.update(config()) + + assert updated.consecutive_failures == 5 + assert updated.enabled == false + assert updated.auto_disabled_at != nil + end + + test "success_changeset/3 resets the failure counter and clears last_error" do + schedule = + insert_schedule(%{ + consecutive_failures: 3, + last_error: "previous failure", + last_error_at: DateTime.utc_now() + }) + + now = DateTime.utc_now() + next = DateTime.add(now, 300, :second) + + {:ok, updated} = + schedule + |> ScheduledWorkflow.success_changeset(now, next) + |> Repo.update(config()) + + assert updated.consecutive_failures == 0 + assert is_nil(updated.last_error) + assert is_nil(updated.last_error_at) + assert updated.last_run_at == now + assert updated.next_run_at == next + end + + test "failure_changeset advances next_run_at when supplied" do + schedule = insert_schedule() + future = DateTime.utc_now() |> DateTime.add(600, :second) |> DateTime.truncate(:microsecond) + + {:ok, updated} = + schedule + |> ScheduledWorkflow.failure_changeset("boom", next_run_at: future) + |> Repo.update(config()) + + # Defer the next fire so we don't loop hot on the same broken schedule + assert updated.next_run_at == future + end +end diff --git a/test/durable/scheduler_test.exs b/durable/test/durable/scheduler_test.exs similarity index 100% rename from test/durable/scheduler_test.exs rename to durable/test/durable/scheduler_test.exs diff --git a/test/durable/validation_test.exs b/durable/test/durable/validation_test.exs similarity index 100% rename from test/durable/validation_test.exs rename to durable/test/durable/validation_test.exs diff --git a/durable/test/durable/wait/timeout_worker_integration_test.exs b/durable/test/durable/wait/timeout_worker_integration_test.exs new file mode 100644 index 0000000..57fe84a --- /dev/null +++ b/durable/test/durable/wait/timeout_worker_integration_test.exs @@ -0,0 +1,105 @@ +defmodule Durable.Wait.TimeoutWorkerIntegrationTest do + @moduledoc """ + End-to-end coverage for `Durable.Wait.TimeoutWorker` driving a real workflow + through the timeout → resume path. Closes the TODO at + `test/durable/wait_test.exs:568` ("would require queue_enabled: true"). + + The TimeoutWorker only exposes `check_timeouts/1` as a `cast`. We use + `:sys.get_state/1` as a synchronization fence: the OTP message queue + serializes the trailing `call` behind any pending `cast`, so when + `:sys.get_state` returns, the cast has been fully processed. + """ + + use Durable.DataCase, async: false + + @moduletag :supervised + + import Durable.DataCase, only: [pid_to_bin: 0, with_backoff: 1, with_backoff: 2] + + alias Durable.Config + alias Durable.Storage.Schemas.{PendingEvent, WorkflowExecution} + alias Durable.TestWorkflows.SinkWorkflow + alias Durable.Wait.TimeoutWorker + + setup do + Durable.DataCase.start_supervised_durable!() + :ok + end + + describe "PendingEvent timeout → resume" do + test "workflow waiting on an event past its timeout_at resumes with the timeout_value" do + ref = "tmo-#{System.unique_integer([:positive])}" + + input = %{ + "action" => "WAIT_EVENT", + "ref" => ref, + "bin_pid" => pid_to_bin(), + "event_name" => "evt_#{ref}", + "timeout_ms" => 100, + "timeout_value" => %{"timed_out" => true, "ref" => ref} + } + + {:ok, wf_id} = Durable.start(SinkWorkflow, input) + + # Step started — workflow is in :waiting. + assert_receive {:started, ^ref}, 2_000 + + config = Config.get(Durable) + + # Confirm pending event exists with the right shape. The {:started, ref} + # arrives BEFORE wait_for_event throws and the pending row is committed, + # so poll briefly for the row to appear. + pending = + with_backoff([total: 100, sleep: 10], fn -> + row = repo_pending_event(config, wf_id, "evt_#{ref}") + assert row != nil + assert row.status == :pending + assert row.timeout_at != nil + row + end) + + _ = pending + + # Wait long enough for timeout_at to elapse. + Process.sleep(200) + + # Trigger the timeout sweep and fence on the GenServer. + TimeoutWorker.check_timeouts(Durable) + _ = :sys.get_state(TimeoutWorker.worker_name(Durable)) + + # PendingEvent transitioned to :timeout. The workflow row is :pending or + # has already been re-claimed by the poller (status :running) — both + # are valid intermediate states. + pending_after = repo_pending_event(config, wf_id, "evt_#{ref}") + assert pending_after.status == :timeout + + # On resume, the step body re-enters and sends {:started, ref} again, + # then wait_for_event returns the timeout_value and {:done, ref} fires. + assert_receive {:started, ^ref}, 3_000 + assert_receive {:done, ^ref}, 3_000 + + # Workflow completes with the timeout_value visible. + with_backoff(fn -> + exec = config.repo.get!(WorkflowExecution, wf_id) + assert exec.status == :completed + + # The step echoed the timeout payload back into context. Keys round + # trip through JSONB as strings. + assert get_in(exec.context, ["payload"]) == %{ + "timed_out" => true, + "ref" => ref + } + end) + end + end + + defp repo_pending_event(%Config{repo: repo}, workflow_id, event_name) do + import Ecto.Query + + repo.one( + from(p in PendingEvent, + where: p.workflow_id == ^workflow_id and p.event_name == ^event_name + ) + ) + end +end diff --git a/test/durable/wait_test.exs b/durable/test/durable/wait_test.exs similarity index 89% rename from test/durable/wait_test.exs rename to durable/test/durable/wait_test.exs index 274fdb3..60bfb92 100644 --- a/test/durable/wait_test.exs +++ b/durable/test/durable/wait_test.exs @@ -137,6 +137,35 @@ defmodule Durable.WaitTest do end end + describe "wait_for_event/2 timeout_value sanitization (Bug C-2 regression)" do + test "accepts a tagged tuple as timeout_value without crashing JSONB insert" do + # Before the fix, `timeout_value: {:error, :timeout}` crashed at PendingEvent + # insertion because Postgrex/Jason can't encode raw tuples. + {:ok, execution} = create_and_execute_workflow(TupleTimeoutValueWorkflow, %{}) + + assert execution.status == :waiting + + pending = get_pending_event(Durable.Config.get(Durable).repo, execution.id, "evt") + assert pending != nil + # Sanitized: tuple becomes a list + assert pending.timeout_value == %{"__value__" => ["error", "timeout"]} + end + + test "accepts a deeply nested map with tuples as timeout_value" do + {:ok, execution} = create_and_execute_workflow(NestedTupleTimeoutValueWorkflow, %{}) + + assert execution.status == :waiting + + pending = get_pending_event(Durable.Config.get(Durable).repo, execution.id, "evt") + assert pending != nil + # The map path also goes through sanitize_for_json now. + timeout_value = pending.timeout_value + assert is_map(timeout_value) + # Nested tuple flattened to a list inside the map + assert get_in(timeout_value, ["nested", "tag"]) == ["err", "boom"] + end + end + describe "wait_for_any/2" do test "creates WaitGroup with wait_type :any" do config = Config.get(Durable) @@ -427,6 +456,39 @@ defmodule Durable.WaitTest do result = Wait.provide_input(fake_uuid, "unknown", %{}) assert result == {:error, :not_found} end + + test "accepts a plain string response (single_choice / text / approval)" do + # Regression: dashboard's SingleChoiceForm submits just the raw choice + # string (e.g. "morning"). PendingInput.response is typed :map, so + # without the wrap the cast silently fails and the workflow never + # resumes. This test locks in that non-map responses are accepted. + config = Config.get(Durable) + repo = config.repo + + {:ok, execution} = create_and_execute_workflow(InputWaitTestWorkflow, %{}) + + assert :ok = Wait.provide_input(execution.id, "manager_approval", "approved") + + pending = get_pending_input(repo, execution.id, "manager_approval") + assert pending.status == :completed + assert pending.response == %{"value" => "approved"} + end + + test "accepts an atom response (e.g. :approved)" do + config = Config.get(Durable) + repo = config.repo + + {:ok, execution} = create_and_execute_workflow(InputWaitTestWorkflow, %{}) + + assert :ok = Wait.provide_input(execution.id, "manager_approval", :approved) + + pending = get_pending_input(repo, execution.id, "manager_approval") + assert pending.status == :completed + # sanitize_for_json leaves atoms alone; complete_pending_input wraps + # them under "value" because the column type is :map. JSONB then + # stores the atom as its string form. + assert pending.response == %{"value" => "approved"} + end end describe "send_event/4" do @@ -443,6 +505,21 @@ defmodule Durable.WaitTest do assert pending.payload == %{"amount" => 50} end + test "accepts a plain string payload" do + # Same wrap pattern as provide_input/4: send_event("foo", "bar") + # should succeed and store %{"value" => "bar"} in payload. + config = Config.get(Durable) + repo = config.repo + + {:ok, execution} = create_and_execute_workflow(EventWaitTestWorkflow, %{}) + + assert :ok = Wait.send_event(execution.id, "payment_confirmed", "done") + + pending = get_pending_event(repo, execution.id, "payment_confirmed") + assert pending.status == :received + assert pending.payload == %{"value" => "done"} + end + test "updates WaitGroup for grouped events" do config = Config.get(Durable) repo = config.repo @@ -1237,3 +1314,39 @@ defmodule UnicodeEventWorkflow do end) end end + +# ============================================================================ +# Bug C-2 regression fixtures — timeout_value with raw tuples / nested terms +# ============================================================================ + +defmodule TupleTimeoutValueWorkflow do + use Durable + use Durable.Helpers + use Durable.Wait + + workflow "tuple_timeout" do + step(:wait_step, fn data -> + # Idiomatic Elixir tuple — would crash JSONB insert before fix. + result = wait_for_event("evt", timeout: hours(1), timeout_value: {:error, :timeout}) + {:ok, assign(data, :result, result)} + end) + end +end + +defmodule NestedTupleTimeoutValueWorkflow do + use Durable + use Durable.Helpers + use Durable.Wait + + workflow "nested_tuple_timeout" do + step(:wait_step, fn data -> + result = + wait_for_event("evt", + timeout: hours(1), + timeout_value: %{"nested" => %{"tag" => {:err, :boom}}} + ) + + {:ok, assign(data, :result, result)} + end) + end +end diff --git a/test/durable_test.exs b/durable/test/durable_test.exs similarity index 100% rename from test/durable_test.exs rename to durable/test/durable_test.exs diff --git a/test/mix/tasks/durable_cancel_test.exs b/durable/test/mix/tasks/durable_cancel_test.exs similarity index 100% rename from test/mix/tasks/durable_cancel_test.exs rename to durable/test/mix/tasks/durable_cancel_test.exs diff --git a/test/mix/tasks/durable_cleanup_test.exs b/durable/test/mix/tasks/durable_cleanup_test.exs similarity index 100% rename from test/mix/tasks/durable_cleanup_test.exs rename to durable/test/mix/tasks/durable_cleanup_test.exs diff --git a/durable/test/mix/tasks/durable_gen_upgrade_test.exs b/durable/test/mix/tasks/durable_gen_upgrade_test.exs new file mode 100644 index 0000000..eeaba01 --- /dev/null +++ b/durable/test/mix/tasks/durable_gen_upgrade_test.exs @@ -0,0 +1,95 @@ +defmodule Mix.Tasks.Durable.GenUpgradeTest do + use ExUnit.Case, async: false + + alias Durable.Migration + alias Mix.Tasks.Durable.Gen.Upgrade, as: GenUpgradeTask + + setup do + Mix.shell(Mix.Shell.Process) + on_exit(fn -> Mix.shell(Mix.Shell.IO) end) + end + + @tag :tmp_dir + test "generates a Durable upgrade migration for the current version", %{tmp_dir: tmp_dir} do + target = Migration.current_version() + previous = Migration.previous_version(target) + + assert [file] = + GenUpgradeTask.run([ + "-r", + "Durable.TestRepo", + "--migrations-path", + tmp_dir + ]) + + assert Path.basename(file) =~ "_upgrade_durable_to_v#{target}.exs" + + content = File.read!(file) + assert content =~ "defmodule Durable.TestRepo.Migrations.UpgradeDurableToV#{target}" + assert content =~ "Durable.Migration.up(to: #{target}, prefix: \"durable\")" + assert content =~ "Durable.Migration.down(to: #{previous}, prefix: \"durable\")" + end + + @tag :tmp_dir + test "supports custom prefix and target version", %{tmp_dir: tmp_dir} do + target = hd(Migration.all_versions()) + + assert [file] = + GenUpgradeTask.run([ + "-r", + "Durable.TestRepo", + "--migrations-path", + tmp_dir, + "--prefix", + "private", + "--to", + Integer.to_string(target) + ]) + + content = File.read!(file) + assert content =~ "Durable.Migration.up(to: #{target}, prefix: \"private\")" + assert content =~ "Durable.Migration.down(to: 0, prefix: \"private\")" + end + + @tag :tmp_dir + test "refuses to generate a duplicate upgrade migration", %{tmp_dir: tmp_dir} do + target = Migration.current_version() + base_name = "upgrade_durable_to_v#{target}" + File.write!(Path.join(tmp_dir, "20250101000000_#{base_name}.exs"), "") + + assert_raise Mix.Error, ~r/already a migration file/, fn -> + GenUpgradeTask.run([ + "-r", + "Durable.TestRepo", + "--migrations-path", + tmp_dir + ]) + end + end + + @tag :tmp_dir + test "raises for unknown target versions", %{tmp_dir: tmp_dir} do + assert_raise Mix.Error, ~r/Unknown Durable migration version/, fn -> + GenUpgradeTask.run([ + "-r", + "Durable.TestRepo", + "--migrations-path", + tmp_dir, + "--to", + "99999999999999" + ]) + end + end + + @tag :tmp_dir + test "raises for an invalid repo", %{tmp_dir: tmp_dir} do + assert_raise Mix.Error, ~r/Could not load Missing.Repo/, fn -> + GenUpgradeTask.run([ + "-r", + "Missing.Repo", + "--migrations-path", + tmp_dir + ]) + end + end +end diff --git a/test/mix/tasks/durable_list_test.exs b/durable/test/mix/tasks/durable_list_test.exs similarity index 100% rename from test/mix/tasks/durable_list_test.exs rename to durable/test/mix/tasks/durable_list_test.exs diff --git a/durable/test/mix/tasks/durable_migrations_test.exs b/durable/test/mix/tasks/durable_migrations_test.exs new file mode 100644 index 0000000..429ae1a --- /dev/null +++ b/durable/test/mix/tasks/durable_migrations_test.exs @@ -0,0 +1,93 @@ +defmodule Mix.Tasks.Durable.MigrationsTest do + use Durable.DataCase, async: false + + alias Durable.Migration + alias Durable.Migration.SchemaMigration + alias Mix.Tasks.Durable.Migrations, as: MigrationsTask + + setup do + Mix.shell(Mix.Shell.Process) + on_exit(fn -> Mix.shell(Mix.Shell.IO) end) + end + + test "reports the default schema as migrated" do + assert :ok = MigrationsTask.run(["-r", "Durable.TestRepo"]) + + output = collect_all_output() + assert output =~ "Repo: Durable.TestRepo" + assert output =~ "Current Durable version: #{Migration.current_version()}" + assert output =~ "Migrated database version: #{Migration.current_version()}" + assert output =~ "Pending versions: none" + assert output =~ "Status: up" + end + + test "emits JSON status" do + assert :ok = MigrationsTask.run(["-r", "Durable.TestRepo", "--json"]) + + output = collect_all_output() + decoded = Jason.decode!(output) + + assert decoded["repo"] == "Durable.TestRepo" + assert decoded["prefix"] == "durable" + assert decoded["current_version"] == Migration.current_version() + assert decoded["migrated_version"] == Migration.current_version() + assert decoded["pending_versions"] == [] + assert decoded["status"] == "up" + end + + test "--check passes when no Durable migrations are pending" do + assert :ok = MigrationsTask.run(["-r", "Durable.TestRepo", "--check"]) + end + + test "--check raises when Durable migrations are pending" do + prefix = unique_prefix("missing") + + assert_raise Mix.Error, ~r/Durable migrations are pending/, fn -> + MigrationsTask.run(["-r", "Durable.TestRepo", "--prefix", prefix, "--check"]) + end + end + + test "reports partially migrated prefixes" do + prefix = unique_prefix("partial") + [applied | pending] = Migration.all_versions() + + create_schema(prefix) + SchemaMigration.ensure_table!(Durable.TestRepo, prefix) + insert_schema_version(prefix, applied) + + assert :ok = MigrationsTask.run(["-r", "Durable.TestRepo", "--prefix", prefix]) + + output = collect_all_output() + assert output =~ "Migrated database version: #{applied}" + assert output =~ "Pending versions: #{Enum.join(pending, ", ")}" + assert output =~ "Status: pending" + end + + defp unique_prefix(label) do + "durable_#{label}_#{System.unique_integer([:positive])}" + end + + defp create_schema(prefix) do + Durable.TestRepo.query!("CREATE SCHEMA IF NOT EXISTS #{prefix}", []) + end + + defp insert_schema_version(prefix, version) do + Durable.TestRepo.query!( + "INSERT INTO #{prefix}.durable_schema_migrations (version, inserted_at) VALUES ($1, $2)", + [version, DateTime.utc_now()] + ) + end + + defp collect_all_output do + collect_all_output("") + end + + defp collect_all_output(acc) do + receive do + {:mix_shell, :info, [line]} -> collect_all_output(acc <> "\n" <> line) + {:mix_shell, :error, [line]} -> collect_all_output(acc <> "\n" <> line) + after + 100 -> String.trim(acc) + end + end +end diff --git a/test/mix/tasks/durable_run_test.exs b/durable/test/mix/tasks/durable_run_test.exs similarity index 100% rename from test/mix/tasks/durable_run_test.exs rename to durable/test/mix/tasks/durable_run_test.exs diff --git a/test/mix/tasks/durable_status_test.exs b/durable/test/mix/tasks/durable_status_test.exs similarity index 100% rename from test/mix/tasks/durable_status_test.exs rename to durable/test/mix/tasks/durable_status_test.exs diff --git a/durable/test/support/data_case.ex b/durable/test/support/data_case.ex new file mode 100644 index 0000000..01aa364 --- /dev/null +++ b/durable/test/support/data_case.ex @@ -0,0 +1,355 @@ +defmodule Durable.DataCase do + @moduledoc """ + This module defines the setup for tests requiring access to the + application's data layer. + + You may define functions here to be used as helpers in your tests. + + ## Supervised mode + + The default setup starts Durable with `queue_enabled: false`. Tests that + need the supervised runtime (queue pollers, stale-job recovery, timeout + worker, scheduler) should opt out via `@moduletag :supervised` and call + `start_supervised_durable!/1` themselves. + """ + + use ExUnit.CaseTemplate + + import Ecto.Query + + alias Durable.Config + alias Durable.Executor + + alias Durable.Storage.Schemas.{ + PendingEvent, + PendingInput, + StepExecution, + WaitGroup, + WorkflowExecution + } + + alias Ecto.Adapters.SQL.Sandbox + + using do + quote do + alias Durable.TestRepo + import Ecto + import Ecto.Changeset + import Ecto.Query + + # Only auto-import helpers that were originally shared OR that don't + # collide with local `defp` definitions in existing test files. Tests + # that want the promoted helpers (create_and_execute_workflow, + # get_step_executions, get_pending_event, get_pending_input, + # get_wait_group, get_child_executions, execute_children, + # get_worker_pid) should `import Durable.DataCase, only: [...]` + # explicitly or call them via the fully-qualified module name. + import Durable.DataCase, + only: [ + assert_eventually: 1, + assert_eventually: 2, + assert_eventually: 3, + with_backoff: 1, + with_backoff: 2, + setup_sandbox: 1, + start_supervised_durable!: 0, + start_supervised_durable!: 1, + pid_to_bin: 0, + pid_to_bin: 1, + bin_to_pid: 1 + ] + end + end + + setup tags do + Durable.DataCase.setup_sandbox(tags) + + unless tags[:supervised] do + start_supervised!({Durable, repo: Durable.TestRepo, queue_enabled: false, pubsub: :start}) + end + + :ok + end + + @doc """ + Sets up the sandbox based on the test tags. + """ + def setup_sandbox(tags) do + pid = Sandbox.start_owner!(Durable.TestRepo, shared: not tags[:async]) + on_exit(fn -> Sandbox.stop_owner(pid) end) + end + + @doc """ + A helper that polls until a condition is met or timeout. + + ## Examples + + assert_eventually(fn -> + {:ok, exec} = Durable.get_execution(id) + exec.status == :completed + end) + """ + def assert_eventually(fun, timeout \\ 5000, interval \\ 100) do + deadline = System.monotonic_time(:millisecond) + timeout + do_assert_eventually(fun, deadline, interval) + end + + defp do_assert_eventually(fun, deadline, interval) do + if fun.() do + true + else + if System.monotonic_time(:millisecond) < deadline do + Process.sleep(interval) + do_assert_eventually(fun, deadline, interval) + else + ExUnit.Assertions.flunk("Condition not met within timeout") + end + end + end + + @doc """ + Re-runs an assertion block until it passes or the deadline elapses. + + Unlike `assert_eventually/3`, this preserves the original `ExUnit.AssertionError` + so the failure message points at the real mismatch. Use for DB/telemetry + assertions where the readable failure matters. + + ## Options + + - `:total` - Total attempts before giving up (default: 100) + - `:sleep` - Sleep between attempts in ms (default: 10) + """ + def with_backoff(opts \\ [], fun) do + total = Keyword.get(opts, :total, 100) + sleep = Keyword.get(opts, :sleep, 10) + + do_with_backoff(fun, 0, total, sleep) + end + + defp do_with_backoff(fun, count, total, sleep) do + fun.() + rescue + exception in [ExUnit.AssertionError] -> + if count < total do + Process.sleep(sleep) + do_with_backoff(fun, count + 1, total, sleep) + else + reraise(exception, __STACKTRACE__) + end + end + + @doc """ + Starts Durable under the test supervision tree with queue processing enabled + by default. Returns the instance name. + + Intended for tests with `@moduletag :supervised`. Defaults: + - `:name` - `Durable` (tests are `async: false` under shared sandbox, so + only one instance runs at a time — reusing the default name keeps + `Durable.start/3` / `Executor.start_workflow/3` etc. callable without + passing `durable: name` everywhere) + - `:repo` - `Durable.TestRepo` + - `:queue_enabled` - `true` + - `:pubsub` - `:start` + - `:queues` - `%{default: [concurrency: 1, poll_interval: 50]}` + - `:stale_lock_timeout` - `300` (seconds) + - `:heartbeat_interval` - `100` (ms) + + Pass any of these to override. Other `Durable` options (`:scheduler_interval`, + etc.) pass through untouched. + """ + def start_supervised_durable!(opts \\ []) do + opts = + opts + |> Keyword.put_new(:name, Durable) + |> Keyword.put_new(:repo, Durable.TestRepo) + |> Keyword.put_new(:queue_enabled, true) + |> Keyword.put_new(:pubsub, :start) + |> Keyword.put_new(:queues, %{default: [concurrency: 1, poll_interval: 50]}) + |> Keyword.put_new(:stale_lock_timeout, 300) + |> Keyword.put_new(:heartbeat_interval, 100) + + name = Keyword.fetch!(opts, :name) + + # Mix-task tests (Mix.Tasks.Durable.List etc.) set + # `:durable, :disable_queue_processing, true` via + # `Durable.Mix.Helpers.ensure_started_readonly/0` and never reset it. + # That flag survives in app env and silently forces queue_enabled to + # false on every subsequent supervisor start. Reset it here so + # supervised tests get the queue they asked for. Restore on exit so + # we don't accidentally enable queues for later mix-task tests. + prior = Application.get_env(:durable, :disable_queue_processing) + Application.put_env(:durable, :disable_queue_processing, false) + ExUnit.Callbacks.on_exit(fn -> restore_disable_flag(prior) end) + + # Ensure a clean start. A previous test may have left the named supervisor + # alive (start_supervised cleanup terminates the test child but if the + # previous test had its own setup blocks, the named registration can + # outlive them). Without this, the next start_supervised! call returns + # `{:error, {:already_started, pid}}` and ExUnit silently uses the existing + # (potentially queue-disabled) instance, ignoring the new opts. + Durable.Supervisor.stop(name) + + start_supervised!({Durable, opts}) + name + end + + defp restore_disable_flag(nil), do: Application.delete_env(:durable, :disable_queue_processing) + + defp restore_disable_flag(value), + do: Application.put_env(:durable, :disable_queue_processing, value) + + # ============================================================================ + # PID <-> binary transport (for use with sink workflows that close over the + # test pid at workflow-input time) + # ============================================================================ + + @doc "Encodes a pid as an opaque base64 binary safe to put in workflow input." + def pid_to_bin(pid \\ self()) do + pid + |> :erlang.term_to_binary() + |> Base.encode64() + end + + @doc "Inverse of `pid_to_bin/1`." + def bin_to_pid(bin) do + bin + |> Base.decode64!() + |> :erlang.binary_to_term() + end + + # ============================================================================ + # Workflow execution helpers + # ============================================================================ + + @doc """ + Creates a workflow execution row for `module` with `input` and then drives it + synchronously via `Durable.Executor.execute_workflow/2`. Returns the reloaded + `WorkflowExecution` struct. + + Works in both queue-disabled (unit) and queue-enabled (supervised) test modes. + """ + def create_and_execute_workflow(module, input, opts \\ []) do + config = Config.get(Durable) + repo = config.repo + {:ok, workflow_def} = module.__default_workflow__() + + attrs = %{ + workflow_module: Atom.to_string(module), + workflow_name: workflow_def.name, + status: :pending, + queue: Keyword.get(opts, :queue, "default"), + priority: Keyword.get(opts, :priority, 0), + input: input, + context: %{} + } + + {:ok, execution} = + %WorkflowExecution{} + |> WorkflowExecution.changeset(attrs) + |> repo.insert() + + Executor.execute_workflow(execution.id, config) + {:ok, repo.get!(WorkflowExecution, execution.id)} + end + + @doc "Loads all `StepExecution` rows for a workflow, oldest first." + def get_step_executions(workflow_id) do + repo = Config.get(Durable).repo + + repo.all( + from(s in StepExecution, + where: s.workflow_id == ^workflow_id, + order_by: [asc: s.inserted_at] + ) + ) + end + + @doc "Fetches a single `PendingInput` for a workflow by input name." + def get_pending_input(repo, workflow_id, input_name) do + repo.one( + from(p in PendingInput, + where: p.workflow_id == ^workflow_id and p.input_name == ^input_name + ) + ) + end + + @doc "Fetches a single `PendingEvent` for a workflow by event name." + def get_pending_event(repo, workflow_id, event_name) do + repo.one( + from(p in PendingEvent, + where: p.workflow_id == ^workflow_id and p.event_name == ^event_name + ) + ) + end + + @doc "Fetches the active `WaitGroup` for a workflow." + def get_wait_group(repo, workflow_id) do + repo.one(from(w in WaitGroup, where: w.workflow_id == ^workflow_id)) + end + + @doc "Returns child `WorkflowExecution` rows for a parent workflow." + def get_child_executions(repo, parent_id) do + repo.all(from(w in WorkflowExecution, where: w.parent_workflow_id == ^parent_id)) + end + + @doc """ + Executes every pending child of `parent_id` synchronously via the inline + executor. Useful for driving parallel fan-out deterministically. + """ + def execute_children(repo, parent_id, config) do + parent_id + |> (&get_child_executions(repo, &1)).() + |> Enum.each(fn child -> + if child.status == :pending do + Executor.execute_workflow(child.id, config) + end + end) + end + + # ============================================================================ + # Supervised-runtime lookups + # ============================================================================ + + @doc """ + Returns the pid of the `Durable.Queue.Worker` GenServer currently executing + `job_id`, or `nil` if no worker matches. + + Uses `:sys.get_state/2` on each live worker — safe within tests, avoid in + hot loops. Only works when Durable is started with `queue_enabled: true`. + """ + def get_worker_pid(durable_name \\ Durable, queue_name \\ "default", job_id) do + worker_sup = + Module.concat([durable_name, Queue, WorkerSupervisor, camelize(queue_name)]) + + case Process.whereis(worker_sup) do + nil -> + nil + + sup_pid -> + sup_pid + |> DynamicSupervisor.which_children() + |> Enum.find_value(fn {_, pid, _, _} when is_pid(pid) -> + find_worker_for_job(pid, job_id) + end) + end + end + + defp find_worker_for_job(pid, job_id) do + state = :sys.get_state(pid, 100) + + if (is_map(state) and Map.get(state, :job)) && state.job.id == job_id do + pid + end + rescue + _ -> nil + catch + :exit, _ -> nil + end + + defp camelize(string) do + string + |> String.split("_") + |> Enum.map_join(&String.capitalize/1) + |> String.to_atom() + end +end diff --git a/durable/test/support/telemetry_handler.ex b/durable/test/support/telemetry_handler.ex new file mode 100644 index 0000000..0093a9f --- /dev/null +++ b/durable/test/support/telemetry_handler.ex @@ -0,0 +1,51 @@ +defmodule Durable.TelemetryHandler do + @moduledoc """ + Attaches a telemetry handler that forwards Durable events to the calling test + process as `{:event, suffix, measurements, metadata}` messages. + + Each test gets an independent handler keyed by a unique name so handlers from + async-adjacent tests don't cross-talk. The handler is detached automatically + on test exit. + + ## Usage + + Durable.TelemetryHandler.attach_events() + + # run code that emits [:durable, :queue, :job_completed] + + assert_receive {:event, :job_completed, %{duration_ms: _}, %{status: :completed}} + """ + + import ExUnit.Callbacks, only: [on_exit: 1] + + @default_events [ + [:durable, :queue, :job_completed], + [:durable, :queue, :heartbeat], + [:durable, :queue, :stale_recovered], + [:durable, :queue, :zombie_recovered], + [:durable, :queue, :ack_failed] + ] + + @doc """ + Attaches a handler for `events` and returns `:ok`. + + Defaults to the full queue event list. Pass a subset to keep the mailbox lean. + The handler forwards each event to `test_pid` as + `{:event, last_segment, measurements, metadata}`. + """ + def attach_events(events \\ @default_events, test_pid \\ self()) do + handler_id = "durable-telemetry-handler-#{System.unique_integer([:positive])}" + + :telemetry.attach_many(handler_id, events, &__MODULE__.handle/4, test_pid) + + on_exit(fn -> :telemetry.detach(handler_id) end) + + :ok + end + + @doc false + def handle(event, measurements, metadata, test_pid) do + suffix = List.last(event) + send(test_pid, {:event, suffix, measurements, metadata}) + end +end diff --git a/test/support/test_repo.ex b/durable/test/support/test_repo.ex similarity index 100% rename from test/support/test_repo.ex rename to durable/test/support/test_repo.ex diff --git a/test/support/test_workflows.ex b/durable/test/support/test_workflows.ex similarity index 100% rename from test/support/test_workflows.ex rename to durable/test/support/test_workflows.ex diff --git a/durable/test/support/workflows/sink_workflow.ex b/durable/test/support/workflows/sink_workflow.ex new file mode 100644 index 0000000..80ba6b7 --- /dev/null +++ b/durable/test/support/workflows/sink_workflow.ex @@ -0,0 +1,114 @@ +defmodule Durable.TestWorkflows.SinkWorkflow do + @moduledoc """ + A test workflow whose behavior is chosen at runtime from its input `action`. + + Modeled on oban's `test/support/worker.ex` pattern. Eliminates the need to + define a new workflow module for every resilience scenario. + + ## Input + + %{ + "action" => "OK" | "ERROR" | "RAISE" | "EXIT" | "TASK_CRASH" | + "SLEEP" | "WAIT_EVENT" | "WAIT_INPUT", + "ref" => arbitrary ref echoed back to the test pid, + "bin_pid" => Durable.DataCase.pid_to_bin(self()), + + # Action-specific: + "sleep_ms" => integer (SLEEP) default: 50 + "event_name" => string (WAIT_EVENT) default: "evt" + "timeout_ms" => integer (WAIT_EVENT) optional + "input_name" => string (WAIT_INPUT) default: "input" + } + + The step sends `{:started, ref}` to the test pid for actions that *start* work + (SLEEP, WAIT_EVENT, WAIT_INPUT, and — once the test pid is attached — OK), + and `{:done, ref}` on successful completion. Crash actions never send + `:done`; the test observes the crash through the workflow's persisted state. + + ## Intentional caveats + + - `"TASK_CRASH"` calls `Process.exit(self(), :kill)`. In supervised mode + `self()` is the Task the Worker spawned, so the Worker survives via + `{:DOWN, ...}`. In inline mode `self()` is the test process — DO NOT use + TASK_CRASH in non-supervised tests. + - `"EXIT"` uses the catchable `exit/1` form so the step runner's catch-all + translates it to `%{type: "exit", ...}`. + """ + + use Durable + use Durable.Context + use Durable.Wait + + workflow "sink" do + step(:run, fn data -> + # On the first invocation, `data` is the workflow input (string keys). + # On resume after a `wait_for_event`/`wait_for_input` timeout, `data` is + # the atomized merge of the workflow's persisted context with the + # timeout payload — the original input keys are not in there. Stash + # everything we need up front so the step body re-runs symmetrically. + ref = data["ref"] || get_context(:ref) + bin_pid = data["bin_pid"] || get_context(:bin_pid) + action = data["action"] || get_context(:action) + event_name = data["event_name"] || get_context(:event_name) || "evt" + input_name = data["input_name"] || get_context(:input_name) || "input" + sleep_ms = data["sleep_ms"] || get_context(:sleep_ms) || 50 + timeout_ms = data["timeout_ms"] || get_context(:timeout_ms) + timeout_value = data["timeout_value"] || get_context(:timeout_value) + + put_context(:ref, ref) + put_context(:bin_pid, bin_pid) + put_context(:action, action) + put_context(:event_name, event_name) + put_context(:input_name, input_name) + put_context(:sleep_ms, sleep_ms) + if timeout_ms, do: put_context(:timeout_ms, timeout_ms) + if timeout_value, do: put_context(:timeout_value, timeout_value) + + pid = Durable.DataCase.bin_to_pid(bin_pid) + + case action do + "OK" -> + send(pid, {:done, ref}) + {:ok, %{"ran" => "OK", "ref" => ref}} + + "ERROR" -> + {:error, %{type: "sink_error", message: "ERROR action"}} + + "RAISE" -> + raise RuntimeError, "sink raise: ref=#{inspect(ref)}" + + "EXIT" -> + exit(:sink_exit) + + "TASK_CRASH" -> + Process.exit(self(), :kill) + + "SLEEP" -> + send(pid, {:started, ref}) + Process.sleep(sleep_ms) + send(pid, {:done, ref}) + {:ok, %{"ran" => "SLEEP", "slept_ms" => sleep_ms}} + + "WAIT_EVENT" -> + send(pid, {:started, ref}) + wait_opts = build_wait_opts(timeout_ms, timeout_value) + payload = wait_for_event(event_name, wait_opts) + send(pid, {:done, ref}) + {:ok, %{"ran" => "WAIT_EVENT", "payload" => payload}} + + "WAIT_INPUT" -> + send(pid, {:started, ref}) + response = wait_for_input(input_name, type: :approval, prompt: "OK?") + send(pid, {:done, ref}) + {:ok, %{"ran" => "WAIT_INPUT", "response" => response}} + end + end) + end + + defp build_wait_opts(timeout_ms, timeout_value) do + opts = [] + opts = if timeout_ms, do: Keyword.put(opts, :timeout, timeout_ms), else: opts + opts = if timeout_value, do: Keyword.put(opts, :timeout_value, timeout_value), else: opts + opts + end +end diff --git a/test/test_helper.exs b/durable/test/test_helper.exs similarity index 100% rename from test/test_helper.exs rename to durable/test/test_helper.exs diff --git a/durable_dashboard/.formatter.exs b/durable_dashboard/.formatter.exs new file mode 100644 index 0000000..4872046 --- /dev/null +++ b/durable_dashboard/.formatter.exs @@ -0,0 +1,5 @@ +[ + import_deps: [:ecto, :ecto_sql, :phoenix, :phoenix_live_view], + subdirectories: ["priv/*/migrations"], + inputs: ["*.{ex,exs}", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/durable_dashboard/.gitignore b/durable_dashboard/.gitignore new file mode 100644 index 0000000..235c201 --- /dev/null +++ b/durable_dashboard/.gitignore @@ -0,0 +1,10 @@ +# Elixir +/_build/ +/deps/ +/cover/ +/doc/ +*.ez +erl_crash.dump + +# Node (assets/) +assets/node_modules/ diff --git a/durable_dashboard/CLAUDE.md b/durable_dashboard/CLAUDE.md new file mode 100644 index 0000000..f32e427 --- /dev/null +++ b/durable_dashboard/CLAUDE.md @@ -0,0 +1,41 @@ +# Durable Dashboard + +Hex package providing the web dashboard for the Durable workflow engine. +Architecture: Plug.Router entry point → Phoenix Router → LiveView pages, +with a single ReactFlow island for the workflow graph view (mounted via a +`phx-hook`). + +## Before changing any UI + +Read `DESIGN.md` in this directory. It is the source of truth for colors, +typography, spacing, motion, status semantics, component primitives, and +composition patterns. New visual decisions are made *there*, then applied +in code — not the other way around. + +If a needed pattern isn't in `DESIGN.md`, add it there in the same PR. + +## Stateless visual components + +Live in `lib/durable_dashboard/components/core.ex`: +`button, badge, status_pill, card, heading, code, kbd, relative_time, +icon, skeleton, empty_state, error_state`. Use these instead of +hand-rolling HTML — see `DESIGN.md` §5 for the API contract. + +## Build & test + +```bash +# Assets (use pnpm, not npm) +cd assets && pnpm install && pnpm build + +# Type / lint / format checks +cd assets && pnpm exec tsc --noEmit && pnpm exec biome check src/ + +# Elixir +mix compile --warnings-as-errors +mix test +``` + +## Test DB + +Phoenix demo runs against port `53412`. See `examples/phoenix_demo` in the +repo root. diff --git a/durable_dashboard/DESIGN.md b/durable_dashboard/DESIGN.md new file mode 100644 index 0000000..025dbfe --- /dev/null +++ b/durable_dashboard/DESIGN.md @@ -0,0 +1,524 @@ +# Durable Dashboard — Design Language + +This document is the source of truth for every visual decision in the Durable +Dashboard. Read it before adding or modifying any UI surface. If a decision +isn't here, codify it here first; don't make it twice. + +> **Audience.** Any contributor (human or AI assistant) shipping a UI piece. +> **Goal.** A new contributor can answer *"how do I render a list of executions?"* +> after five minutes with this doc. + +## 1. Philosophy + +Durable Dashboard is a **workflow-engine console**. The aesthetic priorities +are, in order: + +1. **Data density.** Operators read this thing all day. Every pixel that + isn't carrying information is a tax. +2. **Dark-first.** The default theme is dark; light theme is supported with + the same visual fidelity for ops on bright monitors. +3. **Restrained color.** Color carries semantic meaning (status). It is + never used for branding or decoration. +4. **No chrome.** No drop shadows for depth, no gradients for polish, no + illustrations, no marketing copy. Elevation comes from background + contrast, not effects. + +**Inspirations** (not to imitate, but to calibrate against): + +- **Temporal Web** — data density and disciplined hierarchy. +- **Inngest** — polish and motion vocabulary. +- **Linear** — typography rhythm and command-palette discipline. +- **Argo Workflows / n8n** — workflow graph conventions. + +**Visual budget.** Every surface earns its weight. No card-in-card-in-card. +Whitespace is structural, not decorative — it separates regions, not +paragraphs. + +## 2. Foundations + +### 2.1 Colors + +Every color comes from a CSS custom property declared in +`assets/src/index.css`. Light + dark values are paired. Tailwind utility +classes (`bg-card`, `text-primary`, `border-border`) read these tokens via +the `@theme inline` block. + +| Token | Light | Dark | Use | +| ----------------------- | ------------------------ | ------------------------- | --------------------------------------------- | +| `--background` | `oklch(1 0 0)` | `oklch(0.145 0 0)` | App canvas | +| `--foreground` | `oklch(0.18 0 0)` | `oklch(0.98 0 0)` | Primary text | +| `--card` | `oklch(0.985 0 0)` | `oklch(0.185 0 0)` | Surfaces above the canvas | +| `--popover` | `oklch(0.99 0 0)` | `oklch(0.205 0 0)` | Floating surfaces | +| `--primary` | `oklch(0.5 0.2 250)` | `oklch(0.72 0.16 250)` | Primary actions, focus, active nav | +| `--secondary` | `oklch(0.96 0 0)` | `oklch(0.22 0 0)` | Secondary buttons, neutral chips | +| `--accent` | `oklch(0.95 0 0)` | `oklch(0.24 0 0)` | Hover backgrounds | +| `--muted` | `oklch(0.96 0 0)` | `oklch(0.21 0 0)` | Subdued surfaces | +| `--muted-foreground` | `oklch(0.5 0 0)` | `oklch(0.62 0 0)` | Secondary text | +| `--success` | `oklch(0.55 0.18 155)` | `oklch(0.78 0.16 155)` | Status: completed / running | +| `--warning` | `oklch(0.65 0.18 75)` | `oklch(0.82 0.16 80)` | Status: waiting / compensating | +| `--destructive` | `oklch(0.55 0.22 22)` | `oklch(0.72 0.20 22)` | Status: failed / timeout, destructive actions | +| `--info` | `oklch(0.55 0.16 230)` | `oklch(0.78 0.13 230)` | Status: scheduled | +| `--border` | `oklch(0.92 0 0)` | `oklch(0.27 0 0)` | Hairline dividers | +| `--input` | `oklch(0.94 0 0)` | `oklch(1 0 0 / 8%)` | Input borders/backgrounds | +| `--ring` | `oklch(0.5 0.2 250 / 0.4)` | `oklch(0.72 0.16 250 / 0.55)` | Focus rings | + +#### Forbidden + +- Hex literals — `#3b82f6`, `#22c55e`, `#fff`. Use tokens. +- Inline OKLCH/HSL/RGB in components — `oklch(0.78 0.16 155)`, + `rgb(34 197 94)`. Use tokens. +- Tailwind palette colors — `text-blue-500`, `bg-green-100`. Use tokens. +- Any new color outside the table without first adding it here. + +#### Opacity + +Use the slash modifier on token classes for tints: `bg-success/10`, +`text-primary/80`, `border-destructive/20`. Standard tints: `/5`, `/10`, +`/15`, `/20`, `/40`, `/60`, `/80`. Anything else needs justification. + +### 2.2 Typography + +- `--font-sans` = **Inter Variable** with `cv11` enabled. +- `--font-mono` = **JetBrains Mono Variable** with ligatures off. + +Use `font-mono` (or the `text-numeric` utility for tabular numerics) on: +IDs, durations, timestamps, JSON, code, status pills, dense numeric tables. + +#### Type scale + +Six sizes only. Anything else needs a comment explaining why. + +| Class | Size | Use | +| -------------- | ---- | ------------------------------------------------------ | +| `text-[9px]` | 9 | *Exception only:* graph-marker micro-eyebrows (start/end labels, group badges). | +| `text-[10px]` | 10 | Eyebrow / uppercase chip / kbd / footer hints | +| `text-[11px]` | 11 | Footnote, dense table cell, tertiary metadata | +| `text-xs` | 12 | Body small, control labels, navigation items | +| `text-[13px]` | 13 | Default body, default button text, form fields | +| `text-sm` | 14 | Card titles, primary body, list-row titles | +| `text-[18px]` | 18 | Section heading (h2) | +| `text-[22px]` | 22 | Page heading (h1) | + +Two heading utilities are declared in `index.css`: + +- `text-heading` — applies `font-weight: 600`, `letter-spacing: -0.015em`, + `line-height: 1.2`. Use on h1/h2/h3. +- `text-numeric` — applies `font-mono`, tabular nums, slashed zero. + +### 2.3 Spacing + +4 px grid. Allowed tailwind values: `0.5, 1, 1.5, 2, 2.5, 3, 4, 6, 8, 12, 16`. +Snap everything else. + +Common rhythms: + +- `gap-1.5` — chips, inline status indicators +- `gap-2` — adjacent controls (button + button) +- `gap-3` — related controls (input + button group) +- `gap-4` — within a card body +- `gap-6` — between major sections + +Page padding: `px-6 py-4` standard. Sheet / dialog inner padding: `p-4`. + +### 2.4 Radius + +Maps to `--radius-*` tokens; use the Tailwind shortcuts: + +| Class | px | Use | +| ---------------- | --- | ------------------------------------------------ | +| `rounded-sm` | 2 | Chips, badges, kbd, dense controls | +| `rounded-md` | 4 | Buttons, inputs, cards, surface containers | +| `rounded-lg` | 6 | Sheets, dialogs, large surfaces | +| `rounded-full` | ∞ | Avatars, status dots, circular buttons | + +No 8 px or 10 px radii. They look like marketing components. + +### 2.5 Borders + +- 1 px hairline default — `border border-border`. +- 1.5 px only for focus rings (`ring-2 ring-ring/40`). +- 2 px reserved for status emphasis (e.g. "current execution" outline on a + graph node — see §11). +- Avoid double borders (border + ring without offset). + +### 2.6 Shadows + +`shadow-sm` only. Elevation is achieved via *background contrast* (card +above canvas), not blurred shadows. The one exception: `` +and other floating overlays may use `shadow-lg` for popovers. + +## 3. Motion + +Three named animations only. Defined in `assets/src/index.css`. + +| Name | Duration | Easing | Use | +| ----------------- | -------- | ----------- | ---------------------------------------------------- | +| `led-dot` | 1.6 s | ease-in-out | Running/active status dots; pulses opacity + glow. | +| `dash-flow` | 1.0 s | linear | Flowing edges in the workflow graph. | +| `animate-pulse` | 2.0 s | cubic-bezier| Skeleton loaders only (never on real content). | + +Transitions: `transition-colors duration-150` for hover/focus. Nothing +slower than 200 ms. Layout transitions (`transition-all`) are forbidden — +they make data-dense UIs feel sluggish. + +`prefers-reduced-motion: reduce` disables all three animations through a +single media query in `index.css`. Don't bypass it. + +## 4. Status semantics — canonical table + +This is the **single source of truth** for every status string the system +emits. Do not invent local mappings. + +| Status | Color tier | Dot | Label | +| ----------------- | --------------- | ------- | -------------- | +| `pending` | `muted` | none | "pending" | +| `running` | `success` | pulse | "running" | +| `waiting` | `warning` | solid | "waiting" | +| `completed` | `success` | solid | "completed" | +| `failed` | `destructive` | none | "failed" | +| `cancelled` | `muted` | none | "cancelled" | +| `compensating` | `warning` | pulse | "compensating" | +| `compensated` | `muted` | none | "compensated" | +| `compensation_failed` | `destructive` | none | "comp. failed" | +| `scheduled` | `info` | none | "scheduled" | +| `timeout` | `destructive` | none | "timeout" | + +To add a status: + +1. Update this table. +2. Update `Components.Core.status_meta/1` (HEEx-side) and the `toneFor` + helpers in `step_node.tsx` and any other React node components. +3. Update the workflow query/schema if needed. + +If those four don't agree, the table wins. + +## 5. Component primitives — `Components.Core` API contract + +Stateless visual primitives live in +`lib/durable_dashboard/components/core.ex`. Every primitive listed here is +already implemented; see the source for the full attr list. + +### `<.button>` + +Variants: `primary | secondary | ghost | destructive | link`. +Sizes: `sm` (28h) | `md` (32h, default) | `lg` (40h). + +```heex +<.button kind="primary" type="submit">Save +<.button kind="ghost" size="sm" phx-click="cancel">Cancel +``` + +**Don't** roll your own ` + ); +} diff --git a/durable_dashboard/assets/src/components/layout/TopBar.tsx b/durable_dashboard/assets/src/components/layout/TopBar.tsx new file mode 100644 index 0000000..665eb7b --- /dev/null +++ b/durable_dashboard/assets/src/components/layout/TopBar.tsx @@ -0,0 +1,103 @@ +import { useMemo } from "react"; +import { + Breadcrumb, + BreadcrumbItem, + BreadcrumbLink, + BreadcrumbList, + BreadcrumbPage, + BreadcrumbSeparator, +} from "@/components/ui/breadcrumb"; +import { Separator } from "@/components/ui/separator"; +import { SidebarTrigger } from "@/components/ui/sidebar"; +import type { NavigateFn, ViewName } from "@/lib/types"; + +interface TopBarProps { + currentView: ViewName; + viewParams: Record; + navigate: NavigateFn; +} + +type Crumb = { + label: string; + onClick?: () => void; +}; + +const VIEW_LABELS: Record = { + overview: "Overview", + workflows: "Workflows", + workflow_detail: "Workflows", + schedules: "Schedules", + inputs: "Inputs", + settings: "Settings", +}; + +const TAB_LABELS: Record = { + summary: "Summary", + flow: "Flow", + topology: "Topology", + logs: "Logs", + io: "I/O", + history: "History", +}; + +export function TopBar({ currentView, viewParams, navigate }: TopBarProps) { + const crumbs = useMemo(() => { + const items: Crumb[] = []; + + if (currentView === "workflow_detail") { + items.push({ + label: "Workflows", + onClick: () => navigate("workflows"), + }); + + const name = viewParams.name || viewParams.id?.slice(0, 8) || "Detail"; + items.push({ + label: name, + onClick: viewParams.tab + ? () => + navigate("workflow_detail", { + id: viewParams.id, + }) + : undefined, + }); + + if (viewParams.tab) { + items.push({ + label: TAB_LABELS[viewParams.tab] || viewParams.tab, + }); + } + } else { + items.push({ + label: VIEW_LABELS[currentView] || currentView, + }); + } + + return items; + }, [currentView, viewParams, navigate]); + + return ( +
+ + + + + {crumbs.map((crumb, i) => { + const isLast = i === crumbs.length - 1; + return ( + + {i > 0 && } + {isLast || !crumb.onClick ? ( + {crumb.label} + ) : ( + + {crumb.label} + + )} + + ); + })} + + +
+ ); +} diff --git a/durable_dashboard/assets/src/components/shared/InputForm.tsx b/durable_dashboard/assets/src/components/shared/InputForm.tsx new file mode 100644 index 0000000..254d6a5 --- /dev/null +++ b/durable_dashboard/assets/src/components/shared/InputForm.tsx @@ -0,0 +1,258 @@ +import { useState } from "react"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { Label } from "@/components/ui/label"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { Textarea } from "@/components/ui/textarea"; +import type { PendingInput } from "@/lib/types"; + +interface InputFormProps { + input: PendingInput; + onSubmit: (data: unknown) => void; +} + +export function InputForm({ input, onSubmit }: InputFormProps) { + switch (input.input_type) { + case "approval": + return ; + case "single_choice": + return ; + case "multi_choice": + return ; + case "free_text": + return ; + case "form": + return ; + default: + return ; + } +} + +function ApprovalForm({ onSubmit }: { input: PendingInput; onSubmit: (data: unknown) => void }) { + const [submitting, setSubmitting] = useState(false); + + const handle = async (approved: boolean) => { + setSubmitting(true); + await onSubmit(approved ? "approved" : "rejected"); + setSubmitting(false); + }; + + return ( +
+ + +
+ ); +} + +function SingleChoiceForm({ + input, + onSubmit, +}: { + input: PendingInput; + onSubmit: (data: unknown) => void; +}) { + const [selected, setSelected] = useState(""); + const [submitting, setSubmitting] = useState(false); + const choices = input.fields || []; + + const handle = async () => { + if (!selected) return; + setSubmitting(true); + await onSubmit(selected); + setSubmitting(false); + }; + + return ( +
+
+ {choices.map((choice, i) => { + const val = choice.value || choice.label || String(i); + return ( + + ); + })} +
+ +
+ ); +} + +function MultiChoiceForm({ + input, + onSubmit, +}: { + input: PendingInput; + onSubmit: (data: unknown) => void; +}) { + const [selected, setSelected] = useState>(new Set()); + const [submitting, setSubmitting] = useState(false); + const choices = input.fields || []; + + const toggle = (value: string) => { + setSelected((prev) => { + const next = new Set(prev); + if (next.has(value)) next.delete(value); + else next.add(value); + return next; + }); + }; + + const handle = async () => { + setSubmitting(true); + await onSubmit(Array.from(selected)); + setSubmitting(false); + }; + + return ( +
+
+ {choices.map((choice, i) => { + const val = choice.value || choice.label || String(i); + return ( + + ); + })} +
+ +
+ ); +} + +function FreeTextForm({ + input, + onSubmit, +}: { + input: PendingInput; + onSubmit: (data: unknown) => void; +}) { + const [text, setText] = useState(""); + const [submitting, setSubmitting] = useState(false); + + const handle = async () => { + if (!text.trim()) return; + setSubmitting(true); + await onSubmit(text); + setSubmitting(false); + }; + + return ( +
+ + <% "number" -> %> + + <% _ -> %> + + <% end %> + + """ + end + + defp option_value(%{value: v}), do: to_string(v) + defp option_value(%{"value" => v}), do: to_string(v) + defp option_value(s) when is_binary(s), do: s + defp option_value(s) when is_atom(s), do: Atom.to_string(s) + + defp option_label(%{label: l}), do: l + defp option_label(%{"label" => l}), do: l + defp option_label(s) when is_binary(s), do: s + defp option_label(s) when is_atom(s), do: Atom.to_string(s) + + defp sget(map, key) when is_binary(key) do + Map.get(map, key) || Map.get(map, String.to_atom(key)) + end + + defp humanize(nil), do: "" + defp humanize(name), do: name |> to_string() |> String.replace("_", " ") |> String.capitalize() +end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_controller.ex b/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_controller.ex deleted file mode 100644 index 6d6b312..0000000 --- a/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_controller.ex +++ /dev/null @@ -1,7 +0,0 @@ -defmodule PhoenixDemoWeb.PageController do - use PhoenixDemoWeb, :controller - - def home(conn, _params) do - redirect(conn, to: ~p"/workflows") - end -end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_html.ex b/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_html.ex index a7bd233..e69de29 100644 --- a/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_html.ex +++ b/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_html.ex @@ -1,10 +0,0 @@ -defmodule PhoenixDemoWeb.PageHTML do - @moduledoc """ - This module contains pages rendered by PageController. - - See the `page_html` directory for all templates available. - """ - use PhoenixDemoWeb, :html - - embed_templates "page_html/*" -end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_html/home.html.heex b/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_html/home.html.heex deleted file mode 100644 index b107fd0..0000000 --- a/examples/phoenix_demo/lib/phoenix_demo_web/controllers/page_html/home.html.heex +++ /dev/null @@ -1,202 +0,0 @@ - - -
-
- -
-

- Phoenix Framework - - v{Application.spec(:phoenix, :vsn)} - -

- -
- -

- Peace of mind from prototype to production. -

-

- Build rich, interactive web applications quickly, with less code and fewer moving parts. Join our growing community of developers using Phoenix to craft APIs, HTML5 apps and more, for fun or at scale. -

- -
-
diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/approval_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/approval_live.ex deleted file mode 100644 index 9070a2b..0000000 --- a/examples/phoenix_demo/lib/phoenix_demo_web/live/approval_live.ex +++ /dev/null @@ -1,287 +0,0 @@ -defmodule PhoenixDemoWeb.ApprovalLive do - @moduledoc """ - LiveView for managing human-in-the-loop approvals. - Shows pending approval requests and allows users to approve or reject them. - """ - - use PhoenixDemoWeb, :live_view - - alias Durable.Config - alias Durable.Storage.Schemas.{PendingInput, WorkflowExecution} - alias Durable.Wait - - import Ecto.Query - - @impl true - def mount(_params, _session, socket) do - if connected?(socket) do - Phoenix.PubSub.subscribe(PhoenixDemo.PubSub, "workflows") - :timer.send_interval(2000, self(), :refresh) - end - - {:ok, - assign(socket, - page_title: "Pending Approvals", - pending_approvals: list_pending_approvals(), - selected: nil, - reason: "" - )} - end - - @impl true - def handle_info(:refresh, socket) do - {:noreply, assign(socket, pending_approvals: list_pending_approvals())} - end - - def handle_info({:workflow_started, _id}, socket) do - {:noreply, assign(socket, pending_approvals: list_pending_approvals())} - end - - def handle_info({:workflow_completed, _id, _report}, socket) do - {:noreply, assign(socket, pending_approvals: list_pending_approvals())} - end - - def handle_info({:workflow_rejected, _id, _result}, socket) do - {:noreply, assign(socket, pending_approvals: list_pending_approvals())} - end - - @impl true - def handle_event("select", %{"id" => id}, socket) do - selected = Enum.find(socket.assigns.pending_approvals, &(&1.id == id)) - {:noreply, assign(socket, selected: selected, reason: "")} - end - - @impl true - def handle_event("close_modal", _params, socket) do - {:noreply, assign(socket, selected: nil, reason: "")} - end - - @impl true - def handle_event("update_reason", %{"reason" => reason}, socket) do - {:noreply, assign(socket, reason: reason)} - end - - @impl true - def handle_event("approve", %{"id" => id}, socket) do - pending = Enum.find(socket.assigns.pending_approvals, &(&1.id == id)) - - if pending do - response = %{ - "approved" => true, - "approved_by" => "manager", - "approved_at" => DateTime.utc_now() |> DateTime.to_iso8601() - } - - case Wait.provide_input(pending.workflow_id, pending.input_name, response) do - :ok -> - {:noreply, - socket - |> assign( - pending_approvals: list_pending_approvals(), - selected: nil, - reason: "" - ) - |> put_flash(:info, "Approval granted successfully!")} - - {:error, reason} -> - {:noreply, put_flash(socket, :error, "Failed to approve: #{inspect(reason)}")} - end - else - {:noreply, socket} - end - end - - @impl true - def handle_event("reject", %{"id" => id}, socket) do - pending = Enum.find(socket.assigns.pending_approvals, &(&1.id == id)) - - if pending do - response = %{ - "approved" => false, - "reason" => socket.assigns.reason || "Rejected by reviewer", - "rejected_by" => "manager", - "rejected_at" => DateTime.utc_now() |> DateTime.to_iso8601() - } - - case Wait.provide_input(pending.workflow_id, pending.input_name, response) do - :ok -> - {:noreply, - socket - |> assign( - pending_approvals: list_pending_approvals(), - selected: nil, - reason: "" - ) - |> put_flash(:info, "Workflow rejected.")} - - {:error, reason} -> - {:noreply, put_flash(socket, :error, "Failed to reject: #{inspect(reason)}")} - end - else - {:noreply, socket} - end - end - - defp list_pending_approvals do - config = Config.get(Durable) - repo = config.repo - - repo.all( - from(p in PendingInput, - join: w in WorkflowExecution, - on: p.workflow_id == w.id, - where: p.status == :pending and p.input_type == :approval, - order_by: [asc: p.inserted_at], - preload: [:workflow], - select_merge: %{workflow: w} - ) - ) - end - - defp format_time(nil), do: "-" - - defp format_time(datetime) do - Calendar.strftime(datetime, "%Y-%m-%d %H:%M:%S") - end - - @impl true - def render(assigns) do - ~H""" - <.header> - Pending Approvals - <:subtitle>Review and approve document processing requests - <:actions> - <.button navigate={~p"/workflows"}> - <.icon name="hero-arrow-left" class="size-4 mr-1" /> Back to Dashboard - - - - -
-
- <.icon name="hero-inbox" class="size-16 mx-auto text-base-content/30" /> -

No pending approvals

-

- When workflows need approval, they'll appear here. -

- <.button navigate={~p"/workflows/new"} class="mt-4" variant="primary"> - Create a Workflow - -
- -
-
-
-
-
-

- <.icon name="hero-document-text" class="size-5" /> - {approval.input_name} -

-

- {approval.prompt} -

-
- Pending -
- -
- -
-
- Workflow ID: - - {String.slice(approval.workflow_id, 0, 8)}... - -
-
- Step: - {approval.step_name} -
-
- Created: - {format_time(approval.inserted_at)} -
-
- Timeout: - {format_time(approval.timeout_at)} -
-
- -
-
- - View Document Data - -
-
{Jason.encode!(approval.metadata, pretty: true)}
-
-
-
- -
- -
-
-
-
-
- - - - """ - end -end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/document_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/document_live.ex deleted file mode 100644 index c3cfbb0..0000000 --- a/examples/phoenix_demo/lib/phoenix_demo_web/live/document_live.ex +++ /dev/null @@ -1,248 +0,0 @@ -defmodule PhoenixDemoWeb.DocumentLive do - @moduledoc """ - LiveView for uploading documents and starting processing workflows. - Supports actual file uploads with real processing. - """ - - use PhoenixDemoWeb, :live_view - - alias PhoenixDemo.Workflows.DocumentWorkflow - - @impl true - def mount(_params, _session, socket) do - {:ok, - socket - |> assign(page_title: "Upload Document", workflow_id: nil, error: nil) - |> allow_upload(:document, - accept: ~w(.pdf .txt .csv .md .json .xml), - max_entries: 1, - max_file_size: 10_000_000 - )} - end - - @impl true - def handle_event("validate", _params, socket) do - {:noreply, socket} - end - - @impl true - def handle_event("cancel-upload", %{"ref" => ref}, socket) do - {:noreply, cancel_upload(socket, :document, ref)} - end - - @impl true - def handle_event("submit", _params, socket) do - case uploaded_entries(socket, :document) do - {[_ | _], []} -> - # Process the upload - [result] = - consume_uploaded_entries(socket, :document, fn %{path: path}, entry -> - # Save file to uploads directory - filename = entry.client_name - dest_path = Path.join(uploads_dir(), "#{entry.uuid}_#{filename}") - File.cp!(path, dest_path) - - {:ok, - %{ - "filename" => filename, - "path" => dest_path, - "size" => entry.client_size, - "content_type" => entry.client_type - }} - end) - - # Start the workflow with file info - case Durable.start(DocumentWorkflow, result) do - {:ok, workflow_id} -> - Phoenix.PubSub.broadcast( - PhoenixDemo.PubSub, - "workflows", - {:workflow_started, workflow_id} - ) - - {:noreply, - socket - |> assign(workflow_id: workflow_id, error: nil) - |> put_flash(:info, "Document uploaded and workflow started!")} - - {:error, reason} -> - {:noreply, assign(socket, error: "Failed to start workflow: #{inspect(reason)}")} - end - - {[], []} -> - {:noreply, assign(socket, error: "Please select a file to upload")} - - {_, [_ | _]} -> - {:noreply, socket} - end - end - - defp uploads_dir do - Path.join(:code.priv_dir(:phoenix_demo), "uploads") - end - - defp error_to_string(:too_large), do: "File is too large (max 10MB)" - defp error_to_string(:not_accepted), do: "File type not accepted" - defp error_to_string(:too_many_files), do: "Only one file allowed" - defp error_to_string(err), do: "Error: #{inspect(err)}" - - @impl true - def render(assigns) do - ~H""" - <.header> - Upload Document - <:subtitle>Upload a document to start the processing workflow - <:actions> - <.button navigate={~p"/workflows"}> - <.icon name="hero-arrow-left" class="size-4 mr-1" /> Back to Dashboard - - - - -
-
-
-

Select Document

- - <.form for={%{}} phx-change="validate" phx-submit="submit" class="mt-4"> -
- <.live_file_input upload={@uploads.document} class="hidden" /> - -
- <.icon name="hero-cloud-arrow-up" class="size-12 mx-auto text-base-content/40" /> -

- Drag and drop a file here, or - -

-

- Supported: PDF, TXT, CSV, MD, JSON, XML (max 10MB) -

-
- - <%= for entry <- @uploads.document.entries do %> -
-
- <.icon name={file_icon(entry.client_name)} class="size-8 text-primary" /> -
-

{entry.client_name}

-

- {format_bytes(entry.client_size)} -

-
-
- -
- -
0 and entry.progress < 100} class="mt-2"> - - -
- - <%= for err <- upload_errors(@uploads.document, entry) do %> -

{error_to_string(err)}

- <% end %> - <% end %> -
- -
- <.icon name="hero-exclamation-circle" class="size-5" /> - {@error} -
- -
- <.button - type="submit" - variant="primary" - disabled={@uploads.document.entries == []} - > - <.icon name="hero-arrow-up-tray" class="size-4 mr-1" /> Upload & Process - -
- -
-
- -
-
- <.icon name="hero-check-circle" class="size-6" /> -
-

Workflow Started!

-

- Workflow ID: {@workflow_id} -

-
-
- <.link navigate={~p"/workflows/#{@workflow_id}"} class="btn btn-sm"> - View Progress - -
-
-
- -
-
-
-

- <.icon name="hero-cog-6-tooth" class="size-5" /> Processing Steps -

-
    -
  • Validate file format
  • -
  • Analyze content (size, lines, etc.)
  • -
  • Check if approval needed
  • -
  • Transform & generate report
  • -
-
-
- -
-
-

- <.icon name="hero-shield-check" class="size-5" /> Approval Rules -

-
-
- Requires Approval - PDF files -
-
- Auto-Approved - TXT, CSV, MD, JSON, XML -
-
-
-
-
-
- """ - end - - defp file_icon(filename) do - ext = Path.extname(filename) |> String.downcase() - - case ext do - ".pdf" -> "hero-document" - ".txt" -> "hero-document-text" - ".csv" -> "hero-table-cells" - ".md" -> "hero-document-text" - ".json" -> "hero-code-bracket" - ".xml" -> "hero-code-bracket" - _ -> "hero-document" - end - end - - defp format_bytes(bytes) when bytes < 1024, do: "#{bytes} B" - defp format_bytes(bytes) when bytes < 1_048_576, do: "#{Float.round(bytes / 1024, 1)} KB" - defp format_bytes(bytes), do: "#{Float.round(bytes / 1_048_576, 1)} MB" -end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/executions_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/executions_live.ex new file mode 100644 index 0000000..868f971 --- /dev/null +++ b/examples/phoenix_demo/lib/phoenix_demo_web/live/executions_live.ex @@ -0,0 +1,211 @@ +defmodule PhoenixDemoWeb.ExecutionsLive do + @moduledoc """ + Lists workflow executions with URL-driven filters. Replaces the older + WorkflowLive dashboard. + """ + use PhoenixDemoWeb, :live_view + + alias Durable.Config + alias Durable.Storage.Schemas.WorkflowExecution + + import Ecto.Query + + @statuses ~w(all pending running waiting completed failed cancelled) + + @impl true + def mount(_params, _session, socket) do + if connected?(socket) do + Phoenix.PubSub.subscribe(PhoenixDemo.PubSub, "workflows") + :timer.send_interval(2_000, self(), :refresh) + end + + {:ok, + assign(socket, + page_title: "Executions", + active_nav: :executions, + status: "all", + module: nil, + workflows: [] + )} + end + + @impl true + def handle_params(params, _uri, socket) do + status = if params["status"] in @statuses, do: params["status"], else: "all" + module = params["workflow"] + + socket = + socket + |> assign(status: status, module: module) + |> assign(workflows: list_workflows(status, module)) + |> assign(modules: list_modules()) + + {:noreply, socket} + end + + @impl true + def handle_info(:refresh, socket) do + {:noreply, assign(socket, workflows: list_workflows(socket.assigns.status, socket.assigns.module), modules: list_modules())} + end + + def handle_info({event, _, _}, socket) when event in [:workflow_completed, :workflow_rejected] do + {:noreply, assign(socket, workflows: list_workflows(socket.assigns.status, socket.assigns.module))} + end + + def handle_info(_, socket), do: {:noreply, socket} + + defp list_workflows(status, module) do + config = Config.get(Durable) + repo = config.repo + + query = + from(w in WorkflowExecution, + order_by: [desc: w.inserted_at], + limit: 100 + ) + + query = + case status do + "all" -> query + s -> from(w in query, where: w.status == ^String.to_atom(s)) + end + + query = + case module do + nil -> query + "" -> query + m -> from(w in query, where: w.workflow_module == ^m) + end + + repo.all(query) + end + + defp list_modules do + config = Config.get(Durable) + repo = config.repo + + repo.all( + from(w in WorkflowExecution, + select: w.workflow_module, + distinct: true, + order_by: w.workflow_module + ) + ) + |> Enum.reject(&is_nil/1) + end + + defp status_badge(status) do + case status do + :pending -> "badge badge-warning" + :running -> "badge badge-info" + :waiting -> "badge badge-secondary" + :completed -> "badge badge-success" + :failed -> "badge badge-error" + :cancelled -> "badge badge-ghost" + _ -> "badge" + end + end + + defp short_module(nil), do: "-" + defp short_module(mod), do: mod |> String.split(".") |> List.last() + + defp format_time(nil), do: "-" + defp format_time(dt), do: Calendar.strftime(dt, "%Y-%m-%d %H:%M:%S") + + @impl true + def render(assigns) do + ~H""" + +
+
+

Executions

+

All workflow runs across the demo.

+
+ <.link navigate={~p"/"} class="btn btn-sm btn-ghost"> + <.icon name="hero-plus" class="size-4" /> Run a workflow + +
+ +
+
+ <.link + :for={s <- ~w(all pending running waiting completed failed cancelled)} + patch={status_path(s, @module)} + role="tab" + class={["tab", @status == s && "tab-active"]} + > + {s} + +
+ +
+
+ +
+
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + +
IDWorkflowStatusStepCreated
{String.slice(w.id, 0, 8)} +
{w.workflow_name}
+
{short_module(w.workflow_module)}
+
{w.status}{w.current_step || "-"}{format_time(w.inserted_at)} + <.link navigate={~p"/executions/#{w.id}"} class="btn btn-xs btn-ghost"> + View + +
+ No executions match these filters. +
+
+
+ """ + end + + @impl true + def handle_event("filter_module", %{"module" => mod}, socket) do + {:noreply, push_patch(socket, to: status_path(socket.assigns.status, blank_to_nil(mod)))} + end + + defp blank_to_nil(""), do: nil + defp blank_to_nil(v), do: v + + defp status_path(status, module) do + params = + [{"status", status}, {"workflow", module}] + |> Enum.reject(fn {_k, v} -> v in [nil, "", "all"] end) + + case params do + [] -> ~p"/executions" + _ -> ~p"/executions?#{params}" + end + end +end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/home_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/home_live.ex new file mode 100644 index 0000000..b00c59a --- /dev/null +++ b/examples/phoenix_demo/lib/phoenix_demo_web/live/home_live.ex @@ -0,0 +1,277 @@ +defmodule PhoenixDemoWeb.HomeLive do + @moduledoc """ + Workflow hub: a card per demo workflow. Each card opens a modal with a + simulated-input form that triggers the workflow via `Durable.start/2` + (or `Durable.trigger_schedule/1` for the cron entry). + """ + use PhoenixDemoWeb, :live_view + + import PhoenixDemoWeb.WorkflowForm + + alias PhoenixDemo.Workflows + + @workflows [ + %{ + key: "order_fulfillment", + title: "Order Fulfillment", + module: Workflows.OrderFulfillmentWorkflow, + description: + "End-to-end order: reserve inventory → call PaymentWorkflow → ship → confirm. Toggle the failure flag to watch saga compensation cascade.", + pills: ["call_workflow", "saga", "compensate"], + fields: [ + %{name: "order_id", label: "Order ID", type: "text", default: "ORD-1042"}, + %{name: "amount", label: "Amount ($)", type: "number", default: 99.99, required: true}, + %{ + name: "force_failure", + label: "Force shipping failure (triggers saga rollback)", + type: "checkbox", + default: false + } + ] + }, + %{ + key: "payment", + title: "Payment", + module: Workflows.PaymentWorkflow, + description: + "Standalone or child workflow. Authorize step has retry: max_attempts: 3, backoff: :exponential — random failures show as separate attempts.", + pills: ["retry", "exponential backoff", "composition"], + fields: [ + %{name: "amount", label: "Amount ($)", type: "number", default: 49.50, required: true}, + %{ + name: "force_failure", + label: "Force authorization failure", + type: "checkbox", + default: false + } + ] + }, + %{ + key: "expense_approval", + title: "Expense Approval", + module: Workflows.ExpenseApprovalWorkflow, + description: + "Collects an expense via wait_for_form, then routes to single (manager) or dual (manager + CFO via wait_for_all) approval based on amount.", + pills: ["wait_for_form", "wait_for_all", "decision"], + fields: [ + %{name: "employee", label: "Employee", type: "text", default: "Alice", required: true} + ] + }, + %{ + key: "content_moderation", + title: "Content Moderation", + module: Workflows.ContentModerationWorkflow, + description: + "Three parallel mock-AI scans aggregate to a max-score, then branch routes to auto-remove / human-review / auto-approve.", + pills: ["parallel", "branch", "wait_for_choice"], + fields: [ + %{name: "content_id", label: "Content ID", type: "text", default: "POST-9001"}, + %{ + name: "content_type", + label: "Content type", + type: "select", + options: ["text", "image", "video"], + default: "image" + } + ] + }, + %{ + key: "payment_reconciliation", + title: "Payment Reconciliation", + module: Workflows.PaymentReconciliationWorkflow, + description: + "Submits to a (mock) processor and parks waiting for a webhook event. Send the event from /pending-events to resume.", + pills: ["wait_for_event", "send_event"], + fields: [ + %{name: "transaction_id", label: "Transaction ID", type: "text", default: "TXN-7700"} + ] + }, + %{ + key: "drip_email", + title: "Drip Email Campaign", + module: Workflows.DripEmailCampaignWorkflow, + description: + "Welcome → schedule_at(+30s) → day-2 → sleep(30s) → day-7. The status flips :running ↔ :waiting visibly across the run.", + pills: ["sleep", "schedule_at", "multi-stage"], + fields: [ + %{ + name: "customer_email", + label: "Customer email", + type: "text", + default: "alice@example.com", + required: true + }, + %{name: "campaign", label: "Campaign", type: "text", default: "welcome"} + ] + }, + %{ + key: "hourly_metrics_cron", + kind: :cron, + schedule_name: "hourly_metrics", + title: "Hourly Metrics (cron)", + module: Workflows.HourlyMetricsCronWorkflow, + description: + "Auto-runs every minute via @schedule. Click Run now to fire one immediately, or visit /schedules to watch it tick.", + pills: ["@schedule", "cron", "scheduled"], + fields: [] + } + ] + + @impl true + def mount(_params, _session, socket) do + {:ok, + assign(socket, + page_title: "Durable Demo", + active_nav: :home, + workflows: @workflows, + open_modal: nil + )} + end + + @impl true + def handle_event("open_modal", %{"key" => key}, socket) do + case Enum.find(@workflows, &(&1.key == key)) do + nil -> {:noreply, socket} + wf -> {:noreply, assign(socket, open_modal: wf)} + end + end + + def handle_event("close_modal", _params, socket) do + {:noreply, assign(socket, open_modal: nil)} + end + + def handle_event("run_workflow", params, socket) do + case socket.assigns.open_modal do + nil -> + {:noreply, socket} + + %{kind: :cron, schedule_name: name} -> + case Durable.trigger_schedule(name) do + {:ok, workflow_id} -> + {:noreply, + socket + |> assign(open_modal: nil) + |> put_flash(:info, "Triggered scheduled run.") + |> push_navigate(to: ~p"/executions/#{workflow_id}")} + + {:error, reason} -> + {:noreply, put_flash(socket, :error, "Trigger failed: #{inspect(reason)}")} + end + + %{module: module, fields: fields} -> + input = build_input(fields, params["fields"] || %{}) + + case Durable.start(module, input) do + {:ok, workflow_id} -> + {:noreply, + socket + |> assign(open_modal: nil) + |> put_flash(:info, "Workflow started.") + |> push_navigate(to: ~p"/executions/#{workflow_id}")} + + {:error, reason} -> + {:noreply, put_flash(socket, :error, "Start failed: #{inspect(reason)}")} + end + end + end + + defp build_input(fields, submitted) do + Enum.reduce(fields, %{}, fn field, acc -> + name = field[:name] + raw = Map.get(submitted, name, field[:default]) + Map.put(acc, name, coerce(field[:type], raw)) + end) + end + + defp coerce("number", v) when is_number(v), do: v + defp coerce("number", v) when is_binary(v) do + case Float.parse(v) do + {n, _} -> n + :error -> 0 + end + end + + defp coerce("checkbox", "true"), do: true + defp coerce("checkbox", true), do: true + defp coerce("checkbox", _), do: false + + defp coerce(_, v), do: v + + @impl true + def render(assigns) do + ~H""" + +
+

Workflow Showcase

+

+ Each card runs a workflow that demonstrates a specific Durable feature. Click Run demo + to trigger one with simulated input. Crons fire automatically — open <.link navigate={~p"/schedules"} class="link link-primary">/schedules + to watch them tick. +

+
+ +
+
+
+

+ <.icon :if={wf[:kind] == :cron} name="hero-clock" class="size-4 text-accent" /> + {wf.title} +

+

{wf.description}

+
+ + {pill} + +
+
+ +
+
+
+
+ + +
+ """ + end +end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/pending_events_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/pending_events_live.ex new file mode 100644 index 0000000..32736d1 --- /dev/null +++ b/examples/phoenix_demo/lib/phoenix_demo_web/live/pending_events_live.ex @@ -0,0 +1,195 @@ +defmodule PhoenixDemoWeb.PendingEventsLive do + @moduledoc """ + Sends external events to workflows blocked on `wait_for_event`. For + recognised event names (e.g. `webhook_received`) the page offers preset + payload buttons; everything else gets a free-form payload editor. + """ + use PhoenixDemoWeb, :live_view + + alias Durable.Wait + alias Durable.Storage.Schemas.WorkflowExecution + alias Durable.Config + + import Ecto.Query + + @presets %{ + "webhook_received" => [ + %{label: "Send: settled", payload: %{"status" => "settled", "amount" => 99.99}}, + %{label: "Send: failed", payload: %{"status" => "failed", "code" => "card_declined"}}, + %{label: "Send: timeout marker", payload: %{"status" => "timeout"}} + ] + } + + @impl true + def mount(_params, _session, socket) do + if connected?(socket) do + Phoenix.PubSub.subscribe(PhoenixDemo.PubSub, "workflows") + :timer.send_interval(2_000, self(), :refresh) + end + + {:ok, + assign(socket, + page_title: "Pending Events", + active_nav: :pending_events, + pending: load_pending(), + payloads: %{} + )} + end + + @impl true + def handle_info(:refresh, socket) do + {:noreply, assign(socket, pending: load_pending())} + end + + def handle_info(_, socket), do: {:noreply, socket} + + defp load_pending do + pending = Wait.list_pending_events(limit: 100) + workflows = preload_workflows(Enum.map(pending, & &1.workflow_id)) + + Enum.map(pending, fn p -> + Map.put(p, :workflow, Map.get(workflows, p.workflow_id)) + end) + end + + defp preload_workflows([]), do: %{} + + defp preload_workflows(ids) do + config = Config.get(Durable) + repo = config.repo + + from(w in WorkflowExecution, where: w.id in ^ids, select: {w.id, w}) + |> repo.all() + |> Map.new() + end + + @impl true + def handle_event("update_payload", %{"_id" => id, "payload" => payload}, socket) do + {:noreply, assign(socket, payloads: Map.put(socket.assigns.payloads, id, payload))} + end + + def handle_event("send_preset", %{"id" => id, "preset" => preset_idx}, socket) do + pending = Enum.find(socket.assigns.pending, &(&1.id == id)) + + with %{} = p <- pending, + presets when is_list(presets) <- Map.get(@presets, p.event_name), + idx when is_integer(idx) <- parse_int(preset_idx), + %{payload: payload} <- Enum.at(presets, idx) do + send_event(socket, p, payload) + else + _ -> {:noreply, put_flash(socket, :error, "Preset not found")} + end + end + + def handle_event("send_custom", %{"_id" => id}, socket) do + pending = Enum.find(socket.assigns.pending, &(&1.id == id)) + + with %{} = p <- pending, + raw <- Map.get(socket.assigns.payloads, id, "{}"), + {:ok, decoded} <- Jason.decode(raw) do + send_event(socket, p, decoded) + else + {:error, %Jason.DecodeError{}} -> + {:noreply, put_flash(socket, :error, "Payload is not valid JSON")} + + _ -> + {:noreply, socket} + end + end + + defp send_event(socket, p, payload) do + case Wait.send_event(p.workflow_id, p.event_name, payload) do + :ok -> + {:noreply, + socket + |> put_flash(:info, "Sent #{p.event_name} to #{String.slice(p.workflow_id, 0, 8)}") + |> assign(pending: load_pending())} + + {:error, reason} -> + {:noreply, put_flash(socket, :error, "Failed: #{inspect(reason)}")} + end + end + + defp parse_int(s) when is_binary(s), do: String.to_integer(s) + defp parse_int(n) when is_integer(n), do: n + defp parse_int(_), do: nil + + defp short_module(nil), do: "-" + defp short_module(mod), do: mod |> String.split(".") |> List.last() + + defp format_time(nil), do: "-" + defp format_time(dt), do: Calendar.strftime(dt, "%Y-%m-%d %H:%M:%S") + + @impl true + def render(assigns) do + presets = @presets + + assigns = assign(assigns, presets: presets) + + ~H""" + +
+

Pending Events

+

+ Workflows blocked on wait_for_event. Send a payload below to resume them. +

+
+ +
+ <.icon name="hero-bolt" class="size-12 mx-auto text-base-content/30" /> +

No workflows waiting on events

+

Trigger Payment Reconciliation from the home page to populate this list.

+
+ +
+
+
+
+
+
{p.event_name}
+
+ {short_module(p.workflow && p.workflow.workflow_module)} · step {p.step_name} +
+
+ {p.wait_type} +
+ +
+ WF {String.slice(p.workflow_id, 0, 8)} + timeout {format_time(p.timeout_at)} +
+ +
+ +
+ +
+ Custom payload +
+ + +
+ +
+
+
+
+
+
+
+ """ + end +end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/pending_inputs_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/pending_inputs_live.ex new file mode 100644 index 0000000..abd3718 --- /dev/null +++ b/examples/phoenix_demo/lib/phoenix_demo_web/live/pending_inputs_live.ex @@ -0,0 +1,377 @@ +defmodule PhoenixDemoWeb.PendingInputsLive do + @moduledoc """ + Surfaces every workflow blocked on a pending input. Each card renders an + inline form whose fields come from the wait spec (form / choice / text / + approval) and submits via `Durable.Wait.provide_input/3`. + """ + use PhoenixDemoWeb, :live_view + + alias Durable.Wait + alias Durable.Storage.Schemas.WorkflowExecution + alias Durable.Config + + import Ecto.Query + + @impl true + def mount(params, _session, socket) do + if connected?(socket) do + Phoenix.PubSub.subscribe(PhoenixDemo.PubSub, "workflows") + :timer.send_interval(2_000, self(), :refresh) + end + + type_filter = normalize_type_filter(params["type"]) + + {:ok, + assign(socket, + page_title: "Pending Inputs", + active_nav: :pending_inputs, + type_filter: type_filter, + pending: load_pending(type_filter), + reasons: %{} + )} + end + + @impl true + def handle_params(params, _uri, socket) do + type_filter = normalize_type_filter(params["type"]) + {:noreply, assign(socket, type_filter: type_filter, pending: load_pending(type_filter))} + end + + @impl true + def handle_info(:refresh, socket) do + {:noreply, assign(socket, pending: load_pending(socket.assigns.type_filter))} + end + + def handle_info(_, socket), do: {:noreply, socket} + + defp normalize_type_filter(nil), do: :all + defp normalize_type_filter(""), do: :all + defp normalize_type_filter("all"), do: :all + defp normalize_type_filter("approval"), do: :approval + defp normalize_type_filter("single_choice"), do: :single_choice + defp normalize_type_filter("free_text"), do: :free_text + defp normalize_type_filter("form"), do: :form + defp normalize_type_filter(_), do: :all + + defp load_pending(filter) do + pending = Wait.list_pending_inputs(limit: 100) + + pending = + case filter do + :all -> pending + type -> Enum.filter(pending, &(&1.input_type == type)) + end + + workflows = preload_workflows(Enum.map(pending, & &1.workflow_id)) + + Enum.map(pending, fn p -> + Map.put(p, :workflow, Map.get(workflows, p.workflow_id)) + end) + end + + defp preload_workflows([]), do: %{} + + defp preload_workflows(ids) do + config = Config.get(Durable) + repo = config.repo + + from(w in WorkflowExecution, where: w.id in ^ids, select: {w.id, w}) + |> repo.all() + |> Map.new() + end + + @impl true + def handle_event("submit_form", %{"_id" => id, "fields" => fields}, socket) do + pending = Enum.find(socket.assigns.pending, &(&1.id == id)) + + if pending do + case Wait.provide_input(pending.workflow_id, pending.input_name, fields) do + :ok -> + {:noreply, + socket + |> put_flash(:info, "Input provided.") + |> assign(pending: load_pending(socket.assigns.type_filter))} + + {:error, reason} -> + {:noreply, put_flash(socket, :error, "Failed: #{inspect(reason)}")} + end + else + {:noreply, socket} + end + end + + def handle_event("submit_choice", %{"_id" => id, "fields" => %{"value" => value}}, socket) do + do_provide(socket, id, value) + end + + def handle_event("submit_text", %{"_id" => id, "fields" => %{"text" => text}}, socket) do + do_provide(socket, id, text) + end + + def handle_event("approve", %{"id" => id}, socket) do + response = %{ + "approved" => true, + "approved_by" => "demo-user", + "approved_at" => DateTime.utc_now() |> DateTime.to_iso8601() + } + + do_provide(socket, id, response) + end + + def handle_event("reject", %{"_id" => id}, socket) do + reason = Map.get(socket.assigns.reasons, id, "") + + response = %{ + "approved" => false, + "reason" => reason, + "rejected_by" => "demo-user", + "rejected_at" => DateTime.utc_now() |> DateTime.to_iso8601() + } + + do_provide(socket, id, response) + end + + def handle_event("update_reason", %{"_id" => id, "reason" => reason}, socket) do + {:noreply, assign(socket, reasons: Map.put(socket.assigns.reasons, id, reason))} + end + + def handle_event("update_reason", _params, socket), do: {:noreply, socket} + + defp do_provide(socket, id, response) do + pending = Enum.find(socket.assigns.pending, &(&1.id == id)) + + if pending do + case Wait.provide_input(pending.workflow_id, pending.input_name, response) do + :ok -> + {:noreply, + socket + |> put_flash(:info, "Response submitted.") + |> assign(pending: load_pending(socket.assigns.type_filter))} + + {:error, reason} -> + {:noreply, put_flash(socket, :error, "Failed: #{inspect(reason)}")} + end + else + {:noreply, socket} + end + end + + defp short_module(nil), do: "-" + defp short_module(mod), do: mod |> String.split(".") |> List.last() + + defp format_time(nil), do: "-" + defp format_time(dt), do: Calendar.strftime(dt, "%Y-%m-%d %H:%M:%S") + + defp type_label(:approval), do: "Approval" + defp type_label(:single_choice), do: "Choice" + defp type_label(:free_text), do: "Text" + defp type_label(:form), do: "Form" + defp type_label(other), do: to_string(other) + + defp normalize_fields(fields) when is_list(fields), do: fields + defp normalize_fields(_), do: [] + + @impl true + def render(assigns) do + ~H""" + +
+

Pending Inputs

+

+ Workflows blocked on a human response. Submit one to resume the workflow. +

+
+ +
+ <.link patch={~p"/pending-inputs"} role="tab" class={["tab", @type_filter == :all && "tab-active"]}>All + <.link patch={~p"/pending-inputs?type=approval"} role="tab" class={["tab", @type_filter == :approval && "tab-active"]}>Approvals + <.link patch={~p"/pending-inputs?type=form"} role="tab" class={["tab", @type_filter == :form && "tab-active"]}>Forms + <.link patch={~p"/pending-inputs?type=single_choice"} role="tab" class={["tab", @type_filter == :single_choice && "tab-active"]}>Choices + <.link patch={~p"/pending-inputs?type=free_text"} role="tab" class={["tab", @type_filter == :free_text && "tab-active"]}>Text +
+ +
+ <.icon name="hero-inbox" class="size-12 mx-auto text-base-content/30" /> +

No pending inputs

+

Trigger a workflow that waits for input to populate this list.

+
+ +
+
+
+
+
+
{p.input_name}
+
+ {short_module(p.workflow && p.workflow.workflow_module)} · step {p.step_name} +
+
+ {type_label(p.input_type)} +
+ +

{p.prompt}

+ +
+ WF {String.slice(p.workflow_id, 0, 8)} + timeout {format_time(p.timeout_at)} +
+ +
+ Metadata +
{Jason.encode!(p.metadata, pretty: true)}
+
+ +
+ + <%= case p.input_type do %> + <% :approval -> %> +
+ + +
+ + +
+
+ + <% :single_choice -> %> +
+ +
+ +
+
+ +
+
+ + <% :free_text -> %> +
+ + +
+ +
+
+ + <% :form -> %> +
+ + <.form_fields fields={normalize_fields(p.fields)} /> +
+ +
+
+ + <% _ -> %> +

Unsupported input type: {p.input_type}

+ <% end %> +
+
+
+
+ """ + end + + attr :fields, :list, required: true + + defp form_fields(assigns) do + ~H""" +
+ +
+ """ + end + + defp field_name(f), do: get_str(f, "name") + defp field_label(f), do: get_str(f, "label") || get_str(f, "name") + defp field_type(f), do: get_str(f, "type") || "text" + + defp field_required?(f) do + case get_str(f, "required") do + true -> true + "true" -> true + _ -> false + end + end + + defp field_options(f) do + case get_str(f, "options") do + list when is_list(list) -> list + _ -> [] + end + end + + defp choice_value(%{"value" => v}), do: to_string(v) + defp choice_value(%{value: v}), do: to_string(v) + defp choice_value(s) when is_binary(s), do: s + defp choice_value(s) when is_atom(s), do: Atom.to_string(s) + + defp choice_label(%{"label" => l}), do: l + defp choice_label(%{label: l}), do: l + defp choice_label(s) when is_binary(s), do: s + defp choice_label(s) when is_atom(s), do: Atom.to_string(s) + + defp get_str(map, key) when is_map(map) do + Map.get(map, key) || Map.get(map, String.to_atom(key)) + end + + defp get_str(_, _), do: nil +end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/schedules_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/schedules_live.ex new file mode 100644 index 0000000..c75ec05 --- /dev/null +++ b/examples/phoenix_demo/lib/phoenix_demo_web/live/schedules_live.ex @@ -0,0 +1,125 @@ +defmodule PhoenixDemoWeb.SchedulesLive do + @moduledoc """ + Lists registered scheduled workflows. Supports Run-now, Enable, and + Disable actions per row. + """ + use PhoenixDemoWeb, :live_view + + @impl true + def mount(_params, _session, socket) do + if connected?(socket) do + :timer.send_interval(5_000, self(), :refresh) + end + + {:ok, + assign(socket, + page_title: "Schedules", + active_nav: :schedules, + schedules: Durable.list_schedules(limit: 100) + )} + end + + @impl true + def handle_info(:refresh, socket) do + {:noreply, assign(socket, schedules: Durable.list_schedules(limit: 100))} + end + + @impl true + def handle_event("trigger", %{"name" => name}, socket) do + case Durable.trigger_schedule(name) do + {:ok, workflow_id} -> + {:noreply, + socket + |> put_flash(:info, "Triggered: workflow #{String.slice(workflow_id, 0, 8)}") + |> assign(schedules: Durable.list_schedules(limit: 100))} + + {:error, reason} -> + {:noreply, put_flash(socket, :error, "Trigger failed: #{inspect(reason)}")} + end + end + + def handle_event("toggle", %{"name" => name, "enabled" => "true"}, socket) do + case Durable.disable_schedule(name) do + {:ok, _} -> {:noreply, assign(socket, schedules: Durable.list_schedules(limit: 100))} + err -> {:noreply, put_flash(socket, :error, "Disable failed: #{inspect(err)}")} + end + end + + def handle_event("toggle", %{"name" => name}, socket) do + case Durable.enable_schedule(name) do + {:ok, _} -> {:noreply, assign(socket, schedules: Durable.list_schedules(limit: 100))} + err -> {:noreply, put_flash(socket, :error, "Enable failed: #{inspect(err)}")} + end + end + + defp short_module(nil), do: "-" + defp short_module(mod), do: mod |> to_string() |> String.split(".") |> List.last() + + defp format_time(nil), do: "—" + defp format_time(dt), do: Calendar.strftime(dt, "%Y-%m-%d %H:%M:%S") + + @impl true + def render(assigns) do + ~H""" + +
+

Schedules

+

+ Cron-driven workflows registered via @schedule. The scheduler polls every 5 seconds, so the demo's * * * * * entry fires within a minute of boot. +

+
+ +
+ <.icon name="hero-clock" class="size-12 mx-auto text-base-content/30" /> +

No schedules registered

+

Confirm scheduled_modules: is set in application.ex.

+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameModuleCronEnabledLast runNext runFailures
{s.name}{short_module(s.workflow_module)}{s.cron_expression} + + {if s.enabled, do: "enabled", else: "disabled"} + + {format_time(s.last_run_at)}{format_time(s.next_run_at)}{s.consecutive_failures || 0} + + +
+
+
+ """ + end +end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/workflow_detail_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/workflow_detail_live.ex index 8f07cf7..95a367d 100644 --- a/examples/phoenix_demo/lib/phoenix_demo_web/live/workflow_detail_live.ex +++ b/examples/phoenix_demo/lib/phoenix_demo_web/live/workflow_detail_live.ex @@ -1,7 +1,7 @@ defmodule PhoenixDemoWeb.WorkflowDetailLive do @moduledoc """ - LiveView for viewing detailed workflow execution information. - Shows the workflow status, context, and step execution history. + Detailed view for a single workflow execution: status header, input, + context, error, child workflows, and step timeline. """ use PhoenixDemoWeb, :live_view @@ -15,7 +15,7 @@ defmodule PhoenixDemoWeb.WorkflowDetailLive do def mount(%{"id" => id}, _session, socket) do if connected?(socket) do Phoenix.PubSub.subscribe(PhoenixDemo.PubSub, "workflows") - :timer.send_interval(2000, self(), :refresh) + :timer.send_interval(2_000, self(), :refresh) end case get_workflow(id) do @@ -23,38 +23,46 @@ defmodule PhoenixDemoWeb.WorkflowDetailLive do {:ok, socket |> put_flash(:error, "Workflow not found") - |> redirect(to: ~p"/workflows")} + |> redirect(to: ~p"/executions")} workflow -> {:ok, assign(socket, page_title: "Workflow Details", + active_nav: :executions, workflow: workflow, - steps: get_steps(id) + steps: get_steps(id), + children: Durable.list_children(id) )} end end @impl true - def handle_info(:refresh, socket) do - workflow = get_workflow(socket.assigns.workflow.id) - steps = get_steps(socket.assigns.workflow.id) - {:noreply, assign(socket, workflow: workflow, steps: steps)} - end + def handle_info(:refresh, socket), do: refresh(socket) + def handle_info({:workflow_completed, _, _}, socket), do: refresh(socket) + def handle_info({:workflow_rejected, _, _}, socket), do: refresh(socket) + def handle_info(_, socket), do: {:noreply, socket} - def handle_info({:workflow_completed, id, _report}, socket) do - if id == socket.assigns.workflow.id do - {:noreply, assign(socket, workflow: get_workflow(id), steps: get_steps(id))} - else - {:noreply, socket} - end + defp refresh(socket) do + id = socket.assigns.workflow.id + workflow = get_workflow(id) + steps = get_steps(id) + children = Durable.list_children(id) + + {:noreply, assign(socket, workflow: workflow, steps: steps, children: children)} end - def handle_info({:workflow_rejected, id, _result}, socket) do - if id == socket.assigns.workflow.id do - {:noreply, assign(socket, workflow: get_workflow(id), steps: get_steps(id))} - else - {:noreply, socket} + @impl true + def handle_event("cancel_workflow", _params, socket) do + case Durable.cancel(socket.assigns.workflow.id, "Cancelled from UI") do + :ok -> + {:noreply, + socket + |> put_flash(:info, "Workflow cancelled (parent only — children continue).") + |> assign(workflow: get_workflow(socket.assigns.workflow.id))} + + {:error, reason} -> + {:noreply, put_flash(socket, :error, "Cancel failed: #{inspect(reason)}")} end end @@ -100,162 +108,162 @@ defmodule PhoenixDemoWeb.WorkflowDetailLive do end end - defp format_time(nil), do: "-" + defp short_module(nil), do: "-" + defp short_module(mod), do: mod |> to_string() |> String.split(".") |> List.last() - defp format_time(datetime) do - Calendar.strftime(datetime, "%Y-%m-%d %H:%M:%S") - end + defp format_time(nil), do: "-" + defp format_time(dt), do: Calendar.strftime(dt, "%Y-%m-%d %H:%M:%S") @impl true def render(assigns) do ~H""" - <.header> - Workflow Details - <:subtitle> - {@workflow.id} - - <:actions> - <.button navigate={~p"/workflows"}> - <.icon name="hero-arrow-left" class="size-4 mr-1" /> Back to Dashboard - - - - -
- -
-
-
-

Status

- - {@workflow.status} - -
+ +
+
+

{@workflow.workflow_name}

+ {@workflow.id} +
+
+ <.link navigate={~p"/executions"} class="btn btn-sm btn-ghost"> + <.icon name="hero-arrow-left" class="size-4" /> Executions + + +
+
-
-
-
Workflow
-
{@workflow.workflow_name}
-
-
-
Current Step
-
{@workflow.current_step || "-"}
+
+
+
+
+

Status

+ {@workflow.status}
-
-
Queue
-
{@workflow.queue}
+ +
+
+
Module
+
{short_module(@workflow.workflow_module)}
+
+
+
Current step
+
{@workflow.current_step || "-"}
+
+
+
Queue
+
{@workflow.queue}
+
+
+
Priority
+
{@workflow.priority}
+
-
-
Priority
-
{@workflow.priority}
+ +
+
Created: {format_time(@workflow.inserted_at)}
+
Started: {format_time(@workflow.started_at)}
+
Completed: {format_time(@workflow.completed_at)}
+
Scheduled: {format_time(@workflow.scheduled_at)}
+
-
-
-
Created
-
{format_time(@workflow.inserted_at)}
-
-
-
Started
-
{format_time(@workflow.started_at)}
-
-
-
Completed
-
{format_time(@workflow.completed_at)}
-
-
-
Scheduled
-
{format_time(@workflow.scheduled_at)}
+
+
+

Child workflows

+
    +
  • +
    + {c.status} + {c.workflow_name} + {String.slice(c.id, 0, 8)} +
    + <.link navigate={~p"/executions/#{c.id}"} class="btn btn-xs btn-ghost">View +
  • +
+
+
+ +
+
+
+

Input

+
{Jason.encode!(@workflow.input || %{}, pretty: true)}
-
- <.link navigate={~p"/approvals"} class="btn btn-primary btn-sm"> - <.icon name="hero-hand-raised" class="size-4 mr-1" /> Go to Approvals - +
+
+

Context

+
{Jason.encode!(@workflow.context || %{}, pretty: true)}
+
-
- - -
-
-

Input

-
{Jason.encode!(@workflow.input || %{}, pretty: true)}
-
-
- - -
-
-

Context

-
{Jason.encode!(@workflow.context || %{}, pretty: true)}
-
-
- - -
-
-

- <.icon name="hero-exclamation-triangle" class="size-5" /> Error -

-
{Jason.encode!(@workflow.error, pretty: true)}
-
-
- - -
-
-

Step History

-
- No steps executed yet +
+
+

+ <.icon name="hero-exclamation-triangle" class="size-4" /> Error +

+
{Jason.encode!(@workflow.error, pretty: true)}
+
-
    -
  • -
    0} class={step.status == :completed && "bg-success"} /> -
    - {format_time(step.started_at)} -
    -
    - <.icon - name={step_icon(step.status)} - class={[ - "size-5", - step.status == :completed && "text-success", - step.status == :failed && "text-error", - step.status == :running && "text-info animate-spin" - ]} - /> -
    -
    -
    - {step.step_name} - - {step.status} - +
    +
    +

    Step history

    + +
    + No steps executed yet +
    + +
      +
    • +
      0} class={step.status == :completed && "bg-success"} /> +
      + {format_time(step.started_at)}
      -
      - Type: {step.step_type} | Attempt: {step.attempt} - | Duration: {step.duration_ms}ms +
      + <.icon + name={step_icon(step.status)} + class={[ + "size-5", + step.status == :completed && "text-success", + step.status == :failed && "text-error", + step.status == :running && "text-info motion-safe:animate-spin" + ]} + />
      -
      - View Output -
      {Jason.encode!(step.output, pretty: true)}
      -
      -
      - View Error -
      {Jason.encode!(step.error, pretty: true)}
      -
      -
      -
      -
    • -
    +
    +
    + {step.step_name} + {step.status} +
    +
    + Type: {step.step_type} · attempt {step.attempt} · {step.duration_ms}ms +
    +
    + Output +
    {Jason.encode!(step.output, pretty: true)}
    +
    +
    + Error +
    {Jason.encode!(step.error, pretty: true)}
    +
    +
    +
    +
  • +
+
-
+ """ end end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/live/workflow_live.ex b/examples/phoenix_demo/lib/phoenix_demo_web/live/workflow_live.ex deleted file mode 100644 index e3c984f..0000000 --- a/examples/phoenix_demo/lib/phoenix_demo_web/live/workflow_live.ex +++ /dev/null @@ -1,154 +0,0 @@ -defmodule PhoenixDemoWeb.WorkflowLive do - @moduledoc """ - LiveView dashboard for monitoring workflow executions. - Shows all workflows with real-time status updates via PubSub. - """ - - use PhoenixDemoWeb, :live_view - - alias Durable.Config - alias Durable.Storage.Schemas.WorkflowExecution - - import Ecto.Query - - @impl true - def mount(_params, _session, socket) do - if connected?(socket) do - Phoenix.PubSub.subscribe(PhoenixDemo.PubSub, "workflows") - # Poll for updates every 2 seconds - :timer.send_interval(2000, self(), :refresh) - end - - {:ok, assign(socket, page_title: "Workflow Dashboard", workflows: list_workflows())} - end - - @impl true - def handle_info(:refresh, socket) do - {:noreply, assign(socket, workflows: list_workflows())} - end - - def handle_info({:workflow_completed, _id, _report}, socket) do - {:noreply, assign(socket, workflows: list_workflows())} - end - - def handle_info({:workflow_rejected, _id, _result}, socket) do - {:noreply, assign(socket, workflows: list_workflows())} - end - - defp list_workflows do - config = Config.get(Durable) - repo = config.repo - - repo.all( - from(w in WorkflowExecution, - order_by: [desc: w.inserted_at], - limit: 50 - ) - ) - end - - defp status_badge(status) do - case status do - :pending -> "badge badge-warning" - :running -> "badge badge-info" - :waiting -> "badge badge-secondary" - :completed -> "badge badge-success" - :failed -> "badge badge-error" - :cancelled -> "badge badge-ghost" - _ -> "badge" - end - end - - defp format_time(nil), do: "-" - - defp format_time(datetime) do - Calendar.strftime(datetime, "%Y-%m-%d %H:%M:%S") - end - - @impl true - def render(assigns) do - ~H""" - <.header> - Workflow Dashboard - <:subtitle>Monitor and manage workflow executions - <:actions> - <.button navigate={~p"/workflows/new"} variant="primary"> - <.icon name="hero-plus" class="size-4 mr-1" /> New Workflow - - - - -
-
-
-
Total
-
{length(@workflows)}
-
-
-
Running
-
{Enum.count(@workflows, &(&1.status == :running))}
-
-
-
Waiting
-
- {Enum.count(@workflows, &(&1.status == :waiting))} -
-
-
-
Completed
-
- {Enum.count(@workflows, &(&1.status == :completed))} -
-
-
- -
- - - - - - - - - - - - - - - - - - - - - - - - -
IDWorkflowStatusCurrent StepCreatedActions
- {String.slice(workflow.id, 0, 8)}... - {workflow.workflow_name} - - {workflow.status} - - {workflow.current_step || "-"}{format_time(workflow.inserted_at)} - <.link navigate={~p"/workflows/#{workflow.id}"} class="btn btn-ghost btn-xs"> - View - - <.link - :if={workflow.status == :waiting} - navigate={~p"/approvals"} - class="btn btn-primary btn-xs" - > - Review - -
- No workflows yet. Create one to get started! -
-
-
- """ - end -end diff --git a/examples/phoenix_demo/lib/phoenix_demo_web/router.ex b/examples/phoenix_demo/lib/phoenix_demo_web/router.ex index 9270cbf..114bbbf 100644 --- a/examples/phoenix_demo/lib/phoenix_demo_web/router.ex +++ b/examples/phoenix_demo/lib/phoenix_demo_web/router.ex @@ -17,20 +17,19 @@ defmodule PhoenixDemoWeb.Router do scope "/", PhoenixDemoWeb do pipe_through :browser - # Redirect home to workflows dashboard - get "/", PageController, :home + live "/", HomeLive, :index + live "/executions", ExecutionsLive, :index + live "/executions/:id", WorkflowDetailLive, :show + live "/pending-inputs", PendingInputsLive, :index + live "/pending-events", PendingEventsLive, :index + live "/schedules", SchedulesLive, :index - # Workflow routes - live "/workflows", WorkflowLive, :index - live "/workflows/new", DocumentLive, :new + # Legacy redirects so older bookmarks keep working. + live "/workflows", ExecutionsLive, :index live "/workflows/:id", WorkflowDetailLive, :show - - # Approval routes - live "/approvals", ApprovalLive, :index end - # Other scopes may use custom stacks. - # scope "/api", PhoenixDemoWeb do - # pipe_through :api - # end + # Durable Dashboard — one line, mounts at /dashboard with its own + # pipelines (asset routes skip CSRF for cross-origin script-tag fetches). + use DurableDashboard.Router, mount: "/dashboard", durable: Durable end diff --git a/examples/phoenix_demo/mix.exs b/examples/phoenix_demo/mix.exs index 6d1c3e1..421b920 100644 --- a/examples/phoenix_demo/mix.exs +++ b/examples/phoenix_demo/mix.exs @@ -63,7 +63,8 @@ defmodule PhoenixDemo.MixProject do {:dns_cluster, "~> 0.2.0"}, {:bandit, "~> 1.5"}, # Durable workflow engine - {:durable, path: "../.."} + {:durable, path: "../../durable"}, + {:durable_dashboard, path: "../../durable_dashboard"} ] end @@ -75,15 +76,27 @@ defmodule PhoenixDemo.MixProject do # See the documentation for `Mix` for more info on aliases. defp aliases do [ - setup: ["deps.get", "ecto.setup", "assets.setup", "assets.build"], + setup: [ + "deps.get", + "cmd --cd ../../durable_dashboard/assets pnpm install", + "ecto.setup", + "assets.setup", + "assets.build" + ], "ecto.setup": ["ecto.create", "ecto.migrate", "run priv/repo/seeds.exs"], "ecto.reset": ["ecto.drop", "ecto.setup"], test: ["ecto.create --quiet", "ecto.migrate --quiet", "test"], "assets.setup": ["tailwind.install --if-missing", "esbuild.install --if-missing"], - "assets.build": ["compile", "tailwind phoenix_demo", "esbuild phoenix_demo"], + "assets.build": [ + "compile", + "tailwind phoenix_demo", + "esbuild phoenix_demo", + "cmd --cd ../../durable_dashboard/assets pnpm build" + ], "assets.deploy": [ "tailwind phoenix_demo --minify", "esbuild phoenix_demo --minify", + "cmd --cd ../../durable_dashboard/assets pnpm build", "phx.digest" ], precommit: ["compile --warnings-as-errors", "deps.unlock --unused", "format", "test"] diff --git a/examples/phoenix_demo/priv/repo/migrations/20260413000000_upgrade_durable.exs b/examples/phoenix_demo/priv/repo/migrations/20260413000000_upgrade_durable.exs new file mode 100644 index 0000000..3a407fb --- /dev/null +++ b/examples/phoenix_demo/priv/repo/migrations/20260413000000_upgrade_durable.exs @@ -0,0 +1,17 @@ +defmodule PhoenixDemo.Repo.Migrations.UpgradeDurable do + @moduledoc """ + Pulls in any new Durable library migrations. + + `Durable.Migration.up/1` tracks its own applied-versions table inside the + `durable` schema, so calling it again is idempotent — only migrations that + haven't been applied yet will run. + + As of 2026-04-13 this picks up `V20260413000000AddSchedulerResilience`, + which adds the `last_error`, `last_error_at`, `consecutive_failures`, and + `auto_disabled_at` columns on `scheduled_workflows` (audit fix L-1). + """ + use Ecto.Migration + + def up, do: Durable.Migration.up() + def down, do: Durable.Migration.down() +end diff --git a/lib/durable/wait/timeout_worker.ex b/lib/durable/wait/timeout_worker.ex deleted file mode 100644 index 96fe41c..0000000 --- a/lib/durable/wait/timeout_worker.ex +++ /dev/null @@ -1,277 +0,0 @@ -defmodule Durable.Wait.TimeoutWorker do - @moduledoc """ - Background worker that enforces timeouts for pending inputs and events. - - Periodically checks for timed-out waits and either: - - Resumes the workflow with the timeout_value (if on_timeout: :resume) - - Fails the workflow (if on_timeout: :fail) - """ - - use GenServer - - alias Durable.Repo - alias Durable.Storage.Schemas.{PendingEvent, PendingInput, WaitGroup} - - import Ecto.Query - - require Logger - - @default_interval 60_000 - - # ============================================================================ - # Client API - # ============================================================================ - - @doc """ - Starts the timeout worker. - - ## Options - - - `:config` - The Durable config (required) - - `:interval` - Check interval in milliseconds (default: 60_000) - """ - def start_link(opts) do - config = Keyword.fetch!(opts, :config) - interval = Keyword.get(opts, :interval, @default_interval) - - GenServer.start_link( - __MODULE__, - %{config: config, interval: interval}, - name: worker_name(config.name) - ) - end - - @doc """ - Returns the process name for a given Durable instance. - """ - def worker_name(durable_name) do - Module.concat([durable_name, Wait, TimeoutWorker]) - end - - @doc """ - Manually triggers a timeout check. - """ - def check_timeouts(durable_name \\ Durable) do - GenServer.cast(worker_name(durable_name), :check_timeouts) - end - - # ============================================================================ - # GenServer Callbacks - # ============================================================================ - - @impl true - def init(state) do - schedule_check(state.interval) - {:ok, state} - end - - @impl true - def handle_info(:check_timeouts, state) do - do_check_timeouts(state.config) - schedule_check(state.interval) - {:noreply, state} - end - - @impl true - def handle_cast(:check_timeouts, state) do - do_check_timeouts(state.config) - {:noreply, state} - end - - # ============================================================================ - # Private Functions - # ============================================================================ - - defp schedule_check(interval) do - Process.send_after(self(), :check_timeouts, interval) - end - - defp do_check_timeouts(config) do - now = DateTime.utc_now() - - process_timed_out_inputs(config, now) - process_timed_out_events(config, now) - process_timed_out_wait_groups(config, now) - end - - defp process_timed_out_inputs(config, now) do - # Find pending inputs that have timed out - query = - from(p in PendingInput, - where: - p.status == :pending and - not is_nil(p.timeout_at) and - p.timeout_at <= ^now, - preload: [:workflow] - ) - - timed_out = Repo.all(config, query) - - Enum.each(timed_out, fn pending_input -> - handle_input_timeout(config, pending_input) - end) - end - - defp handle_input_timeout(config, pending_input) do - # Mark as timed out - {:ok, _} = - pending_input - |> PendingInput.timeout_changeset() - |> Repo.update(config) - - # Determine how to handle - on_timeout = pending_input.on_timeout || :resume - - case on_timeout do - :resume -> - # Resume workflow with timeout value - timeout_value = deserialize_timeout_value(pending_input.timeout_value) - - resume_data = %{ - pending_input.input_name => timeout_value, - :__timeout__ => true - } - - Durable.Executor.resume_workflow( - pending_input.workflow_id, - resume_data, - durable: config.name - ) - - :fail -> - # Cancel the workflow with timeout error - reason = "Timeout waiting for input: #{pending_input.input_name}" - Durable.Executor.cancel_workflow(pending_input.workflow_id, reason, durable: config.name) - end - - Logger.info( - "Timeout handled for pending input #{pending_input.input_name} " <> - "in workflow #{pending_input.workflow_id} (#{on_timeout})" - ) - end - - defp process_timed_out_events(config, now) do - # Find pending events that have timed out (only single events, not in groups) - query = - from(p in PendingEvent, - where: - p.status == :pending and - p.wait_type == :single and - is_nil(p.wait_group_id) and - not is_nil(p.timeout_at) and - p.timeout_at <= ^now, - preload: [:workflow] - ) - - timed_out = Repo.all(config, query) - - Enum.each(timed_out, fn pending_event -> - handle_event_timeout(config, pending_event) - end) - end - - defp handle_event_timeout(config, pending_event) do - # Mark as timed out - {:ok, _} = - pending_event - |> PendingEvent.timeout_changeset() - |> Repo.update(config) - - # Resume workflow with timeout value - timeout_value = deserialize_timeout_value(pending_event.timeout_value) - - resume_data = %{ - pending_event.event_name => timeout_value, - :__timeout__ => true - } - - Durable.Executor.resume_workflow( - pending_event.workflow_id, - resume_data, - durable: config.name - ) - - Logger.info( - "Timeout handled for pending event #{pending_event.event_name} " <> - "in workflow #{pending_event.workflow_id}" - ) - end - - defp process_timed_out_wait_groups(config, now) do - # Find wait groups that have timed out - query = - from(w in WaitGroup, - where: - w.status == :pending and - not is_nil(w.timeout_at) and - w.timeout_at <= ^now, - preload: [:workflow] - ) - - timed_out = Repo.all(config, query) - - Enum.each(timed_out, fn wait_group -> - handle_wait_group_timeout(config, wait_group) - end) - end - - defp handle_wait_group_timeout(config, wait_group) do - # Mark wait group as timed out - {:ok, _} = - wait_group - |> WaitGroup.timeout_changeset() - |> Repo.update(config) - - # Mark all related pending events as timed out - query = - from(p in PendingEvent, - where: p.wait_group_id == ^wait_group.id and p.status == :pending - ) - - Repo.update_all(config, query, set: [status: :timeout, completed_at: DateTime.utc_now()]) - - # Resume workflow with timeout value and partial results - timeout_value = deserialize_timeout_value(wait_group.timeout_value) - - resume_data = - case wait_group.wait_type do - :any -> - # For wait_for_any, return {:timeout, nil} or timeout_value - %{ - :__wait_group_result__ => timeout_value || {:timeout, nil}, - :__timeout__ => true - } - - :all -> - # For wait_for_all, return {:timeout, partial_results} - partial = wait_group.received_events || %{} - - %{ - :__wait_group_result__ => timeout_value || {:timeout, partial}, - :__timeout__ => true - } - end - - Durable.Executor.resume_workflow( - wait_group.workflow_id, - resume_data, - durable: config.name - ) - - Logger.info( - "Timeout handled for wait group #{wait_group.id} " <> - "(#{wait_group.wait_type}) in workflow #{wait_group.workflow_id}" - ) - end - - defp deserialize_timeout_value(nil), do: nil - - defp deserialize_timeout_value(%{"__atom__" => atom_string}) do - String.to_existing_atom(atom_string) - rescue - ArgumentError -> String.to_atom(atom_string) - end - - defp deserialize_timeout_value(%{"__value__" => value}), do: value - defp deserialize_timeout_value(value) when is_map(value), do: value -end diff --git a/mix.exs b/mix.exs index ab96da9..ca72dbe 100644 --- a/mix.exs +++ b/mix.exs @@ -1,103 +1,38 @@ -defmodule Durable.MixProject do +defmodule DurableWorkspace.MixProject do use Mix.Project - @version "0.0.0-alpha" - @source_url "https://github.com/wavezync/durable" - @homepage_url "https://durable.wavezync.com" + @apps ~w(durable durable_dashboard) def project do [ - app: :durable, - version: @version, + app: :durable_workspace, + version: "0.0.0", elixir: "~> 1.15", - elixirc_paths: elixirc_paths(Mix.env()), - start_permanent: Mix.env() == :prod, - aliases: aliases(), - deps: deps(), - name: "Durable", - homepage_url: @homepage_url, - description: "A durable, resumable workflow engine for Elixir", - source_url: @source_url, - docs: docs(), - package: package() - ] - end - - def cli do - [ - preferred_envs: [precommit: :test] - ] - end - - def application do - [ - extra_applications: [:logger], - mod: {Durable.Application, []} - ] - end - - defp elixirc_paths(:test), do: ["lib", "test/support"] - defp elixirc_paths(_), do: ["lib"] - - defp deps do - [ - # Core - {:ecto_sql, "~> 3.12"}, - {:postgrex, "~> 0.19"}, - {:jason, "~> 1.4"}, - {:telemetry, "~> 1.3"}, - {:nimble_options, "~> 1.1"}, - {:crontab, "~> 1.1"}, - {:igniter, "~> 0.6", optional: true}, - - # Dev/Test - {:ex_doc, "~> 0.34", only: :dev, runtime: false}, - {:credo, "~> 1.7", only: [:dev, :test], runtime: false}, - {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false} - ] - end - - defp aliases do - [ - setup: ["deps.get", "ecto.setup"], - "ecto.setup": ["ecto.create", "ecto.migrate"], - "ecto.reset": ["ecto.drop", "ecto.setup"], - test: ["ecto.create --quiet", "ecto.migrate --quiet", "test"], - precommit: ["format", "compile --warnings-as-errors", "credo --strict", "test"] - ] - end - - defp docs do - [ - main: "readme", - source_url: @source_url, - source_ref: "v#{@version}", - extras: [ - "README.md", - "guides/ai_workflows.md", - "guides/branching.md", - "guides/compensations.md", - "guides/orchestration.md", - "guides/parallel.md", - "guides/waiting.md" + deps: [ + {:durable, path: "durable"}, + {:durable_dashboard, path: "durable_dashboard"} ], - groups_for_modules: [ - "Mix Tasks": [ - Mix.Tasks.Durable.Status, - Mix.Tasks.Durable.List, - Mix.Tasks.Durable.Run, - Mix.Tasks.Durable.Cancel, - Mix.Tasks.Durable.Cleanup - ] + aliases: [ + setup: cmd("deps.get"), + compile: cmd("compile"), + test: cmd("test"), + format: cmd("format"), + precommit: cmd("precommit") ] ] end - defp package do - [ - licenses: ["MIT"], - links: %{"GitHub" => @source_url}, - files: ~w(lib priv .formatter.exs mix.exs README.md LICENSE) - ] + defp cmd(command) do + for app <- @apps do + fn args -> + {_, code} = + System.cmd("mix", [command | args], + into: IO.binstream(:stdio, :line), + cd: app + ) + + if code > 0, do: System.at_exit(fn _ -> exit({:shutdown, 1}) end) + end + end end end diff --git a/test/support/data_case.ex b/test/support/data_case.ex deleted file mode 100644 index 2ea2421..0000000 --- a/test/support/data_case.ex +++ /dev/null @@ -1,67 +0,0 @@ -defmodule Durable.DataCase do - @moduledoc """ - This module defines the setup for tests requiring access to the - application's data layer. - - You may define functions here to be used as helpers in your tests. - """ - - use ExUnit.CaseTemplate - - alias Ecto.Adapters.SQL.Sandbox - - using do - quote do - alias Durable.TestRepo - import Ecto - import Ecto.Changeset - import Ecto.Query - import Durable.DataCase - end - end - - setup tags do - Durable.DataCase.setup_sandbox(tags) - - # Start Durable with test repo and queue disabled for unit tests - start_supervised!({Durable, repo: Durable.TestRepo, queue_enabled: false}) - - :ok - end - - @doc """ - Sets up the sandbox based on the test tags. - """ - def setup_sandbox(tags) do - pid = Sandbox.start_owner!(Durable.TestRepo, shared: not tags[:async]) - on_exit(fn -> Sandbox.stop_owner(pid) end) - end - - @doc """ - A helper that polls until a condition is met or timeout. - - ## Examples - - assert_eventually(fn -> - {:ok, exec} = Durable.get_execution(id) - exec.status == :completed - end) - """ - def assert_eventually(fun, timeout \\ 5000, interval \\ 100) do - deadline = System.monotonic_time(:millisecond) + timeout - do_assert_eventually(fun, deadline, interval) - end - - defp do_assert_eventually(fun, deadline, interval) do - if fun.() do - true - else - if System.monotonic_time(:millisecond) < deadline do - Process.sleep(interval) - do_assert_eventually(fun, deadline, interval) - else - ExUnit.Assertions.flunk("Condition not met within timeout") - end - end - end -end