From 0b33b808e74a05e72e5e64bd1a2241f16bb58338 Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 21:58:42 -0400 Subject: [PATCH 1/9] fix(core,runtime,client): spec-conformance, security, and bug fixes Protocol/spec conformance: - Lease-expiry termination now emits final_status "error" with LEASE_EXPIRED (#74) - Enforce max_runtime_sec via a watchdog emitting TIMEOUT/"timed_out" (#89) - Release idempotency claim on failed accept so identical retries are not poisoned with DUPLICATE_KEY (#90) - Canonical idempotency fingerprint: sort JsonNode properties (#91) - Subscriber events use the subscriber session's own event_seq space (#76) - Invalid/undecodable job.submit now returns INVALID_REQUEST (#78) - Idempotent replay returns the budget captured at acceptance (#79) - add session.close/session.closed (alias session.bye) ack (#96) - add job.cancelled ack; cancel of unknown job -> JOB_NOT_FOUND, no silent drops (#95) - Unknown credential scheme decodes to UNKNOWN instead of throwing (#97) - list_jobs maps only the requested page (#83) - Lease.contains compares cost.budget numerically; add LeaseSubset validator (#77) Security: - credential_rotated is no longer fanned out to subscribers (#75) - subscribe history replay re-envelopes events with the subscriber's session id and seq, and redacts credential_rotated (#94) - rotateCredential tracks/revokes surplus minted credentials and fails loudly on empty issue rather than aliasing a revoked handle (#98) Bugs: - runJob catch blocks guard terminal emission to avoid duplicate job.error (#92) - terminal job.result/job.error fan out to subscriber sessions (#93) - bound per-job event history and evict terminal jobs after the resume window; unlink closed sessions from subscriber lists (#101) - top-level errors echo the originating request id; client correlates them so a list/subscribe error never fails an unrelated submit (#102) - handshake uses CAS and volatile heartbeat handle to avoid resurrecting a closed session or leaking a heartbeat task (#103) - start worker before scheduling watchdog; cancelled() covers all terminal states (#104) - client completes subscribe publishers when a job terminates (#105) - blocking submit is bounded by a timeout and rejects dispatch-thread reentrancy; add submitAsync (#106) - ArcpRuntime.accept inserts the session before start() to avoid zombie entries (#107) - cost.budget.* gauges are not treated as spend; guard null metric value (#108) - ResultStream rejects divergent duplicate pending chunks, tolerates identical (#111) Adds protocol/unit tests covering the above (#33). Co-authored-by: Cursor --- .../main/java/dev/arcp/client/ArcpClient.java | 167 +++++- .../java/dev/arcp/client/ResultStream.java | 9 + .../java/dev/arcp/client/ClientAuditTest.java | 180 +++++++ .../dev/arcp/client/ResultStreamTest.java | 13 + .../core/credentials/CredentialScheme.java | 17 +- .../main/java/dev/arcp/core/lease/Lease.java | 22 +- .../java/dev/arcp/core/lease/LeaseSubset.java | 92 ++++ .../dev/arcp/core/messages/JobCancelled.java | 24 + .../java/dev/arcp/core/messages/Message.java | 10 +- .../java/dev/arcp/core/messages/Messages.java | 2 + .../dev/arcp/core/messages/SessionClosed.java | 23 + .../CredentialJsonRoundTripTest.java | 18 + .../dev/arcp/core/lease/LeaseSubsetTest.java | 65 +++ .../core/messages/MessageCatalogTest.java | 2 + .../java/dev/arcp/runtime/ArcpRuntime.java | 8 +- .../credentials/CredentialBinding.java | 14 + .../runtime/idempotency/IdempotencyStore.java | 22 + .../dev/arcp/runtime/session/JobRecord.java | 82 ++- .../dev/arcp/runtime/session/SessionLoop.java | 475 +++++++++++++----- .../arcp/runtime/SessionLoopAuditTest.java | 358 +++++++++++++ 20 files changed, 1443 insertions(+), 160 deletions(-) create mode 100644 arcp-client/src/test/java/dev/arcp/client/ClientAuditTest.java create mode 100644 arcp-core/src/main/java/dev/arcp/core/lease/LeaseSubset.java create mode 100644 arcp-core/src/main/java/dev/arcp/core/messages/JobCancelled.java create mode 100644 arcp-core/src/main/java/dev/arcp/core/messages/SessionClosed.java create mode 100644 arcp-core/src/test/java/dev/arcp/core/lease/LeaseSubsetTest.java create mode 100644 arcp-runtime/src/test/java/dev/arcp/runtime/SessionLoopAuditTest.java diff --git a/arcp-client/src/main/java/dev/arcp/client/ArcpClient.java b/arcp-client/src/main/java/dev/arcp/client/ArcpClient.java index ea4fefb..54a2869 100644 --- a/arcp-client/src/main/java/dev/arcp/client/ArcpClient.java +++ b/arcp-client/src/main/java/dev/arcp/client/ArcpClient.java @@ -7,6 +7,8 @@ import dev.arcp.core.auth.Auth; import dev.arcp.core.capabilities.Capabilities; import dev.arcp.core.capabilities.Feature; +import dev.arcp.core.credentials.Credential; +import dev.arcp.core.credentials.CredentialScheme; import dev.arcp.core.error.ArcpException; import dev.arcp.core.error.ErrorPayload; import dev.arcp.core.events.EventBody; @@ -20,6 +22,7 @@ import dev.arcp.core.messages.ClientInfo; import dev.arcp.core.messages.JobAccepted; import dev.arcp.core.messages.JobCancel; +import dev.arcp.core.messages.JobCancelled; import dev.arcp.core.messages.JobError; import dev.arcp.core.messages.JobEvent; import dev.arcp.core.messages.JobFilter; @@ -33,6 +36,7 @@ import dev.arcp.core.messages.Messages; import dev.arcp.core.messages.SessionAck; import dev.arcp.core.messages.SessionBye; +import dev.arcp.core.messages.SessionClosed; import dev.arcp.core.messages.SessionHello; import dev.arcp.core.messages.SessionJobs; import dev.arcp.core.messages.SessionListJobs; @@ -90,6 +94,10 @@ static EnumSet safeFeatureCopy(Set features) { private final ScheduledExecutorService scheduler; private final boolean autoAck; private final Duration ackInterval; + private final Duration submitTimeout; + // Set while the transport's inbound delivery thread is inside dispatch(); used to fail fast if a + // user completes a future on this thread and then calls blocking submit (#106). + private final ThreadLocal inDispatch = ThreadLocal.withInitial(() -> Boolean.FALSE); private final AtomicLong lastSeenSeq = new AtomicLong(-1); private final AtomicLong lastAckedSeq = new AtomicLong(-1); private final AtomicLong lastInboundMillis = new AtomicLong(System.currentTimeMillis()); @@ -121,6 +129,7 @@ private ArcpClient(Builder b) { this.requestedFeatures = safeFeatureCopy(b.features); this.autoAck = b.autoAck; this.ackInterval = b.ackInterval; + this.submitTimeout = b.submitTimeout; if (b.scheduler != null) { this.scheduler = b.scheduler; this.ownedScheduler = false; @@ -165,7 +174,45 @@ public JobHandle submit(JobSubmit submit) { return submit(submit, null); } + /** + * Blocking submit. Returns once the runtime acknowledges with {@code job.accepted} (or fails on + * rejection). Bounded by the configured submit timeout so it can never block forever (#106). + * + *

Must not be called from a dispatch/result callback (i.e. the transport inbound thread); doing + * so would deadlock because the acknowledgement is delivered by that same thread. Such a call + * fails fast with {@link IllegalStateException}. + */ public JobHandle submit(JobSubmit submit, @Nullable TraceId traceId) { + if (Boolean.TRUE.equals(inDispatch.get())) { + throw new IllegalStateException( + "submit() must not be called from an event/result callback; use submitAsync()"); + } + CompletableFuture future = submitAsync(submit, traceId); + try { + return future.get(submitTimeout.toMillis(), TimeUnit.MILLISECONDS); + } catch (TimeoutException e) { + pendingSubmits.removeIf(p -> p.outstanding().handleFuture == future); + future.completeExceptionally(e); + throw new IllegalStateException( + "submit timed out after " + submitTimeout + " awaiting job.accepted", e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IllegalStateException("submit interrupted", e); + } catch (java.util.concurrent.ExecutionException e) { + // Preserve the prior join() behavior: surface the cause (e.g. an ArcpException such as + // DuplicateKeyException) wrapped in an unchecked CompletionException so callers can inspect + // getCause(). + Throwable cause = e.getCause() != null ? e.getCause() : e; + throw new java.util.concurrent.CompletionException(cause); + } + } + + /** Non-blocking submit. Completes with the {@link JobHandle} on {@code job.accepted} (#106). */ + public CompletableFuture submitAsync(JobSubmit submit) { + return submitAsync(submit, null); + } + + public CompletableFuture submitAsync(JobSubmit submit, @Nullable TraceId traceId) { Outstanding o = new Outstanding(); MessageId requestId = MessageId.generate(); // The put-then-send pair must be atomic w.r.t. other submits so that the @@ -180,7 +227,7 @@ public JobHandle submit(JobSubmit submit, @Nullable TraceId traceId) { } finally { submitLock.unlock(); } - return o.handleFuture.join(); + return o.handleFuture; } public Page listJobs(@Nullable JobFilter filter) @@ -330,10 +377,13 @@ public void onNext(Envelope envelope) { if (seq != null) { lastSeenSeq.updateAndGet(prev -> Math.max(prev, seq)); } + inDispatch.set(Boolean.TRUE); try { dispatch(envelope); } catch (RuntimeException e) { log.warn("client dispatch error for {}: {}", envelope.type(), e.toString()); + } finally { + inDispatch.set(Boolean.FALSE); } } @@ -403,6 +453,8 @@ private void dispatch(Envelope envelope) { case JobSubscribed ignored -> { /* signal */ } + case JobCancelled cancelled -> handleCancelled(envelope, cancelled); + case SessionClosed ignored -> log.debug("session closed acknowledged by runtime"); case SessionJobs jobs -> handleListResponse(jobs); case SessionPing ping -> handlePing(ping); case SessionPong ignored -> log.debug("client ignored: {}", envelope.type()); @@ -481,10 +533,40 @@ private void handleAccepted(Envelope envelope, JobAccepted accepted) { if (head == null) { return; } + // §9.8.1: drop credentials whose scheme this client does not recognize rather than failing the + // whole acceptance (#97). Unknown schemes decode to CredentialScheme.UNKNOWN. + JobAccepted visible = withRecognizedCredentials(accepted); Outstanding o = head.outstanding(); - o.jobId = accepted.jobId(); - outstanding.put(accepted.jobId(), o); - o.handleFuture.complete(new ClientJobHandle(accepted, o)); + o.jobId = visible.jobId(); + outstanding.put(visible.jobId(), o); + o.handleFuture.complete(new ClientJobHandle(visible, o)); + } + + private static JobAccepted withRecognizedCredentials(JobAccepted accepted) { + List credentials = accepted.credentials(); + if (credentials == null || credentials.isEmpty()) { + return accepted; + } + List recognized = + credentials.stream().filter(c -> c.scheme() != CredentialScheme.UNKNOWN).toList(); + if (recognized.size() == credentials.size()) { + return accepted; + } + return new JobAccepted( + accepted.jobId(), + accepted.agent(), + accepted.lease(), + accepted.leaseConstraints(), + accepted.budget(), + recognized.isEmpty() ? null : recognized, + accepted.acceptedAt(), + accepted.traceId()); + } + + private void handleCancelled(Envelope envelope, JobCancelled cancelled) { + // §7.4 ack: the terminal job.error CANCELLED that follows completes the result/subscriber; the + // ack itself is informational. + log.debug("job {} cancellation acknowledged: {}", envelope.jobId(), cancelled.reason()); } private void handleJobEvent(Envelope envelope, JobEvent event) { @@ -509,28 +591,70 @@ private void handleResult(Envelope envelope, JobResult result) { return; } Outstanding o = outstanding.remove(jid); - if (o == null) { - return; + if (o != null) { + o.events.close(); + o.resultFuture.complete(result); } - o.events.close(); - o.resultFuture.complete(result); + // Complete the live subscriber publisher (if any) so "subscribe and iterate until complete" + // consumers see onComplete instead of blocking forever, and per-job executors are released + // (#105). + completeLiveSubscriber(jid, null); } private void handleError(Envelope envelope, JobError err) { JobId jid = envelope.jobId(); - Outstanding o = jid != null ? outstanding.remove(jid) : null; - if (o == null) { - // Top-level (unassigned) error: fail the oldest pending submit. - PendingSubmit head = pendingSubmits.pollFirst(); - if (head != null) { - ArcpException ex = ArcpException.from(ErrorPayload.of(err.code(), err.message())); - head.outstanding().handleFuture.completeExceptionally(ex); + ArcpException ex = ArcpException.from(ErrorPayload.of(err.code(), err.message())); + if (jid != null) { + // Terminal error for a known job: fail its result and complete its subscriber. A jobful error + // is never treated as a submit rejection (#102). + Outstanding o = outstanding.remove(jid); + if (o != null) { + o.events.close(); + o.resultFuture.completeExceptionally(ex); } + completeLiveSubscriber(jid, ex); return; } - o.events.close(); - ArcpException ex = ArcpException.from(ErrorPayload.of(err.code(), err.message())); - o.resultFuture.completeExceptionally(ex); + // Top-level (jobless) error: correlate to the originating request via the echoed envelope id so + // a list_jobs/subscribe error never fails an unrelated in-flight submit (#102). + MessageId correlationId = envelope.id(); + CompletableFuture listFuture = listRequests.remove(correlationId); + if (listFuture != null) { + listFuture.completeExceptionally(ex); + return; + } + PendingSubmit match = removePendingSubmit(correlationId); + if (match != null) { + match.outstanding().handleFuture.completeExceptionally(ex); + return; + } + log.warn("dropping uncorrelated top-level error {}: {}", err.code(), err.message()); + } + + private @Nullable PendingSubmit removePendingSubmit(MessageId requestId) { + for (java.util.Iterator it = pendingSubmits.iterator(); it.hasNext(); ) { + PendingSubmit p = it.next(); + if (p.requestId().equals(requestId)) { + it.remove(); + return p; + } + } + return null; + } + + private void completeLiveSubscriber(JobId jid, @Nullable Throwable error) { + SubmissionPublisher pub = liveSubscribers.remove(jid); + if (pub != null) { + if (error != null) { + pub.closeExceptionally(error); + } else { + pub.close(); + } + } + ExecutorService exec = liveExecutors.remove(jid); + if (exec != null) { + exec.shutdown(); + } } private void handleListResponse(SessionJobs jobs) { @@ -629,6 +753,7 @@ public static final class Builder { private Set features = EnumSet.allOf(Feature.class); private boolean autoAck = true; private Duration ackInterval = Duration.ofMillis(200); + private Duration submitTimeout = Duration.ofSeconds(30); private @Nullable ScheduledExecutorService scheduler; private @Nullable String resumeToken; private @Nullable Long lastEventSeq; @@ -672,6 +797,12 @@ public Builder ackInterval(Duration interval) { return this; } + /** Maximum time blocking {@link #submit(JobSubmit)} waits for {@code job.accepted} (#106). */ + public Builder submitTimeout(Duration timeout) { + this.submitTimeout = timeout; + return this; + } + public Builder scheduler(ScheduledExecutorService s) { this.scheduler = s; return this; diff --git a/arcp-client/src/main/java/dev/arcp/client/ResultStream.java b/arcp-client/src/main/java/dev/arcp/client/ResultStream.java index bc94718..9604dd3 100644 --- a/arcp-client/src/main/java/dev/arcp/client/ResultStream.java +++ b/arcp-client/src/main/java/dev/arcp/client/ResultStream.java @@ -74,6 +74,15 @@ public synchronized void accept(ResultChunkEvent chunk) throws IOException { if (chunk.chunkSeq() < nextExpected) { throw new DuplicateChunkException(chunk.chunkSeq()); } + ResultChunkEvent existing = pending.get(chunk.chunkSeq()); + if (existing != null) { + // §8.4: a duplicate of a still-pending chunk is rejected like any other duplicate. A + // byte-identical retransmission is tolerated (idempotent); a divergent copy is an error. + if (existing.equals(chunk)) { + return; + } + throw new DuplicateChunkException(chunk.chunkSeq()); + } pending.put(chunk.chunkSeq(), chunk); while (pending.containsKey(nextExpected)) { ResultChunkEvent ready = pending.remove(nextExpected); diff --git a/arcp-client/src/test/java/dev/arcp/client/ClientAuditTest.java b/arcp-client/src/test/java/dev/arcp/client/ClientAuditTest.java new file mode 100644 index 0000000..df58cf4 --- /dev/null +++ b/arcp-client/src/test/java/dev/arcp/client/ClientAuditTest.java @@ -0,0 +1,180 @@ +package dev.arcp.client; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import dev.arcp.core.error.ArcpException; +import dev.arcp.core.error.ErrorCode; +import dev.arcp.core.events.EventBody; +import dev.arcp.core.transport.MemoryTransport; +import dev.arcp.runtime.ArcpRuntime; +import dev.arcp.runtime.agent.JobOutcome; +import java.time.Duration; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Flow; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.Test; + +class ClientAuditTest { + + @Test + void subscribePublisherCompletesWhenJobTerminates() throws Exception { + CountDownLatch release = new CountDownLatch(1); + CountDownLatch started = new CountDownLatch(1); + ArcpRuntime runtime = + ArcpRuntime.builder() + .agent( + "block", + "1.0.0", + (input, ctx) -> { + started.countDown(); + release.await(); + return JobOutcome.Success.inline(input.payload()); + }) + .build(); + MemoryTransport.Pair pair = MemoryTransport.pair(); + runtime.accept(pair.runtime()); + try (ArcpClient client = ArcpClient.builder(pair.client()).build()) { + client.connect(Duration.ofSeconds(5)); + JobHandle handle = + client.submit(ArcpClient.jobSubmit("block@1.0.0", JsonNodeFactory.instance.objectNode())); + assertThat(started.await(3, TimeUnit.SECONDS)).isTrue(); + + CountDownLatch completed = new CountDownLatch(1); + client + .subscribe(handle.jobId(), SubscribeOptions.live()) + .subscribe( + new Flow.Subscriber() { + @Override + public void onSubscribe(Flow.Subscription s) { + s.request(Long.MAX_VALUE); + } + + @Override + public void onNext(EventBody item) {} + + @Override + public void onError(Throwable throwable) { + completed.countDown(); + } + + @Override + public void onComplete() { + completed.countDown(); + } + }); + + release.countDown(); + handle.result().get(5, TimeUnit.SECONDS); + // #105: the live subscriber publisher is completed when the job terminates. + assertThat(completed.await(5, TimeUnit.SECONDS)).isTrue(); + } + runtime.close(); + } + + @Test + void listJobsBadCursorThrowsInvalidRequestNotTimeout() throws Exception { + ArcpRuntime runtime = + ArcpRuntime.builder() + .agent("echo", "1.0.0", (input, ctx) -> JobOutcome.Success.inline(input.payload())) + .build(); + MemoryTransport.Pair pair = MemoryTransport.pair(); + runtime.accept(pair.runtime()); + try (ArcpClient client = ArcpClient.builder(pair.client()).build()) { + client.connect(Duration.ofSeconds(5)); + // #102: a list_jobs error is correlated to the list request, surfacing INVALID_REQUEST + // promptly rather than timing out (or failing an unrelated submit). + assertThatThrownBy(() -> client.listJobs(null, 10, "#not-base64")) + .isInstanceOfSatisfying( + ArcpException.class, e -> assertThat(e.code()).isEqualTo(ErrorCode.INVALID_REQUEST)); + } + runtime.close(); + } + + @Test + void submitAsyncCompletesWithHandle() throws Exception { + ArcpRuntime runtime = + ArcpRuntime.builder() + .agent("echo", "1.0.0", (input, ctx) -> JobOutcome.Success.inline(input.payload())) + .build(); + MemoryTransport.Pair pair = MemoryTransport.pair(); + runtime.accept(pair.runtime()); + try (ArcpClient client = ArcpClient.builder(pair.client()).build()) { + client.connect(Duration.ofSeconds(5)); + CompletableFuture future = + client.submitAsync( + ArcpClient.jobSubmit("echo@1.0.0", JsonNodeFactory.instance.objectNode())); + JobHandle handle = future.get(5, TimeUnit.SECONDS); + assertThat(handle.jobId()).isNotNull(); + assertThat(handle.result().get(5, TimeUnit.SECONDS).finalStatus()) + .isEqualTo(dev.arcp.core.messages.JobResult.SUCCESS); + } + runtime.close(); + } + + @Test + void crossSessionSubscriberObservesTermination() throws Exception { + CountDownLatch release = new CountDownLatch(1); + CountDownLatch started = new CountDownLatch(1); + ArcpRuntime runtime = + ArcpRuntime.builder() + .agent( + "block", + "1.0.0", + (input, ctx) -> { + started.countDown(); + release.await(); + return JobOutcome.Success.inline(input.payload()); + }) + .build(); + MemoryTransport.Pair submitPair = MemoryTransport.pair(); + MemoryTransport.Pair watchPair = MemoryTransport.pair(); + runtime.accept(submitPair.runtime()); + runtime.accept(watchPair.runtime()); + + try (ArcpClient submitter = ArcpClient.builder(submitPair.client()).bearer("shared").build(); + ArcpClient watcher = ArcpClient.builder(watchPair.client()).bearer("shared").build()) { + submitter.connect(Duration.ofSeconds(5)); + watcher.connect(Duration.ofSeconds(5)); + JobHandle handle = + submitter.submit( + ArcpClient.jobSubmit("block@1.0.0", JsonNodeFactory.instance.objectNode())); + assertThat(started.await(3, TimeUnit.SECONDS)).isTrue(); + + CountDownLatch watcherDone = new CountDownLatch(1); + watcher + .subscribe(handle.jobId(), SubscribeOptions.live()) + .subscribe( + new Flow.Subscriber() { + @Override + public void onSubscribe(Flow.Subscription s) { + s.request(Long.MAX_VALUE); + } + + @Override + public void onNext(EventBody item) {} + + @Override + public void onError(Throwable throwable) { + watcherDone.countDown(); + } + + @Override + public void onComplete() { + watcherDone.countDown(); + } + }); + + // Give the subscribe time to register before completion. + Thread.sleep(200); + release.countDown(); + handle.result().get(5, TimeUnit.SECONDS); + // #93 + #105: the cross-session subscriber observes the terminal message and its publisher + // completes. + assertThat(watcherDone.await(5, TimeUnit.SECONDS)).isTrue(); + } + runtime.close(); + } +} diff --git a/arcp-client/src/test/java/dev/arcp/client/ResultStreamTest.java b/arcp-client/src/test/java/dev/arcp/client/ResultStreamTest.java index 9bdf3bc..60342dd 100644 --- a/arcp-client/src/test/java/dev/arcp/client/ResultStreamTest.java +++ b/arcp-client/src/test/java/dev/arcp/client/ResultStreamTest.java @@ -57,6 +57,19 @@ void duplicateChunkRejected() throws Exception { .isInstanceOf(ResultStream.DuplicateChunkException.class); } + @Test + void duplicatePendingChunkRejectedAndIdenticalTolerated() throws Exception { + ResultId id = ResultId.of("res_pending"); + ResultStream stream = ResultStream.toMemory(id); + // chunk 1 arrives before its predecessor (still pending). + stream.accept(new ResultChunkEvent(id, 1, "world", "utf8", false)); + // identical retransmission of a still-pending chunk is tolerated (#111). + stream.accept(new ResultChunkEvent(id, 1, "world", "utf8", false)); + // a divergent copy of a still-pending chunk is rejected (#111). + assertThatThrownBy(() -> stream.accept(new ResultChunkEvent(id, 1, "WORLD", "utf8", false))) + .isInstanceOf(ResultStream.DuplicateChunkException.class); + } + @Test void rejectsWrongResultIdAndEncodingSwitches() throws Exception { ResultStream stream = ResultStream.toMemory(ResultId.of("res_expected")); diff --git a/arcp-core/src/main/java/dev/arcp/core/credentials/CredentialScheme.java b/arcp-core/src/main/java/dev/arcp/core/credentials/CredentialScheme.java index 99d8c7b..bf1501a 100644 --- a/arcp-core/src/main/java/dev/arcp/core/credentials/CredentialScheme.java +++ b/arcp-core/src/main/java/dev/arcp/core/credentials/CredentialScheme.java @@ -5,7 +5,16 @@ import java.util.Arrays; public enum CredentialScheme { - BEARER("bearer"); + BEARER("bearer"), + + /** + * §9.8.1: implementations MAY define extension schemes (e.g. {@code basic}, {@code signed_url}); + * unknown schemes MUST be ignored by clients that do not recognize them. Decoding an unrecognized + * scheme yields {@code UNKNOWN} rather than throwing, so a single extension credential cannot fail + * decode of an entire {@code job.accepted} (see #97). Consumers filter credentials whose scheme is + * {@code UNKNOWN}. + */ + UNKNOWN("unknown"); private final String wire; @@ -18,11 +27,15 @@ public String wire() { return wire; } + public boolean isBearer() { + return this == BEARER; + } + @JsonCreator public static CredentialScheme fromWire(String wire) { return Arrays.stream(values()) .filter(scheme -> scheme.wire.equals(wire)) .findFirst() - .orElseThrow(() -> new IllegalArgumentException("unknown credential scheme: " + wire)); + .orElse(UNKNOWN); } } diff --git a/arcp-core/src/main/java/dev/arcp/core/lease/Lease.java b/arcp-core/src/main/java/dev/arcp/core/lease/Lease.java index 62dd764..a86c163 100644 --- a/arcp-core/src/main/java/dev/arcp/core/lease/Lease.java +++ b/arcp-core/src/main/java/dev/arcp/core/lease/Lease.java @@ -69,11 +69,19 @@ static Lease fromJson(Map> wire) { return wire == null ? empty() : new Lease(wire); } - /** §9.4 subset check: every child pattern is covered by a parent pattern. */ + /** + * §9.4 subset check: every child capability must be covered by the parent. For pattern namespaces + * a child pattern is covered when some parent pattern covers it; for {@code cost.budget} the + * comparison is numeric — every child currency must be present in the parent and its amount must + * not exceed the parent's amount (a child cannot grant more spend than the parent holds). + */ public boolean contains(Lease child) { return child.capabilities.entrySet().stream() .allMatch( e -> { + if ("cost.budget".equals(e.getKey())) { + return budgetContains(child); + } List parent = capabilities.get(e.getKey()); return parent != null && e.getValue().stream() @@ -84,6 +92,18 @@ public boolean contains(Lease child) { }); } + private boolean budgetContains(Lease child) { + Map parentBudget = budget(); + Map childBudget = child.budget(); + for (Map.Entry entry : childBudget.entrySet()) { + BigDecimal parentAmount = parentBudget.get(entry.getKey()); + if (parentAmount == null || entry.getValue().compareTo(parentAmount) > 0) { + return false; + } + } + return true; + } + private static boolean covers(String parentPattern, String childPattern) { if (parentPattern.equals(childPattern) || "**".equals(parentPattern)) { return true; diff --git a/arcp-core/src/main/java/dev/arcp/core/lease/LeaseSubset.java b/arcp-core/src/main/java/dev/arcp/core/lease/LeaseSubset.java new file mode 100644 index 0000000..2d39502 --- /dev/null +++ b/arcp-core/src/main/java/dev/arcp/core/lease/LeaseSubset.java @@ -0,0 +1,92 @@ +package dev.arcp.core.lease; + +import dev.arcp.core.error.LeaseSubsetViolationException; +import java.math.BigDecimal; +import java.time.Instant; +import java.util.List; +import java.util.Map; +import org.jspecify.annotations.Nullable; + +/** + * §9.4 / §10 delegation subset enforcement. A delegated (child) lease MUST be a strict subset of its + * parent: every capability pattern covered by the parent, every {@code cost.budget} amount no + * greater than the parent's remaining amount, every {@code model.use} pattern covered, and an {@code + * expires_at} no later than the parent's. Violations are rejected with {@code + * LEASE_SUBSET_VIOLATION}. + * + *

The numeric/temporal comparisons here intentionally do not reuse the glob {@code covers()} + * logic, which cannot bound a budget amount or an expiry instant. + */ +public final class LeaseSubset { + + private LeaseSubset() {} + + /** + * Validate that {@code child} is a strict subset of {@code parent}. + * + * @param parent the parent (delegating) lease + * @param parentExpiresAt the parent lease expiry, or {@code null} if unbounded + * @param child the requested child (delegated) lease + * @param childExpiresAt the requested child expiry, or {@code null} if unbounded + * @throws LeaseSubsetViolationException if the child exceeds the parent in any dimension + */ + public static void validate( + Lease parent, + @Nullable Instant parentExpiresAt, + Lease child, + @Nullable Instant childExpiresAt) + throws LeaseSubsetViolationException { + for (Map.Entry> entry : child.capabilities().entrySet()) { + String namespace = entry.getKey(); + if ("cost.budget".equals(namespace)) { + validateBudget(parent.budget(), child.budget()); + continue; + } + List parentPatterns = parent.patterns(namespace); + if (parentPatterns.isEmpty()) { + throw new LeaseSubsetViolationException( + "delegated capability namespace not granted by parent: " + namespace); + } + // Reuse Lease.contains for single-namespace pattern coverage. + Lease childSlice = new Lease(Map.of(namespace, entry.getValue())); + if (!parent.contains(childSlice)) { + throw new LeaseSubsetViolationException( + "delegated patterns exceed parent for namespace " + namespace + ": " + entry.getValue()); + } + } + validateExpiry(parentExpiresAt, childExpiresAt); + } + + private static void validateBudget( + Map parentBudget, Map childBudget) + throws LeaseSubsetViolationException { + for (Map.Entry entry : childBudget.entrySet()) { + BigDecimal parentAmount = parentBudget.get(entry.getKey()); + if (parentAmount == null) { + throw new LeaseSubsetViolationException( + "delegated budget currency not granted by parent: " + entry.getKey()); + } + if (entry.getValue().compareTo(parentAmount) > 0) { + throw new LeaseSubsetViolationException( + "delegated budget " + + entry.getKey() + + ":" + + entry.getValue() + + " exceeds parent " + + parentAmount); + } + } + } + + private static void validateExpiry( + @Nullable Instant parentExpiresAt, @Nullable Instant childExpiresAt) + throws LeaseSubsetViolationException { + if (parentExpiresAt == null) { + return; // parent unbounded → any child expiry is a subset + } + if (childExpiresAt == null || childExpiresAt.isAfter(parentExpiresAt)) { + throw new LeaseSubsetViolationException( + "delegated expires_at " + childExpiresAt + " exceeds parent " + parentExpiresAt); + } + } +} diff --git a/arcp-core/src/main/java/dev/arcp/core/messages/JobCancelled.java b/arcp-core/src/main/java/dev/arcp/core/messages/JobCancelled.java new file mode 100644 index 0000000..3d2397e --- /dev/null +++ b/arcp-core/src/main/java/dev/arcp/core/messages/JobCancelled.java @@ -0,0 +1,24 @@ +package dev.arcp.core.messages; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.jspecify.annotations.Nullable; + +/** + * §7.4 cancel acknowledgement. The runtime sends {@code job.cancelled} (carrying the job id in the + * envelope) to acknowledge a {@code job.cancel}, followed by a terminal {@code job.error} with code + * {@code CANCELLED}. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public record JobCancelled(@Nullable String reason) implements Message { + @JsonCreator + public JobCancelled(@JsonProperty("reason") @Nullable String reason) { + this.reason = reason; + } + + @Override + public Type kind() { + return Type.JOB_CANCELLED; + } +} diff --git a/arcp-core/src/main/java/dev/arcp/core/messages/Message.java b/arcp-core/src/main/java/dev/arcp/core/messages/Message.java index 51d1e94..f89afe2 100644 --- a/arcp-core/src/main/java/dev/arcp/core/messages/Message.java +++ b/arcp-core/src/main/java/dev/arcp/core/messages/Message.java @@ -8,6 +8,7 @@ public sealed interface Message permits SessionHello, SessionWelcome, SessionBye, + SessionClosed, SessionPing, SessionPong, SessionAck, @@ -19,6 +20,7 @@ public sealed interface Message JobResult, JobError, JobCancel, + JobCancelled, JobSubscribe, JobSubscribed, JobUnsubscribe { @@ -28,7 +30,10 @@ public sealed interface Message enum Type { SESSION_HELLO("session.hello"), SESSION_WELCOME("session.welcome"), - SESSION_BYE("session.bye"), + // §6.7: the canonical graceful-close request is `session.close`; `session.bye` is accepted as a + // deprecated alias on decode for one release (see #96). + SESSION_BYE("session.close"), + SESSION_CLOSED("session.closed"), SESSION_PING("session.ping"), SESSION_PONG("session.pong"), SESSION_ACK("session.ack"), @@ -40,6 +45,7 @@ enum Type { JOB_RESULT("job.result"), JOB_ERROR("job.error"), JOB_CANCEL("job.cancel"), + JOB_CANCELLED("job.cancelled"), JOB_SUBSCRIBE("job.subscribe"), JOB_SUBSCRIBED("job.subscribed"), JOB_UNSUBSCRIBE("job.unsubscribe"); @@ -61,6 +67,8 @@ public String wire() { for (Type t : values()) { m.put(t.wire, t); } + // §6.7 backward-compat alias: legacy `session.bye` decodes to the graceful-close type. + m.put("session.bye", SESSION_BYE); BY_WIRE = java.util.Collections.unmodifiableMap(m); } diff --git a/arcp-core/src/main/java/dev/arcp/core/messages/Messages.java b/arcp-core/src/main/java/dev/arcp/core/messages/Messages.java index ec1d831..7d795f0 100644 --- a/arcp-core/src/main/java/dev/arcp/core/messages/Messages.java +++ b/arcp-core/src/main/java/dev/arcp/core/messages/Messages.java @@ -20,6 +20,7 @@ public static Message decode(ObjectMapper mapper, Envelope envelope) { case SESSION_HELLO -> mapper.convertValue(payload, SessionHello.class); case SESSION_WELCOME -> mapper.convertValue(payload, SessionWelcome.class); case SESSION_BYE -> mapper.convertValue(payload, SessionBye.class); + case SESSION_CLOSED -> mapper.convertValue(payload, SessionClosed.class); case SESSION_PING -> mapper.convertValue(payload, SessionPing.class); case SESSION_PONG -> mapper.convertValue(payload, SessionPong.class); case SESSION_ACK -> mapper.convertValue(payload, SessionAck.class); @@ -31,6 +32,7 @@ public static Message decode(ObjectMapper mapper, Envelope envelope) { case JOB_RESULT -> mapper.convertValue(payload, JobResult.class); case JOB_ERROR -> mapper.convertValue(payload, JobError.class); case JOB_CANCEL -> mapper.convertValue(payload, JobCancel.class); + case JOB_CANCELLED -> mapper.convertValue(payload, JobCancelled.class); case JOB_SUBSCRIBE -> mapper.convertValue(payload, JobSubscribe.class); case JOB_SUBSCRIBED -> mapper.convertValue(payload, JobSubscribed.class); case JOB_UNSUBSCRIBE -> mapper.convertValue(payload, JobUnsubscribe.class); diff --git a/arcp-core/src/main/java/dev/arcp/core/messages/SessionClosed.java b/arcp-core/src/main/java/dev/arcp/core/messages/SessionClosed.java new file mode 100644 index 0000000..023afdd --- /dev/null +++ b/arcp-core/src/main/java/dev/arcp/core/messages/SessionClosed.java @@ -0,0 +1,23 @@ +package dev.arcp.core.messages; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.jspecify.annotations.Nullable; + +/** + * §6.7 graceful-close acknowledgement. The runtime sends {@code session.closed} in response to a + * {@code session.close} request before tearing the session down. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public record SessionClosed(@Nullable String reason) implements Message { + @JsonCreator + public SessionClosed(@JsonProperty("reason") @Nullable String reason) { + this.reason = reason; + } + + @Override + public Type kind() { + return Type.SESSION_CLOSED; + } +} diff --git a/arcp-core/src/test/java/dev/arcp/core/credentials/CredentialJsonRoundTripTest.java b/arcp-core/src/test/java/dev/arcp/core/credentials/CredentialJsonRoundTripTest.java index 2e08492..c6dd159 100644 --- a/arcp-core/src/test/java/dev/arcp/core/credentials/CredentialJsonRoundTripTest.java +++ b/arcp-core/src/test/java/dev/arcp/core/credentials/CredentialJsonRoundTripTest.java @@ -46,4 +46,22 @@ void credentialRoundTripsAndRedactsValue() throws Exception { assertThat(ArcpMapper.shared().readValue(json, Credential.class)).isEqualTo(credential); assertThat(credential.toString()).doesNotContain("secret-token"); } + + @Test + void unknownSchemeDecodesToUnknownInsteadOfThrowing() throws Exception { + // §9.8.1: an unrecognized extension scheme must not fail decode (#97). + String json = + """ + { + "id": "cred_x", + "scheme": "signed_url", + "value": "https://signed.example.test/abc", + "endpoint": "https://api.example.test" + } + """; + Credential decoded = ArcpMapper.shared().readValue(json, Credential.class); + assertThat(decoded.scheme()).isEqualTo(CredentialScheme.UNKNOWN); + assertThat(CredentialScheme.BEARER.isBearer()).isTrue(); + assertThat(CredentialScheme.UNKNOWN.isBearer()).isFalse(); + } } diff --git a/arcp-core/src/test/java/dev/arcp/core/lease/LeaseSubsetTest.java b/arcp-core/src/test/java/dev/arcp/core/lease/LeaseSubsetTest.java new file mode 100644 index 0000000..f483d09 --- /dev/null +++ b/arcp-core/src/test/java/dev/arcp/core/lease/LeaseSubsetTest.java @@ -0,0 +1,65 @@ +package dev.arcp.core.lease; + +import static org.assertj.core.api.Assertions.assertThatCode; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import dev.arcp.core.error.LeaseSubsetViolationException; +import java.time.Instant; +import org.junit.jupiter.api.Test; + +class LeaseSubsetTest { + + @Test + void numericBudgetSubsetIsEnforced() { + Lease parent = Lease.builder().allow("cost.budget", "usd:10.00").build(); + Lease within = Lease.builder().allow("cost.budget", "usd:4.00").build(); + Lease over = Lease.builder().allow("cost.budget", "usd:11.00").build(); + org.assertj.core.api.Assertions.assertThat(parent.contains(within)).isTrue(); + org.assertj.core.api.Assertions.assertThat(parent.contains(over)).isFalse(); + } + + @Test + void validateAcceptsStrictSubset() { + Lease parent = + Lease.builder() + .allow("fs.read", "/workspace/**") + .allow("cost.budget", "usd:10") + .allow("model.use", "tier-fast/*") + .build(); + Lease child = + Lease.builder() + .allow("fs.read", "/workspace/app/**") + .allow("cost.budget", "usd:3") + .allow("model.use", "tier-fast/sonnet") + .build(); + Instant parentExpiry = Instant.parse("2026-06-01T00:00:00Z"); + Instant childExpiry = Instant.parse("2026-05-30T00:00:00Z"); + assertThatCode(() -> LeaseSubset.validate(parent, parentExpiry, child, childExpiry)) + .doesNotThrowAnyException(); + } + + @Test + void validateRejectsExceedingBudget() { + Lease parent = Lease.builder().allow("cost.budget", "usd:5").build(); + Lease child = Lease.builder().allow("cost.budget", "usd:6").build(); + assertThatThrownBy(() -> LeaseSubset.validate(parent, null, child, null)) + .isInstanceOf(LeaseSubsetViolationException.class); + } + + @Test + void validateRejectsUngrantedCapability() { + Lease parent = Lease.builder().allow("fs.read", "/workspace/**").build(); + Lease child = Lease.builder().allow("tool.call", "shell").build(); + assertThatThrownBy(() -> LeaseSubset.validate(parent, null, child, null)) + .isInstanceOf(LeaseSubsetViolationException.class); + } + + @Test + void validateRejectsExpiryBeyondParent() { + Lease lease = Lease.builder().allow("fs.read", "/workspace/**").build(); + Instant parentExpiry = Instant.parse("2026-06-01T00:00:00Z"); + Instant childExpiry = Instant.parse("2026-06-02T00:00:00Z"); + assertThatThrownBy(() -> LeaseSubset.validate(lease, parentExpiry, lease, childExpiry)) + .isInstanceOf(LeaseSubsetViolationException.class); + } +} diff --git a/arcp-core/src/test/java/dev/arcp/core/messages/MessageCatalogTest.java b/arcp-core/src/test/java/dev/arcp/core/messages/MessageCatalogTest.java index ca08548..097f276 100644 --- a/arcp-core/src/test/java/dev/arcp/core/messages/MessageCatalogTest.java +++ b/arcp-core/src/test/java/dev/arcp/core/messages/MessageCatalogTest.java @@ -78,6 +78,7 @@ void everyMessageKindRoundTripsThroughEnvelopePayload() { EnumSet.of(Feature.SUBSCRIBE, Feature.LIST_JOBS), List.of(new AgentDescriptor("echo", List.of("1.0.0", "2.0.0"), "2.0.0")))), new SessionBye("done"), + new SessionClosed("done"), new SessionPing("nonce-1", NOW), new SessionPong("nonce-1", NOW.plusMillis(5)), new SessionAck(42L), @@ -125,6 +126,7 @@ void everyMessageKindRoundTripsThroughEnvelopePayload() { null, JsonNodeFactory.instance.objectNode().put("path", "/etc/passwd")), new JobCancel("user requested"), + new JobCancelled("user requested"), new JobSubscribe(JOB_ID, 3L, true), new JobSubscribed(JOB_ID, "running", "echo@1.0.0", LEASE, null, TRACE_ID, 3L, true), new JobUnsubscribe(JOB_ID)); diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/ArcpRuntime.java b/arcp-runtime/src/main/java/dev/arcp/runtime/ArcpRuntime.java index e78700f..9314079 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/ArcpRuntime.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/ArcpRuntime.java @@ -123,11 +123,17 @@ public static Builder builder() { /** Attach a transport; the returned handle is closed on session.bye or transport close. */ public SessionLoop accept(Transport transport) { SessionLoop loop = new SessionLoop(this, transport); - loop.start(); + // Insert before start() so that if the transport completes/errors synchronously during start + // (e.g. an already-closed transport), shutdown -> removeSession finds and removes this entry + // instead of racing a later put that would leave a zombie CLOSED loop in the map (#107). // Always key by the stable pending id, not idOrPending() — the latter flips to the // real session id after handshake and would leave the map keyed by a now-orphaned // string, so removeSession would never find the entry (#23). sessions.put(loop.pendingKey(), loop); + loop.start(); + if (loop.phase() == SessionLoop.Phase.CLOSED) { + removeSession(loop); + } return loop; } diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/credentials/CredentialBinding.java b/arcp-runtime/src/main/java/dev/arcp/runtime/credentials/CredentialBinding.java index 1f4ce7c..890e975 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/credentials/CredentialBinding.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/credentials/CredentialBinding.java @@ -81,6 +81,20 @@ public void revokeAll(JobRecord record) { } } + /** + * Record then revoke a credential that was minted upstream but not attached to a job (e.g. surplus + * credentials returned during rotation), so its spend authority is tracked and released rather + * than dangling (§14, #98). + */ + public void revokeMinted(IssuedCredential credential) { + store.record( + credential.wire().id(), + credential.providerHandle() != null + ? credential.providerHandle() + : credential.wire().id().value()); + revoke(credential); + } + private void revoke(IssuedCredential credential) { CredentialId id = credential.wire().id(); for (int attempt = 1; attempt <= MAX_REVOKE_ATTEMPTS; attempt++) { diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyStore.java b/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyStore.java index c08846a..4215922 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyStore.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyStore.java @@ -90,6 +90,28 @@ public boolean matchesPayload(Principal principal, String idempotencyKey, String return e != null && e.fingerprint.equals(fingerprint); } + /** + * Release a previously-claimed key, but only if it still maps to {@code expected}. Used to undo a + * claim when the corresponding accept fails (§7.2): without this the key stays poisoned for the + * full TTL and an identical retry is wrongly rejected with {@code DUPLICATE_KEY} (#90). + * + * @return {@code true} if an entry for {@code expected} was removed + */ + public boolean release(Principal principal, String idempotencyKey, JobId expected) { + Key key = new Key(principal.id(), idempotencyKey); + Entry[] removed = new Entry[1]; + entries.computeIfPresent( + key, + (k, entry) -> { + if (entry.jobId.equals(expected)) { + removed[0] = entry; + return null; + } + return entry; + }); + return removed[0] != null; + } + /** * Evict entries older than {@code ttl}. Exposed for deterministic test control; the scheduled * background task invokes this method automatically when a scheduler was supplied at construction diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobRecord.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobRecord.java index 0f457f1..3b04522 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobRecord.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobRecord.java @@ -6,13 +6,17 @@ import dev.arcp.core.ids.TraceId; import dev.arcp.core.lease.Lease; import dev.arcp.core.lease.LeaseConstraints; -import dev.arcp.core.wire.Envelope; +import dev.arcp.core.messages.JobEvent; import dev.arcp.runtime.credentials.IssuedCredential; import dev.arcp.runtime.lease.BudgetCounters; +import java.math.BigDecimal; import java.time.Instant; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; +import java.util.Deque; import java.util.List; +import java.util.Map; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.Future; import java.util.concurrent.ScheduledFuture; @@ -47,6 +51,12 @@ public String wire() { } } + /** Default per-job event-history cap when none is supplied (e.g. in unit tests). */ + public static final int DEFAULT_HISTORY_CAPACITY = 1024; + + /** A recorded event retained for §7.6 subscribe-history replay. */ + public record RecordedEvent(long producerSeq, JobEvent event) {} + private final JobId jobId; private final String resolvedAgent; private final Principal principal; @@ -55,12 +65,17 @@ public String wire() { private final BudgetCounters budget; private final Instant createdAt; private final @Nullable TraceId traceId; + private final int historyCapacity; private final AtomicReference status = new AtomicReference<>(Status.PENDING); private final AtomicReference<@Nullable Long> lastEventSeq = new AtomicReference<>(null); - private final CopyOnWriteArrayList eventHistory = new CopyOnWriteArrayList<>(); + // Bounded ring of recent events (§7.6: replay is bounded by the resume buffer window). Guarded by + // its own monitor; appends are O(1) and the buffer never grows past historyCapacity. + private final Deque eventHistory = new ArrayDeque<>(); private final CopyOnWriteArrayList subscribers = new CopyOnWriteArrayList<>(); private volatile @Nullable Future worker; private volatile @Nullable ScheduledFuture expiryWatchdog; + private volatile @Nullable ScheduledFuture maxRuntimeWatchdog; + private volatile @Nullable Map acceptedBudget; private final Object credentialsLock = new Object(); private final ArrayList credentials = new ArrayList<>(); @@ -73,6 +88,28 @@ public JobRecord( BudgetCounters budget, Instant createdAt, @Nullable TraceId traceId) { + this( + jobId, + resolvedAgent, + principal, + lease, + constraints, + budget, + createdAt, + traceId, + DEFAULT_HISTORY_CAPACITY); + } + + public JobRecord( + JobId jobId, + String resolvedAgent, + Principal principal, + Lease lease, + LeaseConstraints constraints, + BudgetCounters budget, + Instant createdAt, + @Nullable TraceId traceId, + int historyCapacity) { this.jobId = jobId; this.resolvedAgent = resolvedAgent; this.principal = principal; @@ -81,6 +118,7 @@ public JobRecord( this.budget = budget; this.createdAt = createdAt; this.traceId = traceId; + this.historyCapacity = historyCapacity > 0 ? historyCapacity : DEFAULT_HISTORY_CAPACITY; } public JobId jobId() { @@ -146,6 +184,23 @@ public void setExpiryWatchdog(ScheduledFuture watchdog) { return expiryWatchdog; } + public void setMaxRuntimeWatchdog(ScheduledFuture watchdog) { + this.maxRuntimeWatchdog = watchdog; + } + + public @Nullable ScheduledFuture maxRuntimeWatchdog() { + return maxRuntimeWatchdog; + } + + /** Snapshot of the budget map returned in the original {@code job.accepted} (§7.2 replay). */ + public void setAcceptedBudget(Map snapshot) { + this.acceptedBudget = Map.copyOf(snapshot); + } + + public @Nullable Map acceptedBudget() { + return acceptedBudget; + } + public void setLastEventSeq(long seq) { lastEventSeq.set(seq); } @@ -154,14 +209,25 @@ public void setLastEventSeq(long seq) { return lastEventSeq.get(); } - public void recordEvent(Envelope envelope) { - eventHistory.add(envelope); + public void recordEvent(long producerSeq, JobEvent event) { + synchronized (eventHistory) { + if (eventHistory.size() == historyCapacity) { + eventHistory.removeFirst(); + } + eventHistory.addLast(new RecordedEvent(producerSeq, event)); + } + } + + public List eventsSince(long fromSeq) { + synchronized (eventHistory) { + return eventHistory.stream().filter(e -> e.producerSeq() > fromSeq).toList(); + } } - public List eventsSince(long eventSeq) { - return eventHistory.stream() - .filter(envelope -> envelope.eventSeq() != null && envelope.eventSeq() > eventSeq) - .toList(); + public int eventHistorySize() { + synchronized (eventHistory) { + return eventHistory.size(); + } } public List subscribers() { diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java index 0cb58b8..4445c1c 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java @@ -13,6 +13,7 @@ import dev.arcp.core.error.UpstreamBudgetExhaustedException; import dev.arcp.core.events.EventBody; import dev.arcp.core.events.MetricEvent; +import dev.arcp.core.events.StatusEvent; import dev.arcp.core.ids.JobId; import dev.arcp.core.ids.MessageId; import dev.arcp.core.ids.SessionId; @@ -21,6 +22,7 @@ import dev.arcp.core.lease.LeaseConstraints; import dev.arcp.core.messages.JobAccepted; import dev.arcp.core.messages.JobCancel; +import dev.arcp.core.messages.JobCancelled; import dev.arcp.core.messages.JobError; import dev.arcp.core.messages.JobEvent; import dev.arcp.core.messages.JobFilter; @@ -35,6 +37,7 @@ import dev.arcp.core.messages.RuntimeInfo; import dev.arcp.core.messages.SessionAck; import dev.arcp.core.messages.SessionBye; +import dev.arcp.core.messages.SessionClosed; import dev.arcp.core.messages.SessionHello; import dev.arcp.core.messages.SessionJobs; import dev.arcp.core.messages.SessionListJobs; @@ -103,7 +106,7 @@ public enum Phase { private final ResumeBuffer resumeBuffer; private final HeartbeatTracker heartbeat; private final CredentialBinding credentialBinding; - private @Nullable ScheduledFuture heartbeatTick; + private volatile @Nullable ScheduledFuture heartbeatTick; private final Set ownedJobs = ConcurrentHashMap.newKeySet(); @@ -185,12 +188,14 @@ public void shutdown(String reason) { w.cancel(true); } credentialBinding.revokeAll(rec); - ScheduledFuture watchdog = rec.expiryWatchdog(); - if (watchdog != null) { - watchdog.cancel(false); - } + cancelWatchdogs(rec); } } + // Unlink this (now closed) session from every job's subscriber list so closed sessions are not + // pinned in memory and never receive further fan-out (#101). + for (JobRecord rec : runtime.jobs()) { + rec.removeSubscribersWhere(s -> s.session() == this); + } try { transport.close(); } catch (RuntimeException ignored) { @@ -199,6 +204,17 @@ public void shutdown(String reason) { runtime.removeSession(this); } + private static void cancelWatchdogs(JobRecord rec) { + ScheduledFuture expiry = rec.expiryWatchdog(); + if (expiry != null) { + expiry.cancel(false); + } + ScheduledFuture maxRuntime = rec.maxRuntimeWatchdog(); + if (maxRuntime != null) { + maxRuntime.cancel(false); + } + } + private void handle(Envelope envelope) { Phase p = phase.get(); Message m; @@ -206,6 +222,12 @@ private void handle(Envelope envelope) { m = Messages.decode(mapper, envelope); } catch (RuntimeException e) { log.warn("rejecting malformed envelope type={}: {}", envelope.type(), e.getMessage()); + // §9.5/§12: a decodable top-level envelope that fails payload validation (e.g. a non-UTC + // expires_at) must be answered with INVALID_REQUEST rather than silently dropped (#78). + if (p == Phase.ACTIVE) { + sendJobErrorTopLevel( + envelope, ErrorCode.INVALID_REQUEST, "invalid request: " + e.getMessage()); + } return; } @@ -228,11 +250,13 @@ private void handle(Envelope envelope) { /* heartbeat already updated onNext */ } case SessionAck ack -> handleAck(ack); - case SessionListJobs listJobs -> handleListJobs(envelope.id(), listJobs); + case SessionListJobs listJobs -> handleListJobs(envelope, listJobs); case JobSubmit submit -> handleSubmit(envelope, submit); case JobCancel cancel -> handleCancel(envelope, cancel); - case JobSubscribe sub -> handleSubscribe(sub); + case JobSubscribe sub -> handleSubscribe(envelope, sub); case JobUnsubscribe unsub -> handleUnsubscribe(unsub); + case SessionClosed ignored -> log.warn("client-only message received: {}", m); + case JobCancelled ignored -> log.warn("client-only message received: {}", m); case SessionWelcome ignored -> log.warn("client-only message received: {}", m); case JobAccepted ignored -> log.warn("client-only message received: {}", m); case JobEvent ignored -> log.warn("client-only message received: {}", m); @@ -251,7 +275,12 @@ private void doHandshake(SessionHello hello) { this.resumeToken = UUID.randomUUID().toString(); this.negotiated = Capabilities.intersect(hello.capabilities().features(), runtime.advertised()); - this.phase.set(Phase.ACTIVE); + // §6: only activate from AWAITING_HELLO. If a concurrent shutdown already moved us to CLOSED, + // abort the handshake instead of resurrecting a torn-down session (#103). + if (!this.phase.compareAndSet(Phase.AWAITING_HELLO, Phase.ACTIVE)) { + shutdown("handshake raced shutdown"); + return; + } Capabilities welcomeCaps = new Capabilities(List.of("json"), negotiated, agents.describe()); SessionWelcome welcome = @@ -276,6 +305,11 @@ private void doHandshake(SessionHello hello) { interval.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS); + // If shutdown won the race after the CAS but before this assignment, cancel immediately so + // no orphaned heartbeat task ticks forever on the shared scheduler (#103). + if (phase.get() == Phase.CLOSED) { + heartbeatTick.cancel(false); + } } } catch (RuntimeException | ArcpException e) { log.info("handshake rejected: {}", e.getMessage()); @@ -315,8 +349,10 @@ private void tickHeartbeat(Duration interval) { } private void handleBye(SessionBye bye) { - log.debug("session {} bye: {}", sessionId, bye.reason()); - shutdown("client bye"); + log.debug("session {} close: {}", sessionId, bye.reason()); + // §6.7: acknowledge with session.closed before tearing down the transport. + send(Message.Type.SESSION_CLOSED, new SessionClosed(bye.reason()), sessionId, null, null, null); + shutdown("client close"); } private void handlePing(SessionPing ping) { @@ -328,12 +364,34 @@ private void handleAck(SessionAck ack) { lastProcessedSeq.updateAndGet(prev -> Math.max(prev, ack.lastProcessedSeq())); } - private void handleListJobs(MessageId requestId, SessionListJobs req) { + private void handleListJobs(Envelope envelope, SessionListJobs req) { if (!negotiated.contains(Feature.LIST_JOBS)) { return; } + MessageId requestId = envelope.id(); + + // Decode the cursor first so a bad cursor fails fast without scanning the registry. The error + // echoes the request envelope id so the client correlates it to this list call, not an + // unrelated in-flight submit (#102). + int startIndex = 0; + if (req.cursor() != null && !req.cursor().isBlank()) { + try { + byte[] decoded = java.util.Base64.getUrlDecoder().decode(req.cursor()); + startIndex = Integer.parseInt(new String(decoded, java.nio.charset.StandardCharsets.UTF_8)); + if (startIndex < 0) { + startIndex = 0; + } + } catch (IllegalArgumentException e) { + sendJobErrorTopLevel(envelope, ErrorCode.INVALID_REQUEST, "invalid list_jobs cursor"); + return; + } + } + JobFilter filter = req.filter() != null ? req.filter() : JobFilter.all(); - List matching = + // Filter and sort matching records, but only materialize JobSummary for the requested page so a + // small limit does not map every matching job (#83). Ordering and cursor use the same stable key + // (createdAt, then jobId). + List matching = runtime.jobs().stream() .filter(rec -> rec.principal().equals(principal)) .filter(rec -> filter.status() == null || filter.status().contains(rec.status().wire())) @@ -345,42 +403,17 @@ private void handleListJobs(MessageId requestId, SessionListJobs req) { .filter( rec -> filter.createdAfter() == null || rec.createdAt().isAfter(filter.createdAfter())) - // Deterministic order: createdAt then jobId for ties. .sorted( java.util.Comparator.comparing((JobRecord rec) -> rec.createdAt()) .thenComparing(rec -> rec.jobId().value())) - .map( - rec -> - new JobSummary( - rec.jobId(), - rec.resolvedAgent(), - rec.status().wire(), - rec.lease(), - null, - rec.createdAt(), - rec.traceId(), - rec.lastEventSeq())) .toList(); - int startIndex = 0; - if (req.cursor() != null && !req.cursor().isBlank()) { - try { - byte[] decoded = java.util.Base64.getUrlDecoder().decode(req.cursor()); - startIndex = Integer.parseInt(new String(decoded, java.nio.charset.StandardCharsets.UTF_8)); - if (startIndex < 0) { - startIndex = 0; - } - } catch (IllegalArgumentException e) { - sendJobErrorTopLevel(null, ErrorCode.INVALID_REQUEST, "invalid list_jobs cursor"); - return; - } - } int limit = req.limit() != null && req.limit() > 0 ? req.limit() : matching.size(); int endIndex = Math.min(matching.size(), startIndex + limit); List page = startIndex >= matching.size() ? List.of() - : List.copyOf(matching.subList(startIndex, endIndex)); + : matching.subList(startIndex, endIndex).stream().map(SessionLoop::toSummary).toList(); String nextCursor = endIndex < matching.size() ? java.util.Base64.getUrlEncoder() @@ -392,6 +425,18 @@ private void handleListJobs(MessageId requestId, SessionListJobs req) { send(Message.Type.SESSION_JOBS, response, sessionId, null, null, null); } + private static JobSummary toSummary(JobRecord rec) { + return new JobSummary( + rec.jobId(), + rec.resolvedAgent(), + rec.status().wire(), + rec.lease(), + null, + rec.createdAt(), + rec.traceId(), + rec.lastEventSeq()); + } + private void handleSubmit(Envelope envelope, JobSubmit submit) { Principal pr = principal; if (pr == null) { @@ -422,6 +467,19 @@ private void handleSubmit(Envelope envelope, JobSubmit submit) { emitReplayAccepted(prior, envelope.traceId()); return; } + // Same key + identical params, but the prior job was never registered or was removed (a + // failed earlier accept). Release the stale claim and re-claim so this identical retry is + // accepted rather than poisoned with DUPLICATE_KEY for the whole TTL (#90). + runtime.idempotency().release(pr, idempotencyKey, conflict.existing()); + if (runtime.idempotency().claim(pr, idempotencyKey, fingerprint, fresh) != null) { + sendJobErrorTopLevel( + envelope, + ErrorCode.DUPLICATE_KEY, + "idempotency_key reuse with conflicting parameters: " + idempotencyKey); + return; + } + acceptJob(envelope, submit, pr, now, fresh, idempotencyKey); + return; } sendJobErrorTopLevel( envelope, @@ -429,10 +487,16 @@ private void handleSubmit(Envelope envelope, JobSubmit submit) { "idempotency_key reuse with conflicting parameters: " + idempotencyKey); return; } - acceptJob(envelope, submit, pr, now, fresh); + acceptJob(envelope, submit, pr, now, fresh, idempotencyKey); return; } - acceptJob(envelope, submit, pr, now, JobId.generate()); + acceptJob(envelope, submit, pr, now, JobId.generate(), null); + } + + private void releaseIdempotency(Principal pr, @Nullable String idempotencyKey, JobId jobId) { + if (idempotencyKey != null) { + runtime.idempotency().release(pr, idempotencyKey, jobId); + } } /** @@ -447,6 +511,11 @@ private String idempotencyFingerprint(JobSubmit submit) { ObjectMapper canonical = mapper.copy(); canonical.configure( com.fasterxml.jackson.databind.SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, true); + // ORDER_MAP_ENTRIES_BY_KEYS sorts java.util.Map only; submit.input() is a JsonNode/ObjectNode + // whose properties must also be sorted so semantically equal payloads with differing key order + // hash identically (§7.2, #91). + canonical.configure( + com.fasterxml.jackson.databind.cfg.JsonNodeFeature.WRITE_PROPERTIES_SORTED, true); canonical.setNodeFactory( com.fasterxml.jackson.databind.node.JsonNodeFactory.withExactBigDecimals(true)); ObjectNode canon = canonical.createObjectNode(); @@ -471,7 +540,10 @@ private String idempotencyFingerprint(JobSubmit submit) { } private void emitReplayAccepted(JobRecord prior, @Nullable TraceId traceId) { - Map budgetSnapshot = prior.budget().snapshot(); + // §7.2: replay the budget captured at the original acceptance, not the live (decremented) + // counters, so an identical retry returns the same job.accepted payload (#79). + Map budgetSnapshot = + prior.acceptedBudget() != null ? prior.acceptedBudget() : prior.budget().snapshot(); JobAccepted accepted = new JobAccepted( prior.jobId(), @@ -486,14 +558,21 @@ private void emitReplayAccepted(JobRecord prior, @Nullable TraceId traceId) { } private void acceptJob( - Envelope envelope, JobSubmit submit, Principal pr, Instant now, JobId jobId) { + Envelope envelope, + JobSubmit submit, + Principal pr, + Instant now, + JobId jobId, + @Nullable String idempotencyKey) { AgentRegistry.Resolved resolved; try { resolved = agents.resolve(submit.agent()); } catch (dev.arcp.core.error.AgentVersionNotAvailableException e) { + releaseIdempotency(pr, idempotencyKey, jobId); sendJobErrorTopLevel(envelope, ErrorCode.AGENT_VERSION_NOT_AVAILABLE, e.getMessage()); return; } catch (dev.arcp.core.error.AgentNotAvailableException e) { + releaseIdempotency(pr, idempotencyKey, jobId); sendJobErrorTopLevel(envelope, ErrorCode.AGENT_NOT_AVAILABLE, e.getMessage()); return; } @@ -504,7 +583,16 @@ private void acceptJob( BudgetCounters budget = new BudgetCounters(lease.budget()); TraceId traceId = envelope.traceId(); JobRecord record = - new JobRecord(jobId, resolved.wire(), pr, lease, constraints, budget, now, traceId); + new JobRecord( + jobId, + resolved.wire(), + pr, + lease, + constraints, + budget, + now, + traceId, + runtime.resumeBufferCapacity()); runtime.registerJob(record); ownedJobs.add(jobId); @@ -517,6 +605,7 @@ private void acceptJob( } catch (RuntimeException e) { ownedJobs.remove(jobId); runtime.removeJob(jobId); + releaseIdempotency(pr, idempotencyKey, jobId); Throwable root = rootCause(e); if (root instanceof UpstreamBudgetExhaustedException budgetError) { sendJobErrorTopLevel(envelope, ErrorCode.BUDGET_EXHAUSTED, budgetError.getMessage()); @@ -531,6 +620,9 @@ private void acceptJob( } Map budgetSnapshot = budget.snapshot(); + // §7.2: capture the budget returned at acceptance so an idempotent replay returns the same + // payload regardless of intervening spend (#79). + record.setAcceptedBudget(budgetSnapshot); JobAccepted accepted = new JobAccepted( jobId, @@ -543,38 +635,72 @@ private void acceptJob( traceId); send(Message.Type.JOB_ACCEPTED, accepted, sessionId, traceId, jobId, null); - // §9.5 watchdog: schedule a terminator if the lease has an expiry. + // Start the worker before scheduling watchdogs so a short-fused watchdog cannot fire in the gap + // before setWorker and lose the interrupt (#104). + record.setWorker(runtime.workerPool().submit(() -> runJob(record, resolved.agent(), submit))); + + // §9.5 watchdog: terminate the job if the lease expires. if (constraints.expiresAt() != null) { long delayMillis = Duration.between(now, constraints.expiresAt()).toMillis(); - if (delayMillis > 0) { - ScheduledFuture watchdog = - runtime - .scheduler() - .schedule(() -> terminateExpiredJob(record), delayMillis, TimeUnit.MILLISECONDS); - record.setExpiryWatchdog(watchdog); - } + ScheduledFuture watchdog = + runtime + .scheduler() + .schedule( + () -> terminateExpiredJob(record), + Math.max(0, delayMillis), + TimeUnit.MILLISECONDS); + record.setExpiryWatchdog(watchdog); } - record.setWorker(runtime.workerPool().submit(() -> runJob(record, resolved.agent(), submit))); + // §7.1/§12 watchdog: enforce max_runtime_sec by emitting TIMEOUT if exceeded (#89). + if (submit.maxRuntimeSec() != null && submit.maxRuntimeSec() > 0) { + ScheduledFuture timeout = + runtime + .scheduler() + .schedule( + () -> terminateTimedOutJob(record), + submit.maxRuntimeSec().longValue(), + TimeUnit.SECONDS); + record.setMaxRuntimeWatchdog(timeout); + } } private void terminateExpiredJob(JobRecord record) { - if (record.transitionTo(JobRecord.Status.TIMED_OUT)) { + // §9.5/§7.3: lease expiry is a LEASE_EXPIRED error with final_status "error", not a timeout + // (#74). The terminal record status is ERROR. + if (record.transitionTo(JobRecord.Status.ERROR)) { var w = record.worker(); if (w != null) { w.cancel(true); } emitJobError( record, - JobError.TIMED_OUT, + JobError.ERROR, ErrorCode.LEASE_EXPIRED, "lease expired at " + record.constraints().expiresAt()); credentialBinding.revokeAll(record); } } + private void terminateTimedOutJob(JobRecord record) { + // §12 TIMEOUT: the job exceeded max_runtime_sec → final_status "timed_out" (#89). + if (record.transitionTo(JobRecord.Status.TIMED_OUT)) { + var w = record.worker(); + if (w != null) { + w.cancel(true); + } + emitJobError( + record, JobError.TIMED_OUT, ErrorCode.TIMEOUT, "job exceeded max_runtime_sec"); + credentialBinding.revokeAll(record); + } + } + private void runJob(JobRecord record, Agent agent, JobSubmit submit) { - record.transitionTo(JobRecord.Status.RUNNING); + // If a watchdog/cancel already moved the record to a terminal state in the gap before this + // worker started, do not start the agent (#104). + if (!record.transitionTo(JobRecord.Status.RUNNING)) { + return; + } JobInput input = new JobInput( submit.input(), @@ -595,7 +721,11 @@ public void emit(EventBody body) { if (body instanceof MetricEvent metric && metric.unit() != null && metric.name() != null + && metric.value() != null && metric.name().startsWith("cost.") + // §9.6: cost.budget.* are telemetry gauges (e.g. cost.budget.remaining carries the + // remaining balance), not spend reports — never decrement on them (#108). + && !metric.name().startsWith("cost.budget.") && record.budget().tracks(metric.unit())) { record.budget().decrement(metric.unit(), metric.value()); } @@ -604,8 +734,10 @@ public void emit(EventBody body) { @Override public boolean cancelled() { + // §9.5: any terminal status (CANCELLED, TIMED_OUT, ERROR) means a polling agent should + // stop, not just CANCELLED (#104). return Thread.currentThread().isInterrupted() - || record.status() == JobRecord.Status.CANCELLED + || record.status().terminal() || phase.get() == Phase.CLOSED; } @@ -625,21 +757,29 @@ public List credentials() { @Override public void rotateCredential(CredentialId id, String newValue) { - IssuedCredential current = - record.credentials().stream() - .filter(issued -> issued.wire().id().equals(id)) - .findFirst() - .orElseThrow( - () -> new IllegalArgumentException("unknown credential id: " + id)); - IssuedCredential reissued = + record.credentials().stream() + .filter(issued -> issued.wire().id().equals(id)) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("unknown credential id: " + id)); + List reissued = runtime .credentialProvisioner() .issue(record.lease(), record.constraints(), this) - .join() - .stream() - .findFirst() - .orElse(current); - Credential wire = reissued.wire(); + .join(); + if (reissued.isEmpty()) { + // §9.8.2/§14: without a freshly minted credential there is nothing to rotate to; fail + // loudly rather than aliasing the current handle (which rotate() would then revoke, + // killing the live credential) (#98). + throw new IllegalStateException( + "credential rotation produced no credential for " + id); + } + // Track and revoke every minted credential except the one we keep, so no minted + // credential is left with untracked, unrevoked spend authority upstream (#98). + IssuedCredential selected = reissued.get(0); + for (int i = 1; i < reissued.size(); i++) { + credentialBinding.revokeMinted(reissued.get(i)); + } + Credential wire = selected.wire(); IssuedCredential next = new IssuedCredential( new Credential( @@ -649,34 +789,37 @@ public void rotateCredential(CredentialId id, String newValue) { wire.endpoint(), wire.profile(), wire.constraints()), - reissued.providerHandle()); + selected.providerHandle()); credentialBinding.rotate(record, id, next); } }; try { JobOutcome outcome = agent.run(input, ctx); - if (record.status() == JobRecord.Status.CANCELLED) { - emitJobError(record, JobError.CANCELLED, ErrorCode.CANCELLED, "cancelled"); - credentialBinding.revokeAll(record); - return; - } + // If a cancel/watchdog already terminated this job (even if the agent swallowed the + // interrupt and returned normally), the terminal message was already emitted — do not emit a + // second, possibly contradictory one (#92). if (record.status().terminal()) { return; } switch (outcome) { case JobOutcome.Success s -> { - record.transitionTo(JobRecord.Status.SUCCESS); - JobResult result = - new JobResult( - JobResult.SUCCESS, s.resultId(), s.resultSize(), s.inline(), s.summary()); - sendJobMessage(record, Message.Type.JOB_RESULT, result, nextSeq()); - credentialBinding.revokeAll(record); + if (record.transitionTo(JobRecord.Status.SUCCESS)) { + JobResult result = + new JobResult( + JobResult.SUCCESS, s.resultId(), s.resultSize(), s.inline(), s.summary()); + long seq = nextSeq(); + sendJobMessage(record, Message.Type.JOB_RESULT, result, seq); + // §7.6/§13.3: subscribers must also observe termination (#93). + fanOutTerminal(record, Message.Type.JOB_RESULT, result); + credentialBinding.revokeAll(record); + } } case JobOutcome.Failure f -> { - record.transitionTo(JobRecord.Status.ERROR); - emitJobError(record, JobError.ERROR, f.code(), f.message()); - credentialBinding.revokeAll(record); + if (record.transitionTo(JobRecord.Status.ERROR)) { + emitJobError(record, JobError.ERROR, f.code(), f.message()); + credentialBinding.revokeAll(record); + } } } } catch (InterruptedException e) { @@ -688,56 +831,65 @@ public void rotateCredential(CredentialId id, String newValue) { credentialBinding.revokeAll(record); } } catch (dev.arcp.core.error.LeaseExpiredException e) { - record.transitionTo(JobRecord.Status.ERROR); - emitJobError(record, JobError.ERROR, ErrorCode.LEASE_EXPIRED, e.getMessage()); - credentialBinding.revokeAll(record); + // §7.3: only emit a terminal error if this thread wins the transition; otherwise a watchdog or + // cancel already produced the single terminal message (#92). + if (record.transitionTo(JobRecord.Status.ERROR)) { + emitJobError(record, JobError.ERROR, ErrorCode.LEASE_EXPIRED, e.getMessage()); + credentialBinding.revokeAll(record); + } } catch (dev.arcp.core.error.PermissionDeniedException e) { - record.transitionTo(JobRecord.Status.ERROR); - emitJobError(record, JobError.ERROR, ErrorCode.PERMISSION_DENIED, e.getMessage()); - credentialBinding.revokeAll(record); + if (record.transitionTo(JobRecord.Status.ERROR)) { + emitJobError(record, JobError.ERROR, ErrorCode.PERMISSION_DENIED, e.getMessage()); + credentialBinding.revokeAll(record); + } } catch (dev.arcp.core.error.BudgetExhaustedException e) { - record.transitionTo(JobRecord.Status.ERROR); - emitJobError(record, JobError.ERROR, ErrorCode.BUDGET_EXHAUSTED, e.getMessage()); - credentialBinding.revokeAll(record); + if (record.transitionTo(JobRecord.Status.ERROR)) { + emitJobError(record, JobError.ERROR, ErrorCode.BUDGET_EXHAUSTED, e.getMessage()); + credentialBinding.revokeAll(record); + } } catch (Exception e) { - record.transitionTo(JobRecord.Status.ERROR); - emitJobError( - record, - JobError.ERROR, - ErrorCode.INTERNAL_ERROR, - e.getMessage() != null ? e.getMessage() : e.getClass().getSimpleName()); - credentialBinding.revokeAll(record); - } finally { - ScheduledFuture w = record.expiryWatchdog(); - if (w != null) { - w.cancel(false); + if (record.transitionTo(JobRecord.Status.ERROR)) { + emitJobError( + record, + JobError.ERROR, + ErrorCode.INTERNAL_ERROR, + e.getMessage() != null ? e.getMessage() : e.getClass().getSimpleName()); + credentialBinding.revokeAll(record); } + } finally { + // Cancel both watchdogs once the worker finishes so no stray scheduler task remains (#89). + cancelWatchdogs(record); + scheduleEviction(record); } } private void handleCancel(Envelope envelope, JobCancel cancel) { JobId jobId = envelope.jobId(); if (jobId == null) { + // §12: a cancel without a job_id is malformed, not silently dropped (#95). + sendJobErrorTopLevel(envelope, ErrorCode.INVALID_REQUEST, "job.cancel missing job_id"); return; } JobRecord rec = runtime.job(jobId); - if (rec == null) { + // §7.4/§7.6: unknown, evicted, or not-visible (non-submitter) jobs all answer JOB_NOT_FOUND so + // a cancel is never silently ignored and job existence is not leaked to non-submitters (#95). + if (rec == null || !rec.principal().equals(principal)) { + sendJobErrorTopLevel(envelope, ErrorCode.JOB_NOT_FOUND, "job not found: " + jobId); return; } - if (!rec.principal().equals(principal)) { - return; // §7.6: only the submitter may cancel - } + String reason = cancel.reason() != null ? cancel.reason() : "cancelled"; if (rec.transitionTo(JobRecord.Status.CANCELLED)) { var w = rec.worker(); if (w != null) { w.cancel(true); } - emitJobError( - rec, - JobError.CANCELLED, - ErrorCode.CANCELLED, - cancel.reason() != null ? cancel.reason() : "cancelled"); + // §7.4: acknowledge with job.cancelled, then emit the terminal job.error CANCELLED. + sendJobMessage(rec, Message.Type.JOB_CANCELLED, new JobCancelled(reason), nextSeq()); + emitJobError(rec, JobError.CANCELLED, ErrorCode.CANCELLED, reason); credentialBinding.revokeAll(rec); + } else { + // Already terminal: acknowledge idempotently rather than going silent. + sendJobMessage(rec, Message.Type.JOB_CANCELLED, new JobCancelled(reason), nextSeq()); } } @@ -787,14 +939,16 @@ private static Throwable rootCause(Throwable throwable) { return root; } - private void handleSubscribe(JobSubscribe sub) { + private void handleSubscribe(Envelope envelope, JobSubscribe sub) { if (!negotiated.contains(Feature.SUBSCRIBE)) { return; } JobRecord rec = runtime.job(sub.jobId()); if (rec == null || !rec.principal().equals(principal)) { + // Echo the request envelope id so the client correlates this error to the subscribe call, not + // an unrelated in-flight submit (#102). sendJobErrorTopLevel( - null, ErrorCode.JOB_NOT_FOUND, "job not found or not visible: " + sub.jobId()); + envelope, ErrorCode.JOB_NOT_FOUND, "job not found or not visible: " + sub.jobId()); return; } boolean alreadySubscribed = @@ -819,17 +973,25 @@ private void handleSubscribe(JobSubscribe sub) { send(Message.Type.JOB_SUBSCRIBED, response, sessionId, rec.traceId(), rec.jobId(), null); if (wantHistory) { - for (Envelope replay : rec.eventsSince(subscribedFrom)) { - try { - transport.send(replay); - } catch (RuntimeException e) { - log.warn("replay send failed: {}", e.toString()); - return; + // §8.1/§8.3/§14: re-envelope each replayed event with this (subscriber) session's session_id + // and a freshly allocated event_seq, and never echo credential material to a subscriber (#94). + boolean isOwner = ownedJobs.contains(rec.jobId()); + for (JobRecord.RecordedEvent replay : rec.eventsSince(subscribedFrom)) { + JobEvent event = replay.event(); + if (!isOwner && isCredentialRotated(event)) { + continue; // never replay credential_rotated (with its value) to a non-owning subscriber } + sendJobMessage(rec, Message.Type.JOB_EVENT, event, nextSeq()); } } } + private static boolean isCredentialRotated(JobEvent event) { + return "status".equals(event.eventKind()) + && event.body() != null + && "credential_rotated".equals(event.body().path("phase").asText("")); + } + private void handleUnsubscribe(JobUnsubscribe unsub) { JobRecord rec = runtime.job(unsub.jobId()); if (rec != null) { @@ -842,14 +1004,36 @@ private void emitJobEvent(JobRecord record, EventBody body) { record.setLastEventSeq(seq); JobEvent event = new JobEvent(body.kind().wire(), runtime.clock().instant(), mapper.valueToTree(body)); - Envelope sent = - send(Message.Type.JOB_EVENT, event, sessionId, record.traceId(), record.jobId(), seq); - if (sent != null) { - record.recordEvent(sent); + send(Message.Type.JOB_EVENT, event, sessionId, record.traceId(), record.jobId(), seq); + // Record the decoded event (not the wire envelope) so history replay can re-envelope it per + // subscriber with that subscriber's own session_id and event_seq (#94). + record.recordEvent(seq, event); + + // §14: credential_rotated carries the new credential value, which only the submitting session + // may receive — never fan it out to subscribers (#75). + boolean redactFromSubscribers = + body instanceof StatusEvent se && "credential_rotated".equals(se.phase()); + if (redactFromSubscribers) { + return; } for (JobRecord.Subscriber sub : record.subscribers()) { if (sub.session() != this) { - sub.session().sendJobMessage(record, Message.Type.JOB_EVENT, event, seq); + // §7.6/§8.3: allocate the event_seq from the subscriber session's own counter (#76). + sub.session().forwardJobMessage(record, Message.Type.JOB_EVENT, event); + } + } + } + + /** Forward a job message to a subscriber session using this (subscriber) session's seq space. */ + void forwardJobMessage(JobRecord rec, Message.Type type, Message msg) { + sendJobMessage(rec, type, msg, nextSeq()); + } + + /** Deliver a terminal message (job.result/job.error) to every subscriber session (§7.6, #93). */ + private void fanOutTerminal(JobRecord record, Message.Type type, Message msg) { + for (JobRecord.Subscriber sub : record.subscribers()) { + if (sub.session() != this) { + sub.session().forwardJobMessage(record, type, msg); } } } @@ -857,11 +1041,35 @@ private void emitJobEvent(JobRecord record, EventBody body) { private void emitJobError(JobRecord record, String finalStatus, ErrorCode code, String message) { JobError err = JobError.fromJson(finalStatus, code, message, null, null); sendJobMessage(record, Message.Type.JOB_ERROR, err, nextSeq()); + // §7.6/§13.3: subscribers must also observe terminal errors (#93). + fanOutTerminal(record, Message.Type.JOB_ERROR, err); + } + + private void scheduleEviction(JobRecord record) { + // §7.6: keep terminal jobs visible to list_jobs / late subscribe for the resume window, then + // evict so the runtime's job map and event history do not grow without bound (#101). + try { + runtime + .scheduler() + .schedule( + () -> { + ownedJobs.remove(record.jobId()); + runtime.removeJob(record.jobId()); + }, + Math.max(0, runtime.resumeWindowSec()), + TimeUnit.SECONDS); + } catch (java.util.concurrent.RejectedExecutionException ignored) { + // scheduler already shut down (runtime closing); nothing to evict + } } private void sendJobErrorTopLevel(@Nullable Envelope origin, ErrorCode code, String message) { JobError err = JobError.fromJson(JobError.ERROR, code, message, null, null); - send( + // Echo the originating request's envelope id so the client can correlate a top-level error to + // the exact request that caused it instead of failing an unrelated pending submit (#102). + MessageId responseId = origin != null ? origin.id() : MessageId.generate(); + sendWithId( + responseId, Message.Type.JOB_ERROR, err, sessionId, @@ -885,13 +1093,22 @@ private void sendJobMessage(JobRecord rec, Message.Type type, Message msg, long @Nullable TraceId tid, @Nullable JobId jid, @Nullable Long seq) { + return sendWithId(MessageId.generate(), type, payload, sid, tid, jid, seq); + } + + private @Nullable Envelope sendWithId( + MessageId id, + Message.Type type, + Message payload, + @Nullable SessionId sid, + @Nullable TraceId tid, + @Nullable JobId jid, + @Nullable Long seq) { if (phase.get() == Phase.CLOSED) { return null; } ObjectNode payloadJson = Messages.encodePayload(mapper, payload); - Envelope env = - new Envelope( - Envelope.VERSION, MessageId.generate(), type.wire(), sid, tid, jid, seq, payloadJson); + Envelope env = new Envelope(Envelope.VERSION, id, type.wire(), sid, tid, jid, seq, payloadJson); if (seq != null) { resumeBuffer.record(env); } diff --git a/arcp-runtime/src/test/java/dev/arcp/runtime/SessionLoopAuditTest.java b/arcp-runtime/src/test/java/dev/arcp/runtime/SessionLoopAuditTest.java new file mode 100644 index 0000000..0291811 --- /dev/null +++ b/arcp-runtime/src/test/java/dev/arcp/runtime/SessionLoopAuditTest.java @@ -0,0 +1,358 @@ +package dev.arcp.runtime; + +import static org.assertj.core.api.Assertions.assertThat; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; +import dev.arcp.core.agents.AgentRef; +import dev.arcp.core.auth.Auth; +import dev.arcp.core.capabilities.Capabilities; +import dev.arcp.core.capabilities.Feature; +import dev.arcp.core.error.ErrorCode; +import dev.arcp.core.events.MetricEvent; +import dev.arcp.core.ids.JobId; +import dev.arcp.core.ids.MessageId; +import dev.arcp.core.ids.SessionId; +import dev.arcp.core.lease.Lease; +import dev.arcp.core.lease.LeaseConstraints; +import dev.arcp.core.messages.ClientInfo; +import dev.arcp.core.messages.JobAccepted; +import dev.arcp.core.messages.JobCancel; +import dev.arcp.core.messages.JobCancelled; +import dev.arcp.core.messages.JobError; +import dev.arcp.core.messages.JobResult; +import dev.arcp.core.messages.JobSubmit; +import dev.arcp.core.messages.Message; +import dev.arcp.core.messages.Messages; +import dev.arcp.core.messages.SessionBye; +import dev.arcp.core.messages.SessionClosed; +import dev.arcp.core.messages.SessionHello; +import dev.arcp.core.messages.SessionWelcome; +import dev.arcp.core.transport.MemoryTransport; +import dev.arcp.core.wire.ArcpMapper; +import dev.arcp.core.wire.Envelope; +import dev.arcp.runtime.agent.JobOutcome; +import dev.arcp.runtime.session.SessionLoop; +import java.math.BigDecimal; +import java.time.Duration; +import java.time.Instant; +import java.util.ArrayDeque; +import java.util.EnumSet; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Flow; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.Test; + +/** Protocol-level coverage for the audit fixes (#74, #89, #90, #91, #95, #96, #108). */ +class SessionLoopAuditTest { + private static final ObjectMapper MAPPER = ArcpMapper.shared(); + private static final SessionId CLIENT_SESSION = SessionId.of("sess_client"); + + @Test + void leaseExpiryEmitsErrorFinalStatusWithLeaseExpired() throws Exception { + CountDownLatch release = new CountDownLatch(1); + try (ArcpRuntime runtime = + ArcpRuntime.builder().agent("block", "1.0.0", blockingAgent(release)).build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + h.send( + Message.Type.JOB_SUBMIT, + new JobSubmit( + AgentRef.parse("block@1.0.0"), + JsonNodeFactory.instance.objectNode(), + null, + LeaseConstraints.of(Instant.now().plusMillis(600)), + null, + null)); + h.take(Message.Type.JOB_ACCEPTED, JobAccepted.class); + JobError error = h.take(Message.Type.JOB_ERROR, JobError.class); + assertThat(error.code()).isEqualTo(ErrorCode.LEASE_EXPIRED); + assertThat(error.finalStatus()).isEqualTo(JobError.ERROR); + release.countDown(); + } + } + + @Test + void maxRuntimeSecEmitsTimeout() throws Exception { + CountDownLatch release = new CountDownLatch(1); + try (ArcpRuntime runtime = + ArcpRuntime.builder().agent("block", "1.0.0", blockingAgent(release)).build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + h.send( + Message.Type.JOB_SUBMIT, + new JobSubmit( + AgentRef.parse("block@1.0.0"), + JsonNodeFactory.instance.objectNode(), + null, + null, + null, + 1)); + h.take(Message.Type.JOB_ACCEPTED, JobAccepted.class); + JobError error = h.take(Message.Type.JOB_ERROR, JobError.class); + assertThat(error.code()).isEqualTo(ErrorCode.TIMEOUT); + assertThat(error.finalStatus()).isEqualTo(JobError.TIMED_OUT); + release.countDown(); + } + } + + @Test + void cancelIsAcknowledgedThenErrors() throws Exception { + CountDownLatch started = new CountDownLatch(1); + CountDownLatch release = new CountDownLatch(1); + try (ArcpRuntime runtime = + ArcpRuntime.builder() + .agent( + "block", + "1.0.0", + (input, ctx) -> { + started.countDown(); + release.await(); + return JobOutcome.Success.inline(input.payload()); + }) + .build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + h.send( + Message.Type.JOB_SUBMIT, + new JobSubmit( + AgentRef.parse("block@1.0.0"), + JsonNodeFactory.instance.objectNode(), + null, + null, + null, + null)); + JobAccepted accepted = h.take(Message.Type.JOB_ACCEPTED, JobAccepted.class); + assertThat(started.await(3, TimeUnit.SECONDS)).isTrue(); + h.sendJob(Message.Type.JOB_CANCEL, new JobCancel("stop"), accepted.jobId()); + JobCancelled ack = h.take(Message.Type.JOB_CANCELLED, JobCancelled.class); + assertThat(ack.reason()).isEqualTo("stop"); + JobError error = h.take(Message.Type.JOB_ERROR, JobError.class); + assertThat(error.code()).isEqualTo(ErrorCode.CANCELLED); + release.countDown(); + } + } + + @Test + void cancelOfUnknownJobYieldsJobNotFound() throws Exception { + try (ArcpRuntime runtime = ArcpRuntime.builder().build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + h.sendJob(Message.Type.JOB_CANCEL, new JobCancel("x"), JobId.of("job_missing")); + assertThat(h.take(Message.Type.JOB_ERROR, JobError.class).code()) + .isEqualTo(ErrorCode.JOB_NOT_FOUND); + } + } + + @Test + void cancelWithoutJobIdYieldsInvalidRequest() throws Exception { + try (ArcpRuntime runtime = ArcpRuntime.builder().build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + h.send(Message.Type.JOB_CANCEL, new JobCancel("x")); + assertThat(h.take(Message.Type.JOB_ERROR, JobError.class).code()) + .isEqualTo(ErrorCode.INVALID_REQUEST); + } + } + + @Test + void sessionCloseIsAcknowledged() throws Exception { + try (ArcpRuntime runtime = ArcpRuntime.builder().build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + h.send(Message.Type.SESSION_BYE, new SessionBye("bye")); + SessionClosed closed = h.take(Message.Type.SESSION_CLOSED, SessionClosed.class); + assertThat(closed.reason()).isEqualTo("bye"); + } + } + + @Test + void budgetRemainingGaugeDoesNotDecrementBudget() throws Exception { + try (ArcpRuntime runtime = + ArcpRuntime.builder() + .agent( + "gauge", + "1.0.0", + (input, ctx) -> { + ctx.emit(new MetricEvent("cost.budget.remaining", new BigDecimal("4"), "usd", null)); + ctx.emit(new MetricEvent("cost.budget.remaining", null, "usd", null)); + ctx.authorize("fs.read", "/workspace/x"); + return JobOutcome.Success.inline(input.payload()); + }) + .build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + h.send( + Message.Type.JOB_SUBMIT, + new JobSubmit( + AgentRef.parse("gauge@1.0.0"), + JsonNodeFactory.instance.objectNode(), + Lease.builder().allow("fs.read", "/workspace/**").allow("cost.budget", "usd:5").build(), + null, + null, + null)); + h.take(Message.Type.JOB_ACCEPTED, JobAccepted.class); + // No BUDGET_EXHAUSTED, no NPE: the job completes successfully. + assertThat(h.take(Message.Type.JOB_RESULT, JobResult.class).finalStatus()) + .isEqualTo(JobResult.SUCCESS); + } + } + + @Test + void failedAcceptDoesNotPoisonIdempotencyKey() throws Exception { + try (ArcpRuntime runtime = ArcpRuntime.builder().build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + ObjectNode input = JsonNodeFactory.instance.objectNode().put("x", 1); + JobSubmit submit = + new JobSubmit(AgentRef.parse("missing@1.0.0"), input, null, null, "retry-key", null); + h.send(Message.Type.JOB_SUBMIT, submit); + assertThat(h.take(Message.Type.JOB_ERROR, JobError.class).code()) + .isEqualTo(ErrorCode.AGENT_NOT_AVAILABLE); + // Identical retry must NOT be poisoned with DUPLICATE_KEY. + h.send(Message.Type.JOB_SUBMIT, submit); + assertThat(h.take(Message.Type.JOB_ERROR, JobError.class).code()) + .isEqualTo(ErrorCode.AGENT_NOT_AVAILABLE); + } + } + + @Test + void idempotencyFingerprintIsKeyOrderInsensitive() throws Exception { + try (ArcpRuntime runtime = + ArcpRuntime.builder() + .agent("echo", "1.0.0", (input, ctx) -> JobOutcome.Success.inline(input.payload())) + .build()) { + Harness h = Harness.connect(runtime); + h.handshake(); + ObjectNode first = JsonNodeFactory.instance.objectNode(); + first.put("b", 1); + first.put("a", 2); + ObjectNode second = JsonNodeFactory.instance.objectNode(); + second.put("a", 2); + second.put("b", 1); + h.send( + Message.Type.JOB_SUBMIT, + new JobSubmit(AgentRef.parse("echo@1.0.0"), first, null, null, "order", null)); + JobAccepted a1 = h.take(Message.Type.JOB_ACCEPTED, JobAccepted.class); + h.send( + Message.Type.JOB_SUBMIT, + new JobSubmit(AgentRef.parse("echo@1.0.0"), second, null, null, "order", null)); + JobAccepted a2 = h.take(Message.Type.JOB_ACCEPTED, JobAccepted.class); + assertThat(a2.jobId()).isEqualTo(a1.jobId()); + } + } + + private static dev.arcp.runtime.agent.Agent blockingAgent(CountDownLatch release) { + return (input, ctx) -> { + release.await(); + return JobOutcome.Success.inline(input.payload()); + }; + } + + private static final class Harness { + private final MemoryTransport client; + private final Probe probe; + + private Harness(MemoryTransport client, Probe probe) { + this.client = client; + this.probe = probe; + } + + static Harness connect(ArcpRuntime runtime) { + MemoryTransport.Pair pair = MemoryTransport.pair(); + Probe probe = new Probe(); + pair.client().incoming().subscribe(probe); + SessionLoop loop = runtime.accept(pair.runtime()); + assertThat(loop).isNotNull(); + return new Harness(pair.client(), probe); + } + + SessionWelcome handshake() throws Exception { + send( + Message.Type.SESSION_HELLO, + new SessionHello( + new ClientInfo("audit-test", "1.0.0"), + Auth.anonymous(), + new Capabilities( + List.of("json"), + EnumSet.of(Feature.SUBSCRIBE, Feature.LIST_JOBS, Feature.COST_BUDGET), + null), + null, + null)); + return take(Message.Type.SESSION_WELCOME, SessionWelcome.class); + } + + void send(Message.Type type, Message message) { + sendInternal(type, message, MessageId.generate(), null); + } + + void sendJob(Message.Type type, Message message, JobId jobId) { + sendInternal(type, message, MessageId.generate(), jobId); + } + + private void sendInternal(Message.Type type, Message message, MessageId id, JobId jobId) { + client.send( + new Envelope( + Envelope.VERSION, + id, + type.wire(), + CLIENT_SESSION, + null, + jobId, + null, + Messages.encodePayload(MAPPER, message))); + } + + T take(Message.Type type, Class messageClass) throws Exception { + return messageClass.cast(Messages.decode(MAPPER, probe.take(type))); + } + } + + private static final class Probe implements Flow.Subscriber { + private final BlockingQueue envelopes = new LinkedBlockingQueue<>(); + private final Queue backlog = new ArrayDeque<>(); + + @Override + public void onSubscribe(Flow.Subscription subscription) { + subscription.request(Long.MAX_VALUE); + } + + @Override + public void onNext(Envelope item) { + envelopes.add(item); + } + + @Override + public void onError(Throwable throwable) {} + + @Override + public void onComplete() {} + + Envelope take(Message.Type type) throws InterruptedException { + long deadline = System.nanoTime() + Duration.ofSeconds(5).toNanos(); + while (System.nanoTime() < deadline) { + for (Envelope existing : List.copyOf(backlog)) { + if (existing.type().equals(type.wire())) { + backlog.remove(existing); + return existing; + } + } + long remaining = deadline - System.nanoTime(); + Envelope envelope = envelopes.poll(Math.max(1L, remaining), TimeUnit.NANOSECONDS); + if (envelope == null) { + break; + } + if (envelope.type().equals(type.wire())) { + return envelope; + } + backlog.add(envelope); + } + throw new AssertionError("timed out waiting for " + type.wire() + "; backlog=" + backlog); + } + } +} From 60c393f0327e1d8b67284af77305d1e4b4833b9c Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 21:59:00 -0400 Subject: [PATCH 2/9] fix(middleware): enforce Host allowlist and detect Vert.x write failures - Jetty: enforce allowedHosts with a servlet filter returning 403 before the WebSocket upgrade, instead of a builder field that was never read (#99) - Spring Boot: enforce arcp.middleware.allowed-hosts with a HandshakeInterceptor rejecting disallowed Hosts with 403 (#99) - Jakarta: record the host decision per handshake and close the session with VIOLATED_POLICY before ArcpRuntime.accept, instead of merely clearing response headers (which the container ignored) (#100) - Vert.x: observe the async write result and fail the transport on write error so the runtime detects a dead session instead of silently dropping events (#110) Co-authored-by: Cursor --- .../jakarta/ArcpJakartaAdapter.java | 18 +++++-- .../jakarta/ArcpJakartaEndpoint.java | 20 ++++++++ .../ArcpSpringBootAutoConfiguration.java | 10 +++- .../HostAllowlistHandshakeInterceptor.java | 48 +++++++++++++++++++ .../vertx/VertxWebSocketTransport.java | 11 ++++- .../arcp/runtime/jetty/ArcpJettyServer.java | 14 ++++++ .../runtime/jetty/HostAllowlistFilter.java | 42 ++++++++++++++++ 7 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/HostAllowlistHandshakeInterceptor.java create mode 100644 arcp-runtime-jetty/src/main/java/dev/arcp/runtime/jetty/HostAllowlistFilter.java diff --git a/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaAdapter.java b/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaAdapter.java index e47cdc6..aca5d7a 100644 --- a/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaAdapter.java +++ b/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaAdapter.java @@ -44,11 +44,20 @@ public static Builder builder() { /** Build the {@link ServerEndpointConfig} ready to hand to a container. */ public ServerEndpointConfig serverEndpointConfig() { + // The Jakarta WebSocket API offers no veto in modifyHandshake: clearing response headers does + // not stop the server from completing the upgrade, so a client that ignores + // Sec-WebSocket-Accept would still reach the runtime. Instead, record the host decision per + // handshake and have the endpoint close the session before runtime.accept (#100). The handshake + // and endpoint construction run on the same container thread, so a ThreadLocal reliably carries + // the decision from modifyHandshake to getEndpointInstance. + ThreadLocal hostRejected = ThreadLocal.withInitial(() -> Boolean.FALSE); ServerEndpointConfig.Configurator configurator = new ServerEndpointConfig.Configurator() { @Override public T getEndpointInstance(Class endpointClass) { - return endpointClass.cast(new ArcpJakartaEndpoint()); + boolean rejected = hostRejected.get(); + hostRejected.remove(); + return endpointClass.cast(new ArcpJakartaEndpoint(rejected)); } @Override @@ -63,13 +72,12 @@ public boolean checkOrigin(String originHeaderValue) { public void modifyHandshake( ServerEndpointConfig sec, HandshakeRequest request, HandshakeResponse response) { if (allowedHosts.isEmpty()) { + hostRejected.set(Boolean.FALSE); return; } List hosts = request.getHeaders().get("Host"); - if (hosts == null || hosts.isEmpty() || !allowedHosts.contains(hosts.get(0))) { - // Signal handshake rejection by stripping accept fields. - response.getHeaders().clear(); - } + boolean allowed = hosts != null && !hosts.isEmpty() && allowedHosts.contains(hosts.get(0)); + hostRejected.set(!allowed); } }; ServerEndpointConfig config = diff --git a/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaEndpoint.java b/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaEndpoint.java index a57b91f..4874f81 100644 --- a/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaEndpoint.java +++ b/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaEndpoint.java @@ -24,9 +24,29 @@ public final class ArcpJakartaEndpoint extends Endpoint { public static final String RUNTIME_KEY = ArcpRuntime.class.getName(); private final Map transports = new ConcurrentHashMap<>(); + private final boolean hostRejected; + + public ArcpJakartaEndpoint() { + this(false); + } + + ArcpJakartaEndpoint(boolean hostRejected) { + this.hostRejected = hostRejected; + } @Override public void onOpen(Session session, EndpointConfig config) { + if (hostRejected) { + // §14: the Host was not on the allowlist; close before the runtime ever sees the session, so + // a client that ignores Sec-WebSocket-Accept validation cannot talk to the runtime (#100). + try { + session.close( + new CloseReason(CloseReason.CloseCodes.VIOLATED_POLICY, "host not allowed")); + } catch (java.io.IOException ignored) { + // best-effort close + } + return; + } ArcpRuntime runtime = (ArcpRuntime) config.getUserProperties().get(RUNTIME_KEY); if (runtime == null) { log.warn("ArcpRuntime missing from EndpointConfig user properties"); diff --git a/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/ArcpSpringBootAutoConfiguration.java b/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/ArcpSpringBootAutoConfiguration.java index 4a1471c..d5b2933 100644 --- a/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/ArcpSpringBootAutoConfiguration.java +++ b/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/ArcpSpringBootAutoConfiguration.java @@ -1,6 +1,7 @@ package dev.arcp.middleware.spring; import dev.arcp.runtime.ArcpRuntime; +import java.util.List; import org.springframework.boot.autoconfigure.AutoConfiguration; import org.springframework.boot.autoconfigure.condition.ConditionalOnBean; import org.springframework.boot.autoconfigure.condition.ConditionalOnClass; @@ -40,6 +41,13 @@ public void registerWebSocketHandlers(WebSocketHandlerRegistry registry) { properties.getAllowedOrigins().isEmpty() ? new String[] {"*"} : properties.getAllowedOrigins().toArray(new String[0]); - registry.addHandler(arcpWebSocketHandler(), properties.getPath()).setAllowedOrigins(origins); + var registration = + registry.addHandler(arcpWebSocketHandler(), properties.getPath()).setAllowedOrigins(origins); + // §14: enforce the Host allowlist before the upgrade completes so a disallowed Host is rejected + // (403) rather than being a silently ignored security control (#99). + List allowedHosts = properties.getAllowedHosts(); + if (!allowedHosts.isEmpty()) { + registration.addInterceptors(new HostAllowlistHandshakeInterceptor(allowedHosts)); + } } } diff --git a/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/HostAllowlistHandshakeInterceptor.java b/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/HostAllowlistHandshakeInterceptor.java new file mode 100644 index 0000000..97a6ca8 --- /dev/null +++ b/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/HostAllowlistHandshakeInterceptor.java @@ -0,0 +1,48 @@ +package dev.arcp.middleware.spring; + +import java.util.List; +import org.springframework.http.HttpStatus; +import org.springframework.http.server.ServerHttpRequest; +import org.springframework.http.server.ServerHttpResponse; +import org.springframework.web.socket.WebSocketHandler; +import org.springframework.web.socket.server.HandshakeInterceptor; + +/** + * Rejects WebSocket upgrades whose {@code Host} header is not on the configured allowlist with HTTP + * 403 before the handshake completes, so the runtime never sees a session from a disallowed Host + * (§14, #99). An empty allowlist disables the check. + */ +final class HostAllowlistHandshakeInterceptor implements HandshakeInterceptor { + + private final List allowedHosts; + + HostAllowlistHandshakeInterceptor(List allowedHosts) { + this.allowedHosts = List.copyOf(allowedHosts); + } + + @Override + public boolean beforeHandshake( + ServerHttpRequest request, + ServerHttpResponse response, + WebSocketHandler wsHandler, + java.util.Map attributes) { + if (allowedHosts.isEmpty()) { + return true; + } + String host = request.getHeaders().getFirst("Host"); + if (host != null && allowedHosts.contains(host)) { + return true; + } + response.setStatusCode(HttpStatus.FORBIDDEN); + return false; + } + + @Override + public void afterHandshake( + ServerHttpRequest request, + ServerHttpResponse response, + WebSocketHandler wsHandler, + Exception exception) { + // no-op + } +} diff --git a/arcp-middleware-vertx/src/main/java/dev/arcp/middleware/vertx/VertxWebSocketTransport.java b/arcp-middleware-vertx/src/main/java/dev/arcp/middleware/vertx/VertxWebSocketTransport.java index a9b8709..4dc02ea 100644 --- a/arcp-middleware-vertx/src/main/java/dev/arcp/middleware/vertx/VertxWebSocketTransport.java +++ b/arcp-middleware-vertx/src/main/java/dev/arcp/middleware/vertx/VertxWebSocketTransport.java @@ -51,7 +51,16 @@ public void send(Envelope envelope) { writeLock.lock(); try { String json = mapper.writeValueAsString(envelope); - socket.writeTextMessage(json); + // Vert.x writes are async: observe the result so a failed write (closed socket, broken pipe) + // is surfaced as a transport failure instead of being silently dropped, letting the runtime + // detect a dead session (#110). + socket + .writeTextMessage(json) + .onFailure( + cause -> { + failInbound(cause); + socket.close(); + }); } catch (IOException e) { throw new UncheckedIOException(e); } finally { diff --git a/arcp-runtime-jetty/src/main/java/dev/arcp/runtime/jetty/ArcpJettyServer.java b/arcp-runtime-jetty/src/main/java/dev/arcp/runtime/jetty/ArcpJettyServer.java index 2ce1189..3c67037 100644 --- a/arcp-runtime-jetty/src/main/java/dev/arcp/runtime/jetty/ArcpJettyServer.java +++ b/arcp-runtime-jetty/src/main/java/dev/arcp/runtime/jetty/ArcpJettyServer.java @@ -1,10 +1,13 @@ package dev.arcp.runtime.jetty; import dev.arcp.runtime.ArcpRuntime; +import jakarta.servlet.DispatcherType; import jakarta.websocket.server.ServerEndpointConfig; import java.net.URI; +import java.util.EnumSet; import java.util.List; import java.util.Objects; +import org.eclipse.jetty.ee10.servlet.FilterHolder; import org.eclipse.jetty.ee10.servlet.ServletContextHandler; import org.eclipse.jetty.ee10.websocket.jakarta.server.config.JakartaWebSocketServletContainerInitializer; import org.eclipse.jetty.server.Server; @@ -17,12 +20,14 @@ public final class ArcpJettyServer implements AutoCloseable { private final ArcpRuntime runtime; private final String path; private final int port; + private final List allowedHosts; private final Server server; private ArcpJettyServer(Builder b) { this.runtime = Objects.requireNonNull(b.runtime, "runtime"); this.path = b.path; this.port = b.port; + this.allowedHosts = b.allowedHosts; VirtualThreadPool pool = new VirtualThreadPool(); this.server = new Server(pool); ServerConnector connector = new ServerConnector(server); @@ -32,6 +37,15 @@ private ArcpJettyServer(Builder b) { ServletContextHandler context = new ServletContextHandler("/"); server.setHandler(context); + // §14: reject upgrades from a non-allowlisted Host with 403 before the WebSocket handshake, so + // allowedHosts is an enforced control rather than a silent no-op (#99). + if (!allowedHosts.isEmpty()) { + context.addFilter( + new FilterHolder(new HostAllowlistFilter(allowedHosts)), + "/*", + EnumSet.of(DispatcherType.REQUEST)); + } + JakartaWebSocketServletContainerInitializer.configure( context, (servletContext, container) -> { diff --git a/arcp-runtime-jetty/src/main/java/dev/arcp/runtime/jetty/HostAllowlistFilter.java b/arcp-runtime-jetty/src/main/java/dev/arcp/runtime/jetty/HostAllowlistFilter.java new file mode 100644 index 0000000..a071f96 --- /dev/null +++ b/arcp-runtime-jetty/src/main/java/dev/arcp/runtime/jetty/HostAllowlistFilter.java @@ -0,0 +1,42 @@ +package dev.arcp.runtime.jetty; + +import jakarta.servlet.Filter; +import jakarta.servlet.FilterChain; +import jakarta.servlet.ServletException; +import jakarta.servlet.ServletRequest; +import jakarta.servlet.ServletResponse; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import java.io.IOException; +import java.util.List; + +/** + * Servlet filter that rejects requests (including WebSocket upgrades) whose {@code Host} header is + * not on the configured allowlist with HTTP 403, before the WebSocket handshake runs (§14, #99). + */ +final class HostAllowlistFilter implements Filter { + + private final List allowedHosts; + + HostAllowlistFilter(List allowedHosts) { + this.allowedHosts = List.copyOf(allowedHosts); + } + + @Override + public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) + throws IOException, ServletException { + if (allowedHosts.isEmpty()) { + chain.doFilter(request, response); + return; + } + if (request instanceof HttpServletRequest httpRequest + && response instanceof HttpServletResponse httpResponse) { + String host = httpRequest.getHeader("Host"); + if (host == null || !allowedHosts.contains(host)) { + httpResponse.sendError(HttpServletResponse.SC_FORBIDDEN, "host not allowed"); + return; + } + } + chain.doFilter(request, response); + } +} From 0d0058e7064441ee5dbb8702e786e6ab3458ad15 Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 22:07:52 -0400 Subject: [PATCH 3/9] fix(runtime): implement session resume semantics (#22) An unexpected transport drop now parks the session for the resume window instead of cancelling its in-flight jobs: job ownership, event history, and the resume buffer are retained and registered by resume token. A session.hello carrying a valid resume_token (and matching principal) reattaches the new transport to the existing session identity, replays buffered envelopes with event_seq greater than last_event_seq, and continues delivering live events; an unknown or expired token returns RESUME_WINDOW_EXPIRED. Explicit session.close still cancels in-flight jobs. Parked sessions are torn down when the resume window elapses or the runtime closes. Adds resume + explicit-close tests. Co-authored-by: Cursor --- .../java/dev/arcp/runtime/ArcpRuntime.java | 22 ++ .../dev/arcp/runtime/session/SessionLoop.java | 173 +++++++++++- .../dev/arcp/runtime/SessionResumeTest.java | 266 ++++++++++++++++++ 3 files changed, 455 insertions(+), 6 deletions(-) create mode 100644 arcp-runtime/src/test/java/dev/arcp/runtime/SessionResumeTest.java diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/ArcpRuntime.java b/arcp-runtime/src/main/java/dev/arcp/runtime/ArcpRuntime.java index 9314079..22e6877 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/ArcpRuntime.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/ArcpRuntime.java @@ -52,6 +52,9 @@ public final class ArcpRuntime implements AutoCloseable { private final CredentialRevocationStore credentialRevocationStore; private final ConcurrentHashMap sessions = new ConcurrentHashMap<>(); private final ConcurrentHashMap jobs = new ConcurrentHashMap<>(); + // §6.3 resume: sessions whose transport dropped unexpectedly are parked here by resume token for + // the resume window so a reconnect can reattach to the same identity and in-flight jobs (#22). + private final ConcurrentHashMap resumable = new ConcurrentHashMap<>(); private ArcpRuntime(Builder b) { this.mapper = b.mapper != null ? b.mapper : ArcpMapper.shared(); @@ -213,6 +216,21 @@ public Collection jobs() { return jobs.values(); } + /** Park a session for resume, keyed by its resume token (§6.3, #22). */ + public void parkResumable(String resumeToken, SessionLoop loop) { + resumable.put(resumeToken, loop); + } + + /** Atomically claim a parked session for resume, or {@code null} if unknown/expired (#22). */ + public @Nullable SessionLoop takeResumable(String resumeToken) { + return resumable.remove(resumeToken); + } + + /** Remove a parked session only if it still maps to {@code loop} (#22). */ + public void removeResumable(String resumeToken, SessionLoop loop) { + resumable.remove(resumeToken, loop); + } + public void removeSession(SessionLoop loop) { // Always remove via the pending key the loop was inserted under, since // idOrPending() flips to the real session id after handshake and would @@ -226,6 +244,10 @@ public void close() { for (SessionLoop loop : sessions.values()) { loop.shutdown("runtime closing"); } + for (SessionLoop loop : resumable.values()) { + loop.shutdown("runtime closing"); + } + resumable.clear(); sessions.clear(); idempotency.close(); if (ownedScheduler) { diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java index 4445c1c..f1dabe1 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java @@ -85,11 +85,14 @@ public final class SessionLoop implements Flow.Subscriber { public enum Phase { AWAITING_HELLO, ACTIVE, + PARKED, CLOSED } private final ArcpRuntime runtime; - private final Transport transport; + // Swappable so a resume can reattach the same session identity (and its in-flight jobs) to a new + // transport (§6.3, #22). + private volatile Transport transport; private final ObjectMapper mapper; private final AgentRegistry agents; private final String pendingId = "pending:" + UUID.randomUUID(); @@ -107,6 +110,14 @@ public enum Phase { private final HeartbeatTracker heartbeat; private final CredentialBinding credentialBinding; private volatile @Nullable ScheduledFuture heartbeatTick; + private volatile @Nullable ScheduledFuture parkExpiry; + // Set when the client requested an explicit graceful close (session.close) so an unexpected + // transport drop can be distinguished from intentional teardown (#22). + private volatile boolean explicitClose; + // When this loop is just a forwarder for a resumed session, inbound is delegated to the resumed + // (parked) loop instead of being dispatched here (#22). + private volatile @Nullable SessionLoop resumeDelegate; + private volatile @Nullable Duration heartbeatInterval; private final Set ownedJobs = ConcurrentHashMap.newKeySet(); @@ -150,6 +161,13 @@ public void onSubscribe(Flow.Subscription s) { @Override public void onNext(Envelope envelope) { + SessionLoop delegate = resumeDelegate; + if (delegate != null) { + // This loop only carried the resume handshake; subsequent inbound belongs to the resumed + // session (#22). + delegate.onNext(envelope); + return; + } heartbeat.onInbound(); try { handle(envelope); @@ -160,22 +178,81 @@ public void onNext(Envelope envelope) { @Override public void onError(Throwable throwable) { - shutdown("transport error: " + throwable.getMessage()); + onTransportDropped("transport error: " + throwable.getMessage()); } @Override public void onComplete() { - shutdown("transport closed"); + onTransportDropped("transport closed"); + } + + private void onTransportDropped(String reason) { + SessionLoop delegate = resumeDelegate; + if (delegate != null) { + // The forwarder's transport dropped; let the resumed session decide whether to park. + delegate.onTransportDropped(reason); + return; + } + // §6.3: an unexpected transport drop (not an explicit session.close) parks the session for the + // resume window, keeping in-flight jobs alive, rather than cancelling everything (#22). + if (!explicitClose && resumeToken != null && phase.compareAndSet(Phase.ACTIVE, Phase.PARKED)) { + park(reason); + return; + } + shutdown(reason); + } + + private void park(String reason) { + log.debug("session {} parked for resume: {}", sessionId, reason); + ScheduledFuture hb = heartbeatTick; + if (hb != null) { + hb.cancel(false); + } + String token = resumeToken; + if (token == null) { + shutdown(reason); + return; + } + runtime.parkResumable(token, this); + try { + parkExpiry = + runtime + .scheduler() + .schedule( + () -> expirePark(token), runtime.resumeWindowSec(), TimeUnit.SECONDS); + } catch (java.util.concurrent.RejectedExecutionException e) { + expirePark(token); + } + } + + private void expirePark(String token) { + if (phase.compareAndSet(Phase.PARKED, Phase.CLOSED)) { + runtime.removeResumable(token, this); + teardownJobsAndSession(); + } } public void shutdown(String reason) { - if (phase.getAndSet(Phase.CLOSED) == Phase.CLOSED) { + Phase prev = phase.getAndSet(Phase.CLOSED); + if (prev == Phase.CLOSED) { return; } ScheduledFuture hb = heartbeatTick; if (hb != null) { hb.cancel(false); } + ScheduledFuture pe = parkExpiry; + if (pe != null) { + pe.cancel(false); + } + if (resumeToken != null) { + runtime.removeResumable(resumeToken, this); + } + teardownJobsAndSession(); + } + + /** Cancel in-flight jobs, unlink subscribers, close the transport, and deregister (#22, #101). */ + private void teardownJobsAndSession() { for (JobId jobId : ownedJobs) { JobRecord rec = runtime.job(jobId); if (rec == null) { @@ -270,6 +347,24 @@ private void handle(Envelope envelope) { private void doHandshake(SessionHello hello) { try { Principal pr = authenticate(hello); + + // §6.3 resume: if the hello carries a resume token for a parked session owned by the same + // principal, reattach to it and replay missed events instead of starting fresh (#22). + String token = hello.resumeToken(); + if (token != null) { + SessionLoop parked = runtime.takeResumable(token); + if (parked != null && parked.phase() == Phase.PARKED && pr.equals(parked.principal())) { + parked.resumeOnto(this, hello.lastEventSeq()); + return; + } + if (parked != null) { + // token found but not resumable by this principal: put it back for the legitimate owner. + runtime.parkResumable(token, parked); + } + rejectResume(); + return; + } + this.principal = pr; this.sessionId = SessionId.generate(); this.resumeToken = UUID.randomUUID().toString(); @@ -296,6 +391,7 @@ private void doHandshake(SessionHello hello) { // §6.4: schedule heartbeat ticks if both peers negotiated heartbeat. if (negotiated.contains(Feature.HEARTBEAT)) { Duration interval = Duration.ofSeconds(runtime.heartbeatIntervalSec()); + this.heartbeatInterval = interval; heartbeat.onInbound(); heartbeatTick = runtime @@ -317,6 +413,62 @@ private void doHandshake(SessionHello hello) { } } + /** §6.3: reattach this parked session to {@code incoming}'s transport and replay missed events. */ + private void resumeOnto(SessionLoop incoming, @Nullable Long lastEventSeq) { + ScheduledFuture pe = parkExpiry; + if (pe != null) { + pe.cancel(false); + } + // Adopt the new connection's transport; route its inbound to this session; retire the + // forwarder loop as a standalone session. + this.transport = incoming.transport; + incoming.resumeDelegate = this; + runtime.removeSession(incoming); + this.phase.set(Phase.ACTIVE); + + Capabilities welcomeCaps = new Capabilities(List.of("json"), negotiated, agents.describe()); + SessionWelcome welcome = + new SessionWelcome( + new RuntimeInfo(runtime.runtimeName(), runtime.runtimeVersion()), + resumeToken, + runtime.resumeWindowSec(), + negotiated.contains(Feature.HEARTBEAT) ? runtime.heartbeatIntervalSec() : null, + welcomeCaps); + send(Message.Type.SESSION_WELCOME, welcome, sessionId, null, null, null); + + // §6.3: replay buffered envelopes the client missed (event_seq > last_event_seq). + long from = lastEventSeq != null ? lastEventSeq : -1L; + for (Envelope replay : resumeBuffer.since(from)) { + try { + transport.send(replay); + } catch (RuntimeException e) { + log.warn("resume replay send failed: {}", e.toString()); + break; + } + } + + // Reschedule heartbeat on the new connection. + Duration interval = heartbeatInterval; + if (negotiated.contains(Feature.HEARTBEAT) && interval != null) { + heartbeat.onInbound(); + heartbeatTick = + runtime + .scheduler() + .scheduleAtFixedRate( + () -> tickHeartbeat(interval), + interval.toMillis(), + interval.toMillis(), + TimeUnit.MILLISECONDS); + } + log.debug("session {} resumed", sessionId); + } + + private void rejectResume() { + // §6.3: unknown or expired resume token. Surface RESUME_WINDOW_EXPIRED then tear down. + sendJobErrorTopLevel(null, ErrorCode.RESUME_WINDOW_EXPIRED, "resume token unknown or expired"); + shutdown("resume rejected"); + } + private Principal authenticate(SessionHello hello) throws ArcpException { Auth auth = hello.auth(); if (Auth.BEARER.equals(auth.scheme())) { @@ -350,7 +502,9 @@ private void tickHeartbeat(Duration interval) { private void handleBye(SessionBye bye) { log.debug("session {} close: {}", sessionId, bye.reason()); - // §6.7: acknowledge with session.closed before tearing down the transport. + // §6.7: an explicit graceful close cancels in-flight jobs (it is not a resumable drop, #22). + explicitClose = true; + // Acknowledge with session.closed before tearing down the transport. send(Message.Type.SESSION_CLOSED, new SessionClosed(bye.reason()), sessionId, null, null, null); shutdown("client close"); } @@ -1112,11 +1266,18 @@ private void sendJobMessage(JobRecord rec, Message.Type type, Message msg, long if (seq != null) { resumeBuffer.record(env); } + // §6.3: while parked the transport is gone; buffer sequenced messages for replay on resume and + // do not tear the session down (#22). + if (phase.get() == Phase.PARKED) { + return env; + } try { transport.send(env); } catch (RuntimeException e) { log.warn("send failed: {}", e.toString()); - shutdown("send failure"); + if (phase.get() == Phase.ACTIVE) { + shutdown("send failure"); + } } return env; } diff --git a/arcp-runtime/src/test/java/dev/arcp/runtime/SessionResumeTest.java b/arcp-runtime/src/test/java/dev/arcp/runtime/SessionResumeTest.java new file mode 100644 index 0000000..6a13036 --- /dev/null +++ b/arcp-runtime/src/test/java/dev/arcp/runtime/SessionResumeTest.java @@ -0,0 +1,266 @@ +package dev.arcp.runtime; + +import static org.assertj.core.api.Assertions.assertThat; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import dev.arcp.core.agents.AgentRef; +import dev.arcp.core.auth.Auth; +import dev.arcp.core.capabilities.Capabilities; +import dev.arcp.core.capabilities.Feature; +import dev.arcp.core.error.ErrorCode; +import dev.arcp.core.events.LogEvent; +import dev.arcp.core.ids.MessageId; +import dev.arcp.core.ids.SessionId; +import dev.arcp.core.messages.ClientInfo; +import dev.arcp.core.messages.JobAccepted; +import dev.arcp.core.messages.JobError; +import dev.arcp.core.messages.JobEvent; +import dev.arcp.core.messages.JobResult; +import dev.arcp.core.messages.JobSubmit; +import dev.arcp.core.messages.Message; +import dev.arcp.core.messages.Messages; +import dev.arcp.core.messages.SessionBye; +import dev.arcp.core.messages.SessionHello; +import dev.arcp.core.messages.SessionWelcome; +import dev.arcp.core.transport.MemoryTransport; +import dev.arcp.core.wire.ArcpMapper; +import dev.arcp.core.wire.Envelope; +import dev.arcp.runtime.agent.JobOutcome; +import dev.arcp.runtime.session.SessionLoop; +import java.time.Duration; +import java.util.ArrayDeque; +import java.util.EnumSet; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Flow; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.Test; + +/** §6.3 session resume coverage (#22). */ +class SessionResumeTest { + private static final ObjectMapper MAPPER = ArcpMapper.shared(); + private static final SessionId CLIENT_SESSION = SessionId.of("sess_client"); + + @Test + void resumeReattachesReplaysAndContinues() throws Exception { + CountDownLatch gate1 = new CountDownLatch(1); + CountDownLatch gate2 = new CountDownLatch(1); + try (ArcpRuntime runtime = + ArcpRuntime.builder() + .agent( + "stream", + "1.0.0", + (input, ctx) -> { + ctx.emit(new LogEvent("info", "e1")); + gate1.await(); + ctx.emit(new LogEvent("info", "e2")); + gate2.await(); + return JobOutcome.Success.inline(input.payload()); + }) + .build()) { + + MemoryTransport.Pair pair1 = MemoryTransport.pair(); + Probe probe1 = new Probe(); + pair1.client().incoming().subscribe(probe1); + SessionLoop loop = runtime.accept(pair1.runtime()); + + SessionWelcome welcome = handshake(pair1.client(), probe1); + String resumeToken = welcome.resumeToken(); + assertThat(resumeToken).isNotNull(); + + send( + pair1.client(), + Message.Type.JOB_SUBMIT, + new JobSubmit( + AgentRef.parse("stream@1.0.0"), + JsonNodeFactory.instance.objectNode(), + null, + null, + null, + null)); + probe1.take(Message.Type.JOB_ACCEPTED); + Envelope e1 = probe1.take(Message.Type.JOB_EVENT); + long lastSeq = e1.eventSeq(); + + // Simulate an unexpected transport drop (server endpoint closes; no session.close was sent). + pair1.runtime().close(); + assertThat(awaitPhase(loop, SessionLoop.Phase.PARKED)).isTrue(); + + // The still-running job emits another event while parked; it is buffered for replay. + gate1.countDown(); + + // Reconnect with the saved token and last seen sequence. + MemoryTransport.Pair pair2 = MemoryTransport.pair(); + Probe probe2 = new Probe(); + pair2.client().incoming().subscribe(probe2); + runtime.accept(pair2.runtime()); + sendHello(pair2.client(), resumeToken, lastSeq); + + SessionWelcome resumed = probe2.takeMessage(Message.Type.SESSION_WELCOME, SessionWelcome.class); + assertThat(resumed.resumeToken()).isEqualTo(resumeToken); + + // Replay: the missed event (e2) is delivered on the new transport. + JobEvent replayed = probe2.takeMessage(Message.Type.JOB_EVENT, JobEvent.class); + assertThat(replayed.eventKind()).isEqualTo("log"); + + // Live continuation: the job finishes and its result lands on the resumed transport. + gate2.countDown(); + JobResult result = probe2.takeMessage(Message.Type.JOB_RESULT, JobResult.class); + assertThat(result.finalStatus()).isEqualTo(JobResult.SUCCESS); + } + } + + @Test + void explicitCloseCancelsAndIsNotResumable() throws Exception { + CountDownLatch started = new CountDownLatch(1); + CountDownLatch release = new CountDownLatch(1); + try (ArcpRuntime runtime = + ArcpRuntime.builder() + .agent( + "block", + "1.0.0", + (input, ctx) -> { + started.countDown(); + release.await(); + return JobOutcome.Success.inline(input.payload()); + }) + .build()) { + MemoryTransport.Pair pair1 = MemoryTransport.pair(); + Probe probe1 = new Probe(); + pair1.client().incoming().subscribe(probe1); + runtime.accept(pair1.runtime()); + SessionWelcome welcome = handshake(pair1.client(), probe1); + + send( + pair1.client(), + Message.Type.JOB_SUBMIT, + new JobSubmit( + AgentRef.parse("block@1.0.0"), + JsonNodeFactory.instance.objectNode(), + null, + null, + null, + null)); + JobAccepted accepted = probe1.takeMessage(Message.Type.JOB_ACCEPTED, JobAccepted.class); + assertThat(started.await(3, TimeUnit.SECONDS)).isTrue(); + + // §6.7/#22: explicit close cancels in-flight jobs and is not resumable. + send(pair1.client(), Message.Type.SESSION_BYE, new SessionBye("done")); + probe1.take(Message.Type.SESSION_CLOSED); + release.countDown(); + + // The in-flight job is cancelled by the explicit close. + long deadline = System.nanoTime() + Duration.ofSeconds(3).toNanos(); + while (System.nanoTime() < deadline) { + var rec = runtime.job(accepted.jobId()); + if (rec != null && rec.status().terminal()) { + break; + } + Thread.sleep(10); + } + assertThat(runtime.job(accepted.jobId()).status().terminal()).isTrue(); + + MemoryTransport.Pair pair2 = MemoryTransport.pair(); + Probe probe2 = new Probe(); + pair2.client().incoming().subscribe(probe2); + runtime.accept(pair2.runtime()); + sendHello(pair2.client(), welcome.resumeToken(), 0L); + JobError resumeError = probe2.takeMessage(Message.Type.JOB_ERROR, JobError.class); + assertThat(resumeError.code()).isEqualTo(ErrorCode.RESUME_WINDOW_EXPIRED); + } + } + + private static boolean awaitPhase(SessionLoop loop, SessionLoop.Phase target) + throws InterruptedException { + long deadline = System.nanoTime() + Duration.ofSeconds(3).toNanos(); + while (System.nanoTime() < deadline) { + if (loop.phase() == target) { + return true; + } + Thread.sleep(10); + } + return loop.phase() == target; + } + + private static SessionWelcome handshake(MemoryTransport client, Probe probe) throws Exception { + sendHello(client, null, null); + return probe.takeMessage(Message.Type.SESSION_WELCOME, SessionWelcome.class); + } + + private static void sendHello(MemoryTransport client, String resumeToken, Long lastEventSeq) { + send( + client, + Message.Type.SESSION_HELLO, + new SessionHello( + new ClientInfo("resume-test", "1.0.0"), + // Bearer auth yields a stable principal across reconnects, which resume requires. + Auth.bearer("resume-principal"), + new Capabilities(List.of("json"), EnumSet.of(Feature.SUBSCRIBE), null), + resumeToken, + lastEventSeq)); + } + + private static void send(MemoryTransport client, Message.Type type, Message message) { + client.send( + new Envelope( + Envelope.VERSION, + MessageId.generate(), + type.wire(), + CLIENT_SESSION, + null, + null, + null, + Messages.encodePayload(MAPPER, message))); + } + + private static final class Probe implements Flow.Subscriber { + private final BlockingQueue envelopes = new LinkedBlockingQueue<>(); + private final Queue backlog = new ArrayDeque<>(); + + @Override + public void onSubscribe(Flow.Subscription subscription) { + subscription.request(Long.MAX_VALUE); + } + + @Override + public void onNext(Envelope item) { + envelopes.add(item); + } + + @Override + public void onError(Throwable throwable) {} + + @Override + public void onComplete() {} + + T takeMessage(Message.Type type, Class messageClass) throws Exception { + return messageClass.cast(Messages.decode(MAPPER, take(type))); + } + + Envelope take(Message.Type type) throws InterruptedException { + long deadline = System.nanoTime() + Duration.ofSeconds(5).toNanos(); + while (System.nanoTime() < deadline) { + for (Envelope existing : List.copyOf(backlog)) { + if (existing.type().equals(type.wire())) { + backlog.remove(existing); + return existing; + } + } + long remaining = deadline - System.nanoTime(); + Envelope envelope = envelopes.poll(Math.max(1L, remaining), TimeUnit.NANOSECONDS); + if (envelope == null) { + break; + } + if (envelope.type().equals(type.wire())) { + return envelope; + } + backlog.add(envelope); + } + throw new AssertionError("timed out waiting for " + type.wire() + "; backlog=" + backlog); + } + } +} From afa75067791ab90675ce6b02201edf6023e9f30b Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 22:12:17 -0400 Subject: [PATCH 4/9] refactor(runtime): extract idempotency fingerprint and list-jobs paging (#80, #82) Begin splitting the oversized SessionLoop by moving two cohesive responsibilities into their own tested types: IdempotencyFingerprint (canonical 7.2 hashing) and JobListing (6.6 filter/sort/paging). SessionLoop no longer contains the list-jobs implementation directly. Partial: the full job-execution/submit/ subscribe extraction and the under-800-line target remain follow-up work, deferred to avoid regression risk to the behavioral fixes in this PR. Co-authored-by: Cursor --- .../idempotency/IdempotencyFingerprint.java | 51 ++++++++ .../dev/arcp/runtime/session/JobListing.java | 89 +++++++++++++ .../dev/arcp/runtime/session/SessionLoop.java | 119 ++---------------- .../IdempotencyFingerprintTest.java | 49 ++++++++ .../arcp/runtime/session/JobListingTest.java | 68 ++++++++++ 5 files changed, 269 insertions(+), 107 deletions(-) create mode 100644 arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyFingerprint.java create mode 100644 arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java create mode 100644 arcp-runtime/src/test/java/dev/arcp/runtime/idempotency/IdempotencyFingerprintTest.java create mode 100644 arcp-runtime/src/test/java/dev/arcp/runtime/session/JobListingTest.java diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyFingerprint.java b/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyFingerprint.java new file mode 100644 index 0000000..dd96c57 --- /dev/null +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyFingerprint.java @@ -0,0 +1,51 @@ +package dev.arcp.runtime.idempotency; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.databind.cfg.JsonNodeFeature; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; +import dev.arcp.core.messages.JobSubmit; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.HexFormat; + +/** + * §7.2 idempotency fingerprint: a collision-resistant SHA-256 over the semantically meaningful + * {@link JobSubmit} fields (agent, input, lease_request, lease_constraints, max_runtime_sec). + * + *

The serialization is canonical — both {@link java.util.Map} entries and {@code ObjectNode} + * properties are sorted — so two payloads that differ only in JSON object key order hash identically + * (#91). + */ +public final class IdempotencyFingerprint { + + private IdempotencyFingerprint() {} + + public static String of(ObjectMapper mapper, JobSubmit submit) { + try { + ObjectMapper canonical = mapper.copy(); + canonical.configure(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, true); + canonical.configure(JsonNodeFeature.WRITE_PROPERTIES_SORTED, true); + canonical.setNodeFactory(JsonNodeFactory.withExactBigDecimals(true)); + ObjectNode canon = canonical.createObjectNode(); + canon.put("agent", submit.agent().wire()); + canon.set("input", canonical.valueToTree(submit.input())); + if (submit.leaseRequest() != null) { + canon.set("lease_request", canonical.valueToTree(submit.leaseRequest())); + } + if (submit.leaseConstraints() != null) { + canon.set("lease_constraints", canonical.valueToTree(submit.leaseConstraints())); + } + if (submit.maxRuntimeSec() != null) { + canon.put("max_runtime_sec", submit.maxRuntimeSec()); + } + byte[] bytes = canonical.writerWithDefaultPrettyPrinter().writeValueAsBytes(canon); + MessageDigest md = MessageDigest.getInstance("SHA-256"); + return HexFormat.of().formatHex(md.digest(bytes)); + } catch (JsonProcessingException | NoSuchAlgorithmException e) { + throw new IllegalStateException("idempotency fingerprint failure", e); + } + } +} diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java new file mode 100644 index 0000000..b4773bc --- /dev/null +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java @@ -0,0 +1,89 @@ +package dev.arcp.runtime.session; + +import dev.arcp.core.auth.Principal; +import dev.arcp.core.messages.JobFilter; +import dev.arcp.core.messages.JobSummary; +import java.nio.charset.StandardCharsets; +import java.util.Base64; +import java.util.Collection; +import java.util.Comparator; +import java.util.List; +import org.jspecify.annotations.Nullable; + +/** + * §6.6 {@code session.list_jobs} filtering and paging. Filters a job registry by principal and the + * requested {@link JobFilter}, orders by a stable key (createdAt, then jobId), and slices the + * requested page — mapping only the page to {@link JobSummary} rather than every matching job (#83). + */ +public final class JobListing { + + private JobListing() {} + + /** A page of job summaries plus the opaque cursor for the next page (or {@code null}). */ + public record Page(List jobs, @Nullable String nextCursor) {} + + /** + * @throws IllegalArgumentException if {@code cursor} is non-blank but not a valid cursor + */ + public static Page page( + Collection jobs, + Principal principal, + JobFilter filter, + @Nullable Integer limit, + @Nullable String cursor) { + int startIndex = decodeCursor(cursor); + + List matching = + jobs.stream() + .filter(rec -> rec.principal().equals(principal)) + .filter(rec -> filter.status() == null || filter.status().contains(rec.status().wire())) + .filter( + rec -> + filter.agent() == null + || filter.agent().equals(rec.resolvedAgent()) + || filter.agent().equals(rec.resolvedAgent().split("@", 2)[0])) + .filter( + rec -> + filter.createdAfter() == null || rec.createdAt().isAfter(filter.createdAfter())) + .sorted( + Comparator.comparing((JobRecord rec) -> rec.createdAt()) + .thenComparing(rec -> rec.jobId().value())) + .toList(); + + int effectiveLimit = limit != null && limit > 0 ? limit : matching.size(); + int endIndex = Math.min(matching.size(), startIndex + effectiveLimit); + List page = + startIndex >= matching.size() + ? List.of() + : matching.subList(startIndex, endIndex).stream().map(JobListing::toSummary).toList(); + String nextCursor = endIndex < matching.size() ? encodeCursor(endIndex) : null; + return new Page(page, nextCursor); + } + + private static int decodeCursor(@Nullable String cursor) { + if (cursor == null || cursor.isBlank()) { + return 0; + } + byte[] decoded = Base64.getUrlDecoder().decode(cursor); + int startIndex = Integer.parseInt(new String(decoded, StandardCharsets.UTF_8)); + return Math.max(0, startIndex); + } + + private static String encodeCursor(int index) { + return Base64.getUrlEncoder() + .withoutPadding() + .encodeToString(Integer.toString(index).getBytes(StandardCharsets.UTF_8)); + } + + private static JobSummary toSummary(JobRecord rec) { + return new JobSummary( + rec.jobId(), + rec.resolvedAgent(), + rec.status().wire(), + rec.lease(), + null, + rec.createdAt(), + rec.traceId(), + rec.lastEventSeq()); + } +} diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java index f1dabe1..e2017dc 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java @@ -30,7 +30,6 @@ import dev.arcp.core.messages.JobSubmit; import dev.arcp.core.messages.JobSubscribe; import dev.arcp.core.messages.JobSubscribed; -import dev.arcp.core.messages.JobSummary; import dev.arcp.core.messages.JobUnsubscribe; import dev.arcp.core.messages.Message; import dev.arcp.core.messages.Messages; @@ -55,6 +54,7 @@ import dev.arcp.runtime.credentials.CredentialBinding; import dev.arcp.runtime.credentials.IssuedCredential; import dev.arcp.runtime.heartbeat.HeartbeatTracker; +import dev.arcp.runtime.idempotency.IdempotencyFingerprint; import dev.arcp.runtime.lease.BudgetCounters; import dev.arcp.runtime.lease.LeaseGuard; import java.math.BigDecimal; @@ -522,75 +522,20 @@ private void handleListJobs(Envelope envelope, SessionListJobs req) { if (!negotiated.contains(Feature.LIST_JOBS)) { return; } - MessageId requestId = envelope.id(); - - // Decode the cursor first so a bad cursor fails fast without scanning the registry. The error - // echoes the request envelope id so the client correlates it to this list call, not an - // unrelated in-flight submit (#102). - int startIndex = 0; - if (req.cursor() != null && !req.cursor().isBlank()) { - try { - byte[] decoded = java.util.Base64.getUrlDecoder().decode(req.cursor()); - startIndex = Integer.parseInt(new String(decoded, java.nio.charset.StandardCharsets.UTF_8)); - if (startIndex < 0) { - startIndex = 0; - } - } catch (IllegalArgumentException e) { - sendJobErrorTopLevel(envelope, ErrorCode.INVALID_REQUEST, "invalid list_jobs cursor"); - return; - } - } - JobFilter filter = req.filter() != null ? req.filter() : JobFilter.all(); - // Filter and sort matching records, but only materialize JobSummary for the requested page so a - // small limit does not map every matching job (#83). Ordering and cursor use the same stable key - // (createdAt, then jobId). - List matching = - runtime.jobs().stream() - .filter(rec -> rec.principal().equals(principal)) - .filter(rec -> filter.status() == null || filter.status().contains(rec.status().wire())) - .filter( - rec -> - filter.agent() == null - || filter.agent().equals(rec.resolvedAgent()) - || filter.agent().equals(rec.resolvedAgent().split("@", 2)[0])) - .filter( - rec -> - filter.createdAfter() == null || rec.createdAt().isAfter(filter.createdAfter())) - .sorted( - java.util.Comparator.comparing((JobRecord rec) -> rec.createdAt()) - .thenComparing(rec -> rec.jobId().value())) - .toList(); - - int limit = req.limit() != null && req.limit() > 0 ? req.limit() : matching.size(); - int endIndex = Math.min(matching.size(), startIndex + limit); - List page = - startIndex >= matching.size() - ? List.of() - : matching.subList(startIndex, endIndex).stream().map(SessionLoop::toSummary).toList(); - String nextCursor = - endIndex < matching.size() - ? java.util.Base64.getUrlEncoder() - .withoutPadding() - .encodeToString( - Integer.toString(endIndex).getBytes(java.nio.charset.StandardCharsets.UTF_8)) - : null; - SessionJobs response = new SessionJobs(requestId, page, nextCursor); + JobListing.Page page; + try { + page = JobListing.page(runtime.jobs(), principal, filter, req.limit(), req.cursor()); + } catch (IllegalArgumentException e) { + // The error echoes the request envelope id so the client correlates it to this list call, + // not an unrelated in-flight submit (#102). + sendJobErrorTopLevel(envelope, ErrorCode.INVALID_REQUEST, "invalid list_jobs cursor"); + return; + } + SessionJobs response = new SessionJobs(envelope.id(), page.jobs(), page.nextCursor()); send(Message.Type.SESSION_JOBS, response, sessionId, null, null, null); } - private static JobSummary toSummary(JobRecord rec) { - return new JobSummary( - rec.jobId(), - rec.resolvedAgent(), - rec.status().wire(), - rec.lease(), - null, - rec.createdAt(), - rec.traceId(), - rec.lastEventSeq()); - } - private void handleSubmit(Envelope envelope, JobSubmit submit) { Principal pr = principal; if (pr == null) { @@ -611,7 +556,7 @@ private void handleSubmit(Envelope envelope, JobSubmit submit) { // §7.2: idempotency. Identical (principal, key, full-submit fingerprint) returns prior job_id. String idempotencyKey = submit.idempotencyKey(); if (idempotencyKey != null) { - String fingerprint = idempotencyFingerprint(submit); + String fingerprint = IdempotencyFingerprint.of(mapper, submit); JobId fresh = JobId.generate(); var conflict = runtime.idempotency().claim(pr, idempotencyKey, fingerprint, fresh); if (conflict != null) { @@ -653,46 +598,6 @@ private void releaseIdempotency(Principal pr, @Nullable String idempotencyKey, J } } - /** - * Build a collision-resistant fingerprint over the semantically meaningful {@link JobSubmit} - * fields: agent, input, lease_request, lease_constraints, max_runtime_sec. The serialization is - * canonical (alphabetically sorted keys) so that semantically equal payloads always produce the - * same fingerprint and the comparison in {@link - * dev.arcp.runtime.idempotency.IdempotencyStore#matchesPayload} is robust to key-order changes. - */ - private String idempotencyFingerprint(JobSubmit submit) { - try { - ObjectMapper canonical = mapper.copy(); - canonical.configure( - com.fasterxml.jackson.databind.SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, true); - // ORDER_MAP_ENTRIES_BY_KEYS sorts java.util.Map only; submit.input() is a JsonNode/ObjectNode - // whose properties must also be sorted so semantically equal payloads with differing key order - // hash identically (§7.2, #91). - canonical.configure( - com.fasterxml.jackson.databind.cfg.JsonNodeFeature.WRITE_PROPERTIES_SORTED, true); - canonical.setNodeFactory( - com.fasterxml.jackson.databind.node.JsonNodeFactory.withExactBigDecimals(true)); - ObjectNode canon = canonical.createObjectNode(); - canon.put("agent", submit.agent().wire()); - canon.set("input", canonical.valueToTree(submit.input())); - if (submit.leaseRequest() != null) { - canon.set("lease_request", canonical.valueToTree(submit.leaseRequest())); - } - if (submit.leaseConstraints() != null) { - canon.set("lease_constraints", canonical.valueToTree(submit.leaseConstraints())); - } - if (submit.maxRuntimeSec() != null) { - canon.put("max_runtime_sec", submit.maxRuntimeSec()); - } - byte[] bytes = canonical.writerWithDefaultPrettyPrinter().writeValueAsBytes(canon); - java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-256"); - return java.util.HexFormat.of().formatHex(md.digest(bytes)); - } catch (com.fasterxml.jackson.core.JsonProcessingException - | java.security.NoSuchAlgorithmException e) { - throw new IllegalStateException("idempotency fingerprint failure", e); - } - } - private void emitReplayAccepted(JobRecord prior, @Nullable TraceId traceId) { // §7.2: replay the budget captured at the original acceptance, not the live (decremented) // counters, so an identical retry returns the same job.accepted payload (#79). diff --git a/arcp-runtime/src/test/java/dev/arcp/runtime/idempotency/IdempotencyFingerprintTest.java b/arcp-runtime/src/test/java/dev/arcp/runtime/idempotency/IdempotencyFingerprintTest.java new file mode 100644 index 0000000..f40ab1c --- /dev/null +++ b/arcp-runtime/src/test/java/dev/arcp/runtime/idempotency/IdempotencyFingerprintTest.java @@ -0,0 +1,49 @@ +package dev.arcp.runtime.idempotency; + +import static org.assertj.core.api.Assertions.assertThat; + +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; +import dev.arcp.core.agents.AgentRef; +import dev.arcp.core.messages.JobSubmit; +import dev.arcp.core.wire.ArcpMapper; +import org.junit.jupiter.api.Test; + +class IdempotencyFingerprintTest { + + @Test + void keyOrderDoesNotAffectFingerprint() { + ObjectNode a = JsonNodeFactory.instance.objectNode(); + a.put("b", 1); + a.put("a", 2); + ObjectNode b = JsonNodeFactory.instance.objectNode(); + b.put("a", 2); + b.put("b", 1); + JobSubmit s1 = new JobSubmit(AgentRef.parse("echo@1.0.0"), a, null, null, "k", null); + JobSubmit s2 = new JobSubmit(AgentRef.parse("echo@1.0.0"), b, null, null, "k", null); + assertThat(IdempotencyFingerprint.of(ArcpMapper.shared(), s1)) + .isEqualTo(IdempotencyFingerprint.of(ArcpMapper.shared(), s2)); + } + + @Test + void differentInputsDifferentFingerprint() { + JobSubmit s1 = + new JobSubmit( + AgentRef.parse("echo@1.0.0"), + JsonNodeFactory.instance.objectNode().put("x", 1), + null, + null, + "k", + null); + JobSubmit s2 = + new JobSubmit( + AgentRef.parse("echo@1.0.0"), + JsonNodeFactory.instance.objectNode().put("x", 2), + null, + null, + "k", + null); + assertThat(IdempotencyFingerprint.of(ArcpMapper.shared(), s1)) + .isNotEqualTo(IdempotencyFingerprint.of(ArcpMapper.shared(), s2)); + } +} diff --git a/arcp-runtime/src/test/java/dev/arcp/runtime/session/JobListingTest.java b/arcp-runtime/src/test/java/dev/arcp/runtime/session/JobListingTest.java new file mode 100644 index 0000000..b77169a --- /dev/null +++ b/arcp-runtime/src/test/java/dev/arcp/runtime/session/JobListingTest.java @@ -0,0 +1,68 @@ +package dev.arcp.runtime.session; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import dev.arcp.core.auth.Principal; +import dev.arcp.core.ids.JobId; +import dev.arcp.core.lease.Lease; +import dev.arcp.core.lease.LeaseConstraints; +import dev.arcp.core.messages.JobFilter; +import dev.arcp.runtime.lease.BudgetCounters; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.Test; + +class JobListingTest { + + private static final Principal ALICE = new Principal("alice"); + + @Test + void pagesAndProducesContinuationCursor() { + List jobs = records(ALICE, 5); + JobListing.Page page1 = JobListing.page(jobs, ALICE, JobFilter.all(), 2, null); + assertThat(page1.jobs()).hasSize(2); + assertThat(page1.nextCursor()).isNotNull(); + + JobListing.Page page2 = JobListing.page(jobs, ALICE, JobFilter.all(), 2, page1.nextCursor()); + assertThat(page2.jobs()).hasSize(2); + JobListing.Page page3 = JobListing.page(jobs, ALICE, JobFilter.all(), 2, page2.nextCursor()); + assertThat(page3.jobs()).hasSize(1); + assertThat(page3.nextCursor()).isNull(); + } + + @Test + void filtersByPrincipal() { + List jobs = records(ALICE, 2); + jobs.addAll(records(new Principal("bob"), 3)); + JobListing.Page page = JobListing.page(jobs, ALICE, JobFilter.all(), null, null); + assertThat(page.jobs()).hasSize(2); + } + + @Test + void badCursorThrows() { + assertThatThrownBy( + () -> JobListing.page(records(ALICE, 1), ALICE, JobFilter.all(), 10, "#not-base64")) + .isInstanceOf(IllegalArgumentException.class); + } + + private static List records(Principal principal, int count) { + List out = new ArrayList<>(); + Instant base = Instant.parse("2026-05-21T12:00:00Z"); + for (int i = 0; i < count; i++) { + out.add( + new JobRecord( + JobId.of("job_" + principal.id() + "_" + i), + "echo@1.0.0", + principal, + Lease.empty(), + LeaseConstraints.none(), + new BudgetCounters(Map.of()), + base.plusSeconds(i), + null)); + } + return out; + } +} From 6c32e7dc898b81582dab36433be31fca587c78e1 Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 22:55:36 -0400 Subject: [PATCH 5/9] style: apply spotless formatting CI's spotless:check gate (JDK 21) flagged formatting drift in the audit-fix commits, which were verified locally with -Darcp.skip.spotless=true on JDK 25. Co-Authored-By: Claude Fable 5 --- .../arcp/core/credentials/CredentialScheme.java | 6 +++--- .../java/dev/arcp/core/lease/LeaseSubset.java | 13 ++++++++----- .../middleware/jakarta/ArcpJakartaAdapter.java | 3 ++- .../middleware/jakarta/ArcpJakartaEndpoint.java | 3 +-- .../spring/ArcpSpringBootAutoConfiguration.java | 4 +++- .../runtime/credentials/CredentialBinding.java | 6 +++--- .../idempotency/IdempotencyFingerprint.java | 4 ++-- .../dev/arcp/runtime/session/JobListing.java | 3 ++- .../dev/arcp/runtime/session/SessionLoop.java | 16 +++++++++------- .../dev/arcp/runtime/SessionLoopAuditTest.java | 8 ++++++-- .../java/dev/arcp/runtime/SessionResumeTest.java | 3 ++- 11 files changed, 41 insertions(+), 28 deletions(-) diff --git a/arcp-core/src/main/java/dev/arcp/core/credentials/CredentialScheme.java b/arcp-core/src/main/java/dev/arcp/core/credentials/CredentialScheme.java index bf1501a..8a6ff0b 100644 --- a/arcp-core/src/main/java/dev/arcp/core/credentials/CredentialScheme.java +++ b/arcp-core/src/main/java/dev/arcp/core/credentials/CredentialScheme.java @@ -10,9 +10,9 @@ public enum CredentialScheme { /** * §9.8.1: implementations MAY define extension schemes (e.g. {@code basic}, {@code signed_url}); * unknown schemes MUST be ignored by clients that do not recognize them. Decoding an unrecognized - * scheme yields {@code UNKNOWN} rather than throwing, so a single extension credential cannot fail - * decode of an entire {@code job.accepted} (see #97). Consumers filter credentials whose scheme is - * {@code UNKNOWN}. + * scheme yields {@code UNKNOWN} rather than throwing, so a single extension credential cannot + * fail decode of an entire {@code job.accepted} (see #97). Consumers filter credentials whose + * scheme is {@code UNKNOWN}. */ UNKNOWN("unknown"); diff --git a/arcp-core/src/main/java/dev/arcp/core/lease/LeaseSubset.java b/arcp-core/src/main/java/dev/arcp/core/lease/LeaseSubset.java index 2d39502..993e382 100644 --- a/arcp-core/src/main/java/dev/arcp/core/lease/LeaseSubset.java +++ b/arcp-core/src/main/java/dev/arcp/core/lease/LeaseSubset.java @@ -8,10 +8,10 @@ import org.jspecify.annotations.Nullable; /** - * §9.4 / §10 delegation subset enforcement. A delegated (child) lease MUST be a strict subset of its - * parent: every capability pattern covered by the parent, every {@code cost.budget} amount no - * greater than the parent's remaining amount, every {@code model.use} pattern covered, and an {@code - * expires_at} no later than the parent's. Violations are rejected with {@code + * §9.4 / §10 delegation subset enforcement. A delegated (child) lease MUST be a strict subset of + * its parent: every capability pattern covered by the parent, every {@code cost.budget} amount no + * greater than the parent's remaining amount, every {@code model.use} pattern covered, and an + * {@code expires_at} no later than the parent's. Violations are rejected with {@code * LEASE_SUBSET_VIOLATION}. * *

The numeric/temporal comparisons here intentionally do not reuse the glob {@code covers()} @@ -51,7 +51,10 @@ public static void validate( Lease childSlice = new Lease(Map.of(namespace, entry.getValue())); if (!parent.contains(childSlice)) { throw new LeaseSubsetViolationException( - "delegated patterns exceed parent for namespace " + namespace + ": " + entry.getValue()); + "delegated patterns exceed parent for namespace " + + namespace + + ": " + + entry.getValue()); } } validateExpiry(parentExpiresAt, childExpiresAt); diff --git a/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaAdapter.java b/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaAdapter.java index aca5d7a..2055712 100644 --- a/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaAdapter.java +++ b/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaAdapter.java @@ -76,7 +76,8 @@ public void modifyHandshake( return; } List hosts = request.getHeaders().get("Host"); - boolean allowed = hosts != null && !hosts.isEmpty() && allowedHosts.contains(hosts.get(0)); + boolean allowed = + hosts != null && !hosts.isEmpty() && allowedHosts.contains(hosts.get(0)); hostRejected.set(!allowed); } }; diff --git a/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaEndpoint.java b/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaEndpoint.java index 4874f81..ec23d3c 100644 --- a/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaEndpoint.java +++ b/arcp-middleware-jakarta/src/main/java/dev/arcp/middleware/jakarta/ArcpJakartaEndpoint.java @@ -40,8 +40,7 @@ public void onOpen(Session session, EndpointConfig config) { // §14: the Host was not on the allowlist; close before the runtime ever sees the session, so // a client that ignores Sec-WebSocket-Accept validation cannot talk to the runtime (#100). try { - session.close( - new CloseReason(CloseReason.CloseCodes.VIOLATED_POLICY, "host not allowed")); + session.close(new CloseReason(CloseReason.CloseCodes.VIOLATED_POLICY, "host not allowed")); } catch (java.io.IOException ignored) { // best-effort close } diff --git a/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/ArcpSpringBootAutoConfiguration.java b/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/ArcpSpringBootAutoConfiguration.java index d5b2933..716a1a4 100644 --- a/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/ArcpSpringBootAutoConfiguration.java +++ b/arcp-middleware-spring-boot/src/main/java/dev/arcp/middleware/spring/ArcpSpringBootAutoConfiguration.java @@ -42,7 +42,9 @@ public void registerWebSocketHandlers(WebSocketHandlerRegistry registry) { ? new String[] {"*"} : properties.getAllowedOrigins().toArray(new String[0]); var registration = - registry.addHandler(arcpWebSocketHandler(), properties.getPath()).setAllowedOrigins(origins); + registry + .addHandler(arcpWebSocketHandler(), properties.getPath()) + .setAllowedOrigins(origins); // §14: enforce the Host allowlist before the upgrade completes so a disallowed Host is rejected // (403) rather than being a silently ignored security control (#99). List allowedHosts = properties.getAllowedHosts(); diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/credentials/CredentialBinding.java b/arcp-runtime/src/main/java/dev/arcp/runtime/credentials/CredentialBinding.java index 890e975..4783788 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/credentials/CredentialBinding.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/credentials/CredentialBinding.java @@ -82,9 +82,9 @@ public void revokeAll(JobRecord record) { } /** - * Record then revoke a credential that was minted upstream but not attached to a job (e.g. surplus - * credentials returned during rotation), so its spend authority is tracked and released rather - * than dangling (§14, #98). + * Record then revoke a credential that was minted upstream but not attached to a job (e.g. + * surplus credentials returned during rotation), so its spend authority is tracked and released + * rather than dangling (§14, #98). */ public void revokeMinted(IssuedCredential credential) { store.record( diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyFingerprint.java b/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyFingerprint.java index dd96c57..b293cde 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyFingerprint.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/idempotency/IdempotencyFingerprint.java @@ -16,8 +16,8 @@ * {@link JobSubmit} fields (agent, input, lease_request, lease_constraints, max_runtime_sec). * *

The serialization is canonical — both {@link java.util.Map} entries and {@code ObjectNode} - * properties are sorted — so two payloads that differ only in JSON object key order hash identically - * (#91). + * properties are sorted — so two payloads that differ only in JSON object key order hash + * identically (#91). */ public final class IdempotencyFingerprint { diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java index b4773bc..8e795bc 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java @@ -13,7 +13,8 @@ /** * §6.6 {@code session.list_jobs} filtering and paging. Filters a job registry by principal and the * requested {@link JobFilter}, orders by a stable key (createdAt, then jobId), and slices the - * requested page — mapping only the page to {@link JobSummary} rather than every matching job (#83). + * requested page — mapping only the page to {@link JobSummary} rather than every matching job + * (#83). */ public final class JobListing { diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java index e2017dc..a9de4af 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java @@ -218,8 +218,7 @@ private void park(String reason) { parkExpiry = runtime .scheduler() - .schedule( - () -> expirePark(token), runtime.resumeWindowSec(), TimeUnit.SECONDS); + .schedule(() -> expirePark(token), runtime.resumeWindowSec(), TimeUnit.SECONDS); } catch (java.util.concurrent.RejectedExecutionException e) { expirePark(token); } @@ -413,7 +412,9 @@ private void doHandshake(SessionHello hello) { } } - /** §6.3: reattach this parked session to {@code incoming}'s transport and replay missed events. */ + /** + * §6.3: reattach this parked session to {@code incoming}'s transport and replay missed events. + */ private void resumeOnto(SessionLoop incoming, @Nullable Long lastEventSeq) { ScheduledFuture pe = parkExpiry; if (pe != null) { @@ -748,8 +749,7 @@ private void terminateTimedOutJob(JobRecord record) { if (w != null) { w.cancel(true); } - emitJobError( - record, JobError.TIMED_OUT, ErrorCode.TIMEOUT, "job exceeded max_runtime_sec"); + emitJobError(record, JobError.TIMED_OUT, ErrorCode.TIMEOUT, "job exceeded max_runtime_sec"); credentialBinding.revokeAll(record); } } @@ -890,7 +890,8 @@ public void rotateCredential(CredentialId id, String newValue) { credentialBinding.revokeAll(record); } } catch (dev.arcp.core.error.LeaseExpiredException e) { - // §7.3: only emit a terminal error if this thread wins the transition; otherwise a watchdog or + // §7.3: only emit a terminal error if this thread wins the transition; otherwise a watchdog + // or // cancel already produced the single terminal message (#92). if (record.transitionTo(JobRecord.Status.ERROR)) { emitJobError(record, JobError.ERROR, ErrorCode.LEASE_EXPIRED, e.getMessage()); @@ -1033,7 +1034,8 @@ private void handleSubscribe(Envelope envelope, JobSubscribe sub) { if (wantHistory) { // §8.1/§8.3/§14: re-envelope each replayed event with this (subscriber) session's session_id - // and a freshly allocated event_seq, and never echo credential material to a subscriber (#94). + // and a freshly allocated event_seq, and never echo credential material to a subscriber + // (#94). boolean isOwner = ownedJobs.contains(rec.jobId()); for (JobRecord.RecordedEvent replay : rec.eventsSince(subscribedFrom)) { JobEvent event = replay.event(); diff --git a/arcp-runtime/src/test/java/dev/arcp/runtime/SessionLoopAuditTest.java b/arcp-runtime/src/test/java/dev/arcp/runtime/SessionLoopAuditTest.java index 0291811..c5c5a16 100644 --- a/arcp-runtime/src/test/java/dev/arcp/runtime/SessionLoopAuditTest.java +++ b/arcp-runtime/src/test/java/dev/arcp/runtime/SessionLoopAuditTest.java @@ -179,7 +179,8 @@ void budgetRemainingGaugeDoesNotDecrementBudget() throws Exception { "gauge", "1.0.0", (input, ctx) -> { - ctx.emit(new MetricEvent("cost.budget.remaining", new BigDecimal("4"), "usd", null)); + ctx.emit( + new MetricEvent("cost.budget.remaining", new BigDecimal("4"), "usd", null)); ctx.emit(new MetricEvent("cost.budget.remaining", null, "usd", null)); ctx.authorize("fs.read", "/workspace/x"); return JobOutcome.Success.inline(input.payload()); @@ -192,7 +193,10 @@ void budgetRemainingGaugeDoesNotDecrementBudget() throws Exception { new JobSubmit( AgentRef.parse("gauge@1.0.0"), JsonNodeFactory.instance.objectNode(), - Lease.builder().allow("fs.read", "/workspace/**").allow("cost.budget", "usd:5").build(), + Lease.builder() + .allow("fs.read", "/workspace/**") + .allow("cost.budget", "usd:5") + .build(), null, null, null)); diff --git a/arcp-runtime/src/test/java/dev/arcp/runtime/SessionResumeTest.java b/arcp-runtime/src/test/java/dev/arcp/runtime/SessionResumeTest.java index 6a13036..8a3e992 100644 --- a/arcp-runtime/src/test/java/dev/arcp/runtime/SessionResumeTest.java +++ b/arcp-runtime/src/test/java/dev/arcp/runtime/SessionResumeTest.java @@ -100,7 +100,8 @@ void resumeReattachesReplaysAndContinues() throws Exception { runtime.accept(pair2.runtime()); sendHello(pair2.client(), resumeToken, lastSeq); - SessionWelcome resumed = probe2.takeMessage(Message.Type.SESSION_WELCOME, SessionWelcome.class); + SessionWelcome resumed = + probe2.takeMessage(Message.Type.SESSION_WELCOME, SessionWelcome.class); assertThat(resumed.resumeToken()).isEqualTo(resumeToken); // Replay: the missed event (e2) is delivered on the new transport. From c3d7235a69796da698aa9ad690de77455cd3e8c9 Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 22:55:48 -0400 Subject: [PATCH 6/9] fix(client): fail connect() fast on resume rejection; fix resume demos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resume semantics introduced for #22 broke the resume example and the stream-resume recipe in two ways: - ArcpClient dropped a top-level RESUME_WINDOW_EXPIRED error that arrives before session.welcome as uncorrelated, leaving connect() to hang until its timeout. The handshake is now rejected with the decoded error so callers can fall back to a fresh session (and the in-memory transport never completes the client's inbound stream on a runtime-side close, so the existing transport-closed fallback could not catch this). - examples/resume and recipes/stream-resume closed the first client gracefully, which cancels the session (§6.7) and invalidates its resume token. They now authenticate with a stable bearer principal, drop the transport without session.close so the runtime parks the session for the resume window, and await the parked phase before reconnecting. Adds a ClientAuditTest regression test asserting connect() surfaces RESUME_WINDOW_EXPIRED promptly for an unknown token. Co-Authored-By: Claude Fable 5 --- .../main/java/dev/arcp/client/ArcpClient.java | 12 +++-- .../java/dev/arcp/client/ClientAuditTest.java | 25 +++++++++ .../java/dev/arcp/examples/resume/Main.java | 52 ++++++++++++------- .../dev/arcp/recipes/streamresume/Main.java | 48 +++++++++++------ 4 files changed, 100 insertions(+), 37 deletions(-) diff --git a/arcp-client/src/main/java/dev/arcp/client/ArcpClient.java b/arcp-client/src/main/java/dev/arcp/client/ArcpClient.java index 54a2869..7a7587b 100644 --- a/arcp-client/src/main/java/dev/arcp/client/ArcpClient.java +++ b/arcp-client/src/main/java/dev/arcp/client/ArcpClient.java @@ -178,9 +178,9 @@ public JobHandle submit(JobSubmit submit) { * Blocking submit. Returns once the runtime acknowledges with {@code job.accepted} (or fails on * rejection). Bounded by the configured submit timeout so it can never block forever (#106). * - *

Must not be called from a dispatch/result callback (i.e. the transport inbound thread); doing - * so would deadlock because the acknowledgement is delivered by that same thread. Such a call - * fails fast with {@link IllegalStateException}. + *

Must not be called from a dispatch/result callback (i.e. the transport inbound thread); + * doing so would deadlock because the acknowledgement is delivered by that same thread. Such a + * call fails fast with {@link IllegalStateException}. */ public JobHandle submit(JobSubmit submit, @Nullable TraceId traceId) { if (Boolean.TRUE.equals(inDispatch.get())) { @@ -628,6 +628,12 @@ private void handleError(Envelope envelope, JobError err) { match.outstanding().handleFuture.completeExceptionally(ex); return; } + // A top-level error before session.welcome rejects the handshake itself (e.g. + // RESUME_WINDOW_EXPIRED for an unknown/expired resume token, §6.3); fail connect() so the + // caller can fall back to a fresh session instead of timing out. + if (sessionFuture.completeExceptionally(ex)) { + return; + } log.warn("dropping uncorrelated top-level error {}: {}", err.code(), err.message()); } diff --git a/arcp-client/src/test/java/dev/arcp/client/ClientAuditTest.java b/arcp-client/src/test/java/dev/arcp/client/ClientAuditTest.java index df58cf4..d8ee92f 100644 --- a/arcp-client/src/test/java/dev/arcp/client/ClientAuditTest.java +++ b/arcp-client/src/test/java/dev/arcp/client/ClientAuditTest.java @@ -177,4 +177,29 @@ public void onComplete() { } runtime.close(); } + + @Test + void connectWithInvalidResumeTokenFailsFast() throws Exception { + ArcpRuntime runtime = + ArcpRuntime.builder() + .agent("echo", "1.0.0", (input, ctx) -> JobOutcome.Success.inline(input.payload())) + .build(); + MemoryTransport.Pair pair = MemoryTransport.pair(); + runtime.accept(pair.runtime()); + try (ArcpClient client = + ArcpClient.builder(pair.client()) + .bearer("resume-principal") + .resumeToken("tok_unknown") + .lastEventSeq(0L) + .build()) { + // §6.3: an unknown/expired resume token rejects the handshake with RESUME_WINDOW_EXPIRED. + // connect() surfaces the top-level error instead of timing out, so the caller can fall back + // to a fresh session. + assertThatThrownBy(() -> client.connect(Duration.ofSeconds(5))) + .isInstanceOfSatisfying( + ArcpException.class, + e -> assertThat(e.code()).isEqualTo(ErrorCode.RESUME_WINDOW_EXPIRED)); + } + runtime.close(); + } } diff --git a/examples/resume/src/main/java/dev/arcp/examples/resume/Main.java b/examples/resume/src/main/java/dev/arcp/examples/resume/Main.java index d3209f2..ffb2ee9 100644 --- a/examples/resume/src/main/java/dev/arcp/examples/resume/Main.java +++ b/examples/resume/src/main/java/dev/arcp/examples/resume/Main.java @@ -3,18 +3,22 @@ import com.fasterxml.jackson.databind.node.JsonNodeFactory; import dev.arcp.client.ArcpClient; import dev.arcp.client.JobHandle; -import dev.arcp.client.Session; import dev.arcp.core.messages.JobResult; import dev.arcp.core.transport.MemoryTransport; import dev.arcp.runtime.ArcpRuntime; import dev.arcp.runtime.agent.JobOutcome; +import dev.arcp.runtime.session.SessionLoop; import java.time.Duration; import java.util.concurrent.TimeUnit; /** - * Demonstrates session resume: after a first connection completes a job, the resume token and last - * seen sequence are captured; a second client reconnects using those values and submits another - * job. + * Demonstrates session resume (§6.3): after an unexpected transport drop the runtime parks the + * session for the resume window; a second connection presenting the captured {@code resume_token} + * reattaches to the same logical session and continues where it left off. + * + *

Resume requires a stable principal across connections, so both clients authenticate with the + * same bearer token. An explicit {@code session.close} cancels the session instead of parking it, + * so the first connection is dropped at the transport level rather than closed gracefully. */ public final class Main { public static void main(String[] args) throws Exception { @@ -25,12 +29,12 @@ public static void main(String[] args) throws Exception { // First connection: submit a job and capture resume state. MemoryTransport.Pair pair1 = MemoryTransport.pair(); - runtime.accept(pair1.runtime()); + SessionLoop serverSession = runtime.accept(pair1.runtime()); - String resumeToken = null; - long lastSeq = 0L; - - try (ArcpClient client1 = ArcpClient.builder(pair1.client()).build()) { + ArcpClient client1 = ArcpClient.builder(pair1.client()).bearer("resume-example").build(); + String resumeToken; + long lastSeq; + try { client1.connect(Duration.ofSeconds(5)); JobHandle handle = client1.submit( @@ -38,21 +42,33 @@ public static void main(String[] args) throws Exception { "echo@1.0.0", JsonNodeFactory.instance.objectNode().put("pass", 1))); handle.result().get(5, TimeUnit.SECONDS); - Session session = client1.session(); - resumeToken = session.resumeToken(); + resumeToken = client1.session().resumeToken(); lastSeq = client1.lastSeenSeq(); + + // Simulate an unexpected transport drop: no session.close is sent, so the runtime parks the + // session for the resume window instead of cancelling it. + pair1.runtime().close(); + } finally { + client1.close(); + } + + // The in-memory transport delivers the drop asynchronously; wait until the runtime has parked + // the session. Over a real network the reconnect delay dwarfs this detection time. + long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(5); + while (serverSession.phase() != SessionLoop.Phase.PARKED && System.nanoTime() < deadline) { + Thread.sleep(10); } - // Second connection: optionally resume from captured token. + // Second connection: present the captured token to reattach to the parked session. MemoryTransport.Pair pair2 = MemoryTransport.pair(); runtime.accept(pair2.runtime()); - ArcpClient.Builder builder2 = ArcpClient.builder(pair2.client()); - if (resumeToken != null) { - builder2 = builder2.resumeToken(resumeToken).lastEventSeq(lastSeq); - } - - try (ArcpClient client2 = builder2.build()) { + try (ArcpClient client2 = + ArcpClient.builder(pair2.client()) + .bearer("resume-example") + .resumeToken(resumeToken) + .lastEventSeq(lastSeq) + .build()) { client2.connect(Duration.ofSeconds(5)); JobHandle handle = client2.submit( diff --git a/recipes/stream-resume/src/main/java/dev/arcp/recipes/streamresume/Main.java b/recipes/stream-resume/src/main/java/dev/arcp/recipes/streamresume/Main.java index f43bf30..8e16b4d 100644 --- a/recipes/stream-resume/src/main/java/dev/arcp/recipes/streamresume/Main.java +++ b/recipes/stream-resume/src/main/java/dev/arcp/recipes/streamresume/Main.java @@ -3,7 +3,6 @@ import com.fasterxml.jackson.databind.node.JsonNodeFactory; import dev.arcp.client.ArcpClient; import dev.arcp.client.JobHandle; -import dev.arcp.client.Session; import dev.arcp.client.SubscribeOptions; import dev.arcp.core.events.EventBody; import dev.arcp.core.events.StatusEvent; @@ -11,6 +10,7 @@ import dev.arcp.core.transport.MemoryTransport; import dev.arcp.runtime.ArcpRuntime; import dev.arcp.runtime.agent.JobOutcome; +import dev.arcp.runtime.session.SessionLoop; import java.time.Duration; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Flow; @@ -21,8 +21,11 @@ * Recipe: Stream resume across connections. * *

Client 1 submits a streaming job (emits 10 {@link StatusEvent}s) and awaits completion, - * recording the resume token and last-seen sequence. Client 2 reconnects using those values and - * re-subscribes to replay the event history, demonstrating fault-tolerant stream consumption. + * recording the resume token and last-seen sequence. Its transport then drops unexpectedly (no + * {@code session.close}), so the runtime parks the session for the resume window (§6.3). Client 2 + * reconnects with the recorded token and re-subscribes with history to replay the event stream, + * demonstrating fault-tolerant stream consumption. Both connections authenticate with the same + * bearer token: resume requires a stable principal. */ public final class Main { public static void main(String[] args) throws Exception { @@ -41,15 +44,16 @@ public static void main(String[] args) throws Exception { // --- Client 1: live consumption. --- MemoryTransport.Pair pair1 = MemoryTransport.pair(); - runtime.accept(pair1.runtime()); + SessionLoop serverSession = runtime.accept(pair1.runtime()); - String resumeToken = null; - long lastSeq = 0L; + String resumeToken; + long lastSeq; JobId jobId; AtomicInteger firstPassCount = new AtomicInteger(); CompletableFuture firstPassDone = new CompletableFuture<>(); - try (ArcpClient client1 = ArcpClient.builder(pair1.client()).bearer("demo").build()) { + ArcpClient client1 = ArcpClient.builder(pair1.client()).bearer("demo").build(); + try { client1.connect(Duration.ofSeconds(5)); JobHandle handle = @@ -87,26 +91,38 @@ public void onComplete() {} firstPassDone.get(5, TimeUnit.SECONDS); jobId = handle.jobId(); - Session session = client1.session(); - resumeToken = session.resumeToken(); + resumeToken = client1.session().resumeToken(); lastSeq = client1.lastSeenSeq(); + + // Simulate an unexpected transport drop: no session.close is sent, so the runtime parks the + // session for the resume window instead of cancelling it. + pair1.runtime().close(); + } finally { + client1.close(); + } + + // The in-memory transport delivers the drop asynchronously; wait until the runtime has parked + // the session. Over a real network the reconnect delay dwarfs this detection time. + long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(5); + while (serverSession.phase() != SessionLoop.Phase.PARKED && System.nanoTime() < deadline) { + Thread.sleep(10); } int firstCount = firstPassCount.get(); - // --- Client 2: replay via history subscription. --- + // --- Client 2: resume the parked session and replay via history subscription. --- MemoryTransport.Pair pair2 = MemoryTransport.pair(); runtime.accept(pair2.runtime()); AtomicInteger replayCount = new AtomicInteger(); CompletableFuture replayDone = new CompletableFuture<>(); - ArcpClient.Builder builder2 = ArcpClient.builder(pair2.client()).bearer("demo"); - if (resumeToken != null) { - builder2 = builder2.resumeToken(resumeToken).lastEventSeq(lastSeq); - } - - try (ArcpClient client2 = builder2.build()) { + try (ArcpClient client2 = + ArcpClient.builder(pair2.client()) + .bearer("demo") + .resumeToken(resumeToken) + .lastEventSeq(lastSeq) + .build()) { client2.connect(Duration.ofSeconds(5)); Flow.Publisher replayEvents = From 95e646eb4dbef293ae2232799141caa591341108 Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 22:58:07 -0400 Subject: [PATCH 7/9] refactor(runtime): wrap malformed list_jobs cursors in a stable error Integer.parseInt on a valid-Base64 but non-numeric cursor threw a raw NumberFormatException whose parser message was echoed to the client in INVALID_REQUEST. Both decode failures now surface as a uniform "invalid cursor" IllegalArgumentException. Addresses the code-quality review finding on decodeCursor. Co-Authored-By: Claude Fable 5 --- .../java/dev/arcp/runtime/session/JobListing.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java index 8e795bc..345a2d1 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/JobListing.java @@ -65,9 +65,14 @@ private static int decodeCursor(@Nullable String cursor) { if (cursor == null || cursor.isBlank()) { return 0; } - byte[] decoded = Base64.getUrlDecoder().decode(cursor); - int startIndex = Integer.parseInt(new String(decoded, StandardCharsets.UTF_8)); - return Math.max(0, startIndex); + try { + byte[] decoded = Base64.getUrlDecoder().decode(cursor); + return Math.max(0, Integer.parseInt(new String(decoded, StandardCharsets.UTF_8))); + } catch (IllegalArgumentException e) { + // Covers malformed Base64 and non-numeric content (NumberFormatException): surface a stable + // message instead of the parser's, since the caller echoes it in INVALID_REQUEST. + throw new IllegalArgumentException("invalid cursor", e); + } } private static String encodeCursor(int index) { From 5e0e13dfc6139b5461b63b65efd4be5c2a404eee Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 23:07:54 -0400 Subject: [PATCH 8/9] ci: build modules before javadoc so reactor deps resolve fresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit javadoc:javadoc alone does not compile, so arcp-runtime's javadoc resolved arcp-core from the runner's cached SNAPSHOT — which lacks classes a PR adds upstream (here JobCancelled/SessionClosed from #95, #96). Run package in the same invocation so inter-module dependencies come from the reactor. Verified against a cold local repository. Co-Authored-By: Claude Fable 5 --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 90e390b..da80a32 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -146,7 +146,11 @@ jobs: java-version: "21" cache: maven - name: Build javadoc for all published modules + # `package` before javadoc:javadoc so inter-module dependencies resolve + # from this reactor's output instead of a (possibly stale) cached + # SNAPSHOT in the runner's local repository — otherwise javadoc cannot + # see classes a PR adds to an upstream module like arcp-core. run: | mvn -B -ntp -am -DskipTests -Darcp.skip.spotless=true \ -pl arcp-core,arcp-client,arcp-runtime,arcp,arcp-otel,arcp-runtime-jetty,arcp-middleware-jakarta,arcp-middleware-spring-boot,arcp-middleware-vertx,arcp-tck \ - javadoc:javadoc + package javadoc:javadoc From 5ef793a42360cf8d33b97d2df8a12a04a0c0cbea Mon Sep 17 00:00:00 2001 From: Nick Ficano Date: Thu, 11 Jun 2026 23:15:31 -0400 Subject: [PATCH 9/9] Potential fix for pull request finding 'Unread local variable' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- .../src/main/java/dev/arcp/runtime/session/SessionLoop.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java index a9de4af..6a3e0d8 100644 --- a/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java +++ b/arcp-runtime/src/main/java/dev/arcp/runtime/session/SessionLoop.java @@ -332,7 +332,7 @@ private void handle(Envelope envelope) { case JobSubscribe sub -> handleSubscribe(envelope, sub); case JobUnsubscribe unsub -> handleUnsubscribe(unsub); case SessionClosed ignored -> log.warn("client-only message received: {}", m); - case JobCancelled ignored -> log.warn("client-only message received: {}", m); + case JobCancelled jc -> log.warn("client-only message received: {}", jc); case SessionWelcome ignored -> log.warn("client-only message received: {}", m); case JobAccepted ignored -> log.warn("client-only message received: {}", m); case JobEvent ignored -> log.warn("client-only message received: {}", m);