From 99ad2b1f6cb1ff8775a71affb6cbb9b97e1b9cd9 Mon Sep 17 00:00:00 2001 From: MK Date: Tue, 16 Jun 2026 15:15:34 -0400 Subject: [PATCH] fix(scheduler): derive k8s service_url default at runtime (closes #179) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-fix, the K8s scheduler backend's runtime constructor hard-errored when scheduler.kubernetes.service_url was empty: Error: kubernetes scheduler backend: scheduler.kubernetes.service_url is required …even though the build-time schedule-manifest stage at forge-cli/build/schedule_manifest_stage.go already knew how to default the same field to http://..svc:/ when unset. Two adjacent code paths reaching opposite conclusions for the same missing field — operators who deployed in-cluster without an explicit service_url couldn't start the agent. Fix: mirror the build-time default in the runtime constructor. - K8sBackendConfig gains a Port int field; selectScheduleBackend plumbs r.cfg.Port into it. - NewKubernetesBackend (and the -WithClient test seam) derives http://..svc:/ when cfg.ServiceURL is empty. defaultK8sServiceURL is the shared helper. - Port=0 falls back to 8080 (matches the runner's listen-port default at forge-cli/runtime/runner.go:152-153). - The hard-error branch is gone — there's no scenario in-cluster where we can't derive a sensible default. - Operator override semantics unchanged: an explicit service_url always wins. Pinned by TestKubernetesBackend_ServiceURLExplicitOverride. Tests added: - TestKubernetesBackend_ServiceURLDefaultDerivation — the #179 pin: empty ServiceURL + Port=9090 → http://my-agent.ns-a.svc:9090/ - TestKubernetesBackend_ServiceURLDefaultPortFallback — Port=0 falls back to 8080 - TestKubernetesBackend_ServiceURLExplicitOverride — explicit ServiceURL (e.g. https://gateway.example.com/...) passes through untouched Docs: - docs/deployment/scheduler-kubernetes.md — new "service_url defaulting" subsection; YAML comment updated to note the auto-derivation - docs/core-concepts/scheduling.md — YAML comment updated with cross-link - CHANGELOG.md — Unreleased / Fixed entry --- CHANGELOG.md | 18 ++++++++ docs/core-concepts/scheduling.md | 2 +- docs/deployment/scheduler-kubernetes.md | 6 ++- forge-cli/runtime/runner.go | 1 + forge-cli/runtime/scheduler_k8s_backend.go | 31 +++++++++++++- .../runtime/scheduler_k8s_backend_test.go | 41 +++++++++++++++++++ 6 files changed, 95 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0b3f3c..6bd02c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Changelog +## Unreleased + +### Fixed + +- **K8s scheduler backend no longer hard-errors when `scheduler.kubernetes.service_url` + is unset (issue #179).** Pre-fix, an agent deployed in-cluster with a default + `scheduler.backend: auto` and no explicit `service_url` aborted startup with + `kubernetes scheduler backend: scheduler.kubernetes.service_url is required` — + even though the build-time `schedule_manifest_stage` already knew how to default + the same field to `http://..svc:/`. Runtime now + mirrors the build-time default: when `ServiceURL` is empty, the constructor + derives the in-cluster Service DNS using `agent_id` + resolved namespace + + the runner's listen port (default 8080). Explicit `service_url` overrides + still pass through untouched, so operators behind an Ingress / Gateway are + unaffected. Pinned by `TestKubernetesBackend_ServiceURLDefaultDerivation`, + `TestKubernetesBackend_ServiceURLDefaultPortFallback`, and + `TestKubernetesBackend_ServiceURLExplicitOverride`. + ## v0.14.2 — 2026-06-10 ### Fixed diff --git a/docs/core-concepts/scheduling.md b/docs/core-concepts/scheduling.md index 613e9d5..826e135 100644 --- a/docs/core-concepts/scheduling.md +++ b/docs/core-concepts/scheduling.md @@ -62,7 +62,7 @@ scheduler: backend: auto # auto (default) | file | kubernetes kubernetes: namespace: "" # defaults to the agent pod's own namespace - service_url: "" # in-cluster URL CronJob trigger pods POST to + service_url: "" # in-cluster URL CronJob trigger pods POST to; auto-derived to http://..svc:/ when empty (issue #179) allow_dynamic: false # whether schedule_set can create CronJobs at runtime trigger_image: "" # default: curlimages/curl:8.10.1 auth_secret_name: "" # default: -internal-token diff --git a/docs/deployment/scheduler-kubernetes.md b/docs/deployment/scheduler-kubernetes.md index 97e26a5..4154f85 100644 --- a/docs/deployment/scheduler-kubernetes.md +++ b/docs/deployment/scheduler-kubernetes.md @@ -24,7 +24,7 @@ scheduler: backend: auto # auto (default) | file | kubernetes kubernetes: namespace: "" # defaults to the agent pod's own namespace at runtime - service_url: "" # in-cluster URL CronJob trigger pods POST to (required in k8s mode) + service_url: "" # in-cluster URL CronJob trigger pods POST to; auto-derived to `http://..svc:/` when empty (mirrors the build-time default, issue #179) allow_dynamic: false # whether schedule_set (LLM-driven) can create CronJobs at runtime trigger_image: "" # container image the trigger Job runs; default: curlimages/curl:8.10.1 auth_secret_name: "" # K8s Secret holding the internal token; default: -internal-token @@ -38,6 +38,10 @@ Resolution at startup: The escape hatch `FORGE_IN_CLUSTER=true|false` overrides the file-presence check — useful for forcing file behavior in a single-replica dev pod, or for running the K8s backend's unit tests on a developer laptop. +### `service_url` defaulting + +When `scheduler.kubernetes.service_url` is empty, the runtime derives `http://..svc:/` (matching the in-cluster Service DNS that `forge package` stamps into the generated CronJob YAML at build time — see `forge-cli/build/schedule_manifest_stage.go`). `` is the agent's A2A listen port (`--port` or `forge.yaml` default 8080). Operators only need to set `service_url` explicitly when the agent sits behind an Ingress / Gateway or uses a non-standard hostname. Pinned by `TestKubernetesBackend_ServiceURLDefaultDerivation` (issue #179). + ## CronJob manifest shape Forge generates one CronJob per schedule. The shape is fixed (no Helm templates) so `kubectl diff` and `kubectl apply -k` work without runtime substitution: diff --git a/forge-cli/runtime/runner.go b/forge-cli/runtime/runner.go index dc46ecb..b5409cc 100644 --- a/forge-cli/runtime/runner.go +++ b/forge-cli/runtime/runner.go @@ -3777,6 +3777,7 @@ func (r *Runner) selectScheduleBackend( k8sCfg.Namespace, K8sBackendConfig{ ServiceURL: k8sCfg.ServiceURL, + Port: r.cfg.Port, AuthSecretName: k8sCfg.AuthSecretName, TriggerImage: k8sCfg.TriggerImage, AllowDynamic: k8sCfg.AllowDynamic, diff --git a/forge-cli/runtime/scheduler_k8s_backend.go b/forge-cli/runtime/scheduler_k8s_backend.go index f7768cc..215dfe1 100644 --- a/forge-cli/runtime/scheduler_k8s_backend.go +++ b/forge-cli/runtime/scheduler_k8s_backend.go @@ -61,8 +61,18 @@ type KubernetesBackend struct { // in-cluster service URL. type K8sBackendConfig struct { // ServiceURL is the in-cluster URL CronJob trigger pods POST to. - // Required. + // When empty, the constructor derives the standard in-cluster + // Service DNS: http://..svc:/ . This + // matches the value the build-time schedule-manifest stage stamps + // into generated CronJob YAML (see forge-cli/build/schedule_manifest_stage.go). + // Operators set this explicitly when the agent listens on a + // non-standard port or sits behind an Ingress / Gateway. ServiceURL string + // Port is the port the agent's A2A server listens on; combined + // with agent_id and namespace to derive ServiceURL when unset. + // Defaults to 8080 when zero (matches the runner's listen-port + // default in forge-cli/runtime/runner.go). + Port int // AuthSecretName is the K8s Secret containing the internal bearer // token CronJobs mount. Defaults to "-internal-token" // when empty (matches `forge auth secret-yaml`). @@ -78,6 +88,20 @@ type K8sBackendConfig struct { AllowDynamic bool } +// defaultK8sServiceURL returns the in-cluster Service DNS URL the +// runtime falls back to when ServiceURL is unset. Mirrors the +// build-time default in forge-cli/build/schedule_manifest_stage.go so +// a `forge run` inside a pod without an explicit service_url still +// dispatches CronJob triggers at the same address `forge package` +// would have stamped into the generated CronJob YAML. Port defaults +// to 8080 when port <= 0. +func defaultK8sServiceURL(agentID, namespace string, port int) string { + if port <= 0 { + port = 8080 + } + return fmt.Sprintf("http://%s.%s.svc:%d/", agentID, namespace, port) +} + // NewKubernetesBackend builds a backend wired to an in-cluster // kubernetes.Interface. The namespace defaults to the pod's own // namespace, read from the standard projected @@ -102,7 +126,7 @@ func NewKubernetesBackend(agentID, namespace string, cfg K8sBackendConfig, logge namespace = "default" } if cfg.ServiceURL == "" { - return nil, fmt.Errorf("kubernetes scheduler backend: scheduler.kubernetes.service_url is required") + cfg.ServiceURL = defaultK8sServiceURL(agentID, namespace, cfg.Port) } if cfg.AuthSecretName == "" { cfg.AuthSecretName = agentID + "-internal-token" @@ -127,6 +151,9 @@ func NewKubernetesBackendWithClient(client kubernetes.Interface, agentID, namesp if namespace == "" { namespace = "default" } + if cfg.ServiceURL == "" { + cfg.ServiceURL = defaultK8sServiceURL(agentID, namespace, cfg.Port) + } if cfg.AuthSecretName == "" { cfg.AuthSecretName = agentID + "-internal-token" } diff --git a/forge-cli/runtime/scheduler_k8s_backend_test.go b/forge-cli/runtime/scheduler_k8s_backend_test.go index 23e3219..9798275 100644 --- a/forge-cli/runtime/scheduler_k8s_backend_test.go +++ b/forge-cli/runtime/scheduler_k8s_backend_test.go @@ -268,3 +268,44 @@ func TestKubernetesBackend_HistoryIsEmptyWithWarning(t *testing.T) { t.Errorf("History should return empty list; got %d", len(hist)) } } + +// TestKubernetesBackend_ServiceURLDefaultDerivation is the #179 regression +// pin: when scheduler.kubernetes.service_url is unset, the runtime must +// fall back to the same in-cluster Service DNS the build-time +// schedule-manifest stage stamps into generated CronJob YAML. Pre-fix +// the constructor hard-errored when ServiceURL was empty; an operator +// who deployed without an explicit service_url couldn't start the +// agent in-cluster. +func TestKubernetesBackend_ServiceURLDefaultDerivation(t *testing.T) { + cs := fake.NewSimpleClientset() + b := NewKubernetesBackendWithClient(cs, "my-agent", "ns-a", K8sBackendConfig{Port: 9090}, fakeBackendLogger{}) + if got, want := b.cfg.ServiceURL, "http://my-agent.ns-a.svc:9090/"; got != want { + t.Errorf("derived ServiceURL = %q, want %q", got, want) + } +} + +// TestKubernetesBackend_ServiceURLDefaultPortFallback covers the +// port-unset branch: when K8sBackendConfig.Port is zero (e.g. the +// caller didn't plumb r.cfg.Port through), the derivation falls back +// to 8080 to match the runner's listen-port default. +func TestKubernetesBackend_ServiceURLDefaultPortFallback(t *testing.T) { + cs := fake.NewSimpleClientset() + b := NewKubernetesBackendWithClient(cs, "my-agent", "ns-a", K8sBackendConfig{}, fakeBackendLogger{}) + if got, want := b.cfg.ServiceURL, "http://my-agent.ns-a.svc:8080/"; got != want { + t.Errorf("derived ServiceURL with Port=0 = %q, want %q", got, want) + } +} + +// TestKubernetesBackend_ServiceURLExplicitOverride confirms an +// operator-supplied ServiceURL passes through untouched — the +// derivation only applies when the field is empty. Pins the +// non-regression case for operators behind an Ingress / Gateway. +func TestKubernetesBackend_ServiceURLExplicitOverride(t *testing.T) { + cs := fake.NewSimpleClientset() + b := NewKubernetesBackendWithClient(cs, "my-agent", "ns-a", + K8sBackendConfig{ServiceURL: "https://gateway.example.com/agents/my-agent/", Port: 9090}, + fakeBackendLogger{}) + if got, want := b.cfg.ServiceURL, "https://gateway.example.com/agents/my-agent/"; got != want { + t.Errorf("explicit ServiceURL not preserved: got %q, want %q", got, want) + } +}