From 937a3b50d217d4b9540fba4542e982a0a1c5369d Mon Sep 17 00:00:00 2001 From: sdairs Date: Tue, 9 Jun 2026 20:56:30 +0100 Subject: [PATCH 1/2] Wait for service readiness before auto-provisioning Query API endpoint The query endpoint upsert returns 500 "Internal error" while a service is still provisioning, so the create-time auto-provision hook always failed and told the user to retry later (#242). `service create` now polls the service state (every 5s, up to 10 minutes) and only enables the endpoint once the service is ready, with state transitions printed to stderr instead of a silent hang. Also stop leaking the dedicated API key when the endpoint binding fails: it is deleted (best-effort) so each retry no longer leaves an orphaned key in the org. Fixes #242 Co-Authored-By: Claude Fable 5 --- README.md | 2 +- crates/clickhousectl/src/cloud/commands.rs | 17 +- .../clickhousectl/src/cloud/service_query.rs | 150 ++++++++++++++++-- 3 files changed, 151 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 119c701..c21fb50 100644 --- a/README.md +++ b/README.md @@ -460,7 +460,7 @@ clickhousectl cloud service delete --force #### Query API auto-provisioning -By default, `cloud service create` provisions a Query API endpoint for the new service and creates a dedicated API key bound to it. The key (`keyId`, `keySecret`, and `endpointId`) is stored in `.clickhouse/credentials.json` under `service_query_keys.`, alongside any user-level API key. `cloud service query` then runs SQL over HTTP using that key — no `clickhouse` binary and no service password required. The key is scoped to a single service, so it can read and write (SELECT, INSERT, DDL) against that service but cannot reach any other service in the org. +By default, `cloud service create` provisions a Query API endpoint for the new service and creates a dedicated API key bound to it. The endpoint can only be bound once the service has finished provisioning, so after printing the new service's credentials the command waits for the service to become ready (polling every 5 seconds, up to 10 minutes) before enabling the endpoint; pass `--no-enable-query` to skip the wait and the endpoint setup. If the wait or setup fails, the service itself is unaffected — rerun `cloud service query` later to provision lazily. The key (`keyId`, `keySecret`, and `endpointId`) is stored in `.clickhouse/credentials.json` under `service_query_keys.`, alongside any user-level API key. `cloud service query` then runs SQL over HTTP using that key — no `clickhouse` binary and no service password required. The key is scoped to a single service, so it can read and write (SELECT, INSERT, DDL) against that service but cannot reach any other service in the org. For existing services without a stored key, `cloud service query` provisions one lazily on first use. Pass `--no-auto-enable` to fail instead, or `--no-enable-query` on `service create` to skip the create-time hook. diff --git a/crates/clickhousectl/src/cloud/commands.rs b/crates/clickhousectl/src/cloud/commands.rs index 13981df..005b2b1 100644 --- a/crates/clickhousectl/src/cloud/commands.rs +++ b/crates/clickhousectl/src/cloud/commands.rs @@ -863,11 +863,24 @@ pub async fn service_create( } if !no_enable_query && !json { - match crate::cloud::service_query::ensure_service_query_setup( - client, &org_id, &svc_id, &svc_name, + // The endpoint upsert 500s while the service is still provisioning + // (issue #242), so wait for the service to come up first. + eprintln!(); + eprintln!("Waiting for the service to be ready to enable the Query API endpoint..."); + let setup = match crate::cloud::service_query::wait_for_service_ready( + client, &org_id, &svc_id, ) .await { + Ok(()) => { + crate::cloud::service_query::ensure_service_query_setup( + client, &org_id, &svc_id, &svc_name, + ) + .await + } + Err(e) => Err(e), + }; + match setup { Ok(_) => { println!(); println!( diff --git a/crates/clickhousectl/src/cloud/service_query.rs b/crates/clickhousectl/src/cloud/service_query.rs index 3c1ff25..cbd5903 100644 --- a/crates/clickhousectl/src/cloud/service_query.rs +++ b/crates/clickhousectl/src/cloud/service_query.rs @@ -25,6 +25,67 @@ const QUERY_ENDPOINT_ROLE: &str = "sql_console_admin"; /// caller so CORS doesn't apply, but the API still requires a value. const ALLOWED_ORIGINS: &str = "*"; +/// Polling cadence while waiting for a service to become ready. +const READY_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(5); + +/// Upper bound on the readiness wait: 120 polls × 5s ≈ 10 minutes, well +/// above normal provisioning time. On timeout the caller falls back to its +/// "retry later" path rather than blocking forever. +const READY_POLL_MAX_ATTEMPTS: u32 = 120; + +/// What a service state means for binding a query endpoint. The control +/// plane returns 500 "Internal error" on the endpoint upsert while the +/// service is still provisioning, so callers must hold off until the +/// service is ready (issue #242). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ReadyState { + /// Provisioned — safe to bind the query endpoint. + Ready, + /// Still coming up — keep polling. + Pending, + /// Won't become ready by waiting (stopped/failed/terminating/unknown). + Unavailable, +} + +fn classify_ready_state(state: &str) -> ReadyState { + match state { + "running" | "idle" | "partially_running" => ReadyState::Ready, + "provisioning" | "starting" | "awaking" => ReadyState::Pending, + _ => ReadyState::Unavailable, + } +} + +/// Poll until `service_id` reaches a state where the query endpoint can be +/// bound, printing each state transition to stderr. Errors out (rather than +/// waiting) on states that won't resolve on their own, and after +/// [`READY_POLL_MAX_ATTEMPTS`] polls. +pub async fn wait_for_service_ready( + client: &CloudClient, + org_id: &str, + service_id: &str, +) -> Result<(), Box> { + let mut last_state = String::new(); + for _ in 0..READY_POLL_MAX_ATTEMPTS { + let svc = client.get_service(org_id, service_id).await?; + let state = svc.state.to_string(); + if state != last_state { + eprintln!(" state: {state}"); + last_state = state.clone(); + } + match classify_ready_state(&state) { + ReadyState::Ready => return Ok(()), + ReadyState::Pending => tokio::time::sleep(READY_POLL_INTERVAL).await, + ReadyState::Unavailable => { + return Err(format!( + "service is in state '{state}' and will not become ready by waiting" + ) + .into()) + } + } + } + Err("timed out waiting for service to become ready".into()) +} + /// Ensure a query endpoint is provisioned for `service_id` and return the /// persisted key. If a key is already cached locally, returns it unchanged; /// otherwise creates the API key, binds it to the query endpoint (merging @@ -65,8 +126,38 @@ pub async fn ensure_service_query_setup( // endpoints (GET/DELETE /v1/.../keys/{keyId}) accept. let api_key_uuid = key_response.key.id.to_string(); - // Merge our new key UUID into any existing endpoint config so we don't - // silently revoke other bindings the user set up. + let endpoint = match bind_query_endpoint(client, org_id, service_id, &api_key_uuid).await { + Ok(endpoint) => endpoint, + Err(e) => { + // The key was created but never bound or persisted, so nothing + // can use it. Delete it (best-effort) so a later retry doesn't + // leave an orphaned key behind per attempt. + let _ = client.delete_api_key(org_id, &api_key_uuid).await; + return Err(e); + } + }; + + let stored = ServiceQueryKey { + key_id, + key_secret, + endpoint_id: endpoint.id, + service_name: service_name.to_string(), + created_at: Utc::now(), + }; + credentials::set_service_query_key(service_id, stored.clone())?; + + Ok(stored) +} + +/// Bind `api_key_uuid` to the service's query endpoint, merging into any +/// existing endpoint configuration so we don't silently revoke other +/// bindings the user set up. +async fn bind_query_endpoint( + client: &CloudClient, + org_id: &str, + service_id: &str, + api_key_uuid: &str, +) -> Result> { let mut open_api_keys = match client .api() .instance_query_endpoint_get(org_id, service_id) @@ -76,8 +167,8 @@ pub async fn ensure_service_query_setup( Err(clickhouse_cloud_api::Error::Api { status: 404, .. }) => Vec::new(), Err(e) => return Err(client.convert_error(e).into()), }; - if !open_api_keys.contains(&api_key_uuid) { - open_api_keys.push(api_key_uuid); + if !open_api_keys.iter().any(|k| k == api_key_uuid) { + open_api_keys.push(api_key_uuid.to_string()); } let endpoint_request = InstanceServiceQueryApiEndpointsPostRequest { @@ -86,18 +177,47 @@ pub async fn ensure_service_query_setup( allowed_origins: ALLOWED_ORIGINS.to_string(), }; - let endpoint = client + Ok(client .create_query_endpoint(org_id, service_id, &endpoint_request) - .await?; + .await?) +} - let stored = ServiceQueryKey { - key_id, - key_secret, - endpoint_id: endpoint.id, - service_name: service_name.to_string(), - created_at: Utc::now(), - }; - credentials::set_service_query_key(service_id, stored.clone())?; +#[cfg(test)] +mod tests { + use super::*; - Ok(stored) + #[test] + fn ready_states() { + for state in ["running", "idle", "partially_running"] { + assert_eq!(classify_ready_state(state), ReadyState::Ready, "{state}"); + } + } + + #[test] + fn pending_states() { + for state in ["provisioning", "starting", "awaking"] { + assert_eq!(classify_ready_state(state), ReadyState::Pending, "{state}"); + } + } + + #[test] + fn unavailable_states() { + for state in [ + "stopping", + "stopped", + "terminating", + "terminated", + "softdeleting", + "softdeleted", + "degraded", + "failed", + "some-future-state", + ] { + assert_eq!( + classify_ready_state(state), + ReadyState::Unavailable, + "{state}" + ); + } + } } From b6179aba20dfaaf12683924aebd032f141c4e4c2 Mon Sep 17 00:00:00 2001 From: sdairs Date: Tue, 9 Jun 2026 21:14:49 +0100 Subject: [PATCH 2/2] Provision Query API endpoint lazily instead of at service create A freshly created service is still provisioning, so the create-time endpoint upsert either blocked for minutes or failed with 500. Rather than waiting in `service create`, drop the create-time hook entirely: the endpoint is provisioned on first `cloud service query`, which was already the behavior for --json/agent output and for pre-existing services. `service create` now returns immediately and prints a hint that the endpoint is provisioned on first use. The now-meaningless `--no-enable-query` flag is removed (`service query --no-auto-enable` still opts out of the lazy path). The orphaned-key cleanup in ensure_service_query_setup stays. Co-Authored-By: Claude Fable 5 --- README.md | 5 +- crates/clickhousectl/src/cloud/cli.rs | 9 +- crates/clickhousectl/src/cloud/commands.rs | 48 ++------- .../clickhousectl/src/cloud/service_query.rs | 101 ------------------ crates/clickhousectl/src/main.rs | 2 - 5 files changed, 10 insertions(+), 155 deletions(-) diff --git a/README.md b/README.md index c21fb50..2172b22 100644 --- a/README.md +++ b/README.md @@ -456,13 +456,12 @@ clickhousectl cloud service delete --force | `--enable-endpoint` / `--disable-endpoint` | Toggle GA service endpoints (currently `mysql`) | | `--private-preview-terms-checked` | Accept private preview terms when required | | `--enable-core-dumps` | Enable or disable service core dump collection | -| `--no-enable-query` | Skip auto-provisioning of the Query API endpoint + per-service key | #### Query API auto-provisioning -By default, `cloud service create` provisions a Query API endpoint for the new service and creates a dedicated API key bound to it. The endpoint can only be bound once the service has finished provisioning, so after printing the new service's credentials the command waits for the service to become ready (polling every 5 seconds, up to 10 minutes) before enabling the endpoint; pass `--no-enable-query` to skip the wait and the endpoint setup. If the wait or setup fails, the service itself is unaffected — rerun `cloud service query` later to provision lazily. The key (`keyId`, `keySecret`, and `endpointId`) is stored in `.clickhouse/credentials.json` under `service_query_keys.`, alongside any user-level API key. `cloud service query` then runs SQL over HTTP using that key — no `clickhouse` binary and no service password required. The key is scoped to a single service, so it can read and write (SELECT, INSERT, DDL) against that service but cannot reach any other service in the org. +The first time `cloud service query` runs against a service without a stored key, it provisions a Query API endpoint for that service and creates a dedicated API key bound to it. The key (`keyId`, `keySecret`, and `endpointId`) is stored in `.clickhouse/credentials.json` under `service_query_keys.`, alongside any user-level API key. Subsequent queries run SQL over HTTP using that key — no `clickhouse` binary and no service password required. The key is scoped to a single service, so it can read and write (SELECT, INSERT, DDL) against that service but cannot reach any other service in the org. Pass `--no-auto-enable` to fail instead of provisioning. -For existing services without a stored key, `cloud service query` provisions one lazily on first use. Pass `--no-auto-enable` to fail instead, or `--no-enable-query` on `service create` to skip the create-time hook. +Provisioning happens lazily (rather than at `service create` time) because the endpoint can only be bound once the service has finished provisioning, which can take several minutes — `service create` returns immediately instead of blocking on it. Per-service scoping is enforced at the query endpoint binding, which is created with role `sql_console_admin` (read + write inside the bound service only). The API key itself has no org-level roles, so the binding is the only thing that grants it any access. `cloud service delete` removes the stored key from `credentials.json`. diff --git a/crates/clickhousectl/src/cloud/cli.rs b/crates/clickhousectl/src/cloud/cli.rs index 85ee720..57e5e59 100644 --- a/crates/clickhousectl/src/cloud/cli.rs +++ b/crates/clickhousectl/src/cloud/cli.rs @@ -572,11 +572,6 @@ CONTEXT FOR AGENTS: #[arg(long)] enable_core_dumps: Option, - /// Skip auto-provisioning of the Query API endpoint and per-service - /// API key - #[arg(long)] - no_enable_query: bool, - /// Organization ID (auto-detected if not specified) #[arg(long)] org_id: Option, @@ -779,8 +774,8 @@ CONTEXT FOR AGENTS: CONTEXT FOR AGENTS: Runs SQL over HTTP — no local clickhouse binary or service password required. Uses a per-service API key (read+write, scoped to this service via the - query endpoint binding) auto-provisioned on first use (or on - `cloud service create`) and stored in .clickhouse/credentials.json. + query endpoint binding) auto-provisioned on first use and stored in + .clickhouse/credentials.json. SQL precedence: --query > --queries-file > stdin. Default format: PrettyCompact on a TTY, TabSeparated when piped.")] Query { diff --git a/crates/clickhousectl/src/cloud/commands.rs b/crates/clickhousectl/src/cloud/commands.rs index 005b2b1..0dacd8d 100644 --- a/crates/clickhousectl/src/cloud/commands.rs +++ b/crates/clickhousectl/src/cloud/commands.rs @@ -554,7 +554,6 @@ pub struct CreateServiceOptions { pub disable_endpoints: Vec, pub private_preview_terms_checked: bool, pub enable_core_dumps: Option, - pub no_enable_query: bool, pub org_id: Option, } @@ -843,12 +842,10 @@ pub async fn service_create( // Validate input before any network call so typos like --provider awss // fail locally instead of on the /organizations lookup. let request = build_create_service_request(&opts)?; - let no_enable_query = opts.no_enable_query; let org_id = resolve_org_id(client, opts.org_id.as_deref()).await?; let response = client.create_service(&org_id, &request).await?; let svc_id = response.service.id.to_string(); - let svc_name = response.service.name.clone(); if json { println!("{}", serde_json::to_string_pretty(&response)?); @@ -860,42 +857,12 @@ pub async fn service_create( println!("Credentials (save these, password shown only once):"); println!(" Username: default"); println!(" Password: {}", response.password); - } - - if !no_enable_query && !json { - // The endpoint upsert 500s while the service is still provisioning - // (issue #242), so wait for the service to come up first. - eprintln!(); - eprintln!("Waiting for the service to be ready to enable the Query API endpoint..."); - let setup = match crate::cloud::service_query::wait_for_service_ready( - client, &org_id, &svc_id, - ) - .await - { - Ok(()) => { - crate::cloud::service_query::ensure_service_query_setup( - client, &org_id, &svc_id, &svc_name, - ) - .await - } - Err(e) => Err(e), - }; - match setup { - Ok(_) => { - println!(); - println!( - "Query API endpoint enabled. Run SQL with: clickhousectl cloud service query --id {} --query \"SELECT 1\"", - svc_id - ); - } - Err(e) => { - eprintln!( - "warning: failed to auto-provision Query API endpoint: {e}. \ - Run `clickhousectl cloud service query --id {}` later to retry.", - svc_id - ); - } - } + println!(); + println!( + "Run SQL with: clickhousectl cloud service query --id {} --query \"SELECT 1\"", + svc_id + ); + println!("(the Query API endpoint is provisioned automatically on first use)"); } Ok(()) } @@ -2884,7 +2851,6 @@ mod tests { disable_endpoints: vec![], private_preview_terms_checked: true, enable_core_dumps: Some(true), - no_enable_query: false, org_id: None, }; @@ -2924,7 +2890,6 @@ mod tests { disable_endpoints: vec![], private_preview_terms_checked: false, enable_core_dumps: None, - no_enable_query: false, org_id: None, }; @@ -2960,7 +2925,6 @@ mod tests { disable_endpoints: vec![], private_preview_terms_checked: false, enable_core_dumps: None, - no_enable_query: false, org_id: None, }; diff --git a/crates/clickhousectl/src/cloud/service_query.rs b/crates/clickhousectl/src/cloud/service_query.rs index cbd5903..f6959a0 100644 --- a/crates/clickhousectl/src/cloud/service_query.rs +++ b/crates/clickhousectl/src/cloud/service_query.rs @@ -25,67 +25,6 @@ const QUERY_ENDPOINT_ROLE: &str = "sql_console_admin"; /// caller so CORS doesn't apply, but the API still requires a value. const ALLOWED_ORIGINS: &str = "*"; -/// Polling cadence while waiting for a service to become ready. -const READY_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(5); - -/// Upper bound on the readiness wait: 120 polls × 5s ≈ 10 minutes, well -/// above normal provisioning time. On timeout the caller falls back to its -/// "retry later" path rather than blocking forever. -const READY_POLL_MAX_ATTEMPTS: u32 = 120; - -/// What a service state means for binding a query endpoint. The control -/// plane returns 500 "Internal error" on the endpoint upsert while the -/// service is still provisioning, so callers must hold off until the -/// service is ready (issue #242). -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum ReadyState { - /// Provisioned — safe to bind the query endpoint. - Ready, - /// Still coming up — keep polling. - Pending, - /// Won't become ready by waiting (stopped/failed/terminating/unknown). - Unavailable, -} - -fn classify_ready_state(state: &str) -> ReadyState { - match state { - "running" | "idle" | "partially_running" => ReadyState::Ready, - "provisioning" | "starting" | "awaking" => ReadyState::Pending, - _ => ReadyState::Unavailable, - } -} - -/// Poll until `service_id` reaches a state where the query endpoint can be -/// bound, printing each state transition to stderr. Errors out (rather than -/// waiting) on states that won't resolve on their own, and after -/// [`READY_POLL_MAX_ATTEMPTS`] polls. -pub async fn wait_for_service_ready( - client: &CloudClient, - org_id: &str, - service_id: &str, -) -> Result<(), Box> { - let mut last_state = String::new(); - for _ in 0..READY_POLL_MAX_ATTEMPTS { - let svc = client.get_service(org_id, service_id).await?; - let state = svc.state.to_string(); - if state != last_state { - eprintln!(" state: {state}"); - last_state = state.clone(); - } - match classify_ready_state(&state) { - ReadyState::Ready => return Ok(()), - ReadyState::Pending => tokio::time::sleep(READY_POLL_INTERVAL).await, - ReadyState::Unavailable => { - return Err(format!( - "service is in state '{state}' and will not become ready by waiting" - ) - .into()) - } - } - } - Err("timed out waiting for service to become ready".into()) -} - /// Ensure a query endpoint is provisioned for `service_id` and return the /// persisted key. If a key is already cached locally, returns it unchanged; /// otherwise creates the API key, binds it to the query endpoint (merging @@ -181,43 +120,3 @@ async fn bind_query_endpoint( .create_query_endpoint(org_id, service_id, &endpoint_request) .await?) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn ready_states() { - for state in ["running", "idle", "partially_running"] { - assert_eq!(classify_ready_state(state), ReadyState::Ready, "{state}"); - } - } - - #[test] - fn pending_states() { - for state in ["provisioning", "starting", "awaking"] { - assert_eq!(classify_ready_state(state), ReadyState::Pending, "{state}"); - } - } - - #[test] - fn unavailable_states() { - for state in [ - "stopping", - "stopped", - "terminating", - "terminated", - "softdeleting", - "softdeleted", - "degraded", - "failed", - "some-future-state", - ] { - assert_eq!( - classify_ready_state(state), - ReadyState::Unavailable, - "{state}" - ); - } - } -} diff --git a/crates/clickhousectl/src/main.rs b/crates/clickhousectl/src/main.rs index af9e2e8..4e5b7fd 100644 --- a/crates/clickhousectl/src/main.rs +++ b/crates/clickhousectl/src/main.rs @@ -434,7 +434,6 @@ async fn run_cloud(args: CloudArgs) -> Result<()> { disable_endpoint, private_preview_terms_checked, enable_core_dumps, - no_enable_query, org_id, } => { let opts = cloud::commands::CreateServiceOptions { @@ -461,7 +460,6 @@ async fn run_cloud(args: CloudArgs) -> Result<()> { disable_endpoints: disable_endpoint, private_preview_terms_checked, enable_core_dumps, - no_enable_query, org_id, }; cloud::commands::service_create(&client, opts, json).await