From 422406d5cc8290021df0d68c20978cd48d29fa22 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 13 May 2026 17:45:37 +0545 Subject: [PATCH 1/3] Add Dynamo docs --- mkdocs.yml | 1 + mkdocs/docs/concepts/services.md | 109 ++++++++++++++- mkdocs/docs/examples/inference/dynamo.md | 164 +++++++++++++++++++++++ mkdocs/docs/examples/inference/sglang.md | 1 - 4 files changed, 271 insertions(+), 4 deletions(-) create mode 100644 mkdocs/docs/examples/inference/dynamo.md diff --git a/mkdocs.yml b/mkdocs.yml index 0bab8c329..4c057b500 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -321,6 +321,7 @@ nav: - NCCL/RCCL tests: docs/examples/clusters/nccl-rccl-tests.md - Inference: - SGLang: docs/examples/inference/sglang.md + - Dynamo: docs/examples/inference/dynamo.md - vLLM: docs/examples/inference/vllm.md - NIM: docs/examples/inference/nim.md - TensorRT-LLM: docs/examples/inference/trtllm.md diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index d9c61ec54..7cf95404c 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -342,13 +342,15 @@ Setting the minimum number of replicas to `0` allows the service to scale down t -Since 0.20.17, `dstack` supports serving a model using PD disaggregation. To use it, configure three replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers, and one for decode workers. +Since 0.20.17, `dstack` supports serving a model using PD (Prefill-Decode) disaggregation. To use it, configure three replica groups: one for the router, one for prefill workers, and one for decode workers. -> Currently, Prefill-Decode disaggregation is supported only for SGLang. +`dstack` integrates with two routers for PD disaggregation: [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html) and [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo). + +> Currently, with SMG router Prefill-Decode disaggregation is supported only for SGLang. Below is an example for running `zai-org/GLM-4.5-Air-FP8`: -=== "NVIDIA" +=== "SMG"
@@ -418,6 +420,107 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
+=== "Dynamo" + +
+ + ```yaml + type: service + name: dynamo-pd + + env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + + replicas: + - count: 1 + docker: true + router: + type: dynamo + commands: + - apt-get update + - apt-get install -y python3-dev python3-venv + - python3 -m venv ~/dyn-venv + - source ~/dyn-venv/bin/activate + - pip install -U pip + - pip install --pre "ai-dynamo[sglang]" + - git clone https://github.com/ai-dynamo/dynamo.git + # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend. + - docker compose -f dynamo/deploy/docker-compose.yml up -d + - | + python3 -m dynamo.frontend \ + --http-host 0.0.0.0 --http-port 8000 \ + --discovery-backend etcd --router-mode kv \ + --kv-cache-block-size 64 + resources: + cpu: 4 + + - count: 1..4 + scaling: + metric: rps + target: 3 + python: "3.12" + nvcc: true + commands: + # dstack injects DSTACK_ROUTER_INTERNAL_IP after the router replica + # is provisioned. Compose the etcd/NATS endpoints from it. + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + # Set to enable /health endpoint required by dstack probes. + - export DYN_SYSTEM_PORT="8000" + # Wait until the router's etcd and NATS ports are actually accepting connections. + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install --pre "ai-dynamo[sglang]" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode prefill --disaggregation-transfer-backend nixl + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + python: "3.12" + nvcc: true + commands: + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + - export DYN_SYSTEM_PORT="8000" + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install --pre "ai-dynamo[sglang]" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode decode --disaggregation-transfer-backend nixl + resources: + gpu: H200 + + port: 8000 + model: zai-org/GLM-4.5-Air-FP8 + + # Custom probe is required for PD disaggregation. + probes: + - type: http + url: /health + interval: 15s + ``` + +
+ !!! info "Cluster" PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances. diff --git a/mkdocs/docs/examples/inference/dynamo.md b/mkdocs/docs/examples/inference/dynamo.md new file mode 100644 index 000000000..d4f930fed --- /dev/null +++ b/mkdocs/docs/examples/inference/dynamo.md @@ -0,0 +1,164 @@ +--- +title: Dynamo +description: Deploying zai-org/GLM-4.5-Air-FP8 using NVIDIA Dynamo +--- + +# Dynamo + +This example shows how to deploy `zai-org/GLM-4.5-Air-FP8` using +[NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) and `dstack`. + + +## Apply a configuration + +Here's an example of a service that deploys `zai-org/GLM-4.5-Air-FP8` using +Dynamo with PD disaggregation. + +
+ +```yaml +type: service +name: dynamo-pd + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1 + docker: true + router: + type: dynamo + commands: + - apt-get update + - apt-get install -y python3-dev python3-venv + - python3 -m venv ~/dyn-venv + - source ~/dyn-venv/bin/activate + - pip install -U pip + - pip install --pre "ai-dynamo[sglang]" + - git clone https://github.com/ai-dynamo/dynamo.git + # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend. + - docker compose -f dynamo/deploy/docker-compose.yml up -d + - | + python3 -m dynamo.frontend \ + --http-host 0.0.0.0 --http-port 8000 \ + --discovery-backend etcd --router-mode kv \ + --kv-cache-block-size 64 + resources: + cpu: 4 + + - count: 1..4 + scaling: + metric: rps + target: 3 + python: "3.12" + nvcc: true + commands: + # dstack injects DSTACK_ROUTER_INTERNAL_IP after the router replica + # is provisioned. Compose the etcd/NATS endpoints from it. + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + # Set to enable /health endpoint required by dstack probes. + - export DYN_SYSTEM_PORT="8000" + # Wait until the router's etcd and NATS ports are actually accepting connections. + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install --pre "ai-dynamo[sglang]" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode prefill --disaggregation-transfer-backend nixl + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + python: "3.12" + nvcc: true + commands: + - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379" + - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222" + - export DYN_SYSTEM_PORT="8000" + - | + until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \ + && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do + echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 + done + - pip install --pre "ai-dynamo[sglang]" + - | + python3 -m dynamo.sglang \ + --model-path $MODEL_ID --served-model-name $MODEL_ID \ + --discovery-backend etcd --host 0.0.0.0 \ + --page-size 64 \ + --disaggregation-mode decode --disaggregation-transfer-backend nixl + resources: + gpu: H200 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +# Custom probe is required for PD disaggregation. +probes: + - type: http + url: /health + interval: 15s +``` + +
+ +Save the configuration as `service.dstack.yml`, then use the +[`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
+ +```shell +$ dstack apply -f service.dstack.yml +``` + +
+ +If no gateway is created, the service endpoint will be available at `/proxy/services///`. + +
+ +```shell +curl http://127.0.0.1:3000/proxy/services/main/dynamo-pd/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <user token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "zai-org/GLM-4.5-Air-FP8", + "messages": [ + { + "role": "user", + "content": "What is prefill-decode disaggregation?" + } + ], + "max_tokens": 1024 + }' +``` + +
+ +> If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://dynamo-pd./`. + +## Configuration options + +Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon. + +!!! info "Cluster" + PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances. + + While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster. + +## What's next? + +1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) +2. Browse the [NVIDIA Dynamo GitHub repository](https://github.com/ai-dynamo/dynamo) and the [SGLang](./sglang.md) example diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md index e900a5f0b..42d49d375 100644 --- a/mkdocs/docs/examples/inference/sglang.md +++ b/mkdocs/docs/examples/inference/sglang.md @@ -92,7 +92,6 @@ Here's an example of a service that deploys The AMD example keeps the deployment close to the upstream Qwen and SGLang guidance: a pinned ROCm image, tensor parallelism across all four GPUs, and the standard `qwen3` reasoning parser without extra ROCm-specific tuning flags. -The first startup on MI300X can take longer while SGLang compiles ROCm kernels. Save one of the configurations above as `service.dstack.yml`, then use the [`dstack apply`](../../reference/cli/dstack/apply.md) command. From a5569b7585eecd4e90c35d9e38f27c9f4d553ed2 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 14 May 2026 13:23:58 +0545 Subject: [PATCH 2/3] Minor Update --- mkdocs/docs/concepts/services.md | 6 +++--- mkdocs/docs/examples/inference/dynamo.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index 7cf95404c..fce30adac 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -443,7 +443,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`: - python3 -m venv ~/dyn-venv - source ~/dyn-venv/bin/activate - pip install -U pip - - pip install --pre "ai-dynamo[sglang]" + - pip install "ai-dynamo[sglang]==1.1.1" - git clone https://github.com/ai-dynamo/dynamo.git # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend. - docker compose -f dynamo/deploy/docker-compose.yml up -d @@ -474,7 +474,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`: && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 done - - pip install --pre "ai-dynamo[sglang]" + - pip install "ai-dynamo[sglang]==1.1.1" - | python3 -m dynamo.sglang \ --model-path $MODEL_ID --served-model-name $MODEL_ID \ @@ -499,7 +499,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`: && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 done - - pip install --pre "ai-dynamo[sglang]" + - pip install "ai-dynamo[sglang]==1.1.1" - | python3 -m dynamo.sglang \ --model-path $MODEL_ID --served-model-name $MODEL_ID \ diff --git a/mkdocs/docs/examples/inference/dynamo.md b/mkdocs/docs/examples/inference/dynamo.md index d4f930fed..ba3b94cf9 100644 --- a/mkdocs/docs/examples/inference/dynamo.md +++ b/mkdocs/docs/examples/inference/dynamo.md @@ -35,7 +35,7 @@ replicas: - python3 -m venv ~/dyn-venv - source ~/dyn-venv/bin/activate - pip install -U pip - - pip install --pre "ai-dynamo[sglang]" + - pip install "ai-dynamo[sglang]==1.1.1" - git clone https://github.com/ai-dynamo/dynamo.git # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend. - docker compose -f dynamo/deploy/docker-compose.yml up -d @@ -66,7 +66,7 @@ replicas: && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 done - - pip install --pre "ai-dynamo[sglang]" + - pip install "ai-dynamo[sglang]==1.1.1" - | python3 -m dynamo.sglang \ --model-path $MODEL_ID --served-model-name $MODEL_ID \ @@ -91,7 +91,7 @@ replicas: && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3 done - - pip install --pre "ai-dynamo[sglang]" + - pip install "ai-dynamo[sglang]==1.1.1" - | python3 -m dynamo.sglang \ --model-path $MODEL_ID --served-model-name $MODEL_ID \ From 6481878324ac367eb299102fd1b026a6a1690548 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 14 May 2026 14:34:51 +0200 Subject: [PATCH 3/3] [Docs] NVIDIA Dynamo docs minor edits --- mkdocs/docs/concepts/services.md | 16 +++++++++------- mkdocs/docs/examples/inference/dynamo.md | 6 ++++-- mkdocs/docs/examples/inference/sglang.md | 6 ++++-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index fce30adac..c998bc47e 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -342,12 +342,10 @@ Setting the minimum number of replicas to `0` allows the service to scale down t -Since 0.20.17, `dstack` supports serving a model using PD (Prefill-Decode) disaggregation. To use it, configure three replica groups: one for the router, one for prefill workers, and one for decode workers. +Since 0.20.17, `dstack` supports serving a model using Prefill-Decode disaggregation. To use it, configure three replica groups: one for the router, one for prefill workers, and one for decode workers. `dstack` integrates with two routers for PD disaggregation: [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html) and [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo). -> Currently, with SMG router Prefill-Decode disaggregation is supported only for SGLang. - Below is an example for running `zai-org/GLM-4.5-Air-FP8`: === "SMG" @@ -374,10 +372,10 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`: --port 8000 \ --pd-disaggregation \ --prefill-policy cache_aware - router: - type: sglang resources: cpu: 4 + router: + type: sglang - count: 1..4 scaling: @@ -420,6 +418,8 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`: + > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon. + === "Dynamo"
@@ -435,8 +435,6 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`: replicas: - count: 1 docker: true - router: - type: dynamo commands: - apt-get update - apt-get install -y python3-dev python3-venv @@ -454,6 +452,8 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`: --kv-cache-block-size 64 resources: cpu: 4 + router: + type: dynamo - count: 1..4 scaling: @@ -521,6 +521,8 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
+ > With the the `dynamo` router, you can use SGLang, vLLM, and TensorRT-LLM prefill and decode workers. + !!! info "Cluster" PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances. diff --git a/mkdocs/docs/examples/inference/dynamo.md b/mkdocs/docs/examples/inference/dynamo.md index ba3b94cf9..0c30f19e7 100644 --- a/mkdocs/docs/examples/inference/dynamo.md +++ b/mkdocs/docs/examples/inference/dynamo.md @@ -27,8 +27,6 @@ env: replicas: - count: 1 docker: true - router: - type: dynamo commands: - apt-get update - apt-get install -y python3-dev python3-venv @@ -46,6 +44,8 @@ replicas: --kv-cache-block-size 64 resources: cpu: 4 + router: + type: dynamo - count: 1..4 scaling: @@ -113,6 +113,8 @@ probes: +> With the the `dynamo` router, you can use SGLang, vLLM, and TensorRT-LLM prefill and decode workers. + Save the configuration as `service.dstack.yml`, then use the [`dstack apply`](../../reference/cli/dstack/apply.md) command. diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md index 42d49d375..6e67eecdd 100644 --- a/mkdocs/docs/examples/inference/sglang.md +++ b/mkdocs/docs/examples/inference/sglang.md @@ -163,10 +163,10 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/ --port 8000 \ --pd-disaggregation \ --prefill-policy cache_aware - router: - type: sglang resources: cpu: 4 + router: + type: sglang - count: 1..4 scaling: @@ -211,6 +211,8 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/ +> With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon. + Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon. !!! info "Cluster"