From 422406d5cc8290021df0d68c20978cd48d29fa22 Mon Sep 17 00:00:00 2001
From: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
Date: Wed, 13 May 2026 17:45:37 +0545
Subject: [PATCH 1/3] Add Dynamo docs

---
 mkdocs.yml                               |   1 +
 mkdocs/docs/concepts/services.md         | 109 ++++++++++++++-
 mkdocs/docs/examples/inference/dynamo.md | 164 +++++++++++++++++++++++
 mkdocs/docs/examples/inference/sglang.md |   1 -
 4 files changed, 271 insertions(+), 4 deletions(-)
 create mode 100644 mkdocs/docs/examples/inference/dynamo.md
diff --git a/mkdocs.yml b/mkdocs.yml
index 0bab8c329..4c057b500 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -321,6 +321,7 @@ nav:
             - NCCL/RCCL tests: docs/examples/clusters/nccl-rccl-tests.md
         - Inference:
             - SGLang: docs/examples/inference/sglang.md
+            - Dynamo: docs/examples/inference/dynamo.md
             - vLLM: docs/examples/inference/vllm.md
             - NIM: docs/examples/inference/nim.md
             - TensorRT-LLM: docs/examples/inference/trtllm.md
diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md
index d9c61ec54..7cf95404c 100644
--- a/mkdocs/docs/concepts/services.md
+++ b/mkdocs/docs/concepts/services.md
@@ -342,13 +342,15 @@ Setting the minimum number of replicas to `0` allows the service to scale down t
 
 <!-- NOTE: this section is referenced from the CLI, keep the URL unchanged -->
 
-Since 0.20.17, `dstack` supports serving a model using PD disaggregation. To use it, configure three replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers, and one for decode workers.
+Since 0.20.17, `dstack` supports serving a model using PD (Prefill-Decode) disaggregation. To use it, configure three replica groups: one for the router, one for prefill workers, and one for decode workers.
 
-> Currently, Prefill-Decode disaggregation is supported only for SGLang.
+`dstack` integrates with two routers for PD disaggregation: [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html) and [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo).
+
+> Currently, with SMG router Prefill-Decode disaggregation is supported only for SGLang.
 
 Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
 
-=== "NVIDIA"
+=== "SMG"
 
     <div editor-title="pd.dstack.yml">
 
@@ -418,6 +420,107 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
 
     </div>
 
+=== "Dynamo"
+
+    <div editor-title="pd.dstack.yml">
+
+    ```yaml
+    type: service
+    name: dynamo-pd
+
+    env:
+      - HF_TOKEN
+      - MODEL_ID=zai-org/GLM-4.5-Air-FP8
+
+    replicas:
+      - count: 1
+        docker: true
+        router:
+          type: dynamo
+        commands:
+          - apt-get update
+          - apt-get install -y python3-dev python3-venv
+          - python3 -m venv ~/dyn-venv
+          - source ~/dyn-venv/bin/activate
+          - pip install -U pip
+          - pip install --pre "ai-dynamo[sglang]"
+          - git clone https://github.com/ai-dynamo/dynamo.git
+          # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend.
+          - docker compose -f dynamo/deploy/docker-compose.yml up -d
+          - |
+            python3 -m dynamo.frontend \
+              --http-host 0.0.0.0 --http-port 8000 \
+              --discovery-backend etcd --router-mode kv \
+              --kv-cache-block-size 64
+        resources:
+          cpu: 4
+
+      - count: 1..4
+        scaling:
+          metric: rps
+          target: 3
+        python: "3.12"
+        nvcc: true
+        commands:
+          # dstack injects DSTACK_ROUTER_INTERNAL_IP after the router replica
+          # is provisioned. Compose the etcd/NATS endpoints from it.
+          - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379"
+          - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222"
+          # Set to enable /health endpoint required by dstack probes.
+          - export DYN_SYSTEM_PORT="8000"
+          # Wait until the router's etcd and NATS ports are actually accepting connections.
+          - |
+            until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \
+               && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do
+              echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3
+            done
+          - pip install --pre "ai-dynamo[sglang]"
+          - |
+            python3 -m dynamo.sglang \
+              --model-path $MODEL_ID --served-model-name $MODEL_ID \
+              --discovery-backend etcd --host 0.0.0.0 \
+              --page-size 64 \
+              --disaggregation-mode prefill --disaggregation-transfer-backend nixl
+        resources:
+          gpu: H200
+
+      - count: 1..8
+        scaling:
+          metric: rps
+          target: 2
+        python: "3.12"
+        nvcc: true
+        commands:
+          - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379"
+          - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222"
+          - export DYN_SYSTEM_PORT="8000"
+          - |
+            until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \
+               && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do
+              echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3
+            done
+          - pip install --pre "ai-dynamo[sglang]"
+          - |
+            python3 -m dynamo.sglang \
+              --model-path $MODEL_ID --served-model-name $MODEL_ID \
+              --discovery-backend etcd --host 0.0.0.0 \
+              --page-size 64 \
+              --disaggregation-mode decode --disaggregation-transfer-backend nixl
+        resources:
+          gpu: H200
+
+    port: 8000
+    model: zai-org/GLM-4.5-Air-FP8
+
+    # Custom probe is required for PD disaggregation.
+    probes:
+      - type: http
+        url: /health
+        interval: 15s
+    ```
+
+    </div>
+
 !!! info "Cluster"
     PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances.
 
diff --git a/mkdocs/docs/examples/inference/dynamo.md b/mkdocs/docs/examples/inference/dynamo.md
new file mode 100644
index 000000000..d4f930fed
--- /dev/null
+++ b/mkdocs/docs/examples/inference/dynamo.md
@@ -0,0 +1,164 @@
+---
+title: Dynamo
+description: Deploying zai-org/GLM-4.5-Air-FP8 using NVIDIA Dynamo
+---
+
+# Dynamo
+
+This example shows how to deploy `zai-org/GLM-4.5-Air-FP8` using
+[NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) and `dstack`.
+
+
+## Apply a configuration
+
+Here's an example of a service that deploys `zai-org/GLM-4.5-Air-FP8` using
+Dynamo with PD disaggregation.
+
+<div editor-title="service.dstack.yml">
+
+```yaml
+type: service
+name: dynamo-pd
+
+env:
+  - HF_TOKEN
+  - MODEL_ID=zai-org/GLM-4.5-Air-FP8
+
+replicas:
+  - count: 1
+    docker: true
+    router:
+      type: dynamo
+    commands:
+      - apt-get update
+      - apt-get install -y python3-dev python3-venv
+      - python3 -m venv ~/dyn-venv
+      - source ~/dyn-venv/bin/activate
+      - pip install -U pip
+      - pip install --pre "ai-dynamo[sglang]"
+      - git clone https://github.com/ai-dynamo/dynamo.git
+      # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend.
+      - docker compose -f dynamo/deploy/docker-compose.yml up -d
+      - |
+        python3 -m dynamo.frontend \
+          --http-host 0.0.0.0 --http-port 8000 \
+          --discovery-backend etcd --router-mode kv \
+          --kv-cache-block-size 64
+    resources:
+      cpu: 4
+
+  - count: 1..4
+    scaling:
+      metric: rps
+      target: 3
+    python: "3.12"
+    nvcc: true
+    commands:
+      # dstack injects DSTACK_ROUTER_INTERNAL_IP after the router replica
+      # is provisioned. Compose the etcd/NATS endpoints from it.
+      - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379"
+      - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222"
+      # Set to enable /health endpoint required by dstack probes.
+      - export DYN_SYSTEM_PORT="8000"
+      # Wait until the router's etcd and NATS ports are actually accepting connections.
+      - |
+        until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \
+           && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do
+          echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3
+        done
+      - pip install --pre "ai-dynamo[sglang]"
+      - |
+        python3 -m dynamo.sglang \
+          --model-path $MODEL_ID --served-model-name $MODEL_ID \
+          --discovery-backend etcd --host 0.0.0.0 \
+          --page-size 64 \
+          --disaggregation-mode prefill --disaggregation-transfer-backend nixl
+    resources:
+      gpu: H200
+
+  - count: 1..8
+    scaling:
+      metric: rps
+      target: 2
+    python: "3.12"
+    nvcc: true
+    commands:
+      - export ETCD_ENDPOINTS="http://$DSTACK_ROUTER_INTERNAL_IP:2379"
+      - export NATS_SERVER="nats://$DSTACK_ROUTER_INTERNAL_IP:4222"
+      - export DYN_SYSTEM_PORT="8000"
+      - |
+        until (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/2379) 2>/dev/null \
+           && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do
+          echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3
+        done
+      - pip install --pre "ai-dynamo[sglang]"
+      - |
+        python3 -m dynamo.sglang \
+          --model-path $MODEL_ID --served-model-name $MODEL_ID \
+          --discovery-backend etcd --host 0.0.0.0 \
+          --page-size 64 \
+          --disaggregation-mode decode --disaggregation-transfer-backend nixl
+    resources:
+      gpu: H200
+
+port: 8000
+model: zai-org/GLM-4.5-Air-FP8
+
+# Custom probe is required for PD disaggregation.
+probes:
+  - type: http
+    url: /health
+    interval: 15s
+```
+
+</div>
+
+Save the configuration as `service.dstack.yml`, then use the
+[`dstack apply`](../../reference/cli/dstack/apply.md) command.
+
+<div class="termy">
+
+```shell
+$ dstack apply -f service.dstack.yml
+```
+
+</div>
+
+If no gateway is created, the service endpoint will be available at `<dstack server URL>/proxy/services/<project name>/<run name>/`.
+
+<div class="termy">
+
+```shell
+curl http://127.0.0.1:3000/proxy/services/main/dynamo-pd/v1/chat/completions \
+    -X POST \
+    -H 'Authorization: Bearer &lt;user token&gt;' \
+    -H 'Content-Type: application/json' \
+    -d '{
+      "model": "zai-org/GLM-4.5-Air-FP8",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is prefill-decode disaggregation?"
+        }
+      ],
+      "max_tokens": 1024
+    }'
+```
+
+</div>
+
+> If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://dynamo-pd.<gateway domain>/`.
+
+## Configuration options
+
+Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon.
+
+!!! info "Cluster"
+    PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances.
+
+    While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster.
+
+## What's next?
+
+1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md)
+2. Browse the [NVIDIA Dynamo GitHub repository](https://github.com/ai-dynamo/dynamo) and the [SGLang](./sglang.md) example
diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md
index e900a5f0b..42d49d375 100644
--- a/mkdocs/docs/examples/inference/sglang.md
+++ b/mkdocs/docs/examples/inference/sglang.md
@@ -92,7 +92,6 @@ Here's an example of a service that deploys
 The AMD example keeps the deployment close to the upstream Qwen and SGLang
 guidance: a pinned ROCm image, tensor parallelism across all four GPUs, and the
 standard `qwen3` reasoning parser without extra ROCm-specific tuning flags.
-The first startup on MI300X can take longer while SGLang compiles ROCm kernels.
 
 Save one of the configurations above as `service.dstack.yml`, then use the
 [`dstack apply`](../../reference/cli/dstack/apply.md) command.

From a5569b7585eecd4e90c35d9e38f27c9f4d553ed2 Mon Sep 17 00:00:00 2001
From: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
Date: Thu, 14 May 2026 13:23:58 +0545
Subject: [PATCH 2/3] Minor Update

---
 mkdocs/docs/concepts/services.md         | 6 +++---
 mkdocs/docs/examples/inference/dynamo.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md
index 7cf95404c..fce30adac 100644
--- a/mkdocs/docs/concepts/services.md
+++ b/mkdocs/docs/concepts/services.md
@@ -443,7 +443,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
           - python3 -m venv ~/dyn-venv
           - source ~/dyn-venv/bin/activate
           - pip install -U pip
-          - pip install --pre "ai-dynamo[sglang]"
+          - pip install "ai-dynamo[sglang]==1.1.1"
           - git clone https://github.com/ai-dynamo/dynamo.git
           # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend.
           - docker compose -f dynamo/deploy/docker-compose.yml up -d
@@ -474,7 +474,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
                && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do
               echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3
             done
-          - pip install --pre "ai-dynamo[sglang]"
+          - pip install "ai-dynamo[sglang]==1.1.1"
           - |
             python3 -m dynamo.sglang \
               --model-path $MODEL_ID --served-model-name $MODEL_ID \
@@ -499,7 +499,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
                && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do
               echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3
             done
-          - pip install --pre "ai-dynamo[sglang]"
+          - pip install "ai-dynamo[sglang]==1.1.1"
           - |
             python3 -m dynamo.sglang \
               --model-path $MODEL_ID --served-model-name $MODEL_ID \
diff --git a/mkdocs/docs/examples/inference/dynamo.md b/mkdocs/docs/examples/inference/dynamo.md
index d4f930fed..ba3b94cf9 100644
--- a/mkdocs/docs/examples/inference/dynamo.md
+++ b/mkdocs/docs/examples/inference/dynamo.md
@@ -35,7 +35,7 @@ replicas:
       - python3 -m venv ~/dyn-venv
       - source ~/dyn-venv/bin/activate
       - pip install -U pip
-      - pip install --pre "ai-dynamo[sglang]"
+      - pip install "ai-dynamo[sglang]==1.1.1"
       - git clone https://github.com/ai-dynamo/dynamo.git
       # Brings up the NATS / etcd compose stack and runs the Dynamo HTTP frontend.
       - docker compose -f dynamo/deploy/docker-compose.yml up -d
@@ -66,7 +66,7 @@ replicas:
            && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do
           echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3
         done
-      - pip install --pre "ai-dynamo[sglang]"
+      - pip install "ai-dynamo[sglang]==1.1.1"
       - |
         python3 -m dynamo.sglang \
           --model-path $MODEL_ID --served-model-name $MODEL_ID \
@@ -91,7 +91,7 @@ replicas:
            && (echo > /dev/tcp/$DSTACK_ROUTER_INTERNAL_IP/4222) 2>/dev/null; do
           echo "waiting for etcd/NATS on $DSTACK_ROUTER_INTERNAL_IP..."; sleep 3
         done
-      - pip install --pre "ai-dynamo[sglang]"
+      - pip install "ai-dynamo[sglang]==1.1.1"
       - |
         python3 -m dynamo.sglang \
           --model-path $MODEL_ID --served-model-name $MODEL_ID \

From 6481878324ac367eb299102fd1b026a6a1690548 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Thu, 14 May 2026 14:34:51 +0200
Subject: [PATCH 3/3] [Docs] NVIDIA Dynamo docs minor edits

---
 mkdocs/docs/concepts/services.md         | 16 +++++++++-------
 mkdocs/docs/examples/inference/dynamo.md |  6 ++++--
 mkdocs/docs/examples/inference/sglang.md |  6 ++++--
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md
index fce30adac..c998bc47e 100644
--- a/mkdocs/docs/concepts/services.md
+++ b/mkdocs/docs/concepts/services.md
@@ -342,12 +342,10 @@ Setting the minimum number of replicas to `0` allows the service to scale down t
 
 <!-- NOTE: this section is referenced from the CLI, keep the URL unchanged -->
 
-Since 0.20.17, `dstack` supports serving a model using PD (Prefill-Decode) disaggregation. To use it, configure three replica groups: one for the router, one for prefill workers, and one for decode workers.
+Since 0.20.17, `dstack` supports serving a model using Prefill-Decode disaggregation. To use it, configure three replica groups: one for the router, one for prefill workers, and one for decode workers.
 
 `dstack` integrates with two routers for PD disaggregation: [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html) and [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo).
 
-> Currently, with SMG router Prefill-Decode disaggregation is supported only for SGLang.
-
 Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
 
 === "SMG"
@@ -374,10 +372,10 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
               --port 8000 \
               --pd-disaggregation \
               --prefill-policy cache_aware
-        router:
-          type: sglang
         resources:
           cpu: 4
+        router:
+          type: sglang
 
       - count: 1..4
         scaling:
@@ -420,6 +418,8 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
 
     </div>
 
+    > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon.
+
 === "Dynamo"
 
     <div editor-title="pd.dstack.yml">
@@ -435,8 +435,6 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
     replicas:
       - count: 1
         docker: true
-        router:
-          type: dynamo
         commands:
           - apt-get update
           - apt-get install -y python3-dev python3-venv
@@ -454,6 +452,8 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
               --kv-cache-block-size 64
         resources:
           cpu: 4
+        router:
+          type: dynamo
 
       - count: 1..4
         scaling:
@@ -521,6 +521,8 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8`:
 
     </div>
 
+    > With the the `dynamo` router, you can use SGLang, vLLM, and TensorRT-LLM prefill and decode workers.
+
 !!! info "Cluster"
     PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances.
 
diff --git a/mkdocs/docs/examples/inference/dynamo.md b/mkdocs/docs/examples/inference/dynamo.md
index ba3b94cf9..0c30f19e7 100644
--- a/mkdocs/docs/examples/inference/dynamo.md
+++ b/mkdocs/docs/examples/inference/dynamo.md
@@ -27,8 +27,6 @@ env:
 replicas:
   - count: 1
     docker: true
-    router:
-      type: dynamo
     commands:
       - apt-get update
       - apt-get install -y python3-dev python3-venv
@@ -46,6 +44,8 @@ replicas:
           --kv-cache-block-size 64
     resources:
       cpu: 4
+    router:
+      type: dynamo
 
   - count: 1..4
     scaling:
@@ -113,6 +113,8 @@ probes:
 
 </div>
 
+> With the the `dynamo` router, you can use SGLang, vLLM, and TensorRT-LLM prefill and decode workers.
+
 Save the configuration as `service.dstack.yml`, then use the
 [`dstack apply`](../../reference/cli/dstack/apply.md) command.
 
diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md
index 42d49d375..6e67eecdd 100644
--- a/mkdocs/docs/examples/inference/sglang.md
+++ b/mkdocs/docs/examples/inference/sglang.md
@@ -163,10 +163,10 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/
               --port 8000 \
               --pd-disaggregation \
               --prefill-policy cache_aware
-        router:
-          type: sglang
         resources:
           cpu: 4
+        router:
+          type: sglang
 
       - count: 1..4
         scaling:
@@ -211,6 +211,8 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/
 
     </div>
 
+> With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon.
+
 Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon.
 
 !!! info "Cluster"