diff --git a/src/main/java/dev/netcopy/transfer/Puller.java b/src/main/java/dev/netcopy/transfer/Puller.java
index bb609ee..1bfaa3c 100644
--- a/src/main/java/dev/netcopy/transfer/Puller.java
+++ b/src/main/java/dev/netcopy/transfer/Puller.java
@@ -753,8 +753,19 @@ private BlobPuller createBlobPuller(JobState job) {
                 if (host == null) {
                     throw new IllegalArgumentException("peerUrl has no host: " + job.peerUrl());
                 }
+                // Pool sized to the GLOBAL chunk-worker concurrency, not per-file. The puller
+                // can have up to `fileParallelism × chunksPerFile` chunk workers running at
+                // once (each file gets its own chunk semaphore but all files share the
+                // BlobPuller's connection pool). Pre-v0.4.1 the pool was sized to just
+                // `chunksPerFile`, so with the defaults 8×4=32 chunk workers were competing
+                // for 8 sockets and the Performance modal's "pool acquire wait" sat at
+                // p50 ~280ms — a quarter of every chunk's wall clock. Multiplying out gives
+                // us a 1:1 socket-per-worker ratio with no contention. The TCP server's
+                // MAX_CONCURRENT_CONNECTIONS=1024 cap is well above any sensible product.
+                int poolSize = Math.max(1, job.chunksPerFile())
+                             * Math.max(1, job.fileParallelism());
                 yield new TcpBlobPuller(host, job.peerTcpPort(), peerToken,
-                    Math.max(1, job.chunksPerFile()), bytesObserver);
+                    poolSize, bytesObserver);
             }
         };
     }