Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@
/TODO.md
/gguf/
*.o
*.exe
*.dSYM/
# Generated MSVC import libs for the Windows ROCm build (regenerated from the
# HIP SDK's lib*.dll by win/build-rocm.sh).
/win/third_party/hipblas.lib
/win/third_party/hipblaslt.lib
__pycache__/
*.pyc
/misc/
.*.swp
.DS_Store
ds4.lock
37 changes: 35 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,40 @@ DS4_LINK_LIBS ?= $(CUDA_LDLIBS)
METAL_LDLIBS := $(LDLIBS)
endif

.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo rocm
.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo rocm windows-rocm windows-cpu

# Native Windows ROCm/HIP build of ds4-bench.exe for gfx1151 (AMD HIP SDK, no
# WSL). hipcc.exe's .bat wrapper splits args on spaces, so the compile/link is
# delegated to win/build-rocm.sh (which also synthesizes the MSVC import libs
# and vendors rocWMMA). Override ROCM_PATH / ROCM_ARCH as needed.
ROCM_PATH ?= C:/Program Files/AMD/ROCm/7.1
windows-rocm:
ROCM_PATH="$(ROCM_PATH)" ROCM_ARCH="$(ROCM_ARCH)" bash win/build-rocm.sh

# Native Windows CPU-only build of ds4-bench.exe with MinGW-w64 GCC (no GPU
# backend, no WSL/MSVC). DeepSeek V4 runs on the CPU backend here; this mirrors
# the old `windows-cpu` target from the pre-refactor base. The mmap/stat/socket
# POSIX shims (ds4_win.h, win/ds4_sockets_win.h) are wired in-tree behind
# _WIN32; MinGW already provides pthread/clock_gettime/ftruncate, so the MSVC
# pthread shim is not used (it is !__MINGW32__-guarded). main moved
# ds4_distributed.c / ds4_ssd.c into the shared core, so they link into the
# bench too — hence -lws2_32 / -liphlpapi for the Winsock surface in
# ds4_distributed.c (the MSVC `#pragma comment(lib,...)` in the sockets shim is
# a no-op under gcc, so the import libs are named explicitly here). Self-
# contained (sets its own CC/flags) so it builds regardless of the host
# uname-s branch above and leaves the POSIX/cuda/rocm targets untouched.
WIN_CPU_CC ?= gcc
WIN_CPU_CFLAGS ?= -O3 -ffast-math -march=native -g -Wall -Wextra -std=c99 \
-D_GNU_SOURCE -fno-finite-math-only -DDS4_NO_GPU
WIN_CPU_LDLIBS ?= -lm -lpthread -lws2_32 -liphlpapi
WIN_CPU_OBJS = ds4_bench_cpu.o ds4_help.o ds4_cpu.o ds4_distributed.o ds4_ssd.o
windows-cpu:
$(WIN_CPU_CC) $(WIN_CPU_CFLAGS) -c -o ds4_cpu.o ds4.c
$(WIN_CPU_CC) $(WIN_CPU_CFLAGS) -c -o ds4_bench_cpu.o ds4_bench.c
$(WIN_CPU_CC) $(WIN_CPU_CFLAGS) -c -o ds4_help.o ds4_help.c
$(WIN_CPU_CC) $(WIN_CPU_CFLAGS) -c -o ds4_distributed.o ds4_distributed.c
$(WIN_CPU_CC) $(WIN_CPU_CFLAGS) -c -o ds4_ssd.o ds4_ssd.c
$(WIN_CPU_CC) $(WIN_CPU_CFLAGS) -o ds4-bench.exe $(WIN_CPU_OBJS) $(WIN_CPU_LDLIBS)

ifeq ($(UNAME_S),Darwin)
all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
Expand Down Expand Up @@ -230,4 +263,4 @@ q4k-dot-test: tests/test_q4k_dot.c
./tests/test_q4k_dot

clean:
rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o ds4-bench.exe ds4.exe ds4-server.exe ds4-eval.exe ds4-agent.exe
26 changes: 26 additions & 0 deletions ds4.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,46 @@
#include <ctype.h>
#include <limits.h>
#include <math.h>
#if defined(_WIN32) && defined(DS4_WIN_PTHREAD)
/* Native Windows GPU (HIP/MSVC-ABI) build: MSVC has no <pthread.h>; use the
* Win32 pthread shim. The MinGW CPU build (no DS4_WIN_PTHREAD) keeps real
* winpthreads, so its behavior is unchanged. */
#include "win/ds4_pthread_win.h"
#else
#include <pthread.h>
#endif
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
/* Native Windows CPU (MinGW-w64) and GPU (HIP/clang-MSVC) builds: a small
* dependency-free POSIX shim supplies mmap/flock/pread/sysconf/dprintf/
* fmemopen. See ds4_win.h. The shim body is guarded by _WIN32, so POSIX
* builds are byte-for-byte unchanged. */
#include "ds4_win.h"
#include <sys/stat.h>
#include <stdarg.h>
#include <time.h>
#if defined(__MINGW32__)
#include <strings.h> /* MinGW provides strcasecmp/strncasecmp */
#include <unistd.h> /* MinGW provides POSIX unistd surface */
#else
#include <io.h> /* MSVC-ABI build: read/write/close/lseek/isatty */
#include <process.h> /* getpid */
#include <direct.h>
#endif
#else
#include <strings.h>
#include <sys/file.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <stdarg.h>
#include <time.h>
#include <unistd.h>
#endif

#include "ds4.h"
#include "ds4_distributed.h"
Expand Down
5 changes: 5 additions & 0 deletions ds4_bench.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef _WIN32
/* Native Windows build: the POSIX shim supplies clock_gettime/CLOCK_MONOTONIC,
* nanosleep and PATH_MAX. Body guarded by _WIN32. See ds4_win.h. */
#include "ds4_win.h"
#endif

typedef struct {
const char *model_path;
Expand Down
36 changes: 29 additions & 7 deletions ds4_distributed.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,49 @@

#include "ds4_distributed.h"

#include <arpa/inet.h>
#include <errno.h>
#include <float.h>
#include <math.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#if defined(_WIN32)
/* Native Windows build: main moved the distributed runtime into CORE_OBJS, so
* it links into ds4-bench. The POSIX sockets surface it uses (arpa/inet,
* netdb, sys/socket, poll, …) is supplied by a Winsock2 shim, and the POSIX
* shim (ds4_win.h) covers nanosleep/clock_gettime/etc. Both bodies are guarded
* by _WIN32, so POSIX builds are byte-for-byte unchanged. */
#include "win/ds4_sockets_win.h"
#if defined(DS4_WIN_PTHREAD)
#include "win/ds4_pthread_win.h"
#else
#include <pthread.h>
#endif
#if defined(__MINGW32__)
/* MinGW supplies POSIX sleep()/usleep() via <unistd.h>; the MSVC ABI build does
* not, and relies on the sleep() shim in ds4_win.h instead (guarded there by
* !__MINGW32__). Pulling <unistd.h> here keeps the MinGW CPU build's sleep()
* declared without disturbing the MSVC ROCm path. */
#include <unistd.h>
#endif
#include "ds4_win.h"
#else
#include <arpa/inet.h>
#include <netdb.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#endif

/* =========================================================================
* Protocol Constants And Wire Records
Expand Down
4 changes: 4 additions & 0 deletions ds4_help.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#if defined(_WIN32) && !defined(__MINGW32__)
#include <io.h> /* MSVC-ABI build: isatty/fileno (no <unistd.h>) */
#else
#include <unistd.h>
#endif

typedef struct {
const char *off;
Expand Down
10 changes: 10 additions & 0 deletions ds4_rocm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,17 @@
#include <string.h>
#include <sys/stat.h>
#include <time.h>
#ifdef _WIN32
/* Native Windows ROCm build: the same dependency-free POSIX shim used by
* ds4.c supplies mmap/sysconf/pread/fcntl/flock. <io.h> provides the
* _open/_read/_write/_close family; the aliases below map the POSIX names the
* device-host code uses. The shim body is guarded by _WIN32, so POSIX/CUDA
* builds are byte-for-byte unchanged. See win/README.md and ds4_win.h. */
#include "ds4_win.h"
#include <io.h>
#else
#include <unistd.h>
#endif
#include <unordered_map>
#include <vector>

Expand Down
10 changes: 10 additions & 0 deletions ds4_ssd.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,18 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
/* Native Windows build: the dependency-free POSIX shim supplies mmap (incl.
* anonymous MAP_PRIVATE|MAP_ANONYMOUS used by --simulate-used-memory),
* munmap, mlock/munlock and sysconf. Body guarded by _WIN32. See ds4_win.h. */
#include "ds4_win.h"
#if defined(__MINGW32__)
#include <unistd.h>
#endif
#else
#include <sys/mman.h>
#include <unistd.h>
#endif

#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
Expand Down
Loading