Skip to content

Commit ec91df9

Browse files
committed
Debug for remote benchmark
1 parent cc0ecaf commit ec91df9

File tree

9 files changed

+247
-35
lines changed

9 files changed

+247
-35
lines changed

gpu_driven/Makefile

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@ INC = -I./ -I../include -I$(HIP_HOME)/include -I${CONDA_LIB_HOME}/../include -L$
88
LIBS = -lz -lelf -libverbs -L ${HIP_HOME}/lib -lamdhip64
99
LIBS_SHARED = -lglog -lgflags -lgtest -lz -lelf -libverbs
1010
override CXXFLAGS += -O3 -g -std=c++17 -Wno-pointer-arith -Wno-interference-size -fPIC -D__HIP_PLATFORM_AMD__
11+
override HIPFLAGS += -O3 -g -std=c++17 -Wno-pointer-arith -Wno-interference-size -fPIC -D__HIP_PLATFORM_AMD__ --offload-arch=gfx942
1112
DEPS = *.h
12-
PLUGIN_SO = librccl-net-uccl.so
13-
NCCL_INC:= -I$(RCCL_HOME)/build/release/include -I$(RCCL_HOME)/src/include -I$(HIP_HOME)/include
13+
# PLUGIN_SO = librccl-net-uccl.so
14+
# NCCL_INC:= -I$(RCCL_HOME)/build/release/include -I$(RCCL_HOME)/src/include -I$(HIP_HOME)/include
1415

1516
# HIP compiler
1617
HIPCC := $(HIP_HOME)/bin/hipcc
@@ -48,7 +49,7 @@ TARGET_LOCAL := benchmark_local
4849
TARGET_REMOTE := benchmark_remote
4950

5051
.PHONY: build
51-
build: $(test_bin) $(lib_obj) $(PLUGIN_SO) librdma_hip.a $(TARGET_LOCAL) $(TARGET_REMOTE)
52+
build: $(test_bin) $(lib_obj) librdma_hip.a $(TARGET_LOCAL) $(TARGET_REMOTE)
5253

5354
# Test binary compilation
5455
%_test: %_test.cc $(DEPS) $(lib_obj)
@@ -64,27 +65,27 @@ build: $(test_bin) $(lib_obj) $(PLUGIN_SO) librdma_hip.a $(TARGET_LOCAL) $(TARGE
6465

6566
# HIP compilation rule
6667
%.o: %.hip
67-
$(HIPCC) $(CXXFLAGS) $(INCLUDES) -MMD -MP -c $< -o $@
68+
$(HIPCC) $(HIPFLAGS) $(INCLUDES) -MMD -MP -c $< -o $@
6869

69-
# Plugin shared library
70-
$(PLUGIN_SO): nccl_plugin.cc $(DEPS) $(lib_obj)
71-
g++ $(NCCL_INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) nccl_plugin.cc $(lib_obj) $(INC) $(LIBS_SHARED) $(CXXFLAGS)
70+
# Plugin shared library - commented out as NCCL plugin is not part of this project
71+
# $(PLUGIN_SO): nccl_plugin.cc $(DEPS) $(lib_obj)
72+
# g++ $(NCCL_INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) nccl_plugin.cc $(lib_obj) $(INC) $(LIBS_SHARED) $(CXXFLAGS)
7273

7374
# Static library
7475
librdma_hip.a: $(lib_obj)
7576
ar rcs $@ $(lib_obj)
7677

7778
# Benchmark targets - linking with HIP
7879
$(TARGET_LOCAL): $(OBJ_LOCAL)
79-
$(HIPCC) $(CXXFLAGS) $(INCLUDES) $(OBJ_LOCAL) $(LIBS) -o $@
80+
$(HIPCC) $(HIPFLAGS) $(INCLUDES) $(OBJ_LOCAL) $(LIBS) -o $@
8081

8182
$(TARGET_REMOTE): $(OBJ_REMOTE)
82-
$(HIPCC) $(CXXFLAGS) $(INCLUDES) $(OBJ_REMOTE) $(LIBS) -o $@
83+
$(HIPCC) $(HIPFLAGS) $(INCLUDES) $(OBJ_REMOTE) $(LIBS) -o $@
8384

8485
# Clean all generated files
8586
.PHONY: clean
8687
clean:
87-
rm -f *.o $(test_bin) $(PLUGIN_SO) librdma_hip.a $(TARGET_LOCAL) $(TARGET_REMOTE) *.d src/*.d bench/*.d
88+
rm -f *.o $(test_bin) librdma_hip.a $(TARGET_LOCAL) $(TARGET_REMOTE) *.d src/*.d bench/*.d
8889

8990
# Automatically include dependency files if they exist
9091
-include $(OBJ_LOCAL:.o=.d) $(OBJ_REMOTE:.o=.d)

gpu_driven/bench/benchmark_local.hip

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,7 @@ int main(int argc, char** argv) {
1515
return 1;
1616
}
1717

18-
GdrSupportInitOnce();
19-
if (!GdrSupportInitOnce()) {
20-
printf(
21-
"Error: GPUDirect RDMA module is not loaded. Please load "
22-
"amdgpu or rocm_smi modules!\n");
23-
exit(1);
24-
}
18+
2519

2620
hipStream_t stream1;
2721
hipStreamCreate(&stream1);

gpu_driven/bench/benchmark_remote.hip

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,25 @@ int main(int argc, char** argv) {
4949

5050
printf("Allocated %zu bytes of GPU buffer at %p\n", total_size, gpu_buffer);
5151

52+
// Initialize per_GPU_device_buf array
53+
for (int d = 0; d < NUM_GPUS; ++d) {
54+
per_GPU_device_buf[d] = nullptr;
55+
}
56+
57+
// Initialize per_GPU_device_buf for remote node
58+
if (rank == 1) {
59+
for (int d = 0; d < NUM_GPUS; ++d) {
60+
hipSetDevice(d);
61+
hipMalloc(&per_GPU_device_buf[d], total_size);
62+
if (per_GPU_device_buf[d] == nullptr) {
63+
fprintf(stderr, "Failed to allocate GPU buffer on GPU %d\n", d);
64+
exit(1);
65+
}
66+
printf("Allocated %zu bytes for per_GPU_device_buf[%d] at %p\n", total_size, d, per_GPU_device_buf[d]);
67+
}
68+
hipSetDevice(0);
69+
}
70+
5271
// Initialize global RDMA resources
5372
RDMAConnectionInfo local_info;
5473
global_rdma_init(gpu_buffer, total_size, &local_info, rank);

gpu_driven/include/common.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@
2323
} \
2424
} while (0)
2525

26+
#define HIP_CHECK(cmd) \
27+
do { \
28+
hipError_t err = cmd; \
29+
if (err != hipSuccess) { \
30+
fprintf(stderr, "HIP error (%s:%d): %s\n", __FILE__, __LINE__, hipGetErrorString(err)); \
31+
exit(EXIT_FAILURE); \
32+
} \
33+
} while (0)
34+
2635
#define hipCheckErrors(msg) \
2736
do { \
2837
hipError_t __err = hipGetLastError(); \

gpu_driven/include/rdma.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ void handle_peer_copy(uint64_t wr_id, uint32_t imm, int src_dev, int dst_dev,
8181
void* src_ptr, void* dst_ptr, size_t num_bytes);
8282

8383
void per_thread_rdma_init(void* gpu_buf, size_t bytes, int rank, int block_idx);
84+
void fill_local_gid(RDMAConnectionInfo* local_info);
8485

8586
extern void* per_GPU_device_buf[NUM_GPUS];
8687

gpu_driven/include/util_simple.h

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#pragma once
2+
3+
#include <filesystem>
4+
#include <vector>
5+
#include <string>
6+
#include <algorithm>
7+
#include <regex>
8+
#include <fstream>
9+
#include <iostream>
10+
11+
namespace uccl {
12+
13+
namespace fs = std::filesystem;
14+
15+
static bool is_bdf(std::string const& s) {
16+
// Match full PCI BDF allowing hexadecimal digits
17+
static const std::regex re(
18+
R"([0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F])");
19+
return std::regex_match(s, re);
20+
}
21+
22+
static int cal_pcie_distance(fs::path const& devA, fs::path const& devB) {
23+
auto devA_parent = devA.parent_path();
24+
auto devB_parent = devB.parent_path();
25+
26+
auto build_chain = [](fs::path const& dev) {
27+
std::vector<std::string> chain;
28+
for (fs::path p = fs::canonical(dev);; p = p.parent_path()) {
29+
std::string leaf = p.filename();
30+
if (is_bdf(leaf)) chain.push_back(leaf); // collect BDF components
31+
if (p == p.root_path()) break; // reached filesystem root
32+
}
33+
return chain; /* self → root */
34+
};
35+
36+
auto chainA = build_chain(devA_parent);
37+
auto chainB = build_chain(devB_parent);
38+
39+
// Walk back from root until paths diverge
40+
size_t i = chainA.size();
41+
size_t j = chainB.size();
42+
while (i > 0 && j > 0 && chainA[i - 1] == chainB[j - 1]) {
43+
--i;
44+
--j;
45+
}
46+
// Distance = remaining unique hops in each chain
47+
return static_cast<int>(i + j);
48+
}
49+
50+
static std::vector<fs::path> get_gpu_cards() {
51+
// Discover GPU BDF using /sys/class/drm/cardX/device symlinks
52+
std::vector<fs::path> gpu_cards;
53+
const fs::path drm_class{"/sys/class/drm"};
54+
const std::regex card_re(R"(card(\d+))");
55+
56+
if (fs::exists(drm_class)) {
57+
for (auto const& entry : fs::directory_iterator(drm_class)) {
58+
const std::string name = entry.path().filename();
59+
std::smatch m;
60+
if (!std::regex_match(name, m, card_re)) continue;
61+
62+
fs::path dev_path = fs::canonical(entry.path() / "device");
63+
64+
// check vendor id
65+
std::ifstream vf(dev_path / "vendor");
66+
std::string vs;
67+
if (!(vf >> vs)) continue;
68+
uint32_t vendor = std::stoul(vs, nullptr, 0); // handles "0x10de"
69+
70+
if (vendor != 0x10de && vendor != 0x1002) continue; // NVIDIA or AMD
71+
72+
gpu_cards.push_back(dev_path);
73+
}
74+
}
75+
76+
const fs::path nvidia_gpus{"/proc/driver/nvidia/gpus"};
77+
if (gpu_cards.empty() && fs::exists(nvidia_gpus)) {
78+
for (auto const& entry : fs::directory_iterator(nvidia_gpus)) {
79+
gpu_cards.push_back(entry.path());
80+
}
81+
}
82+
83+
std::sort(gpu_cards.begin(), gpu_cards.end(),
84+
[](fs::path const& a, fs::path const& b) {
85+
return a.filename() < b.filename();
86+
});
87+
88+
return gpu_cards;
89+
}
90+
91+
static std::vector<std::pair<std::string, fs::path>> get_rdma_nics() {
92+
// Discover RDMA NICs under /sys/class/infiniband
93+
std::vector<std::pair<std::string, fs::path>> ib_nics;
94+
const fs::path ib_class{"/sys/class/infiniband"};
95+
if (!fs::exists(ib_class)) {
96+
std::cerr << "No /sys/class/infiniband directory found. Are RDMA drivers "
97+
"loaded?\n";
98+
return ib_nics;
99+
}
100+
101+
for (auto const& ib_entry : fs::directory_iterator(ib_class)) {
102+
std::string ibdev = ib_entry.path().filename();
103+
fs::path ib_device_path = fs::canonical(ib_entry.path() / "device");
104+
105+
// Collect interface names under RDMA device
106+
fs::path netdir = ib_device_path / "net";
107+
if (fs::exists(netdir) && fs::is_directory(netdir)) {
108+
ib_nics.push_back(std::make_pair(ibdev, ib_device_path));
109+
}
110+
}
111+
std::sort(ib_nics.begin(), ib_nics.end(),
112+
[](std::pair<std::string, fs::path> const& a,
113+
std::pair<std::string, fs::path> const& b) {
114+
return a.first < b.first;
115+
});
116+
return ib_nics;
117+
}
118+
119+
} // namespace uccl

gpu_driven/src/gpu_kernel.hip

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@
77

88
// HIP equivalent of clock64() - using wall clock time
99
__device__ __forceinline__ unsigned long long hip_clock64() {
10-
#if defined(__AMDGCN__)
11-
// AMD specific clock implementation
10+
#if defined(__AMDGCN__) && defined(__GFX942__)
11+
// AMD specific clock implementation for gfx942
1212
unsigned long long clock_val;
1313
asm volatile("s_memrealtime %0" : "=s"(clock_val));
1414
return clock_val;
15+
#elif defined(__AMDGCN__)
16+
// More compatible AMD implementation
17+
return __builtin_amdgcn_s_memrealtime();
1518
#else
1619
// Fallback implementation
1720
return static_cast<unsigned long long>(clock());

gpu_driven/src/peer_copy.hip

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,11 @@ hipError_t launch_peer_bulk_copy2(CopyTask const* host_tasks, int num_tasks,
108108
hipLaunchKernelGGL(peer_copy_kernel_vec, blocks, threads_per_block, 0, stream,
109109
d_tasks, num_tasks);
110110
} else if (true) {
111-
int tasks_per_block = num_tasks / NVLINK_SM_PER_PROCESS;
111+
int tasks_per_block = (num_tasks + NVLINK_SM_PER_PROCESS - 1) / NVLINK_SM_PER_PROCESS;
112112
hipLaunchKernelGGL(peer_copy_kernel_vec_batched, blocks, threads_per_block, 0, stream,
113113
d_tasks, num_tasks, tasks_per_block);
114114
} else {
115-
int tasks_per_block = num_tasks / NVLINK_SM_PER_PROCESS;
115+
int tasks_per_block = (num_tasks + NVLINK_SM_PER_PROCESS - 1) / NVLINK_SM_PER_PROCESS;
116116
size_t shmem = threads_per_block * 2 /*PIPE_DEPTH*/ * sizeof(int4);
117117
hipLaunchKernelGGL(HIP_KERNEL_NAME(peer_copy_kernel_vec_pipelined<2, int4>),
118118
blocks, threads_per_block, shmem, stream, d_tasks, num_tasks, tasks_per_block);

0 commit comments

Comments
 (0)