From d2caf76e6a8f70c85f2012213bc9512bef848541 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Wed, 4 Jun 2025 11:14:03 +0800 Subject: [PATCH 01/58] add XPU symm --- src/xccl/XPUSymmetricMemory.cpp | 647 +++++++++++++++++++++++++++ src/xccl/XPUSymmetricMemory.hpp | 126 ++++++ src/xccl/XPUSymmetricMemoryTypes.hpp | 8 + src/xccl/XPUSymmetricMemoryUtils.cpp | 241 ++++++++++ src/xccl/XPUSymmetricMemoryUtils.hpp | 111 +++++ 5 files changed, 1133 insertions(+) create mode 100644 src/xccl/XPUSymmetricMemory.cpp create mode 100644 src/xccl/XPUSymmetricMemory.hpp create mode 100644 src/xccl/XPUSymmetricMemoryTypes.hpp create mode 100644 src/xccl/XPUSymmetricMemoryUtils.cpp create mode 100644 src/xccl/XPUSymmetricMemoryUtils.hpp diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp new file mode 100644 index 0000000000..ae83337a24 --- /dev/null +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -0,0 +1,647 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +// todo: check this point +#include + +namespace c10d { +namespace symmetric_memory { + +/* Start of XPUSymmetricMemory implementation */ + +// A set of exchange methods with prefix "XPUSymmetricMemory" +static StoreExchange storeExchange = StoreExchange("XPUSymmetricMemory"); + +AllocationRef::AllocationRef( + void* ptr, + HandleType handle, + size_t block_size, + int device_idx) + : ptr(ptr), + handle(handle), + block_size(block_size), + device_idx(device_idx) {} + +AllocationRef::~AllocationRef() { + if (is_finalizing()) { + return; + } + c10::xpu::XPUGuard guard(device_idx); + C10_XPU_CHECK(xpuDeviceSynchronize()); +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) + // Leak the cuda allocations during static deinitialization + auto driver_api = c10::cuda::DriverAPI::get(); + C10_CUDA_DRIVER_CHECK( + driver_api->cuMemUnmap_(reinterpret_cast(ptr), block_size)); + C10_CUDA_DRIVER_CHECK(driver_api->cuMemRelease_(handle)); +#elif defined(USE_ROCM) + C10_HIP_CHECK(hipMemUnmap(reinterpret_cast(ptr), block_size)); + C10_HIP_CHECK(hipMemRelease(handle)); +#else + TORCH_CHECK( + false, "XPUSymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); +#endif +} + +XPUSymmetricMemory::XPUSymmetricMemory( + std::vector> alloc_refs, + std::vector buffers, + std::vector signal_pads, + HandleType mc_handle, + void* mc_addr, + size_t buffer_size, + int local_device_idx, + int rank, + int world_size) + : alloc_refs_(std::move(alloc_refs)), + buffers_(std::move(buffers)), + signal_pads_(std::move(signal_pads)), + mc_handle_(mc_handle), + mc_addr_(mc_addr), + buffer_size_(buffer_size), + local_device_idx_(local_device_idx), + rank_(rank), + world_size_(world_size) { + const size_t arr_size = sizeof(void*) * world_size_; + buffers_dev_ = reinterpret_cast( + c10::xpu::XPUCachingAllocator::raw_alloc(arr_size)); + signal_pads_dev_ = reinterpret_cast( + c10::xpu::XPUCachingAllocator::raw_alloc(arr_size)); + + c10::xpu::XPUGuard guard(local_device_idx); + // todo: zl_debug + xpu::getCurrentXPUStream().queue().memcpy(buffers_dev_, buffers_.data(), arr_size); + xpu::getCurrentXPUStream().queue().memcpy(signal_pads_dev_, signal_pads_.data(), arr_size); +} + +std::vector XPUSymmetricMemory::get_buffer_ptrs() { + return buffers_; +} + +std::vector XPUSymmetricMemory::get_signal_pad_ptrs() { + return signal_pads_; +} + +void** XPUSymmetricMemory::get_buffer_ptrs_dev() { + return buffers_dev_; +} + +void** XPUSymmetricMemory::get_signal_pad_ptrs_dev() { + return signal_pads_dev_; +} + +size_t XPUSymmetricMemory::get_buffer_size() { + return buffer_size_; +} + +size_t XPUSymmetricMemory::get_signal_pad_size() { + return signal_pad_size; +} + +bool XPUSymmetricMemory::has_multicast_support() { + return mc_addr_ != nullptr; +} + +void* XPUSymmetricMemory::get_multicast_ptr() { + return mc_addr_; +} + +at::Tensor XPUSymmetricMemory::get_buffer( + int rank, + c10::IntArrayRef sizes, + c10::ScalarType dtype, + int64_t storage_offset) { + const size_t numel = std::accumulate( + sizes.begin(), + sizes.end(), + static_cast(1), + std::multiplies()); + const auto element_size = c10::elementSize(dtype); + const auto req_size = (numel + storage_offset) * element_size; + TORCH_CHECK( + req_size <= buffer_size_, + "XPUSymmetricMemory::get_buffer: the requested size (", + req_size, + " bytes) exceeds the allocated size (", + buffer_size_, + " bytes)"); + auto data_ptr = reinterpret_cast(buffers_[rank]) + + storage_offset * element_size; + auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); + auto options = at::TensorOptions().dtype(dtype).device(device); + return at::for_blob(data_ptr, sizes) + .options(options) + .target_device(device) + .make_tensor(); +} + +at::Tensor XPUSymmetricMemory::get_signal_pad( + int rank, + c10::IntArrayRef sizes, + std::optional dtype, + int64_t storage_offset) { + // If the dtype is unspecified, default it to UInt32, as it + // is the most common type for signaling purposes. + if (!dtype.has_value()) { + dtype = c10::ScalarType::UInt32; + } + + // If the shape is unspecified, treat the signal pad as a 1d tensor. + const auto element_size = c10::elementSize(*dtype); + std::vector shape; + if (sizes.size() != 0) { + shape = sizes.vec(); + } else { + shape.push_back(signal_pad_size / element_size); + } + + const size_t numel = std::accumulate( + shape.begin(), + + shape.end(), + static_cast(1), + std::multiplies()); + const auto req_size = (numel + storage_offset) * element_size; + TORCH_CHECK( + req_size <= signal_pad_size, + "CUDASymmetricMemory::get_signal_pad: the requested size (", + req_size, + " bytes) exceeds the allocated size (", + signal_pad_size, + " bytes)"); + auto data_ptr = reinterpret_cast(signal_pads_[rank]) + + storage_offset * element_size; + auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); + auto options = at::TensorOptions().dtype(*dtype).device(device); + return at::for_blob(data_ptr, shape) + .options(options) + .target_device(device) + .make_tensor(); +} + +void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { + + LOG(ERROR) << "XPUSymmetricMemory::barrier not supported"; + +// check_channel(channel, world_size_); +// c10::xpu::CUDAGuard guard(local_device_idx_); +// barrier_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( +// reinterpret_cast(signal_pads_dev_), +// channel, +// rank_, +// world_size_, +// timeout_ms); +// C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +void XPUSymmetricMemory::put_signal( + int dst_rank, + int channel, + size_t timeout_ms) { + + LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported"; + +// check_channel(channel, world_size_); +// c10::cuda::CUDAGuard guard(local_device_idx_); +// put_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( +// reinterpret_cast(signal_pads_dev_), +// dst_rank, +// channel, +// rank_, +// world_size_, +// timeout_ms); +// C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +void XPUSymmetricMemory::wait_signal( + int src_rank, + int channel, + size_t timeout_ms) { + + LOG(ERROR) << "XPUSymmetricMemory::wait_signal not supported"; +// check_channel(channel, world_size_); +// c10::cuda::CUDAGuard guard(local_device_idx_); +// wait_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( +// reinterpret_cast(signal_pads_dev_), +// src_rank, +// channel, +// rank_, +// world_size_, +// timeout_ms); +// C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +int XPUSymmetricMemory::get_rank() { + return rank_; +} + +int XPUSymmetricMemory::get_world_size() { + return world_size_; +} + +Block::Block( + c10::intrusive_ptr alloc_ref, + int device_idx, + size_t block_size, + size_t buffer_size, + size_t signal_pad_offset, + const std::optional& group_name) + : alloc_ref(std::move(alloc_ref)), + device_idx(device_idx), + block_size(block_size), + buffer_size(buffer_size), + signal_pad_offset(signal_pad_offset), + default_group_name(std::move(group_name)) {} + +void* XPUSymmetricMemoryAllocator::alloc( + size_t size, + int device_idx, + const std::optional& group_name) { + + size_t signal_pad_offset = at::round_up(size, 16UL); + size_t block_size = signal_pad_offset + signal_pad_size; + c10::xpu::XPUGuard guard(device_idx); + device_idx = static_cast(guard.current_device().index()); + + sycl::queue current_queue = xpu::getCurrentXPUStream().queue() + sycl::context sycl_context = queue.get_context() + sycl::device sycl_device = queue.get_device() + zePhysicalMemCreate(sycl_context, sycl_device) + + + + +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + // NOLINTNEXTLINE(bugprone-signed-char-misuse) + prop.location.id = device_idx; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + + + size_t granularity; + auto driver_api = c10::cuda::DriverAPI::get(); + C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_( + &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + block_size = at::round_up(block_size, granularity); + + HandleType handle; + C10_CUDA_DRIVER_CHECK( + driver_api->cuMemCreate_(&handle, block_size, &prop, 0)); + +#elif defined(USE_ROCM) + hipMemAllocationProp prop = {}; + prop.type = hipMemAllocationTypePinned; + prop.location.type = hipMemLocationTypeDevice; + // NOLINTNEXTLINE(bugprone-signed-char-misuse) + prop.location.id = device_idx; + prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor; + + + size_t granularity; + C10_HIP_CHECK(hipMemGetAllocationGranularity( + &granularity, &prop, hipMemAllocationGranularityRecommended)); + block_size = at::round_up(block_size, granularity); + + HandleType handle; + C10_HIP_CHECK(hipMemCreate(reinterpret_cast(&handle), block_size, &prop, 0)); + +#else + TORCH_CHECK( + false, "XPUSymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); +#endif + void* ptr = nullptr; + map_block(&ptr, handle, block_size, device_idx); + + AT_CUDA_CHECK(cudaMemset(ptr, 0, block_size)); + + auto alloc_ref = + c10::make_intrusive(ptr, handle, block_size, device_idx); + auto block = c10::make_intrusive( + std::move(alloc_ref), + device_idx, + block_size, + size, + signal_pad_offset, + group_name); + { + std::unique_lock lock(mutex_); + ptr_to_block_.emplace(ptr, std::move(block)); + } + return ptr; +} + +void XPUSymmetricMemoryAllocator::free(void* ptr) { + std::unique_lock lock(mutex_); + ptr_to_block_.erase(ptr); +} + +size_t XPUSymmetricMemoryAllocator::get_alloc_size(void* ptr) { + auto block = find_block(ptr); + TORCH_CHECK( + block != nullptr, + "XPUSymmetricMemoryAllocator::get_alloc_size: input must be allocated ", + "via XPUSymmetricMemoryAllocator::alloc"); + return block->buffer_size; +} + +struct RendezvousRequest { + int device_idx; + int pid; + size_t block_size; + size_t buffer_size; + size_t signal_pad_offset; + bool has_multicast_support; +}; + +void validate_rendezvous_requests( + const std::vector& reqs, + int world_size) { + TORCH_CHECK(reqs.size() == (size_t)world_size); + + std::unordered_set device_indices; + device_indices.reserve(world_size); + for (auto req : reqs) { + device_indices.insert(req.device_idx); + } + if (!allow_overlapping_devices() && + device_indices.size() < (size_t)world_size) { + TORCH_CHECK( + false, + "XPUSymmetricMemoryAllocator::rendezvous: ", + "detected allocations from overlapping devices ", + "from different ranks."); + } + + for (int r = 1; r < world_size; ++r) { + TORCH_CHECK(reqs[r].block_size == reqs[0].block_size); + TORCH_CHECK(reqs[r].buffer_size == reqs[0].buffer_size); + TORCH_CHECK(reqs[r].signal_pad_offset == reqs[0].signal_pad_offset); + } +} + +static bool check_group_multicast_support( + const std::vector& reqs) { + std::vector ranks_with_multicast_support; + for (size_t r = 0; r < reqs.size(); ++r) { + if (reqs[r].has_multicast_support) { + ranks_with_multicast_support.push_back(r); + } + } + if (ranks_with_multicast_support.size() == reqs.size()) { + return true; + } else { + // We don't expect this to happen. But we want to let the user to know if + // this happens. + if (ranks_with_multicast_support.size() != 0) { + LOG(WARNING) + << "Only a subset of ranks in the group has multicast support: " + << ranks_with_multicast_support << " (world_size=" << reqs.size() + << "). Skipping multicast initialization because this is unexpected."; + } + return false; + } +} + +static void init_multicast_for_block( + HandleType& mc_handle, + void*& mc_addr, + const c10::intrusive_ptr& block, + IpcChannel& ipc_channel, + const std::vector& pids, + const c10::intrusive_ptr& store, + int rank, + int world_size) { +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && \ + defined(CUDART_SUPPORTS_MULTICAST) + auto driver_api = c10::cuda::DriverAPI::get(); + if (rank == 0) { + CUmulticastObjectProp mc_prop{}; + mc_prop.numDevices = world_size; + mc_prop.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + mc_prop.size = block->block_size; + + // create a multicast object, which acts as a handle that allows multiple + // devices or processes to access the same memory allocation coherently. + auto err = driver_api->cuMulticastCreate_(&mc_handle, &mc_prop); + if (err != CUDA_SUCCESS) { + const char* err_str; + CUresult get_error_str_err = driver_api->cuGetErrorString_(err, &err_str); + if (get_error_str_err != CUDA_SUCCESS) { + err_str = "unknown cuda driver error"; + } + LOG(WARNING) + << "SymmetricMemory: cuMulticastCreate failed with: \"" << err_str + << "\". Gracefully skipping multicast initialization. " + << "However, this is unexpected. Please report the issue on GitHub."; + // Allow peers gracefully skip multicast initialization by sending -1 + ipc_channel.broadcast_fds(rank, 0, pids, -1); + return; + } + + int mc_fd; + // using the CUDA Driver API to export a multicast object into a POSIX file descriptor. + C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_( + &mc_fd, mc_handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0)); + ipc_channel.broadcast_fds(rank, 0, pids, mc_fd); + // Ref count is incremented as soon as SCM_RIGHTS send happens + close(mc_fd); + } else { + int mc_fd = ipc_channel.broadcast_fds(rank, 0, pids, -1); + if (mc_fd == -1) { + return; + } + // Convert back to a handle from the broadcasted POSIX file descriptor. + C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_( + &mc_handle, + (void*)(uintptr_t)mc_fd, + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + close(mc_fd); + } + + // All rank adds their physical allocation to the multicast object + C10_CUDA_DRIVER_CHECK( + driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx)); + C10_CUDA_DRIVER_CHECK(driver_api->cuMulticastBindMem_( + mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0)); + + map_block(&mc_addr, mc_handle, block->block_size, block->device_idx); + storeExchange.barrier(store, rank, world_size); +#endif +} + +c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( + void* ptr, + const std::optional& group_name) { + + auto block = find_block(ptr); + if (block == nullptr) { + return nullptr; + } + + // The group_name passed to rendezvous() takes precedence over + // the default group_name specified during allocation. + std::string group_name_; + // Treat empty string and std::nullopt the same as empty string seems to be + // implicitly used that way + if (group_name.has_value() && group_name != "") { + group_name_ = *group_name; + } else { + if (!block->default_group_name.has_value()) { + TORCH_CHECK( + false, + "XPUSymmetricMemory::rendezvous: `group_name` is neither " + "specified during allocation nor passed to rendezvous()."); + } + group_name_ = *block->default_group_name; + } + + auto it = block->symm_mems.find(group_name_); + if (it != block->symm_mems.end()) { + return it->second; + } + + c10::cuda::CUDAGuard guard(block->device_idx); + + // Currently, IpcChannel is using a file based socket for inter-process communication + IpcChannel ipc_channel; + auto group_info = get_group_info(group_name_); + auto store = group_info.store; + int rank = group_info.rank; + int world_size = group_info.world_size; + int block_fd; + +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) + auto driver_api = c10::cuda::DriverAPI::get(); + // using the CUDA Driver API to export a GPU memory block as a + // POSIX file descriptor (FD), so it can be shared across processes via IPC. + C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_( + &block_fd, + block->alloc_ref->handle, + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, + 0)); +#elif defined (USE_ROCM) + C10_HIP_CHECK(hipMemExportToShareableHandle( + &block_fd, block->alloc_ref->handle, hipMemHandleTypePosixFileDescriptor, 0)); +#else + TORCH_CHECK( + false, "XPUSymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); +#endif + + auto local_req = RendezvousRequest{ + .device_idx = block->device_idx, + .pid = getpid(), + .block_size = block->block_size, + .buffer_size = block->buffer_size, + .signal_pad_offset = block->signal_pad_offset, + .has_multicast_support = device_has_multicast_support(block->device_idx)}; + auto reqs = storeExchange.all_gather(store, rank, world_size, local_req); + validate_rendezvous_requests(reqs, world_size); + + std::vector pids(world_size); + for (int r = 0; r < world_size; ++r) { + pids[r] = reqs[r].pid; + } + auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); + + std::vector handles(world_size); + std::vector buffers(world_size, nullptr); + std::vector signal_pads(world_size, nullptr); + + for (int r = 0; r < world_size; ++r) { + if (r == rank) { + handles[r] = block->alloc_ref->handle; + buffers[r] = ptr; + signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); + continue; + } + // This api imports a GPU memory allocation that was previously exported as a file + // descriptor and it returns a memory handle. +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) + C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_( + &handles[r], + (void*)(uintptr_t)imported_fds[r], + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); +#elif defined (USE_ROCM) + C10_HIP_CHECK(hipMemImportFromShareableHandle( + &handles[r], + (void*)(uintptr_t)&(imported_fds[r]), + hipMemHandleTypePosixFileDescriptor)); +#else + TORCH_CHECK( + false, "XPUSymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); +#endif + map_block(&buffers[r], handles[r], block->block_size, block->device_idx); + signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); + close(imported_fds[r]); + } + storeExchange.barrier(store, rank, world_size); + close(block_fd); + + HandleType mc_handle{}; + void* mc_addr = nullptr; + bool group_has_multicast_support = check_group_multicast_support(reqs); + if (!allow_overlapping_devices() && group_has_multicast_support) { + init_multicast_for_block( + mc_handle, mc_addr, block, ipc_channel, pids, store, rank, world_size); + } + + std::vector> alloc_refs; + for (int r = 0; r < world_size; ++r) { + if (r == rank) { + alloc_refs.emplace_back(block->alloc_ref); + continue; + } + alloc_refs.push_back(c10::make_intrusive( + buffers[r], handles[r], block->block_size, block->device_idx)); + } + + auto symm_mem = c10::make_intrusive( + std::move(alloc_refs), + std::move(buffers), + std::move(signal_pads), + mc_handle, + mc_addr, + block->buffer_size, + block->device_idx, + group_info.rank, + group_info.world_size); + block->symm_mems[group_name_] = symm_mem; + return symm_mem; +} + +bool XPUSymmetricMemoryAllocator::has_multicast_support(int device_idx) { + return device_has_multicast_support(device_idx); +} + +c10::intrusive_ptr XPUSymmetricMemoryAllocator::find_block(void* ptr) { + std::shared_lock lock(mutex_); + auto it = ptr_to_block_.find(ptr); + if (it == ptr_to_block_.end()) { + return nullptr; + } + return it->second; +} + +struct RegisterXPUSymmetricMemoryAllocator { + RegisterXPUSymmetricMemoryAllocator() { + register_allocator( + c10::DeviceType::XPU, + c10::make_intrusive()); + } +}; + +static RegisterXPUSymmetricMemoryAllocator register_allocator_; + +} // namespace symmetric_memory +} // namespace c10d diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp new file mode 100644 index 0000000000..3ba415711d --- /dev/null +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -0,0 +1,126 @@ +#pragma once + +#include +#include +#include +#include + +namespace c10d::symmetric_memory { + +// Resource wrapper that owns a (vaddr, allocation handle) pair. Upon +// destruction, it unmaps the vaddr and releases the allocation handle. +struct AllocationRef : public c10::intrusive_ptr_target { + void* ptr; + HandleType handle; + size_t block_size; + int device_idx; + + AllocationRef( + void* ptr, + HandleType handle, + size_t block_size, + int device_idx); + + ~AllocationRef(); +}; + +class XPUSymmetricMemory : public SymmetricMemory { + public: + XPUSymmetricMemory( + std::vector> alloc_refs, + std::vector buffers, + std::vector signal_pads, + HandleType mc_handle, + void* mc_addr, + size_t buffer_size, + int local_device_idx, + int rank, + int world_size); + + ~XPUSymmetricMemory() override{}; + + std::vector get_buffer_ptrs() override; + std::vector get_signal_pad_ptrs() override; + void** get_buffer_ptrs_dev() override; + void** get_signal_pad_ptrs_dev() override; + size_t get_buffer_size() override; + size_t get_signal_pad_size() override; + + bool has_multicast_support() override; + void* get_multicast_ptr() override; + + at::Tensor get_buffer( + int rank, + c10::IntArrayRef sizes, + c10::ScalarType dtype, + int64_t storage_offset) override; + + at::Tensor get_signal_pad( + int rank, + c10::IntArrayRef sizes, + std::optional dtype, + int64_t storage_offset) override; + + void barrier(int channel, size_t timeout_ms) override; + void put_signal(int dst_rank, int channel, size_t timeout_ms) override; + void wait_signal(int src_rank, int channel, size_t timeout_ms) override; + + int get_rank() override; + int get_world_size() override; + + private: + std::vector> alloc_refs_; + std::vector buffers_; + std::vector signal_pads_; + HandleType mc_handle_; + void* mc_addr_; + size_t buffer_size_; + int local_device_idx_; + int rank_; + int world_size_; + void** buffers_dev_; + void** signal_pads_dev_; +}; + +// Metadata associated with each allocation performed by +// `CUDASymmetricMemoryAllocator`. +struct Block : public c10::intrusive_ptr_target { + c10::intrusive_ptr alloc_ref; + int device_idx; + size_t block_size; + size_t buffer_size; + size_t signal_pad_offset; + std::optional default_group_name; + std::map> symm_mems; + + Block( + c10::intrusive_ptr alloc_ref, + int device_idx, + size_t block_size, + size_t buffer_size, + size_t signal_pad_offset, + const std::optional& group_name); +}; + +class XPUSymmetricMemoryAllocator : public SymmetricMemoryAllocator { + public: + void* alloc( + size_t size, + int device_idx, + const std::optional& group_name) override; + + void free(void* ptr) override; + size_t get_alloc_size(void* ptr) override; + c10::intrusive_ptr rendezvous( + void* ptr, + const std::optional& group_name) override; + bool has_multicast_support(int device_idx) override; + + private: + c10::intrusive_ptr find_block(void* ptr); + + std::shared_mutex mutex_; + std::unordered_map> ptr_to_block_; +}; + +} // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemoryTypes.hpp b/src/xccl/XPUSymmetricMemoryTypes.hpp new file mode 100644 index 0000000000..4cab3b81f7 --- /dev/null +++ b/src/xccl/XPUSymmetricMemoryTypes.hpp @@ -0,0 +1,8 @@ +#pragma once + +namespace c10d::symmetric_memory { + +constexpr size_t signal_pad_size = 2048; +using HandleType = void*; + +} // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp new file mode 100644 index 0000000000..84e3a9a8e2 --- /dev/null +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -0,0 +1,241 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace c10d::symmetric_memory { + +bool device_has_multicast_support(int device_idx) { + return true; +} + +bool allow_overlapping_devices() { + return true; +} + +IpcChannel::IpcChannel() + : socket_name_(get_socket_name(getpid())), + socket_(socket(AF_UNIX, SOCK_DGRAM, 0)) { + // On success, a file descriptor for the new socket is returned. + // On error, -1 is returned, and errno is set to indicate the error. + TORCH_CHECK( + socket_ != -1, "Failed to create socket: ", c10::utils::str_error(errno)); + + struct sockaddr_un addr = {.sun_family = AF_UNIX}; + std::copy(socket_name_.begin(), socket_name_.end(), addr.sun_path); + + TORCH_CHECK( + bind(socket_, (struct sockaddr*)&addr, SUN_LEN(&addr)) == 0, + "Failed to bind socket: ", + c10::utils::str_error(errno)); +} + +IpcChannel::~IpcChannel() { + close(socket_); + unlink(socket_name_.c_str()); +} + +void IpcChannel::send_fd(int dst_pid, int fd) { + // Because file descriptors are process-local kernel objects, and we can’t + // pass them via normal socket payloads (like write() or send()). Unix domain + // sockets provide a mechanism to pass actual FDs via sendmsg()/recvmsg(). + // Define destination socket address + struct sockaddr_un addr = {.sun_family = AF_UNIX}; + auto socket_name = get_socket_name(dst_pid); + std::copy(socket_name.begin(), socket_name.end(), addr.sun_path); + + // Prepare data to send + // Data being sent is "fd", the value of fd will be sent as auxiliary data + // (control message) + struct iovec io = {.iov_base = (void*)("fd"), .iov_len = 2}; + + // Prepare control message data buffer and zero it out + // NOLINTNEXTLINE(*array*) + char cbuf[CMSG_SPACE(sizeof(int))]; + memset(cbuf, 0, sizeof(cbuf)); + + // Create message header + struct msghdr msg { + // destination socket address and size of it + // message content in msg_iov and number of such structs (1 in our case) + // auxiliary data with the value of fd and size of it + .msg_name = (void*)&addr, .msg_namelen = sizeof(struct sockaddr_un), + .msg_iov = &io, .msg_iovlen = 1, .msg_control = cbuf, + .msg_controllen = sizeof(cbuf) + }; + + // This points to the first control message header + // With SCM_RIGHTS we let the kernel know that we are passing file + // descriptors. + auto cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + // Specify socket level message + cmsg->cmsg_level = SOL_SOCKET; + // SCM_RIGHTS is the type used to pass file descriptors + cmsg->cmsg_type = SCM_RIGHTS; + + if (fd != -1) { + std::copy( + reinterpret_cast(&fd), + reinterpret_cast(&fd) + sizeof(fd), + reinterpret_cast(CMSG_DATA(cmsg))); + } else { + msg.msg_controllen = 0; + } + + // Finally send the the message + TORCH_CHECK( + sendmsg(socket_, &msg, 0) > 0, + "Failed to send fd: ", + c10::utils::str_error(errno)); +} + +int IpcChannel::recv_fd() { + // Prepare buffer for regular message "fd" + // NOLINTNEXTLINE(*array*) + char buf[2]; + memset(&buf, 0, sizeof(buf)); + struct iovec io = {.iov_base = (void*)buf, .iov_len = sizeof(buf)}; + + // Prepare buffer for control message and zero it out + // NOLINTNEXTLINE(*array*) + char cbuf[CMSG_SPACE(sizeof(int))]; + memset(cbuf, 0, sizeof(cbuf)); + + // Define socket address to receive on: family AF_UNIX means unix domain + // socket + struct sockaddr_un addr = {.sun_family = AF_UNIX}; + std::copy(socket_name_.begin(), socket_name_.end(), addr.sun_path); + + // Prepare message header + struct msghdr msg = { + .msg_name = (void*)&addr, + .msg_namelen = sizeof(struct sockaddr_un), + .msg_iov = &io, + .msg_iovlen = 1, + .msg_control = cbuf, + .msg_controllen = sizeof(cbuf)}; + + // Recieve message on socket_ + TORCH_CHECK( + recvmsg(socket_, &msg, 0) > 0, + "Failed to receive fd: ", + c10::utils::str_error(errno)); + + if (msg.msg_controllen == 0) { + return -1; + } + + // Extract control message and validate its content + auto cmsg = CMSG_FIRSTHDR(&msg); + TORCH_CHECK(cmsg != nullptr); + TORCH_CHECK(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); + TORCH_CHECK(cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS); + return *reinterpret_cast(CMSG_DATA(cmsg)); +} + +std::vector IpcChannel::all_gather_fds( + int rank, + const std::vector& pids, + int fd) { + int world_size = (int)pids.size(); + std::vector fds(pids.size()); + fds[rank] = fd; + + int dst_rank = (rank + 1) % world_size; + for (int step = 1; step < world_size; ++step) { + int src_rank = (rank + world_size - step) % world_size; + send_fd(pids[dst_rank], fd); + fd = recv_fd(); + fds[src_rank] = fd; + } + return fds; +} + +int IpcChannel::broadcast_fds( + int rank, + int src_rank, + const std::vector& pids, + int fd) { + int world_size = (int)pids.size(); + + if (rank == src_rank) { + for (int dst_rank = 0; dst_rank < (int)world_size; ++dst_rank) { + if (dst_rank == rank) { + continue; + } + send_fd(pids[dst_rank], fd); + } + return fd; + } + return recv_fd(); +} + +std::string IpcChannel::get_socket_name(int pid) { + const char* tmp_dir = "/tmp"; + for (const char* env_var : {"TMPDIR", "TMP", "TEMP", "TEMPDIR"}) { + if (const char* path = getenv(env_var)) { + tmp_dir = path; + break; + } + } + std::ostringstream oss; + oss << tmp_dir << "/symm_mem-" << pid; + return oss.str(); +} + +void map_block( + void** ptr, + c10d::symmetric_memory::HandleType handle, + size_t size, + int device_idx) { +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) + auto driver_api = c10::cuda::DriverAPI::get(); + auto dev_ptr = reinterpret_cast(ptr); + // Allocate virtual address space + C10_CUDA_DRIVER_CHECK( + driver_api->cuMemAddressReserve_(dev_ptr, size, 0ULL, 0, 0ULL)); + // Map the physical memory to the virtual address + C10_CUDA_DRIVER_CHECK(driver_api->cuMemMap_(*dev_ptr, size, 0, handle, 0ULL)); + + // Set access permissions + CUmemAccessDesc desc; + desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + // NOLINTNEXTLINE(bugprone-signed-char-misuse) + desc.location.id = static_cast(device_idx); + desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + C10_CUDA_DRIVER_CHECK(driver_api->cuMemSetAccess_(*dev_ptr, size, &desc, 1)); +#elif defined(USE_ROCM) + C10_HIP_CHECK(hipMemAddressReserve(ptr, size, 0ULL, 0, 0ULL)); + C10_HIP_CHECK(hipMemMap( + *ptr, + size, + 0, + reinterpret_cast(handle), + 0ULL)); + C10_HIP_CHECK(hipMemMap( + *ptr, + size, + 0, + reinterpret_cast(handle), + 0ULL)); + + hipMemAccessDesc desc; + desc.location.type = hipMemLocationTypeDevice; + // NOLINTNEXTLINE(bugprone-signed-char-misuse) + desc.location.id = static_cast(device_idx); + desc.flags = hipMemAccessFlagsProtReadWrite; + C10_HIP_CHECK(hipMemSetAccess(*ptr, size, &desc, 1)); +#else + TORCH_CHECK( + false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); +#endif +} + +} // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp new file mode 100644 index 0000000000..7e0153dd85 --- /dev/null +++ b/src/xccl/XPUSymmetricMemoryUtils.hpp @@ -0,0 +1,111 @@ +#pragma once +#include +#include +#include + +namespace c10d { +namespace symmetric_memory { + +bool device_has_multicast_support(int device_idx); + +bool allow_overlapping_devices(); + +class IpcChannel { + public: + IpcChannel(); + ~IpcChannel(); + + void send_fd(int dst_pid, int fd); + int recv_fd(); + + std::vector all_gather_fds( + int rank, + const std::vector& pids, + int fd); + + int broadcast_fds( + int rank, + int src_rank, + const std::vector& pids, + int fd); + + private: + static std::string get_socket_name(int pid); + + std::string socket_name_; + int socket_; +}; + +// A set of store-based exchange methods with a preset prefix typically type of +// the SymmetricMemory. Most used as static instances at respective +// SymmetricMemory implementation files. +class StoreExchange { + public: + StoreExchange(const std::string& store_prefix) + : store_prefix_(store_prefix) {} + + // Put template function in header file so that compiler can easily access it. + template + std::vector all_gather( + const c10::intrusive_ptr& store, + int rank, + int world_size, + T val) { + static_assert(std::is_trivially_copyable_v); + + std::vector peer_keys; + peer_keys.reserve(world_size); + for (int r = 0; r < world_size; ++r) { + std::ostringstream oss; + oss << store_prefix_ << "/" << seq_id_ << "/" << r; + peer_keys.push_back(oss.str()); + } + ++seq_id_; + + { + std::vector payload( + reinterpret_cast(&val), + reinterpret_cast(&val) + sizeof(T)); + store->set(peer_keys[rank], payload); + } + + std::vector peer_vals; + peer_vals.reserve(world_size); + for (int r = 0; r < world_size; ++r) { + if (r == rank) { + peer_vals.push_back(val); + continue; + } + store->wait({peer_keys[r]}); + auto payload = store->get(peer_keys[r]); + TORCH_CHECK(payload.size() == sizeof(T)); + T peer_val{}; + std::memcpy(&peer_val, payload.data(), sizeof(T)); + peer_vals.push_back(peer_val); + } + return peer_vals; + } + + void barrier( + const c10::intrusive_ptr& store, + int rank, + int world_size) { + // TODO: implement an efficient one? + all_gather(store, rank, world_size, 0); + } + + private: + const std::string store_prefix_; + size_t seq_id_ = 0; +}; + +// Teturns a pointer of virtual address that is mapped to the physical memory +// held by the handle. +void map_block( + void** ptr, + c10d::symmetric_memory::HandleType handle, + size_t size, + int device_idx); + +} // namespace symmetric_memory +} // namespace c10d From 059cee656054f4cca8379a61958a1a1c07de4641 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Wed, 4 Jun 2025 11:55:58 +0800 Subject: [PATCH 02/58] correct include --- src/xccl/XPUSymmetricMemory.cpp | 6 +++--- src/xccl/XPUSymmetricMemory.hpp | 2 +- src/xccl/XPUSymmetricMemoryUtils.cpp | 2 +- src/xccl/XPUSymmetricMemoryUtils.hpp | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index ae83337a24..45a18322cb 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include #include #include @@ -11,7 +11,7 @@ #include // todo: check this point -#include +#include namespace c10d { namespace symmetric_memory { diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index 3ba415711d..9abf519d35 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 84e3a9a8e2..020e267b0a 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -5,7 +5,7 @@ #include -#include +#include #include #include diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp index 7e0153dd85..acc20e00fa 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.hpp +++ b/src/xccl/XPUSymmetricMemoryUtils.hpp @@ -1,7 +1,7 @@ #pragma once #include #include -#include +#include namespace c10d { namespace symmetric_memory { From b8a5473d2ab116c89f4c7cef7133792b4d8c30f3 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Wed, 4 Jun 2025 14:57:19 +0800 Subject: [PATCH 03/58] remove XPUGuard --- src/xccl/XPUSymmetricMemory.cpp | 116 +++++++++----------------------- 1 file changed, 30 insertions(+), 86 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 45a18322cb..d551bc9349 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include @@ -35,21 +35,8 @@ AllocationRef::~AllocationRef() { if (is_finalizing()) { return; } - c10::xpu::XPUGuard guard(device_idx); - C10_XPU_CHECK(xpuDeviceSynchronize()); -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) - // Leak the cuda allocations during static deinitialization - auto driver_api = c10::cuda::DriverAPI::get(); - C10_CUDA_DRIVER_CHECK( - driver_api->cuMemUnmap_(reinterpret_cast(ptr), block_size)); - C10_CUDA_DRIVER_CHECK(driver_api->cuMemRelease_(handle)); -#elif defined(USE_ROCM) - C10_HIP_CHECK(hipMemUnmap(reinterpret_cast(ptr), block_size)); - C10_HIP_CHECK(hipMemRelease(handle)); -#else - TORCH_CHECK( - false, "XPUSymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); -#endif + c10::DeviceGuard guard(device_idx); + c10::xpu::syncStreamsOnDevice(); } XPUSymmetricMemory::XPUSymmetricMemory( @@ -77,10 +64,10 @@ XPUSymmetricMemory::XPUSymmetricMemory( signal_pads_dev_ = reinterpret_cast( c10::xpu::XPUCachingAllocator::raw_alloc(arr_size)); - c10::xpu::XPUGuard guard(local_device_idx); + c10::DeviceGuard guard(local_device_idx); // todo: zl_debug - xpu::getCurrentXPUStream().queue().memcpy(buffers_dev_, buffers_.data(), arr_size); - xpu::getCurrentXPUStream().queue().memcpy(signal_pads_dev_, signal_pads_.data(), arr_size); + at::xpu::getCurrentXPUStream().queue().memcpy(buffers_dev_, buffers_.data(), arr_size); + at::xpu::getCurrentXPUStream().queue().memcpy(signal_pads_dev_, signal_pads_.data(), arr_size); } std::vector XPUSymmetricMemory::get_buffer_ptrs() { @@ -269,75 +256,32 @@ void* XPUSymmetricMemoryAllocator::alloc( size_t signal_pad_offset = at::round_up(size, 16UL); size_t block_size = signal_pad_offset + signal_pad_size; - c10::xpu::XPUGuard guard(device_idx); - device_idx = static_cast(guard.current_device().index()); - - sycl::queue current_queue = xpu::getCurrentXPUStream().queue() - sycl::context sycl_context = queue.get_context() - sycl::device sycl_device = queue.get_device() - zePhysicalMemCreate(sycl_context, sycl_device) - - - - -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - // NOLINTNEXTLINE(bugprone-signed-char-misuse) - prop.location.id = device_idx; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + c10::DeviceGuard guard(device_idx); + //device_idx = static_cast(guard.current_device().index()); // zl_debug todo + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + sycl::context sycl_context = queue.get_context(); + sycl::device sycl_device = queue.get_device(); + zePhysicalMemCreate(sycl_context, sycl_device); - size_t granularity; - auto driver_api = c10::cuda::DriverAPI::get(); - C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_( - &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); - block_size = at::round_up(block_size, granularity); - - HandleType handle; - C10_CUDA_DRIVER_CHECK( - driver_api->cuMemCreate_(&handle, block_size, &prop, 0)); - -#elif defined(USE_ROCM) - hipMemAllocationProp prop = {}; - prop.type = hipMemAllocationTypePinned; - prop.location.type = hipMemLocationTypeDevice; - // NOLINTNEXTLINE(bugprone-signed-char-misuse) - prop.location.id = device_idx; - prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor; - - - size_t granularity; - C10_HIP_CHECK(hipMemGetAllocationGranularity( - &granularity, &prop, hipMemAllocationGranularityRecommended)); - block_size = at::round_up(block_size, granularity); - - HandleType handle; - C10_HIP_CHECK(hipMemCreate(reinterpret_cast(&handle), block_size, &prop, 0)); - -#else - TORCH_CHECK( - false, "XPUSymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); -#endif void* ptr = nullptr; - map_block(&ptr, handle, block_size, device_idx); - - AT_CUDA_CHECK(cudaMemset(ptr, 0, block_size)); - - auto alloc_ref = - c10::make_intrusive(ptr, handle, block_size, device_idx); - auto block = c10::make_intrusive( - std::move(alloc_ref), - device_idx, - block_size, - size, - signal_pad_offset, - group_name); - { - std::unique_lock lock(mutex_); - ptr_to_block_.emplace(ptr, std::move(block)); - } +// map_block(&ptr, handle, block_size, device_idx); +// +// TORCH_CHECK(cudaMemset(ptr, 0, block_size)); +// +// auto alloc_ref = +// c10::make_intrusive(ptr, handle, block_size, device_idx); +// auto block = c10::make_intrusive( +// std::move(alloc_ref), +// device_idx, +// block_size, +// size, +// signal_pad_offset, +// group_name); +// { +// std::unique_lock lock(mutex_); +// ptr_to_block_.emplace(ptr, std::move(block)); +// } return ptr; } @@ -511,7 +455,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( return it->second; } - c10::cuda::CUDAGuard guard(block->device_idx); + c10::DeviceGuard guard(block->device_idx); // Currently, IpcChannel is using a file based socket for inter-process communication IpcChannel ipc_channel; From 3446fa6bb1d0f352775a358d81099721c6c112ba Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Wed, 4 Jun 2025 15:17:47 +0800 Subject: [PATCH 04/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index d551bc9349..054a2b3892 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -35,7 +35,7 @@ AllocationRef::~AllocationRef() { if (is_finalizing()) { return; } - c10::DeviceGuard guard(device_idx); +// c10::DeviceGuard guard(device_idx); // zl_debug: todo c10::xpu::syncStreamsOnDevice(); } @@ -64,7 +64,7 @@ XPUSymmetricMemory::XPUSymmetricMemory( signal_pads_dev_ = reinterpret_cast( c10::xpu::XPUCachingAllocator::raw_alloc(arr_size)); - c10::DeviceGuard guard(local_device_idx); +// c10::DeviceGuard guard(local_device_idx); //todo // todo: zl_debug at::xpu::getCurrentXPUStream().queue().memcpy(buffers_dev_, buffers_.data(), arr_size); at::xpu::getCurrentXPUStream().queue().memcpy(signal_pads_dev_, signal_pads_.data(), arr_size); @@ -256,13 +256,13 @@ void* XPUSymmetricMemoryAllocator::alloc( size_t signal_pad_offset = at::round_up(size, 16UL); size_t block_size = signal_pad_offset + signal_pad_size; - c10::DeviceGuard guard(device_idx); +// c10::DeviceGuard guard(device_idx); // todo //device_idx = static_cast(guard.current_device().index()); // zl_debug todo sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); sycl::context sycl_context = queue.get_context(); sycl::device sycl_device = queue.get_device(); - zePhysicalMemCreate(sycl_context, sycl_device); +// zePhysicalMemCreate(sycl_context, sycl_device); void* ptr = nullptr; // map_block(&ptr, handle, block_size, device_idx); @@ -455,7 +455,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( return it->second; } - c10::DeviceGuard guard(block->device_idx); +// c10::DeviceGuard guard(block->device_idx); // todo // Currently, IpcChannel is using a file based socket for inter-process communication IpcChannel ipc_channel; From 453fcf2500e23cfe9924c2444c53a832a92d6965 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 9 Jun 2025 10:32:05 +0800 Subject: [PATCH 05/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 206 +++++++++++++++++++++------ src/xccl/XPUSymmetricMemory.hpp | 2 + src/xccl/XPUSymmetricMemoryUtils.cpp | 37 +---- 3 files changed, 166 insertions(+), 79 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 054a2b3892..e75710b42d 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -256,32 +256,102 @@ void* XPUSymmetricMemoryAllocator::alloc( size_t signal_pad_offset = at::round_up(size, 16UL); size_t block_size = signal_pad_offset + signal_pad_size; -// c10::DeviceGuard guard(device_idx); // todo - //device_idx = static_cast(guard.current_device().index()); // zl_debug todo + + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + sycl::context sycl_dev = current_queue.get_context(); + sycl::device sycl_ctx = current_queue.get_device(); + ze_context_handle_t ze_ctx = + sycl::get_native(ctx); + ze_device_handle_t ze_device = sycl::get_native(sycl_dev); + + // 申请设备内存描述 + ze_device_mem_alloc_desc_t dev_desc = {}; + dev_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + dev_desc.pNext = nullptr; + dev_desc.ordinal = 0; + dev_desc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED; + + // 推荐粒度对齐(Level Zero 并不显式返回粒度,通常是驱动固定,建议用 64KB) + constexpr size_t granularity = 64 * 1024; + block_size = at::round_up(block_size, granularity); + + // 分配设备内存(此内存支持导出为 IPC) + void* ptr = nullptr; + ze_result_t res = zeMemAllocDevice( + ze_context, + &dev_desc, + block_size, + /* alignment = */ granularity, + ze_device, + &ptr); + TORCH_CHECK(res == ZE_RESULT_SUCCESS, "zeMemAllocDevice failed"); + + // 零初始化内存(Level Zero 没有 native memset,手动写 0) + std::memset(ptr, 0, block_size); + + // 获取 IPC 句柄 + ze_ipc_mem_handle_t ipc_handle; + res = zeMemGetIpcHandle(ze_context, ptr, &ipc_handle); + TORCH_CHECK(res == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); + + // 封装 AllocationRef 和 Block + auto alloc_ref = + c10::make_intrusive(ptr, ipc_handle, block_size, device_idx); + auto block = c10::make_intrusive( + std::move(alloc_ref), + device_idx, + block_size, + size, + signal_pad_offset, + group_name); + + { + std::unique_lock lock(mutex_); + ptr_to_block_.emplace(ptr, std::move(block)); + } + + return ptr; +} + +void* XPUSymmetricMemoryAllocator::alloc( + size_t size, + int device_idx, + const std::optional& group_name) { + + size_t signal_pad_offset = at::round_up(size, 16UL); + size_t block_size = signal_pad_offset + signal_pad_size; sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - sycl::context sycl_context = queue.get_context(); - sycl::device sycl_device = queue.get_device(); -// zePhysicalMemCreate(sycl_context, sycl_device); + sycl::context sycl_dev = current_queue.get_context(); + sycl::device sycl_ctx = current_queue.get_device(); + ze_context_handle_t ze_ctx = sycl::get_native(ctx); + ze_device_handle_t ze_device = sycl::get_native(sycl_dev); + // 获取 granularity + ze_physical_mem_desc_t phys_desc = { + ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC, nullptr, + ZE_PHYSICAL_MEM_DESC_FLAG_BIAS_UNCACHED, block_size, 0}; + + // 创建物理内存句柄 + ze_physical_mem_handle_t handle = nullptr; + ZE_CHECK(zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, &handle)); + + // 分配虚拟地址空间(只映射,不物理分配) void* ptr = nullptr; -// map_block(&ptr, handle, block_size, device_idx); -// -// TORCH_CHECK(cudaMemset(ptr, 0, block_size)); -// -// auto alloc_ref = -// c10::make_intrusive(ptr, handle, block_size, device_idx); -// auto block = c10::make_intrusive( -// std::move(alloc_ref), -// device_idx, -// block_size, -// size, -// signal_pad_offset, -// group_name); -// { -// std::unique_lock lock(mutex_); -// ptr_to_block_.emplace(ptr, std::move(block)); -// } + map_block(&ptr, handle, block_size, device_idx); + + // 初始化(memset) + memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset + + // 构造 Block 和 AllocationRef(假设这些结构未变) + auto alloc_ref = c10::make_intrusive(ptr, handle, block_size, device_idx); + auto block = c10::make_intrusive( + std::move(alloc_ref), device_idx, block_size, size, signal_pad_offset, group_name); + + { + std::unique_lock lock(mutex_); + ptr_to_block_.emplace(ptr, std::move(block)); + } return ptr; } @@ -424,6 +494,48 @@ static void init_multicast_for_block( #endif } +void XPUSymmetricMemoryAllocator::exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) + { + // Step 1: Get base address of the pointer + sycl::context ctx = queue.get_context(); + auto l0_ctx = sycl::get_native(ctx); + + void *base_addr; + size_t base_size; + zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); + + // Step 2: Get IPC mem handle from base address + alignas(64) exchange_contents send_buf; + alignas(64) exchange_contents recv_buf[world]; + + // fill in the exchange info + zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); + send_buf.offset = (char*)ptr - (char*)base_addr; + send_buf.pid = getpid(); + + // Step 3: Exchange the handles and offsets + memset(recv_buf, 0, sizeof(recv_buf)); + // Overkill if we don't really needs all peer's handles + un_allgather(&send_buf, recv_buf, rank, world); + + for (uint32_t i = 0; i < world; i++) + { + // Step 4: Prepare pid file descriptor of next process + auto* peer = recv_buf + i; + // Step 6: Open IPC handle of remote peer + auto l0_device + = sycl::get_native(queue.get_device()); + void* peer_base; + + zeCheck(zeMemOpenIpcHandle( + l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); + buffers[i] = (char*)peer_base + peer->offset; + sync_buffer[i] = (char*)peer_base + peer->offset + data_size_per_buffer * sizeof(data_type); + offsets[i] = peer->offset; + ipc_handle[i] = send_buf.ipc_handle; + } + } + c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( void* ptr, const std::optional& group_name) { @@ -465,6 +577,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( int world_size = group_info.world_size; int block_fd; +// todo: get fd and handle #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) auto driver_api = c10::cuda::DriverAPI::get(); // using the CUDA Driver API to export a GPU memory block as a @@ -474,13 +587,18 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( block->alloc_ref->handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0)); -#elif defined (USE_ROCM) - C10_HIP_CHECK(hipMemExportToShareableHandle( - &block_fd, block->alloc_ref->handle, hipMemHandleTypePosixFileDescriptor, 0)); -#else - TORCH_CHECK( - false, "XPUSymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); + + ZE_CALL(zeMemOpenIpcHandle, (remote_context, device, block->alloc_ref->handle,, {}, &ptr)) + #endif + // Step 6: Open IPC handle of remote peer + sycl::context ctx = queue.get_context(); + auto l0_ctx = sycl::get_native(ctx); + auto l0_device = sycl::get_native(queue.get_device()); + + ze_result_t result = zeMemGetIpcHandle(l0_ctx, block->ptr, &ipc_handle); + + TORCH_CHECK(result == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); auto local_req = RendezvousRequest{ .device_idx = block->device_idx, @@ -496,7 +614,9 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( for (int r = 0; r < world_size; ++r) { pids[r] = reqs[r].pid; } - auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); + std::vector imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); + + imported_handles = ipc_channel.all_gather_ipc_handles(rank, group_info.pids, ipc_handle); std::vector handles(world_size); std::vector buffers(world_size, nullptr); @@ -509,28 +629,20 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); continue; } - // This api imports a GPU memory allocation that was previously exported as a file - // descriptor and it returns a memory handle. -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) - C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_( - &handles[r], - (void*)(uintptr_t)imported_fds[r], - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); -#elif defined (USE_ROCM) - C10_HIP_CHECK(hipMemImportFromShareableHandle( - &handles[r], - (void*)(uintptr_t)&(imported_fds[r]), - hipMemHandleTypePosixFileDescriptor)); -#else - TORCH_CHECK( - false, "XPUSymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); -#endif + + void* imported_ptr = nullptr; + result = zeMemOpenIpcHandle( + context, + device, + imported_handles[r], + ZE_IPC_MEMORY_FLAG_NONE, + &imported_ptr); + TORCH_CHECK(result == ZE_RESULT_SUCCESS, "zeMemOpenIpcHandle failed"); + map_block(&buffers[r], handles[r], block->block_size, block->device_idx); signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); - close(imported_fds[r]); } storeExchange.barrier(store, rank, world_size); - close(block_fd); HandleType mc_handle{}; void* mc_addr = nullptr; diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index 9abf519d35..edf783d794 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -115,6 +116,7 @@ class XPUSymmetricMemoryAllocator : public SymmetricMemoryAllocator { void* ptr, const std::optional& group_name) override; bool has_multicast_support(int device_idx) override; + void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr); private: c10::intrusive_ptr find_block(void* ptr); diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 020e267b0a..6631a8cc86 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -7,7 +7,7 @@ #include #include -#include +#include namespace c10d::symmetric_memory { @@ -140,7 +140,7 @@ int IpcChannel::recv_fd() { return *reinterpret_cast(CMSG_DATA(cmsg)); } -std::vector IpcChannel::all_gather_fds( +std::vector IpcChannel::all_gather_fds( int rank, const std::vector& pids, int fd) { @@ -195,14 +195,12 @@ void map_block( c10d::symmetric_memory::HandleType handle, size_t size, int device_idx) { -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) auto driver_api = c10::cuda::DriverAPI::get(); auto dev_ptr = reinterpret_cast(ptr); // Allocate virtual address space - C10_CUDA_DRIVER_CHECK( - driver_api->cuMemAddressReserve_(dev_ptr, size, 0ULL, 0, 0ULL)); + zeVirtualMemReserve(dev_ptr, size, 0ULL, 0, 0ULL); // Map the physical memory to the virtual address - C10_CUDA_DRIVER_CHECK(driver_api->cuMemMap_(*dev_ptr, size, 0, handle, 0ULL)); + zeVirtualMemMap(*dev_ptr, size, 0, handle, 0ULL); // Set access permissions CUmemAccessDesc desc; @@ -210,32 +208,7 @@ void map_block( // NOLINTNEXTLINE(bugprone-signed-char-misuse) desc.location.id = static_cast(device_idx); desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - C10_CUDA_DRIVER_CHECK(driver_api->cuMemSetAccess_(*dev_ptr, size, &desc, 1)); -#elif defined(USE_ROCM) - C10_HIP_CHECK(hipMemAddressReserve(ptr, size, 0ULL, 0, 0ULL)); - C10_HIP_CHECK(hipMemMap( - *ptr, - size, - 0, - reinterpret_cast(handle), - 0ULL)); - C10_HIP_CHECK(hipMemMap( - *ptr, - size, - 0, - reinterpret_cast(handle), - 0ULL)); - - hipMemAccessDesc desc; - desc.location.type = hipMemLocationTypeDevice; - // NOLINTNEXTLINE(bugprone-signed-char-misuse) - desc.location.id = static_cast(device_idx); - desc.flags = hipMemAccessFlagsProtReadWrite; - C10_HIP_CHECK(hipMemSetAccess(*ptr, size, &desc, 1)); -#else - TORCH_CHECK( - false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED"); -#endif + zeVirtualMemSetAccessAttribute(*dev_ptr, size, &desc, 1); } } // namespace c10d::symmetric_memory From d0f7e0bd939a55a2103e9a5027e8f08d862c9e03 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 9 Jun 2025 11:23:22 +0800 Subject: [PATCH 06/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 42 --------------------------------- 1 file changed, 42 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index e75710b42d..998594ea59 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -313,48 +313,6 @@ void* XPUSymmetricMemoryAllocator::alloc( return ptr; } -void* XPUSymmetricMemoryAllocator::alloc( - size_t size, - int device_idx, - const std::optional& group_name) { - - size_t signal_pad_offset = at::round_up(size, 16UL); - size_t block_size = signal_pad_offset + signal_pad_size; - - sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - sycl::context sycl_dev = current_queue.get_context(); - sycl::device sycl_ctx = current_queue.get_device(); - ze_context_handle_t ze_ctx = sycl::get_native(ctx); - ze_device_handle_t ze_device = sycl::get_native(sycl_dev); - - // 获取 granularity - ze_physical_mem_desc_t phys_desc = { - ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC, nullptr, - ZE_PHYSICAL_MEM_DESC_FLAG_BIAS_UNCACHED, block_size, 0}; - - // 创建物理内存句柄 - ze_physical_mem_handle_t handle = nullptr; - ZE_CHECK(zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, &handle)); - - // 分配虚拟地址空间(只映射,不物理分配) - void* ptr = nullptr; - map_block(&ptr, handle, block_size, device_idx); - - // 初始化(memset) - memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset - - // 构造 Block 和 AllocationRef(假设这些结构未变) - auto alloc_ref = c10::make_intrusive(ptr, handle, block_size, device_idx); - auto block = c10::make_intrusive( - std::move(alloc_ref), device_idx, block_size, size, signal_pad_offset, group_name); - - { - std::unique_lock lock(mutex_); - ptr_to_block_.emplace(ptr, std::move(block)); - } - return ptr; -} - void XPUSymmetricMemoryAllocator::free(void* ptr) { std::unique_lock lock(mutex_); ptr_to_block_.erase(ptr); From dad01a551630cead2de66e777b92b3c7883dc556 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 9 Jun 2025 11:42:36 +0800 Subject: [PATCH 07/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 59 ++++++++++------------------ src/xccl/XPUSymmetricMemoryUtils.cpp | 46 +++++++++++++++------- 2 files changed, 52 insertions(+), 53 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 998594ea59..fe01500db1 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -257,53 +257,34 @@ void* XPUSymmetricMemoryAllocator::alloc( size_t signal_pad_offset = at::round_up(size, 16UL); size_t block_size = signal_pad_offset + signal_pad_size; - sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + // 获取 SYCL/Level Zero context 和 device + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); sycl::context sycl_dev = current_queue.get_context(); sycl::device sycl_ctx = current_queue.get_device(); ze_context_handle_t ze_ctx = - sycl::get_native(ctx); - ze_device_handle_t ze_device = sycl::get_native(sycl_dev); + sycl::get_native(sycl_ctx); + ze_device_handle_t ze_dev = sycl::get_native(sycl_dev); - // 申请设备内存描述 - ze_device_mem_alloc_desc_t dev_desc = {}; - dev_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - dev_desc.pNext = nullptr; - dev_desc.ordinal = 0; - dev_desc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED; + // 获取 granularity + ze_physical_mem_desc_t phys_desc = { + ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC, nullptr, 0, block_size}; - // 推荐粒度对齐(Level Zero 并不显式返回粒度,通常是驱动固定,建议用 64KB) - constexpr size_t granularity = 64 * 1024; - block_size = at::round_up(block_size, granularity); + // 创建物理内存句柄 + ze_physical_mem_handle_t handle = nullptr; + ze_result_t status = zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, &handle); + TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zePhysicalMemCreate failed"); - // 分配设备内存(此内存支持导出为 IPC) + // 分配虚拟地址空间(只映射,不物理分配) void* ptr = nullptr; - ze_result_t res = zeMemAllocDevice( - ze_context, - &dev_desc, - block_size, - /* alignment = */ granularity, - ze_device, - &ptr); - TORCH_CHECK(res == ZE_RESULT_SUCCESS, "zeMemAllocDevice failed"); - - // 零初始化内存(Level Zero 没有 native memset,手动写 0) - std::memset(ptr, 0, block_size); - - // 获取 IPC 句柄 - ze_ipc_mem_handle_t ipc_handle; - res = zeMemGetIpcHandle(ze_context, ptr, &ipc_handle); - TORCH_CHECK(res == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); - - // 封装 AllocationRef 和 Block - auto alloc_ref = - c10::make_intrusive(ptr, ipc_handle, block_size, device_idx); + map_block(&ptr, handle, block_size, device_idx); + + // 初始化(memset) + memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset + + // 构造 Block 和 AllocationRef(假设这些结构未变) + auto alloc_ref = c10::make_intrusive(ptr, handle, block_size, device_idx); auto block = c10::make_intrusive( - std::move(alloc_ref), - device_idx, - block_size, - size, - signal_pad_offset, - group_name); + std::move(alloc_ref), device_idx, block_size, size, signal_pad_offset, group_name); { std::unique_lock lock(mutex_); diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 6631a8cc86..68dd14afe0 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -195,20 +195,38 @@ void map_block( c10d::symmetric_memory::HandleType handle, size_t size, int device_idx) { - auto driver_api = c10::cuda::DriverAPI::get(); - auto dev_ptr = reinterpret_cast(ptr); - // Allocate virtual address space - zeVirtualMemReserve(dev_ptr, size, 0ULL, 0, 0ULL); - // Map the physical memory to the virtual address - zeVirtualMemMap(*dev_ptr, size, 0, handle, 0ULL); - - // Set access permissions - CUmemAccessDesc desc; - desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - // NOLINTNEXTLINE(bugprone-signed-char-misuse) - desc.location.id = static_cast(device_idx); - desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - zeVirtualMemSetAccessAttribute(*dev_ptr, size, &desc, 1); + // 1. Reserve virtual address space + void* virtual_ptr = nullptr; + ze_result_t status = zeVirtualMemReserve( + ze_context, // context + nullptr, // let L0 pick virtual address + size, // size + &virtual_ptr // out: reserved address + ); + TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemReserve failed"); + + // 2. Map physical memory to virtual address + status = zeVirtualMemMap( + ze_context, + virtual_ptr, // virtual memory to map to + size, + handle, // physical memory handle + 0 // flags + ); + TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemMap failed"); + + // 3. Set access attributes + ze_memory_access_attribute_t access = ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE; + status = zeVirtualMemSetAccessAttribute( + ze_context, + virtual_ptr, + size, + access + ); + TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemSetAccessAttribute failed"); + + // 4. Return pointer + *ptr = virtual_ptr; } } // namespace c10d::symmetric_memory From d7553ec0ba9661a4893d07cbbe9db1092352a600 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 9 Jun 2025 12:15:58 +0800 Subject: [PATCH 08/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 185 ++++++++------------------- src/xccl/XPUSymmetricMemory.hpp | 2 +- src/xccl/XPUSymmetricMemoryUtils.cpp | 5 + 3 files changed, 57 insertions(+), 135 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index fe01500db1..953a95490e 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -366,114 +366,50 @@ static bool check_group_multicast_support( } } -static void init_multicast_for_block( - HandleType& mc_handle, - void*& mc_addr, - const c10::intrusive_ptr& block, - IpcChannel& ipc_channel, - const std::vector& pids, - const c10::intrusive_ptr& store, - int rank, - int world_size) { -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && \ - defined(CUDART_SUPPORTS_MULTICAST) - auto driver_api = c10::cuda::DriverAPI::get(); - if (rank == 0) { - CUmulticastObjectProp mc_prop{}; - mc_prop.numDevices = world_size; - mc_prop.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - mc_prop.size = block->block_size; - - // create a multicast object, which acts as a handle that allows multiple - // devices or processes to access the same memory allocation coherently. - auto err = driver_api->cuMulticastCreate_(&mc_handle, &mc_prop); - if (err != CUDA_SUCCESS) { - const char* err_str; - CUresult get_error_str_err = driver_api->cuGetErrorString_(err, &err_str); - if (get_error_str_err != CUDA_SUCCESS) { - err_str = "unknown cuda driver error"; - } - LOG(WARNING) - << "SymmetricMemory: cuMulticastCreate failed with: \"" << err_str - << "\". Gracefully skipping multicast initialization. " - << "However, this is unexpected. Please report the issue on GitHub."; - // Allow peers gracefully skip multicast initialization by sending -1 - ipc_channel.broadcast_fds(rank, 0, pids, -1); - return; - } - - int mc_fd; - // using the CUDA Driver API to export a multicast object into a POSIX file descriptor. - C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_( - &mc_fd, mc_handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0)); - ipc_channel.broadcast_fds(rank, 0, pids, mc_fd); - // Ref count is incremented as soon as SCM_RIGHTS send happens - close(mc_fd); - } else { - int mc_fd = ipc_channel.broadcast_fds(rank, 0, pids, -1); - if (mc_fd == -1) { - return; - } - // Convert back to a handle from the broadcasted POSIX file descriptor. - C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_( - &mc_handle, - (void*)(uintptr_t)mc_fd, - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); - close(mc_fd); - } - - // All rank adds their physical allocation to the multicast object - C10_CUDA_DRIVER_CHECK( - driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx)); - C10_CUDA_DRIVER_CHECK(driver_api->cuMulticastBindMem_( - mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0)); - - map_block(&mc_addr, mc_handle, block->block_size, block->device_idx); - storeExchange.barrier(store, rank, world_size); -#endif -} - -void XPUSymmetricMemoryAllocator::exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) - { - // Step 1: Get base address of the pointer - sycl::context ctx = queue.get_context(); - auto l0_ctx = sycl::get_native(ctx); - - void *base_addr; - size_t base_size; - zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); - - // Step 2: Get IPC mem handle from base address - alignas(64) exchange_contents send_buf; - alignas(64) exchange_contents recv_buf[world]; - - // fill in the exchange info - zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); - send_buf.offset = (char*)ptr - (char*)base_addr; - send_buf.pid = getpid(); - - // Step 3: Exchange the handles and offsets - memset(recv_buf, 0, sizeof(recv_buf)); - // Overkill if we don't really needs all peer's handles - un_allgather(&send_buf, recv_buf, rank, world); - - for (uint32_t i = 0; i < world; i++) - { - // Step 4: Prepare pid file descriptor of next process - auto* peer = recv_buf + i; - // Step 6: Open IPC handle of remote peer - auto l0_device - = sycl::get_native(queue.get_device()); - void* peer_base; - - zeCheck(zeMemOpenIpcHandle( - l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); - buffers[i] = (char*)peer_base + peer->offset; - sync_buffer[i] = (char*)peer_base + peer->offset + data_size_per_buffer * sizeof(data_type); - offsets[i] = peer->offset; - ipc_handle[i] = send_buf.ipc_handle; - } - } +//void XPUSymmetricMemoryAllocator::exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) +// { +// // Step 1: Get base address of the pointer +// sycl::context ctx = queue.get_context(); +// auto l0_ctx = sycl::get_native(ctx); +// +// void *base_addr; +// size_t base_size; +// ze_result_t status = zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size); +// TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeMemGetAddressRange failed"); +// +// // Step 2: Get IPC mem handle from base address +// alignas(64) exchange_contents send_buf; +// alignas(64) exchange_contents recv_buf[world]; +// +// // fill in the exchange info +// status = zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle); +// TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); +// send_buf.offset = (char*)ptr - (char*)base_addr; +// send_buf.pid = getpid(); +// +// // Step 3: Exchange the handles and offsets +// memset(recv_buf, 0, sizeof(recv_buf)); +// // Overkill if we don't really needs all peer's handles +// un_allgather(&send_buf, recv_buf, rank, world); +// +// for (uint32_t i = 0; i < world; i++) +// { +// // Step 4: Prepare pid file descriptor of next process +// auto* peer = recv_buf + i; +// // Step 6: Open IPC handle of remote peer +// auto l0_device +// = sycl::get_native(queue.get_device()); +// void* peer_base; +// +// status = zeMemOpenIpcHandle( +// l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base); +// TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeMemOpenIpcHandle failed"); +// buffers[i] = (char*)peer_base + peer->offset; +// sync_buffer[i] = (char*)peer_base + peer->offset + data_size_per_buffer * sizeof(data_type); +// offsets[i] = peer->offset; +// ipc_handle[i] = send_buf.ipc_handle; +// } +// } c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( void* ptr, @@ -516,28 +452,13 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( int world_size = group_info.world_size; int block_fd; -// todo: get fd and handle -#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) - auto driver_api = c10::cuda::DriverAPI::get(); - // using the CUDA Driver API to export a GPU memory block as a - // POSIX file descriptor (FD), so it can be shared across processes via IPC. - C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_( - &block_fd, - block->alloc_ref->handle, - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, - 0)); - - ZE_CALL(zeMemOpenIpcHandle, (remote_context, device, block->alloc_ref->handle,, {}, &ptr)) - -#endif - // Step 6: Open IPC handle of remote peer - sycl::context ctx = queue.get_context(); - auto l0_ctx = sycl::get_native(ctx); - auto l0_device = sycl::get_native(queue.get_device()); - - ze_result_t result = zeMemGetIpcHandle(l0_ctx, block->ptr, &ipc_handle); + // Step 6: Open IPC handle of remote peer + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + sycl::context ctx = current_queue.get_context(); + auto l0_ctx = sycl::get_native(ctx); - TORCH_CHECK(result == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); + ze_result_t result = zeMemGetIpcHandle(l0_ctx, block->ptr, &ipc_handle); + TORCH_CHECK(result == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); auto local_req = RendezvousRequest{ .device_idx = block->device_idx, @@ -586,11 +507,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( HandleType mc_handle{}; void* mc_addr = nullptr; bool group_has_multicast_support = check_group_multicast_support(reqs); - if (!allow_overlapping_devices() && group_has_multicast_support) { - init_multicast_for_block( - mc_handle, mc_addr, block, ipc_channel, pids, store, rank, world_size); - } - + //todo: not support multicast now std::vector> alloc_refs; for (int r = 0; r < world_size; ++r) { if (r == rank) { diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index edf783d794..3d7bc778fa 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -116,7 +116,7 @@ class XPUSymmetricMemoryAllocator : public SymmetricMemoryAllocator { void* ptr, const std::optional& group_name) override; bool has_multicast_support(int device_idx) override; - void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr); +// void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr); private: c10::intrusive_ptr find_block(void* ptr); diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 68dd14afe0..a16375888b 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -195,6 +195,11 @@ void map_block( c10d::symmetric_memory::HandleType handle, size_t size, int device_idx) { + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + sycl::device sycl_ctx = current_queue.get_device(); + ze_context_handle_t ze_context = + sycl::get_native(sycl_ctx); + // 1. Reserve virtual address space void* virtual_ptr = nullptr; ze_result_t status = zeVirtualMemReserve( From d7886eed821807009869ade5cbca456b70b4c31a Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 9 Jun 2025 13:21:36 +0800 Subject: [PATCH 09/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 4 ++-- src/xccl/XPUSymmetricMemoryTypes.hpp | 2 +- src/xccl/XPUSymmetricMemoryUtils.cpp | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 953a95490e..c8f76b17b9 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -259,8 +259,8 @@ void* XPUSymmetricMemoryAllocator::alloc( // 获取 SYCL/Level Zero context 和 device sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - sycl::context sycl_dev = current_queue.get_context(); - sycl::device sycl_ctx = current_queue.get_device(); + sycl::context sycl_ctx = current_queue.get_context(); + sycl::device sycl_dev = current_queue.get_device(); ze_context_handle_t ze_ctx = sycl::get_native(sycl_ctx); ze_device_handle_t ze_dev = sycl::get_native(sycl_dev); diff --git a/src/xccl/XPUSymmetricMemoryTypes.hpp b/src/xccl/XPUSymmetricMemoryTypes.hpp index 4cab3b81f7..3b64364162 100644 --- a/src/xccl/XPUSymmetricMemoryTypes.hpp +++ b/src/xccl/XPUSymmetricMemoryTypes.hpp @@ -3,6 +3,6 @@ namespace c10d::symmetric_memory { constexpr size_t signal_pad_size = 2048; -using HandleType = void*; +using HandleType = ze_physical_mem_handle_t; } // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index a16375888b..aad5573fad 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace c10d::symmetric_memory { @@ -140,7 +141,7 @@ int IpcChannel::recv_fd() { return *reinterpret_cast(CMSG_DATA(cmsg)); } -std::vector IpcChannel::all_gather_fds( +std::vector IpcChannel::all_gather_fds( int rank, const std::vector& pids, int fd) { From 002a8ec820c2b3050d88aa89c87ae8c1bf74ac27 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 9 Jun 2025 14:44:05 +0800 Subject: [PATCH 10/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 7 +++++-- src/xccl/XPUSymmetricMemoryTypes.hpp | 2 ++ src/xccl/XPUSymmetricMemoryUtils.cpp | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index c8f76b17b9..bf11842ee0 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -456,6 +457,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); sycl::context ctx = current_queue.get_context(); auto l0_ctx = sycl::get_native(ctx); + sycl::device dev = current_queue.get_device(); + auto l0_dev = sycl::get_native(dev); ze_result_t result = zeMemGetIpcHandle(l0_ctx, block->ptr, &ipc_handle); TORCH_CHECK(result == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); @@ -492,8 +495,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( void* imported_ptr = nullptr; result = zeMemOpenIpcHandle( - context, - device, + l0_ctx, + l0_dev, imported_handles[r], ZE_IPC_MEMORY_FLAG_NONE, &imported_ptr); diff --git a/src/xccl/XPUSymmetricMemoryTypes.hpp b/src/xccl/XPUSymmetricMemoryTypes.hpp index 3b64364162..b03250c86a 100644 --- a/src/xccl/XPUSymmetricMemoryTypes.hpp +++ b/src/xccl/XPUSymmetricMemoryTypes.hpp @@ -1,5 +1,7 @@ #pragma once +#include + namespace c10d::symmetric_memory { constexpr size_t signal_pad_size = 2048; diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index aad5573fad..8d8ba712e4 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -217,7 +217,8 @@ void map_block( virtual_ptr, // virtual memory to map to size, handle, // physical memory handle - 0 // flags + 0, // flags + ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE //ze_memory_access_attribute_t ); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemMap failed"); From 8530c7425a5f9625f6935cbf112a6546d43d4f4d Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 9 Jun 2025 14:50:37 +0800 Subject: [PATCH 11/58] debug --- src/xccl/XPUSymmetricMemoryUtils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 8d8ba712e4..e2992e4cec 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -5,6 +5,7 @@ #include +#include #include #include #include @@ -197,7 +198,7 @@ void map_block( size_t size, int device_idx) { sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - sycl::device sycl_ctx = current_queue.get_device(); + sycl::context sycl_ctx = current_queue.get_context(); ze_context_handle_t ze_context = sycl::get_native(sycl_ctx); From 5ffc7f85f499e1ffe60d243f701dd17602697d76 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 9 Jun 2025 22:22:13 +0800 Subject: [PATCH 12/58] debug --- src/xccl/IPCExchange.cpp | 0 src/xccl/XPUSymmetricMemory.cpp | 1 - 2 files changed, 1 deletion(-) create mode 100644 src/xccl/IPCExchange.cpp diff --git a/src/xccl/IPCExchange.cpp b/src/xccl/IPCExchange.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index bf11842ee0..9d49785b9d 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include From e3c722a5b1dbb89631016daed833219182238847 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 10 Jun 2025 10:56:37 +0800 Subject: [PATCH 13/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 35 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 9d49785b9d..09dd8b8a41 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -315,6 +315,7 @@ struct RendezvousRequest { size_t buffer_size; size_t signal_pad_offset; bool has_multicast_support; + size_t base_offset; }; void validate_rendezvous_requests( @@ -459,8 +460,14 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( sycl::device dev = current_queue.get_device(); auto l0_dev = sycl::get_native(dev); - ze_result_t result = zeMemGetIpcHandle(l0_ctx, block->ptr, &ipc_handle); - TORCH_CHECK(result == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); + ze_ipc_mem_handle_t ipc_handle; + // convert to base address + void *base_addr; + size_t base_size; + zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size); + zeMemGetIpcHandle(l0_ctx, base_addr, &ipc_handle); + size_t base_offset = (char*)ptr - (char*)base_addr; + block_fd = *reinterpret_cast(&ipc_handle); auto local_req = RendezvousRequest{ .device_idx = block->device_idx, @@ -468,7 +475,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( .block_size = block->block_size, .buffer_size = block->buffer_size, .signal_pad_offset = block->signal_pad_offset, - .has_multicast_support = device_has_multicast_support(block->device_idx)}; + .has_multicast_support = false, + .base_offset = base_offset}; auto reqs = storeExchange.all_gather(store, rank, world_size, local_req); validate_rendezvous_requests(reqs, world_size); @@ -476,9 +484,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( for (int r = 0; r < world_size; ++r) { pids[r] = reqs[r].pid; } - std::vector imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); - - imported_handles = ipc_channel.all_gather_ipc_handles(rank, group_info.pids, ipc_handle); + auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); std::vector handles(world_size); std::vector buffers(world_size, nullptr); @@ -492,16 +498,15 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( continue; } - void* imported_ptr = nullptr; - result = zeMemOpenIpcHandle( - l0_ctx, - l0_dev, - imported_handles[r], - ZE_IPC_MEMORY_FLAG_NONE, - &imported_ptr); - TORCH_CHECK(result == ZE_RESULT_SUCCESS, "zeMemOpenIpcHandle failed"); + ze_ipc_mem_handle_t peer_ipc_handle; + int peer_fd = imported_fds[r]; + std::memcpy(&peer_ipc_handle, &peer_fd, sizeof(int)); - map_block(&buffers[r], handles[r], block->block_size, block->device_idx); + // Open IPC handle of remote peer + void* peer_base; + zeMemOpenIpcHandle(l0_ctx, l0_dev, peer_ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base); + void* physical_buffer_ptr = (char*)peer_base + reqs[r].base_offset; + map_block(&buffers[r], physical_buffer_ptr, block->block_size, block->device_idx); signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } storeExchange.barrier(store, rank, world_size); From dfea229b74760b3d89d896ff031e06e9dc141017 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 10 Jun 2025 15:22:08 +0800 Subject: [PATCH 14/58] add debug logs --- src/xccl/XPUSymmetricMemory.cpp | 15 +++++++++++++-- src/xccl/XPUSymmetricMemoryUtils.cpp | 5 ++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 09dd8b8a41..ea309ad3ba 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -265,6 +265,7 @@ void* XPUSymmetricMemoryAllocator::alloc( sycl::get_native(sycl_ctx); ze_device_handle_t ze_dev = sycl::get_native(sycl_dev); + std::cout << "zl_debug get context and device done " << std::endl; // 获取 granularity ze_physical_mem_desc_t phys_desc = { ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC, nullptr, 0, block_size}; @@ -274,23 +275,32 @@ void* XPUSymmetricMemoryAllocator::alloc( ze_result_t status = zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, &handle); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zePhysicalMemCreate failed"); + std::cout << "zl_debug physical device memory allocation done " << std::endl; + // 分配虚拟地址空间(只映射,不物理分配) void* ptr = nullptr; map_block(&ptr, handle, block_size, device_idx); + std::cout << "zl_debug map virtual to physical done " << std::endl; + // 初始化(memset) memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset - + + std::cout << "zl_debug memset to 0 for initialization " << std::endl; // 构造 Block 和 AllocationRef(假设这些结构未变) auto alloc_ref = c10::make_intrusive(ptr, handle, block_size, device_idx); + std::cout << "zl_debug make AllocationRef " << std::endl; auto block = c10::make_intrusive( std::move(alloc_ref), device_idx, block_size, size, signal_pad_offset, group_name); + std::cout << "zl_debug make block done " << std::endl; { std::unique_lock lock(mutex_); ptr_to_block_.emplace(ptr, std::move(block)); } + std::cout << "zl_debug before return ptr" << std::endl; + return ptr; } @@ -506,7 +516,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( void* peer_base; zeMemOpenIpcHandle(l0_ctx, l0_dev, peer_ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base); void* physical_buffer_ptr = (char*)peer_base + reqs[r].base_offset; - map_block(&buffers[r], physical_buffer_ptr, block->block_size, block->device_idx); + //map_block(&buffers[r], physical_buffer_ptr, block->block_size, block->device_idx); + buffers[r] = physical_buffer_ptr; signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } storeExchange.barrier(store, rank, world_size); diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index e2992e4cec..2eb515113e 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -201,7 +201,7 @@ void map_block( sycl::context sycl_ctx = current_queue.get_context(); ze_context_handle_t ze_context = sycl::get_native(sycl_ctx); - + std::cout << "zl_debug in map_block to get virtual address " << std::endl; // 1. Reserve virtual address space void* virtual_ptr = nullptr; ze_result_t status = zeVirtualMemReserve( @@ -211,6 +211,7 @@ void map_block( &virtual_ptr // out: reserved address ); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemReserve failed"); + std::cout << "zl_debug get zeVirtualMemReserve done " << std::endl; // 2. Map physical memory to virtual address status = zeVirtualMemMap( @@ -222,6 +223,7 @@ void map_block( ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE //ze_memory_access_attribute_t ); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemMap failed"); + std::cout << "zl_debug get zeVirtualMemMap done " << std::endl; // 3. Set access attributes ze_memory_access_attribute_t access = ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE; @@ -232,6 +234,7 @@ void map_block( access ); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemSetAccessAttribute failed"); + std::cout << "zl_debug get zeVirtualMemSetAccessAttribute done " << std::endl; // 4. Return pointer *ptr = virtual_ptr; From 2bcde21f5758d0407c53515c506dbbee2b8d703b Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Thu, 12 Jun 2025 11:40:51 +0800 Subject: [PATCH 15/58] check device type --- src/xccl/XPUSymmetricMemory.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index ea309ad3ba..5cb1c59db1 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -284,7 +284,7 @@ void* XPUSymmetricMemoryAllocator::alloc( std::cout << "zl_debug map virtual to physical done " << std::endl; // 初始化(memset) - memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset + //memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset std::cout << "zl_debug memset to 0 for initialization " << std::endl; // 构造 Block 和 AllocationRef(假设这些结构未变) @@ -300,6 +300,9 @@ void* XPUSymmetricMemoryAllocator::alloc( } std::cout << "zl_debug before return ptr" << std::endl; + // check ptr type + auto type = sycl::get_pointer_type(ptr, sycl_ctx); + TORCH_CHECK(type == sycl::usm::alloc::device, "[In symmetric memory] ptr is not a device type pointer."); return ptr; } From ce6f875ef546bb0bb7c8c547518dfd6b532a6bd8 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Thu, 12 Jun 2025 14:49:09 +0800 Subject: [PATCH 16/58] debug async ops --- src/xccl/ProcessGroupXCCL.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index c820a1c486..634dc62d47 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -670,6 +670,8 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device, opType); + std::cout << "zl_debug async OP " << asyncOp << std::endl; + if (coalescing_state_ & CoalActive) { if ((coalescing_state_ & CoalColl) == 0) { seqCollective_++; From 67d9ee3f9e50b51ff0cb5556da0d1944362b8b9d Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Fri, 13 Jun 2025 11:02:21 +0800 Subject: [PATCH 17/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 5cb1c59db1..530a755bf6 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -302,6 +302,11 @@ void* XPUSymmetricMemoryAllocator::alloc( std::cout << "zl_debug before return ptr" << std::endl; // check ptr type auto type = sycl::get_pointer_type(ptr, sycl_ctx); + if (type == sycl::usm::alloc::unknown){ + std::cout << "zl_debug get type as unknown" << std::endl; + } else if (type == sycl::usm::alloc::host) { + std::cout << "zl_debug get type as host" << std::endl; + } TORCH_CHECK(type == sycl::usm::alloc::device, "[In symmetric memory] ptr is not a device type pointer."); return ptr; From eaca2bd3ae1f23494844f7322b39a9936e2a45ea Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Fri, 13 Jun 2025 11:43:02 +0800 Subject: [PATCH 18/58] refine to void* --- src/xccl/XPUSymmetricMemory.cpp | 15 +++++++++++++-- src/xccl/XPUSymmetricMemoryTypes.hpp | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 530a755bf6..19afe8c369 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -279,7 +279,15 @@ void* XPUSymmetricMemoryAllocator::alloc( // 分配虚拟地址空间(只映射,不物理分配) void* ptr = nullptr; - map_block(&ptr, handle, block_size, device_idx); + //map_block(&ptr, handle, block_size, device_idx); + ze_device_mem_alloc_desc_t default_device_mem_alloc_desc = { + .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + .pNext = nullptr, + .flags = 0, + .ordinal = 0 +}; + + zeMemAllocDevice(l0_ctx, default_device_mem_alloc_desc, size, 128, l0_dev, &ptr); std::cout << "zl_debug map virtual to physical done " << std::endl; @@ -288,7 +296,8 @@ void* XPUSymmetricMemoryAllocator::alloc( std::cout << "zl_debug memset to 0 for initialization " << std::endl; // 构造 Block 和 AllocationRef(假设这些结构未变) - auto alloc_ref = c10::make_intrusive(ptr, handle, block_size, device_idx); + //auto alloc_ref = c10::make_intrusive(ptr, handle, block_size, device_idx); + auto alloc_ref = c10::make_intrusive(ptr, ptr, block_size, device_idx); std::cout << "zl_debug make AllocationRef " << std::endl; auto block = c10::make_intrusive( std::move(alloc_ref), device_idx, block_size, size, signal_pad_offset, group_name); @@ -306,6 +315,8 @@ void* XPUSymmetricMemoryAllocator::alloc( std::cout << "zl_debug get type as unknown" << std::endl; } else if (type == sycl::usm::alloc::host) { std::cout << "zl_debug get type as host" << std::endl; + } else { + std::cout << "zl_debug get type as device" << std::endl; } TORCH_CHECK(type == sycl::usm::alloc::device, "[In symmetric memory] ptr is not a device type pointer."); diff --git a/src/xccl/XPUSymmetricMemoryTypes.hpp b/src/xccl/XPUSymmetricMemoryTypes.hpp index b03250c86a..133abd2712 100644 --- a/src/xccl/XPUSymmetricMemoryTypes.hpp +++ b/src/xccl/XPUSymmetricMemoryTypes.hpp @@ -5,6 +5,6 @@ namespace c10d::symmetric_memory { constexpr size_t signal_pad_size = 2048; -using HandleType = ze_physical_mem_handle_t; +using HandleType = void*; } // namespace c10d::symmetric_memory From 7be7d365c6f06bad19b9d4198ec1b2be79fbbe9b Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Fri, 13 Jun 2025 12:17:10 +0800 Subject: [PATCH 19/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 60 +++++++++++++++++++++------- src/xccl/XPUSymmetricMemoryUtils.cpp | 2 +- src/xccl/XPUSymmetricMemoryUtils.hpp | 2 +- 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 19afe8c369..eab72f46de 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -35,7 +35,8 @@ AllocationRef::~AllocationRef() { if (is_finalizing()) { return; } -// c10::DeviceGuard guard(device_idx); // zl_debug: todo +c10::Device local_device(c10::DeviceType::XPU, device_idx); +c10::DeviceGuard guard(local_device); c10::xpu::syncStreamsOnDevice(); } @@ -64,7 +65,9 @@ XPUSymmetricMemory::XPUSymmetricMemory( signal_pads_dev_ = reinterpret_cast( c10::xpu::XPUCachingAllocator::raw_alloc(arr_size)); -// c10::DeviceGuard guard(local_device_idx); //todo + c10::Device local_device(c10::DeviceType::XPU, local_device_idx); + c10::DeviceGuard guard(local_device); + // todo: zl_debug at::xpu::getCurrentXPUStream().queue().memcpy(buffers_dev_, buffers_.data(), arr_size); at::xpu::getCurrentXPUStream().queue().memcpy(signal_pads_dev_, signal_pads_.data(), arr_size); @@ -175,19 +178,47 @@ at::Tensor XPUSymmetricMemory::get_signal_pad( .make_tensor(); } +void check_channel(int channel, int world_size) { + TORCH_CHECK( + channel >= 0, + "channel for barrier(), put_signal() and wait_signal() ", + "must be greater than 0 (got ", + channel, + ")"); + const size_t num_channels = signal_pad_size / sizeof(uint32_t) * world_size; + TORCH_CHECK( + static_cast(channel) < num_channels, + "The maximum supported channel for barrier(), put_signal() and wait_signal() is ", + num_channels - 1, + " (got ", + channel, + ")"); +} + void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { - LOG(ERROR) << "XPUSymmetricMemory::barrier not supported"; +// LOG(ERROR) << "XPUSymmetricMemory::barrier not supported"; + check_channel(channel, world_size_); -// check_channel(channel, world_size_); -// c10::xpu::CUDAGuard guard(local_device_idx_); -// barrier_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( -// reinterpret_cast(signal_pads_dev_), -// channel, -// rank_, -// world_size_, -// timeout_ms); -// C10_CUDA_KERNEL_LAUNCH_CHECK(); + c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); + c10::DeviceGuard guard(local_device); + + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + + current_queue.submit([&](handler& h) { + h.parallel_for(range<1>(world_size), [=](id<1> idx) { + int target_rank = idx[0]; + if (target_rank == rank) { + return; + } + //todo: implement +// bool put_success = try_put_signal( +// signal_pads[target_rank] + world_size * channel + rank, timeout_ms); +// +// bool wait_success = try_wait_signal( +// signal_pads[rank] + world_size * channel + target_rank, timeout_ms); + }); + }); } void XPUSymmetricMemory::put_signal( @@ -287,7 +318,7 @@ void* XPUSymmetricMemoryAllocator::alloc( .ordinal = 0 }; - zeMemAllocDevice(l0_ctx, default_device_mem_alloc_desc, size, 128, l0_dev, &ptr); + zeMemAllocDevice(ze_ctx, default_device_mem_alloc_desc, size, 128, ze_dev, &ptr); std::cout << "zl_debug map virtual to physical done " << std::endl; @@ -472,7 +503,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( return it->second; } -// c10::DeviceGuard guard(block->device_idx); // todo + c10::Device local_device(c10::DeviceType::XPU, block->device_idx); + c10::DeviceGuard guard(local_device); // Currently, IpcChannel is using a file based socket for inter-process communication IpcChannel ipc_channel; diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 2eb515113e..afb185932e 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -194,7 +194,7 @@ std::string IpcChannel::get_socket_name(int pid) { void map_block( void** ptr, - c10d::symmetric_memory::HandleType handle, + ze_physical_mem_handle_t handle, size_t size, int device_idx) { sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp index acc20e00fa..e7a492abe8 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.hpp +++ b/src/xccl/XPUSymmetricMemoryUtils.hpp @@ -103,7 +103,7 @@ class StoreExchange { // held by the handle. void map_block( void** ptr, - c10d::symmetric_memory::HandleType handle, + ze_physical_mem_handle_t handle, size_t size, int device_idx); From e203d62304e58183d452f7c38e16b2625021e783 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Fri, 13 Jun 2025 16:23:31 +0800 Subject: [PATCH 20/58] debug copy --- src/xccl/XPUSymmetricMemory.cpp | 70 +++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index eab72f46de..59a9606a86 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -74,6 +74,7 @@ XPUSymmetricMemory::XPUSymmetricMemory( } std::vector XPUSymmetricMemory::get_buffer_ptrs() { + std::cout << "zl_debug in XPUSymmetricMemory::get_buffer_ptrs" << buffers_[0] << " ___ " << buffers_[1] << std::endl; return buffers_; } @@ -124,6 +125,7 @@ at::Tensor XPUSymmetricMemory::get_buffer( " bytes) exceeds the allocated size (", buffer_size_, " bytes)"); + std::cout << "zl_debug in get_buffer " << rank << "___" << buffers_[rank] << "___" <(buffers_[rank]) + storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); @@ -205,20 +207,20 @@ void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - current_queue.submit([&](handler& h) { - h.parallel_for(range<1>(world_size), [=](id<1> idx) { - int target_rank = idx[0]; - if (target_rank == rank) { - return; - } - //todo: implement -// bool put_success = try_put_signal( -// signal_pads[target_rank] + world_size * channel + rank, timeout_ms); -// -// bool wait_success = try_wait_signal( -// signal_pads[rank] + world_size * channel + target_rank, timeout_ms); - }); - }); +// current_queue.submit([&](handler& h) { +// h.parallel_for(range<1>(world_size), [=](id<1> idx) { +// int target_rank = idx[0]; +// if (target_rank == rank) { +// return; +// } +// //todo: implement +//// bool put_success = try_put_signal( +//// signal_pads[target_rank] + world_size * channel + rank, timeout_ms); +//// +//// bool wait_success = try_wait_signal( +//// signal_pads[rank] + world_size * channel + target_rank, timeout_ms); +// }); +// }); } void XPUSymmetricMemory::put_signal( @@ -318,18 +320,23 @@ void* XPUSymmetricMemoryAllocator::alloc( .ordinal = 0 }; - zeMemAllocDevice(ze_ctx, default_device_mem_alloc_desc, size, 128, ze_dev, &ptr); + zeMemAllocDevice(ze_ctx, &default_device_mem_alloc_desc, size, 128, ze_dev, &ptr); + + at::Tensor xpu_tensor = at::empty({1000}, c10::TensorOptions().device(c10::kXPU).dtype(c10::kByte)); + + uint8_t* raw_ptr = xpu_tensor.data_ptr(); + std::cout << "zl_debug start copy to local " << std::endl; + current_queue.memcpy(raw_ptr, ptr, 100).wait(); + std::cout << "zl_debug end copy to local " << std::endl; - std::cout << "zl_debug map virtual to physical done " << std::endl; + std::cout << "zl_debug map virtual to physical done " << std::endl; // 初始化(memset) //memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset - - std::cout << "zl_debug memset to 0 for initialization " << std::endl; + // 构造 Block 和 AllocationRef(假设这些结构未变) //auto alloc_ref = c10::make_intrusive(ptr, handle, block_size, device_idx); auto alloc_ref = c10::make_intrusive(ptr, ptr, block_size, device_idx); - std::cout << "zl_debug make AllocationRef " << std::endl; auto block = c10::make_intrusive( std::move(alloc_ref), device_idx, block_size, size, signal_pad_offset, group_name); std::cout << "zl_debug make block done " << std::endl; @@ -338,18 +345,8 @@ void* XPUSymmetricMemoryAllocator::alloc( std::unique_lock lock(mutex_); ptr_to_block_.emplace(ptr, std::move(block)); } + // check this ptr copy to sycl buffer - std::cout << "zl_debug before return ptr" << std::endl; - // check ptr type - auto type = sycl::get_pointer_type(ptr, sycl_ctx); - if (type == sycl::usm::alloc::unknown){ - std::cout << "zl_debug get type as unknown" << std::endl; - } else if (type == sycl::usm::alloc::host) { - std::cout << "zl_debug get type as host" << std::endl; - } else { - std::cout << "zl_debug get type as device" << std::endl; - } - TORCH_CHECK(type == sycl::usm::alloc::device, "[In symmetric memory] ptr is not a device type pointer."); return ptr; } @@ -555,6 +552,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( if (r == rank) { handles[r] = block->alloc_ref->handle; buffers[r] = ptr; + std::cout << "zl_debug rendevous in rank = " << r << " ptr: " << ptr << std::endl; signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); continue; } @@ -569,6 +567,18 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( void* physical_buffer_ptr = (char*)peer_base + reqs[r].base_offset; //map_block(&buffers[r], physical_buffer_ptr, block->block_size, block->device_idx); buffers[r] = physical_buffer_ptr; + + //double check this buffer + at::Tensor xpu_tensor = at::empty({1000}, c10::TensorOptions().device(c10::kXPU).dtype(c10::kByte)); + + uint8_t* raw_ptr = xpu_tensor.data_ptr(); + std::cout << "zl_debug start copy to local in rendevous" << std::endl; + current_queue.memcpy(raw_ptr, physical_buffer_ptr, 100).wait(); + std::cout << "zl_debug end copy to local in rendevous in rank = " << r << " ptr: " << physical_buffer_ptr << std::endl; + + at::Tensor cpu_tensor = xpu_tensor.to("cpu"); + std::cout << "zl_debug peer rank = " << r << " data = " << cpu_tensor << std::endl; + signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } storeExchange.barrier(store, rank, world_size); From 83318178638ca3373003966bdd47a3bd9707243c Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 16 Jun 2025 15:31:58 +0800 Subject: [PATCH 21/58] debug sharded handle --- src/xccl/XPUSymmetricMemory.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 59a9606a86..8c61f594ee 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -576,8 +576,22 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( current_queue.memcpy(raw_ptr, physical_buffer_ptr, 100).wait(); std::cout << "zl_debug end copy to local in rendevous in rank = " << r << " ptr: " << physical_buffer_ptr << std::endl; - at::Tensor cpu_tensor = xpu_tensor.to("cpu"); - std::cout << "zl_debug peer rank = " << r << " data = " << cpu_tensor << std::endl; + int count = 256; + auto host_ptr = (int *)sycl::malloc_host(512 * sizeof(int), current_queue); + auto tmp_ptr = (int *)sycl::malloc_device(512 * sizeof(int), current_queue); + std::cout << "Sync buffer content at " << address << ": "; + current_queue.memcpy(tmp_ptr, physical_buffer_ptr, count * sizeof(int)); + current_queue.memcpy(host_ptr, tmp_ptr, count * sizeof(int)); + current_queue.wait(); + + for (int i = 0; i < count; i++) { + std::cout << host_ptr[i] << " "; + } + std::cout << std::flush; + std::cout << "zl_debug print done " << std::flush; + +// at::Tensor cpu_tensor = xpu_tensor.to(c10::kCPU); +// std::cout << "zl_debug peer rank = " << r << " data = " << cpu_tensor << std::endl; signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } From 0a83b5633c133b007e6701db148a793b478e0bce Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 17 Jun 2025 11:56:03 +0800 Subject: [PATCH 22/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 52 ++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 8c61f594ee..af2d8265e5 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -518,6 +518,24 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( sycl::device dev = current_queue.get_device(); auto l0_dev = sycl::get_native(dev); + // print original values + int tmp_count = 128; + auto host_ptr = (int *)sycl::malloc_host(tmp_count * sizeof(int), current_queue); + auto tmp_ptr = (int *)sycl::malloc_device(tmp_count * sizeof(int), current_queue); + std::cout << "zl_debug start to copy data " << std::endl; + + current_queue.memcpy(tmp_ptr, ptr, tmp_count * sizeof(int)); + current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); + current_queue.wait(); + std::cout << "zl_debug finish copy original local data to host" << std::endl; + + for (int i = 0; i < tmp_count; i++) { + std::cout << host_ptr[i] << " "; + } + std::cout << std::flush; + std::cout << "zl_debug print done " << std::flush; + + ze_ipc_mem_handle_t ipc_handle; // convert to base address void *base_addr; @@ -569,30 +587,30 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( buffers[r] = physical_buffer_ptr; //double check this buffer - at::Tensor xpu_tensor = at::empty({1000}, c10::TensorOptions().device(c10::kXPU).dtype(c10::kByte)); - - uint8_t* raw_ptr = xpu_tensor.data_ptr(); - std::cout << "zl_debug start copy to local in rendevous" << std::endl; - current_queue.memcpy(raw_ptr, physical_buffer_ptr, 100).wait(); - std::cout << "zl_debug end copy to local in rendevous in rank = " << r << " ptr: " << physical_buffer_ptr << std::endl; - - int count = 256; - auto host_ptr = (int *)sycl::malloc_host(512 * sizeof(int), current_queue); - auto tmp_ptr = (int *)sycl::malloc_device(512 * sizeof(int), current_queue); - std::cout << "Sync buffer content at " << address << ": "; - current_queue.memcpy(tmp_ptr, physical_buffer_ptr, count * sizeof(int)); - current_queue.memcpy(host_ptr, tmp_ptr, count * sizeof(int)); + at::Tensor xpu_tensor = at::empty({1024}, c10::TensorOptions().device(c10::kXPU).dtype(c10::kInt)); + + int tmp_count = 128; + int* raw_ptr = xpu_tensor.data_ptr(); + std::cout << "zl_debug start copy to local in rendevous" << std::endl; + current_queue.memcpy(raw_ptr, physical_buffer_ptr, tmp_count * sizeof(int)); + current_queue.wait(); + std::cout << "zl_debug end copy to local in rendevous in rank = " << r << " ptr: " << physical_buffer_ptr << std::endl; + + auto host_ptr = (int *)sycl::malloc_host(tmp_count * sizeof(int), current_queue); + auto tmp_ptr = (int *)sycl::malloc_device(tmp_count * sizeof(int), current_queue); + std::cout << "zl_debug start to copy data " << std::endl; + + current_queue.memcpy(tmp_ptr, physical_buffer_ptr, tmp_count * sizeof(int)); + current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); current_queue.wait(); + std::cout << "zl_debug finish copy data to host" << std::endl; - for (int i = 0; i < count; i++) { + for (int i = 0; i < tmp_count; i++) { std::cout << host_ptr[i] << " "; } std::cout << std::flush; std::cout << "zl_debug print done " << std::flush; -// at::Tensor cpu_tensor = xpu_tensor.to(c10::kCPU); -// std::cout << "zl_debug peer rank = " << r << " data = " << cpu_tensor << std::endl; - signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } storeExchange.barrier(store, rank, world_size); From 646d246cc3affb81c1929e9cd0bdb993e7388e1a Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 17 Jun 2025 15:31:37 +0800 Subject: [PATCH 23/58] debug --- src/xccl/XPUSymmetricMemory.cpp | 74 +++++++++++++-------------------- 1 file changed, 30 insertions(+), 44 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index af2d8265e5..a21fb729c5 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -125,11 +125,11 @@ at::Tensor XPUSymmetricMemory::get_buffer( " bytes) exceeds the allocated size (", buffer_size_, " bytes)"); - std::cout << "zl_debug in get_buffer " << rank << "___" << buffers_[rank] << "___" <(buffers_[rank]) + storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); auto options = at::TensorOptions().dtype(dtype).device(device); + std::cout << "[Native] zl_debug in get_buffer on rank = " << rank << " buffer ptr=" << buffers_[rank] << " offset=" <(); - std::cout << "zl_debug start copy to local " << std::endl; - current_queue.memcpy(raw_ptr, ptr, 100).wait(); - std::cout << "zl_debug end copy to local " << std::endl; + std::cout << "[Native] zl_debug allocate memory with size = " << size << " allocated ptr=" << ptr << std::endl; - std::cout << "zl_debug map virtual to physical done " << std::endl; + //zeMemAllocDevice(ze_ctx, &default_device_mem_alloc_desc, size, 128, ze_dev, &ptr); +// uint8_t* raw_ptr = xpu_tensor.data_ptr(); +// std::cout << "zl_debug start copy to local " << std::endl; +// current_queue.memcpy(raw_ptr, ptr, 100).wait(); +// std::cout << "zl_debug end copy to local " << std::endl; +// +// std::cout << "zl_debug map virtual to physical done " << std::endl; // 初始化(memset) //memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset @@ -339,7 +338,6 @@ void* XPUSymmetricMemoryAllocator::alloc( auto alloc_ref = c10::make_intrusive(ptr, ptr, block_size, device_idx); auto block = c10::make_intrusive( std::move(alloc_ref), device_idx, block_size, size, signal_pad_offset, group_name); - std::cout << "zl_debug make block done " << std::endl; { std::unique_lock lock(mutex_); @@ -519,21 +517,20 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( auto l0_dev = sycl::get_native(dev); // print original values - int tmp_count = 128; - auto host_ptr = (int *)sycl::malloc_host(tmp_count * sizeof(int), current_queue); - auto tmp_ptr = (int *)sycl::malloc_device(tmp_count * sizeof(int), current_queue); - std::cout << "zl_debug start to copy data " << std::endl; + int tmp_count = 32768; + auto host_ptr = (float *)sycl::malloc_host(tmp_count * sizeof(float), current_queue); + auto tmp_ptr = (float *)sycl::malloc_device(tmp_count * sizeof(float), current_queue); - current_queue.memcpy(tmp_ptr, ptr, tmp_count * sizeof(int)); - current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); - current_queue.wait(); - std::cout << "zl_debug finish copy original local data to host" << std::endl; + current_queue.memcpy(tmp_ptr, ptr, tmp_count * sizeof(int)); + current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); + current_queue.wait(); + std::cout << "[Native] zl_debug finish copy original local data to host" << std::endl; - for (int i = 0; i < tmp_count; i++) { - std::cout << host_ptr[i] << " "; - } - std::cout << std::flush; - std::cout << "zl_debug print done " << std::flush; + for (int i = 0; i < tmp_count; i++) { + std::cout << host_ptr[i] << " "; + } + std::cout << std::flush; + std::cout << "zl_debug print done " << std::flush; ze_ipc_mem_handle_t ipc_handle; @@ -570,7 +567,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( if (r == rank) { handles[r] = block->alloc_ref->handle; buffers[r] = ptr; - std::cout << "zl_debug rendevous in rank = " << r << " ptr: " << ptr << std::endl; + std::cout << "[Native] zl_debug rendevous in rank = " << r << " ptr = " << ptr << std::endl; signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); continue; } @@ -587,29 +584,18 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( buffers[r] = physical_buffer_ptr; //double check this buffer - at::Tensor xpu_tensor = at::empty({1024}, c10::TensorOptions().device(c10::kXPU).dtype(c10::kInt)); - - int tmp_count = 128; - int* raw_ptr = xpu_tensor.data_ptr(); - std::cout << "zl_debug start copy to local in rendevous" << std::endl; - current_queue.memcpy(raw_ptr, physical_buffer_ptr, tmp_count * sizeof(int)); - current_queue.wait(); - std::cout << "zl_debug end copy to local in rendevous in rank = " << r << " ptr: " << physical_buffer_ptr << std::endl; - - auto host_ptr = (int *)sycl::malloc_host(tmp_count * sizeof(int), current_queue); - auto tmp_ptr = (int *)sycl::malloc_device(tmp_count * sizeof(int), current_queue); - std::cout << "zl_debug start to copy data " << std::endl; - + auto host_ptr = (float *)sycl::malloc_host(32768 * sizeof(float), current_queue); + auto tmp_ptr = (float *)sycl::malloc_device(32768 * sizeof(float), current_queue); + std::cout << "[Native] zl_debug start to copy exchanged data to local host " << std::endl; current_queue.memcpy(tmp_ptr, physical_buffer_ptr, tmp_count * sizeof(int)); current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); current_queue.wait(); - std::cout << "zl_debug finish copy data to host" << std::endl; + std::cout << "[Native] zl_debug finish copy exchanged data to local host" << std::endl; for (int i = 0; i < tmp_count; i++) { std::cout << host_ptr[i] << " "; } std::cout << std::flush; - std::cout << "zl_debug print done " << std::flush; signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } From fed33270e4a1853c1cbebdb25db4449aa5e7e75f Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 17 Jun 2025 15:46:37 +0800 Subject: [PATCH 24/58] enable torch-ccl exchange --- src/xccl/IPCExchange.cpp | 0 src/xccl/IPCExchange.hpp | 370 ++++++++++++++++++++++++++++++++ src/xccl/XPUSymmetricMemory.cpp | 10 +- src/xccl/ze_exception.hpp | 51 +++++ 4 files changed, 429 insertions(+), 2 deletions(-) delete mode 100644 src/xccl/IPCExchange.cpp create mode 100644 src/xccl/IPCExchange.hpp create mode 100644 src/xccl/ze_exception.hpp diff --git a/src/xccl/IPCExchange.cpp b/src/xccl/IPCExchange.cpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp new file mode 100644 index 0000000000..0ebb885e5c --- /dev/null +++ b/src/xccl/IPCExchange.hpp @@ -0,0 +1,370 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "xccl/ze_exception.hpp" + +#define ELE_COUNT 128 + +struct exchange_contents +{ + // first 4-byte is file descriptor for drmbuf or gem object + union + { + ze_ipc_mem_handle_t ipc_handle; + int fd = -1; + }; + size_t offset = 0; + int pid = -1; +}; + +#define sysCheck(x) \ + if (x == -1) { \ + throw std::system_error( \ + std::make_error_code(std::errc(errno))); \ + } + +// We can't inherit it from cmsghdr because flexible array member +struct exchange_fd { + char obscure[CMSG_LEN(sizeof(int)) - sizeof(int)]; + int fd; + + exchange_fd(int cmsg_level, int cmsg_type, int fd) + : fd(fd) { + auto* cmsg = reinterpret_cast(obscure); + cmsg->cmsg_len = sizeof(exchange_fd); + cmsg->cmsg_level = cmsg_level; + cmsg->cmsg_type = cmsg_type; + } + + exchange_fd() : fd(-1) { + memset(obscure, 0, sizeof(obscure)); + }; +}; + +void un_send_fd(int sock, int fd, int rank, size_t offset) { + iovec iov[1]; + msghdr msg; + auto rank_offset = std::make_pair(rank, offset); + + iov[0].iov_base = &rank_offset; + iov[0].iov_len = sizeof(rank_offset); + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_name = nullptr; + msg.msg_namelen = 0; + + exchange_fd cmsg (SOL_SOCKET, SCM_RIGHTS, fd); + + msg.msg_control = &cmsg; + msg.msg_controllen = sizeof(exchange_fd); + sysCheck(sendmsg(sock, &msg, 0)); +} + +std::tuple un_recv_fd(int sock) { + iovec iov[1]; + msghdr msg; + std::pair rank_offset; + + iov[0].iov_base = &rank_offset; + iov[0].iov_len = sizeof(rank_offset); + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_name = nullptr; + msg.msg_namelen = 0; + + exchange_fd cmsg; + msg.msg_control = &cmsg; + msg.msg_controllen = sizeof(exchange_fd); + int n_recv = recvmsg(sock, &msg, 0); + sysCheck(n_recv); + // assert(n_recv == sizeof(int)); + + return std::make_tuple(cmsg.fd, rank_offset.first, rank_offset.second); +} + +int prepare_socket(const char *sockname) { + sockaddr_un un; + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + strcpy(un.sun_path, sockname); + + auto sock = socket(AF_UNIX, SOCK_STREAM, 0); + sysCheck(sock); + + int on = 1; + sysCheck(ioctl(sock, FIONBIO, &on)); + + auto size = offsetof(sockaddr_un, sun_path) + strlen(un.sun_path); + sysCheck(bind(sock, (sockaddr *)&un, size)); + + return sock; +} + +int server_listen(const char *sockname) { + // unlink(sockname); + auto sock = prepare_socket(sockname); + sysCheck(listen(sock, 10)); + + return sock; +} + +int serv_accept(int listen_sock) { + sockaddr_un un; + + socklen_t len = sizeof(un); + auto accept_sock = accept(listen_sock, (sockaddr *)&un, &len); + sysCheck(accept_sock); + + return accept_sock; +} + +int client_connect(const char *server, const char *client) { + auto sock = prepare_socket(client); + sockaddr_un sun; + memset(&sun, 0, sizeof(sun)); + sun.sun_family = AF_UNIX; + strcpy(sun.sun_path, server); + auto len = offsetof(sockaddr_un, sun_path) + strlen(server); + sysCheck(connect(sock, (sockaddr *)&sun, len)); + return sock; +} + +void un_allgather(exchange_contents* send_buf, exchange_contents recv_buf[], int rank, int world) { + const char* servername_prefix = "/tmp/open-peer-ipc-mem-server-rank_"; + const char* clientname_prefix = "/tmp/open-peer-ipc-mem-client-rank_"; + char server_name[64]; + /* get username to make server_name unique */ + auto uid = getuid(); + auto pwd = getpwuid(uid); + snprintf(server_name, sizeof(server_name), "%s%d_%s", servername_prefix, rank, pwd->pw_name); + unlink(server_name); + auto s_listen = server_listen(server_name); + + MPI_Barrier(MPI_COMM_WORLD); + + pollfd fdarray[world]; + int recv_socks[world-1]; + + for (auto& pollfd : fdarray) pollfd.fd = -1; + std::fill(recv_socks, recv_socks + world -1, -1); + + auto fd_guard = [&]() { + for (int i = 0, j = 0; i < world; ++ i) { + if ( i != rank && recv_socks[j] != -1) + sysCheck(close(recv_socks[j++])); + if ( fdarray[i].fd != -1 ) + sysCheck(close(fdarray[i].fd)); + } + }; + + struct guard__{ + using F = decltype(fd_guard); + F f; + guard__(const F &f) : f(f) {} + ~guard__() { f(); } + } free_fd(fd_guard); + + // connect to all ranks + for (int i = 0; i < world; ++ i) { + if (rank == i) { + fdarray[i].fd = s_listen; + fdarray[i].events = POLLIN; + fdarray[i].revents = 0; + } else { + char peer_name[64]; + char client_name[64]; + + snprintf(client_name, sizeof(client_name), "%s%d-%d_%s", clientname_prefix, rank, i, pwd->pw_name); + unlink(client_name); + + snprintf(peer_name, sizeof(peer_name), "%s%d_%s", servername_prefix, i, pwd->pw_name); + fdarray[i].fd = client_connect(peer_name, client_name); + fdarray[i].events = POLLOUT; + fdarray[i].revents = 0; + } + } + + // std::future> future_fds[world -1]; + int slot = 0; + uint32_t send_progress = 1<fd, rank, send_buf->offset); + send_progress |= 1< +class allreducer +{ +public: + allreducer() + { + initialized = false; + size_per_buffer = 0; + buffer_index = 0; + } + + void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) + { + auto ret = MPI_Init(NULL, NULL); + if (ret == MPI_ERR_OTHER) { + std::cout<<"MPI init error"<(local_buffer); // 1024 counts + int host_data[ELE_COUNT]; + for (int i = 0; i < ELE_COUNT; ++i) host_data[i] = static_cast(rank_in + 2); + queue.memcpy(tmp_buffer, host_data, ELE_COUNT * sizeof(int)).wait(); + debug_print_buffer(queue, static_cast(local_buffer), ELE_COUNT); + std::cout << "start to do IPC exchange " << std::endl; + + // XXX: gain access to remote pointers + exchange_peer_ipc_mem(queue, local_buffer); + initialized = true; + + } + void allreduce(sycl::queue& queue, void* inout_buffer, uint32_t size) {} + void release(sycl::queue& queue) + { + // Clean up, close/put ipc handles, free memory, etc. + auto l0_ctx = sycl::get_native< + sycl::backend::ext_oneapi_level_zero>(queue.get_context()); + for (int i = 0; i < world; i++) + { + if (i != rank) + { + zeCheck(zeMemCloseIpcHandle(l0_ctx, (char *)buffers[i] - offsets[i])); + } + } + + sycl::free(buffers[rank], queue); + initialized = false; + } + +private: +void debug_print_buffer(sycl::queue& queue, int *address, int count) { + auto host_ptr = (int *)sycl::malloc_host(count * sizeof(int), queue); + auto tmp_ptr = (int *)sycl::malloc_device(count * sizeof(int), queue); + + queue.memcpy(tmp_ptr, address, count * sizeof(int)); + queue.memcpy(host_ptr, tmp_ptr, count * sizeof(int)); + + queue.wait(); + + for (int i = 0; i < count; i++) { + std::cout << host_ptr[i] << " "; + } + std::cout << std::endl; +} + void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) + { + // Step 1: Get base address of the pointer + sycl::context ctx = queue.get_context(); + auto l0_ctx = sycl::get_native(ctx); + + void *base_addr; + size_t base_size; + zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); + std::cout << "zl_debug get base address " << base_addr << " base size " << base_size << std::endl; + + // Step 2: Get IPC mem handle from base address + alignas(64) exchange_contents send_buf; + alignas(64) exchange_contents recv_buf[world]; + + // fill in the exchange info + zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); + send_buf.offset = (char*)ptr - (char*)base_addr; + std::cout << "zl_debug get address base offset " << send_buf.offset << std::endl; + send_buf.pid = getpid(); + + // Step 3: Exchange the handles and offsets + memset(recv_buf, 0, sizeof(recv_buf)); + // Overkill if we don't really needs all peer's handles + un_allgather(&send_buf, recv_buf, rank, world); + + for (uint32_t i = 0; i < world; i++) + { + // Step 4: Prepare pid file descriptor of next process + auto* peer = recv_buf + i; + // Step 6: Open IPC handle of remote peer + auto l0_device + = sycl::get_native(queue.get_device()); + void* peer_base; + + zeCheck(zeMemOpenIpcHandle( + l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); + std::cout << "zl_debug get peer " << i << " with base address: " << peer_base << " offset: " << peer->offset << std::endl; + buffers[i] = (char*)peer_base + peer->offset; + // make sure data correction + debug_print_buffer(queue, static_cast(buffers[i]), ELE_COUNT); + offsets[i] = peer->offset; + ipc_handle[i] = send_buf.ipc_handle; + } + } + + bool initialized; + void* buffers[max_rank]; + void* sync_buffer[max_rank]; + size_t offsets[max_rank]; + ze_ipc_mem_handle_t ipc_handle[max_rank]; + int rank, world; + int size_per_buffer; + int data_size_per_buffer; + int buffer_index; +}; diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index a21fb729c5..03d0347b92 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -515,6 +516,11 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( auto l0_ctx = sycl::get_native(ctx); sycl::device dev = current_queue.get_device(); auto l0_dev = sycl::get_native(dev); + // check with original ones + allreducer ar; + ar.init(current_queue, rank, world_size); + current_queue.wait(); + std::cout << "!!!![Native] zl_debug torch-ccl exchange done" << std::endl; // print original values int tmp_count = 32768; @@ -584,8 +590,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( buffers[r] = physical_buffer_ptr; //double check this buffer - auto host_ptr = (float *)sycl::malloc_host(32768 * sizeof(float), current_queue); - auto tmp_ptr = (float *)sycl::malloc_device(32768 * sizeof(float), current_queue); + auto host_ptr = (float *)sycl::malloc_host(tmp_count * sizeof(float), current_queue); + auto tmp_ptr = (float *)sycl::malloc_device(tmp_count * sizeof(float), current_queue); std::cout << "[Native] zl_debug start to copy exchanged data to local host " << std::endl; current_queue.memcpy(tmp_ptr, physical_buffer_ptr, tmp_count * sizeof(int)); current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); diff --git a/src/xccl/ze_exception.hpp b/src/xccl/ze_exception.hpp new file mode 100644 index 0000000000..68ac25b8da --- /dev/null +++ b/src/xccl/ze_exception.hpp @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include + +// Mapping from status to human readable string +class zeException : std::exception { + const char * zeResultToString(ze_result_t status) const { + static const std::unordered_map zeResultToStringMap{ + {ZE_RESULT_SUCCESS, "[Core] success"}, + {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, + {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, + {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, + {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, + {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, + {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, + {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, + {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, + {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, + {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, + {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, + {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, + {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, + {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, + {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, + }; + auto it = zeResultToStringMap.find(status); + if (it != zeResultToStringMap.end()) + return it->second; + else + return "Unknown Reason"; + } + +public: + zeException(ze_result_t ret) : result_(ret) {} + + ze_result_t result_; + + const char* what() const noexcept override { + return zeResultToString(result_); + } +}; + +#define zeCheck(x) \ + if (x != ZE_RESULT_SUCCESS) { \ + auto e = zeException(x); \ + std::cout<<"Throw "< Date: Tue, 17 Jun 2025 17:18:29 +0800 Subject: [PATCH 25/58] remove unneeded --- src/xccl/XPUSymmetricMemory.cpp | 61 ++++++++++++--------------------- 1 file changed, 21 insertions(+), 40 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 03d0347b92..ed13c3c5f1 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -75,7 +75,6 @@ XPUSymmetricMemory::XPUSymmetricMemory( } std::vector XPUSymmetricMemory::get_buffer_ptrs() { - std::cout << "zl_debug in XPUSymmetricMemory::get_buffer_ptrs" << buffers_[0] << " ___ " << buffers_[1] << std::endl; return buffers_; } @@ -130,7 +129,7 @@ at::Tensor XPUSymmetricMemory::get_buffer( storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); auto options = at::TensorOptions().dtype(dtype).device(device); - std::cout << "[Native] zl_debug in get_buffer on rank = " << rank << " buffer ptr=" << buffers_[rank] << " offset=" < ar; +// ar.init(current_queue, rank, world_size); +// current_queue.wait(); +// std::cout << "!!!![Native] zl_debug torch-ccl exchange done" << std::endl; +// ze_ipc_mem_handle_t ipc_handle; // convert to base address void *base_addr; @@ -573,7 +555,6 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( if (r == rank) { handles[r] = block->alloc_ref->handle; buffers[r] = ptr; - std::cout << "[Native] zl_debug rendevous in rank = " << r << " ptr = " << ptr << std::endl; signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); continue; } @@ -589,19 +570,19 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( //map_block(&buffers[r], physical_buffer_ptr, block->block_size, block->device_idx); buffers[r] = physical_buffer_ptr; - //double check this buffer - auto host_ptr = (float *)sycl::malloc_host(tmp_count * sizeof(float), current_queue); - auto tmp_ptr = (float *)sycl::malloc_device(tmp_count * sizeof(float), current_queue); - std::cout << "[Native] zl_debug start to copy exchanged data to local host " << std::endl; - current_queue.memcpy(tmp_ptr, physical_buffer_ptr, tmp_count * sizeof(int)); - current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); - current_queue.wait(); - std::cout << "[Native] zl_debug finish copy exchanged data to local host" << std::endl; - - for (int i = 0; i < tmp_count; i++) { - std::cout << host_ptr[i] << " "; - } - std::cout << std::flush; +// //double check this buffer +// auto host_ptr = (float *)sycl::malloc_host(tmp_count * sizeof(float), current_queue); +// auto tmp_ptr = (float *)sycl::malloc_device(tmp_count * sizeof(float), current_queue); +// std::cout << "[Native] zl_debug start to copy exchanged data to local host " << std::endl; +// current_queue.memcpy(tmp_ptr, physical_buffer_ptr, tmp_count * sizeof(int)); +// current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); +// current_queue.wait(); +// std::cout << "[Native] zl_debug finish copy exchanged data to local host" << std::endl; +// +// for (int i = 0; i < tmp_count; i++) { +// std::cout << host_ptr[i] << " "; +// } +// std::cout << std::flush; signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } From 997e26bf0cd3047b37d3045f22a3a889772bd62e Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Thu, 19 Jun 2025 10:36:22 +0800 Subject: [PATCH 26/58] fix a bug and move to local IPC exchange --- src/xccl/IPCExchange.hpp | 25 +++++++++--------- src/xccl/XPUSymmetricMemory.cpp | 46 ++++++++++++++------------------- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index 0ebb885e5c..0e29a0dd1b 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -251,7 +251,8 @@ class allreducer void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) { - auto ret = MPI_Init(NULL, NULL); + if (initialized) return; + auto ret = MPI_Init(NULL, NULL); if (ret == MPI_ERR_OTHER) { std::cout<<"MPI init error"<(local_buffer); // 1024 counts - int host_data[ELE_COUNT]; - for (int i = 0; i < ELE_COUNT; ++i) host_data[i] = static_cast(rank_in + 2); - queue.memcpy(tmp_buffer, host_data, ELE_COUNT * sizeof(int)).wait(); - debug_print_buffer(queue, static_cast(local_buffer), ELE_COUNT); - std::cout << "start to do IPC exchange " << std::endl; + rank = tmp_rank; + world = tmp_world; +// void* local_buffer = sycl::malloc_device(ELE_COUNT * sizeof(int), queue); +// int* tmp_buffer = static_cast(local_buffer); // 1024 counts +// int host_data[ELE_COUNT]; +// for (int i = 0; i < ELE_COUNT; ++i) host_data[i] = static_cast(rank_in + 2); +// queue.memcpy(tmp_buffer, host_data, ELE_COUNT * sizeof(int)).wait(); +// debug_print_buffer(queue, static_cast(local_buffer), ELE_COUNT); +// std::cout << "start to do IPC exchange " << std::endl; // XXX: gain access to remote pointers - exchange_peer_ipc_mem(queue, local_buffer); +// exchange_peer_ipc_mem(queue, local_buffer); initialized = true; } @@ -297,7 +298,6 @@ class allreducer initialized = false; } -private: void debug_print_buffer(sycl::queue& queue, int *address, int count) { auto host_ptr = (int *)sycl::malloc_host(count * sizeof(int), queue); auto tmp_ptr = (int *)sycl::malloc_device(count * sizeof(int), queue); @@ -312,6 +312,7 @@ void debug_print_buffer(sycl::queue& queue, int *address, int count) { } std::cout << std::endl; } + // buffer_size as element size void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) { // Step 1: Get base address of the pointer diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index ed13c3c5f1..c81e560622 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -170,6 +170,7 @@ at::Tensor XPUSymmetricMemory::get_signal_pad( " bytes) exceeds the allocated size (", signal_pad_size, " bytes)"); + std::cout << "[Native] zl_debug get singnal_pads " << std::endl; auto data_ptr = reinterpret_cast(signal_pads_[rank]) + storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); @@ -289,6 +290,7 @@ void* XPUSymmetricMemoryAllocator::alloc( size_t signal_pad_offset = at::round_up(size, 16UL); size_t block_size = signal_pad_offset + signal_pad_size; + std::cout << "[Native] zl_debug in allocation with original size " << size << " with pad size= " << signal_pad_size << " with total block size= " << block_size << std::endl; // 获取 SYCL/Level Zero context 和 device sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); @@ -318,7 +320,8 @@ void* XPUSymmetricMemoryAllocator::alloc( }; // zl_debug create a device memory by sycl - void* ptr = sycl::malloc_device(size, current_queue); + void* ptr = sycl::malloc_device(block_size, current_queue); + current_queue.memset(ptr, 0, block_size); // std::cout << "[Native] zl_debug allocate memory with size = " << size << " allocated ptr=" << ptr << std::endl; @@ -516,20 +519,11 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( sycl::device dev = current_queue.get_device(); auto l0_dev = sycl::get_native(dev); // check with original ones // debug code -// allreducer ar; -// ar.init(current_queue, rank, world_size); -// current_queue.wait(); + // initialize MPI done + allreducer ar; + ar.init(current_queue, rank, world_size); // std::cout << "!!!![Native] zl_debug torch-ccl exchange done" << std::endl; // - ze_ipc_mem_handle_t ipc_handle; - // convert to base address - void *base_addr; - size_t base_size; - zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size); - zeMemGetIpcHandle(l0_ctx, base_addr, &ipc_handle); - size_t base_offset = (char*)ptr - (char*)base_addr; - block_fd = *reinterpret_cast(&ipc_handle); - auto local_req = RendezvousRequest{ .device_idx = block->device_idx, .pid = getpid(), @@ -537,7 +531,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( .buffer_size = block->buffer_size, .signal_pad_offset = block->signal_pad_offset, .has_multicast_support = false, - .base_offset = base_offset}; + .base_offset = 0}; auto reqs = storeExchange.all_gather(store, rank, world_size, local_req); validate_rendezvous_requests(reqs, world_size); @@ -545,7 +539,13 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( for (int r = 0; r < world_size; ++r) { pids[r] = reqs[r].pid; } - auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); + + // do IPC exchange for all peer ranks + ar.exchange_peer_ipc_mem(current_queue, ptr, ); + std::cout << "[Native] zl_debug finished ipc exchange " << std::endl; + + +// auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); std::vector handles(world_size); std::vector buffers(world_size, nullptr); @@ -557,18 +557,12 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( buffers[r] = ptr; signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); continue; + } else { + buffers[r] = ar.buffers[r]; + handles[r] = ar.buffers[r]; //ar.ipc_handle[r]; + signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); } - ze_ipc_mem_handle_t peer_ipc_handle; - int peer_fd = imported_fds[r]; - std::memcpy(&peer_ipc_handle, &peer_fd, sizeof(int)); - - // Open IPC handle of remote peer - void* peer_base; - zeMemOpenIpcHandle(l0_ctx, l0_dev, peer_ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base); - void* physical_buffer_ptr = (char*)peer_base + reqs[r].base_offset; - //map_block(&buffers[r], physical_buffer_ptr, block->block_size, block->device_idx); - buffers[r] = physical_buffer_ptr; // //double check this buffer // auto host_ptr = (float *)sycl::malloc_host(tmp_count * sizeof(float), current_queue); @@ -584,7 +578,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( // } // std::cout << std::flush; - signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); +// signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } storeExchange.barrier(store, rank, world_size); From a350b8f17716606552425d87f5a2a7f9a5a7268d Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Thu, 19 Jun 2025 22:00:41 +0800 Subject: [PATCH 27/58] add symm copy_buffer API --- src/xccl/IPCExchange.hpp | 27 +++++++-------- src/xccl/XPUSymmetricMemory.cpp | 60 ++++++++------------------------- src/xccl/XPUSymmetricMemory.hpp | 1 + 3 files changed, 27 insertions(+), 61 deletions(-) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index 0e29a0dd1b..e112166e2f 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -252,10 +252,17 @@ class allreducer void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) { if (initialized) return; - auto ret = MPI_Init(NULL, NULL); - if (ret == MPI_ERR_OTHER) { - std::cout<<"MPI init error"<(local_buffer); // 1024 counts -// int host_data[ELE_COUNT]; -// for (int i = 0; i < ELE_COUNT; ++i) host_data[i] = static_cast(rank_in + 2); -// queue.memcpy(tmp_buffer, host_data, ELE_COUNT * sizeof(int)).wait(); -// debug_print_buffer(queue, static_cast(local_buffer), ELE_COUNT); -// std::cout << "start to do IPC exchange " << std::endl; - - // XXX: gain access to remote pointers -// exchange_peer_ipc_mem(queue, local_buffer); - initialized = true; + initialized = true; } void allreduce(sycl::queue& queue, void* inout_buffer, uint32_t size) {} diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index c81e560622..5a360019db 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -106,6 +106,19 @@ void* XPUSymmetricMemory::get_multicast_ptr() { return mc_addr_; } +void XPUSymmetricMemory::copy_buffer(at::Tensor src, at::Tensor dst , size_t size) { + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + auto src_ptr = src.data_ptr(); + auto dst_ptr = dst.data_ptr(); + + size_t copy_size = size * c10::elementSize(src.scalar_type()); + +// std::cout << "[Native] zl_debug start to copy from src to dst with size " << copy_size << std::endl; + current_queue.memcpy(dst_ptr, src_ptr, copy_size); +// current_queue.wait(); +// std::cout << "[Native] zl_debug copy done " << std::endl; + +} at::Tensor XPUSymmetricMemory::get_buffer( int rank, c10::IntArrayRef sizes, @@ -425,51 +438,6 @@ static bool check_group_multicast_support( } } -//void XPUSymmetricMemoryAllocator::exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) -// { -// // Step 1: Get base address of the pointer -// sycl::context ctx = queue.get_context(); -// auto l0_ctx = sycl::get_native(ctx); -// -// void *base_addr; -// size_t base_size; -// ze_result_t status = zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size); -// TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeMemGetAddressRange failed"); -// -// // Step 2: Get IPC mem handle from base address -// alignas(64) exchange_contents send_buf; -// alignas(64) exchange_contents recv_buf[world]; -// -// // fill in the exchange info -// status = zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle); -// TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeMemGetIpcHandle failed"); -// send_buf.offset = (char*)ptr - (char*)base_addr; -// send_buf.pid = getpid(); -// -// // Step 3: Exchange the handles and offsets -// memset(recv_buf, 0, sizeof(recv_buf)); -// // Overkill if we don't really needs all peer's handles -// un_allgather(&send_buf, recv_buf, rank, world); -// -// for (uint32_t i = 0; i < world; i++) -// { -// // Step 4: Prepare pid file descriptor of next process -// auto* peer = recv_buf + i; -// // Step 6: Open IPC handle of remote peer -// auto l0_device -// = sycl::get_native(queue.get_device()); -// void* peer_base; -// -// status = zeMemOpenIpcHandle( -// l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base); -// TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeMemOpenIpcHandle failed"); -// buffers[i] = (char*)peer_base + peer->offset; -// sync_buffer[i] = (char*)peer_base + peer->offset + data_size_per_buffer * sizeof(data_type); -// offsets[i] = peer->offset; -// ipc_handle[i] = send_buf.ipc_handle; -// } -// } - c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( void* ptr, const std::optional& group_name) { @@ -541,7 +509,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( } // do IPC exchange for all peer ranks - ar.exchange_peer_ipc_mem(current_queue, ptr, ); + ar.exchange_peer_ipc_mem(current_queue, ptr); std::cout << "[Native] zl_debug finished ipc exchange " << std::endl; diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index 3d7bc778fa..6bfeec2291 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -65,6 +65,7 @@ class XPUSymmetricMemory : public SymmetricMemory { void barrier(int channel, size_t timeout_ms) override; void put_signal(int dst_rank, int channel, size_t timeout_ms) override; void wait_signal(int src_rank, int channel, size_t timeout_ms) override; + void copy_buffer(at::Tensor src, at::Tensor dst , size_t size) override; int get_rank() override; int get_world_size() override; From 49ca539c1b28055725e939896447e3ed71c798ef Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Mon, 23 Jun 2025 10:20:18 +0800 Subject: [PATCH 28/58] barrier with MPI --- src/xccl/XPUSymmetricMemory.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 5a360019db..0f91d00386 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -14,6 +14,9 @@ // todo: check this point #include +// todo: fixed with kernel barrier +#include + namespace c10d { namespace symmetric_memory { @@ -221,6 +224,10 @@ void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + std::cout << "zl_debug finish to do barrier " << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + std::cout << "zl_debug start to do barrier " << std::endl; + // current_queue.submit([&](handler& h) { // h.parallel_for(range<1>(world_size), [=](id<1> idx) { // int target_rank = idx[0]; From c0b600b54e20225ba478a4557047f80eea6097de Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 24 Jun 2025 14:31:42 +0800 Subject: [PATCH 29/58] support arc --- cmake/XCCL.cmake | 6 ++++++ src/xccl/ProcessGroupXCCL.cpp | 2 -- src/xccl/XPUSymmetricMemory.cpp | 8 ++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cmake/XCCL.cmake b/cmake/XCCL.cmake index 6deeb8d3c9..ed8b94c04f 100644 --- a/cmake/XCCL.cmake +++ b/cmake/XCCL.cmake @@ -17,4 +17,10 @@ if(NOT __XCCL_INCLUDED) set_property( TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES ${XCCL_LIBRARY}) + set_property( + TARGET torch::xccl APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + /mnt/cache/drivers/neo/level_zero/include) + set_property( + TARGET torch::xccl APPEND PROPERTY INTERFACE_LINK_LIBRARIES + /usr/lib/x86_64-linux-gnu/libze_loader.so) endif() diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index 634dc62d47..c820a1c486 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -670,8 +670,6 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device, opType); - std::cout << "zl_debug async OP " << asyncOp << std::endl; - if (coalescing_state_ & CoalActive) { if ((coalescing_state_ & CoalColl) == 0) { seqCollective_++; diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 0f91d00386..108278de0c 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -224,9 +224,9 @@ void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - std::cout << "zl_debug finish to do barrier " << std::endl; +// std::cout << "zl_debug finish to do barrier " << std::endl; MPI_Barrier(MPI_COMM_WORLD); - std::cout << "zl_debug start to do barrier " << std::endl; +// std::cout << "zl_debug start to do barrier " << std::endl; // current_queue.submit([&](handler& h) { // h.parallel_for(range<1>(world_size), [=](id<1> idx) { @@ -326,8 +326,8 @@ void* XPUSymmetricMemoryAllocator::alloc( // 创建物理内存句柄 ze_physical_mem_handle_t handle = nullptr; - ze_result_t status = zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, &handle); - TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zePhysicalMemCreate failed"); +// ze_result_t status = zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, &handle); +// TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zePhysicalMemCreate failed"); // 分配虚拟地址空间(只映射,不物理分配) // void* ptr = nullptr; From abade523284c6cd4b56e23ac843d702a54be6c69 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 24 Jun 2025 22:53:03 +0800 Subject: [PATCH 30/58] refine barrier --- src/xccl/XPUSymmetricMemory.cpp | 49 +++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 108278de0c..e49677b419 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -17,6 +17,8 @@ // todo: fixed with kernel barrier #include +#define MAX_RANK 8 + namespace c10d { namespace symmetric_memory { @@ -223,10 +225,51 @@ void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { c10::DeviceGuard guard(local_device); sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); +// sycl::event dep_event = current_queue.ext_oneapi_submit_barrier(); + + std::cout << "zl_debug start to do barrier " << std::endl; +// current_queue.submit([=](sycl::handler& h) { +// h.depends_on({dep_event}); +// h.host_task([=]() { +// MPI_Barrier(MPI_COMM_WORLD); +// }); +// }); -// std::cout << "zl_debug finish to do barrier " << std::endl; - MPI_Barrier(MPI_COMM_WORLD); -// std::cout << "zl_debug start to do barrier " << std::endl; + int *peer_address[MAX_RANK]; + int *local_address[MAX_RANK]; + + for (int i = 0; i < world_size_; i++) { + peer_address[i] = static_cast(singnal_pads[i]) + world_size_ * channel + rank_; + local_address[i] = static_cast(singnal_pads[rank]) + world_size_ * channel + i; + } + int tmp_rank = rank_; + current_queue.submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::range<1>{ world_size_ }, [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL{ + int target_rank = idx.get_linear_id(); + sycl::ext::oneapi::experimental::printf("DEBUG loop to rank%d: \n", target_rank); + if (target_rank != tmp_rank) { + simd grf; + grf[0] = 1; + lsc_block_store(peer_address[target_rank], grf); + sycl::ext::oneapi::experimental::printf("DEBUG block store done rank%d: \n", target_rank); + do { +// lsc_fence(); + fence(); + grf = lsc_block_load + (local_address[idx]); + sycl::ext::oneapi::experimental::printf("DEBUG block load wip rank%d: \n", target_rank); + } while (grf[0] == 0); + grf[0] = 0; + sycl::ext::oneapi::experimental::printf("DEBUG block load wip rank%d: \n", target_rank); + lsc_block_store(local_address[target_rank], grf); + sycl::ext::oneapi::experimental::printf("DEBUG block store back rank%d:\n", target_rank); + } + } + ); + }); + current_queue.wait(); + std::cout << "zl_debug finish to do barrier " << std::endl; // current_queue.submit([&](handler& h) { // h.parallel_for(range<1>(world_size), [=](id<1> idx) { From eb722a7b652d615695096cd83a276a37e7020634 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Wed, 25 Jun 2025 16:58:46 +0800 Subject: [PATCH 31/58] workaroud barrier --- src/xccl/IPCExchange.hpp | 10 ++-- src/xccl/XPUSymmetricMemory.cpp | 101 ++++++++++++-------------------- 2 files changed, 44 insertions(+), 67 deletions(-) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index e112166e2f..cd5bfbe8f9 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -270,7 +270,7 @@ class allreducer MPI_Comm_size(MPI_COMM_WORLD, &tmp_world); MPI_Comm_rank(MPI_COMM_WORLD, &tmp_rank); - std::cout << "zl_debug get rank & world size after MPI init " << tmp_world << " " << tmp_rank << std::endl; +// std::cout << "zl_debug get rank & world size after MPI init " << tmp_world << " " << tmp_rank << std::endl; rank = tmp_rank; world = tmp_world; @@ -319,7 +319,7 @@ void debug_print_buffer(sycl::queue& queue, int *address, int count) { void *base_addr; size_t base_size; zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); - std::cout << "zl_debug get base address " << base_addr << " base size " << base_size << std::endl; +// std::cout << "zl_debug get base address " << base_addr << " base size " << base_size << std::endl; // Step 2: Get IPC mem handle from base address alignas(64) exchange_contents send_buf; @@ -328,7 +328,7 @@ void debug_print_buffer(sycl::queue& queue, int *address, int count) { // fill in the exchange info zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); send_buf.offset = (char*)ptr - (char*)base_addr; - std::cout << "zl_debug get address base offset " << send_buf.offset << std::endl; +// std::cout << "zl_debug get address base offset " << send_buf.offset << std::endl; send_buf.pid = getpid(); // Step 3: Exchange the handles and offsets @@ -347,10 +347,10 @@ void debug_print_buffer(sycl::queue& queue, int *address, int count) { zeCheck(zeMemOpenIpcHandle( l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); - std::cout << "zl_debug get peer " << i << " with base address: " << peer_base << " offset: " << peer->offset << std::endl; +// std::cout << "zl_debug get peer " << i << " with base address: " << peer_base << " offset: " << peer->offset << std::endl; buffers[i] = (char*)peer_base + peer->offset; // make sure data correction - debug_print_buffer(queue, static_cast(buffers[i]), ELE_COUNT); +// debug_print_buffer(queue, static_cast(buffers[i]), ELE_COUNT); offsets[i] = peer->offset; ipc_handle[i] = send_buf.ipc_handle; } diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index e49677b419..fa016c0702 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -188,7 +188,7 @@ at::Tensor XPUSymmetricMemory::get_signal_pad( " bytes) exceeds the allocated size (", signal_pad_size, " bytes)"); - std::cout << "[Native] zl_debug get singnal_pads " << std::endl; +// std::cout << "[Native] zl_debug get singnal_pads " << std::endl; auto data_ptr = reinterpret_cast(signal_pads_[rank]) + storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); @@ -225,66 +225,43 @@ void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { c10::DeviceGuard guard(local_device); sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); -// sycl::event dep_event = current_queue.ext_oneapi_submit_barrier(); - - std::cout << "zl_debug start to do barrier " << std::endl; -// current_queue.submit([=](sycl::handler& h) { -// h.depends_on({dep_event}); -// h.host_task([=]() { -// MPI_Barrier(MPI_COMM_WORLD); -// }); -// }); - - int *peer_address[MAX_RANK]; - int *local_address[MAX_RANK]; - - for (int i = 0; i < world_size_; i++) { - peer_address[i] = static_cast(singnal_pads[i]) + world_size_ * channel + rank_; - local_address[i] = static_cast(singnal_pads[rank]) + world_size_ * channel + i; - } - int tmp_rank = rank_; - current_queue.submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl::range<1>{ world_size_ }, [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL{ - int target_rank = idx.get_linear_id(); - sycl::ext::oneapi::experimental::printf("DEBUG loop to rank%d: \n", target_rank); - if (target_rank != tmp_rank) { - simd grf; - grf[0] = 1; - lsc_block_store(peer_address[target_rank], grf); - sycl::ext::oneapi::experimental::printf("DEBUG block store done rank%d: \n", target_rank); - do { -// lsc_fence(); - fence(); - grf = lsc_block_load - (local_address[idx]); - sycl::ext::oneapi::experimental::printf("DEBUG block load wip rank%d: \n", target_rank); - } while (grf[0] == 0); - grf[0] = 0; - sycl::ext::oneapi::experimental::printf("DEBUG block load wip rank%d: \n", target_rank); - lsc_block_store(local_address[target_rank], grf); - sycl::ext::oneapi::experimental::printf("DEBUG block store back rank%d:\n", target_rank); - } - } - ); - }); - current_queue.wait(); - std::cout << "zl_debug finish to do barrier " << std::endl; - -// current_queue.submit([&](handler& h) { -// h.parallel_for(range<1>(world_size), [=](id<1> idx) { -// int target_rank = idx[0]; -// if (target_rank == rank) { -// return; -// } -// //todo: implement -//// bool put_success = try_put_signal( -//// signal_pads[target_rank] + world_size * channel + rank, timeout_ms); -//// -//// bool wait_success = try_wait_signal( -//// signal_pads[rank] + world_size * channel + target_rank, timeout_ms); -// }); -// }); +// std::cout << "zl_debug start to do barrier " << std::endl; +// +// int *peer_address[MAX_RANK]; +// int *local_address[MAX_RANK]; +// +// for (int i = 0; i < world_size_; i++) { +// peer_address[i] = static_cast(singnal_pads[i]) + world_size_ * channel + rank_; +// local_address[i] = static_cast(singnal_pads[rank]) + world_size_ * channel + i; +// } +// int tmp_rank = rank_; +// current_queue.submit([&](sycl::handler& cgh) { +// cgh.parallel_for( +// sycl::range<1>{ world_size_ }, [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL{ +// int target_rank = idx.get_linear_id(); +// sycl::ext::oneapi::experimental::printf("DEBUG loop to rank%d: \n", target_rank); +// if (target_rank != tmp_rank) { +// simd grf; +// grf[0] = 1; +// lsc_block_store(peer_address[target_rank], grf); +// sycl::ext::oneapi::experimental::printf("DEBUG block store done rank%d: \n", target_rank); +// do { +//// lsc_fence(); +// fence(); +// grf = lsc_block_load +// (local_address[idx]); +// sycl::ext::oneapi::experimental::printf("DEBUG block load wip rank%d: \n", target_rank); +// } while (grf[0] == 0); +// grf[0] = 0; +// sycl::ext::oneapi::experimental::printf("DEBUG block load wip rank%d: \n", target_rank); +// lsc_block_store(local_address[target_rank], grf); +// sycl::ext::oneapi::experimental::printf("DEBUG block store back rank%d:\n", target_rank); +// } +// } +// ); +// }); +// current_queue.wait(); +// std::cout << "zl_debug finish to do barrier " << std::endl; } void XPUSymmetricMemory::put_signal( @@ -353,7 +330,7 @@ void* XPUSymmetricMemoryAllocator::alloc( size_t signal_pad_offset = at::round_up(size, 16UL); size_t block_size = signal_pad_offset + signal_pad_size; - std::cout << "[Native] zl_debug in allocation with original size " << size << " with pad size= " << signal_pad_size << " with total block size= " << block_size << std::endl; +// std::cout << "[Native] zl_debug in allocation with original size " << size << " with pad size= " << signal_pad_size << " with total block size= " << block_size << std::endl; // 获取 SYCL/Level Zero context 和 device sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); From 5d242723ff27507a6a78b32a86a26399d1cb9dc5 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Thu, 3 Jul 2025 21:27:43 +0800 Subject: [PATCH 32/58] refine ipc exchange --- cmake/XCCL.cmake | 2 +- src/xccl/IPCExchange.hpp | 78 ++++++++++++++++++++-------- src/xccl/XPUSymmetricMemory.hpp | 2 +- src/xccl/XPUSymmetricMemoryUtils.hpp | 2 +- 4 files changed, 60 insertions(+), 24 deletions(-) diff --git a/cmake/XCCL.cmake b/cmake/XCCL.cmake index ed8b94c04f..2e4afc01a5 100644 --- a/cmake/XCCL.cmake +++ b/cmake/XCCL.cmake @@ -19,7 +19,7 @@ if(NOT __XCCL_INCLUDED) ${XCCL_LIBRARY}) set_property( TARGET torch::xccl APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES - /mnt/cache/drivers/neo/level_zero/include) + /usr/include) set_property( TARGET torch::xccl APPEND PROPERTY INTERFACE_LINK_LIBRARIES /usr/lib/x86_64-linux-gnu/libze_loader.so) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index cd5bfbe8f9..043d9c6f67 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -9,7 +9,7 @@ #include #include -#include +//#include #include #include @@ -117,7 +117,7 @@ int prepare_socket(const char *sockname) { } int server_listen(const char *sockname) { - // unlink(sockname); + unlink(sockname); auto sock = prepare_socket(sockname); sysCheck(listen(sock, 10)); @@ -134,14 +134,45 @@ int serv_accept(int listen_sock) { return accept_sock; } +bool wait_for_socket_file(const char* path, int max_seconds = 10) { + struct stat buffer; + for (int i = 0; i < max_seconds * 10; ++i) { + if (stat(path, &buffer) == 0) { + return true; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + return false; +} + int client_connect(const char *server, const char *client) { + if (!wait_for_socket_file(server, 10)) { + std::cerr << "Error: timeout waiting for server socket file: " << server + << std::endl; + exit(EXIT_FAILURE); + } auto sock = prepare_socket(client); sockaddr_un sun; memset(&sun, 0, sizeof(sun)); sun.sun_family = AF_UNIX; strcpy(sun.sun_path, server); auto len = offsetof(sockaddr_un, sun_path) + strlen(server); - sysCheck(connect(sock, (sockaddr *)&sun, len)); + //sysCheck(connect(sock, (sockaddr *)&sun, len)); + // connect重试 + const int max_retries = 50; + int retry = 0; + int ret = -1; + while (retry < max_retries) { + ret = connect(sock, (sockaddr*)&sun, len); + if (ret == 0) + break; // 连接成功 + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + retry++; + } + if (ret != 0) { + perror("connect failed"); + exit(EXIT_FAILURE); + } return sock; } @@ -156,7 +187,7 @@ void un_allgather(exchange_contents* send_buf, exchange_contents recv_buf[], int unlink(server_name); auto s_listen = server_listen(server_name); - MPI_Barrier(MPI_COMM_WORLD); +// MPI_Barrier(MPI_COMM_WORLD); pollfd fdarray[world]; int recv_socks[world-1]; @@ -253,27 +284,27 @@ class allreducer { if (initialized) return; int flag = 0; - MPI_Initialized(&flag); - - if (!flag) { - auto ret = MPI_Init(NULL, NULL); - if (ret == MPI_ERR_OTHER) { - std::cout<<"MPI init error"<(buffers[i]), ELE_COUNT); offsets[i] = peer->offset; ipc_handle[i] = send_buf.ipc_handle; + } } } diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index 6bfeec2291..d083fee65f 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace c10d::symmetric_memory { diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp index e7a492abe8..3814f57669 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.hpp +++ b/src/xccl/XPUSymmetricMemoryUtils.hpp @@ -1,6 +1,6 @@ #pragma once #include -#include +#include #include namespace c10d { From 3be5ebec259fbe2e27d726dea49d7d6a22df36ab Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Thu, 10 Jul 2025 16:57:26 +0800 Subject: [PATCH 33/58] refine ipc exchange --- src/xccl/IPCExchange.hpp | 369 +++++++++++++++++++++------------------ 1 file changed, 203 insertions(+), 166 deletions(-) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index 043d9c6f67..12b736f772 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -1,31 +1,31 @@ +#include +#include +#include #include -#include #include +#include #include -#include -#include #include -#include -#include #include +#include +#include //#include #include -#include +#include #include #include -#include +#include // for std::chrono::milliseconds +#include // for std::this_thread::sleep_for #include "xccl/ze_exception.hpp" #define ELE_COUNT 128 -struct exchange_contents -{ +struct exchange_contents { // first 4-byte is file descriptor for drmbuf or gem object - union - { + union { ze_ipc_mem_handle_t ipc_handle; int fd = -1; }; @@ -33,10 +33,9 @@ struct exchange_contents int pid = -1; }; -#define sysCheck(x) \ - if (x == -1) { \ - throw std::system_error( \ - std::make_error_code(std::errc(errno))); \ +#define sysCheck(x) \ + if (x == -1) { \ + throw std::system_error(std::make_error_code(std::errc(errno))); \ } // We can't inherit it from cmsghdr because flexible array member @@ -44,9 +43,8 @@ struct exchange_fd { char obscure[CMSG_LEN(sizeof(int)) - sizeof(int)]; int fd; - exchange_fd(int cmsg_level, int cmsg_type, int fd) - : fd(fd) { - auto* cmsg = reinterpret_cast(obscure); + exchange_fd(int cmsg_level, int cmsg_type, int fd) : fd(fd) { + auto* cmsg = reinterpret_cast(obscure); cmsg->cmsg_len = sizeof(exchange_fd); cmsg->cmsg_level = cmsg_level; cmsg->cmsg_type = cmsg_type; @@ -69,7 +67,7 @@ void un_send_fd(int sock, int fd, int rank, size_t offset) { msg.msg_name = nullptr; msg.msg_namelen = 0; - exchange_fd cmsg (SOL_SOCKET, SCM_RIGHTS, fd); + exchange_fd cmsg(SOL_SOCKET, SCM_RIGHTS, fd); msg.msg_control = &cmsg; msg.msg_controllen = sizeof(exchange_fd); @@ -98,7 +96,7 @@ std::tuple un_recv_fd(int sock) { return std::make_tuple(cmsg.fd, rank_offset.first, rank_offset.second); } -int prepare_socket(const char *sockname) { +int prepare_socket(const char* sockname) { sockaddr_un un; memset(&un, 0, sizeof(un)); un.sun_family = AF_UNIX; @@ -111,13 +109,13 @@ int prepare_socket(const char *sockname) { sysCheck(ioctl(sock, FIONBIO, &on)); auto size = offsetof(sockaddr_un, sun_path) + strlen(un.sun_path); - sysCheck(bind(sock, (sockaddr *)&un, size)); + sysCheck(bind(sock, (sockaddr*)&un, size)); return sock; } -int server_listen(const char *sockname) { - unlink(sockname); +int server_listen(const char* sockname) { + unlink(sockname); auto sock = prepare_socket(sockname); sysCheck(listen(sock, 10)); @@ -125,10 +123,10 @@ int server_listen(const char *sockname) { } int serv_accept(int listen_sock) { - sockaddr_un un; + sockaddr_un un; socklen_t len = sizeof(un); - auto accept_sock = accept(listen_sock, (sockaddr *)&un, &len); + auto accept_sock = accept(listen_sock, (sockaddr*)&un, &len); sysCheck(accept_sock); return accept_sock; @@ -145,19 +143,21 @@ bool wait_for_socket_file(const char* path, int max_seconds = 10) { return false; } -int client_connect(const char *server, const char *client) { +int client_connect(const char* server, const char* client) { + // std::cout << "zl_debug in client connect " << server << "___" << client + // << std::endl; if (!wait_for_socket_file(server, 10)) { std::cerr << "Error: timeout waiting for server socket file: " << server << std::endl; exit(EXIT_FAILURE); } auto sock = prepare_socket(client); + // std::cout << "zl_debug prepare socket done " << std::endl; sockaddr_un sun; memset(&sun, 0, sizeof(sun)); sun.sun_family = AF_UNIX; strcpy(sun.sun_path, server); auto len = offsetof(sockaddr_un, sun_path) + strlen(server); - //sysCheck(connect(sock, (sockaddr *)&sun, len)); // connect重试 const int max_retries = 50; int retry = 0; @@ -173,46 +173,62 @@ int client_connect(const char *server, const char *client) { perror("connect failed"); exit(EXIT_FAILURE); } + + // sysCheck(connect(sock, (sockaddr*)&sun, len)); + // std::cout << "zl_debug connect done " << std::endl; return sock; } -void un_allgather(exchange_contents* send_buf, exchange_contents recv_buf[], int rank, int world) { +void un_allgather( + exchange_contents* send_buf, + exchange_contents recv_buf[], + int rank, + int world) { const char* servername_prefix = "/tmp/open-peer-ipc-mem-server-rank_"; const char* clientname_prefix = "/tmp/open-peer-ipc-mem-client-rank_"; char server_name[64]; /* get username to make server_name unique */ auto uid = getuid(); auto pwd = getpwuid(uid); - snprintf(server_name, sizeof(server_name), "%s%d_%s", servername_prefix, rank, pwd->pw_name); + snprintf( + server_name, + sizeof(server_name), + "%s%d_%s", + servername_prefix, + rank, + pwd->pw_name); unlink(server_name); auto s_listen = server_listen(server_name); -// MPI_Barrier(MPI_COMM_WORLD); + // MPI_Barrier(MPI_COMM_WORLD); pollfd fdarray[world]; - int recv_socks[world-1]; + int recv_socks[world - 1]; - for (auto& pollfd : fdarray) pollfd.fd = -1; - std::fill(recv_socks, recv_socks + world -1, -1); + for (auto& pollfd : fdarray) + pollfd.fd = -1; + std::fill(recv_socks, recv_socks + world - 1, -1); auto fd_guard = [&]() { - for (int i = 0, j = 0; i < world; ++ i) { - if ( i != rank && recv_socks[j] != -1) + for (int i = 0, j = 0; i < world; ++i) { + if (i != rank && recv_socks[j] != -1) sysCheck(close(recv_socks[j++])); - if ( fdarray[i].fd != -1 ) + if (fdarray[i].fd != -1) sysCheck(close(fdarray[i].fd)); } }; - struct guard__{ + struct guard__ { using F = decltype(fd_guard); F f; - guard__(const F &f) : f(f) {} - ~guard__() { f(); } + guard__(const F& f) : f(f) {} + ~guard__() { + f(); + } } free_fd(fd_guard); // connect to all ranks - for (int i = 0; i < world; ++ i) { + for (int i = 0; i < world; ++i) { if (rank == i) { fdarray[i].fd = s_listen; fdarray[i].events = POLLIN; @@ -221,10 +237,23 @@ void un_allgather(exchange_contents* send_buf, exchange_contents recv_buf[], int char peer_name[64]; char client_name[64]; - snprintf(client_name, sizeof(client_name), "%s%d-%d_%s", clientname_prefix, rank, i, pwd->pw_name); + snprintf( + client_name, + sizeof(client_name), + "%s%d-%d_%s", + clientname_prefix, + rank, + i, + pwd->pw_name); unlink(client_name); - snprintf(peer_name, sizeof(peer_name), "%s%d_%s", servername_prefix, i, pwd->pw_name); + snprintf( + peer_name, + sizeof(peer_name), + "%s%d_%s", + servername_prefix, + i, + pwd->pw_name); fdarray[i].fd = client_connect(peer_name, client_name); fdarray[i].events = POLLOUT; fdarray[i].revents = 0; @@ -233,12 +262,12 @@ void un_allgather(exchange_contents* send_buf, exchange_contents recv_buf[], int // std::future> future_fds[world -1]; int slot = 0; - uint32_t send_progress = 1<fd, rank, send_buf->offset); - send_progress |= 1< -class allreducer -{ -public: - allreducer() - { - initialized = false; - size_per_buffer = 0; - buffer_index = 0; - } - - void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) - { - if (initialized) return; - int flag = 0; -// MPI_Initialized(&flag); -// -// if (!flag) { -// auto ret = MPI_Init(NULL, NULL); -// if (ret == MPI_ERR_OTHER) { -// std::cout<<"MPI init error"< +class allreducer { + public: + allreducer() { + initialized = false; + size_per_buffer = 0; + buffer_index = 0; + } + void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) { + if (initialized) + return; + /** + int flag = 0; + MPI_Initialized(&flag); + + if (!flag) { + auto ret = MPI_Init(NULL, NULL); + if (ret == MPI_ERR_OTHER) { + std::cout << "MPI init error" << std::endl; + return; + } else { + std::cout << "MPI init in torch-xpu-ops" << std::endl; + } + } else { + std::cout << "MPI already initialized.\n"; } - void allreduce(sycl::queue& queue, void* inout_buffer, uint32_t size) {} - void release(sycl::queue& queue) - { - // Clean up, close/put ipc handles, free memory, etc. - auto l0_ctx = sycl::get_native< - sycl::backend::ext_oneapi_level_zero>(queue.get_context()); - for (int i = 0; i < world; i++) - { - if (i != rank) - { - zeCheck(zeMemCloseIpcHandle(l0_ctx, (char *)buffers[i] - offsets[i])); - } - } - - sycl::free(buffers[rank], queue); - initialized = false; + **/ + + zeCheck(zeInit(0)); + int tmp_rank, tmp_world; + + // MPI_Comm_size(MPI_COMM_WORLD, &tmp_world); + // MPI_Comm_rank(MPI_COMM_WORLD, &tmp_rank); + tmp_world = world_in; + tmp_rank = rank_in; + // std::cout << "zl_debug get rank & world size after MPI init " << + // tmp_world + // << " " << tmp_rank << std::endl; + + rank = tmp_rank; + world = tmp_world; + initialized = true; + } + void allreduce(sycl::queue& queue, void* inout_buffer, uint32_t size) {} + void release(sycl::queue& queue) { + // Clean up, close/put ipc handles, free memory, etc. + auto l0_ctx = sycl::get_native( + queue.get_context()); + for (int i = 0; i < world; i++) { + if (i != rank) { + zeCheck(zeMemCloseIpcHandle(l0_ctx, (char*)buffers[i] - offsets[i])); + } } -void debug_print_buffer(sycl::queue& queue, int *address, int count) { - auto host_ptr = (int *)sycl::malloc_host(count * sizeof(int), queue); - auto tmp_ptr = (int *)sycl::malloc_device(count * sizeof(int), queue); + sycl::free(buffers[rank], queue); + initialized = false; + } + + void debug_print_buffer(sycl::queue& queue, int* address, int count) { + auto host_ptr = (int*)sycl::malloc_host(count * sizeof(int), queue); + auto tmp_ptr = (int*)sycl::malloc_device(count * sizeof(int), queue); queue.memcpy(tmp_ptr, address, count * sizeof(int)); queue.memcpy(host_ptr, tmp_ptr, count * sizeof(int)); @@ -336,69 +371,71 @@ void debug_print_buffer(sycl::queue& queue, int *address, int count) { queue.wait(); for (int i = 0; i < count; i++) { - std::cout << host_ptr[i] << " "; + std::cout << host_ptr[i] << " "; } std::cout << std::endl; -} - // buffer_size as element size - void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) - { - // Step 1: Get base address of the pointer - sycl::context ctx = queue.get_context(); - auto l0_ctx = sycl::get_native(ctx); - - void *base_addr; - size_t base_size; - zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); -// std::cout << "zl_debug get base address " << base_addr << " base size " << base_size << std::endl; - - // Step 2: Get IPC mem handle from base address - alignas(64) exchange_contents send_buf; - alignas(64) exchange_contents recv_buf[world]; - - // fill in the exchange info - zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); - send_buf.offset = (char*)ptr - (char*)base_addr; -// std::cout << "zl_debug get address base offset " << send_buf.offset << std::endl; - send_buf.pid = getpid(); - - // Step 3: Exchange the handles and offsets - memset(recv_buf, 0, sizeof(recv_buf)); - // Overkill if we don't really needs all peer's handles - un_allgather(&send_buf, recv_buf, rank, world); - - for (uint32_t i = 0; i < world; i++) - { - if (i == rank) { - buffers[i] = ptr; - offsets[i] = 0; - } else { - // Step 4: Prepare pid file descriptor of next process - auto* peer = recv_buf + i; - // Step 6: Open IPC handle of remote peer - auto l0_device - = sycl::get_native(queue.get_device()); - void* peer_base; - - zeCheck(zeMemOpenIpcHandle( - l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); -// std::cout << "zl_debug get peer " << i << " with base address: " << peer_base << " offset: " << peer->offset << std::endl; - buffers[i] = (char*)peer_base + peer->offset; - // make sure data correction -// debug_print_buffer(queue, static_cast(buffers[i]), ELE_COUNT); - offsets[i] = peer->offset; - ipc_handle[i] = send_buf.ipc_handle; - } - } + } + // buffer_size as element size + void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) { + // Step 1: Get base address of the pointer + sycl::context ctx = queue.get_context(); + auto l0_ctx = sycl::get_native(ctx); + + void* base_addr; + size_t base_size; + zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); + // std::cout << "zl_debug get base address " << base_addr << " base size " + // << base_size << std::endl; + + // Step 2: Get IPC mem handle from base address + alignas(64) exchange_contents send_buf; + alignas(64) exchange_contents recv_buf[world]; + + // fill in the exchange info + zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); + send_buf.offset = (char*)ptr - (char*)base_addr; + // std::cout << "zl_debug get address base offset " << send_buf.offset + // << std::endl; + send_buf.pid = getpid(); + + // Step 3: Exchange the handles and offsets + memset(recv_buf, 0, sizeof(recv_buf)); + // Overkill if we don't really needs all peer's handles + un_allgather(&send_buf, recv_buf, rank, world); + // std::cout << "zl_debug after un allgather" << std::endl; + for (uint32_t i = 0; i < world; i++) { + // Step 4: Prepare pid file descriptor of next process + auto* peer = recv_buf + i; + // Step 6: Open IPC handle of remote peer + auto l0_device = sycl::get_native( + queue.get_device()); + void* peer_base; + + zeCheck(zeMemOpenIpcHandle( + l0_ctx, + l0_device, + peer->ipc_handle, + ZE_IPC_MEMORY_FLAG_BIAS_CACHED, + &peer_base)); + // std::cout << "zl_debug get peer " << i << " with base + // address: " << peer_base << " offset: " << peer->offset << + // std::endl; + buffers[i] = (char*)peer_base + peer->offset; + // make sure data correction + // debug_print_buffer(queue, static_cast(buffers[i]), + // ELE_COUNT); + offsets[i] = peer->offset; + ipc_handle[i] = send_buf.ipc_handle; } + } - bool initialized; - void* buffers[max_rank]; - void* sync_buffer[max_rank]; - size_t offsets[max_rank]; - ze_ipc_mem_handle_t ipc_handle[max_rank]; - int rank, world; - int size_per_buffer; - int data_size_per_buffer; - int buffer_index; + bool initialized; + void* buffers[max_rank]; + void* sync_buffer[max_rank]; + size_t offsets[max_rank]; + ze_ipc_mem_handle_t ipc_handle[max_rank]; + int rank, world; + int size_per_buffer; + int data_size_per_buffer; + int buffer_index; }; From 3d1491c111be3b27a1c6088fa7499386fcdc207a Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 15 Jul 2025 16:25:57 +0800 Subject: [PATCH 34/58] reabse and then workaround barrier --- src/xccl/ProcessGroupXCCL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index c820a1c486..84d144511b 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -2078,7 +2078,7 @@ c10::intrusive_ptr ProcessGroupXCCL::barrier(const BarrierOptions& opts) { } auto currentStream = at::xpu::getCurrentXPUStream(barDevIdx); - currentStream.synchronize(); +// currentStream.synchronize(); // zl_debug workaround for symm barrier return nullptr; } From fdf3d6d7cdb23eaca5c540cc155e4b1d5b5d8286 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Thu, 17 Jul 2025 15:55:28 +0800 Subject: [PATCH 35/58] impl base class virtual function --- src/xccl/XPUSymmetricMemory.cpp | 9 +++++++++ src/xccl/XPUSymmetricMemory.hpp | 2 ++ 2 files changed, 11 insertions(+) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index fa016c0702..846e19e6b4 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -609,6 +609,15 @@ bool XPUSymmetricMemoryAllocator::has_multicast_support(int device_idx) { return device_has_multicast_support(device_idx); } +c10::DeviceType XPUSymmetricMemoryAllocator::supported_device_type() { + return c10::DeviceType::XPU; +} + +std::string XPUSymmetricMemoryAllocator::name() { + return "XPU"; +} + + c10::intrusive_ptr XPUSymmetricMemoryAllocator::find_block(void* ptr) { std::shared_lock lock(mutex_); auto it = ptr_to_block_.find(ptr); diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index d083fee65f..516cefe1ef 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -118,6 +118,8 @@ class XPUSymmetricMemoryAllocator : public SymmetricMemoryAllocator { const std::optional& group_name) override; bool has_multicast_support(int device_idx) override; // void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr); + c10::DeviceType supported_device_type() override; + std::string name() override; private: c10::intrusive_ptr find_block(void* ptr); From 6a7057b32333908442fc181d6c56602a7751782e Mon Sep 17 00:00:00 2001 From: "Han, Chao1" Date: Thu, 31 Jul 2025 15:48:49 +0800 Subject: [PATCH 36/58] format --- src/xccl/ProcessGroupXCCL.cpp | 2 +- src/xccl/XPUSymmetricMemory.cpp | 315 +++++++++++++++------------ src/xccl/XPUSymmetricMemory.hpp | 8 +- src/xccl/XPUSymmetricMemoryUtils.cpp | 39 ++-- src/xccl/ze_exception.hpp | 63 +++--- 5 files changed, 232 insertions(+), 195 deletions(-) diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index 84d144511b..7042189147 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -2078,7 +2078,7 @@ c10::intrusive_ptr ProcessGroupXCCL::barrier(const BarrierOptions& opts) { } auto currentStream = at::xpu::getCurrentXPUStream(barDevIdx); -// currentStream.synchronize(); // zl_debug workaround for symm barrier + // currentStream.synchronize(); // zl_debug workaround for symm barrier return nullptr; } diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 846e19e6b4..bb019d1d3f 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -1,12 +1,12 @@ +#include #include #include -#include #include #include -#include #include #include +#include #include #include @@ -41,8 +41,8 @@ AllocationRef::~AllocationRef() { if (is_finalizing()) { return; } -c10::Device local_device(c10::DeviceType::XPU, device_idx); -c10::DeviceGuard guard(local_device); + c10::Device local_device(c10::DeviceType::XPU, device_idx); + c10::DeviceGuard guard(local_device); c10::xpu::syncStreamsOnDevice(); } @@ -75,8 +75,10 @@ XPUSymmetricMemory::XPUSymmetricMemory( c10::DeviceGuard guard(local_device); // todo: zl_debug - at::xpu::getCurrentXPUStream().queue().memcpy(buffers_dev_, buffers_.data(), arr_size); - at::xpu::getCurrentXPUStream().queue().memcpy(signal_pads_dev_, signal_pads_.data(), arr_size); + at::xpu::getCurrentXPUStream().queue().memcpy( + buffers_dev_, buffers_.data(), arr_size); + at::xpu::getCurrentXPUStream().queue().memcpy( + signal_pads_dev_, signal_pads_.data(), arr_size); } std::vector XPUSymmetricMemory::get_buffer_ptrs() { @@ -111,18 +113,21 @@ void* XPUSymmetricMemory::get_multicast_ptr() { return mc_addr_; } -void XPUSymmetricMemory::copy_buffer(at::Tensor src, at::Tensor dst , size_t size) { - sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - auto src_ptr = src.data_ptr(); - auto dst_ptr = dst.data_ptr(); - - size_t copy_size = size * c10::elementSize(src.scalar_type()); +void XPUSymmetricMemory::copy_buffer( + at::Tensor src, + at::Tensor dst, + size_t size) { + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + auto src_ptr = src.data_ptr(); + auto dst_ptr = dst.data_ptr(); -// std::cout << "[Native] zl_debug start to copy from src to dst with size " << copy_size << std::endl; - current_queue.memcpy(dst_ptr, src_ptr, copy_size); -// current_queue.wait(); -// std::cout << "[Native] zl_debug copy done " << std::endl; + size_t copy_size = size * c10::elementSize(src.scalar_type()); + // std::cout << "[Native] zl_debug start to copy from src to dst with size + // " << copy_size << std::endl; + current_queue.memcpy(dst_ptr, src_ptr, copy_size); + // current_queue.wait(); + // std::cout << "[Native] zl_debug copy done " << std::endl; } at::Tensor XPUSymmetricMemory::get_buffer( int rank, @@ -147,7 +152,9 @@ at::Tensor XPUSymmetricMemory::get_buffer( storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); auto options = at::TensorOptions().dtype(dtype).device(device); -// std::cout << "[Native] zl_debug in get_buffer on rank = " << rank << " buffer ptr=" << buffers_[rank] << " offset=" <(signal_pads_[rank]) + storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); @@ -217,88 +224,98 @@ void check_channel(int channel, int world_size) { } void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { - -// LOG(ERROR) << "XPUSymmetricMemory::barrier not supported"; + // LOG(ERROR) << "XPUSymmetricMemory::barrier not supported"; check_channel(channel, world_size_); c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); c10::DeviceGuard guard(local_device); sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); -// std::cout << "zl_debug start to do barrier " << std::endl; -// -// int *peer_address[MAX_RANK]; -// int *local_address[MAX_RANK]; -// -// for (int i = 0; i < world_size_; i++) { -// peer_address[i] = static_cast(singnal_pads[i]) + world_size_ * channel + rank_; -// local_address[i] = static_cast(singnal_pads[rank]) + world_size_ * channel + i; -// } -// int tmp_rank = rank_; -// current_queue.submit([&](sycl::handler& cgh) { -// cgh.parallel_for( -// sycl::range<1>{ world_size_ }, [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL{ -// int target_rank = idx.get_linear_id(); -// sycl::ext::oneapi::experimental::printf("DEBUG loop to rank%d: \n", target_rank); -// if (target_rank != tmp_rank) { -// simd grf; -// grf[0] = 1; -// lsc_block_store(peer_address[target_rank], grf); -// sycl::ext::oneapi::experimental::printf("DEBUG block store done rank%d: \n", target_rank); -// do { -//// lsc_fence(); -// fence(); -// grf = lsc_block_load -// (local_address[idx]); -// sycl::ext::oneapi::experimental::printf("DEBUG block load wip rank%d: \n", target_rank); -// } while (grf[0] == 0); -// grf[0] = 0; -// sycl::ext::oneapi::experimental::printf("DEBUG block load wip rank%d: \n", target_rank); -// lsc_block_store(local_address[target_rank], grf); -// sycl::ext::oneapi::experimental::printf("DEBUG block store back rank%d:\n", target_rank); -// } -// } -// ); -// }); -// current_queue.wait(); -// std::cout << "zl_debug finish to do barrier " << std::endl; + // std::cout << "zl_debug start to do barrier " << std::endl; + // + // int *peer_address[MAX_RANK]; + // int *local_address[MAX_RANK]; + // + // for (int i = 0; i < world_size_; i++) { + // peer_address[i] = static_cast(singnal_pads[i]) + world_size_ + // * channel + rank_; local_address[i] = + // static_cast(singnal_pads[rank]) + world_size_ * channel + i; + // } + // int tmp_rank = rank_; + // current_queue.submit([&](sycl::handler& cgh) { + // cgh.parallel_for( + // sycl::range<1>{ world_size_ }, [=](sycl::item<1> idx) + // SYCL_ESIMD_KERNEL{ + // int target_rank = idx.get_linear_id(); + // sycl::ext::oneapi::experimental::printf("DEBUG loop to rank%d: + // \n", target_rank); if (target_rank != tmp_rank) { + // simd grf; + // grf[0] = 1; + // lsc_block_store(peer_address[target_rank], grf); + // sycl::ext::oneapi::experimental::printf("DEBUG block store + // done rank%d: \n", target_rank); do { + //// lsc_fence(); + // fence(); grf = lsc_block_load + // (local_address[idx]); + // sycl::ext::oneapi::experimental::printf("DEBUG block load + // wip rank%d: \n", target_rank); + // } while (grf[0] == 0); + // grf[0] = 0; + // sycl::ext::oneapi::experimental::printf("DEBUG block load wip + // rank%d: \n", target_rank); lsc_block_store(local_address[target_rank], grf); + // sycl::ext::oneapi::experimental::printf("DEBUG block store + // back rank%d:\n", target_rank); + // } + // } + // ); + // }); + // current_queue.wait(); + // std::cout << "zl_debug finish to do barrier " << std::endl; } void XPUSymmetricMemory::put_signal( int dst_rank, int channel, size_t timeout_ms) { - LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported"; -// check_channel(channel, world_size_); -// c10::cuda::CUDAGuard guard(local_device_idx_); -// put_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( -// reinterpret_cast(signal_pads_dev_), -// dst_rank, -// channel, -// rank_, -// world_size_, -// timeout_ms); -// C10_CUDA_KERNEL_LAUNCH_CHECK(); + // check_channel(channel, world_size_); + // c10::cuda::CUDAGuard guard(local_device_idx_); + // put_signal_kernel<<<1, C10_WARP_SIZE, 0, + // at::cuda::getCurrentCUDAStream()>>>( + // reinterpret_cast(signal_pads_dev_), + // dst_rank, + // channel, + // rank_, + // world_size_, + // timeout_ms); + // C10_CUDA_KERNEL_LAUNCH_CHECK(); } void XPUSymmetricMemory::wait_signal( int src_rank, int channel, size_t timeout_ms) { - - LOG(ERROR) << "XPUSymmetricMemory::wait_signal not supported"; -// check_channel(channel, world_size_); -// c10::cuda::CUDAGuard guard(local_device_idx_); -// wait_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( -// reinterpret_cast(signal_pads_dev_), -// src_rank, -// channel, -// rank_, -// world_size_, -// timeout_ms); -// C10_CUDA_KERNEL_LAUNCH_CHECK(); + LOG(ERROR) << "XPUSymmetricMemory::wait_signal not supported"; + // check_channel(channel, world_size_); + // c10::cuda::CUDAGuard guard(local_device_idx_); + // wait_signal_kernel<<<1, C10_WARP_SIZE, 0, + // at::cuda::getCurrentCUDAStream()>>>( + // reinterpret_cast(signal_pads_dev_), + // src_rank, + // channel, + // rank_, + // world_size_, + // timeout_ms); + // C10_CUDA_KERNEL_LAUNCH_CHECK(); } int XPUSymmetricMemory::get_rank() { @@ -327,60 +344,72 @@ void* XPUSymmetricMemoryAllocator::alloc( size_t size, int device_idx, const std::optional& group_name) { - size_t signal_pad_offset = at::round_up(size, 16UL); size_t block_size = signal_pad_offset + signal_pad_size; -// std::cout << "[Native] zl_debug in allocation with original size " << size << " with pad size= " << signal_pad_size << " with total block size= " << block_size << std::endl; + // std::cout << "[Native] zl_debug in allocation with original size " << size + // << " with pad size= " << signal_pad_size << " with total block size= " << + // block_size << std::endl; // 获取 SYCL/Level Zero context 和 device sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - sycl::context sycl_ctx = current_queue.get_context(); - sycl::device sycl_dev = current_queue.get_device(); - ze_context_handle_t ze_ctx = - sycl::get_native(sycl_ctx); - ze_device_handle_t ze_dev = sycl::get_native(sycl_dev); - - // 获取 granularity + sycl::context sycl_ctx = current_queue.get_context(); + sycl::device sycl_dev = current_queue.get_device(); + ze_context_handle_t ze_ctx = + sycl::get_native(sycl_ctx); + ze_device_handle_t ze_dev = + sycl::get_native(sycl_dev); + + // 获取 granularity ze_physical_mem_desc_t phys_desc = { ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC, nullptr, 0, block_size}; // 创建物理内存句柄 ze_physical_mem_handle_t handle = nullptr; -// ze_result_t status = zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, &handle); -// TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zePhysicalMemCreate failed"); + // ze_result_t status = zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, + // &handle); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zePhysicalMemCreate + // failed"); // 分配虚拟地址空间(只映射,不物理分配) -// void* ptr = nullptr; - //map_block(&ptr, handle, block_size, device_idx); + // void* ptr = nullptr; + // map_block(&ptr, handle, block_size, device_idx); ze_device_mem_alloc_desc_t default_device_mem_alloc_desc = { - .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, - .pNext = nullptr, - .flags = 0, - .ordinal = 0 -}; + .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + .pNext = nullptr, + .flags = 0, + .ordinal = 0}; // zl_debug create a device memory by sycl void* ptr = sycl::malloc_device(block_size, current_queue); current_queue.memset(ptr, 0, block_size); -// std::cout << "[Native] zl_debug allocate memory with size = " << size << " allocated ptr=" << ptr << std::endl; + // std::cout << "[Native] zl_debug allocate memory with size = " << size << " + // allocated ptr=" << ptr << std::endl; - //zeMemAllocDevice(ze_ctx, &default_device_mem_alloc_desc, size, 128, ze_dev, &ptr); -// uint8_t* raw_ptr = xpu_tensor.data_ptr(); -// std::cout << "zl_debug start copy to local " << std::endl; -// current_queue.memcpy(raw_ptr, ptr, 100).wait(); -// std::cout << "zl_debug end copy to local " << std::endl; -// -// std::cout << "zl_debug map virtual to physical done " << std::endl; + // zeMemAllocDevice(ze_ctx, &default_device_mem_alloc_desc, size, 128, ze_dev, + // &ptr); + // uint8_t* raw_ptr = xpu_tensor.data_ptr(); + // std::cout << "zl_debug start copy to local " << std::endl; + // current_queue.memcpy(raw_ptr, ptr, 100).wait(); + // std::cout << "zl_debug end copy to local " << std::endl; + // + // std::cout << "zl_debug map virtual to physical done " << std::endl; // 初始化(memset) - //memset(ptr, 0, block_size); // You may want zeCommandListMemset for GPU-based memset + // memset(ptr, 0, block_size); // You may want zeCommandListMemset for + // GPU-based memset // 构造 Block 和 AllocationRef(假设这些结构未变) - //auto alloc_ref = c10::make_intrusive(ptr, handle, block_size, device_idx); - auto alloc_ref = c10::make_intrusive(ptr, ptr, block_size, device_idx); + // auto alloc_ref = c10::make_intrusive(ptr, handle, + // block_size, device_idx); + auto alloc_ref = + c10::make_intrusive(ptr, ptr, block_size, device_idx); auto block = c10::make_intrusive( - std::move(alloc_ref), device_idx, block_size, size, signal_pad_offset, group_name); + std::move(alloc_ref), + device_idx, + block_size, + size, + signal_pad_offset, + group_name); { std::unique_lock lock(mutex_); @@ -388,7 +417,6 @@ void* XPUSymmetricMemoryAllocator::alloc( } // check this ptr copy to sycl buffer - return ptr; } @@ -468,7 +496,6 @@ static bool check_group_multicast_support( c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( void* ptr, const std::optional& group_name) { - auto block = find_block(ptr); if (block == nullptr) { return nullptr; @@ -499,7 +526,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( c10::Device local_device(c10::DeviceType::XPU, block->device_idx); c10::DeviceGuard guard(local_device); - // Currently, IpcChannel is using a file based socket for inter-process communication + // Currently, IpcChannel is using a file based socket for inter-process + // communication IpcChannel ipc_channel; auto group_info = get_group_info(group_name_); auto store = group_info.store; @@ -517,8 +545,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( // initialize MPI done allreducer ar; ar.init(current_queue, rank, world_size); -// std::cout << "!!!![Native] zl_debug torch-ccl exchange done" << std::endl; -// + // std::cout << "!!!![Native] zl_debug torch-ccl exchange done" << std::endl; + // auto local_req = RendezvousRequest{ .device_idx = block->device_idx, .pid = getpid(), @@ -535,12 +563,11 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( pids[r] = reqs[r].pid; } - // do IPC exchange for all peer ranks - ar.exchange_peer_ipc_mem(current_queue, ptr); - std::cout << "[Native] zl_debug finished ipc exchange " << std::endl; + // do IPC exchange for all peer ranks + ar.exchange_peer_ipc_mem(current_queue, ptr); + std::cout << "[Native] zl_debug finished ipc exchange " << std::endl; - -// auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); + // auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); std::vector handles(world_size); std::vector buffers(world_size, nullptr); @@ -553,34 +580,35 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); continue; } else { - buffers[r] = ar.buffers[r]; - handles[r] = ar.buffers[r]; //ar.ipc_handle[r]; - signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); + buffers[r] = ar.buffers[r]; + handles[r] = ar.buffers[r]; // ar.ipc_handle[r]; + signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); } - -// //double check this buffer -// auto host_ptr = (float *)sycl::malloc_host(tmp_count * sizeof(float), current_queue); -// auto tmp_ptr = (float *)sycl::malloc_device(tmp_count * sizeof(float), current_queue); -// std::cout << "[Native] zl_debug start to copy exchanged data to local host " << std::endl; -// current_queue.memcpy(tmp_ptr, physical_buffer_ptr, tmp_count * sizeof(int)); -// current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * sizeof(int)); -// current_queue.wait(); -// std::cout << "[Native] zl_debug finish copy exchanged data to local host" << std::endl; -// -// for (int i = 0; i < tmp_count; i++) { -// std::cout << host_ptr[i] << " "; -// } -// std::cout << std::flush; - -// signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); + // //double check this buffer + // auto host_ptr = (float *)sycl::malloc_host(tmp_count * sizeof(float), + // current_queue); auto tmp_ptr = (float *)sycl::malloc_device(tmp_count + // * sizeof(float), current_queue); std::cout << "[Native] zl_debug start + // to copy exchanged data to local host " << std::endl; + // current_queue.memcpy(tmp_ptr, physical_buffer_ptr, tmp_count * + // sizeof(int)); current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * + // sizeof(int)); current_queue.wait(); std::cout << "[Native] zl_debug + // finish copy exchanged data to local host" << std::endl; + // + // for (int i = 0; i < tmp_count; i++) { + // std::cout << host_ptr[i] << " "; + // } + // std::cout << std::flush; + + // signal_pads[r] = (void*)((uintptr_t)buffers[r] + + // block->signal_pad_offset); } storeExchange.barrier(store, rank, world_size); HandleType mc_handle{}; void* mc_addr = nullptr; bool group_has_multicast_support = check_group_multicast_support(reqs); - //todo: not support multicast now + // todo: not support multicast now std::vector> alloc_refs; for (int r = 0; r < world_size; ++r) { if (r == rank) { @@ -617,7 +645,6 @@ std::string XPUSymmetricMemoryAllocator::name() { return "XPU"; } - c10::intrusive_ptr XPUSymmetricMemoryAllocator::find_block(void* ptr) { std::shared_lock lock(mutex_); auto it = ptr_to_block_.find(ptr); @@ -629,9 +656,9 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::find_block(void* ptr) { struct RegisterXPUSymmetricMemoryAllocator { RegisterXPUSymmetricMemoryAllocator() { - register_allocator( - c10::DeviceType::XPU, - c10::make_intrusive()); + register_allocator( + c10::DeviceType::XPU, + c10::make_intrusive()); } }; diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index 516cefe1ef..1d13baf40b 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -1,10 +1,10 @@ #pragma once -#include #include -#include +#include #include #include +#include namespace c10d::symmetric_memory { @@ -65,7 +65,7 @@ class XPUSymmetricMemory : public SymmetricMemory { void barrier(int channel, size_t timeout_ms) override; void put_signal(int dst_rank, int channel, size_t timeout_ms) override; void wait_signal(int src_rank, int channel, size_t timeout_ms) override; - void copy_buffer(at::Tensor src, at::Tensor dst , size_t size) override; + void copy_buffer(at::Tensor src, at::Tensor dst, size_t size) override; int get_rank() override; int get_world_size() override; @@ -117,7 +117,7 @@ class XPUSymmetricMemoryAllocator : public SymmetricMemoryAllocator { void* ptr, const std::optional& group_name) override; bool has_multicast_support(int device_idx) override; -// void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr); + // void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr); c10::DeviceType supported_device_type() override; std::string name() override; diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index afb185932e..6764fbc511 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -6,10 +6,10 @@ #include #include -#include -#include #include #include +#include +#include namespace c10d::symmetric_memory { @@ -197,18 +197,18 @@ void map_block( ze_physical_mem_handle_t handle, size_t size, int device_idx) { - sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - sycl::context sycl_ctx = current_queue.get_context(); - ze_context_handle_t ze_context = - sycl::get_native(sycl_ctx); + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + sycl::context sycl_ctx = current_queue.get_context(); + ze_context_handle_t ze_context = + sycl::get_native(sycl_ctx); std::cout << "zl_debug in map_block to get virtual address " << std::endl; // 1. Reserve virtual address space void* virtual_ptr = nullptr; ze_result_t status = zeVirtualMemReserve( - ze_context, // context - nullptr, // let L0 pick virtual address - size, // size - &virtual_ptr // out: reserved address + ze_context, // context + nullptr, // let L0 pick virtual address + size, // size + &virtual_ptr // out: reserved address ); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemReserve failed"); std::cout << "zl_debug get zeVirtualMemReserve done " << std::endl; @@ -216,24 +216,21 @@ void map_block( // 2. Map physical memory to virtual address status = zeVirtualMemMap( ze_context, - virtual_ptr, // virtual memory to map to + virtual_ptr, // virtual memory to map to size, - handle, // physical memory handle - 0, // flags - ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE //ze_memory_access_attribute_t + handle, // physical memory handle + 0, // flags + ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE // ze_memory_access_attribute_t ); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemMap failed"); std::cout << "zl_debug get zeVirtualMemMap done " << std::endl; // 3. Set access attributes ze_memory_access_attribute_t access = ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE; - status = zeVirtualMemSetAccessAttribute( - ze_context, - virtual_ptr, - size, - access - ); - TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemSetAccessAttribute failed"); + status = + zeVirtualMemSetAccessAttribute(ze_context, virtual_ptr, size, access); + TORCH_CHECK( + status == ZE_RESULT_SUCCESS, "zeVirtualMemSetAccessAttribute failed"); std::cout << "zl_debug get zeVirtualMemSetAccessAttribute done " << std::endl; // 4. Return pointer diff --git a/src/xccl/ze_exception.hpp b/src/xccl/ze_exception.hpp index 68ac25b8da..a317d98e02 100644 --- a/src/xccl/ze_exception.hpp +++ b/src/xccl/ze_exception.hpp @@ -2,29 +2,42 @@ #include #include -#include #include +#include // Mapping from status to human readable string class zeException : std::exception { - const char * zeResultToString(ze_result_t status) const { - static const std::unordered_map zeResultToStringMap{ - {ZE_RESULT_SUCCESS, "[Core] success"}, - {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, - {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, - {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, - {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, - {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, - {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, - {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, - {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, - {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, - {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, - {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, - {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, - {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, - {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, - {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, + const char* zeResultToString(ze_result_t status) const { + static const std::unordered_map zeResultToStringMap{ + {ZE_RESULT_SUCCESS, "[Core] success"}, + {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, + {ZE_RESULT_ERROR_DEVICE_LOST, + "[Core] device hung, reset, was removed, or driver update occurred"}, + {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, + "[Core] insufficient host memory to satisfy call"}, + {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, + "[Core] insufficient device memory to satisfy call"}, + {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, + "[Core] error occurred when building module, see build log for details"}, + {ZE_RESULT_ERROR_UNINITIALIZED, + "[Validation] driver is not initialized"}, + {ZE_RESULT_ERROR_INVALID_NULL_POINTER, + "[Validation] pointer argument may not be nullptr"}, + {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, + "[Validation] object pointed to by handle still in-use by device"}, + {ZE_RESULT_ERROR_INVALID_ENUMERATION, + "[Validation] enumerator argument is not valid"}, + {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, + {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, + "[Validation] size argument is not supported by the device"}, + {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, + "[Validation] alignment argument is not supported by the device"}, + {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, + "[Validation] handle argument is not valid"}, + {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, + "[Validation] generic error code for unsupported features"}, + {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, + "[Validation] native binary is not supported by the device"}, }; auto it = zeResultToStringMap.find(status); if (it != zeResultToStringMap.end()) @@ -33,7 +46,7 @@ class zeException : std::exception { return "Unknown Reason"; } -public: + public: zeException(ze_result_t ret) : result_(ret) {} ze_result_t result_; @@ -43,9 +56,9 @@ class zeException : std::exception { } }; -#define zeCheck(x) \ - if (x != ZE_RESULT_SUCCESS) { \ - auto e = zeException(x); \ - std::cout<<"Throw "< Date: Thu, 31 Jul 2025 16:06:27 +0800 Subject: [PATCH 37/58] rm hardcode ze --- cmake/XCCL.cmake | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cmake/XCCL.cmake b/cmake/XCCL.cmake index 2e4afc01a5..6deeb8d3c9 100644 --- a/cmake/XCCL.cmake +++ b/cmake/XCCL.cmake @@ -17,10 +17,4 @@ if(NOT __XCCL_INCLUDED) set_property( TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES ${XCCL_LIBRARY}) - set_property( - TARGET torch::xccl APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES - /usr/include) - set_property( - TARGET torch::xccl APPEND PROPERTY INTERFACE_LINK_LIBRARIES - /usr/lib/x86_64-linux-gnu/libze_loader.so) endif() From 769ddec6d0dbb21c3c644f2ab5d803c27780830e Mon Sep 17 00:00:00 2001 From: "Han, Chao1" Date: Tue, 5 Aug 2025 16:49:48 +0800 Subject: [PATCH 38/58] clean code --- src/xccl/XPUSymmetricMemory.cpp | 138 +------------------------------- src/xccl/XPUSymmetricMemory.hpp | 2 - 2 files changed, 3 insertions(+), 137 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index bb019d1d3f..31bbdaecf3 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -123,11 +123,7 @@ void XPUSymmetricMemory::copy_buffer( size_t copy_size = size * c10::elementSize(src.scalar_type()); - // std::cout << "[Native] zl_debug start to copy from src to dst with size - // " << copy_size << std::endl; current_queue.memcpy(dst_ptr, src_ptr, copy_size); - // current_queue.wait(); - // std::cout << "[Native] zl_debug copy done " << std::endl; } at::Tensor XPUSymmetricMemory::get_buffer( int rank, @@ -152,9 +148,7 @@ at::Tensor XPUSymmetricMemory::get_buffer( storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); auto options = at::TensorOptions().dtype(dtype).device(device); - // std::cout << "[Native] zl_debug in get_buffer on rank = " << rank << " - // buffer ptr=" << buffers_[rank] << " offset=" <(signal_pads_[rank]) + storage_offset * element_size; auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); @@ -224,61 +217,12 @@ void check_channel(int channel, int world_size) { } void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { - // LOG(ERROR) << "XPUSymmetricMemory::barrier not supported"; check_channel(channel, world_size_); c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); c10::DeviceGuard guard(local_device); sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - // std::cout << "zl_debug start to do barrier " << std::endl; - // - // int *peer_address[MAX_RANK]; - // int *local_address[MAX_RANK]; - // - // for (int i = 0; i < world_size_; i++) { - // peer_address[i] = static_cast(singnal_pads[i]) + world_size_ - // * channel + rank_; local_address[i] = - // static_cast(singnal_pads[rank]) + world_size_ * channel + i; - // } - // int tmp_rank = rank_; - // current_queue.submit([&](sycl::handler& cgh) { - // cgh.parallel_for( - // sycl::range<1>{ world_size_ }, [=](sycl::item<1> idx) - // SYCL_ESIMD_KERNEL{ - // int target_rank = idx.get_linear_id(); - // sycl::ext::oneapi::experimental::printf("DEBUG loop to rank%d: - // \n", target_rank); if (target_rank != tmp_rank) { - // simd grf; - // grf[0] = 1; - // lsc_block_store(peer_address[target_rank], grf); - // sycl::ext::oneapi::experimental::printf("DEBUG block store - // done rank%d: \n", target_rank); do { - //// lsc_fence(); - // fence(); grf = lsc_block_load - // (local_address[idx]); - // sycl::ext::oneapi::experimental::printf("DEBUG block load - // wip rank%d: \n", target_rank); - // } while (grf[0] == 0); - // grf[0] = 0; - // sycl::ext::oneapi::experimental::printf("DEBUG block load wip - // rank%d: \n", target_rank); lsc_block_store(local_address[target_rank], grf); - // sycl::ext::oneapi::experimental::printf("DEBUG block store - // back rank%d:\n", target_rank); - // } - // } - // ); - // }); - // current_queue.wait(); - // std::cout << "zl_debug finish to do barrier " << std::endl; } void XPUSymmetricMemory::put_signal( @@ -286,18 +230,6 @@ void XPUSymmetricMemory::put_signal( int channel, size_t timeout_ms) { LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported"; - - // check_channel(channel, world_size_); - // c10::cuda::CUDAGuard guard(local_device_idx_); - // put_signal_kernel<<<1, C10_WARP_SIZE, 0, - // at::cuda::getCurrentCUDAStream()>>>( - // reinterpret_cast(signal_pads_dev_), - // dst_rank, - // channel, - // rank_, - // world_size_, - // timeout_ms); - // C10_CUDA_KERNEL_LAUNCH_CHECK(); } void XPUSymmetricMemory::wait_signal( @@ -305,17 +237,6 @@ void XPUSymmetricMemory::wait_signal( int channel, size_t timeout_ms) { LOG(ERROR) << "XPUSymmetricMemory::wait_signal not supported"; - // check_channel(channel, world_size_); - // c10::cuda::CUDAGuard guard(local_device_idx_); - // wait_signal_kernel<<<1, C10_WARP_SIZE, 0, - // at::cuda::getCurrentCUDAStream()>>>( - // reinterpret_cast(signal_pads_dev_), - // src_rank, - // channel, - // rank_, - // world_size_, - // timeout_ms); - // C10_CUDA_KERNEL_LAUNCH_CHECK(); } int XPUSymmetricMemory::get_rank() { @@ -346,11 +267,7 @@ void* XPUSymmetricMemoryAllocator::alloc( const std::optional& group_name) { size_t signal_pad_offset = at::round_up(size, 16UL); size_t block_size = signal_pad_offset + signal_pad_size; - // std::cout << "[Native] zl_debug in allocation with original size " << size - // << " with pad size= " << signal_pad_size << " with total block size= " << - // block_size << std::endl; - // 获取 SYCL/Level Zero context 和 device sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); sycl::context sycl_ctx = current_queue.get_context(); sycl::device sycl_dev = current_queue.get_device(); @@ -359,48 +276,19 @@ void* XPUSymmetricMemoryAllocator::alloc( ze_device_handle_t ze_dev = sycl::get_native(sycl_dev); - // 获取 granularity ze_physical_mem_desc_t phys_desc = { ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC, nullptr, 0, block_size}; - // 创建物理内存句柄 ze_physical_mem_handle_t handle = nullptr; - // ze_result_t status = zePhysicalMemCreate(ze_ctx, ze_dev, &phys_desc, - // &handle); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zePhysicalMemCreate - // failed"); - // 分配虚拟地址空间(只映射,不物理分配) - // void* ptr = nullptr; - // map_block(&ptr, handle, block_size, device_idx); ze_device_mem_alloc_desc_t default_device_mem_alloc_desc = { .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, .pNext = nullptr, .flags = 0, .ordinal = 0}; - // zl_debug create a device memory by sycl void* ptr = sycl::malloc_device(block_size, current_queue); current_queue.memset(ptr, 0, block_size); - - // std::cout << "[Native] zl_debug allocate memory with size = " << size << " - // allocated ptr=" << ptr << std::endl; - - // zeMemAllocDevice(ze_ctx, &default_device_mem_alloc_desc, size, 128, ze_dev, - // &ptr); - // uint8_t* raw_ptr = xpu_tensor.data_ptr(); - // std::cout << "zl_debug start copy to local " << std::endl; - // current_queue.memcpy(raw_ptr, ptr, 100).wait(); - // std::cout << "zl_debug end copy to local " << std::endl; - // - // std::cout << "zl_debug map virtual to physical done " << std::endl; - - // 初始化(memset) - // memset(ptr, 0, block_size); // You may want zeCommandListMemset for - // GPU-based memset - - // 构造 Block 和 AllocationRef(假设这些结构未变) - // auto alloc_ref = c10::make_intrusive(ptr, handle, - // block_size, device_idx); auto alloc_ref = c10::make_intrusive(ptr, ptr, block_size, device_idx); auto block = c10::make_intrusive( @@ -545,8 +433,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( // initialize MPI done allreducer ar; ar.init(current_queue, rank, world_size); - // std::cout << "!!!![Native] zl_debug torch-ccl exchange done" << std::endl; - // + auto local_req = RendezvousRequest{ .device_idx = block->device_idx, .pid = getpid(), @@ -565,7 +452,6 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( // do IPC exchange for all peer ranks ar.exchange_peer_ipc_mem(current_queue, ptr); - std::cout << "[Native] zl_debug finished ipc exchange " << std::endl; // auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); @@ -584,24 +470,6 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( handles[r] = ar.buffers[r]; // ar.ipc_handle[r]; signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); } - - // //double check this buffer - // auto host_ptr = (float *)sycl::malloc_host(tmp_count * sizeof(float), - // current_queue); auto tmp_ptr = (float *)sycl::malloc_device(tmp_count - // * sizeof(float), current_queue); std::cout << "[Native] zl_debug start - // to copy exchanged data to local host " << std::endl; - // current_queue.memcpy(tmp_ptr, physical_buffer_ptr, tmp_count * - // sizeof(int)); current_queue.memcpy(host_ptr, tmp_ptr, tmp_count * - // sizeof(int)); current_queue.wait(); std::cout << "[Native] zl_debug - // finish copy exchanged data to local host" << std::endl; - // - // for (int i = 0; i < tmp_count; i++) { - // std::cout << host_ptr[i] << " "; - // } - // std::cout << std::flush; - - // signal_pads[r] = (void*)((uintptr_t)buffers[r] + - // block->signal_pad_offset); } storeExchange.barrier(store, rank, world_size); diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index 1d13baf40b..aa7f1c1660 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -84,8 +84,6 @@ class XPUSymmetricMemory : public SymmetricMemory { void** signal_pads_dev_; }; -// Metadata associated with each allocation performed by -// `CUDASymmetricMemoryAllocator`. struct Block : public c10::intrusive_ptr_target { c10::intrusive_ptr alloc_ref; int device_idx; From 4d7e170434371799821fc4090383f9f721adf27b Mon Sep 17 00:00:00 2001 From: "Han, Chao1" Date: Thu, 7 Aug 2025 17:10:53 +0800 Subject: [PATCH 39/58] update --- src/xccl/IPCExchange.hpp | 66 +++++++---------- src/xccl/ze_exception.hpp | 149 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 39 deletions(-) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index 12b736f772..dc5a9189d8 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -144,28 +144,24 @@ bool wait_for_socket_file(const char* path, int max_seconds = 10) { } int client_connect(const char* server, const char* client) { - // std::cout << "zl_debug in client connect " << server << "___" << client - // << std::endl; if (!wait_for_socket_file(server, 10)) { std::cerr << "Error: timeout waiting for server socket file: " << server << std::endl; exit(EXIT_FAILURE); } auto sock = prepare_socket(client); - // std::cout << "zl_debug prepare socket done " << std::endl; sockaddr_un sun; memset(&sun, 0, sizeof(sun)); sun.sun_family = AF_UNIX; strcpy(sun.sun_path, server); auto len = offsetof(sockaddr_un, sun_path) + strlen(server); - // connect重试 const int max_retries = 50; int retry = 0; int ret = -1; while (retry < max_retries) { ret = connect(sock, (sockaddr*)&sun, len); if (ret == 0) - break; // 连接成功 + break; std::this_thread::sleep_for(std::chrono::milliseconds(100)); retry++; } @@ -175,7 +171,6 @@ int client_connect(const char* server, const char* client) { } // sysCheck(connect(sock, (sockaddr*)&sun, len)); - // std::cout << "zl_debug connect done " << std::endl; return sock; } @@ -314,33 +309,17 @@ class allreducer { void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) { if (initialized) return; - /** - int flag = 0; - MPI_Initialized(&flag); - - if (!flag) { - auto ret = MPI_Init(NULL, NULL); - if (ret == MPI_ERR_OTHER) { - std::cout << "MPI init error" << std::endl; - return; - } else { - std::cout << "MPI init in torch-xpu-ops" << std::endl; - } - } else { - std::cout << "MPI already initialized.\n"; + + // 动态加载 Level Zero + if (!load_level_zero_library()) { + throw std::runtime_error("Failed to initialize Level Zero"); } - **/ - zeCheck(zeInit(0)); + zeCheck_dynamic(zeInit_dynamic(0)); int tmp_rank, tmp_world; - // MPI_Comm_size(MPI_COMM_WORLD, &tmp_world); - // MPI_Comm_rank(MPI_COMM_WORLD, &tmp_rank); tmp_world = world_in; tmp_rank = rank_in; - // std::cout << "zl_debug get rank & world size after MPI init " << - // tmp_world - // << " " << tmp_rank << std::endl; rank = tmp_rank; world = tmp_world; @@ -348,12 +327,21 @@ class allreducer { } void allreduce(sycl::queue& queue, void* inout_buffer, uint32_t size) {} void release(sycl::queue& queue) { + if (!initialized) + return; + // Clean up, close/put ipc handles, free memory, etc. + if (!load_level_zero_library()) { + std::cerr << "Warning: Level Zero not available for cleanup" << std::endl; + return; + } + auto l0_ctx = sycl::get_native( queue.get_context()); for (int i = 0; i < world; i++) { if (i != rank) { - zeCheck(zeMemCloseIpcHandle(l0_ctx, (char*)buffers[i] - offsets[i])); + zeCheck_dynamic(zeMemCloseIpcHandle_dynamic( + l0_ctx, (char*)buffers[i] - offsets[i])); } } @@ -377,32 +365,34 @@ class allreducer { } // buffer_size as element size void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) { + if (!load_level_zero_library()) { + throw std::runtime_error("Level Zero not available"); + } + // Step 1: Get base address of the pointer sycl::context ctx = queue.get_context(); auto l0_ctx = sycl::get_native(ctx); void* base_addr; size_t base_size; - zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); - // std::cout << "zl_debug get base address " << base_addr << " base size " - // << base_size << std::endl; + zeCheck_dynamic( + zeMemGetAddressRange_dynamic(l0_ctx, ptr, &base_addr, &base_size)); // Step 2: Get IPC mem handle from base address alignas(64) exchange_contents send_buf; alignas(64) exchange_contents recv_buf[world]; // fill in the exchange info - zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); + zeCheck_dynamic( + zeMemGetIpcHandle_dynamic(l0_ctx, base_addr, &send_buf.ipc_handle)); send_buf.offset = (char*)ptr - (char*)base_addr; - // std::cout << "zl_debug get address base offset " << send_buf.offset - // << std::endl; + send_buf.pid = getpid(); // Step 3: Exchange the handles and offsets memset(recv_buf, 0, sizeof(recv_buf)); // Overkill if we don't really needs all peer's handles un_allgather(&send_buf, recv_buf, rank, world); - // std::cout << "zl_debug after un allgather" << std::endl; for (uint32_t i = 0; i < world; i++) { // Step 4: Prepare pid file descriptor of next process auto* peer = recv_buf + i; @@ -411,15 +401,13 @@ class allreducer { queue.get_device()); void* peer_base; - zeCheck(zeMemOpenIpcHandle( + zeCheck_dynamic(zeMemOpenIpcHandle_dynamic( l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); - // std::cout << "zl_debug get peer " << i << " with base - // address: " << peer_base << " offset: " << peer->offset << - // std::endl; + buffers[i] = (char*)peer_base + peer->offset; // make sure data correction // debug_print_buffer(queue, static_cast(buffers[i]), diff --git a/src/xccl/ze_exception.hpp b/src/xccl/ze_exception.hpp index a317d98e02..d975fab48c 100644 --- a/src/xccl/ze_exception.hpp +++ b/src/xccl/ze_exception.hpp @@ -1,10 +1,80 @@ #pragma once +#include #include #include #include #include +// Level Zero API 函数指针类型定义 +typedef ze_result_t (*zeInit_t)(ze_init_flags_t flags); +typedef ze_result_t (*zeMemGetAddressRange_t)( + ze_context_handle_t hContext, + const void* ptr, + void** pBase, + size_t* pSize); +typedef ze_result_t (*zeMemGetIpcHandle_t)( + ze_context_handle_t hContext, + const void* ptr, + ze_ipc_mem_handle_t* pIpcHandle); +typedef ze_result_t (*zeMemOpenIpcHandle_t)( + ze_context_handle_t hContext, + ze_device_handle_t hDevice, + ze_ipc_mem_handle_t handle, + ze_ipc_memory_flags_t flags, + void** pptr); +typedef ze_result_t ( + *zeMemCloseIpcHandle_t)(ze_context_handle_t hContext, const void* ptr); +typedef ze_result_t (*zeVirtualMemMap_t)( + ze_context_handle_t hContext, + const void* ptr, + size_t size, + ze_memory_access_attribute_t access, + ze_memory_advice_t advice); +typedef ze_result_t (*zeVirtualMemReserve_t)( + ze_context_handle_t hContext, + const void* pStart, + size_t size, + void** pptr); +typedef ze_result_t (*zeVirtualMemSetAccessAttribute_t)( + ze_context_handle_t hContext, + const void* ptr, + size_t size, + ze_memory_access_attribute_t access); + +// Level Zero 库动态加载函数声明 +bool load_level_zero_library(); +void unload_level_zero_library(); + +#define zeCheck_dynamic(x) \ + do { \ + if (!load_level_zero_library()) { \ + throw std::runtime_error("Level Zero library not available"); \ + } \ + ze_result_t result = (x); \ + if (result != ZE_RESULT_SUCCESS) { \ + auto e = zeException(result); \ + std::cout << "Throw " << e.what() << std::endl; \ + throw e; \ + } \ + } while (0) + +// 动态函数宏 +#define zeInit_dynamic(flags) zeInit_ptr(flags) +#define zeMemGetAddressRange_dynamic(ctx, ptr, base, size) \ + zeMemGetAddressRange_ptr(ctx, ptr, base, size) +#define zeMemGetIpcHandle_dynamic(ctx, ptr, handle) \ + zeMemGetIpcHandle_ptr(ctx, ptr, handle) +#define zeMemOpenIpcHandle_dynamic(ctx, dev, handle, flags, ptr) \ + zeMemOpenIpcHandle_ptr(ctx, dev, handle, flags, ptr) +#define zeMemCloseIpcHandle_dynamic(ctx, ptr) zeMemCloseIpcHandle_ptr(ctx, ptr) +#define zeVirtualMemMap_dynamic(ctx, ptr, size, access, advice) \ + zeVirtualMemMap_ptr(ctx, ptr, size, access, advice) +#define zeVirtualMemReserve_dynamic(ctx, start, size, ptr) \ + zeVirtualMemReserve_ptr(ctx, start, size, ptr) +#define zeVirtualMemSetAccessAttribute_dynamic(ctx, ptr, size, access) \ + zeVirtualMemSetAccessAttribute_ptr(ctx, ptr, size, access) + // Mapping from status to human readable string class zeException : std::exception { const char* zeResultToString(ze_result_t status) const { @@ -62,3 +132,82 @@ class zeException : std::exception { std::cout << "Throw " << e.what() << std::endl; \ throw e; \ } + +// 全局函数指针定义 +static zeInit_t zeInit_ptr = nullptr; +static zeMemGetAddressRange_t zeMemGetAddressRange_ptr = nullptr; +static zeMemGetIpcHandle_t zeMemGetIpcHandle_ptr = nullptr; +static zeMemOpenIpcHandle_t zeMemOpenIpcHandle_ptr = nullptr; +static zeMemCloseIpcHandle_t zeMemCloseIpcHandle_ptr = nullptr; +static zeVirtualMemMap_t zeVirtualMemMap_ptr = nullptr; +static zeVirtualMemReserve_t zeVirtualMemReserve_ptr = nullptr; +static zeVirtualMemSetAccessAttribute_t zeVirtualMemSetAccessAttribute_ptr = + nullptr; + +static void* ze_handle = nullptr; + +// Level Zero 库动态加载实现 +inline bool load_level_zero_library() { + if (ze_handle != nullptr) { + return true; // 已经加载 + } + + // 尝试加载 Level Zero 库 + const char* lib_names[] = { + "libze_loader.so.1", + "libze_loader.so", + "/usr/local/lib/libze_loader.so", + "/opt/intel/oneapi/level-zero/latest/lib/libze_loader.so"}; + + for (const char* lib_name : lib_names) { + ze_handle = dlopen(lib_name, RTLD_LAZY); + if (ze_handle != nullptr) { + break; + } + } + + if (ze_handle == nullptr) { + std::cerr << "Failed to load Level Zero library: " << dlerror() + << std::endl; + return false; + } + + // 加载函数指针 + zeInit_ptr = (zeInit_t)dlsym(ze_handle, "zeInit"); + zeMemGetAddressRange_ptr = + (zeMemGetAddressRange_t)dlsym(ze_handle, "zeMemGetAddressRange"); + zeMemGetIpcHandle_ptr = + (zeMemGetIpcHandle_t)dlsym(ze_handle, "zeMemGetIpcHandle"); + zeMemOpenIpcHandle_ptr = + (zeMemOpenIpcHandle_t)dlsym(ze_handle, "zeMemOpenIpcHandle"); + zeMemCloseIpcHandle_ptr = + (zeMemCloseIpcHandle_t)dlsym(ze_handle, "zeMemCloseIpcHandle"); + zeVirtualMemMap_ptr = (zeVirtualMemMap_t)dlsym(ze_handle, "zeVirtualMemMap"); + zeVirtualMemReserve_ptr = + (zeVirtualMemReserve_t)dlsym(ze_handle, "zeVirtualMemReserve"); + zeVirtualMemSetAccessAttribute_ptr = (zeVirtualMemSetAccessAttribute_t)dlsym( + ze_handle, "zeVirtualMemSetAccessAttribute"); + + if (!zeVirtualMemMap_ptr || !zeVirtualMemReserve_ptr || + !zeVirtualMemSetAccessAttribute_ptr) { + std::cerr << "Failed to load Level Zero Virtual Memory API functions" + << std::endl; + dlclose(ze_handle); + ze_handle = nullptr; + return false; + } + + return true; +} + +inline void unload_level_zero_library() { + if (ze_handle != nullptr) { + dlclose(ze_handle); + ze_handle = nullptr; + zeInit_ptr = nullptr; + zeMemGetAddressRange_ptr = nullptr; + zeMemGetIpcHandle_ptr = nullptr; + zeMemOpenIpcHandle_ptr = nullptr; + zeMemCloseIpcHandle_ptr = nullptr; + } +} From de4da04e1c3da50ee249d3e4ba3781f36bb841d4 Mon Sep 17 00:00:00 2001 From: "Han, Chao1" Date: Thu, 7 Aug 2025 19:23:09 +0800 Subject: [PATCH 40/58] avoid symbol conflict --- src/xccl/ze_exception.hpp | 111 ++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 35 deletions(-) diff --git a/src/xccl/ze_exception.hpp b/src/xccl/ze_exception.hpp index d975fab48c..99e4cb6e9e 100644 --- a/src/xccl/ze_exception.hpp +++ b/src/xccl/ze_exception.hpp @@ -1,12 +1,20 @@ #pragma once #include -#include #include #include #include -// Level Zero API 函数指针类型定义 +#define zeVirtualMemMap zeVirtualMemMap_original +#define zeVirtualMemReserve zeVirtualMemReserve_original +#define zeVirtualMemSetAccessAttribute zeVirtualMemSetAccessAttribute_original + +#include + +#undef zeVirtualMemMap +#undef zeVirtualMemReserve +#undef zeVirtualMemSetAccessAttribute + typedef ze_result_t (*zeInit_t)(ze_init_flags_t flags); typedef ze_result_t (*zeMemGetAddressRange_t)( ze_context_handle_t hContext, @@ -29,8 +37,9 @@ typedef ze_result_t (*zeVirtualMemMap_t)( ze_context_handle_t hContext, const void* ptr, size_t size, - ze_memory_access_attribute_t access, - ze_memory_advice_t advice); + ze_physical_mem_handle_t hPhysicalMemory, + size_t offset, + ze_memory_access_attribute_t access); typedef ze_result_t (*zeVirtualMemReserve_t)( ze_context_handle_t hContext, const void* pStart, @@ -42,7 +51,6 @@ typedef ze_result_t (*zeVirtualMemSetAccessAttribute_t)( size_t size, ze_memory_access_attribute_t access); -// Level Zero 库动态加载函数声明 bool load_level_zero_library(); void unload_level_zero_library(); @@ -59,7 +67,6 @@ void unload_level_zero_library(); } \ } while (0) -// 动态函数宏 #define zeInit_dynamic(flags) zeInit_ptr(flags) #define zeMemGetAddressRange_dynamic(ctx, ptr, base, size) \ zeMemGetAddressRange_ptr(ctx, ptr, base, size) @@ -68,33 +75,25 @@ void unload_level_zero_library(); #define zeMemOpenIpcHandle_dynamic(ctx, dev, handle, flags, ptr) \ zeMemOpenIpcHandle_ptr(ctx, dev, handle, flags, ptr) #define zeMemCloseIpcHandle_dynamic(ctx, ptr) zeMemCloseIpcHandle_ptr(ctx, ptr) -#define zeVirtualMemMap_dynamic(ctx, ptr, size, access, advice) \ - zeVirtualMemMap_ptr(ctx, ptr, size, access, advice) +#define zeVirtualMemMap_dynamic(ctx, ptr, size, phys_mem, offset, access) \ + zeVirtualMemMap_ptr(ctx, ptr, size, phys_mem, offset, access) #define zeVirtualMemReserve_dynamic(ctx, start, size, ptr) \ zeVirtualMemReserve_ptr(ctx, start, size, ptr) #define zeVirtualMemSetAccessAttribute_dynamic(ctx, ptr, size, access) \ zeVirtualMemSetAccessAttribute_ptr(ctx, ptr, size, access) -// Mapping from status to human readable string +// Exception handling class class zeException : std::exception { const char* zeResultToString(ze_result_t status) const { static const std::unordered_map zeResultToStringMap{ {ZE_RESULT_SUCCESS, "[Core] success"}, {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, - {ZE_RESULT_ERROR_DEVICE_LOST, - "[Core] device hung, reset, was removed, or driver update occurred"}, - {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, - "[Core] insufficient host memory to satisfy call"}, - {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, - "[Core] insufficient device memory to satisfy call"}, - {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, - "[Core] error occurred when building module, see build log for details"}, {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, - {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, - "[Validation] object pointed to by handle still in-use by device"}, + {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, + "[Validation] handle argument is not valid"}, {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, @@ -102,12 +101,20 @@ class zeException : std::exception { "[Validation] size argument is not supported by the device"}, {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, - {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, - "[Validation] handle argument is not valid"}, {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, + {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, + "[Core] insufficient host memory to satisfy call"}, + {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, + "[Core] insufficient device memory to satisfy call"}, + {ZE_RESULT_ERROR_DEVICE_LOST, + "[Core] device hung, reset, was removed, or driver update occurred"}, + {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, + "[Core] error occurred when building module, see build log for details"}, + {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, + "[Validation] object pointed to by handle still in-use by device"}, }; auto it = zeResultToStringMap.find(status); if (it != zeResultToStringMap.end()) @@ -133,7 +140,6 @@ class zeException : std::exception { throw e; \ } -// 全局函数指针定义 static zeInit_t zeInit_ptr = nullptr; static zeMemGetAddressRange_t zeMemGetAddressRange_ptr = nullptr; static zeMemGetIpcHandle_t zeMemGetIpcHandle_ptr = nullptr; @@ -146,18 +152,11 @@ static zeVirtualMemSetAccessAttribute_t zeVirtualMemSetAccessAttribute_ptr = static void* ze_handle = nullptr; -// Level Zero 库动态加载实现 inline bool load_level_zero_library() { if (ze_handle != nullptr) { - return true; // 已经加载 + return true; } - - // 尝试加载 Level Zero 库 - const char* lib_names[] = { - "libze_loader.so.1", - "libze_loader.so", - "/usr/local/lib/libze_loader.so", - "/opt/intel/oneapi/level-zero/latest/lib/libze_loader.so"}; + const char* lib_names[] = {"/usr/lib/x86_64-linux-gnu/libze_loader.so"}; for (const char* lib_name : lib_names) { ze_handle = dlopen(lib_name, RTLD_LAZY); @@ -172,7 +171,6 @@ inline bool load_level_zero_library() { return false; } - // 加载函数指针 zeInit_ptr = (zeInit_t)dlsym(ze_handle, "zeInit"); zeMemGetAddressRange_ptr = (zeMemGetAddressRange_t)dlsym(ze_handle, "zeMemGetAddressRange"); @@ -188,10 +186,11 @@ inline bool load_level_zero_library() { zeVirtualMemSetAccessAttribute_ptr = (zeVirtualMemSetAccessAttribute_t)dlsym( ze_handle, "zeVirtualMemSetAccessAttribute"); - if (!zeVirtualMemMap_ptr || !zeVirtualMemReserve_ptr || + if (!zeInit_ptr || !zeMemGetAddressRange_ptr || !zeMemGetIpcHandle_ptr || + !zeMemOpenIpcHandle_ptr || !zeMemCloseIpcHandle_ptr || + !zeVirtualMemMap_ptr || !zeVirtualMemReserve_ptr || !zeVirtualMemSetAccessAttribute_ptr) { - std::cerr << "Failed to load Level Zero Virtual Memory API functions" - << std::endl; + std::cerr << "Failed to load Level Zero API functions" << std::endl; dlclose(ze_handle); ze_handle = nullptr; return false; @@ -209,5 +208,47 @@ inline void unload_level_zero_library() { zeMemGetIpcHandle_ptr = nullptr; zeMemOpenIpcHandle_ptr = nullptr; zeMemCloseIpcHandle_ptr = nullptr; + zeVirtualMemMap_ptr = nullptr; + zeVirtualMemReserve_ptr = nullptr; + zeVirtualMemSetAccessAttribute_ptr = nullptr; + } +} + +extern "C" { + +__attribute__((weak)) ze_result_t zeVirtualMemMap( + ze_context_handle_t hContext, + const void* ptr, + size_t size, + ze_physical_mem_handle_t hPhysicalMemory, + size_t offset, + ze_memory_access_attribute_t access) { + if (!load_level_zero_library() || !zeVirtualMemMap_ptr) { + return ZE_RESULT_ERROR_UNINITIALIZED; } + return zeVirtualMemMap_ptr( + hContext, ptr, size, hPhysicalMemory, offset, access); +} + +__attribute__((weak)) ze_result_t zeVirtualMemReserve( + ze_context_handle_t hContext, + const void* pStart, + size_t size, + void** pptr) { + if (!load_level_zero_library() || !zeVirtualMemReserve_ptr) { + return ZE_RESULT_ERROR_UNINITIALIZED; + } + return zeVirtualMemReserve_ptr(hContext, pStart, size, pptr); +} + +__attribute__((weak)) ze_result_t zeVirtualMemSetAccessAttribute( + ze_context_handle_t hContext, + const void* ptr, + size_t size, + ze_memory_access_attribute_t access) { + if (!load_level_zero_library() || !zeVirtualMemSetAccessAttribute_ptr) { + return ZE_RESULT_ERROR_UNINITIALIZED; + } + return zeVirtualMemSetAccessAttribute_ptr(hContext, ptr, size, access); +} } From ffd3a90fdd696feda0c7afcee1e4f88633ab962a Mon Sep 17 00:00:00 2001 From: "Han, Chao1" Date: Fri, 8 Aug 2025 10:57:13 +0800 Subject: [PATCH 41/58] refine IPCExchang --- src/xccl/IPCExchange.hpp | 55 ++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index dc5a9189d8..f057dbf593 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -8,9 +8,8 @@ #include #include #include +#include "xccl/ze_exception.hpp" -#include -//#include #include #include @@ -19,8 +18,6 @@ #include // for std::chrono::milliseconds #include // for std::this_thread::sleep_for -#include "xccl/ze_exception.hpp" - #define ELE_COUNT 128 struct exchange_contents { @@ -305,12 +302,35 @@ class allreducer { size_per_buffer = 0; buffer_index = 0; } + allreducer(const allreducer&) = delete; + allreducer& operator=(const allreducer&) = delete; + allreducer(allreducer&& other) noexcept { + *this = std::move(other); + } + allreducer& operator=(allreducer&& other) noexcept { + if (this != &other) { + initialized = other.initialized; + rank = other.rank; + world = other.world; + std::memcpy(buffers, other.buffers, sizeof(buffers)); + std::memcpy(offsets, other.offsets, sizeof(offsets)); + std::memcpy(ipc_handle, other.ipc_handle, sizeof(ipc_handle)); + + other.initialized = false; + } + return *this; + } + ~allreducer() { + if (initialized) { + std::cerr << "Warning: allreducer destroyed without calling release()" + << std::endl; + } + } void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) { if (initialized) return; - // 动态加载 Level Zero if (!load_level_zero_library()) { throw std::runtime_error("Failed to initialize Level Zero"); } @@ -329,22 +349,19 @@ class allreducer { void release(sycl::queue& queue) { if (!initialized) return; - - // Clean up, close/put ipc handles, free memory, etc. - if (!load_level_zero_library()) { - std::cerr << "Warning: Level Zero not available for cleanup" << std::endl; - return; - } - - auto l0_ctx = sycl::get_native( - queue.get_context()); - for (int i = 0; i < world; i++) { - if (i != rank) { - zeCheck_dynamic(zeMemCloseIpcHandle_dynamic( - l0_ctx, (char*)buffers[i] - offsets[i])); + try { + auto l0_ctx = sycl::get_native( + queue.get_context()); + for (int i = 0; i < world; i++) { + if (i != rank) { + zeCheck_dynamic(zeMemCloseIpcHandle_dynamic( + l0_ctx, (char*)buffers[i] - offsets[i])); + } } + } catch (const std::exception& e) { + std::cerr << "Warning: Level Zero cleanup failed: " << e.what() + << std::endl; } - sycl::free(buffers[rank], queue); initialized = false; } From 66b77b1b81397ffa79f6d306a9667c50c8074140 Mon Sep 17 00:00:00 2001 From: "Han, Chao1" Date: Fri, 8 Aug 2025 11:08:05 +0800 Subject: [PATCH 42/58] rm header --- src/xccl/XPUSymmetricMemory.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 31bbdaecf3..31a186c3d8 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -11,9 +11,6 @@ #include #include -// todo: check this point -#include - // todo: fixed with kernel barrier #include From 868cae8ffad60bb57cc7c69a31ec680fd9917bd0 Mon Sep 17 00:00:00 2001 From: Han Chao Date: Tue, 12 Aug 2025 13:48:08 +0800 Subject: [PATCH 43/58] update --- src/xccl/XPUSymmetricMemory.cpp | 16 ++++++++++------ src/xccl/XPUSymmetricMemoryUtils.cpp | 21 ++++++++++++++------- src/xccl/XPUSymmetricMemoryUtils.hpp | 2 ++ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 31a186c3d8..ae35fa088c 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -71,7 +71,6 @@ XPUSymmetricMemory::XPUSymmetricMemory( c10::Device local_device(c10::DeviceType::XPU, local_device_idx); c10::DeviceGuard guard(local_device); - // todo: zl_debug at::xpu::getCurrentXPUStream().queue().memcpy( buffers_dev_, buffers_.data(), arr_size); at::xpu::getCurrentXPUStream().queue().memcpy( @@ -499,7 +498,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( } bool XPUSymmetricMemoryAllocator::has_multicast_support(int device_idx) { - return device_has_multicast_support(device_idx); + return false; } c10::DeviceType XPUSymmetricMemoryAllocator::supported_device_type() { @@ -521,12 +520,17 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::find_block(void* ptr) { struct RegisterXPUSymmetricMemoryAllocator { RegisterXPUSymmetricMemoryAllocator() { - register_allocator( - c10::DeviceType::XPU, - c10::make_intrusive()); + auto allocator = c10::make_intrusive(); + // Query backend used for XPU + if (getSymmMemBackendXPU() == "XPU") { + // Direct set (static registration) + register_allocator(c10::DeviceType::XPU, allocator); + } else { + // Register availability in case `set_backend` is called dynamically + register_availability("XPU", allocator); + } } }; - static RegisterXPUSymmetricMemoryAllocator register_allocator_; } // namespace symmetric_memory diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 6764fbc511..551e12abc5 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -6,19 +6,30 @@ #include #include -#include #include #include #include namespace c10d::symmetric_memory { +std::string getSymmMemBackendXPU() { + static auto val = c10::utils::get_env("TORCH_SYMMMEM"); + if (val.has_value()) { + TORCH_CHECK( + val.value() == "XPU", + "TORCH_SYMMMEM environment variable must be 'XPU'."); + return val.value(); + } + return "XPU"; +} + bool device_has_multicast_support(int device_idx) { - return true; + return false; } bool allow_overlapping_devices() { - return true; + return c10::utils::check_env("TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES") == + true; } IpcChannel::IpcChannel() @@ -201,7 +212,6 @@ void map_block( sycl::context sycl_ctx = current_queue.get_context(); ze_context_handle_t ze_context = sycl::get_native(sycl_ctx); - std::cout << "zl_debug in map_block to get virtual address " << std::endl; // 1. Reserve virtual address space void* virtual_ptr = nullptr; ze_result_t status = zeVirtualMemReserve( @@ -211,7 +221,6 @@ void map_block( &virtual_ptr // out: reserved address ); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemReserve failed"); - std::cout << "zl_debug get zeVirtualMemReserve done " << std::endl; // 2. Map physical memory to virtual address status = zeVirtualMemMap( @@ -223,7 +232,6 @@ void map_block( ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE // ze_memory_access_attribute_t ); TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemMap failed"); - std::cout << "zl_debug get zeVirtualMemMap done " << std::endl; // 3. Set access attributes ze_memory_access_attribute_t access = ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE; @@ -231,7 +239,6 @@ void map_block( zeVirtualMemSetAccessAttribute(ze_context, virtual_ptr, size, access); TORCH_CHECK( status == ZE_RESULT_SUCCESS, "zeVirtualMemSetAccessAttribute failed"); - std::cout << "zl_debug get zeVirtualMemSetAccessAttribute done " << std::endl; // 4. Return pointer *ptr = virtual_ptr; diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp index 3814f57669..f119928f41 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.hpp +++ b/src/xccl/XPUSymmetricMemoryUtils.hpp @@ -6,6 +6,8 @@ namespace c10d { namespace symmetric_memory { +std::string getSymmMemBackendXPU(); + bool device_has_multicast_support(int device_idx); bool allow_overlapping_devices(); From f3f6f0cb5dcf20d5d09201a3708718de97f80ace Mon Sep 17 00:00:00 2001 From: Han Chao Date: Mon, 18 Aug 2025 09:56:48 +0800 Subject: [PATCH 44/58] refine and add sycl signal --- src/xccl/CMakeLists.txt | 2 + src/xccl/IPCExchange.hpp | 3 - src/xccl/ProcessGroupXCCL.cpp | 2 +- src/xccl/Signal.cpp | 201 +++++++++++++++++++++++++++ src/xccl/Signal.hpp | 114 +++++++++++++++ src/xccl/XPUSymmetricMemory.cpp | 99 ++++++------- src/xccl/XPUSymmetricMemoryUtils.cpp | 1 + 7 files changed, 358 insertions(+), 64 deletions(-) create mode 100644 src/xccl/Signal.cpp create mode 100644 src/xccl/Signal.hpp diff --git a/src/xccl/CMakeLists.txt b/src/xccl/CMakeLists.txt index 74ece226cc..dd04a7dfa4 100644 --- a/src/xccl/CMakeLists.txt +++ b/src/xccl/CMakeLists.txt @@ -3,9 +3,11 @@ file(GLOB xccl_h "*.hpp") file(GLOB xccl_cpp "*.cpp") list(REMOVE_ITEM xccl_cpp "${CMAKE_CURRENT_SOURCE_DIR}/NanCheck_XPU.cpp") +list(REMOVE_ITEM xccl_cpp "${CMAKE_CURRENT_SOURCE_DIR}/Signal.cpp") list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp}) list(APPEND ATen_XPU_SYCL_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/NanCheck_XPU.cpp") +list(APPEND ATen_XPU_SYCL_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/Signal.cpp") set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE) set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index f057dbf593..dc4d3e2d5d 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -426,9 +426,6 @@ class allreducer { &peer_base)); buffers[i] = (char*)peer_base + peer->offset; - // make sure data correction - // debug_print_buffer(queue, static_cast(buffers[i]), - // ELE_COUNT); offsets[i] = peer->offset; ipc_handle[i] = send_buf.ipc_handle; } diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index 7042189147..c820a1c486 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -2078,7 +2078,7 @@ c10::intrusive_ptr ProcessGroupXCCL::barrier(const BarrierOptions& opts) { } auto currentStream = at::xpu::getCurrentXPUStream(barDevIdx); - // currentStream.synchronize(); // zl_debug workaround for symm barrier + currentStream.synchronize(); return nullptr; } diff --git a/src/xccl/Signal.cpp b/src/xccl/Signal.cpp new file mode 100644 index 0000000000..451e9c4fc4 --- /dev/null +++ b/src/xccl/Signal.cpp @@ -0,0 +1,201 @@ +#include +#include +#include +#include + +namespace c10d::symmetric_memory { + +struct barrierKernel { + void operator()(sycl::nd_item<1> item) const { + auto thread_id = item.get_local_id(0); + + if (thread_id < world_size) { + auto target_rank = thread_id; + if (target_rank == rank) { + return; + } + auto put_success = try_put_signal_device( + signal_pads[target_rank] + world_size * channel + rank, 10000000); + if (!put_success) { + assert(0); + } + + auto wait_success = try_wait_signal_device( + signal_pads[rank] + world_size * channel + target_rank, 10000000); + if (!wait_success) { + assert(0); + } + } + } + + barrierKernel( + uint32_t** signal_pads, + int channel, + int rank, + int world_size, + size_t timeout_ms) + : signal_pads(signal_pads), + channel(channel), + rank(rank), + world_size(world_size), + timeout_ms(timeout_ms) {} + + private: + uint32_t** signal_pads; + int channel; + int rank; + int world_size; + size_t timeout_ms; +}; + +void barrier_impl_xpu( + uint32_t** signal_pads, + int channel, + int rank, + int world_size, + size_t timeout_ms, + at::xpu::XPUStream& stream) { + int64_t maxNumThreadsPerBlock = syclMaxWorkGroupSize(); + const size_t numThreadsPerBlock = + std::min(maxNumThreadsPerBlock, std::max(32, world_size)); + + if (!(numThreadsPerBlock > 0)) { + return; + } + int64_t numBlocks = 1; + auto global_range = numBlocks * numThreadsPerBlock; + auto local_range = numThreadsPerBlock; + + using Kernel = barrierKernel; + auto kfn = Kernel(signal_pads, channel, rank, world_size, timeout_ms); + + sycl_kernel_submit(global_range, local_range, stream.queue(), kfn); +} + +struct putSignalKernel { + void operator()(sycl::nd_item<1> item) const { + auto thread_id = item.get_local_id(0); + + if (thread_id == 0) { + auto put_success = try_put_signal_device( + signal_pads[dst_rank] + world_size * channel + rank, 10000000); + if (!put_success) { + assert(0); + } + } + } + + putSignalKernel( + uint32_t** signal_pads, + int dst_rank, + int channel, + int rank, + int world_size, + size_t timeout_ms) + : signal_pads(signal_pads), + dst_rank(dst_rank), + channel(channel), + rank(rank), + world_size(world_size), + timeout_ms(timeout_ms) {} + + private: + uint32_t** signal_pads; + int dst_rank; + int channel; + int rank; + int world_size; + size_t timeout_ms; +}; + +void put_signal_impl_xpu( + uint32_t** signal_pads, + int dst_rank, + int channel, + int rank, + int world_size, + size_t timeout_ms, + at::xpu::XPUStream& stream) { + int64_t maxNumThreadsPerBlock = syclMaxWorkGroupSize(); + const size_t numThreadsPerBlock = std::min(maxNumThreadsPerBlock, 32); + + if (!(numThreadsPerBlock > 0)) { + return; + } + + int64_t numBlocks = 1; + auto global_range = numBlocks * numThreadsPerBlock; + auto local_range = numThreadsPerBlock; + + using Kernel = putSignalKernel; + auto kfn = + Kernel(signal_pads, dst_rank, channel, rank, world_size, timeout_ms); + + sycl_kernel_submit(global_range, local_range, stream.queue(), kfn); +} + +struct waitSignalKernel { + void operator()(sycl::nd_item<1> item) const { + auto thread_id = item.get_local_id(0); + + if (thread_id == 0) { + auto wait_success = try_wait_signal_device( + signal_pads[rank] + world_size * channel + src_rank, 10000000); + if (!wait_success) { + assert(0); + } + + sycl::atomic_fence(sycl::memory_order_seq_cst, sycl::memory_scope_system); + } + } + + waitSignalKernel( + uint32_t** signal_pads, + int src_rank, + int channel, + int rank, + int world_size, + size_t timeout_ms) + : signal_pads(signal_pads), + src_rank(src_rank), + channel(channel), + rank(rank), + world_size(world_size), + timeout_ms(timeout_ms) {} + + private: + uint32_t** signal_pads; + int src_rank; + int channel; + int rank; + int world_size; + size_t timeout_ms; +}; + +void wait_signal_impl_xpu( + uint32_t** signal_pads, + int src_rank, + int channel, + int rank, + int world_size, + size_t timeout_ms, + at::xpu::XPUStream& stream) { + int64_t maxNumThreadsPerBlock = syclMaxWorkGroupSize(); + const size_t numThreadsPerBlock = std::min(maxNumThreadsPerBlock, 32); + + if (!(numThreadsPerBlock > 0)) { + return; + } + + int64_t numBlocks = 1; + auto global_range = numBlocks * numThreadsPerBlock; + auto local_range = numThreadsPerBlock; + + using Kernel = waitSignalKernel; + auto kfn = + Kernel(signal_pads, src_rank, channel, rank, world_size, timeout_ms); + + sycl_kernel_submit(global_range, local_range, stream.queue(), kfn); +} + +} // namespace c10d::symmetric_memory diff --git a/src/xccl/Signal.hpp b/src/xccl/Signal.hpp new file mode 100644 index 0000000000..876824860a --- /dev/null +++ b/src/xccl/Signal.hpp @@ -0,0 +1,114 @@ +#pragma once + +#include + +#include +#include + +namespace c10d::symmetric_memory { + +using at::native::memory::get_alignment; + +template +uint32_t cas(uint32_t* addr, uint32_t compare, uint32_t val) { + sycl::atomic_ref< + uint32_t, + sycl::memory_order::acq_rel, + sycl::memory_scope::system> + ref(*addr); + ref.compare_exchange_strong(compare, val); + return compare; +} + +inline size_t global_timer_ns() { + auto now = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast( + now.time_since_epoch()) + .count(); +} + +constexpr size_t ns_per_ms = 1e6; + +// Device-compatible version using a simple counter approach +template +bool try_put_signal_device(uint32_t* addr, size_t max_iterations = 1000) { + size_t iterations = 0; + while (cas(addr, 0, 1) != 0) { + if (max_iterations != 0 && iterations++ > max_iterations) { + return false; + } + } + return true; +} + +template +bool try_wait_signal_device(uint32_t* addr, size_t max_iterations = 1000) { + size_t iterations = 0; + while (cas(addr, 1, 0) != 1) { + if (max_iterations != 0 && iterations++ > max_iterations) { + return false; + } + } + return true; +} + +template +bool try_put_signal(uint32_t* addr, size_t timeout_ms) { + size_t deadline = global_timer_ns() + timeout_ms * ns_per_ms; + while (cas(addr, 0, 1) != 0) { + if (timeout_ms != 0 && global_timer_ns() > deadline) { + return false; + } + } + return true; +} + +template +bool try_wait_signal(uint32_t* addr, size_t timeout_ms) { + size_t deadline = global_timer_ns() + timeout_ms * ns_per_ms; + while (cas(addr, 1, 0) != 1) { + if (timeout_ms != 0 && global_timer_ns() > deadline) { + return false; + } + } + return true; +} + +template +void put_signal(uint32_t* addr) { + while (cas(addr, 0, 1) != 0) + ; +} + +template +void wait_signal(uint32_t* addr) { + while (cas(addr, 1, 0) != 1) + ; +} + +void barrier_impl_xpu( + uint32_t** signal_pads, + int channel, + int rank, + int world_size, + size_t timeout_ms, + at::xpu::XPUStream& stream); + +void put_signal_impl_xpu( + uint32_t** signal_pads, + int dst_rank, + int channel, + int rank, + int world_size, + size_t timeout_ms, + at::xpu::XPUStream& stream); + +void wait_signal_impl_xpu( + uint32_t** signal_pads, + int src_rank, + int channel, + int rank, + int world_size, + size_t timeout_ms, + at::xpu::XPUStream& stream); +} // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index ae35fa088c..03a526537e 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -19,9 +20,6 @@ namespace c10d { namespace symmetric_memory { -/* Start of XPUSymmetricMemory implementation */ - -// A set of exchange methods with prefix "XPUSymmetricMemory" static StoreExchange storeExchange = StoreExchange("XPUSymmetricMemory"); AllocationRef::AllocationRef( @@ -217,22 +215,55 @@ void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); c10::DeviceGuard guard(local_device); + auto stream = at::xpu::getCurrentXPUStream(); - sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + barrier_impl_xpu( + reinterpret_cast(signal_pads_dev_), + channel, + rank_, + world_size_, + timeout_ms, + stream); } void XPUSymmetricMemory::put_signal( int dst_rank, int channel, size_t timeout_ms) { - LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported"; + check_channel(channel, world_size_); + + c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); + c10::DeviceGuard guard(local_device); + auto stream = at::xpu::getCurrentXPUStream(); + + put_signal_impl_xpu( + reinterpret_cast(signal_pads_dev_), + dst_rank, + channel, + rank_, + world_size_, + timeout_ms, + stream); } void XPUSymmetricMemory::wait_signal( int src_rank, int channel, size_t timeout_ms) { - LOG(ERROR) << "XPUSymmetricMemory::wait_signal not supported"; + check_channel(channel, world_size_); + + c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); + c10::DeviceGuard guard(local_device); + auto stream = at::xpu::getCurrentXPUStream(); + + wait_signal_impl_xpu( + reinterpret_cast(signal_pads_dev_), + src_rank, + channel, + rank_, + world_size_, + timeout_ms, + stream); } int XPUSymmetricMemory::get_rank() { @@ -265,24 +296,6 @@ void* XPUSymmetricMemoryAllocator::alloc( size_t block_size = signal_pad_offset + signal_pad_size; sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - sycl::context sycl_ctx = current_queue.get_context(); - sycl::device sycl_dev = current_queue.get_device(); - ze_context_handle_t ze_ctx = - sycl::get_native(sycl_ctx); - ze_device_handle_t ze_dev = - sycl::get_native(sycl_dev); - - ze_physical_mem_desc_t phys_desc = { - ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC, nullptr, 0, block_size}; - - ze_physical_mem_handle_t handle = nullptr; - - ze_device_mem_alloc_desc_t default_device_mem_alloc_desc = { - .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, - .pNext = nullptr, - .flags = 0, - .ordinal = 0}; - void* ptr = sycl::malloc_device(block_size, current_queue); current_queue.memset(ptr, 0, block_size); auto alloc_ref = @@ -299,8 +312,6 @@ void* XPUSymmetricMemoryAllocator::alloc( std::unique_lock lock(mutex_); ptr_to_block_.emplace(ptr, std::move(block)); } - // check this ptr copy to sycl buffer - return ptr; } @@ -354,29 +365,6 @@ void validate_rendezvous_requests( } } -static bool check_group_multicast_support( - const std::vector& reqs) { - std::vector ranks_with_multicast_support; - for (size_t r = 0; r < reqs.size(); ++r) { - if (reqs[r].has_multicast_support) { - ranks_with_multicast_support.push_back(r); - } - } - if (ranks_with_multicast_support.size() == reqs.size()) { - return true; - } else { - // We don't expect this to happen. But we want to let the user to know if - // this happens. - if (ranks_with_multicast_support.size() != 0) { - LOG(WARNING) - << "Only a subset of ranks in the group has multicast support: " - << ranks_with_multicast_support << " (world_size=" << reqs.size() - << "). Skipping multicast initialization because this is unexpected."; - } - return false; - } -} - c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( void* ptr, const std::optional& group_name) { @@ -417,16 +405,10 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( auto store = group_info.store; int rank = group_info.rank; int world_size = group_info.world_size; - int block_fd; // Step 6: Open IPC handle of remote peer sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - sycl::context ctx = current_queue.get_context(); - auto l0_ctx = sycl::get_native(ctx); - sycl::device dev = current_queue.get_device(); - auto l0_dev = sycl::get_native(dev); - // check with original ones // debug code - // initialize MPI done + allreducer ar; ar.init(current_queue, rank, world_size); @@ -449,8 +431,6 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( // do IPC exchange for all peer ranks ar.exchange_peer_ipc_mem(current_queue, ptr); - // auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd); - std::vector handles(world_size); std::vector buffers(world_size, nullptr); std::vector signal_pads(world_size, nullptr); @@ -471,8 +451,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( HandleType mc_handle{}; void* mc_addr = nullptr; - bool group_has_multicast_support = check_group_multicast_support(reqs); - // todo: not support multicast now + std::vector> alloc_refs; for (int r = 0; r < world_size; ++r) { if (r == rank) { diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 551e12abc5..39c07399b8 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include From f60645e7f8f8f6262b2d31632c54aab804d39845 Mon Sep 17 00:00:00 2001 From: Han Chao Date: Mon, 25 Aug 2025 15:01:04 +0800 Subject: [PATCH 45/58] correct rendezvous --- src/xccl/IPCExchange.hpp | 2 -- src/xccl/XPUSymmetricMemory.cpp | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index dc4d3e2d5d..101f424019 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -192,8 +192,6 @@ void un_allgather( unlink(server_name); auto s_listen = server_listen(server_name); - // MPI_Barrier(MPI_COMM_WORLD); - pollfd fdarray[world]; int recv_socks[world - 1]; diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 03a526537e..145c6940e7 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -444,7 +444,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( } else { buffers[r] = ar.buffers[r]; handles[r] = ar.buffers[r]; // ar.ipc_handle[r]; - signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); + signal_pads[r] = + (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } } storeExchange.barrier(store, rank, world_size); From 2b6e84fa58519deeebfa07a798434d250c7f497d Mon Sep 17 00:00:00 2001 From: Han Chao Date: Tue, 2 Sep 2025 13:27:34 +0800 Subject: [PATCH 46/58] add barrier imple by dist allgather --- src/xccl/XPUSymmetricMemory.cpp | 45 ++++++++++++++++++++++++++++----- src/xccl/XPUSymmetricMemory.hpp | 5 ++++ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 145c6940e7..55366a59b0 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -213,17 +213,47 @@ void check_channel(int channel, int world_size) { void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { check_channel(channel, world_size_); + auto group = c10d::resolve_process_group(group_name_); + if (group == nullptr) { + TORCH_WARN( + "Process group '", + group_name_, + "' not found, falling back to original barrier"); + throw std::runtime_error("Process group not found"); + } + auto* xcclPg = dynamic_cast( + group->getBackend(c10::DeviceType::XPU).get()); + c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); c10::DeviceGuard guard(local_device); - auto stream = at::xpu::getCurrentXPUStream(); + auto barrier_input = at::tensor( + {rank_}, at::TensorOptions().dtype(torch::kInt32).device(local_device)); + std::vector> output_tensors(1); + output_tensors[0].reserve(world_size_); + for (int i = 0; i < world_size_; ++i) { + output_tensors[0].emplace_back(at::zeros_like(barrier_input)); + } - barrier_impl_xpu( - reinterpret_cast(signal_pads_dev_), - channel, - rank_, - world_size_, + std::vector input_tensors = {barrier_input}; + auto work = xcclPg->allgather(output_tensors, input_tensors); + bool success = work->wait(std::chrono::milliseconds(timeout_ms)); + TORCH_CHECK( + success, + "Barrier timeout after ", timeout_ms, - stream); + " ms for group '", + group_name_); + // c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); + // c10::DeviceGuard guard(local_device); + // auto stream = at::xpu::getCurrentXPUStream(); + + // barrier_impl_xpu( + // reinterpret_cast(signal_pads_dev_), + // channel, + // rank_, + // world_size_, + // timeout_ms, + // stream); } void XPUSymmetricMemory::put_signal( @@ -473,6 +503,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( block->device_idx, group_info.rank, group_info.world_size); + symm_mem->set_group_name(group_name_); block->symm_mems[group_name_] = symm_mem; return symm_mem; } diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index aa7f1c1660..fe0a88c4e6 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -70,6 +70,10 @@ class XPUSymmetricMemory : public SymmetricMemory { int get_rank() override; int get_world_size() override; + void set_group_name(const std::string& group_name) { + group_name_ = group_name; + } + private: std::vector> alloc_refs_; std::vector buffers_; @@ -82,6 +86,7 @@ class XPUSymmetricMemory : public SymmetricMemory { int world_size_; void** buffers_dev_; void** signal_pads_dev_; + std::string group_name_; }; struct Block : public c10::intrusive_ptr_target { From 1df3f9f85b2d3596d175da7dbca4f8181dcc7c9e Mon Sep 17 00:00:00 2001 From: Han Chao Date: Thu, 4 Sep 2025 14:27:23 +0800 Subject: [PATCH 47/58] rename ze_symbol --- src/xccl/XPUSymmetricMemory.cpp | 2 ++ src/xccl/{ze_exception.hpp => ze_symbol.hpp} | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) rename src/xccl/{ze_exception.hpp => ze_symbol.hpp} (99%) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 55366a59b0..e2f72fc10f 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -8,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/src/xccl/ze_exception.hpp b/src/xccl/ze_symbol.hpp similarity index 99% rename from src/xccl/ze_exception.hpp rename to src/xccl/ze_symbol.hpp index 99e4cb6e9e..20af666811 100644 --- a/src/xccl/ze_exception.hpp +++ b/src/xccl/ze_symbol.hpp @@ -156,7 +156,7 @@ inline bool load_level_zero_library() { if (ze_handle != nullptr) { return true; } - const char* lib_names[] = {"/usr/lib/x86_64-linux-gnu/libze_loader.so"}; + const char* lib_names[] = {"libze_loader.so"}; for (const char* lib_name : lib_names) { ze_handle = dlopen(lib_name, RTLD_LAZY); From 4034168e8eaf95d7b8d0434cdedbcbb335e78418 Mon Sep 17 00:00:00 2001 From: Han Chao Date: Mon, 8 Sep 2025 14:24:57 +0800 Subject: [PATCH 48/58] add allreduce_barrier --- src/xccl/XPUSymmetricMemory.cpp | 37 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index e2f72fc10f..253061b86f 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -228,23 +228,30 @@ void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); c10::DeviceGuard guard(local_device); - auto barrier_input = at::tensor( - {rank_}, at::TensorOptions().dtype(torch::kInt32).device(local_device)); - std::vector> output_tensors(1); - output_tensors[0].reserve(world_size_); - for (int i = 0; i < world_size_; ++i) { - output_tensors[0].emplace_back(at::zeros_like(barrier_input)); + + static thread_local at::Tensor barrier_tensor; + if (!barrier_tensor.defined() || barrier_tensor.device() != local_device) { + barrier_tensor = at::zeros( + {1}, at::TensorOptions().device(local_device).dtype(at::kFloat)); + } else { + barrier_tensor.zero_(); } - std::vector input_tensors = {barrier_input}; - auto work = xcclPg->allgather(output_tensors, input_tensors); - bool success = work->wait(std::chrono::milliseconds(timeout_ms)); - TORCH_CHECK( - success, - "Barrier timeout after ", - timeout_ms, - " ms for group '", - group_name_); + c10d::AllreduceOptions arOpts; + arOpts.asyncOp = false; + auto work = + xcclPg->allreduce_impl(barrier_tensor, "xccl:symm_mem_barrier", arOpts); + + if (work) { + bool success = work->wait(std::chrono::milliseconds(timeout_ms)); + TORCH_CHECK( + success, + "Barrier timeout after ", + timeout_ms, + " ms for group '", + group_name_, + "'"); + } // c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); // c10::DeviceGuard guard(local_device); // auto stream = at::xpu::getCurrentXPUStream(); From 1bb3f9b7d973588a5c5a5d8dd779827ed5a56651 Mon Sep 17 00:00:00 2001 From: Han Chao Date: Tue, 9 Sep 2025 15:07:58 +0800 Subject: [PATCH 49/58] correct --- src/xccl/IPCExchange.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IPCExchange.hpp index 101f424019..b80cfeb9c0 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IPCExchange.hpp @@ -8,7 +8,7 @@ #include #include #include -#include "xccl/ze_exception.hpp" +#include "xccl/ze_symbol.hpp" #include From 34ad35f2fb13f1f6860adaa31c4bb8ea3af7bcfb Mon Sep 17 00:00:00 2001 From: Han Chao Date: Tue, 9 Sep 2025 16:07:10 +0800 Subject: [PATCH 50/58] add test case --- test/xpu/distributed/test_c10d_ops_xccl.py | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/test/xpu/distributed/test_c10d_ops_xccl.py b/test/xpu/distributed/test_c10d_ops_xccl.py index 6ab9c02c35..f9d806e689 100644 --- a/test/xpu/distributed/test_c10d_ops_xccl.py +++ b/test/xpu/distributed/test_c10d_ops_xccl.py @@ -23,6 +23,10 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__))) from test_c10d_xccl import init_multigpu_helper, requires_xccl +from torch.distributed._symmetric_memory import ( + _fused_all_gather_matmul_fallback, + _fused_matmul_reduce_scatter_fallback, +) from torch.testing._internal.common_distributed import MultiProcContinuousTest from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, @@ -921,6 +925,60 @@ def test_all_to_all_single_none(self): out.tolist(), list(zip(range(self.world_size), range(self.world_size))) ) + @requires_xccl() + def test_fused_all_gather_matmul(self) -> None: + device = self.rank_to_GPU[self.rank][0] + torch.xpu.set_device(device) + BATCH = 8 + M = 64 + N = 16 + K = 32 + group = dist.group.WORLD + rank = self.rank + + torch.manual_seed(42 + rank) + A_shard = torch.rand(BATCH, M // self.world_size, K, device="xpu") + Bs = [torch.rand(K, N, device="xpu") for _ in range(3)] + + ag_output_0, mm_outputs_0 = _fused_all_gather_matmul_fallback( + A_shard, Bs, gather_dim=0, group_name=group.group_name + ) + ag_output_1, mm_outputs_1 = torch.ops.symm_mem.fused_all_gather_matmul( + A_shard, Bs, gather_dim=0, group_name=group.group_name + ) + + self.assertEqual(ag_output_0, ag_output_1) + self.assertEqual(ag_output_0.stride(), ag_output_1.stride()) + for mm_output_0, mm_output_1 in zip(mm_outputs_0, mm_outputs_1): + self.assertEqual(mm_output_0, mm_output_1) + self.assertEqual(mm_output_0.stride(), mm_output_1.stride()) + + @requires_xccl() + @parametrize("scatter_dim", [0, 1]) + def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None: + device = self.rank_to_GPU[self.rank][0] + torch.xpu.set_device(device) + BATCH = 8 + M = 64 + N = 16 + K = 32 + group = dist.group.WORLD + rank = self.rank + + torch.manual_seed(42 + rank) + A = torch.rand(BATCH, M, K, device="xpu") + B = torch.rand(K, N, device="xpu") + + output_0 = _fused_matmul_reduce_scatter_fallback( + A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name + ) + output_1 = torch.ops.symm_mem.fused_matmul_reduce_scatter( + A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name + ) + + assert torch.allclose(output_0, output_1) + assert output_0.stride() == output_1.stride() + instantiate_parametrized_tests(ProcessGroupXCCLOpTest) if __name__ == "__main__": From 7f0524a0b278b371df6a18f84222c1ef2d8b4c50 Mon Sep 17 00:00:00 2001 From: Han Chao Date: Tue, 9 Sep 2025 16:34:21 +0800 Subject: [PATCH 51/58] rm signal --- src/xccl/CMakeLists.txt | 2 - src/xccl/Signal.cpp | 201 -------------------------------- src/xccl/Signal.hpp | 114 ------------------ src/xccl/XPUSymmetricMemory.cpp | 42 +------ 4 files changed, 2 insertions(+), 357 deletions(-) delete mode 100644 src/xccl/Signal.cpp delete mode 100644 src/xccl/Signal.hpp diff --git a/src/xccl/CMakeLists.txt b/src/xccl/CMakeLists.txt index dd04a7dfa4..74ece226cc 100644 --- a/src/xccl/CMakeLists.txt +++ b/src/xccl/CMakeLists.txt @@ -3,11 +3,9 @@ file(GLOB xccl_h "*.hpp") file(GLOB xccl_cpp "*.cpp") list(REMOVE_ITEM xccl_cpp "${CMAKE_CURRENT_SOURCE_DIR}/NanCheck_XPU.cpp") -list(REMOVE_ITEM xccl_cpp "${CMAKE_CURRENT_SOURCE_DIR}/Signal.cpp") list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp}) list(APPEND ATen_XPU_SYCL_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/NanCheck_XPU.cpp") -list(APPEND ATen_XPU_SYCL_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/Signal.cpp") set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE) set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE) diff --git a/src/xccl/Signal.cpp b/src/xccl/Signal.cpp deleted file mode 100644 index 451e9c4fc4..0000000000 --- a/src/xccl/Signal.cpp +++ /dev/null @@ -1,201 +0,0 @@ -#include -#include -#include -#include - -namespace c10d::symmetric_memory { - -struct barrierKernel { - void operator()(sycl::nd_item<1> item) const { - auto thread_id = item.get_local_id(0); - - if (thread_id < world_size) { - auto target_rank = thread_id; - if (target_rank == rank) { - return; - } - auto put_success = try_put_signal_device( - signal_pads[target_rank] + world_size * channel + rank, 10000000); - if (!put_success) { - assert(0); - } - - auto wait_success = try_wait_signal_device( - signal_pads[rank] + world_size * channel + target_rank, 10000000); - if (!wait_success) { - assert(0); - } - } - } - - barrierKernel( - uint32_t** signal_pads, - int channel, - int rank, - int world_size, - size_t timeout_ms) - : signal_pads(signal_pads), - channel(channel), - rank(rank), - world_size(world_size), - timeout_ms(timeout_ms) {} - - private: - uint32_t** signal_pads; - int channel; - int rank; - int world_size; - size_t timeout_ms; -}; - -void barrier_impl_xpu( - uint32_t** signal_pads, - int channel, - int rank, - int world_size, - size_t timeout_ms, - at::xpu::XPUStream& stream) { - int64_t maxNumThreadsPerBlock = syclMaxWorkGroupSize(); - const size_t numThreadsPerBlock = - std::min(maxNumThreadsPerBlock, std::max(32, world_size)); - - if (!(numThreadsPerBlock > 0)) { - return; - } - int64_t numBlocks = 1; - auto global_range = numBlocks * numThreadsPerBlock; - auto local_range = numThreadsPerBlock; - - using Kernel = barrierKernel; - auto kfn = Kernel(signal_pads, channel, rank, world_size, timeout_ms); - - sycl_kernel_submit(global_range, local_range, stream.queue(), kfn); -} - -struct putSignalKernel { - void operator()(sycl::nd_item<1> item) const { - auto thread_id = item.get_local_id(0); - - if (thread_id == 0) { - auto put_success = try_put_signal_device( - signal_pads[dst_rank] + world_size * channel + rank, 10000000); - if (!put_success) { - assert(0); - } - } - } - - putSignalKernel( - uint32_t** signal_pads, - int dst_rank, - int channel, - int rank, - int world_size, - size_t timeout_ms) - : signal_pads(signal_pads), - dst_rank(dst_rank), - channel(channel), - rank(rank), - world_size(world_size), - timeout_ms(timeout_ms) {} - - private: - uint32_t** signal_pads; - int dst_rank; - int channel; - int rank; - int world_size; - size_t timeout_ms; -}; - -void put_signal_impl_xpu( - uint32_t** signal_pads, - int dst_rank, - int channel, - int rank, - int world_size, - size_t timeout_ms, - at::xpu::XPUStream& stream) { - int64_t maxNumThreadsPerBlock = syclMaxWorkGroupSize(); - const size_t numThreadsPerBlock = std::min(maxNumThreadsPerBlock, 32); - - if (!(numThreadsPerBlock > 0)) { - return; - } - - int64_t numBlocks = 1; - auto global_range = numBlocks * numThreadsPerBlock; - auto local_range = numThreadsPerBlock; - - using Kernel = putSignalKernel; - auto kfn = - Kernel(signal_pads, dst_rank, channel, rank, world_size, timeout_ms); - - sycl_kernel_submit(global_range, local_range, stream.queue(), kfn); -} - -struct waitSignalKernel { - void operator()(sycl::nd_item<1> item) const { - auto thread_id = item.get_local_id(0); - - if (thread_id == 0) { - auto wait_success = try_wait_signal_device( - signal_pads[rank] + world_size * channel + src_rank, 10000000); - if (!wait_success) { - assert(0); - } - - sycl::atomic_fence(sycl::memory_order_seq_cst, sycl::memory_scope_system); - } - } - - waitSignalKernel( - uint32_t** signal_pads, - int src_rank, - int channel, - int rank, - int world_size, - size_t timeout_ms) - : signal_pads(signal_pads), - src_rank(src_rank), - channel(channel), - rank(rank), - world_size(world_size), - timeout_ms(timeout_ms) {} - - private: - uint32_t** signal_pads; - int src_rank; - int channel; - int rank; - int world_size; - size_t timeout_ms; -}; - -void wait_signal_impl_xpu( - uint32_t** signal_pads, - int src_rank, - int channel, - int rank, - int world_size, - size_t timeout_ms, - at::xpu::XPUStream& stream) { - int64_t maxNumThreadsPerBlock = syclMaxWorkGroupSize(); - const size_t numThreadsPerBlock = std::min(maxNumThreadsPerBlock, 32); - - if (!(numThreadsPerBlock > 0)) { - return; - } - - int64_t numBlocks = 1; - auto global_range = numBlocks * numThreadsPerBlock; - auto local_range = numThreadsPerBlock; - - using Kernel = waitSignalKernel; - auto kfn = - Kernel(signal_pads, src_rank, channel, rank, world_size, timeout_ms); - - sycl_kernel_submit(global_range, local_range, stream.queue(), kfn); -} - -} // namespace c10d::symmetric_memory diff --git a/src/xccl/Signal.hpp b/src/xccl/Signal.hpp deleted file mode 100644 index 876824860a..0000000000 --- a/src/xccl/Signal.hpp +++ /dev/null @@ -1,114 +0,0 @@ -#pragma once - -#include - -#include -#include - -namespace c10d::symmetric_memory { - -using at::native::memory::get_alignment; - -template -uint32_t cas(uint32_t* addr, uint32_t compare, uint32_t val) { - sycl::atomic_ref< - uint32_t, - sycl::memory_order::acq_rel, - sycl::memory_scope::system> - ref(*addr); - ref.compare_exchange_strong(compare, val); - return compare; -} - -inline size_t global_timer_ns() { - auto now = std::chrono::high_resolution_clock::now(); - return std::chrono::duration_cast( - now.time_since_epoch()) - .count(); -} - -constexpr size_t ns_per_ms = 1e6; - -// Device-compatible version using a simple counter approach -template -bool try_put_signal_device(uint32_t* addr, size_t max_iterations = 1000) { - size_t iterations = 0; - while (cas(addr, 0, 1) != 0) { - if (max_iterations != 0 && iterations++ > max_iterations) { - return false; - } - } - return true; -} - -template -bool try_wait_signal_device(uint32_t* addr, size_t max_iterations = 1000) { - size_t iterations = 0; - while (cas(addr, 1, 0) != 1) { - if (max_iterations != 0 && iterations++ > max_iterations) { - return false; - } - } - return true; -} - -template -bool try_put_signal(uint32_t* addr, size_t timeout_ms) { - size_t deadline = global_timer_ns() + timeout_ms * ns_per_ms; - while (cas(addr, 0, 1) != 0) { - if (timeout_ms != 0 && global_timer_ns() > deadline) { - return false; - } - } - return true; -} - -template -bool try_wait_signal(uint32_t* addr, size_t timeout_ms) { - size_t deadline = global_timer_ns() + timeout_ms * ns_per_ms; - while (cas(addr, 1, 0) != 1) { - if (timeout_ms != 0 && global_timer_ns() > deadline) { - return false; - } - } - return true; -} - -template -void put_signal(uint32_t* addr) { - while (cas(addr, 0, 1) != 0) - ; -} - -template -void wait_signal(uint32_t* addr) { - while (cas(addr, 1, 0) != 1) - ; -} - -void barrier_impl_xpu( - uint32_t** signal_pads, - int channel, - int rank, - int world_size, - size_t timeout_ms, - at::xpu::XPUStream& stream); - -void put_signal_impl_xpu( - uint32_t** signal_pads, - int dst_rank, - int channel, - int rank, - int world_size, - size_t timeout_ms, - at::xpu::XPUStream& stream); - -void wait_signal_impl_xpu( - uint32_t** signal_pads, - int src_rank, - int channel, - int rank, - int world_size, - size_t timeout_ms, - at::xpu::XPUStream& stream); -} // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 253061b86f..4510609e45 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include @@ -252,57 +251,20 @@ void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { group_name_, "'"); } - // c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); - // c10::DeviceGuard guard(local_device); - // auto stream = at::xpu::getCurrentXPUStream(); - - // barrier_impl_xpu( - // reinterpret_cast(signal_pads_dev_), - // channel, - // rank_, - // world_size_, - // timeout_ms, - // stream); } void XPUSymmetricMemory::put_signal( int dst_rank, int channel, size_t timeout_ms) { - check_channel(channel, world_size_); - - c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); - c10::DeviceGuard guard(local_device); - auto stream = at::xpu::getCurrentXPUStream(); - - put_signal_impl_xpu( - reinterpret_cast(signal_pads_dev_), - dst_rank, - channel, - rank_, - world_size_, - timeout_ms, - stream); + LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported"; } void XPUSymmetricMemory::wait_signal( int src_rank, int channel, size_t timeout_ms) { - check_channel(channel, world_size_); - - c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); - c10::DeviceGuard guard(local_device); - auto stream = at::xpu::getCurrentXPUStream(); - - wait_signal_impl_xpu( - reinterpret_cast(signal_pads_dev_), - src_rank, - channel, - rank_, - world_size_, - timeout_ms, - stream); + LOG(ERROR) << "XPUSymmetricMemory::wait_signal not supported"; } int XPUSymmetricMemory::get_rank() { From 3b3ff27508cad5ae162c13ef01da40235e595a88 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Tue, 9 Sep 2025 17:23:58 +0800 Subject: [PATCH 52/58] code refine --- src/xccl/{IPCExchange.hpp => IpcExchange.hpp} | 58 +----- src/xccl/XPUSymmetricMemory.cpp | 83 ++------- src/xccl/XPUSymmetricMemoryUtils.cpp | 174 +----------------- src/xccl/XPUSymmetricMemoryUtils.hpp | 26 --- 4 files changed, 23 insertions(+), 318 deletions(-) rename src/xccl/{IPCExchange.hpp => IpcExchange.hpp} (86%) diff --git a/src/xccl/IPCExchange.hpp b/src/xccl/IpcExchange.hpp similarity index 86% rename from src/xccl/IPCExchange.hpp rename to src/xccl/IpcExchange.hpp index b80cfeb9c0..84589b2c05 100644 --- a/src/xccl/IPCExchange.hpp +++ b/src/xccl/IpcExchange.hpp @@ -15,10 +15,8 @@ #include #include #include -#include // for std::chrono::milliseconds -#include // for std::this_thread::sleep_for - -#define ELE_COUNT 128 +#include +#include struct exchange_contents { // first 4-byte is file descriptor for drmbuf or gem object @@ -289,42 +287,11 @@ void un_allgather( recv_buf[rank] = *send_buf; } -template < - typename data_type, - uint32_t max_rank = 8, - uint32_t max_buffer = 1024 /*KB*/> -class allreducer { +class IpcChannel { public: - allreducer() { + IpcChannel() { initialized = false; - size_per_buffer = 0; - buffer_index = 0; - } - allreducer(const allreducer&) = delete; - allreducer& operator=(const allreducer&) = delete; - allreducer(allreducer&& other) noexcept { - *this = std::move(other); - } - allreducer& operator=(allreducer&& other) noexcept { - if (this != &other) { - initialized = other.initialized; - rank = other.rank; - world = other.world; - std::memcpy(buffers, other.buffers, sizeof(buffers)); - std::memcpy(offsets, other.offsets, sizeof(offsets)); - std::memcpy(ipc_handle, other.ipc_handle, sizeof(ipc_handle)); - - other.initialized = false; - } - return *this; - } - ~allreducer() { - if (initialized) { - std::cerr << "Warning: allreducer destroyed without calling release()" - << std::endl; - } } - void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) { if (initialized) return; @@ -343,7 +310,6 @@ class allreducer { world = tmp_world; initialized = true; } - void allreduce(sycl::queue& queue, void* inout_buffer, uint32_t size) {} void release(sycl::queue& queue) { if (!initialized) return; @@ -364,22 +330,9 @@ class allreducer { initialized = false; } - void debug_print_buffer(sycl::queue& queue, int* address, int count) { - auto host_ptr = (int*)sycl::malloc_host(count * sizeof(int), queue); - auto tmp_ptr = (int*)sycl::malloc_device(count * sizeof(int), queue); - - queue.memcpy(tmp_ptr, address, count * sizeof(int)); - queue.memcpy(host_ptr, tmp_ptr, count * sizeof(int)); - - queue.wait(); - - for (int i = 0; i < count; i++) { - std::cout << host_ptr[i] << " "; - } - std::cout << std::endl; - } // buffer_size as element size void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) { + if (!initialize) init(); if (!load_level_zero_library()) { throw std::runtime_error("Level Zero not available"); } @@ -430,6 +383,7 @@ class allreducer { } bool initialized; + uint32_t max_rank = 16, void* buffers[max_rank]; void* sync_buffer[max_rank]; size_t offsets[max_rank]; diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 4510609e45..7233c11f1f 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -13,11 +13,6 @@ #include #include -// todo: fixed with kernel barrier -#include - -#define MAX_RANK 8 - namespace c10d { namespace symmetric_memory { @@ -40,6 +35,9 @@ AllocationRef::~AllocationRef() { c10::Device local_device(c10::DeviceType::XPU, device_idx); c10::DeviceGuard guard(local_device); c10::xpu::syncStreamsOnDevice(); + // todo: free this buffer if no reference + auto stream = at::xpu::getCurrentXPUStream(); + sycl::free(ptr, stream); } XPUSymmetricMemory::XPUSymmetricMemory( @@ -101,11 +99,11 @@ size_t XPUSymmetricMemory::get_signal_pad_size() { } bool XPUSymmetricMemory::has_multicast_support() { - return mc_addr_ != nullptr; + return false; } void* XPUSymmetricMemory::get_multicast_ptr() { - return mc_addr_; + return nullptr; } void XPUSymmetricMemory::copy_buffer( @@ -155,43 +153,7 @@ at::Tensor XPUSymmetricMemory::get_signal_pad( c10::IntArrayRef sizes, std::optional dtype, int64_t storage_offset) { - // If the dtype is unspecified, default it to UInt32, as it - // is the most common type for signaling purposes. - if (!dtype.has_value()) { - dtype = c10::ScalarType::UInt32; - } - - // If the shape is unspecified, treat the signal pad as a 1d tensor. - const auto element_size = c10::elementSize(*dtype); - std::vector shape; - if (sizes.size() != 0) { - shape = sizes.vec(); - } else { - shape.push_back(signal_pad_size / element_size); - } - - const size_t numel = std::accumulate( - shape.begin(), - - shape.end(), - static_cast(1), - std::multiplies()); - const auto req_size = (numel + storage_offset) * element_size; - TORCH_CHECK( - req_size <= signal_pad_size, - "XPUSymmetricMemory::get_signal_pad: the requested size (", - req_size, - " bytes) exceeds the allocated size (", - signal_pad_size, - " bytes)"); - auto data_ptr = reinterpret_cast(signal_pads_[rank]) + - storage_offset * element_size; - auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); - auto options = at::TensorOptions().dtype(*dtype).device(device); - return at::for_blob(data_ptr, shape) - .options(options) - .target_device(device) - .make_tensor(); + LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported"; } void check_channel(int channel, int world_size) { @@ -214,12 +176,13 @@ void check_channel(int channel, int world_size) { void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { check_channel(channel, world_size_); + // Currently, we leverage oneCCL for barrier. Later, we may move to SYCL implementation. auto group = c10d::resolve_process_group(group_name_); if (group == nullptr) { TORCH_WARN( "Process group '", group_name_, - "' not found, falling back to original barrier"); + "' not found, please init process group first before calling SymmetricMemory"); throw std::runtime_error("Process group not found"); } auto* xcclPg = dynamic_cast( @@ -337,7 +300,6 @@ struct RendezvousRequest { size_t buffer_size; size_t signal_pad_offset; bool has_multicast_support; - size_t base_offset; }; void validate_rendezvous_requests( @@ -350,14 +312,6 @@ void validate_rendezvous_requests( for (auto req : reqs) { device_indices.insert(req.device_idx); } - if (!allow_overlapping_devices() && - device_indices.size() < (size_t)world_size) { - TORCH_CHECK( - false, - "XPUSymmetricMemoryAllocator::rendezvous: ", - "detected allocations from overlapping devices ", - "from different ranks."); - } for (int r = 1; r < world_size; ++r) { TORCH_CHECK(reqs[r].block_size == reqs[0].block_size); @@ -399,28 +353,22 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( c10::Device local_device(c10::DeviceType::XPU, block->device_idx); c10::DeviceGuard guard(local_device); - // Currently, IpcChannel is using a file based socket for inter-process - // communication + // IpcChannel is used to do inter-process communication IpcChannel ipc_channel; auto group_info = get_group_info(group_name_); auto store = group_info.store; int rank = group_info.rank; int world_size = group_info.world_size; - - // Step 6: Open IPC handle of remote peer sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - allreducer ar; - ar.init(current_queue, rank, world_size); - auto local_req = RendezvousRequest{ .device_idx = block->device_idx, .pid = getpid(), .block_size = block->block_size, .buffer_size = block->buffer_size, .signal_pad_offset = block->signal_pad_offset, - .has_multicast_support = false, - .base_offset = 0}; + .has_multicast_support = false + }; auto reqs = storeExchange.all_gather(store, rank, world_size, local_req); validate_rendezvous_requests(reqs, world_size); @@ -430,8 +378,9 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( } // do IPC exchange for all peer ranks - ar.exchange_peer_ipc_mem(current_queue, ptr); + ipc_channel.exchange_peer_ipc_mem(current_queue, ptr); + // no physical memory handle, so handles and buffers are both for virtual address std::vector handles(world_size); std::vector buffers(world_size, nullptr); std::vector signal_pads(world_size, nullptr); @@ -443,8 +392,8 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); continue; } else { - buffers[r] = ar.buffers[r]; - handles[r] = ar.buffers[r]; // ar.ipc_handle[r]; + buffers[r] = ipc_channel.buffers[r]; + handles[r] = ipc_channel.buffers[r]; signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); } diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp index 39c07399b8..7130fe7b6a 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.cpp +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -29,179 +29,7 @@ bool device_has_multicast_support(int device_idx) { } bool allow_overlapping_devices() { - return c10::utils::check_env("TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES") == - true; -} - -IpcChannel::IpcChannel() - : socket_name_(get_socket_name(getpid())), - socket_(socket(AF_UNIX, SOCK_DGRAM, 0)) { - // On success, a file descriptor for the new socket is returned. - // On error, -1 is returned, and errno is set to indicate the error. - TORCH_CHECK( - socket_ != -1, "Failed to create socket: ", c10::utils::str_error(errno)); - - struct sockaddr_un addr = {.sun_family = AF_UNIX}; - std::copy(socket_name_.begin(), socket_name_.end(), addr.sun_path); - - TORCH_CHECK( - bind(socket_, (struct sockaddr*)&addr, SUN_LEN(&addr)) == 0, - "Failed to bind socket: ", - c10::utils::str_error(errno)); -} - -IpcChannel::~IpcChannel() { - close(socket_); - unlink(socket_name_.c_str()); -} - -void IpcChannel::send_fd(int dst_pid, int fd) { - // Because file descriptors are process-local kernel objects, and we can’t - // pass them via normal socket payloads (like write() or send()). Unix domain - // sockets provide a mechanism to pass actual FDs via sendmsg()/recvmsg(). - // Define destination socket address - struct sockaddr_un addr = {.sun_family = AF_UNIX}; - auto socket_name = get_socket_name(dst_pid); - std::copy(socket_name.begin(), socket_name.end(), addr.sun_path); - - // Prepare data to send - // Data being sent is "fd", the value of fd will be sent as auxiliary data - // (control message) - struct iovec io = {.iov_base = (void*)("fd"), .iov_len = 2}; - - // Prepare control message data buffer and zero it out - // NOLINTNEXTLINE(*array*) - char cbuf[CMSG_SPACE(sizeof(int))]; - memset(cbuf, 0, sizeof(cbuf)); - - // Create message header - struct msghdr msg { - // destination socket address and size of it - // message content in msg_iov and number of such structs (1 in our case) - // auxiliary data with the value of fd and size of it - .msg_name = (void*)&addr, .msg_namelen = sizeof(struct sockaddr_un), - .msg_iov = &io, .msg_iovlen = 1, .msg_control = cbuf, - .msg_controllen = sizeof(cbuf) - }; - - // This points to the first control message header - // With SCM_RIGHTS we let the kernel know that we are passing file - // descriptors. - auto cmsg = CMSG_FIRSTHDR(&msg); - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); - // Specify socket level message - cmsg->cmsg_level = SOL_SOCKET; - // SCM_RIGHTS is the type used to pass file descriptors - cmsg->cmsg_type = SCM_RIGHTS; - - if (fd != -1) { - std::copy( - reinterpret_cast(&fd), - reinterpret_cast(&fd) + sizeof(fd), - reinterpret_cast(CMSG_DATA(cmsg))); - } else { - msg.msg_controllen = 0; - } - - // Finally send the the message - TORCH_CHECK( - sendmsg(socket_, &msg, 0) > 0, - "Failed to send fd: ", - c10::utils::str_error(errno)); -} - -int IpcChannel::recv_fd() { - // Prepare buffer for regular message "fd" - // NOLINTNEXTLINE(*array*) - char buf[2]; - memset(&buf, 0, sizeof(buf)); - struct iovec io = {.iov_base = (void*)buf, .iov_len = sizeof(buf)}; - - // Prepare buffer for control message and zero it out - // NOLINTNEXTLINE(*array*) - char cbuf[CMSG_SPACE(sizeof(int))]; - memset(cbuf, 0, sizeof(cbuf)); - - // Define socket address to receive on: family AF_UNIX means unix domain - // socket - struct sockaddr_un addr = {.sun_family = AF_UNIX}; - std::copy(socket_name_.begin(), socket_name_.end(), addr.sun_path); - - // Prepare message header - struct msghdr msg = { - .msg_name = (void*)&addr, - .msg_namelen = sizeof(struct sockaddr_un), - .msg_iov = &io, - .msg_iovlen = 1, - .msg_control = cbuf, - .msg_controllen = sizeof(cbuf)}; - - // Recieve message on socket_ - TORCH_CHECK( - recvmsg(socket_, &msg, 0) > 0, - "Failed to receive fd: ", - c10::utils::str_error(errno)); - - if (msg.msg_controllen == 0) { - return -1; - } - - // Extract control message and validate its content - auto cmsg = CMSG_FIRSTHDR(&msg); - TORCH_CHECK(cmsg != nullptr); - TORCH_CHECK(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); - TORCH_CHECK(cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS); - return *reinterpret_cast(CMSG_DATA(cmsg)); -} - -std::vector IpcChannel::all_gather_fds( - int rank, - const std::vector& pids, - int fd) { - int world_size = (int)pids.size(); - std::vector fds(pids.size()); - fds[rank] = fd; - - int dst_rank = (rank + 1) % world_size; - for (int step = 1; step < world_size; ++step) { - int src_rank = (rank + world_size - step) % world_size; - send_fd(pids[dst_rank], fd); - fd = recv_fd(); - fds[src_rank] = fd; - } - return fds; -} - -int IpcChannel::broadcast_fds( - int rank, - int src_rank, - const std::vector& pids, - int fd) { - int world_size = (int)pids.size(); - - if (rank == src_rank) { - for (int dst_rank = 0; dst_rank < (int)world_size; ++dst_rank) { - if (dst_rank == rank) { - continue; - } - send_fd(pids[dst_rank], fd); - } - return fd; - } - return recv_fd(); -} - -std::string IpcChannel::get_socket_name(int pid) { - const char* tmp_dir = "/tmp"; - for (const char* env_var : {"TMPDIR", "TMP", "TEMP", "TEMPDIR"}) { - if (const char* path = getenv(env_var)) { - tmp_dir = path; - break; - } - } - std::ostringstream oss; - oss << tmp_dir << "/symm_mem-" << pid; - return oss.str(); + return false; } void map_block( diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp index f119928f41..530c511071 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.hpp +++ b/src/xccl/XPUSymmetricMemoryUtils.hpp @@ -12,32 +12,6 @@ bool device_has_multicast_support(int device_idx); bool allow_overlapping_devices(); -class IpcChannel { - public: - IpcChannel(); - ~IpcChannel(); - - void send_fd(int dst_pid, int fd); - int recv_fd(); - - std::vector all_gather_fds( - int rank, - const std::vector& pids, - int fd); - - int broadcast_fds( - int rank, - int src_rank, - const std::vector& pids, - int fd); - - private: - static std::string get_socket_name(int pid); - - std::string socket_name_; - int socket_; -}; - // A set of store-based exchange methods with a preset prefix typically type of // the SymmetricMemory. Most used as static instances at respective // SymmetricMemory implementation files. From 0477394a557821de8ed302dd7ec93df85d8b5259 Mon Sep 17 00:00:00 2001 From: Han Chao Date: Wed, 10 Sep 2025 11:29:12 +0800 Subject: [PATCH 53/58] refine --- src/xccl/IpcExchange.hpp | 11 +++++--- src/xccl/XPUSymmetricMemory.cpp | 48 ++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/src/xccl/IpcExchange.hpp b/src/xccl/IpcExchange.hpp index 84589b2c05..e515cd6ce0 100644 --- a/src/xccl/IpcExchange.hpp +++ b/src/xccl/IpcExchange.hpp @@ -331,8 +331,13 @@ class IpcChannel { } // buffer_size as element size - void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr) { - if (!initialize) init(); + void exchange_peer_ipc_mem( + sycl::queue& queue, + void* ptr, + uint32_t rank_in, + uint32_t world_in) { + if (!initialized) + init(queue, rank_in, world_in); if (!load_level_zero_library()) { throw std::runtime_error("Level Zero not available"); } @@ -383,7 +388,7 @@ class IpcChannel { } bool initialized; - uint32_t max_rank = 16, + static constexpr uint32_t max_rank = 16; void* buffers[max_rank]; void* sync_buffer[max_rank]; size_t offsets[max_rank]; diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 7233c11f1f..a4757791e3 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -153,7 +153,42 @@ at::Tensor XPUSymmetricMemory::get_signal_pad( c10::IntArrayRef sizes, std::optional dtype, int64_t storage_offset) { - LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported"; + // If the dtype is unspecified, default it to UInt32, as it + // is the most common type for signaling purposes. + if (!dtype.has_value()) { + dtype = c10::ScalarType::UInt32; + } + + // If the shape is unspecified, treat the signal pad as a 1d tensor. + const auto element_size = c10::elementSize(*dtype); + std::vector shape; + if (sizes.size() != 0) { + shape = sizes.vec(); + } else { + shape.push_back(signal_pad_size / element_size); + } + + const size_t numel = std::accumulate( + shape.begin(), + shape.end(), + static_cast(1), + std::multiplies()); + const auto req_size = (numel + storage_offset) * element_size; + TORCH_CHECK( + req_size <= signal_pad_size, + "XPUSymmetricMemory::get_signal_pad: the requested size (", + req_size, + " bytes) exceeds the allocated size (", + signal_pad_size, + " bytes)"); + auto data_ptr = reinterpret_cast(signal_pads_[rank]) + + storage_offset * element_size; + auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); + auto options = at::TensorOptions().dtype(*dtype).device(device); + return at::for_blob(data_ptr, shape) + .options(options) + .target_device(device) + .make_tensor(); } void check_channel(int channel, int world_size) { @@ -176,7 +211,8 @@ void check_channel(int channel, int world_size) { void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { check_channel(channel, world_size_); - // Currently, we leverage oneCCL for barrier. Later, we may move to SYCL implementation. + // Currently, we leverage oneCCL for barrier. Later, we may move to SYCL + // implementation. auto group = c10d::resolve_process_group(group_name_); if (group == nullptr) { TORCH_WARN( @@ -367,8 +403,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( .block_size = block->block_size, .buffer_size = block->buffer_size, .signal_pad_offset = block->signal_pad_offset, - .has_multicast_support = false - }; + .has_multicast_support = false}; auto reqs = storeExchange.all_gather(store, rank, world_size, local_req); validate_rendezvous_requests(reqs, world_size); @@ -378,9 +413,10 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( } // do IPC exchange for all peer ranks - ipc_channel.exchange_peer_ipc_mem(current_queue, ptr); + ipc_channel.exchange_peer_ipc_mem(current_queue, ptr, rank, world_size); - // no physical memory handle, so handles and buffers are both for virtual address + // no physical memory handle, so handles and buffers are both for virtual + // address std::vector handles(world_size); std::vector buffers(world_size, nullptr); std::vector signal_pads(world_size, nullptr); From 7788d4d9466f67a1a78a334e90ef4080b418d276 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Wed, 10 Sep 2025 14:20:28 +0800 Subject: [PATCH 54/58] remove level-zero header if it's no needed --- src/xccl/XPUSymmetricMemory.cpp | 13 ------------- src/xccl/XPUSymmetricMemoryTypes.hpp | 2 -- src/xccl/XPUSymmetricMemoryUtils.hpp | 4 +++- 3 files changed, 3 insertions(+), 16 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index a4757791e3..f9756ec303 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -35,7 +35,6 @@ AllocationRef::~AllocationRef() { c10::Device local_device(c10::DeviceType::XPU, device_idx); c10::DeviceGuard guard(local_device); c10::xpu::syncStreamsOnDevice(); - // todo: free this buffer if no reference auto stream = at::xpu::getCurrentXPUStream(); sycl::free(ptr, stream); } @@ -106,18 +105,6 @@ void* XPUSymmetricMemory::get_multicast_ptr() { return nullptr; } -void XPUSymmetricMemory::copy_buffer( - at::Tensor src, - at::Tensor dst, - size_t size) { - sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); - auto src_ptr = src.data_ptr(); - auto dst_ptr = dst.data_ptr(); - - size_t copy_size = size * c10::elementSize(src.scalar_type()); - - current_queue.memcpy(dst_ptr, src_ptr, copy_size); -} at::Tensor XPUSymmetricMemory::get_buffer( int rank, c10::IntArrayRef sizes, diff --git a/src/xccl/XPUSymmetricMemoryTypes.hpp b/src/xccl/XPUSymmetricMemoryTypes.hpp index 133abd2712..4cab3b81f7 100644 --- a/src/xccl/XPUSymmetricMemoryTypes.hpp +++ b/src/xccl/XPUSymmetricMemoryTypes.hpp @@ -1,7 +1,5 @@ #pragma once -#include - namespace c10d::symmetric_memory { constexpr size_t signal_pad_size = 2048; diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp index 530c511071..69189f45cf 100644 --- a/src/xccl/XPUSymmetricMemoryUtils.hpp +++ b/src/xccl/XPUSymmetricMemoryUtils.hpp @@ -75,8 +75,10 @@ class StoreExchange { size_t seq_id_ = 0; }; -// Teturns a pointer of virtual address that is mapped to the physical memory +// Returns a pointer of virtual address that is mapped to the physical memory // held by the handle. +// todo: will follow such physical memory handle map with virtual address, +// when L0 provides physical handle exchange API and we have multicast support. void map_block( void** ptr, ze_physical_mem_handle_t handle, From 62bb559af8c5dafae992d4580052d9573c841a63 Mon Sep 17 00:00:00 2001 From: Han Chao Date: Thu, 11 Sep 2025 15:01:20 +0800 Subject: [PATCH 55/58] fix and add test --- src/xccl/XPUSymmetricMemory.cpp | 2 - test/xpu/distributed/test_c10d_ops_xccl.py | 58 ------------- .../distributed/test_symmetric_memory_xccl.py | 85 +++++++++++++++++++ test/xpu/run_distributed.py | 4 + 4 files changed, 89 insertions(+), 60 deletions(-) create mode 100644 test/xpu/distributed/test_symmetric_memory_xccl.py diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index f9756ec303..013bb03b72 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -35,8 +35,6 @@ AllocationRef::~AllocationRef() { c10::Device local_device(c10::DeviceType::XPU, device_idx); c10::DeviceGuard guard(local_device); c10::xpu::syncStreamsOnDevice(); - auto stream = at::xpu::getCurrentXPUStream(); - sycl::free(ptr, stream); } XPUSymmetricMemory::XPUSymmetricMemory( diff --git a/test/xpu/distributed/test_c10d_ops_xccl.py b/test/xpu/distributed/test_c10d_ops_xccl.py index f9d806e689..6ab9c02c35 100644 --- a/test/xpu/distributed/test_c10d_ops_xccl.py +++ b/test/xpu/distributed/test_c10d_ops_xccl.py @@ -23,10 +23,6 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__))) from test_c10d_xccl import init_multigpu_helper, requires_xccl -from torch.distributed._symmetric_memory import ( - _fused_all_gather_matmul_fallback, - _fused_matmul_reduce_scatter_fallback, -) from torch.testing._internal.common_distributed import MultiProcContinuousTest from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, @@ -925,60 +921,6 @@ def test_all_to_all_single_none(self): out.tolist(), list(zip(range(self.world_size), range(self.world_size))) ) - @requires_xccl() - def test_fused_all_gather_matmul(self) -> None: - device = self.rank_to_GPU[self.rank][0] - torch.xpu.set_device(device) - BATCH = 8 - M = 64 - N = 16 - K = 32 - group = dist.group.WORLD - rank = self.rank - - torch.manual_seed(42 + rank) - A_shard = torch.rand(BATCH, M // self.world_size, K, device="xpu") - Bs = [torch.rand(K, N, device="xpu") for _ in range(3)] - - ag_output_0, mm_outputs_0 = _fused_all_gather_matmul_fallback( - A_shard, Bs, gather_dim=0, group_name=group.group_name - ) - ag_output_1, mm_outputs_1 = torch.ops.symm_mem.fused_all_gather_matmul( - A_shard, Bs, gather_dim=0, group_name=group.group_name - ) - - self.assertEqual(ag_output_0, ag_output_1) - self.assertEqual(ag_output_0.stride(), ag_output_1.stride()) - for mm_output_0, mm_output_1 in zip(mm_outputs_0, mm_outputs_1): - self.assertEqual(mm_output_0, mm_output_1) - self.assertEqual(mm_output_0.stride(), mm_output_1.stride()) - - @requires_xccl() - @parametrize("scatter_dim", [0, 1]) - def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None: - device = self.rank_to_GPU[self.rank][0] - torch.xpu.set_device(device) - BATCH = 8 - M = 64 - N = 16 - K = 32 - group = dist.group.WORLD - rank = self.rank - - torch.manual_seed(42 + rank) - A = torch.rand(BATCH, M, K, device="xpu") - B = torch.rand(K, N, device="xpu") - - output_0 = _fused_matmul_reduce_scatter_fallback( - A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name - ) - output_1 = torch.ops.symm_mem.fused_matmul_reduce_scatter( - A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name - ) - - assert torch.allclose(output_0, output_1) - assert output_0.stride() == output_1.stride() - instantiate_parametrized_tests(ProcessGroupXCCLOpTest) if __name__ == "__main__": diff --git a/test/xpu/distributed/test_symmetric_memory_xccl.py b/test/xpu/distributed/test_symmetric_memory_xccl.py new file mode 100644 index 0000000000..37f5d3e6da --- /dev/null +++ b/test/xpu/distributed/test_symmetric_memory_xccl.py @@ -0,0 +1,85 @@ +import torch +import torch.distributed as dist +from test_c10d_xccl import init_multigpu_helper, requires_xccl +from torch.distributed._symmetric_memory import ( + _fused_all_gather_matmul_fallback, + _fused_matmul_reduce_scatter_fallback, +) + +from torch.testing._internal.common_distributed import MultiProcContinuousTest +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests +) + +@instantiate_parametrized_tests +class AsyncTPTest(MultiProcContinuousTest): + @property + def device(self) -> torch.device: + return torch.device("xpu", self.rank) + + def _init_process(self): + torch.xpu.set_device(self.device) + torch.manual_seed(42 + self.rank) + torch.use_deterministic_algorithms(True) + torch.set_deterministic_debug_mode("warn") + torch.utils.deterministic.fill_uninitialized_memory = True + + @requires_xccl() + @parametrize("gather_dim", [0, 1]) + def test_fused_all_gather_matmul(self, gather_dim: int) -> None: + self._init_process() + BATCH = 8 + M = 64 + N = 16 + K = 32 + group = dist.group.WORLD + rank = self.rank + + torch.manual_seed(42 + rank) + A_shard = torch.rand(BATCH, M // self.world_size, K, device="xpu") + Bs = [torch.rand(K, N, device="xpu") for _ in range(3)] + + ag_output_0, mm_outputs_0 = _fused_all_gather_matmul_fallback( + A_shard, Bs, gather_dim=gather_dim, group_name=group.group_name + ) + ag_output_1, mm_outputs_1 = torch.ops.symm_mem.fused_all_gather_matmul( + A_shard, Bs, gather_dim=gather_dim, group_name=group.group_name + ) + + self.assertEqual(ag_output_0, ag_output_1) + self.assertEqual(ag_output_0.stride(), ag_output_1.stride()) + for mm_output_0, mm_output_1 in zip(mm_outputs_0, mm_outputs_1): + self.assertEqual(mm_output_0, mm_output_1) + self.assertEqual(mm_output_0.stride(), mm_output_1.stride()) + + @requires_xccl() + @parametrize("scatter_dim", [0, 1]) + def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None: + self._init_process() + + BATCH = 8 + M = 64 + N = 16 + K = 32 + group = dist.group.WORLD + rank = self.rank + + torch.manual_seed(42 + rank) + A = torch.rand(BATCH, M, K, device="xpu") + B = torch.rand(K, N, device="xpu") + + output_0 = _fused_matmul_reduce_scatter_fallback( + A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name + ) + output_1 = torch.ops.symm_mem.fused_matmul_reduce_scatter( + A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name + ) + + self.assertEqual(output_0, output_1) + self.assertEqual(output_0.stride(), output_1.stride()) + + +if __name__ == "__main__": + run_tests() diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index ddde5f8c8a..4965406165 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -26,6 +26,10 @@ def run(test_command): test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) +test_command = ["python", "distributed/test_c10d_xccl.py"] +res += run(test_command) +test_command = ["python", "distributed/test_symmetric_memory_xccl.py"] +res += run(test_command) # run pytest with skiplist for key in skip_dict: From 902a61f073dc5d6257d36314c350816f29915bf8 Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Fri, 12 Sep 2025 14:46:45 +0800 Subject: [PATCH 56/58] fix memory release issue --- src/xccl/XPUSymmetricMemory.cpp | 16 ++++++++++++---- src/xccl/XPUSymmetricMemory.hpp | 4 +++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 013bb03b72..c20f521693 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -22,19 +22,27 @@ AllocationRef::AllocationRef( void* ptr, HandleType handle, size_t block_size, - int device_idx) + int device_idx, + bool local_allocation) : ptr(ptr), handle(handle), block_size(block_size), - device_idx(device_idx) {} + device_idx(device_idx), + local_allocation(local_allocation){} AllocationRef::~AllocationRef() { if (is_finalizing()) { return; } + // Currently, we cannot free virtual memory exchanged from other device. + if (!local_allocation) { + return; + } c10::Device local_device(c10::DeviceType::XPU, device_idx); c10::DeviceGuard guard(local_device); c10::xpu::syncStreamsOnDevice(); + auto stream = at::xpu::getCurrentXPUStream(); + sycl::free(ptr, stream); } XPUSymmetricMemory::XPUSymmetricMemory( @@ -284,7 +292,7 @@ void* XPUSymmetricMemoryAllocator::alloc( void* ptr = sycl::malloc_device(block_size, current_queue); current_queue.memset(ptr, 0, block_size); auto alloc_ref = - c10::make_intrusive(ptr, ptr, block_size, device_idx); + c10::make_intrusive(ptr, ptr, block_size, device_idx, true); auto block = c10::make_intrusive( std::move(alloc_ref), device_idx, @@ -431,7 +439,7 @@ c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( continue; } alloc_refs.push_back(c10::make_intrusive( - buffers[r], handles[r], block->block_size, block->device_idx)); + buffers[r], handles[r], block->block_size, block->device_idx, false)); } auto symm_mem = c10::make_intrusive( diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index fe0a88c4e6..a0fbc907a9 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -15,12 +15,14 @@ struct AllocationRef : public c10::intrusive_ptr_target { HandleType handle; size_t block_size; int device_idx; + bool local_allocation; AllocationRef( void* ptr, HandleType handle, size_t block_size, - int device_idx); + int device_idx, + bool local_allocation); ~AllocationRef(); }; From d301bdfd2903e3561cf0e98d337a1222df69d49b Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Fri, 12 Sep 2025 16:13:34 +0800 Subject: [PATCH 57/58] refine according to latest pytorch --- src/xccl/XPUSymmetricMemory.cpp | 47 +++------------------------------ src/xccl/XPUSymmetricMemory.hpp | 10 ++----- 2 files changed, 6 insertions(+), 51 deletions(-) diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index c20f521693..50f4d98cd8 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -141,49 +141,6 @@ at::Tensor XPUSymmetricMemory::get_buffer( .make_tensor(); } -at::Tensor XPUSymmetricMemory::get_signal_pad( - int rank, - c10::IntArrayRef sizes, - std::optional dtype, - int64_t storage_offset) { - // If the dtype is unspecified, default it to UInt32, as it - // is the most common type for signaling purposes. - if (!dtype.has_value()) { - dtype = c10::ScalarType::UInt32; - } - - // If the shape is unspecified, treat the signal pad as a 1d tensor. - const auto element_size = c10::elementSize(*dtype); - std::vector shape; - if (sizes.size() != 0) { - shape = sizes.vec(); - } else { - shape.push_back(signal_pad_size / element_size); - } - - const size_t numel = std::accumulate( - shape.begin(), - shape.end(), - static_cast(1), - std::multiplies()); - const auto req_size = (numel + storage_offset) * element_size; - TORCH_CHECK( - req_size <= signal_pad_size, - "XPUSymmetricMemory::get_signal_pad: the requested size (", - req_size, - " bytes) exceeds the allocated size (", - signal_pad_size, - " bytes)"); - auto data_ptr = reinterpret_cast(signal_pads_[rank]) + - storage_offset * element_size; - auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); - auto options = at::TensorOptions().dtype(*dtype).device(device); - return at::for_blob(data_ptr, shape) - .options(options) - .target_device(device) - .make_tensor(); -} - void check_channel(int channel, int world_size) { TORCH_CHECK( channel >= 0, @@ -267,6 +224,10 @@ int XPUSymmetricMemory::get_world_size() { return world_size_; } +c10::Device XPUSymmetricMemory::get_device() { + return c10::Device(c10::DeviceType::XPU, local_device_idx_); +} + Block::Block( c10::intrusive_ptr alloc_ref, int device_idx, diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp index a0fbc907a9..2daac1114a 100644 --- a/src/xccl/XPUSymmetricMemory.hpp +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -56,21 +56,15 @@ class XPUSymmetricMemory : public SymmetricMemory { int rank, c10::IntArrayRef sizes, c10::ScalarType dtype, - int64_t storage_offset) override; - - at::Tensor get_signal_pad( - int rank, - c10::IntArrayRef sizes, - std::optional dtype, - int64_t storage_offset) override; + int64_t storage_offset); void barrier(int channel, size_t timeout_ms) override; void put_signal(int dst_rank, int channel, size_t timeout_ms) override; void wait_signal(int src_rank, int channel, size_t timeout_ms) override; - void copy_buffer(at::Tensor src, at::Tensor dst, size_t size) override; int get_rank() override; int get_world_size() override; + c10::Device get_device() override; void set_group_name(const std::string& group_name) { group_name_ = group_name; From c417b299dc9ed1c07b039fe0397ff7a0cd3e025b Mon Sep 17 00:00:00 2001 From: lzhang2 Date: Fri, 12 Sep 2025 16:41:08 +0800 Subject: [PATCH 58/58] debug about getting ptr device --- src/ATen/native/xpu/Copy.cpp | 2 ++ src/xccl/XPUSymmetricMemory.cpp | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/Copy.cpp b/src/ATen/native/xpu/Copy.cpp index b011baa80b..60a3792761 100644 --- a/src/ATen/native/xpu/Copy.cpp +++ b/src/ATen/native/xpu/Copy.cpp @@ -71,8 +71,10 @@ void memcpyAsync( Device dst_device = iter.device(0); Device src_device = iter.device(1); if (dst_device == src_device) { + std::cout << "zl_debug: go to same device and specialized kernel" << std::endl; copy_kernel(iter); } else { + std::cout << "zl_debug: go to sycl copy kernel" << std::endl; TORCH_INTERNAL_ASSERT(p2p_enabled == true); auto dst = (char*)iter.data_ptr(0); auto src = (char*)iter.data_ptr(1); diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp index 50f4d98cd8..d49d126122 100644 --- a/src/xccl/XPUSymmetricMemory.cpp +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -132,7 +132,9 @@ at::Tensor XPUSymmetricMemory::get_buffer( " bytes)"); auto data_ptr = reinterpret_cast(buffers_[rank]) + storage_offset * element_size; - auto device = c10::Device(c10::DeviceType::XPU, local_device_idx_); + // check the device of this device buffer + auto ptr_to_device_id = c10::xpu::get_device_idx_from_pointer(data_ptr); + auto device = c10::Device(c10::DeviceType::XPU, ptr_to_device_id); auto options = at::TensorOptions().dtype(dtype).device(device); return at::for_blob(data_ptr, sizes)