-
Notifications
You must be signed in to change notification settings - Fork 913
Description
Background information
Due to the limitations of my current RDMA NIC MTT specifications, I cannot continue the mpi-benchmarks test using 4K normal pages, so I use large pages (2M size) to reduce MTT consumption.
What version of Open MPI are you using? (e.g., v4.1.6, v5.0.1, git branch name and hash, etc.)
v4.1.5
Describe how Open MPI was installed (e.g., from a source/distribution tarball, from a git clone, from an operating system distribution package, etc.)
I downloaded openmpi-4.1.5.tar.gz directly from the tag.
Details of the problem
When running mpirun, add the following mca parameter
--mca mpool hugepage \
--mca btl_openib_mpool_hints "mpool=hugepage,page_size=2M" \
The reason for the error is that ibv_reg_mr will fail and return -22 (Invalid). What is even more strange is that when I add the following code with + in front, the registration will succeed. That is to say, with the same parameters, ibv_reg_mr will fail the first time, but succeed the second time.
=>openmpi-4.1.5/opal/mca/btl/openib/btl_openib_component.c
openib_reg->mr = ibv_reg_mr(device->ib_pd, base, size, access_flag);
+ if (NULL == openib_reg->mr) {
+ openib_reg->mr = ibv_reg_mr(device->ib_pd, base, size, access_flag);
+ }
So in order to determine whether it was the RDMA NIC problem, I wrote a simple demo ibv_reg_mr program and found that it was able to register successfully in one go. The demo code is as follows
=>hugepage_ibv_reg.c
#define _GNU_SOURCE
#include <infiniband/verbs.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#define HUGEPAGE_SIZE (2 * 1024 * 1024)
#define ALIGN_DOWN(ptr, align) ((void *)((uintptr_t)(ptr) & ~((uintptr_t)(align - 1))))
int main() {
struct ibv_device **dev_list = NULL;
struct ibv_context *ctx = NULL;
struct ibv_pd *pd = NULL;
struct ibv_mr *mr = NULL;
void *raw_buf = NULL;
void *aligned_buf = NULL;
int fd = -1;
int rc;
char *path = NULL;
size_t alloc_size = HUGEPAGE_SIZE;
int32_t count = 0;
rc = asprintf (&path, "%s/hugepage.openmpi.%d.%d", "/dev/hugepages",
getpid (), count);
if (0 > rc) {
return -1;
}
fd = open (path, O_RDWR | O_CREAT, 0600);
if (-1 == fd) {
free (path);
return -1;
}
if (0 != ftruncate (fd, alloc_size)) {
close (fd);
unlink (path);
free (path);
return -1;
}
int num_devices = 0;
dev_list = ibv_get_device_list(&num_devices);
if (!dev_list) {
perror("ibv_get_device_list");
return 1;
}
for (int i = 0; i < num_devices; i++) {
if (strcmp(ibv_get_device_name(dev_list[i]), "xscale_0") == 0) {
ctx = ibv_open_device(dev_list[i]);
break;
}
}
if (!ctx) {
fprintf(stderr, "Failed to find RDMA device xscale_0\n");
ibv_free_device_list(dev_list);
return 1;
}
pd = ibv_alloc_pd(ctx);
if (!pd) {
perror("ibv_alloc_pd");
ibv_close_device(ctx);
ibv_free_device_list(dev_list);
return 1;
}
raw_buf = mmap (NULL, alloc_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (raw_buf == MAP_FAILED) {
perror("mmap hugepage");
ibv_dealloc_pd(pd);
ibv_close_device(ctx);
ibv_free_device_list(dev_list);
return 1;
}
unlink (path);
free (path);
if (fd >= 0) {
close (fd);
}
//aligned_buf = ALIGN_DOWN((char *)raw_buf, 0x1000);
aligned_buf = (char *)raw_buf + 0x1000;
printf("raw_buf = %p, aligned_buf = %p\n", raw_buf, aligned_buf);
mr = ibv_reg_mr(pd, aligned_buf, 0x1000,
//IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
0xf);
if (!mr) {
perror("ibv_reg_mr");
munmap(raw_buf, alloc_size);
ibv_dealloc_pd(pd);
ibv_close_device(ctx);
ibv_free_device_list(dev_list);
return 1;
}
printf("Memory registered: addr=%p, lkey=0x%x, rkey=0x%x\n",
mr->addr, mr->lkey, mr->rkey);
ibv_dereg_mr(mr);
munmap(raw_buf, alloc_size);
ibv_dealloc_pd(pd);
ibv_close_device(ctx);
ibv_free_device_list(dev_list);
return 0;
}
The output prints as follows:
raw_buf = 0x7f5c86600000, aligned_buf = 0x7f5c86601000
Memory registered: addr=0x7f5c86601000, lkey=0x20069, rkey=0x20069
So based on the above conclusions, it seems that there is a problem with the use of openmpi huge page configuration. I would be very grateful if you could answer my question!