Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve latency test #112

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 51 additions & 22 deletions tests/copylat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ using namespace gdrcopy::test;
// manually tuned...
int num_write_iters = 10000;
int num_read_iters = 100;
int small_size_iter_factor = 1000;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the intention and usefulness for small sizes. However, it changes what the number of iterations users specify. Is there a better way to do this or could you provide an explanation message? Currently, the users need to read the code in order to know that small sizes and large sizes use different number of iterations.

int warmup = 10;

int main(int argc, char *argv[])
{
Expand All @@ -49,10 +51,11 @@ int main(int argc, char *argv[])
bool do_cumemcpy = false;
struct timespec beg, end;
double lat_us;
double bw;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn’t this redundant with copybw?

If you want to do shmoo for bw, is it better to rename the test? “copylat” doesn’t sound right anymore in that case.


while(1) {
int c;
c = getopt(argc, argv, "s:d:w:r:hc");
c = getopt(argc, argv, "s:d:w:r:hcW:");
if (c == -1)
break;

Expand All @@ -69,11 +72,18 @@ int main(int argc, char *argv[])
case 'r':
num_read_iters = strtol(optarg, NULL, 0);
break;
case 'W':
warmup = strtol(optarg, NULL, 0);
break;
case 'c':
do_cumemcpy = true;
break;
case 'h':
printf("syntax: %s -s <buf size> -d <gpu dev id> -w <write iters> -r <read iters> -h[help] -c[do-cuMemcpy]\n", argv[0]);
printf("syntax: %s [-s <buf size>][-d <gpu dev id>][-w <write iters>][-r <read iters>][-h][-c][-w]\n"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The last option should be [-W <# iterations>]. You forgot to capitalize the letter.

"-c benchmark cuMemcpy\n"
"-w <# iterations> modify warmup (default %d)\n",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Capitalize the latter W.

argv[0],
warmup);
exit(EXIT_FAILURE);
break;
default:
Expand Down Expand Up @@ -137,39 +147,48 @@ int main(int argc, char *argv[])

if (do_cumemcpy) {
cout << endl;
cout << "cuMemcpy_H2D num iters for each size: " << num_write_iters << endl;
printf("Test \t\t Size(B) \t Avg.Time(us)\n");
cout << "cuMemcpy_H2D num iters for each size: " << small_size_iter_factor * num_write_iters << "/" << num_write_iters << endl;
printf("Test \t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n");
BEGIN_CHECK {
// cuMemcpy H2D benchmark
copy_size = 1;
while (copy_size <= size) {
int iter = 0;
clock_gettime(MYCLOCK, &beg);
for (iter = 0; iter < num_write_iters; ++iter) {
size_t num_iters = (size < 100000 ? num_write_iters*small_size_iter_factor: num_write_iters);
for (iter = 0; iter < num_iters+warmup; ++iter) {
if (iter == warmup)
clock_gettime(MYCLOCK, &beg);
ASSERTDRV(cuMemcpy(d_A, (CUdeviceptr)init_buf, copy_size));
}
clock_gettime(MYCLOCK, &end);
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
printf("cuMemcpy_H2D \t %8zu \t %11.4f\n", copy_size, lat_us);
double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0;
lat_us = dt_us / (double)num_iters;
bw = copy_size / lat_us;
printf("cuMemcpy_H2D \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw);
copy_size <<= 1;
}
} END_CHECK;

cout << endl;
cout << "cuMemcpy_D2H num iters for each size: " << num_read_iters << endl;
printf("Test \t\t Size(B) \t Avg.Time(us)\n");
cout << "cuMemcpy_D2H num iters for each size: " << small_size_iter_factor * num_read_iters << "/" << num_read_iters << endl;
printf("Test \t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n");
BEGIN_CHECK {
// cuMemcpy D2H benchmark
copy_size = 1;
while (copy_size <= size) {
int iter = 0;
clock_gettime(MYCLOCK, &beg);
for (iter = 0; iter < num_read_iters; ++iter) {
size_t num_iters = (size < 100000 ? small_size_iter_factor*num_read_iters:num_read_iters);
for (iter = 0; iter < num_iters+warmup; ++iter) {
if (iter == warmup)
clock_gettime(MYCLOCK, &beg);
ASSERTDRV(cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size));
}
clock_gettime(MYCLOCK, &end);
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
printf("cuMemcpy_D2H \t %8zu \t %11.4f\n", copy_size, lat_us);
//lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0;
lat_us = dt_us / (double)num_iters;
bw = copy_size / lat_us;
printf("cuMemcpy_D2H \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw);
copy_size <<= 1;
}
} END_CHECK;
Expand Down Expand Up @@ -216,17 +235,22 @@ int main(int argc, char *argv[])
cout << "WARNING: Measuring the issue overhead as observed by the CPU. Data might not be ordered all the way to the GPU internal visibility." << endl;
// For more information, see
// https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior
printf("Test \t\t\t Size(B) \t Avg.Time(us)\n");
printf("Test \t\t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n");
copy_size = 1;
while (copy_size <= size) {
int iter = 0;
clock_gettime(MYCLOCK, &beg);
for (iter = 0; iter < num_write_iters; ++iter) {
size_t num_iters = (size < 100000 ? num_write_iters*small_size_iter_factor: num_write_iters);
for (iter = 0; iter < num_iters+warmup; ++iter) {
if (iter == warmup)
clock_gettime(MYCLOCK, &beg);
gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size);
}
clock_gettime(MYCLOCK, &end);
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
printf("gdr_copy_to_mapping \t %8zu \t %11.4f\n", copy_size, lat_us);
double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0;
lat_us = dt_us / (double)num_iters;
bw = copy_size / lat_us;
printf("gdr_copy_to_mapping \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw);
copy_size <<= 1;
}

Expand All @@ -239,12 +263,17 @@ int main(int argc, char *argv[])
copy_size = 1;
while (copy_size <= size) {
int iter = 0;
clock_gettime(MYCLOCK, &beg);
for (iter = 0; iter < num_read_iters; ++iter)
size_t num_iters = (size < 100000 ? small_size_iter_factor*num_read_iters:num_read_iters);
for (iter = 0; iter < num_iters+warmup; ++iter) {
if (iter == warmup)
clock_gettime(MYCLOCK, &beg);
gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size);
}
clock_gettime(MYCLOCK, &end);
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter;
printf("gdr_copy_from_mapping \t %8zu \t %11.4f\n", copy_size, lat_us);
double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0;
lat_us = dt_us / (double)num_iters;
bw = copy_size / lat_us;
printf("gdr_copy_from_mapping \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw);
copy_size <<= 1;
}

Expand Down
36 changes: 20 additions & 16 deletions tests/sanity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ using namespace std;
using namespace gdrcopy::test;

volatile bool expecting_exception_signal = false;
int gpu_id = 0;

void exception_signal_handle(int sig)
{
Expand Down Expand Up @@ -168,7 +169,7 @@ BEGIN_GDRCOPY_TEST(basic)
expecting_exception_signal = false;
MB();

init_cuda(0);
init_cuda(gpu_id);

const size_t _size = 256*1024+16;
const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
Expand Down Expand Up @@ -201,7 +202,7 @@ BEGIN_GDRCOPY_TEST(basic_with_tokens)
expecting_exception_signal = false;
MB();

init_cuda(0);
init_cuda(gpu_id);

const size_t _size = 256*1024+16;
const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
Expand Down Expand Up @@ -242,7 +243,7 @@ BEGIN_GDRCOPY_TEST(basic_unaligned_mapping)
expecting_exception_signal = false;
MB();

init_cuda(0);
init_cuda(gpu_id);

// Allocate for a few bytes so that cuMemAlloc returns an unaligned address
// in the next allocation. This behavior is observed in GPU Driver 410 and
Expand Down Expand Up @@ -337,7 +338,7 @@ BEGIN_GDRCOPY_TEST(data_validation)
expecting_exception_signal = false;
MB();

init_cuda(0);
init_cuda(gpu_id);

const size_t _size = 256*1024+16;
const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
Expand Down Expand Up @@ -461,7 +462,7 @@ BEGIN_GDRCOPY_TEST(invalidation_access_after_gdr_close)

int mydata = (rand() % 1000) + 1;

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A;
ASSERTDRV(gpuMemAlloc(&d_A, size));
Expand Down Expand Up @@ -537,7 +538,7 @@ BEGIN_GDRCOPY_TEST(invalidation_access_after_cumemfree)

int mydata = (rand() % 1000) + 1;

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A;
ASSERTDRV(gpuMemAlloc(&d_A, size));
Expand Down Expand Up @@ -613,7 +614,7 @@ BEGIN_GDRCOPY_TEST(invalidation_two_mappings)

int mydata = (rand() % 1000) + 1;

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A[2];

Expand Down Expand Up @@ -762,7 +763,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_access_after_cumemfree)
if (pid == 0)
mydata += 10;

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A;
ASSERTDRV(gpuMemAlloc(&d_A, size));
Expand Down Expand Up @@ -871,7 +872,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_after_gdr_map)
const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
const char *myname;

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A;
ASSERTDRV(gpuMemAlloc(&d_A, size));
Expand Down Expand Up @@ -1011,7 +1012,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_child_gdr_map_parent)
const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
const char *myname;

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A;
ASSERTDRV(gpuMemAlloc(&d_A, size));
Expand Down Expand Up @@ -1124,7 +1125,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_map_and_free)

int mydata = (rand() % 1000) + 1;

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A;
ASSERTDRV(gpuMemAlloc(&d_A, size));
Expand Down Expand Up @@ -1229,7 +1230,7 @@ BEGIN_GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_pin_buffer)

print_dbg("%s: Start\n", myname);

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A;
ASSERTDRV(gpuMemAlloc(&d_A, size));
Expand Down Expand Up @@ -1344,7 +1345,7 @@ BEGIN_GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_map)
write_fd = filedes_1[1];
}

init_cuda(0);
init_cuda(gpu_id);

CUdeviceptr d_A;
ASSERTDRV(gpuMemAlloc(&d_A, size));
Expand Down Expand Up @@ -1487,7 +1488,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_child_gdr_pin_parent_with_tokens)
read_fd = filedes_0[0];
write_fd = filedes_1[1];

init_cuda(0);
init_cuda(gpu_id);

ASSERTDRV(gpuMemAlloc(&d_A, size));
ASSERTDRV(cuPointerGetAttribute(&tokens, CU_POINTER_ATTRIBUTE_P2P_TOKENS, d_A));
Expand All @@ -1512,13 +1513,16 @@ int main(int argc, char *argv[])
{
int c;

while ((c = getopt(argc, argv, "h::v::")) != -1) {
while ((c = getopt(argc, argv, "d:h::v::")) != -1) {
switch (c) {
case 'd':
gpu_id = atoi(optarg);
break;
case 'v':
gdrcopy::test::print_dbg_msg = true;
break;
case 'h':
cout << "Usage: " << argv[0] << " [-v] [-h]" << endl;
cout << "Usage: " << argv[0] << " [-d gpuid] [-v] [-h]" << endl;
break;
case '?':
if (isprint(optopt))
Expand Down