From c52f5a3bb8c94043d024dfcdfa456a3766ab73a7 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Wed, 24 Feb 2016 01:07:27 +0900 Subject: [PATCH] refs #27: Add size/alignemt checks for datablock_kernel_arg structs. * It is working as exepcted. ...then what's the problem with CUDA? :( --- include/nba/engines/cuda/test.hh | 7 ++-- include/nba/framework/datablock_shared.hh | 8 ++--- src/engines/cuda/test.cu | 12 +++++++ tests/test_cuda.cc | 44 +++++++++++++++++------ 4 files changed, 54 insertions(+), 17 deletions(-) diff --git a/include/nba/engines/cuda/test.hh b/include/nba/engines/cuda/test.hh index bba638a..681a1b0 100644 --- a/include/nba/engines/cuda/test.hh +++ b/include/nba/engines/cuda/test.hh @@ -1,7 +1,8 @@ namespace nba { -extern void* get_test_kernel_noop(); -extern void* get_test_kernel_shiftedint_size_check(); -extern void* get_test_kernel_shiftedint_value_check(); +extern void *get_test_kernel_noop(); +extern void *get_test_kernel_shiftedint_size_check(); +extern void *get_test_kernel_shiftedint_value_check(); +extern void *get_test_kernel_dbarg_size_check(); } // vim: ts=8 sts=4 sw=4 et diff --git a/include/nba/framework/datablock_shared.hh b/include/nba/framework/datablock_shared.hh index 7c0a487..0e97fb0 100644 --- a/include/nba/framework/datablock_shared.hh +++ b/include/nba/framework/datablock_shared.hh @@ -9,7 +9,7 @@ #include #include -struct datablock_batch_info { +struct alignas(8) datablock_batch_info { void *buffer_bases_in; void *buffer_bases_out; uint32_t item_count_in; @@ -18,14 +18,14 @@ struct datablock_batch_info { uint16_t *item_sizes_out; nba::dev_offset_t *item_offsets_in; nba::dev_offset_t *item_offsets_out; -}; // __cuda_aligned +}; -struct datablock_kernel_arg { +struct alignas(8) datablock_kernel_arg { uint32_t total_item_count_in; uint32_t total_item_count_out; uint16_t item_size_in; // for fixed-size cases uint16_t item_size_out; // for fixed-size cases struct datablock_batch_info batches[0]; -}; // __cuda_aligned +}; #endif diff --git a/src/engines/cuda/test.cu b/src/engines/cuda/test.cu index 910bb30..33b437c 100644 --- a/src/engines/cuda/test.cu +++ b/src/engines/cuda/test.cu @@ -1,5 +1,6 @@ #include #include +#include using namespace std; using namespace nba; @@ -23,10 +24,21 @@ __global__ void shiftedint_value_check *raw_v = v->as_value(); } +__global__ void dbarg_size_check(size_t *sizes, size_t *offsets) +{ + sizes[0] = sizeof(struct datablock_kernel_arg); + offsets[0] = offsetof(struct datablock_kernel_arg, batches); + sizes[1] = sizeof(struct datablock_batch_info); + offsets[1] = offsetof(struct datablock_batch_info, item_offsets_in); +} + void *nba::get_test_kernel_shiftedint_size_check() { return reinterpret_cast (shiftedint_size_check); } void *nba::get_test_kernel_shiftedint_value_check() { return reinterpret_cast (shiftedint_value_check); } +void *nba::get_test_kernel_dbarg_size_check() +{ return reinterpret_cast (dbarg_size_check); } + // vim: ts=8 sts=4 sw=4 et diff --git a/tests/test_cuda.cc b/tests/test_cuda.cc index 4fd8c8e..bd472c5 100644 --- a/tests/test_cuda.cc +++ b/tests/test_cuda.cc @@ -44,12 +44,12 @@ TEST(CUDAStructTest, ShfitedIntSizeCheck) { ASSERT_NE(nullptr, output_d); size_t output_h = 0; void *raw_args[1] = { &output_d }; - EXPECT_EQ(cudaSuccess, cudaLaunchKernel(k, dim3(1), dim3(1), raw_args, 0, 0)); - EXPECT_EQ(cudaSuccess, cudaMemcpy(&output_h, output_d, sizeof(size_t), cudaMemcpyDeviceToHost)); + ASSERT_EQ(cudaSuccess, cudaLaunchKernel(k, dim3(1), dim3(1), raw_args, 0, 0)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(&output_h, output_d, sizeof(size_t), cudaMemcpyDeviceToHost)); EXPECT_EQ(sizeof(nba::dev_offset_t), 2); EXPECT_EQ(sizeof(nba::dev_offset_t), output_h); - EXPECT_EQ(cudaSuccess, cudaFree(output_d)); - EXPECT_EQ(cudaSuccess, cudaDeviceSynchronize()); + ASSERT_EQ(cudaSuccess, cudaFree(output_d)); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); } TEST(CUDAStructTest, ShfitedIntValueCheck) { @@ -64,15 +64,39 @@ TEST(CUDAStructTest, ShfitedIntValueCheck) { EXPECT_EQ(165320, input_h.as_value()); size_t output_h = 0; void *raw_args[2] = { &input_d, &output_d }; - EXPECT_EQ(cudaSuccess, cudaMemcpy(input_d, &input_h, sizeof(nba::dev_offset_t), cudaMemcpyHostToDevice)); - EXPECT_EQ(cudaSuccess, cudaLaunchKernel(k, dim3(1), dim3(1), raw_args, 0, 0)); - EXPECT_EQ(cudaSuccess, cudaMemcpy(&output_h, output_d, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(input_d, &input_h, sizeof(nba::dev_offset_t), cudaMemcpyHostToDevice)); + ASSERT_EQ(cudaSuccess, cudaLaunchKernel(k, dim3(1), dim3(1), raw_args, 0, 0)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(&output_h, output_d, sizeof(uint64_t), cudaMemcpyDeviceToHost)); EXPECT_EQ(165320, output_h); - EXPECT_EQ(cudaSuccess, cudaFree(input_d)); - EXPECT_EQ(cudaSuccess, cudaFree(output_d)); - EXPECT_EQ(cudaSuccess, cudaDeviceSynchronize()); + ASSERT_EQ(cudaSuccess, cudaFree(input_d)); + ASSERT_EQ(cudaSuccess, cudaFree(output_d)); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); } +TEST(CUDAStructTest, DatablockArgSizeAlignCheck) { + void *k = get_test_kernel_dbarg_size_check(); + void *output_sizes_d; + void *output_offsets_d; + ASSERT_EQ(cudaSuccess, cudaMalloc(&output_sizes_d, sizeof(size_t) * 2)); + ASSERT_NE(nullptr, output_sizes_d); + ASSERT_EQ(cudaSuccess, cudaMalloc(&output_offsets_d, sizeof(size_t) * 2)); + ASSERT_NE(nullptr, output_offsets_d); + size_t output_sizes_h[2] = { 0, 0 }; + size_t output_offsets_h[2] = { 0, 0 }; + void *raw_args[2] = { &output_sizes_d, &output_offsets_d }; + ASSERT_EQ(cudaSuccess, cudaLaunchKernel(k, dim3(1), dim3(1), raw_args, 0, 0)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(&output_sizes_h, output_sizes_d, sizeof(size_t) * 2, cudaMemcpyDeviceToHost)); + ASSERT_EQ(cudaSuccess, cudaMemcpy(&output_offsets_h, output_offsets_d, sizeof(size_t) * 2, cudaMemcpyDeviceToHost)); + EXPECT_EQ(sizeof(struct datablock_kernel_arg), output_sizes_h[0]); + EXPECT_EQ(offsetof(struct datablock_kernel_arg, batches), output_offsets_h[0]); + EXPECT_EQ(sizeof(struct datablock_batch_info), output_sizes_h[1]); + EXPECT_EQ(offsetof(struct datablock_batch_info, item_offsets_in), output_offsets_h[1]); + ASSERT_EQ(cudaSuccess, cudaFree(output_sizes_d)); + ASSERT_EQ(cudaSuccess, cudaFree(output_offsets_d)); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); +} + + #else TEST(CUDATest, Noop) {