From e75e280d2973a7f555613f46f14ec45a5d9c1b9c Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Mon, 8 Feb 2016 23:04:18 +0900 Subject: [PATCH] refs #6, #27: Refactor memory pools and host/device memory types * Tried to support OpenCL, but confirmed that we need OpenCL 2.0+. (which is not supported by current generation of Xeon Phi...) - Related codes will be rewritten someday using SVM (shared virtual memory) APIs in OpenCL 2.0+. * Reduced memory footprint of batch_ids array passed to the device. * Rollbacked ev_prepare -> ev_check watcher type change (c732a25), as it has broken CPU-only cross-node IP forwarding scenarios. :( * TODO: fix IPsec GPU-only mode... --- configs/ipv4-router-gpuonly.click | 1 - elements/ip/IPlookup.cc | 117 ++++++----- elements/ip/IPlookup.hh | 4 +- elements/ip/IPlookup_kernel.cu | 2 +- elements/ipsec/IPsecAES.cc | 27 ++- elements/ipsec/IPsecAES.hh | 4 +- elements/ipsec/IPsecAES_kernel.cu | 4 +- elements/ipsec/IPsecAuthHMACSHA1.cc | 18 +- elements/ipsec/IPsecAuthHMACSHA1.hh | 4 +- elements/ipsec/IPsecAuthHMACSHA1_kernel.cu | 4 +- elements/ipv6/LookupIP6Route.cc | 78 ++++---- elements/ipv6/LookupIP6Route.hh | 4 +- elements/ipv6/LookupIP6Route_kernel.cu | 4 +- include/nba/core/mempool.hh | 55 +++--- include/nba/core/offloadtypes.hh | 23 ++- include/nba/element/element.hh | 2 +- include/nba/engines/cuda/computecontext.hh | 45 ++--- include/nba/engines/cuda/computedevice.hh | 12 +- include/nba/engines/cuda/mempool.hh | 74 ++++--- include/nba/engines/dummy/computecontext.hh | 36 ++-- include/nba/engines/dummy/computedevice.hh | 12 +- include/nba/engines/dummy/mempool.hh | 33 ++-- include/nba/engines/phi/computecontext.hh | 16 +- include/nba/engines/phi/mempool.hh | 107 +++------- include/nba/framework/computecontext.hh | 28 +-- include/nba/framework/computedevice.hh | 14 +- include/nba/framework/config.hh | 2 + include/nba/framework/datablock.hh | 13 +- include/nba/framework/offloadtask.hh | 16 +- include/nba/framework/threadcontext.hh | 2 +- src/engines/cuda/computecontext.cc | 124 +++++++----- src/engines/cuda/computedevice.cc | 24 ++- src/engines/dummy/computecontext.cc | 100 ++++++---- src/engines/dummy/computedevice.cc | 24 ++- src/engines/phi/computecontext.cc | 125 ++++++++---- src/lib/coprocessor.cc | 3 +- src/lib/datablock.cc | 28 +-- src/lib/elementgraph.cc | 2 +- src/lib/io.cc | 10 +- src/lib/offloadtask.cc | 206 +++++++++++--------- tests/test_core_mempool.cc | 51 +++++ 41 files changed, 816 insertions(+), 642 deletions(-) create mode 100644 tests/test_core_mempool.cc diff --git a/configs/ipv4-router-gpuonly.click b/configs/ipv4-router-gpuonly.click index 713923f..4e8be0a 100644 --- a/configs/ipv4-router-gpuonly.click +++ b/configs/ipv4-router-gpuonly.click @@ -3,6 +3,5 @@ GPUOnly() -> DropBroadcasts() -> CheckIPHeader() -> IPlookup() -> -//IPlookup() -> DecIPTTL() -> ToOutput(); diff --git a/elements/ip/IPlookup.cc b/elements/ip/IPlookup.cc index 4f920a2..1401d5e 100644 --- a/elements/ip/IPlookup.cc +++ b/elements/ip/IPlookup.cc @@ -34,26 +34,12 @@ IPlookup::IPlookup(): OffloadableElement() num_tx_ports = 0; rr_port = 0; - p_rwlock_TBL24 = NULL; - p_rwlock_TBLlong = NULL; - TBL24_h = NULL; - TBLlong_h = NULL; - TBL24_d = NULL; - TBLlong_d = NULL; -} - -int IPlookup::initialize() -{ - /* Get routing table pointers from the node-local storage. */ - TBL24_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBL24"); - p_rwlock_TBL24 = ctx->node_local_storage->get_rwlock("TBL24"); - TBLlong_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBLlong"); - p_rwlock_TBLlong = ctx->node_local_storage->get_rwlock("TBLlong"); - - /* Get device pointers from the node-local storage. */ - TBL24_d = ((memory_t **) ctx->node_local_storage->get_alloc("TBL24_dev_ptr"))[0]; - TBLlong_d = ((memory_t **) ctx->node_local_storage->get_alloc("TBLlong_dev_ptr"))[0]; - return 0; + p_rwlock_TBL24 = nullptr; + p_rwlock_TBLlong = nullptr; + TBL24_h = nullptr; + TBLlong_h = nullptr; + TBL24_d = { nullptr }; + TBLlong_d = { nullptr }; } int IPlookup::initialize_global() @@ -76,8 +62,8 @@ int IPlookup::initialize_per_node() ctx->node_local_storage->alloc("TBL24", sizeof(uint16_t) * ipv4_get_TBL24_size()); ctx->node_local_storage->alloc("TBLlong", sizeof(uint16_t) * ipv4_get_TBLlong_size()); /* Storage for device pointers. */ - ctx->node_local_storage->alloc("TBL24_dev_ptr", sizeof(memory_t)); - ctx->node_local_storage->alloc("TBLlong_dev_ptr", sizeof(memory_t)); + ctx->node_local_storage->alloc("TBL24_dev_ptr", sizeof(dev_mem_t)); + ctx->node_local_storage->alloc("TBLlong_dev_ptr", sizeof(dev_mem_t)); printf("element::IPlookup: Initializing FIB from the global RIB for NUMA node %d...\n", node_idx); ipv4_build_fib(); @@ -85,13 +71,30 @@ int IPlookup::initialize_per_node() return 0; } +int IPlookup::initialize() +{ + /* Get routing table pointers from the node-local storage. */ + TBL24_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBL24"); + TBLlong_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBLlong"); + assert(TBL24_h != nullptr); + assert(TBLlong_h != nullptr); + p_rwlock_TBL24 = ctx->node_local_storage->get_rwlock("TBL24"); + p_rwlock_TBLlong = ctx->node_local_storage->get_rwlock("TBLlong"); + + /* Get device pointers from the node-local storage. */ + TBL24_d = (dev_mem_t *) ctx->node_local_storage->get_alloc("TBL24_dev_ptr"); + TBLlong_d = (dev_mem_t *) ctx->node_local_storage->get_alloc("TBLlong_dev_ptr"); + + rr_port = 0; + return 0; +} + int IPlookup::configure(comp_thread_context *ctx, std::vector &args) { Element::configure(ctx, args); num_tx_ports = ctx->num_tx_ports; num_nodes = ctx->num_nodes; node_idx = ctx->loc.node_id; - rr_port = 0; return 0; } @@ -110,9 +113,13 @@ int IPlookup::process(int input_port, Packet *pkt) return 0; } - //unsigned n = (pkt->pkt.in_port <= (num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2); - //rr_port = (rr_port + 1) % (num_tx_ports / 2) + n; + #ifdef NBA_IPFWD_RR_NODE_LOCAL + unsigned iface_in = anno_get(&pkt->anno, NBA_ANNO_IFACE_IN); + unsigned n = (iface_in <= ((unsigned) num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2); + rr_port = (rr_port + 1) % (num_tx_ports / 2) + n; + #else rr_port = (rr_port + 1) % (num_tx_ports); + #endif anno_set(&pkt->anno, NBA_ANNO_IFACE_OUT, rr_port); output(0).push(pkt); return 0; @@ -127,9 +134,13 @@ int IPlookup::postproc(int input_port, void *custom_output, Packet *pkt) return 0; } - //unsigned n = (pkt->pkt.in_port <= (num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2); - //rr_port = (rr_port + 1) % (num_tx_ports / 2) + n; + #ifdef NBA_IPFWD_RR_NODE_LOCAL + unsigned iface_in = anno_get(&pkt->anno, NBA_ANNO_IFACE_IN); + unsigned n = (iface_in <= ((unsigned) num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2); + rr_port = (rr_port + 1) % (num_tx_ports / 2) + n; + #else rr_port = (rr_port + 1) % (num_tx_ports); + #endif anno_set(&pkt->anno, NBA_ANNO_IFACE_OUT, rr_port); output(0).push(pkt); return 0; @@ -149,41 +160,41 @@ size_t IPlookup::get_desired_workgroup_size(const char *device_name) const } #ifdef USE_CUDA +void IPlookup::cuda_init_handler(ComputeDevice *device) +{ + /* Store the device pointers for per-thread element instances. */ + size_t TBL24_alloc_size = sizeof(uint16_t) * ipv4_get_TBL24_size(); + size_t TBLlong_alloc_size = sizeof(uint16_t) * ipv4_get_TBLlong_size(); + // As it is before initialize() is called, we need to get the pointers + // from the node-local storage by ourselves here. + uint16_t *_TBL24_h = nullptr; + uint16_t *_TBLlong_h = nullptr; + _TBL24_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBL24"); + _TBLlong_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBLlong"); + TBL24_d = (dev_mem_t *) ctx->node_local_storage->get_alloc("TBL24_dev_ptr"); + TBLlong_d = (dev_mem_t *) ctx->node_local_storage->get_alloc("TBLlong_dev_ptr"); + *TBL24_d = device->alloc_device_buffer(TBL24_alloc_size); + *TBLlong_d = device->alloc_device_buffer(TBLlong_alloc_size); + /* Convert host-side routing table to host_mem_t and copy the routing table. */ + device->memwrite({(void *) _TBL24_h}, *TBL24_d, 0, TBL24_alloc_size); + device->memwrite({(void *) _TBLlong_h}, *TBLlong_d, 0, TBLlong_alloc_size); +} + void IPlookup::cuda_compute_handler(ComputeContext *cctx, struct resource_param *res) { - //printf("G++ datablock_kernel_arg (%lu)\n", sizeof(struct datablock_kernel_arg)); - //printf("G++ .total_item_count (%lu)\n", offsetof(struct datablock_kernel_arg, total_item_count)); - //printf("G++ .buffer_bases (%lu)\n", offsetof(struct datablock_kernel_arg, buffer_bases)); - //printf("G++ .item_count (%lu)\n", offsetof(struct datablock_kernel_arg, item_count)); - //printf("G++ .item_size (%lu)\n", offsetof(struct datablock_kernel_arg, item_size)); - //printf("G++ .item_sizes (%lu)\n", offsetof(struct datablock_kernel_arg, item_sizes)); - struct kernel_arg arg; - arg = {(void *) &TBL24_d, sizeof(void *), alignof(void *)}; + void *ptr_args[2]; + ptr_args[0] = cctx->unwrap_device_buffer(*TBL24_d); + arg = {(void *) &ptr_args[0], sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); - arg = {(void *) &TBLlong_d, sizeof(void *), alignof(void *)}; + ptr_args[1] = cctx->unwrap_device_buffer(*TBLlong_d); + arg = {(void *) &ptr_args[1], sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); - kernel_t kern; + dev_kernel_t kern; kern.ptr = ipv4_route_lookup_get_cuda_kernel(); cctx->enqueue_kernel_launch(kern, res); } - -void IPlookup::cuda_init_handler(ComputeDevice *device) -{ - memory_t new_TBL24_d = /*(uint16_t *)*/ device->alloc_device_buffer(sizeof(uint16_t) * ipv4_get_TBL24_size(), HOST_TO_DEVICE); - memory_t new_TBLlong_d = /*(uint16_t *)*/ device->alloc_device_buffer(sizeof(uint16_t) * ipv4_get_TBLlong_size(), HOST_TO_DEVICE); - TBL24_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBL24"); - TBLlong_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBLlong"); - device->memwrite(TBL24_h, new_TBL24_d, 0, sizeof(uint16_t) * ipv4_get_TBL24_size()); - device->memwrite(TBLlong_h, new_TBLlong_d, 0, sizeof(uint16_t) * ipv4_get_TBLlong_size()); - - /* Store the device pointers for per-thread instances. */ - memory_t *TBL24_dev_ptr_storage = (memory_t *) ctx->node_local_storage->get_alloc("TBL24_dev_ptr"); - memory_t *TBLlong_dev_ptr_storage = (memory_t *) ctx->node_local_storage->get_alloc("TBLlong_dev_ptr"); - (TBL24_dev_ptr_storage)[0] = new_TBL24_d; - (TBLlong_dev_ptr_storage)[0] = new_TBLlong_d; -} #endif int IPlookup::ipv4_route_add(uint32_t addr, uint16_t len, uint16_t nexthop) diff --git a/elements/ip/IPlookup.hh b/elements/ip/IPlookup.hh index bfa4a86..75ee6dc 100644 --- a/elements/ip/IPlookup.hh +++ b/elements/ip/IPlookup.hh @@ -83,8 +83,8 @@ protected: uint16_t *TBL24_h; uint16_t *TBLlong_h; - memory_t *TBL24_d; - memory_t *TBLlong_d; + dev_mem_t *TBL24_d; + dev_mem_t *TBLlong_d; }; EXPORT_ELEMENT(IPlookup); diff --git a/elements/ip/IPlookup_kernel.cu b/elements/ip/IPlookup_kernel.cu index 6e06333..11b4947 100644 --- a/elements/ip/IPlookup_kernel.cu +++ b/elements/ip/IPlookup_kernel.cu @@ -33,7 +33,7 @@ __device__ uint32_t ntohl(uint32_t n) /* The GPU kernel. */ __global__ void ipv4_route_lookup_cuda( struct datablock_kernel_arg **datablocks, - uint32_t count, uint16_t *batch_ids, uint16_t *item_ids, + uint32_t count, uint8_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, uint16_t* __restrict__ TBL24_d, uint16_t* __restrict__ TBLlong_d) diff --git a/elements/ipsec/IPsecAES.cc b/elements/ipsec/IPsecAES.cc index 07863a6..cd06e95 100644 --- a/elements/ipsec/IPsecAES.cc +++ b/elements/ipsec/IPsecAES.cc @@ -51,7 +51,7 @@ int IPsecAES::initialize() h_key_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array"); /* Get device pointer from the node local storage. */ - d_key_array_ptr = ((memory_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr"))[0]; + d_key_array_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr"); if (aes_sa_entry_array != NULL) { free(aes_sa_entry_array); @@ -80,8 +80,8 @@ int IPsecAES::initialize_global() entry->entry_idx = i; rte_memcpy(entry->aes_key, "1234123412341234", AES_BLOCK_SIZE); #ifdef USE_OPENSSL_EVP - // TODO: check if copying globally initialized evpctx works okay. - EVP_CIPHER_CTX_init(&entry->evpctx); + // TODO: check if copying globally initialized evpctx works okay. + EVP_CIPHER_CTX_init(&entry->evpctx); //if (EVP_EncryptInit(&entry->evpctx, EVP_aes_128_ctr(), entry->aes_key, esph->esp_iv) != 1) if (EVP_EncryptInit(&entry->evpctx, EVP_aes_128_ctr(), entry->aes_key, fake_iv) != 1) fprintf(stderr, "IPsecAES: EVP_EncryptInit() - %s\n", ERR_error_string(ERR_get_error(), NULL)); @@ -121,7 +121,7 @@ int IPsecAES::initialize_per_node() rte_memcpy(temp_array, aes_sa_entry_array, size); /* Storage for pointer, which points aes key array in device */ - ctx->node_local_storage->alloc("d_aes_key_array_ptr", sizeof(memory_t)); + ctx->node_local_storage->alloc("d_aes_key_array_ptr", sizeof(dev_mem_t)); return 0; } @@ -182,30 +182,25 @@ int IPsecAES::process(int input_port, Packet *pkt) void IPsecAES::cuda_init_handler(ComputeDevice *device) { // Put key array content to device space. - long key_array_size = sizeof(struct aes_sa_entry) * num_tunnels; + size_t key_array_size = sizeof(struct aes_sa_entry) * num_tunnels; h_key_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array"); - memory_t key_array_in_device = device->alloc_device_buffer(key_array_size, 0); - device->memwrite(h_key_array, key_array_in_device, 0, key_array_size); + dev_mem_t key_array_in_device = device->alloc_device_buffer(key_array_size); + device->memwrite({ h_key_array }, key_array_in_device, 0, key_array_size); // Store the device pointer for per-thread instances. - memory_t *p = (memory_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr"); - ((memory_t *) p)[0] = key_array_in_device; + dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr"); + *p = key_array_in_device; } -#endif -#ifdef USE_CUDA void IPsecAES::cuda_compute_handler(ComputeContext *cctx, struct resource_param *res) { struct kernel_arg arg; - arg = {(void *) &d_key_array_ptr.ptr, sizeof(void *), alignof(void *)}; + arg = {(void *) &d_key_array_ptr->ptr, sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); - kernel_t kern; + dev_kernel_t kern; kern.ptr = ipsec_aes_encryption_get_cuda_kernel(); cctx->enqueue_kernel_launch(kern, res); - - // TODO: data-copy-opt - // ?->inc_dev_ver(dbid_ipsec_...); } #endif diff --git a/elements/ipsec/IPsecAES.hh b/elements/ipsec/IPsecAES.hh index 87b48fa..2d07b1d 100644 --- a/elements/ipsec/IPsecAES.hh +++ b/elements/ipsec/IPsecAES.hh @@ -61,8 +61,8 @@ protected: /* Per-thread pointers, which points to the node local storage variables. */ std::unordered_map *h_sa_table; // tunnel lookup is done in CPU only. No need for GPU ptr. - struct aes_sa_entry *h_key_array = NULL; // used in CPU. - memory_t d_key_array_ptr; // points to the device buffer. + struct aes_sa_entry *h_key_array = nullptr; // used in CPU. + dev_mem_t *d_key_array_ptr; }; EXPORT_ELEMENT(IPsecAES); diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu index b9c5b19..478d312 100644 --- a/elements/ipsec/IPsecAES_kernel.cu +++ b/elements/ipsec/IPsecAES_kernel.cu @@ -681,7 +681,7 @@ __device__ void AES_encrypt_cu_optimized(const uint8_t *in, uint8_t *out, __global__ void AES_ctr_encrypt_chunk_SharedMem_5( struct datablock_kernel_arg **datablocks, - uint32_t count, uint16_t *batch_ids, uint16_t *item_ids, + uint32_t count, uint8_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, struct aes_sa_entry* flow_info ) @@ -695,7 +695,7 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count && count != 0) { - const uint16_t batch_idx = batch_ids[idx]; + const uint8_t batch_idx = batch_ids[idx]; const uint16_t item_idx = item_ids[idx]; const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d]; diff --git a/elements/ipsec/IPsecAuthHMACSHA1.cc b/elements/ipsec/IPsecAuthHMACSHA1.cc index 9dce95c..29c73b0 100644 --- a/elements/ipsec/IPsecAuthHMACSHA1.cc +++ b/elements/ipsec/IPsecAuthHMACSHA1.cc @@ -51,7 +51,7 @@ int IPsecAuthHMACSHA1::initialize() h_key_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array"); /* Get device pointer from the node local storage. */ - d_key_array_ptr = ((struct hmac_sa_entry **)ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr"))[0]; + d_key_array_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr"); if (hmac_sa_entry_array != NULL) { free(hmac_sa_entry_array); @@ -111,7 +111,7 @@ int IPsecAuthHMACSHA1::initialize_per_node() rte_memcpy(temp_array, hmac_sa_entry_array, size); /* Storage for pointer, which points hmac key array in device */ - ctx->node_local_storage->alloc("d_hmac_key_array_ptr", sizeof(void *)); + ctx->node_local_storage->alloc("d_hmac_key_array_ptr", sizeof(dev_mem_t)); return 0; } @@ -177,24 +177,24 @@ int IPsecAuthHMACSHA1::process(int input_port, Packet *pkt) void IPsecAuthHMACSHA1::cuda_init_handler(ComputeDevice *device) { // Put key array content to device space. - long key_array_size = sizeof(struct hmac_sa_entry) * num_tunnels; + size_t key_array_size = sizeof(struct hmac_sa_entry) * num_tunnels; h_key_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array"); - memory_t key_array_in_device = /*(struct hmac_sa_entry *)*/ device->alloc_device_buffer(key_array_size, 0); - device->memwrite(h_key_array, key_array_in_device, 0, key_array_size); + dev_mem_t key_array_in_device = device->alloc_device_buffer(key_array_size); + device->memwrite({ h_key_array }, key_array_in_device, 0, key_array_size); // Store the device pointer for per-thread instances. - void *p = ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr"); - ((memory_t *) p)[0] = key_array_in_device; + dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr"); + *p = key_array_in_device; } void IPsecAuthHMACSHA1::cuda_compute_handler(ComputeContext *cctx, struct resource_param *res) { struct kernel_arg arg; - arg = {(void *) &d_key_array_ptr, sizeof(void *), alignof(void *)}; + arg = {(void *) &d_key_array_ptr->ptr, sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); - kernel_t kern; + dev_kernel_t kern; kern.ptr = ipsec_hsha1_encryption_get_cuda_kernel(); cctx->enqueue_kernel_launch(kern, res); } diff --git a/elements/ipsec/IPsecAuthHMACSHA1.hh b/elements/ipsec/IPsecAuthHMACSHA1.hh index d667da2..b2fced0 100644 --- a/elements/ipsec/IPsecAuthHMACSHA1.hh +++ b/elements/ipsec/IPsecAuthHMACSHA1.hh @@ -60,8 +60,8 @@ protected: int dummy_index; std::unordered_map *h_sa_table; // tunnel lookup is done in CPU only. No need for GPU ptr. - struct hmac_sa_entry *h_key_array = NULL; // used in CPU. - struct hmac_sa_entry *d_key_array_ptr = NULL; // points to the device buffer. + struct hmac_sa_entry *h_key_array = nullptr; // used in CPU. + dev_mem_t *d_key_array_ptr; // points to the device buffer. private: const int idx_pkt_offset = 0; diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu index 64b8e6e..ea6bd2b 100644 --- a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu +++ b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu @@ -1239,13 +1239,13 @@ __global__ void computeHMAC_SHA1_2(char* buf, char* keys, uint32_t *offsets, __global__ void computeHMAC_SHA1_3( struct datablock_kernel_arg **datablocks, - uint32_t count, uint16_t *batch_ids, uint16_t *item_ids, + uint32_t count, uint8_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, struct hmac_sa_entry *hmac_key_array) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count && count != 0) { - const uint16_t batch_idx = batch_ids[idx]; + const uint8_t batch_idx = batch_ids[idx]; const uint16_t item_idx = item_ids[idx]; assert(item_idx < 64); const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d]; diff --git a/elements/ipv6/LookupIP6Route.cc b/elements/ipv6/LookupIP6Route.cc index 6d56949..3b6cfd4 100644 --- a/elements/ipv6/LookupIP6Route.cc +++ b/elements/ipv6/LookupIP6Route.cc @@ -51,17 +51,13 @@ LookupIP6Route::LookupIP6Route(): OffloadableElement() d_table_sizes = NULL; } -int LookupIP6Route::initialize() +int LookupIP6Route::configure(comp_thread_context *ctx, std::vector &args) { - // Called after coproc threads are initialized. - - /* Get routing table pointers from the node-local storage. */ - _table_ptr = (RoutingTableV6*)ctx->node_local_storage->get_alloc("ipv6_table"); - _rwlock_ptr = ctx->node_local_storage->get_rwlock("ipv6_table"); - - /* Get GPU device pointers from the node-local storage. */ - d_tables = ((memory_t **) ctx->node_local_storage->get_alloc("dev_tables"))[0]; - d_table_sizes = ((memory_t **) ctx->node_local_storage->get_alloc("dev_table_sizes"))[0]; + Element::configure(ctx, args); + num_tx_ports = ctx->num_tx_ports; + num_nodes = ctx->num_nodes; + node_idx = ctx->loc.node_id; + rr_port = 0; return 0; } @@ -72,7 +68,6 @@ int LookupIP6Route::initialize_global() int count = 200000; _original_table.from_random(seed, count); _original_table.build(); - return 0; } @@ -93,14 +88,17 @@ int LookupIP6Route::initialize_per_node() return 0; } -int LookupIP6Route::configure(comp_thread_context *ctx, std::vector &args) +int LookupIP6Route::initialize() { - Element::configure(ctx, args); - num_tx_ports = ctx->num_tx_ports; - num_nodes = ctx->num_nodes; - node_idx = ctx->loc.node_id; - rr_port = 0; + // Called after coproc threads are initialized. + + /* Get routing table pointers from the node-local storage. */ + _table_ptr = (RoutingTableV6*)ctx->node_local_storage->get_alloc("ipv6_table"); + _rwlock_ptr = ctx->node_local_storage->get_rwlock("ipv6_table"); + /* Get GPU device pointers from the node-local storage. */ + d_tables = (dev_mem_t *) ctx->node_local_storage->get_alloc("dev_tables"); + d_table_sizes = (dev_mem_t *) ctx->node_local_storage->get_alloc("dev_table_sizes"); return 0; } @@ -128,7 +126,13 @@ int LookupIP6Route::process(int input_port, Packet *pkt) return 0; } - rr_port = (rr_port + 1) % num_tx_ports; + #ifdef NBA_IPFWD_RR_NODE_LOCAL + unsigned iface_in = anno_get(&pkt->anno, NBA_ANNO_IFACE_IN); + unsigned n = (iface_in <= ((unsigned) num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2); + rr_port = (rr_port + 1) % (num_tx_ports / 2) + n; + #else + rr_port = (rr_port + 1) % (num_tx_ports); + #endif anno_set(&pkt->anno, NBA_ANNO_IFACE_OUT, rr_port); output(0).push(pkt); return 0; @@ -142,7 +146,13 @@ int LookupIP6Route::postproc(int input_port, void *custom_output, Packet *pkt) pkt->kill(); return 0; } - rr_port = (rr_port + 1) % num_tx_ports; + #ifdef NBA_IPFWD_RR_NODE_LOCAL + unsigned iface_in = anno_get(&pkt->anno, NBA_ANNO_IFACE_IN); + unsigned n = (iface_in <= ((unsigned) num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2); + rr_port = (rr_port + 1) % (num_tx_ports / 2) + n; + #else + rr_port = (rr_port + 1) % (num_tx_ports); + #endif anno_set(&pkt->anno, NBA_ANNO_IFACE_OUT, rr_port); output(0).push(pkt); return 0; @@ -165,11 +175,11 @@ size_t LookupIP6Route::get_desired_workgroup_size(const char *device_name) const void LookupIP6Route::cuda_compute_handler(ComputeContext *cctx, struct resource_param *res) { struct kernel_arg arg; - arg = {(void *) &d_tables, sizeof(void *), alignof(void *)}; + arg = {(void *) &d_tables->ptr, sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); - arg = {(void *) &d_table_sizes, sizeof(void *), alignof(void *)}; + arg = {(void *) &d_table_sizes->ptr, sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); - kernel_t kern; + dev_kernel_t kern; kern.ptr = ipv6_route_lookup_get_cuda_kernel(); cctx->enqueue_kernel_launch(kern, res); } @@ -177,27 +187,27 @@ void LookupIP6Route::cuda_compute_handler(ComputeContext *cctx, struct resource_ void LookupIP6Route::cuda_init_handler(ComputeDevice *device) { size_t table_sizes[128]; - memory_t table_ptrs_in_d[128]; + void *table_ptrs_in_d[128]; - memory_t new_d_tables = /*(Item **)*/ device->alloc_device_buffer(sizeof(memory_t *)*128, HOST_TO_DEVICE); - memory_t new_d_table_sizes = /*(size_t *)*/ device->alloc_device_buffer(sizeof(size_t)*128, HOST_TO_DEVICE); + /* Store the device pointers for per-thread instances. */ + d_tables = (dev_mem_t *) ctx->node_local_storage->get_alloc("dev_tables"); + d_table_sizes = (dev_mem_t *) ctx->node_local_storage->get_alloc("dev_table_sizes"); + *d_tables = device->alloc_device_buffer(sizeof(void *) * 128); + *d_table_sizes = device->alloc_device_buffer(sizeof(size_t) * 128); /* table_ptrs_in_d keeps track of the temporary host-side references to tables in * the device for initialization and copy. * d_tables is the actual device buffer to store pointers in table_ptrs_in_d. */ for (int i = 0; i < 128; i++) { table_sizes[i] = _original_table.m_Tables[i]->m_TableSize; - table_ptrs_in_d[i] = /*(Item *)*/ device->alloc_device_buffer(sizeof(Item) * table_sizes[i] * 2, HOST_TO_DEVICE); - device->memwrite(_original_table.m_Tables[i]->m_Table, table_ptrs_in_d[i], 0, sizeof(Item) * table_sizes[i] * 2); + size_t copy_size = sizeof(Item) * table_sizes[i] * 2; + table_ptrs_in_d[i] = device->alloc_device_buffer(copy_size).ptr; + device->memwrite({ _original_table.m_Tables[i]->m_Table }, {table_ptrs_in_d[i]}, + 0, copy_size); } - device->memwrite(table_ptrs_in_d, new_d_tables, 0, sizeof(Item*) * 128); - device->memwrite(table_sizes, new_d_table_sizes, 0, sizeof(size_t) * 128); + device->memwrite({ table_ptrs_in_d }, *d_tables, 0, sizeof(void *) * 128); + device->memwrite({ table_sizes }, *d_table_sizes, 0, sizeof(size_t) * 128); - /* Store the device pointers for per-thread instances. */ - d_tables = (memory_t *) ctx->node_local_storage->get_alloc("dev_tables"); - d_table_sizes = (memory_t *) ctx->node_local_storage->get_alloc("dev_table_sizes"); - (d_tables)[0] = new_d_tables; - (d_table_sizes)[0] = new_d_table_sizes; } #endif diff --git a/elements/ipv6/LookupIP6Route.hh b/elements/ipv6/LookupIP6Route.hh index 6fd1aa3..c7079e3 100644 --- a/elements/ipv6/LookupIP6Route.hh +++ b/elements/ipv6/LookupIP6Route.hh @@ -63,8 +63,8 @@ private: rte_rwlock_t *_rwlock_ptr; /* For offloaded methods */ - memory_t *d_tables; - memory_t *d_table_sizes; + dev_mem_t *d_tables; + dev_mem_t *d_table_sizes; }; EXPORT_ELEMENT(LookupIP6Route); diff --git a/elements/ipv6/LookupIP6Route_kernel.cu b/elements/ipv6/LookupIP6Route_kernel.cu index 098932c..be47719 100644 --- a/elements/ipv6/LookupIP6Route_kernel.cu +++ b/elements/ipv6/LookupIP6Route_kernel.cu @@ -144,7 +144,7 @@ __device__ uint64_t ntohll(uint64_t val) __global__ void ipv6_route_lookup_cuda( struct datablock_kernel_arg **datablocks, - uint32_t count, uint16_t *batch_ids, uint16_t *item_ids, + uint32_t count, uint8_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, Item** __restrict__ tables_d, size_t* __restrict__ table_sizes_d) @@ -152,7 +152,7 @@ __global__ void ipv6_route_lookup_cuda( int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { - uint16_t batch_idx = batch_ids[idx]; + uint8_t batch_idx = batch_ids[idx]; uint16_t item_idx = item_ids[idx]; struct datablock_kernel_arg *db_dest_addrs = datablocks[dbid_ipv6_dest_addrs_d]; struct datablock_kernel_arg *db_results = datablocks[dbid_ipv6_lookup_results_d]; diff --git a/include/nba/core/mempool.hh b/include/nba/core/mempool.hh index 196f97d..6c1cb5a 100644 --- a/include/nba/core/mempool.hh +++ b/include/nba/core/mempool.hh @@ -13,44 +13,55 @@ namespace nba * This abstract memory pool class provides a bump allocator * in the memroy region defined by its subclasses. */ +template class MemoryPool { public: - MemoryPool() : max_size(0), cur_pos(0) - {} + MemoryPool() + : max_size(0), align(CACHE_LINE_SIZE), cur_pos(0) + { } - virtual ~MemoryPool() {} + MemoryPool(size_t max_size) + : max_size(max_size), align(CACHE_LINE_SIZE), cur_pos(0) + { } - virtual bool init(size_t max_size) = 0; + MemoryPool(size_t max_size, size_t align) + : max_size(max_size), align(align), cur_pos(0) + { } + virtual ~MemoryPool() { } + + size_t get_alloc_size() const { return cur_pos; } + + virtual bool init() = 0; + + virtual T get_base_ptr() const = 0; + + // Device implementers should provide his own alloc() method, + // using the inherited _alloc() method which provides new offset + // calculation according to bump allocator strategy. + virtual int alloc(size_t size, T& ptr) = 0; + + virtual void destroy() = 0; + + // We implement a bump allocator. + void reset() { cur_pos = 0; } + +protected: int _alloc(size_t size, size_t *start_offset) { - if (cur_pos + size > max_size) + if (ALIGN_CEIL(cur_pos + size, align) > max_size) return -ENOMEM; /* IMPORTANT: We need to return the position before adding the new size. */ if (start_offset != nullptr) *start_offset = cur_pos; cur_pos += size; - cur_pos = ALIGN_CEIL(cur_pos, CACHE_LINE_SIZE); + cur_pos = ALIGN_CEIL(cur_pos, align); return 0; } - // The device implementer's should provide his own alloc() method. - - void reset() - { - cur_pos = 0; - } - - size_t get_alloc_size() const - { - return cur_pos; - } - - virtual void *get_base_ptr() const = 0; - -protected: - size_t max_size; + const size_t max_size; + const size_t align; size_t cur_pos; }; diff --git a/include/nba/core/offloadtypes.hh b/include/nba/core/offloadtypes.hh index 466dc93..6281b2d 100644 --- a/include/nba/core/offloadtypes.hh +++ b/include/nba/core/offloadtypes.hh @@ -11,21 +11,25 @@ #endif /* Common object types */ -typedef union memobj { +typedef union { void *ptr; #ifdef USE_PHI cl_mem clmem; #endif -} memory_t; +} dev_mem_t; -typedef union kernelobj { +typedef union { + void *ptr; +} host_mem_t; + +typedef union { void *ptr; #ifdef USE_PHI cl_kernel clkernel; #endif -} kernel_t; +} dev_kernel_t; -typedef union eventobj { +typedef union { void *ptr; #ifdef USE_CUDA cudaEvent_t cuev; @@ -33,7 +37,7 @@ typedef union eventobj { #ifdef USE_PHI cl_event clev; #endif -} event_t; +} dev_event_t; struct resource_param { uint32_t num_workitems; @@ -47,9 +51,10 @@ struct kernel_arg { size_t align; }; -enum io_direction_hint { - HOST_TO_DEVICE = 0, - DEVICE_TO_HOST = 1, +enum io_direction_hint : int { + AGNOSTIC = 0, + HOST_TO_DEVICE = 1, + DEVICE_TO_HOST = 2, }; diff --git a/include/nba/element/element.hh b/include/nba/element/element.hh index 3b05b5d..66c49b3 100644 --- a/include/nba/element/element.hh +++ b/include/nba/element/element.hh @@ -243,7 +243,7 @@ public: /** Returns the list of supported devices for offloading. */ virtual void get_supported_devices(std::vector &device_names) const = 0; - //virtual size_t get_desired_workgroup_size(const char *device_name) const = 0; + virtual size_t get_desired_workgroup_size(const char *device_name) const = 0; virtual int get_offload_item_counter_dbid() const = 0; virtual size_t get_used_datablocks(int *datablock_ids) = 0; //virtual void get_datablocks(std::vector &datablock_ids){get_used_datablocks(datablock_ids);}; //TODO fill here... diff --git a/include/nba/engines/cuda/computecontext.hh b/include/nba/engines/cuda/computecontext.hh index 7474d0c..6b180e3 100644 --- a/include/nba/engines/cuda/computecontext.hh +++ b/include/nba/engines/cuda/computecontext.hh @@ -1,10 +1,7 @@ #ifndef __NBA_CUDA_COMPUTECTX_HH__ #define __NBA_CUDA_COMPUTECTX_HH__ -#include - #include -#include #include #include #include @@ -30,10 +27,16 @@ public: virtual ~CUDAComputeContext(); io_base_t alloc_io_base(); - int alloc_input_buffer(io_base_t io_base, size_t size, void **host_ptr, memory_t *dev_mem); - int alloc_output_buffer(io_base_t io_base, size_t size, void **host_ptr, memory_t *dev_mem); - void get_input_current_pos(io_base_t io_base, void **host_ptr, memory_t *dev_mem) const; - void get_output_current_pos(io_base_t io_base, void **host_ptr, memory_t *dev_mem) const; + int alloc_input_buffer(io_base_t io_base, size_t size, + host_mem_t &host_ptr, dev_mem_t &dev_ptr); + int alloc_output_buffer(io_base_t io_base, size_t size, + host_mem_t &host_ptr, dev_mem_t &dev_ptr); + void map_input_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const; + void map_output_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const; + void *unwrap_host_buffer(const host_mem_t hbuf) const; + void *unwrap_device_buffer(const dev_mem_t dbuf) const; size_t get_input_size(io_base_t io_base) const; size_t get_output_size(io_base_t io_base) const; void clear_io_buffers(io_base_t io_base); @@ -41,15 +44,13 @@ public: void clear_kernel_args(); void push_kernel_arg(struct kernel_arg &arg); - int enqueue_memwrite_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size); - int enqueue_memread_op(void* host_buf, memory_t dev_buf, size_t offset, size_t size); - int enqueue_kernel_launch(kernel_t kernel, struct resource_param *res); - int enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), void *user_arg); - - cudaStream_t get_stream() - { - return _stream; - } + int enqueue_memwrite_op(const host_mem_t host_buf, const dev_mem_t dev_buf, + size_t offset, size_t size); + int enqueue_memread_op(const host_mem_t host_buf, const dev_mem_t dev_buf, + size_t offset, size_t size); + int enqueue_kernel_launch(dev_kernel_t kernel, struct resource_param *res); + int enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), + void *user_arg); void sync() { @@ -89,16 +90,16 @@ private: uint8_t *checkbits_d; uint8_t *checkbits_h; cudaStream_t _stream; - CUDAMemoryPool _cuda_mempool_in[NBA_MAX_IO_BASES]; - CUDAMemoryPool _cuda_mempool_out[NBA_MAX_IO_BASES]; - CPUMemoryPool _cpu_mempool_in[NBA_MAX_IO_BASES]; - CPUMemoryPool _cpu_mempool_out[NBA_MAX_IO_BASES]; + CUDAMemoryPool *_cuda_mempool_in[NBA_MAX_IO_BASES]; + CUDAMemoryPool *_cuda_mempool_out[NBA_MAX_IO_BASES]; + CPUMemoryPool *_cpu_mempool_in[NBA_MAX_IO_BASES]; + CPUMemoryPool *_cpu_mempool_out[NBA_MAX_IO_BASES]; const struct rte_memzone *reserve_memory(ComputeDevice *mother); const struct rte_memzone *mz; - void *dummy_host_buf; - memory_t dummy_dev_buf; + host_mem_t dummy_host_buf; + dev_mem_t dummy_dev_buf; size_t num_kernel_args; struct kernel_arg kernel_args[CUDA_MAX_KERNEL_ARGS]; diff --git a/include/nba/engines/cuda/computedevice.hh b/include/nba/engines/cuda/computedevice.hh index 9852e2d..0c44485 100644 --- a/include/nba/engines/cuda/computedevice.hh +++ b/include/nba/engines/cuda/computedevice.hh @@ -25,12 +25,12 @@ public: int get_spec(struct compute_device_spec *spec); int get_utilization(struct compute_device_util *util); - void *alloc_host_buffer(size_t size, int flags); - memory_t alloc_device_buffer(size_t size, int flags); - void free_host_buffer(void *ptr); - void free_device_buffer(memory_t ptr); - void memwrite(void *host_buf, memory_t dev_buf, size_t offset, size_t size); - void memread(void *host_buf, memory_t dev_buf, size_t offset, size_t size); + host_mem_t alloc_host_buffer(size_t size, int flags); + dev_mem_t alloc_device_buffer(size_t size, int flags); + void free_host_buffer(host_mem_t ptr); + void free_device_buffer(dev_mem_t ptr); + void memwrite(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size); + void memread(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size); private: ComputeContext *_get_available_context(); diff --git a/include/nba/engines/cuda/mempool.hh b/include/nba/engines/cuda/mempool.hh index aa8d4ac..888082e 100644 --- a/include/nba/engines/cuda/mempool.hh +++ b/include/nba/engines/cuda/mempool.hh @@ -3,38 +3,51 @@ #include #include +#include #include #include #include namespace nba { -class CUDAMemoryPool : public MemoryPool +class CUDAMemoryPool : public MemoryPool { public: - CUDAMemoryPool() : MemoryPool(), base(NULL) - { - } + CUDAMemoryPool() + : MemoryPool(), base(nullptr) + { } + + CUDAMemoryPool(size_t max_size) + : MemoryPool(max_size), base(nullptr) + { } + + CUDAMemoryPool(size_t max_size, size_t align) + : MemoryPool(max_size, align), base(nullptr) + { } virtual ~CUDAMemoryPool() { destroy(); } - virtual bool init(size_t max_size) + bool init() { - this->max_size = max_size; cutilSafeCall(cudaMalloc((void **) &base, max_size)); return true; } - void *alloc(size_t size) + dev_mem_t get_base_ptr() const + { + return { base }; + } + + int alloc(size_t size, dev_mem_t &ptr) { size_t offset; int ret = _alloc(size, &offset); if (ret == 0) - return (void *) ((uintptr_t) base + offset); - return NULL; + ptr.ptr = (void *) ((uintptr_t) base + offset); + return ret; } void destroy() @@ -43,55 +56,56 @@ public: cudaFree(base); } - void *get_base_ptr() const - { - return base; - } - private: void *base; }; -class CPUMemoryPool : public MemoryPool +class CPUMemoryPool : public MemoryPool { public: - CPUMemoryPool(int cuda_flags = 0) : MemoryPool(), base(NULL), flags(cuda_flags), use_external(false) - { - } + CPUMemoryPool(int cuda_flags = 0) + : MemoryPool(), base(nullptr), flags(cuda_flags), use_external(false) + { } + + CPUMemoryPool(size_t max_size, int cuda_flags = 0) + : MemoryPool(max_size), base(nullptr), flags(cuda_flags), use_external(false) + { } + + CPUMemoryPool(size_t max_size, size_t align, int cuda_flags = 0) + : MemoryPool(max_size, align), base(nullptr), flags(cuda_flags), use_external(false) + { } virtual ~CPUMemoryPool() { destroy(); } - virtual bool init(unsigned long size) + bool init() { - this->max_size = size; - cutilSafeCall(cudaHostAlloc((void **) &base, size, + cutilSafeCall(cudaHostAlloc((void **) &base, max_size, this->flags)); return true; } - bool init_with_flags(unsigned long size, void *ext_ptr, int flags) + bool init_with_flags(void *ext_ptr, int flags) { - this->max_size = size; if (ext_ptr != nullptr) { base = ext_ptr; use_external = true; } else { - cutilSafeCall(cudaHostAlloc((void **) &base, size, + cutilSafeCall(cudaHostAlloc((void **) &base, max_size, flags)); } return true; } - void *alloc(size_t size) + int alloc(size_t size, host_mem_t &m) { size_t offset; int ret = _alloc(size, &offset); if (ret == 0) - return (void *) ((uintptr_t) base + offset); - return NULL; + m.ptr = (void *) ((uintptr_t) base + offset); + return ret; } void destroy() @@ -100,12 +114,12 @@ public: cudaFreeHost(base); } - void *get_base_ptr() const + host_mem_t get_base_ptr() const { - return base; + return { base }; } -protected: +private: void *base; int flags; bool use_external; diff --git a/include/nba/engines/dummy/computecontext.hh b/include/nba/engines/dummy/computecontext.hh index b091ddc..a6b599f 100644 --- a/include/nba/engines/dummy/computecontext.hh +++ b/include/nba/engines/dummy/computecontext.hh @@ -1,10 +1,7 @@ #ifndef __NBA_DUMMY_COMPUTECTX_HH__ #define __NBA_DUMMY_COMPUTECTX_HH__ -#include - #include -#include #include #include #include @@ -24,10 +21,16 @@ public: virtual ~DummyComputeContext(); io_base_t alloc_io_base(); - int alloc_input_buffer(io_base_t io_base, size_t size, void **host_ptr, memory_t *dev_mem); - int alloc_output_buffer(io_base_t io_base, size_t size, void **host_ptr, memory_t *dev_mem); - void get_input_current_pos(io_base_t io_base, void **host_ptr, memory_t *dev_mem) const; - void get_output_current_pos(io_base_t io_base, void **host_ptr, memory_t *dev_mem) const; + int alloc_input_buffer(io_base_t io_base, size_t size, + host_mem_t &host_ptr, dev_mem_t &dev_ptr); + int alloc_output_buffer(io_base_t io_base, size_t size, + host_mem_t &host_ptr, dev_mem_t &dev_ptr); + void map_input_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const; + void map_output_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const; + void *unwrap_host_buffer(const host_mem_t hbuf) const; + void *unwrap_device_buffer(const dev_mem_t dbuf) const; size_t get_input_size(io_base_t io_base) const; size_t get_output_size(io_base_t io_base) const; void clear_io_buffers(io_base_t io_base); @@ -35,10 +38,13 @@ public: void clear_kernel_args() { } void push_kernel_arg(struct kernel_arg &arg) { } - int enqueue_memwrite_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size); - int enqueue_memread_op(void* host_buf, memory_t dev_buf, size_t offset, size_t size); - int enqueue_kernel_launch(kernel_t kernel, struct resource_param *res); - int enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), void *user_arg); + int enqueue_memwrite_op(const host_mem_t host_buf, const dev_mem_t dev_buf, + size_t offset, size_t size); + int enqueue_memread_op(const host_mem_t host_buf, const dev_mem_t dev_buf, + size_t offset, size_t size); + int enqueue_kernel_launch(dev_kernel_t kernel, struct resource_param *res); + int enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), + void *user_arg); void sync() { @@ -66,10 +72,10 @@ public: } private: - DummyCPUMemoryPool _dev_mempool_in[NBA_MAX_IO_BASES]; - DummyCPUMemoryPool _dev_mempool_out[NBA_MAX_IO_BASES]; - DummyCPUMemoryPool _cpu_mempool_in[NBA_MAX_IO_BASES]; - DummyCPUMemoryPool _cpu_mempool_out[NBA_MAX_IO_BASES]; + DummyCPUMemoryPool *_dev_mempool_in[NBA_MAX_IO_BASES]; + DummyCPUMemoryPool *_dev_mempool_out[NBA_MAX_IO_BASES]; + DummyCPUMemoryPool *_cpu_mempool_in[NBA_MAX_IO_BASES]; + DummyCPUMemoryPool *_cpu_mempool_out[NBA_MAX_IO_BASES]; FixedRing *io_base_ring; }; diff --git a/include/nba/engines/dummy/computedevice.hh b/include/nba/engines/dummy/computedevice.hh index ba112e3..d4923ac 100644 --- a/include/nba/engines/dummy/computedevice.hh +++ b/include/nba/engines/dummy/computedevice.hh @@ -22,12 +22,12 @@ public: int get_spec(struct compute_device_spec *spec); int get_utilization(struct compute_device_util *util); - void *alloc_host_buffer(size_t size, int flags); - memory_t alloc_device_buffer(size_t size, int flags); - void free_host_buffer(void *ptr); - void free_device_buffer(memory_t ptr); - void memwrite(void *host_buf, memory_t dev_buf, size_t offset, size_t size); - void memread(void *host_buf, memory_t dev_buf, size_t offset, size_t size); + host_mem_t alloc_host_buffer(size_t size, int flags); + dev_mem_t alloc_device_buffer(size_t size, int flags); + void free_host_buffer(host_mem_t ptr); + void free_device_buffer(dev_mem_t ptr); + void memwrite(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size); + void memread(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size); private: ComputeContext *_get_available_context(); diff --git a/include/nba/engines/dummy/mempool.hh b/include/nba/engines/dummy/mempool.hh index 1d12093..5344f09 100644 --- a/include/nba/engines/dummy/mempool.hh +++ b/include/nba/engines/dummy/mempool.hh @@ -7,39 +7,46 @@ namespace nba { -class DummyCPUMemoryPool : public MemoryPool +class DummyCPUMemoryPool : public MemoryPool { public: - DummyCPUMemoryPool() : MemoryPool(), base(NULL) - { - } + DummyCPUMemoryPool() + : MemoryPool(), base(nullptr) + { } + + DummyCPUMemoryPool(size_t max_size) + : MemoryPool(max_size), base(nullptr) + { } + + DummyCPUMemoryPool(size_t max_size, size_t align) + : MemoryPool(max_size, align), base(nullptr) + { } virtual ~DummyCPUMemoryPool() { destroy(); } - virtual bool init(unsigned long size) + bool init() { - this->max_size = size; - base = malloc(size); + base = malloc(max_size); return true; } - void *alloc(size_t size) + int alloc(size_t size, void *&ptr) { size_t offset; int ret = _alloc(size, &offset); if (ret == 0) - return (void *) ((uint8_t *) base + offset); - return NULL; + ptr = (void *) ((uint8_t *) base + offset); + return ret; } void destroy() { - if (base != NULL) { + if (base != nullptr) { free(base); - base = NULL; + base = nullptr; } } @@ -48,7 +55,7 @@ public: return base; } -protected: +private: void *base; }; diff --git a/include/nba/engines/phi/computecontext.hh b/include/nba/engines/phi/computecontext.hh index 2edbe9a..bbd8ff3 100644 --- a/include/nba/engines/phi/computecontext.hh +++ b/include/nba/engines/phi/computecontext.hh @@ -41,16 +41,6 @@ public: int enqueue_kernel_launch(kernel_t kernel, struct resource_param *res); int enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), void *user_arg); - void *get_stream() - { - // TODO: implement - return NULL; - } - //cudaStream_t get_stream() - //{ - // return _stream; - //} - void sync() { clFinish(clqueue); @@ -94,10 +84,8 @@ private: cl_command_queue clqueue; cl_event clev; cl_event clev_marker; - PhiMemoryPool *dev_mempool_in; - PhiMemoryPool *dev_mempool_out; - CPUMemoryPool *cpu_mempool_in; - CPUMemoryPool *cpu_mempool_out; + CLMemoryPool *_mempool_in[NBA_MAX_IO_BASES]; + CLMemoryPool *_mempool_out[NBA_MAX_IO_BASES]; size_t num_kernel_args; struct kernel_arg kernel_args[CUDA_MAX_KERNEL_ARGS]; diff --git a/include/nba/engines/phi/mempool.hh b/include/nba/engines/phi/mempool.hh index fe2f505..990fb34 100644 --- a/include/nba/engines/phi/mempool.hh +++ b/include/nba/engines/phi/mempool.hh @@ -1,5 +1,5 @@ -#ifndef __NBA_PHI_MEMPOOL_HH__ -#define __NBA_PHI_MEMPOOL_HH__ +#ifndef __NBA_CL_MEMPOOL_HH__ +#define __NBA_CL_MEMPOOL_HH__ #include #include @@ -10,108 +10,65 @@ namespace nba { -class PhiMemoryPool : public MemoryPool +class CLMemoryPool : public MemoryPool { public: - PhiMemoryPool(cl_context clctx, cl_command_queue clqueue, int direction_hint) - : MemoryPool(), clctx(clctx), clqueue(clqueue), direction_hint(direction_hint) - { - } + CLMemoryPool(size_t max_size, size_t align, cl_context clctx, cl_command_queue clqueue, int direction_hint) + : MemoryPool(max_size, align), clctx(clctx), clqueue(clqueue), direction_hint(direction_hint) + { } - virtual ~PhiMemoryPool() + virtual ~CLMemoryPool() { destroy(); } - virtual bool init(unsigned long max_size) + bool init() { - this->max_size = max_size; cl_int err_ret; - clbuf = clCreateBuffer(clctx, CL_MEM_HOST_NO_ACCESS | - (direction_hint == HOST_TO_DEVICE ? CL_MEM_READ_ONLY : CL_MEM_WRITE_ONLY), - max_size, NULL, &err_ret); + // Let OpenCL runtime to allocate both host-side buffer + // and device-side buffer using its own optimized flags. + base_buf = clCreateBuffer(clctx, CL_MEM_ALLOC_HOST_PTR | + (direction_hint == HOST_TO_DEVICE + ? (CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY) + : (CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY)), + max_size, nullptr, &err_ret); if (err_ret != CL_SUCCESS) return false; return true; } - memory_t alloc(size_t size) + cl_mem get_base_ptr() const + { + return base_buf; + } + + int alloc(size_t size, cl_mem &subbuf) { - memory_t ret; size_t offset; - int r =_alloc(size, &offset); - if (r == 0) { + int ret = _alloc(size, &offset); + if (ret == 0) { cl_buffer_region region = { offset, size }; - cl_int err_ret; - // Create a sub-buffer inheriting all flags from clbuf. - ret.clmem = clCreateSubBuffer(clbuf, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err_ret); - if (err_ret != CL_SUCCESS) { - fprintf(stderr, "clCreateSubBuffer() failed!\n"); - ret.ptr = NULL; + cl_int err; + // Create a sub-buffer inheriting all flags from base_buf. + subbuf = clCreateSubBuffer(base_buf, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + if (err != CL_SUCCESS) { + ret = -ENOMEM; } - return ret; } - ret.ptr = NULL; return ret; } - virtual void destroy() - { - clReleaseMemObject(clbuf); - } - - void *get_base_ptr() const + void destroy() { - // TODO: convert clbuf to void* - assert(false, "not implemented yet"); - return nullptr; + // TODO: on reset(), release sub-buffer objects as well. + clReleaseMemObject(base_buf); } private: cl_context clctx; cl_command_queue clqueue; int direction_hint; - cl_mem clbuf; -}; - -class CPUMemoryPool : public MemoryPool -{ -public: - virtual ~CPUMemoryPool() - { - destroy(); - } - - virtual bool init(size_t max_size) - { - void *ret = NULL; - this->max_size = max_size; - base = (uint8_t*) malloc(max_size); - return ret; - } - - void *alloc(size_t size) - { - size_t offset; - int ret = _alloc(size, &offset); - if (ret == 0) - return (void *) ((uint8_t *) base + offset); - return NULL; - } - - virtual void destroy() - { - if (base) - free(base); - } - - void *get_base_ptr() const - { - return base; - } - -private: - void *base; + cl_mem base_buf; }; } diff --git a/include/nba/framework/computecontext.hh b/include/nba/framework/computecontext.hh index b08b59f..5a6329c 100644 --- a/include/nba/framework/computecontext.hh +++ b/include/nba/framework/computecontext.hh @@ -36,13 +36,15 @@ public: virtual io_base_t alloc_io_base() = 0; virtual int alloc_input_buffer(io_base_t io_base, size_t size, - void **host_ptr, memory_t *dev_mem) = 0; + host_mem_t &hbuf, dev_mem_t &dbuf) = 0; virtual int alloc_output_buffer(io_base_t io_base, size_t size, - void **host_ptr, memory_t *dev_mem) = 0; - virtual void get_input_current_pos(io_base_t io_base, - void **host_ptr, memory_t *dev_mem) const = 0; - virtual void get_output_current_pos(io_base_t io_base, - void **host_ptr, memory_t *dev_mem) const = 0; + host_mem_t &hbuf, dev_mem_t &dbuf) = 0; + virtual void map_input_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const = 0; + virtual void map_output_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const = 0; + virtual void *unwrap_host_buffer(const host_mem_t hbuf) const = 0; + virtual void *unwrap_device_buffer(const dev_mem_t dbuf) const = 0; virtual size_t get_input_size(io_base_t io_base) const = 0; virtual size_t get_output_size(io_base_t io_base) const = 0; virtual void clear_io_buffers(io_base_t io_base) = 0; @@ -50,11 +52,15 @@ public: virtual void clear_kernel_args() = 0; virtual void push_kernel_arg(struct kernel_arg &arg) = 0; - /* All functions must be implemented as non-blocking. */ - virtual int enqueue_memwrite_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size) = 0; - virtual int enqueue_memread_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size) = 0; - virtual int enqueue_kernel_launch(kernel_t kernel, struct resource_param *res) = 0; - virtual int enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), void *user_arg) = 0; + /* All methods below must be implemented using non-blocking APIs provided + * by device runtimes. */ + virtual int enqueue_memwrite_op(const host_mem_t host_buf, const dev_mem_t dev_buf, + size_t offset, size_t size) = 0; + virtual int enqueue_memread_op(const host_mem_t host_buf, const dev_mem_t dev_buf, + size_t offset, size_t size) = 0; + virtual int enqueue_kernel_launch(dev_kernel_t kernel, struct resource_param *res) = 0; + virtual int enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), + void *user_arg) = 0; virtual uint8_t *get_device_checkbits() = 0; virtual uint8_t *get_host_checkbits() = 0; diff --git a/include/nba/framework/computedevice.hh b/include/nba/framework/computedevice.hh index b174704..d43ef79 100644 --- a/include/nba/framework/computedevice.hh +++ b/include/nba/framework/computedevice.hh @@ -91,14 +91,16 @@ public: return contexts; } - virtual void *alloc_host_buffer(size_t size, int flags) = 0; - virtual memory_t alloc_device_buffer(size_t size, int flags) = 0; - virtual void free_host_buffer(void *ptr) = 0; - virtual void free_device_buffer(memory_t ptr) = 0; + virtual host_mem_t alloc_host_buffer(size_t size, int flags) = 0; + virtual dev_mem_t alloc_device_buffer(size_t size, int flags = AGNOSTIC) = 0; + virtual void free_host_buffer(host_mem_t ptr) = 0; + virtual void free_device_buffer(dev_mem_t ptr) = 0; /* Synchronous versions */ - virtual void memwrite(void *host_buf, memory_t dev_buf, size_t offset, size_t size) = 0; - virtual void memread(void *host_buf, memory_t dev_buf, size_t offset, size_t size) = 0; + virtual void memwrite(host_mem_t host_buf, dev_mem_t dev_buf, + size_t offset, size_t size) = 0; + virtual void memread(host_mem_t host_buf, dev_mem_t dev_buf, + size_t offset, size_t size) = 0; std::string type_name; struct ev_async *input_watcher; diff --git a/include/nba/framework/config.hh b/include/nba/framework/config.hh index f64b0c3..a68c435 100644 --- a/include/nba/framework/config.hh +++ b/include/nba/framework/config.hh @@ -57,6 +57,8 @@ #define NBA_OQ (true) // Use output-queuing semantics when possible. #undef NBA_CPU_MICROBENCH // Enable support for PAPI library for microbenchmarks. +#undef NBA_IPFWD_RR_NODE_LOCAL + #define NBA_REUSE_DATABLOCKS /* If you change below, update HANDLE_ALL_PORTS macro in lib/element.hh as well!! */ diff --git a/include/nba/framework/datablock.hh b/include/nba/framework/datablock.hh index c36a474..6d24e92 100644 --- a/include/nba/framework/datablock.hh +++ b/include/nba/framework/datablock.hh @@ -100,16 +100,17 @@ struct item_size_info { * array. */ struct datablock_tracker { - void *host_in_ptr; - memory_t dev_in_ptr; - void *host_out_ptr; - memory_t dev_out_ptr; + host_mem_t host_in_ptr; + dev_mem_t dev_in_ptr; + host_mem_t host_out_ptr; + dev_mem_t dev_out_ptr; size_t in_size; size_t in_count; size_t out_size; size_t out_count; - struct item_size_info *aligned_item_sizes_h; - memory_t aligned_item_sizes_d; + struct item_size_info *aligned_item_sizes; + host_mem_t aligned_item_sizes_h; + dev_mem_t aligned_item_sizes_d; }; /** diff --git a/include/nba/framework/offloadtask.hh b/include/nba/framework/offloadtask.hh index 021d3e3..47438a0 100644 --- a/include/nba/framework/offloadtask.hh +++ b/include/nba/framework/offloadtask.hh @@ -79,8 +79,8 @@ public: OffloadableElement* elem; int dbid_h2d[NBA_MAX_DATABLOCKS]; - struct datablock_kernel_arg **dbarray_h; - memory_t dbarray_d; + host_mem_t dbarray_h; + dev_mem_t dbarray_d; struct ev_async *completion_watcher __cache_aligned; struct rte_ring *completion_queue __cache_aligned; @@ -89,15 +89,11 @@ public: private: friend class OffloadableElement; - void *host_write_begin; - void *host_read_begin; - memory_t dev_write_begin; - memory_t dev_read_begin; - size_t input_alloc_size_begin; - size_t output_alloc_size_begin; + size_t input_begin; + size_t output_begin; - size_t last_input_size; - size_t last_output_size; + size_t last_input_size; // for debugging + size_t last_output_size; // for debugging }; } diff --git a/include/nba/framework/threadcontext.hh b/include/nba/framework/threadcontext.hh index c2d90b3..5916fd9 100644 --- a/include/nba/framework/threadcontext.hh +++ b/include/nba/framework/threadcontext.hh @@ -185,7 +185,7 @@ public: struct rte_ring *task_completion_queue; /* to receive completed offload tasks */ struct ev_async *task_completion_watcher; - struct ev_prepare *prepare_watcher; + struct ev_check *check_watcher; } __cache_aligned; struct coproc_thread_context { diff --git a/src/engines/cuda/computecontext.cc b/src/engines/cuda/computecontext.cc index 54ad901..7026fb6 100644 --- a/src/engines/cuda/computecontext.cc +++ b/src/engines/cuda/computecontext.cc @@ -13,6 +13,7 @@ struct cuda_event_context { }; #define IO_BASE_SIZE (16 * 1024 * 1024) +#define IO_MEMPOOL_ALIGN (8lu) #undef USE_PHYS_CONT_MEMORY // performance degraded :( CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother) @@ -21,31 +22,32 @@ CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother) /* NOTE: Write-combined memory degrades performance to half... */ { type_name = "cuda"; - size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize()); // TODO: read from config + size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize()); cutilSafeCall(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking)); NEW(node_id, io_base_ring, FixedRing, NBA_MAX_IO_BASES, node_id); for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) { io_base_ring->push_back(i); - _cuda_mempool_in[i].init(io_base_size); - _cuda_mempool_out[i].init(io_base_size); + NEW(node_id, _cuda_mempool_in[i], CUDAMemoryPool, io_base_size, IO_MEMPOOL_ALIGN); + NEW(node_id, _cuda_mempool_out[i], CUDAMemoryPool, io_base_size, IO_MEMPOOL_ALIGN); + _cuda_mempool_in[i]->init(); + _cuda_mempool_out[i]->init(); + NEW(node_id, _cpu_mempool_in[i], CPUMemoryPool, io_base_size, IO_MEMPOOL_ALIGN, 0); + NEW(node_id, _cpu_mempool_out[i], CPUMemoryPool, io_base_size, IO_MEMPOOL_ALIGN, 0); #ifdef USE_PHYS_CONT_MEMORY void *base; base = (void *) ((uintptr_t) mz->addr + i * io_base_size); - _cpu_mempool_in[i].init_with_flags(io_base_size, base, 0); + _cpu_mempool_in[i]->init_with_flags(base, 0); base = (void *) ((uintptr_t) mz->addr + i * io_base_size + NBA_MAX_IO_BASES * io_base_size); - _cpu_mempool_out[i].init_with_flags(io_base_size, base, 0); + _cpu_mempool_out[i]->init_with_flags(base, 0); #else - _cpu_mempool_in[i].init_with_flags(io_base_size, nullptr, cudaHostAllocPortable); - _cpu_mempool_out[i].init_with_flags(io_base_size, nullptr, cudaHostAllocPortable); + _cpu_mempool_in[i]->init_with_flags(nullptr, cudaHostAllocPortable); + _cpu_mempool_out[i]->init_with_flags(nullptr, cudaHostAllocPortable); #endif } { - void *t; - cutilSafeCall(cudaMalloc((void **) &t, CACHE_LINE_SIZE)); - dummy_dev_buf.ptr = t; - cutilSafeCall(cudaHostAlloc((void **) &t, CACHE_LINE_SIZE, cudaHostAllocPortable)); - dummy_host_buf = t; + cutilSafeCall(cudaMalloc((void **) &dummy_dev_buf.ptr, CACHE_LINE_SIZE)); + cutilSafeCall(cudaHostAlloc((void **) &dummy_host_buf.ptr, CACHE_LINE_SIZE, cudaHostAllocPortable)); } cutilSafeCall(cudaHostAlloc((void **) &checkbits_h, MAX_BLOCKS, cudaHostAllocMapped)); cutilSafeCall(cudaHostGetDevicePointer((void **) &checkbits_d, checkbits_h, 0)); @@ -74,10 +76,10 @@ CUDAComputeContext::~CUDAComputeContext() { cutilSafeCall(cudaStreamDestroy(_stream)); for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) { - _cuda_mempool_in[i].destroy(); - _cuda_mempool_out[i].destroy(); - _cpu_mempool_in[i].destroy(); - _cpu_mempool_out[i].destroy(); + _cuda_mempool_in[i]->destroy(); + _cuda_mempool_out[i]->destroy(); + _cpu_mempool_in[i]->destroy(); + _cpu_mempool_out[i]->destroy(); } if (mz != nullptr) rte_memzone_free(mz); @@ -92,73 +94,93 @@ io_base_t CUDAComputeContext::alloc_io_base() return (io_base_t) i; } -void CUDAComputeContext::get_input_current_pos(io_base_t io_base, void **host_ptr, memory_t *dev_mem) const +int CUDAComputeContext::alloc_input_buffer(io_base_t io_base, size_t size, + host_mem_t &host_mem, dev_mem_t &dev_mem) { unsigned i = io_base; - *host_ptr = (char*)_cpu_mempool_in[i].get_base_ptr() + (uintptr_t)_cpu_mempool_in[i].get_alloc_size(); - dev_mem->ptr = (char*)_cuda_mempool_in[i].get_base_ptr() + (uintptr_t)_cuda_mempool_in[i].get_alloc_size(); + assert(0 == _cpu_mempool_in[i]->alloc(size, host_mem)); + assert(0 == _cuda_mempool_in[i]->alloc(size, dev_mem)); + // for debugging + //assert(((uintptr_t)host_mem.ptr & 0xffff) == ((uintptr_t)dev_mem.ptr & 0xffff)); + return 0; } -void CUDAComputeContext::get_output_current_pos(io_base_t io_base, void **host_ptr, memory_t *dev_mem) const +int CUDAComputeContext::alloc_output_buffer(io_base_t io_base, size_t size, + host_mem_t &host_mem, dev_mem_t &dev_mem) { unsigned i = io_base; - *host_ptr = (char*)_cpu_mempool_out[i].get_base_ptr() + (uintptr_t)_cpu_mempool_out[i].get_alloc_size(); - dev_mem->ptr = (char*)_cuda_mempool_out[i].get_base_ptr() + (uintptr_t)_cuda_mempool_out[i].get_alloc_size(); + assert(0 == _cpu_mempool_out[i]->alloc(size, host_mem)); + assert(0 == _cuda_mempool_out[i]->alloc(size, dev_mem)); + // for debugging + //assert(((uintptr_t)host_mem.ptr & 0xffff) == ((uintptr_t)dev_mem.ptr & 0xffff)); + return 0; } -size_t CUDAComputeContext::get_input_size(io_base_t io_base) const +void CUDAComputeContext::map_input_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const { unsigned i = io_base; - return _cpu_mempool_in[i].get_alloc_size(); + hbuf.ptr = (void *) ((uintptr_t) _cpu_mempool_in[i]->get_base_ptr().ptr + offset); + dbuf.ptr = (void *) ((uintptr_t) _cuda_mempool_in[i]->get_base_ptr().ptr + offset); + // len is ignored. } -size_t CUDAComputeContext::get_output_size(io_base_t io_base) const +void CUDAComputeContext::map_output_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const { unsigned i = io_base; - return _cpu_mempool_out[i].get_alloc_size(); + hbuf.ptr = (void *) ((uintptr_t) _cpu_mempool_out[i]->get_base_ptr().ptr + offset); + dbuf.ptr = (void *) ((uintptr_t) _cuda_mempool_out[i]->get_base_ptr().ptr + offset); + // len is ignored. +} + +void *CUDAComputeContext::unwrap_host_buffer(const host_mem_t hbuf) const +{ + return hbuf.ptr; } -int CUDAComputeContext::alloc_input_buffer(io_base_t io_base, size_t size, void **host_ptr, memory_t *dev_mem) +void *CUDAComputeContext::unwrap_device_buffer(const dev_mem_t dbuf) const +{ + return dbuf.ptr; +} + +size_t CUDAComputeContext::get_input_size(io_base_t io_base) const { unsigned i = io_base; - *host_ptr = _cpu_mempool_in[i].alloc(size); - assert(*host_ptr != nullptr); - dev_mem->ptr = _cuda_mempool_in[i].alloc(size); - assert(dev_mem->ptr != nullptr); - return 0; + return _cpu_mempool_in[i]->get_alloc_size(); } -int CUDAComputeContext::alloc_output_buffer(io_base_t io_base, size_t size, void **host_ptr, memory_t *dev_mem) +size_t CUDAComputeContext::get_output_size(io_base_t io_base) const { unsigned i = io_base; - *host_ptr = _cpu_mempool_out[i].alloc(size); - assert(*host_ptr != nullptr); - dev_mem->ptr = _cuda_mempool_out[i].alloc(size); - assert(dev_mem->ptr != nullptr); - return 0; + return _cpu_mempool_out[i]->get_alloc_size(); } void CUDAComputeContext::clear_io_buffers(io_base_t io_base) { unsigned i = io_base; - _cpu_mempool_in[i].reset(); - _cpu_mempool_out[i].reset(); - _cuda_mempool_in[i].reset(); - _cuda_mempool_out[i].reset(); + _cpu_mempool_in[i]->reset(); + _cpu_mempool_out[i]->reset(); + _cuda_mempool_in[i]->reset(); + _cuda_mempool_out[i]->reset(); io_base_ring->push_back(i); } -int CUDAComputeContext::enqueue_memwrite_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +int CUDAComputeContext::enqueue_memwrite_op(const host_mem_t host_buf, + const dev_mem_t dev_buf, + size_t offset, size_t size) { - //cutilSafeCall(cudaMemcpyAsync(dummy_dev_buf.ptr, dummy_host_buf, 64, cudaMemcpyHostToDevice, _stream)); - cutilSafeCall(cudaMemcpyAsync(dev_buf.ptr, host_buf, size, cudaMemcpyHostToDevice, _stream)); + cutilSafeCall(cudaMemcpyAsync(dev_buf.ptr, host_buf.ptr, size, + cudaMemcpyHostToDevice, _stream)); return 0; } -int CUDAComputeContext::enqueue_memread_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +int CUDAComputeContext::enqueue_memread_op(const host_mem_t host_buf, + const dev_mem_t dev_buf, + size_t offset, size_t size) { - //cutilSafeCall(cudaMemcpyAsync(dummy_host_buf, dummy_dev_buf.ptr, 64, cudaMemcpyDeviceToHost, _stream)); - cutilSafeCall(cudaMemcpyAsync(host_buf, dev_buf.ptr, size, cudaMemcpyDeviceToHost, _stream)); + cutilSafeCall(cudaMemcpyAsync(host_buf.ptr, dev_buf.ptr, size, + cudaMemcpyDeviceToHost, _stream)); return 0; } @@ -173,7 +195,7 @@ void CUDAComputeContext::push_kernel_arg(struct kernel_arg &arg) kernel_args[num_kernel_args ++] = arg; /* Copied to the array. */ } -int CUDAComputeContext::enqueue_kernel_launch(kernel_t kernel, struct resource_param *res) +int CUDAComputeContext::enqueue_kernel_launch(dev_kernel_t kernel, struct resource_param *res) { assert(checkbits_d != nullptr); // TODO: considerations for cudaFuncSetCacheConfig() and @@ -193,7 +215,9 @@ int CUDAComputeContext::enqueue_kernel_launch(kernel_t kernel, struct resource_p return 0; } -int CUDAComputeContext::enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), void *user_arg) +int CUDAComputeContext::enqueue_event_callback( + void (*func_ptr)(ComputeContext *ctx, void *user_arg), + void *user_arg) { auto cb = [](cudaStream_t stream, cudaError_t status, void *user_data) { diff --git a/src/engines/cuda/computedevice.cc b/src/engines/cuda/computedevice.cc index 30b479c..60812a1 100644 --- a/src/engines/cuda/computedevice.cc +++ b/src/engines/cuda/computedevice.cc @@ -88,7 +88,7 @@ void CUDAComputeDevice::_return_context(ComputeContext *cctx) _ready_cond.unlock(); } -void *CUDAComputeDevice::alloc_host_buffer(size_t size, int flags) +host_mem_t CUDAComputeDevice::alloc_host_buffer(size_t size, int flags) { void *ptr; int nvflags = 0; @@ -97,37 +97,35 @@ void *CUDAComputeDevice::alloc_host_buffer(size_t size, int flags) nvflags |= (flags & HOST_WRITECOMBINED) ? cudaHostAllocWriteCombined : 0; cutilSafeCall(cudaHostAlloc(&ptr, size, nvflags)); assert(ptr != NULL); - return ptr; + return { ptr }; } -memory_t CUDAComputeDevice::alloc_device_buffer(size_t size, int flags) +dev_mem_t CUDAComputeDevice::alloc_device_buffer(size_t size, int flags) { - memory_t m; void *ptr; cutilSafeCall(cudaMalloc(&ptr, size)); assert(ptr != NULL); - m.ptr = ptr; - return m; + return { ptr }; } -void CUDAComputeDevice::free_host_buffer(void *ptr) +void CUDAComputeDevice::free_host_buffer(host_mem_t m) { - cutilSafeCall(cudaFreeHost(ptr)); + cutilSafeCall(cudaFreeHost(m.ptr)); } -void CUDAComputeDevice::free_device_buffer(memory_t m) +void CUDAComputeDevice::free_device_buffer(dev_mem_t m) { cutilSafeCall(cudaFree(m.ptr)); } -void CUDAComputeDevice::memwrite(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +void CUDAComputeDevice::memwrite(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size) { - cutilSafeCall(cudaMemcpy((uint8_t *) dev_buf.ptr + offset, host_buf, size, cudaMemcpyHostToDevice)); + cutilSafeCall(cudaMemcpy((uint8_t *) dev_buf.ptr + offset, host_buf.ptr, size, cudaMemcpyHostToDevice)); } -void CUDAComputeDevice::memread(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +void CUDAComputeDevice::memread(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size) { - cutilSafeCall(cudaMemcpy(host_buf, (uint8_t *) dev_buf.ptr + offset, size, cudaMemcpyDeviceToHost)); + cutilSafeCall(cudaMemcpy(host_buf.ptr, (uint8_t *) dev_buf.ptr + offset, size, cudaMemcpyDeviceToHost)); } // vim: ts=8 sts=4 sw=4 et diff --git a/src/engines/dummy/computecontext.cc b/src/engines/dummy/computecontext.cc index 6a2eb77..f818bdd 100644 --- a/src/engines/dummy/computecontext.cc +++ b/src/engines/dummy/computecontext.cc @@ -12,20 +12,24 @@ DummyComputeContext::DummyComputeContext(unsigned ctx_id, ComputeDevice *mother_ NEW(node_id, io_base_ring, FixedRing, NBA_MAX_IO_BASES, node_id); for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) { - _dev_mempool_in[i].init(io_base_size); - _dev_mempool_out[i].init(io_base_size); - _cpu_mempool_in[i].init(io_base_size); - _cpu_mempool_out[i].init(io_base_size); + NEW(node_id, _dev_mempool_in[i], DummyCPUMemoryPool, io_base_size, CACHE_LINE_SIZE); + NEW(node_id, _dev_mempool_out[i], DummyCPUMemoryPool, io_base_size, CACHE_LINE_SIZE); + NEW(node_id, _cpu_mempool_in[i], DummyCPUMemoryPool, io_base_size, CACHE_LINE_SIZE); + NEW(node_id, _cpu_mempool_out[i], DummyCPUMemoryPool, io_base_size, CACHE_LINE_SIZE); + _dev_mempool_in[i]->init(); + _dev_mempool_out[i]->init(); + _cpu_mempool_in[i]->init(); + _cpu_mempool_out[i]->init(); } } DummyComputeContext::~DummyComputeContext() { for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) { - _dev_mempool_in[i].destroy(); - _dev_mempool_out[i].destroy(); - _cpu_mempool_in[i].destroy(); - _cpu_mempool_out[i].destroy(); + _dev_mempool_in[i]->destroy(); + _dev_mempool_out[i]->destroy(); + _cpu_mempool_in[i]->destroy(); + _cpu_mempool_out[i]->destroy(); } } @@ -37,79 +41,97 @@ io_base_t DummyComputeContext::alloc_io_base() return (io_base_t) i; } - -void DummyComputeContext::get_input_current_pos(io_base_t io_base, void **host_ptr, memory_t *dev_mem) const +int DummyComputeContext::alloc_input_buffer(io_base_t io_base, size_t size, + host_mem_t &host_ptr, dev_mem_t &dev_ptr) { unsigned i = io_base; - *host_ptr = (char*)_cpu_mempool_in[i].get_base_ptr() + (uintptr_t)_cpu_mempool_in[i].get_alloc_size(); - dev_mem->ptr = (char*)_dev_mempool_in[i].get_base_ptr() + (uintptr_t)_dev_mempool_in[i].get_alloc_size(); + assert(0 == _cpu_mempool_in[i]->alloc(size, host_ptr.ptr)); + assert(0 == _dev_mempool_in[i]->alloc(size, dev_ptr.ptr)); + return 0; } -void DummyComputeContext::get_output_current_pos(io_base_t io_base, void **host_ptr, memory_t *dev_mem) const +int DummyComputeContext::alloc_output_buffer(io_base_t io_base, size_t size, + host_mem_t &host_ptr, dev_mem_t &dev_ptr) { unsigned i = io_base; - *host_ptr = (char*)_cpu_mempool_out[i].get_base_ptr() + (uintptr_t)_cpu_mempool_out[i].get_alloc_size(); - dev_mem->ptr = (char*)_dev_mempool_out[i].get_base_ptr() + (uintptr_t)_dev_mempool_out[i].get_alloc_size(); + assert(0 == _cpu_mempool_out[i]->alloc(size, host_ptr.ptr)); + assert(0 == _dev_mempool_out[i]->alloc(size, dev_ptr.ptr)); + return 0; } -size_t DummyComputeContext::get_input_size(io_base_t io_base) const +void DummyComputeContext::map_input_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const { unsigned i = io_base; - return _cpu_mempool_in[i].get_alloc_size(); + hbuf.ptr = (void *) ((uintptr_t) _cpu_mempool_in[i]->get_base_ptr() + offset); + dbuf.ptr = (void *) ((uintptr_t) _cpu_mempool_in[i]->get_base_ptr() + offset); + // len is ignored. } -size_t DummyComputeContext::get_output_size(io_base_t io_base) const +void DummyComputeContext::map_output_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const { unsigned i = io_base; - return _cpu_mempool_out[i].get_alloc_size(); + hbuf.ptr = (void *) ((uintptr_t)_cpu_mempool_out[i]->get_base_ptr() + offset); + dbuf.ptr = (void *) ((uintptr_t)_cpu_mempool_out[i]->get_base_ptr() + offset); + // len is ignored. +} + +void *DummyComputeContext::unwrap_host_buffer(const host_mem_t hbuf) const +{ + return hbuf.ptr; } -int DummyComputeContext::alloc_input_buffer(io_base_t io_base, size_t size, void **host_ptr, memory_t *dev_mem) +void *DummyComputeContext::unwrap_device_buffer(const dev_mem_t dbuf) const +{ + return dbuf.ptr; +} + +size_t DummyComputeContext::get_input_size(io_base_t io_base) const { unsigned i = io_base; - *host_ptr = _cpu_mempool_in[i].alloc(size); - assert(*host_ptr != nullptr); - dev_mem->ptr = _dev_mempool_in[i].alloc(size); - assert(dev_mem->ptr != nullptr); - return 0; + return _cpu_mempool_in[i]->get_alloc_size(); } -int DummyComputeContext::alloc_output_buffer(io_base_t io_base, size_t size, void **host_ptr, memory_t *dev_mem) +size_t DummyComputeContext::get_output_size(io_base_t io_base) const { unsigned i = io_base; - *host_ptr = _cpu_mempool_out[i].alloc(size); - assert(*host_ptr != nullptr); - dev_mem->ptr = _dev_mempool_out[i].alloc(size); - assert(dev_mem->ptr != nullptr); - return 0; + return _cpu_mempool_out[i]->get_alloc_size(); } void DummyComputeContext::clear_io_buffers(io_base_t io_base) { unsigned i = io_base; - _cpu_mempool_in[i].reset(); - _cpu_mempool_out[i].reset(); - _dev_mempool_in[i].reset(); - _dev_mempool_out[i].reset(); + _cpu_mempool_in[i]->reset(); + _cpu_mempool_out[i]->reset(); + _dev_mempool_in[i]->reset(); + _dev_mempool_out[i]->reset(); io_base_ring->push_back(i); } -int DummyComputeContext::enqueue_memwrite_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +int DummyComputeContext::enqueue_memwrite_op(const host_mem_t host_buf, + const dev_mem_t dev_buf, + size_t offset, size_t size) { return 0; } -int DummyComputeContext::enqueue_memread_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +int DummyComputeContext::enqueue_memread_op(const host_mem_t host_buf, + const dev_mem_t dev_buf, + size_t offset, size_t size) { return 0; } -int DummyComputeContext::enqueue_kernel_launch(kernel_t kernel, struct resource_param *res) +int DummyComputeContext::enqueue_kernel_launch(dev_kernel_t kernel, + struct resource_param *res) { return 0; } -int DummyComputeContext::enqueue_event_callback(void (*func_ptr)(ComputeContext *ctx, void *user_arg), void *user_arg) +int DummyComputeContext::enqueue_event_callback( + void (*func_ptr)(ComputeContext *ctx, void *user_arg), + void *user_arg) { func_ptr(this, user_arg); return 0; diff --git a/src/engines/dummy/computedevice.cc b/src/engines/dummy/computedevice.cc index 0bde526..f943619 100644 --- a/src/engines/dummy/computedevice.cc +++ b/src/engines/dummy/computedevice.cc @@ -77,36 +77,34 @@ void DummyComputeDevice::_return_context(ComputeContext *cctx) _ready_cond.unlock(); } -void *DummyComputeDevice::alloc_host_buffer(size_t size, int flags) +host_mem_t DummyComputeDevice::alloc_host_buffer(size_t size, int flags) { - return malloc(size); + return { malloc(size) }; } -memory_t DummyComputeDevice::alloc_device_buffer(size_t size, int flags) +dev_mem_t DummyComputeDevice::alloc_device_buffer(size_t size, int flags) { - memory_t m; - m.ptr = malloc(size); - return m; + return { malloc(size) }; } -void DummyComputeDevice::free_host_buffer(void *ptr) +void DummyComputeDevice::free_host_buffer(host_mem_t m) { - free(ptr); + free(m.ptr); } -void DummyComputeDevice::free_device_buffer(memory_t m) +void DummyComputeDevice::free_device_buffer(dev_mem_t m) { free(m.ptr); } -void DummyComputeDevice::memwrite(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +void DummyComputeDevice::memwrite(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size) { - memcpy(((uint8_t *) dev_buf.ptr) + offset, host_buf, size); + memcpy(((uint8_t *) dev_buf.ptr) + offset, host_buf.ptr, size); } -void DummyComputeDevice::memread(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +void DummyComputeDevice::memread(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size) { - memcpy(host_buf, ((uint8_t *) dev_buf.ptr) + offset, size); + memcpy(host_buf.ptr, ((uint8_t *) dev_buf.ptr) + offset, size); } // vim: ts=8 sts=4 sw=4 et diff --git a/src/engines/phi/computecontext.cc b/src/engines/phi/computecontext.cc index 06877a9..c2815cf 100644 --- a/src/engines/phi/computecontext.cc +++ b/src/engines/phi/computecontext.cc @@ -1,6 +1,7 @@ #include #include #include +#include using namespace std; using namespace nba; @@ -11,21 +12,31 @@ struct phi_event_context { void *user_arg; }; +#define IO_BASE_SIZE (16 * 1024 * 1024) +#define IO_MEMPOOL_ALIGN (8) + PhiComputeContext::PhiComputeContext(unsigned ctx_id, ComputeDevice *mother_device) : ComputeContext(ctx_id, mother_device) { type_name = "phi"; - size_t mem_size = 8 * 1024 * 1024; // TODO: read from config + size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize()); cl_int err_ret; PhiComputeDevice *modevice = (PhiComputeDevice *) mother_device; clqueue = clCreateCommandQueue(modevice->clctx, modevice->cldevid, 0, &err_ret); if (err_ret != CL_SUCCESS) { rte_panic("clCreateCommandQueue()@PhiComputeContext() failed\n"); } - dev_mempool_in = new PhiMemoryPool(modevice->clctx, clqueue, HOST_TO_DEVICE), - dev_mempool_out = new PhiMemoryPool(modevice->clctx, clqueue, DEVICE_TO_HOST), - cpu_mempool_in = new CPUMemoryPool(); - cpu_mempool_out = new CPUMemoryPool(); + NEW(node_id, io_base_ring, FixedRing, + NBA_MAX_IO_BASES, node_id); + for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) { + io_base_ring->push_back(i); + NEW(node_id, _mempool_in[i], CLMemoryPool, + io_base_size, IO_MEMPOOL_ALIGN, modevice->clctx, clqueue, HOST_TO_DEVICE), + _mempool_in[i]->init(); + NEW(node_id, _mempool_out[i], CLMemoryPool, + io_base_size, IO_MEMPOOL_ALIGN, modevice->clctx, clqueue, DEVICE_TO_HOST), + _mempool_out[i]->init(); + } checkbits_d.clmem = clCreateBuffer(modevice->clctx, CL_MEM_READ_WRITE, MAX_BLOCKS, NULL, &err_ret); if (err_ret != CL_SUCCESS) { @@ -42,51 +53,97 @@ PhiComputeContext::PhiComputeContext(unsigned ctx_id, ComputeDevice *mother_devi PhiComputeContext::~PhiComputeContext() { + for (unsigned i =0; i < NBA_MAX_IO_BASES; i++) { + _mempool_in[i]->destroy(); + _mempool_out[i]->destroy(); + } clReleaseCommandQueue(clqueue); - delete dev_mempool_in; - delete dev_mempool_out; - delete cpu_mempool_in; - delete cpu_mempool_out; } -int PhiComputeContext::alloc_input_buffer(size_t size, void **host_ptr, memory_t *dev_mem) +io_base_t PhiComputeContext::alloc_io_base() +{ + if (io_base_ring->empty()) return INVALID_IO_BASE; + unsigned i = io_base_ring->front(); + io_base_ring->pop_front(); + return (io_base_t) i; +} + +int PhiComputeContext::alloc_input_buffer(io_base_t io_base, size_t size, + host_mem_t &host_mem, dev_mem_t &dev_mem) { - *host_ptr = cpu_mempool_in->alloc(size); - dev_mem->clmem = dev_mempool_in->alloc(size); + unsigned i = io_base; + assert(0 == _mempool_in[i]->alloc(size, dev_mem)); + assert(CL_SUCCESS == clGetMemObjectInfo(dev_mem.clmem, CL_MEM_HOST_PTR, + sizeof(void *), &host_mem.ptr, nullptr)); return 0; } -int PhiComputeContext::alloc_output_buffer(size_t size, void **host_ptr, memory_t *dev_mem) +int PhiComputeContext::alloc_output_buffer(io_base_t io_base, size_t size, + host_mem_t &host_mem, dev_mem_t &dev_mem) { - *host_ptr = cpu_mempool_in->alloc(size); - dev_mem->clmem = dev_mempool_in->alloc(size); + unsigned i = io_base; + assert(0 == _mempool_out[i]->alloc(size, dev_mem)); + assert(CL_SUCCESS == clGetMemObjectInfo(dev_mem.clmem, CL_MEM_HOST_PTR, + sizeof(void *), &host_mem.ptr, nullptr)); return 0; } -void PhiComputeContext::clear_io_buffers() +void PhiComputeContext::map_input_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const +{ + unsigned i = io_base; + cl_mem base = _mempool_in[i]->get_base_ptr(); + cl_int err; + cl_buffer_region region = { offset, len }; + dbuf.clmem = clCreateSubBuffer(base, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + assert(CL_SUCCESS == err); + assert(CL_SUCCESS == clGetMemObjectInfo(dbuf.clmem, CL_MEM_HOST_PTR, + sizeof(void *), &hbuf.ptr, nullptr)); +} + +void PhiComputeContext::map_output_buffer(io_base_t io_base, size_t offset, size_t len, + host_mem_t &hbuf, dev_mem_t &dbuf) const +{ + unsigned i = io_base; + cl_mem base = _mempool_out[i]->get_base_ptr(); + cl_int err; + cl_buffer_region region = { offset, len }; + dbuf.clmem = clCreateSubBuffer(base, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + assert(CL_SUCCESS == err); + assert(CL_SUCCESS == clGetMemObjectInfo(dbuf.clmem, CL_MEM_HOST_PTR, + sizeof(void *), &hbuf.ptr, nullptr)); +} + +void *PhiComputeContext::unwrap_host_buffer(host_mem_t hbuf) const +{ + return hbuf.ptr; +} + +void *PhiComputeContext::unwrap_device_buffer(dev_mem_t dbuf) const { - cpu_mempool_in->reset(); - cpu_mempool_out->reset(); - dev_mempool_in->reset(); - dev_mempool_out->reset(); + void *ptr = nullptr; + clGetMemObjectInfo(dbuf.clmem, CL_MEM_HOST_PTR, sizeof(void *), &ptr, nullptr); + return ptr; } -void *PhiComputeContext::get_host_input_buffer_base() +size_t PhiComputeContext::get_input_size(io_base_t io_base) const { - return cpu_mempool_in->get_base_ptr(); + unsigned i = io_base; + return _mempool_in[i]->get_alloc_size(); } -memory_t PhiComputeContext::get_device_input_buffer_base() +size_t PhiComputeContext::get_output_size(io_base_t io_base) const { - memory_t ret; - ret.clmem = clbuf; - return ret; + unsigned i = io_base; + return _mempool_out[i]->get_alloc_size(); } -size_t PhiComputeContext::get_total_input_buffer_size() +void PhiComputeContext::clear_io_buffers(io_base_t io_base) { - assert(cpu_mempool_in->get_alloc_size() == dev_mempool_in->get_alloc_size()); - return cpu_mempool_in->get_alloc_size(); + unsigned i = io_base; + _mempool_in[i]->reset(); + _mempool_out[i]->reset(); + io_base_ring->push_back(i); } void PhiComputeContext::clear_kernel_args() @@ -100,17 +157,17 @@ void PhiComputeContext::push_kernel_arg(struct kernel_arg &arg) kernel_args[num_kernel_args ++] = arg; /* Copied to the array. */ } -int PhiComputeContext::enqueue_memwrite_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +int PhiComputeContext::enqueue_memwrite_op(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size) { - return (int) clEnqueueWriteBuffer(clqueue, dev_buf.clmem, CL_FALSE, offset, size, host_buf, 0, NULL, &clev); + return (int) clEnqueueWriteBuffer(clqueue, dev_buf.clmem, CL_FALSE, offset, size, host_buf.ptr, 0, NULL, &clev); } -int PhiComputeContext::enqueue_memread_op(void *host_buf, memory_t dev_buf, size_t offset, size_t size) +int PhiComputeContext::enqueue_memread_op(host_mem_t host_buf, dev_mem_t dev_buf, size_t offset, size_t size) { - return (int) clEnqueueReadBuffer(clqueue, dev_buf.clmem, CL_FALSE, offset, size, host_buf, 0, NULL, &clev); + return (int) clEnqueueReadBuffer(clqueue, dev_buf.clmem, CL_FALSE, offset, size, host_buf.ptr, 0, NULL, &clev); } -int PhiComputeContext::enqueue_kernel_launch(kernel_t kernel, struct resource_param *res) +int PhiComputeContext::enqueue_kernel_launch(dev_kernel_t kernel, struct resource_param *res) { for (unsigned i = 0; i < num_kernel_args; i++) { phiSafeCall(clSetKernelArg(kernel.clkernel, 6 + i, kernel_args[i].size, kernel_args[i].ptr)); diff --git a/src/lib/coprocessor.cc b/src/lib/coprocessor.cc index b03d3c1..f8dda8a 100644 --- a/src/lib/coprocessor.cc +++ b/src/lib/coprocessor.cc @@ -60,6 +60,7 @@ static void coproc_task_input_cb(struct ev_loop *loop, struct ev_async *watcher, task->coproc_ctx = ctx; task->copy_h2d(); task->execute(); + //task->cctx->sync(); // for DEBUG /* We separate d2h copy step since CUDA implicitly synchronizes * kernel executions. See more details at: * http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#implicit-synchronization */ @@ -88,8 +89,8 @@ static void coproc_task_d2h_cb(struct ev_loop *loop, struct ev_async *watcher, i OffloadTask *task = ctx->d2h_pending_queue->front(); ctx->d2h_pending_queue->pop_front(); if (task->poll_kernel_finished()) { - //task->cctx->sync(); task->copy_d2h(); + //task->cctx->sync(); // for DEBUG ctx->task_done_queue->push_back(task); if (ctx->task_done_queue->size() >= NBA_MAX_KERNEL_OVERLAP || !ev_is_pending(ctx->task_input_watcher)) ev_feed_event(loop, ctx->task_done_watcher, EV_ASYNC); diff --git a/src/lib/datablock.cc b/src/lib/datablock.cc index d00185c..65f01aa 100644 --- a/src/lib/datablock.cc +++ b/src/lib/datablock.cc @@ -63,8 +63,8 @@ tuple DataBlock::calc_read_buffer_size(PacketBatch *batch) num_read_items = batch->count; size_t align = (read_roi.align == 0) ? 2 : read_roi.align; unsigned aligned_len = RTE_ALIGN_CEIL(read_roi.length, align); - t->aligned_item_sizes_h->size = aligned_len; - read_buffer_size = aligned_len * num_read_items; + t->aligned_item_sizes->size = aligned_len; + read_buffer_size = aligned_len * num_read_items; break; } case READ_WHOLE_PACKET: { @@ -79,15 +79,15 @@ tuple DataBlock::calc_read_buffer_size(PacketBatch *batch) #if (NBA_BATCHING_SCHEME == NBA_BATCHING_TRADITIONAL) \ || (NBA_BATCHING_SCHEME == NBA_BATCHING_BITVECTOR) if (IS_PACKET_INVALID(batch, pkt_idx)) { - t->aligned_item_sizes_h->offsets[pkt_idx] = 0; - t->aligned_item_sizes_h->sizes[pkt_idx] = 0; + t->aligned_item_sizes->offsets[pkt_idx] = 0; + t->aligned_item_sizes->sizes[pkt_idx] = 0; } else { #endif unsigned exact_len = rte_pktmbuf_data_len(batch->packets[pkt_idx]) - read_roi.offset + read_roi.length + read_roi.size_delta; unsigned aligned_len = RTE_ALIGN_CEIL(exact_len, align); - t->aligned_item_sizes_h->offsets[pkt_idx] = read_buffer_size; - t->aligned_item_sizes_h->sizes[pkt_idx] = aligned_len; + t->aligned_item_sizes->offsets[pkt_idx] = read_buffer_size; + t->aligned_item_sizes->sizes[pkt_idx] = aligned_len; read_buffer_size += aligned_len; #if (NBA_BATCHING_SCHEME == NBA_BATCHING_TRADITIONAL) \ || (NBA_BATCHING_SCHEME == NBA_BATCHING_BITVECTOR) @@ -181,8 +181,8 @@ void DataBlock::preprocess(PacketBatch *batch, void *host_in_buffer) { case READ_PARTIAL_PACKET: { void *invalid_value = this->get_invalid_value(); FOR_EACH_PACKET_ALL_PREFETCH(batch, 4u) { - uint16_t aligned_elemsz = t->aligned_item_sizes_h->size; - uint32_t offset = t->aligned_item_sizes_h->size * pkt_idx; + uint16_t aligned_elemsz = t->aligned_item_sizes->size; + uint32_t offset = t->aligned_item_sizes->size * pkt_idx; if (IS_PACKET_INVALID(batch, pkt_idx)) { if (invalid_value != nullptr) { rte_memcpy((char *) host_in_buffer + offset, invalid_value, aligned_elemsz); @@ -201,8 +201,8 @@ void DataBlock::preprocess(PacketBatch *batch, void *host_in_buffer) { FOR_EACH_PACKET_ALL_PREFETCH(batch, 4u) { if (IS_PACKET_INVALID(batch, pkt_idx)) continue; - size_t aligned_elemsz = t->aligned_item_sizes_h->sizes[pkt_idx]; - size_t offset = t->aligned_item_sizes_h->offsets[pkt_idx].as_value(); + size_t aligned_elemsz = t->aligned_item_sizes->sizes[pkt_idx]; + size_t offset = t->aligned_item_sizes->offsets[pkt_idx].as_value(); rte_memcpy((char*) host_in_buffer + offset, rte_pktmbuf_mtod(batch->packets[pkt_idx], char*) + read_roi.offset, aligned_elemsz); @@ -246,11 +246,11 @@ void DataBlock::postprocess(OffloadableElement *elem, int input_port, PacketBatc #endif FOR_EACH_PACKET(batch) { size_t elemsz = bitselect(write_roi.type == WRITE_PARTIAL_PACKET, - t->aligned_item_sizes_h->size, - t->aligned_item_sizes_h->sizes[pkt_idx]); + t->aligned_item_sizes->size, + t->aligned_item_sizes->sizes[pkt_idx]); size_t offset = bitselect(write_roi.type == WRITE_PARTIAL_PACKET, - t->aligned_item_sizes_h->size * pkt_idx, - t->aligned_item_sizes_h->offsets[pkt_idx].as_value()); + t->aligned_item_sizes->size * pkt_idx, + t->aligned_item_sizes->offsets[pkt_idx].as_value()); rte_memcpy(rte_pktmbuf_mtod(batch->packets[pkt_idx], char*) + write_roi.offset, (char*) host_out_ptr + offset, elemsz); diff --git a/src/lib/elementgraph.cc b/src/lib/elementgraph.cc index 2e5bcc8..97418ac 100644 --- a/src/lib/elementgraph.cc +++ b/src/lib/elementgraph.cc @@ -101,11 +101,11 @@ void ElementGraph::send_offload_task_to_device(OffloadTask *task) /* Allocate the host-device IO buffer pool. */ while (task->io_base == INVALID_IO_BASE) { task->io_base = cctx->alloc_io_base(); + if (unlikely(ctx->io_ctx->loop_broken)) return; if (task->io_base == INVALID_IO_BASE) { /* If not available now, wait. */ ev_run(ctx->io_ctx->loop, 0); } - if (unlikely(ctx->io_ctx->loop_broken)) return; } /* Calculate required buffer sizes, allocate them, and initialize them. diff --git a/src/lib/io.cc b/src/lib/io.cc index 2df0709..8ce2ca6 100644 --- a/src/lib/io.cc +++ b/src/lib/io.cc @@ -95,7 +95,7 @@ static void comp_task_init(struct rte_mempool *mp, void *arg, void *obj, unsigne new (t) OffloadTask(); } -static void comp_prepare_cb(struct ev_loop *loop, struct ev_prepare *watcher, int revents) +static void comp_prepare_cb(struct ev_loop *loop, struct ev_check *watcher, int revents) { /* This routine is called when ev_run() is about to block. * (i.e., there is no outstanding events) @@ -139,8 +139,6 @@ static void comp_offload_task_completion_cb(struct ev_loop *loop, struct ev_asyn for (PacketBatch *batch : task->batches) { if (batch->datablock_states != nullptr) { struct datablock_tracker *t = batch->datablock_states; - t->host_in_ptr = nullptr; - t->host_out_ptr = nullptr; rte_mempool_put(ctx->dbstate_pool, (void *) t); batch->datablock_states = nullptr; } @@ -747,10 +745,10 @@ int io_loop(void *arg) } /* Register per-iteration check event. */ - ctx->comp_ctx->prepare_watcher = (struct ev_prepare *) rte_malloc_socket(nullptr, sizeof(struct ev_prepare), + ctx->comp_ctx->check_watcher = (struct ev_check *) rte_malloc_socket(nullptr, sizeof(struct ev_check), CACHE_LINE_SIZE, ctx->loc.node_id); - ev_prepare_init(ctx->comp_ctx->prepare_watcher, comp_prepare_cb); - ev_prepare_start(ctx->loop, ctx->comp_ctx->prepare_watcher); + ev_check_init(ctx->comp_ctx->check_watcher, comp_prepare_cb); + ev_check_start(ctx->loop, ctx->comp_ctx->check_watcher); /* ==== END_OF_COMP ====*/ diff --git a/src/lib/offloadtask.cc b/src/lib/offloadtask.cc index 151e256..99d2717 100644 --- a/src/lib/offloadtask.cc +++ b/src/lib/offloadtask.cc @@ -48,26 +48,26 @@ OffloadTask::~OffloadTask() { } -#if DEBUG +#ifdef DEBUG #define _debug_print_inb(tag, batch, dbid) { \ - void *begin_h; \ - memory_t begin_d; \ - cctx->get_input_current_pos(io_base, &begin_h, &begin_d); \ - size_t len = cctx->get_input_size(io_base) - last_input_size; \ - last_input_size = cctx->get_input_size(io_base); \ - printf("task[%lu, %p:%u] alloc_input_buffer (" tag ") %p:%d -> start:%p, end:%p, len:%'lu(0x%lx)\n", \ + size_t end = cctx->get_input_size(io_base); \ + size_t len = end - last_input_size; \ + size_t begin = end - len; \ + last_input_size = end; \ + printf("task[%lu, %p:%u] alloc_input_buffer (" tag ") %p:%d " \ + "-> start:0x%08x, end:0x%08x, len:%lu (0x%lx) bytes\n", \ task_id, cctx, (unsigned) io_base, batch, dbid, \ - (void *)((uintptr_t)begin_h - len), begin_h, len, len); \ + begin, end, len, len); \ } #define _debug_print_outb(tag, batch, dbid) { \ - void *begin_h; \ - memory_t begin_d; \ - cctx->get_output_current_pos(io_base, &begin_h, &begin_d); \ - size_t len = cctx->get_output_size(io_base) - last_output_size; \ - last_output_size = cctx->get_output_size(io_base); \ - printf("task[%lu, %p:%u] alloc_output_buffer (" tag ") %p:%d -> start:%p, end:%p, len:%'lu(0x%lx)\n", \ + size_t end = cctx->get_output_size(io_base); \ + size_t len = end - last_output_size; \ + size_t begin = end - len; \ + last_output_size = end; \ + printf("task[%lu, %p:%u] alloc_output_buffer (" tag ") %p:%d " \ + "-> start:0x%08x, end:0x%08x, len:%lu (0x%lx) bytes\n", \ task_id, cctx, (unsigned) io_base, batch, dbid, \ - (void *)((uintptr_t)begin_h - len), begin_h, len, len); \ + begin, end, len, len); \ } #else #define _debug_print_inb(tag, batch, dbid) @@ -78,10 +78,8 @@ void OffloadTask::prepare_read_buffer() { // write: host-to-device input // read: device-to-host output - cctx->get_input_current_pos(io_base, &host_write_begin, &dev_write_begin); - cctx->get_output_current_pos(io_base, &host_read_begin, &dev_read_begin); - input_alloc_size_begin = cctx->get_input_size(io_base); - output_alloc_size_begin = cctx->get_output_size(io_base); + input_begin = cctx->get_input_size(io_base); + output_begin = cctx->get_output_size(io_base); _debug_print_inb("at-beginning", nullptr, 0); _debug_print_outb("at-beginning", nullptr, 0); @@ -94,22 +92,26 @@ void OffloadTask::prepare_read_buffer() for (PacketBatch *batch : batches) { struct datablock_tracker *t = &batch->datablock_states[dbid]; cctx->alloc_input_buffer(io_base, sizeof(struct item_size_info), - (void **) &t->aligned_item_sizes_h, - &t->aligned_item_sizes_d); + t->aligned_item_sizes_h, + t->aligned_item_sizes_d); + t->aligned_item_sizes = (struct item_size_info *) + cctx->unwrap_host_buffer(t->aligned_item_sizes_h); } _debug_print_inb("prepare_read_buffer.WHOLE", nullptr, dbid); } else if (rri.type == READ_PARTIAL_PACKET) { for (PacketBatch *batch : batches) { struct datablock_tracker *t = &batch->datablock_states[dbid]; cctx->alloc_input_buffer(io_base, sizeof(uint64_t), - (void **) &t->aligned_item_sizes_h, - &t->aligned_item_sizes_d); + t->aligned_item_sizes_h, + t->aligned_item_sizes_d); + t->aligned_item_sizes = (struct item_size_info *) + cctx->unwrap_host_buffer(t->aligned_item_sizes_h); } _debug_print_inb("prepare_read_buffer.PARTIAL", nullptr, dbid); } else { for (PacketBatch *batch : batches) { struct datablock_tracker *t = &batch->datablock_states[dbid]; - t->aligned_item_sizes_h = nullptr; + //t->aligned_item_sizes_h = nullptr; } } } /* endif(check_preproc) */ @@ -124,8 +126,6 @@ void OffloadTask::prepare_read_buffer() struct datablock_tracker *t = &batch->datablock_states[dbid]; t->in_size = 0; t->in_count = 0; - t->host_in_ptr = nullptr; - t->dev_in_ptr.ptr = nullptr; } } else { for (PacketBatch *batch : batches) { @@ -133,8 +133,9 @@ void OffloadTask::prepare_read_buffer() tie(t->in_size, t->in_count) = db->calc_read_buffer_size(batch); if (t->in_size > 0 && t->in_count > 0) { cctx->alloc_input_buffer(io_base, t->in_size, - (void **) &t->host_in_ptr, &t->dev_in_ptr); - db->preprocess(batch, t->host_in_ptr); + t->host_in_ptr, t->dev_in_ptr); + void *inp = cctx->unwrap_host_buffer(t->host_in_ptr); + db->preprocess(batch, inp); } } _debug_print_inb("prepare_read_buffer.preproc", nullptr, dbid); @@ -157,8 +158,6 @@ void OffloadTask::prepare_write_buffer() struct datablock_tracker *t = &batch->datablock_states[dbid]; t->out_size = 0; t->out_count = 0; - t->host_out_ptr = nullptr; - t->dev_out_ptr.ptr = nullptr; } } else { //if (rri.type == READ_WHOLE_PACKET && wri.type == WRITE_WHOLE_PACKET) { @@ -173,13 +172,11 @@ void OffloadTask::prepare_write_buffer() //} else { for (PacketBatch *batch : batches) { struct datablock_tracker *t = &batch->datablock_states[dbid]; - t->host_out_ptr = nullptr; - t->dev_out_ptr.ptr = nullptr; tie(t->out_size, t->out_count) = db->calc_write_buffer_size(batch); if (t->out_size > 0 && t->out_count > 0) { cctx->alloc_output_buffer(io_base, t->out_size, - (void **) &t->host_out_ptr, - &t->dev_out_ptr); + t->host_out_ptr, + t->dev_out_ptr); } } _debug_print_outb("prepare_write_buffer", nullptr, dbid); @@ -194,10 +191,11 @@ bool OffloadTask::copy_h2d() state = TASK_H2D_COPYING; /* Copy the datablock information for the first kernel argument. */ - size_t dbarray_size = sizeof(struct datablock_kernel_arg *) * datablocks.size(); - cctx->alloc_input_buffer(io_base, dbarray_size, (void **) &dbarray_h, &dbarray_d); + size_t dbarray_size = sizeof(void *) * datablocks.size(); + struct datablock_kernel_arg **dbarray; + cctx->alloc_input_buffer(io_base, dbarray_size, dbarray_h, dbarray_d); _debug_print_inb("copy_h2d.dbarray", nullptr, 0); - assert(dbarray_h != nullptr); + dbarray = (struct datablock_kernel_arg **) cctx->unwrap_host_buffer(dbarray_h); for (int dbid : datablocks) { int dbid_d = dbid_h2d[dbid]; @@ -208,16 +206,21 @@ bool OffloadTask::copy_h2d() db->get_read_roi(&rri); db->get_write_roi(&wri); - struct datablock_kernel_arg *dbarg_h; - memory_t dbarg_d; + struct datablock_kernel_arg *dbarg; + host_mem_t dbarg_h; + dev_mem_t dbarg_d; size_t dbarg_size = sizeof(struct datablock_kernel_arg) + batches.size() * sizeof(struct datablock_batch_info); - cctx->alloc_input_buffer(io_base, dbarg_size, (void **) &dbarg_h, &dbarg_d); - assert(dbarg_h != nullptr); + cctx->alloc_input_buffer(io_base, dbarg_size, dbarg_h, dbarg_d); + dbarg = (struct datablock_kernel_arg *) cctx->unwrap_host_buffer(dbarg_h); + dbarray[dbid_d] = (struct datablock_kernel_arg *) cctx->unwrap_device_buffer(dbarg_d); + dbarg->total_item_count_in = 0; + dbarg->total_item_count_out = 0; - dbarray_h[dbid_d] = (struct datablock_kernel_arg *) dbarg_d.ptr; - dbarg_h->total_item_count_in = 0; - dbarg_h->total_item_count_out = 0; + // NOTE: To use our "datablock kernel arg" data structures, + // the underlying kernel language must support generic + // pointer references. + // (e.g,. NVIDIA CUDA / OpenCL 2.0+) for (auto&& p : enumerate(batches)) { size_t b = p.first; @@ -228,34 +231,31 @@ bool OffloadTask::copy_h2d() if (rri.type == READ_WHOLE_PACKET && t->in_count > 0) { /* We need to copy the size array because each item may * have different lengths. */ - assert(t->aligned_item_sizes_h != nullptr); - dbarg_h->batches[b].item_sizes_in = (uint16_t *) - ((char *) t->aligned_item_sizes_d.ptr - + (uintptr_t) offsetof(struct item_size_info, sizes)); - dbarg_h->batches[b].item_sizes_out = (uint16_t *) - ((char *) t->aligned_item_sizes_d.ptr - + (uintptr_t) offsetof(struct item_size_info, sizes)); - dbarg_h->batches[b].item_offsets_in = (dev_offset_t *) - ((char *) t->aligned_item_sizes_d.ptr - + (uintptr_t) offsetof(struct item_size_info, offsets)); - dbarg_h->batches[b].item_offsets_out = (dev_offset_t *) - ((char *) t->aligned_item_sizes_d.ptr - + (uintptr_t) offsetof(struct item_size_info, offsets)); + //assert(t->aligned_item_sizes_h != nullptr); + uintptr_t base_ptr = (uintptr_t) cctx->unwrap_device_buffer(t->aligned_item_sizes_d); + dbarg->batches[b].item_sizes_in = (uint16_t *) + (base_ptr + offsetof(struct item_size_info, sizes)); + dbarg->batches[b].item_sizes_out = (uint16_t *) + (base_ptr + offsetof(struct item_size_info, sizes)); + dbarg->batches[b].item_offsets_in = (dev_offset_t *) + (base_ptr + offsetof(struct item_size_info, offsets)); + dbarg->batches[b].item_offsets_out = (dev_offset_t *) + (base_ptr + offsetof(struct item_size_info, offsets)); } else { /* Same for all batches. * We assume the module developer knows the fixed length * when writing device kernel codes. */ - dbarg_h->item_size_in = rri.length; - dbarg_h->item_size_out = wri.length; - dbarg_h->batches[b].item_offsets_in = nullptr; - dbarg_h->batches[b].item_offsets_out = nullptr; + dbarg->item_size_in = rri.length; + dbarg->item_size_out = wri.length; + dbarg->batches[b].item_offsets_in = nullptr; + dbarg->batches[b].item_offsets_out = nullptr; } - dbarg_h->batches[b].buffer_bases_in = t->dev_in_ptr.ptr; // FIXME: generalize to CL? - dbarg_h->batches[b].item_count_in = t->in_count; - dbarg_h->total_item_count_in += t->in_count; - dbarg_h->batches[b].buffer_bases_out = t->dev_out_ptr.ptr; // FIXME: generalize to CL? - dbarg_h->batches[b].item_count_out = t->out_count; - dbarg_h->total_item_count_out += t->out_count; + dbarg->batches[b].buffer_bases_in = cctx->unwrap_device_buffer(t->dev_in_ptr); + dbarg->batches[b].item_count_in = t->in_count; + dbarg->total_item_count_in += t->in_count; + dbarg->batches[b].buffer_bases_out = cctx->unwrap_device_buffer(t->dev_out_ptr); + dbarg->batches[b].item_count_out = t->out_count; + dbarg->total_item_count_out += t->out_count; } /* endfor(batches) */ } /* endfor(dbid) */ return true; @@ -278,10 +278,10 @@ void OffloadTask::execute() int dbid = elem->get_offload_item_counter_dbid(); DataBlock *db = comp_ctx->datablock_registry[dbid]; - uint16_t *batch_ids_h = nullptr; - memory_t batch_ids_d; - uint16_t *item_ids_h = nullptr; - memory_t item_ids_d; + host_mem_t batch_ids_h; + dev_mem_t batch_ids_d; + host_mem_t item_ids_h; + dev_mem_t item_ids_d; for (PacketBatch *batch : batches) { struct datablock_tracker *t = &batch->datablock_states[dbid]; @@ -291,37 +291,38 @@ void OffloadTask::execute() if (all_item_count > 0) { cctx->alloc_input_buffer(io_base, sizeof(uint16_t) * all_item_count, - (void **) &batch_ids_h, &batch_ids_d); + batch_ids_h, batch_ids_d); _debug_print_inb("execute.batch_ids", nullptr, 0); - assert(batch_ids_h != nullptr); cctx->alloc_input_buffer(io_base, sizeof(uint16_t) * all_item_count, - (void **) &item_ids_h, &item_ids_d); + item_ids_h, item_ids_d); _debug_print_inb("execute.item_ids", nullptr, 0); - assert(item_ids_h != nullptr); + uint8_t *batch_ids = (uint8_t *) cctx->unwrap_host_buffer(batch_ids_h); + uint16_t *item_ids = (uint16_t *) cctx->unwrap_host_buffer(item_ids_h); res.num_workitems = all_item_count; - res.num_threads_per_workgroup = 256; + res.num_threads_per_workgroup = elem->get_desired_workgroup_size(cctx->type_name.c_str()); res.num_workgroups = (all_item_count + res.num_threads_per_workgroup - 1) / res.num_threads_per_workgroup; - uint16_t batch_id = 0; + uint8_t batch_id = 0; unsigned global_idx = 0; for (PacketBatch *batch : batches) { struct datablock_tracker *t = &batch->datablock_states[dbid]; for (unsigned item_id = 0; item_id < t->in_count; item_id ++) { - batch_ids_h[global_idx] = batch_id; - item_ids_h[global_idx] = item_id; + batch_ids[global_idx] = batch_id; + item_ids[global_idx] = item_id; global_idx ++; } batch_id ++; } - size_t last_alloc_size = cctx->get_input_size(io_base); - //printf("GPU-offload-h2d-size: %'lu bytes\n", last_alloc_size); + size_t total_input_size = cctx->get_input_size(io_base) - input_begin; + //printf("GPU-offload-h2d-size: %'lu bytes\n", total_input_size); // ipv4@64B: 16K ~ 24K // ipsec@64B: ~ 5M - cctx->enqueue_memwrite_op(host_write_begin, dev_write_begin, 0, - last_alloc_size - input_alloc_size_begin); - //cctx->enqueue_memwrite_op(host_write_begin, dev_write_begin, 0, - // 2097152); + host_mem_t host_input; + dev_mem_t dev_input; + cctx->map_input_buffer(io_base, input_begin, total_input_size, + host_input, dev_input); + cctx->enqueue_memwrite_op(host_input, dev_input, 0, total_input_size); cctx->clear_checkbits(); cctx->clear_kernel_args(); @@ -333,20 +334,30 @@ void OffloadTask::execute() * (2) the number of batches */ void *checkbits_d = cctx->get_device_checkbits(); + void *ptr_args[3]; // storage for rvalue struct kernel_arg arg; - arg = {(void *) &dbarray_d.ptr, sizeof(void *), alignof(void *)}; + + ptr_args[0] = cctx->unwrap_device_buffer(dbarray_d); + arg = {&ptr_args[0], sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); + arg = {(void *) &all_item_count, sizeof(uint32_t), alignof(uint32_t)}; cctx->push_kernel_arg(arg); - arg = {(void *) &batch_ids_d.ptr, sizeof(void *), alignof(void *)}; + + ptr_args[1] = cctx->unwrap_device_buffer(batch_ids_d); + arg = {&ptr_args[1], sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); - arg = {(void *) &item_ids_d.ptr, sizeof(void *), alignof(void *)}; + + ptr_args[2] = cctx->unwrap_device_buffer(item_ids_d); + arg = {&ptr_args[2], sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); + arg = {(void *) &checkbits_d, sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); offload_compute_handler &handler = elem->offload_compute_handlers[cctx->type_name]; handler(cctx, &res); + /* Skip kernel execution. */ //res.num_workitems = 0; //res.num_threads_per_workgroup = 1; @@ -368,9 +379,12 @@ bool OffloadTask::copy_d2h() state = TASK_D2H_COPYING; /* Coalesced D2H data copy. */ - size_t last_alloc_size = cctx->get_output_size(io_base); - cctx->enqueue_memread_op(host_read_begin, dev_read_begin, - 0, last_alloc_size - output_alloc_size_begin); + size_t total_output_size = cctx->get_output_size(io_base) - output_begin; + host_mem_t host_output; + dev_mem_t dev_output; + cctx->map_output_buffer(io_base, output_begin, total_output_size, + host_output, dev_output); + cctx->enqueue_memread_op(host_output, dev_output, 0, total_output_size); return true; } @@ -409,15 +423,15 @@ void OffloadTask::postprocess() DataBlock *db = comp_ctx->datablock_registry[dbid]; struct write_roi_info wri; db->get_write_roi(&wri); - int b = 0; - for (PacketBatch *batch : batches) { + for (auto&& pair : enumerate(batches)) { + PacketBatch *batch = pair.second; struct datablock_tracker *t = &batch->datablock_states[dbid]; - if (t->host_out_ptr != nullptr) { + if (t->out_size > 0) { // FIXME: let the element to choose the datablock used for postprocessing, // or pass multiple datablocks that have outputs. - db->postprocess(elem, input_ports[b], batch, t->host_out_ptr); + void *outp = cctx->unwrap_host_buffer(t->host_out_ptr); + db->postprocess(elem, input_ports[pair.first], batch, outp); } - b++; } } /* endif(check_postproc) */ } /* endfor(dbid) */ diff --git a/tests/test_core_mempool.cc b/tests/test_core_mempool.cc new file mode 100644 index 0000000..353129d --- /dev/null +++ b/tests/test_core_mempool.cc @@ -0,0 +1,51 @@ +#include +#include + +namespace nba { + +class DummyMemoryPool : public MemoryPool +{ +public: + DummyMemoryPool() : MemoryPool() { } + DummyMemoryPool(size_t max_size) : MemoryPool(max_size) { } + DummyMemoryPool(size_t max_size, size_t align) : MemoryPool(max_size, align) { } + + bool init() { return true; /* no-op */ } + + uintptr_t get_base_ptr() const { return 0; } + + int alloc(size_t size, uintptr_t& ptr) + { + size_t offset; + int ret = _alloc(size, &offset); + if (ret == 0) + ptr = (uintptr_t) offset; + return ret; + } + + void destroy() { /* no-op */ } +}; + +} // endns(nba) + +using namespace nba; + +TEST(CoreMempoolTest, Alloc) { + DummyMemoryPool mp(100lu, 32lu); + uintptr_t p; + EXPECT_EQ(0, mp.alloc(50, p)); + EXPECT_EQ(0, p); + EXPECT_EQ(64, mp.get_alloc_size()); + EXPECT_EQ(0, mp.alloc(20, p)); + EXPECT_EQ(64, p); + EXPECT_EQ(96, mp.get_alloc_size()); + EXPECT_EQ(-ENOMEM, mp.alloc(4, p)) + << "Even if cur_pos + size <= max_size, " + "it should fail when aligned size exceeds max_size."; + EXPECT_EQ(64, p) << "Pointer should not change when full."; + EXPECT_EQ(96, mp.get_alloc_size()) << "Alloc size should not change when full."; + mp.reset(); + EXPECT_EQ(0, mp.get_alloc_size()); +} + +// vim: ts=8 sts=4 sw=4 et