Skip to content

Commit

Permalink
refs #6, #27: Refactor memory pools and host/device memory types
Browse files Browse the repository at this point in the history
 * Tried to support OpenCL, but confirmed that we need OpenCL 2.0+.
   (which is not supported by current generation of Xeon Phi...)

   - Related codes will be rewritten someday using SVM (shared virtual
     memory) APIs in OpenCL 2.0+.

 * Reduced memory footprint of batch_ids array passed to the device.

 * Rollbacked ev_prepare -> ev_check watcher type change (c732a25),
   as it has broken CPU-only cross-node IP forwarding scenarios. :(

 * TODO: fix IPsec GPU-only mode...
  • Loading branch information
achimnol committed Feb 8, 2016
1 parent 8b2c10f commit e75e280
Show file tree
Hide file tree
Showing 41 changed files with 816 additions and 642 deletions.
1 change: 0 additions & 1 deletion configs/ipv4-router-gpuonly.click
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@ GPUOnly() ->
DropBroadcasts() ->
CheckIPHeader() ->
IPlookup() ->
//IPlookup() ->
DecIPTTL() ->
ToOutput();
117 changes: 64 additions & 53 deletions elements/ip/IPlookup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,12 @@ IPlookup::IPlookup(): OffloadableElement()

num_tx_ports = 0;
rr_port = 0;
p_rwlock_TBL24 = NULL;
p_rwlock_TBLlong = NULL;
TBL24_h = NULL;
TBLlong_h = NULL;
TBL24_d = NULL;
TBLlong_d = NULL;
}

int IPlookup::initialize()
{
/* Get routing table pointers from the node-local storage. */
TBL24_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBL24");
p_rwlock_TBL24 = ctx->node_local_storage->get_rwlock("TBL24");
TBLlong_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBLlong");
p_rwlock_TBLlong = ctx->node_local_storage->get_rwlock("TBLlong");

/* Get device pointers from the node-local storage. */
TBL24_d = ((memory_t **) ctx->node_local_storage->get_alloc("TBL24_dev_ptr"))[0];
TBLlong_d = ((memory_t **) ctx->node_local_storage->get_alloc("TBLlong_dev_ptr"))[0];
return 0;
p_rwlock_TBL24 = nullptr;
p_rwlock_TBLlong = nullptr;
TBL24_h = nullptr;
TBLlong_h = nullptr;
TBL24_d = { nullptr };
TBLlong_d = { nullptr };
}

int IPlookup::initialize_global()
Expand All @@ -76,22 +62,39 @@ int IPlookup::initialize_per_node()
ctx->node_local_storage->alloc("TBL24", sizeof(uint16_t) * ipv4_get_TBL24_size());
ctx->node_local_storage->alloc("TBLlong", sizeof(uint16_t) * ipv4_get_TBLlong_size());
/* Storage for device pointers. */
ctx->node_local_storage->alloc("TBL24_dev_ptr", sizeof(memory_t));
ctx->node_local_storage->alloc("TBLlong_dev_ptr", sizeof(memory_t));
ctx->node_local_storage->alloc("TBL24_dev_ptr", sizeof(dev_mem_t));
ctx->node_local_storage->alloc("TBLlong_dev_ptr", sizeof(dev_mem_t));

printf("element::IPlookup: Initializing FIB from the global RIB for NUMA node %d...\n", node_idx);
ipv4_build_fib();

return 0;
}

int IPlookup::initialize()
{
/* Get routing table pointers from the node-local storage. */
TBL24_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBL24");
TBLlong_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBLlong");
assert(TBL24_h != nullptr);
assert(TBLlong_h != nullptr);
p_rwlock_TBL24 = ctx->node_local_storage->get_rwlock("TBL24");
p_rwlock_TBLlong = ctx->node_local_storage->get_rwlock("TBLlong");

/* Get device pointers from the node-local storage. */
TBL24_d = (dev_mem_t *) ctx->node_local_storage->get_alloc("TBL24_dev_ptr");
TBLlong_d = (dev_mem_t *) ctx->node_local_storage->get_alloc("TBLlong_dev_ptr");

rr_port = 0;
return 0;
}

int IPlookup::configure(comp_thread_context *ctx, std::vector<std::string> &args)
{
Element::configure(ctx, args);
num_tx_ports = ctx->num_tx_ports;
num_nodes = ctx->num_nodes;
node_idx = ctx->loc.node_id;
rr_port = 0;
return 0;
}

Expand All @@ -110,9 +113,13 @@ int IPlookup::process(int input_port, Packet *pkt)
return 0;
}

//unsigned n = (pkt->pkt.in_port <= (num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2);
//rr_port = (rr_port + 1) % (num_tx_ports / 2) + n;
#ifdef NBA_IPFWD_RR_NODE_LOCAL
unsigned iface_in = anno_get(&pkt->anno, NBA_ANNO_IFACE_IN);
unsigned n = (iface_in <= ((unsigned) num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2);
rr_port = (rr_port + 1) % (num_tx_ports / 2) + n;
#else
rr_port = (rr_port + 1) % (num_tx_ports);
#endif
anno_set(&pkt->anno, NBA_ANNO_IFACE_OUT, rr_port);
output(0).push(pkt);
return 0;
Expand All @@ -127,9 +134,13 @@ int IPlookup::postproc(int input_port, void *custom_output, Packet *pkt)
return 0;
}

//unsigned n = (pkt->pkt.in_port <= (num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2);
//rr_port = (rr_port + 1) % (num_tx_ports / 2) + n;
#ifdef NBA_IPFWD_RR_NODE_LOCAL
unsigned iface_in = anno_get(&pkt->anno, NBA_ANNO_IFACE_IN);
unsigned n = (iface_in <= ((unsigned) num_tx_ports / 2) - 1) ? 0 : (num_tx_ports / 2);
rr_port = (rr_port + 1) % (num_tx_ports / 2) + n;
#else
rr_port = (rr_port + 1) % (num_tx_ports);
#endif
anno_set(&pkt->anno, NBA_ANNO_IFACE_OUT, rr_port);
output(0).push(pkt);
return 0;
Expand All @@ -149,41 +160,41 @@ size_t IPlookup::get_desired_workgroup_size(const char *device_name) const
}

#ifdef USE_CUDA
void IPlookup::cuda_init_handler(ComputeDevice *device)
{
/* Store the device pointers for per-thread element instances. */
size_t TBL24_alloc_size = sizeof(uint16_t) * ipv4_get_TBL24_size();
size_t TBLlong_alloc_size = sizeof(uint16_t) * ipv4_get_TBLlong_size();
// As it is before initialize() is called, we need to get the pointers
// from the node-local storage by ourselves here.
uint16_t *_TBL24_h = nullptr;
uint16_t *_TBLlong_h = nullptr;
_TBL24_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBL24");
_TBLlong_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBLlong");
TBL24_d = (dev_mem_t *) ctx->node_local_storage->get_alloc("TBL24_dev_ptr");
TBLlong_d = (dev_mem_t *) ctx->node_local_storage->get_alloc("TBLlong_dev_ptr");
*TBL24_d = device->alloc_device_buffer(TBL24_alloc_size);
*TBLlong_d = device->alloc_device_buffer(TBLlong_alloc_size);
/* Convert host-side routing table to host_mem_t and copy the routing table. */
device->memwrite({(void *) _TBL24_h}, *TBL24_d, 0, TBL24_alloc_size);
device->memwrite({(void *) _TBLlong_h}, *TBLlong_d, 0, TBLlong_alloc_size);
}

void IPlookup::cuda_compute_handler(ComputeContext *cctx,
struct resource_param *res)
{
//printf("G++ datablock_kernel_arg (%lu)\n", sizeof(struct datablock_kernel_arg));
//printf("G++ .total_item_count (%lu)\n", offsetof(struct datablock_kernel_arg, total_item_count));
//printf("G++ .buffer_bases (%lu)\n", offsetof(struct datablock_kernel_arg, buffer_bases));
//printf("G++ .item_count (%lu)\n", offsetof(struct datablock_kernel_arg, item_count));
//printf("G++ .item_size (%lu)\n", offsetof(struct datablock_kernel_arg, item_size));
//printf("G++ .item_sizes (%lu)\n", offsetof(struct datablock_kernel_arg, item_sizes));

struct kernel_arg arg;
arg = {(void *) &TBL24_d, sizeof(void *), alignof(void *)};
void *ptr_args[2];
ptr_args[0] = cctx->unwrap_device_buffer(*TBL24_d);
arg = {(void *) &ptr_args[0], sizeof(void *), alignof(void *)};
cctx->push_kernel_arg(arg);
arg = {(void *) &TBLlong_d, sizeof(void *), alignof(void *)};
ptr_args[1] = cctx->unwrap_device_buffer(*TBLlong_d);
arg = {(void *) &ptr_args[1], sizeof(void *), alignof(void *)};
cctx->push_kernel_arg(arg);
kernel_t kern;
dev_kernel_t kern;
kern.ptr = ipv4_route_lookup_get_cuda_kernel();
cctx->enqueue_kernel_launch(kern, res);
}

void IPlookup::cuda_init_handler(ComputeDevice *device)
{
memory_t new_TBL24_d = /*(uint16_t *)*/ device->alloc_device_buffer(sizeof(uint16_t) * ipv4_get_TBL24_size(), HOST_TO_DEVICE);
memory_t new_TBLlong_d = /*(uint16_t *)*/ device->alloc_device_buffer(sizeof(uint16_t) * ipv4_get_TBLlong_size(), HOST_TO_DEVICE);
TBL24_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBL24");
TBLlong_h = (uint16_t *) ctx->node_local_storage->get_alloc("TBLlong");
device->memwrite(TBL24_h, new_TBL24_d, 0, sizeof(uint16_t) * ipv4_get_TBL24_size());
device->memwrite(TBLlong_h, new_TBLlong_d, 0, sizeof(uint16_t) * ipv4_get_TBLlong_size());

/* Store the device pointers for per-thread instances. */
memory_t *TBL24_dev_ptr_storage = (memory_t *) ctx->node_local_storage->get_alloc("TBL24_dev_ptr");
memory_t *TBLlong_dev_ptr_storage = (memory_t *) ctx->node_local_storage->get_alloc("TBLlong_dev_ptr");
(TBL24_dev_ptr_storage)[0] = new_TBL24_d;
(TBLlong_dev_ptr_storage)[0] = new_TBLlong_d;
}
#endif

int IPlookup::ipv4_route_add(uint32_t addr, uint16_t len, uint16_t nexthop)
Expand Down
4 changes: 2 additions & 2 deletions elements/ip/IPlookup.hh
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ protected:

uint16_t *TBL24_h;
uint16_t *TBLlong_h;
memory_t *TBL24_d;
memory_t *TBLlong_d;
dev_mem_t *TBL24_d;
dev_mem_t *TBLlong_d;
};

EXPORT_ELEMENT(IPlookup);
Expand Down
2 changes: 1 addition & 1 deletion elements/ip/IPlookup_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ __device__ uint32_t ntohl(uint32_t n)
/* The GPU kernel. */
__global__ void ipv4_route_lookup_cuda(
struct datablock_kernel_arg **datablocks,
uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
uint8_t *checkbits_d,
uint16_t* __restrict__ TBL24_d,
uint16_t* __restrict__ TBLlong_d)
Expand Down
27 changes: 11 additions & 16 deletions elements/ipsec/IPsecAES.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ int IPsecAES::initialize()
h_key_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array");

/* Get device pointer from the node local storage. */
d_key_array_ptr = ((memory_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr"))[0];
d_key_array_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr");

if (aes_sa_entry_array != NULL) {
free(aes_sa_entry_array);
Expand Down Expand Up @@ -80,8 +80,8 @@ int IPsecAES::initialize_global()
entry->entry_idx = i;
rte_memcpy(entry->aes_key, "1234123412341234", AES_BLOCK_SIZE);
#ifdef USE_OPENSSL_EVP
// TODO: check if copying globally initialized evpctx works okay.
EVP_CIPHER_CTX_init(&entry->evpctx);
// TODO: check if copying globally initialized evpctx works okay.
EVP_CIPHER_CTX_init(&entry->evpctx);
//if (EVP_EncryptInit(&entry->evpctx, EVP_aes_128_ctr(), entry->aes_key, esph->esp_iv) != 1)
if (EVP_EncryptInit(&entry->evpctx, EVP_aes_128_ctr(), entry->aes_key, fake_iv) != 1)
fprintf(stderr, "IPsecAES: EVP_EncryptInit() - %s\n", ERR_error_string(ERR_get_error(), NULL));
Expand Down Expand Up @@ -121,7 +121,7 @@ int IPsecAES::initialize_per_node()
rte_memcpy(temp_array, aes_sa_entry_array, size);

/* Storage for pointer, which points aes key array in device */
ctx->node_local_storage->alloc("d_aes_key_array_ptr", sizeof(memory_t));
ctx->node_local_storage->alloc("d_aes_key_array_ptr", sizeof(dev_mem_t));

return 0;
}
Expand Down Expand Up @@ -182,30 +182,25 @@ int IPsecAES::process(int input_port, Packet *pkt)
void IPsecAES::cuda_init_handler(ComputeDevice *device)
{
// Put key array content to device space.
long key_array_size = sizeof(struct aes_sa_entry) * num_tunnels;
size_t key_array_size = sizeof(struct aes_sa_entry) * num_tunnels;
h_key_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array");
memory_t key_array_in_device = device->alloc_device_buffer(key_array_size, 0);
device->memwrite(h_key_array, key_array_in_device, 0, key_array_size);
dev_mem_t key_array_in_device = device->alloc_device_buffer(key_array_size);
device->memwrite({ h_key_array }, key_array_in_device, 0, key_array_size);

// Store the device pointer for per-thread instances.
memory_t *p = (memory_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr");
((memory_t *) p)[0] = key_array_in_device;
dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr");
*p = key_array_in_device;
}
#endif

#ifdef USE_CUDA
void IPsecAES::cuda_compute_handler(ComputeContext *cctx, struct resource_param *res)
{
struct kernel_arg arg;
arg = {(void *) &d_key_array_ptr.ptr, sizeof(void *), alignof(void *)};
arg = {(void *) &d_key_array_ptr->ptr, sizeof(void *), alignof(void *)};
cctx->push_kernel_arg(arg);

kernel_t kern;
dev_kernel_t kern;
kern.ptr = ipsec_aes_encryption_get_cuda_kernel();
cctx->enqueue_kernel_launch(kern, res);

// TODO: data-copy-opt
// ?->inc_dev_ver(dbid_ipsec_...);
}
#endif

Expand Down
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAES.hh
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ protected:

/* Per-thread pointers, which points to the node local storage variables. */
std::unordered_map<struct ipaddr_pair, int> *h_sa_table; // tunnel lookup is done in CPU only. No need for GPU ptr.
struct aes_sa_entry *h_key_array = NULL; // used in CPU.
memory_t d_key_array_ptr; // points to the device buffer.
struct aes_sa_entry *h_key_array = nullptr; // used in CPU.
dev_mem_t *d_key_array_ptr;
};

EXPORT_ELEMENT(IPsecAES);
Expand Down
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAES_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,7 @@ __device__ void AES_encrypt_cu_optimized(const uint8_t *in, uint8_t *out,

__global__ void AES_ctr_encrypt_chunk_SharedMem_5(
struct datablock_kernel_arg **datablocks,
uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
uint8_t *checkbits_d,
struct aes_sa_entry* flow_info
)
Expand All @@ -695,7 +695,7 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < count && count != 0) {

const uint16_t batch_idx = batch_ids[idx];
const uint8_t batch_idx = batch_ids[idx];
const uint16_t item_idx = item_ids[idx];

const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
Expand Down
18 changes: 9 additions & 9 deletions elements/ipsec/IPsecAuthHMACSHA1.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ int IPsecAuthHMACSHA1::initialize()
h_key_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array");

/* Get device pointer from the node local storage. */
d_key_array_ptr = ((struct hmac_sa_entry **)ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr"))[0];
d_key_array_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr");

if (hmac_sa_entry_array != NULL) {
free(hmac_sa_entry_array);
Expand Down Expand Up @@ -111,7 +111,7 @@ int IPsecAuthHMACSHA1::initialize_per_node()
rte_memcpy(temp_array, hmac_sa_entry_array, size);

/* Storage for pointer, which points hmac key array in device */
ctx->node_local_storage->alloc("d_hmac_key_array_ptr", sizeof(void *));
ctx->node_local_storage->alloc("d_hmac_key_array_ptr", sizeof(dev_mem_t));

return 0;
}
Expand Down Expand Up @@ -177,24 +177,24 @@ int IPsecAuthHMACSHA1::process(int input_port, Packet *pkt)
void IPsecAuthHMACSHA1::cuda_init_handler(ComputeDevice *device)
{
// Put key array content to device space.
long key_array_size = sizeof(struct hmac_sa_entry) * num_tunnels;
size_t key_array_size = sizeof(struct hmac_sa_entry) * num_tunnels;
h_key_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array");
memory_t key_array_in_device = /*(struct hmac_sa_entry *)*/ device->alloc_device_buffer(key_array_size, 0);
device->memwrite(h_key_array, key_array_in_device, 0, key_array_size);
dev_mem_t key_array_in_device = device->alloc_device_buffer(key_array_size);
device->memwrite({ h_key_array }, key_array_in_device, 0, key_array_size);

// Store the device pointer for per-thread instances.
void *p = ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr");
((memory_t *) p)[0] = key_array_in_device;
dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr");
*p = key_array_in_device;
}

void IPsecAuthHMACSHA1::cuda_compute_handler(ComputeContext *cctx,
struct resource_param *res)
{
struct kernel_arg arg;
arg = {(void *) &d_key_array_ptr, sizeof(void *), alignof(void *)};
arg = {(void *) &d_key_array_ptr->ptr, sizeof(void *), alignof(void *)};
cctx->push_kernel_arg(arg);

kernel_t kern;
dev_kernel_t kern;
kern.ptr = ipsec_hsha1_encryption_get_cuda_kernel();
cctx->enqueue_kernel_launch(kern, res);
}
Expand Down
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAuthHMACSHA1.hh
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ protected:
int dummy_index;

std::unordered_map<struct ipaddr_pair, int> *h_sa_table; // tunnel lookup is done in CPU only. No need for GPU ptr.
struct hmac_sa_entry *h_key_array = NULL; // used in CPU.
struct hmac_sa_entry *d_key_array_ptr = NULL; // points to the device buffer.
struct hmac_sa_entry *h_key_array = nullptr; // used in CPU.
dev_mem_t *d_key_array_ptr; // points to the device buffer.

private:
const int idx_pkt_offset = 0;
Expand Down
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1239,13 +1239,13 @@ __global__ void computeHMAC_SHA1_2(char* buf, char* keys, uint32_t *offsets,

__global__ void computeHMAC_SHA1_3(
struct datablock_kernel_arg **datablocks,
uint32_t count, uint16_t *batch_ids, uint16_t *item_ids,
uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
uint8_t *checkbits_d,
struct hmac_sa_entry *hmac_key_array)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < count && count != 0) {
const uint16_t batch_idx = batch_ids[idx];
const uint8_t batch_idx = batch_ids[idx];
const uint16_t item_idx = item_ids[idx];
assert(item_idx < 64);
const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
Expand Down
Loading

0 comments on commit e75e280

Please sign in to comment.