diff --git a/lib/CL/devices/vortex/kernel_args.h b/lib/CL/devices/vortex/kernel_args.h index 00cb384a0824e5d52c3eed3a00f523e47bfdd573..9612684c42ecda7311faae0ea282a6b1a867dc17 100644 --- a/lib/CL/devices/vortex/kernel_args.h +++ b/lib/CL/devices/vortex/kernel_args.h @@ -7,3 +7,7 @@ typedef struct { uint32_t global_offset[3]; uint32_t kernel_id; } kernel_args_t; + +inline uint32_t alignOffset(uint32_t offset, uint32_t alignment) { + return (offset + alignment - 1) & ~(alignment - 1); +} diff --git a/lib/CL/devices/vortex/kernel_main.c b/lib/CL/devices/vortex/kernel_main.c index 0fbfa420d9f04c5634d01f936e6db03c3b66ac07..52218213889ddd29142d96ebf1c41fe7f3e985ec 100644 --- a/lib/CL/devices/vortex/kernel_main.c +++ b/lib/CL/devices/vortex/kernel_main.c @@ -18,8 +18,8 @@ int main(void) { for (int i = 0, n = kargs->work_dim; i < 3; i++) { g_global_offset.m[i] = (i < n) ? kargs->global_offset[i] : 0; } - - void* arg = (void*)((uint8_t*)kargs + sizeof(kernel_args_t)); + uint32_t aligned_kernel_args_size = alignOffset(sizeof(kernel_args_t), sizeof(size_t)); + void* arg = (void*)((uint8_t*)kargs + aligned_kernel_args_size); vx_kernel_func_cb kernel_func = (vx_kernel_func_cb)__vx_get_kernel_callback(kargs->kernel_id); return vx_spawn_threads(kargs->work_dim, kargs->num_groups, kargs->local_size, kernel_func, arg); } diff --git a/lib/CL/devices/vortex/pocl-vortex.c b/lib/CL/devices/vortex/pocl-vortex.c index 0ab5cf0ab393fe2ebf17e9da1ffa2dfd9a3d56be..bdd48f0e21c88c07d80e948b95ff4199885b9900 100644 --- a/lib/CL/devices/vortex/pocl-vortex.c +++ b/lib/CL/devices/vortex/pocl-vortex.c @@ -179,7 +179,7 @@ pocl_vortex_init (unsigned j, cl_device_id dev, const char* parameters) dev->llvm_cpu = NULL; dev->address_bits = is_64bit ? 64 : 32; - dev->llvm_target_triplet = is_64bit ? "riscv64-unknown-unknown" : "riscv32-unknown-unknown"; + dev->llvm_target_triplet = is_64bit ? "riscv64-unknown-unknown-elf" : "riscv32-unknown-unknown-elf"; dev->llvm_abi = is_64bit ? "lp64d" : "ilp32f"; dev->llvm_cpu = is_64bit ? "generic-rv64" : "generic-rv32"; dev->kernellib_name = is_64bit ? "kernel-riscv64" : "kernel-riscv32"; @@ -424,9 +424,9 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) { struct pocl_context *pc = &cmd->command.run.pc; int vx_err; - int num_groups = 1; - int group_size = 1; - for (int i = 0; i < pc->work_dim; ++i) { + uint32_t num_groups = 1; + uint32_t group_size = 1; + for (uint32_t i = 0; i < pc->work_dim; ++i) { num_groups *= pc->num_groups[i]; group_size *= pc->local_size[i]; } @@ -436,37 +436,39 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) { assert (data != NULL); dd = (vortex_device_data_t *)data; - int ptr_size = dd->is_64bit ? 8 : 4; + uint32_t ptr_size = dd->is_64bit ? 8 : 4; + + uint32_t aligned_kernel_args_size = alignOffset(sizeof(kernel_args_t), ptr_size); // calculate kernel arguments buffer size - int local_mem_size = 0; + uint32_t local_mem_size = 0; size_t abuf_size = 0; for (int i = 0; i < meta->num_args; ++i) { struct pocl_argument* al = &(cmd->command.run.arguments[i]); if (ARG_IS_LOCAL(meta->arg_info[i])) { local_mem_size += al->size; - abuf_size += 4; + abuf_size = alignOffset(abuf_size + 4, ptr_size); } else if ((meta->arg_info[i].type == POCL_ARG_TYPE_POINTER) || (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE) || (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)) { - abuf_size += ptr_size; + abuf_size = alignOffset(abuf_size + ptr_size, ptr_size); } else { // scalar argument - abuf_size += al->size; + abuf_size = alignOffset(abuf_size + al->size, ptr_size); } } // local buffers for (int i = 0; i < meta->num_locals; ++i) { local_mem_size += meta->local_sizes[i]; - abuf_size += 4; + abuf_size = alignOffset(abuf_size + 4, ptr_size); } // add local size if (local_mem_size != 0) { - abuf_size += 4; + abuf_size = alignOffset(abuf_size + 4, ptr_size); } // check occupancy @@ -483,7 +485,7 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) { } // allocate arguments host buffer - size_t kargs_buffer_size = sizeof(kernel_args_t) + abuf_size; + size_t kargs_buffer_size = aligned_kernel_args_size + abuf_size; uint8_t* const host_kargs_base_ptr = malloc(kargs_buffer_size); assert(host_kargs_base_ptr); @@ -514,30 +516,31 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) { // write arguments - uint8_t* host_args_ptr = host_kargs_base_ptr + sizeof(kernel_args_t); - int local_mem_offset = 0; + uint8_t* const host_args_ptr = host_kargs_base_ptr + aligned_kernel_args_size; + uint32_t host_args_offset = 0; + uint32_t local_mem_offset = 0; for (int i = 0; i < meta->num_args; ++i) { struct pocl_argument* al = &(cmd->command.run.arguments[i]); if (ARG_IS_LOCAL(meta->arg_info[i])) { if (local_mem_offset == 0) { - memcpy(host_args_ptr, &local_mem_size, 4); // local_size - host_args_ptr += 4; + memcpy(host_args_ptr + host_args_offset, &local_mem_size, 4); // local_size + host_args_offset = alignOffset(host_args_offset + 4, ptr_size); } - memcpy(host_args_ptr, &local_mem_offset, 4); // arg offset - host_args_ptr += 4; + memcpy(host_args_ptr + host_args_offset, &local_mem_offset, 4); // arg offset + host_args_offset = alignOffset(host_args_offset + 4, ptr_size); local_mem_offset += al->size; } else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER) { if (al->value == NULL) { - memset(host_args_ptr, 0, ptr_size); // NULL pointer value - host_args_ptr += ptr_size; + memset(host_args_ptr + host_args_offset, 0, ptr_size); // NULL pointer value + host_args_offset = alignOffset(host_args_offset + ptr_size, ptr_size); } else { cl_mem m = (*(cl_mem *)(al->value)); vortex_buffer_data_t* buf_data = (vortex_buffer_data_t *) m->device_ptrs[cmd->device->global_mem_id].mem_ptr; uint64_t dev_mem_addr = buf_data->buf_address + al->offset; - memcpy(host_args_ptr, &buf_data->buf_address, ptr_size); // pointer value - host_args_ptr += ptr_size; + memcpy(host_args_ptr + host_args_offset, &buf_data->buf_address, ptr_size); // pointer value + host_args_offset = alignOffset(host_args_offset + ptr_size, ptr_size); } } else if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { @@ -547,19 +550,19 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) { POCL_ABORT("POCL_VORTEX_RUN\n"); } else { // scalar argument - memcpy(host_args_ptr, al->value, al->size); // scalar value - host_args_ptr += al->size; + memcpy(host_args_ptr + host_args_offset, al->value, al->size); // scalar value + host_args_offset = alignOffset(host_args_offset + al->size, ptr_size); } } // write local arguments for (int i = 0; i < meta->num_locals; ++i) { if (local_mem_offset == 0) { - memcpy(host_args_ptr, &local_mem_size, 4); // local_size - host_args_ptr += 4; + memcpy(host_args_ptr + host_args_offset, &local_mem_size, 4); // local_size + host_args_offset = alignOffset(host_args_offset + 4, ptr_size); } - memcpy(host_args_ptr, &local_mem_offset, 4); // arg offset - host_args_ptr += 4; + memcpy(host_args_ptr + host_args_offset, &local_mem_offset, 4); // arg offset + host_args_offset = alignOffset(host_args_offset + 4, ptr_size); local_mem_offset += meta->local_sizes[i]; } diff --git a/lib/CL/devices/vortex/vortex_utils.cc b/lib/CL/devices/vortex/vortex_utils.cc index 35a81a1e552b33fba911c8a73e299b297ee4ea89..974bd2ca4c4db7ce88e7640ee1f54698b7dd3433 100644 --- a/lib/CL/devices/vortex/vortex_utils.cc +++ b/lib/CL/devices/vortex/vortex_utils.cc @@ -46,6 +46,7 @@ #include "pocl_util.h" #include "LLVMUtils.h" +#include "kernel_args.h" static int exec(const char* cmd, std::ostream& out) { char buffer[128]; @@ -99,6 +100,9 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module auto &Context = module->getContext(); const llvm::DataLayout &DL = module->getDataLayout(); + std::string TargetTriple = module->getTargetTriple(); + bool is64Bit = TargetTriple.find("riscv64") != std::string::npos; + auto I32Ty = llvm::Type::getInt32Ty(Context); auto I8Ty = llvm::Type::getInt8Ty(Context); auto I8PtrTy = I8Ty->getPointerTo(); @@ -124,6 +128,8 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module auto MDS = llvm::MDNode::get(Context, llvm::MDString::get(Context, "vortex.uniform")); + uint32_t BaseAlignment = is64Bit ? 8 : 4; + for (auto& OldArg : function->args()) { auto ArgType = OldArg.getType(); auto ArgOffset = llvm::ConstantInt::get(I32Ty, arg_offset); @@ -132,8 +138,8 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module if (allocated_local_mem == nullptr) { // Load __local_size auto local_size_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, "__local_size_ptr"); - arg_offset += 4; auto local_size = Builder.CreateLoad(I32Ty, local_size_ptr, "__local_size"); + arg_offset = alignOffset(arg_offset + 4, BaseAlignment); // Call vx_local_alloc(__local_size) auto function_type = llvm::FunctionType::get(I8PtrTy, {I32Ty}, false); auto vx_local_alloc_func = module->getOrInsertFunction("vx_local_alloc", function_type); @@ -142,13 +148,13 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module // Load argument __offset auto offset_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, OldArg.getName() + "_offset_ptr"); auto offset = Builder.CreateLoad(I32Ty, offset_ptr, OldArg.getName() + "_offset"); - arg_offset += 4; + arg_offset = alignOffset(arg_offset + 4, BaseAlignment); // Apply pointer offset Arg = Builder.CreateGEP(I8PtrTy, allocated_local_mem, offset, OldArg.getName() + "_byte_ptr"); } else { auto offset_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, OldArg.getName() + "_offset_ptr"); Arg = Builder.CreateLoad(ArgType, offset_ptr, OldArg.getName() + "_loaded"); - arg_offset += DL.getTypeAllocSize(ArgType); + arg_offset = alignOffset(arg_offset + DL.getTypeAllocSize(ArgType), BaseAlignment); } auto instr = llvm::cast<llvm::Instruction>(Arg); assert(instr != nullptr);