diff --git a/lib/CL/devices/vortex/pocl-vortex.c b/lib/CL/devices/vortex/pocl-vortex.c index 356e802d9b949f918a4a47c72939163b7be886ff..91119a766829fd9f22b9f00d09422629ffe6ab52 100644 --- a/lib/CL/devices/vortex/pocl-vortex.c +++ b/lib/CL/devices/vortex/pocl-vortex.c @@ -70,6 +70,10 @@ static cl_bool vortex_available = CL_TRUE; static const char *vortex_native_device_aux_funcs[] = {NULL}; +char* pocl_vortex_init_build(void *data) { + return strdup("-target-feature +m -target-feature +a -target-feature +f -target-feature +d"); +} + void pocl_vortex_init_device_ops(struct pocl_device_ops *ops) { ops->device_name = "vortex"; @@ -95,6 +99,7 @@ void pocl_vortex_init_device_ops(struct pocl_device_ops *ops) { ops->supports_binary = pocl_driver_supports_binary; ops->build_poclbinary = pocl_driver_build_poclbinary; ops->build_builtin = pocl_driver_build_opencl_builtins; + ops->init_build = pocl_vortex_init_build; ops->post_build_program = pocl_vortex_post_build_program; ops->free_program = pocl_vortex_free_program; @@ -175,6 +180,7 @@ pocl_vortex_init (unsigned j, cl_device_id dev, const char* parameters) dev->address_bits = VORTEX_XLEN; dev->llvm_target_triplet = is64bit ? "riscv64-unknown-unknown" : "riscv32-unknown-unknown"; dev->llvm_abi = is64bit ? "lp64d" : "ilp32f"; + dev->llvm_cpu = is64bit ? "generic-rv64" : "generic-rv32"; dev->kernellib_name = is64bit ? "kernel-riscv64" : "kernel-riscv32"; dev->kernellib_fallback_name = NULL; dev->kernellib_subdir = "vortex"; @@ -214,9 +220,31 @@ pocl_vortex_init (unsigned j, cl_device_id dev, const char* parameters) return CL_DEVICE_NOT_FOUND; } + uint64_t num_warps; + vx_err = vx_dev_caps(vx_device, VX_CAPS_NUM_WARPS, &num_warps); + if (vx_err != 0) { + vx_dev_close(vx_device); + free(dd); + return CL_DEVICE_NOT_FOUND; + } + + uint64_t num_threads; + vx_err = vx_dev_caps(vx_device, VX_CAPS_NUM_THREADS, &num_threads); + if (vx_err != 0) { + vx_dev_close(vx_device); + free(dd); + return CL_DEVICE_NOT_FOUND; + } + + uint64_t max_work_group_size = num_warps * num_threads; + dev->global_mem_size = global_mem_size; dev->max_mem_alloc_size = global_mem_size; dev->local_mem_size = local_mem_size; + dev->max_work_group_size = max_work_group_size; + dev->max_work_item_sizes[0] = max_work_group_size; + dev->max_work_item_sizes[1] = max_work_group_size; + dev->max_work_item_sizes[2] = max_work_group_size; dev->max_compute_units = num_cores; dd->vx_kernel_buffer = NULL; @@ -345,7 +373,7 @@ int pocl_vortex_create_kernel (cl_device_id device, cl_program program, const char* current = pdata->kernel_names; int i = 0; int found = 0; - for (int i = 0; i < pdata->num_kernels; ++i) { + for (; i < pdata->num_kernels; ++i) { if (strcmp(current, kernel->name) == 0) { found = 1; break; @@ -439,7 +467,7 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) { } // check occupancy - if (local_mem_size != 0) { + if (group_size != 1) { int available_localmem; vx_err = vx_check_occupancy(dd->vx_device, group_size, &available_localmem); if (vx_err != 0) { diff --git a/lib/CL/devices/vortex/vortex_utils.cc b/lib/CL/devices/vortex/vortex_utils.cc index 08c77da4109c71f664d81654178fbaae5adfe295..35a81a1e552b33fba911c8a73e299b297ee4ea89 100644 --- a/lib/CL/devices/vortex/vortex_utils.cc +++ b/lib/CL/devices/vortex/vortex_utils.cc @@ -101,10 +101,10 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module auto I32Ty = llvm::Type::getInt32Ty(Context); auto I8Ty = llvm::Type::getInt8Ty(Context); + auto I8PtrTy = I8Ty->getPointerTo(); // Create new function signature - auto ArgBufferType = llvm::PointerType::get(llvm::Type::getInt8Ty(Context), 0); - auto NewFuncType = llvm::FunctionType::get(function->getReturnType(), {ArgBufferType}, false); + auto NewFuncType = llvm::FunctionType::get(function->getReturnType(), {I8PtrTy}, false); auto NewFunc = llvm::Function::Create(NewFuncType, function->getLinkage(), function->getName() + "_vortex"); module->getFunctionList().insert(function->getIterator(), NewFunc); NewFunc->takeName(function); @@ -116,13 +116,14 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module auto ai = NewFunc->arg_begin(); auto ArgBuffer = &*ai++; ArgBuffer->setName("ArgBuffer"); - auto I8PtrTy = I8Ty->getPointerTo(); unsigned arg_idx = 0; unsigned arg_offset = 0; llvm::Value* allocated_local_mem = nullptr; + auto MDS = llvm::MDNode::get(Context, llvm::MDString::get(Context, "vortex.uniform")); + for (auto& OldArg : function->args()) { auto ArgType = OldArg.getType(); auto ArgOffset = llvm::ConstantInt::get(I32Ty, arg_offset); @@ -149,6 +150,10 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module Arg = Builder.CreateLoad(ArgType, offset_ptr, OldArg.getName() + "_loaded"); arg_offset += DL.getTypeAllocSize(ArgType); } + auto instr = llvm::cast<llvm::Instruction>(Arg); + assert(instr != nullptr); + instr->setMetadata("vortex.uniform", MDS); + OldArg.replaceAllUsesWith(Arg); arg_idx += 1; } @@ -186,9 +191,9 @@ static void addKernelSelect(llvm::SmallVector<std::string, 8>& funcNames, llvm:: auto& Context = module->getContext(); auto I32Ty = llvm::Type::getInt32Ty(Context); - auto VoidTy = llvm::Type::getVoidTy(Context); - auto VoidPtrTy = llvm::PointerType::getUnqual(VoidTy); - auto GetKernelCallbackTy = llvm::FunctionType::get(VoidPtrTy, {I32Ty}, false); + auto I8Ty = llvm::Type::getInt8Ty(Context); + auto I8PtrTy = I8Ty->getPointerTo(); + auto GetKernelCallbackTy = llvm::FunctionType::get(I8PtrTy, {I32Ty}, false); auto GetKernelCallbackFunc = llvm::Function::Create( GetKernelCallbackTy, llvm::Function::ExternalLinkage, "__vx_get_kernel_callback", module);