diff --git a/lib/CL/devices/vortex/pocl-vortex.c b/lib/CL/devices/vortex/pocl-vortex.c
index 356e802d9b949f918a4a47c72939163b7be886ff..91119a766829fd9f22b9f00d09422629ffe6ab52 100644
--- a/lib/CL/devices/vortex/pocl-vortex.c
+++ b/lib/CL/devices/vortex/pocl-vortex.c
@@ -70,6 +70,10 @@ static cl_bool vortex_available = CL_TRUE;
 
 static const char *vortex_native_device_aux_funcs[] = {NULL};
 
+char* pocl_vortex_init_build(void *data) {
+    return strdup("-target-feature +m -target-feature +a -target-feature +f -target-feature +d");
+}
+
 void pocl_vortex_init_device_ops(struct pocl_device_ops *ops) {
 
   ops->device_name = "vortex";
@@ -95,6 +99,7 @@ void pocl_vortex_init_device_ops(struct pocl_device_ops *ops) {
   ops->supports_binary = pocl_driver_supports_binary;
   ops->build_poclbinary = pocl_driver_build_poclbinary;
   ops->build_builtin = pocl_driver_build_opencl_builtins;
+  ops->init_build = pocl_vortex_init_build;
 
   ops->post_build_program = pocl_vortex_post_build_program;
   ops->free_program = pocl_vortex_free_program;
@@ -175,6 +180,7 @@ pocl_vortex_init (unsigned j, cl_device_id dev, const char* parameters)
   dev->address_bits = VORTEX_XLEN;
   dev->llvm_target_triplet = is64bit ? "riscv64-unknown-unknown" : "riscv32-unknown-unknown";
   dev->llvm_abi = is64bit ? "lp64d" : "ilp32f";
+  dev->llvm_cpu = is64bit ? "generic-rv64" : "generic-rv32";
   dev->kernellib_name = is64bit ? "kernel-riscv64" : "kernel-riscv32";
   dev->kernellib_fallback_name = NULL;
   dev->kernellib_subdir = "vortex";
@@ -214,9 +220,31 @@ pocl_vortex_init (unsigned j, cl_device_id dev, const char* parameters)
     return CL_DEVICE_NOT_FOUND;
   }
 
+  uint64_t num_warps;
+  vx_err = vx_dev_caps(vx_device, VX_CAPS_NUM_WARPS, &num_warps);
+  if (vx_err != 0) {
+    vx_dev_close(vx_device);
+    free(dd);
+    return CL_DEVICE_NOT_FOUND;
+  }
+
+  uint64_t num_threads;
+  vx_err = vx_dev_caps(vx_device, VX_CAPS_NUM_THREADS, &num_threads);
+  if (vx_err != 0) {
+    vx_dev_close(vx_device);
+    free(dd);
+    return CL_DEVICE_NOT_FOUND;
+  }
+
+  uint64_t max_work_group_size = num_warps * num_threads;
+
   dev->global_mem_size = global_mem_size;
   dev->max_mem_alloc_size = global_mem_size;
   dev->local_mem_size = local_mem_size;
+  dev->max_work_group_size    = max_work_group_size;
+  dev->max_work_item_sizes[0] = max_work_group_size;
+  dev->max_work_item_sizes[1] = max_work_group_size;
+  dev->max_work_item_sizes[2] = max_work_group_size;
   dev->max_compute_units = num_cores;
 
   dd->vx_kernel_buffer = NULL;
@@ -345,7 +373,7 @@ int pocl_vortex_create_kernel (cl_device_id device, cl_program program,
     const char* current = pdata->kernel_names;
     int i = 0;
     int found = 0;
-    for (int i = 0; i < pdata->num_kernels; ++i) {
+    for (; i < pdata->num_kernels; ++i) {
       if (strcmp(current, kernel->name) == 0) {
         found = 1;
         break;
@@ -439,7 +467,7 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
   }
 
   // check occupancy
-  if (local_mem_size != 0) {
+  if (group_size != 1) {
     int available_localmem;
     vx_err = vx_check_occupancy(dd->vx_device, group_size, &available_localmem);
     if (vx_err != 0) {
diff --git a/lib/CL/devices/vortex/vortex_utils.cc b/lib/CL/devices/vortex/vortex_utils.cc
index 08c77da4109c71f664d81654178fbaae5adfe295..35a81a1e552b33fba911c8a73e299b297ee4ea89 100644
--- a/lib/CL/devices/vortex/vortex_utils.cc
+++ b/lib/CL/devices/vortex/vortex_utils.cc
@@ -101,10 +101,10 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
 
   auto I32Ty = llvm::Type::getInt32Ty(Context);
   auto I8Ty = llvm::Type::getInt8Ty(Context);
+  auto I8PtrTy = I8Ty->getPointerTo();
 
   // Create new function signature
-  auto ArgBufferType = llvm::PointerType::get(llvm::Type::getInt8Ty(Context), 0);
-  auto NewFuncType = llvm::FunctionType::get(function->getReturnType(), {ArgBufferType}, false);
+  auto NewFuncType = llvm::FunctionType::get(function->getReturnType(), {I8PtrTy}, false);
   auto NewFunc = llvm::Function::Create(NewFuncType, function->getLinkage(), function->getName() + "_vortex");
   module->getFunctionList().insert(function->getIterator(), NewFunc);
   NewFunc->takeName(function);
@@ -116,13 +116,14 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
   auto ai = NewFunc->arg_begin();
   auto ArgBuffer = &*ai++;
   ArgBuffer->setName("ArgBuffer");
-  auto I8PtrTy = I8Ty->getPointerTo();
 
   unsigned arg_idx = 0;
   unsigned arg_offset = 0;
 
   llvm::Value* allocated_local_mem = nullptr;
 
+  auto MDS = llvm::MDNode::get(Context, llvm::MDString::get(Context, "vortex.uniform"));
+
   for (auto& OldArg : function->args()) {
     auto ArgType = OldArg.getType();
     auto ArgOffset = llvm::ConstantInt::get(I32Ty, arg_offset);
@@ -149,6 +150,10 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
       Arg = Builder.CreateLoad(ArgType, offset_ptr, OldArg.getName() + "_loaded");
       arg_offset += DL.getTypeAllocSize(ArgType);
     }
+    auto instr = llvm::cast<llvm::Instruction>(Arg);
+    assert(instr != nullptr);
+    instr->setMetadata("vortex.uniform", MDS);
+
     OldArg.replaceAllUsesWith(Arg);
     arg_idx += 1;
   }
@@ -186,9 +191,9 @@ static void addKernelSelect(llvm::SmallVector<std::string, 8>& funcNames, llvm::
   auto& Context = module->getContext();
 
   auto I32Ty = llvm::Type::getInt32Ty(Context);
-  auto VoidTy = llvm::Type::getVoidTy(Context);
-  auto VoidPtrTy = llvm::PointerType::getUnqual(VoidTy);
-  auto GetKernelCallbackTy = llvm::FunctionType::get(VoidPtrTy, {I32Ty}, false);
+  auto I8Ty = llvm::Type::getInt8Ty(Context);
+  auto I8PtrTy = I8Ty->getPointerTo();
+  auto GetKernelCallbackTy = llvm::FunctionType::get(I8PtrTy, {I32Ty}, false);
 
   auto GetKernelCallbackFunc = llvm::Function::Create(
     GetKernelCallbackTy, llvm::Function::ExternalLinkage, "__vx_get_kernel_callback", module);