diff --git a/lib/CL/devices/vortex/kernel_args.h b/lib/CL/devices/vortex/kernel_args.h
index 00cb384a0824e5d52c3eed3a00f523e47bfdd573..9612684c42ecda7311faae0ea282a6b1a867dc17 100644
--- a/lib/CL/devices/vortex/kernel_args.h
+++ b/lib/CL/devices/vortex/kernel_args.h
@@ -7,3 +7,7 @@ typedef struct {
   uint32_t global_offset[3];
   uint32_t kernel_id;
 } kernel_args_t;
+
+inline uint32_t alignOffset(uint32_t offset, uint32_t alignment) {
+  return (offset + alignment - 1) & ~(alignment - 1);
+}
diff --git a/lib/CL/devices/vortex/kernel_main.c b/lib/CL/devices/vortex/kernel_main.c
index 0fbfa420d9f04c5634d01f936e6db03c3b66ac07..52218213889ddd29142d96ebf1c41fe7f3e985ec 100644
--- a/lib/CL/devices/vortex/kernel_main.c
+++ b/lib/CL/devices/vortex/kernel_main.c
@@ -18,8 +18,8 @@ int main(void) {
   for (int i = 0, n = kargs->work_dim; i < 3; i++) {
     g_global_offset.m[i] = (i < n) ? kargs->global_offset[i] : 0;
   }
-
-  void* arg = (void*)((uint8_t*)kargs + sizeof(kernel_args_t));
+  uint32_t aligned_kernel_args_size = alignOffset(sizeof(kernel_args_t), sizeof(size_t));
+  void* arg = (void*)((uint8_t*)kargs + aligned_kernel_args_size);
   vx_kernel_func_cb kernel_func = (vx_kernel_func_cb)__vx_get_kernel_callback(kargs->kernel_id);
   return vx_spawn_threads(kargs->work_dim, kargs->num_groups, kargs->local_size, kernel_func, arg);
 }
diff --git a/lib/CL/devices/vortex/pocl-vortex.c b/lib/CL/devices/vortex/pocl-vortex.c
index 0ab5cf0ab393fe2ebf17e9da1ffa2dfd9a3d56be..bdd48f0e21c88c07d80e948b95ff4199885b9900 100644
--- a/lib/CL/devices/vortex/pocl-vortex.c
+++ b/lib/CL/devices/vortex/pocl-vortex.c
@@ -179,7 +179,7 @@ pocl_vortex_init (unsigned j, cl_device_id dev, const char* parameters)
 
   dev->llvm_cpu = NULL;
   dev->address_bits = is_64bit ? 64 : 32;
-  dev->llvm_target_triplet = is_64bit ? "riscv64-unknown-unknown" : "riscv32-unknown-unknown";
+  dev->llvm_target_triplet = is_64bit ? "riscv64-unknown-unknown-elf" : "riscv32-unknown-unknown-elf";
   dev->llvm_abi = is_64bit ? "lp64d" : "ilp32f";
   dev->llvm_cpu = is_64bit ? "generic-rv64" : "generic-rv32";
   dev->kernellib_name = is_64bit ? "kernel-riscv64" : "kernel-riscv32";
@@ -424,9 +424,9 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
   struct pocl_context *pc = &cmd->command.run.pc;
   int vx_err;
 
-  int num_groups = 1;
-  int group_size = 1;
-  for (int i = 0; i < pc->work_dim; ++i) {
+  uint32_t num_groups = 1;
+  uint32_t group_size = 1;
+  for (uint32_t i = 0; i < pc->work_dim; ++i) {
     num_groups *= pc->num_groups[i];
     group_size *= pc->local_size[i];
   }
@@ -436,37 +436,39 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
   assert (data != NULL);
   dd = (vortex_device_data_t *)data;
 
-  int ptr_size = dd->is_64bit ? 8 : 4;
+  uint32_t ptr_size = dd->is_64bit ? 8 : 4;
+
+  uint32_t aligned_kernel_args_size = alignOffset(sizeof(kernel_args_t), ptr_size);
 
   // calculate kernel arguments buffer size
-  int local_mem_size = 0;
+  uint32_t local_mem_size = 0;
   size_t abuf_size = 0;
 
   for (int i = 0; i < meta->num_args; ++i) {
     struct pocl_argument* al = &(cmd->command.run.arguments[i]);
     if (ARG_IS_LOCAL(meta->arg_info[i])) {
       local_mem_size += al->size;
-      abuf_size += 4;
+      abuf_size = alignOffset(abuf_size + 4, ptr_size);
     } else
     if ((meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
      || (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
      || (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)) {
-      abuf_size += ptr_size;
+      abuf_size = alignOffset(abuf_size + ptr_size, ptr_size);
     } else {
       // scalar argument
-      abuf_size += al->size;
+      abuf_size = alignOffset(abuf_size + al->size, ptr_size);
     }
   }
 
   // local buffers
   for (int i = 0; i < meta->num_locals; ++i) {
     local_mem_size += meta->local_sizes[i];
-    abuf_size += 4;
+    abuf_size = alignOffset(abuf_size + 4, ptr_size);
   }
 
   // add local size
   if (local_mem_size != 0) {
-    abuf_size += 4;
+    abuf_size = alignOffset(abuf_size + 4, ptr_size);
   }
 
   // check occupancy
@@ -483,7 +485,7 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
   }
 
   // allocate arguments host buffer
-  size_t kargs_buffer_size = sizeof(kernel_args_t) + abuf_size;
+  size_t kargs_buffer_size = aligned_kernel_args_size + abuf_size;
   uint8_t* const host_kargs_base_ptr = malloc(kargs_buffer_size);
   assert(host_kargs_base_ptr);
 
@@ -514,30 +516,31 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
 
   // write arguments
 
-  uint8_t* host_args_ptr = host_kargs_base_ptr + sizeof(kernel_args_t);
-  int local_mem_offset = 0;
+  uint8_t* const host_args_ptr = host_kargs_base_ptr + aligned_kernel_args_size;
+  uint32_t host_args_offset = 0;
+  uint32_t local_mem_offset = 0;
 
   for (int i = 0; i < meta->num_args; ++i) {
     struct pocl_argument* al = &(cmd->command.run.arguments[i]);
     if (ARG_IS_LOCAL(meta->arg_info[i])) {
       if (local_mem_offset == 0) {
-        memcpy(host_args_ptr, &local_mem_size, 4); // local_size
-        host_args_ptr += 4;
+        memcpy(host_args_ptr + host_args_offset, &local_mem_size, 4); // local_size
+        host_args_offset = alignOffset(host_args_offset + 4, ptr_size);
       }
-      memcpy(host_args_ptr, &local_mem_offset, 4); // arg offset
-      host_args_ptr += 4;
+      memcpy(host_args_ptr + host_args_offset, &local_mem_offset, 4); // arg offset
+      host_args_offset = alignOffset(host_args_offset + 4, ptr_size);
       local_mem_offset += al->size;
     } else
     if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER) {
       if (al->value == NULL) {
-        memset(host_args_ptr, 0, ptr_size); // NULL pointer value
-        host_args_ptr += ptr_size;
+        memset(host_args_ptr + host_args_offset, 0, ptr_size); // NULL pointer value
+        host_args_offset = alignOffset(host_args_offset + ptr_size, ptr_size);
       } else {
         cl_mem m = (*(cl_mem *)(al->value));
         vortex_buffer_data_t* buf_data = (vortex_buffer_data_t *) m->device_ptrs[cmd->device->global_mem_id].mem_ptr;
         uint64_t dev_mem_addr = buf_data->buf_address + al->offset;
-        memcpy(host_args_ptr, &buf_data->buf_address, ptr_size); // pointer value
-        host_args_ptr += ptr_size;
+        memcpy(host_args_ptr + host_args_offset, &buf_data->buf_address, ptr_size); // pointer value
+        host_args_offset = alignOffset(host_args_offset + ptr_size, ptr_size);
       }
     } else
     if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE) {
@@ -547,19 +550,19 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
         POCL_ABORT("POCL_VORTEX_RUN\n");
     } else {
       // scalar argument
-      memcpy(host_args_ptr, al->value, al->size); // scalar value
-      host_args_ptr += al->size;
+      memcpy(host_args_ptr + host_args_offset, al->value, al->size); // scalar value
+      host_args_offset = alignOffset(host_args_offset + al->size, ptr_size);
     }
   }
 
   // write local arguments
   for (int i = 0; i < meta->num_locals; ++i) {
     if (local_mem_offset == 0) {
-      memcpy(host_args_ptr, &local_mem_size, 4); // local_size
-      host_args_ptr += 4;
+      memcpy(host_args_ptr + host_args_offset, &local_mem_size, 4); // local_size
+      host_args_offset = alignOffset(host_args_offset + 4, ptr_size);
     }
-    memcpy(host_args_ptr, &local_mem_offset, 4); // arg offset
-    host_args_ptr += 4;
+    memcpy(host_args_ptr + host_args_offset, &local_mem_offset, 4); // arg offset
+    host_args_offset = alignOffset(host_args_offset + 4, ptr_size);
     local_mem_offset += meta->local_sizes[i];
   }
 
diff --git a/lib/CL/devices/vortex/vortex_utils.cc b/lib/CL/devices/vortex/vortex_utils.cc
index 35a81a1e552b33fba911c8a73e299b297ee4ea89..974bd2ca4c4db7ce88e7640ee1f54698b7dd3433 100644
--- a/lib/CL/devices/vortex/vortex_utils.cc
+++ b/lib/CL/devices/vortex/vortex_utils.cc
@@ -46,6 +46,7 @@
 #include "pocl_util.h"
 
 #include "LLVMUtils.h"
+#include "kernel_args.h"
 
 static int exec(const char* cmd, std::ostream& out) {
   char buffer[128];
@@ -99,6 +100,9 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
   auto &Context = module->getContext();
   const llvm::DataLayout &DL = module->getDataLayout();
 
+  std::string TargetTriple = module->getTargetTriple();
+  bool is64Bit = TargetTriple.find("riscv64") != std::string::npos;
+
   auto I32Ty = llvm::Type::getInt32Ty(Context);
   auto I8Ty = llvm::Type::getInt8Ty(Context);
   auto I8PtrTy = I8Ty->getPointerTo();
@@ -124,6 +128,8 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
 
   auto MDS = llvm::MDNode::get(Context, llvm::MDString::get(Context, "vortex.uniform"));
 
+  uint32_t BaseAlignment = is64Bit ? 8 : 4;
+
   for (auto& OldArg : function->args()) {
     auto ArgType = OldArg.getType();
     auto ArgOffset = llvm::ConstantInt::get(I32Ty, arg_offset);
@@ -132,8 +138,8 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
       if (allocated_local_mem == nullptr) {
         // Load __local_size
         auto local_size_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, "__local_size_ptr");
-        arg_offset += 4;
         auto local_size = Builder.CreateLoad(I32Ty, local_size_ptr, "__local_size");
+        arg_offset = alignOffset(arg_offset + 4, BaseAlignment);
         // Call vx_local_alloc(__local_size)
         auto function_type = llvm::FunctionType::get(I8PtrTy, {I32Ty}, false);
         auto vx_local_alloc_func = module->getOrInsertFunction("vx_local_alloc", function_type);
@@ -142,13 +148,13 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
       // Load argument __offset
       auto offset_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, OldArg.getName() + "_offset_ptr");
       auto offset = Builder.CreateLoad(I32Ty, offset_ptr, OldArg.getName() + "_offset");
-      arg_offset += 4;
+      arg_offset = alignOffset(arg_offset + 4, BaseAlignment);
       // Apply pointer offset
       Arg = Builder.CreateGEP(I8PtrTy, allocated_local_mem, offset, OldArg.getName() + "_byte_ptr");
     } else {
       auto offset_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, OldArg.getName() + "_offset_ptr");
       Arg = Builder.CreateLoad(ArgType, offset_ptr, OldArg.getName() + "_loaded");
-      arg_offset += DL.getTypeAllocSize(ArgType);
+      arg_offset = alignOffset(arg_offset + DL.getTypeAllocSize(ArgType), BaseAlignment);
     }
     auto instr = llvm::cast<llvm::Instruction>(Arg);
     assert(instr != nullptr);