diff --git a/CMakeLists.txt b/CMakeLists.txt index 775c07d2ca33794855f7bbede8760df23aeab860..0b9c206bdfa3b36e415c43cb96fa1be7feca4e0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,8 @@ option(POCL_DEBUG_MESSAGES option(ENABLE_LOADABLE_DRIVERS "Enable drivers to be dlopen()-ed at pocl runtime, instead of being linked into libpocl" ON) +option(ENABLE_VORTEX "Enable the Vortex OpenGPU device driver" OFF) + option(ENABLE_HSA "Enable the HSA base profile runtime device driver" OFF) option(ENABLE_CUDA "Enable the CUDA device driver for NVIDIA devices" OFF) @@ -1156,9 +1158,9 @@ endif() endif() -setup_cached_var(ENABLE_ICD "Using an ICD loader" - "Requested build with icd, but ICD loader not found! some examples will not work.." - "ICD loader found, but requested build without it") + setup_cached_var(ENABLE_ICD "Using an ICD loader" + "Requested build with icd, but ICD loader not found! some examples will not work.." + "ICD loader found, but requested build without it") if(ENABLE_ICD) # only meaningful to link tests with ocl-icd @@ -1860,7 +1862,7 @@ if(ENABLE_CUDA) endif() # FP64 is always enabled for CUDA, it should be available since Compute Capability 1.3 - set(CUDA_DEVICE_EXTENSIONS "${CUDA_DEVICE_EXTENSIONS} cl_khr_fp64") + set(CUDA_DEVICE_EXTENSIONS "${CUDA_DEVICE_EXTENSIONS} cl_khr_fp64") set(CUDA_DEVICE_FEATURES_30 "${CUDA_DEVICE_FEATURES_30} __opencl_c_fp64") if(ENABLE_CUDNN) @@ -1872,6 +1874,25 @@ endif() ########################################################## +if (ENABLE_VORTEX) + set(OCL_DRIVERS "${OCL_DRIVERS} vortex") + set(OCL_TARGETS "${OCL_TARGETS} vortex") + if(NOT DEFINED VORTEX_PREFIX) + message(FATAL_ERROR "should set 'VORTEX_PREFIX' option") + endif() + if(NOT DEFINED VORTEX_XLEN) + message(FATAL_ERROR "should set 'VORTEX_XLEN' option") + endif() + set(BUILD_VORTEX 1) + set(VORTEX_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store") + set(VORTEX_DEVICE_CL_VERSION_MAJOR 1) + set(VORTEX_DEVICE_CL_VERSION_MINOR 2) + set(VORTEX_DEVICE_CL_VERSION "120") + set(VORTEX_DEVICE_CL_STD "1.2") +endif() + +########################################################## + message(STATUS "Building the following device drivers: ${OCL_DRIVERS}") set(BUILDDIR "${CMAKE_BINARY_DIR}") @@ -2371,6 +2392,7 @@ MESSAGE(STATUS "ENABLE_TCEMC: ${ENABLE_TCEMC}") MESSAGE(STATUS "ENABLE_HSA: ${ENABLE_HSA}") MESSAGE(STATUS "ENABLE_ALMAIF_DEVICE: ${ENABLE_ALMAIF_DEVICE}") MESSAGE(STATUS "ENABLE_CUDA: ${ENABLE_CUDA}") +MESSAGE(STATUS "ENABLE_VORTEX: ${ENABLE_VORTEX}") MESSAGE(STATUS "ENABLE_CUDNN: ${ENABLE_CUDNN}") MESSAGE(STATUS "ENABLE_HOST_CPU_DEVICES: ${ENABLE_HOST_CPU_DEVICES}") MESSAGE(STATUS "ENABLE_VULKAN: ${ENABLE_VULKAN}") diff --git a/README.vortex b/README.vortex new file mode 100644 index 0000000000000000000000000000000000000000..ebdbcb1255d0a10729644f8952602bf544237adc --- /dev/null +++ b/README.vortex @@ -0,0 +1,11 @@ +## build POCL runtime +- git clone --branch vortex --recursive https://github.com/vortexgpgpu/pocl +- cd pocl +- mkdir build && cd build +- export POCL_PATH=$TOOLDIR/pocl +- export VORTEX_PREFIX=$HOME/tools/vortex +- export LLVM_PREFIX=$TOOLDIR/llvm-vortex +- cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=$POCL_PATH -DWITH_LLVM_CONFIG=$LLVM_PREFIX/bin/llvm-config -DVORTEX_PREFIX=$VORTEX_PREFIX -DVORTEX_XLEN=32 -DENABLE_VORTEX=ON -DKERNEL_CACHE_DEFAULT=OFF -DENABLE_HOST_CPU_DEVICES=OFF -DENABLE_TESTS=OFF -DPOCL_DEBUG_MESSAGES=ON -DENABLE_ICD=OFF .. +- make -j`nproc` +- make install +- cp -r ../include $POCL_RT_PATH \ No newline at end of file diff --git a/cmake/LLVM.cmake b/cmake/LLVM.cmake index eb3788518192b46578f72cc2855e37f2c4db0bfc..428ec61822a928667dd3e57cc19f455d4f8c4a90 100644 --- a/cmake/LLVM.cmake +++ b/cmake/LLVM.cmake @@ -297,9 +297,13 @@ endif() find_program_or_die(LLVM_OPT "opt" "LLVM optimizer") find_program_or_die(LLVM_LLC "llc" "LLVM static compiler") +find_program_or_die(LLVM_AR "llvm-ar" "LLVM archiver") find_program_or_die(LLVM_AS "llvm-as" "LLVM assembler") +find_program_or_die(LLVM_DIS "llvm-dis" "LLVM disassembler") find_program_or_die(LLVM_LINK "llvm-link" "LLVM IR linker") find_program_or_die(LLVM_LLI "lli" "LLVM interpreter") +find_program_or_die(LLVM_OBJCOPY "llvm-objcopy" "LLVM object copying") +find_program_or_die(LLVM_OBJDUMP "llvm-objdump" "LLVM object dump") if(ENABLE_LLVM_FILECHECKS) if(IS_ABSOLUTE "${LLVM_FILECHECK_BIN}" AND EXISTS "${LLVM_FILECHECK_BIN}") diff --git a/config.h.in.cmake b/config.h.in.cmake index c518bcc62af2120a9794c7db76f23bca58096ece..3c48360f3179d64590bdbae63fde014ea79d1eca 100644 --- a/config.h.in.cmake +++ b/config.h.in.cmake @@ -5,6 +5,7 @@ #cmakedefine BUILD_HSA #cmakedefine BUILD_CUDA +#cmakedefine BUILD_VORTEX #cmakedefine BUILD_BASIC #cmakedefine BUILD_TBB #cmakedefine BUILD_PTHREAD @@ -118,6 +119,14 @@ #endif +#if defined(BUILD_VORTEX) + +#define VORTEX_DEVICE_EXTENSIONS "@VORTEX_DEVICE_EXTENSIONS@" + +#define VORTEX_XLEN @VORTEX_XLEN@ + +#endif + #if defined(ENABLE_HOST_CPU_DEVICES) #cmakedefine ENABLE_HOST_CPU_DEVICES_OPENMP @@ -193,6 +202,16 @@ #define LLVM_LLC "@LLVM_LLC@" +#define LLVM_AR "@LLVM_AR@" + +#define LLVM_DIS "@LLVM_DIS@" + +#define LLVM_OBJCOPY "@LLVM_OBJCOPY@" + +#define LLVM_OBJDUMP "@LLVM_OBJDUMP@" + +#define LLVM_PREFIX "@LLVM_PREFIX@" + #define LLVM_SPIRV "@LLVM_SPIRV@" #define LLVM_OPT "@LLVM_OPT@" @@ -270,6 +289,9 @@ #define CUDA_DEVICE_CL_VERSION_MAJOR @CUDA_DEVICE_CL_VERSION_MAJOR@ #define CUDA_DEVICE_CL_VERSION_MINOR @CUDA_DEVICE_CL_VERSION_MINOR@ +#define VORTEX_DEVICE_CL_VERSION_MAJOR @VORTEX_DEVICE_CL_VERSION_MAJOR@ +#define VORTEX_DEVICE_CL_VERSION_MINOR @VORTEX_DEVICE_CL_VERSION_MINOR@ + #define HOST_DEVICE_CL_VERSION_MAJOR @HOST_DEVICE_CL_VERSION_MAJOR@ #define HOST_DEVICE_CL_VERSION_MINOR @HOST_DEVICE_CL_VERSION_MINOR@ diff --git a/lib/CL/clGetPlatformInfo.c b/lib/CL/clGetPlatformInfo.c index bd9a4c5461ea55677d29cbd169efc932adb84798..f8c24253aa63abf06a3a6d0b905d782886b4b567 100644 --- a/lib/CL/clGetPlatformInfo.c +++ b/lib/CL/clGetPlatformInfo.c @@ -108,6 +108,10 @@ static const char *pocl_version ", CUDA" #endif +#ifdef BUILD_VORTEX + ", VORTEX" +#endif + #ifdef BUILD_HSA ", HSA" #endif diff --git a/lib/CL/devices/CMakeLists.txt b/lib/CL/devices/CMakeLists.txt index a6d40ad43ce56dfa6ba7403b429ca294e0215928..fed894c44bce55717982c8774261e74379c2b9e1 100644 --- a/lib/CL/devices/CMakeLists.txt +++ b/lib/CL/devices/CMakeLists.txt @@ -156,6 +156,15 @@ if(ENABLE_CUDA) endif() endif() +if(ENABLE_VORTEX) + add_subdirectory("vortex") + set(POCL_DEVICES_OBJS "${POCL_DEVICES_OBJS}" + "$<TARGET_OBJECTS:pocl-devices-vortex>") + + if(NOT ENABLE_LOADABLE_DRIVERS) + list(APPEND POCL_DEVICES_LINK_LIST vortex) + endif() +endif() set(POCL_DEVICES_SOURCES devices.h devices.c diff --git a/lib/CL/devices/devices.c b/lib/CL/devices/devices.c index b43ca5146dfad952aa80e105af9c72aa4606ecf2..155b580227302ff7410440a08023d9f8b49f5cf9 100644 --- a/lib/CL/devices/devices.c +++ b/lib/CL/devices/devices.c @@ -87,6 +87,10 @@ #include "cuda/pocl-cuda.h" #endif +#ifdef BUILD_VORTEX +#include "vortex/pocl-vortex.h" +#endif + #if defined(BUILD_ALMAIF) #include "almaif/almaif.h" #endif @@ -174,6 +178,9 @@ static init_device_ops pocl_devices_init_ops[] = { #ifdef BUILD_CUDA INIT_DEV (cuda), #endif +#ifdef BUILD_VORTEX + INIT_DEV (vortex), +#endif #ifdef BUILD_ALMAIF INIT_DEV (almaif), #endif @@ -212,6 +219,9 @@ char pocl_device_types[POCL_NUM_DEVICE_TYPES][33] = { #ifdef BUILD_CUDA "cuda", #endif +#ifdef BUILD_VORTEX + "vortex", +#endif #ifdef BUILD_ALMAIF "almaif", #endif diff --git a/lib/CL/devices/vortex/CMakeLists.txt b/lib/CL/devices/vortex/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc53db46e706d0f7ea91b71c9be1fa1539fa90c6 --- /dev/null +++ b/lib/CL/devices/vortex/CMakeLists.txt @@ -0,0 +1,40 @@ +#============================================================================= +# CMake build system files +# +# Copyright (c) 2014-2021 pocl developers +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +#============================================================================= + +unset(CMAKE_CXX_STANDARD) +unset(CMAKE_CXX_STANDARD_REQUIRED) +set_source_files_properties(vortex_utils.cc vortex_utils.h PROPERTIES COMPILE_FLAGS "${LLVM_CXXFLAGS}") +include_directories(${LLVM_INCLUDE_DIRS} ${VORTEX_PREFIX}/runtime/include ${CMAKE_CURRENT_SOURCE_DIR}/../../../llvmopencl) +set(CMAKE_CXX_STANDARD ${POCL_CMAKE_CXX_STANDARD}) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +add_pocl_device_library("pocl-devices-vortex" pocl-vortex.c pocl-vortex.h vortex_utils.cc vortex_utils.h) + +if(ENABLE_LOADABLE_DRIVERS) + target_link_libraries(pocl-devices-vortex PRIVATE ${VORTEX_PREFIX}/runtime/lib/libvortex.so ${PTHREAD_LIBRARY}) +endif() + +install(FILES "kernel_main.c" "kernel_args.h" + DESTINATION "${POCL_INSTALL_PRIVATE_DATADIR}/vortex" COMPONENT "lib") \ No newline at end of file diff --git a/lib/CL/devices/vortex/kernel_args.h b/lib/CL/devices/vortex/kernel_args.h new file mode 100644 index 0000000000000000000000000000000000000000..00cb384a0824e5d52c3eed3a00f523e47bfdd573 --- /dev/null +++ b/lib/CL/devices/vortex/kernel_args.h @@ -0,0 +1,9 @@ +#include <stdint.h> + +typedef struct { + uint32_t work_dim; + uint32_t num_groups[3]; + uint32_t local_size[3]; + uint32_t global_offset[3]; + uint32_t kernel_id; +} kernel_args_t; diff --git a/lib/CL/devices/vortex/kernel_main.c b/lib/CL/devices/vortex/kernel_main.c new file mode 100644 index 0000000000000000000000000000000000000000..0fbfa420d9f04c5634d01f936e6db03c3b66ac07 --- /dev/null +++ b/lib/CL/devices/vortex/kernel_main.c @@ -0,0 +1,25 @@ +#include <vx_spawn.h> +#include <vx_print.h> +#include "kernel_args.h" + +int g_work_dim; +dim3_t g_global_offset; + +void* vx_local_alloc(uint32_t size) { + return __local_mem(size); +} + +void* __vx_get_kernel_callback(int kernel_id); + +int main(void) { + kernel_args_t* kargs = (kernel_args_t*)csr_read(VX_CSR_MSCRATCH); + + g_work_dim = kargs->work_dim; + for (int i = 0, n = kargs->work_dim; i < 3; i++) { + g_global_offset.m[i] = (i < n) ? kargs->global_offset[i] : 0; + } + + void* arg = (void*)((uint8_t*)kargs + sizeof(kernel_args_t)); + vx_kernel_func_cb kernel_func = (vx_kernel_func_cb)__vx_get_kernel_callback(kargs->kernel_id); + return vx_spawn_threads(kargs->work_dim, kargs->num_groups, kargs->local_size, kernel_func, arg); +} diff --git a/lib/CL/devices/vortex/pocl-vortex.c b/lib/CL/devices/vortex/pocl-vortex.c new file mode 100644 index 0000000000000000000000000000000000000000..356e802d9b949f918a4a47c72939163b7be886ff --- /dev/null +++ b/lib/CL/devices/vortex/pocl-vortex.c @@ -0,0 +1,761 @@ +#include "pocl-vortex.h" +#include "builtin_kernels.hh" +#include "common.h" +#include "config.h" +#include "config2.h" +#include "cpuinfo.h" +#include "devices.h" +#include "pocl_local_size.h" +#include "pocl_util.h" +#include "topology/pocl_topology.h" +#include "utlist.h" + +#include <stdint.h> +#include <assert.h> +#include <limits.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <utlist.h> + +#include "pocl_context.h" +#include "pocl_cache.h" +#include "pocl_file_util.h" +#include "pocl_mem_management.h" +#include "pocl_timing.h" +#include "pocl_workgroup_func.h" + +#include "common_driver.h" +#include "pocl_llvm.h" + +#include "vortex_utils.h" +#include "kernel_args.h" +#include <vortex.h> + +typedef struct { + vx_device_h vx_device; + vx_buffer_h vx_kernel_buffer; + + /* List of commands ready to be executed */ + _cl_command_node *ready_list; + + /* List of commands not yet ready to be executed */ + _cl_command_node *command_list; + + /* Lock for command list related operations */ + pocl_lock_t cq_lock; + + pocl_lock_t compile_lock; + + size_t ctx_refcount; +} vortex_device_data_t; + +typedef struct { + int num_kernels; + char* kernel_names; +} vortex_program_data_t; + +typedef struct { + size_t refcount; + int kernel_id; +} vortex_kernel_data_t; + +typedef struct { + vx_device_h vx_device; + vx_buffer_h vx_buffer; + uint64_t buf_address; +} vortex_buffer_data_t; + +static cl_bool vortex_available = CL_TRUE; + +static const char *vortex_native_device_aux_funcs[] = {NULL}; + +void pocl_vortex_init_device_ops(struct pocl_device_ops *ops) { + + ops->device_name = "vortex"; + ops->build_hash = pocl_vortex_build_hash; + ops->probe = pocl_vortex_probe; + ops->uninit = pocl_vortex_uninit; + ops->init = pocl_vortex_init; + + ops->init_context = pocl_vortex_init_context; + ops->free_context = pocl_vortex_free_context; + + ops->run = pocl_vortex_run; + ops->run_native = NULL; + + ops->alloc_mem_obj = pocl_vortex_alloc_mem_obj; + ops->free = pocl_vortex_free; + + ops->build_source = pocl_driver_build_source; + ops->link_program = pocl_driver_link_program; + ops->build_binary = pocl_driver_build_binary; + ops->free_program = pocl_driver_free_program; + ops->setup_metadata = pocl_driver_setup_metadata; + ops->supports_binary = pocl_driver_supports_binary; + ops->build_poclbinary = pocl_driver_build_poclbinary; + ops->build_builtin = pocl_driver_build_opencl_builtins; + + ops->post_build_program = pocl_vortex_post_build_program; + ops->free_program = pocl_vortex_free_program; + + ops->create_kernel = pocl_vortex_create_kernel; + ops->free_kernel = pocl_vortex_free_kernel; + + ops->submit = pocl_vortex_submit; + ops->join = pocl_vortex_join; + ops->flush = pocl_vortex_flush; + ops->notify = pocl_vortex_notify; + ops->broadcast = pocl_broadcast; + + ops->read = pocl_vortex_read; + ops->write = pocl_vortex_write; + + ops->get_mapping_ptr = pocl_driver_get_mapping_ptr; + ops->free_mapping_ptr = pocl_driver_free_mapping_ptr; +} + +char * pocl_vortex_build_hash (cl_device_id device) +{ + char *res = (char *)calloc(1000, sizeof(char)); + vortex_device_data_t *dd = (vortex_device_data_t *)device->data; +#if (VORTEX_XLEN == 64) + snprintf(res, 1000, "vortex-riscv64-unknown-unknown-elf"); +#else + snprintf(res, 1000, "vortex-riscv32-unknown-unknown-elf"); +#endif + return res; +} + +unsigned int pocl_vortex_probe(struct pocl_device_ops *ops) +{ + return (0 == strcmp(ops->device_name, "vortex")); +} + +cl_int +pocl_vortex_init (unsigned j, cl_device_id dev, const char* parameters) +{ + int vx_err; + vortex_device_data_t *dd; + + const char* sz_cflags = pocl_get_string_option("POCL_VORTEX_CFLAGS", ""); + + int is64bit = (VORTEX_XLEN == 64); + + assert (dev->data == NULL); + + pocl_init_default_device_infos(dev, VORTEX_DEVICE_EXTENSIONS); + + SETUP_DEVICE_CL_VERSION (dev, VORTEX_DEVICE_CL_VERSION_MAJOR, + VORTEX_DEVICE_CL_VERSION_MINOR); + + dd = (vortex_device_data_t *)calloc(1, sizeof(vortex_device_data_t)); + if (dd == NULL){ + return CL_OUT_OF_HOST_MEMORY; + } + + dev->vendor = "Vortex Group"; + dev->long_name = "Vortex OpenGPU"; + dev->short_name = "Vortex"; + dev->vendor_id = 0; + dev->type = CL_DEVICE_TYPE_GPU; + + dev->spmd = CL_TRUE; + dev->run_workgroup_pass = CL_FALSE; + dev->execution_capabilities = CL_EXEC_KERNEL; + //dev->global_as_id = VX_ADDR_SPACE_GLOBAL; + //dev->local_as_id = VX_ADDR_SPACE_LOCAL; + //dev->constant_as_id = VX_ADDR_SPACE_CONSTANT; + dev->autolocals_to_args = POCL_AUTOLOCALS_TO_ARGS_ALWAYS; + dev->device_alloca_locals = CL_FALSE; + dev->device_side_printf = 0; + dev->has_64bit_long = is64bit; + + dev->llvm_cpu = NULL; + dev->address_bits = VORTEX_XLEN; + dev->llvm_target_triplet = is64bit ? "riscv64-unknown-unknown" : "riscv32-unknown-unknown"; + dev->llvm_abi = is64bit ? "lp64d" : "ilp32f"; + dev->kernellib_name = is64bit ? "kernel-riscv64" : "kernel-riscv32"; + dev->kernellib_fallback_name = NULL; + dev->kernellib_subdir = "vortex"; + dev->device_aux_functions = vortex_native_device_aux_funcs; + + dev->image_support = CL_FALSE; + + vx_device_h vx_device; + + vx_err = vx_dev_open(&vx_device); + if (vx_err != 0) { + free(dd); + return CL_DEVICE_NOT_FOUND; + } + + uint64_t num_cores; + vx_err = vx_dev_caps(vx_device, VX_CAPS_NUM_CORES, &num_cores); + if (vx_err != 0) { + vx_dev_close(vx_device); + free(dd); + return CL_DEVICE_NOT_FOUND; + } + + uint64_t global_mem_size; + vx_err = vx_dev_caps(vx_device, VX_CAPS_GLOBAL_MEM_SIZE, &global_mem_size); + if (vx_err != 0) { + vx_dev_close(vx_device); + free(dd); + return CL_DEVICE_NOT_FOUND; + } + + uint64_t local_mem_size; + vx_err = vx_dev_caps(vx_device, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size); + if (vx_err != 0) { + vx_dev_close(vx_device); + free(dd); + return CL_DEVICE_NOT_FOUND; + } + + dev->global_mem_size = global_mem_size; + dev->max_mem_alloc_size = global_mem_size; + dev->local_mem_size = local_mem_size; + dev->max_compute_units = num_cores; + + dd->vx_kernel_buffer = NULL; + dd->vx_device = vx_device; + + dd->ctx_refcount = 0; + + POCL_INIT_LOCK(dd->compile_lock); + POCL_INIT_LOCK(dd->cq_lock); + + dev->data = dd; + dev->available = &vortex_available; + + return CL_SUCCESS; +} + +cl_int pocl_vortex_uninit (unsigned j, cl_device_id device) { + vortex_device_data_t *dd = (vortex_device_data_t *)device->data; + if (NULL == dd) + return CL_SUCCESS; + + if (dd->vx_kernel_buffer != NULL) { + vx_mem_free(dd->vx_kernel_buffer); + } + vx_dev_close(dd->vx_device); + + POCL_DESTROY_LOCK (dd->compile_lock); + POCL_DESTROY_LOCK (dd->cq_lock); + POCL_MEM_FREE(dd); + device->data = NULL; + return CL_SUCCESS; +} + +int pocl_vortex_init_context (cl_device_id device, cl_context context) { + vortex_device_data_t *dd = (vortex_device_data_t *)device->data; + if (NULL == dd) + return CL_SUCCESS; + + dd->ctx_refcount++; + + return CL_SUCCESS; +} + +int pocl_vortex_free_context (cl_device_id device, cl_context context) { + vortex_device_data_t *dd = (vortex_device_data_t *)device->data; + if (NULL == dd) + return CL_SUCCESS; + + if (--dd->ctx_refcount == 0) { + pocl_vortex_uninit(0, device); + } + + return CL_SUCCESS; +} + +int pocl_vortex_post_build_program (cl_program program, cl_uint device_i) { + int result; + cl_device_id device = program->devices[device_i]; + vortex_device_data_t *ddata = (vortex_device_data_t *)device->data; + vortex_program_data_t *pdata = NULL; + + POCL_LOCK (ddata->compile_lock); + + do { + result = pocl_llvm_run_passes_on_program (program, device_i); + if (result != 0) + break; + + pdata = (vortex_program_data_t *)calloc (1, sizeof (vortex_program_data_t)); + pdata->kernel_names = NULL; + + char sz_program_bc[POCL_MAX_PATHNAME_LENGTH]; + char sz_program_vxbin[POCL_MAX_PATHNAME_LENGTH]; + + pocl_cache_program_bc_path(sz_program_bc, program, device_i); + remove_extension(sz_program_bc); + + strcpy(sz_program_vxbin, sz_program_bc); + strncat(sz_program_vxbin, ".vxbin", POCL_MAX_PATHNAME_LENGTH - 1); + + result = compile_vortex_program(&pdata->kernel_names, &pdata->num_kernels, + sz_program_vxbin, program->llvm_irs[device_i]); + if (result != 0) + break; + + } while (0); + + program->data[device_i] = pdata; + + POCL_UNLOCK (ddata->compile_lock); + + return result; +} + +int pocl_vortex_free_program (cl_device_id device, cl_program program, + unsigned device_i) { + vortex_device_data_t *ddata = (vortex_device_data_t *)device->data; + vortex_program_data_t *pdata = (vortex_program_data_t *)program->data[device_i]; + if (pdata == NULL) + return CL_SUCCESS; + + pocl_driver_free_program (device, program, device_i); + + POCL_MEM_FREE (pdata->kernel_names); + POCL_MEM_FREE (pdata); + program->data[device_i] = NULL; + + return CL_SUCCESS; +} + +int pocl_vortex_create_kernel (cl_device_id device, cl_program program, + cl_kernel kernel, unsigned device_i) { + int result = CL_SUCCESS; + pocl_kernel_metadata_t *meta = kernel->meta; + assert(meta->data != NULL); + vortex_kernel_data_t *kdata = (vortex_kernel_data_t *)meta->data[device_i]; + if (kdata != NULL) { + ++kdata->refcount; + return CL_SUCCESS; + } + + do { + vortex_program_data_t *pdata = (vortex_program_data_t *)program->data[device_i]; + assert(pdata != NULL); + + const char* current = pdata->kernel_names; + int i = 0; + int found = 0; + for (int i = 0; i < pdata->num_kernels; ++i) { + if (strcmp(current, kernel->name) == 0) { + found = 1; + break; + } + current += strlen(current) + 1; + } + assert(found); + kdata = (void *)calloc (1, sizeof (vortex_kernel_data_t)); + kdata->kernel_id = i; + ++kdata->refcount; + + } while (0); + + meta->data[device_i] = kdata; + + return result; +} + +int pocl_vortex_free_kernel (cl_device_id device, cl_program program, + cl_kernel kernel, unsigned device_i) { + pocl_kernel_metadata_t *meta = kernel->meta; + assert(meta->data != NULL); + vortex_kernel_data_t *kdata = (vortex_kernel_data_t *)meta->data[device_i]; + if (kdata == NULL) + return CL_SUCCESS; + + --kdata->refcount; + if (kdata->refcount == 0) { + POCL_MEM_FREE (kdata); + meta->data[device_i] = NULL; + } + + return CL_SUCCESS; +} + +void pocl_vortex_run (void *data, _cl_command_node *cmd) { + vortex_device_data_t *dd; + struct pocl_argument *al; + cl_uint device_i = cmd->program_device_i; + cl_kernel kernel = cmd->command.run.kernel; + cl_program program = kernel->program; + pocl_kernel_metadata_t *meta = kernel->meta; + vortex_program_data_t *pdata = (vortex_program_data_t *)program->data[device_i]; + vortex_kernel_data_t *kdata = (vortex_kernel_data_t *)meta->data[device_i]; + struct pocl_context *pc = &cmd->command.run.pc; + int vx_err; + + int num_groups = 1; + int group_size = 1; + for (int i = 0; i < pc->work_dim; ++i) { + num_groups *= pc->num_groups[i]; + group_size *= pc->local_size[i]; + } + if (num_groups == 0 || group_size == 0) + return; + + assert (data != NULL); + dd = (vortex_device_data_t *)data; + + int ptr_size = VORTEX_XLEN / 8; + + // calculate kernel arguments buffer size + int local_mem_size = 0; + size_t abuf_size = 0; + + for (int i = 0; i < meta->num_args; ++i) { + struct pocl_argument* al = &(cmd->command.run.arguments[i]); + if (ARG_IS_LOCAL(meta->arg_info[i])) { + local_mem_size += al->size; + abuf_size += 4; + } else + if ((meta->arg_info[i].type == POCL_ARG_TYPE_POINTER) + || (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE) + || (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)) { + abuf_size += ptr_size; + } else { + // scalar argument + abuf_size += al->size; + } + } + + // local buffers + for (int i = 0; i < meta->num_locals; ++i) { + local_mem_size += meta->local_sizes[i]; + abuf_size += 4; + } + + // add local size + if (local_mem_size != 0) { + abuf_size += 4; + } + + // check occupancy + if (local_mem_size != 0) { + int available_localmem; + vx_err = vx_check_occupancy(dd->vx_device, group_size, &available_localmem); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } + if (local_mem_size > available_localmem) { + POCL_ABORT("out of local memory: needed=%d bytes, available=%d bytes\n", + local_mem_size, available_localmem); + } + } + + // allocate arguments host buffer + size_t kargs_buffer_size = sizeof(kernel_args_t) + abuf_size; + uint8_t* const host_kargs_base_ptr = malloc(kargs_buffer_size); + assert(host_kargs_base_ptr); + + // allocate kernel arguments buffer + vx_buffer_h vx_kargs_buffer; + vx_err = vx_mem_alloc(dd->vx_device, kargs_buffer_size, VX_MEM_READ, &vx_kargs_buffer); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } + + uint64_t dev_kargs_base_addr; + vx_err = vx_mem_address(vx_kargs_buffer, &dev_kargs_base_addr); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } + + // write context data + { + kernel_args_t* const kargs = (kernel_args_t*)host_kargs_base_ptr; + kargs->work_dim = pc->work_dim; + for (int i = 0; i < 3; ++i) { + kargs->num_groups[i] = pc->num_groups[i]; + kargs->local_size[i] = pc->local_size[i]; + kargs->global_offset[i] = pc->global_offset[i]; + } + kargs->kernel_id = kdata->kernel_id; + } + + // write arguments + + uint8_t* host_args_ptr = host_kargs_base_ptr + sizeof(kernel_args_t); + int local_mem_offset = 0; + + for (int i = 0; i < meta->num_args; ++i) { + struct pocl_argument* al = &(cmd->command.run.arguments[i]); + if (ARG_IS_LOCAL(meta->arg_info[i])) { + if (local_mem_offset == 0) { + memcpy(host_args_ptr, &local_mem_size, 4); // local_size + host_args_ptr += 4; + } + memcpy(host_args_ptr, &local_mem_offset, 4); // arg offset + host_args_ptr += 4; + local_mem_offset += al->size; + } else + if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER) { + if (al->value == NULL) { + memset(host_args_ptr, 0, ptr_size); // NULL pointer value + host_args_ptr += ptr_size; + } else { + cl_mem m = (*(cl_mem *)(al->value)); + vortex_buffer_data_t* buf_data = (vortex_buffer_data_t *) m->device_ptrs[cmd->device->global_mem_id].mem_ptr; + uint64_t dev_mem_addr = buf_data->buf_address + al->offset; + memcpy(host_args_ptr, &buf_data->buf_address, ptr_size); // pointer value + host_args_ptr += ptr_size; + } + } else + if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } else + if (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } else { + // scalar argument + memcpy(host_args_ptr, al->value, al->size); // scalar value + host_args_ptr += al->size; + } + } + + // write local arguments + for (int i = 0; i < meta->num_locals; ++i) { + if (local_mem_offset == 0) { + memcpy(host_args_ptr, &local_mem_size, 4); // local_size + host_args_ptr += 4; + } + memcpy(host_args_ptr, &local_mem_offset, 4); // arg offset + host_args_ptr += 4; + local_mem_offset += meta->local_sizes[i]; + } + + // upload kernel arguments buffer + vx_err = vx_copy_to_dev(vx_kargs_buffer, host_kargs_base_ptr, 0, kargs_buffer_size); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } + + // release argument host buffer + free(host_kargs_base_ptr); + + // upload kernel to device + if (NULL == dd->vx_kernel_buffer) { + char sz_program_bc[POCL_MAX_PATHNAME_LENGTH]; + char sz_program_vxbin[POCL_MAX_PATHNAME_LENGTH]; + + pocl_cache_program_bc_path(sz_program_bc, program, device_i); + remove_extension(sz_program_bc); + + strcpy(sz_program_vxbin, sz_program_bc); + strncat(sz_program_vxbin, ".vxbin", POCL_MAX_PATHNAME_LENGTH - 1); + + vx_err = vx_upload_kernel_file(dd->vx_device, sz_program_vxbin, &dd->vx_kernel_buffer); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } + } + + // launch kernel execution + vx_err = vx_start(dd->vx_device, dd->vx_kernel_buffer, vx_kargs_buffer); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } + + // wait for the execution to complete + vx_err = vx_ready_wait(dd->vx_device, -1); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } + + // release arguments device buffer + vx_mem_free(vx_kargs_buffer); +} + +cl_int pocl_vortex_alloc_mem_obj(cl_device_id device, cl_mem mem_obj, void *host_ptr) { + int vx_err; + pocl_mem_identifier *p = &mem_obj->device_ptrs[device->global_mem_id]; + + /* let other drivers preallocate */ + if ((mem_obj->flags & CL_MEM_ALLOC_HOST_PTR) && (mem_obj->mem_host_ptr == NULL)) + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + + p->extra_ptr = NULL; + p->version = 0; + p->extra = 0; + cl_mem_flags flags = mem_obj->flags; + + if (flags & CL_MEM_USE_HOST_PTR) { + POCL_ABORT("POCL_VORTEX_MALLOC\n"); + } else { + int vx_flags = 0; + if ((flags & CL_MEM_READ_WRITE) != 0) + vx_flags = VX_MEM_READ_WRITE; + if ((flags & CL_MEM_READ_ONLY) != 0) + vx_flags = VX_MEM_READ; + if ((flags & CL_MEM_WRITE_ONLY) != 0) + vx_flags = VX_MEM_WRITE; + + vortex_device_data_t* dd = (vortex_device_data_t *)device->data; + + vx_buffer_h vx_buffer; + vx_err = vx_mem_alloc(dd->vx_device, mem_obj->size, vx_flags, &vx_buffer); + if (vx_err != 0) { + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + + uint64_t buf_address; + vx_err = vx_mem_address(vx_buffer, &buf_address); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_RUN\n"); + } + + if (host_ptr && (flags & CL_MEM_COPY_HOST_PTR)) { + vx_err = vx_copy_to_dev(vx_buffer, host_ptr, 0, mem_obj->size); + if (vx_err != 0) { + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + } + + if (flags & CL_MEM_ALLOC_HOST_PTR) { + /* malloc mem_host_ptr then increase refcount */ + pocl_alloc_or_retain_mem_host_ptr (mem_obj); + } + + vortex_buffer_data_t* buf_data = (vortex_buffer_data_t *)malloc(sizeof(vortex_buffer_data_t)); + buf_data->vx_device = dd->vx_device; + buf_data->vx_buffer = vx_buffer; + buf_data->buf_address = buf_address; + + p->mem_ptr = buf_data; + } + + return CL_SUCCESS; +} + +void pocl_vortex_free(cl_device_id device, cl_mem mem_obj) { + pocl_mem_identifier *p = &mem_obj->device_ptrs[device->global_mem_id]; + cl_mem_flags flags = mem_obj->flags; + vortex_buffer_data_t* buf_data = (vortex_buffer_data_t*)p->mem_ptr; + + if (flags & CL_MEM_USE_HOST_PTR) { + POCL_ABORT("POCL_VORTEX_FREE\n"); + } else { + if (flags & CL_MEM_ALLOC_HOST_PTR) { + pocl_release_mem_host_ptr(mem_obj); + } + if (buf_data->vx_buffer) { + vx_mem_free(buf_data->vx_buffer); + } + } + free(buf_data); + p->mem_ptr = NULL; + p->version = 0; +} + +void pocl_vortex_write(void *data, + const void *__restrict__ host_ptr, + pocl_mem_identifier *dst_mem_id, + cl_mem dst_buf, + size_t offset, + size_t size) { + int vx_err; + vortex_buffer_data_t *buf_data = (vortex_buffer_data_t *)dst_mem_id->mem_ptr; + vx_err = vx_copy_to_dev(buf_data->vx_buffer, host_ptr, offset, size); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_WRITE\n"); + } +} + +void pocl_vortex_read(void *data, + void *__restrict__ host_ptr, + pocl_mem_identifier *src_mem_id, + cl_mem src_buf, + size_t offset, + size_t size) { + int vx_err; + vortex_buffer_data_t* buf_data = (vortex_buffer_data_t*)src_mem_id->mem_ptr; + vx_err = vx_copy_from_dev(host_ptr, buf_data->vx_buffer, offset, size); + if (vx_err != 0) { + POCL_ABORT("POCL_VORTEX_READ\n"); + } +} + +static void vortex_command_scheduler (vortex_device_data_t *dd) { + _cl_command_node *node; + + /* execute commands from ready list */ + while ((node = dd->ready_list)) + { + assert (pocl_command_is_ready (node->sync.event.event)); + assert (node->sync.event.event->status == CL_SUBMITTED); + CDL_DELETE (dd->ready_list, node); + POCL_UNLOCK (dd->cq_lock); + pocl_exec_command (node); + POCL_LOCK (dd->cq_lock); + } + + return; +} + +void pocl_vortex_submit (_cl_command_node *node, cl_command_queue cq) { + vortex_device_data_t *dd = (vortex_device_data_t *)node->device->data; + + node->ready = 1; + POCL_LOCK (dd->cq_lock); + pocl_command_push(node, &dd->ready_list, &dd->command_list); + + POCL_UNLOCK_OBJ (node->sync.event.event); + vortex_command_scheduler (dd); + POCL_UNLOCK (dd->cq_lock); + + return; +} + +void pocl_vortex_flush (cl_device_id device, cl_command_queue cq) { + vortex_device_data_t *dd = (vortex_device_data_t *)device->data; + + POCL_LOCK (dd->cq_lock); + vortex_command_scheduler (dd); + POCL_UNLOCK (dd->cq_lock); +} + +void pocl_vortex_join (cl_device_id device, cl_command_queue cq) { + vortex_device_data_t *dd = (vortex_device_data_t *)device->data; + + POCL_LOCK (dd->cq_lock); + vortex_command_scheduler (dd); + POCL_UNLOCK (dd->cq_lock); + + return; +} + +void pocl_vortex_notify (cl_device_id device, cl_event event, cl_event finished) { + vortex_device_data_t *dd = (vortex_device_data_t *)device->data; + _cl_command_node * volatile node = event->command; + + if (finished->status < CL_COMPLETE) + { + pocl_update_event_failed (event); + return; + } + + if (!node->ready) + return; + + if (pocl_command_is_ready (event)) + { + if (event->status == CL_QUEUED) + { + pocl_update_event_submitted (event); + POCL_LOCK (dd->cq_lock); + CDL_DELETE (dd->command_list, node); + CDL_PREPEND (dd->ready_list, node); + vortex_command_scheduler (dd); + POCL_UNLOCK (dd->cq_lock); + } + return; + } +} \ No newline at end of file diff --git a/lib/CL/devices/vortex/pocl-vortex.h b/lib/CL/devices/vortex/pocl-vortex.h new file mode 100644 index 0000000000000000000000000000000000000000..16c5f2e58ce3473f1b8133b50346f79863d3a78e --- /dev/null +++ b/lib/CL/devices/vortex/pocl-vortex.h @@ -0,0 +1,9 @@ +#ifndef POCL_VORTEX_H +#define POCL_VORTEX_H + +#include "pocl_cl.h" + +#include "prototypes.inc" +GEN_PROTOTYPES (vortex) + +#endif /* POCL_VORTEX_H */ diff --git a/lib/CL/devices/vortex/vortex_utils.cc b/lib/CL/devices/vortex/vortex_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..08c77da4109c71f664d81654178fbaae5adfe295 --- /dev/null +++ b/lib/CL/devices/vortex/vortex_utils.cc @@ -0,0 +1,339 @@ +#include <assert.h> +#include <limits.h> +#include <stdlib.h> +#include <unistd.h> +#include <cstdarg> +#include <vector> +#include <cstdio> +#include <ostream> +#include <iostream> +#include <fstream> +#include <sstream> +#include <regex> + +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Linker/Linker.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Cloning.h" + +#include <llvm/Analysis/TargetLibraryInfo.h> +#include <llvm/Analysis/TargetTransformInfo.h> + +#if LLVM_MAJOR >= 17 +#include <llvm/Transforms/IPO/Internalize.h> +#endif + +#include "vortex_utils.h" + +#include <llvm/IR/Module.h> +#include <llvm/Support/raw_ostream.h> +#include <llvm/Support/FileSystem.h> +#include <llvm/Bitcode/BitcodeWriter.h> + +#include "pocl.h" +#include "pocl_file_util.h" +#include "pocl_util.h" + +#include "LLVMUtils.h" + +static int exec(const char* cmd, std::ostream& out) { + char buffer[128]; + auto pipe = popen(cmd, "r"); + if (!pipe) { + //throw std::runtime_error("popen() failed!"); + return -1; + } + while (!feof(pipe)) { + if (fgets(buffer, 128, pipe) != nullptr) + out << buffer; + } + return pclose(pipe); +} + +void remove_extension(char* filename) { + char *last_dot = strrchr(filename, '.'); + if (last_dot != NULL) { + *last_dot = '\0'; + } +} + +static char* convertToCharArray(const llvm::SmallVector<std::string, 8>& names) { + // Calculate the total length required for the buffer + size_t totalLength = 0; + for (const auto& name : names) { + totalLength += name.size() + 1; // +1 for the null terminator + } + + // Allocate buffer + char* buffer = (char*)malloc(totalLength * sizeof(char)); + if (buffer == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return nullptr; + } + + // Copy names into buffer with null separation + size_t position = 0; + for (const auto& name : names) { + std::strcpy(buffer + position, name.c_str()); + position += name.size(); + buffer[position] = '\0'; // Null terminator + position += 1; + } + + return buffer; +} + +// Store function arguments in a single argument buffer. +static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module, llvm::SmallVector<std::string, 8>& funcNames) { + auto &Context = module->getContext(); + const llvm::DataLayout &DL = module->getDataLayout(); + + auto I32Ty = llvm::Type::getInt32Ty(Context); + auto I8Ty = llvm::Type::getInt8Ty(Context); + + // Create new function signature + auto ArgBufferType = llvm::PointerType::get(llvm::Type::getInt8Ty(Context), 0); + auto NewFuncType = llvm::FunctionType::get(function->getReturnType(), {ArgBufferType}, false); + auto NewFunc = llvm::Function::Create(NewFuncType, function->getLinkage(), function->getName() + "_vortex"); + module->getFunctionList().insert(function->getIterator(), NewFunc); + NewFunc->takeName(function); + + auto EntryBlock = llvm::BasicBlock::Create(Context, "entry", NewFunc); + llvm::IRBuilder<> Builder(EntryBlock); + + // Access function arguments + auto ai = NewFunc->arg_begin(); + auto ArgBuffer = &*ai++; + ArgBuffer->setName("ArgBuffer"); + auto I8PtrTy = I8Ty->getPointerTo(); + + unsigned arg_idx = 0; + unsigned arg_offset = 0; + + llvm::Value* allocated_local_mem = nullptr; + + for (auto& OldArg : function->args()) { + auto ArgType = OldArg.getType(); + auto ArgOffset = llvm::ConstantInt::get(I32Ty, arg_offset); + llvm::Value* Arg; + if (pocl::isLocalMemFunctionArg(function, arg_idx)) { + if (allocated_local_mem == nullptr) { + // Load __local_size + auto local_size_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, "__local_size_ptr"); + arg_offset += 4; + auto local_size = Builder.CreateLoad(I32Ty, local_size_ptr, "__local_size"); + // Call vx_local_alloc(__local_size) + auto function_type = llvm::FunctionType::get(I8PtrTy, {I32Ty}, false); + auto vx_local_alloc_func = module->getOrInsertFunction("vx_local_alloc", function_type); + allocated_local_mem = Builder.CreateCall(vx_local_alloc_func, {local_size}, "__local_mem"); + } + // Load argument __offset + auto offset_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, OldArg.getName() + "_offset_ptr"); + auto offset = Builder.CreateLoad(I32Ty, offset_ptr, OldArg.getName() + "_offset"); + arg_offset += 4; + // Apply pointer offset + Arg = Builder.CreateGEP(I8PtrTy, allocated_local_mem, offset, OldArg.getName() + "_byte_ptr"); + } else { + auto offset_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, OldArg.getName() + "_offset_ptr"); + Arg = Builder.CreateLoad(ArgType, offset_ptr, OldArg.getName() + "_loaded"); + arg_offset += DL.getTypeAllocSize(ArgType); + } + OldArg.replaceAllUsesWith(Arg); + arg_idx += 1; + } + + // Move the body of the old function to the new function + NewFunc->splice(NewFunc->end(), function); + + // Connect the entry block to the first block of the old function + for (auto& BB : *NewFunc) { + if (&BB != EntryBlock) { + Builder.CreateBr(&BB); + break; + } + } + + funcNames.push_back(NewFunc->getName().str()); + + return true; +} + +static void processKernels(llvm::SmallVector<std::string, 8>& funcNames, llvm::Module *module) { + llvm::SmallVector<llvm::Function *, 8> functionsToErase; + for (auto& function : module->functions()) { + if (!pocl::isKernelToProcess(function)) + continue; + if (createArgumentsBuffer(&function, module, funcNames)) + functionsToErase.push_back(&function); + } + for (auto function : functionsToErase) { + function->eraseFromParent(); + } +} + +static void addKernelSelect(llvm::SmallVector<std::string, 8>& funcNames, llvm::Module *module) { + auto& Context = module->getContext(); + + auto I32Ty = llvm::Type::getInt32Ty(Context); + auto VoidTy = llvm::Type::getVoidTy(Context); + auto VoidPtrTy = llvm::PointerType::getUnqual(VoidTy); + auto GetKernelCallbackTy = llvm::FunctionType::get(VoidPtrTy, {I32Ty}, false); + + auto GetKernelCallbackFunc = llvm::Function::Create( + GetKernelCallbackTy, llvm::Function::ExternalLinkage, "__vx_get_kernel_callback", module); + + llvm::IRBuilder<> Builder(Context); + + auto EntryBB = llvm::BasicBlock::Create(Context, "entry", GetKernelCallbackFunc); + Builder.SetInsertPoint(EntryBB); + + // Get the function argument (kernel_index) + auto Args = GetKernelCallbackFunc->arg_begin(); + auto KernelIndex = Args++; + KernelIndex->setName("kernel_index"); + + // Prepare the switch instruction + auto Switch = Builder.CreateSwitch(KernelIndex, EntryBB); + + // Iterate through the functions in the module and create cases for the switch + int FunctionIndex = 0; + for (llvm::Function& F : module->functions()) { + if (std::find(funcNames.begin(), funcNames.end(), F.getName().str()) == funcNames.end()) + continue; + // Create a basic block for this function index + auto CaseBB = llvm::BasicBlock::Create(Context, "case_" + std::to_string(FunctionIndex), GetKernelCallbackFunc); + Builder.SetInsertPoint(CaseBB); + // Return the function pointer + Builder.CreateRet(Builder.CreateBitCast(&F, GetKernelCallbackTy->getReturnType())); + // Add the case to the switch statement + Switch->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(Context), FunctionIndex), CaseBB); + ++FunctionIndex; + } + // Create a default case block for out-of-bounds indices + auto DefaultBB = llvm::BasicBlock::Create(Context, "default", GetKernelCallbackFunc); + Builder.SetInsertPoint(DefaultBB); + Builder.CreateRet(llvm::ConstantPointerNull::get(llvm::cast<llvm::PointerType>(GetKernelCallbackTy->getReturnType()))); + Switch->setDefaultDest(DefaultBB); +} + +int compile_vortex_program(char**kernel_names, int* num_kernels, char* sz_program_vxbin, void* llvm_module) { + int err; + + const char* llvm_install_path = getenv("LLVM_PREFIX"); + if (llvm_install_path) { + if (!pocl_exists(llvm_install_path)) { + POCL_MSG_ERR("$LLVM_PREFIX: '%s' doesn't exist\n", llvm_install_path); + return -1; + } + POCL_MSG_PRINT_INFO("using $LLVM_PREFIX=%s!\n", llvm_install_path); + } + + std::string build_cflags = pocl_get_string_option("POCL_VORTEX_CFLAGS", ""); + if (build_cflags == "") { + POCL_MSG_ERR("'POCL_VORTEX_CFLAGS' need to be set\n"); + return -1; + } + + std::string build_ldflags = pocl_get_string_option ("POCL_VORTEX_LDFLAGS", ""); + if(build_ldflags == ""){ + POCL_MSG_ERR("'POCL_VORTEX_LDFLAGS' need to be set\n"); + return -1; + } + + char sz_program_bc[POCL_MAX_PATHNAME_LENGTH + 1]; + err = pocl_mk_tempname(sz_program_bc, "/tmp/pocl_vortex_program", ".bc", nullptr); + if (err != 0) + return err; + + char sz_program_elf[POCL_MAX_PATHNAME_LENGTH + 1]; + err = pocl_mk_tempname(sz_program_elf, "/tmp/pocl_vortex_program", ".elf", nullptr); + if (err != 0) + return err; + + auto module = (llvm::Module *)llvm_module; + llvm::SmallVector<std::string, 8> kernelNames; + processKernels(kernelNames, module); + addKernelSelect(kernelNames, module); + + *num_kernels = kernelNames.size(); + *kernel_names = convertToCharArray(kernelNames); + + { + std::error_code EC; + llvm::raw_fd_ostream file(sz_program_bc, EC, llvm::sys::fs::OF_None); + llvm::WriteBitcodeToFile(*module, file); + file.close(); + } + + if (POCL_DEBUGGING_ON) { + std::error_code EC; + llvm::raw_fd_ostream file("program.ll", EC, llvm::sys::fs::OF_None); + module->print(file, nullptr); + file.close(); + } + + { + std::string clang_path(CLANG); + if (llvm_install_path) { + clang_path.replace(0, strlen(LLVM_PREFIX), llvm_install_path); + } + + char sz_kernel_main[POCL_MAX_PATHNAME_LENGTH]; + pocl_get_srcdir_or_datadir (sz_kernel_main, "/lib/CL/devices", "", "/vortex/kernel_main.c"); + + std::stringstream ss_cmd, ss_out; + ss_cmd << clang_path.c_str() << " " << build_cflags << " " << sz_program_bc << " " << sz_kernel_main << " " << build_ldflags << " -o " << sz_program_elf; + POCL_MSG_PRINT_LLVM("running \"%s\"\n", ss_cmd.str().c_str()); + int err = exec(ss_cmd.str().c_str(), ss_out); + if (err != 0) { + POCL_MSG_ERR("%s\n", ss_out.str().c_str()); + return err; + } + } + + if (POCL_DEBUGGING_ON) { + std::string objdump_path(LLVM_OBJDUMP); + if (llvm_install_path) { + objdump_path.replace(0, strlen(LLVM_PREFIX), llvm_install_path); + } + + std::stringstream ss_cmd, ss_out; + ss_cmd << objdump_path.c_str() << " -D " << sz_program_elf << " > program.dump"; + + POCL_MSG_PRINT_LLVM("running \"%s\"\n", ss_cmd.str().c_str()); + int err = exec(ss_cmd.str().c_str(), ss_out); + if (err != 0) { + POCL_MSG_ERR("%s\n", ss_out.str().c_str()); + return err; + } + } + + { + std::string vxbintool_path = pocl_get_string_option ("POCL_VORTEX_BINTOOL", ""); + if (vxbintool_path == ""){ + POCL_MSG_ERR("'POCL_VORTEX_BINTOOL' need to be set\n"); + return -1; + } + std::stringstream ss_cmd, ss_out; + ss_cmd << vxbintool_path << " " << sz_program_elf << " " << sz_program_vxbin; + POCL_MSG_PRINT_LLVM("running \"%s\"\n", ss_cmd.str().c_str()); + int err = exec(ss_cmd.str().c_str(), ss_out); + if (err != 0) { + POCL_MSG_ERR("%s\n", ss_out.str().c_str()); + return err; + } + } + + return 0; +} \ No newline at end of file diff --git a/lib/CL/devices/vortex/vortex_utils.h b/lib/CL/devices/vortex/vortex_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..ab3c6f2424abb9cd9eccf3adb3a245d43c49a602 --- /dev/null +++ b/lib/CL/devices/vortex/vortex_utils.h @@ -0,0 +1,18 @@ +#ifndef VORTEX_UTILS_H +#define VORTEX_UTILS_H + +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void remove_extension(char* filename); + +int compile_vortex_program(char** kernel_names, int* num_kernels, char* sz_program_vxbin, void* llvm_module); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/lib/kernel/CMakeLists.txt b/lib/kernel/CMakeLists.txt index 81244fb2b4e7a490b652a2c7983b3c62e254f4fd..7932a7066ad96b93856bcfe7a999518860f3743f 100644 --- a/lib/kernel/CMakeLists.txt +++ b/lib/kernel/CMakeLists.txt @@ -224,6 +224,12 @@ endif() #********************************************************************* +if(OCL_TARGETS MATCHES "vortex") + add_subdirectory("vortex") +endif() + +#********************************************************************* + # "Escape" a list before passing to an external command string(REPLACE ";" "****" KERNEL_BC_LIST_ESCAPED "${KERNEL_BC_LIST}") diff --git a/lib/kernel/vortex/CMakeLists.txt b/lib/kernel/vortex/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6dbf55420da29dcc17a6a0f9348fcfdad9edc31 --- /dev/null +++ b/lib/kernel/vortex/CMakeLists.txt @@ -0,0 +1,79 @@ +#============================================================================= +# CMake build system files +# +# Copyright (c) 2014-2018 pocl developers +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +#============================================================================= + +include("bitcode_rules") + +if( ${VORTEX_XLEN} EQUAL 64 ) + set(LLVM_TARGET riscv64) + set(TARGET_MARCH rv64imafd) +else( ${VORTEX_XLEN} EQUAL 64 ) + set(LLVM_TARGET riscv32) + set(TARGET_MARCH rv32imaf) +endif( ${VORTEX_XLEN} EQUAL 64 ) + +set(KERNEL_SOURCES ${SOURCES_GENERIC}) + +foreach(FILE printf.c print_base.c + get_work_dim.c get_num_groups.c get_local_size.c get_global_offset.c get_global_size.c + get_group_id.c get_local_id.c get_global_id.c get_linear_id.c + barrier.ll + atomics.cl) + list(REMOVE_ITEM KERNEL_SOURCES "${FILE}") +endforeach() + +foreach(FILE workitems.c printf.c barrier.c) + list(REMOVE_ITEM KERNEL_SOURCES "${FILE}") + list(APPEND KERNEL_SOURCES "vortex/${FILE}") +endforeach() + +set(CLANG_FLAGS "-ffreestanding" "-target" "${LLVM_TARGET}" "-march=${TARGET_MARCH}" "-emit-llvm" "-D_CL_DISABLE_HALF" "-I${VORTEX_PREFIX}/kernel/include") + +set(KERNEL_CL_FLAGS "-Xclang" "-cl-std=CL${VORTEX_DEVICE_CL_STD}" "-D__OPENCL_C_VERSION__=${VORTEX_DEVICE_CL_VERSION}" ${KERNEL_CL_FLAGS}) + +set(LLC_FLAGS "") + +set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${VORTEX_DEVICE_CL_VERSION} -DPOCL_DEVICE_ADDRESS_BITS=${VORTEX_XLEN}") + +separate_arguments(VORTEX_DEVICE_EXTENSIONS) +foreach(EXT ${VORTEX_DEVICE_EXTENSIONS}) + set(DEVICE_CL_FLAGS "${DEVICE_CL_FLAGS} -D${EXT}") +endforeach() +separate_arguments(DEVICE_CL_FLAGS) + +make_kernel_bc(KERNEL_BC "${LLVM_TARGET}" "BCs" 0 0 0 ${KERNEL_SOURCES}) + +# just debug +message(STATUS "${LLVM_TARGET} Kernel BC: ${KERNEL_BC}") + +list(APPEND KERNEL_BC_LIST "${KERNEL_BC}") +set(KERNEL_BC_LIST "${KERNEL_BC_LIST}" PARENT_SCOPE) + +# a target is needed... +add_custom_target("kernel_${LLVM_TARGET}" DEPENDS ${KERNEL_BC}) + +list(APPEND KERNEL_TARGET_LIST "kernel_${LLVM_TARGET}") +set(KERNEL_TARGET_LIST "${KERNEL_TARGET_LIST}" PARENT_SCOPE) + +install(FILES "${KERNEL_BC}" DESTINATION "${POCL_INSTALL_PRIVATE_DATADIR}" COMPONENT "lib") diff --git a/lib/kernel/vortex/barrier.c b/lib/kernel/vortex/barrier.c new file mode 100644 index 0000000000000000000000000000000000000000..c6e0ea21ddf8af77a052ee64b63cabe0bbcc91d1 --- /dev/null +++ b/lib/kernel/vortex/barrier.c @@ -0,0 +1,10 @@ +#include <vx_spawn.h> + +#define CLK_GLOBAL_MEM_FENCE 0x02 + +void _Z7barrierj(int flags) { + if (flags & CLK_GLOBAL_MEM_FENCE) { + vx_fence(); + } + vx_barrier(__local_group_id, __warps_per_group); +} diff --git a/lib/kernel/vortex/printf.c b/lib/kernel/vortex/printf.c new file mode 100644 index 0000000000000000000000000000000000000000..5e2065dfdd5cb7c65ba7116320aeac72027d5b78 --- /dev/null +++ b/lib/kernel/vortex/printf.c @@ -0,0 +1,10 @@ +#include <vx_print.h> + +int printf (const char *restrict fmt, ...) { + int ret; + va_list va; + va_start(va, fmt); + ret = vx_vprintf(fmt, va); + va_end(va); + return ret; +} diff --git a/lib/kernel/vortex/workitems.c b/lib/kernel/vortex/workitems.c new file mode 100644 index 0000000000000000000000000000000000000000..5ac4add8da76c9c8a4fc643f64ce181d314a7def --- /dev/null +++ b/lib/kernel/vortex/workitems.c @@ -0,0 +1,84 @@ +#include <vx_spawn.h> + +extern int g_work_dim; +extern dim3_t g_global_offset; + +uint32_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_work_dim (void) { + return g_work_dim; +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_num_groups(uint32_t dimindx) { + switch (dimindx) { + default: return gridDim.x; + case 1: return gridDim.y; + case 2: return gridDim.z; + } +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_local_size(uint32_t dimindx) { + switch (dimindx) { + default: return blockDim.x; + case 1: return blockDim.y; + case 2: return blockDim.z; + } +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_global_offset(uint32_t dimindx) { + switch (dimindx) { + default: return g_global_offset.x; + case 1: return g_global_offset.y; + case 2: return g_global_offset.z; + } +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_group_id(uint32_t dimindx) { + switch (dimindx) { + default: return blockIdx.x; + case 1: return blockIdx.y; + case 2: return blockIdx.z; + } +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_local_id(uint32_t dimindx) { + switch (dimindx) { + default: return threadIdx.x; + case 1: return threadIdx.y; + case 2: return threadIdx.z; + } +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_global_size(uint32_t dimindx) { + switch (dimindx) { + default: return blockDim.x * gridDim.x; + case 1: return blockDim.y * gridDim.y; + case 2: return blockDim.z * gridDim.z; + } +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_global_id(uint32_t dimindx) { + switch (dimindx) { + default: return blockIdx.x * blockDim.x + threadIdx.x + g_global_offset.x; + case 1: return blockIdx.y * blockDim.y + threadIdx.y + g_global_offset.y; + case 2: return blockIdx.z * blockDim.z + threadIdx.z + g_global_offset.z; + } +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_global_linear_id(void) { + return ((blockIdx.z * blockDim.z + threadIdx.z) * blockDim.y * gridDim.y * blockDim.x * gridDim.x) + + ((blockIdx.y * blockDim.y + threadIdx.y) * blockDim.x * gridDim.x) + + ((blockIdx.x * blockDim.z + threadIdx.x)); +} + +size_t _CL_OVERLOADABLE _CL_READNONE _CL_OPTNONE +get_local_linear_id(void) { + return (threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + threadIdx.x; +} \ No newline at end of file diff --git a/lib/llvmopencl/LLVMUtils.h b/lib/llvmopencl/LLVMUtils.h index c9cf1ad87c393391a621c53836da0a0f0f97d3ce..91ebf3c8e07197f21dab0246f10f968de30b2c92 100644 --- a/lib/llvmopencl/LLVMUtils.h +++ b/lib/llvmopencl/LLVMUtils.h @@ -76,6 +76,7 @@ POCL_EXPORT bool isGVarUsedByFunction(llvm::GlobalVariable *GVar, llvm::Function *F); // Checks if the given argument of Func is a local buffer. +POCL_EXPORT bool isLocalMemFunctionArg(llvm::Function *Func, unsigned ArgIndex); // determines if GVar is OpenCL program-scope variable