Skip to content
Snippets Groups Projects
Commit ecd43d1e authored by Pekka Jääskeläinen's avatar Pekka Jääskeläinen
Browse files

cl_intel_required_subgroup_size for CPUs

CL_​KERNEL_​SPILL_​MEM_​SIZE_​INTEL and
CL_​KERNEL_​COMPILE_​SUB_​GROUP_​SIZE_​INTEL are yet to implement.
parent dda05ce4
No related branches found
No related tags found
No related merge requests found
...@@ -9,6 +9,8 @@ Notable User Facing Changes ...@@ -9,6 +9,8 @@ Notable User Facing Changes
- Added support for generic address spaces in the CPU drivers - Added support for generic address spaces in the CPU drivers
- Added basic support for cl_khr_subgroups for CPUs: A single - Added basic support for cl_khr_subgroups for CPUs: A single
subgroup that always executes the whole X-dimension's WIs. subgroup that always executes the whole X-dimension's WIs.
- Added initial (incomplete) support for
cl_intel_required_subgroup_size for CPUs
3.1 December 2022 3.1 December 2022
================= =================
......
...@@ -775,7 +775,8 @@ option(ENABLE_SLEEF "Use SLEEF for kernel library" ON) ...@@ -775,7 +775,8 @@ option(ENABLE_SLEEF "Use SLEEF for kernel library" ON)
option(ENABLE_CONFORMANCE "Enable conformance to OpenCL standard. \ option(ENABLE_CONFORMANCE "Enable conformance to OpenCL standard. \
Enabling this option this does not guarantee conformance (depends on hardware), \ Enabling this option this does not guarantee conformance (depends on hardware), \
but CMake will give errors if options that conflict with conformance are used" OFF) but CMake will give errors if options that conflict with conformance \
are used. It also disables advertising incomplete extensions." OFF)
if(ENABLE_CONFORMANCE AND (NOT ENABLE_SLEEF)) if(ENABLE_CONFORMANCE AND (NOT ENABLE_SLEEF))
message(FATAL_ERROR "conformance needs enabled SLEEF") message(FATAL_ERROR "conformance needs enabled SLEEF")
...@@ -1201,8 +1202,8 @@ endif() ...@@ -1201,8 +1202,8 @@ endif()
set(HOST_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics \ set(HOST_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics \
cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics \ cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics \
cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_command_buffer") cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_command_buffer")
if(LLVM_VERSION VERSION_GREATER_EQUAL 14.0) if(LLVM_VERSION VERSION_GREATER_EQUAL 14.0)
set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_subgroups") set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_subgroups")
...@@ -1211,13 +1212,21 @@ endif() ...@@ -1211,13 +1212,21 @@ endif()
# Extensions that are work-in-progress with known unfinished aspects. # Extensions that are work-in-progress with known unfinished aspects.
# These are not advertised with a conformant build. # These are not advertised with a conformant build.
# #
# * cl_khr_shuffle: Passes the CTS, but only because it doesn't test non-uniform # * cl_khr_subgroup_shuffle: Passes the CTS, but only because it doesn't test
# (lock-step) behavior: https://github.com/KhronosGroup/OpenCL-CTS/issues/1236 # non-uniform(lock-step) behavior, see:
# https://github.com/KhronosGroup/OpenCL-CTS/issues/1236
# #
# * cl_khr_subgroup_ballot: sub_group_ballot() works for uniform calls, the rest # * cl_khr_subgroup_ballot: sub_group_ballot() works for uniform calls, the rest
# are unimplemented. # are unimplemented.
#
# * cl_intel_subgroups: The block reads/writes are unimplemented.
#
# * cl_intel_required_subgroup_size: CL_​KERNEL_​SPILL_​MEM_​SIZE_​INTEL and
# CL_​KERNEL_​COMPILE_​SUB_​GROUP_​SIZE_​INTEL are yet to implement.
#
if(NOT ENABLE_CONFORMANCE) if(NOT ENABLE_CONFORMANCE)
set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_subgroup_ballot cl_khr_subgroup_shuffle") set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_subgroup_ballot \
cl_khr_subgroup_shuffle cl_intel_subgroups cl_intel_required_subgroup_size")
endif() endif()
set(HOST_DEVICE_FEATURES_30 "__opencl_c_3d_image_writes __opencl_c_images \ set(HOST_DEVICE_FEATURES_30 "__opencl_c_3d_image_writes __opencl_c_images \
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
Copyright (c) 2011-2013 Universidad Rey Juan Carlos and Copyright (c) 2011-2013 Universidad Rey Juan Carlos and
2011-2021 Pekka Jääskeläinen 2011-2021 Pekka Jääskeläinen
2023 Pekka Jääskeläinen / Intel Finland Oy
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to of this software and associated documentation files (the "Software"), to
...@@ -118,7 +119,7 @@ pocl_basic_init_device_ops(struct pocl_device_ops *ops) ...@@ -118,7 +119,7 @@ pocl_basic_init_device_ops(struct pocl_device_ops *ops)
ops->build_hash = pocl_basic_build_hash; ops->build_hash = pocl_basic_build_hash;
ops->compute_local_size = pocl_default_local_size_optimizer; ops->compute_local_size = pocl_default_local_size_optimizer;
ops->get_device_info_ext = NULL; ops->get_device_info_ext = pocl_basic_device_info_ext;
ops->svm_free = pocl_basic_svm_free; ops->svm_free = pocl_basic_svm_free;
ops->svm_alloc = pocl_basic_svm_alloc; ops->svm_alloc = pocl_basic_svm_alloc;
...@@ -856,3 +857,21 @@ pocl_basic_svm_copy (cl_device_id dev, void *__restrict__ dst, ...@@ -856,3 +857,21 @@ pocl_basic_svm_copy (cl_device_id dev, void *__restrict__ dst,
{ {
memcpy (dst, src, size); memcpy (dst, src, size);
} }
cl_int
pocl_basic_device_info_ext (cl_device_id device, cl_device_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret)
{
switch (param_name)
{
case CL_DEVICE_SUB_GROUP_SIZES_INTEL:
/* We can basically support fixing any WG size with the CPU devices, but
let's report something semi-sensible here for vectorization aid. */
size_t sizes[] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 };
POCL_RETURN_GETINFO_ARRAY (size_t, sizeof (sizes) / sizeof (size_t),
sizes);
default:
return CL_INVALID_VALUE;
}
}
...@@ -41,4 +41,9 @@ ...@@ -41,4 +41,9 @@
#include "prototypes.inc" #include "prototypes.inc"
GEN_PROTOTYPES (basic) GEN_PROTOTYPES (basic)
cl_int pocl_basic_device_info_ext (cl_device_id device,
cl_device_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret);
#endif /* POCL_BASIC_H */ #endif /* POCL_BASIC_H */
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
Copyright (c) 2011-2013 Universidad Rey Juan Carlos Copyright (c) 2011-2013 Universidad Rey Juan Carlos
2011-2021 Pekka Jääskeläinen 2011-2021 Pekka Jääskeläinen
2022-2023 Pekka Jääskeläinen / Intel Finland Oy
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to of this software and associated documentation files (the "Software"), to
...@@ -1741,7 +1742,9 @@ pocl_setup_opencl_c_with_version (cl_device_id dev, int supports_30) ...@@ -1741,7 +1742,9 @@ pocl_setup_opencl_c_with_version (cl_device_id dev, int supports_30)
} }
static const cl_name_version OPENCL_EXTENSIONS[] static const cl_name_version OPENCL_EXTENSIONS[]
= { { CL_MAKE_VERSION (1, 0, 0), "cl_khr_byte_addressable_store" }, = { { CL_MAKE_VERSION (1, 0, 0), "cl_intel_required_subgroup_size" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_intel_subgroups" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_khr_byte_addressable_store" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_khr_global_int32_base_atomics" }, { CL_MAKE_VERSION (1, 0, 0), "cl_khr_global_int32_base_atomics" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_khr_global_int32_extended_atomics" }, { CL_MAKE_VERSION (1, 0, 0), "cl_khr_global_int32_extended_atomics" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_khr_local_int32_base_atomics" }, { CL_MAKE_VERSION (1, 0, 0), "cl_khr_local_int32_base_atomics" },
......
/* OpenCL built-in library: subgroup basic functionality /* OpenCL built-in library: subroups functionality
Copyright (c) 2022-2023 Pekka Jääskeläinen / Intel Finland Oy Copyright (c) 2022-2023 Pekka Jääskeläinen / Intel Finland Oy
...@@ -21,17 +21,10 @@ ...@@ -21,17 +21,10 @@
IN THE SOFTWARE. IN THE SOFTWARE.
*/ */
/* The default implementation of subgroups is the simplest possible one of /* The default implementation of subgroups for CPU drivers. It uses work-group
always having one subgroup executing the innermost dimension. sized local buffers for exchanging the data. The subgroup size is by default
the local X dimension side, unless restricted with the
Next, the plan is to allow the default to be changed explicitly by intel_reqd_sub_group_size metadata.
means of the intel_reqd_sub_group_size annotation as described in
https://registry.khronos.org/OpenCL/extensions/intel/
cl_intel_required_subgroup_size.html
This forms a minimal viable feature set sufficient to emulate different
warp sizes for CUDA/HIP execution. Performance via efficient vectorization
is not a priority for now.
*/ */
#include <math.h> #include <math.h>
...@@ -65,11 +58,13 @@ size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx); ...@@ -65,11 +58,13 @@ size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx);
size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx); size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx);
/* Magic variable that is expanded in Workgroup.cc */
extern uint _pocl_sub_group_size;
uint _CL_OVERLOADABLE uint _CL_OVERLOADABLE
get_sub_group_size (void) get_sub_group_size (void)
{ {
/* By default 1 SG per WG_x. */ return _pocl_sub_group_size;
return get_local_size (0);
} }
uint _CL_OVERLOADABLE uint _CL_OVERLOADABLE
...@@ -81,7 +76,8 @@ get_max_sub_group_size (void) ...@@ -81,7 +76,8 @@ get_max_sub_group_size (void)
uint _CL_OVERLOADABLE uint _CL_OVERLOADABLE
get_num_sub_groups (void) get_num_sub_groups (void)
{ {
return (uint)get_local_size (1) * get_local_size (2); return (uint)get_local_size (0) * get_local_size (1) * get_local_size (2)
/ get_max_sub_group_size ();
} }
uint _CL_OVERLOADABLE uint _CL_OVERLOADABLE
...@@ -90,31 +86,44 @@ get_enqueued_num_sub_groups (void) ...@@ -90,31 +86,44 @@ get_enqueued_num_sub_groups (void)
return 1; return 1;
} }
size_t _CL_OVERLOADABLE get_local_linear_id (void);
uint _CL_OVERLOADABLE uint _CL_OVERLOADABLE
get_sub_group_id (void) get_sub_group_id (void)
{ {
return get_local_id (2) * get_local_size (1) + get_local_id (1); return (uint)get_local_linear_id () / get_max_sub_group_size ();
} }
uint _CL_OVERLOADABLE uint _CL_OVERLOADABLE
get_sub_group_local_id (void) get_sub_group_local_id (void)
{ {
return (uint)get_local_id (0); return (uint)get_local_linear_id () % get_max_sub_group_size ();
}
static size_t _CL_OVERLOADABLE
get_first_llid (void)
{
return get_sub_group_id () * get_max_sub_group_size ();
} }
void _CL_OVERLOADABLE sub_group_barrier (cl_mem_fence_flags flags); void _CL_OVERLOADABLE sub_group_barrier (cl_mem_fence_flags flags);
#define SUB_GROUP_SHUFFLE_T(TYPE) \ #define SUB_GROUP_SHUFFLE_PT(PREFIX, TYPE) \
__attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE sub_group_shuffle ( \ __attribute__ ((always_inline)) \
TYPE val, uint index) \ TYPE _CL_OVERLOADABLE PREFIX##sub_group_shuffle (TYPE val, uint index) \
{ \ { \
volatile TYPE *temp_storage \ volatile TYPE *temp_storage \
= __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \ = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \
temp_storage[get_sub_group_local_id ()] = val; \ temp_storage[get_local_linear_id ()] = val; \
sub_group_barrier (CLK_LOCAL_MEM_FENCE); \ sub_group_barrier (CLK_LOCAL_MEM_FENCE); \
return temp_storage[index % get_sub_group_size ()]; \ return temp_storage[get_first_llid () + index % get_sub_group_size ()]; \
} }
/* Define both the non-prefixed (khr) and Intel-prefixed shuffles. */
#define SUB_GROUP_SHUFFLE_T(TYPE) \
SUB_GROUP_SHUFFLE_PT (, TYPE) \
SUB_GROUP_SHUFFLE_PT (intel_, TYPE)
SUB_GROUP_SHUFFLE_T (char) SUB_GROUP_SHUFFLE_T (char)
SUB_GROUP_SHUFFLE_T (uchar) SUB_GROUP_SHUFFLE_T (uchar)
SUB_GROUP_SHUFFLE_T (short) SUB_GROUP_SHUFFLE_T (short)
...@@ -126,18 +135,24 @@ SUB_GROUP_SHUFFLE_T (ulong) ...@@ -126,18 +135,24 @@ SUB_GROUP_SHUFFLE_T (ulong)
SUB_GROUP_SHUFFLE_T (float) SUB_GROUP_SHUFFLE_T (float)
SUB_GROUP_SHUFFLE_T (double) SUB_GROUP_SHUFFLE_T (double)
#define SUB_GROUP_SHUFFLE_XOR_T(TYPE) \ #define SUB_GROUP_SHUFFLE_XOR_PT(PREFIX, TYPE) \
__attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE \ __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE \
sub_group_shuffle_xor (TYPE val, uint mask) \ PREFIX##sub_group_shuffle_xor (TYPE val, uint mask) \
{ \ { \
volatile TYPE *temp_storage \ volatile TYPE *temp_storage \
= __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \ = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \
temp_storage[get_sub_group_local_id ()] = val; \ temp_storage[get_local_linear_id ()] = val; \
sub_group_barrier (CLK_LOCAL_MEM_FENCE); \ sub_group_barrier (CLK_LOCAL_MEM_FENCE); \
return temp_storage[(get_sub_group_local_id () ^ mask) \ return temp_storage[get_first_llid () \
% get_sub_group_size ()]; \ + (get_sub_group_local_id () ^ mask) \
% get_sub_group_size ()]; \
} }
/* Define both the non-prefixed (khr) and Intel-prefixed shuffles. */
#define SUB_GROUP_SHUFFLE_XOR_T(TYPE) \
SUB_GROUP_SHUFFLE_XOR_PT (, TYPE) \
SUB_GROUP_SHUFFLE_XOR_PT (intel_, TYPE)
SUB_GROUP_SHUFFLE_XOR_T (char) SUB_GROUP_SHUFFLE_XOR_T (char)
SUB_GROUP_SHUFFLE_XOR_T (uchar) SUB_GROUP_SHUFFLE_XOR_T (uchar)
SUB_GROUP_SHUFFLE_XOR_T (short) SUB_GROUP_SHUFFLE_XOR_T (short)
...@@ -169,18 +184,19 @@ SUB_GROUP_BROADCAST_T (double) ...@@ -169,18 +184,19 @@ SUB_GROUP_BROADCAST_T (double)
{ \ { \
volatile TYPE *temp_storage \ volatile TYPE *temp_storage \
= __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \ = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \
temp_storage[get_sub_group_local_id ()] = val; \ temp_storage[get_local_linear_id ()] = val; \
sub_group_barrier (CLK_LOCAL_MEM_FENCE); \ sub_group_barrier (CLK_LOCAL_MEM_FENCE); \
if (get_sub_group_local_id () == 0) \ if (get_sub_group_local_id () == 0) \
{ \ { \
for (uint i = 1; i < get_sub_group_size (); ++i) \ for (uint i = 1; i < get_sub_group_size (); ++i) \
{ \ { \
TYPE a = temp_storage[0], b = temp_storage[i]; \ TYPE a = temp_storage[get_first_llid ()], \
temp_storage[0] = OPERATION; \ b = temp_storage[get_first_llid () + i]; \
temp_storage[get_first_llid ()] = OPERATION; \
} \ } \
} \ } \
sub_group_barrier (CLK_LOCAL_MEM_FENCE); \ sub_group_barrier (CLK_LOCAL_MEM_FENCE); \
return temp_storage[0]; \ return temp_storage[get_first_llid ()]; \
} }
#define SUB_GROUP_REDUCE_T(OPNAME, OPERATION) \ #define SUB_GROUP_REDUCE_T(OPNAME, OPERATION) \
...@@ -201,18 +217,19 @@ SUB_GROUP_REDUCE_T (max, a > b ? a : b) ...@@ -201,18 +217,19 @@ SUB_GROUP_REDUCE_T (max, a > b ? a : b)
{ \ { \
volatile TYPE *data \ volatile TYPE *data \
= __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \ = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \
data[get_sub_group_local_id ()] = val; \ data[get_local_linear_id ()] = val; \
sub_group_barrier (CLK_LOCAL_MEM_FENCE); \ sub_group_barrier (CLK_LOCAL_MEM_FENCE); \
if (get_sub_group_local_id () == 0) \ if (get_sub_group_local_id () == 0) \
{ \ { \
for (uint i = 1; i < get_sub_group_size (); ++i) \ for (uint i = 1; i < get_sub_group_size (); ++i) \
{ \ { \
TYPE a = data[i - 1], b = data[i]; \ TYPE a = data[get_first_llid () + i - 1], \
data[i] = OPERATION; \ b = data[get_first_llid () + i]; \
data[get_first_llid () + i] = OPERATION; \
} \ } \
} \ } \
sub_group_barrier (CLK_LOCAL_MEM_FENCE); \ sub_group_barrier (CLK_LOCAL_MEM_FENCE); \
return data[get_sub_group_local_id ()]; \ return data[get_local_linear_id ()]; \
} }
#define SUB_GROUP_SCAN_INCLUSIVE_T(OPNAME, OPERATION) \ #define SUB_GROUP_SCAN_INCLUSIVE_T(OPNAME, OPERATION) \
...@@ -233,19 +250,20 @@ SUB_GROUP_SCAN_INCLUSIVE_T (max, a > b ? a : b) ...@@ -233,19 +250,20 @@ SUB_GROUP_SCAN_INCLUSIVE_T (max, a > b ? a : b)
{ \ { \
volatile TYPE *data \ volatile TYPE *data \
= __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \ = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE)); \
data[get_sub_group_local_id () + 1] = val; \ data[get_local_linear_id () + 1] = val; \
data[0] = ID; \ data[get_first_llid ()] = ID; \
sub_group_barrier (CLK_LOCAL_MEM_FENCE); \ sub_group_barrier (CLK_LOCAL_MEM_FENCE); \
if (get_sub_group_local_id () == 0) \ if (get_sub_group_local_id () == 0) \
{ \ { \
for (uint i = 1; i < get_sub_group_size (); ++i) \ for (uint i = 1; i < get_sub_group_size (); ++i) \
{ \ { \
TYPE a = data[i - 1], b = data[i]; \ TYPE a = data[get_first_llid () + i - 1], \
data[i] = OPERATION; \ b = data[get_first_llid () + i]; \
data[get_first_llid () + i] = OPERATION; \
} \ } \
} \ } \
sub_group_barrier (CLK_LOCAL_MEM_FENCE); \ sub_group_barrier (CLK_LOCAL_MEM_FENCE); \
return data[get_sub_group_local_id ()]; \ return data[get_local_linear_id ()]; \
} }
SUB_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, int, 0) SUB_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, int, 0)
...@@ -272,19 +290,23 @@ SUB_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, double, -INFINITY) ...@@ -272,19 +290,23 @@ SUB_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, double, -INFINITY)
__attribute__ ((always_inline)) uint4 _CL_OVERLOADABLE __attribute__ ((always_inline)) uint4 _CL_OVERLOADABLE
sub_group_ballot (int predicate) sub_group_ballot (int predicate)
{ {
uint *flags = __pocl_local_mem_alloca (sizeof (uint) * 4, sizeof (uint) * 4); /* TODO: We actually would need only one per SG. */
uint *flags
= __pocl_work_group_alloca (sizeof (uint) * 4, sizeof (uint) * 4);
char *res = __pocl_work_group_alloca (sizeof (char), 4); char *res = __pocl_work_group_alloca (sizeof (char), 4);
if (get_sub_group_local_id () < 128) if (get_local_linear_id () < 128)
res[get_sub_group_local_id ()] = !!predicate; res[get_local_linear_id ()] = !!predicate;
sub_group_barrier (CLK_LOCAL_MEM_FENCE); sub_group_barrier (CLK_LOCAL_MEM_FENCE);
if (get_sub_group_local_id () == 0) if (get_sub_group_local_id () == 0)
{ {
flags[0] = flags[1] = flags[2] = flags[3] = ~0; flags[get_first_llid ()] = flags[get_first_llid () + 1]
= flags[get_first_llid () + 2] = flags[get_first_llid () + 3] = ~0;
for (uint i = 0; i < get_sub_group_size () && i < 128; ++i) for (uint i = 0; i < get_sub_group_size () && i < 128; ++i)
{ {
flags[i / 32] |= res[i] << (i % 32); flags[get_first_llid () * 4 + i / 32]
|= res[get_first_llid () * 4 + i] << (i % 32);
} }
} }
sub_group_barrier (CLK_LOCAL_MEM_FENCE); sub_group_barrier (CLK_LOCAL_MEM_FENCE);
return *(uint4 *)flags; return ((uint4 *)flags)[get_first_llid () * 4];
} }
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
// and parallelized kernel for an OpenCL workgroup. // and parallelized kernel for an OpenCL workgroup.
// //
// Copyright (c) 2011 Universidad Rey Juan Carlos // Copyright (c) 2011 Universidad Rey Juan Carlos
// 2012-2022 Pekka Jääskeläinen // 2012-2022 Pekka Jääskeläinen / Parform Oy
// 2023 Pekka Jääskeläinen / Intel Finland Oy
// //
// Permission is hereby granted, free of charge, to any person obtaining a copy // Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal // of this software and associated documentation files (the "Software"), to deal
...@@ -73,9 +74,6 @@ using namespace std; ...@@ -73,9 +74,6 @@ using namespace std;
using namespace llvm; using namespace llvm;
using namespace pocl; using namespace pocl;
enum PoclContextStructFields { enum PoclContextStructFields {
PC_NUM_GROUPS, PC_NUM_GROUPS,
PC_GLOBAL_OFFSET, PC_GLOBAL_OFFSET,
...@@ -130,8 +128,6 @@ Workgroup::runOnModule(Module &M) { ...@@ -130,8 +128,6 @@ Workgroup::runOnModule(Module &M) {
SizeTWidth = address_bits; SizeTWidth = address_bits;
SizeT = IntegerType::get(*C, SizeTWidth); SizeT = IntegerType::get(*C, SizeTWidth);
// LLVM 8.0 dropped the TypeBuilder API. This is a cleaner version
// anyways as it builds the context type using the SizeT directly.
llvm::Type *Int32T = Type::getInt32Ty(*C); llvm::Type *Int32T = Type::getInt32Ty(*C);
llvm::Type *Int8T = Type::getInt8Ty(*C); llvm::Type *Int8T = Type::getInt8Ty(*C);
PoclContextT = PoclContextT =
...@@ -756,8 +752,8 @@ Workgroup::globalHandlesToContextStructLoads( ...@@ -756,8 +752,8 @@ Workgroup::globalHandlesToContextStructLoads(
return StructLoads; return StructLoads;
} }
// Converts uses of the given variable handles (external global variables) to // Converts uses of the given pseudo variable handles (magic external global
// use the given function-private values instead. // variables) to use the given function-private values instead.
void void
Workgroup::privatizeGlobals(llvm::Function *F, llvm::IRBuilder<> &Builder, Workgroup::privatizeGlobals(llvm::Function *F, llvm::IRBuilder<> &Builder,
const std::vector<std::string> &&GlobalHandleNames, const std::vector<std::string> &&GlobalHandleNames,
...@@ -919,6 +915,17 @@ Workgroup::privatizeContext(Function *F) ...@@ -919,6 +915,17 @@ Workgroup::privatizeContext(Function *F)
Builder, {"_num_groups_x", "_num_groups_y", "_num_groups_z"}, Builder, {"_num_groups_x", "_num_groups_y", "_num_groups_z"},
PC_NUM_GROUPS)); PC_NUM_GROUPS));
// Privatize the subgroup size (for CPUs), if referred.
if (M->getGlobalVariable("_pocl_sub_group_size") != nullptr) {
Value *SGSize = getRequiredSubgroupSize(*F);
if (SGSize == nullptr) {
SGSize = Builder.CreateLoad(LocalSizeAllocas[0]->getAllocatedType(),
LocalSizeAllocas[0]);
}
assert(SGSize != nullptr);
privatizeGlobals(F, Builder, {"_pocl_sub_group_size"}, {SGSize});
}
if (DeviceSidePrintf) { if (DeviceSidePrintf) {
// Privatize _printf_buffer // Privatize _printf_buffer
privatizeGlobals( privatizeGlobals(
...@@ -1652,3 +1659,18 @@ Workgroup::hasWorkgroupBarriers(const Function &F) ...@@ -1652,3 +1659,18 @@ Workgroup::hasWorkgroupBarriers(const Function &F)
} }
return false; return false;
} }
// The subgroup size is currently defined for the CPU implementations
// via the intel_reqd_subgroup_size metadata or the local dimension
// x size (the default).
llvm::Value *Workgroup::getRequiredSubgroupSize(llvm::Function &F) {
if (MDNode *SGSizeMD = F.getMetadata("intel_reqd_sub_group_size")) {
// Use the constant from the metadata.
ConstantAsMetadata *ConstMD =
cast<ConstantAsMetadata>(SGSizeMD->getOperand(0));
ConstantInt *Const = cast<ConstantInt>(ConstMD->getValue());
return Const;
}
return nullptr;
}
...@@ -2,12 +2,13 @@ ...@@ -2,12 +2,13 @@
// //
// Copyright (c) 2011 Universidad Rey Juan Carlos // Copyright (c) 2011 Universidad Rey Juan Carlos
// 2011-2018 Pekka Jääskeläinen // 2011-2018 Pekka Jääskeläinen
// 2023 Pekka Jääskeläinen / Intel Finland Oy
// //
// Permission is hereby granted, free of charge, to any person obtaining a copy // Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal // of this software and associated documentation files (the "Software"), to
// in the Software without restriction, including without limitation the rights // deal in the Software without restriction, including without limitation the
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// copies of the Software, and to permit persons to whom the Software is // sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions: // furnished to do so, subject to the following conditions:
// //
// The above copyright notice and this permission notice shall be included in // The above copyright notice and this permission notice shall be included in
...@@ -17,9 +18,9 @@ ...@@ -17,9 +18,9 @@
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// THE SOFTWARE. // IN THE SOFTWARE.
#ifndef _POCL_WORKGROUP_H #ifndef _POCL_WORKGROUP_H
#define _POCL_WORKGROUP_H #define _POCL_WORKGROUP_H
...@@ -91,6 +92,8 @@ namespace pocl { ...@@ -91,6 +92,8 @@ namespace pocl {
LLVMContextRef Ctx, LLVMValueRef F, LLVMContextRef Ctx, LLVMValueRef F,
unsigned ParamIndex); unsigned ParamIndex);
llvm::Value *getRequiredSubgroupSize(llvm::Function &F);
llvm::Module *M; llvm::Module *M;
llvm::LLVMContext *C; llvm::LLVMContext *C;
......
...@@ -20,33 +20,9 @@ SCRIPTPATH=$( realpath "$0" ) ...@@ -20,33 +20,9 @@ SCRIPTPATH=$( realpath "$0" )
RELPATH=$(dirname "$SCRIPTPATH") RELPATH=$(dirname "$SCRIPTPATH")
$RELPATH/clang-format-diff.py -regex '(.*(\.hpp$|\.cc$|\.cpp$))|(lib/llvmopencl/.*)' -i -p1 -style LLVM <$PATCHY $RELPATH/clang-format-diff.py -regex '(.*(\.hpp$|\.cc$|\.cpp$))|(lib/llvmopencl/.*)' -i -p1 -style LLVM <$PATCHY
$RELPATH/clang-format-diff.py -regex '.*(\.h$|\.c$|\.cl$)' -i -p1 -style GNU <$PATCHY $RELPATH/clang-format-diff.py -regex '(.*(\.hh$|\.cc$))|(lib/llvmopencl/.*)|(lib/CL/devices/tce/.*)' -i -p1 -style LLVM <$PATCHY
if [ -z "$(git diff)" ]; then if [ -z "$(git diff)" ]; then
echo "No changes." echo "No changes."
exit 0 exit 0
fi fi
git diff
echo "ACCEPT CHANGES ?"
read REPLY
if [ "$REPLY" == "y" ]; then
git add -u
git commit --amend
if [ -d .git/rebase-merge ]; then
git rebase --continue
fi
else
git add -p
fi
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment