cl_intel_required_subgroup_size for CPUs

CL_KERNEL_SPILL_MEM_SIZE_INTEL and CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL are yet to implement.

cl_intel_required_subgroup_size for CPUs
ecd43d1e · Pekka Jääskeläinen · dda05ce4 · ecd43d1e · ecd43d1e · ecd43d1e
Commit ecd43d1e authored 2 years ago by Pekka Jääskeläinen
--- a/CHANGES
+++ b/CHANGES
@@ -9,6 +9,8 @@ Notable User Facing Changes
 - Added support for generic address spaces in the CPU drivers
 - Added basic support for cl_khr_subgroups for CPUs: A single
  subgroup that always executes the whole X-dimension's WIs.
+- Added initial (incomplete) support for 
+  cl_intel_required_subgroup_size for CPUs
 3.1 December 2022
 =================

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -775,7 +775,8 @@ option(ENABLE_SLEEF "Use SLEEF for kernel library" ON)
 option(ENABLE_CONFORMANCE "Enable conformance to OpenCL standard. \
  Enabling this option this does not guarantee conformance (depends on hardware), \
-  but CMake will give errors if options that conflict with conformance are used" OFF)
+  but CMake will give errors if options that conflict with conformance \
+are used. It also disables advertising incomplete extensions." OFF)
 if(ENABLE_CONFORMANCE AND (NOT ENABLE_SLEEF))
  message(FATAL_ERROR "conformance needs enabled SLEEF")
@@ -1201,8 +1202,8 @@ endif()
 set(HOST_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics \
-  cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics \
+cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics \
-  cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_command_buffer")
+cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_command_buffer")
 if(LLVM_VERSION VERSION_GREATER_EQUAL 14.0)
  set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_subgroups")
@@ -1211,13 +1212,21 @@ endif()
 # Extensions that are work-in-progress with known unfinished aspects.
 # These are not advertised with a conformant build.
 #
-# * cl_khr_shuffle: Passes the CTS, but only because it doesn't test non-uniform
+# * cl_khr_subgroup_shuffle: Passes the CTS, but only because it doesn't test
-# (lock-step) behavior: https://github.com/KhronosGroup/OpenCL-CTS/issues/1236
+#   non-uniform(lock-step) behavior, see:
+#   https://github.com/KhronosGroup/OpenCL-CTS/issues/1236
 #
 # * cl_khr_subgroup_ballot: sub_group_ballot() works for uniform calls, the rest
 # are unimplemented.
+#
+# * cl_intel_subgroups: The block reads/writes are unimplemented.
+#
+# * cl_intel_required_subgroup_size: CL_KERNEL_SPILL_MEM_SIZE_INTEL and
+#   CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL are yet to implement.
+#
 if(NOT ENABLE_CONFORMANCE)
-  set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_subgroup_ballot cl_khr_subgroup_shuffle")
+  set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_subgroup_ballot \
+cl_khr_subgroup_shuffle cl_intel_subgroups cl_intel_required_subgroup_size")
 endif()
 set(HOST_DEVICE_FEATURES_30 "__opencl_c_3d_image_writes  __opencl_c_images \

--- a/lib/CL/devices/basic/basic.c
+++ b/lib/CL/devices/basic/basic.c
@@ -2,6 +2,7 @@
   Copyright (c) 2011-2013 Universidad Rey Juan Carlos and
                 2011-2021 Pekka Jääskeläinen
+                 2023 Pekka Jääskeläinen / Intel Finland Oy
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to
@@ -118,7 +119,7 @@ pocl_basic_init_device_ops(struct pocl_device_ops *ops)
  ops->build_hash = pocl_basic_build_hash;
  ops->compute_local_size = pocl_default_local_size_optimizer;
-  ops->get_device_info_ext = NULL;
+  ops->get_device_info_ext = pocl_basic_device_info_ext;
  ops->svm_free = pocl_basic_svm_free;
  ops->svm_alloc = pocl_basic_svm_alloc;
@@ -856,3 +857,21 @@ pocl_basic_svm_copy (cl_device_id dev, void *__restrict__ dst,
 {
  memcpy (dst, src, size);
 }
+cl_int
+pocl_basic_device_info_ext (cl_device_id device, cl_device_info param_name,
+                            size_t param_value_size, void *param_value,
+                            size_t *param_value_size_ret)
+{
+  switch (param_name)
+    {
+    case CL_DEVICE_SUB_GROUP_SIZES_INTEL:
+      /* We can basically support fixing any WG size with the CPU devices, but
+         let's report something semi-sensible here for vectorization aid. */
+      size_t sizes[] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 };
+      POCL_RETURN_GETINFO_ARRAY (size_t, sizeof (sizes) / sizeof (size_t),
+                                 sizes);
+    default:
+      return CL_INVALID_VALUE;
+    }
+}
--- a/lib/CL/devices/basic/basic.h
+++ b/lib/CL/devices/basic/basic.h
@@ -41,4 +41,9 @@
 #include "prototypes.inc"
 GEN_PROTOTYPES (basic)
+cl_int pocl_basic_device_info_ext (cl_device_id device,
+                                   cl_device_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret);
 #endif /* POCL_BASIC_H */
--- a/lib/CL/devices/common.c
+++ b/lib/CL/devices/common.c
@@ -3,6 +3,7 @@
   Copyright (c) 2011-2013 Universidad Rey Juan Carlos
                 2011-2021 Pekka Jääskeläinen
+                 2022-2023 Pekka Jääskeläinen / Intel Finland Oy
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to
@@ -1741,7 +1742,9 @@ pocl_setup_opencl_c_with_version (cl_device_id dev, int supports_30)
 }
 static const cl_name_version OPENCL_EXTENSIONS[]
-    = { { CL_MAKE_VERSION (1, 0, 0), "cl_khr_byte_addressable_store" },
+    = { { CL_MAKE_VERSION (1, 0, 0), "cl_intel_required_subgroup_size" },
+        { CL_MAKE_VERSION (1, 0, 0), "cl_intel_subgroups" },
+        { CL_MAKE_VERSION (1, 0, 0), "cl_khr_byte_addressable_store" },
        { CL_MAKE_VERSION (1, 0, 0), "cl_khr_global_int32_base_atomics" },
        { CL_MAKE_VERSION (1, 0, 0), "cl_khr_global_int32_extended_atomics" },
        { CL_MAKE_VERSION (1, 0, 0), "cl_khr_local_int32_base_atomics" },

--- a/lib/kernel/subgroups.c
+++ b/lib/kernel/subgroups.c
-/* OpenCL built-in library: subgroup basic functionality
+/* OpenCL built-in library: subroups functionality
   Copyright (c) 2022-2023 Pekka Jääskeläinen / Intel Finland Oy
@@ -21,17 +21,10 @@
   IN THE SOFTWARE.
 */
-/* The default implementation of subgroups is the simplest possible one of
+/* The default implementation of subgroups for CPU drivers. It uses work-group
-   always having one subgroup executing the innermost dimension.
+   sized local buffers for exchanging the data. The subgroup size is by default
+   the local X dimension side, unless restricted with the
-   Next, the plan is to allow the default to be changed explicitly by
+   intel_reqd_sub_group_size metadata.
-   means of the intel_reqd_sub_group_size annotation as described in
-   https://registry.khronos.org/OpenCL/extensions/intel/
-   cl_intel_required_subgroup_size.html
-   This forms a minimal viable feature set sufficient to emulate different
-   warp sizes for CUDA/HIP execution. Performance via efficient vectorization
-   is not a priority for now.
 */
 #include <math.h>
@@ -65,11 +58,13 @@ size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx);
 size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx);
+/* Magic variable that is expanded in Workgroup.cc */
+extern uint _pocl_sub_group_size;
 uint _CL_OVERLOADABLE
 get_sub_group_size (void)
 {
-  /* By default 1 SG per WG_x. */
+  return _pocl_sub_group_size;
-  return get_local_size (0);
 }
 uint _CL_OVERLOADABLE
@@ -81,7 +76,8 @@ get_max_sub_group_size (void)
 uint _CL_OVERLOADABLE
 get_num_sub_groups (void)
 {
-  return (uint)get_local_size (1) * get_local_size (2);
+  return (uint)get_local_size (0) * get_local_size (1) * get_local_size (2)
+         / get_max_sub_group_size ();
 }
 uint _CL_OVERLOADABLE
@@ -90,31 +86,44 @@ get_enqueued_num_sub_groups (void)
  return 1;
 }
+size_t _CL_OVERLOADABLE get_local_linear_id (void);
 uint _CL_OVERLOADABLE
 get_sub_group_id (void)
 {
-  return get_local_id (2) * get_local_size (1) + get_local_id (1);
+  return (uint)get_local_linear_id () / get_max_sub_group_size ();
 }
 uint _CL_OVERLOADABLE
 get_sub_group_local_id (void)
 {
-  return (uint)get_local_id (0);
+  return (uint)get_local_linear_id () % get_max_sub_group_size ();
+}
+static size_t _CL_OVERLOADABLE
+get_first_llid (void)
+{
+  return get_sub_group_id () * get_max_sub_group_size ();
 }
 void _CL_OVERLOADABLE sub_group_barrier (cl_mem_fence_flags flags);
-#define SUB_GROUP_SHUFFLE_T(TYPE)                                             \
+#define SUB_GROUP_SHUFFLE_PT(PREFIX, TYPE)                                    \
-  __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE sub_group_shuffle (   \
+  __attribute__ ((always_inline))                                             \
-      TYPE val, uint index)                                                   \
+  TYPE _CL_OVERLOADABLE PREFIX##sub_group_shuffle (TYPE val, uint index)      \
  {                                                                           \
    volatile TYPE *temp_storage                                               \
        = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE));            \
-    temp_storage[get_sub_group_local_id ()] = val;                            \
+    temp_storage[get_local_linear_id ()] = val;                               \
    sub_group_barrier (CLK_LOCAL_MEM_FENCE);                                  \
-    return temp_storage[index % get_sub_group_size ()];                       \
+    return temp_storage[get_first_llid () + index % get_sub_group_size ()];   \
  }
+/* Define both the non-prefixed (khr) and Intel-prefixed shuffles. */
+#define SUB_GROUP_SHUFFLE_T(TYPE)                                             \
+  SUB_GROUP_SHUFFLE_PT (, TYPE)                                               \
+  SUB_GROUP_SHUFFLE_PT (intel_, TYPE)
 SUB_GROUP_SHUFFLE_T (char)
 SUB_GROUP_SHUFFLE_T (uchar)
 SUB_GROUP_SHUFFLE_T (short)
@@ -126,18 +135,24 @@ SUB_GROUP_SHUFFLE_T (ulong)
 SUB_GROUP_SHUFFLE_T (float)
 SUB_GROUP_SHUFFLE_T (double)
-#define SUB_GROUP_SHUFFLE_XOR_T(TYPE)                                         \
+#define SUB_GROUP_SHUFFLE_XOR_PT(PREFIX, TYPE)                                \
  __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE                       \
-  sub_group_shuffle_xor (TYPE val, uint mask)                                 \
+  PREFIX##sub_group_shuffle_xor (TYPE val, uint mask)                         \
  {                                                                           \
    volatile TYPE *temp_storage                                               \
        = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE));            \
-    temp_storage[get_sub_group_local_id ()] = val;                            \
+    temp_storage[get_local_linear_id ()] = val;                               \
    sub_group_barrier (CLK_LOCAL_MEM_FENCE);                                  \
-    return temp_storage[(get_sub_group_local_id () ^ mask)                    \
+    return temp_storage[get_first_llid ()                                     \
-                        % get_sub_group_size ()];                             \
+                        + (get_sub_group_local_id () ^ mask)                  \
+                              % get_sub_group_size ()];                       \
  }
+/* Define both the non-prefixed (khr) and Intel-prefixed shuffles. */
+#define SUB_GROUP_SHUFFLE_XOR_T(TYPE)                                         \
+  SUB_GROUP_SHUFFLE_XOR_PT (, TYPE)                                           \
+  SUB_GROUP_SHUFFLE_XOR_PT (intel_, TYPE)
 SUB_GROUP_SHUFFLE_XOR_T (char)
 SUB_GROUP_SHUFFLE_XOR_T (uchar)
 SUB_GROUP_SHUFFLE_XOR_T (short)
@@ -169,18 +184,19 @@ SUB_GROUP_BROADCAST_T (double)
  {                                                                           \
    volatile TYPE *temp_storage                                               \
        = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE));            \
-    temp_storage[get_sub_group_local_id ()] = val;                            \
+    temp_storage[get_local_linear_id ()] = val;                               \
    sub_group_barrier (CLK_LOCAL_MEM_FENCE);                                  \
    if (get_sub_group_local_id () == 0)                                       \
      {                                                                       \
        for (uint i = 1; i < get_sub_group_size (); ++i)                      \
          {                                                                   \
-            TYPE a = temp_storage[0], b = temp_storage[i];                    \
+            TYPE a = temp_storage[get_first_llid ()],                         \
-            temp_storage[0] = OPERATION;                                      \
+                 b = temp_storage[get_first_llid () + i];                     \
+            temp_storage[get_first_llid ()] = OPERATION;                      \
          }                                                                   \
      }                                                                       \
    sub_group_barrier (CLK_LOCAL_MEM_FENCE);                                  \
-    return temp_storage[0];                                                   \
+    return temp_storage[get_first_llid ()];                                   \
  }
 #define SUB_GROUP_REDUCE_T(OPNAME, OPERATION)                                 \
@@ -201,18 +217,19 @@ SUB_GROUP_REDUCE_T (max, a > b ? a : b)
  {                                                                           \
    volatile TYPE *data                                                       \
        = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE));            \
-    data[get_sub_group_local_id ()] = val;                                    \
+    data[get_local_linear_id ()] = val;                                       \
    sub_group_barrier (CLK_LOCAL_MEM_FENCE);                                  \
    if (get_sub_group_local_id () == 0)                                       \
      {                                                                       \
        for (uint i = 1; i < get_sub_group_size (); ++i)                      \
          {                                                                   \
-            TYPE a = data[i - 1], b = data[i];                                \
+            TYPE a = data[get_first_llid () + i - 1],                         \
-            data[i] = OPERATION;                                              \
+                 b = data[get_first_llid () + i];                             \
+            data[get_first_llid () + i] = OPERATION;                          \
          }                                                                   \
      }                                                                       \
    sub_group_barrier (CLK_LOCAL_MEM_FENCE);                                  \
-    return data[get_sub_group_local_id ()];                                   \
+    return data[get_local_linear_id ()];                                      \
  }
 #define SUB_GROUP_SCAN_INCLUSIVE_T(OPNAME, OPERATION)                         \
@@ -233,19 +250,20 @@ SUB_GROUP_SCAN_INCLUSIVE_T (max, a > b ? a : b)
  {                                                                           \
    volatile TYPE *data                                                       \
        = __pocl_work_group_alloca (sizeof (TYPE), sizeof (TYPE));            \
-    data[get_sub_group_local_id () + 1] = val;                                \
+    data[get_local_linear_id () + 1] = val;                                   \
-    data[0] = ID;                                                             \
+    data[get_first_llid ()] = ID;                                             \
    sub_group_barrier (CLK_LOCAL_MEM_FENCE);                                  \
    if (get_sub_group_local_id () == 0)                                       \
      {                                                                       \
        for (uint i = 1; i < get_sub_group_size (); ++i)                      \
          {                                                                   \
-            TYPE a = data[i - 1], b = data[i];                                \
+            TYPE a = data[get_first_llid () + i - 1],                         \
-            data[i] = OPERATION;                                              \
+                 b = data[get_first_llid () + i];                             \
+            data[get_first_llid () + i] = OPERATION;                          \
          }                                                                   \
      }                                                                       \
    sub_group_barrier (CLK_LOCAL_MEM_FENCE);                                  \
-    return data[get_sub_group_local_id ()];                                   \
+    return data[get_local_linear_id ()];                                      \
  }
 SUB_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, int, 0)
@@ -272,19 +290,23 @@ SUB_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, double, -INFINITY)
 __attribute__ ((always_inline)) uint4 _CL_OVERLOADABLE
 sub_group_ballot (int predicate)
 {
-  uint *flags = __pocl_local_mem_alloca (sizeof (uint) * 4, sizeof (uint) * 4);
+  /* TODO: We actually would need only one per SG. */
+  uint *flags
+      = __pocl_work_group_alloca (sizeof (uint) * 4, sizeof (uint) * 4);
  char *res = __pocl_work_group_alloca (sizeof (char), 4);
-  if (get_sub_group_local_id () < 128)
+  if (get_local_linear_id () < 128)
-    res[get_sub_group_local_id ()] = !!predicate;
+    res[get_local_linear_id ()] = !!predicate;
  sub_group_barrier (CLK_LOCAL_MEM_FENCE);
  if (get_sub_group_local_id () == 0)
    {
-      flags[0] = flags[1] = flags[2] = flags[3] = ~0;
+      flags[get_first_llid ()] = flags[get_first_llid () + 1]
+          = flags[get_first_llid () + 2] = flags[get_first_llid () + 3] = ~0;
      for (uint i = 0; i < get_sub_group_size () && i < 128; ++i)
        {
-          flags[i / 32] |= res[i] << (i % 32);
+          flags[get_first_llid () * 4 + i / 32]
+              |= res[get_first_llid () * 4 + i] << (i % 32);
        }
    }
  sub_group_barrier (CLK_LOCAL_MEM_FENCE);
-  return *(uint4 *)flags;
+  return ((uint4 *)flags)[get_first_llid () * 4];
 }
--- a/lib/llvmopencl/Workgroup.cc
+++ b/lib/llvmopencl/Workgroup.cc
@@ -2,7 +2,8 @@
 // and parallelized kernel for an OpenCL workgroup.
 //
 // Copyright (c) 2011 Universidad Rey Juan Carlos
-//               2012-2022 Pekka Jääskeläinen
+//               2012-2022 Pekka Jääskeläinen / Parform Oy
+//               2023 Pekka Jääskeläinen / Intel Finland Oy
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -73,9 +74,6 @@ using namespace std;
 using namespace llvm;
 using namespace pocl;
 enum PoclContextStructFields {
  PC_NUM_GROUPS,
  PC_GLOBAL_OFFSET,
@@ -130,8 +128,6 @@ Workgroup::runOnModule(Module &M) {
  SizeTWidth = address_bits;
  SizeT = IntegerType::get(*C, SizeTWidth);
-  // LLVM 8.0 dropped the TypeBuilder API. This is a cleaner version
-  // anyways as it builds the context type using the SizeT directly.
  llvm::Type *Int32T = Type::getInt32Ty(*C);
  llvm::Type *Int8T = Type::getInt8Ty(*C);
  PoclContextT =
@@ -756,8 +752,8 @@ Workgroup::globalHandlesToContextStructLoads(
  return StructLoads;
 }
-// Converts uses of the given variable handles (external global variables) to
+// Converts uses of the given pseudo variable handles (magic external global
-// use the given function-private values instead.
+// variables) to use the given function-private values instead.
 void
 Workgroup::privatizeGlobals(llvm::Function *F, llvm::IRBuilder<> &Builder,
                            const std::vector<std::string> &&GlobalHandleNames,
@@ -919,6 +915,17 @@ Workgroup::privatizeContext(Function *F)
      Builder, {"_num_groups_x", "_num_groups_y", "_num_groups_z"},
      PC_NUM_GROUPS));
+  // Privatize the subgroup size (for CPUs), if referred.
+  if (M->getGlobalVariable("_pocl_sub_group_size") != nullptr) {
+    Value *SGSize = getRequiredSubgroupSize(*F);
+    if (SGSize == nullptr) {
+      SGSize = Builder.CreateLoad(LocalSizeAllocas[0]->getAllocatedType(),
+                                  LocalSizeAllocas[0]);
+    }
+    assert(SGSize != nullptr);
+    privatizeGlobals(F, Builder, {"_pocl_sub_group_size"}, {SGSize});
+  }
  if (DeviceSidePrintf) {
    // Privatize _printf_buffer
    privatizeGlobals(
@@ -1652,3 +1659,18 @@ Workgroup::hasWorkgroupBarriers(const Function &F)
  }
  return false;
 }
+// The subgroup size is currently defined for the CPU implementations
+// via the intel_reqd_subgroup_size metadata or the local dimension
+// x size (the default).
+llvm::Value *Workgroup::getRequiredSubgroupSize(llvm::Function &F) {
+  if (MDNode *SGSizeMD = F.getMetadata("intel_reqd_sub_group_size")) {
+    // Use the constant from the metadata.
+    ConstantAsMetadata *ConstMD =
+        cast<ConstantAsMetadata>(SGSizeMD->getOperand(0));
+    ConstantInt *Const = cast<ConstantInt>(ConstMD->getValue());
+    return Const;
+  }
+  return nullptr;
+}
--- a/lib/llvmopencl/Workgroup.h
+++ b/lib/llvmopencl/Workgroup.h
@@ -2,12 +2,13 @@
 //
 // Copyright (c) 2011 Universidad Rey Juan Carlos
 //               2011-2018 Pekka Jääskeläinen
+//               2023 Pekka Jääskeläinen / Intel Finland Oy
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
+// of this software and associated documentation files (the "Software"), to
-// in the Software without restriction, including without limitation the rights
+// deal in the Software without restriction, including without limitation the
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-// copies of the Software, and to permit persons to whom the Software is
+// sell copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
@@ -17,9 +18,9 @@
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// THE SOFTWARE.
+// IN THE SOFTWARE.
 #ifndef _POCL_WORKGROUP_H
 #define _POCL_WORKGROUP_H
@@ -91,6 +92,8 @@ namespace pocl {
                                      LLVMContextRef Ctx, LLVMValueRef F,
                                      unsigned ParamIndex);
+    llvm::Value *getRequiredSubgroupSize(llvm::Function &F);
    llvm::Module *M;
    llvm::LLVMContext *C;

--- a/tools/scripts/format-last-commit.sh
+++ b/tools/scripts/format-last-commit.sh
@@ -20,33 +20,9 @@ SCRIPTPATH=$( realpath "$0"  )
 RELPATH=$(dirname "$SCRIPTPATH")
 $RELPATH/clang-format-diff.py -regex '(.*(\.hpp$|\.cc$|\.cpp$))|(lib/llvmopencl/.*)' -i -p1 -style LLVM <$PATCHY
-$RELPATH/clang-format-diff.py -regex '.*(\.h$|\.c$|\.cl$)' -i -p1 -style GNU <$PATCHY
+$RELPATH/clang-format-diff.py -regex '(.*(\.hh$|\.cc$))|(lib/llvmopencl/.*)|(lib/CL/devices/tce/.*)' -i -p1 -style LLVM <$PATCHY
 if [ -z "$(git diff)" ]; then
  echo "No changes."
  exit 0
 fi
-git diff
-echo "ACCEPT CHANGES ?"
-read REPLY
-if [ "$REPLY" == "y" ]; then
-  git add -u
-  git commit --amend
-  if [ -d .git/rebase-merge ]; then
-    git rebase --continue
-  fi
-else
-  git add -p
-fi