Fix barrier and mem_fence scopes (#610)
Fixes #604
* barrier, work_group_barrier and mem_fence should default to workgroup
memory scope
* support CL 2.0 explicit memory scope barriers
* support sub_group_barrier
* use AcquireRelease semantics instead of SequentiallyConsistent
diff --git a/lib/ReplaceOpenCLBuiltinPass.cpp b/lib/ReplaceOpenCLBuiltinPass.cpp
index ddc1d40..280030f 100644
--- a/lib/ReplaceOpenCLBuiltinPass.cpp
+++ b/lib/ReplaceOpenCLBuiltinPass.cpp
@@ -121,7 +121,7 @@
bool replaceFmod(Function &F);
bool replaceExp10(Function &F, const std::string &basename);
bool replaceLog10(Function &F, const std::string &basename);
- bool replaceBarrier(Function &F);
+ bool replaceBarrier(Function &F, bool subgroup = false);
bool replaceMemFence(Function &F, uint32_t semantics);
bool replacePrefetch(Function &F);
bool replaceRelational(Function &F, CmpInst::Predicate P, int32_t C);
@@ -232,8 +232,11 @@
case Builtins::kWorkGroupBarrier:
return replaceBarrier(F);
+ case Builtins::kSubGroupBarrier:
+ return replaceBarrier(F, true);
+
case Builtins::kMemFence:
- return replaceMemFence(F, spv::MemorySemanticsSequentiallyConsistentMask);
+ return replaceMemFence(F, spv::MemorySemanticsAcquireReleaseMask);
case Builtins::kReadMemFence:
return replaceMemFence(F, spv::MemorySemanticsAcquireMask);
case Builtins::kWriteMemFence:
@@ -565,11 +568,11 @@
});
}
-bool ReplaceOpenCLBuiltinPass::replaceBarrier(Function &F) {
+bool ReplaceOpenCLBuiltinPass::replaceBarrier(Function &F, bool subgroup) {
enum { CLK_LOCAL_MEM_FENCE = 0x01, CLK_GLOBAL_MEM_FENCE = 0x02 };
- return replaceCallsWithValue(F, [](CallInst *CI) {
+ return replaceCallsWithValue(F, [subgroup](CallInst *CI) {
auto Arg = CI->getOperand(0);
// We need to map the OpenCL constants to the SPIR-V equivalents.
@@ -577,12 +580,14 @@
ConstantInt::get(Arg->getType(), CLK_LOCAL_MEM_FENCE);
const auto GlobalMemFence =
ConstantInt::get(Arg->getType(), CLK_GLOBAL_MEM_FENCE);
- const auto ConstantSequentiallyConsistent = ConstantInt::get(
- Arg->getType(), spv::MemorySemanticsSequentiallyConsistentMask);
+ const auto ConstantAcquireRelease = ConstantInt::get(
+ Arg->getType(), spv::MemorySemanticsAcquireReleaseMask);
const auto ConstantScopeDevice =
ConstantInt::get(Arg->getType(), spv::ScopeDevice);
const auto ConstantScopeWorkgroup =
ConstantInt::get(Arg->getType(), spv::ScopeWorkgroup);
+ const auto ConstantScopeSubgroup =
+ ConstantInt::get(Arg->getType(), spv::ScopeSubgroup);
// Map CLK_LOCAL_MEM_FENCE to MemorySemanticsWorkgroupMemoryMask.
const auto LocalMemFenceMask =
@@ -603,23 +608,44 @@
ConstantInt::get(Arg->getType(), UniformShiftAmount), "", CI);
// And combine the above together, also adding in
- // MemorySemanticsSequentiallyConsistentMask.
+ // MemorySemanticsAcquireReleaseMask.
auto MemorySemantics =
BinaryOperator::Create(Instruction::Or, MemorySemanticsWorkgroup,
- ConstantSequentiallyConsistent, "", CI);
+ ConstantAcquireRelease, "", CI);
MemorySemantics = BinaryOperator::Create(Instruction::Or, MemorySemantics,
MemorySemanticsUniform, "", CI);
- // For Memory Scope if we used CLK_GLOBAL_MEM_FENCE, we need to use
- // Device Scope, otherwise Workgroup Scope.
- const auto Cmp =
- CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, GlobalMemFenceMask,
- GlobalMemFence, "", CI);
- const auto MemoryScope = SelectInst::Create(Cmp, ConstantScopeDevice,
- ConstantScopeWorkgroup, "", CI);
+ // If the memory scope is not specified explicitly, it is either Subgroup
+ // or Workgroup depending on the type of barrier.
+ Value *MemoryScope =
+ subgroup ? ConstantScopeSubgroup : ConstantScopeWorkgroup;
+ if (CI->data_operands_size() > 1) {
+ enum {
+ CL_MEMORY_SCOPE_WORKGROUP = 0x1,
+ CL_MEMORY_SCOPE_DEVICE = 0x2,
+ CL_MEMORY_SCOPE_SUBGROUP = 0x4
+ };
+ // The call was given an explicit memory scope.
+ const auto MemoryScopeSubgroup =
+ ConstantInt::get(Arg->getType(), CL_MEMORY_SCOPE_SUBGROUP);
+ const auto MemoryScopeDevice =
+ ConstantInt::get(Arg->getType(), CL_MEMORY_SCOPE_DEVICE);
- // Lastly, the Execution Scope is always Workgroup Scope.
- const auto ExecutionScope = ConstantScopeWorkgroup;
+ auto Cmp =
+ CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+ MemoryScopeSubgroup, CI->getOperand(1), "", CI);
+ MemoryScope = SelectInst::Create(Cmp, ConstantScopeSubgroup,
+ ConstantScopeWorkgroup, "", CI);
+ Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+ MemoryScopeDevice, CI->getOperand(1), "", CI);
+ MemoryScope =
+ SelectInst::Create(Cmp, ConstantScopeDevice, MemoryScope, "", CI);
+ }
+
+ // Lastly, the Execution Scope is either Workgroup or Subgroup depending on
+ // the type of barrier;
+ const auto ExecutionScope =
+ subgroup ? ConstantScopeSubgroup : ConstantScopeWorkgroup;
return clspv::InsertSPIRVOp(CI, spv::OpControlBarrier,
{Attribute::NoDuplicate}, CI->getType(),
@@ -642,8 +668,8 @@
ConstantInt::get(Arg->getType(), CLK_GLOBAL_MEM_FENCE);
const auto ConstantMemorySemantics =
ConstantInt::get(Arg->getType(), semantics);
- const auto ConstantScopeDevice =
- ConstantInt::get(Arg->getType(), spv::ScopeDevice);
+ const auto ConstantScopeWorkgroup =
+ ConstantInt::get(Arg->getType(), spv::ScopeWorkgroup);
// Map CLK_LOCAL_MEM_FENCE to MemorySemanticsWorkgroupMemoryMask.
const auto LocalMemFenceMask =
@@ -671,8 +697,8 @@
MemorySemantics = BinaryOperator::Create(Instruction::Or, MemorySemantics,
MemorySemanticsUniform, "", CI);
- // Memory Scope is always device.
- const auto MemoryScope = ConstantScopeDevice;
+ // Memory Scope is always workgroup.
+ const auto MemoryScope = ConstantScopeWorkgroup;
return clspv::InsertSPIRVOp(CI, spv::OpMemoryBarrier, {}, CI->getType(),
{MemoryScope, MemorySemantics});