HACK: Support vstore_half with aligned i32* and no SPV_KHR_16bit_storage

The base pointer argument is assumed to be aligned to 32 bits.

We use PackHalf2x16 to generate a 32-bit integer, then use an atomic xor
to atomically write only the right 16bits of the target 32-bit word.
We use Relaxed consistency.

Includes a test for vstore_half to OpenCL global memory (UniformMemory).

Adds compiler option -f16bit_storage for the old behaviour, which
assumes SPV_KHR_16bit_storage extension is available.

TODO: test vstore_half on Workgroup memory (OpenCL local memory).
diff --git a/lib/ReplaceOpenCLBuiltinPass.cpp b/lib/ReplaceOpenCLBuiltinPass.cpp
index 88b3589..5a27e81 100644
--- a/lib/ReplaceOpenCLBuiltinPass.cpp
+++ b/lib/ReplaceOpenCLBuiltinPass.cpp
@@ -16,6 +16,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Pass.h>
+#include <llvm/Support/CommandLine.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 
@@ -25,6 +26,10 @@
 
 #define DEBUG_TYPE "ReplaceOpenCLBuiltin"
 
+static llvm::cl::opt<bool> f16bit_storage(
+    "f16bit_storage", llvm::cl::init(false),
+    llvm::cl::desc("Assume the target supports SPV_KHR_16bit_storage"));
+
 namespace {
 uint32_t clz(uint32_t v) {
   uint32_t r;
@@ -1129,12 +1134,10 @@
           // The pointer argument from vstore_half.
           auto Arg2 = CI->getOperand(2);
 
-          auto ShortTy = Type::getInt16Ty(M.getContext());
           auto IntTy = Type::getInt32Ty(M.getContext());
           auto Float2Ty = VectorType::get(Type::getFloatTy(M.getContext()), 2);
-          auto NewPointerTy = PointerType::get(
-              ShortTy, Arg2->getType()->getPointerAddressSpace());
           auto NewFType = FunctionType::get(IntTy, Float2Ty, false);
+          auto One = ConstantInt::get(IntTy, 1);
 
           // Our intrinsic to pack a float2 to an int.
           auto SPIRVIntrinsic = "spirv.pack.v2f16";
@@ -1142,24 +1145,100 @@
           auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
 
           // Insert our value into a float2 so that we can pack it.
-          auto TempVec = InsertElementInst::Create(UndefValue::get(Float2Ty), Arg0, ConstantInt::get(IntTy, 0), "", CI);
+          auto TempVec =
+              InsertElementInst::Create(UndefValue::get(Float2Ty), Arg0,
+                                        ConstantInt::get(IntTy, 0), "", CI);
 
           // Pack the float2 -> half2 (in an int).
           auto X = CallInst::Create(NewF, TempVec, "", CI);
 
-          // Truncate our i32 to an i16.
-          auto Trunc = CastInst::CreateTruncOrBitCast(X, ShortTy, "", CI);
+          if (f16bit_storage) {
+            auto ShortTy = Type::getInt16Ty(M.getContext());
+            auto ShortPointerTy = PointerType::get(
+                ShortTy, Arg2->getType()->getPointerAddressSpace());
 
-          // Cast the half* pointer to short*.
-          auto Cast = CastInst::CreatePointerCast(Arg2, NewPointerTy, "", CI);
+            // Truncate our i32 to an i16.
+            auto Trunc = CastInst::CreateTruncOrBitCast(X, ShortTy, "", CI);
 
-          // Index into the correct address of the casted pointer.
-          auto Index = GetElementPtrInst::Create(ShortTy, Cast, Arg1, "", CI);
+            // Cast the half* pointer to short*.
+            auto Cast = CastInst::CreatePointerCast(Arg2, ShortPointerTy, "", CI);
 
-          // Store to the int* we casted to.
-          auto Store = new StoreInst(Trunc, Index, CI);
+            // Index into the correct address of the casted pointer.
+            auto Index = GetElementPtrInst::Create(ShortTy, Cast, Arg1, "", CI);
 
-          CI->replaceAllUsesWith(Store);
+            // Store to the int* we casted to.
+            auto Store = new StoreInst(Trunc, Index, CI);
+
+            CI->replaceAllUsesWith(Store);
+          } else {
+            // We can only write to 32-bit aligned words.
+            //
+            // Assuming base is aligned to 32-bits, replace the equivalent of
+            //   vstore_half(value, index, base)
+            // with:
+            //   uint32_t* target_ptr = (uint32_t*)(base) + index / 2;
+            //   uint32_t write_to_upper_half = index & 1u;
+            //   uint32_t shift = write_to_upper_half << 4;
+            //
+            //   // Pack the float value as a half number in bottom 16 bits
+            //   // of an i32.
+            //   uint32_t packed = spirv.pack.v2f16((float2)(value, undef));
+            //
+            //   uint32_t xor_value =   (*target_ptr & (0xffff << shift))
+            //                        ^ ((packed & 0xffff) << shift)
+            //   // We only need relaxed consistency, but OpenCL 1.2 only has
+            //   // sequentially consistent atomics.
+            //   // TODO(dneto): Use relaxed consistency.
+            //   atomic_xor(target_ptr, xor_value)
+            auto IntPointerTy = PointerType::get(
+                IntTy, Arg2->getType()->getPointerAddressSpace());
+
+            auto Four = ConstantInt::get(IntTy, 4);
+            auto FFFF = ConstantInt::get(IntTy, 0xffff);
+
+            auto IndexIsOdd = BinaryOperator::CreateAnd(Arg1, One, "index_is_odd_i32", CI);
+            // Compute index / 2
+            auto IndexIntoI32 = BinaryOperator::CreateLShr(Arg1, One, "index_into_i32", CI);
+            auto BaseI32Ptr = CastInst::CreatePointerCast(Arg2, IntPointerTy, "base_i32_ptr", CI);
+            auto OutPtr = GetElementPtrInst::Create(IntTy, BaseI32Ptr, IndexIntoI32, "base_i32_ptr", CI);
+            auto CurrentValue = new LoadInst(OutPtr, "current_value", CI);
+            auto Shift = BinaryOperator::CreateShl(IndexIsOdd, Four, "shift", CI);
+            auto MaskBitsToWrite = BinaryOperator::CreateShl(FFFF, Shift, "mask_bits_to_write", CI);
+            auto MaskedCurrent = BinaryOperator::CreateAnd(MaskBitsToWrite, CurrentValue, "masked_current", CI);
+
+            auto XLowerBits = BinaryOperator::CreateAnd(X, FFFF, "lower_bits_of_packed", CI);
+            auto NewBitsToWrite = BinaryOperator::CreateShl(XLowerBits, Shift, "new_bits_to_write", CI);
+            auto ValueToXor = BinaryOperator::CreateXor(MaskedCurrent, NewBitsToWrite, "value_to_xor", CI);
+
+            // Generate the call to atomi_xor.
+            SmallVector<Type *, 5> ParamTypes;
+            // The pointer type.
+            ParamTypes.push_back(IntPointerTy);
+            // The Types for memory scope, semantics, and value.
+            ParamTypes.push_back(IntTy);
+            ParamTypes.push_back(IntTy);
+            ParamTypes.push_back(IntTy);
+            auto NewFType = FunctionType::get(IntTy, ParamTypes, false);
+            auto NewF = M.getOrInsertFunction("spirv.atomic_xor", NewFType);
+
+            const auto ConstantScopeDevice =
+                ConstantInt::get(IntTy, spv::ScopeDevice);
+            // Assume the pointee is in OpenCL global (SPIR-V Uniform) or local
+            // (SPIR-V Workgroup).
+            const auto AddrSpaceSemanticsBits =
+                IntPointerTy->getPointerAddressSpace() == 1
+                    ? spv::MemorySemanticsUniformMemoryMask
+                    : spv::MemorySemanticsWorkgroupMemoryMask;
+
+            // We're using relaxed consistency here.
+            const auto ConstantMemorySemantics =
+                ConstantInt::get(IntTy, spv::MemorySemanticsUniformMemoryMask |
+                                            AddrSpaceSemanticsBits);
+
+            SmallVector<Value *, 5> Params{OutPtr, ConstantScopeDevice,
+                                           ConstantMemorySemantics, ValueToXor};
+            CallInst::Create(NewF, Params, "store_halfword_xor_trick", CI);
+          }
 
           // Lastly, remember to remove the user.
           ToRemoves.push_back(CI);