half: add support for all v(store|load)[a]_halfn fcts (#844)

* half: add support for all v(store|load)[a]_halfn fcts

This commit make OpenCL-CTS 'test_half roundTrip' test pass.

This commit contains 2 fixes in PointBitcastPass.
Both of them have a dedicated test added.

* fix clspv-opt arg in run command of new tests
diff --git a/lib/Compiler.cpp b/lib/Compiler.cpp
index ff3a607..1b6cbe7 100644
--- a/lib/Compiler.cpp
+++ b/lib/Compiler.cpp
@@ -662,6 +662,8 @@
     pm.addPass(clspv::UndoByvalPass());
     pm.addPass(clspv::UndoSRetPass());
     pm.addPass(clspv::ClusterPodKernelArgumentsPass());
+    // ReplaceOpenCLBuiltinPass can generate vec8 and vec16 elements. It needs
+    // to be before the potential LongVectorLoweringPass pass.
     pm.addPass(clspv::ReplaceOpenCLBuiltinPass());
     pm.addPass(clspv::ThreeElementVectorLoweringPass());
 
diff --git a/lib/ReplaceOpenCLBuiltinPass.cpp b/lib/ReplaceOpenCLBuiltinPass.cpp
index e622e20..159b1cd 100644
--- a/lib/ReplaceOpenCLBuiltinPass.cpp
+++ b/lib/ReplaceOpenCLBuiltinPass.cpp
@@ -504,15 +504,19 @@
     return replaceVload(F);
 
   case Builtins::kVloadaHalf:
+    return replaceVloadHalf(F, FI.getName(), FI.getParameter(0).vector_size,
+                            true);
   case Builtins::kVloadHalf:
-    return replaceVloadHalf(F, FI.getName(), FI.getParameter(0).vector_size);
+    return replaceVloadHalf(F, FI.getName(), FI.getParameter(0).vector_size,
+                            false);
 
   case Builtins::kVstore:
     return replaceVstore(F);
 
-  case Builtins::kVstoreHalf:
   case Builtins::kVstoreaHalf:
-    return replaceVstoreHalf(F, FI.getParameter(0).vector_size);
+    return replaceVstoreHalf(F, FI.getParameter(0).vector_size, true);
+  case Builtins::kVstoreHalf:
+    return replaceVstoreHalf(F, FI.getParameter(0).vector_size, false);
 
   case Builtins::kSmoothstep: {
     int vec_size = FI.getLastParameter().vector_size;
@@ -1973,27 +1977,41 @@
 
 bool ReplaceOpenCLBuiltinPass::replaceVloadHalf(Function &F,
                                                 const std::string &name,
-                                                int vec_size) {
+                                                int vec_size, bool aligned) {
   bool is_clspv_version = !name.compare(0, 8, "__clspv_");
   if (!vec_size) {
-    // deduce vec_size from last character of name (e.g. vload_half4)
-    vec_size = std::atoi(&name.back());
+    // deduce vec_size from last characters of name (e.g. vload_half4)
+    std::string half = "half";
+    vec_size = std::atoi(
+        name.substr(name.find(half) + half.size(), std::string::npos).c_str());
   }
   switch (vec_size) {
   case 2:
     return is_clspv_version ? replaceClspvVloadaHalf2(F) : replaceVloadHalf2(F);
+  case 3:
+    if (!is_clspv_version) {
+      return aligned ? replaceVloadaHalf3(F) : replaceVloadHalf3(F);
+    }
+    break;
   case 4:
     return is_clspv_version ? replaceClspvVloadaHalf4(F) : replaceVloadHalf4(F);
+  case 8:
+    if (!is_clspv_version) {
+      return replaceVloadHalf8(F);
+    }
+    break;
+  case 16:
+    if (!is_clspv_version) {
+      return replaceVloadHalf16(F);
+    }
+    break;
   case 0:
     if (!is_clspv_version) {
       return replaceVloadHalf(F);
     }
-    // Fall-through
-  default:
-    llvm_unreachable("Unsupported vload_half vector size");
     break;
   }
-  return false;
+  llvm_unreachable("Unsupported vload_half vector size");
 }
 
 bool ReplaceOpenCLBuiltinPass::replaceVloadHalf(Function &F) {
@@ -2137,6 +2155,131 @@
   });
 }
 
+bool ReplaceOpenCLBuiltinPass::replaceVloadHalf3(Function &F) {
+  Module &M = *F.getParent();
+  return replaceCallsWithValue(F, [&](CallInst *CI) {
+    // The index argument from vload_half.
+    auto Arg0 = CI->getOperand(0);
+
+    // The pointer argument from vload_half.
+    auto Arg1 = CI->getOperand(1);
+
+    auto IntTy = Type::getInt32Ty(M.getContext());
+    auto ShortTy = Type::getInt16Ty(M.getContext());
+    auto FloatTy = Type::getFloatTy(M.getContext());
+    auto Float2Ty = FixedVectorType::get(FloatTy, 2);
+    auto Float3Ty = FixedVectorType::get(FloatTy, 3);
+    auto NewPointerTy =
+        PointerType::get(ShortTy, Arg1->getType()->getPointerAddressSpace());
+    auto NewFType = FunctionType::get(Float2Ty, IntTy, false);
+
+    auto Int0 = ConstantInt::get(IntTy, 0);
+    auto Int1 = ConstantInt::get(IntTy, 1);
+    auto Int2 = ConstantInt::get(IntTy, 2);
+
+    // Cast the half* pointer to short*.
+    auto Cast = CastInst::CreatePointerCast(Arg1, NewPointerTy, "", CI);
+
+    // Load the first element
+    auto Index0 = BinaryOperator::Create(
+        Instruction::Add,
+        BinaryOperator::Create(Instruction::Shl, Arg0, Int1, "", CI), Arg0, "",
+        CI);
+    auto GEP0 = GetElementPtrInst::Create(ShortTy, Cast, Index0, "", CI);
+    auto Load0 = new LoadInst(ShortTy, GEP0, "", CI);
+
+    // Load the second element
+    auto Index1 =
+        BinaryOperator::Create(Instruction::Add, Index0, Int1, "", CI);
+    auto GEP1 = GetElementPtrInst::Create(ShortTy, Cast, Index1, "", CI);
+    auto Load1 = new LoadInst(ShortTy, GEP1, "", CI);
+
+    // Load the third element
+    auto Index2 =
+        BinaryOperator::Create(Instruction::Add, Index1, Int1, "", CI);
+    auto GEP2 = GetElementPtrInst::Create(ShortTy, Cast, Index2, "", CI);
+    auto Load2 = new LoadInst(ShortTy, GEP2, "", CI);
+
+    // Extend each short to int.
+    auto X0 = CastInst::Create(Instruction::ZExt, Load0, IntTy, "", CI);
+    auto X1 = CastInst::Create(Instruction::ZExt, Load1, IntTy, "", CI);
+    auto X2 = CastInst::Create(Instruction::ZExt, Load2, IntTy, "", CI);
+
+    // Our intrinsic to unpack a float2 from an int.
+    auto SPIRVIntrinsic = clspv::UnpackFunction();
+
+    auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
+
+    // Convert int to float2 and extract the uniq meaningful float
+    auto Y0 = ExtractElementInst::Create(CallInst::Create(NewF, X0, "", CI),
+                                         Int0, "", CI);
+    auto Y1 = ExtractElementInst::Create(CallInst::Create(NewF, X1, "", CI),
+                                         Int0, "", CI);
+    auto Y2 = ExtractElementInst::Create(CallInst::Create(NewF, X2, "", CI),
+                                         Int0, "", CI);
+
+    // Create the final float3 to be returned
+    auto Combine =
+        InsertElementInst::Create(UndefValue::get(Float3Ty), Y0, Int0, "", CI);
+    Combine = InsertElementInst::Create(Combine, Y1, Int1, "", CI);
+    Combine = InsertElementInst::Create(Combine, Y2, Int2, "", CI);
+
+    return Combine;
+  });
+}
+
+bool ReplaceOpenCLBuiltinPass::replaceVloadaHalf3(Function &F) {
+  Module &M = *F.getParent();
+  return replaceCallsWithValue(F, [&](CallInst *CI) {
+    // The index argument from vload_half.
+    auto Arg0 = CI->getOperand(0);
+
+    // The pointer argument from vload_half.
+    auto Arg1 = CI->getOperand(1);
+
+    auto IntTy = Type::getInt32Ty(M.getContext());
+    auto Int2Ty = FixedVectorType::get(IntTy, 2);
+    auto Float2Ty = FixedVectorType::get(Type::getFloatTy(M.getContext()), 2);
+    auto NewPointerTy =
+        PointerType::get(Int2Ty, Arg1->getType()->getPointerAddressSpace());
+    auto NewFType = FunctionType::get(Float2Ty, IntTy, false);
+
+    // Cast the half* pointer to int2*.
+    auto Cast = CastInst::CreatePointerCast(Arg1, NewPointerTy, "", CI);
+
+    // Index into the correct address of the casted pointer.
+    auto Index = GetElementPtrInst::Create(Int2Ty, Cast, Arg0, "", CI);
+
+    // Load from the int2* we casted to.
+    auto Load = new LoadInst(Int2Ty, Index, "", CI);
+
+    // Extract each element from the loaded int2.
+    auto X =
+        ExtractElementInst::Create(Load, ConstantInt::get(IntTy, 0), "", CI);
+    auto Y =
+        ExtractElementInst::Create(Load, ConstantInt::get(IntTy, 1), "", CI);
+
+    // Our intrinsic to unpack a float2 from an int.
+    auto SPIRVIntrinsic = clspv::UnpackFunction();
+
+    auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
+
+    // Get the lower (x & y) components of our final float4.
+    auto Lo = CallInst::Create(NewF, X, "", CI);
+
+    // Get the higher (z & w) components of our final float4.
+    auto Hi = CallInst::Create(NewF, Y, "", CI);
+
+    Constant *ShuffleMask[3] = {ConstantInt::get(IntTy, 0),
+                                ConstantInt::get(IntTy, 1),
+                                ConstantInt::get(IntTy, 2)};
+
+    // Combine our two float2's into one float4.
+    return new ShuffleVectorInst(Lo, Hi, ConstantVector::get(ShuffleMask), "",
+                                 CI);
+  });
+}
+
 bool ReplaceOpenCLBuiltinPass::replaceVloadHalf4(Function &F) {
   Module &M = *F.getParent();
   return replaceCallsWithValue(F, [&](CallInst *CI) {
@@ -2189,13 +2332,182 @@
   });
 }
 
+bool ReplaceOpenCLBuiltinPass::replaceVloadHalf8(Function &F) {
+  Module &M = *F.getParent();
+  return replaceCallsWithValue(F, [&](CallInst *CI) {
+    // The index argument from vload_half.
+    auto Arg0 = CI->getOperand(0);
+
+    // The pointer argument from vload_half.
+    auto Arg1 = CI->getOperand(1);
+
+    auto IntTy = Type::getInt32Ty(M.getContext());
+    auto Int4Ty = FixedVectorType::get(IntTy, 4);
+    auto Float2Ty = FixedVectorType::get(Type::getFloatTy(M.getContext()), 2);
+    auto NewPointerTy =
+        PointerType::get(Int4Ty, Arg1->getType()->getPointerAddressSpace());
+    auto NewFType = FunctionType::get(Float2Ty, IntTy, false);
+
+    // Cast the half* pointer to int4*.
+    auto Cast = CastInst::CreatePointerCast(Arg1, NewPointerTy, "", CI);
+
+    // Index into the correct address of the casted pointer.
+    auto Index = GetElementPtrInst::Create(Int4Ty, Cast, Arg0, "", CI);
+
+    // Load from the int4* we casted to.
+    auto Load = new LoadInst(Int4Ty, Index, "", CI);
+
+    // Extract each element from the loaded int4.
+    auto X1 =
+        ExtractElementInst::Create(Load, ConstantInt::get(IntTy, 0), "", CI);
+    auto X2 =
+        ExtractElementInst::Create(Load, ConstantInt::get(IntTy, 1), "", CI);
+    auto X3 =
+        ExtractElementInst::Create(Load, ConstantInt::get(IntTy, 2), "", CI);
+    auto X4 =
+        ExtractElementInst::Create(Load, ConstantInt::get(IntTy, 3), "", CI);
+
+    // Our intrinsic to unpack a float2 from an int.
+    auto SPIRVIntrinsic = clspv::UnpackFunction();
+
+    auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
+
+    // Convert the 4 int into 4 float2
+    auto Y1 = CallInst::Create(NewF, X1, "", CI);
+    auto Y2 = CallInst::Create(NewF, X2, "", CI);
+    auto Y3 = CallInst::Create(NewF, X3, "", CI);
+    auto Y4 = CallInst::Create(NewF, X4, "", CI);
+
+    Constant *ShuffleMask4[4] = {
+        ConstantInt::get(IntTy, 0), ConstantInt::get(IntTy, 1),
+        ConstantInt::get(IntTy, 2), ConstantInt::get(IntTy, 3)};
+
+    // Combine our two float2's into one float4.
+    auto Z1 = new ShuffleVectorInst(Y1, Y2, ConstantVector::get(ShuffleMask4),
+                                    "", CI);
+    auto Z2 = new ShuffleVectorInst(Y3, Y4, ConstantVector::get(ShuffleMask4),
+                                    "", CI);
+
+    Constant *ShuffleMask8[8] = {
+        ConstantInt::get(IntTy, 0), ConstantInt::get(IntTy, 1),
+        ConstantInt::get(IntTy, 2), ConstantInt::get(IntTy, 3),
+        ConstantInt::get(IntTy, 4), ConstantInt::get(IntTy, 5),
+        ConstantInt::get(IntTy, 6), ConstantInt::get(IntTy, 7)};
+
+    // Combine our two float4's into one float8.
+    return new ShuffleVectorInst(Z1, Z2, ConstantVector::get(ShuffleMask8), "",
+                                 CI);
+  });
+}
+
+bool ReplaceOpenCLBuiltinPass::replaceVloadHalf16(Function &F) {
+  Module &M = *F.getParent();
+  return replaceCallsWithValue(F, [&](CallInst *CI) {
+    // The index argument from vload_half.
+    auto Arg0 = CI->getOperand(0);
+
+    // The pointer argument from vload_half.
+    auto Arg1 = CI->getOperand(1);
+
+    auto IntTy = Type::getInt32Ty(M.getContext());
+    auto Int4Ty = FixedVectorType::get(IntTy, 4);
+    auto Float2Ty = FixedVectorType::get(Type::getFloatTy(M.getContext()), 2);
+    auto NewPointerTy =
+        PointerType::get(Int4Ty, Arg1->getType()->getPointerAddressSpace());
+    auto NewFType = FunctionType::get(Float2Ty, IntTy, false);
+
+    // Cast the half* pointer to int4*.
+    auto Cast = CastInst::CreatePointerCast(Arg1, NewPointerTy, "", CI);
+
+    // Index into the correct address of the casted pointer.
+    auto Arg0x2 = BinaryOperator::Create(Instruction::Shl, Arg0, ConstantInt::get(IntTy, 1), "", CI);
+    auto Index1 = GetElementPtrInst::Create(Int4Ty, Cast, Arg0x2, "", CI);
+    auto Arg0x2p1 = BinaryOperator::Create(Instruction::Add, Arg0x2, ConstantInt::get(IntTy, 1), "", CI);
+    auto Index2 = GetElementPtrInst::Create(Int4Ty, Cast, Arg0x2p1, "", CI);
+
+    // Load from the int4* we casted to.
+    auto Load1 = new LoadInst(Int4Ty, Index1, "", CI);
+    auto Load2 = new LoadInst(Int4Ty, Index2, "", CI);
+
+    // Extract each element from the two loaded int4.
+    auto X1 =
+        ExtractElementInst::Create(Load1, ConstantInt::get(IntTy, 0), "", CI);
+    auto X2 =
+        ExtractElementInst::Create(Load1, ConstantInt::get(IntTy, 1), "", CI);
+    auto X3 =
+        ExtractElementInst::Create(Load1, ConstantInt::get(IntTy, 2), "", CI);
+    auto X4 =
+        ExtractElementInst::Create(Load1, ConstantInt::get(IntTy, 3), "", CI);
+    auto X5 =
+        ExtractElementInst::Create(Load2, ConstantInt::get(IntTy, 0), "", CI);
+    auto X6 =
+        ExtractElementInst::Create(Load2, ConstantInt::get(IntTy, 1), "", CI);
+    auto X7 =
+        ExtractElementInst::Create(Load2, ConstantInt::get(IntTy, 2), "", CI);
+    auto X8 =
+        ExtractElementInst::Create(Load2, ConstantInt::get(IntTy, 3), "", CI);
+
+    // Our intrinsic to unpack a float2 from an int.
+    auto SPIRVIntrinsic = clspv::UnpackFunction();
+
+    auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
+
+    // Convert the eight int into float2
+    auto Y1 = CallInst::Create(NewF, X1, "", CI);
+    auto Y2 = CallInst::Create(NewF, X2, "", CI);
+    auto Y3 = CallInst::Create(NewF, X3, "", CI);
+    auto Y4 = CallInst::Create(NewF, X4, "", CI);
+    auto Y5 = CallInst::Create(NewF, X5, "", CI);
+    auto Y6 = CallInst::Create(NewF, X6, "", CI);
+    auto Y7 = CallInst::Create(NewF, X7, "", CI);
+    auto Y8 = CallInst::Create(NewF, X8, "", CI);
+
+    Constant *ShuffleMask4[4] = {
+        ConstantInt::get(IntTy, 0), ConstantInt::get(IntTy, 1),
+        ConstantInt::get(IntTy, 2), ConstantInt::get(IntTy, 3)};
+
+    // Combine our two float2's into one float4.
+    auto Z1 = new ShuffleVectorInst(Y1, Y2, ConstantVector::get(ShuffleMask4),
+                                    "", CI);
+    auto Z2 = new ShuffleVectorInst(Y3, Y4, ConstantVector::get(ShuffleMask4),
+                                    "", CI);
+    auto Z3 = new ShuffleVectorInst(Y5, Y6, ConstantVector::get(ShuffleMask4),
+                                    "", CI);
+    auto Z4 = new ShuffleVectorInst(Y7, Y8, ConstantVector::get(ShuffleMask4),
+                                    "", CI);
+
+    Constant *ShuffleMask8[8] = {
+        ConstantInt::get(IntTy, 0), ConstantInt::get(IntTy, 1),
+        ConstantInt::get(IntTy, 2), ConstantInt::get(IntTy, 3),
+        ConstantInt::get(IntTy, 4), ConstantInt::get(IntTy, 5),
+        ConstantInt::get(IntTy, 6), ConstantInt::get(IntTy, 7)};
+
+    // Combine our two float4's into one float8.
+    auto Z5 = new ShuffleVectorInst(Z1, Z2, ConstantVector::get(ShuffleMask8),
+                                    "", CI);
+    auto Z6 = new ShuffleVectorInst(Z3, Z4, ConstantVector::get(ShuffleMask8),
+                                    "", CI);
+    Constant *ShuffleMask16[16] = {
+        ConstantInt::get(IntTy, 0),  ConstantInt::get(IntTy, 1),
+        ConstantInt::get(IntTy, 2),  ConstantInt::get(IntTy, 3),
+        ConstantInt::get(IntTy, 4),  ConstantInt::get(IntTy, 5),
+        ConstantInt::get(IntTy, 6),  ConstantInt::get(IntTy, 7),
+        ConstantInt::get(IntTy, 8),  ConstantInt::get(IntTy, 9),
+        ConstantInt::get(IntTy, 10), ConstantInt::get(IntTy, 11),
+        ConstantInt::get(IntTy, 12), ConstantInt::get(IntTy, 13),
+        ConstantInt::get(IntTy, 14), ConstantInt::get(IntTy, 15)};
+    // Combine our two float8's into one float16.
+    return new ShuffleVectorInst(Z5, Z6, ConstantVector::get(ShuffleMask16), "",
+                                 CI);
+  });
+}
+
 bool ReplaceOpenCLBuiltinPass::replaceClspvVloadaHalf2(Function &F) {
 
   // Replace __clspv_vloada_half2(uint Index, global uint* Ptr) with:
   //
   //    %u = load i32 %ptr
-  //    %fxy = call <2 x float> Unpack2xHalf(u)
-  //    %result = shufflevector %fxy %fzw <4 x i32> <0, 1, 2, 3>
+  //    %result = call <2 x float> Unpack2xHalf(u)
   Module &M = *F.getParent();
   return replaceCallsWithValue(F, [&](CallInst *CI) {
     auto Index = CI->getOperand(0);
@@ -2227,7 +2539,7 @@
   //    %u2zw = extractelement %u2, 1
   //    %fxy = call <2 x float> Unpack2xHalf(uint)
   //    %fzw = call <2 x float> Unpack2xHalf(uint)
-  //    %result = shufflevector %fxy %fzw <4 x i32> <0, 1, 2, 3>
+  //    %result = shufflevector %fxy %fzw <4 x float> <0, 1, 2, 3>
   Module &M = *F.getParent();
   return replaceCallsWithValue(F, [&](CallInst *CI) {
     auto Index = CI->getOperand(0);
@@ -2268,14 +2580,20 @@
   });
 }
 
-bool ReplaceOpenCLBuiltinPass::replaceVstoreHalf(Function &F, int vec_size) {
+bool ReplaceOpenCLBuiltinPass::replaceVstoreHalf(Function &F, int vec_size, bool aligned) {
   switch (vec_size) {
   case 0:
     return replaceVstoreHalf(F);
   case 2:
     return replaceVstoreHalf2(F);
+  case 3:
+    return aligned ? replaceVstoreaHalf3(F) : replaceVstoreHalf3(F);
   case 4:
     return replaceVstoreHalf4(F);
+  case 8:
+    return replaceVstoreHalf8(F);
+  case 16:
+    return replaceVstoreHalf16(F);
   default:
     llvm_unreachable("Unsupported vstore_half vector size");
     break;
@@ -2473,6 +2791,149 @@
   });
 }
 
+bool ReplaceOpenCLBuiltinPass::replaceVstoreHalf3(Function &F) {
+  Module &M = *F.getParent();
+  return replaceCallsWithValue(F, [&](CallInst *CI) {
+    // The value to store.
+    auto Arg0 = CI->getOperand(0);
+
+    // The index argument from vstore_half.
+    auto Arg1 = CI->getOperand(1);
+
+    // The pointer argument from vstore_half.
+    auto Arg2 = CI->getOperand(2);
+
+    auto IntTy = Type::getInt32Ty(M.getContext());
+    auto ShortTy = Type::getInt16Ty(M.getContext());
+    auto FloatTy = Type::getFloatTy(M.getContext());
+    auto Float2Ty = FixedVectorType::get(FloatTy, 2);
+    auto NewPointerTy =
+        PointerType::get(ShortTy, Arg2->getType()->getPointerAddressSpace());
+    auto NewFType = FunctionType::get(IntTy, Float2Ty, false);
+
+    auto Int0 = ConstantInt::get(IntTy, 0);
+    auto Int1 = ConstantInt::get(IntTy, 1);
+    auto Int2 = ConstantInt::get(IntTy, 2);
+
+    auto X0 = InsertElementInst::Create(
+        UndefValue::get(Float2Ty),
+        ExtractElementInst::Create(Arg0, Int0, "", CI), Int0, "", CI);
+    auto X1 = InsertElementInst::Create(
+        UndefValue::get(Float2Ty),
+        ExtractElementInst::Create(Arg0, Int1, "", CI), Int0, "", CI);
+    auto X2 = InsertElementInst::Create(
+        UndefValue::get(Float2Ty),
+        ExtractElementInst::Create(Arg0, Int2, "", CI), Int0, "", CI);
+
+    // Our intrinsic to pack a float2 to an int.
+    auto SPIRVIntrinsic = clspv::PackFunction();
+
+    auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
+
+    // Convert float2 into int and trunc to short to keep only the meaningful
+    // part of it
+    auto Y0 =
+        CastInst::Create(Instruction::Trunc, CallInst::Create(NewF, X0, "", CI),
+                         ShortTy, "", CI);
+    auto Y1 =
+        CastInst::Create(Instruction::Trunc, CallInst::Create(NewF, X1, "", CI),
+                         ShortTy, "", CI);
+    auto Y2 =
+        CastInst::Create(Instruction::Trunc, CallInst::Create(NewF, X2, "", CI),
+                         ShortTy, "", CI);
+
+    // Cast the half* pointer to short*.
+    auto Cast = CastInst::CreatePointerCast(Arg2, NewPointerTy, "", CI);
+
+    auto Index0 = BinaryOperator::Create(
+        Instruction::Add,
+        BinaryOperator::Create(Instruction::Shl, Arg1, Int1, "", CI), Arg1, "",
+        CI);
+    auto GEP0 = GetElementPtrInst::Create(ShortTy, Cast, Index0, "", CI);
+    new StoreInst(Y0, GEP0, CI);
+
+    auto Index1 =
+        BinaryOperator::Create(Instruction::Add, Index0, Int1, "", CI);
+    auto GEP1 = GetElementPtrInst::Create(ShortTy, Cast, Index1, "", CI);
+    new StoreInst(Y1, GEP1, CI);
+
+    auto Index2 =
+        BinaryOperator::Create(Instruction::Add, Index1, Int1, "", CI);
+    auto GEP2 = GetElementPtrInst::Create(ShortTy, Cast, Index2, "", CI);
+    return new StoreInst(Y2, GEP2, CI);
+  });
+}
+
+bool ReplaceOpenCLBuiltinPass::replaceVstoreaHalf3(Function &F) {
+  Module &M = *F.getParent();
+  return replaceCallsWithValue(F, [&](CallInst *CI) {
+    // The value to store.
+    auto Arg0 = CI->getOperand(0);
+
+    // The index argument from vstore_half.
+    auto Arg1 = CI->getOperand(1);
+
+    // The pointer argument from vstore_half.
+    auto Arg2 = CI->getOperand(2);
+
+    auto IntTy = Type::getInt32Ty(M.getContext());
+    auto ShortTy = Type::getInt16Ty(M.getContext());
+    auto FloatTy = Type::getFloatTy(M.getContext());
+    auto Float2Ty = FixedVectorType::get(FloatTy, 2);
+    auto NewPointerTy =
+        PointerType::get(ShortTy, Arg2->getType()->getPointerAddressSpace());
+    auto NewFType = FunctionType::get(IntTy, Float2Ty, false);
+
+    auto Int0 = ConstantInt::get(IntTy, 0);
+    auto Int1 = ConstantInt::get(IntTy, 1);
+    auto Int2 = ConstantInt::get(IntTy, 2);
+
+    auto X0 = InsertElementInst::Create(
+        UndefValue::get(Float2Ty),
+        ExtractElementInst::Create(Arg0, Int0, "", CI), Int0, "", CI);
+    auto X1 = InsertElementInst::Create(
+        UndefValue::get(Float2Ty),
+        ExtractElementInst::Create(Arg0, Int1, "", CI), Int0, "", CI);
+    auto X2 = InsertElementInst::Create(
+        UndefValue::get(Float2Ty),
+        ExtractElementInst::Create(Arg0, Int2, "", CI), Int0, "", CI);
+
+    // Our intrinsic to pack a float2 to an int.
+    auto SPIRVIntrinsic = clspv::PackFunction();
+
+    auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
+
+    // Convert float2 into int and trunc to short to keep only the meaningful
+    // part of it
+    auto Y0 =
+        CastInst::Create(Instruction::Trunc, CallInst::Create(NewF, X0, "", CI),
+                         ShortTy, "", CI);
+    auto Y1 =
+        CastInst::Create(Instruction::Trunc, CallInst::Create(NewF, X1, "", CI),
+                         ShortTy, "", CI);
+    auto Y2 =
+        CastInst::Create(Instruction::Trunc, CallInst::Create(NewF, X2, "", CI),
+                         ShortTy, "", CI);
+
+    // Cast the half* pointer to short*.
+    auto Cast = CastInst::CreatePointerCast(Arg2, NewPointerTy, "", CI);
+
+    auto Index0 = BinaryOperator::Create(Instruction::Shl, Arg1, Int2, "", CI);
+    auto GEP0 = GetElementPtrInst::Create(ShortTy, Cast, Index0, "", CI);
+    new StoreInst(Y0, GEP0, CI);
+
+    auto Index1 =
+        BinaryOperator::Create(Instruction::Add, Index0, Int1, "", CI);
+    auto GEP1 = GetElementPtrInst::Create(ShortTy, Cast, Index1, "", CI);
+    new StoreInst(Y1, GEP1, CI);
+
+    auto Index2 =
+        BinaryOperator::Create(Instruction::Add, Index1, Int1, "", CI);
+    auto GEP2 = GetElementPtrInst::Create(ShortTy, Cast, Index2, "", CI);
+    return new StoreInst(Y2, GEP2, CI);
+  });
+}
+
 bool ReplaceOpenCLBuiltinPass::replaceVstoreHalf4(Function &F) {
   Module &M = *F.getParent();
   return replaceCallsWithValue(F, [&](CallInst *CI) {
@@ -2533,6 +2994,181 @@
   });
 }
 
+bool ReplaceOpenCLBuiltinPass::replaceVstoreHalf8(Function &F) {
+  Module &M = *F.getParent();
+  return replaceCallsWithValue(F, [&](CallInst *CI) {
+    // The value to store.
+    auto Arg0 = CI->getOperand(0);
+
+    // The index argument from vstore_half.
+    auto Arg1 = CI->getOperand(1);
+
+    // The pointer argument from vstore_half.
+    auto Arg2 = CI->getOperand(2);
+
+    auto IntTy = Type::getInt32Ty(M.getContext());
+    auto Int4Ty = FixedVectorType::get(IntTy, 4);
+    auto Float2Ty = FixedVectorType::get(Type::getFloatTy(M.getContext()), 2);
+    auto NewPointerTy =
+        PointerType::get(Int4Ty, Arg2->getType()->getPointerAddressSpace());
+    auto NewFType = FunctionType::get(IntTy, Float2Ty, false);
+
+    Constant *ShuffleMask01[2] = {ConstantInt::get(IntTy, 0),
+                                  ConstantInt::get(IntTy, 1)};
+    auto X01 =
+        new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                              ConstantVector::get(ShuffleMask01), "", CI);
+    Constant *ShuffleMask23[2] = {ConstantInt::get(IntTy, 2),
+                                  ConstantInt::get(IntTy, 3)};
+    auto X23 =
+        new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                              ConstantVector::get(ShuffleMask23), "", CI);
+    Constant *ShuffleMask45[2] = {ConstantInt::get(IntTy, 4),
+                                  ConstantInt::get(IntTy, 5)};
+    auto X45 =
+        new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                              ConstantVector::get(ShuffleMask45), "", CI);
+    Constant *ShuffleMask67[2] = {ConstantInt::get(IntTy, 6),
+                                  ConstantInt::get(IntTy, 7)};
+    auto X67 =
+        new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                              ConstantVector::get(ShuffleMask67), "", CI);
+
+    // Our intrinsic to pack a float2 to an int.
+    auto SPIRVIntrinsic = clspv::PackFunction();
+
+    auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
+
+    auto Y01 = CallInst::Create(NewF, X01, "", CI);
+    auto Y23 = CallInst::Create(NewF, X23, "", CI);
+    auto Y45 = CallInst::Create(NewF, X45, "", CI);
+    auto Y67 = CallInst::Create(NewF, X67, "", CI);
+
+    auto Combine = InsertElementInst::Create(
+        UndefValue::get(Int4Ty), Y01, ConstantInt::get(IntTy, 0), "", CI);
+    Combine = InsertElementInst::Create(Combine, Y23,
+                                        ConstantInt::get(IntTy, 1), "", CI);
+    Combine = InsertElementInst::Create(Combine, Y45,
+                                        ConstantInt::get(IntTy, 2), "", CI);
+    Combine = InsertElementInst::Create(Combine, Y67,
+                                        ConstantInt::get(IntTy, 3), "", CI);
+
+    // Cast the half* pointer to int4*.
+    auto Cast = CastInst::CreatePointerCast(Arg2, NewPointerTy, "", CI);
+
+    // Index into the correct address of the casted pointer.
+    auto Index = GetElementPtrInst::Create(Int4Ty, Cast, Arg1, "", CI);
+
+    // Store to the int4* we casted to.
+    return new StoreInst(Combine, Index, CI);
+  });
+}
+
+bool ReplaceOpenCLBuiltinPass::replaceVstoreHalf16(Function &F) {
+  Module &M = *F.getParent();
+  return replaceCallsWithValue(F, [&](CallInst *CI) {
+    // The value to store.
+    auto Arg0 = CI->getOperand(0);
+
+    // The index argument from vstore_half.
+    auto Arg1 = CI->getOperand(1);
+
+    // The pointer argument from vstore_half.
+    auto Arg2 = CI->getOperand(2);
+
+    auto IntTy = Type::getInt32Ty(M.getContext());
+    auto Int4Ty = FixedVectorType::get(IntTy, 4);
+    auto Float2Ty = FixedVectorType::get(Type::getFloatTy(M.getContext()), 2);
+    auto NewPointerTy =
+        PointerType::get(Int4Ty, Arg2->getType()->getPointerAddressSpace());
+    auto NewFType = FunctionType::get(IntTy, Float2Ty, false);
+
+    Constant *ShuffleMask0[2] = {ConstantInt::get(IntTy, 0),
+                                 ConstantInt::get(IntTy, 1)};
+    auto X0 = new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                                    ConstantVector::get(ShuffleMask0), "", CI);
+    Constant *ShuffleMask1[2] = {ConstantInt::get(IntTy, 2),
+                                 ConstantInt::get(IntTy, 3)};
+    auto X1 = new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                                    ConstantVector::get(ShuffleMask1), "", CI);
+    Constant *ShuffleMask2[2] = {ConstantInt::get(IntTy, 4),
+                                 ConstantInt::get(IntTy, 5)};
+    auto X2 = new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                                    ConstantVector::get(ShuffleMask2), "", CI);
+    Constant *ShuffleMask3[2] = {ConstantInt::get(IntTy, 6),
+                                 ConstantInt::get(IntTy, 7)};
+    auto X3 = new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                                    ConstantVector::get(ShuffleMask3), "", CI);
+    Constant *ShuffleMask4[2] = {ConstantInt::get(IntTy, 8),
+                                 ConstantInt::get(IntTy, 9)};
+    auto X4 = new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                                    ConstantVector::get(ShuffleMask4), "", CI);
+    Constant *ShuffleMask5[2] = {ConstantInt::get(IntTy, 10),
+                                 ConstantInt::get(IntTy, 11)};
+    auto X5 = new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                                    ConstantVector::get(ShuffleMask5), "", CI);
+    Constant *ShuffleMask6[2] = {ConstantInt::get(IntTy, 12),
+                                 ConstantInt::get(IntTy, 13)};
+    auto X6 = new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                                    ConstantVector::get(ShuffleMask6), "", CI);
+    Constant *ShuffleMask7[2] = {ConstantInt::get(IntTy, 14),
+                                 ConstantInt::get(IntTy, 15)};
+    auto X7 = new ShuffleVectorInst(Arg0, UndefValue::get(Arg0->getType()),
+                                    ConstantVector::get(ShuffleMask7), "", CI);
+
+    // Our intrinsic to pack a float2 to an int.
+    auto SPIRVIntrinsic = clspv::PackFunction();
+
+    auto NewF = M.getOrInsertFunction(SPIRVIntrinsic, NewFType);
+
+    auto Y0 = CallInst::Create(NewF, X0, "", CI);
+    auto Y1 = CallInst::Create(NewF, X1, "", CI);
+    auto Y2 = CallInst::Create(NewF, X2, "", CI);
+    auto Y3 = CallInst::Create(NewF, X3, "", CI);
+    auto Y4 = CallInst::Create(NewF, X4, "", CI);
+    auto Y5 = CallInst::Create(NewF, X5, "", CI);
+    auto Y6 = CallInst::Create(NewF, X6, "", CI);
+    auto Y7 = CallInst::Create(NewF, X7, "", CI);
+
+    auto Combine1 = InsertElementInst::Create(
+        UndefValue::get(Int4Ty), Y0, ConstantInt::get(IntTy, 0), "", CI);
+    Combine1 = InsertElementInst::Create(Combine1, Y1,
+                                         ConstantInt::get(IntTy, 1), "", CI);
+    Combine1 = InsertElementInst::Create(Combine1, Y2,
+                                         ConstantInt::get(IntTy, 2), "", CI);
+    Combine1 = InsertElementInst::Create(Combine1, Y3,
+                                         ConstantInt::get(IntTy, 3), "", CI);
+
+    auto Combine2 = InsertElementInst::Create(
+        UndefValue::get(Int4Ty), Y4, ConstantInt::get(IntTy, 0), "", CI);
+    Combine2 = InsertElementInst::Create(Combine2, Y5,
+                                         ConstantInt::get(IntTy, 1), "", CI);
+    Combine2 = InsertElementInst::Create(Combine2, Y6,
+                                         ConstantInt::get(IntTy, 2), "", CI);
+    Combine2 = InsertElementInst::Create(Combine2, Y7,
+                                         ConstantInt::get(IntTy, 3), "", CI);
+
+    // Cast the half* pointer to int4*.
+    auto Cast = CastInst::CreatePointerCast(Arg2, NewPointerTy, "", CI);
+
+    // Index into the correct address of the casted pointer.
+    auto Arg1x2 = BinaryOperator::Create(Instruction::Shl, Arg1,
+                                         ConstantInt::get(IntTy, 1), "", CI);
+    auto Index1 = GetElementPtrInst::Create(Int4Ty, Cast, Arg1x2, "", CI);
+
+    // Store to the int4* we casted to.
+    new StoreInst(Combine1, Index1, CI);
+
+    // Index into the correct address of the casted pointer.
+    auto Arg1Plus1 = BinaryOperator::Create(Instruction::Add, Arg1x2,
+                                            ConstantInt::get(IntTy, 1), "", CI);
+    auto Index2 = GetElementPtrInst::Create(Int4Ty, Cast, Arg1Plus1, "", CI);
+
+    // Store to the int4* we casted to.
+    return new StoreInst(Combine2, Index2, CI);
+  });
+}
+
 bool ReplaceOpenCLBuiltinPass::replaceHalfReadImage(Function &F) {
   // convert half to float
   Module &M = *F.getParent();
diff --git a/lib/ReplaceOpenCLBuiltinPass.h b/lib/ReplaceOpenCLBuiltinPass.h
index e184207..3baf203 100644
--- a/lib/ReplaceOpenCLBuiltinPass.h
+++ b/lib/ReplaceOpenCLBuiltinPass.h
@@ -59,16 +59,24 @@
   bool replaceSignbit(llvm::Function &F, bool is_vec);
   bool replaceMul(llvm::Function &F, bool is_float, bool is_mad);
   bool replaceVloadHalf(llvm::Function &F, const std::string &name,
-                        int vec_size);
+                        int vec_size, bool aligned);
   bool replaceVloadHalf(llvm::Function &F);
   bool replaceVloadHalf2(llvm::Function &F);
+  bool replaceVloadHalf3(llvm::Function &F);
+  bool replaceVloadaHalf3(llvm::Function &F);
   bool replaceVloadHalf4(llvm::Function &F);
+  bool replaceVloadHalf8(llvm::Function &F);
+  bool replaceVloadHalf16(llvm::Function &F);
   bool replaceClspvVloadaHalf2(llvm::Function &F);
   bool replaceClspvVloadaHalf4(llvm::Function &F);
-  bool replaceVstoreHalf(llvm::Function &F, int vec_size);
+  bool replaceVstoreHalf(llvm::Function &F, int vec_size, bool aligned);
   bool replaceVstoreHalf(llvm::Function &F);
   bool replaceVstoreHalf2(llvm::Function &F);
+  bool replaceVstoreHalf3(llvm::Function &F);
+  bool replaceVstoreaHalf3(llvm::Function &F);
   bool replaceVstoreHalf4(llvm::Function &F);
+  bool replaceVstoreHalf8(llvm::Function &F);
+  bool replaceVstoreHalf16(llvm::Function &F);
   bool replaceHalfReadImage(llvm::Function &F);
   bool replaceHalfWriteImage(llvm::Function &F);
   bool replaceSampledReadImageWithIntCoords(llvm::Function &F);
diff --git a/lib/ReplacePointerBitcastPass.cpp b/lib/ReplacePointerBitcastPass.cpp
index 59963bc..cb438d1 100644
--- a/lib/ReplacePointerBitcastPass.cpp
+++ b/lib/ReplacePointerBitcastPass.cpp
@@ -504,6 +504,10 @@
               // Generate stores.
               Value *SrcAddrIdx = NewAddrIdx;
               Value *BaseAddr = BitCastSrc;
+              if (VIdx != 0) {
+                SrcAddrIdx =
+                    Builder.CreateAdd(SrcAddrIdx, Builder.getInt32(4 * VIdx));
+              }
               for (unsigned i = 0; i < NumElement; i++) {
                 // Calculate store address.
                 Value *DstAddr =
@@ -1135,7 +1139,13 @@
               Value *TmpVal = Builder.CreateBitCast(STVal, TmpTy);
               TmpVal = Builder.CreateLShr(
                   TmpVal, Builder.getIntN(DstTyBitWidth, i * SrcTyBitWidth));
-              TmpVal = Builder.CreateTrunc(TmpVal, SrcTy);
+              if (SrcTy->isHalfTy()) {
+                auto Tmpi16 = Builder.CreateTrunc(
+                    TmpVal, Type::getInt16Ty(M.getContext()));
+                TmpVal = Builder.CreateBitCast(Tmpi16, SrcTy);
+              } else {
+                  TmpVal = Builder.CreateTrunc(TmpVal, SrcTy);
+              }
               STValues.push_back(TmpVal);
             }
 
diff --git a/test/HalfStorage/vload_half.cl b/test/HalfStorage/vload_half.cl
new file mode 100644
index 0000000..c247491
--- /dev/null
+++ b/test/HalfStorage/vload_half.cl
@@ -0,0 +1,29 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, int b, __global float *dst) {
+    *dst = vload_half(b, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[ushort:%[^ ]+]] = OpTypeInt 16 0
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint0:%[^ ]+]] = OpConstant [[uint]] 0
+
+// CHECK-DAG: [[half_array:%[^ ]+]] = OpTypeRuntimeArray [[half]]
+// CHECK-DAG: [[half_ptr:%[^ ]+]] = OpTypeStruct [[half_array]]
+// CHECK-DAG: [[global_half_ptr:%[^ ]+]] = OpTypePointer StorageBuffer [[half_ptr]]
+
+// CHECK: [[a:%[^ ]+]] = OpVariable [[global_half_ptr]] StorageBuffer
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 0
+// CHECK: [[addr:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint0]] [[b]]
+// CHECK: [[valh:%[^ ]+]] = OpLoad [[half]] [[addr]]
+// CHECK: [[vali16:%[^ ]+]] = OpBitcast [[ushort]] [[valh]]
+// CHECK: [[vali32:%[^ ]+]] = OpUConvert [[uint]] [[vali16]]
+// CHECK: [[valf2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[vali32]]
+// CHECK: [[val:%[^ ]+]] = OpCompositeExtract [[float]] [[valf2]] 0
+// CHECK: OpStore {{.*}} [[val]]
diff --git a/test/HalfStorage/vload_half.ll b/test/HalfStorage/vload_half.ll
new file mode 100644
index 0000000..71c02a2
--- /dev/null
+++ b/test/HalfStorage/vload_half.ll
@@ -0,0 +1,20 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define float @foo(half addrspace(1)* %a, i32 %b) {
+entry:
+  %0 = call spir_func float @_Z10vload_halfjPU3AS1KDh(i32 %b, half addrspace(1)* %a)
+  ret float %0
+}
+
+declare spir_func float @_Z10vload_halfjPU3AS1KDh(i32, half addrspace(1)*)
+
+; CHECK:  [[ai16:%[^ ]+]] = bitcast half addrspace(1)* %a to i16 addrspace(1)*
+; CHECK:  [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 %b
+; CHECK:  [[reti16:%[^ ]+]] = load i16, i16 addrspace(1)* [[gep]], align 2
+; CHECK:  [[reti32:%[^ ]+]] = zext i16 [[reti16]] to i32
+; CHECK:  [[retf2:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[reti32]])
+; CHECK:  [[ret:%[^ ]+]] = extractelement <2 x float> [[retf2]], i32 0
diff --git a/test/HalfStorage/vload_half16.cl b/test/HalfStorage/vload_half16.cl
new file mode 100644
index 0000000..25b8af5
--- /dev/null
+++ b/test/HalfStorage/vload_half16.cl
@@ -0,0 +1,183 @@
+// RUN: clspv %s -o %t.spv -long-vector
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, int b, __global float16 *dst) {
+    *dst = vload_half16(b, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[half4:%[^ ]+]] = OpTypeVector [[half]] 4
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint_16:%[^ ]+]] = OpConstant [[uint]] 16
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint_3:%[^ ]+]] = OpConstant [[uint]] 3
+// CHECK-DAG: [[uint_4:%[^ ]+]] = OpConstant [[uint]] 4
+// CHECK-DAG: [[uint_5:%[^ ]+]] = OpConstant [[uint]] 5
+// CHECK-DAG: [[uint_6:%[^ ]+]] = OpConstant [[uint]] 6
+// CHECK-DAG: [[uint_7:%[^ ]+]] = OpConstant [[uint]] 7
+// CHECK-DAG: [[uint_8:%[^ ]+]] = OpConstant [[uint]] 8
+// CHECK-DAG: [[uint_9:%[^ ]+]] = OpConstant [[uint]] 9
+// CHECK-DAG: [[uint_10:%[^ ]+]] = OpConstant [[uint]] 10
+// CHECK-DAG: [[uint_11:%[^ ]+]] = OpConstant [[uint]] 11
+// CHECK-DAG: [[uint_12:%[^ ]+]] = OpConstant [[uint]] 12
+// CHECK-DAG: [[uint_13:%[^ ]+]] = OpConstant [[uint]] 13
+// CHECK-DAG: [[uint_14:%[^ ]+]] = OpConstant [[uint]] 14
+// CHECK-DAG: [[uint_15:%[^ ]+]] = OpConstant [[uint]] 15
+
+// CHECK-DAG: [[half_array:%[^ ]+]] = OpTypeRuntimeArray [[half]]
+// CHECK-DAG: [[half_ptr:%[^ ]+]] = OpTypeStruct [[half_array]]
+// CHECK-DAG: [[global_half_ptr:%[^ ]+]] = OpTypePointer StorageBuffer [[half_ptr]]
+
+// CHECK: [[a:%[^ ]+]] = OpVariable [[global_half_ptr]] StorageBuffer
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 0
+
+// CHECK: [[bx16:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[b]] [[uint_4]]
+// CHECK: [[idx8:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] [[uint_8]]
+
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[bx16]]
+// CHECK: [[val0:%[^ ]+]] = OpLoad [[half]] [[addr0]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] [[uint_1]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx1]]
+// CHECK: [[val1:%[^ ]+]] = OpLoad [[half]] [[addr1]]
+
+// CHECK: [[idx2:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] [[uint_2]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx2]]
+// CHECK: [[val2:%[^ ]+]] = OpLoad [[half]] [[addr2]]
+
+// CHECK: [[idx3:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_3
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx3]]
+// CHECK: [[val3:%[^ ]+]] = OpLoad [[half]] [[addr3]]
+
+// CHECK: [[idx4:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_4
+// CHECK: [[addr4:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx4]]
+// CHECK: [[val4:%[^ ]+]] = OpLoad [[half]] [[addr4]]
+
+// CHECK: [[idx5:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_5
+// CHECK: [[addr5:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx5]]
+// CHECK: [[val5:%[^ ]+]] = OpLoad [[half]] [[addr5]]
+
+// CHECK: [[idx6:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_6
+// CHECK: [[addr6:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx6]]
+// CHECK: [[val6:%[^ ]+]] = OpLoad [[half]] [[addr6]]
+
+// CHECK: [[idx7:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_7
+// CHECK: [[addr7:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx7]]
+// CHECK: [[val7:%[^ ]+]] = OpLoad [[half]] [[addr7]]
+
+// CHECK: [[val0h4:%[^ ]+]] = OpCompositeConstruct [[half4]] [[val0]] [[val1]] [[val2]] [[val3]]
+// CHECK: [[val1h4:%[^ ]+]] = OpCompositeConstruct [[half4]] [[val4]] [[val5]] [[val6]] [[val7]]
+// CHECK: [[val0i32:%[^ ]+]] = OpBitcast %v2uint [[val0h4]]
+// CHECK: [[val1i32:%[^ ]+]] = OpBitcast %v2uint [[val1h4]]
+
+// CHECK: [[addr8:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx8]]
+// CHECK: [[val8:%[^ ]+]] = OpLoad [[half]] [[addr8]]
+
+// CHECK: [[idx9:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] [[uint_9]]
+// CHECK: [[addr9:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx9]]
+// CHECK: [[val9:%[^ ]+]] = OpLoad [[half]] [[addr9]]
+
+// CHECK: [[idx10:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] [[uint_10]]
+// CHECK: [[addr10:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx10]]
+// CHECK: [[val10:%[^ ]+]] = OpLoad [[half]] [[addr10]]
+
+// CHECK: [[idx11:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_11
+// CHECK: [[addr11:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx11]]
+// CHECK: [[val11:%[^ ]+]] = OpLoad [[half]] [[addr11]]
+
+// CHECK: [[idx12:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_12
+// CHECK: [[addr12:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx12]]
+// CHECK: [[val12:%[^ ]+]] = OpLoad [[half]] [[addr12]]
+
+// CHECK: [[idx13:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_13
+// CHECK: [[addr13:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx13]]
+// CHECK: [[val13:%[^ ]+]] = OpLoad [[half]] [[addr13]]
+
+// CHECK: [[idx14:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_14
+// CHECK: [[addr14:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx14]]
+// CHECK: [[val14:%[^ ]+]] = OpLoad [[half]] [[addr14]]
+
+// CHECK: [[idx15:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx16]] %uint_15
+// CHECK: [[addr15:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx15]]
+// CHECK: [[val15:%[^ ]+]] = OpLoad [[half]] [[addr15]]
+
+// CHECK: [[val2h4:%[^ ]+]] = OpCompositeConstruct [[half4]] [[val8]] [[val9]] [[val10]] [[val11]]
+// CHECK: [[val3h4:%[^ ]+]] = OpCompositeConstruct [[half4]] [[val12]] [[val13]] [[val14]] [[val15]]
+// CHECK: [[val2i32:%[^ ]+]] = OpBitcast %v2uint [[val2h4]]
+// CHECK: [[val3i32:%[^ ]+]] = OpBitcast %v2uint [[val3h4]]
+
+
+// CHECK: [[val00i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[val0i32]] 0
+// CHECK: [[val01i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[val0i32]] 1
+// CHECK: [[val10i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[val1i32]] 0
+// CHECK: [[val11i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[val1i32]] 1
+
+// CHECK: [[val20i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[val2i32]] 0
+// CHECK: [[val21i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[val2i32]] 1
+// CHECK: [[val30i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[val3i32]] 0
+// CHECK: [[val31i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[val3i32]] 1
+
+// CHECK: [[val0f2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val00i32]]
+// CHECK: [[val1f2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val01i32]]
+// CHECK: [[val2f2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val10i32]]
+// CHECK: [[val3f2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val11i32]]
+// CHECK: [[val4f2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val20i32]]
+// CHECK: [[val5f2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val21i32]]
+// CHECK: [[val6f2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val30i32]]
+// CHECK: [[val7f2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val31i32]]
+
+// CHECK: [[val0:%[^ ]+]] = OpCompositeExtract [[float]] [[val0f2]] 0
+// CHECK: [[val1:%[^ ]+]] = OpCompositeExtract [[float]] [[val0f2]] 1
+// CHECK: [[val2:%[^ ]+]] = OpCompositeExtract [[float]] [[val1f2]] 0
+// CHECK: [[val3:%[^ ]+]] = OpCompositeExtract [[float]] [[val1f2]] 1
+// CHECK: [[val4:%[^ ]+]] = OpCompositeExtract [[float]] [[val2f2]] 0
+// CHECK: [[val5:%[^ ]+]] = OpCompositeExtract [[float]] [[val2f2]] 1
+// CHECK: [[val6:%[^ ]+]] = OpCompositeExtract [[float]] [[val3f2]] 0
+// CHECK: [[val7:%[^ ]+]] = OpCompositeExtract [[float]] [[val3f2]] 1
+// CHECK: [[val8:%[^ ]+]] = OpCompositeExtract [[float]] [[val4f2]] 0
+// CHECK: [[val9:%[^ ]+]] = OpCompositeExtract [[float]] [[val4f2]] 1
+// CHECK: [[val10:%[^ ]+]] = OpCompositeExtract [[float]] [[val5f2]] 0
+// CHECK: [[val11:%[^ ]+]] = OpCompositeExtract [[float]] [[val5f2]] 1
+// CHECK: [[val12:%[^ ]+]] = OpCompositeExtract [[float]] [[val6f2]] 0
+// CHECK: [[val13:%[^ ]+]] = OpCompositeExtract [[float]] [[val6f2]] 1
+// CHECK: [[val14:%[^ ]+]] = OpCompositeExtract [[float]] [[val7f2]] 0
+// CHECK: [[val15:%[^ ]+]] = OpCompositeExtract [[float]] [[val7f2]] 1
+
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_0]]
+// CHECK: OpStore [[addr0]] [[val0]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_1]]
+// CHECK: OpStore [[addr1]] [[val1]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_2]]
+// CHECK: OpStore [[addr2]] [[val2]]
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_3]]
+// CHECK: OpStore [[addr3]] [[val3]]
+// CHECK: [[addr4:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_4]]
+// CHECK: OpStore [[addr4]] [[val4]]
+// CHECK: [[addr5:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_5]]
+// CHECK: OpStore [[addr5]] [[val5]]
+// CHECK: [[addr6:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_6]]
+// CHECK: OpStore [[addr6]] [[val6]]
+// CHECK: [[addr7:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_7]]
+// CHECK: OpStore [[addr7]] [[val7]]
+// CHECK: [[addr8:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_8]]
+// CHECK: OpStore [[addr8]] [[val8]]
+// CHECK: [[addr9:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_9]]
+// CHECK: OpStore [[addr9]] [[val9]]
+// CHECK: [[addr10:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_10]]
+// CHECK: OpStore [[addr10]] [[val10]]
+// CHECK: [[addr11:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_11]]
+// CHECK: OpStore [[addr11]] [[val11]]
+// CHECK: [[addr12:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_12]]
+// CHECK: OpStore [[addr12]] [[val12]]
+// CHECK: [[addr13:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_13]]
+// CHECK: OpStore [[addr13]] [[val13]]
+// CHECK: [[addr14:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_14]]
+// CHECK: OpStore [[addr14]] [[val14]]
+// CHECK: [[addr15:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_15]]
+// CHECK: OpStore [[addr15]] [[val15]]
diff --git a/test/HalfStorage/vload_half16.ll b/test/HalfStorage/vload_half16.ll
new file mode 100644
index 0000000..27431ab
--- /dev/null
+++ b/test/HalfStorage/vload_half16.ll
@@ -0,0 +1,44 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define <16 x float> @foo(half addrspace(1)* %a, i32 %b) {
+entry:
+  %0 = call spir_func <16 x float> @_Z12vload_half16jPU3AS1KDh(i32 %b, half addrspace(1)* %a)
+  ret <16 x float> %0
+}
+
+declare spir_func <16 x float> @_Z12vload_half16jPU3AS1KDh(i32, half addrspace(1)*)
+
+; CHECK:  [[a4i32:%[^ ]+]] = bitcast half addrspace(1)* %a to <4 x i32> addrspace(1)*
+; CHECK:  [[bx2:%[^ ]+]] = shl i32 %b, 1
+; CHECK:  [[gep0:%[^ ]+]] = getelementptr <4 x i32>, <4 x i32> addrspace(1)* [[a4i32]], i32 [[bx2]]
+; CHECK:  [[idx1:%[^ ]+]] = add i32 [[bx2]], 1
+; CHECK:  [[gep1:%[^ ]+]] = getelementptr <4 x i32>, <4 x i32> addrspace(1)* [[a4i32]], i32 [[idx1]]
+; CHECK:  [[load0:%[^ ]+]] = load <4 x i32>, <4 x i32> addrspace(1)* [[gep0]], align 16
+; CHECK:  [[load1:%[^ ]+]] = load <4 x i32>, <4 x i32> addrspace(1)* [[gep1]], align 16
+; CHECK:  [[val0:%[^ ]+]] = extractelement <4 x i32> [[load0]], i32 0
+; CHECK:  [[val1:%[^ ]+]] = extractelement <4 x i32> [[load0]], i32 1
+; CHECK:  [[val2:%[^ ]+]] = extractelement <4 x i32> [[load0]], i32 2
+; CHECK:  [[val3:%[^ ]+]] = extractelement <4 x i32> [[load0]], i32 3
+; CHECK:  [[val4:%[^ ]+]] = extractelement <4 x i32> [[load1]], i32 0
+; CHECK:  [[val5:%[^ ]+]] = extractelement <4 x i32> [[load1]], i32 1
+; CHECK:  [[val6:%[^ ]+]] = extractelement <4 x i32> [[load1]], i32 2
+; CHECK:  [[val7:%[^ ]+]] = extractelement <4 x i32> [[load1]], i32 3
+; CHECK:  [[val0f:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val0]])
+; CHECK:  [[val1f:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val1]])
+; CHECK:  [[val2f:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val2]])
+; CHECK:  [[val3f:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val3]])
+; CHECK:  [[val4f:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val4]])
+; CHECK:  [[val5f:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val5]])
+; CHECK:  [[val6f:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val6]])
+; CHECK:  [[val7f:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val7]])
+; CHECK:  [[ret01:%[^ ]+]] = shufflevector <2 x float> [[val0f]], <2 x float> [[val1f]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  [[ret23:%[^ ]+]] = shufflevector <2 x float> [[val2f]], <2 x float> [[val3f]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  [[ret45:%[^ ]+]] = shufflevector <2 x float> [[val4f]], <2 x float> [[val5f]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  [[ret67:%[^ ]+]] = shufflevector <2 x float> [[val6f]], <2 x float> [[val7f]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  [[ret0123:%[^ ]+]] = shufflevector <4 x float> [[ret01]], <4 x float> [[ret23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK:  [[ret4567:%[^ ]+]] = shufflevector <4 x float> [[ret45]], <4 x float> [[ret67]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK:  [[ret:%[^ ]+]] = shufflevector <8 x float> [[ret0123]], <8 x float> [[ret4567]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/HalfStorage/vload_half2.cl b/test/HalfStorage/vload_half2.cl
new file mode 100644
index 0000000..484959b
--- /dev/null
+++ b/test/HalfStorage/vload_half2.cl
@@ -0,0 +1,42 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, int b, __global float2 *dst) {
+    *dst = vload_half2(b, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[ushort:%[^ ]+]] = OpTypeInt 16 0
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_16:%[^ ]+]] = OpConstant [[uint]] 16
+
+// CHECK-DAG: [[half_array:%[^ ]+]] = OpTypeRuntimeArray [[half]]
+// CHECK-DAG: [[half_ptr:%[^ ]+]] = OpTypeStruct [[half_array]]
+// CHECK-DAG: [[global_half_ptr:%[^ ]+]] = OpTypePointer StorageBuffer [[half_ptr]]
+
+// CHECK: [[a:%[^ ]+]] = OpVariable [[global_half_ptr]] StorageBuffer
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 0
+
+// CHECK: [[bx2:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[b]]
+// CHECK: [[addr_low:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[bx2]]
+// CHECK: [[val_low:%[^ ]+]] = OpLoad [[half]] [[addr_low]]
+// CHECK: [[val_lowi16:%[^ ]+]] = OpBitcast [[ushort]] [[val_low]]
+
+// CHECK: [[bx2p1:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx2]] [[uint_1]]
+// CHECK: [[addr_high:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[bx2p1]]
+// CHECK: [[val_high:%[^ ]+]] = OpLoad [[half]] [[addr_high]]
+// CHECK: [[val_highi16:%[^ ]+]] = OpBitcast [[ushort]] [[val_high]]
+
+// CHECK: [[val_lowi32:%[^ ]+]] = OpUConvert [[uint]] [[val_lowi16]]
+// CHECK: [[val_highi32:%[^ ]+]] = OpUConvert [[uint]] [[val_highi16]]
+// CHECK: [[val_highshift:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[val_highi32]] [[uint_16]]
+// CHECK: [[vali32:%[^ ]+]] = OpBitwiseOr [[uint]] [[val_highshift]] [[val_lowi32]]
+
+// CHECK: [[val:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[vali32]]
+// CHECK: OpStore {{.*}} [[val]]
diff --git a/test/HalfStorage/vload_half2.ll b/test/HalfStorage/vload_half2.ll
new file mode 100644
index 0000000..78c44b4
--- /dev/null
+++ b/test/HalfStorage/vload_half2.ll
@@ -0,0 +1,18 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define <2 x float> @foo(half addrspace(1)* %a, i32 %b) {
+entry:
+  %0 = call spir_func <2 x float> @_Z11vload_half2jPU3AS1KDh(i32 %b, half addrspace(1)* %a)
+  ret <2 x float> %0
+}
+
+declare spir_func <2 x float> @_Z11vload_half2jPU3AS1KDh(i32, half addrspace(1)*)
+
+; CHECK:  [[ai32:%[^ ]+]] = bitcast half addrspace(1)* %a to i32 addrspace(1)*
+; CHECK:  [[gep:%[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[ai32]], i32 %b
+; CHECK:  [[reti32:%[^ ]+]] = load i32, i32 addrspace(1)* [[gep]], align 4
+; CHECK:  [[ret:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[reti32]])
diff --git a/test/HalfStorage/vload_half3.cl b/test/HalfStorage/vload_half3.cl
new file mode 100644
index 0000000..5554edd
--- /dev/null
+++ b/test/HalfStorage/vload_half3.cl
@@ -0,0 +1,56 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, int b, __global float3 *dst) {
+    *dst = vload_half3(b, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[float3:%[^ ]+]] = OpTypeVector [[float]] 3
+// CHECK-DAG: [[undef_float2:%[^ ]+]] = OpUndef [[float2]]
+// CHECK-DAG: [[ushort:%[^ ]+]] = OpTypeInt 16 0
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint_3:%[^ ]+]] = OpConstant [[uint]] 3
+
+// CHECK-DAG: [[half_array:%[^ ]+]] = OpTypeRuntimeArray [[half]]
+// CHECK-DAG: [[half_ptr:%[^ ]+]] = OpTypeStruct [[half_array]]
+// CHECK-DAG: [[global_half_ptr:%[^ ]+]] = OpTypePointer StorageBuffer [[half_ptr]]
+
+// CHECK: [[a:%[^ ]+]] = OpVariable [[global_half_ptr]] StorageBuffer
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 0
+
+// CHECK: [[bx3:%[^ ]+]] = OpIMul [[uint]] [[b]] [[uint_3]]
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[bx3]]
+// CHECK: [[val0h:%[^ ]+]] = OpLoad [[half]] [[addr0]]
+// CHECK: [[val0i16:%[^ ]+]] = OpBitcast [[ushort]] [[val0h]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpIAdd [[uint]] [[bx3]] [[uint_1]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx1]]
+// CHECK: [[val1h:%[^ ]+]] = OpLoad [[half]] [[addr1]]
+// CHECK: [[val1i16:%[^ ]+]] = OpBitcast [[ushort]] [[val1h]]
+
+// CHECK: [[idx2:%[^ ]+]] = OpIAdd [[uint]] [[bx3]] [[uint_2]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx2]]
+// CHECK: [[val2h:%[^ ]+]] = OpLoad [[half]] [[addr2]]
+// CHECK: [[val2i16:%[^ ]+]] = OpBitcast [[ushort]] [[val2h]]
+
+// CHECK: [[val0i32:%[^ ]+]] = OpUConvert [[uint]] [[val0i16]]
+// CHECK: [[val1i32:%[^ ]+]] = OpUConvert [[uint]] [[val1i16]]
+// CHECK: [[val2i32:%[^ ]+]] = OpUConvert [[uint]] [[val2i16]]
+
+// CHECK: [[val0:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val0i32]]
+// CHECK: [[val1:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val1i32]]
+// CHECK: [[val2:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val2i32]]
+
+// CHECK: [[val0f3:%[^ ]+]] = OpVectorShuffle [[float3]] [[val2]] [[undef_float2]] 0 4294967295 4294967295
+// CHECK: [[val1f3:%[^ ]+]] = OpVectorShuffle [[float3]] [[val0]] [[val1]] 0 2 4294967295
+// CHECK: [[val:%[^ ]+]] = OpVectorShuffle [[float3]] [[val1f3]] [[val0f3]] 0 1 3
+
+// CHECK: OpStore {{.*}} [[val]]
diff --git a/test/HalfStorage/vload_half3.ll b/test/HalfStorage/vload_half3.ll
new file mode 100644
index 0000000..fc74255
--- /dev/null
+++ b/test/HalfStorage/vload_half3.ll
@@ -0,0 +1,37 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define <3 x float> @foo(half addrspace(1)* %a, i32 %b) {
+entry:
+  %0 = call spir_func <3 x float> @_Z11vload_half3jPU3AS1KDh(i32 %b, half addrspace(1)* %a)
+  ret <3 x float> %0
+}
+
+declare spir_func <3 x float> @_Z11vload_half3jPU3AS1KDh(i32, half addrspace(1)*)
+
+; CHECK:  [[ai16:%[^ ]+]] = bitcast half addrspace(1)* %a to i16 addrspace(1)*
+; CHECK:  [[bx2:%[^ ]+]] = shl i32 %b, 1
+; CHECK:  [[idx0:%[^ ]+]] = add i32 [[bx2]], %b
+; CHECK:  [[gep0:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[idx0]]
+; CHECK:  [[val0i16:%[^ ]+]] = load i16, i16 addrspace(1)* [[gep0]], align 2
+; CHECK:  [[idx1:%[^ ]+]] = add i32 [[idx0]], 1
+; CHECK:  [[gep1:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[idx1]]
+; CHECK:  [[val1i16:%[^ ]+]] = load i16, i16 addrspace(1)* [[gep1]], align 2
+; CHECK:  [[idx2:%[^ ]+]] = add i32 [[idx1]], 1
+; CHECK:  [[gep2:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[idx2]]
+; CHECK:  [[val2i16:%[^ ]+]] = load i16, i16 addrspace(1)* [[gep2]], align 2
+; CHECK:  [[val0i32:%[^ ]+]] = zext i16 [[val0i16]] to i32
+; CHECK:  [[val1i32:%[^ ]+]] = zext i16 [[val1i16]] to i32
+; CHECK:  [[val2i32:%[^ ]+]] = zext i16 [[val2i16]] to i32
+; CHECK:  [[val2f0:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val0i32]])
+; CHECK:  [[val0:%[^ ]+]] = extractelement <2 x float> [[val2f0]], i32 0
+; CHECK:  [[val2f1:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val1i32]])
+; CHECK:  [[val1:%[^ ]+]] = extractelement <2 x float> [[val2f1]], i32 0
+; CHECK:  [[val2f2:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val2i32]])
+; CHECK:  [[val2:%[^ ]+]] = extractelement <2 x float> [[val2f2]], i32 0
+; CHECK:  [[ret0:%[^ ]+]] = insertelement <3 x float> undef, float [[val0]], i32 0
+; CHECK:  [[ret01:%[^ ]+]] = insertelement <3 x float> [[ret0]], float [[val1]], i32 1
+; CHECK:  [[ret:%[^ ]+]] = insertelement <3 x float> [[ret01]], float [[val2]], i32 2
diff --git a/test/HalfStorage/vload_half4.cl b/test/HalfStorage/vload_half4.cl
new file mode 100644
index 0000000..a4bffa5
--- /dev/null
+++ b/test/HalfStorage/vload_half4.cl
@@ -0,0 +1,53 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, int b, __global float4 *dst) {
+    *dst = vload_half4(b, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[half4:%[^ ]+]] = OpTypeVector [[half]] 4
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[float4:%[^ ]+]] = OpTypeVector [[float]] 4
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint_3:%[^ ]+]] = OpConstant [[uint]] 3
+
+// CHECK-DAG: [[half_array:%[^ ]+]] = OpTypeRuntimeArray [[half]]
+// CHECK-DAG: [[half_ptr:%[^ ]+]] = OpTypeStruct [[half_array]]
+// CHECK-DAG: [[global_half_ptr:%[^ ]+]] = OpTypePointer StorageBuffer [[half_ptr]]
+
+// CHECK: [[a:%[^ ]+]] = OpVariable [[global_half_ptr]] StorageBuffer
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 0
+
+// CHECK: [[bx4:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[b]] [[uint_2]]
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[bx4]]
+// CHECK: [[val0:%[^ ]+]] = OpLoad [[half]] [[addr0]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx4]] [[uint_1]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx1]]
+// CHECK: [[val1:%[^ ]+]] = OpLoad [[half]] [[addr1]]
+
+// CHECK: [[idx2:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx4]] [[uint_2]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx2]]
+// CHECK: [[val2:%[^ ]+]] = OpLoad [[half]] [[addr2]]
+
+// CHECK: [[idx3:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx4]] %uint_3
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx3]]
+// CHECK: [[val3:%[^ ]+]] = OpLoad [[half]] [[addr3]]
+
+// CHECK: [[valh4:%[^ ]+]] = OpCompositeConstruct [[half4]] [[val0]] [[val1]] [[val2]] [[val3]]
+// CHECK: [[vali32:%[^ ]+]] = OpBitcast %v2uint [[valh4]]
+// CHECK: [[val01i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[vali32]] 0
+// CHECK: [[val23i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[vali32]] 1
+
+// CHECK: [[val01:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val01i32]]
+// CHECK: [[val23:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val23i32]]
+
+// CHECK: [[val:%[^ ]+]] = OpVectorShuffle [[float4]] [[val01]] [[val23]] 0 1 2 3
+// CHECK:       OpStore {{.*}} [[val]]
diff --git a/test/HalfStorage/vload_half4.ll b/test/HalfStorage/vload_half4.ll
new file mode 100644
index 0000000..f917f87
--- /dev/null
+++ b/test/HalfStorage/vload_half4.ll
@@ -0,0 +1,22 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define <4 x float> @foo(half addrspace(1)* %a, i32 %b) {
+entry:
+  %0 = call spir_func <4 x float> @_Z11vload_half4jPU3AS1KDh(i32 %b, half addrspace(1)* %a)
+  ret <4 x float> %0
+}
+
+declare spir_func <4 x float> @_Z11vload_half4jPU3AS1KDh(i32, half addrspace(1)*)
+
+; CHECK:  [[a2i32:%[^ ]+]] = bitcast half addrspace(1)* %a to <2 x i32> addrspace(1)*
+; CHECK:  [[gep:%[^ ]+]] = getelementptr <2 x i32>, <2 x i32> addrspace(1)* [[a2i32]], i32 %b
+; CHECK:  [[vali32:%[^ ]+]] = load <2 x i32>, <2 x i32> addrspace(1)* [[gep]], align 8
+; CHECK:  [[val01i32:%[^ ]+]] = extractelement <2 x i32> [[vali32]], i32 0
+; CHECK:  [[val23i32:%[^ ]+]] = extractelement <2 x i32> [[vali32]], i32 1
+; CHECK:  [[val01:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val01i32]])
+; CHECK:  [[val23:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val23i32]])
+; CHECK:  [[ret:%[^ ]+]] = shufflevector <2 x float> [[val01]], <2 x float> [[val23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/test/HalfStorage/vload_half8.cl b/test/HalfStorage/vload_half8.cl
new file mode 100644
index 0000000..a784484
--- /dev/null
+++ b/test/HalfStorage/vload_half8.cl
@@ -0,0 +1,101 @@
+// RUN: clspv %s -o %t.spv -long-vector
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, int b, __global float8 *dst) {
+    *dst = vload_half8(b, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[half4:%[^ ]+]] = OpTypeVector [[half]] 4
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint_3:%[^ ]+]] = OpConstant [[uint]] 3
+// CHECK-DAG: [[uint_4:%[^ ]+]] = OpConstant [[uint]] 4
+// CHECK-DAG: [[uint_5:%[^ ]+]] = OpConstant [[uint]] 5
+// CHECK-DAG: [[uint_6:%[^ ]+]] = OpConstant [[uint]] 6
+// CHECK-DAG: [[uint_7:%[^ ]+]] = OpConstant [[uint]] 7
+
+// CHECK-DAG: [[half_array:%[^ ]+]] = OpTypeRuntimeArray [[half]]
+// CHECK-DAG: [[half_ptr:%[^ ]+]] = OpTypeStruct [[half_array]]
+// CHECK-DAG: [[global_half_ptr:%[^ ]+]] = OpTypePointer StorageBuffer [[half_ptr]]
+
+// CHECK: [[a:%[^ ]+]] = OpVariable [[global_half_ptr]] StorageBuffer
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 0
+
+// CHECK: [[bx8:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[b]] [[uint_3]]
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[bx8]]
+// CHECK: [[val0:%[^ ]+]] = OpLoad [[half]] [[addr0]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx8]] [[uint_1]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx1]]
+// CHECK: [[val1:%[^ ]+]] = OpLoad [[half]] [[addr1]]
+
+// CHECK: [[idx2:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx8]] [[uint_2]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx2]]
+// CHECK: [[val2:%[^ ]+]] = OpLoad [[half]] [[addr2]]
+
+// CHECK: [[idx3:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx8]] %uint_3
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx3]]
+// CHECK: [[val3:%[^ ]+]] = OpLoad [[half]] [[addr3]]
+
+// CHECK: [[idx4:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx8]] %uint_4
+// CHECK: [[addr4:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx4]]
+// CHECK: [[val4:%[^ ]+]] = OpLoad [[half]] [[addr4]]
+
+// CHECK: [[idx5:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx8]] %uint_5
+// CHECK: [[addr5:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx5]]
+// CHECK: [[val5:%[^ ]+]] = OpLoad [[half]] [[addr5]]
+
+// CHECK: [[idx6:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx8]] %uint_6
+// CHECK: [[addr6:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx6]]
+// CHECK: [[val6:%[^ ]+]] = OpLoad [[half]] [[addr6]]
+
+// CHECK: [[idx7:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx8]] %uint_7
+// CHECK: [[addr7:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx7]]
+// CHECK: [[val7:%[^ ]+]] = OpLoad [[half]] [[addr7]]
+
+// CHECK: [[valh4l:%[^ ]+]] = OpCompositeConstruct [[half4]] [[val0]] [[val1]] [[val2]] [[val3]]
+// CHECK: [[valh4h:%[^ ]+]] = OpCompositeConstruct [[half4]] [[val4]] [[val5]] [[val6]] [[val7]]
+// CHECK: [[vali32l:%[^ ]+]] = OpBitcast %v2uint [[valh4l]]
+// CHECK: [[vali32h:%[^ ]+]] = OpBitcast %v2uint [[valh4h]]
+// CHECK: [[val01i32l:%[^ ]+]] = OpCompositeExtract [[uint]] [[vali32l]] 0
+// CHECK: [[val23i32l:%[^ ]+]] = OpCompositeExtract [[uint]] [[vali32l]] 1
+// CHECK: [[val01i32h:%[^ ]+]] = OpCompositeExtract [[uint]] [[vali32h]] 0
+// CHECK: [[val23i32h:%[^ ]+]] = OpCompositeExtract [[uint]] [[vali32h]] 1
+
+// CHECK: [[val01:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val01i32l]]
+// CHECK: [[val23:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val23i32l]]
+// CHECK: [[val45:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val01i32h]]
+// CHECK: [[val67:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val23i32h]]
+
+// CHECK: [[val0:%[^ ]+]] = OpCompositeExtract [[float]] [[val01]] 0
+// CHECK: [[val1:%[^ ]+]] = OpCompositeExtract [[float]] [[val01]] 1
+// CHECK: [[val2:%[^ ]+]] = OpCompositeExtract [[float]] [[val23]] 0
+// CHECK: [[val3:%[^ ]+]] = OpCompositeExtract [[float]] [[val23]] 1
+// CHECK: [[val4:%[^ ]+]] = OpCompositeExtract [[float]] [[val45]] 0
+// CHECK: [[val5:%[^ ]+]] = OpCompositeExtract [[float]] [[val45]] 1
+// CHECK: [[val6:%[^ ]+]] = OpCompositeExtract [[float]] [[val67]] 0
+// CHECK: [[val7:%[^ ]+]] = OpCompositeExtract [[float]] [[val67]] 1
+
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_0]]
+// CHECK: OpStore [[addr0]] [[val0]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_1]]
+// CHECK: OpStore [[addr1]] [[val1]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_2]]
+// CHECK: OpStore [[addr2]] [[val2]]
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_3]]
+// CHECK: OpStore [[addr3]] [[val3]]
+// CHECK: [[addr4:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_4]]
+// CHECK: OpStore [[addr4]] [[val4]]
+// CHECK: [[addr5:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_5]]
+// CHECK: OpStore [[addr5]] [[val5]]
+// CHECK: [[addr6:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_6]]
+// CHECK: OpStore [[addr6]] [[val6]]
+// CHECK: [[addr7:%[^ ]+]] = OpAccessChain {{.*}} {{.*}} [[uint_0]] [[uint_0]] [[uint_7]]
+// CHECK: OpStore [[addr7]] [[val7]]
diff --git a/test/HalfStorage/vload_half8.ll b/test/HalfStorage/vload_half8.ll
new file mode 100644
index 0000000..f2c8309
--- /dev/null
+++ b/test/HalfStorage/vload_half8.ll
@@ -0,0 +1,28 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define <8 x float> @foo(half addrspace(1)* %a, i32 %b) {
+entry:
+  %0 = call spir_func <8 x float> @_Z11vload_half8jPU3AS1KDh(i32 %b, half addrspace(1)* %a)
+  ret <8 x float> %0
+}
+
+declare spir_func <8 x float> @_Z11vload_half8jPU3AS1KDh(i32, half addrspace(1)*)
+
+; CHECK:  [[a4i32:%[^ ]+]] = bitcast half addrspace(1)* %a to <4 x i32> addrspace(1)*
+; CHECK:  [[gep:%[^ ]+]] = getelementptr <4 x i32>, <4 x i32> addrspace(1)* [[a4i32]], i32 %b
+; CHECK:  [[vali32:%[^ ]+]] = load <4 x i32>, <4 x i32> addrspace(1)* [[gep]], align 16
+; CHECK:  [[val01i32:%[^ ]+]] = extractelement <4 x i32> [[vali32]], i32 0
+; CHECK:  [[val23i32:%[^ ]+]] = extractelement <4 x i32> [[vali32]], i32 1
+; CHECK:  [[val45i32:%[^ ]+]] = extractelement <4 x i32> [[vali32]], i32 2
+; CHECK:  [[val67i32:%[^ ]+]] = extractelement <4 x i32> [[vali32]], i32 3
+; CHECK:  [[val01:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val01i32]])
+; CHECK:  [[val23:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val23i32]])
+; CHECK:  [[val45:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val45i32]])
+; CHECK:  [[val67:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val67i32]])
+; CHECK:  [[ret0123:%[^ ]+]] = shufflevector <2 x float> [[val01]], <2 x float> [[val23]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  [[ret4567:%[^ ]+]] = shufflevector <2 x float> [[val45]], <2 x float> [[val67]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  [[ret:%[^ ]+]] = shufflevector <4 x float> [[ret0123]], <4 x float> [[ret4567]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/HalfStorage/vloada_half3.cl b/test/HalfStorage/vloada_half3.cl
new file mode 100644
index 0000000..e60135d
--- /dev/null
+++ b/test/HalfStorage/vloada_half3.cl
@@ -0,0 +1,53 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, int b, __global float3 *dst) {
+    *dst = vloada_half3(b, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[half4:%[^ ]+]] = OpTypeVector [[half]] 4
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[float3:%[^ ]+]] = OpTypeVector [[float]] 3
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint_3:%[^ ]+]] = OpConstant [[uint]] 3
+
+// CHECK-DAG: [[half_array:%[^ ]+]] = OpTypeRuntimeArray [[half]]
+// CHECK-DAG: [[half_ptr:%[^ ]+]] = OpTypeStruct [[half_array]]
+// CHECK-DAG: [[global_half_ptr:%[^ ]+]] = OpTypePointer StorageBuffer [[half_ptr]]
+
+// CHECK: [[a:%[^ ]+]] = OpVariable [[global_half_ptr]] StorageBuffer
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 0
+
+// CHECK: [[bx4:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[b]] [[uint_2]]
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[bx4]]
+// CHECK: [[val0:%[^ ]+]] = OpLoad [[half]] [[addr0]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx4]] [[uint_1]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx1]]
+// CHECK: [[val1:%[^ ]+]] = OpLoad [[half]] [[addr1]]
+
+// CHECK: [[idx2:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx4]] [[uint_2]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx2]]
+// CHECK: [[val2:%[^ ]+]] = OpLoad [[half]] [[addr2]]
+
+// CHECK: [[idx3:%[^ ]+]] = OpBitwiseOr [[uint]] [[bx4]] %uint_3
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain {{.*}} [[a]] [[uint_0]] [[idx3]]
+// CHECK: [[val3:%[^ ]+]] = OpLoad [[half]] [[addr3]]
+
+// CHECK: [[valh4:%[^ ]+]] = OpCompositeConstruct [[half4]] [[val0]] [[val1]] [[val2]] [[val3]]
+// CHECK: [[vali32:%[^ ]+]] = OpBitcast %v2uint [[valh4]]
+// CHECK: [[val01i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[vali32]] 0
+// CHECK: [[val23i32:%[^ ]+]] = OpCompositeExtract [[uint]] [[vali32]] 1
+
+// CHECK: [[val01:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val01i32]]
+// CHECK: [[val23:%[^ ]+]] = OpExtInst [[float2]] {{.*}} UnpackHalf2x16 [[val23i32]]
+
+// CHECK: [[val:%[^ ]+]] = OpVectorShuffle [[float3]] [[val01]] [[val23]] 0 1 2
+// CHECK:       OpStore {{.*}} [[val]]
diff --git a/test/HalfStorage/vloada_half3.ll b/test/HalfStorage/vloada_half3.ll
new file mode 100644
index 0000000..9339787
--- /dev/null
+++ b/test/HalfStorage/vloada_half3.ll
@@ -0,0 +1,22 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define <3 x float> @foo(half addrspace(1)* %a, i32 %b) {
+entry:
+  %0 = call spir_func <3 x float> @_Z12vloada_half3jPU3AS1KDh(i32 %b, half addrspace(1)* %a)
+  ret <3 x float> %0
+}
+
+declare spir_func <3 x float> @_Z12vloada_half3jPU3AS1KDh(i32, half addrspace(1)*)
+
+; CHECK:  [[a2i32:%[^ ]+]] = bitcast half addrspace(1)* %a to <2 x i32> addrspace(1)*
+; CHECK:  [[gep:%[^ ]+]] = getelementptr <2 x i32>, <2 x i32> addrspace(1)* [[a2i32]], i32 %b
+; CHECK:  [[vali32:%[^ ]+]] = load <2 x i32>, <2 x i32> addrspace(1)* [[gep]], align 8
+; CHECK:  [[val01i32:%[^ ]+]] = extractelement <2 x i32> [[vali32]], i32 0
+; CHECK:  [[val23i32:%[^ ]+]] = extractelement <2 x i32> [[vali32]], i32 1
+; CHECK:  [[val01:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val01i32]])
+; CHECK:  [[val23:%[^ ]+]] = call <2 x float> @_Z18spirv.unpack.v2f16(i32 [[val23i32]])
+; CHECK:  [[ret:%[^ ]+]] = shufflevector <2 x float> [[val01]], <2 x float> [[val23]], <3 x i32> <i32 0, i32 1, i32 2>
diff --git a/test/HalfStorage/vstore_half.cl b/test/HalfStorage/vstore_half.cl
new file mode 100644
index 0000000..ac2439a
--- /dev/null
+++ b/test/HalfStorage/vstore_half.cl
@@ -0,0 +1,25 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, float b, int c) {
+    vstore_half(b, c, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[undef_float2:%[^ ]+]] = OpUndef [[float2]]
+// CHECK-DAG: [[ushort:%[^ ]+]] = OpTypeInt 16 0
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint0:%[^ ]+]] = OpConstant [[uint]] 0
+
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 0
+// CHECK: [[c:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 1
+// CHECK: [[val2f32:%[^ ]+]] = OpCompositeInsert [[float2]] [[b]] [[undef_float2]] 0
+// CHECK: [[vali32:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val2f32]]
+// CHECK: [[vali16:%[^ ]+]] = OpUConvert [[ushort]] [[vali32]]
+// CHECK: [[val:%[^ ]+]] = OpBitcast [[half]] [[vali16]]
+// CHECK: [[addr:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[c]]
+// CHECK: OpStore [[addr]] [[val]]
diff --git a/test/HalfStorage/vstore_half.ll b/test/HalfStorage/vstore_half.ll
new file mode 100644
index 0000000..befd34e
--- /dev/null
+++ b/test/HalfStorage/vstore_half.ll
@@ -0,0 +1,20 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define void @foo(half addrspace(1)* %a, float %b, i32 %c) {
+entry:
+  call spir_func void @_Z11vstore_halffjPU3AS1Dh(float %b, i32 %c, half addrspace(1)* %a)
+  ret void
+}
+
+declare spir_func void @_Z11vstore_halffjPU3AS1Dh(float, i32, half addrspace(1)*)
+
+; CHECK: [[float2:%[^ ]+]] = insertelement <2 x float> undef, float %b, i32 0
+; CHECK: [[half2:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[float2]])
+; CHECK: [[half:%[^ ]+]] = trunc i32 [[half2]] to i16
+; CHECK: [[a_cast_i16:%[^ ]+]] = bitcast half addrspace(1)* %a to i16 addrspace(1)*
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[a_cast_i16]], i32 %c
+; CHECK: store i16 [[half]], i16 addrspace(1)* [[gep]], align 2
diff --git a/test/HalfStorage/vstore_half16.cl b/test/HalfStorage/vstore_half16.cl
new file mode 100644
index 0000000..ad7f012
--- /dev/null
+++ b/test/HalfStorage/vstore_half16.cl
@@ -0,0 +1,154 @@
+// RUN: clspv %s -o %t.spv -long-vector
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, float16 b, int c) {
+    vstore_half16(b, c, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[half4:%[^ ]+]] = OpTypeVector [[half]] 4
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint2:%[^ ]+]] = OpTypeVector [[uint]] 2
+// CHECK-DAG: [[uint_16:%[^ ]+]] = OpConstant [[uint]] 16
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint_3:%[^ ]+]] = OpConstant [[uint]] 3
+// CHECK-DAG: [[uint_4:%[^ ]+]] = OpConstant [[uint]] 4
+// CHECK-DAG: [[uint_5:%[^ ]+]] = OpConstant [[uint]] 5
+// CHECK-DAG: [[uint_6:%[^ ]+]] = OpConstant [[uint]] 6
+// CHECK-DAG: [[uint_7:%[^ ]+]] = OpConstant [[uint]] 7
+// CHECK-DAG: [[uint_8:%[^ ]+]] = OpConstant [[uint]] 8
+// CHECK-DAG: [[uint_9:%[^ ]+]] = OpConstant [[uint]] 9
+// CHECK-DAG: [[uint_10:%[^ ]+]] = OpConstant [[uint]] 10
+// CHECK-DAG: [[uint_11:%[^ ]+]] = OpConstant [[uint]] 11
+// CHECK-DAG: [[uint_12:%[^ ]+]] = OpConstant [[uint]] 12
+// CHECK-DAG: [[uint_13:%[^ ]+]] = OpConstant [[uint]] 13
+// CHECK-DAG: [[uint_14:%[^ ]+]] = OpConstant [[uint]] 14
+// CHECK-DAG: [[uint_15:%[^ ]+]] = OpConstant [[uint]] 15
+
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract {{.*}} {{.*}} 0
+
+// CHECK: [[b0:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 0
+// CHECK: [[b1:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 1
+// CHECK: [[b2:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 2
+// CHECK: [[b3:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 3
+// CHECK: [[b4:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 4
+// CHECK: [[b5:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 5
+// CHECK: [[b6:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 6
+// CHECK: [[b7:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 7
+// CHECK: [[b8:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 8
+// CHECK: [[b9:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 9
+// CHECK: [[b10:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 10
+// CHECK: [[b11:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 11
+// CHECK: [[b12:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 12
+// CHECK: [[b13:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 13
+// CHECK: [[b14:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 14
+// CHECK: [[b15:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 15
+
+// CHECK: [[c:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 1
+
+// CHECK: [[b01:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b0]] [[b1]]
+// CHECK: [[b23:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b2]] [[b3]]
+// CHECK: [[b45:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b4]] [[b5]]
+// CHECK: [[b67:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b6]] [[b7]]
+// CHECK: [[b89:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b8]] [[b9]]
+// CHECK: [[b1011:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b10]] [[b11]]
+// CHECK: [[b1213:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b12]] [[b13]]
+// CHECK: [[b1415:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b14]] [[b15]]
+
+// CHECK: [[b01f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b01]]
+// CHECK: [[b23f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b23]]
+// CHECK: [[b45f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b45]]
+// CHECK: [[b67f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b67]]
+// CHECK: [[b89f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b89]]
+// CHECK: [[b1011f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b1011]]
+// CHECK: [[b1213f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b1213]]
+// CHECK: [[b1415f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b1415]]
+
+// CHECK: [[b0123f:%[^ ]+]] = OpCompositeConstruct [[uint2]] [[b01f]] [[b23f]]
+// CHECK: [[b4567f:%[^ ]+]] = OpCompositeConstruct [[uint2]] [[b45f]] [[b67f]]
+// CHECK: [[b891011f:%[^ ]+]] = OpCompositeConstruct [[uint2]] [[b89f]] [[b1011f]]
+// CHECK: [[b12131415f:%[^ ]+]] = OpCompositeConstruct [[uint2]] [[b1213f]] [[b1415f]]
+
+// CHECK: [[cx16:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[c]] [[uint_4]]
+
+// CHECK: [[b0123h:%[^ ]+]] = OpBitcast [[half4]] [[b0123f]]
+// CHECK: [[b0h:%[^ ]+]] = OpCompositeExtract [[half]] [[b0123h]] 0
+// CHECK: [[b1h:%[^ ]+]] = OpCompositeExtract [[half]] [[b0123h]] 1
+// CHECK: [[b2h:%[^ ]+]] = OpCompositeExtract [[half]] [[b0123h]] 2
+// CHECK: [[b3h:%[^ ]+]] = OpCompositeExtract [[half]] [[b0123h]] 3
+
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[cx16]]
+// CHECK: OpStore [[addr0]] [[b0h]]
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_1]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx1]]
+// CHECK: OpStore [[addr1]] [[b1h]]
+// CHECK: [[idx2:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_2]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx2]]
+// CHECK: OpStore [[addr2]] [[b2h]]
+// CHECK: [[idx3:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_3]]
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx3]]
+// CHECK: OpStore [[addr3]] [[b3h]]
+
+// CHECK: [[b4567h:%[^ ]+]] = OpBitcast [[half4]] [[b4567f]]
+// CHECK: [[b4h:%[^ ]+]] = OpCompositeExtract [[half]] [[b4567h]] 0
+// CHECK: [[b5h:%[^ ]+]] = OpCompositeExtract [[half]] [[b4567h]] 1
+// CHECK: [[b6h:%[^ ]+]] = OpCompositeExtract [[half]] [[b4567h]] 2
+// CHECK: [[b7h:%[^ ]+]] = OpCompositeExtract [[half]] [[b4567h]] 3
+
+// CHECK: [[idx4:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_4]]
+// CHECK: [[addr4:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx4]]
+// CHECK: OpStore [[addr4]] [[b4h]]
+// CHECK: [[idx5:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_5]]
+// CHECK: [[addr5:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx5]]
+// CHECK: OpStore [[addr5]] [[b5h]]
+// CHECK: [[idx6:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_6]]
+// CHECK: [[addr6:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx6]]
+// CHECK: OpStore [[addr6]] [[b6h]]
+// CHECK: [[idx7:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_7]]
+// CHECK: [[addr7:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx7]]
+// CHECK: OpStore [[addr7]] [[b7h]]
+
+// CHECK: [[idx8:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_8]]
+
+// CHECK: [[b891011h:%[^ ]+]] = OpBitcast [[half4]] [[b891011f]]
+// CHECK: [[b8h:%[^ ]+]] = OpCompositeExtract [[half]] [[b891011h]] 0
+// CHECK: [[b9h:%[^ ]+]] = OpCompositeExtract [[half]] [[b891011h]] 1
+// CHECK: [[b10h:%[^ ]+]] = OpCompositeExtract [[half]] [[b891011h]] 2
+// CHECK: [[b11h:%[^ ]+]] = OpCompositeExtract [[half]] [[b891011h]] 3
+
+// CHECK: [[addr8:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx8]]
+// CHECK: OpStore [[addr8]] [[b8h]]
+// CHECK: [[idx9:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_9]]
+// CHECK: [[addr9:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx9]]
+// CHECK: OpStore [[addr9]] [[b9h]]
+// CHECK: [[idx10:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_10]]
+// CHECK: [[addr10:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx10]]
+// CHECK: OpStore [[addr10]] [[b10h]]
+// CHECK: [[idx11:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_11]]
+// CHECK: [[addr11:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx11]]
+// CHECK: OpStore [[addr11]] [[b11h]]
+
+// CHECK: [[b12131415h:%[^ ]+]] = OpBitcast [[half4]] [[b12131415f]]
+// CHECK: [[b12h:%[^ ]+]] = OpCompositeExtract [[half]] [[b12131415h]] 0
+// CHECK: [[b13h:%[^ ]+]] = OpCompositeExtract [[half]] [[b12131415h]] 1
+// CHECK: [[b14h:%[^ ]+]] = OpCompositeExtract [[half]] [[b12131415h]] 2
+// CHECK: [[b15h:%[^ ]+]] = OpCompositeExtract [[half]] [[b12131415h]] 3
+
+// CHECK: [[idx12:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_12]]
+// CHECK: [[addr12:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx12]]
+// CHECK: OpStore [[addr12]] [[b12h]]
+// CHECK: [[idx13:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_13]]
+// CHECK: [[addr13:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx13]]
+// CHECK: OpStore [[addr13]] [[b13h]]
+// CHECK: [[idx14:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_14]]
+// CHECK: [[addr14:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx14]]
+// CHECK: OpStore [[addr14]] [[b14h]]
+// CHECK: [[idx15:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx16]] [[uint_15]]
+// CHECK: [[addr15:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx15]]
+// CHECK: OpStore [[addr15]] [[b15h]]
diff --git a/test/HalfStorage/vstore_half16.ll b/test/HalfStorage/vstore_half16.ll
new file mode 100644
index 0000000..d18dc2c
--- /dev/null
+++ b/test/HalfStorage/vstore_half16.ll
@@ -0,0 +1,45 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define void @foo(half addrspace(1)* %a, <16 x float> %b, i32 %c) {
+entry:
+  call spir_func void @_Z13vstore_half16Dv16_fjPU3AS1Dh(<16 x float> %b, i32 %c, half addrspace(1)* %a)
+  ret void
+}
+
+declare spir_func void @_Z13vstore_half16Dv16_fjPU3AS1Dh(<16 x float>, i32, half addrspace(1)*)
+
+; CHECK:  [[b0:%[^ ]+]] = shufflevector <16 x float> %b, <16 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK:  [[b1:%[^ ]+]] = shufflevector <16 x float> %b, <16 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK:  [[b2:%[^ ]+]] = shufflevector <16 x float> %b, <16 x float> undef, <2 x i32> <i32 4, i32 5>
+; CHECK:  [[b3:%[^ ]+]] = shufflevector <16 x float> %b, <16 x float> undef, <2 x i32> <i32 6, i32 7>
+; CHECK:  [[b4:%[^ ]+]] = shufflevector <16 x float> %b, <16 x float> undef, <2 x i32> <i32 8, i32 9>
+; CHECK:  [[b5:%[^ ]+]] = shufflevector <16 x float> %b, <16 x float> undef, <2 x i32> <i32 10, i32 11>
+; CHECK:  [[b6:%[^ ]+]] = shufflevector <16 x float> %b, <16 x float> undef, <2 x i32> <i32 12, i32 13>
+; CHECK:  [[b7:%[^ ]+]] = shufflevector <16 x float> %b, <16 x float> undef, <2 x i32> <i32 14, i32 15>
+; CHECK:  [[b0i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b0]])
+; CHECK:  [[b1i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b1]])
+; CHECK:  [[b2i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b2]])
+; CHECK:  [[b3i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b3]])
+; CHECK:  [[b4i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b4]])
+; CHECK:  [[b5i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b5]])
+; CHECK:  [[b6i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b6]])
+; CHECK:  [[b7i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b7]])
+; CHECK:  [[b00:%[^ ]+]] = insertelement <4 x i32> undef, i32 [[b0i32]], i32 0
+; CHECK:  [[b01:%[^ ]+]] = insertelement <4 x i32> [[b00]], i32 [[b1i32]], i32 1
+; CHECK:  [[b02:%[^ ]+]] = insertelement <4 x i32> [[b01]], i32 [[b2i32]], i32 2
+; CHECK:  [[b03:%[^ ]+]] = insertelement <4 x i32> [[b02]], i32 [[b3i32]], i32 3
+; CHECK:  [[b10:%[^ ]+]] = insertelement <4 x i32> undef, i32 [[b4i32]], i32 0
+; CHECK:  [[b11:%[^ ]+]] = insertelement <4 x i32> [[b10]], i32 [[b5i32]], i32 1
+; CHECK:  [[b12:%[^ ]+]] = insertelement <4 x i32> [[b11]], i32 [[b6i32]], i32 2
+; CHECK:  [[b13:%[^ ]+]] = insertelement <4 x i32> [[b12]], i32 [[b7i32]], i32 3
+; CHECK:  [[av4i32:%[^ ]+]] = bitcast half addrspace(1)* %a to <4 x i32> addrspace(1)*
+; CHECK:  [[cx2:%[^ ]+]] = shl i32 %c, 1
+; CHECK:  [[gep0:%[^ ]+]] = getelementptr <4 x i32>, <4 x i32> addrspace(1)* [[av4i32]], i32 [[cx2]]
+; CHECK:  store <4 x i32> [[b03]], <4 x i32> addrspace(1)* [[gep0]], align 16
+; CHECK:  [[cx2p1:%[^ ]+]] = add i32 [[cx2]], 1
+; CHECK:  [[gep1:%[^ ]+]] = getelementptr <4 x i32>, <4 x i32> addrspace(1)* [[av4i32]], i32 [[cx2p1]]
+; CHECK:  store <4 x i32> [[b13]], <4 x i32> addrspace(1)* [[gep1]], align 16
diff --git a/test/HalfStorage/vstore_half2.cl b/test/HalfStorage/vstore_half2.cl
new file mode 100644
index 0000000..55d3d76
--- /dev/null
+++ b/test/HalfStorage/vstore_half2.cl
@@ -0,0 +1,35 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, float2 b, int c) {
+    vstore_half2(b, c, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[ushort:%[^ ]+]] = OpTypeInt 16 0
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint16:%[^ ]+]] = OpConstant [[uint]] 16
+// CHECK-DAG: [[uint0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint1:%[^ ]+]] = OpConstant [[uint]] 1
+
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[float2]] {{.*}} 0
+// CHECK: [[c:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 1
+
+// CHECK: [[vali32:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b]]
+// CHECK: [[idx0:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[c]] [[uint1]]
+// CHECK: [[val1i16:%[^ ]+]] = OpUConvert [[ushort]] [[vali32]]
+// CHECK: [[val2i32:%[^ ]+]] = OpShiftRightLogical [[uint]] [[vali32]] [[uint16]]
+// CHECK: [[val2i16:%[^ ]+]] = OpUConvert [[ushort]] [[val2i32]]
+
+// CHECK: [[val1:%[^ ]+]] = OpBitcast [[half]] [[val1i16]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[idx0]]
+// CHECK: OpStore [[addr1]] [[val1]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[idx0]] [[uint1]]
+// CHECK: [[val2:%[^ ]+]] = OpBitcast [[half]] [[val2i16]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[idx1]]
+// CHECK: OpStore [[addr2]] [[val2]]
diff --git a/test/HalfStorage/vstore_half2.ll b/test/HalfStorage/vstore_half2.ll
new file mode 100644
index 0000000..93a16ac
--- /dev/null
+++ b/test/HalfStorage/vstore_half2.ll
@@ -0,0 +1,19 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define void @foo(half addrspace(1)* %a, <2 x float> %b, i32 %c) {
+entry:
+  call spir_func void @_Z12vstore_half2Dv2_fjPU3AS1Dh(<2 x float> %b, i32 %c, half addrspace(1)* %a)
+  ret void
+}
+
+declare spir_func void @_Z12vstore_half2Dv2_fjPU3AS1Dh(<2 x float>, i32, half addrspace(1)*)
+
+
+; CHECK: [[half2:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> %b)
+; CHECK: [[a_cast_i32:%[^ ]+]] = bitcast half addrspace(1)* %a to i32 addrspace(1)*
+; CHECK: [[gep:%[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[a_cast_i32]], i32 %c
+; CHECK: store i32 [[half2]], i32 addrspace(1)* [[gep]], align 4
diff --git a/test/HalfStorage/vstore_half3.cl b/test/HalfStorage/vstore_half3.cl
new file mode 100644
index 0000000..026bdf1
--- /dev/null
+++ b/test/HalfStorage/vstore_half3.cl
@@ -0,0 +1,49 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, float3 b, int c) {
+    vstore_half3(b, c, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[float4:%[^ ]+]] = OpTypeVector [[float]] 4
+// CHECK-DAG: [[ushort:%[^ ]+]] = OpTypeInt 16 0
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint3:%[^ ]+]] = OpConstant [[uint]] 3
+
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[float4]] {{.*}} 0
+// CHECK: [[c:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 1
+
+// CHECK: [[val1_2f32:%[^ ]+]] = OpVectorShuffle [[float2]] [[b]] {{.*}} 0 4294967295
+// CHECK: [[val2_2f32:%[^ ]+]] = OpVectorShuffle [[float2]] [[b]] {{.*}} 1 4294967295
+// CHECK: [[val3_2f32:%[^ ]+]] = OpVectorShuffle [[float2]] [[b]] {{.*}} 2 4294967295
+
+// CHECK: [[val1i32:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val1_2f32]]
+// CHECK: [[val1i16:%[^ ]+]] = OpUConvert [[ushort]] [[val1i32]]
+// CHECK: [[val2i32:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val2_2f32]]
+// CHECK: [[val2i16:%[^ ]+]] = OpUConvert [[ushort]] [[val2i32]]
+// CHECK: [[val3i32:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val3_2f32]]
+// CHECK: [[val3i16:%[^ ]+]] = OpUConvert [[ushort]] [[val3i32]]
+
+// CHECK: [[cx3:%[^ ]+]] = OpIMul [[uint]] [[c]] [[uint3]]
+
+// CHECK: [[val1:%[^ ]+]] = OpBitcast [[half]] [[val1i16]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[cx3]]
+// CHECK: OpStore [[addr1]] [[val1]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpIAdd [[uint]] [[cx3]] [[uint1]]
+// CHECK: [[val2:%[^ ]+]] = OpBitcast [[half]] [[val2i16]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[idx1]]
+// CHECK: OpStore [[addr2]] [[val2]]
+
+// CHECK: [[idx2:%[^ ]+]] = OpIAdd [[uint]] [[cx3]] [[uint2]]
+// CHECK: [[val3:%[^ ]+]] = OpBitcast [[half]] [[val3i16]]
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[idx2]]
+// CHECK: OpStore [[addr3]] [[val3]]
diff --git a/test/HalfStorage/vstore_half3.ll b/test/HalfStorage/vstore_half3.ll
new file mode 100644
index 0000000..7745e4f
--- /dev/null
+++ b/test/HalfStorage/vstore_half3.ll
@@ -0,0 +1,38 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define void @foo(half addrspace(1)* %a, <3 x float> %b, i32 %c) {
+entry:
+  call spir_func void @_Z12vstore_half3Dv3_fjPU3AS1Dh(<3 x float> %b, i32 %c, half addrspace(1)* %a)
+  ret void
+}
+
+declare spir_func void @_Z12vstore_half3Dv3_fjPU3AS1Dh(<3 x float>, i32, half addrspace(1)*)
+
+
+; CHECK:  [[b0:%[^ ]+]] = extractelement <3 x float> %b, i32 0
+; CHECK:  [[b0f2:%[^ ]+]] = insertelement <2 x float> undef, float [[b0]], i32 0
+; CHECK:  [[b1:%[^ ]+]] = extractelement <3 x float> %b, i32 1
+; CHECK:  [[b1f2:%[^ ]+]] = insertelement <2 x float> undef, float [[b1]], i32 0
+; CHECK:  [[b2:%[^ ]+]] = extractelement <3 x float> %b, i32 2
+; CHECK:  [[b2f2:%[^ ]+]] = insertelement <2 x float> undef, float [[b2]], i32 0
+; CHECK:  [[b0i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b0f2]])
+; CHECK:  [[b0i16:%[^ ]+]] = trunc i32 [[b0i32]] to i16
+; CHECK:  [[b1i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b1f2]])
+; CHECK:  [[b1i16:%[^ ]+]] = trunc i32 [[b1i32]] to i16
+; CHECK:  [[b2i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b2f2]])
+; CHECK:  [[b2i16:%[^ ]+]] = trunc i32 [[b2i32]] to i16
+; CHECK:  [[ai16:%[^ ]+]] = bitcast half addrspace(1)* %a to i16 addrspace(1)*
+; CHECK:  [[cx2:%[^ ]+]] = shl i32 %c, 1
+; CHECK:  [[cx3:%[^ ]+]] = add i32 [[cx2]], %c
+; CHECK:  [[gep0:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[cx3]]
+; CHECK:  store i16 [[b0i16]], i16 addrspace(1)* [[gep0]], align 2
+; CHECK:  [[idx1:%[^ ]+]] = add i32 [[cx3]], 1
+; CHECK:  [[gep1:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[idx1]]
+; CHECK:  store i16 [[b1i16]], i16 addrspace(1)* [[gep1]], align 2
+; CHECK:  [[idx2:%[^ ]+]] = add i32 [[idx1]], 1
+; CHECK:  [[gep2:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[idx2]]
+; CHECK:  store i16 [[b2i16]], i16 addrspace(1)* [[gep2]], align 2
diff --git a/test/HalfStorage/vstore_half4.cl b/test/HalfStorage/vstore_half4.cl
new file mode 100644
index 0000000..ce73a35
--- /dev/null
+++ b/test/HalfStorage/vstore_half4.cl
@@ -0,0 +1,53 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, float4 b, int c) {
+    vstore_half4(b, c, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[half4:%[^ ]+]] = OpTypeVector [[half]] 4
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[float4:%[^ ]+]] = OpTypeVector [[float]] 4
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint2:%[^ ]+]] = OpTypeVector [[uint]] 2
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint_3:%[^ ]+]] = OpConstant [[uint]] 3
+
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[float4]] {{.*}} 0
+// CHECK: [[c:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 1
+
+// CHECK: [[val_float01:%[^ ]+]] = OpVectorShuffle [[float2]] [[b]] {{.*}} 0 1
+// CHECK: [[val_float23:%[^ ]+]] = OpVectorShuffle [[float2]] [[b]] {{.*}} 2 3
+
+// CHECK: [[val1_int:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val_float01]]
+// CHECK: [[val2_int:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val_float23]]
+
+// CHECK: [[val_int2:%[^ ]+]] = OpCompositeConstruct [[uint2]] [[val1_int]] [[val2_int]]
+// CHECK: [[cx4:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[c]] [[uint_2]]
+// CHECK: [[val_half4:%[^ ]+]] = OpBitcast [[half4]] [[val_int2]]
+
+// CHECK: [[val1:%[^ ]+]] = OpCompositeExtract [[half]] [[val_half4]] 0
+// CHECK: [[val2:%[^ ]+]] = OpCompositeExtract [[half]] [[val_half4]] 1
+// CHECK: [[val3:%[^ ]+]] = OpCompositeExtract [[half]] [[val_half4]] 2
+// CHECK: [[val4:%[^ ]+]] = OpCompositeExtract [[half]] [[val_half4]] 3
+
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[cx4]]
+// CHECK: OpStore [[addr1]] [[val1]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx4]] [[uint_1]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx1]]
+// CHECK: OpStore [[addr2]] [[val2]]
+
+// CHECK: [[idx2:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx4]] [[uint_2]]
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx2]]
+// CHECK: OpStore [[addr3]] [[val3]]
+
+// CHECK: [[idx3:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx4]] [[uint_3]]
+// CHECK: [[addr4:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx3]]
+// CHECK: OpStore [[addr4]] [[val4]]
diff --git a/test/HalfStorage/vstore_half4.ll b/test/HalfStorage/vstore_half4.ll
new file mode 100644
index 0000000..ae913b4
--- /dev/null
+++ b/test/HalfStorage/vstore_half4.ll
@@ -0,0 +1,23 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define void @foo(half addrspace(1)* %a, <4 x float> %b, i32 %c) {
+entry:
+  call spir_func void @_Z12vstore_half4Dv4_fjPU3AS1Dh(<4 x float> %b, i32 %c, half addrspace(1)* %a)
+  ret void
+}
+
+declare spir_func void @_Z12vstore_half4Dv4_fjPU3AS1Dh(<4 x float>, i32, half addrspace(1)*)
+
+; CHECK:  [[b01:%[^ ]+]] = shufflevector <4 x float> %b, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK:  [[b23:%[^ ]+]] = shufflevector <4 x float> %b, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK:  [[b01i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b01]])
+; CHECK:  [[b23i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b23]])
+; CHECK:  [[b01v2i32:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[b01i32]], i32 0
+; CHECK:  [[b0123v2i32:%[^ ]+]] = insertelement <2 x i32> [[b01v2i32]], i32 [[b23i32]], i32 1
+; CHECK:  [[av2i32:%[^ ]+]] = bitcast half addrspace(1)* %a to <2 x i32> addrspace(1)*
+; CHECK:  [[gep:%[^ ]+]] = getelementptr <2 x i32>, <2 x i32> addrspace(1)* [[av2i32]], i32 %c
+; CHECK:  store <2 x i32> [[b0123v2i32]], <2 x i32> addrspace(1)* [[gep]], align 8
diff --git a/test/HalfStorage/vstore_half8.cl b/test/HalfStorage/vstore_half8.cl
new file mode 100644
index 0000000..0abad79
--- /dev/null
+++ b/test/HalfStorage/vstore_half8.cl
@@ -0,0 +1,88 @@
+// RUN: clspv %s -o %t.spv -long-vector
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, float8 b, int c) {
+    vstore_half8(b, c, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[half4:%[^ ]+]] = OpTypeVector [[half]] 4
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint2:%[^ ]+]] = OpTypeVector [[uint]] 2
+// CHECK-DAG: [[uint_0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint_1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint_2:%[^ ]+]] = OpConstant [[uint]] 2
+// CHECK-DAG: [[uint_3:%[^ ]+]] = OpConstant [[uint]] 3
+// CHECK-DAG: [[uint_4:%[^ ]+]] = OpConstant [[uint]] 4
+// CHECK-DAG: [[uint_5:%[^ ]+]] = OpConstant [[uint]] 5
+// CHECK-DAG: [[uint_6:%[^ ]+]] = OpConstant [[uint]] 6
+// CHECK-DAG: [[uint_7:%[^ ]+]] = OpConstant [[uint]] 7
+
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract {{.*}} {{.*}} 0
+
+// CHECK: [[b0:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 0
+// CHECK: [[b1:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 1
+// CHECK: [[b2:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 2
+// CHECK: [[b3:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 3
+// CHECK: [[b4:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 4
+// CHECK: [[b5:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 5
+// CHECK: [[b6:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 6
+// CHECK: [[b7:%[^ ]+]] = OpCompositeExtract [[float]] {{.*}} 7
+
+// CHECK: [[c:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 1
+
+// CHECK: [[b01:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b0]] [[b1]]
+// CHECK: [[b23:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b2]] [[b3]]
+// CHECK: [[b45:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b4]] [[b5]]
+// CHECK: [[b67:%[^ ]+]] = OpCompositeConstruct [[float2]] [[b6]] [[b7]]
+
+// CHECK: [[b01f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b01]]
+// CHECK: [[b23f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b23]]
+// CHECK: [[b45f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b45]]
+// CHECK: [[b67f:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[b67]]
+
+// CHECK: [[b0123f:%[^ ]+]] = OpCompositeConstruct [[uint2]] [[b01f]] [[b23f]]
+// CHECK: [[b4567f:%[^ ]+]] = OpCompositeConstruct [[uint2]] [[b45f]] [[b67f]]
+
+// CHECK: [[cx8:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[c]] [[uint_3]]
+
+// CHECK: [[b0123h:%[^ ]+]] = OpBitcast [[half4]] [[b0123f]]
+// CHECK: [[b0h:%[^ ]+]] = OpCompositeExtract [[half]] [[b0123h]] 0
+// CHECK: [[b1h:%[^ ]+]] = OpCompositeExtract [[half]] [[b0123h]] 1
+// CHECK: [[b2h:%[^ ]+]] = OpCompositeExtract [[half]] [[b0123h]] 2
+// CHECK: [[b3h:%[^ ]+]] = OpCompositeExtract [[half]] [[b0123h]] 3
+
+// CHECK: [[addr0:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[cx8]]
+// CHECK: OpStore [[addr0]] [[b0h]]
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx8]] [[uint_1]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx1]]
+// CHECK: OpStore [[addr1]] [[b1h]]
+// CHECK: [[idx2:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx8]] [[uint_2]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx2]]
+// CHECK: OpStore [[addr2]] [[b2h]]
+// CHECK: [[idx3:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx8]] [[uint_3]]
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx3]]
+// CHECK: OpStore [[addr3]] [[b3h]]
+
+// CHECK: [[b4567h:%[^ ]+]] = OpBitcast [[half4]] [[b4567f]]
+// CHECK: [[b4h:%[^ ]+]] = OpCompositeExtract [[half]] [[b4567h]] 0
+// CHECK: [[b5h:%[^ ]+]] = OpCompositeExtract [[half]] [[b4567h]] 1
+// CHECK: [[b6h:%[^ ]+]] = OpCompositeExtract [[half]] [[b4567h]] 2
+// CHECK: [[b7h:%[^ ]+]] = OpCompositeExtract [[half]] [[b4567h]] 3
+
+// CHECK: [[idx4:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx8]] [[uint_4]]
+// CHECK: [[addr4:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx4]]
+// CHECK: OpStore [[addr4]] [[b4h]]
+// CHECK: [[idx5:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx8]] [[uint_5]]
+// CHECK: [[addr5:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx5]]
+// CHECK: OpStore [[addr5]] [[b5h]]
+// CHECK: [[idx6:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx8]] [[uint_6]]
+// CHECK: [[addr6:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx6]]
+// CHECK: OpStore [[addr6]] [[b6h]]
+// CHECK: [[idx7:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx8]] [[uint_7]]
+// CHECK: [[addr7:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint_0]] [[idx7]]
+// CHECK: OpStore [[addr7]] [[b7h]]
diff --git a/test/HalfStorage/vstore_half8.ll b/test/HalfStorage/vstore_half8.ll
new file mode 100644
index 0000000..0c3ec72
--- /dev/null
+++ b/test/HalfStorage/vstore_half8.ll
@@ -0,0 +1,29 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define void @foo(half addrspace(1)* %a, <8 x float> %b, i32 %c) {
+entry:
+  call spir_func void @_Z12vstore_half8Dv8_fjPU3AS1Dh(<8 x float> %b, i32 %c, half addrspace(1)* %a)
+  ret void
+}
+
+declare spir_func void @_Z12vstore_half8Dv8_fjPU3AS1Dh(<8 x float>, i32, half addrspace(1)*)
+
+; CHECK:  [[b01:%[^ ]+]] = shufflevector <8 x float> %b, <8 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK:  [[b23:%[^ ]+]] = shufflevector <8 x float> %b, <8 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK:  [[b45:%[^ ]+]] = shufflevector <8 x float> %b, <8 x float> undef, <2 x i32> <i32 4, i32 5>
+; CHECK:  [[b67:%[^ ]+]] = shufflevector <8 x float> %b, <8 x float> undef, <2 x i32> <i32 6, i32 7>
+; CHECK:  [[b01i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b01]])
+; CHECK:  [[b23i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b23]])
+; CHECK:  [[b45i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b45]])
+; CHECK:  [[b67i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b67]])
+; CHECK:  [[bv0:%[^ ]+]] = insertelement <4 x i32> undef, i32 [[b01i32]], i32 0
+; CHECK:  [[bv1:%[^ ]+]] = insertelement <4 x i32> [[bv0]], i32 [[b23i32]], i32 1
+; CHECK:  [[bv2:%[^ ]+]] = insertelement <4 x i32> [[bv1]], i32 [[b45i32]], i32 2
+; CHECK:  [[bv3:%[^ ]+]] = insertelement <4 x i32> [[bv2]], i32 [[b67i32]], i32 3
+; CHECK:  [[av4i32:%[^ ]+]] = bitcast half addrspace(1)* %a to <4 x i32> addrspace(1)*
+; CHECK:  [[gep:%[^ ]+]] = getelementptr <4 x i32>, <4 x i32> addrspace(1)* [[av4i32]], i32 %c
+; CHECK:  store <4 x i32> [[bv3]], <4 x i32> addrspace(1)* [[gep]], align 16
diff --git a/test/HalfStorage/vstorea_half3.cl b/test/HalfStorage/vstorea_half3.cl
new file mode 100644
index 0000000..5f0b5ed
--- /dev/null
+++ b/test/HalfStorage/vstorea_half3.cl
@@ -0,0 +1,48 @@
+// RUN: clspv %s -o %t.spv
+// RUN: spirv-dis -o %t2.spvasm %t.spv
+// RUN: FileCheck %s < %t2.spvasm
+// RUN: spirv-val --target-env vulkan1.0 %t.spv
+
+__kernel void test(__global half *a, float3 b, int c) {
+    vstorea_half3(b, c, a);
+}
+
+// CHECK-DAG: [[half:%[^ ]+]] = OpTypeFloat 16
+// CHECK-DAG: [[float:%[^ ]+]] = OpTypeFloat 32
+// CHECK-DAG: [[float2:%[^ ]+]] = OpTypeVector [[float]] 2
+// CHECK-DAG: [[float4:%[^ ]+]] = OpTypeVector [[float]] 4
+// CHECK-DAG: [[ushort:%[^ ]+]] = OpTypeInt 16 0
+// CHECK-DAG: [[uint:%[^ ]+]] = OpTypeInt 32 0
+// CHECK-DAG: [[uint0:%[^ ]+]] = OpConstant [[uint]] 0
+// CHECK-DAG: [[uint1:%[^ ]+]] = OpConstant [[uint]] 1
+// CHECK-DAG: [[uint2:%[^ ]+]] = OpConstant [[uint]] 2
+
+// CHECK: [[b:%[^ ]+]] = OpCompositeExtract [[float4]] {{.*}} 0
+// CHECK: [[c:%[^ ]+]] = OpCompositeExtract [[uint]] {{.*}} 1
+
+// CHECK: [[val1_2f32:%[^ ]+]] = OpVectorShuffle [[float2]] [[b]] {{.*}} 0 4294967295
+// CHECK: [[val2_2f32:%[^ ]+]] = OpVectorShuffle [[float2]] [[b]] {{.*}} 1 4294967295
+// CHECK: [[val3_2f32:%[^ ]+]] = OpVectorShuffle [[float2]] [[b]] {{.*}} 2 4294967295
+
+// CHECK: [[val1i32:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val1_2f32]]
+// CHECK: [[val1i16:%[^ ]+]] = OpUConvert [[ushort]] [[val1i32]]
+// CHECK: [[val2i32:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val2_2f32]]
+// CHECK: [[val2i16:%[^ ]+]] = OpUConvert [[ushort]] [[val2i32]]
+// CHECK: [[val3i32:%[^ ]+]] = OpExtInst [[uint]] {{.*}} PackHalf2x16 [[val3_2f32]]
+// CHECK: [[val3i16:%[^ ]+]] = OpUConvert [[ushort]] [[val3i32]]
+
+// CHECK: [[cx4:%[^ ]+]] = OpShiftLeftLogical [[uint]] [[c]] [[uint2]]
+
+// CHECK: [[val1:%[^ ]+]] = OpBitcast [[half]] [[val1i16]]
+// CHECK: [[addr1:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[cx4]]
+// CHECK: OpStore [[addr1]] [[val1]]
+
+// CHECK: [[idx1:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx4]] [[uint1]]
+// CHECK: [[val2:%[^ ]+]] = OpBitcast [[half]] [[val2i16]]
+// CHECK: [[addr2:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[idx1]]
+// CHECK: OpStore [[addr2]] [[val2]]
+
+// CHECK: [[idx2:%[^ ]+]] = OpBitwiseOr [[uint]] [[cx4]] [[uint2]]
+// CHECK: [[val3:%[^ ]+]] = OpBitcast [[half]] [[val3i16]]
+// CHECK: [[addr3:%[^ ]+]] = OpAccessChain %{{.*}} %{{.*}} [[uint0]] [[idx2]]
+// CHECK: OpStore [[addr3]] [[val3]]
diff --git a/test/HalfStorage/vstorea_half3.ll b/test/HalfStorage/vstorea_half3.ll
new file mode 100644
index 0000000..edd6c6e
--- /dev/null
+++ b/test/HalfStorage/vstorea_half3.ll
@@ -0,0 +1,37 @@
+; RUN: clspv-opt %s -o %t --passes=replace-opencl-builtin
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define void @foo(half addrspace(1)* %a, <3 x float> %b, i32 %c) {
+entry:
+  call spir_func void @_Z13vstorea_half3Dv3_fjPU3AS1Dh(<3 x float> %b, i32 %c, half addrspace(1)* %a)
+  ret void
+}
+
+declare spir_func void @_Z13vstorea_half3Dv3_fjPU3AS1Dh(<3 x float>, i32, half addrspace(1)*)
+
+
+; CHECK:  [[b0:%[^ ]+]] = extractelement <3 x float> %b, i32 0
+; CHECK:  [[b0f2:%[^ ]+]] = insertelement <2 x float> undef, float [[b0]], i32 0
+; CHECK:  [[b1:%[^ ]+]] = extractelement <3 x float> %b, i32 1
+; CHECK:  [[b1f2:%[^ ]+]] = insertelement <2 x float> undef, float [[b1]], i32 0
+; CHECK:  [[b2:%[^ ]+]] = extractelement <3 x float> %b, i32 2
+; CHECK:  [[b2f2:%[^ ]+]] = insertelement <2 x float> undef, float [[b2]], i32 0
+; CHECK:  [[b0i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b0f2]])
+; CHECK:  [[b0i16:%[^ ]+]] = trunc i32 [[b0i32]] to i16
+; CHECK:  [[b1i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b1f2]])
+; CHECK:  [[b1i16:%[^ ]+]] = trunc i32 [[b1i32]] to i16
+; CHECK:  [[b2i32:%[^ ]+]] = call i32 @_Z16spirv.pack.v2f16(<2 x float> [[b2f2]])
+; CHECK:  [[b2i16:%[^ ]+]] = trunc i32 [[b2i32]] to i16
+; CHECK:  [[ai16:%[^ ]+]] = bitcast half addrspace(1)* %a to i16 addrspace(1)*
+; CHECK:  [[cx4:%[^ ]+]] = shl i32 %c, 2
+; CHECK:  [[gep0:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[cx4]]
+; CHECK:  store i16 [[b0i16]], i16 addrspace(1)* [[gep0]], align 2
+; CHECK:  [[idx1:%[^ ]+]] = add i32 [[cx4]], 1
+; CHECK:  [[gep1:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[idx1]]
+; CHECK:  store i16 [[b1i16]], i16 addrspace(1)* [[gep1]], align 2
+; CHECK:  [[idx2:%[^ ]+]] = add i32 [[idx1]], 1
+; CHECK:  [[gep2:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* [[ai16]], i32 [[idx2]]
+; CHECK:  store i16 [[b2i16]], i16 addrspace(1)* [[gep2]], align 2
diff --git a/test/PointerCasts/store_cast_half_to_int.ll b/test/PointerCasts/store_cast_half_to_int.ll
new file mode 100644
index 0000000..4a02efe
--- /dev/null
+++ b/test/PointerCasts/store_cast_half_to_int.ll
@@ -0,0 +1,23 @@
+; RUN: clspv-opt %s -o %t --passes=replace-pointer-bitcast
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+; CHECK: [[lshr0:%[a-zA-Z0-9_.]+]] = lshr i32 %s, 0
+; CHECK: [[trunc0:%[a-zA-Z0-9_.]+]] = trunc i32 [[lshr0]] to i16
+; CHECK: [[cast0:%[a-zA-Z0-9_.]+]] = bitcast i16 [[trunc0]] to half
+; CHECK: [[lshr1:%[a-zA-Z0-9_.]+]] = lshr i32 %s, 16
+; CHECK: [[trunc1:%[a-zA-Z0-9_.]+]] = trunc i32 [[lshr1]] to i16
+; CHECK: [[cast1:%[a-zA-Z0-9_.]+]] = bitcast i16 [[trunc1]] to half
+; CHECK: [[gep0:%[a-zA-Z0-9_.]+]] = getelementptr half, half addrspace(1)* %a, i32 0
+; CHECK: store half [[cast0]], half addrspace(1)* [[gep0]]
+; CHECK: [[gep1:%[a-zA-Z0-9_.]+]] = getelementptr half, half addrspace(1)* %a, i32 1
+; CHECK: store half [[cast1]], half addrspace(1)* [[gep1]]
+define spir_kernel void @foo(half addrspace(1)* %a, i32 %s) {
+entry:
+  %0 = bitcast half addrspace(1)* %a to i32 addrspace(1)*
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0
+  store i32 %s, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
diff --git a/test/PointerCasts/store_cast_short_to_int4.ll b/test/PointerCasts/store_cast_short_to_int4.ll
new file mode 100644
index 0000000..b34afe2
--- /dev/null
+++ b/test/PointerCasts/store_cast_short_to_int4.ll
@@ -0,0 +1,52 @@
+; RUN: clspv-opt %s -o %t --passes=replace-pointer-bitcast
+; RUN: FileCheck %s < %t
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+; CHECK: [[shl:%[^ ]+]] = shl i32 %i, 3
+; CHECK: [[shuffle:%[^ ]+]] = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK: [[bitcast:%[^ ]+]] = bitcast <2 x i32> [[shuffle]] to <4 x i16>
+; CHECK: [[elem0:%[^ ]+]] = extractelement <4 x i16> [[bitcast]], i32 0
+; CHECK: [[elem1:%[^ ]+]] = extractelement <4 x i16> [[bitcast]], i32 1
+; CHECK: [[elem2:%[^ ]+]] = extractelement <4 x i16> [[bitcast]], i32 2
+; CHECK: [[elem3:%[^ ]+]] = extractelement <4 x i16> [[bitcast]], i32 3
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* %a, i32 [[shl]]
+; CHECK: store i16 [[elem0]], i16 addrspace(1)* [[gep]], align 2
+; CHECK: [[add:%[^ ]+]] = add i32 [[shl]], 1
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* %a, i32 [[add]]
+; CHECK: store i16 [[elem1]], i16 addrspace(1)* [[gep]], align 2
+; CHECK: [[add2:%[^ ]+]] = add i32 [[add]], 1
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* %a, i32 [[add2]]
+; CHECK: store i16 [[elem2]], i16 addrspace(1)* [[gep]], align 2
+; CHECK: [[add3:%[^ ]+]] = add i32 [[add2]], 1
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* %a, i32 [[add3]]
+; CHECK: store i16 [[elem3]], i16 addrspace(1)* [[gep]], align 2
+
+; CHECK: [[shuffle:%[^ ]+]] = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK: [[bitcast:%[^ ]+]] = bitcast <2 x i32> [[shuffle]] to <4 x i16>
+; CHECK: [[elem0:%[^ ]+]] = extractelement <4 x i16> [[bitcast]], i32 0
+; CHECK: [[elem1:%[^ ]+]] = extractelement <4 x i16> [[bitcast]], i32 1
+; CHECK: [[elem2:%[^ ]+]] = extractelement <4 x i16> [[bitcast]], i32 2
+; CHECK: [[elem3:%[^ ]+]] = extractelement <4 x i16> [[bitcast]], i32 3
+; CHECK: [[add:%[^ ]+]] = add i32 [[shl]], 4
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* %a, i32 [[add]]
+; CHECK: store i16 [[elem0]], i16 addrspace(1)* [[gep]], align 2
+; CHECK: [[add1:%[^ ]+]] = add i32 [[add]], 1
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* %a, i32 [[add1]]
+; CHECK: store i16 [[elem1]], i16 addrspace(1)* [[gep]], align 2
+; CHECK: [[add2:%[^ ]+]] = add i32 [[add1]], 1
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* %a, i32 [[add2]]
+; CHECK: store i16 [[elem2]], i16 addrspace(1)* [[gep]], align 2
+; CHECK: [[add3:%[^ ]+]] = add i32 [[add2]], 1
+; CHECK: [[gep:%[^ ]+]] = getelementptr i16, i16 addrspace(1)* %a, i32 [[add3]]
+; CHECK: store i16 [[elem3]], i16 addrspace(1)* [[gep]], align 2
+
+define spir_kernel void @foo(i16 addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %i) {
+entry:
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
+  %1 = bitcast i16 addrspace(1)* %a to <4 x i32> addrspace(1)*
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %1, i32 %i
+  store <4 x i32> %0, <4 x i32> addrspace(1)* %arrayidx, align 16
+  ret void
+}