cborinternal_p.h: Fix AVX2 build with MSVC

MSVC (and I think ICC too) are lacking the simpler, scalar instructions
to convert from single-precision to half-precision and back. Instead, we
need to use the packed data intrinsics.

Fixes #192.

Signed-off-by: Thiago Macieira <thiago.macieira@intel.com>
diff --git a/.appveyor.yml b/.appveyor.yml
index 453fa12..eb70441 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -13,12 +13,12 @@
 
     if /i "%APPVEYOR_BUILD_WORKER_IMAGE%"=="Visual Studio 2017" (call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64) & (set QTDIR=C:\Qt\5.12\msvc2017_64)
 
-    if /i "%APPVEYOR_BUILD_WORKER_IMAGE%"=="Visual Studio 2019" (call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64) & (set QTDIR=C:\Qt\5.15\msvc2019_64)
+    if /i "%APPVEYOR_BUILD_WORKER_IMAGE%"=="Visual Studio 2019" (call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64) & (set QTDIR=C:\Qt\5.15\msvc2019_64) & set CFLAGS=/arch:AVX2
 
     set path=%PATH%;%QTDIR%\bin
 build_script:
 - cmd: >-
-    nmake -f Makefile.nmake -nologo CFLAGS="-W3 -Os -MDd"
+    nmake -f Makefile.nmake -nologo CFLAGS="%CFLAGS% -W3 -Os -MDd"
 
     cd tests
 
diff --git a/src/cborinternal_p.h b/src/cborinternal_p.h
index a85a929..35b7a79 100644
--- a/src/cborinternal_p.h
+++ b/src/cborinternal_p.h
@@ -37,15 +37,17 @@
 #endif
 
 #ifndef CBOR_NO_HALF_FLOAT_TYPE
-#  ifdef __F16C__
+#  if defined(__F16C__) || defined(__AVX2__)
 #    include <immintrin.h>
-static inline unsigned short encode_half(double val)
+static inline unsigned short encode_half(float val)
 {
-    return _cvtss_sh((float)val, 3);
+    __m128i m = _mm_cvtps_ph(_mm_set_ss(val), _MM_FROUND_CUR_DIRECTION);
+    return _mm_extract_epi16(m, 0);
 }
-static inline double decode_half(unsigned short half)
+static inline float decode_half(unsigned short half)
 {
-    return _cvtsh_ss(half);
+    __m128i m = _mm_cvtsi32_si128(half);
+    return _mm_cvtss_f32(_mm_cvtph_ps(m));
 }
 #  else
 /* software implementation of float-to-fp16 conversions */