QS16->QS8 VCVT for SSE2 replace shufd with unpcklwd/unpckhwd to replicate int32
PiperOrigin-RevId: 508574443
diff --git a/src/amalgam/gen/avx.c b/src/amalgam/gen/avx.c
index 2bd1124..c7a6071 100644
--- a/src/amalgam/gen/avx.c
+++ b/src/amalgam/gen/avx.c
@@ -9512,18 +9512,17 @@
__m128i vy = _mm_packs_epi16(vacc, vacc);
if (batch & (4 * sizeof(int8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(int8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(int8_t))) {
- *output = (int8_t) vy_lo;
+ *output = (int8_t) _mm_extract_epi8(vy, 0);
}
}
}
@@ -12270,18 +12269,17 @@
__m128i vy = _mm_packus_epi16(vacc, vacc);
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/amalgam/gen/avx2.c b/src/amalgam/gen/avx2.c
index 352df39..7b540db 100644
--- a/src/amalgam/gen/avx2.c
+++ b/src/amalgam/gen/avx2.c
@@ -5927,18 +5927,17 @@
output += 8;
}
if (batch & (4 * sizeof(int8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(int8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(int8_t))) {
- *output = (int8_t) vy_lo;
+ *output = (int8_t) _mm_extract_epi8(vy, 0);
}
}
}
@@ -7927,18 +7926,17 @@
output += 8;
}
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/amalgam/gen/sse41.c b/src/amalgam/gen/sse41.c
index 94fed0b..4b57b10 100644
--- a/src/amalgam/gen/sse41.c
+++ b/src/amalgam/gen/sse41.c
@@ -5348,18 +5348,17 @@
__m128i vy = _mm_packs_epi16(vacc, vacc);
if (batch & (4 * sizeof(int8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(int8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(int8_t))) {
- *output = (int8_t) vy_lo;
+ *output = (int8_t) _mm_extract_epi8(vy, 0);
}
}
}
@@ -8283,18 +8282,17 @@
__m128i vy = _mm_packus_epi16(vacc, vacc);
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x16.c b/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x16.c
index c052483..ee1c51a 100644
--- a/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x16.c
+++ b/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x16.c
@@ -44,54 +44,54 @@
const __m128i vu2 = _mm_unpacklo_epi16(vx2, vzero);
const __m128i vu3 = _mm_unpackhi_epi16(vx2, vzero);
- __m128i vacco0 = _mm_shuffle_epi32(vu0, _MM_SHUFFLE(3, 3, 2, 2)); // high
- __m128i vacce0 = _mm_shuffle_epi32(vu0, _MM_SHUFFLE(3, 1, 2, 0)); // low
- __m128i vacco1 = _mm_shuffle_epi32(vu1, _MM_SHUFFLE(3, 3, 2, 2)); // high
- __m128i vacce1 = _mm_shuffle_epi32(vu1, _MM_SHUFFLE(3, 1, 2, 0)); // low
- __m128i vacco2 = _mm_shuffle_epi32(vu2, _MM_SHUFFLE(3, 3, 2, 2)); // high
- __m128i vacce2 = _mm_shuffle_epi32(vu2, _MM_SHUFFLE(3, 1, 2, 0)); // low
- __m128i vacco3 = _mm_shuffle_epi32(vu3, _MM_SHUFFLE(3, 3, 2, 2)); // high
- __m128i vacce3 = _mm_shuffle_epi32(vu3, _MM_SHUFFLE(3, 1, 2, 0)); // low
+ __m128i vacc0lo = _mm_unpacklo_epi32(vu0, vzero); // low
+ __m128i vacc0hi = _mm_unpackhi_epi32(vu0, vzero); // high
+ __m128i vacc1lo = _mm_unpacklo_epi32(vu1, vzero); // low
+ __m128i vacc1hi = _mm_unpackhi_epi32(vu1, vzero); // high
+ __m128i vacc2lo = _mm_unpacklo_epi32(vu2, vzero); // low
+ __m128i vacc2hi = _mm_unpackhi_epi32(vu2, vzero); // high
+ __m128i vacc3lo = _mm_unpacklo_epi32(vu3, vzero); // low
+ __m128i vacc3hi = _mm_unpackhi_epi32(vu3, vzero); // high
- vacce0 = _mm_mul_epu32(vacce0, vmultiplier);
- vacco0 = _mm_mul_epu32(vacco0, vmultiplier);
- vacce1 = _mm_mul_epu32(vacce1, vmultiplier);
- vacco1 = _mm_mul_epu32(vacco1, vmultiplier);
- vacce2 = _mm_mul_epu32(vacce2, vmultiplier);
- vacco2 = _mm_mul_epu32(vacco2, vmultiplier);
- vacce3 = _mm_mul_epu32(vacce3, vmultiplier);
- vacco3 = _mm_mul_epu32(vacco3, vmultiplier);
+ vacc0lo = _mm_mul_epu32(vacc0lo, vmultiplier);
+ vacc0hi = _mm_mul_epu32(vacc0hi, vmultiplier);
+ vacc1lo = _mm_mul_epu32(vacc1lo, vmultiplier);
+ vacc1hi = _mm_mul_epu32(vacc1hi, vmultiplier);
+ vacc2lo = _mm_mul_epu32(vacc2lo, vmultiplier);
+ vacc2hi = _mm_mul_epu32(vacc2hi, vmultiplier);
+ vacc3lo = _mm_mul_epu32(vacc3lo, vmultiplier);
+ vacc3hi = _mm_mul_epu32(vacc3hi, vmultiplier);
- vacce0 = _mm_add_epi64(vacce0, vbias);
- vacco0 = _mm_add_epi64(vacco0, vbias);
- vacce1 = _mm_add_epi64(vacce1, vbias);
- vacco1 = _mm_add_epi64(vacco1, vbias);
- vacce2 = _mm_add_epi64(vacce2, vbias);
- vacco2 = _mm_add_epi64(vacco2, vbias);
- vacce3 = _mm_add_epi64(vacce3, vbias);
- vacco3 = _mm_add_epi64(vacco3, vbias);
+ vacc0lo = _mm_add_epi64(vacc0lo, vbias);
+ vacc0hi = _mm_add_epi64(vacc0hi, vbias);
+ vacc1lo = _mm_add_epi64(vacc1lo, vbias);
+ vacc1hi = _mm_add_epi64(vacc1hi, vbias);
+ vacc2lo = _mm_add_epi64(vacc2lo, vbias);
+ vacc2hi = _mm_add_epi64(vacc2hi, vbias);
+ vacc3lo = _mm_add_epi64(vacc3lo, vbias);
+ vacc3hi = _mm_add_epi64(vacc3hi, vbias);
- vacce0 = _mm_srli_epi64(vacce0, 16);
- vacco0 = _mm_slli_epi64(vacco0, 16);
- vacce1 = _mm_srli_epi64(vacce1, 16);
- vacco1 = _mm_slli_epi64(vacco1, 16);
- vacce2 = _mm_srli_epi64(vacce2, 16);
- vacco2 = _mm_slli_epi64(vacco2, 16);
- vacce3 = _mm_srli_epi64(vacce3, 16);
- vacco3 = _mm_slli_epi64(vacco3, 16);
+ vacc0lo = _mm_srli_epi64(vacc0lo, 16);
+ vacc0hi = _mm_srli_epi64(vacc0hi, 16);
+ vacc1lo = _mm_srli_epi64(vacc1lo, 16);
+ vacc1hi = _mm_srli_epi64(vacc1hi, 16);
+ vacc2lo = _mm_srli_epi64(vacc2lo, 16);
+ vacc2hi = _mm_srli_epi64(vacc2hi, 16);
+ vacc3lo = _mm_srli_epi64(vacc3lo, 16);
+ vacc3hi = _mm_srli_epi64(vacc3hi, 16);
- __m128i vacc0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce0),
- _mm_castsi128_ps(vacco0),
- _MM_SHUFFLE(3, 1, 2, 0)));
- __m128i vacc1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce1),
- _mm_castsi128_ps(vacco1),
- _MM_SHUFFLE(3, 1, 2, 0)));
- __m128i vacc2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce2),
- _mm_castsi128_ps(vacco2),
- _MM_SHUFFLE(3, 1, 2, 0)));
- __m128i vacc3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce3),
- _mm_castsi128_ps(vacco3),
- _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacc0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacc0lo),
+ _mm_castsi128_ps(vacc0hi),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128i vacc1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacc1lo),
+ _mm_castsi128_ps(vacc1hi),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128i vacc2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacc2lo),
+ _mm_castsi128_ps(vacc2hi),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128i vacc3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacc3lo),
+ _mm_castsi128_ps(vacc3hi),
+ _MM_SHUFFLE(2, 0, 2, 0)));
// Pack 8 ints into 8 shorts
vacc0 = _mm_packs_epi32(vacc0, vacc1);
@@ -107,16 +107,16 @@
__m128i vx = _mm_loadl_epi64((const __m128i*) input); input += 4;
vx = _mm_xor_si128(vx, vinput_bias); // Convert signed inputs to unsigned.
const __m128i vu = _mm_unpacklo_epi16(vx, vzero);
- __m128i vacco = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 3, 2, 2));
- __m128i vacce = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 1, 2, 0));
- vacce = _mm_mul_epu32(vacce, vmultiplier);
- vacco = _mm_mul_epu32(vacco, vmultiplier);
- vacce = _mm_add_epi64(vacce, vbias);
- vacco = _mm_add_epi64(vacco, vbias);
- vacce = _mm_srli_epi64(vacce, 16);
- vacco = _mm_slli_epi64(vacco, 16);
- __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce),
- _mm_castsi128_ps(vacco), _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacclo = _mm_unpacklo_epi32(vu, vzero);
+ __m128i vacchi = _mm_unpackhi_epi32(vu, vzero);
+ vacclo = _mm_mul_epu32(vacclo, vmultiplier);
+ vacchi = _mm_mul_epu32(vacchi, vmultiplier);
+ vacclo = _mm_add_epi64(vacclo, vbias);
+ vacchi = _mm_add_epi64(vacchi, vbias);
+ vacclo = _mm_srli_epi64(vacclo, 16);
+ vacchi = _mm_srli_epi64(vacchi, 16);
+ __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacclo),
+ _mm_castsi128_ps(vacchi), _MM_SHUFFLE(2, 0, 2, 0)));
vacc = _mm_packs_epi32(vacc, vacc);
const __m128i vy = _mm_packs_epi16(vacc, vacc);
unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
@@ -129,16 +129,16 @@
__m128i vx = _mm_loadl_epi64((const __m128i*) input);
vx = _mm_xor_si128(vx, vinput_bias);
const __m128i vu = _mm_unpacklo_epi16(vx, vzero);
- __m128i vacco = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 3, 2, 2));
- __m128i vacce = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 1, 2, 0));
- vacce = _mm_mul_epu32(vacce, vmultiplier);
- vacco = _mm_mul_epu32(vacco, vmultiplier);
- vacce = _mm_add_epi64(vacce, vbias);
- vacco = _mm_add_epi64(vacco, vbias);
- vacce = _mm_srli_epi64(vacce, 16);
- vacco = _mm_slli_epi64(vacco, 16);
- __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce),
- _mm_castsi128_ps(vacco), _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacclo = _mm_unpacklo_epi32(vu, vzero);
+ __m128i vacchi = _mm_unpackhi_epi32(vu, vzero);
+ vacclo = _mm_mul_epu32(vacclo, vmultiplier);
+ vacchi = _mm_mul_epu32(vacchi, vmultiplier);
+ vacclo = _mm_add_epi64(vacclo, vbias);
+ vacchi = _mm_add_epi64(vacchi, vbias);
+ vacclo = _mm_srli_epi64(vacclo, 16);
+ vacchi = _mm_srli_epi64(vacchi, 16);
+ __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacclo),
+ _mm_castsi128_ps(vacchi), _MM_SHUFFLE(2, 0, 2, 0)));
vacc = _mm_packs_epi32(vacc, vacc);
const __m128i vy = _mm_packs_epi16(vacc, vacc);
diff --git a/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x4.c b/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x4.c
index 9915650..313daaf 100644
--- a/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x4.c
+++ b/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x4.c
@@ -36,16 +36,16 @@
__m128i vx = _mm_loadl_epi64((const __m128i*) input); input += 4;
vx = _mm_xor_si128(vx, vinput_bias); // Convert signed inputs to unsigned.
const __m128i vu = _mm_unpacklo_epi16(vx, vzero);
- __m128i vacco = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 3, 2, 2));
- __m128i vacce = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 1, 2, 0));
- vacce = _mm_mul_epu32(vacce, vmultiplier);
- vacco = _mm_mul_epu32(vacco, vmultiplier);
- vacce = _mm_add_epi64(vacce, vbias);
- vacco = _mm_add_epi64(vacco, vbias);
- vacce = _mm_srli_epi64(vacce, 16);
- vacco = _mm_slli_epi64(vacco, 16);
- __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce),
- _mm_castsi128_ps(vacco), _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacclo = _mm_unpacklo_epi32(vu, vzero);
+ __m128i vacchi = _mm_unpackhi_epi32(vu, vzero);
+ vacclo = _mm_mul_epu32(vacclo, vmultiplier);
+ vacchi = _mm_mul_epu32(vacchi, vmultiplier);
+ vacclo = _mm_add_epi64(vacclo, vbias);
+ vacchi = _mm_add_epi64(vacchi, vbias);
+ vacclo = _mm_srli_epi64(vacclo, 16);
+ vacchi = _mm_srli_epi64(vacchi, 16);
+ __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacclo),
+ _mm_castsi128_ps(vacchi), _MM_SHUFFLE(2, 0, 2, 0)));
vacc = _mm_packs_epi32(vacc, vacc);
const __m128i vy = _mm_packs_epi16(vacc, vacc);
unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
@@ -58,16 +58,16 @@
__m128i vx = _mm_loadl_epi64((const __m128i*) input);
vx = _mm_xor_si128(vx, vinput_bias);
const __m128i vu = _mm_unpacklo_epi16(vx, vzero);
- __m128i vacco = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 3, 2, 2));
- __m128i vacce = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 1, 2, 0));
- vacce = _mm_mul_epu32(vacce, vmultiplier);
- vacco = _mm_mul_epu32(vacco, vmultiplier);
- vacce = _mm_add_epi64(vacce, vbias);
- vacco = _mm_add_epi64(vacco, vbias);
- vacce = _mm_srli_epi64(vacce, 16);
- vacco = _mm_slli_epi64(vacco, 16);
- __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce),
- _mm_castsi128_ps(vacco), _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacclo = _mm_unpacklo_epi32(vu, vzero);
+ __m128i vacchi = _mm_unpackhi_epi32(vu, vzero);
+ vacclo = _mm_mul_epu32(vacclo, vmultiplier);
+ vacchi = _mm_mul_epu32(vacchi, vmultiplier);
+ vacclo = _mm_add_epi64(vacclo, vbias);
+ vacchi = _mm_add_epi64(vacchi, vbias);
+ vacclo = _mm_srli_epi64(vacclo, 16);
+ vacchi = _mm_srli_epi64(vacchi, 16);
+ __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacclo),
+ _mm_castsi128_ps(vacchi), _MM_SHUFFLE(2, 0, 2, 0)));
vacc = _mm_packs_epi32(vacc, vacc);
const __m128i vy = _mm_packs_epi16(vacc, vacc);
diff --git a/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x8.c b/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x8.c
index 5e1dea4..b3b4c05 100644
--- a/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x8.c
+++ b/src/qs16-qs8-vcvt/gen/qs16-qs8-vcvt-sse2-x8.c
@@ -40,32 +40,32 @@
const __m128i vu0 = _mm_unpacklo_epi16(vx0, vzero);
const __m128i vu1 = _mm_unpackhi_epi16(vx0, vzero);
- __m128i vacco0 = _mm_shuffle_epi32(vu0, _MM_SHUFFLE(3, 3, 2, 2)); // high
- __m128i vacce0 = _mm_shuffle_epi32(vu0, _MM_SHUFFLE(3, 1, 2, 0)); // low
- __m128i vacco1 = _mm_shuffle_epi32(vu1, _MM_SHUFFLE(3, 3, 2, 2)); // high
- __m128i vacce1 = _mm_shuffle_epi32(vu1, _MM_SHUFFLE(3, 1, 2, 0)); // low
+ __m128i vacc0lo = _mm_unpacklo_epi32(vu0, vzero); // low
+ __m128i vacc0hi = _mm_unpackhi_epi32(vu0, vzero); // high
+ __m128i vacc1lo = _mm_unpacklo_epi32(vu1, vzero); // low
+ __m128i vacc1hi = _mm_unpackhi_epi32(vu1, vzero); // high
- vacce0 = _mm_mul_epu32(vacce0, vmultiplier);
- vacco0 = _mm_mul_epu32(vacco0, vmultiplier);
- vacce1 = _mm_mul_epu32(vacce1, vmultiplier);
- vacco1 = _mm_mul_epu32(vacco1, vmultiplier);
+ vacc0lo = _mm_mul_epu32(vacc0lo, vmultiplier);
+ vacc0hi = _mm_mul_epu32(vacc0hi, vmultiplier);
+ vacc1lo = _mm_mul_epu32(vacc1lo, vmultiplier);
+ vacc1hi = _mm_mul_epu32(vacc1hi, vmultiplier);
- vacce0 = _mm_add_epi64(vacce0, vbias);
- vacco0 = _mm_add_epi64(vacco0, vbias);
- vacce1 = _mm_add_epi64(vacce1, vbias);
- vacco1 = _mm_add_epi64(vacco1, vbias);
+ vacc0lo = _mm_add_epi64(vacc0lo, vbias);
+ vacc0hi = _mm_add_epi64(vacc0hi, vbias);
+ vacc1lo = _mm_add_epi64(vacc1lo, vbias);
+ vacc1hi = _mm_add_epi64(vacc1hi, vbias);
- vacce0 = _mm_srli_epi64(vacce0, 16);
- vacco0 = _mm_slli_epi64(vacco0, 16);
- vacce1 = _mm_srli_epi64(vacce1, 16);
- vacco1 = _mm_slli_epi64(vacco1, 16);
+ vacc0lo = _mm_srli_epi64(vacc0lo, 16);
+ vacc0hi = _mm_srli_epi64(vacc0hi, 16);
+ vacc1lo = _mm_srli_epi64(vacc1lo, 16);
+ vacc1hi = _mm_srli_epi64(vacc1hi, 16);
- __m128i vacc0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce0),
- _mm_castsi128_ps(vacco0),
- _MM_SHUFFLE(3, 1, 2, 0)));
- __m128i vacc1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce1),
- _mm_castsi128_ps(vacco1),
- _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacc0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacc0lo),
+ _mm_castsi128_ps(vacc0hi),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128i vacc1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacc1lo),
+ _mm_castsi128_ps(vacc1hi),
+ _MM_SHUFFLE(2, 0, 2, 0)));
// Pack 8 ints into 8 shorts
vacc0 = _mm_packs_epi32(vacc0, vacc1);
@@ -80,16 +80,16 @@
__m128i vx = _mm_loadl_epi64((const __m128i*) input); input += 4;
vx = _mm_xor_si128(vx, vinput_bias); // Convert signed inputs to unsigned.
const __m128i vu = _mm_unpacklo_epi16(vx, vzero);
- __m128i vacco = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 3, 2, 2));
- __m128i vacce = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 1, 2, 0));
- vacce = _mm_mul_epu32(vacce, vmultiplier);
- vacco = _mm_mul_epu32(vacco, vmultiplier);
- vacce = _mm_add_epi64(vacce, vbias);
- vacco = _mm_add_epi64(vacco, vbias);
- vacce = _mm_srli_epi64(vacce, 16);
- vacco = _mm_slli_epi64(vacco, 16);
- __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce),
- _mm_castsi128_ps(vacco), _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacclo = _mm_unpacklo_epi32(vu, vzero);
+ __m128i vacchi = _mm_unpackhi_epi32(vu, vzero);
+ vacclo = _mm_mul_epu32(vacclo, vmultiplier);
+ vacchi = _mm_mul_epu32(vacchi, vmultiplier);
+ vacclo = _mm_add_epi64(vacclo, vbias);
+ vacchi = _mm_add_epi64(vacchi, vbias);
+ vacclo = _mm_srli_epi64(vacclo, 16);
+ vacchi = _mm_srli_epi64(vacchi, 16);
+ __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacclo),
+ _mm_castsi128_ps(vacchi), _MM_SHUFFLE(2, 0, 2, 0)));
vacc = _mm_packs_epi32(vacc, vacc);
const __m128i vy = _mm_packs_epi16(vacc, vacc);
unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
@@ -102,16 +102,16 @@
__m128i vx = _mm_loadl_epi64((const __m128i*) input);
vx = _mm_xor_si128(vx, vinput_bias);
const __m128i vu = _mm_unpacklo_epi16(vx, vzero);
- __m128i vacco = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 3, 2, 2));
- __m128i vacce = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 1, 2, 0));
- vacce = _mm_mul_epu32(vacce, vmultiplier);
- vacco = _mm_mul_epu32(vacco, vmultiplier);
- vacce = _mm_add_epi64(vacce, vbias);
- vacco = _mm_add_epi64(vacco, vbias);
- vacce = _mm_srli_epi64(vacce, 16);
- vacco = _mm_slli_epi64(vacco, 16);
- __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce),
- _mm_castsi128_ps(vacco), _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacclo = _mm_unpacklo_epi32(vu, vzero);
+ __m128i vacchi = _mm_unpackhi_epi32(vu, vzero);
+ vacclo = _mm_mul_epu32(vacclo, vmultiplier);
+ vacchi = _mm_mul_epu32(vacchi, vmultiplier);
+ vacclo = _mm_add_epi64(vacclo, vbias);
+ vacchi = _mm_add_epi64(vacchi, vbias);
+ vacclo = _mm_srli_epi64(vacclo, 16);
+ vacchi = _mm_srli_epi64(vacchi, 16);
+ __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacclo),
+ _mm_castsi128_ps(vacchi), _MM_SHUFFLE(2, 0, 2, 0)));
vacc = _mm_packs_epi32(vacc, vacc);
const __m128i vy = _mm_packs_epi16(vacc, vacc);
diff --git a/src/qs16-qs8-vcvt/sse2.c.in b/src/qs16-qs8-vcvt/sse2.c.in
index 08f59ba..cf42571 100644
--- a/src/qs16-qs8-vcvt/sse2.c.in
+++ b/src/qs16-qs8-vcvt/sse2.c.in
@@ -45,25 +45,25 @@
const __m128i vu${ABC[N+1]} = _mm_unpackhi_epi16(vx${ABC[N]}, vzero);
$for N in range(SIMD_TILE):
- __m128i vacco${ABC[N]} = _mm_shuffle_epi32(vu${ABC[N]}, _MM_SHUFFLE(3, 3, 2, 2)); // high
- __m128i vacce${ABC[N]} = _mm_shuffle_epi32(vu${ABC[N]}, _MM_SHUFFLE(3, 1, 2, 0)); // low
+ __m128i vacc${ABC[N]}lo = _mm_unpacklo_epi32(vu${ABC[N]}, vzero); // low
+ __m128i vacc${ABC[N]}hi = _mm_unpackhi_epi32(vu${ABC[N]}, vzero); // high
$for N in range(SIMD_TILE):
- vacce${ABC[N]} = _mm_mul_epu32(vacce${ABC[N]}, vmultiplier);
- vacco${ABC[N]} = _mm_mul_epu32(vacco${ABC[N]}, vmultiplier);
+ vacc${ABC[N]}lo = _mm_mul_epu32(vacc${ABC[N]}lo, vmultiplier);
+ vacc${ABC[N]}hi = _mm_mul_epu32(vacc${ABC[N]}hi, vmultiplier);
$for N in range(SIMD_TILE):
- vacce${ABC[N]} = _mm_add_epi64(vacce${ABC[N]}, vbias);
- vacco${ABC[N]} = _mm_add_epi64(vacco${ABC[N]}, vbias);
+ vacc${ABC[N]}lo = _mm_add_epi64(vacc${ABC[N]}lo, vbias);
+ vacc${ABC[N]}hi = _mm_add_epi64(vacc${ABC[N]}hi, vbias);
$for N in range(SIMD_TILE):
- vacce${ABC[N]} = _mm_srli_epi64(vacce${ABC[N]}, 16);
- vacco${ABC[N]} = _mm_slli_epi64(vacco${ABC[N]}, 16);
+ vacc${ABC[N]}lo = _mm_srli_epi64(vacc${ABC[N]}lo, 16);
+ vacc${ABC[N]}hi = _mm_srli_epi64(vacc${ABC[N]}hi, 16);
$for N in range(SIMD_TILE):
- __m128i vacc${ABC[N]} = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce${ABC[N]}),
- _mm_castsi128_ps(vacco${ABC[N]}),
- _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacc${ABC[N]} = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacc${ABC[N]}lo),
+ _mm_castsi128_ps(vacc${ABC[N]}hi),
+ _MM_SHUFFLE(2, 0, 2, 0)));
// Pack 8 ints into 8 shorts
$for N in range(0, SIMD_TILE, 2):
@@ -89,16 +89,16 @@
__m128i vx = _mm_loadl_epi64((const __m128i*) input); input += 4;
vx = _mm_xor_si128(vx, vinput_bias); // Convert signed inputs to unsigned.
const __m128i vu = _mm_unpacklo_epi16(vx, vzero);
- __m128i vacco = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 3, 2, 2));
- __m128i vacce = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 1, 2, 0));
- vacce = _mm_mul_epu32(vacce, vmultiplier);
- vacco = _mm_mul_epu32(vacco, vmultiplier);
- vacce = _mm_add_epi64(vacce, vbias);
- vacco = _mm_add_epi64(vacco, vbias);
- vacce = _mm_srli_epi64(vacce, 16);
- vacco = _mm_slli_epi64(vacco, 16);
- __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce),
- _mm_castsi128_ps(vacco), _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacclo = _mm_unpacklo_epi32(vu, vzero);
+ __m128i vacchi = _mm_unpackhi_epi32(vu, vzero);
+ vacclo = _mm_mul_epu32(vacclo, vmultiplier);
+ vacchi = _mm_mul_epu32(vacchi, vmultiplier);
+ vacclo = _mm_add_epi64(vacclo, vbias);
+ vacchi = _mm_add_epi64(vacchi, vbias);
+ vacclo = _mm_srli_epi64(vacclo, 16);
+ vacchi = _mm_srli_epi64(vacchi, 16);
+ __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacclo),
+ _mm_castsi128_ps(vacchi), _MM_SHUFFLE(2, 0, 2, 0)));
vacc = _mm_packs_epi32(vacc, vacc);
const __m128i vy = _mm_packs_epi16(vacc, vacc);
unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
@@ -111,16 +111,16 @@
__m128i vx = _mm_loadl_epi64((const __m128i*) input);
vx = _mm_xor_si128(vx, vinput_bias);
const __m128i vu = _mm_unpacklo_epi16(vx, vzero);
- __m128i vacco = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 3, 2, 2));
- __m128i vacce = _mm_shuffle_epi32(vu, _MM_SHUFFLE(3, 1, 2, 0));
- vacce = _mm_mul_epu32(vacce, vmultiplier);
- vacco = _mm_mul_epu32(vacco, vmultiplier);
- vacce = _mm_add_epi64(vacce, vbias);
- vacco = _mm_add_epi64(vacco, vbias);
- vacce = _mm_srli_epi64(vacce, 16);
- vacco = _mm_slli_epi64(vacco, 16);
- __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacce),
- _mm_castsi128_ps(vacco), _MM_SHUFFLE(3, 1, 2, 0)));
+ __m128i vacclo = _mm_unpacklo_epi32(vu, vzero);
+ __m128i vacchi = _mm_unpackhi_epi32(vu, vzero);
+ vacclo = _mm_mul_epu32(vacclo, vmultiplier);
+ vacchi = _mm_mul_epu32(vacchi, vmultiplier);
+ vacclo = _mm_add_epi64(vacclo, vbias);
+ vacchi = _mm_add_epi64(vacchi, vbias);
+ vacclo = _mm_srli_epi64(vacclo, 16);
+ vacchi = _mm_srli_epi64(vacchi, 16);
+ __m128i vacc = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vacclo),
+ _mm_castsi128_ps(vacchi), _MM_SHUFFLE(2, 0, 2, 0)));
vacc = _mm_packs_epi32(vacc, vacc);
const __m128i vy = _mm_packs_epi16(vacc, vacc);
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-avx-x16.c b/src/qu8-vcvt/gen/qu8-vcvt-avx-x16.c
index 3e96306..869b9d5 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-avx-x16.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-avx-x16.c
@@ -76,18 +76,17 @@
__m128i vy = _mm_packus_epi16(vacc, vacc);
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-avx-x32.c b/src/qu8-vcvt/gen/qu8-vcvt-avx-x32.c
index 9216e27..cce1a7d 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-avx-x32.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-avx-x32.c
@@ -88,18 +88,17 @@
__m128i vy = _mm_packus_epi16(vacc, vacc);
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-avx-x8.c b/src/qu8-vcvt/gen/qu8-vcvt-avx-x8.c
index 2f1b752..aae3fa4 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-avx-x8.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-avx-x8.c
@@ -54,18 +54,17 @@
__m128i vy = _mm_packus_epi16(vacc, vacc);
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-avx2-x16.c b/src/qu8-vcvt/gen/qu8-vcvt-avx2-x16.c
index 6e71f86..5fa5743 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-avx2-x16.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-avx2-x16.c
@@ -61,18 +61,17 @@
output += 8;
}
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-avx2-x32.c b/src/qu8-vcvt/gen/qu8-vcvt-avx2-x32.c
index 24688ed..3c3a164 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-avx2-x32.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-avx2-x32.c
@@ -85,18 +85,17 @@
output += 8;
}
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-avx2-x64.c b/src/qu8-vcvt/gen/qu8-vcvt-avx2-x64.c
index f0481a4..237a5fb 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-avx2-x64.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-avx2-x64.c
@@ -98,18 +98,17 @@
output += 8;
}
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-sse41-x16.c b/src/qu8-vcvt/gen/qu8-vcvt-sse41-x16.c
index 9b7c447..6a2f8ee 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-sse41-x16.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-sse41-x16.c
@@ -76,18 +76,17 @@
__m128i vy = _mm_packus_epi16(vacc, vacc);
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-sse41-x32.c b/src/qu8-vcvt/gen/qu8-vcvt-sse41-x32.c
index a5ea3ba..e3738c7 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-sse41-x32.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-sse41-x32.c
@@ -88,18 +88,17 @@
__m128i vy = _mm_packus_epi16(vacc, vacc);
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}
diff --git a/src/qu8-vcvt/gen/qu8-vcvt-sse41-x8.c b/src/qu8-vcvt/gen/qu8-vcvt-sse41-x8.c
index fcdb270..87cd29e 100644
--- a/src/qu8-vcvt/gen/qu8-vcvt-sse41-x8.c
+++ b/src/qu8-vcvt/gen/qu8-vcvt-sse41-x8.c
@@ -54,18 +54,17 @@
__m128i vy = _mm_packus_epi16(vacc, vacc);
if (batch & (4 * sizeof(uint8_t))) {
- unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
+ _mm_storeu_si32(output, vy);
vy = _mm_srli_epi64(vy, 32);
output += 4;
}
- uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
if (batch & (2 * sizeof(uint8_t))) {
- unaligned_store_u16(output, (uint16_t) vy_lo);
- vy_lo >>= 16;
+ _mm_storeu_si16(output, vy);
+ vy = _mm_srli_epi32(vy, 16);
output += 2;
}
if (batch & (1 * sizeof(uint8_t))) {
- *output = (uint8_t) vy_lo;
+ *output = (uint8_t) _mm_extract_epi8(vy, 0);
}
}
}