Add ReverseBlocks
PiperOrigin-RevId: 411776293
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index eba62ac..45c03fe 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -921,6 +921,9 @@
index `2*i` and `2*i+1` are swapped. Results are undefined for vectors with
less than two blocks; callers must first check that via `Lanes`.
+* <code>V **ReverseBlocks**(V v)</code>: returns a vector with blocks in
+ reversed order.
+
* `V`: `{u,i,f}{32,64}` \
<code>V **TableLookupLanes**(V a, unspecified)</code> returns a vector of
`a[indices[i]]`, where `unspecified` is the return value of
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
index 7b315c4..b75bb66 100644
--- a/hwy/ops/arm_neon-inl.h
+++ b/hwy/ops/arm_neon-inl.h
@@ -3906,6 +3906,14 @@
return v;
}
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
+ return v;
+}
+
// ------------------------------ ReorderDemote2To (OddEven)
template <size_t N>
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index 1f03e66..5012775 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -1625,6 +1625,13 @@
return Shuffle2301(Shuffle1032(v));
}
+// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
+template <class D, class V = VFromD<D>>
+HWY_API V ReverseBlocks(D d, V v) {
+ const Repartition<uint64_t, D> du64;
+ return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
+}
+
// ------------------------------ TableLookupBytes
template <class V, class VI>
diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h
index 64260e7..db873c4 100644
--- a/hwy/ops/rvv-inl.h
+++ b/hwy/ops/rvv-inl.h
@@ -513,6 +513,8 @@
namespace detail {
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf)
+HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx)
+HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf)
} // namespace detail
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add)
@@ -1593,10 +1595,23 @@
const RebindToUnsigned<D> du;
using TU = TFromD<decltype(du)>;
const size_t N = Lanes(du);
- const auto idx = Sub(Set(du, static_cast<TU>(N - 1)), detail::Iota0(du));
+ const auto idx =
+ detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
return TableLookupLanes(v, idx);
}
+// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
+template <class D, class V = VFromD<D>>
+HWY_API V ReverseBlocks(D d, V v) {
+ const Repartition<uint64_t, D> du64;
+ const size_t N = Lanes(du64);
+ const auto rev =
+ detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
+ // Swap lo/hi u64 within each block
+ const auto idx = detail::XorS(rev, 1);
+ return BitCast(d, TableLookupLanes(BitCast(du64, v), idx));
+}
+
// ------------------------------ Compress
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
@@ -2051,13 +2066,13 @@
template <class D, HWY_IF_UNSIGNED_D(D)>
HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
- return Add(detail::Iota0(d), Set(d, first));
+ return detail::AddS(detail::Iota0(d), first);
}
template <class D, HWY_IF_SIGNED_D(D)>
HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
const RebindToUnsigned<D> du;
- return Add(BitCast(d, detail::Iota0(du)), Set(d, first));
+ return detail::AddS(BitCast(d, detail::Iota0(du)), first);
}
template <class D, HWY_IF_FLOAT_D(D)>
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
index 3e7758f..a4ce5a8 100644
--- a/hwy/ops/scalar-inl.h
+++ b/hwy/ops/scalar-inl.h
@@ -1125,6 +1125,14 @@
return v;
}
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec1<T> ReverseBlocks(Sisd<T> /* tag */, const Vec1<T> v) {
+ return v;
+}
+
// ------------------------------ Reverse
template <typename T>
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
index 69707fd..1463def 100644
--- a/hwy/ops/wasm_128-inl.h
+++ b/hwy/ops/wasm_128-inl.h
@@ -2424,6 +2424,14 @@
return v;
}
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
+ return v;
+}
+
// ================================================== CONVERT
// ------------------------------ Promotions (part w/ narrow lanes -> full)
diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h
index b52d5c7..802b80e 100644
--- a/hwy/ops/wasm_256-inl.h
+++ b/hwy/ops/wasm_256-inl.h
@@ -2298,6 +2298,13 @@
return v;
}
+// ------------------------------ ReverseBlocks
+
+template <typename T>
+HWY_API Vec256<T> ReverseBlocks(Full256<T> /* tag */, const Vec256<T> v) {
+ return v;
+}
+
// ================================================== CONVERT
// ------------------------------ Promotions (part w/ narrow lanes -> full)
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index cfec2a1..7de1bd8 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -3403,6 +3403,14 @@
#endif
}
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
+ return v;
+}
+
// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
// Single lane: no change
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index 0492a1b..9caa18c 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -3140,6 +3140,13 @@
return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
}
+// ------------------------------ ReverseBlocks (ConcatLowerUpper)
+
+template <typename T>
+HWY_API Vec256<T> ReverseBlocks(Full256<T> /* tag */, Vec256<T> v) {
+ return ConcatLowerUpper(v, v);
+}
+
// ------------------------------ TableLookupBytes (ZeroExtendVector)
// Both full
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index 11d0e71..fe93d56 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -2718,6 +2718,22 @@
_mm512_shuffle_f64x2(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
}
+// ------------------------------ ReverseBlocks
+
+template <typename T>
+HWY_API Vec512<T> ReverseBlocks(Full512<T> /* tag */, Vec512<T> v) {
+ return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
+}
+HWY_API Vec512<float> ReverseBlocks(Full512<float> /* tag */, Vec512<float> v) {
+ return Vec512<float>{
+ _mm512_shuffle_f32x4(v.raw, v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
+}
+HWY_API Vec512<double> ReverseBlocks(Full512<double> /* tag */,
+ Vec512<double> v) {
+ return Vec512<double>{
+ _mm512_shuffle_f64x2(v.raw, v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
+}
+
// ------------------------------ TableLookupBytes (ZeroExtendVector)
// Both full
diff --git a/hwy/tests/swizzle_test.cc b/hwy/tests/swizzle_test.cc
index ea14514..da328f1 100644
--- a/hwy/tests/swizzle_test.cc
+++ b/hwy/tests/swizzle_test.cc
@@ -203,6 +203,34 @@
ForUIF163264(ForPartialVectors<TestReverse>());
}
+struct TestReverseBlocks {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const RebindToUnsigned<D> du; // Iota does not support float16_t.
+ const auto v = BitCast(d, Iota(du, 1));
+ auto expected = AllocateAligned<T>(N);
+
+ constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+ const size_t num_blocks = N / kLanesPerBlock;
+ HWY_ASSERT(num_blocks != 0);
+
+ // Can't set float16_t value directly, need to permute in memory.
+ auto copy = AllocateAligned<T>(N);
+ Store(v, d, copy.get());
+ for (size_t i = 0; i < N; ++i) {
+ const size_t idx_block = i / kLanesPerBlock;
+ const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock;
+ expected[i] = copy[base + (i % kLanesPerBlock)];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v));
+ }
+};
+
+HWY_NOINLINE void TestAllReverseBlocks() {
+ ForAllTypes(ForGE128Vectors<TestReverseBlocks>());
+}
+
class TestCompress {
template <typename T, typename TI, size_t N>
void CheckStored(Simd<T, N> d, Simd<TI, N> di, size_t expected_pos,
@@ -487,6 +515,7 @@
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverseBlocks);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllCompress);
} // namespace hwy