Add ReverseBlocks

PiperOrigin-RevId: 411776293
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index eba62ac..45c03fe 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -921,6 +921,9 @@
     index `2*i` and `2*i+1` are swapped. Results are undefined for vectors with
     less than two blocks; callers must first check that via `Lanes`.
 
+*   <code>V **ReverseBlocks**(V v)</code>: returns a vector with blocks in
+    reversed order.
+
 *   `V`: `{u,i,f}{32,64}` \
     <code>V **TableLookupLanes**(V a, unspecified)</code> returns a vector of
     `a[indices[i]]`, where `unspecified` is the return value of
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
index 7b315c4..b75bb66 100644
--- a/hwy/ops/arm_neon-inl.h
+++ b/hwy/ops/arm_neon-inl.h
@@ -3906,6 +3906,14 @@
   return v;
 }
 
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
+  return v;
+}
+
 // ------------------------------ ReorderDemote2To (OddEven)
 
 template <size_t N>
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index 1f03e66..5012775 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -1625,6 +1625,13 @@
   return Shuffle2301(Shuffle1032(v));
 }
 
+// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
+template <class D, class V = VFromD<D>>
+HWY_API V ReverseBlocks(D d, V v) {
+  const Repartition<uint64_t, D> du64;
+  return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
+}
+
 // ------------------------------ TableLookupBytes
 
 template <class V, class VI>
diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h
index 64260e7..db873c4 100644
--- a/hwy/ops/rvv-inl.h
+++ b/hwy/ops/rvv-inl.h
@@ -513,6 +513,8 @@
 namespace detail {
 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx)
 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf)
+HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx)
+HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf)
 }  // namespace detail
 
 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add)
@@ -1593,10 +1595,23 @@
   const RebindToUnsigned<D> du;
   using TU = TFromD<decltype(du)>;
   const size_t N = Lanes(du);
-  const auto idx = Sub(Set(du, static_cast<TU>(N - 1)), detail::Iota0(du));
+  const auto idx =
+      detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
   return TableLookupLanes(v, idx);
 }
 
+// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
+template <class D, class V = VFromD<D>>
+HWY_API V ReverseBlocks(D d, V v) {
+  const Repartition<uint64_t, D> du64;
+  const size_t N = Lanes(du64);
+  const auto rev =
+      detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
+  // Swap lo/hi u64 within each block
+  const auto idx = detail::XorS(rev, 1);
+  return BitCast(d, TableLookupLanes(BitCast(du64, v), idx));
+}
+
 // ------------------------------ Compress
 
 #define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME,  \
@@ -2051,13 +2066,13 @@
 
 template <class D, HWY_IF_UNSIGNED_D(D)>
 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
-  return Add(detail::Iota0(d), Set(d, first));
+  return detail::AddS(detail::Iota0(d), first);
 }
 
 template <class D, HWY_IF_SIGNED_D(D)>
 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
   const RebindToUnsigned<D> du;
-  return Add(BitCast(d, detail::Iota0(du)), Set(d, first));
+  return detail::AddS(BitCast(d, detail::Iota0(du)), first);
 }
 
 template <class D, HWY_IF_FLOAT_D(D)>
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
index 3e7758f..a4ce5a8 100644
--- a/hwy/ops/scalar-inl.h
+++ b/hwy/ops/scalar-inl.h
@@ -1125,6 +1125,14 @@
   return v;
 }
 
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec1<T> ReverseBlocks(Sisd<T> /* tag */, const Vec1<T> v) {
+  return v;
+}
+
 // ------------------------------ Reverse
 
 template <typename T>
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
index 69707fd..1463def 100644
--- a/hwy/ops/wasm_128-inl.h
+++ b/hwy/ops/wasm_128-inl.h
@@ -2424,6 +2424,14 @@
   return v;
 }
 
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
+  return v;
+}
+
 // ================================================== CONVERT
 
 // ------------------------------ Promotions (part w/ narrow lanes -> full)
diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h
index b52d5c7..802b80e 100644
--- a/hwy/ops/wasm_256-inl.h
+++ b/hwy/ops/wasm_256-inl.h
@@ -2298,6 +2298,13 @@
   return v;
 }
 
+// ------------------------------ ReverseBlocks
+
+template <typename T>
+HWY_API Vec256<T> ReverseBlocks(Full256<T> /* tag */, const Vec256<T> v) {
+  return v;
+}
+
 // ================================================== CONVERT
 
 // ------------------------------ Promotions (part w/ narrow lanes -> full)
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index cfec2a1..7de1bd8 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -3403,6 +3403,14 @@
 #endif
 }
 
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
+  return v;
+}
+
 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
 
 // Single lane: no change
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index 0492a1b..9caa18c 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -3140,6 +3140,13 @@
   return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
 }
 
+// ------------------------------ ReverseBlocks (ConcatLowerUpper)
+
+template <typename T>
+HWY_API Vec256<T> ReverseBlocks(Full256<T> /* tag */, Vec256<T> v) {
+  return ConcatLowerUpper(v, v);
+}
+
 // ------------------------------ TableLookupBytes (ZeroExtendVector)
 
 // Both full
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index 11d0e71..fe93d56 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -2718,6 +2718,22 @@
       _mm512_shuffle_f64x2(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
 }
 
+// ------------------------------ ReverseBlocks
+
+template <typename T>
+HWY_API Vec512<T> ReverseBlocks(Full512<T> /* tag */, Vec512<T> v) {
+  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
+}
+HWY_API Vec512<float> ReverseBlocks(Full512<float> /* tag */, Vec512<float> v) {
+  return Vec512<float>{
+      _mm512_shuffle_f32x4(v.raw, v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
+}
+HWY_API Vec512<double> ReverseBlocks(Full512<double> /* tag */,
+                                     Vec512<double> v) {
+  return Vec512<double>{
+      _mm512_shuffle_f64x2(v.raw, v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
+}
+
 // ------------------------------ TableLookupBytes (ZeroExtendVector)
 
 // Both full
diff --git a/hwy/tests/swizzle_test.cc b/hwy/tests/swizzle_test.cc
index ea14514..da328f1 100644
--- a/hwy/tests/swizzle_test.cc
+++ b/hwy/tests/swizzle_test.cc
@@ -203,6 +203,34 @@
   ForUIF163264(ForPartialVectors<TestReverse>());
 }
 
+struct TestReverseBlocks {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
+    const auto v = BitCast(d, Iota(du, 1));
+    auto expected = AllocateAligned<T>(N);
+
+    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+    const size_t num_blocks = N / kLanesPerBlock;
+    HWY_ASSERT(num_blocks != 0);
+
+    // Can't set float16_t value directly, need to permute in memory.
+    auto copy = AllocateAligned<T>(N);
+    Store(v, d, copy.get());
+    for (size_t i = 0; i < N; ++i) {
+      const size_t idx_block = i / kLanesPerBlock;
+      const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock;
+      expected[i] = copy[base + (i % kLanesPerBlock)];
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v));
+  }
+};
+
+HWY_NOINLINE void TestAllReverseBlocks() {
+  ForAllTypes(ForGE128Vectors<TestReverseBlocks>());
+}
+
 class TestCompress {
   template <typename T, typename TI, size_t N>
   void CheckStored(Simd<T, N> d, Simd<TI, N> di, size_t expected_pos,
@@ -487,6 +515,7 @@
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverseBlocks);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllCompress);
 }  // namespace hwy