Avoid using cpuinfo_get_max_cache_size() function

This function is missing in upstream cpuinfo, and causes build failures in OSS
XNNPACK

PiperOrigin-RevId: 272270763
diff --git a/bench/convolution.cc b/bench/convolution.cc
index 863e35c..0857b01 100644
--- a/bench/convolution.cc
+++ b/bench/convolution.cc
@@ -87,7 +87,7 @@
     return;
   }
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
   std::vector<uint8_t> output(output_elements * num_buffers);
 
@@ -204,7 +204,7 @@
     return;
   }
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (kernel.size() + bias.size() + output_elements));
   std::vector<float> output(output_elements * num_buffers);
 
diff --git a/bench/deconvolution.cc b/bench/deconvolution.cc
index 423c7a6..4acfffe 100644
--- a/bench/deconvolution.cc
+++ b/bench/deconvolution.cc
@@ -75,7 +75,7 @@
     return;
   }
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (kernel.size() + bias.size() + output_elements));
   std::vector<uint8_t> output(output_elements * num_buffers);
 
@@ -192,7 +192,7 @@
     return;
   }
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (kernel.size() + bias.size() + output_elements));
   std::vector<float> output(output_elements * num_buffers);
 
diff --git a/bench/f16-gemm.cc b/bench/f16-gemm.cc
index e107813..15c8a92 100644
--- a/bench/f16-gemm.cc
+++ b/bench/f16-gemm.cc
@@ -58,7 +58,7 @@
   const size_t w_elements = nc_stride * kc_stride + nc_stride;
   const size_t c_elements = mc * nc;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(uint16_t) * (w_elements + c_elements));
 
   std::vector<uint16_t, AlignedAllocator<uint16_t, 32>> w(w_elements * num_buffers);
diff --git a/bench/f32-conv-hwc.cc b/bench/f32-conv-hwc.cc
index 8535b71..1b2687a 100644
--- a/bench/f32-conv-hwc.cc
+++ b/bench/f32-conv-hwc.cc
@@ -60,7 +60,7 @@
     benchmark::utils::roundUp<size_t>(output_channels, output_channels_tile);
   const size_t output_elements = output_height * output_width * output_channels;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (weights_elements + output_elements));
 
   std::vector<float, AlignedAllocator<float, 32>> packed_weights(weights_elements * num_buffers);
diff --git a/bench/f32-dwconv-spchw.cc b/bench/f32-dwconv-spchw.cc
index fd44b78..cc81527 100644
--- a/bench/f32-dwconv-spchw.cc
+++ b/bench/f32-dwconv-spchw.cc
@@ -91,7 +91,7 @@
   const size_t w_elements = (kernel_size + 1) * channels;
   const size_t o_elements = output_size * channels;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + o_elements));
 
   std::vector<float, AlignedAllocator<float, 32>> packed_weights(w_elements * num_buffers);
@@ -208,7 +208,7 @@
   const size_t w_elements = (kernel_size + 1) * channels;
   const size_t o_elements = output_height * benchmark::utils::roundUp<size_t>(output_width, ot) * channels;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + o_elements));
 
   std::vector<float, AlignedAllocator<float, 32>> packed_weights(w_elements * num_buffers);
diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc
index d3967b0..d21b1c3 100644
--- a/bench/f32-dwconv.cc
+++ b/bench/f32-dwconv.cc
@@ -79,7 +79,7 @@
   const size_t i_elements = output_height * step_height;
   const size_t c_elements = output_size * channels;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
 
   std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 9425971..882fc2b 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -61,7 +61,7 @@
   const size_t w_elements = nc_stride * kc_stride + nc_stride;
   const size_t c_elements = mc * nc;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + c_elements));
 
   std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
@@ -132,7 +132,7 @@
   const size_t w_elements = nc_stride * kc + nc_stride;
   const size_t c_elements = mc * nc;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + c_elements));
 
   std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
@@ -205,7 +205,7 @@
   const size_t w_elements = nc_stride * kc + nc_stride;
   const size_t c_elements = mc * nc;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + c_elements));
 
   std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
@@ -259,7 +259,7 @@
   const size_t kc = state.range(2);
 
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (nc * (mc + kc + 1)));
 
   std::vector<float> a(mc * kc);
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 0e371a6..e62983b 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -76,7 +76,7 @@
   const size_t i_elements = mc_stride * kernel_size;
   const size_t c_elements = output_height * output_width * output_pixel_stride;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
 
   std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
diff --git a/bench/f32-im2col-gemm.cc b/bench/f32-im2col-gemm.cc
index be4444a..d9100f8 100644
--- a/bench/f32-im2col-gemm.cc
+++ b/bench/f32-im2col-gemm.cc
@@ -70,7 +70,7 @@
   const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
   const size_t c_elements = output_size * group_output_channels;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + c_elements));
 
   std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
diff --git a/bench/f32-spmm.cc b/bench/f32-spmm.cc
index 4d2a9da..19a78cd 100644
--- a/bench/f32-spmm.cc
+++ b/bench/f32-spmm.cc
@@ -54,7 +54,7 @@
   const size_t dmap_elements = num_nonzeroes / nr;
   const size_t nmap_elements = nc;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(float) * (w_elements + c_elements) + sizeof(uint32_t) * (dmap_elements + nmap_elements));
 
   // Micro-kernel can access one element beyond w and dmap for software pipelining.
diff --git a/bench/q8-gemm.cc b/bench/q8-gemm.cc
index 9a15006..1538d13 100644
--- a/bench/q8-gemm.cc
+++ b/bench/q8-gemm.cc
@@ -61,7 +61,7 @@
   const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
   const size_t c_elements = mc * nc;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       sizeof(uint8_t) * (w_elements + c_elements));
 
   std::vector<uint8_t, AlignedAllocator<uint8_t, 32>> w(w_elements * num_buffers);
@@ -159,7 +159,7 @@
   const size_t bElements = nc;
   const size_t c_elements = mc * nc;
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
 
   std::vector<uint8_t> k(kElements * num_buffers);
@@ -209,7 +209,7 @@
   auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
 
   const size_t num_buffers = 1 +
-    benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+    benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
       nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
 
   std::vector<uint8_t> a(mc * kc);
diff --git a/bench/utils.cc b/bench/utils.cc
index eb02e21..87ae24c 100644
--- a/bench/utils.cc
+++ b/bench/utils.cc
@@ -25,7 +25,7 @@
   // Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
   wipe_buffer_size = 128 * 1024 * 1024;
   if (cpuinfo_initialize()) {
-    wipe_buffer_size = cpuinfo_get_max_cache_size();
+    wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
   }
 #if defined(__ANDROID__)
   // memalign is obsolete, but it is the only option on Android until API level 17.
@@ -83,5 +83,123 @@
   return 0;
 }
 
+size_t GetMaxCacheSize() {
+  if (!cpuinfo_initialize()) {
+    #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+      // DynamIQ max: 4 MB
+      return 4 * 1024 * 1024;
+    #else
+      // Intel eDRAM max: 128 MB
+      return 128 * 1024 * 1024;
+    #endif
+  }
+  const cpuinfo_processor* processor = cpuinfo_get_processor(0);
+  #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+    // There is no precise way to detect cache size on ARM/ARM64, and cache size reported by cpuinfo
+    // may underestimate the actual cache size. Thus, we use microarchitecture-specific maximum.
+    switch (processor->core->uarch) {
+      case cpuinfo_uarch_xscale:
+      case cpuinfo_uarch_arm11:
+      case cpuinfo_uarch_scorpion:
+      case cpuinfo_uarch_krait:
+      case cpuinfo_uarch_kryo:
+      case cpuinfo_uarch_exynos_m1:
+      case cpuinfo_uarch_exynos_m2:
+      case cpuinfo_uarch_exynos_m3:
+        // cpuinfo-detected cache size always correct.
+        break;
+      case cpuinfo_uarch_cortex_a5:
+        // Max observed (NXP Vybrid SoC)
+        return 512 * 1024;
+      case cpuinfo_uarch_cortex_a7:
+        // Cortex-A7 MPCore Technical Reference Manual:
+        // 7.1. About the L2 Memory system
+        //   The L2 memory system consists of an:
+        //    - Optional tightly-coupled L2 cache that includes:
+        //      - Configurable L2 cache size of 128KB, 256KB, 512KB, and 1MB.
+        return 1024 * 1024;
+      case cpuinfo_uarch_cortex_a8:
+        // Cortex-A8 Technical Reference Manual:
+        // 8.1. About the L2 memory system
+        //   The key features of the L2 memory system include:
+        //    - configurable cache size of 0KB, 128KB, 256KB, 512KB, and 1MB
+        return 1024 * 1024;
+      case cpuinfo_uarch_cortex_a9:
+        // Max observed (e.g. Exynos 4212)
+        return 1024 * 1024;
+      case cpuinfo_uarch_cortex_a12:
+      case cpuinfo_uarch_cortex_a17:
+        // ARM Cortex-A17 MPCore Processor Technical Reference Manual:
+        // 7.1. About the L2 Memory system
+        //   The key features of the L2 memory system include:
+        //    - An integrated L2 cache:
+        //      - The cache size is implemented as either 256KB, 512KB, 1MB, 2MB, 4MB or 8MB.
+        return 8 * 1024 * 1024;
+      case cpuinfo_uarch_cortex_a15:
+        // ARM Cortex-A15 MPCore Processor Technical Reference Manual:
+        // 7.1. About the L2 memory system
+        //   The features of the L2 memory system include:
+        //    - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
+        return 4 * 1024 * 1024;
+      case cpuinfo_uarch_cortex_a35:
+        // ARM Cortex‑A35 Processor Technical Reference Manual:
+        // 7.1 About the L2 memory system
+        //   L2 cache
+        //    - Further features of the L2 cache are:
+        //      - Configurable size of 128KB, 256KB, 512KB, and 1MB.
+        return 1024 * 1024;
+      case cpuinfo_uarch_cortex_a53:
+        // ARM Cortex-A53 MPCore Processor Technical Reference Manual:
+        // 7.1. About the L2 memory system
+        //   The L2 memory system consists of an:
+        //    - Optional tightly-coupled L2 cache that includes:
+        //      - Configurable L2 cache size of 128KB, 256KB, 512KB, 1MB and 2MB.
+        return 2 * 1024 * 1024;
+      case cpuinfo_uarch_cortex_a57:
+        // ARM Cortex-A57 MPCore Processor Technical Reference Manual:
+        // 7.1 About the L2 memory system
+        //   The features of the L2 memory system include:
+        //    - Configurable L2 cache size of 512KB, 1MB, and 2MB.
+        return 2 * 1024 * 1024;
+      case cpuinfo_uarch_cortex_a72:
+        // ARM Cortex-A72 MPCore Processor Technical Reference Manual:
+        // 7.1 About the L2 memory system
+        //   The features of the L2 memory system include:
+        //    - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
+        return 4 * 1024 * 1024;
+      case cpuinfo_uarch_cortex_a73:
+        // ARM Cortex‑A73 MPCore Processor Technical Reference Manual
+        // 7.1 About the L2 memory system
+        //   The L2 memory system consists of:
+        //    - A tightly-integrated L2 cache with:
+        //       - A configurable size of 256KB, 512KB, 1MB, 2MB, 4MB, or 8MB.
+        return 8 * 1024 * 1024;
+      default:
+        // ARM DynamIQ Shared Unit Technical Reference Manual
+        // 1.3 Implementation options
+        //   L3_CACHE_SIZE
+        //    - 256KB
+        //    - 512KB
+        //    - 1024KB
+        //    - 1536KB
+        //    - 2048KB
+        //    - 3072KB
+        //    - 4096KB
+        return 4 * 1024 * 1024;
+    }
+  #endif
+  if (processor->cache.l4 != NULL) {
+    return processor->cache.l4->size;
+  } else if (processor->cache.l3 != NULL) {
+    return processor->cache.l3->size;
+  } else if (processor->cache.l2 != NULL) {
+    return processor->cache.l2->size;
+  } else if (processor->cache.l1d != NULL) {
+    return processor->cache.l1d->size;
+  } else {
+    return 0;
+  }
+}
+
 }  // namespace utils
 }  // namespace benchmark
diff --git a/bench/utils.h b/bench/utils.h
index 87f235b..9775af2 100644
--- a/bench/utils.h
+++ b/bench/utils.h
@@ -13,7 +13,13 @@
 
 uint32_t wipeCache();
 uint32_t prefetchToL1(const void* ptr, size_t size);
-uint64_t GetCurrentCpuFrequency();  // Return clockrate of current cpu
+
+// Return clock rate, in Hz, for the currently used logical processor.
+uint64_t GetCurrentCpuFrequency();
+
+// Return maximum (across all cores/clusters/sockets) last level cache size.
+// Can overestimate, but not underestimate LLC size.
+size_t GetMaxCacheSize();
 
 template <class T>
 inline T divideRoundUp(T x, T q) {