Avoid using cpuinfo_get_max_cache_size() function
This function is missing in upstream cpuinfo, and causes build failures in OSS
XNNPACK
PiperOrigin-RevId: 272270763
diff --git a/bench/convolution.cc b/bench/convolution.cc
index 863e35c..0857b01 100644
--- a/bench/convolution.cc
+++ b/bench/convolution.cc
@@ -87,7 +87,7 @@
return;
}
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
std::vector<uint8_t> output(output_elements * num_buffers);
@@ -204,7 +204,7 @@
return;
}
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (kernel.size() + bias.size() + output_elements));
std::vector<float> output(output_elements * num_buffers);
diff --git a/bench/deconvolution.cc b/bench/deconvolution.cc
index 423c7a6..4acfffe 100644
--- a/bench/deconvolution.cc
+++ b/bench/deconvolution.cc
@@ -75,7 +75,7 @@
return;
}
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (kernel.size() + bias.size() + output_elements));
std::vector<uint8_t> output(output_elements * num_buffers);
@@ -192,7 +192,7 @@
return;
}
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (kernel.size() + bias.size() + output_elements));
std::vector<float> output(output_elements * num_buffers);
diff --git a/bench/f16-gemm.cc b/bench/f16-gemm.cc
index e107813..15c8a92 100644
--- a/bench/f16-gemm.cc
+++ b/bench/f16-gemm.cc
@@ -58,7 +58,7 @@
const size_t w_elements = nc_stride * kc_stride + nc_stride;
const size_t c_elements = mc * nc;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(uint16_t) * (w_elements + c_elements));
std::vector<uint16_t, AlignedAllocator<uint16_t, 32>> w(w_elements * num_buffers);
diff --git a/bench/f32-conv-hwc.cc b/bench/f32-conv-hwc.cc
index 8535b71..1b2687a 100644
--- a/bench/f32-conv-hwc.cc
+++ b/bench/f32-conv-hwc.cc
@@ -60,7 +60,7 @@
benchmark::utils::roundUp<size_t>(output_channels, output_channels_tile);
const size_t output_elements = output_height * output_width * output_channels;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (weights_elements + output_elements));
std::vector<float, AlignedAllocator<float, 32>> packed_weights(weights_elements * num_buffers);
diff --git a/bench/f32-dwconv-spchw.cc b/bench/f32-dwconv-spchw.cc
index fd44b78..cc81527 100644
--- a/bench/f32-dwconv-spchw.cc
+++ b/bench/f32-dwconv-spchw.cc
@@ -91,7 +91,7 @@
const size_t w_elements = (kernel_size + 1) * channels;
const size_t o_elements = output_size * channels;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + o_elements));
std::vector<float, AlignedAllocator<float, 32>> packed_weights(w_elements * num_buffers);
@@ -208,7 +208,7 @@
const size_t w_elements = (kernel_size + 1) * channels;
const size_t o_elements = output_height * benchmark::utils::roundUp<size_t>(output_width, ot) * channels;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + o_elements));
std::vector<float, AlignedAllocator<float, 32>> packed_weights(w_elements * num_buffers);
diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc
index d3967b0..d21b1c3 100644
--- a/bench/f32-dwconv.cc
+++ b/bench/f32-dwconv.cc
@@ -79,7 +79,7 @@
const size_t i_elements = output_height * step_height;
const size_t c_elements = output_size * channels;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 9425971..882fc2b 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -61,7 +61,7 @@
const size_t w_elements = nc_stride * kc_stride + nc_stride;
const size_t c_elements = mc * nc;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + c_elements));
std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
@@ -132,7 +132,7 @@
const size_t w_elements = nc_stride * kc + nc_stride;
const size_t c_elements = mc * nc;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + c_elements));
std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
@@ -205,7 +205,7 @@
const size_t w_elements = nc_stride * kc + nc_stride;
const size_t c_elements = mc * nc;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + c_elements));
std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
@@ -259,7 +259,7 @@
const size_t kc = state.range(2);
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (nc * (mc + kc + 1)));
std::vector<float> a(mc * kc);
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 0e371a6..e62983b 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -76,7 +76,7 @@
const size_t i_elements = mc_stride * kernel_size;
const size_t c_elements = output_height * output_width * output_pixel_stride;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
diff --git a/bench/f32-im2col-gemm.cc b/bench/f32-im2col-gemm.cc
index be4444a..d9100f8 100644
--- a/bench/f32-im2col-gemm.cc
+++ b/bench/f32-im2col-gemm.cc
@@ -70,7 +70,7 @@
const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
const size_t c_elements = output_size * group_output_channels;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + c_elements));
std::vector<float, AlignedAllocator<float, 32>> w(w_elements * num_buffers);
diff --git a/bench/f32-spmm.cc b/bench/f32-spmm.cc
index 4d2a9da..19a78cd 100644
--- a/bench/f32-spmm.cc
+++ b/bench/f32-spmm.cc
@@ -54,7 +54,7 @@
const size_t dmap_elements = num_nonzeroes / nr;
const size_t nmap_elements = nc;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(float) * (w_elements + c_elements) + sizeof(uint32_t) * (dmap_elements + nmap_elements));
// Micro-kernel can access one element beyond w and dmap for software pipelining.
diff --git a/bench/q8-gemm.cc b/bench/q8-gemm.cc
index 9a15006..1538d13 100644
--- a/bench/q8-gemm.cc
+++ b/bench/q8-gemm.cc
@@ -61,7 +61,7 @@
const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
const size_t c_elements = mc * nc;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(uint8_t) * (w_elements + c_elements));
std::vector<uint8_t, AlignedAllocator<uint8_t, 32>> w(w_elements * num_buffers);
@@ -159,7 +159,7 @@
const size_t bElements = nc;
const size_t c_elements = mc * nc;
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
std::vector<uint8_t> k(kElements * num_buffers);
@@ -209,7 +209,7 @@
auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
const size_t num_buffers = 1 +
- benchmark::utils::divideRoundUp<size_t>(cpuinfo_get_max_cache_size(),
+ benchmark::utils::divideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
std::vector<uint8_t> a(mc * kc);
diff --git a/bench/utils.cc b/bench/utils.cc
index eb02e21..87ae24c 100644
--- a/bench/utils.cc
+++ b/bench/utils.cc
@@ -25,7 +25,7 @@
// Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
wipe_buffer_size = 128 * 1024 * 1024;
if (cpuinfo_initialize()) {
- wipe_buffer_size = cpuinfo_get_max_cache_size();
+ wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
}
#if defined(__ANDROID__)
// memalign is obsolete, but it is the only option on Android until API level 17.
@@ -83,5 +83,123 @@
return 0;
}
+size_t GetMaxCacheSize() {
+ if (!cpuinfo_initialize()) {
+ #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ // DynamIQ max: 4 MB
+ return 4 * 1024 * 1024;
+ #else
+ // Intel eDRAM max: 128 MB
+ return 128 * 1024 * 1024;
+ #endif
+ }
+ const cpuinfo_processor* processor = cpuinfo_get_processor(0);
+ #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+ // There is no precise way to detect cache size on ARM/ARM64, and cache size reported by cpuinfo
+ // may underestimate the actual cache size. Thus, we use microarchitecture-specific maximum.
+ switch (processor->core->uarch) {
+ case cpuinfo_uarch_xscale:
+ case cpuinfo_uarch_arm11:
+ case cpuinfo_uarch_scorpion:
+ case cpuinfo_uarch_krait:
+ case cpuinfo_uarch_kryo:
+ case cpuinfo_uarch_exynos_m1:
+ case cpuinfo_uarch_exynos_m2:
+ case cpuinfo_uarch_exynos_m3:
+ // cpuinfo-detected cache size always correct.
+ break;
+ case cpuinfo_uarch_cortex_a5:
+ // Max observed (NXP Vybrid SoC)
+ return 512 * 1024;
+ case cpuinfo_uarch_cortex_a7:
+ // Cortex-A7 MPCore Technical Reference Manual:
+ // 7.1. About the L2 Memory system
+ // The L2 memory system consists of an:
+ // - Optional tightly-coupled L2 cache that includes:
+ // - Configurable L2 cache size of 128KB, 256KB, 512KB, and 1MB.
+ return 1024 * 1024;
+ case cpuinfo_uarch_cortex_a8:
+ // Cortex-A8 Technical Reference Manual:
+ // 8.1. About the L2 memory system
+ // The key features of the L2 memory system include:
+ // - configurable cache size of 0KB, 128KB, 256KB, 512KB, and 1MB
+ return 1024 * 1024;
+ case cpuinfo_uarch_cortex_a9:
+ // Max observed (e.g. Exynos 4212)
+ return 1024 * 1024;
+ case cpuinfo_uarch_cortex_a12:
+ case cpuinfo_uarch_cortex_a17:
+ // ARM Cortex-A17 MPCore Processor Technical Reference Manual:
+ // 7.1. About the L2 Memory system
+ // The key features of the L2 memory system include:
+ // - An integrated L2 cache:
+ // - The cache size is implemented as either 256KB, 512KB, 1MB, 2MB, 4MB or 8MB.
+ return 8 * 1024 * 1024;
+ case cpuinfo_uarch_cortex_a15:
+ // ARM Cortex-A15 MPCore Processor Technical Reference Manual:
+ // 7.1. About the L2 memory system
+ // The features of the L2 memory system include:
+ // - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
+ return 4 * 1024 * 1024;
+ case cpuinfo_uarch_cortex_a35:
+ // ARM Cortex‑A35 Processor Technical Reference Manual:
+ // 7.1 About the L2 memory system
+ // L2 cache
+ // - Further features of the L2 cache are:
+ // - Configurable size of 128KB, 256KB, 512KB, and 1MB.
+ return 1024 * 1024;
+ case cpuinfo_uarch_cortex_a53:
+ // ARM Cortex-A53 MPCore Processor Technical Reference Manual:
+ // 7.1. About the L2 memory system
+ // The L2 memory system consists of an:
+ // - Optional tightly-coupled L2 cache that includes:
+ // - Configurable L2 cache size of 128KB, 256KB, 512KB, 1MB and 2MB.
+ return 2 * 1024 * 1024;
+ case cpuinfo_uarch_cortex_a57:
+ // ARM Cortex-A57 MPCore Processor Technical Reference Manual:
+ // 7.1 About the L2 memory system
+ // The features of the L2 memory system include:
+ // - Configurable L2 cache size of 512KB, 1MB, and 2MB.
+ return 2 * 1024 * 1024;
+ case cpuinfo_uarch_cortex_a72:
+ // ARM Cortex-A72 MPCore Processor Technical Reference Manual:
+ // 7.1 About the L2 memory system
+ // The features of the L2 memory system include:
+ // - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
+ return 4 * 1024 * 1024;
+ case cpuinfo_uarch_cortex_a73:
+ // ARM Cortex‑A73 MPCore Processor Technical Reference Manual
+ // 7.1 About the L2 memory system
+ // The L2 memory system consists of:
+ // - A tightly-integrated L2 cache with:
+ // - A configurable size of 256KB, 512KB, 1MB, 2MB, 4MB, or 8MB.
+ return 8 * 1024 * 1024;
+ default:
+ // ARM DynamIQ Shared Unit Technical Reference Manual
+ // 1.3 Implementation options
+ // L3_CACHE_SIZE
+ // - 256KB
+ // - 512KB
+ // - 1024KB
+ // - 1536KB
+ // - 2048KB
+ // - 3072KB
+ // - 4096KB
+ return 4 * 1024 * 1024;
+ }
+ #endif
+ if (processor->cache.l4 != NULL) {
+ return processor->cache.l4->size;
+ } else if (processor->cache.l3 != NULL) {
+ return processor->cache.l3->size;
+ } else if (processor->cache.l2 != NULL) {
+ return processor->cache.l2->size;
+ } else if (processor->cache.l1d != NULL) {
+ return processor->cache.l1d->size;
+ } else {
+ return 0;
+ }
+}
+
} // namespace utils
} // namespace benchmark
diff --git a/bench/utils.h b/bench/utils.h
index 87f235b..9775af2 100644
--- a/bench/utils.h
+++ b/bench/utils.h
@@ -13,7 +13,13 @@
uint32_t wipeCache();
uint32_t prefetchToL1(const void* ptr, size_t size);
-uint64_t GetCurrentCpuFrequency(); // Return clockrate of current cpu
+
+// Return clock rate, in Hz, for the currently used logical processor.
+uint64_t GetCurrentCpuFrequency();
+
+// Return maximum (across all cores/clusters/sockets) last level cache size.
+// Can overestimate, but not underestimate LLC size.
+size_t GetMaxCacheSize();
template <class T>
inline T divideRoundUp(T x, T q) {