| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef HIGHWAY_HWY_BASE_H_ |
| #define HIGHWAY_HWY_BASE_H_ |
| |
| // For SIMD module implementations and their callers, target-independent. |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include <atomic> |
| |
| // Add to #if conditions to prevent IDE from graying out code. |
| #if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \ |
| (defined Q_CREATOR_RUN) || (defined(__CLANGD__)) |
| #define HWY_IDE 1 |
| #else |
| #define HWY_IDE 0 |
| #endif |
| |
| //------------------------------------------------------------------------------ |
| // Detect compiler using predefined macros |
| |
| #ifdef _MSC_VER |
| #define HWY_COMPILER_MSVC _MSC_VER |
| #else |
| #define HWY_COMPILER_MSVC 0 |
| #endif |
| |
| #ifdef __INTEL_COMPILER |
| #define HWY_COMPILER_ICC __INTEL_COMPILER |
| #else |
| #define HWY_COMPILER_ICC 0 |
| #endif |
| |
| #ifdef __GNUC__ |
| #define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__) |
| #else |
| #define HWY_COMPILER_GCC 0 |
| #endif |
| |
| // Clang can masquerade as MSVC/GCC, in which case both are set. |
| #ifdef __clang__ |
| #define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__) |
| #else |
| #define HWY_COMPILER_CLANG 0 |
| #endif |
| |
| // More than one may be nonzero, but we want at least one. |
| #if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \ |
| !HWY_COMPILER_CLANG |
| #error "Unsupported compiler" |
| #endif |
| |
| //------------------------------------------------------------------------------ |
| // Compiler-specific definitions |
| |
| #define HWY_STR_IMPL(macro) #macro |
| #define HWY_STR(macro) HWY_STR_IMPL(macro) |
| |
| #if HWY_COMPILER_MSVC |
| |
| #include <intrin.h> |
| |
| #define HWY_RESTRICT __restrict |
| #define HWY_INLINE __forceinline |
| #define HWY_NOINLINE __declspec(noinline) |
| #define HWY_FLATTEN |
| #define HWY_NORETURN __declspec(noreturn) |
| #define HWY_LIKELY(expr) (expr) |
| #define HWY_UNLIKELY(expr) (expr) |
| #define HWY_PRAGMA(tokens) __pragma(tokens) |
| #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) |
| #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) |
| #define HWY_MAYBE_UNUSED |
| #define HWY_HAS_ASSUME_ALIGNED 0 |
| #if (_MSC_VER >= 1700) |
| #define HWY_MUST_USE_RESULT _Check_return_ |
| #else |
| #define HWY_MUST_USE_RESULT |
| #endif |
| |
| #else |
| |
| #define HWY_RESTRICT __restrict__ |
| #define HWY_INLINE inline __attribute__((always_inline)) |
| #define HWY_NOINLINE __attribute__((noinline)) |
| #define HWY_FLATTEN __attribute__((flatten)) |
| #define HWY_NORETURN __attribute__((noreturn)) |
| #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) |
| #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) |
| #define HWY_PRAGMA(tokens) _Pragma(#tokens) |
| #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) |
| #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) |
| // Encountered "attribute list cannot appear here" when using the C++17 |
| // [[maybe_unused]], so only use the old style attribute for now. |
| #define HWY_MAYBE_UNUSED __attribute__((unused)) |
| #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result)) |
| |
| #endif // !HWY_COMPILER_MSVC |
| |
| //------------------------------------------------------------------------------ |
| // Builtin/attributes |
| |
| #ifdef __has_builtin |
| #define HWY_HAS_BUILTIN(name) __has_builtin(name) |
| #else |
| #define HWY_HAS_BUILTIN(name) 0 |
| #endif |
| |
| #ifdef __has_attribute |
| #define HWY_HAS_ATTRIBUTE(name) __has_attribute(name) |
| #else |
| #define HWY_HAS_ATTRIBUTE(name) 0 |
| #endif |
| |
| // Enables error-checking of format strings. |
| #if HWY_HAS_ATTRIBUTE(__format__) |
| #define HWY_FORMAT(idx_fmt, idx_arg) \ |
| __attribute__((__format__(__printf__, idx_fmt, idx_arg))) |
| #else |
| #define HWY_FORMAT(idx_fmt, idx_arg) |
| #endif |
| |
| // Returns a void* pointer which the compiler then assumes is N-byte aligned. |
| // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32); |
| // |
| // The assignment semantics are required by GCC/Clang. ICC provides an in-place |
| // __assume_aligned, whereas MSVC's __assume appears unsuitable. |
| #if HWY_HAS_BUILTIN(__builtin_assume_aligned) |
| #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) |
| #else |
| #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ |
| #endif |
| |
| // Clang and GCC require attributes on each function into which SIMD intrinsics |
| // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and |
| // automatic annotation via pragmas. |
| #if HWY_COMPILER_CLANG |
| #define HWY_PUSH_ATTRIBUTES(targets_str) \ |
| HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \ |
| apply_to = function)) |
| #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop) |
| #elif HWY_COMPILER_GCC |
| #define HWY_PUSH_ATTRIBUTES(targets_str) \ |
| HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str) |
| #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options) |
| #else |
| #define HWY_PUSH_ATTRIBUTES(targets_str) |
| #define HWY_POP_ATTRIBUTES |
| #endif |
| |
| //------------------------------------------------------------------------------ |
| // Detect architecture using predefined macros |
| |
| #if defined(__i386__) || defined(_M_IX86) |
| #define HWY_ARCH_X86_32 1 |
| #else |
| #define HWY_ARCH_X86_32 0 |
| #endif |
| |
| #if defined(__x86_64__) || defined(_M_X64) |
| #define HWY_ARCH_X86_64 1 |
| #else |
| #define HWY_ARCH_X86_64 0 |
| #endif |
| |
| #if HWY_ARCH_X86_32 || HWY_ARCH_X86_64 |
| #define HWY_ARCH_X86 1 |
| #else |
| #define HWY_ARCH_X86 0 |
| #endif |
| |
| #if defined(__powerpc64__) || defined(_M_PPC) |
| #define HWY_ARCH_PPC 1 |
| #else |
| #define HWY_ARCH_PPC 0 |
| #endif |
| |
| #if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) |
| #define HWY_ARCH_ARM 1 |
| #else |
| #define HWY_ARCH_ARM 0 |
| #endif |
| |
| // There isn't yet a standard __wasm or __wasm__. |
| #ifdef __EMSCRIPTEN__ |
| #define HWY_ARCH_WASM 1 |
| #else |
| #define HWY_ARCH_WASM 0 |
| #endif |
| |
| #ifdef __riscv |
| #define HWY_ARCH_RVV 1 |
| #else |
| #define HWY_ARCH_RVV 0 |
| #endif |
| |
| #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \ |
| HWY_ARCH_RVV) != 1 |
| #error "Must detect exactly one platform" |
| #endif |
| |
| //------------------------------------------------------------------------------ |
| // Macros |
| |
| #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED |
| |
| #define HWY_CONCAT_IMPL(a, b) a##b |
| #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) |
| |
| #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b)) |
| #define HWY_MAX(a, b) ((a) < (b) ? (b) : (a)) |
| |
| // Alternative for asm volatile("" : : : "memory"), which has no effect. |
| #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel) |
| |
| // 4 instances of a given literal value, useful as input to LoadDup128. |
| #define HWY_REP4(literal) literal, literal, literal, literal |
| |
| #define HWY_ABORT(format, ...) \ |
| ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__) |
| |
| // Always enabled. |
| #define HWY_ASSERT(condition) \ |
| do { \ |
| if (!(condition)) { \ |
| HWY_ABORT("Assert %s", #condition); \ |
| } \ |
| } while (0) |
| |
| // Only for "debug" builds |
| #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \ |
| defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER) |
| #define HWY_DASSERT(condition) HWY_ASSERT(condition) |
| #else |
| #define HWY_DASSERT(condition) \ |
| do { \ |
| } while (0) |
| #endif |
| |
| //------------------------------------------------------------------------------ |
| |
| namespace hwy { |
| |
| // See also HWY_ALIGNMENT - aligned_allocator aligns to the larger of that and |
| // the vector size, whose upper bound is specified here. |
| |
| #if HWY_ARCH_X86 |
| static constexpr size_t kMaxVectorSize = 64; // AVX-512 |
| #define HWY_ALIGN_MAX alignas(64) |
| #else |
| static constexpr size_t kMaxVectorSize = 16; |
| #define HWY_ALIGN_MAX alignas(16) |
| #endif |
| |
| HWY_NORETURN void HWY_FORMAT(3, 4) |
| Abort(const char* file, int line, const char* format, ...); |
| |
| template <typename T> |
| constexpr bool IsFloat() { |
| return T(1.25) != T(1); |
| } |
| |
| template <typename T> |
| constexpr bool IsSigned() { |
| return T(0) > T(-1); |
| } |
| |
| // Largest/smallest representable integer values. |
| template <typename T> |
| constexpr T LimitsMax() { |
| return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1) |
| : static_cast<T>(~0ull); |
| } |
| template <typename T> |
| constexpr T LimitsMin() { |
| return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0); |
| } |
| |
| // Manual control of overload resolution (SFINAE). |
| template <bool Condition, class T> |
| struct EnableIfT {}; |
| template <class T> |
| struct EnableIfT<true, T> { |
| using type = T; |
| }; |
| |
| template <bool Condition, class T = void> |
| using EnableIf = typename EnableIfT<Condition, T>::type; |
| |
| template <typename T1, typename T2> |
| constexpr inline T1 DivCeil(T1 a, T2 b) { |
| return (a + b - 1) / b; |
| } |
| |
| // Works for any `align`; if a power of two, compiler emits ADD+AND. |
| constexpr inline size_t RoundUpTo(size_t what, size_t align) { |
| return DivCeil(what, align) * align; |
| } |
| |
| // Undefined results for x == 0. |
| HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { |
| #ifdef _MSC_VER |
| unsigned long index; // NOLINT |
| _BitScanForward(&index, x); |
| return index; |
| #else |
| return static_cast<size_t>(__builtin_ctz(x)); |
| #endif |
| } |
| |
| HWY_API size_t PopCount(uint64_t x) { |
| #if HWY_COMPILER_CLANG || HWY_COMPILER_GCC |
| return static_cast<size_t>(__builtin_popcountll(x)); |
| #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 |
| return _mm_popcnt_u64(x); |
| #elif HWY_COMPILER_MSVC |
| return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32)); |
| #else |
| x -= ((x >> 1) & 0x55555555U); |
| x = (((x >> 2) & 0x33333333U) + (x & 0x33333333U)); |
| x = (((x >> 4) + x) & 0x0F0F0F0FU); |
| x += (x >> 8); |
| x += (x >> 16); |
| x += (x >> 32); |
| x = x & 0x0000007FU; |
| return (unsigned int)x; |
| #endif |
| } |
| |
| // The source/destination must not overlap/alias. |
| template <size_t kBytes, typename From, typename To> |
| HWY_API void CopyBytes(const From* from, To* to) { |
| #if HWY_COMPILER_MSVC |
| const uint8_t* HWY_RESTRICT from_bytes = |
| reinterpret_cast<const uint8_t*>(from); |
| uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to); |
| for (size_t i = 0; i < kBytes; ++i) { |
| to_bytes[i] = from_bytes[i]; |
| } |
| #else |
| // Avoids horrible codegen on Clang (series of PINSRB) |
| __builtin_memcpy(to, from, kBytes); |
| #endif |
| } |
| |
| } // namespace hwy |
| |
| #endif // HIGHWAY_HWY_BASE_H_ |