Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 1 | // Copyright 2020 Google LLC |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
Jan Wassenberg | b97d18f | 2020-11-11 17:53:22 +0100 | [diff] [blame] | 15 | // This include guard is checked by foreach_target, so avoid the usual _H_ |
| 16 | // suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included |
| 17 | // after/outside this include guard. |
| 18 | #ifndef HWY_HIGHWAY_INCLUDED |
| 19 | #define HWY_HIGHWAY_INCLUDED |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 20 | |
Jan Wassenberg | b97d18f | 2020-11-11 17:53:22 +0100 | [diff] [blame] | 21 | // Main header required before using vector types. |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 22 | |
| 23 | #include "hwy/targets.h" |
| 24 | |
| 25 | namespace hwy { |
| 26 | |
Jan Wassenberg | e97d92a | 2021-01-05 08:59:48 -0800 | [diff] [blame] | 27 | // API version (https://semver.org/) |
| 28 | #define HWY_MAJOR 0 |
| 29 | #define HWY_MINOR 7 |
| 30 | #define HWY_PATCH 0 |
| 31 | |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 32 | //------------------------------------------------------------------------------ |
| 33 | // Shorthand for descriptors (defined in shared-inl.h) used to select overloads. |
| 34 | |
| 35 | // Because Highway functions take descriptor and/or vector arguments, ADL finds |
| 36 | // these functions without requiring users in project::HWY_NAMESPACE to |
| 37 | // qualify Highway functions with hwy::HWY_NAMESPACE. However, ADL rules for |
| 38 | // templates require `using hwy::HWY_NAMESPACE::ShiftLeft;` etc. declarations. |
| 39 | |
| 40 | // Full (native-width) vector. |
| 41 | #define HWY_FULL(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)> |
| 42 | |
| 43 | // Vector of up to MAX_N lanes. |
| 44 | #define HWY_CAPPED(T, MAX_N) \ |
| 45 | hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))> |
| 46 | |
| 47 | //------------------------------------------------------------------------------ |
| 48 | // Export user functions for static/dynamic dispatch |
| 49 | |
| 50 | // Evaluates to 0 inside a translation unit if it is generating anything but the |
Jan Wassenberg | b97d18f | 2020-11-11 17:53:22 +0100 | [diff] [blame] | 51 | // static target (the last one if multiple targets are enabled). Used to prevent |
| 52 | // redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only |
| 53 | // compile once anyway, so this is 1 unless it is or has been included. |
| 54 | #ifndef HWY_ONCE |
| 55 | #define HWY_ONCE 1 |
| 56 | #endif |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 57 | |
| 58 | // HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for |
| 59 | // HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is |
| 60 | // defined), and can be used to deduce the return type of Choose*. |
| 61 | #if HWY_STATIC_TARGET == HWY_SCALAR |
| 62 | #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME |
| 63 | #elif HWY_STATIC_TARGET == HWY_WASM |
| 64 | #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME |
| 65 | #elif HWY_STATIC_TARGET == HWY_NEON |
| 66 | #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME |
| 67 | #elif HWY_STATIC_TARGET == HWY_PPC8 |
| 68 | #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME |
| 69 | #elif HWY_STATIC_TARGET == HWY_SSE4 |
| 70 | #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME |
| 71 | #elif HWY_STATIC_TARGET == HWY_AVX2 |
| 72 | #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME |
| 73 | #elif HWY_STATIC_TARGET == HWY_AVX3 |
| 74 | #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME |
| 75 | #endif |
| 76 | |
| 77 | // Dynamic dispatch declarations. |
| 78 | |
| 79 | template <typename RetType, typename... Args> |
| 80 | struct FunctionCache { |
| 81 | public: |
| 82 | typedef RetType(FunctionType)(Args...); |
| 83 | |
| 84 | // A template function that when instantiated has the same signature as the |
| 85 | // function being called. This function initializes the global cache of the |
| 86 | // current supported targets mask used for dynamic dispatch and calls the |
| 87 | // appropriate function. Since this mask used for dynamic dispatch is a |
| 88 | // global cache, all the highway exported functions, even those exposed by |
| 89 | // different modules, will be initialized after this function runs for any one |
| 90 | // of those exported functions. |
| 91 | template <FunctionType* const table[]> |
| 92 | static RetType ChooseAndCall(Args... args) { |
| 93 | // If we are running here it means we need to update the chosen target. |
| 94 | chosen_target.Update(); |
| 95 | return (table[chosen_target.GetIndex()])(args...); |
| 96 | } |
| 97 | }; |
| 98 | |
| 99 | // Factory function only used to infer the template parameters RetType and Args |
| 100 | // from a function passed to the factory. |
| 101 | template <typename RetType, typename... Args> |
| 102 | FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) { |
| 103 | return FunctionCache<RetType, Args...>(); |
| 104 | } |
| 105 | |
| 106 | // HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or |
| 107 | // nullptr is that target was not compiled. |
| 108 | #if HWY_TARGETS & HWY_SCALAR |
| 109 | #define HWY_CHOOSE_SCALAR(FUNC_NAME) &N_SCALAR::FUNC_NAME |
| 110 | #else |
| 111 | // When scalar is not present and we try to use scalar because other targets |
| 112 | // were disabled at runtime we fall back to the baseline with |
| 113 | // HWY_STATIC_DISPATCH() |
| 114 | #define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) |
| 115 | #endif |
| 116 | |
| 117 | #if HWY_TARGETS & HWY_WASM |
| 118 | #define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME |
| 119 | #else |
| 120 | #define HWY_CHOOSE_WASM(FUNC_NAME) nullptr |
| 121 | #endif |
| 122 | |
Jan Wassenberg | 0034dac | 2021-01-07 01:18:02 -0800 | [diff] [blame] | 123 | #if HWY_TARGETS & HWY_RVV |
| 124 | #define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME |
| 125 | #else |
| 126 | #define HWY_CHOOSE_RVV(FUNC_NAME) nullptr |
| 127 | #endif |
| 128 | |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 129 | #if HWY_TARGETS & HWY_NEON |
| 130 | #define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME |
| 131 | #else |
| 132 | #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr |
| 133 | #endif |
| 134 | |
| 135 | #if HWY_TARGETS & HWY_PPC8 |
| 136 | #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME |
| 137 | #else |
| 138 | #define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr |
| 139 | #endif |
| 140 | |
| 141 | #if HWY_TARGETS & HWY_SSE4 |
| 142 | #define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME |
| 143 | #else |
| 144 | #define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr |
| 145 | #endif |
| 146 | |
| 147 | #if HWY_TARGETS & HWY_AVX2 |
| 148 | #define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME |
| 149 | #else |
| 150 | #define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr |
| 151 | #endif |
| 152 | |
| 153 | #if HWY_TARGETS & HWY_AVX3 |
| 154 | #define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME |
| 155 | #else |
| 156 | #define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr |
| 157 | #endif |
| 158 | |
| 159 | #define HWY_DISPATCH_TABLE(FUNC_NAME) \ |
| 160 | HWY_CONCAT(FUNC_NAME, HighwayDispatchTable) |
| 161 | |
| 162 | // HWY_EXPORT(FUNC_NAME); expands to a static array that is used by |
| 163 | // HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This |
| 164 | // static array must be defined at the same namespace level as the function |
| 165 | // it is exporting. |
| 166 | // After being exported, it can be called from other parts of the same source |
| 167 | // file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper |
| 168 | // like in the following example: |
| 169 | // |
| 170 | // #include "hwy/highway.h" |
| 171 | // HWY_BEFORE_NAMESPACE(); |
| 172 | // namespace skeleton { |
| 173 | // namespace HWY_NAMESPACE { |
| 174 | // |
| 175 | // void MyFunction(int a, char b, const char* c) { ... } |
| 176 | // |
| 177 | // // NOLINTNEXTLINE(google-readability-namespace-comments) |
| 178 | // } // namespace HWY_NAMESPACE |
| 179 | // } // namespace skeleton |
| 180 | // HWY_AFTER_NAMESPACE(); |
| 181 | // |
| 182 | // namespace skeleton { |
| 183 | // HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope. |
| 184 | // |
| 185 | // void MyFunction(int a, char b, const char* c) { |
| 186 | // return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c); |
| 187 | // } |
| 188 | // } // namespace skeleton |
| 189 | // |
| 190 | |
| 191 | #if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) |
| 192 | |
| 193 | // Simplified version for IDE or the dynamic dispatch case with only one target. |
| 194 | // This case still uses a table, although of a single element, to provide the |
| 195 | // same compile error conditions as with the dynamic dispatch case when multiple |
| 196 | // targets are being compiled. |
| 197 | #define HWY_EXPORT(FUNC_NAME) \ |
| 198 | static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ |
| 199 | FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)} |
| 200 | #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) (*(HWY_DISPATCH_TABLE(FUNC_NAME)[0])) |
| 201 | |
| 202 | #else |
| 203 | |
| 204 | // Dynamic dispatch case with one entry per dynamic target plus the scalar |
| 205 | // mode and the initialization wrapper. |
| 206 | #define HWY_EXPORT(FUNC_NAME) \ |
| 207 | static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \ |
| 208 | const HWY_DISPATCH_TABLE(FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \ |
| 209 | /* The first entry in the table initializes the global cache and \ |
| 210 | * calls the appropriate function. */ \ |
| 211 | &decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH( \ |
| 212 | FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \ |
| 213 | HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ |
| 214 | HWY_CHOOSE_SCALAR(FUNC_NAME), \ |
| 215 | } |
| 216 | #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \ |
| 217 | (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()])) |
| 218 | |
| 219 | #endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) |
| 220 | |
| 221 | } // namespace hwy |
| 222 | |
Jan Wassenberg | b97d18f | 2020-11-11 17:53:22 +0100 | [diff] [blame] | 223 | #endif // HWY_HIGHWAY_INCLUDED |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 224 | |
| 225 | //------------------------------------------------------------------------------ |
| 226 | |
| 227 | // NOTE: ops/*.h cannot use regular include guards because their definitions |
| 228 | // depend on HWY_TARGET, e.g. enabling AVX3 instructions on 128-bit vectors, so |
| 229 | // we want to include them once per target. However, each *-inl.h includes |
| 230 | // highway.h, so we still need an external per-target include guard. |
| 231 | #if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE) |
| 232 | #ifdef HWY_HIGHWAY_PER_TARGET |
| 233 | #undef HWY_HIGHWAY_PER_TARGET |
| 234 | #else |
| 235 | #define HWY_HIGHWAY_PER_TARGET |
| 236 | #endif |
| 237 | |
| 238 | // These define ops inside namespace hwy::HWY_NAMESPACE. |
| 239 | #if HWY_TARGET == HWY_SSE4 |
| 240 | #include "hwy/ops/x86_128-inl.h" |
| 241 | #elif HWY_TARGET == HWY_AVX2 |
| 242 | #include "hwy/ops/x86_256-inl.h" |
| 243 | #elif HWY_TARGET == HWY_AVX3 |
| 244 | #include "hwy/ops/x86_512-inl.h" |
| 245 | #elif HWY_TARGET == HWY_PPC8 |
| 246 | #elif HWY_TARGET == HWY_NEON |
| 247 | #include "hwy/ops/arm_neon-inl.h" |
| 248 | #elif HWY_TARGET == HWY_WASM |
| 249 | #include "hwy/ops/wasm_128-inl.h" |
Jan Wassenberg | 0034dac | 2021-01-07 01:18:02 -0800 | [diff] [blame] | 250 | #elif HWY_TARGET == HWY_RVV |
| 251 | // TODO(janwas): header |
| 252 | #include "hwy/ops/shared-inl.h" |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 253 | #elif HWY_TARGET == HWY_SCALAR |
| 254 | #include "hwy/ops/scalar-inl.h" |
| 255 | #else |
| 256 | #pragma message("HWY_TARGET does not match any known target") |
| 257 | #endif // HWY_TARGET |
| 258 | |
Jan Wassenberg | b97d18f | 2020-11-11 17:53:22 +0100 | [diff] [blame] | 259 | // Commonly used functions/types that must come after ops are defined. |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 260 | HWY_BEFORE_NAMESPACE(); |
| 261 | namespace hwy { |
| 262 | namespace HWY_NAMESPACE { |
| 263 | |
Jan Wassenberg | bc4a79e | 2020-12-02 06:46:11 -0800 | [diff] [blame] | 264 | // The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>. |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 265 | template <class V> |
Jan Wassenberg | bc4a79e | 2020-12-02 06:46:11 -0800 | [diff] [blame] | 266 | using LaneType = decltype(GetLane(V())); |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 267 | |
Jan Wassenberg | d26c37d | 2021-01-06 02:39:16 -0800 | [diff] [blame] | 268 | // Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type |
| 269 | // of functions that do not take a vector argument, or as an argument type if |
| 270 | // the function only has a template argument for D, or for explicit type names |
| 271 | // instead of auto. This may be a built-in type. |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 272 | template <class D> |
| 273 | using Vec = decltype(Zero(D())); |
| 274 | |
Jan Wassenberg | d26c37d | 2021-01-06 02:39:16 -0800 | [diff] [blame] | 275 | // Mask type. Useful as the return type of functions that do not take a mask |
| 276 | // argument, or as an argument type if the function only has a template argument |
| 277 | // for D, or for explicit type names instead of auto. |
| 278 | template <class D> |
| 279 | using Mask = decltype(MaskFromVec(Zero(D()))); |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 280 | |
Jan Wassenberg | bc4a79e | 2020-12-02 06:46:11 -0800 | [diff] [blame] | 281 | // Returns the closest value to v within [lo, hi]. |
| 282 | template <class V> |
| 283 | HWY_API V Clamp(const V v, const V lo, const V hi) { |
| 284 | return Min(Max(lo, v), hi); |
| 285 | } |
| 286 | |
Jan Wassenberg | 32693d9 | 2020-12-22 04:33:37 -0800 | [diff] [blame] | 287 | // CombineShiftRightBytes (and ..Lanes) are not available for the scalar target. |
Jan Wassenberg | 0034dac | 2021-01-07 01:18:02 -0800 | [diff] [blame] | 288 | // TODO(janwas): implement for RVV |
| 289 | #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV |
Jan Wassenberg | 32693d9 | 2020-12-22 04:33:37 -0800 | [diff] [blame] | 290 | |
Jan Wassenberg | 00f6ea5 | 2020-12-21 10:07:29 -0800 | [diff] [blame] | 291 | template <size_t kLanes, class V> |
| 292 | HWY_API V CombineShiftRightLanes(const V hi, const V lo) { |
| 293 | return CombineShiftRightBytes<kLanes * sizeof(LaneType<V>)>(hi, lo); |
| 294 | } |
| 295 | |
Jan Wassenberg | 32693d9 | 2020-12-22 04:33:37 -0800 | [diff] [blame] | 296 | #endif |
| 297 | |
Jan Wassenberg | 6a88083 | 2020-12-03 08:41:16 -0800 | [diff] [blame] | 298 | // Returns lanes with the most significant bit set and all other bits zero. |
| 299 | template <class D> |
| 300 | HWY_API Vec<D> SignBit(D d) { |
| 301 | using Unsigned = MakeUnsigned<typename D::T>; |
| 302 | const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1); |
| 303 | return BitCast(d, Set(Rebind<Unsigned, D>(), bit)); |
| 304 | } |
| 305 | |
Jan Wassenberg | 94a72d0 | 2020-10-29 18:04:03 +0100 | [diff] [blame] | 306 | // NOLINTNEXTLINE(google-readability-namespace-comments) |
| 307 | } // namespace HWY_NAMESPACE |
| 308 | } // namespace hwy |
| 309 | HWY_AFTER_NAMESPACE(); |
| 310 | |
| 311 | #endif // HWY_HIGHWAY_PER_TARGET |