blob: bf2665aead689c3873c65b2b5daaefc6f2a9b795 [file] [log] [blame]
Jan Wassenberg94a72d02020-10-29 18:04:03 +01001// Copyright 2020 Google LLC
Jan Wassenberg447210e2019-09-06 18:09:39 +02002//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
JPEG XL226f84e2020-11-11 20:22:00 +010015#ifndef HIGHWAY_HWY_TARGETS_H_
16#define HIGHWAY_HWY_TARGETS_H_
Jan Wassenberg447210e2019-09-06 18:09:39 +020017
Jan Wassenberg94a72d02020-10-29 18:04:03 +010018#include <vector>
19
20// For SIMD module implementations and their callers. Defines which targets to
21// generate and call.
22
23#include "hwy/base.h"
24
25//------------------------------------------------------------------------------
26// Optional configuration
27
28// See ../quick_reference.md for documentation of these macros.
29
30// Uncomment to override the default baseline determined from predefined macros:
31// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
32
33// Uncomment to override the default blocklist:
34// #define HWY_BROKEN_TARGETS HWY_AVX3
35
36// Uncomment to definitely avoid generating those target(s):
37// #define HWY_DISABLED_TARGETS HWY_SSE4
38
39// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
40// AVX2 target for VMs which support AVX2 but not the other instruction sets)
41// #define HWY_DISABLE_BMI2_FMA
42
43//------------------------------------------------------------------------------
44// Targets
45
46// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
47// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
48//
49// All values are unconditionally defined so we can test HWY_TARGETS without
50// first checking the HWY_ARCH_*.
51//
52// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
53// can use 32-bit literals.
54
55// 1,2,4: reserved
56#define HWY_AVX3 8
57#define HWY_AVX2 16
58// 32: reserved for AVX
59#define HWY_SSE4 64
60// 0x80, 0x100, 0x200: reserved for SSSE3, SSE3, SSE2
61
62// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
63// dynamic dispatch. All x86 target bits must be lower or equal to
64// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
65// HWY_MAX_DYNAMIC_TARGETS in total.
66#define HWY_HIGHEST_TARGET_BIT_X86 9
67
68// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
69#define HWY_NEON 0x2000
70
71#define HWY_HIGHEST_TARGET_BIT_ARM 13
72
73// 0x4000, 0x8000 reserved
74#define HWY_PPC8 0x10000 // v2.07 or 3
75// 0x20000, 0x40000 reserved for prior VSX/AltiVec
76
77#define HWY_HIGHEST_TARGET_BIT_PPC 18
78
79// 0x80000 reserved
80#define HWY_WASM 0x100000
81
82#define HWY_HIGHEST_TARGET_BIT_WASM 20
83
Jan Wassenberg0034dac2021-01-07 01:18:02 -080084// 0x200000, 0x400000, 0x800000 reserved
85
86#define HWY_RVV 0x1000000
87
88#define HWY_HIGHEST_TARGET_BIT_RVV 24
89
90// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
Jan Wassenberg94a72d02020-10-29 18:04:03 +010091
92#define HWY_SCALAR 0x20000000
93// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
94
95//------------------------------------------------------------------------------
96// Set default blocklists
97
98// Disabled means excluded from enabled at user's request. A separate config
99// macro allows disabling without deactivating the blocklist below.
100#ifndef HWY_DISABLED_TARGETS
101#define HWY_DISABLED_TARGETS 0
102#endif
103
104// Broken means excluded from enabled due to known compiler issues. Allow the
105// user to override this blocklist without any guarantee of success.
106#ifndef HWY_BROKEN_TARGETS
107
108// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
109// SSE4 codegen (msan failure), so disable all those targets.
110#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
111// TODO: Disable all non-scalar targets for every build target once we have
112// clang-7 enabled in our builders.
113#ifdef MEMORY_SANITIZER
114#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
115#else
116#define HWY_BROKEN_TARGETS 0
117#endif
118// This entails a major speed reduction, so warn unless the user explicitly
119// opts in to scalar-only.
120#if !defined(HWY_COMPILE_ONLY_SCALAR)
121#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
122#endif
123
124// MSVC, or 32-bit may fail to compile AVX2/3.
125#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
126#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
127#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
128
129#else
130#define HWY_BROKEN_TARGETS 0
131#endif
132
133#endif // HWY_BROKEN_TARGETS
134
135// Enabled means not disabled nor blocklisted.
136#define HWY_ENABLED(targets) \
137 ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
138
139//------------------------------------------------------------------------------
140// Detect baseline targets using predefined macros
141
142// Baseline means the targets for which the compiler is allowed to generate
143// instructions, implying the target CPU would have to support them. Do not use
144// this directly because it does not take the blocklist into account. Allow the
145// user to override this without any guarantee of success.
146#ifndef HWY_BASELINE_TARGETS
147
148#ifdef __wasm_simd128__
149#define HWY_BASELINE_WASM HWY_WASM
150#else
151#define HWY_BASELINE_WASM 0
152#endif
153
154#ifdef __VSX__
155#define HWY_BASELINE_PPC8 HWY_PPC8
156#else
157#define HWY_BASELINE_PPC8 0
158#endif
159
160// GCC 4.5.4 only defines the former; 5.4 defines both.
161#if defined(__ARM_NEON__) || defined(__ARM_NEON)
162#define HWY_BASELINE_NEON HWY_NEON
163#else
164#define HWY_BASELINE_NEON 0
165#endif
166
167#ifdef __SSE4_1__
168#define HWY_BASELINE_SSE4 HWY_SSE4
169#else
170#define HWY_BASELINE_SSE4 0
171#endif
172
173#ifdef __AVX2__
174#define HWY_BASELINE_AVX2 HWY_AVX2
175#else
176#define HWY_BASELINE_AVX2 0
177#endif
178
179#ifdef __AVX512F__
180#define HWY_BASELINE_AVX3 HWY_AVX3
181#else
182#define HWY_BASELINE_AVX3 0
183#endif
184
Jan Wassenberg0034dac2021-01-07 01:18:02 -0800185#ifdef __riscv_vector
186#define HWY_BASELINE_RVV HWY_RVV
187#else
188#define HWY_BASELINE_RVV 0
189#endif
190
Jan Wassenberg94a72d02020-10-29 18:04:03 +0100191#define HWY_BASELINE_TARGETS \
192 (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \
Jan Wassenberg0034dac2021-01-07 01:18:02 -0800193 HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
194 HWY_BASELINE_RVV)
Jan Wassenberg94a72d02020-10-29 18:04:03 +0100195
196#endif // HWY_BASELINE_TARGETS
197
198//------------------------------------------------------------------------------
199// Choose target for static dispatch
200
201#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
202#if HWY_ENABLED_BASELINE == 0
203#error "At least one baseline target must be defined and enabled"
204#endif
205
206// Best baseline, used for static dispatch. This is the least-significant 1-bit
207// within HWY_ENABLED_BASELINE and lower bit values imply "better".
208#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
209
210// Start by assuming static dispatch. If we later use dynamic dispatch, this
211// will be defined to other targets during the multiple-inclusion, and finally
212// return to the initial value. Defining this outside begin/end_target ensures
213// inl headers successfully compile by themselves (required by Bazel).
214#define HWY_TARGET HWY_STATIC_TARGET
215
216//------------------------------------------------------------------------------
217// Choose targets for dynamic dispatch according to one of four policies
218
219#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
220 defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
221#error "Invalid config: can only define a single policy for targets"
222#endif
223
224// Attainable means enabled and the compiler allows intrinsics (even when not
225// allowed to autovectorize). Used in 3 and 4.
226#if HWY_ARCH_X86
227#define HWY_ATTAINABLE_TARGETS \
228 HWY_ENABLED(HWY_SCALAR | HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
229#else
230#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
231#endif
232
233// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
234// to ~HWY_SCALAR, but this is more explicit).
235#if defined(HWY_COMPILE_ONLY_SCALAR)
Jan Wassenberg71b77d42020-11-30 13:26:09 +0100236#undef HWY_STATIC_TARGET
237#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
Jan Wassenberg94a72d02020-10-29 18:04:03 +0100238#define HWY_TARGETS HWY_SCALAR
239
240// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
241#elif defined(HWY_COMPILE_ONLY_STATIC)
242#define HWY_TARGETS HWY_STATIC_TARGET
243
244// 3) For tests: include all attainable targets (in particular: scalar)
245#elif defined(HWY_COMPILE_ALL_ATTAINABLE)
246#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
247
248// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
249// excluding superseded targets, in particular scalar.
250#else
251
252#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
253
254#endif // target policy
255
256// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
257// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
258// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
259#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
260#error "Logic error: best baseline should be included in dynamic targets"
261#endif
262
263//------------------------------------------------------------------------------
264
265namespace hwy {
266
267// Returns (cached) bitfield of enabled targets that are supported on this CPU.
268// Implemented in supported_targets.cc; unconditionally compiled to support the
269// use case of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may
270// allow eliding calls to this function.
271uint32_t SupportedTargets();
272
273// Disable from runtime dispatch the mask of compiled in targets. Targets that
274// were not enabled at compile time are ignored. This function is useful to
275// disable a target supported by the CPU that is known to have bugs or when a
276// lower target is desired. For this reason, attempts to disable targets which
277// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
278// returns at least the baseline target.
279void DisableTargets(uint32_t disabled_targets);
280
281// Single target: reduce code size by eliding the call and conditional branches
282// inside Choose*() functions.
283#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
284#define HWY_SUPPORTED_TARGETS HWY_TARGETS
285#else
286#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
287#endif
288
289// Set the mock mask of CPU supported targets instead of the actual CPU
290// supported targets computed in SupportedTargets(). The return value of
291// SupportedTargets() will still be affected by the DisabledTargets() mask
292// regardless of this mock, to prevent accidentally adding targets that are
293// known to be buggy in the current CPU. Call with a mask of 0 to disable the
294// mock and use the actual CPU supported targets instead.
295void SetSupportedTargetsForTest(uint32_t targets);
296
297// Returns whether the SupportedTargets() function was called since the last
298// SetSupportedTargetsForTest() call.
299bool SupportedTargetsCalledForTest();
300
301// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
302// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
303// is affected by the current SetSupportedTargetsForTest() mock if any.
304HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
305 std::vector<uint32_t> ret;
306 for (uint32_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
307 targets = targets & (targets - 1)) {
308 uint32_t current_target = targets & ~(targets - 1);
309 ret.push_back(current_target);
310 }
311 return ret;
312}
313
314static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
315 switch (target) {
316#if HWY_ARCH_X86
317 case HWY_SSE4:
318 return "SSE4";
319 case HWY_AVX2:
320 return "AVX2";
321 case HWY_AVX3:
322 return "AVX3";
323#endif
324
325#if HWY_ARCH_ARM
326 case HWY_NEON:
327 return "Neon";
328#endif
329
330#if HWY_ARCH_PPC
331 case HWY_PPC8:
332 return "Power8";
333#endif
334
335#if HWY_ARCH_WASM
336 case HWY_WASM:
337 return "Wasm";
338#endif
339
Jan Wassenberg0034dac2021-01-07 01:18:02 -0800340#if HWY_ARCH_RVV
341 case HWY_RVV:
342 return "RVV";
343#endif
344
Jan Wassenberg94a72d02020-10-29 18:04:03 +0100345 case HWY_SCALAR:
346 return "Scalar";
347
348 default:
349 return "?";
350 }
351}
352
353// The maximum number of dynamic targets on any architecture is defined by
354// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
355
356// For the ChosenTarget mask and index we use a different bit arrangement than
357// in the HWY_TARGETS mask. Only the targets involved in the current
358// architecture are used in this mask, and therefore only the least significant
359// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the uint32_t mask are used. The least
360// significant bit is set when the mask is not initialized, the next
361// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
362// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
363// that position and the next more significant bit is used for the scalar
364// target. Because of this we need to define equivalent values for HWY_TARGETS
365// in this representation.
366// This mask representation allows to use ctz() on this mask and obtain a small
367// number that's used as an index of the table for dynamic dispatch. In this
368// way the first entry is used when the mask is uninitialized, the following
369// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
370// scalar.
371
372// The HWY_SCALAR bit in the ChosenTarget mask format.
373#define HWY_CHOSEN_TARGET_MASK_SCALAR (1u << (HWY_MAX_DYNAMIC_TARGETS + 1))
374
375// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
376// current architecture.
377#define HWY_CHOSEN_TARGET_SHIFT(X) \
378 ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
379 ((1u << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
380 << 1)
381
382// The HWY_TARGETS mask in the ChosenTarget mask format.
383#define HWY_CHOSEN_TARGET_MASK_TARGETS \
384 (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1u)
385
386#if HWY_ARCH_X86
387// Maximum number of dynamic targets, changing this value is an ABI incompatible
388// change
389#define HWY_MAX_DYNAMIC_TARGETS 10
390#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
391// These must match the order in which the HWY_TARGETS are defined
392// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
393// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
394// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
395// corresponds to the best target. Don't include a "," at the end of the list.
396#define HWY_CHOOSE_TARGET_LIST(func_name) \
397 nullptr, /* reserved */ \
398 nullptr, /* reserved */ \
399 nullptr, /* reserved */ \
400 HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
401 HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
402 nullptr, /* AVX */ \
403 HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
404 nullptr, /* SSSE3 */ \
405 nullptr, /* SSE3 */ \
406 nullptr /* SSE2 */
407
408#endif // HWY_ARCH_X86
409
410#if HWY_ARCH_ARM
411// See HWY_ARCH_X86 above for details.
412#define HWY_MAX_DYNAMIC_TARGETS 4
413#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
414#define HWY_CHOOSE_TARGET_LIST(func_name) \
415 nullptr, /* reserved */ \
416 nullptr, /* reserved */ \
417 nullptr, /* reserved */ \
418 HWY_CHOOSE_NEON(func_name) /* NEON */
419
420#endif // HWY_ARCH_ARM
421
422#if HWY_ARCH_PPC
423// See HWY_ARCH_X86 above for details.
424#define HWY_MAX_DYNAMIC_TARGETS 5
425#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
426#define HWY_CHOOSE_TARGET_LIST(func_name) \
427 nullptr, /* reserved */ \
428 nullptr, /* reserved */ \
429 HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
430 nullptr, /* VSX */ \
431 nullptr /* AltiVec */
432
433#endif // HWY_ARCH_PPC
434
435#if HWY_ARCH_WASM
436// See HWY_ARCH_X86 above for details.
437#define HWY_MAX_DYNAMIC_TARGETS 4
438#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
439#define HWY_CHOOSE_TARGET_LIST(func_name) \
440 nullptr, /* reserved */ \
441 nullptr, /* reserved */ \
442 nullptr, /* reserved */ \
443 HWY_CHOOSE_WASM(func_name) /* WASM */
444
445#endif // HWY_ARCH_WASM
446
Jan Wassenberg0034dac2021-01-07 01:18:02 -0800447#if HWY_ARCH_RVV
448// See HWY_ARCH_X86 above for details.
449#define HWY_MAX_DYNAMIC_TARGETS 4
450#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
451#define HWY_CHOOSE_TARGET_LIST(func_name) \
452 nullptr, /* reserved */ \
453 nullptr, /* reserved */ \
454 nullptr, /* reserved */ \
455 HWY_CHOOSE_RVV(func_name) /* RVV */
456
457#endif // HWY_ARCH_RVV
458
Jan Wassenberg94a72d02020-10-29 18:04:03 +0100459struct ChosenTarget {
460 public:
461 // Update the ChosenTarget mask based on the current CPU supported
462 // targets.
463 void Update();
464
465 // Reset the ChosenTarget to the uninitialized state.
466 void DeInit() { mask_.store(1); }
467
468 // Whether the ChosenTarget was initialized. This is useful to know whether
469 // any HWY_DYNAMIC_DISPATCH function was called.
470 bool IsInitialized() const { return mask_.load() != 1; }
471
472 // Return the index in the dynamic dispatch table to be used by the current
473 // CPU. Note that this method must be in the header file so it uses the value
474 // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
475 // calls it, which may be different from others. This allows to only consider
476 // those targets that were actually compiled in this module.
Highwayf999d0e2020-11-13 13:17:34 +0100477 size_t HWY_INLINE GetIndex() const {
Jan Wassenberg94a72d02020-10-29 18:04:03 +0100478 return hwy::Num0BitsBelowLS1Bit_Nonzero32(mask_.load() &
479 HWY_CHOSEN_TARGET_MASK_TARGETS);
480 }
481
482 private:
483 // Initialized to 1 so GetChosenTargetIndex() returns 0.
484 std::atomic<uint32_t> mask_{1};
485};
486
487extern ChosenTarget chosen_target;
488
489} // namespace hwy
Jan Wassenberg447210e2019-09-06 18:09:39 +0200490
JPEG XL226f84e2020-11-11 20:22:00 +0100491#endif // HIGHWAY_HWY_TARGETS_H_