Support ARM64EC ABI
diff --git a/src/threadpool-atomics.h b/src/threadpool-atomics.h
index 44772e2..eaa0707 100644
--- a/src/threadpool-atomics.h
+++ b/src/threadpool-atomics.h
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 /* SSE-specific headers */
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) && !defined(_M_ARM64EC)
 	#include <xmmintrin.h>
 #endif
 
@@ -369,365 +369,6 @@
 	static inline void pthreadpool_fence_release() {
 		__sync_synchronize();
 	}
-#elif defined(_MSC_VER) && defined(_M_X64)
-	typedef volatile uint32_t pthreadpool_atomic_uint32_t;
-	typedef volatile size_t   pthreadpool_atomic_size_t;
-	typedef void *volatile    pthreadpool_atomic_void_p;
-
-	static inline uint32_t pthreadpool_load_relaxed_uint32_t(
-		pthreadpool_atomic_uint32_t* address)
-	{
-		return *address;
-	}
-
-	static inline size_t pthreadpool_load_relaxed_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return *address;
-	}
-
-	static inline void* pthreadpool_load_relaxed_void_p(
-		pthreadpool_atomic_void_p* address)
-	{
-		return *address;
-	}
-
-	static inline uint32_t pthreadpool_load_acquire_uint32_t(
-		pthreadpool_atomic_uint32_t* address)
-	{
-		/* x86-64 loads always have acquire semantics; use only a compiler barrier */
-		const uint32_t value = *address;
-		_ReadBarrier();
-		return value;
-	}
-
-	static inline size_t pthreadpool_load_acquire_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		/* x86-64 loads always have acquire semantics; use only a compiler barrier */
-		const size_t value = *address;
-		_ReadBarrier();
-		return value;
-	}
-
-	static inline void pthreadpool_store_relaxed_uint32_t(
-		pthreadpool_atomic_uint32_t* address,
-		uint32_t value)
-	{
-		*address = value;
-	}
-
-	static inline void pthreadpool_store_relaxed_size_t(
-		pthreadpool_atomic_size_t* address,
-		size_t value)
-	{
-		*address = value;
-	}
-
-	static inline void pthreadpool_store_relaxed_void_p(
-		pthreadpool_atomic_void_p* address,
-		void* value)
-	{
-		*address = value;
-	}
-
-	static inline void pthreadpool_store_release_uint32_t(
-		pthreadpool_atomic_uint32_t* address,
-		uint32_t value)
-	{
-		/* x86-64 stores always have release semantics; use only a compiler barrier */
-		_WriteBarrier();
-		*address = value;
-	}
-
-	static inline void pthreadpool_store_release_size_t(
-		pthreadpool_atomic_size_t* address,
-		size_t value)
-	{
-		/* x86-64 stores always have release semantics; use only a compiler barrier */
-		_WriteBarrier();
-		*address = value;
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_relaxed_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement64((volatile __int64*) address);
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_release_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement64((volatile __int64*) address);
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement64((volatile __int64*) address);
-	}
-
-	static inline bool pthreadpool_try_decrement_relaxed_size_t(
-		pthreadpool_atomic_size_t* value)
-	{
-		size_t actual_value = *value;
-		while (actual_value != 0) {
-			const size_t new_value = actual_value - 1;
-			const size_t expected_value = actual_value;
-			actual_value = _InterlockedCompareExchange64(
-				(volatile __int64*) value, (__int64) new_value, (__int64) expected_value);
-			if (actual_value == expected_value) {
-				return true;
-			}
-		}
-		return false;
-	}
-
-	static inline void pthreadpool_fence_acquire() {
-		_mm_lfence();
-		_ReadBarrier();
-	}
-
-	static inline void pthreadpool_fence_release() {
-		_WriteBarrier();
-		_mm_sfence();
-	}
-#elif defined(_MSC_VER) && defined(_M_IX86)
-	typedef volatile uint32_t pthreadpool_atomic_uint32_t;
-	typedef volatile size_t   pthreadpool_atomic_size_t;
-	typedef void *volatile    pthreadpool_atomic_void_p;
-
-	static inline uint32_t pthreadpool_load_relaxed_uint32_t(
-		pthreadpool_atomic_uint32_t* address)
-	{
-		return *address;
-	}
-
-	static inline size_t pthreadpool_load_relaxed_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return *address;
-	}
-
-	static inline void* pthreadpool_load_relaxed_void_p(
-		pthreadpool_atomic_void_p* address)
-	{
-		return *address;
-	}
-
-	static inline uint32_t pthreadpool_load_acquire_uint32_t(
-		pthreadpool_atomic_uint32_t* address)
-	{
-		/* x86 loads always have acquire semantics; use only a compiler barrier */
-		const uint32_t value = *address;
-		_ReadBarrier();
-		return value;
-	}
-
-	static inline size_t pthreadpool_load_acquire_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		/* x86 loads always have acquire semantics; use only a compiler barrier */
-		const size_t value = *address;
-		_ReadBarrier();
-		return value;
-	}
-
-	static inline void pthreadpool_store_relaxed_uint32_t(
-		pthreadpool_atomic_uint32_t* address,
-		uint32_t value)
-	{
-		*address = value;
-	}
-
-	static inline void pthreadpool_store_relaxed_size_t(
-		pthreadpool_atomic_size_t* address,
-		size_t value)
-	{
-		*address = value;
-	}
-
-	static inline void pthreadpool_store_relaxed_void_p(
-		pthreadpool_atomic_void_p* address,
-		void* value)
-	{
-		*address = value;
-	}
-
-	static inline void pthreadpool_store_release_uint32_t(
-		pthreadpool_atomic_uint32_t* address,
-		uint32_t value)
-	{
-		/* x86 stores always have release semantics; use only a compiler barrier */
-		_WriteBarrier();
-		*address = value;
-	}
-
-	static inline void pthreadpool_store_release_size_t(
-		pthreadpool_atomic_size_t* address,
-		size_t value)
-	{
-		/* x86 stores always have release semantics; use only a compiler barrier */
-		_WriteBarrier();
-		*address = value;
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_relaxed_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement((volatile long*) address);
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_release_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement((volatile long*) address);
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement((volatile long*) address);
-	}
-
-	static inline bool pthreadpool_try_decrement_relaxed_size_t(
-		pthreadpool_atomic_size_t* value)
-	{
-		size_t actual_value = *value;
-		while (actual_value != 0) {
-			const size_t new_value = actual_value - 1;
-			const size_t expected_value = actual_value;
-			actual_value = _InterlockedCompareExchange(
-				(volatile long*) value, (long) new_value, (long) expected_value);
-			if (actual_value == expected_value) {
-				return true;
-			}
-		}
-		return false;
-	}
-
-	static inline void pthreadpool_fence_acquire() {
-		_mm_lfence();
-	}
-
-	static inline void pthreadpool_fence_release() {
-		_mm_sfence();
-	}
-#elif defined(_MSC_VER) && defined(_M_ARM64)
-	typedef volatile uint32_t pthreadpool_atomic_uint32_t;
-	typedef volatile size_t   pthreadpool_atomic_size_t;
-	typedef void *volatile    pthreadpool_atomic_void_p;
-
-	static inline uint32_t pthreadpool_load_relaxed_uint32_t(
-		pthreadpool_atomic_uint32_t* address)
-	{
-		return (uint32_t) __iso_volatile_load32((const volatile __int32*) address);
-	}
-
-	static inline size_t pthreadpool_load_relaxed_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) __iso_volatile_load64((const volatile __int64*) address);
-	}
-
-	static inline void* pthreadpool_load_relaxed_void_p(
-		pthreadpool_atomic_void_p* address)
-	{
-		return (void*) __iso_volatile_load64((const volatile __int64*) address);
-	}
-
-	static inline uint32_t pthreadpool_load_acquire_uint32_t(
-		pthreadpool_atomic_uint32_t* address)
-	{
-		return (uint32_t) __ldar32((volatile unsigned __int32*) address);
-	}
-
-	static inline size_t pthreadpool_load_acquire_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) __ldar64((volatile unsigned __int64*) address);
-	}
-
-	static inline void pthreadpool_store_relaxed_uint32_t(
-		pthreadpool_atomic_uint32_t* address,
-		uint32_t value)
-	{
-		__iso_volatile_store32((volatile __int32*) address, (__int32) value);
-	}
-
-	static inline void pthreadpool_store_relaxed_size_t(
-		pthreadpool_atomic_size_t* address,
-		size_t value)
-	{
-		__iso_volatile_store64((volatile __int64*) address, (__int64) value);
-	}
-
-	static inline void pthreadpool_store_relaxed_void_p(
-		pthreadpool_atomic_void_p* address,
-		void* value)
-	{
-		__iso_volatile_store64((volatile __int64*) address, (__int64) value);
-	}
-
-	static inline void pthreadpool_store_release_uint32_t(
-		pthreadpool_atomic_uint32_t* address,
-		uint32_t value)
-	{
-		_WriteBarrier();
-		__stlr32((unsigned __int32 volatile*) address, (unsigned __int32) value);
-	}
-
-	static inline void pthreadpool_store_release_size_t(
-		pthreadpool_atomic_size_t* address,
-		size_t value)
-	{
-		_WriteBarrier();
-		__stlr64((unsigned __int64 volatile*) address, (unsigned __int64) value);
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_relaxed_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement64_nf((volatile __int64*) address);
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_release_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement64_rel((volatile __int64*) address);
-	}
-
-	static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t(
-		pthreadpool_atomic_size_t* address)
-	{
-		return (size_t) _InterlockedDecrement64((volatile __int64*) address);
-	}
-
-	static inline bool pthreadpool_try_decrement_relaxed_size_t(
-		pthreadpool_atomic_size_t* value)
-	{
-		size_t actual_value = (size_t) __iso_volatile_load64((const volatile __int64*) value);
-		while (actual_value != 0) {
-			const size_t new_value = actual_value - 1;
-			const size_t expected_value = actual_value;
-			actual_value = _InterlockedCompareExchange64_nf(
-				(volatile __int64*) value, (__int64) new_value, (__int64) expected_value);
-			if (actual_value == expected_value) {
-				return true;
-			}
-		}
-		return false;
-	}
-
-	static inline void pthreadpool_fence_acquire() {
-		__dmb(_ARM64_BARRIER_ISHLD);
-		_ReadBarrier();
-	}
-
-	static inline void pthreadpool_fence_release() {
-		_WriteBarrier();
-		__dmb(_ARM64_BARRIER_ISH);
-	}
 #elif defined(_MSC_VER) && defined(_M_ARM)
 	typedef volatile uint32_t pthreadpool_atomic_uint32_t;
 	typedef volatile size_t   pthreadpool_atomic_size_t;
@@ -851,15 +492,370 @@
 		_WriteBarrier();
 		__dmb(_ARM_BARRIER_ISH);
 	}
+#elif defined(_MSC_VER) && defined(_M_ARM64)
+	typedef volatile uint32_t pthreadpool_atomic_uint32_t;
+	typedef volatile size_t   pthreadpool_atomic_size_t;
+	typedef void *volatile    pthreadpool_atomic_void_p;
+
+	static inline uint32_t pthreadpool_load_relaxed_uint32_t(
+		pthreadpool_atomic_uint32_t* address)
+	{
+		return (uint32_t) __iso_volatile_load32((const volatile __int32*) address);
+	}
+
+	static inline size_t pthreadpool_load_relaxed_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) __iso_volatile_load64((const volatile __int64*) address);
+	}
+
+	static inline void* pthreadpool_load_relaxed_void_p(
+		pthreadpool_atomic_void_p* address)
+	{
+		return (void*) __iso_volatile_load64((const volatile __int64*) address);
+	}
+
+	static inline uint32_t pthreadpool_load_acquire_uint32_t(
+		pthreadpool_atomic_uint32_t* address)
+	{
+		return (uint32_t) __ldar32((volatile unsigned __int32*) address);
+	}
+
+	static inline size_t pthreadpool_load_acquire_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) __ldar64((volatile unsigned __int64*) address);
+	}
+
+	static inline void pthreadpool_store_relaxed_uint32_t(
+		pthreadpool_atomic_uint32_t* address,
+		uint32_t value)
+	{
+		__iso_volatile_store32((volatile __int32*) address, (__int32) value);
+	}
+
+	static inline void pthreadpool_store_relaxed_size_t(
+		pthreadpool_atomic_size_t* address,
+		size_t value)
+	{
+		__iso_volatile_store64((volatile __int64*) address, (__int64) value);
+	}
+
+	static inline void pthreadpool_store_relaxed_void_p(
+		pthreadpool_atomic_void_p* address,
+		void* value)
+	{
+		__iso_volatile_store64((volatile __int64*) address, (__int64) value);
+	}
+
+	static inline void pthreadpool_store_release_uint32_t(
+		pthreadpool_atomic_uint32_t* address,
+		uint32_t value)
+	{
+		_WriteBarrier();
+		__stlr32((unsigned __int32 volatile*) address, (unsigned __int32) value);
+	}
+
+	static inline void pthreadpool_store_release_size_t(
+		pthreadpool_atomic_size_t* address,
+		size_t value)
+	{
+		_WriteBarrier();
+		__stlr64((unsigned __int64 volatile*) address, (unsigned __int64) value);
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_relaxed_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement64_nf((volatile __int64*) address);
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_release_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement64_rel((volatile __int64*) address);
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement64((volatile __int64*) address);
+	}
+
+	static inline bool pthreadpool_try_decrement_relaxed_size_t(
+		pthreadpool_atomic_size_t* value)
+	{
+		size_t actual_value = (size_t) __iso_volatile_load64((const volatile __int64*) value);
+		while (actual_value != 0) {
+			const size_t new_value = actual_value - 1;
+			const size_t expected_value = actual_value;
+			actual_value = _InterlockedCompareExchange64_nf(
+				(volatile __int64*) value, (__int64) new_value, (__int64) expected_value);
+			if (actual_value == expected_value) {
+				return true;
+			}
+		}
+		return false;
+	}
+
+	static inline void pthreadpool_fence_acquire() {
+		__dmb(_ARM64_BARRIER_ISHLD);
+		_ReadBarrier();
+	}
+
+	static inline void pthreadpool_fence_release() {
+		_WriteBarrier();
+		__dmb(_ARM64_BARRIER_ISH);
+	}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+	typedef volatile uint32_t pthreadpool_atomic_uint32_t;
+	typedef volatile size_t   pthreadpool_atomic_size_t;
+	typedef void *volatile    pthreadpool_atomic_void_p;
+
+	static inline uint32_t pthreadpool_load_relaxed_uint32_t(
+		pthreadpool_atomic_uint32_t* address)
+	{
+		return *address;
+	}
+
+	static inline size_t pthreadpool_load_relaxed_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return *address;
+	}
+
+	static inline void* pthreadpool_load_relaxed_void_p(
+		pthreadpool_atomic_void_p* address)
+	{
+		return *address;
+	}
+
+	static inline uint32_t pthreadpool_load_acquire_uint32_t(
+		pthreadpool_atomic_uint32_t* address)
+	{
+		/* x86 loads always have acquire semantics; use only a compiler barrier */
+		const uint32_t value = *address;
+		_ReadBarrier();
+		return value;
+	}
+
+	static inline size_t pthreadpool_load_acquire_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		/* x86 loads always have acquire semantics; use only a compiler barrier */
+		const size_t value = *address;
+		_ReadBarrier();
+		return value;
+	}
+
+	static inline void pthreadpool_store_relaxed_uint32_t(
+		pthreadpool_atomic_uint32_t* address,
+		uint32_t value)
+	{
+		*address = value;
+	}
+
+	static inline void pthreadpool_store_relaxed_size_t(
+		pthreadpool_atomic_size_t* address,
+		size_t value)
+	{
+		*address = value;
+	}
+
+	static inline void pthreadpool_store_relaxed_void_p(
+		pthreadpool_atomic_void_p* address,
+		void* value)
+	{
+		*address = value;
+	}
+
+	static inline void pthreadpool_store_release_uint32_t(
+		pthreadpool_atomic_uint32_t* address,
+		uint32_t value)
+	{
+		/* x86 stores always have release semantics; use only a compiler barrier */
+		_WriteBarrier();
+		*address = value;
+	}
+
+	static inline void pthreadpool_store_release_size_t(
+		pthreadpool_atomic_size_t* address,
+		size_t value)
+	{
+		/* x86 stores always have release semantics; use only a compiler barrier */
+		_WriteBarrier();
+		*address = value;
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_relaxed_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement((volatile long*) address);
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_release_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement((volatile long*) address);
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement((volatile long*) address);
+	}
+
+	static inline bool pthreadpool_try_decrement_relaxed_size_t(
+		pthreadpool_atomic_size_t* value)
+	{
+		size_t actual_value = *value;
+		while (actual_value != 0) {
+			const size_t new_value = actual_value - 1;
+			const size_t expected_value = actual_value;
+			actual_value = _InterlockedCompareExchange(
+				(volatile long*) value, (long) new_value, (long) expected_value);
+			if (actual_value == expected_value) {
+				return true;
+			}
+		}
+		return false;
+	}
+
+	static inline void pthreadpool_fence_acquire() {
+		_mm_lfence();
+	}
+
+	static inline void pthreadpool_fence_release() {
+		_mm_sfence();
+	}
+#elif defined(_MSC_VER) && defined(_M_X64)
+	typedef volatile uint32_t pthreadpool_atomic_uint32_t;
+	typedef volatile size_t   pthreadpool_atomic_size_t;
+	typedef void *volatile    pthreadpool_atomic_void_p;
+
+	static inline uint32_t pthreadpool_load_relaxed_uint32_t(
+		pthreadpool_atomic_uint32_t* address)
+	{
+		return *address;
+	}
+
+	static inline size_t pthreadpool_load_relaxed_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return *address;
+	}
+
+	static inline void* pthreadpool_load_relaxed_void_p(
+		pthreadpool_atomic_void_p* address)
+	{
+		return *address;
+	}
+
+	static inline uint32_t pthreadpool_load_acquire_uint32_t(
+		pthreadpool_atomic_uint32_t* address)
+	{
+		/* x86-64 loads always have acquire semantics; use only a compiler barrier */
+		const uint32_t value = *address;
+		_ReadBarrier();
+		return value;
+	}
+
+	static inline size_t pthreadpool_load_acquire_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		/* x86-64 loads always have acquire semantics; use only a compiler barrier */
+		const size_t value = *address;
+		_ReadBarrier();
+		return value;
+	}
+
+	static inline void pthreadpool_store_relaxed_uint32_t(
+		pthreadpool_atomic_uint32_t* address,
+		uint32_t value)
+	{
+		*address = value;
+	}
+
+	static inline void pthreadpool_store_relaxed_size_t(
+		pthreadpool_atomic_size_t* address,
+		size_t value)
+	{
+		*address = value;
+	}
+
+	static inline void pthreadpool_store_relaxed_void_p(
+		pthreadpool_atomic_void_p* address,
+		void* value)
+	{
+		*address = value;
+	}
+
+	static inline void pthreadpool_store_release_uint32_t(
+		pthreadpool_atomic_uint32_t* address,
+		uint32_t value)
+	{
+		/* x86-64 stores always have release semantics; use only a compiler barrier */
+		_WriteBarrier();
+		*address = value;
+	}
+
+	static inline void pthreadpool_store_release_size_t(
+		pthreadpool_atomic_size_t* address,
+		size_t value)
+	{
+		/* x86-64 stores always have release semantics; use only a compiler barrier */
+		_WriteBarrier();
+		*address = value;
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_relaxed_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement64((volatile __int64*) address);
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_release_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement64((volatile __int64*) address);
+	}
+
+	static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t(
+		pthreadpool_atomic_size_t* address)
+	{
+		return (size_t) _InterlockedDecrement64((volatile __int64*) address);
+	}
+
+	static inline bool pthreadpool_try_decrement_relaxed_size_t(
+		pthreadpool_atomic_size_t* value)
+	{
+		size_t actual_value = *value;
+		while (actual_value != 0) {
+			const size_t new_value = actual_value - 1;
+			const size_t expected_value = actual_value;
+			actual_value = _InterlockedCompareExchange64(
+				(volatile __int64*) value, (__int64) new_value, (__int64) expected_value);
+			if (actual_value == expected_value) {
+				return true;
+			}
+		}
+		return false;
+	}
+
+	static inline void pthreadpool_fence_acquire() {
+		_mm_lfence();
+		_ReadBarrier();
+	}
+
+	static inline void pthreadpool_fence_release() {
+		_WriteBarrier();
+		_mm_sfence();
+	}
 #else
 	#error "Platform-specific implementation of threadpool-atomics.h required"
 #endif
 
-#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-	static inline void pthreadpool_yield() {
-		_mm_pause();
-	}
-#elif defined(__ARM_ACLE) || defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#if defined(__ARM_ACLE) || defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC))
 	static inline void pthreadpool_yield() {
 		__yield();
 	}
@@ -867,6 +863,10 @@
 	static inline void pthreadpool_yield() {
 		__asm__ __volatile__("yield");
 	}
+#elif defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+	static inline void pthreadpool_yield() {
+		_mm_pause();
+	}
 #else
 	static inline void pthreadpool_yield() {
 		pthreadpool_fence_acquire();
diff --git a/src/threadpool-utils.h b/src/threadpool-utils.h
index 91e2445..0b81360 100644
--- a/src/threadpool-utils.h
+++ b/src/threadpool-utils.h
@@ -4,7 +4,7 @@
 #include <stddef.h>
 
 /* SSE-specific headers */
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) && !defined(_M_ARM64EC) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
 	#include <xmmintrin.h>
 #endif
 
@@ -15,12 +15,12 @@
 
 
 struct fpu_state {
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
-	uint32_t mxcsr;
-#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM)
+#if defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM)
 	uint32_t fpscr;
-#elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && defined(_M_ARM64)
+#elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
 	uint64_t fpcr;
+#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+	uint32_t mxcsr;
 #else
 	char unused;
 #endif
@@ -28,12 +28,12 @@
 
 static inline struct fpu_state get_fpu_state() {
 	struct fpu_state state = { 0 };
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
-	state.mxcsr = (uint32_t) _mm_getcsr();
-#elif defined(_MSC_VER) && defined(_M_ARM)
+#if defined(_MSC_VER) && defined(_M_ARM)
 	state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0);
-#elif defined(_MSC_VER) && defined(_M_ARM64)
+#elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
 	state.fpcr = (uint64_t) _ReadStatusReg(0x5A20);
+#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+	state.mxcsr = (uint32_t) _mm_getcsr();
 #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
 	__asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr));
 #elif defined(__GNUC__) && defined(__aarch64__)
@@ -43,27 +43,25 @@
 }
 
 static inline void set_fpu_state(const struct fpu_state state) {
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
-	_mm_setcsr((unsigned int) state.mxcsr);
-#elif defined(_MSC_VER) && defined(_M_ARM)
+#if defined(_MSC_VER) && defined(_M_ARM)
 	_MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0);
-#elif defined(_MSC_VER) && defined(_M_ARM64)
+#elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
 	_WriteStatusReg(0x5A20, (__int64) state.fpcr);
 #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
 	__asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr));
 #elif defined(__GNUC__) && defined(__aarch64__)
 	__asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr));
+#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+	_mm_setcsr((unsigned int) state.mxcsr);
 #endif
 }
 
 static inline void disable_fpu_denormals() {
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
-	_mm_setcsr(_mm_getcsr() | 0x8040);
-#elif defined(_MSC_VER) && defined(_M_ARM)
+#if defined(_MSC_VER) && defined(_M_ARM)
 	int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0);
 	fpscr |= 0x1000000;
 	_MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0);
-#elif defined(_MSC_VER) && defined(_M_ARM64)
+#elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
 	__int64 fpcr = _ReadStatusReg(0x5A20);
 	fpcr |= 0x1080000;
 	_WriteStatusReg(0x5A20, fpcr);
@@ -92,6 +90,8 @@
 			"ORR %w[fpcr], %w[fpcr], 0x80000\n"
 			"MSR fpcr, %[fpcr]\n"
 		: [fpcr] "=r" (fpcr));
+#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+	_mm_setcsr(_mm_getcsr() | 0x8040);
 #endif
 }