Blame - modules/audio_processing/aec3/vector_math.h - webrtc.googlesource.com/src

blob: e4d1381ae1769031451b3c334a12b68629518cb1 [file] [log] [blame]

peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame]	11	#ifndef MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_
				12	#define MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	13
Niels Möller	a12c42a	2018-07-25 16:05:48 +0200	[diff] [blame]	14	// Defines WEBRTC_ARCH_X86_FAMILY, used below.
				15	#include "rtc_base/system/arch.h"
				16
peah	5d153c7	2017-05-03 06:45:44 -0700	[diff] [blame]	17	#if defined(WEBRTC_HAS_NEON)
				18	#include <arm_neon.h>
				19	#endif
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	20	#if defined(WEBRTC_ARCH_X86_FAMILY)
				21	#include <emmintrin.h>
				22	#endif
				23	#include <math.h>
Jonas Olsson	a4d8737	2019-07-05 19:08:33 +0200	[diff] [blame]	24
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	25	#include <algorithm>
				26	#include <array>
				27	#include <functional>
				28
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame]	29	#include "api/array_view.h"
				30	#include "modules/audio_processing/aec3/aec3_common.h"
				31	#include "rtc_base/checks.h"
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	32
				33	namespace webrtc {
				34	namespace aec3 {
				35
				36	// Provides optimizations for mathematical operations based on vectors.
				37	class VectorMath {
				38	public:
				39	explicit VectorMath(Aec3Optimization optimization)
				40	: optimization_(optimization) {}
				41
				42	// Elementwise square root.
Zhaoliang Ma	e537e9c	2020-08-31 10:20:47 +0800	[diff] [blame]	43	void SqrtAVX2(rtc::ArrayView<float> x);
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	44	void Sqrt(rtc::ArrayView<float> x) {
				45	switch (optimization_) {
				46	#if defined(WEBRTC_ARCH_X86_FAMILY)
				47	case Aec3Optimization::kSse2: {
				48	const int x_size = static_cast<int>(x.size());
				49	const int vector_limit = x_size >> 2;
				50
				51	int j = 0;
				52	for (; j < vector_limit * 4; j += 4) {
				53	__m128 g = _mm_loadu_ps(&x[j]);
				54	g = _mm_sqrt_ps(g);
				55	_mm_storeu_ps(&x[j], g);
				56	}
				57
				58	for (; j < x_size; ++j) {
				59	x[j] = sqrtf(x[j]);
				60	}
				61	} break;
Zhaoliang Ma	e537e9c	2020-08-31 10:20:47 +0800	[diff] [blame]	62	case Aec3Optimization::kAvx2:
				63	SqrtAVX2(x);
				64	break;
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	65	#endif
peah	5d153c7	2017-05-03 06:45:44 -0700	[diff] [blame]	66	#if defined(WEBRTC_HAS_NEON)
				67	case Aec3Optimization::kNeon: {
				68	const int x_size = static_cast<int>(x.size());
				69	const int vector_limit = x_size >> 2;
				70
				71	int j = 0;
				72	for (; j < vector_limit * 4; j += 4) {
				73	float32x4_t g = vld1q_f32(&x[j]);
				74	#if !defined(WEBRTC_ARCH_ARM64)
				75	float32x4_t y = vrsqrteq_f32(g);
				76
				77	// Code to handle sqrt(0).
				78	// If the input to sqrtf() is zero, a zero will be returned.
				79	// If the input to vrsqrteq_f32() is zero, positive infinity is
				80	// returned.
				81	const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
				82	// check for divide by zero
				83	const uint32x4_t div_by_zero =
				84	vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(y));
				85	// zero out the positive infinity results
				86	y = vreinterpretq_f32_u32(
				87	vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(y)));
				88	// from arm documentation
				89	// The Newton-Raphson iteration:
				90	// y[n+1] = y[n] * (3 - d * (y[n] * y[n])) / 2)
				91	// converges to (1/√d) if y0 is the result of VRSQRTE applied to d.
				92	//
				93	// Note: The precision did not improve after 2 iterations.
				94	for (int i = 0; i < 2; i++) {
				95	y = vmulq_f32(vrsqrtsq_f32(vmulq_f32(y, y), g), y);
				96	}
				97	// sqrt(g) = g * 1/sqrt(g)
				98	g = vmulq_f32(g, y);
				99	#else
				100	g = vsqrtq_f32(g);
				101	#endif
				102	vst1q_f32(&x[j], g);
				103	}
				104
				105	for (; j < x_size; ++j) {
				106	x[j] = sqrtf(x[j]);
				107	}
				108	}
				109	#endif
				110	break;
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	111	default:
				112	std::for_each(x.begin(), x.end(), [](float& a) { a = sqrtf(a); });
				113	}
				114	}
				115
				116	// Elementwise vector multiplication z = x * y.
Zhaoliang Ma	e537e9c	2020-08-31 10:20:47 +0800	[diff] [blame]	117	void MultiplyAVX2(rtc::ArrayView<const float> x,
				118	rtc::ArrayView<const float> y,
				119	rtc::ArrayView<float> z);
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	120	void Multiply(rtc::ArrayView<const float> x,
				121	rtc::ArrayView<const float> y,
				122	rtc::ArrayView<float> z) {
				123	RTC_DCHECK_EQ(z.size(), x.size());
				124	RTC_DCHECK_EQ(z.size(), y.size());
				125	switch (optimization_) {
				126	#if defined(WEBRTC_ARCH_X86_FAMILY)
				127	case Aec3Optimization::kSse2: {
				128	const int x_size = static_cast<int>(x.size());
				129	const int vector_limit = x_size >> 2;
				130
				131	int j = 0;
				132	for (; j < vector_limit * 4; j += 4) {
				133	const __m128 x_j = _mm_loadu_ps(&x[j]);
				134	const __m128 y_j = _mm_loadu_ps(&y[j]);
				135	const __m128 z_j = _mm_mul_ps(x_j, y_j);
				136	_mm_storeu_ps(&z[j], z_j);
				137	}
				138
				139	for (; j < x_size; ++j) {
				140	z[j] = x[j] * y[j];
				141	}
				142	} break;
Zhaoliang Ma	e537e9c	2020-08-31 10:20:47 +0800	[diff] [blame]	143	case Aec3Optimization::kAvx2:
				144	MultiplyAVX2(x, y, z);
				145	break;
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	146	#endif
peah	5d153c7	2017-05-03 06:45:44 -0700	[diff] [blame]	147	#if defined(WEBRTC_HAS_NEON)
				148	case Aec3Optimization::kNeon: {
				149	const int x_size = static_cast<int>(x.size());
				150	const int vector_limit = x_size >> 2;
				151
				152	int j = 0;
				153	for (; j < vector_limit * 4; j += 4) {
				154	const float32x4_t x_j = vld1q_f32(&x[j]);
				155	const float32x4_t y_j = vld1q_f32(&y[j]);
				156	const float32x4_t z_j = vmulq_f32(x_j, y_j);
				157	vst1q_f32(&z[j], z_j);
				158	}
				159
				160	for (; j < x_size; ++j) {
				161	z[j] = x[j] * y[j];
				162	}
				163	} break;
				164	#endif
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	165	default:
				166	std::transform(x.begin(), x.end(), y.begin(), z.begin(),
				167	std::multiplies<float>());
				168	}
				169	}
				170
				171	// Elementwise vector accumulation z += x.
Zhaoliang Ma	e537e9c	2020-08-31 10:20:47 +0800	[diff] [blame]	172	void AccumulateAVX2(rtc::ArrayView<const float> x, rtc::ArrayView<float> z);
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	173	void Accumulate(rtc::ArrayView<const float> x, rtc::ArrayView<float> z) {
				174	RTC_DCHECK_EQ(z.size(), x.size());
				175	switch (optimization_) {
				176	#if defined(WEBRTC_ARCH_X86_FAMILY)
				177	case Aec3Optimization::kSse2: {
				178	const int x_size = static_cast<int>(x.size());
				179	const int vector_limit = x_size >> 2;
				180
				181	int j = 0;
				182	for (; j < vector_limit * 4; j += 4) {
				183	const __m128 x_j = _mm_loadu_ps(&x[j]);
				184	__m128 z_j = _mm_loadu_ps(&z[j]);
				185	z_j = _mm_add_ps(x_j, z_j);
				186	_mm_storeu_ps(&z[j], z_j);
				187	}
				188
				189	for (; j < x_size; ++j) {
				190	z[j] += x[j];
				191	}
				192	} break;
Zhaoliang Ma	e537e9c	2020-08-31 10:20:47 +0800	[diff] [blame]	193	case Aec3Optimization::kAvx2:
				194	AccumulateAVX2(x, z);
				195	break;
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	196	#endif
peah	5d153c7	2017-05-03 06:45:44 -0700	[diff] [blame]	197	#if defined(WEBRTC_HAS_NEON)
				198	case Aec3Optimization::kNeon: {
				199	const int x_size = static_cast<int>(x.size());
				200	const int vector_limit = x_size >> 2;
				201
				202	int j = 0;
				203	for (; j < vector_limit * 4; j += 4) {
				204	const float32x4_t x_j = vld1q_f32(&x[j]);
				205	float32x4_t z_j = vld1q_f32(&z[j]);
				206	z_j = vaddq_f32(z_j, x_j);
				207	vst1q_f32(&z[j], z_j);
				208	}
				209
				210	for (; j < x_size; ++j) {
				211	z[j] += x[j];
				212	}
				213	} break;
				214	#endif
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	215	default:
				216	std::transform(x.begin(), x.end(), z.begin(), z.begin(),
				217	std::plus<float>());
				218	}
				219	}
				220
				221	private:
				222	Aec3Optimization optimization_;
				223	};
				224
				225	} // namespace aec3
				226
				227	} // namespace webrtc
				228
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame]	229	#endif // MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_