Blame - modules/audio_processing/aec3/vector_math.h - webrtc.googlesource.com/src

blob: 883cd95fdd8308cdf3d7a7c11f517e66bb257fab [file] [log] [blame]

peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame]	11	#ifndef MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_
				12	#define MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	13
Niels Möller	a12c42a	2018-07-25 16:05:48 +0200	[diff] [blame]	14	// Defines WEBRTC_ARCH_X86_FAMILY, used below.
				15	#include "rtc_base/system/arch.h"
				16
peah	5d153c7	2017-05-03 06:45:44 -0700	[diff] [blame]	17	#if defined(WEBRTC_HAS_NEON)
				18	#include <arm_neon.h>
				19	#endif
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	20	#if defined(WEBRTC_ARCH_X86_FAMILY)
				21	#include <emmintrin.h>
				22	#endif
				23	#include <math.h>
Jonas Olsson	a4d8737	2019-07-05 19:08:33 +0200	[diff] [blame]	24
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	25	#include <algorithm>
				26	#include <array>
				27	#include <functional>
				28
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame]	29	#include "api/array_view.h"
				30	#include "modules/audio_processing/aec3/aec3_common.h"
				31	#include "rtc_base/checks.h"
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	32
				33	namespace webrtc {
				34	namespace aec3 {
				35
				36	// Provides optimizations for mathematical operations based on vectors.
				37	class VectorMath {
				38	public:
				39	explicit VectorMath(Aec3Optimization optimization)
				40	: optimization_(optimization) {}
				41
				42	// Elementwise square root.
				43	void Sqrt(rtc::ArrayView<float> x) {
				44	switch (optimization_) {
				45	#if defined(WEBRTC_ARCH_X86_FAMILY)
				46	case Aec3Optimization::kSse2: {
				47	const int x_size = static_cast<int>(x.size());
				48	const int vector_limit = x_size >> 2;
				49
				50	int j = 0;
				51	for (; j < vector_limit * 4; j += 4) {
				52	__m128 g = _mm_loadu_ps(&x[j]);
				53	g = _mm_sqrt_ps(g);
				54	_mm_storeu_ps(&x[j], g);
				55	}
				56
				57	for (; j < x_size; ++j) {
				58	x[j] = sqrtf(x[j]);
				59	}
				60	} break;
				61	#endif
peah	5d153c7	2017-05-03 06:45:44 -0700	[diff] [blame]	62	#if defined(WEBRTC_HAS_NEON)
				63	case Aec3Optimization::kNeon: {
				64	const int x_size = static_cast<int>(x.size());
				65	const int vector_limit = x_size >> 2;
				66
				67	int j = 0;
				68	for (; j < vector_limit * 4; j += 4) {
				69	float32x4_t g = vld1q_f32(&x[j]);
				70	#if !defined(WEBRTC_ARCH_ARM64)
				71	float32x4_t y = vrsqrteq_f32(g);
				72
				73	// Code to handle sqrt(0).
				74	// If the input to sqrtf() is zero, a zero will be returned.
				75	// If the input to vrsqrteq_f32() is zero, positive infinity is
				76	// returned.
				77	const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
				78	// check for divide by zero
				79	const uint32x4_t div_by_zero =
				80	vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(y));
				81	// zero out the positive infinity results
				82	y = vreinterpretq_f32_u32(
				83	vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(y)));
				84	// from arm documentation
				85	// The Newton-Raphson iteration:
				86	// y[n+1] = y[n] * (3 - d * (y[n] * y[n])) / 2)
				87	// converges to (1/√d) if y0 is the result of VRSQRTE applied to d.
				88	//
				89	// Note: The precision did not improve after 2 iterations.
				90	for (int i = 0; i < 2; i++) {
				91	y = vmulq_f32(vrsqrtsq_f32(vmulq_f32(y, y), g), y);
				92	}
				93	// sqrt(g) = g * 1/sqrt(g)
				94	g = vmulq_f32(g, y);
				95	#else
				96	g = vsqrtq_f32(g);
				97	#endif
				98	vst1q_f32(&x[j], g);
				99	}
				100
				101	for (; j < x_size; ++j) {
				102	x[j] = sqrtf(x[j]);
				103	}
				104	}
				105	#endif
				106	break;
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	107	default:
				108	std::for_each(x.begin(), x.end(), [](float& a) { a = sqrtf(a); });
				109	}
				110	}
				111
				112	// Elementwise vector multiplication z = x * y.
				113	void Multiply(rtc::ArrayView<const float> x,
				114	rtc::ArrayView<const float> y,
				115	rtc::ArrayView<float> z) {
				116	RTC_DCHECK_EQ(z.size(), x.size());
				117	RTC_DCHECK_EQ(z.size(), y.size());
				118	switch (optimization_) {
				119	#if defined(WEBRTC_ARCH_X86_FAMILY)
				120	case Aec3Optimization::kSse2: {
				121	const int x_size = static_cast<int>(x.size());
				122	const int vector_limit = x_size >> 2;
				123
				124	int j = 0;
				125	for (; j < vector_limit * 4; j += 4) {
				126	const __m128 x_j = _mm_loadu_ps(&x[j]);
				127	const __m128 y_j = _mm_loadu_ps(&y[j]);
				128	const __m128 z_j = _mm_mul_ps(x_j, y_j);
				129	_mm_storeu_ps(&z[j], z_j);
				130	}
				131
				132	for (; j < x_size; ++j) {
				133	z[j] = x[j] * y[j];
				134	}
				135	} break;
				136	#endif
peah	5d153c7	2017-05-03 06:45:44 -0700	[diff] [blame]	137	#if defined(WEBRTC_HAS_NEON)
				138	case Aec3Optimization::kNeon: {
				139	const int x_size = static_cast<int>(x.size());
				140	const int vector_limit = x_size >> 2;
				141
				142	int j = 0;
				143	for (; j < vector_limit * 4; j += 4) {
				144	const float32x4_t x_j = vld1q_f32(&x[j]);
				145	const float32x4_t y_j = vld1q_f32(&y[j]);
				146	const float32x4_t z_j = vmulq_f32(x_j, y_j);
				147	vst1q_f32(&z[j], z_j);
				148	}
				149
				150	for (; j < x_size; ++j) {
				151	z[j] = x[j] * y[j];
				152	}
				153	} break;
				154	#endif
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	155	default:
				156	std::transform(x.begin(), x.end(), y.begin(), z.begin(),
				157	std::multiplies<float>());
				158	}
				159	}
				160
				161	// Elementwise vector accumulation z += x.
				162	void Accumulate(rtc::ArrayView<const float> x, rtc::ArrayView<float> z) {
				163	RTC_DCHECK_EQ(z.size(), x.size());
				164	switch (optimization_) {
				165	#if defined(WEBRTC_ARCH_X86_FAMILY)
				166	case Aec3Optimization::kSse2: {
				167	const int x_size = static_cast<int>(x.size());
				168	const int vector_limit = x_size >> 2;
				169
				170	int j = 0;
				171	for (; j < vector_limit * 4; j += 4) {
				172	const __m128 x_j = _mm_loadu_ps(&x[j]);
				173	__m128 z_j = _mm_loadu_ps(&z[j]);
				174	z_j = _mm_add_ps(x_j, z_j);
				175	_mm_storeu_ps(&z[j], z_j);
				176	}
				177
				178	for (; j < x_size; ++j) {
				179	z[j] += x[j];
				180	}
				181	} break;
				182	#endif
peah	5d153c7	2017-05-03 06:45:44 -0700	[diff] [blame]	183	#if defined(WEBRTC_HAS_NEON)
				184	case Aec3Optimization::kNeon: {
				185	const int x_size = static_cast<int>(x.size());
				186	const int vector_limit = x_size >> 2;
				187
				188	int j = 0;
				189	for (; j < vector_limit * 4; j += 4) {
				190	const float32x4_t x_j = vld1q_f32(&x[j]);
				191	float32x4_t z_j = vld1q_f32(&z[j]);
				192	z_j = vaddq_f32(z_j, x_j);
				193	vst1q_f32(&z[j], z_j);
				194	}
				195
				196	for (; j < x_size; ++j) {
				197	z[j] += x[j];
				198	}
				199	} break;
				200	#endif
peah	5e79b29	2017-04-12 01:20:45 -0700	[diff] [blame]	201	default:
				202	std::transform(x.begin(), x.end(), z.begin(), z.begin(),
				203	std::plus<float>());
				204	}
				205	}
				206
				207	private:
				208	Aec3Optimization optimization_;
				209	};
				210
				211	} // namespace aec3
				212
				213	} // namespace webrtc
				214
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame]	215	#endif // MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_