Blame - common_audio/signal_processing/downsample_fast_neon.c - webrtc.googlesource.com/src

blob: 36fc0c8aee8eddb64e794cfbbed426d012c9ace5 [file] [log] [blame]

andrew@webrtc.org	6fd52f3	2014-12-10 00:59:48 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame]	11	#include "common_audio/signal_processing/include/signal_processing_library.h"
andrew@webrtc.org	6fd52f3	2014-12-10 00:59:48 +0000	[diff] [blame]	12
				13	#include <arm_neon.h>
				14
				15	// NEON intrinsics version of WebRtcSpl_DownsampleFast()
				16	// for ARM 32-bit/64-bit platforms.
				17	int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	18	size_t data_in_length,
andrew@webrtc.org	6fd52f3	2014-12-10 00:59:48 +0000	[diff] [blame]	19	int16_t* data_out,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	20	size_t data_out_length,
andrew@webrtc.org	6fd52f3	2014-12-10 00:59:48 +0000	[diff] [blame]	21	const int16_t* __restrict coefficients,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	22	size_t coefficients_length,
andrew@webrtc.org	6fd52f3	2014-12-10 00:59:48 +0000	[diff] [blame]	23	int factor,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	24	size_t delay) {
				25	size_t i = 0;
				26	size_t j = 0;
andrew@webrtc.org	6fd52f3	2014-12-10 00:59:48 +0000	[diff] [blame]	27	int32_t out_s32 = 0;
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	28	size_t endpos = delay + factor * (data_out_length - 1) + 1;
				29	size_t res = data_out_length & 0x7;
				30	size_t endpos1 = endpos - factor * res;
andrew@webrtc.org	6fd52f3	2014-12-10 00:59:48 +0000	[diff] [blame]	31
				32	// Return error if any of the running conditions doesn't meet.
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	33	if (data_out_length == 0 \|\| coefficients_length == 0
andrew@webrtc.org	6fd52f3	2014-12-10 00:59:48 +0000	[diff] [blame]	34	\|\| data_in_length < endpos) {
				35	return -1;
				36	}
				37
				38	// First part, unroll the loop 8 times, with 3 subcases
				39	// (factor == 2, 4, others).
				40	switch (factor) {
				41	case 2: {
				42	for (i = delay; i < endpos1; i += 16) {
				43	// Round value, 0.5 in Q12.
				44	int32x4_t out32x4_0 = vdupq_n_s32(2048);
				45	int32x4_t out32x4_1 = vdupq_n_s32(2048);
				46
				47	#if defined(WEBRTC_ARCH_ARM64)
				48	// Unroll the loop 2 times.
				49	for (j = 0; j < coefficients_length - 1; j += 2) {
				50	int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
				51	int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
				52	int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
				53
				54	// Mul and accumulate low 64-bit data.
				55	int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
				56	int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
				57	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
				58	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
				59
				60	// Mul and accumulate high 64-bit data.
				61	// TODO: vget_high_s16 need extra cost on ARM64. This could be
				62	// replaced by vmlal_high_lane_s16. But for the interface of
				63	// vmlal_high_lane_s16, there is a bug in gcc 4.9.
				64	// This issue need to be tracked in the future.
				65	int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
				66	int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
				67	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
				68	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
				69	}
				70
				71	for (; j < coefficients_length; j++) {
				72	int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
				73	int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
				74
				75	// Mul and accumulate low 64-bit data.
				76	int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
				77	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
				78
				79	// Mul and accumulate high 64-bit data.
				80	// TODO: vget_high_s16 need extra cost on ARM64. This could be
				81	// replaced by vmlal_high_lane_s16. But for the interface of
				82	// vmlal_high_lane_s16, there is a bug in gcc 4.9.
				83	// This issue need to be tracked in the future.
				84	int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
				85	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
				86	}
				87	#else
				88	// On ARMv7, the loop unrolling 2 times results in performance
				89	// regression.
				90	for (j = 0; j < coefficients_length; j++) {
				91	int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
				92	int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
				93
				94	// Mul and accumulate.
				95	int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
				96	int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
				97	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
				98	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
				99	}
				100	#endif
				101
				102	// Saturate and store the output.
				103	int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
				104	int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
				105	vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
				106	data_out += 8;
				107	}
				108	break;
				109	}
				110	case 4: {
				111	for (i = delay; i < endpos1; i += 32) {
				112	// Round value, 0.5 in Q12.
				113	int32x4_t out32x4_0 = vdupq_n_s32(2048);
				114	int32x4_t out32x4_1 = vdupq_n_s32(2048);
				115
				116	// Unroll the loop 4 times.
				117	for (j = 0; j < coefficients_length - 3; j += 4) {
				118	int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
				119	int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
				120
				121	// Mul and accumulate low 64-bit data.
				122	int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
				123	int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
				124	int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
				125	int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
				126	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
				127	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
				128	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
				129	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
				130
				131	// Mul and accumulate high 64-bit data.
				132	// TODO: vget_high_s16 need extra cost on ARM64. This could be
				133	// replaced by vmlal_high_lane_s16. But for the interface of
				134	// vmlal_high_lane_s16, there is a bug in gcc 4.9.
				135	// This issue need to be tracked in the future.
				136	int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
				137	int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
				138	int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
				139	int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
				140	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
				141	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
				142	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
				143	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
				144	}
				145
				146	for (; j < coefficients_length; j++) {
				147	int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
				148	int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
				149
				150	// Mul and accumulate low 64-bit data.
				151	int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
				152	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
				153
				154	// Mul and accumulate high 64-bit data.
				155	// TODO: vget_high_s16 need extra cost on ARM64. This could be
				156	// replaced by vmlal_high_lane_s16. But for the interface of
				157	// vmlal_high_lane_s16, there is a bug in gcc 4.9.
				158	// This issue need to be tracked in the future.
				159	int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
				160	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
				161	}
				162
				163	// Saturate and store the output.
				164	int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
				165	int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
				166	vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
				167	data_out += 8;
				168	}
				169	break;
				170	}
				171	default: {
				172	for (i = delay; i < endpos1; i += factor * 8) {
				173	// Round value, 0.5 in Q12.
				174	int32x4_t out32x4_0 = vdupq_n_s32(2048);
				175	int32x4_t out32x4_1 = vdupq_n_s32(2048);
				176
				177	for (j = 0; j < coefficients_length; j++) {
				178	int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
				179	int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
				180	in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
				181	in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
				182	in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
				183	int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
				184	in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
				185	in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
				186	in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
				187
				188	// Mul and accumulate.
				189	out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
				190	out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
				191	}
				192
				193	// Saturate and store the output.
				194	int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
				195	int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
				196	vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
				197	data_out += 8;
				198	}
				199	break;
				200	}
				201	}
				202
				203	// Second part, do the rest iterations (if any).
				204	for (; i < endpos; i += factor) {
				205	out_s32 = 2048; // Round value, 0.5 in Q12.
				206
				207	for (j = 0; j < coefficients_length; j++) {
				208	out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
				209	}
				210
				211	// Saturate and store the output.
				212	out_s32 >>= 12;
				213	*data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
				214	}
				215
				216	return 0;
				217	}