andrew@webrtc.org | c6a3755 | 2013-05-08 20:35:43 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
| 11 | // Modified from the Chromium original: |
| 12 | // src/media/base/simd/sinc_resampler_sse.cc |
| 13 | |
| 14 | #include "webrtc/common_audio/resampler/sinc_resampler.h" |
| 15 | |
| 16 | #include <xmmintrin.h> |
| 17 | |
| 18 | namespace webrtc { |
| 19 | |
| 20 | float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, |
| 21 | const float* k2, |
| 22 | double kernel_interpolation_factor) { |
| 23 | __m128 m_input; |
| 24 | __m128 m_sums1 = _mm_setzero_ps(); |
| 25 | __m128 m_sums2 = _mm_setzero_ps(); |
| 26 | |
| 27 | // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling |
| 28 | // these loops hurt performance in local testing. |
| 29 | if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { |
Peter Kasting | dce40cf | 2015-08-24 14:52:23 -0700 | [diff] [blame] | 30 | for (size_t i = 0; i < kKernelSize; i += 4) { |
andrew@webrtc.org | c6a3755 | 2013-05-08 20:35:43 +0000 | [diff] [blame] | 31 | m_input = _mm_loadu_ps(input_ptr + i); |
| 32 | m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
| 33 | m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
| 34 | } |
| 35 | } else { |
Peter Kasting | dce40cf | 2015-08-24 14:52:23 -0700 | [diff] [blame] | 36 | for (size_t i = 0; i < kKernelSize; i += 4) { |
andrew@webrtc.org | c6a3755 | 2013-05-08 20:35:43 +0000 | [diff] [blame] | 37 | m_input = _mm_load_ps(input_ptr + i); |
| 38 | m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
| 39 | m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
| 40 | } |
| 41 | } |
| 42 | |
| 43 | // Linearly interpolate the two "convolutions". |
bjornv@webrtc.org | 3cbd6c2 | 2014-09-04 13:21:44 +0000 | [diff] [blame] | 44 | m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1( |
| 45 | static_cast<float>(1.0 - kernel_interpolation_factor))); |
| 46 | m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1( |
| 47 | static_cast<float>(kernel_interpolation_factor))); |
andrew@webrtc.org | c6a3755 | 2013-05-08 20:35:43 +0000 | [diff] [blame] | 48 | m_sums1 = _mm_add_ps(m_sums1, m_sums2); |
| 49 | |
| 50 | // Sum components together. |
| 51 | float result; |
| 52 | m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
| 53 | _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
| 54 | m_sums2, m_sums2, 1))); |
| 55 | |
| 56 | return result; |
| 57 | } |
| 58 | |
| 59 | } // namespace webrtc |