Replace asm NEON function by intrinsics implementation on ARMv7
Passed building isac_neon and modules_unittests on Android ARMv7.
Passed modules_unittests with following filters:
--gtest_filter=FiltersTest*
--gtest_filter=LpcMaskingModelTest*
--gtest_filter=TransformTest*
--gtest_filter=FilterBanksTest*
WebRtcIsacfix_CalculateResidualEnergyNeon is removed, refer more in
Issue 4224.
The old review url is at: https://webrtc-codereview.appspot.com/37259004/
BUG=4002
R=andrew@webrtc.org, jridges@masque.com, kjellander@webrtc.org
Review URL: https://webrtc-codereview.appspot.com/48319005
Patch from Zhongwei Yao <zhongwei.yao@arm.com>.
Change-Id: I4c16e15930f1b3449d67b67bf023fac28121dff8
Cr-Commit-Position: refs/heads/master@{#9140}
diff --git a/webrtc/modules/audio_coding/BUILD.gn b/webrtc/modules/audio_coding/BUILD.gn
index 9a2bf79..df565d0 100644
--- a/webrtc/modules/audio_coding/BUILD.gn
+++ b/webrtc/modules/audio_coding/BUILD.gn
@@ -591,17 +591,14 @@
if (rtc_build_armv7_neon || current_cpu == "arm64") {
source_set("isac_neon") {
- sources = [ "codecs/isac/fix/source/entropy_coding_neon.c" ]
+ sources = [
+ "codecs/isac/fix/source/entropy_coding_neon.c",
+ "codecs/isac/fix/source/filters_neon.c",
+ "codecs/isac/fix/source/lattice_neon.c",
+ "codecs/isac/fix/source/transform_neon.c",
+ ]
if (rtc_build_armv7_neon) {
- sources += [
- "codecs/isac/fix/source/filterbanks_neon.S",
- "codecs/isac/fix/source/filters_neon.S",
- "codecs/isac/fix/source/lattice_neon.S",
- "codecs/isac/fix/source/lpc_masking_model_neon.S",
- "codecs/isac/fix/source/transform_neon.S",
- ]
-
# Enable compilation for the ARM v7 Neon instruction set. This is needed
# since //build/config/arm.gni only enables Neon for iOS, not Android.
# This provides the same functionality as webrtc/build/arm_neon.gypi.
@@ -614,18 +611,11 @@
]
}
- if (current_cpu == "arm64") {
- sources += [
- "codecs/isac/fix/source/filters_neon.c",
- "codecs/isac/fix/source/lattice_neon.c",
- "codecs/isac/fix/source/transform_neon.c",
- ]
+ if (current_cpu != "arm64" || !is_clang) {
# Disable AllpassFilter2FixDec16Neon function due to a clang bug.
# Refer more details at:
# https://code.google.com/p/webrtc/issues/detail?id=4567
- if (!is_clang) {
sources += [ "codecs/isac/fix/source/filterbanks_neon.c", ]
- }
}
# Disable LTO in audio_processing_neon target due to compiler bug.
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
deleted file mode 100644
index 0a43551..0000000
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
+++ /dev/null
@@ -1,270 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
-@ in iSAC codec, optimized for ARM Neon platform. Bit exact with function
-@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
-@ C code is at end of this file.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
-.align 2
-
-@void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
-@ int16_t *data_ch1, // Input and output in channel 1, in Q0
-@ int16_t *data_ch2, // Input and output in channel 2, in Q0
-@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
-@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
-@ const int length, // Length of the data buffers
-@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
-@ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
-
-DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
- push {r4 - r7}
-
- ldr r5, [sp, #24] @ filter_state_ch2
- ldr r6, [sp, #20] @ filter_state_ch1
-
- @ Initialize the Neon registers.
- vld1.16 d0[0], [r0]! @ data_ch1[0]
- vld1.16 d0[2], [r1]! @ data_ch2[0]
- vld1.32 d30[0], [r2] @ factor_ch1[0], factor_ch1[1]
- vld1.32 d30[1], [r3] @ factor_ch2[0], factor_ch2[1]
- vld1.32 d16[0], [r6]! @ filter_state_ch1[0]
- vld1.32 d17[0], [r5]! @ filter_state_ch2[0]
- vneg.s16 d31, d30
-
- ldr r3, [sp, #16] @ length
- mov r4, #4 @ Post offset value for the loop
- mov r2, #-2 @ Post offset value for the loop
- sub r3, #2 @ Loop counter
-
- @ Loop unrolling pre-processing.
- vqdmull.s16 q1, d30, d0
- vshll.s16 q0, d0, #16
- vqadd.s32 q2, q1, q8
- vshrn.i32 d6, q2, #16
- vmull.s16 q1, d31, d6
- vshl.s32 q1, #1
- vqadd.s32 q8, q1, q0
- vld1.32 d16[1], [r6] @ filter_state_ch1[1]
- vld1.32 d17[1], [r5] @ filter_state_ch2[1]
- sub r6, #4 @ &filter_state_ch1[0]
- sub r5, #4 @ &filter_state_ch2[0]
- vld1.16 d6[1], [r0], r2 @ data_ch1[1]
- vld1.16 d6[3], [r1], r2 @ data_ch2[1]
- vrev32.16 d0, d6
-
-FOR_LOOP:
- vqdmull.s16 q1, d30, d0
- vshll.s16 q0, d0, #16
- vqadd.s32 q2, q1, q8
- vshrn.i32 d4, q2, #16
- vmull.s16 q1, d31, d4
- vst1.16 d4[1], [r0], r4 @ Store data_ch1[n]
- vst1.16 d4[3], [r1], r4 @ Store data_ch2[n]
- vshl.s32 q1, #1
- vld1.16 d4[1], [r0], r2 @ Load data_ch1[n + 2]
- vld1.16 d4[3], [r1], r2 @ Load data_ch2[n + 2]
- vqadd.s32 q8, q1, q0
- vrev32.16 d0, d4
- vqdmull.s16 q1, d30, d0
- subs r3, #2
- vqadd.s32 q2, q1, q8
- vshrn.i32 d6, q2, #16
- vmull.s16 q1, d31, d6
- vshll.s16 q0, d0, #16
- vst1.16 d6[1], [r0], r4 @ Store data_ch1[n + 1]
- vst1.16 d6[3], [r1], r4 @ Store data_ch2[n + 1]
- vshl.s32 q1, #1
- vld1.16 d6[1], [r0], r2 @ Load data_ch1[n + 3]
- vld1.16 d6[3], [r1], r2 @ Load data_ch2[n + 3]
- vqadd.s32 q8, q1, q0
- vrev32.16 d0, d6
- bgt FOR_LOOP
-
- @ Loop unrolling post-processing.
- vqdmull.s16 q1, d30, d0
- vshll.s16 q0, d0, #16
- vqadd.s32 q2, q1, q8
- vshrn.i32 d4, q2, #16
- vmull.s16 q1, d31, d4
- vst1.16 d4[1], [r0]! @ Store data_ch1[n]
- vst1.16 d4[3], [r1]! @ Store data_ch2[n]
- vshl.s32 q1, #1
- vqadd.s32 q8, q1, q0
- vrev32.16 d0, d4
- vqdmull.s16 q1, d30, d0
- vshll.s16 q0, d0, #16
- vqadd.s32 q2, q1, q8
- vshrn.i32 d6, q2, #16
- vmull.s16 q1, d31, d6
- vst1.16 d6[1], [r0] @ Store data_ch1[n + 1]
- vst1.16 d6[3], [r1] @ Store data_ch2[n + 1]
- vshl.s32 q1, #1
- vst1.32 d16[0], [r6]! @ Store filter_state_ch1[0]
- vqadd.s32 q9, q1, q0
- vst1.32 d17[0], [r5]! @ Store filter_state_ch1[1]
- vst1.32 d18[1], [r6] @ Store filter_state_ch2[0]
- vst1.32 d19[1], [r5] @ Store filter_state_ch2[1]
-
- pop {r4 - r7}
- bx lr
-
-@void AllpassFilter2FixDec16BothChannels(
-@ int16_t *data_ch1, // Input and output in channel 1, in Q0
-@ int16_t *data_ch2, // Input and output in channel 2, in Q0
-@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
-@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
-@ const int length, // Length of the data buffers
-@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
-@ int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16
-@ int n = 0;
-@ int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
-@ int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
-@ int16_t sample0_ch1 = 0, sample0_ch2 = 0;
-@ int16_t sample1_ch1 = 0, sample1_ch2 = 0;
-@ int32_t a0_ch1 = 0, a0_ch2 = 0;
-@ int32_t b0_ch1 = 0, b0_ch2 = 0;
-@
-@ int32_t a1_ch1 = 0, a1_ch2 = 0;
-@ int32_t b1_ch1 = 0, b1_ch2 = 0;
-@ int32_t b2_ch1 = 0, b2_ch2 = 0;
-@
-@ // Loop unrolling preprocessing.
-@
-@ sample0_ch1 = data_ch1[n];
-@ sample0_ch2 = data_ch2[n];
-@
-@ a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
-@ a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
-@
-@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
-@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
-@
-@ a0_ch1 = -factor_ch1[0] * (int16_t)(b0_ch1 >> 16);
-@ a0_ch2 = -factor_ch2[0] * (int16_t)(b0_ch2 >> 16);
-@
-@ state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
-@ state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
-@
-@ sample1_ch1 = data_ch1[n + 1];
-@ sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
-@ sample1_ch2 = data_ch2[n + 1];
-@ sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
-@
-@
-@ for (n = 0; n < length - 2; n += 2) {
-@ a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
-@ a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
-@ a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
-@ a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
-@
-@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
-@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); //Q16+Q16=Q16
-@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); //Q16+Q16=Q16
-@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); //Q16+Q16=Q16
-@
-@ a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
-@ a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
-@ a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
-@ a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
-@
-@ state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
-@ state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
-@ state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
-@ state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
-@
-@ sample0_ch1 = data_ch1[n + 2];
-@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
-@ sample0_ch2 = data_ch2[n + 2];
-@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0
-@
-@ a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
-@ a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
-@ a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
-@ a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
-@
-@ b2_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
-@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
-@ b2_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
-@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
-@
-@ a0_ch1 = -factor_ch1[0] * (int16_t)(b2_ch1 >> 16);
-@ a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
-@ a0_ch2 = -factor_ch2[0] * (int16_t)(b2_ch2 >> 16);
-@ a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
-@
-@ state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
-@ state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
-@ state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
-@ state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
-@
-@
-@ sample1_ch1 = data_ch1[n + 3];
-@ sample0_ch1 = (int16_t) (b2_ch1 >> 16); //Save as Q0
-@ sample1_ch2 = data_ch2[n + 3];
-@ sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
-@
-@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
-@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
-@ data_ch2[n] = (int16_t) (b0_ch2 >> 16);
-@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
-@ }
-@
-@ // Loop unrolling post-processing.
-@
-@ a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
-@ a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
-@ a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
-@ a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
-@
-@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
-@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1);
-@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2);
-@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2);
-@
-@ a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
-@ a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
-@ a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
-@ a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
-@
-@ state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
-@ state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
-@ state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
-@ state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
-@
-@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
-@ data_ch2[n] = (int16_t) (b0_ch2 >> 16);
-@
-@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
-@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0
-@
-@ a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
-@ a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
-@
-@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
-@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
-@
-@ a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
-@ a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
-@
-@ state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
-@ state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
-@
-@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
-@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
-@
-@ filter_state_ch1[0] = state0_ch1;
-@ filter_state_ch1[1] = state1_ch1;
-@ filter_state_ch2[0] = state0_ch2;
-@ filter_state_ch2[1] = state1_ch2;
-@}
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
deleted file mode 100644
index 3c5ac64..0000000
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S
+++ /dev/null
@@ -1,145 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-@ Reference code in filters.c. Output is bit-exact.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
-.align 2
-
-@ int WebRtcIsacfix_AutocorrNeon(
-@ int32_t* __restrict r,
-@ const int16_t* __restrict x,
-@ int16_t N,
-@ int16_t order,
-@ int16_t* __restrict scale);
-
-DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
- push {r3 - r12}
-
- @ Constant initializations
- mov r4, #33
- vmov.i32 d0, #0
- vmov.i32 q8, #0
- vmov.i32 d29, #0 @ Initialize (-scale).
- vmov.u8 d30, #255 @ Initialize d30 as -1.
- vmov.i32 d0[0], r4 @ d0: 00000033 (low), 00000000 (high)
- vmov.i32 d25, #32
-
- mov r5, r1 @ x
- mov r6, r2 @ N
-
-@ Generate the first coefficient r0.
-LOOP_R0:
- vld1.16 {d18}, [r5]! @ x[]
- subs r6, r6, #4
- vmull.s16 q9, d18, d18
- vpadal.s32 q8, q9
- bgt LOOP_R0
-
- vadd.i64 d16, d16, d17
-
- @ Calculate scaling (the value of shifting).
- vmov d17, d16
-
- @ Check overflow and determine the value for 'scale'.
- @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
- @ lower 32-bit words. Note that we don't care about the value of the upper
- @ word in d17.
-
- @ Check the case of 1 bit overflow. If it occurs store the results for
- @ scale and r[0] in d17 and d29.
-
- vshr.u64 d3, d16, #1
- vclt.s32 d1, d16, #0 @ < 0 ?
- vbit d17, d3, d1 @ For r[0]
- vbit d29, d30, d1 @ -scale = -1
-
- @ For the case of more than 1 bit overflow. If it occurs overwrite the
- @ results for scale and r[0] in d17 and d29.
- vclz.s32 d5, d16 @ Leading zeros of the two 32 bit words.
- vshr.s64 d26, d5, #32 @ Keep only the upper 32 bits.
- vsub.i64 d31, d26, d0 @ zeros - 33
- vshl.i64 d27, d26, #32
- vorr d27, d26 @ Duplicate the high word with its low one.
- vshl.u64 d2, d16, d31 @ Shift by (-scale).
- vclt.s32 d1, d27, d25 @ < 32 ?
- vbit d17, d2, d1 @ For r[0]
- vbit d29, d31, d1 @ -scale
-
- vst1.32 d17[0], [r0]! @ r[0]
- mov r5, #1 @ outer loop counter
-
-@ Generate rest of the coefficients
-LOOP_R:
- vmov.i32 q8, #0 @ Initialize the accumulation result.
- vmov.i32 q9, #0 @ Initialize the accumulation result.
- mov r7, r1 @ &x[0]
- add r6, r7, r5, lsl #1 @ x[i]
- sub r12, r2, r5 @ N - i
- lsr r8, r12, #3 @ inner loop counter
- sub r12, r8, lsl #3 @ Leftover samples to be processed
-
-LOOP_8X_SAMPLES: @ Multiple of 8 samples
- vld1.16 {d20, d21}, [r7]! @ x[0, ...]
- vld1.16 {d22, d23}, [r6]! @ x[i, ...]
- vmull.s16 q12, d20, d22
- vmull.s16 q13, d21, d23
- subs r8, #1
- vpadal.s32 q8, q12
- vpadal.s32 q9, q13
- bgt LOOP_8X_SAMPLES
-
- cmp r12, #4
- blt REST_SAMPLES
-
-Four_SAMPLES:
- vld1.16 d20, [r7]!
- vld1.16 d22, [r6]!
- vmull.s16 q12, d20, d22
- vpadal.s32 q8, q12
- sub r12, #4
-
-REST_SAMPLES:
- mov r8, #0 @ Initialize lower word of the accumulation.
- mov r4, #0 @ Initialize upper word of the accumulation.
- cmp r12, #0
- ble SUMUP
-
-LOOP_REST_SAMPLES:
- ldrh r9, [r7], #2 @ x[0, ...]
- ldrh r10, [r6], #2 @ x[i, ...]
- smulbb r11, r9, r10
- adds r8, r8, r11 @ lower word of the accumulation.
- adc r4, r4, r11, asr #31 @ upper word of the accumulation.
- subs r12, #1
- bgt LOOP_REST_SAMPLES
-
-@ Added the multiplication results together and do a shift.
-SUMUP:
- vadd.i64 d16, d17
- vadd.i64 d18, d19
- vadd.i64 d18, d16
- vmov d17, r8, r4
- vadd.i64 d18, d17
- vshl.s64 d18, d29 @ Shift left by (-scale).
- vst1.32 d18[0], [r0]! @ r[i]
-
- add r5, #1
- cmp r5, r3
- ble LOOP_R
-
- vneg.s32 d29, d29 @ Get value for 'scale'.
- ldr r2, [sp, #40] @ &scale
- add r0, r3, #1 @ return (order + 1)
- vst1.s16 d29[0], [r2] @ Store 'scale'
-
- pop {r3 - r12}
- bx lr
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c
index c211162..2fba3e6 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c
@@ -205,10 +205,6 @@
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon;
WebRtcIsacfix_Spec2Time = WebRtcIsacfix_Spec2TimeNeon;
WebRtcIsacfix_Time2Spec = WebRtcIsacfix_Time2SpecNeon;
-#if !(defined WEBRTC_ARCH_ARM64_NEON)
- WebRtcIsacfix_CalculateResidualEnergy =
- WebRtcIsacfix_CalculateResidualEnergyNeon;
-#endif
// Disable AllpassFilter2FixDec16Neon function due to a clang bug.
// Refer more details at:
// https://code.google.com/p/webrtc/issues/detail?id=4567
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S
deleted file mode 100644
index f31a32d..0000000
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.S
+++ /dev/null
@@ -1,146 +0,0 @@
-@
-@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ lattice_neon.s
-@
-@ Contains a function for the core loop in the normalized lattice MA
-@ filter routine for iSAC codec, optimized for ARM Neon platform.
-@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
-@ int16_t input1,
-@ int32_t input2,
-@ int32_t* ptr0,
-@ int32_t* ptr1,
-@ int32_t* __restrict ptr2);
-@ It calculates
-@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
-@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
-@ in Q15 domain.
-@
-@ Reference code in lattice.c.
-@ Output is not bit-exact with the reference C code, due to the replacement
-@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
-@ instructions, smulwb, and smull. Speech quality was not degraded by
-@ testing speech and tone vectors.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-#include "settings.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
-.align 2
-DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
- push {r4-r8}
-
- vdup.32 d28, r0 @ Initialize Neon register with input0
- vdup.32 d29, r1 @ Initialize Neon register with input1
- vdup.32 d30, r2 @ Initialize Neon register with input2
- ldr r4, [sp, #20] @ ptr1
- ldr r12, [sp, #24] @ ptr2
-
- @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
- @ Leftover samples after the loop, in r6:
- @ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
- mov r6, #HALF_SUBFRAMELEN
- sub r6, #1
- lsr r5, r6, #2
- sub r6, r5, lsl #2
-
- @ First r5 iterations in a loop.
-
-LOOP:
- vld1.32 {d0, d1}, [r3]! @ *ptr0
-
- vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0)
- vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0)
- vmull.s32 q12, d0, d29 @ input1 * (*ptr0)
- vmull.s32 q13, d1, d29 @ input1 * (*ptr0)
-
- vrshrn.i64 d4, q10, #15
- vrshrn.i64 d5, q11, #15
-
- vld1.32 {d2, d3}, [r12] @ *ptr2
- vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a
-
- vrshrn.i64 d0, q12, #15
-
- vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b)
- vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
-
- vrshrn.i64 d16, q10, #16
- vrshrn.i64 d17, q11, #16
-
- vmull.s32 q10, d16, d28 @ input0 * (*ptr2)
- vmull.s32 q11, d17, d28 @ input0 * (*ptr2)
-
- vrshrn.i64 d1, q13, #15
- vrshrn.i64 d18, q10, #15
- vrshrn.i64 d19, q11, #15
-
- vst1.32 {d16, d17}, [r12]! @ *ptr2
-
- vadd.i32 q9, q0, q9
- subs r5, #1
- vst1.32 {d18, d19}, [r4]! @ *ptr1
-
- bgt LOOP
-
- @ Check how many samples still need to be processed.
- subs r6, #2
- blt LAST_SAMPLE
-
- @ Process two more samples:
- vld1.32 d0, [r3]! @ *ptr0
-
- vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0)
- vmull.s32 q13, d0, d29 @ input1 * (*ptr0)
-
- vld1.32 d18, [r12] @ *ptr2
- vrshrn.i64 d4, q11, #15
-
- vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a
- vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
- vrshrn.i64 d16, q11, #16
-
- vmull.s32 q11, d16, d28 @ input0 * (*ptr2)
- vst1.32 d16, [r12]! @ *ptr2
-
- vrshrn.i64 d0, q13, #15
- vrshrn.i64 d19, q11, #15
- vadd.i32 d19, d0, d19
-
- vst1.32 d19, [r4]! @ *ptr1
-
- @ If there's still one more sample, process it here.
-LAST_SAMPLE:
- cmp r6, #1
- bne END
-
- @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
-
- ldr r7, [r3] @ *ptr0
- ldr r8, [r12] @ *ptr2
-
- smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16
- add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1)
- smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits
- lsl r6, #16
- add r6, r5, lsr #16 @ Only take the middle 32 bits
- str r6, [r12] @ Output (*ptr2, as 32 bits)
-
- @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
-
- smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16
- smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16
- lsl r5, r5, #1
- add r5, r6, lsl #1
- str r5, [r4] @ Output (*ptr1)
-
-END:
- pop {r4-r8}
- bx lr
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h
index 1270c14..aac9275 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h
@@ -53,15 +53,6 @@
int32_t* corr_coeffs,
int* q_val_residual_energy);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
-int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
- int32_t q_val_corr,
- int q_val_polynomial,
- int16_t* a_polynomial,
- int32_t* corr_coeffs,
- int* q_val_residual_energy);
-#endif
-
#if defined(MIPS_DSP_R2_LE)
int32_t WebRtcIsacfix_CalculateResidualEnergyMIPS(int lpc_order,
int32_t q_val_corr,
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S
deleted file mode 100644
index a5955c2..0000000
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_neon.S
+++ /dev/null
@@ -1,173 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ Contains a function for WebRtcIsacfix_CalculateResidualEnergyNeon() in
-@ iSAC codec, optimized for ARM Neon platform. Reference code in
-@ lpc_masking_model.c.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
-.align 2
-
-@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
-@ int32_t q_val_corr,
-@ int q_val_polynomial,
-@ int16_t* a_polynomial,
-@ int32_t* corr_coeffs,
-@ int* q_val_residual_energy);
-DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
- push {r4-r11}
-
- sub r13, r13, #16
- str r1, [r13, #8]
- str r2, [r13, #12]
-
- mov r4, #1
- vmov.s64 q11, #0 @ Initialize shift_internal.
- vmov.s64 q13, #0 @ Initialize sum64.
- vmov.s64 q10, #0
- vmov.u8 d20[0], r4 @ Set q10 to 1.
-
- cmp r0, #0
- blt POST_LOOP_I
-
- add r9, r3, r0, asl #1 @ &a_polynomial[lpc_order]
- mov r6, #0 @ Loop counter i.
- ldr r11, [r13, #48]
- sub r10, r0, #1
- mov r7, r3 @ &a_polynomial[0]
- str r9, [r13, #4]
-
-LOOP_I:
- ldr r2, [r11], #4 @ corr_coeffs[i]
- vmov.s64 q15, #0 @ Initialize the sum64_tmp.
- vdup.s32 d25, r2
-
- cmp r0, r6 @ Compare lpc_order to i.
- movle r2, r6
- ble POST_LOOP_J
-
- mov r1, r6 @ j = i;
- mov r12, r7 @ &a_polynomial[i]
- mov r4, r3 @ &a_polynomial[j - i]
-
-LOOP_J:
- ldr r8, [r12], #4
- ldr r5, [r4], #4
- vmov.u32 d0[0], r8
- vmov.u32 d1[0], r5
- vmull.s16 q0, d0, d1
- vmull.s32 q0, d0, d25
- cmp r6, #0 @ i == 0?
- vshl.s64 q0, q11
- beq SUM1
- vshl.s64 q0, #1
-
-SUM1:
- vqadd.s64 q14, q0, q15 @ Sum and test overflow.
- add r1, r1, #2
- bvc MOV1 @ Skip the shift if there's no overflow.
- vshr.s64 q0, #1
- vshr.s64 q15, #1
- vadd.s64 q14, q0, q15
- vsub.s64 q11, q10
-
-MOV1:
- cmp r0, r1 @ Compare lpc_order to j.
- vmov.s64 q15, q14
- bgt LOOP_J
-
- bic r1, r10, #1
- add r2, r6, #2
- add r2, r1, r2
-
-POST_LOOP_J:
- vqadd.s64 q0, q13, q15 @ Sum and test overflow.
- bvc MOV2 @ Skip the shift if there's no overflow.
- vshr.s64 q13, #1
- vshr.s64 q15, #1
- vadd.s64 q0, q13, q15
- vsub.s64 q11, q10
-
-MOV2:
- vmov.s64 q13, q0 @ update sum64.
- cmp r2, r0
- bne CHECK_LOOP_CONDITION
-
- @ Last sample in the inner loop.
- ldr r4, [r13, #4]
- ldrsh r8, [r4]
- ldrsh r12, [r9]
- mul r8, r8, r12
- vmov.s32 d0[0], r8
- vmull.s32 q0, d0, d25
- cmp r6, #0 @ i == 0?
- vshl.s64 q0, q11
- beq SUM2
- vshl.s64 q0, #1
-
-SUM2:
- vqadd.s64 d1, d0, d26 @ Sum and test overflow.
- bvc MOV3 @ Skip the shift if there's no overflow.
- vshr.s64 q13, #1
- vshr.s64 d0, #1
- vadd.s64 d1, d0, d26
- vsub.s64 q11, q10
-
-MOV3:
- vmov.s64 d26, d1 @ update sum64.
-
-CHECK_LOOP_CONDITION:
- add r6, r6, #1
- sub r9, r9, #2
- cmp r0, r6 @ Compare i to lpc_order.
- sub r10, r10, #1
- add r7, r7, #2
- bge LOOP_I
-
-POST_LOOP_I:
- mov r3, #0
- vqadd.s64 d0, d26, d27 @ Sum and test overflow.
- bvc GET_SHIFT_NORM @ Skip the shift if there's no overflow.
- vshr.s64 q13, #1
- vadd.s64 d0, d26, d27
- vsub.s64 q11, q10
-
-GET_SHIFT_NORM:
- vcls.s32 d1, d0 @ Count leading extra sign bits.
- vmov.32 r2, d1[1] @ Store # of sign bits of only the 32 MSBs.
- vmovl.s32 q1, d1
- vshl.s64 d0, d3 @ d3 contains # of sign bits of the 32 MSBs.
-
- vcls.s32 d1, d0 @ Count again the leading extra sign bits.
- vmov.s32 r1, d1[1] @ Store # of sign bits of only the 32 MSBs.
- vmovl.s32 q1, d1
- vshl.s64 d0, d3 @ d3 contains # of sign bits of the 32 MSBs.
-
- vmov.s32 r0, d0[1] @ residual_energy
- vmov.s32 r3, d22[0] @ shift_internal
-
- @ Calculate the value for q_val_residual_energy.
- ldr r4, [r13, #8] @ q_val_corr
- ldr r5, [r13, #12] @ q_val_polynomial
- sub r12, r4, #32
- add r12, r12, r5, asl #1
- add r1, r12, r1 @ add 1st part of shift_internal.
- add r12, r1, r2 @ add 2nd part of shift_internal.
- ldr r2, [r13, #52]
- add r3, r12, r3 @ value for q_val_residual_energy.
- str r3, [r2, #0]
-
- add r13, r13, #16
- pop {r4-r11}
- bx r14
-
-
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc
index aaeff2c..0d32ff8 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model_unittest.cc
@@ -58,11 +58,4 @@
TEST_F(LpcMaskingModelTest, CalculateResidualEnergyTest) {
CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyC);
-#ifdef WEBRTC_DETECT_ARM_NEON
- if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
- CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyNeon);
- }
-#elif defined(WEBRTC_ARCH_ARM_NEON)
- CalculateResidualEnergyTester(WebRtcIsacfix_CalculateResidualEnergyNeon);
-#endif
}
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_neon.S b/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_neon.S
deleted file mode 100644
index 98ce389..0000000
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_neon.S
+++ /dev/null
@@ -1,645 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-@ Reference code in transform.c. Bit not exact due to how rounding is
-@ done in C code and ARM instructions, but quality by assembly code is
-@ not worse.
-
-#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcIsacfix_Spec2TimeNeon
-GLOBAL_FUNCTION WebRtcIsacfix_Time2SpecNeon
-GLOBAL_LABEL WebRtcIsacfix_kSinTab1
-GLOBAL_LABEL WebRtcIsacfix_kCosTab1
-GLOBAL_LABEL WebRtcIsacfix_kSinTab2
-
-@ void WebRtcIsacfix_Time2SpecNeon(int16_t* inre1Q9,
-@ int16_t* inre2Q9,
-@ int16_t* outreQ7,
-@ int16_t* outimQ7);
-
-DEFINE_FUNCTION WebRtcIsacfix_Time2SpecNeon
-.align 2
- push {r3-r11,lr} @ need to push r4-r11, but push r3 too to keep
- @ stack 8-byte aligned
- sub sp, sp, #(16 + FRAMESAMPLES * 4)
-
- str r0, [sp] @ inre1Q9
- str r1, [sp, #4] @ inre2Q9
- str r2, [sp, #8] @ outreQ7
- str r3, [sp, #12] @ outimQ7
-
- mov r8, #(FRAMESAMPLES - 16)
- add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 4]
- add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 4]
- add r4, sp, #16 @ tmpreQ16;
- add r5, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16;
-
- adr r9, WebRtcIsacfix_kCosTab1
-#if defined(__APPLE__)
- mov r6, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
-#else
- mov r6, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
-#endif
- add r10, r9, r6 @ WebRtcIsacfix_kSinTab1
-
- vmov.u32 q14, #0 @ Initialize the maximum values for tmpInIm.
- vmov.u32 q15, #0 @ Initialize the maximum values for tmpInRe.
- movw r6, #16921 @ 0.5 / sqrt(240) in Q19
- lsl r6, #5 @ Together with vqdmulh, net effect is ">> 26".
- mov r8, #(FRAMESAMPLES / 2) @ loop counter
- vdup.s32 q11, r6
-
-Time2Spec_TransformAndFindMax:
-@ Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
-
- subs r8, #8
-
- vld1.16 {q0}, [r9, :64]! @ WebRtcIsacfix_kCosTab1[]
- vld1.16 {q2}, [r0]! @ inre1Q9[]
- vmull.s16 q8, d0, d4 @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k]
- vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab1[]
- vmull.s16 q9, d1, d5 @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k]
- vld1.16 {q3}, [r1]! @ inre2Q9[]
- vmlal.s16 q8, d2, d6 @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k]
- vmlal.s16 q9, d3, d7 @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k]
- vmull.s16 q12, d0, d6 @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k]
- vmull.s16 q13, d1, d7 @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k]
- vmlsl.s16 q12, d2, d4 @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k]
- vmlsl.s16 q13, d3, d5 @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k]
-
- vqdmulh.s32 q0, q8, q11 @ xrQ16 * factQ19
- vqdmulh.s32 q1, q9, q11 @ xrQ16 * factQ19
- vqdmulh.s32 q2, q12, q11 @ xrQ16 * factQ19
- vqdmulh.s32 q3, q13, q11 @ xrQ16 * factQ19
-
- @ Find the absolute maximum in the vectors and store them.
- vabs.s32 q8, q0
- vabs.s32 q9, q1
- vabs.s32 q12, q2
- vst1.32 {q0, q1}, [r4]! @ tmpreQ16[k]
- vabs.s32 q13, q3
- vmax.u32 q14, q8 @ Use u32 so we don't lose the value 0x80000000.
- vmax.u32 q15, q12
- vst1.32 {q2, q3}, [r5]! @ tmpimQ16[k]
- vmax.u32 q15, q13
- vmax.u32 q14, q9 @ Maximum for outre1Q16[].
-
- bgt Time2Spec_TransformAndFindMax
-
- @ Find the maximum value in the Neon registers
- vmax.u32 d28, d29
- vmax.u32 d30, d31
- vpmax.u32 d28, d28, d28 @ Both 32 bits words hold the same value tmpInIm.
- vpmax.u32 d30, d30, d30 @ Both 32 bits words hold the same value tmpInRe.
- vmax.s32 d30, d28, d30 @ if (yrQ16 > xrQ16) {xrQ16 = yrQ16};
-
- ldr r4, [sp] @ inre1Q9
- vcls.s32 d31, d30 @ sh = WebRtcSpl_NormW32(tmpInRe);
- ldr r5, [sp, #4] @ inre2Q9
- vmov.i32 d30, #24
- add r6, sp, #16 @ tmpreQ16;
- vsub.s32 d31, d31, d30 @ sh = sh - 24;
- add r7, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16;
- vdup.s32 q8, d31[0] @ sh
-
- mov r8, #(FRAMESAMPLES / 2) @ loop counter
-
-Time2Spec_PreFftShift:
- subs r8, #16
-
- vld1.32 {q0, q1}, [r6]! @ tmpreQ16[]
- vrshl.s32 q0, q0, q8
- vld1.32 {q2, q3}, [r6]! @ tmpreQ16[]
- vrshl.s32 q1, q1, q8
- vld1.32 {q10, q11}, [r7]! @ tmpimQ16[]
- vrshl.s32 q2, q2, q8
- vld1.32 {q12, q13}, [r7]! @ tmpimQ16[]
- vrshl.s32 q3, q3, q8
- vrshl.s32 q10, q10, q8
- vrshl.s32 q11, q11, q8
- vrshl.s32 q12, q12, q8
- vrshl.s32 q13, q13, q8
-
- vmovn.s32 d0, q0
- vmovn.s32 d1, q1
- vmovn.s32 d2, q2
- vmovn.s32 d3, q3
- vmovn.s32 d4, q10
- vmovn.s32 d5, q11
- vmovn.s32 d6, q12
- vmovn.s32 d7, q13
-
- vst1.16 {q0, q1}, [r4]! @ inre1Q9[]
- vst1.16 {q2, q3}, [r5]! @ inre2Q9[]
-
- bgt Time2Spec_PreFftShift
-
- vmov.s32 r10, d16[0] @ Store value of sh.
- ldr r0, [sp] @ inre1Q9
- ldr r1, [sp, #4] @ inre2Q9
- mov r2, #-1
- CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
-
- vdup.s32 q8, r10 @ sh
- mov r8, #(FRAMESAMPLES - 8)
- ldr r2, [sp, #8] @ outreQ7
- ldr r3, [sp, #12] @ outimQ7
- add r11, r2, r8 @ &outRe1Q16[FRAMESAMPLES / 2 - 4]
- add r12, r3, r8 @ &outim2Q16[FRAMESAMPLES / 2 - 4]
- ldr r6, [sp] @ inre1Q9
- ldr r7, [sp, #4] @ inre2Q9
- add r4, r6, r8 @ &inre1Q9[FRAMESAMPLES / 2 - 4]
- add r5, r7, r8 @ &inre2Q9[FRAMESAMPLES / 2 - 4]
- adr r10, WebRtcIsacfix_kSinTab2
-
- add r9, r10, #(120*2 - 8) @ &WebRtcIsacfix_kSinTab2[119 - 4]
-
- vneg.s32 q15, q8 @ -sh
- vmov.i32 q0, #23
- vsub.s32 q15, q15, q0 @ -sh - 23
-
- mov r8, #(FRAMESAMPLES / 4) @ loop counter
-
- @ Pre-load variables.
- vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
- vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
- vld1.16 {d0}, [r6]! @ inre1Q9
- vld1.16 {d1}, [r7]! @ inre2Q9
-
-Time2Spec_PostFftTransform:
-@ By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
-@ ">> 14" and then ">> 9" as in the C code.
-
- vld1.16 {d6}, [r9, :64] @ kCosTab2[]
- vneg.s16 d6, d6
- vld1.16 {d7}, [r10, :64]! @ WebRtcIsacfix_kSinTab2[]
- vrev64.16 q1, q1 @ Reverse samples in 2nd half of xrQ16[].
- vqadd.s16 d4, d0, d2 @ xrQ16
- vqsub.s16 d5, d1, d3 @ xiQ16
- vrev64.16 d6, d6
-
- sub r9, #8 @ Update pointers for kCosTab2[].
- sub r4, #8 @ Update pointers for inre1Q9[].
- sub r5, #8 @ Update pointers for inr22Q9[].
- subs r8, #4 @ Update loop counter.
-
- vqadd.s16 d1, d1, d3 @ yrQ16
- vqsub.s16 d0, d2, d0 @ yiQ16
-
- vmull.s16 q12, d6, d4 @ kCosTab2[k] * xrQ16
- vmlsl.s16 q12, d7, d5 @ WebRtcIsacfix_kSinTab2[k] * xiQ16
- vmull.s16 q13, d7, d4 @ WebRtcIsacfix_kSinTab2[k] * xrQ16
- vmlal.s16 q13, d6, d5 @ kCosTab2[k] * xiQ16
- vmull.s16 q9, d7, d1 @ WebRtcIsacfix_kSinTab2[k] * yrQ16
- vmlal.s16 q9, d6, d0 @ kCosTab2[k] * yiQ16
- vmull.s16 q10, d7, d0 @ WebRtcIsacfix_kSinTab2[k] * yiQ16
- vmlsl.s16 q10, d6, d1 @ kCosTab2[k] * yrQ16
-
- vshl.s32 q12, q12, q15
- vshl.s32 q13, q13, q15
- vshl.s32 q9, q9, q15
- vshl.s32 q10, q10, q15
-
- vneg.s32 q8, q9
- vld1.16 {d0}, [r6]! @ inre1Q9
- vmovn.s32 d24, q12
- vld1.16 {d1}, [r7]! @ inre2Q9
- vmovn.s32 d25, q13
- vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
- vmovn.s32 d5, q10
- vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
- vmovn.s32 d4, q8
- vst1.16 {d24}, [r2]! @ outreQ7[k]
- vrev64.16 q2, q2 @ Reverse the order of the samples.
- vst1.16 {d25}, [r3]! @ outimQ7[k]
- vst1.16 {d4}, [r11] @ outreQ7[FRAMESAMPLES / 2 - 1 - k]
- vst1.16 {d5}, [r12] @ outimQ7[FRAMESAMPLES / 2 - 1 - k]
- sub r11, #8 @ Update pointers for outreQ7[].
- sub r12, #8 @ Update pointers for outimQ7[].
-
- bgt Time2Spec_PostFftTransform
-
- add sp, sp, #(16 + FRAMESAMPLES * 4)
- pop {r3-r11,pc}
-
-.align 8
-@ Cosine table 1 in Q14
-WebRtcIsacfix_kCosTab1:
-_WebRtcIsacfix_kCosTab1: @ Label for iOS
- .short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
- .short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
- .short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
- .short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
- .short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
- .short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
- .short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
- .short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
- .short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
- .short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
- .short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
- .short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
- .short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
- .short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
- .short 1713, 1499, 1285, 1072, 857, 643, 429, 214
- .short 0, -214, -429, -643, -857, -1072, -1285, -1499
- .short -1713, -1926, -2139, -2351, -2563, -2775, -2986, -3196
- .short -3406, -3616, -3825, -4033, -4240, -4447, -4653, -4859
- .short -5063, -5266, -5469, -5671, -5872, -6071, -6270, -6467
- .short -6664, -6859, -7053, -7246, -7438, -7629, -7818, -8006
- .short -8192, -8377, -8561, -8743, -8923, -9102, -9280, -9456
- .short -9630, -9803, -9974, -10143, -10311, -10477, -10641, -10803
- .short -10963, -11121, -11278, -11433, -11585, -11736, -11885, -12031
- .short -12176, -12318, -12458, -12597, -12733, -12867, -12998, -13128
- .short -13255, -13380, -13502, -13623, -13741, -13856, -13970, -14081
- .short -14189, -14295, -14399, -14500, -14598, -14694, -14788, -14879
- .short -14968, -15053, -15137, -15218, -15296, -15371, -15444, -15515
- .short -15582, -15647, -15709, -15769, -15826, -15880, -15931, -15980
- .short -16026, -16069, -16110, -16147, -16182, -16214, -16244, -16270
- .short -16294, -16315, -16333, -16349, -16362, -16371, -16378, -16383
-
-.align 8
-@ Sine table 2 in Q14
-WebRtcIsacfix_kSinTab2:
-_WebRtcIsacfix_kSinTab2: @ Label for iOS
- .short 16384, -16381, 16375, -16367, 16356, -16342, 16325, -16305
- .short 16283, -16257, 16229, -16199, 16165, -16129, 16090, -16048
- .short 16003, -15956, 15906, -15853, 15798, -15739, 15679, -15615
- .short 15549, -15480, 15408, -15334, 15257, -15178, 15095, -15011
- .short 14924, -14834, 14741, -14647, 14549, -14449, 14347, -14242
- .short 14135, -14025, 13913, -13799, 13682, -13563, 13441, -13318
- .short 13192, -13063, 12933, -12800, 12665, -12528, 12389, -12247
- .short 12104, -11958, 11810, -11661, 11509, -11356, 11200, -11042
- .short 10883, -10722, 10559, -10394, 10227, -10059, 9889, -9717
- .short 9543, -9368, 9191, -9013, 8833, -8652, 8469, -8285
- .short 8099, -7912, 7723, -7534, 7342, -7150, 6957, -6762
- .short 6566, -6369, 6171, -5971, 5771, -5570, 5368, -5165
- .short 4961, -4756, 4550, -4344, 4137, -3929, 3720, -3511
- .short 3301, -3091, 2880, -2669, 2457, -2245, 2032, -1819
- .short 1606, -1392, 1179, -965, 750, -536, 322, -107
-
-@ Table kCosTab2 was removed since its data is redundant with kSinTab2.
-
-.align 8
-@ Sine table 1 in Q14
-WebRtcIsacfix_kSinTab1:
-_WebRtcIsacfix_kSinTab1: @ Label for iOS
- .short 0, 214, 429, 643, 857, 1072, 1285, 1499
- .short 1713, 1926, 2139, 2351, 2563, 2775, 2986, 3196
- .short 3406, 3616, 3825, 4033, 4240, 4447, 4653, 4859
- .short 5063, 5266, 5469, 5671, 5872, 6071, 6270, 6467
- .short 6664, 6859, 7053, 7246, 7438, 7629, 7818, 8006
- .short 8192, 8377, 8561, 8743, 8923, 9102, 9280, 9456
- .short 9630, 9803, 9974, 10143, 10311, 10477, 10641, 10803
- .short 10963, 11121, 11278, 11433, 11585, 11736, 11885, 12031
- .short 12176, 12318, 12458, 12597, 12733, 12867, 12998, 13128
- .short 13255, 13380, 13502, 13623, 13741, 13856, 13970, 14081
- .short 14189, 14295, 14399, 14500, 14598, 14694, 14788, 14879
- .short 14968, 15053, 15137, 15218, 15296, 15371, 15444, 15515
- .short 15582, 15647, 15709, 15769, 15826, 15880, 15931, 15980
- .short 16026, 16069, 16110, 16147, 16182, 16214, 16244, 16270
- .short 16294, 16315, 16333, 16349, 16362, 16371, 16378, 16383
- .short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
- .short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
- .short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
- .short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
- .short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
- .short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
- .short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
- .short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
- .short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
- .short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
- .short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
- .short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
- .short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
- .short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
- .short 1713, 1499, 1285, 1072, 857, 643, 429, 214
-
-@ void WebRtcIsacfix_Spec2TimeNeon(int16_t *inreQ7,
-@ int16_t *inimQ7,
-@ int32_t *outre1Q16,
-@ int32_t *outre2Q16);
-
-DEFINE_FUNCTION WebRtcIsacfix_Spec2TimeNeon
-.align 2
- push {r3-r11,lr} @ need to push r4-r11, but push r3 too to keep
- @ stack 8-byte aligned
-
- sub sp, sp, #16
- str r0, [sp] @ inreQ7
- str r1, [sp, #4] @ inimQ7
- str r2, [sp, #8] @ outre1Q16
- str r3, [sp, #12] @ outre2Q16
-
- mov r8, #(FRAMESAMPLES - 16)
- add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 8]
- add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 8]
- add r4, r2, r8, lsl #1 @ &outRe1Q16[FRAMESAMPLES / 2 - 8]
- add r6, r3, r8, lsl #1 @ &outRe2Q16[FRAMESAMPLES / 2 - 8]
-
- mov r8, #(FRAMESAMPLES / 2) @ loop counter
- adr r10, WebRtcIsacfix_kSinTab2
- add r9, r10, #(120*2 - 16) @ &WebRtcIsacfix_kSinTab2[119 - 8]
-
- vpush {q4-q7}
-
- mov r5, #-32
- mov r7, #-16
- vmov.u32 q6, #0 @ Initialize the maximum values for tmpInIm.
- vmov.u32 q7, #0 @ Initialize the maximum values for tmpInRe.
-
-TransformAndFindMax:
-@ Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
-@ Bit-exact.
-
- subs r8, #16
-
- vld1.16 {q0}, [r9, :64] @ kCosTab2[]
- sub r9, #16
- vld1.16 {q2}, [r0]! @ inreQ7[]
- vneg.s16 q0, q0
- vld1.16 {q3}, [r1]! @ inimQ7[]
- vrev64.16 d0, d0
- vrev64.16 d1, d1
- vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab2[]
- vswp d0, d1
-
- vmull.s16 q8, d2, d6 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
- vmull.s16 q9, d3, d7 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
- vmlal.s16 q8, d0, d4 @ kCosTab2[k] * inreQ7[k]
- vmlal.s16 q9, d1, d5 @ kCosTab2[k] * inreQ7[k]
- vmull.s16 q12, d0, d6 @ kCosTab2[k] * inimQ7[k]
- vmull.s16 q13, d1, d7 @ kCosTab2[k] * inimQ7[k]
- vmlsl.s16 q12, d2, d4 @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k]
- vmlsl.s16 q13, d3, d5 @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k]
-
- vld1.16 {q2}, [r11], r7 @ inimQ7[FRAMESAMPLES / 2 - 8 + i]
- vld1.16 {q3}, [r12], r7 @ inreQ7[FRAMESAMPLES / 2 - 8 + i]
-
- vrev64.16 q2, q2 @ Reverse the order of the samples
- vrev64.16 q3, q3 @ Reverse the order of the samples
-
- vmull.s16 q14, d2, d5 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
- vmull.s16 q15, d3, d4 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
- vmlsl.s16 q14, d0, d7 @ q14 -= kCosTab2[k] * inreQ7[k]
- vmlsl.s16 q15, d1, d6 @ q15 -= kCosTab2[k] * inreQ7[k]
-
- vmull.s16 q10, d0, d5 @ kCosTab2[k] * inimQ7[]
- vmull.s16 q11, d1, d4 @ kCosTab2[k] * inimQ7[]
- vmlal.s16 q10, d2, d7 @ q10 += WebRtcIsacfix_kSinTab2[k] * inreQ7[]
- vmlal.s16 q11, d3, d6 @ q11 += WebRtcIsacfix_kSinTab2[k] * inreQ7[]
-
- vshr.s32 q8, q8, #5 @ xrQ16
- vshr.s32 q9, q9, #5 @ xrQ16
- vshr.s32 q12, q12, #5 @ xiQ16
- vshr.s32 q13, q13, #5 @ xiQ16
- vshr.s32 q14, q14, #5 @ yiQ16
- vshr.s32 q15, q15, #5 @ yiQ16
-
- vneg.s32 q10, q10
- vneg.s32 q11, q11
-
- @ xrQ16 - yiQ16
- vsub.s32 q0, q8, q14
- vsub.s32 q1, q9, q15
-
- vshr.s32 q10, q10, #5 @ yrQ16
- vshr.s32 q11, q11, #5 @ yrQ16
-
- @ xrQ16 + yiQ16
- vadd.s32 q3, q8, q14
- vadd.s32 q2, q9, q15
-
- @ yrQ16 + xiQ16
- vadd.s32 q4, q10, q12
- vadd.s32 q5, q11, q13
-
- @ yrQ16 - xiQ16
- vsub.s32 q8, q11, q13
- vsub.s32 q9, q10, q12
-
- @ Reverse the order of the samples
- vrev64.32 q2, q2
- vrev64.32 q3, q3
- vrev64.32 q8, q8
- vrev64.32 q9, q9
- vswp d4, d5
- vswp d6, d7
-
- vst1.32 {q0, q1}, [r2]! @ outre1Q16[k]
- vswp d16, d17
- vswp d18, d19
- vst1.32 {q2, q3}, [r4], r5 @ outre1Q16[FRAMESAMPLES / 2 - 1 - k]
-
- @ Find the absolute maximum in the vectors and store them in q6 and q7.
- vabs.s32 q10, q0
- vabs.s32 q14, q4
- vabs.s32 q11, q1
- vabs.s32 q15, q5
- vabs.s32 q12, q2
- vmax.u32 q6, q10 @ Use u32 so we don't lose the value 0x80000000.
- vmax.u32 q7, q14 @ Maximum for outre2Q16[].
- vabs.s32 q0, q8
- vmax.u32 q6, q11 @ Maximum for outre1Q16[].
- vmax.u32 q7, q15
- vabs.s32 q13, q3
- vmax.u32 q6, q12
- vmax.u32 q7, q0
- vabs.s32 q1, q9
- vst1.32 {q4, q5}, [r3]! @ outre2Q16[k]
- vst1.32 {q8, q9}, [r6], r5 @ outre2Q16[FRAMESAMPLES / 2 - 1 - k]
- vmax.u32 q6, q13
- vmax.u32 q7, q1
-
- bgt TransformAndFindMax
-
- adr r10, WebRtcIsacfix_kSinTab1
-#if defined(__APPLE__)
- mov r2, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
-#else
- mov r2, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
-#endif
-
- sub r11, r10, r2 @ WebRtcIsacfix_kCosTab1
-
- @ Find the maximum value in the Neon registers
- vmax.u32 d12, d13
- vmax.u32 d14, d15
- vpmax.u32 d12, d12, d12 @ Both 32 bits words hold the same value tmpInIm.
- vpmax.u32 d14, d14, d14 @ Both 32 bits words hold the same value tmpInRe.
- vmax.s32 d0, d12, d14 @ if (tmpInIm>tmpInRe) tmpInRe = tmpInIm;
-
- vpop {q4-q7}
-
- ldr r4, [sp] @ inreQ7
- vcls.s32 d1, d0 @ sh = WebRtcSpl_NormW32(tmpInRe);
- ldr r5, [sp, #4] @ inimQ7
- vmov.i32 d0, #24 @ sh = sh-24;
- ldr r6, [sp, #8] @ outre1Q16
- vsub.s32 d1, d1, d0
- ldr r7, [sp, #12] @ outre2Q16
- vdup.s32 q8, d1[0] @ sh
-
- mov r8, #(FRAMESAMPLES / 2)
-
-PreFftShift:
- subs r8, #16
- vld1.32 {q0, q1}, [r6]! @ outre1Q16[]
- vld1.32 {q2, q3}, [r6]! @ outre1Q16[]
- vrshl.s32 q0, q0, q8
- vrshl.s32 q1, q1, q8
- vrshl.s32 q2, q2, q8
- vrshl.s32 q3, q3, q8
- vld1.32 {q10, q11}, [r7]! @ outre2Q16[]
- vld1.32 {q12, q13}, [r7]! @ outre2Q16[]
- vrshl.s32 q10, q10, q8
- vrshl.s32 q11, q11, q8
- vrshl.s32 q12, q12, q8
- vrshl.s32 q13, q13, q8
-
- vmovn.s32 d0, q0
- vmovn.s32 d1, q1
- vmovn.s32 d2, q2
- vmovn.s32 d3, q3
- vmovn.s32 d4, q10
- vmovn.s32 d5, q11
- vmovn.s32 d6, q12
- vmovn.s32 d7, q13
-
- vst1.16 {q0, q1}, [r4]! @ inreQ7[]
- vst1.16 {q2, q3}, [r5]! @ inimQ7[]
-
- bgt PreFftShift
-
- vmov.s32 r8, d16[0] @ Store value of sh.
- ldr r0, [sp] @ inreQ7
- ldr r1, [sp, #4] @ inimQ7
- mov r2, #1
- CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
-
- vdup.s32 q8, r8 @ sh
- mov r9, r11 @ WebRtcIsacfix_kCosTab1
- ldr r4, [sp] @ inreQ7
- ldr r5, [sp, #4] @ inimQ7
- ldr r6, [sp, #8] @ outre1Q16
- ldr r7, [sp, #12] @ outre2Q16
- mov r8, #(FRAMESAMPLES / 2)
- vneg.s32 q15, q8 @ -sh
- movw r0, #273
- lsl r0, #15 @ Together with vqdmulh, net effect is ">> 16".
- vdup.s32 q14, r0
-
-PostFftShiftDivide:
- subs r8, #16
-
- vld1.16 {q0, q1}, [r4]! @ inreQ7
- vmovl.s16 q10, d0
- vmovl.s16 q11, d1
- vld1.16 {q2, q3}, [r5]! @ inimQ7
- vmovl.s16 q8, d2
- vmovl.s16 q9, d3
-
- vshl.s32 q10, q10, q15
- vshl.s32 q11, q11, q15
- vshl.s32 q8, q8, q15
- vshl.s32 q9, q9, q15
-
- vqdmulh.s32 q10, q10, q14
- vqdmulh.s32 q11, q11, q14
- vqdmulh.s32 q8, q8, q14
- vqdmulh.s32 q9, q9, q14
-
- vmovl.s16 q0, d4
- vmovl.s16 q1, d5
- vmovl.s16 q2, d6
- vmovl.s16 q3, d7
-
- vshl.s32 q0, q0, q15
- vshl.s32 q1, q1, q15
- vshl.s32 q2, q2, q15
- vshl.s32 q3, q3, q15
-
- @ WEBRTC_SPL_MUL_16_32_RSFT16(273, outre2Q16[k])
- vqdmulh.s32 q0, q0, q14
- vqdmulh.s32 q1, q1, q14
- vst1.32 {q10, q11}, [r6]! @ outre1Q16[]
- vqdmulh.s32 q2, q2, q14
- vqdmulh.s32 q3, q3, q14
- vst1.32 {q8, q9}, [r6]! @ outre1Q16[]
- vst1.32 {q0, q1}, [r7]! @ outre2Q16[]
- vst1.32 {q2, q3}, [r7]! @ outre2Q16[]
-
- bgt PostFftShiftDivide
-
- mov r8, #(FRAMESAMPLES / 2)
- ldr r2, [sp, #8] @ outre1Q16
- ldr r3, [sp, #12] @ outre2Q16
- movw r0, #31727
- lsl r0, #16 @ With vqdmulh and vrshrn, net effect is ">> 25".
-
-DemodulateAndSeparate:
- subs r8, #8
-
- vld1.16 {q0}, [r9, :64]! @ WebRtcIsacfix_kCosTab1[]
- vmovl.s16 q10, d0 @ WebRtcIsacfix_kCosTab1[]
- vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab1[]
- vmovl.s16 q11, d1 @ WebRtcIsacfix_kCosTab1[]
- vld1.32 {q2, q3}, [r2] @ outre1Q16
- vmovl.s16 q12, d2 @ WebRtcIsacfix_kSinTab1[]
- vld1.32 {q14, q15}, [r3] @ outre2Q16
- vmovl.s16 q13, d3 @ WebRtcIsacfix_kSinTab1[]
-
- vmull.s32 q0, d20, d4 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
- vmull.s32 q1, d21, d5 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
- vmull.s32 q8, d22, d6 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
- vmull.s32 q9, d23, d7 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
-
- vmlsl.s32 q0, d24, d28 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
- vmlsl.s32 q1, d25, d29 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
- vmlsl.s32 q8, d26, d30 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
- vmlsl.s32 q9, d27, d31 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
-
- vrshrn.s64 d0, q0, #10 @ xrQ16
- vrshrn.s64 d1, q1, #10 @ xrQ16
- vrshrn.s64 d2, q8, #10 @ xrQ16
- vrshrn.s64 d3, q9, #10 @ xrQ16
-
- vmull.s32 q8, d20, d28 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
- vmull.s32 q9, d21, d29 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
- vmull.s32 q14, d22, d30 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
- vmull.s32 q15, d23, d31 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
-
- vmlal.s32 q8, d24, d4 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
- vmlal.s32 q9, d25, d5 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
- vmlal.s32 q14, d26, d6 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
- vmlal.s32 q15, d27, d7 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
-
- vdup.s32 q11, r0 @ generic -> Neon doesn't cost extra cycles.
-
- vrshrn.s64 d24, q8, #10 @ xiQ16
- vrshrn.s64 d25, q9, #10 @ xiQ16
- vqdmulh.s32 q0, q0, q11
- vrshrn.s64 d26, q14, #10 @ xiQ16
- vrshrn.s64 d27, q15, #10 @ xiQ16
-
- @ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xrQ16)
- @ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xiQ16)
-
- vqdmulh.s32 q1, q1, q11
- vqdmulh.s32 q2, q12, q11
- vqdmulh.s32 q3, q13, q11
-
- vst1.16 {q0, q1}, [r2]! @ outre1Q16[]
- vst1.16 {q2, q3}, [r3]! @ outre2Q16[]
-
- bgt DemodulateAndSeparate
-
- add sp, sp, #16
- pop {r3-r11,pc}
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c
index ee96b8e..8f89fb8 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/transform_tables.c
@@ -16,7 +16,6 @@
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
#include "webrtc/typedefs.h"
-#if !(defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
/* Cosine table 1 in Q14. */
const int16_t WebRtcIsacfix_kCosTab1[FRAMESAMPLES/2] = {
16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315, 16294, 16270,
@@ -90,7 +89,6 @@
4137, -3929, 3720, -3511, 3301, -3091, 2880, -2669, 2457, -2245,
2032, -1819, 1606, -1392, 1179, -965, 750, -536, 322, -107
};
-#endif
#if defined(MIPS32_LE)
/* Cosine table 2 in Q14. Used only on MIPS platforms. */
diff --git a/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi b/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi
index cc2af97..285583c 100644
--- a/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi
+++ b/webrtc/modules/audio_coding/codecs/isac/isacfix.gypi
@@ -142,11 +142,9 @@
],
'sources': [
'fix/source/entropy_coding_neon.c',
- 'fix/source/filterbanks_neon.S',
- 'fix/source/filters_neon.S',
- 'fix/source/lattice_neon.S',
- 'fix/source/lpc_masking_model_neon.S',
- 'fix/source/transform_neon.S',
+ 'fix/source/filters_neon.c',
+ 'fix/source/lattice_neon.c',
+ 'fix/source/transform_neon.c',
],
'conditions': [
# Disable LTO in isac_neon target due to compiler bug
@@ -156,27 +154,11 @@
'-ffat-lto-objects',
],
}],
- ['target_arch=="arm64"', {
- 'sources!': [
- 'fix/source/filterbanks_neon.S',
- 'fix/source/filters_neon.S',
- 'fix/source/lattice_neon.S',
- 'fix/source/lpc_masking_model_neon.S',
- 'fix/source/transform_neon.S',
- ],
- 'sources': [
- 'fix/source/filters_neon.c',
- 'fix/source/lattice_neon.c',
- 'fix/source/transform_neon.c',
- ],
- 'conditions': [
- # Disable AllpassFilter2FixDec16Neon function due to a clang
- # bug. Refer more details at:
- # https://code.google.com/p/webrtc/issues/detail?id=4567
- ['clang==0', {
+ # Disable AllpassFilter2FixDec16Neon function due to a clang
+ # bug. Refer more details at:
+ # https://code.google.com/p/webrtc/issues/detail?id=4567
+ ['target_arch!="arm64" or clang==0', {
'sources': ['fix/source/filterbanks_neon.c',],
- }],
- ],
}]
],
},