blob: 5b1c2210e482f6facace4a61240c43378c2ec51c [file] [log] [blame]
niklase@google.com470e71d2011-07-07 08:21:25 +00001/*
2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
3 * Copyright Takuya OOURA, 1996-2001
4 *
5 * You may use, copy, modify and distribute this code for any purpose (include
6 * commercial use) and without fee. Please refer to this package when you modify
7 * this code.
8 *
9 * Changes by the WebRTC authors:
10 * - Trivial type modifications.
11 * - Minimal code subset to do rdft of length 128.
12 * - Optimizations because of known length.
13 *
14 * All changes are covered by the WebRTC license and IP grant:
15 * Use of this source code is governed by a BSD-style license
16 * that can be found in the LICENSE file in the root of the source
17 * tree. An additional intellectual property rights grant can be found
18 * in the file PATENTS. All contributing project authors may
19 * be found in the AUTHORS file in the root of the source tree.
20 */
21
pbos@webrtc.org7fad4b82013-05-28 08:11:59 +000022#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
ajm@google.comce7c2a22011-08-04 01:50:00 +000023
niklase@google.com470e71d2011-07-07 08:21:25 +000024#include <math.h>
25
pbos@webrtc.org7fad4b82013-05-28 08:11:59 +000026#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
27#include "webrtc/typedefs.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000028
cduvivier@google.com0e07d822011-07-25 23:54:20 +000029// constants shared by all paths (C, SSE2).
niklase@google.com470e71d2011-07-07 08:21:25 +000030float rdft_w[64];
cduvivier@google.com0e07d822011-07-25 23:54:20 +000031// constants used by the C path.
32float rdft_wk3ri_first[32];
33float rdft_wk3ri_second[32];
34// constants used by SSE2 but initialized in C path.
35ALIGN16_BEG float ALIGN16_END rdft_wk1r[32];
36ALIGN16_BEG float ALIGN16_END rdft_wk2r[32];
37ALIGN16_BEG float ALIGN16_END rdft_wk3r[32];
38ALIGN16_BEG float ALIGN16_END rdft_wk1i[32];
39ALIGN16_BEG float ALIGN16_END rdft_wk2i[32];
40ALIGN16_BEG float ALIGN16_END rdft_wk3i[32];
cduvivier@google.com288c8692011-08-22 21:55:33 +000041ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
cduvivier@google.com0e07d822011-07-25 23:54:20 +000042
niklase@google.com470e71d2011-07-07 08:21:25 +000043static int ip[16];
44
andrew@webrtc.org13b2d462013-10-08 23:41:42 +000045static void bitrv2_32(int* ip, float* a) {
cd@webrtc.org85b4a1b2012-04-10 21:25:17 +000046 const int n = 32;
niklase@google.com470e71d2011-07-07 08:21:25 +000047 int j, j1, k, k1, m, m2;
48 float xr, xi, yr, yi;
49
50 ip[0] = 0;
51 {
52 int l = n;
53 m = 1;
54 while ((m << 3) < l) {
55 l >>= 1;
56 for (j = 0; j < m; j++) {
57 ip[m + j] = ip[j] + l;
58 }
59 m <<= 1;
60 }
61 }
62 m2 = 2 * m;
63 for (k = 0; k < m; k++) {
64 for (j = 0; j < k; j++) {
65 j1 = 2 * j + ip[k];
66 k1 = 2 * k + ip[j];
67 xr = a[j1];
68 xi = a[j1 + 1];
69 yr = a[k1];
70 yi = a[k1 + 1];
71 a[j1] = yr;
72 a[j1 + 1] = yi;
73 a[k1] = xr;
74 a[k1 + 1] = xi;
75 j1 += m2;
76 k1 += 2 * m2;
77 xr = a[j1];
78 xi = a[j1 + 1];
79 yr = a[k1];
80 yi = a[k1 + 1];
81 a[j1] = yr;
82 a[j1 + 1] = yi;
83 a[k1] = xr;
84 a[k1 + 1] = xi;
85 j1 += m2;
86 k1 -= m2;
87 xr = a[j1];
88 xi = a[j1 + 1];
89 yr = a[k1];
90 yi = a[k1 + 1];
91 a[j1] = yr;
92 a[j1 + 1] = yi;
93 a[k1] = xr;
94 a[k1 + 1] = xi;
95 j1 += m2;
96 k1 += 2 * m2;
97 xr = a[j1];
98 xi = a[j1 + 1];
99 yr = a[k1];
100 yi = a[k1 + 1];
101 a[j1] = yr;
102 a[j1 + 1] = yi;
103 a[k1] = xr;
104 a[k1 + 1] = xi;
105 }
106 j1 = 2 * k + m2 + ip[k];
107 k1 = j1 + m2;
108 xr = a[j1];
109 xi = a[j1 + 1];
110 yr = a[k1];
111 yi = a[k1 + 1];
112 a[j1] = yr;
113 a[j1 + 1] = yi;
114 a[k1] = xr;
115 a[k1 + 1] = xi;
116 }
117}
118
andrew@webrtc.orgc0907ef2014-02-21 00:13:31 +0000119static void bitrv2_128_C(float* a) {
cd@webrtc.org85b4a1b2012-04-10 21:25:17 +0000120 /*
121 Following things have been attempted but are no faster:
122 (a) Storing the swap indexes in a LUT (index calculations are done
123 for 'free' while waiting on memory/L1).
124 (b) Consolidate the load/store of two consecutive floats by a 64 bit
125 integer (execution is memory/L1 bound).
126 (c) Do a mix of floats and 64 bit integer to maximize register
127 utilization (execution is memory/L1 bound).
128 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
129 (e) Hard-coding of the offsets to completely eliminates index
130 calculations.
131 */
132
133 unsigned int j, j1, k, k1;
134 float xr, xi, yr, yi;
135
136 static const int ip[4] = {0, 64, 32, 96};
137 for (k = 0; k < 4; k++) {
138 for (j = 0; j < k; j++) {
139 j1 = 2 * j + ip[k];
140 k1 = 2 * k + ip[j];
141 xr = a[j1 + 0];
142 xi = a[j1 + 1];
143 yr = a[k1 + 0];
144 yi = a[k1 + 1];
145 a[j1 + 0] = yr;
146 a[j1 + 1] = yi;
147 a[k1 + 0] = xr;
148 a[k1 + 1] = xi;
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000149 j1 += 8;
cd@webrtc.org85b4a1b2012-04-10 21:25:17 +0000150 k1 += 16;
151 xr = a[j1 + 0];
152 xi = a[j1 + 1];
153 yr = a[k1 + 0];
154 yi = a[k1 + 1];
155 a[j1 + 0] = yr;
156 a[j1 + 1] = yi;
157 a[k1 + 0] = xr;
158 a[k1 + 1] = xi;
159 j1 += 8;
160 k1 -= 8;
161 xr = a[j1 + 0];
162 xi = a[j1 + 1];
163 yr = a[k1 + 0];
164 yi = a[k1 + 1];
165 a[j1 + 0] = yr;
166 a[j1 + 1] = yi;
167 a[k1 + 0] = xr;
168 a[k1 + 1] = xi;
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000169 j1 += 8;
cd@webrtc.org85b4a1b2012-04-10 21:25:17 +0000170 k1 += 16;
171 xr = a[j1 + 0];
172 xi = a[j1 + 1];
173 yr = a[k1 + 0];
174 yi = a[k1 + 1];
175 a[j1 + 0] = yr;
176 a[j1 + 1] = yi;
177 a[k1 + 0] = xr;
178 a[k1 + 1] = xi;
179 }
180 j1 = 2 * k + 8 + ip[k];
181 k1 = j1 + 8;
182 xr = a[j1 + 0];
183 xi = a[j1 + 1];
184 yr = a[k1 + 0];
185 yi = a[k1 + 1];
186 a[j1 + 0] = yr;
187 a[j1 + 1] = yi;
188 a[k1 + 0] = xr;
189 a[k1 + 1] = xi;
190 }
191}
192
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000193static void makewt_32(void) {
niklase@google.com470e71d2011-07-07 08:21:25 +0000194 const int nw = 32;
195 int j, nwh;
196 float delta, x, y;
197
198 ip[0] = nw;
199 ip[1] = 1;
200 nwh = nw >> 1;
201 delta = atanf(1.0f) / nwh;
202 rdft_w[0] = 1;
203 rdft_w[1] = 0;
204 rdft_w[nwh] = cosf(delta * nwh);
205 rdft_w[nwh + 1] = rdft_w[nwh];
206 for (j = 2; j < nwh; j += 2) {
207 x = cosf(delta * j);
208 y = sinf(delta * j);
209 rdft_w[j] = x;
210 rdft_w[j + 1] = y;
211 rdft_w[nw - j] = y;
212 rdft_w[nw - j + 1] = x;
213 }
cd@webrtc.org85b4a1b2012-04-10 21:25:17 +0000214 bitrv2_32(ip + 2, rdft_w);
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000215
cduvivier@google.com288c8692011-08-22 21:55:33 +0000216 // pre-calculate constants used by cft1st_128 and cftmdl_128...
217 cftmdl_wk1r[0] = rdft_w[2];
218 cftmdl_wk1r[1] = rdft_w[2];
219 cftmdl_wk1r[2] = rdft_w[2];
220 cftmdl_wk1r[3] = -rdft_w[2];
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000221 {
222 int k1;
223
224 for (k1 = 0, j = 0; j < 128; j += 16, k1 += 2) {
225 const int k2 = 2 * k1;
226 const float wk2r = rdft_w[k1 + 0];
227 const float wk2i = rdft_w[k1 + 1];
228 float wk1r, wk1i;
229 // ... scalar version.
230 wk1r = rdft_w[k2 + 0];
231 wk1i = rdft_w[k2 + 1];
232 rdft_wk3ri_first[k1 + 0] = wk1r - 2 * wk2i * wk1i;
233 rdft_wk3ri_first[k1 + 1] = 2 * wk2i * wk1r - wk1i;
234 wk1r = rdft_w[k2 + 2];
235 wk1i = rdft_w[k2 + 3];
236 rdft_wk3ri_second[k1 + 0] = wk1r - 2 * wk2r * wk1i;
237 rdft_wk3ri_second[k1 + 1] = 2 * wk2r * wk1r - wk1i;
238 // ... vector version.
239 rdft_wk1r[k2 + 0] = rdft_w[k2 + 0];
240 rdft_wk1r[k2 + 1] = rdft_w[k2 + 0];
241 rdft_wk1r[k2 + 2] = rdft_w[k2 + 2];
242 rdft_wk1r[k2 + 3] = rdft_w[k2 + 2];
243 rdft_wk2r[k2 + 0] = rdft_w[k1 + 0];
244 rdft_wk2r[k2 + 1] = rdft_w[k1 + 0];
245 rdft_wk2r[k2 + 2] = -rdft_w[k1 + 1];
246 rdft_wk2r[k2 + 3] = -rdft_w[k1 + 1];
247 rdft_wk3r[k2 + 0] = rdft_wk3ri_first[k1 + 0];
248 rdft_wk3r[k2 + 1] = rdft_wk3ri_first[k1 + 0];
249 rdft_wk3r[k2 + 2] = rdft_wk3ri_second[k1 + 0];
250 rdft_wk3r[k2 + 3] = rdft_wk3ri_second[k1 + 0];
251 rdft_wk1i[k2 + 0] = -rdft_w[k2 + 1];
252 rdft_wk1i[k2 + 1] = rdft_w[k2 + 1];
253 rdft_wk1i[k2 + 2] = -rdft_w[k2 + 3];
254 rdft_wk1i[k2 + 3] = rdft_w[k2 + 3];
255 rdft_wk2i[k2 + 0] = -rdft_w[k1 + 1];
256 rdft_wk2i[k2 + 1] = rdft_w[k1 + 1];
257 rdft_wk2i[k2 + 2] = -rdft_w[k1 + 0];
258 rdft_wk2i[k2 + 3] = rdft_w[k1 + 0];
259 rdft_wk3i[k2 + 0] = -rdft_wk3ri_first[k1 + 1];
260 rdft_wk3i[k2 + 1] = rdft_wk3ri_first[k1 + 1];
261 rdft_wk3i[k2 + 2] = -rdft_wk3ri_second[k1 + 1];
262 rdft_wk3i[k2 + 3] = rdft_wk3ri_second[k1 + 1];
263 }
264 }
niklase@google.com470e71d2011-07-07 08:21:25 +0000265}
266
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000267static void makect_32(void) {
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000268 float* c = rdft_w + 32;
niklase@google.com470e71d2011-07-07 08:21:25 +0000269 const int nc = 32;
270 int j, nch;
271 float delta;
272
273 ip[1] = nc;
274 nch = nc >> 1;
275 delta = atanf(1.0f) / nch;
276 c[0] = cosf(delta * nch);
277 c[nch] = 0.5f * c[0];
278 for (j = 1; j < nch; j++) {
279 c[j] = 0.5f * cosf(delta * j);
280 c[nc - j] = 0.5f * sinf(delta * j);
281 }
282}
283
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000284static void cft1st_128_C(float* a) {
niklase@google.com470e71d2011-07-07 08:21:25 +0000285 const int n = 128;
286 int j, k1, k2;
287 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
288 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
289
bjornv@webrtc.orgcd9b90a2014-06-30 12:05:18 +0000290 // The processing of the first set of elements was simplified in C to avoid
291 // some operations (multiplication by zero or one, addition of two elements
292 // multiplied by the same weight, ...).
niklase@google.com470e71d2011-07-07 08:21:25 +0000293 x0r = a[0] + a[2];
294 x0i = a[1] + a[3];
295 x1r = a[0] - a[2];
296 x1i = a[1] - a[3];
297 x2r = a[4] + a[6];
298 x2i = a[5] + a[7];
299 x3r = a[4] - a[6];
300 x3i = a[5] - a[7];
301 a[0] = x0r + x2r;
302 a[1] = x0i + x2i;
303 a[4] = x0r - x2r;
304 a[5] = x0i - x2i;
305 a[2] = x1r - x3i;
306 a[3] = x1i + x3r;
307 a[6] = x1r + x3i;
308 a[7] = x1i - x3r;
309 wk1r = rdft_w[2];
310 x0r = a[8] + a[10];
311 x0i = a[9] + a[11];
312 x1r = a[8] - a[10];
313 x1i = a[9] - a[11];
314 x2r = a[12] + a[14];
315 x2i = a[13] + a[15];
316 x3r = a[12] - a[14];
317 x3i = a[13] - a[15];
318 a[8] = x0r + x2r;
319 a[9] = x0i + x2i;
320 a[12] = x2i - x0i;
321 a[13] = x0r - x2r;
322 x0r = x1r - x3i;
323 x0i = x1i + x3r;
324 a[10] = wk1r * (x0r - x0i);
325 a[11] = wk1r * (x0r + x0i);
326 x0r = x3i + x1r;
327 x0i = x3r - x1i;
328 a[14] = wk1r * (x0i - x0r);
329 a[15] = wk1r * (x0i + x0r);
330 k1 = 0;
331 for (j = 16; j < n; j += 16) {
332 k1 += 2;
333 k2 = 2 * k1;
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000334 wk2r = rdft_w[k1 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000335 wk2i = rdft_w[k1 + 1];
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000336 wk1r = rdft_w[k2 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000337 wk1i = rdft_w[k2 + 1];
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000338 wk3r = rdft_wk3ri_first[k1 + 0];
339 wk3i = rdft_wk3ri_first[k1 + 1];
340 x0r = a[j + 0] + a[j + 2];
niklase@google.com470e71d2011-07-07 08:21:25 +0000341 x0i = a[j + 1] + a[j + 3];
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000342 x1r = a[j + 0] - a[j + 2];
niklase@google.com470e71d2011-07-07 08:21:25 +0000343 x1i = a[j + 1] - a[j + 3];
344 x2r = a[j + 4] + a[j + 6];
345 x2i = a[j + 5] + a[j + 7];
346 x3r = a[j + 4] - a[j + 6];
347 x3i = a[j + 5] - a[j + 7];
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000348 a[j + 0] = x0r + x2r;
niklase@google.com470e71d2011-07-07 08:21:25 +0000349 a[j + 1] = x0i + x2i;
350 x0r -= x2r;
351 x0i -= x2i;
352 a[j + 4] = wk2r * x0r - wk2i * x0i;
353 a[j + 5] = wk2r * x0i + wk2i * x0r;
354 x0r = x1r - x3i;
355 x0i = x1i + x3r;
356 a[j + 2] = wk1r * x0r - wk1i * x0i;
357 a[j + 3] = wk1r * x0i + wk1i * x0r;
358 x0r = x1r + x3i;
359 x0i = x1i - x3r;
360 a[j + 6] = wk3r * x0r - wk3i * x0i;
361 a[j + 7] = wk3r * x0i + wk3i * x0r;
362 wk1r = rdft_w[k2 + 2];
363 wk1i = rdft_w[k2 + 3];
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000364 wk3r = rdft_wk3ri_second[k1 + 0];
365 wk3i = rdft_wk3ri_second[k1 + 1];
niklase@google.com470e71d2011-07-07 08:21:25 +0000366 x0r = a[j + 8] + a[j + 10];
367 x0i = a[j + 9] + a[j + 11];
368 x1r = a[j + 8] - a[j + 10];
369 x1i = a[j + 9] - a[j + 11];
370 x2r = a[j + 12] + a[j + 14];
371 x2i = a[j + 13] + a[j + 15];
372 x3r = a[j + 12] - a[j + 14];
373 x3i = a[j + 13] - a[j + 15];
374 a[j + 8] = x0r + x2r;
375 a[j + 9] = x0i + x2i;
376 x0r -= x2r;
377 x0i -= x2i;
378 a[j + 12] = -wk2i * x0r - wk2r * x0i;
379 a[j + 13] = -wk2i * x0i + wk2r * x0r;
380 x0r = x1r - x3i;
381 x0i = x1i + x3r;
382 a[j + 10] = wk1r * x0r - wk1i * x0i;
383 a[j + 11] = wk1r * x0i + wk1i * x0r;
384 x0r = x1r + x3i;
385 x0i = x1i - x3r;
386 a[j + 14] = wk3r * x0r - wk3i * x0i;
387 a[j + 15] = wk3r * x0i + wk3i * x0r;
388 }
389}
390
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000391static void cftmdl_128_C(float* a) {
cduvivier@google.com288c8692011-08-22 21:55:33 +0000392 const int l = 8;
niklase@google.com470e71d2011-07-07 08:21:25 +0000393 const int n = 128;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000394 const int m = 32;
395 int j0, j1, j2, j3, k, k1, k2, m2;
niklase@google.com470e71d2011-07-07 08:21:25 +0000396 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
397 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
398
cduvivier@google.com288c8692011-08-22 21:55:33 +0000399 for (j0 = 0; j0 < l; j0 += 2) {
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000400 j1 = j0 + 8;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000401 j2 = j0 + 16;
402 j3 = j0 + 24;
403 x0r = a[j0 + 0] + a[j1 + 0];
404 x0i = a[j0 + 1] + a[j1 + 1];
405 x1r = a[j0 + 0] - a[j1 + 0];
406 x1i = a[j0 + 1] - a[j1 + 1];
407 x2r = a[j2 + 0] + a[j3 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000408 x2i = a[j2 + 1] + a[j3 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000409 x3r = a[j2 + 0] - a[j3 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000410 x3i = a[j2 + 1] - a[j3 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000411 a[j0 + 0] = x0r + x2r;
412 a[j0 + 1] = x0i + x2i;
413 a[j2 + 0] = x0r - x2r;
niklase@google.com470e71d2011-07-07 08:21:25 +0000414 a[j2 + 1] = x0i - x2i;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000415 a[j1 + 0] = x1r - x3i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000416 a[j1 + 1] = x1i + x3r;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000417 a[j3 + 0] = x1r + x3i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000418 a[j3 + 1] = x1i - x3r;
419 }
420 wk1r = rdft_w[2];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000421 for (j0 = m; j0 < l + m; j0 += 2) {
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000422 j1 = j0 + 8;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000423 j2 = j0 + 16;
424 j3 = j0 + 24;
425 x0r = a[j0 + 0] + a[j1 + 0];
426 x0i = a[j0 + 1] + a[j1 + 1];
427 x1r = a[j0 + 0] - a[j1 + 0];
428 x1i = a[j0 + 1] - a[j1 + 1];
429 x2r = a[j2 + 0] + a[j3 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000430 x2i = a[j2 + 1] + a[j3 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000431 x3r = a[j2 + 0] - a[j3 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000432 x3i = a[j2 + 1] - a[j3 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000433 a[j0 + 0] = x0r + x2r;
434 a[j0 + 1] = x0i + x2i;
435 a[j2 + 0] = x2i - x0i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000436 a[j2 + 1] = x0r - x2r;
437 x0r = x1r - x3i;
438 x0i = x1i + x3r;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000439 a[j1 + 0] = wk1r * (x0r - x0i);
niklase@google.com470e71d2011-07-07 08:21:25 +0000440 a[j1 + 1] = wk1r * (x0r + x0i);
441 x0r = x3i + x1r;
442 x0i = x3r - x1i;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000443 a[j3 + 0] = wk1r * (x0i - x0r);
niklase@google.com470e71d2011-07-07 08:21:25 +0000444 a[j3 + 1] = wk1r * (x0i + x0r);
445 }
446 k1 = 0;
447 m2 = 2 * m;
448 for (k = m2; k < n; k += m2) {
449 k1 += 2;
450 k2 = 2 * k1;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000451 wk2r = rdft_w[k1 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000452 wk2i = rdft_w[k1 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000453 wk1r = rdft_w[k2 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000454 wk1i = rdft_w[k2 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000455 wk3r = rdft_wk3ri_first[k1 + 0];
456 wk3i = rdft_wk3ri_first[k1 + 1];
457 for (j0 = k; j0 < l + k; j0 += 2) {
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000458 j1 = j0 + 8;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000459 j2 = j0 + 16;
460 j3 = j0 + 24;
461 x0r = a[j0 + 0] + a[j1 + 0];
462 x0i = a[j0 + 1] + a[j1 + 1];
463 x1r = a[j0 + 0] - a[j1 + 0];
464 x1i = a[j0 + 1] - a[j1 + 1];
465 x2r = a[j2 + 0] + a[j3 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000466 x2i = a[j2 + 1] + a[j3 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000467 x3r = a[j2 + 0] - a[j3 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000468 x3i = a[j2 + 1] - a[j3 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000469 a[j0 + 0] = x0r + x2r;
470 a[j0 + 1] = x0i + x2i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000471 x0r -= x2r;
472 x0i -= x2i;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000473 a[j2 + 0] = wk2r * x0r - wk2i * x0i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000474 a[j2 + 1] = wk2r * x0i + wk2i * x0r;
475 x0r = x1r - x3i;
476 x0i = x1i + x3r;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000477 a[j1 + 0] = wk1r * x0r - wk1i * x0i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000478 a[j1 + 1] = wk1r * x0i + wk1i * x0r;
479 x0r = x1r + x3i;
480 x0i = x1i - x3r;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000481 a[j3 + 0] = wk3r * x0r - wk3i * x0i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000482 a[j3 + 1] = wk3r * x0i + wk3i * x0r;
483 }
484 wk1r = rdft_w[k2 + 2];
485 wk1i = rdft_w[k2 + 3];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000486 wk3r = rdft_wk3ri_second[k1 + 0];
487 wk3i = rdft_wk3ri_second[k1 + 1];
488 for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000489 j1 = j0 + 8;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000490 j2 = j0 + 16;
491 j3 = j0 + 24;
492 x0r = a[j0 + 0] + a[j1 + 0];
493 x0i = a[j0 + 1] + a[j1 + 1];
494 x1r = a[j0 + 0] - a[j1 + 0];
495 x1i = a[j0 + 1] - a[j1 + 1];
496 x2r = a[j2 + 0] + a[j3 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000497 x2i = a[j2 + 1] + a[j3 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000498 x3r = a[j2 + 0] - a[j3 + 0];
niklase@google.com470e71d2011-07-07 08:21:25 +0000499 x3i = a[j2 + 1] - a[j3 + 1];
cduvivier@google.com288c8692011-08-22 21:55:33 +0000500 a[j0 + 0] = x0r + x2r;
501 a[j0 + 1] = x0i + x2i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000502 x0r -= x2r;
503 x0i -= x2i;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000504 a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000505 a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
506 x0r = x1r - x3i;
507 x0i = x1i + x3r;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000508 a[j1 + 0] = wk1r * x0r - wk1i * x0i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000509 a[j1 + 1] = wk1r * x0i + wk1i * x0r;
510 x0r = x1r + x3i;
511 x0i = x1i - x3r;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000512 a[j3 + 0] = wk3r * x0r - wk3i * x0i;
niklase@google.com470e71d2011-07-07 08:21:25 +0000513 a[j3 + 1] = wk3r * x0i + wk3i * x0r;
514 }
515 }
516}
517
andrew@webrtc.orgc0907ef2014-02-21 00:13:31 +0000518static void cftfsub_128_C(float* a) {
niklase@google.com470e71d2011-07-07 08:21:25 +0000519 int j, j1, j2, j3, l;
520 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
521
522 cft1st_128(a);
cduvivier@google.com288c8692011-08-22 21:55:33 +0000523 cftmdl_128(a);
niklase@google.com470e71d2011-07-07 08:21:25 +0000524 l = 32;
525 for (j = 0; j < l; j += 2) {
526 j1 = j + l;
527 j2 = j1 + l;
528 j3 = j2 + l;
529 x0r = a[j] + a[j1];
530 x0i = a[j + 1] + a[j1 + 1];
531 x1r = a[j] - a[j1];
532 x1i = a[j + 1] - a[j1 + 1];
533 x2r = a[j2] + a[j3];
534 x2i = a[j2 + 1] + a[j3 + 1];
535 x3r = a[j2] - a[j3];
536 x3i = a[j2 + 1] - a[j3 + 1];
537 a[j] = x0r + x2r;
538 a[j + 1] = x0i + x2i;
539 a[j2] = x0r - x2r;
540 a[j2 + 1] = x0i - x2i;
541 a[j1] = x1r - x3i;
542 a[j1 + 1] = x1i + x3r;
543 a[j3] = x1r + x3i;
544 a[j3 + 1] = x1i - x3r;
545 }
546}
547
andrew@webrtc.orgc0907ef2014-02-21 00:13:31 +0000548static void cftbsub_128_C(float* a) {
niklase@google.com470e71d2011-07-07 08:21:25 +0000549 int j, j1, j2, j3, l;
550 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
551
552 cft1st_128(a);
cduvivier@google.com288c8692011-08-22 21:55:33 +0000553 cftmdl_128(a);
niklase@google.com470e71d2011-07-07 08:21:25 +0000554 l = 32;
555
556 for (j = 0; j < l; j += 2) {
557 j1 = j + l;
558 j2 = j1 + l;
559 j3 = j2 + l;
560 x0r = a[j] + a[j1];
561 x0i = -a[j + 1] - a[j1 + 1];
562 x1r = a[j] - a[j1];
563 x1i = -a[j + 1] + a[j1 + 1];
564 x2r = a[j2] + a[j3];
565 x2i = a[j2 + 1] + a[j3 + 1];
566 x3r = a[j2] - a[j3];
567 x3i = a[j2 + 1] - a[j3 + 1];
568 a[j] = x0r + x2r;
569 a[j + 1] = x0i - x2i;
570 a[j2] = x0r - x2r;
571 a[j2 + 1] = x0i + x2i;
572 a[j1] = x1r - x3i;
573 a[j1 + 1] = x1i - x3r;
574 a[j3] = x1r + x3i;
575 a[j3 + 1] = x1i + x3r;
576 }
577}
578
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000579static void rftfsub_128_C(float* a) {
580 const float* c = rdft_w + 32;
niklase@google.com470e71d2011-07-07 08:21:25 +0000581 int j1, j2, k1, k2;
582 float wkr, wki, xr, xi, yr, yi;
583
584 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
585 k2 = 128 - j2;
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000586 k1 = 32 - j1;
niklase@google.com470e71d2011-07-07 08:21:25 +0000587 wkr = 0.5f - c[k1];
588 wki = c[j1];
589 xr = a[j2 + 0] - a[k2 + 0];
590 xi = a[j2 + 1] + a[k2 + 1];
591 yr = wkr * xr - wki * xi;
592 yi = wkr * xi + wki * xr;
593 a[j2 + 0] -= yr;
594 a[j2 + 1] -= yi;
595 a[k2 + 0] += yr;
596 a[k2 + 1] -= yi;
597 }
598}
599
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000600static void rftbsub_128_C(float* a) {
601 const float* c = rdft_w + 32;
niklase@google.com470e71d2011-07-07 08:21:25 +0000602 int j1, j2, k1, k2;
603 float wkr, wki, xr, xi, yr, yi;
604
605 a[1] = -a[1];
606 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
607 k2 = 128 - j2;
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000608 k1 = 32 - j1;
niklase@google.com470e71d2011-07-07 08:21:25 +0000609 wkr = 0.5f - c[k1];
610 wki = c[j1];
611 xr = a[j2 + 0] - a[k2 + 0];
612 xi = a[j2 + 1] + a[k2 + 1];
613 yr = wkr * xr + wki * xi;
614 yi = wkr * xi - wki * xr;
615 a[j2 + 0] = a[j2 + 0] - yr;
616 a[j2 + 1] = yi - a[j2 + 1];
617 a[k2 + 0] = yr + a[k2 + 0];
618 a[k2 + 1] = yi - a[k2 + 1];
619 }
620 a[65] = -a[65];
621}
622
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000623void aec_rdft_forward_128(float* a) {
niklase@google.com470e71d2011-07-07 08:21:25 +0000624 float xi;
cd@webrtc.org85b4a1b2012-04-10 21:25:17 +0000625 bitrv2_128(a);
niklase@google.com470e71d2011-07-07 08:21:25 +0000626 cftfsub_128(a);
627 rftfsub_128(a);
628 xi = a[0] - a[1];
629 a[0] += a[1];
630 a[1] = xi;
631}
632
andrew@webrtc.org13b2d462013-10-08 23:41:42 +0000633void aec_rdft_inverse_128(float* a) {
niklase@google.com470e71d2011-07-07 08:21:25 +0000634 a[1] = 0.5f * (a[0] - a[1]);
635 a[0] -= a[1];
636 rftbsub_128(a);
cd@webrtc.org85b4a1b2012-04-10 21:25:17 +0000637 bitrv2_128(a);
niklase@google.com470e71d2011-07-07 08:21:25 +0000638 cftbsub_128(a);
639}
640
641// code path selection
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000642rft_sub_128_t cft1st_128;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000643rft_sub_128_t cftmdl_128;
niklase@google.com470e71d2011-07-07 08:21:25 +0000644rft_sub_128_t rftfsub_128;
645rft_sub_128_t rftbsub_128;
andrew@webrtc.orgc0907ef2014-02-21 00:13:31 +0000646rft_sub_128_t cftfsub_128;
647rft_sub_128_t cftbsub_128;
648rft_sub_128_t bitrv2_128;
niklase@google.com470e71d2011-07-07 08:21:25 +0000649
650void aec_rdft_init(void) {
cduvivier@google.com0e07d822011-07-25 23:54:20 +0000651 cft1st_128 = cft1st_128_C;
cduvivier@google.com288c8692011-08-22 21:55:33 +0000652 cftmdl_128 = cftmdl_128_C;
niklase@google.com470e71d2011-07-07 08:21:25 +0000653 rftfsub_128 = rftfsub_128_C;
654 rftbsub_128 = rftbsub_128_C;
andrew@webrtc.orgc0907ef2014-02-21 00:13:31 +0000655 cftfsub_128 = cftfsub_128_C;
656 cftbsub_128 = cftbsub_128_C;
657 bitrv2_128 = bitrv2_128_C;
andrew@webrtc.orgc8d012f2012-01-13 19:43:09 +0000658#if defined(WEBRTC_ARCH_X86_FAMILY)
niklase@google.com470e71d2011-07-07 08:21:25 +0000659 if (WebRtc_GetCPUInfo(kSSE2)) {
niklase@google.com470e71d2011-07-07 08:21:25 +0000660 aec_rdft_init_sse2();
niklase@google.com470e71d2011-07-07 08:21:25 +0000661 }
andrew@webrtc.orgc8d012f2012-01-13 19:43:09 +0000662#endif
andrew@webrtc.orgc0907ef2014-02-21 00:13:31 +0000663#if defined(MIPS_FPU_LE)
664 aec_rdft_init_mips();
665#endif
bjornv@webrtc.orgcd9b90a2014-06-30 12:05:18 +0000666#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
667 aec_rdft_init_neon();
668#endif
niklase@google.com470e71d2011-07-07 08:21:25 +0000669 // init library constants.
670 makewt_32();
671 makect_32();
672}