blob: 285de4dc79eeda5e9ddfd28eb4055b69f1e9ac6d [file] [log] [blame]
niklase@google.com470e71d2011-07-07 08:21:25 +00001/*
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11/*
12 * This file contains the Accelerate algorithm that is used to reduce
13 * the delay by removing a part of the audio stream.
14 */
15
16#include "dsp.h"
17
18#include "signal_processing_library.h"
19
20#include "dsp_helpfunctions.h"
21#include "neteq_error_codes.h"
22
23#define ACCELERATE_CORR_LEN 50
24#define ACCELERATE_MIN_LAG 10
25#define ACCELERATE_MAX_LAG 60
26#define ACCELERATE_DOWNSAMPLED_LEN (ACCELERATE_CORR_LEN + ACCELERATE_MAX_LAG)
27
28/* Scratch usage:
29
30 Type Name size startpos endpos
31 WebRtc_Word16 pw16_downSampSpeech 110 0 109
32 WebRtc_Word32 pw32_corr 2*50 110 209
33 WebRtc_Word16 pw16_corr 50 0 49
34
35 Total: 110+2*50
36 */
37
38#define SCRATCH_PW16_DS_SPEECH 0
39#define SCRATCH_PW32_CORR ACCELERATE_DOWNSAMPLED_LEN
40#define SCRATCH_PW16_CORR 0
41
42/****************************************************************************
43 * WebRtcNetEQ_Accelerate(...)
44 *
45 * This function tries to shorten the audio data by removing one or several
46 * pitch periods. The operation is only carried out if the correlation is
47 * strong or if the signal energy is very low.
48 *
49 * Input:
50 * - inst : NetEQ DSP instance
51 * - scratchPtr : Pointer to scratch vector.
52 * - decoded : Pointer to newly decoded speech.
53 * - len : Length of decoded speech.
54 * - BGNonly : If non-zero, Accelerate will only remove the last
55 * DEFAULT_TIME_ADJUST seconds of the input.
56 * No signal matching is done.
57 *
58 * Output:
59 * - inst : Updated instance
60 * - outData : Pointer to a memory space where the output data
61 * should be stored
62 * - pw16_len : Number of samples written to outData.
63 *
64 * Return value : 0 - Ok
65 * <0 - Error
66 */
67
68int WebRtcNetEQ_Accelerate(DSPInst_t *inst,
69#ifdef SCRATCH
70 WebRtc_Word16 *pw16_scratchPtr,
71#endif
72 const WebRtc_Word16 *pw16_decoded, int len,
73 WebRtc_Word16 *pw16_outData, WebRtc_Word16 *pw16_len,
74 WebRtc_Word16 BGNonly)
75{
76
77#ifdef SCRATCH
78 /* Use scratch memory for internal temporary vectors */
79 WebRtc_Word16 *pw16_downSampSpeech = pw16_scratchPtr + SCRATCH_PW16_DS_SPEECH;
80 WebRtc_Word32 *pw32_corr = (WebRtc_Word32*) (pw16_scratchPtr + SCRATCH_PW32_CORR);
81 WebRtc_Word16 *pw16_corr = pw16_scratchPtr + SCRATCH_PW16_CORR;
82#else
83 /* Allocate memory for temporary vectors */
84 WebRtc_Word16 pw16_downSampSpeech[ACCELERATE_DOWNSAMPLED_LEN];
85 WebRtc_Word32 pw32_corr[ACCELERATE_CORR_LEN];
86 WebRtc_Word16 pw16_corr[ACCELERATE_CORR_LEN];
87#endif
88 WebRtc_Word16 w16_decodedMax = 0;
89 WebRtc_Word16 w16_tmp;
90 WebRtc_Word16 w16_tmp2;
91 WebRtc_Word32 w32_tmp;
92 WebRtc_Word32 w32_tmp2;
93
94 const WebRtc_Word16 w16_startLag = ACCELERATE_MIN_LAG;
95 const WebRtc_Word16 w16_endLag = ACCELERATE_MAX_LAG;
96 const WebRtc_Word16 w16_corrLen = ACCELERATE_CORR_LEN;
97 const WebRtc_Word16 *pw16_vec1, *pw16_vec2;
98 WebRtc_Word16 *pw16_vectmp;
99 WebRtc_Word16 w16_inc, w16_startfact;
100 WebRtc_Word16 w16_bestIndex, w16_bestVal;
101 WebRtc_Word16 w16_VAD = 1;
102 WebRtc_Word16 fsMult;
103 WebRtc_Word16 fsMult120;
104 WebRtc_Word32 w32_en1, w32_en2, w32_cc;
105 WebRtc_Word16 w16_en1, w16_en2;
106 WebRtc_Word16 w16_en1Scale, w16_en2Scale;
107 WebRtc_Word16 w16_sqrtEn1En2;
108 WebRtc_Word16 w16_bestCorr = 0;
109 int ok;
110
111#ifdef NETEQ_STEREO
112 MasterSlaveInfo *msInfo = inst->msInfo;
113#endif
114
115 fsMult = WebRtcNetEQ_CalcFsMult(inst->fs); /* Calculate fs/8000 */
116
117 /* Pre-calculate common multiplication with fsMult */
118 fsMult120 = (WebRtc_Word16) WEBRTC_SPL_MUL_16_16(fsMult, 120); /* 15 ms */
119
120 inst->ExpandInst.w16_consecExp = 0; /* Last was not expand any more */
121
122 /* Sanity check for len variable; must be (almost) 30 ms
123 (120*fsMult + max(bestIndex)) */
124 if (len < (WebRtc_Word16) WEBRTC_SPL_MUL_16_16((120 + 119), fsMult))
125 {
126 /* Length of decoded data too short */
127 inst->w16_mode = MODE_UNSUCCESS_ACCELERATE;
128 *pw16_len = len;
129
130 /* simply move all data from decoded to outData */
131 WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len);
132
133 return NETEQ_OTHER_ERROR;
134 }
135
136 /***********************************/
137 /* Special operations for BGN only */
138 /***********************************/
139
140 /* Check if "background noise only" flag is set */
141 if (BGNonly)
142 {
143 /* special operation for BGN only; simply remove a chunk of data */
144 w16_bestIndex = DEFAULT_TIME_ADJUST * WEBRTC_SPL_LSHIFT_W16(fsMult, 3); /* X*fs/1000 */
145
146 /* Sanity check for bestIndex */
147 if (w16_bestIndex > len)
148 { /* not good, do nothing instead */
149 inst->w16_mode = MODE_UNSUCCESS_ACCELERATE;
150 *pw16_len = len;
151
152 /* simply move all data from decoded to outData */
153 WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len);
154
155 return NETEQ_OTHER_ERROR;
156 }
157
158 /* set length parameter */
159 *pw16_len = len - w16_bestIndex; /* we remove bestIndex samples */
160
161 /* copy to output */
162 WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, *pw16_len);
163
164 /* set mode */
165 inst->w16_mode = MODE_LOWEN_ACCELERATE;
166
167 /* update statistics */
168 inst->statInst.accelerateLength += w16_bestIndex;
169
170 return 0;
171 } /* end of special code for BGN mode */
172
173#ifdef NETEQ_STEREO
174
175 /* Sanity for msInfo */
176 if (msInfo == NULL)
177 {
178 /* this should not happen here */
179 return MASTER_SLAVE_ERROR;
180 }
181
182 if (msInfo->msMode != NETEQ_SLAVE)
183 {
184 /* Find correlation lag only for non-slave instances */
185
186#endif
187
188 /****************************************************************/
189 /* Find the strongest correlation lag by downsampling to 4 kHz, */
190 /* calculating correlation for downsampled signal and finding */
191 /* the strongest correlation peak. */
192 /****************************************************************/
193
194 /* find maximum absolute value */
195 w16_decodedMax = WebRtcSpl_MaxAbsValueW16(pw16_decoded, (WebRtc_Word16) len);
196
197 /* downsample the decoded speech to 4 kHz */
198 ok = WebRtcNetEQ_DownSampleTo4kHz(pw16_decoded, len, inst->fs, pw16_downSampSpeech,
199 ACCELERATE_DOWNSAMPLED_LEN, 1 /* compensate delay*/);
200 if (ok != 0)
201 {
202 /* error */
203 inst->w16_mode = MODE_UNSUCCESS_ACCELERATE;
204 *pw16_len = len;
205 /* simply move all data from decoded to outData */
206 WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len);
207 return NETEQ_OTHER_ERROR;
208 }
209
210 /*
211 * Set scaling factor for cross correlation to protect against overflow
212 * (log2(50) => 6)
213 */
214 w16_tmp = 6 - WebRtcSpl_NormW32(WEBRTC_SPL_MUL_16_16(w16_decodedMax, w16_decodedMax));
215 w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp);
216
217 /* Perform correlation from lag 10 to lag 60 in 4 kHz domain */
218 WebRtcNetEQ_CrossCorr(
219 pw32_corr, &pw16_downSampSpeech[w16_endLag],
220 &pw16_downSampSpeech[w16_endLag - w16_startLag], w16_corrLen,
221 (WebRtc_Word16) (w16_endLag - w16_startLag), w16_tmp, -1);
222
223 /* Normalize correlation to 14 bits and put in a WebRtc_Word16 vector */
224 w32_tmp = WebRtcSpl_MaxAbsValueW32(pw32_corr, w16_corrLen);
225 w16_tmp = 17 - WebRtcSpl_NormW32(w32_tmp);
226 w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp);
227
228 WebRtcSpl_VectorBitShiftW32ToW16(pw16_corr, w16_corrLen, pw32_corr, w16_tmp);
229
230#ifdef NETEQ_STEREO
231 } /* end if (msInfo->msMode != NETEQ_SLAVE) */
232
233 if ((msInfo->msMode == NETEQ_MASTER) || (msInfo->msMode == NETEQ_MONO))
234 {
235 /* Find the strongest correlation peak by using the parabolic fit method */
236 WebRtcNetEQ_PeakDetection(pw16_corr, (WebRtc_Word16) w16_corrLen, 1, fsMult,
237 &w16_bestIndex, &w16_bestVal);
238 /* 0 <= bestIndex <= (2*corrLen - 1)*fsMult = 99*fsMult */
239
240 /* Compensate bestIndex for displaced starting position */
241 w16_bestIndex = w16_bestIndex + w16_startLag * WEBRTC_SPL_LSHIFT_W16(fsMult, 1);
242 /* 20*fsMult <= bestIndex <= 119*fsMult */
243
244 msInfo->bestIndex = w16_bestIndex;
245 }
246 else if (msInfo->msMode == NETEQ_SLAVE)
247 {
248 if (msInfo->extraInfo == ACC_FAIL)
249 {
250 /* Master has signaled an unsuccessful accelerate */
251 w16_bestIndex = 0;
252 }
253 else
254 {
255 /* Get best index from master */
256 w16_bestIndex = msInfo->bestIndex;
257 }
258 }
259 else
260 {
261 /* Invalid mode */
262 return MASTER_SLAVE_ERROR;
263 }
264
265#else /* NETEQ_STEREO */
266
267 /* Find the strongest correlation peak by using the parabolic fit method */
268 WebRtcNetEQ_PeakDetection(pw16_corr, (WebRtc_Word16) w16_corrLen, 1, fsMult,
269 &w16_bestIndex, &w16_bestVal);
270 /* 0 <= bestIndex <= (2*corrLen - 1)*fsMult = 99*fsMult */
271
272 /* Compensate bestIndex for displaced starting position */
273 w16_bestIndex = w16_bestIndex + w16_startLag * WEBRTC_SPL_LSHIFT_W16(fsMult, 1);
274 /* 20*fsMult <= bestIndex <= 119*fsMult */
275
276#endif /* NETEQ_STEREO */
277
278#ifdef NETEQ_STEREO
279
280 if (msInfo->msMode != NETEQ_SLAVE)
281 {
282 /* Calculate correlation only for non-slave instances */
283
284#endif /* NETEQ_STEREO */
285
286 /*****************************************************/
287 /* Calculate correlation bestCorr for the found lag. */
288 /* Also do a simple VAD decision. */
289 /*****************************************************/
290
291 /*
292 * Calculate scaling to ensure that bestIndex samples can be square-summed
293 * without overflowing
294 */
295 w16_tmp = (31
296 - WebRtcSpl_NormW32(WEBRTC_SPL_MUL_16_16(w16_decodedMax, w16_decodedMax)));
297 w16_tmp += (31 - WebRtcSpl_NormW32(w16_bestIndex));
298 w16_tmp -= 31;
299 w16_tmp = WEBRTC_SPL_MAX(0, w16_tmp);
300
301 /* vec1 starts at 15 ms minus one pitch period */
302 pw16_vec1 = &pw16_decoded[fsMult120 - w16_bestIndex];
303 /* vec2 start at 15 ms */
304 pw16_vec2 = &pw16_decoded[fsMult120];
305
306 /* Calculate energies for vec1 and vec2 */
307 w32_en1 = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec1,
308 (WebRtc_Word16*) pw16_vec1, w16_bestIndex, w16_tmp);
309 w32_en2 = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec2,
310 (WebRtc_Word16*) pw16_vec2, w16_bestIndex, w16_tmp);
311
312 /* Calculate cross-correlation at the found lag */
313 w32_cc = WebRtcNetEQ_DotW16W16((WebRtc_Word16*) pw16_vec1, (WebRtc_Word16*) pw16_vec2,
314 w16_bestIndex, w16_tmp);
315
316 /* Check VAD constraint
317 ((en1+en2)/(2*bestIndex)) <= 8*inst->BGNInst.energy */
318 w32_tmp = WEBRTC_SPL_RSHIFT_W32(w32_en1 + w32_en2, 4); /* (en1+en2)/(2*8) */
319 if (inst->BGNInst.w16_initialized == 1)
320 {
321 w32_tmp2 = inst->BGNInst.w32_energy;
322 }
323 else
324 {
325 /* if BGN parameters have not been estimated, use a fixed threshold */
326 w32_tmp2 = 75000;
327 }
328 w16_tmp2 = 16 - WebRtcSpl_NormW32(w32_tmp2);
329 w16_tmp2 = WEBRTC_SPL_MAX(0, w16_tmp2);
330 w32_tmp = WEBRTC_SPL_RSHIFT_W32(w32_tmp, w16_tmp2);
331 w16_tmp2 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_tmp2, w16_tmp2);
332 w32_tmp2 = WEBRTC_SPL_MUL_16_16(w16_bestIndex, w16_tmp2);
333
334 /* Scale w32_tmp properly before comparing with w32_tmp2 */
335 /* (w16_tmp is scaling before energy calculation, thus 2*w16_tmp) */
336 if (WebRtcSpl_NormW32(w32_tmp) < WEBRTC_SPL_LSHIFT_W32(w16_tmp,1))
337 {
338 /* Cannot scale only w32_tmp, must scale w32_temp2 too */
339 WebRtc_Word16 tempshift = WebRtcSpl_NormW32(w32_tmp);
340 w32_tmp = WEBRTC_SPL_LSHIFT_W32(w32_tmp, tempshift);
341 w32_tmp2 = WEBRTC_SPL_RSHIFT_W32(w32_tmp2,
342 WEBRTC_SPL_LSHIFT_W32(w16_tmp,1) - tempshift);
343 }
344 else
345 {
346 w32_tmp = WEBRTC_SPL_LSHIFT_W32(w32_tmp,
347 WEBRTC_SPL_LSHIFT_W32(w16_tmp,1));
348 }
349
350 if (w32_tmp <= w32_tmp2) /*((en1+en2)/(2*bestIndex)) <= 8*inst->BGNInst.energy */
351 {
352 /* The signal seems to be passive speech */
353 w16_VAD = 0;
354 w16_bestCorr = 0; /* Correlation does not matter */
355 }
356 else
357 {
358 /* The signal is active speech */
359 w16_VAD = 1;
360
361 /* Calculate correlation (cc/sqrt(en1*en2)) */
362
363 /* Start with calculating scale values */
364 w16_en1Scale = 16 - WebRtcSpl_NormW32(w32_en1);
365 w16_en1Scale = WEBRTC_SPL_MAX(0, w16_en1Scale);
366 w16_en2Scale = 16 - WebRtcSpl_NormW32(w32_en2);
367 w16_en2Scale = WEBRTC_SPL_MAX(0, w16_en2Scale);
368
369 /* Make sure total scaling is even (to simplify scale factor after sqrt) */
370 if ((w16_en1Scale + w16_en2Scale) & 1)
371 {
372 w16_en1Scale += 1;
373 }
374
375 /* Convert energies to WebRtc_Word16 */
376 w16_en1 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_en1, w16_en1Scale);
377 w16_en2 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(w32_en2, w16_en2Scale);
378
379 /* Calculate energy product */
380 w32_tmp = WEBRTC_SPL_MUL_16_16(w16_en1, w16_en2);
381
382 /* Calculate square-root of energy product */
henrik.lundin@webrtc.org9f710d02011-09-12 16:44:37 +0000383 w16_sqrtEn1En2 = (WebRtc_Word16) WebRtcSpl_SqrtFloor(w32_tmp);
niklase@google.com470e71d2011-07-07 08:21:25 +0000384
385 /* Calculate cc/sqrt(en1*en2) in Q14 */
386 w16_tmp = 14 - WEBRTC_SPL_RSHIFT_W16(w16_en1Scale+w16_en2Scale, 1);
387 w32_cc = WEBRTC_SPL_SHIFT_W32(w32_cc, w16_tmp);
388 w32_cc = WEBRTC_SPL_MAX(0, w32_cc); /* Don't divide with negative number */
389 w16_bestCorr = (WebRtc_Word16) WebRtcSpl_DivW32W16(w32_cc, w16_sqrtEn1En2);
390 w16_bestCorr = WEBRTC_SPL_MIN(16384, w16_bestCorr); /* set maximum to 1.0 */
391 }
392
393#ifdef NETEQ_STEREO
394
395 } /* end if (msInfo->msMode != NETEQ_SLAVE) */
396
397#endif /* NETEQ_STEREO */
398
399 /************************************************/
400 /* Check accelerate criteria and remove samples */
401 /************************************************/
402
403 /* Check for strong correlation (>0.9) or passive speech */
404#ifdef NETEQ_STEREO
405 if ((((w16_bestCorr > 14746) || (w16_VAD == 0)) && (msInfo->msMode != NETEQ_SLAVE))
406 || ((msInfo->msMode == NETEQ_SLAVE) && (msInfo->extraInfo != ACC_FAIL)))
407#else
408 if ((w16_bestCorr > 14746) || (w16_VAD == 0))
409#endif
410 {
411 /* Do accelerate operation by overlap add */
412
413 /*
414 * Calculate cross-fading slope so that the fading factor goes from
415 * 1 (16384 in Q14) to 0 in one pitch period (bestIndex).
416 */
417 w16_inc = (WebRtc_Word16) WebRtcSpl_DivW32W16((WebRtc_Word32) 16384,
418 (WebRtc_Word16) (w16_bestIndex + 1)); /* in Q14 */
419
420 /* Initiate fading factor */
421 w16_startfact = 16384 - w16_inc;
422
423 /* vec1 starts at 15 ms minus one pitch period */
424 pw16_vec1 = &pw16_decoded[fsMult120 - w16_bestIndex];
425 /* vec2 start at 15 ms */
426 pw16_vec2 = &pw16_decoded[fsMult120];
427
428 /* Copy unmodified part [0 to 15 ms minus 1 pitch period] */
429 w16_tmp = (fsMult120 - w16_bestIndex);
430 WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, w16_tmp);
431
432 /* Generate interpolated part of length bestIndex (1 pitch period) */
433 pw16_vectmp = pw16_outData + w16_tmp; /* start of interpolation output */
434 /* Reuse mixing function from Expand */
435 WebRtcNetEQ_MixVoiceUnvoice(pw16_vectmp, (WebRtc_Word16*) pw16_vec1,
436 (WebRtc_Word16*) pw16_vec2, &w16_startfact, w16_inc, w16_bestIndex);
437
438 /* Move the last part (also unmodified) */
439 /* Take from decoded at 15 ms + 1 pitch period */
440 pw16_vec2 = &pw16_decoded[fsMult120 + w16_bestIndex];
441 WEBRTC_SPL_MEMMOVE_W16(&pw16_outData[fsMult120], pw16_vec2,
442 (WebRtc_Word16) (len - fsMult120 - w16_bestIndex));
443
444 /* Set the mode flag */
445 if (w16_VAD)
446 {
447 inst->w16_mode = MODE_SUCCESS_ACCELERATE;
448 }
449 else
450 {
451 inst->w16_mode = MODE_LOWEN_ACCELERATE;
452 }
453
454 /* Calculate resulting length = original length - pitch period */
455 *pw16_len = len - w16_bestIndex;
456
457 /* Update in-call statistics */
458 inst->statInst.accelerateLength += w16_bestIndex;
459
460 return 0;
461 }
462 else
463 {
464 /* Accelerate not allowed */
465
466#ifdef NETEQ_STEREO
467 /* Signal to slave(s) that this was unsuccessful */
468 if (msInfo->msMode == NETEQ_MASTER)
469 {
470 msInfo->extraInfo = ACC_FAIL;
471 }
472#endif
473
474 /* Set mode flag to unsuccessful accelerate */
475 inst->w16_mode = MODE_UNSUCCESS_ACCELERATE;
476
477 /* Length is unmodified */
478 *pw16_len = len;
479
480 /* Simply move all data from decoded to outData */
481 WEBRTC_SPL_MEMMOVE_W16(pw16_outData, pw16_decoded, (WebRtc_Word16) len);
482
483 return 0;
484 }
485}
486
487#undef SCRATCH_PW16_DS_SPEECH
488#undef SCRATCH_PW32_CORR
489#undef SCRATCH_PW16_CORR