blob: b7680292bd8d88f4c2f80aeb33c155894604f3b8 [file] [log] [blame]
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +00001/*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020011#include "modules/audio_coding/neteq/time_stretch.h"
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000012
13#include <algorithm> // min, max
kwiberg2d0c3322016-02-14 09:28:33 -080014#include <memory>
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000015
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020016#include "common_audio/signal_processing/include/signal_processing_library.h"
17#include "modules/audio_coding/neteq/background_noise.h"
18#include "modules/audio_coding/neteq/cross_correlation.h"
19#include "modules/audio_coding/neteq/dsp_helper.h"
Karl Wiberge40468b2017-11-22 10:42:26 +010020#include "rtc_base/numerics/safe_conversions.h"
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000021
22namespace webrtc {
23
Henrik Lundincf808d22015-05-27 14:33:29 +020024TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
25 size_t input_len,
26 bool fast_mode,
27 AudioMultiVector* output,
Peter Kastingdce40cf2015-08-24 14:52:23 -070028 size_t* length_change_samples) {
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000029 // Pre-calculate common multiplication with |fs_mult_|.
Peter Kastingdce40cf2015-08-24 14:52:23 -070030 size_t fs_mult_120 =
31 static_cast<size_t>(fs_mult_ * 120); // Corresponds to 15 ms.
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000032
33 const int16_t* signal;
kwiberg2d0c3322016-02-14 09:28:33 -080034 std::unique_ptr<int16_t[]> signal_array;
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000035 size_t signal_len;
36 if (num_channels_ == 1) {
37 signal = input;
38 signal_len = input_len;
39 } else {
40 // We want |signal| to be only the first channel of |input|, which is
41 // interleaved. Thus, we take the first sample, skip forward |num_channels|
42 // samples, and continue like that.
43 signal_len = input_len / num_channels_;
44 signal_array.reset(new int16_t[signal_len]);
45 signal = signal_array.get();
Henrik Lundin11b6f682020-06-29 12:17:42 +020046 size_t j = kRefChannel;
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000047 for (size_t i = 0; i < signal_len; ++i) {
48 signal_array[i] = input[j];
49 j += num_channels_;
50 }
51 }
52
53 // Find maximum absolute value of input signal.
Peter Kastingdce40cf2015-08-24 14:52:23 -070054 max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000055
56 // Downsample to 4 kHz sample rate and calculate auto-correlation.
57 DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
58 sample_rate_hz_, true /* compensate delay*/,
59 downsampled_input_);
60 AutoCorrelation();
61
62 // Find the strongest correlation peak.
Peter Kastingdce40cf2015-08-24 14:52:23 -070063 static const size_t kNumPeaks = 1;
64 size_t peak_index;
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000065 int16_t peak_value;
66 DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
67 fs_mult_, &peak_index, &peak_value);
68 // Assert that |peak_index| stays within boundaries.
Mirko Bonadei25ab3222021-07-08 20:08:20 +020069 RTC_DCHECK_LE(peak_index, (2 * kCorrelationLen - 1) * fs_mult_);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000070
71 // Compensate peak_index for displaced starting position. The displacement
72 // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz
73 // domain, while the |peak_index| is in the original sample rate; hence, the
74 // multiplication by fs_mult_ * 2.
75 peak_index += kMinLag * fs_mult_ * 2;
76 // Assert that |peak_index| stays within boundaries.
Mirko Bonadei25ab3222021-07-08 20:08:20 +020077 RTC_DCHECK_GE(peak_index, static_cast<size_t>(20 * fs_mult_));
78 RTC_DCHECK_LE(peak_index,
79 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000080
81 // Calculate scaling to ensure that |peak_index| samples can be square-summed
82 // without overflowing.
83 int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
Yves Gerey665174f2018-06-19 15:03:05 +020084 WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +000085 scaling = std::max(0, scaling);
86
87 // |vec1| starts at 15 ms minus one pitch period.
88 const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
89 // |vec2| start at 15 ms.
90 const int16_t* vec2 = &signal[fs_mult_120];
91 // Calculate energies for |vec1| and |vec2|, assuming they both contain
92 // |peak_index| samples.
93 int32_t vec1_energy =
94 WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
95 int32_t vec2_energy =
96 WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);
97
98 // Calculate cross-correlation between |vec1| and |vec2|.
99 int32_t cross_corr =
100 WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);
101
102 // Check if the signal seems to be active speech or not (simple VAD).
Yves Gerey665174f2018-06-19 15:03:05 +0200103 bool active_speech =
104 SpeechDetection(vec1_energy, vec2_energy, peak_index, scaling);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000105
106 int16_t best_correlation;
107 if (!active_speech) {
108 SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
109 } else {
110 // Calculate correlation:
111 // cross_corr / sqrt(vec1_energy * vec2_energy).
112
113 // Start with calculating scale values.
114 int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
115 int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));
116
117 // Make sure total scaling is even (to simplify scale factor after sqrt).
118 if ((energy1_scale + energy2_scale) & 1) {
119 // The sum is odd.
120 energy1_scale += 1;
121 }
122
123 // Scale energies to int16_t.
124 int16_t vec1_energy_int16 =
125 static_cast<int16_t>(vec1_energy >> energy1_scale);
126 int16_t vec2_energy_int16 =
127 static_cast<int16_t>(vec2_energy >> energy2_scale);
128
129 // Calculate square-root of energy product.
Yves Gerey665174f2018-06-19 15:03:05 +0200130 int16_t sqrt_energy_prod =
131 WebRtcSpl_SqrtFloor(vec1_energy_int16 * vec2_energy_int16);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000132
133 // Calculate cross_corr / sqrt(en1*en2) in Q14.
134 int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
135 cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
136 cross_corr = std::max(0, cross_corr); // Don't use if negative.
137 best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
138 // Make sure |best_correlation| is no larger than 1 in Q14.
139 best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
140 }
141
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000142 // Check accelerate criteria and stretch the signal.
Henrik Lundincf808d22015-05-27 14:33:29 +0200143 ReturnCodes return_value =
144 CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
145 active_speech, fast_mode, output);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000146 switch (return_value) {
147 case kSuccess:
148 *length_change_samples = peak_index;
149 break;
150 case kSuccessLowEnergy:
151 *length_change_samples = peak_index;
152 break;
153 case kNoStretch:
154 case kError:
155 *length_change_samples = 0;
156 break;
157 }
158 return return_value;
159}
160
161void TimeStretch::AutoCorrelation() {
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000162 // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
163 int32_t auto_corr[kCorrelationLen];
minyue53ff70f2016-05-02 01:50:30 -0700164 CrossCorrelationWithAutoShift(
165 &downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag],
166 kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000167
168 // Normalize correlation to 14 bits and write to |auto_correlation_|.
169 int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
minyue53ff70f2016-05-02 01:50:30 -0700170 int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000171 WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
172 auto_corr, scaling);
173}
174
Yves Gerey665174f2018-06-19 15:03:05 +0200175bool TimeStretch::SpeechDetection(int32_t vec1_energy,
176 int32_t vec2_energy,
177 size_t peak_index,
178 int scaling) const {
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000179 // Check if the signal seems to be active speech or not (simple VAD).
180 // If (vec1_energy + vec2_energy) / (2 * peak_index) <=
181 // 8 * background_noise_energy, then we say that the signal contains no
182 // active speech.
183 // Rewrite the inequality as:
184 // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
185 // The two sides of the inequality will be denoted |left_side| and
186 // |right_side|.
Henrik Lundinb1629cf2017-02-28 14:58:30 +0100187 int32_t left_side = rtc::saturated_cast<int32_t>(
188 (static_cast<int64_t>(vec1_energy) + vec2_energy) / 16);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000189 int32_t right_side;
190 if (background_noise_.initialized()) {
Henrik Lundin11b6f682020-06-29 12:17:42 +0200191 right_side = background_noise_.Energy(kRefChannel);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000192 } else {
193 // If noise parameters have not been estimated, use a fixed threshold.
194 right_side = 75000;
195 }
196 int right_scale = 16 - WebRtcSpl_NormW32(right_side);
197 right_scale = std::max(0, right_scale);
198 left_side = left_side >> right_scale;
Peter Kastingdce40cf2015-08-24 14:52:23 -0700199 right_side =
kwibergd3edd772017-03-01 18:52:48 -0800200 rtc::dchecked_cast<int32_t>(peak_index) * (right_side >> right_scale);
henrik.lundin@webrtc.orgd94659d2013-01-29 12:09:21 +0000201
202 // Scale |left_side| properly before comparing with |right_side|.
203 // (|scaling| is the scale factor before energy calculation, thus the scale
204 // factor for the energy is 2 * scaling.)
205 if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
206 // Cannot scale only |left_side|, must scale |right_side| too.
207 int temp_scale = WebRtcSpl_NormW32(left_side);
208 left_side = left_side << temp_scale;
209 right_side = right_side >> (2 * scaling - temp_scale);
210 } else {
211 left_side = left_side << 2 * scaling;
212 }
213 return left_side > right_side;
214}
215
216} // namespace webrtc