blob: d5f6b3bcd7715d6104b0cba8b20d5c0151c13438 [file] [log] [blame]
pbos@webrtc.org788acd12014-12-15 09:41:24 +00001/*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
aluebsecf6b812015-06-25 12:28:48 -070011#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
pbos@webrtc.org788acd12014-12-15 09:41:24 +000012
13#include <math.h>
14#include <stdio.h>
15
andrew@webrtc.org04c50982015-03-19 20:06:29 +000016#include "webrtc/common_audio/fft4g.h"
aluebsecf6b812015-06-25 12:28:48 -070017#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
18#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
Edward Lemurc20978e2017-07-06 19:44:34 +020019#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
20#include "webrtc/rtc_base/checks.h"
pbos@webrtc.org788acd12014-12-15 09:41:24 +000021extern "C" {
22#include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
23#include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
24#include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
25#include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
pbos@webrtc.org788acd12014-12-15 09:41:24 +000026}
Henrik Kjellanderff761fb2015-11-04 08:31:52 +010027#include "webrtc/modules/include/module_common_types.h"
pbos@webrtc.org788acd12014-12-15 09:41:24 +000028
29namespace webrtc {
30
31// The following structures are declared anonymous in iSAC's structs.h. To
32// forward declare them, we use this derived class trick.
aluebsecf6b812015-06-25 12:28:48 -070033struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
34struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
pbos@webrtc.org788acd12014-12-15 09:41:24 +000035
brucedawsonfde21162017-06-20 10:57:09 -070036static constexpr float kFrequencyResolution =
aluebsecf6b812015-06-25 12:28:48 -070037 kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
brucedawsonfde21162017-06-20 10:57:09 -070038static constexpr int kSilenceRms = 5;
pbos@webrtc.org788acd12014-12-15 09:41:24 +000039
aluebsecf6b812015-06-25 12:28:48 -070040// TODO(turajs): Make a Create or Init for VadAudioProc.
41VadAudioProc::VadAudioProc()
pbos@webrtc.org788acd12014-12-15 09:41:24 +000042 : audio_buffer_(),
43 num_buffer_samples_(kNumPastSignalSamples),
44 log_old_gain_(-2),
45 old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).
46 pitch_analysis_handle_(new PitchAnalysisStruct),
47 pre_filter_handle_(new PreFiltBankstr),
aluebsecf6b812015-06-25 12:28:48 -070048 high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
49 kFilterOrder,
50 kCoeffDenominator,
51 kFilterOrder)) {
kwiberg@webrtc.org2ebfac52015-01-14 10:51:54 +000052 static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
53 sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
54 "lpc analysis window incorrect size");
55 static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
56 "correlation weight incorrect size");
pbos@webrtc.org788acd12014-12-15 09:41:24 +000057
58 // TODO(turajs): Are we doing too much in the constructor?
59 float data[kDftSize];
60 // Make FFT to initialize.
61 ip_[0] = 0;
62 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
63 // TODO(turajs): Need to initialize high-pass filter.
64
65 // Initialize iSAC components.
66 WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
67 WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
68}
69
aluebsecf6b812015-06-25 12:28:48 -070070VadAudioProc::~VadAudioProc() {
71}
pbos@webrtc.org788acd12014-12-15 09:41:24 +000072
aluebsecf6b812015-06-25 12:28:48 -070073void VadAudioProc::ResetBuffer() {
pbos@webrtc.org788acd12014-12-15 09:41:24 +000074 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
75 sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
76 num_buffer_samples_ = kNumPastSignalSamples;
77}
78
aluebsecf6b812015-06-25 12:28:48 -070079int VadAudioProc::ExtractFeatures(const int16_t* frame,
Peter Kastingdce40cf2015-08-24 14:52:23 -070080 size_t length,
pbos@webrtc.org788acd12014-12-15 09:41:24 +000081 AudioFeatures* features) {
82 features->num_frames = 0;
83 if (length != kNumSubframeSamples) {
84 return -1;
85 }
86
87 // High-pass filter to remove the DC component and very low frequency content.
88 // We have experienced that this high-pass filtering improves voice/non-voiced
89 // classification.
90 if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
91 &audio_buffer_[num_buffer_samples_]) != 0) {
aluebsecf6b812015-06-25 12:28:48 -070092 return -1;
pbos@webrtc.org788acd12014-12-15 09:41:24 +000093 }
94
95 num_buffer_samples_ += kNumSubframeSamples;
96 if (num_buffer_samples_ < kBufferLength) {
97 return 0;
98 }
kwiberg9e2be5f2016-09-14 05:23:22 -070099 RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000100 features->num_frames = kNum10msSubframes;
101 features->silence = false;
102
103 Rms(features->rms, kMaxNumFrames);
Peter Kastingdce40cf2015-08-24 14:52:23 -0700104 for (size_t i = 0; i < kNum10msSubframes; ++i) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000105 if (features->rms[i] < kSilenceRms) {
106 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
107 // Bail out here instead.
108 features->silence = true;
109 ResetBuffer();
110 return 0;
111 }
112 }
113
114 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
115 kMaxNumFrames);
116 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
117 ResetBuffer();
118 return 0;
119}
120
121// Computes |kLpcOrder + 1| correlation coefficients.
aluebsecf6b812015-06-25 12:28:48 -0700122void VadAudioProc::SubframeCorrelation(double* corr,
Peter Kastingdce40cf2015-08-24 14:52:23 -0700123 size_t length_corr,
124 size_t subframe_index) {
kwiberg9e2be5f2016-09-14 05:23:22 -0700125 RTC_DCHECK_GE(length_corr, kLpcOrder + 1);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000126 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
Peter Kastingdce40cf2015-08-24 14:52:23 -0700127 size_t buffer_index = subframe_index * kNumSubframeSamples;
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000128
Peter Kastingdce40cf2015-08-24 14:52:23 -0700129 for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000130 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
131
aluebsecf6b812015-06-25 12:28:48 -0700132 WebRtcIsac_AutoCorr(corr, windowed_audio,
133 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000134}
135
136// Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
137// The analysis window is 15 ms long and it is centered on the first half of
138// each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
139// first half of each 10 ms subframe.
Peter Kastingdce40cf2015-08-24 14:52:23 -0700140void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
kwiberg9e2be5f2016-09-14 05:23:22 -0700141 RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1));
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000142 double corr[kLpcOrder + 1];
143 double reflec_coeff[kLpcOrder];
Peter Kastingdce40cf2015-08-24 14:52:23 -0700144 for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
aluebsecf6b812015-06-25 12:28:48 -0700145 i++, offset_lpc += kLpcOrder + 1) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000146 SubframeCorrelation(corr, kLpcOrder + 1, i);
147 corr[0] *= 1.0001;
148 // This makes Lev-Durb a bit more stable.
Peter Kastingdce40cf2015-08-24 14:52:23 -0700149 for (size_t k = 0; k < kLpcOrder + 1; k++) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000150 corr[k] *= kCorrWeight[k];
151 }
152 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
153 }
154}
155
156// Fit a second order curve to these 3 points and find the location of the
157// extremum. The points are inverted before curve fitting.
aluebsecf6b812015-06-25 12:28:48 -0700158static float QuadraticInterpolation(float prev_val,
159 float curr_val,
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000160 float next_val) {
161 // Doing the interpolation in |1 / A(z)|^2.
162 float fractional_index = 0;
163 next_val = 1.0f / next_val;
164 prev_val = 1.0f / prev_val;
165 curr_val = 1.0f / curr_val;
166
aluebsecf6b812015-06-25 12:28:48 -0700167 fractional_index =
168 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
kwiberg9e2be5f2016-09-14 05:23:22 -0700169 RTC_DCHECK_LT(fabs(fractional_index), 1);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000170 return fractional_index;
171}
172
173// 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
174// of the input signal. The local maximum of the spectral envelope corresponds
175// with the local minimum of A(z). It saves complexity, as we save one
176// inversion. Furthermore, we find the first local maximum of magnitude squared,
177// to save on one square root.
Peter Kastingdce40cf2015-08-24 14:52:23 -0700178void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
179 size_t length_f_peak) {
kwiberg9e2be5f2016-09-14 05:23:22 -0700180 RTC_DCHECK_GE(length_f_peak, kNum10msSubframes);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000181 double lpc[kNum10msSubframes * (kLpcOrder + 1)];
182 // For all sub-frames.
183 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
184
Peter Kastingdce40cf2015-08-24 14:52:23 -0700185 const size_t kNumDftCoefficients = kDftSize / 2 + 1;
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000186 float data[kDftSize];
187
Peter Kastingdce40cf2015-08-24 14:52:23 -0700188 for (size_t i = 0; i < kNum10msSubframes; i++) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000189 // Convert to float with zero pad.
190 memset(data, 0, sizeof(data));
Peter Kastingdce40cf2015-08-24 14:52:23 -0700191 for (size_t n = 0; n < kLpcOrder + 1; n++) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000192 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
193 }
194 // Transform to frequency domain.
195 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
196
Peter Kastingdce40cf2015-08-24 14:52:23 -0700197 size_t index_peak = 0;
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000198 float prev_magn_sqr = data[0] * data[0];
199 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
200 float next_magn_sqr;
201 bool found_peak = false;
Peter Kastingdce40cf2015-08-24 14:52:23 -0700202 for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
aluebsecf6b812015-06-25 12:28:48 -0700203 next_magn_sqr =
204 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000205 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
206 found_peak = true;
207 index_peak = n - 1;
208 break;
209 }
210 prev_magn_sqr = curr_magn_sqr;
211 curr_magn_sqr = next_magn_sqr;
212 }
213 float fractional_index = 0;
214 if (!found_peak) {
215 // Checking if |kNumDftCoefficients - 1| is the local minimum.
216 next_magn_sqr = data[1] * data[1];
217 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
218 index_peak = kNumDftCoefficients - 1;
219 }
220 } else {
221 // A peak is found, do a simple quadratic interpolation to get a more
222 // accurate estimate of the peak location.
aluebsecf6b812015-06-25 12:28:48 -0700223 fractional_index =
224 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000225 }
226 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
227 }
228}
229
230// Using iSAC functions to estimate pitch gains & lags.
aluebsecf6b812015-06-25 12:28:48 -0700231void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
232 double* pitch_lags_hz,
Peter Kastingdce40cf2015-08-24 14:52:23 -0700233 size_t length) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000234 // TODO(turajs): This can be "imported" from iSAC & and the next two
235 // constants.
kwiberg9e2be5f2016-09-14 05:23:22 -0700236 RTC_DCHECK_GE(length, kNum10msSubframes);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000237 const int kNumPitchSubframes = 4;
238 double gains[kNumPitchSubframes];
239 double lags[kNumPitchSubframes];
240
241 const int kNumSubbandFrameSamples = 240;
242 const int kNumLookaheadSamples = 24;
243
244 float lower[kNumSubbandFrameSamples];
245 float upper[kNumSubbandFrameSamples];
246 double lower_lookahead[kNumSubbandFrameSamples];
247 double upper_lookahead[kNumSubbandFrameSamples];
248 double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
249 kNumLookaheadSamples];
250
251 // Split signal to lower and upper bands
aluebsecf6b812015-06-25 12:28:48 -0700252 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
253 upper, lower_lookahead, upper_lookahead,
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000254 pre_filter_handle_.get());
255 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
256 pitch_analysis_handle_.get(), lags, gains);
257
258 // Lags are computed on lower-band signal with sampling rate half of the
259 // input signal.
aluebsecf6b812015-06-25 12:28:48 -0700260 GetSubframesPitchParameters(
261 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
262 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000263}
264
Peter Kastingdce40cf2015-08-24 14:52:23 -0700265void VadAudioProc::Rms(double* rms, size_t length_rms) {
kwiberg9e2be5f2016-09-14 05:23:22 -0700266 RTC_DCHECK_GE(length_rms, kNum10msSubframes);
Peter Kastingdce40cf2015-08-24 14:52:23 -0700267 size_t offset = kNumPastSignalSamples;
268 for (size_t i = 0; i < kNum10msSubframes; i++) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000269 rms[i] = 0;
Peter Kastingdce40cf2015-08-24 14:52:23 -0700270 for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000271 rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
aluebsecf6b812015-06-25 12:28:48 -0700272 rms[i] = sqrt(rms[i] / kNumSubframeSamples);
pbos@webrtc.org788acd12014-12-15 09:41:24 +0000273 }
274}
275
276} // namespace webrtc