Blame - modules/audio_processing/vad/vad_audio_proc.cc - webrtc.googlesource.com/src

blob: d5f6b3bcd7715d6104b0cba8b20d5c0151c13438 [file] [log] [blame]

pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	11	#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	12
				13	#include <math.h>
				14	#include <stdio.h>
				15
andrew@webrtc.org	04c5098	2015-03-19 20:06:29 +0000	[diff] [blame]	16	#include "webrtc/common_audio/fft4g.h"
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	17	#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
				18	#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
Edward Lemur	c20978e	2017-07-06 19:44:34 +0200	[diff] [blame]	19	#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
				20	#include "webrtc/rtc_base/checks.h"
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	21	extern "C" {
				22	#include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
				23	#include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
				24	#include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
				25	#include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	26	}
Henrik Kjellander	ff761fb	2015-11-04 08:31:52 +0100	[diff] [blame]	27	#include "webrtc/modules/include/module_common_types.h"
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	28
				29	namespace webrtc {
				30
				31	// The following structures are declared anonymous in iSAC's structs.h. To
				32	// forward declare them, we use this derived class trick.
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	33	struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
				34	struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	35
brucedawson	fde2116	2017-06-20 10:57:09 -0700	[diff] [blame]	36	static constexpr float kFrequencyResolution =
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	37	kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
brucedawson	fde2116	2017-06-20 10:57:09 -0700	[diff] [blame]	38	static constexpr int kSilenceRms = 5;
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	39
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	40	// TODO(turajs): Make a Create or Init for VadAudioProc.
				41	VadAudioProc::VadAudioProc()
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	42	: audio_buffer_(),
				43	num_buffer_samples_(kNumPastSignalSamples),
				44	log_old_gain_(-2),
				45	old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).
				46	pitch_analysis_handle_(new PitchAnalysisStruct),
				47	pre_filter_handle_(new PreFiltBankstr),
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	48	high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
				49	kFilterOrder,
				50	kCoeffDenominator,
				51	kFilterOrder)) {
kwiberg@webrtc.org	2ebfac5	2015-01-14 10:51:54 +0000	[diff] [blame]	52	static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
				53	sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
				54	"lpc analysis window incorrect size");
				55	static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
				56	"correlation weight incorrect size");
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	57
				58	// TODO(turajs): Are we doing too much in the constructor?
				59	float data[kDftSize];
				60	// Make FFT to initialize.
				61	ip_[0] = 0;
				62	WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
				63	// TODO(turajs): Need to initialize high-pass filter.
				64
				65	// Initialize iSAC components.
				66	WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
				67	WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
				68	}
				69
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	70	VadAudioProc::~VadAudioProc() {
				71	}
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	72
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	73	void VadAudioProc::ResetBuffer() {
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	74	memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
				75	sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
				76	num_buffer_samples_ = kNumPastSignalSamples;
				77	}
				78
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	79	int VadAudioProc::ExtractFeatures(const int16_t* frame,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	80	size_t length,
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	81	AudioFeatures* features) {
				82	features->num_frames = 0;
				83	if (length != kNumSubframeSamples) {
				84	return -1;
				85	}
				86
				87	// High-pass filter to remove the DC component and very low frequency content.
				88	// We have experienced that this high-pass filtering improves voice/non-voiced
				89	// classification.
				90	if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
				91	&audio_buffer_[num_buffer_samples_]) != 0) {
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	92	return -1;
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	93	}
				94
				95	num_buffer_samples_ += kNumSubframeSamples;
				96	if (num_buffer_samples_ < kBufferLength) {
				97	return 0;
				98	}
kwiberg	9e2be5f	2016-09-14 05:23:22 -0700	[diff] [blame]	99	RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	100	features->num_frames = kNum10msSubframes;
				101	features->silence = false;
				102
				103	Rms(features->rms, kMaxNumFrames);
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	104	for (size_t i = 0; i < kNum10msSubframes; ++i) {
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	105	if (features->rms[i] < kSilenceRms) {
				106	// PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
				107	// Bail out here instead.
				108	features->silence = true;
				109	ResetBuffer();
				110	return 0;
				111	}
				112	}
				113
				114	PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
				115	kMaxNumFrames);
				116	FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
				117	ResetBuffer();
				118	return 0;
				119	}
				120
				121	// Computes \|kLpcOrder + 1\| correlation coefficients.
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	122	void VadAudioProc::SubframeCorrelation(double* corr,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	123	size_t length_corr,
				124	size_t subframe_index) {
kwiberg	9e2be5f	2016-09-14 05:23:22 -0700	[diff] [blame]	125	RTC_DCHECK_GE(length_corr, kLpcOrder + 1);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	126	double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	127	size_t buffer_index = subframe_index * kNumSubframeSamples;
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	128
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	129	for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	130	windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
				131
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	132	WebRtcIsac_AutoCorr(corr, windowed_audio,
				133	kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	134	}
				135
				136	// Compute \|kNum10msSubframes\| sets of LPC coefficients, one per 10 ms input.
				137	// The analysis window is 15 ms long and it is centered on the first half of
				138	// each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
				139	// first half of each 10 ms subframe.
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	140	void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
kwiberg	9e2be5f	2016-09-14 05:23:22 -0700	[diff] [blame]	141	RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1));
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	142	double corr[kLpcOrder + 1];
				143	double reflec_coeff[kLpcOrder];
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	144	for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	145	i++, offset_lpc += kLpcOrder + 1) {
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	146	SubframeCorrelation(corr, kLpcOrder + 1, i);
				147	corr[0] *= 1.0001;
				148	// This makes Lev-Durb a bit more stable.
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	149	for (size_t k = 0; k < kLpcOrder + 1; k++) {
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	150	corr[k] *= kCorrWeight[k];
				151	}
				152	WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
				153	}
				154	}
				155
				156	// Fit a second order curve to these 3 points and find the location of the
				157	// extremum. The points are inverted before curve fitting.
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	158	static float QuadraticInterpolation(float prev_val,
				159	float curr_val,
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	160	float next_val) {
				161	// Doing the interpolation in \|1 / A(z)\|^2.
				162	float fractional_index = 0;
				163	next_val = 1.0f / next_val;
				164	prev_val = 1.0f / prev_val;
				165	curr_val = 1.0f / curr_val;
				166
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	167	fractional_index =
				168	-(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
kwiberg	9e2be5f	2016-09-14 05:23:22 -0700	[diff] [blame]	169	RTC_DCHECK_LT(fabs(fractional_index), 1);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	170	return fractional_index;
				171	}
				172
				173	// 1 / A(z), where A(z) is defined by \|lpc\| is a model of the spectral envelope
				174	// of the input signal. The local maximum of the spectral envelope corresponds
				175	// with the local minimum of A(z). It saves complexity, as we save one
				176	// inversion. Furthermore, we find the first local maximum of magnitude squared,
				177	// to save on one square root.
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	178	void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
				179	size_t length_f_peak) {
kwiberg	9e2be5f	2016-09-14 05:23:22 -0700	[diff] [blame]	180	RTC_DCHECK_GE(length_f_peak, kNum10msSubframes);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	181	double lpc[kNum10msSubframes * (kLpcOrder + 1)];
				182	// For all sub-frames.
				183	GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
				184
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	185	const size_t kNumDftCoefficients = kDftSize / 2 + 1;
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	186	float data[kDftSize];
				187
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	188	for (size_t i = 0; i < kNum10msSubframes; i++) {
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	189	// Convert to float with zero pad.
				190	memset(data, 0, sizeof(data));
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	191	for (size_t n = 0; n < kLpcOrder + 1; n++) {
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	192	data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
				193	}
				194	// Transform to frequency domain.
				195	WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
				196
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	197	size_t index_peak = 0;
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	198	float prev_magn_sqr = data[0] * data[0];
				199	float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
				200	float next_magn_sqr;
				201	bool found_peak = false;
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	202	for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	203	next_magn_sqr =
				204	data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	205	if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
				206	found_peak = true;
				207	index_peak = n - 1;
				208	break;
				209	}
				210	prev_magn_sqr = curr_magn_sqr;
				211	curr_magn_sqr = next_magn_sqr;
				212	}
				213	float fractional_index = 0;
				214	if (!found_peak) {
				215	// Checking if \|kNumDftCoefficients - 1\| is the local minimum.
				216	next_magn_sqr = data[1] * data[1];
				217	if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
				218	index_peak = kNumDftCoefficients - 1;
				219	}
				220	} else {
				221	// A peak is found, do a simple quadratic interpolation to get a more
				222	// accurate estimate of the peak location.
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	223	fractional_index =
				224	QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	225	}
				226	f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
				227	}
				228	}
				229
				230	// Using iSAC functions to estimate pitch gains & lags.
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	231	void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
				232	double* pitch_lags_hz,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	233	size_t length) {
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	234	// TODO(turajs): This can be "imported" from iSAC & and the next two
				235	// constants.
kwiberg	9e2be5f	2016-09-14 05:23:22 -0700	[diff] [blame]	236	RTC_DCHECK_GE(length, kNum10msSubframes);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	237	const int kNumPitchSubframes = 4;
				238	double gains[kNumPitchSubframes];
				239	double lags[kNumPitchSubframes];
				240
				241	const int kNumSubbandFrameSamples = 240;
				242	const int kNumLookaheadSamples = 24;
				243
				244	float lower[kNumSubbandFrameSamples];
				245	float upper[kNumSubbandFrameSamples];
				246	double lower_lookahead[kNumSubbandFrameSamples];
				247	double upper_lookahead[kNumSubbandFrameSamples];
				248	double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
				249	kNumLookaheadSamples];
				250
				251	// Split signal to lower and upper bands
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	252	WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
				253	upper, lower_lookahead, upper_lookahead,
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	254	pre_filter_handle_.get());
				255	WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
				256	pitch_analysis_handle_.get(), lags, gains);
				257
				258	// Lags are computed on lower-band signal with sampling rate half of the
				259	// input signal.
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	260	GetSubframesPitchParameters(
				261	kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
				262	&log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	263	}
				264
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	265	void VadAudioProc::Rms(double* rms, size_t length_rms) {
kwiberg	9e2be5f	2016-09-14 05:23:22 -0700	[diff] [blame]	266	RTC_DCHECK_GE(length_rms, kNum10msSubframes);
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	267	size_t offset = kNumPastSignalSamples;
				268	for (size_t i = 0; i < kNum10msSubframes; i++) {
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	269	rms[i] = 0;
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	270	for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	271	rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
aluebs	ecf6b81	2015-06-25 12:28:48 -0700	[diff] [blame]	272	rms[i] = sqrt(rms[i] / kNumSubframeSamples);
pbos@webrtc.org	788acd1	2014-12-15 09:41:24 +0000	[diff] [blame]	273	}
				274	}
				275
				276	} // namespace webrtc