Blame - webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc - webrtc.googlesource.com/src

blob: 932eff1091b689d5752ea228eb746a5bd05d3597 [file] [log] [blame]

ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
				11	#include "webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h"
				12
				13	#include <cmath>
				14	#include <cstdlib>
				15
				16	#include <algorithm>
				17
				18	#include "webrtc/base/checks.h"
				19	#include "webrtc/common_audio/vad/include/webrtc_vad.h"
				20	#include "webrtc/common_audio/window_generator.h"
				21
				22	using std::complex;
				23	using std::max;
				24	using std::min;
				25
				26	namespace webrtc {
				27
				28	const int IntelligibilityEnhancer::kErbResolution = 2;
				29	const int IntelligibilityEnhancer::kWindowSizeMs = 2;
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	30	// The size of the chunk provided by APM, in milliseconds.
				31	const int IntelligibilityEnhancer::kChunkSizeMs = 10;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	32	const int IntelligibilityEnhancer::kAnalyzeRate = 800;
				33	const int IntelligibilityEnhancer::kVarianceRate = 2;
				34	const float IntelligibilityEnhancer::kClipFreq = 200.0f;
				35	const float IntelligibilityEnhancer::kConfigRho = 0.02f;
				36	const float IntelligibilityEnhancer::kKbdAlpha = 1.5f;
				37	const float IntelligibilityEnhancer::kGainChangeLimit = 0.0125f;
				38
				39	using VarianceType = intelligibility::VarianceArray::StepType;
				40
				41	IntelligibilityEnhancer::TransformCallback::TransformCallback(
				42	IntelligibilityEnhancer* parent,
				43	IntelligibilityEnhancer::AudioSource source)
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	44	: parent_(parent),
				45	source_(source) {}
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	46
				47	void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
				48	const complex<float>* const* in_block,
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	49	int in_channels, int frames, int /* out_channels */,
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	50	complex<float>* const* out_block) {
				51	DCHECK_EQ(parent_->freqs_, frames);
				52	for (int i = 0; i < in_channels; ++i) {
				53	parent_->DispatchAudio(source_, in_block[i], out_block[i]);
				54	}
				55	}
				56
				57	IntelligibilityEnhancer::IntelligibilityEnhancer(int erb_resolution,
				58	int sample_rate_hz,
				59	int channels,
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	60	int cv_type, float cv_alpha,
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	61	int cv_win,
				62	int analysis_rate,
				63	int variance_rate,
				64	float gain_limit)
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	65	: freqs_(RealFourier::ComplexLength(RealFourier::FftOrder(
				66	sample_rate_hz * kWindowSizeMs / 1000))),
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	67	window_size_(1 << RealFourier::FftOrder(freqs_)),
				68	chunk_length_(sample_rate_hz * kChunkSizeMs / 1000),
				69	bank_size_(GetBankSize(sample_rate_hz, erb_resolution)),
				70	sample_rate_hz_(sample_rate_hz),
				71	erb_resolution_(erb_resolution),
				72	channels_(channels),
				73	analysis_rate_(analysis_rate),
				74	variance_rate_(variance_rate),
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	75	clear_variance_(freqs_, static_cast<VarianceType>(cv_type), cv_win,
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	76	cv_alpha),
				77	noise_variance_(freqs_, VarianceType::kStepInfinite, 475, 0.01f),
				78	filtered_clear_var_(new float[bank_size_]),
				79	filtered_noise_var_(new float[bank_size_]),
				80	filter_bank_(nullptr),
				81	center_freqs_(new float[bank_size_]),
				82	rho_(new float[bank_size_]),
				83	gains_eq_(new float[bank_size_]),
				84	gain_applier_(freqs_, gain_limit),
				85	temp_out_buffer_(nullptr),
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	86	input_audio_(new float*[channels]),
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	87	kbd_window_(new float[window_size_]),
				88	render_callback_(this, AudioSource::kRenderStream),
				89	capture_callback_(this, AudioSource::kCaptureStream),
				90	block_count_(0),
				91	analysis_step_(0),
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	92	vad_high_(nullptr),
				93	vad_low_(nullptr),
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	94	vad_tmp_buffer_(new int16_t[chunk_length_]) {
				95	DCHECK_LE(kConfigRho, 1.0f);
				96
				97	CreateErbBank();
				98
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	99	WebRtcVad_Create(&vad_high_);
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	100	WebRtcVad_Init(vad_high_);
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	101	WebRtcVad_set_mode(vad_high_, 0); // high likelihood of speech
				102	WebRtcVad_Create(&vad_low_);
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	103	WebRtcVad_Init(vad_low_);
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	104	WebRtcVad_set_mode(vad_low_, 3); // low likelihood of speech
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	105
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	106	temp_out_buffer_ = static_cast<float**>(malloc(
				107	sizeof(temp_out_buffer_) channels_ +
				108	sizeof(*temp_out_buffer_) chunk_length_ * channels_));
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	109	for (int i = 0; i < channels_; ++i) {
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	110	temp_out_buffer_[i] = reinterpret_cast<float*>(temp_out_buffer_ + channels_)
				111	+ chunk_length_ * i;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	112	}
				113
				114	for (int i = 0; i < bank_size_; ++i) {
				115	rho_[i] = kConfigRho * kConfigRho;
				116	}
				117
				118	float freqs_khz = kClipFreq / 1000.0f;
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	119	int erb_index = static_cast<int>(ceilf(11.17f * logf((freqs_khz + 0.312f) /
				120	(freqs_khz + 14.6575f))
				121	+ 43.0f));
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	122	start_freq_ = max(1, erb_index * kErbResolution);
				123
				124	WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,
				125	kbd_window_.get());
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	126	render_mangler_.reset(new LappedTransform(channels_, channels_,
				127	chunk_length_,
				128	kbd_window_.get(),
				129	window_size_,
				130	window_size_ / 2,
				131	&render_callback_));
				132	capture_mangler_.reset(new LappedTransform(channels_, channels_,
				133	chunk_length_,
				134	kbd_window_.get(),
				135	window_size_,
				136	window_size_ / 2,
				137	&capture_callback_));
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	138	}
				139
				140	IntelligibilityEnhancer::~IntelligibilityEnhancer() {
				141	WebRtcVad_Free(vad_low_);
				142	WebRtcVad_Free(vad_high_);
				143	free(filter_bank_);
				144	}
				145
				146	void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio) {
				147	for (int i = 0; i < chunk_length_; ++i) {
				148	vad_tmp_buffer_[i] = (int16_t)audio[0][i];
				149	}
				150	has_voice_low_ = WebRtcVad_Process(vad_low_, sample_rate_hz_,
				151	vad_tmp_buffer_.get(), chunk_length_) == 1;
				152
				153	render_mangler_->ProcessChunk(audio, temp_out_buffer_);
				154	for (int i = 0; i < channels_; ++i) {
				155	memcpy(audio[i], temp_out_buffer_[i],
				156	chunk_length_ * sizeof(**temp_out_buffer_));
				157	}
				158	}
				159
				160	void IntelligibilityEnhancer::ProcessCaptureAudio(float* const* audio) {
				161	for (int i = 0; i < chunk_length_; ++i) {
				162	vad_tmp_buffer_[i] = (int16_t)audio[0][i];
				163	}
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	164	// TODO(bercic): the VAD was always detecting voice in the noise stream,
				165	// no matter what the aggressiveness, so it was temporarily disabled here
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	166
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	167	//if (WebRtcVad_Process(vad_high_, sample_rate_hz_, vad_tmp_buffer_.get(),
				168	// chunk_length_) == 1) {
				169	// printf("capture HAS speech\n");
				170	// return;
				171	//}
				172	//printf("capture NO speech\n");
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	173	capture_mangler_->ProcessChunk(audio, temp_out_buffer_);
				174	}
				175
				176	void IntelligibilityEnhancer::DispatchAudio(
				177	IntelligibilityEnhancer::AudioSource source,
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	178	const complex<float>* in_block, complex<float>* out_block) {
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	179	switch (source) {
				180	case kRenderStream:
				181	ProcessClearBlock(in_block, out_block);
				182	break;
				183	case kCaptureStream:
				184	ProcessNoiseBlock(in_block, out_block);
				185	break;
				186	}
				187	}
				188
				189	void IntelligibilityEnhancer::ProcessClearBlock(const complex<float>* in_block,
				190	complex<float>* out_block) {
				191	float power_target;
				192
				193	if (block_count_ < 2) {
				194	memset(out_block, 0, freqs_ * sizeof(*out_block));
				195	++block_count_;
				196	return;
				197	}
				198
				199	if (has_voice_low_ \|\| true) {
				200	clear_variance_.Step(in_block, false);
				201	power_target = std::accumulate(clear_variance_.variance(),
				202	clear_variance_.variance() + freqs_, 0.0f);
				203
				204	if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {
				205	AnalyzeClearBlock(power_target);
				206	++analysis_step_;
				207	if (analysis_step_ == variance_rate_) {
				208	analysis_step_ = 0;
				209	clear_variance_.Clear();
				210	noise_variance_.Clear();
				211	}
				212	}
				213	++block_count_;
				214	}
				215
				216	/* efidata(n,:) = sqrt(b(n)) * fidata(n,:) */
				217	gain_applier_.Apply(in_block, out_block);
				218	}
				219
				220	void IntelligibilityEnhancer::AnalyzeClearBlock(float power_target) {
				221	FilterVariance(clear_variance_.variance(), filtered_clear_var_.get());
				222	FilterVariance(noise_variance_.variance(), filtered_noise_var_.get());
				223
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	224	/* lambda binary search */
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	225
				226	float lambda_bot = -1.0f, lambda_top = -10e-18f, lambda;
				227	float power_bot, power_top, power;
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	228	SolveEquation14(lambda_top, start_freq_, gains_eq_.get());
				229	power_top = DotProduct(gains_eq_.get(), filtered_clear_var_.get(),
				230	bank_size_);
				231	SolveEquation14(lambda_bot, start_freq_, gains_eq_.get());
				232	power_bot = DotProduct(gains_eq_.get(), filtered_clear_var_.get(),
				233	bank_size_);
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	234	DCHECK(power_target >= power_bot && power_target <= power_top);
				235
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	236	float power_ratio = 2.0f;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	237	int iters = 0;
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	238	while (fabs(power_ratio - 1.0f) > 0.001f && iters <= 100) {
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	239	lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f;
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	240	SolveEquation14(lambda, start_freq_, gains_eq_.get());
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	241	power = DotProduct(gains_eq_.get(), filtered_clear_var_.get(), bank_size_);
				242	if (power < power_target) {
				243	lambda_bot = lambda;
				244	} else {
				245	lambda_top = lambda;
				246	}
				247	power_ratio = fabs(power / power_target);
				248	++iters;
				249	}
				250
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	251	/* b = filterbank' * b */
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	252	float* gains = gain_applier_.target();
				253	for (int i = 0; i < freqs_; ++i) {
				254	gains[i] = 0.0f;
				255	for (int j = 0; j < bank_size_; ++j) {
				256	gains[i] = fmaf(filter_bank_[j][i], gains_eq_[j], gains[i]);
				257	}
				258	}
				259	}
				260
				261	void IntelligibilityEnhancer::ProcessNoiseBlock(const complex<float>* in_block,
				262	complex<float>* /out_block/) {
				263	noise_variance_.Step(in_block);
				264	}
				265
				266	int IntelligibilityEnhancer::GetBankSize(int sample_rate, int erb_resolution) {
				267	float freq_limit = sample_rate / 2000.0f;
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	268	int erb_scale = ceilf(11.17f * logf((freq_limit + 0.312f) /
				269	(freq_limit + 14.6575f)) + 43.0f);
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	270	return erb_scale * erb_resolution;
				271	}
				272
				273	void IntelligibilityEnhancer::CreateErbBank() {
				274	int lf = 1, rf = 4;
				275
				276	for (int i = 0; i < bank_size_; ++i) {
				277	float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_));
				278	center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));
				279	center_freqs_[i] -= 14678.49f;
				280	}
				281	float last_center_freq = center_freqs_[bank_size_ - 1];
				282	for (int i = 0; i < bank_size_; ++i) {
				283	center_freqs_[i] = 0.5f sample_rate_hz_ / last_center_freq;
				284	}
				285
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	286	filter_bank_ = static_cast<float**>(malloc(
				287	sizeof(filter_bank_) bank_size_ +
				288	sizeof(*filter_bank_) freqs_ * bank_size_));
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	289	for (int i = 0; i < bank_size_; ++i) {
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	290	filter_bank_[i] = reinterpret_cast<float*>(filter_bank_ + bank_size_) +
				291	freqs_ * i;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	292	}
				293
				294	for (int i = 1; i <= bank_size_; ++i) {
				295	int lll, ll, rr, rrr;
				296	lll = round(center_freqs_[max(1, i - lf) - 1] * freqs_ /
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	297	(0.5f * sample_rate_hz_));
				298	ll = round(center_freqs_[max(1, i ) - 1] * freqs_ /
				299	(0.5f * sample_rate_hz_));
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	300	lll = min(freqs_, max(lll, 1)) - 1;
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	301	ll = min(freqs_, max(ll, 1)) - 1;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	302
				303	rrr = round(center_freqs_[min(bank_size_, i + rf) - 1] * freqs_ /
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	304	(0.5f * sample_rate_hz_));
				305	rr = round(center_freqs_[min(bank_size_, i + 1) - 1] * freqs_ /
				306	(0.5f * sample_rate_hz_));
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	307	rrr = min(freqs_, max(rrr, 1)) - 1;
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	308	rr = min(freqs_, max(rr, 1)) - 1;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	309
				310	float step, element;
				311
				312	step = 1.0f / (ll - lll);
				313	element = 0.0f;
				314	for (int j = lll; j <= ll; ++j) {
				315	filter_bank_[i - 1][j] = element;
				316	element += step;
				317	}
				318	step = 1.0f / (rrr - rr);
				319	element = 1.0f;
				320	for (int j = rr; j <= rrr; ++j) {
				321	filter_bank_[i - 1][j] = element;
				322	element -= step;
				323	}
				324	for (int j = ll; j <= rr; ++j) {
				325	filter_bank_[i - 1][j] = 1.0f;
				326	}
				327	}
				328
				329	float sum;
				330	for (int i = 0; i < freqs_; ++i) {
				331	sum = 0.0f;
				332	for (int j = 0; j < bank_size_; ++j) {
				333	sum += filter_bank_[j][i];
				334	}
				335	for (int j = 0; j < bank_size_; ++j) {
				336	filter_bank_[j][i] /= sum;
				337	}
				338	}
				339	}
				340
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	341	void IntelligibilityEnhancer::SolveEquation14(float lambda, int start_freq,
				342	float* sols) {
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	343	bool quadratic = (kConfigRho < 1.0f);
				344	const float* var_x0 = filtered_clear_var_.get();
				345	const float* var_n0 = filtered_noise_var_.get();
				346
				347	for (int n = 0; n < start_freq; ++n) {
				348	sols[n] = 1.0f;
				349	}
				350	for (int n = start_freq - 1; n < bank_size_; ++n) {
				351	float alpha0, beta0, gamma0;
				352	gamma0 = 0.5f * rho_[n] * var_x0[n] * var_n0[n] +
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	353	lambda * var_x0[n] * var_n0[n] * var_n0[n];
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	354	beta0 = lambda * var_x0[n] * (2 - rho_[n]) * var_x0[n] * var_n0[n];
				355	if (quadratic) {
				356	alpha0 = lambda * var_x0[n] * (1 - rho_[n]) * var_x0[n] * var_x0[n];
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	357	sols[n] = (-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0))
				358	/ (2 * alpha0);
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	359	} else {
				360	sols[n] = -gamma0 / beta0;
				361	}
				362	sols[n] = fmax(0, sols[n]);
				363	}
				364	}
				365
				366	void IntelligibilityEnhancer::FilterVariance(const float* var, float* result) {
				367	for (int i = 0; i < bank_size_; ++i) {
				368	result[i] = DotProduct(filter_bank_[i], var, freqs_);
				369	}
				370	}
				371
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	372	float IntelligibilityEnhancer::DotProduct(const float* a, const float* b,
				373	int length) {
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	374	float ret = 0.0f;
				375
				376	for (int i = 0; i < length; ++i) {
				377	ret = fmaf(a[i], b[i], ret);
				378	}
				379	return ret;
				380	}
				381
				382	} // namespace webrtc
aluebs	c555b99	2015-06-16 20:26:16 -0700	[diff] [blame^]	383