Blame - webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h - webrtc.googlesource.com/src

blob: 8125707f120981c40817152a80bcbea43f2e3006 [file] [log] [blame]

ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	11	//
				12	// Specifies core class for intelligbility enhancement.
				13	//
				14
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	15	#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_
				16	#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_
				17
				18	#include <complex>
				19
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	20	#include "webrtc/base/scoped_ptr.h"
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	21	#include "webrtc/common_audio/lapped_transform.h"
				22	#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	23
				24	struct WebRtcVadInst;
				25	typedef struct WebRtcVadInst VadInst;
				26
				27	namespace webrtc {
				28
				29	// Speech intelligibility enhancement module. Reads render and capture
				30	// audio streams and modifies the render stream with a set of gains per
				31	// frequency bin to enhance speech against the noise background.
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	32	// Note: assumes speech and noise streams are already separated.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	33	class IntelligibilityEnhancer {
				34	public:
				35	// Construct a new instance with the given filter bank resolution,
				36	// sampling rate, number of channels and analysis rates.
				37	// \|analysis_rate\| sets the number of input blocks (containing speech!)
				38	// to elapse before a new gain computation is made. \|variance_rate\| specifies
				39	// the number of gain recomputations after which the variances are reset.
				40	// \|cv_*\| are parameters for the VarianceArray constructor for the
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	41	// clear speech stream.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	42	// TODO(bercic): the \|cv_\|, \|_rate\| and \|gain_limit\| parameters should
				43	// probably go away once fine tuning is done. They override the internal
				44	// constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate).
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	45	IntelligibilityEnhancer(int erb_resolution,
				46	int sample_rate_hz,
				47	int channels,
				48	int cv_type,
				49	float cv_alpha,
				50	int cv_win,
				51	int analysis_rate,
				52	int variance_rate,
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	53	float gain_limit);
				54	~IntelligibilityEnhancer();
				55
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	56	// Reads and processes chunk of noise stream in time domain.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	57	void ProcessCaptureAudio(float* const* audio);
				58
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	59	// Reads chunk of speech in time domain and updates with modified signal.
				60	void ProcessRenderAudio(float* const* audio);
				61
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	62	private:
				63	enum AudioSource {
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	64	kRenderStream = 0, // Clear speech stream.
				65	kCaptureStream, // Noise stream.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	66	};
				67
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	68	// Provides access point to the frequency domain.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	69	class TransformCallback : public LappedTransform::Callback {
				70	public:
				71	TransformCallback(IntelligibilityEnhancer* parent, AudioSource source);
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	72
				73	// All in frequency domain, receives input \|in_block\|, applies
				74	// intelligibility enhancement, and writes result to \|out_block\|.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	75	virtual void ProcessAudioBlock(const std::complex<float>* const* in_block,
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	76	int in_channels,
				77	int frames,
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	78	int out_channels,
				79	std::complex<float>* const* out_block);
				80
				81	private:
				82	IntelligibilityEnhancer* parent_;
				83	AudioSource source_;
				84	};
				85	friend class TransformCallback;
				86
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	87	// Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source.
				88	void DispatchAudio(AudioSource source,
				89	const std::complex<float>* in_block,
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	90	std::complex<float>* out_block);
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	91
				92	// Updates variance computation and analysis with \|in_block_\|,
				93	// and writes modified speech to \|out_block\|.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	94	void ProcessClearBlock(const std::complex<float>* in_block,
				95	std::complex<float>* out_block);
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	96
				97	// Computes and sets modified gains.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	98	void AnalyzeClearBlock(float power_target);
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	99
				100	// Updates variance calculation for noise input with \|in_block\|.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	101	void ProcessNoiseBlock(const std::complex<float>* in_block,
				102	std::complex<float>* out_block);
				103
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	104	// Returns number of ERB filters.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	105	static int GetBankSize(int sample_rate, int erb_resolution);
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	106
				107	// Initializes ERB filterbank.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	108	void CreateErbBank();
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	109
				110	// Analytically solves quadratic for optimal gains given \|lambda\|.
				111	// Negative gains are set to 0. Stores the results in \|sols\|.
				112	void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols);
				113
				114	// Computes variance across ERB filters from freq variance \|var\|.
				115	// Stores in \|result\|.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	116	void FilterVariance(const float* var, float* result);
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	117
				118	// Returns dot product of vectors specified by size \|length\| arrays \|a\|,\|b\|.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	119	static float DotProduct(const float* a, const float* b, int length);
				120
				121	static const int kErbResolution;
				122	static const int kWindowSizeMs;
				123	static const int kChunkSizeMs;
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	124	static const int kAnalyzeRate; // Default for \|analysis_rate_\|.
				125	static const int kVarianceRate; // Default for \|variance_rate_\|.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	126	static const float kClipFreq;
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	127	static const float kConfigRho; // Default production and interpretation SNR.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	128	static const float kKbdAlpha;
				129	static const float kGainChangeLimit;
				130
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	131	const int freqs_; // Num frequencies in frequency domain.
				132	const int window_size_; // Window size in samples; also the block size.
				133	const int chunk_length_; // Chunk size in samples.
				134	const int bank_size_; // Num ERB filters.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	135	const int sample_rate_hz_;
				136	const int erb_resolution_;
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	137	const int channels_; // Num channels.
				138	const int analysis_rate_; // Num blocks before gains recalculated.
				139	const int variance_rate_; // Num recalculations before history is cleared.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	140
				141	intelligibility::VarianceArray clear_variance_;
				142	intelligibility::VarianceArray noise_variance_;
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	143	rtc::scoped_ptr<float[]> filtered_clear_var_;
				144	rtc::scoped_ptr<float[]> filtered_noise_var_;
				145	float** filter_bank_; // TODO(ekmeyerson): Switch to using ChannelBuffer.
				146	rtc::scoped_ptr<float[]> center_freqs_;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	147	int start_freq_;
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	148	rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR.
				149	// for each ERB band.
				150	rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	151	intelligibility::GainApplier gain_applier_;
				152
				153	// Destination buffer used to reassemble blocked chunks before overwriting
				154	// the original input array with modifications.
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	155	// TODO(ekmeyerson): Switch to using ChannelBuffer.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	156	float** temp_out_buffer_;
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	157
				158	rtc::scoped_ptr<float* []> input_audio_;
				159	rtc::scoped_ptr<float[]> kbd_window_;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	160	TransformCallback render_callback_;
				161	TransformCallback capture_callback_;
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	162	rtc::scoped_ptr<LappedTransform> render_mangler_;
				163	rtc::scoped_ptr<LappedTransform> capture_mangler_;
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	164	int block_count_;
				165	int analysis_step_;
				166
				167	// TODO(bercic): Quick stopgap measure for voice detection in the clear
				168	// and noise streams.
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	169	// Note: VAD currently does not affect anything in IntelligibilityEnhancer.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	170	VadInst* vad_high_;
				171	VadInst* vad_low_;
ekm	b7553df	2015-06-16 18:57:32 -0700	[diff] [blame^]	172	rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_;
				173	bool has_voice_low_; // Whether voice detected in speech stream.
ekm	030249d	2015-06-15 13:02:24 -0700	[diff] [blame]	174	};
				175
				176	} // namespace webrtc
				177
				178	#endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_