Use VAD to get a better speech power estimation in the IntelligibilityEnhancer
R=henrik.lundin@webrtc.org, turaj@webrtc.org
Review URL: https://codereview.webrtc.org/1693823004 .
Cr-Commit-Position: refs/heads/master@{#11713}
diff --git a/webrtc/modules/audio_processing/audio_processing_impl.cc b/webrtc/modules/audio_processing/audio_processing_impl.cc
index e155171..bb746ee 100644
--- a/webrtc/modules/audio_processing/audio_processing_impl.cc
+++ b/webrtc/modules/audio_processing/audio_processing_impl.cc
@@ -1184,8 +1184,7 @@
}
bool AudioProcessingImpl::is_rev_processed() const {
- return constants_.intelligibility_enabled &&
- public_submodules_->intelligibility_enhancer->active();
+ return constants_.intelligibility_enabled;
}
bool AudioProcessingImpl::render_check_rev_conversion_needed() const {
@@ -1236,12 +1235,9 @@
void AudioProcessingImpl::InitializeIntelligibility() {
if (constants_.intelligibility_enabled) {
- IntelligibilityEnhancer::Config config;
- config.sample_rate_hz = capture_nonlocked_.split_rate;
- config.num_capture_channels = capture_.capture_audio->num_channels();
- config.num_render_channels = render_.render_audio->num_channels();
public_submodules_->intelligibility_enhancer.reset(
- new IntelligibilityEnhancer(config));
+ new IntelligibilityEnhancer(capture_nonlocked_.split_rate,
+ render_.render_audio->num_channels()));
}
}
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
index f0050a2..8f0e7bf 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
@@ -27,11 +27,16 @@
const size_t kErbResolution = 2;
const int kWindowSizeMs = 16;
const int kChunkSizeMs = 10; // Size provided by APM.
-const float kClipFreq = 200.0f;
-const float kConfigRho = 0.02f; // Default production and interpretation SNR.
+const float kClipFreqKhz = 0.2f;
const float kKbdAlpha = 1.5f;
const float kLambdaBot = -1.0f; // Extreme values in bisection
const float kLambdaTop = -10e-18f; // search for lamda.
+const float kVoiceProbabilityThreshold = 0.02f;
+// Number of chunks after voice activity which is still considered speech.
+const size_t kSpeechOffsetDelay = 80;
+const float kDecayRate = 0.98f; // Power estimation decay rate.
+const float kMaxRelativeGainChange = 0.04f; // Maximum relative change in gain.
+const float kRho = 0.0004f; // Default production and interpretation SNR.
// Returns dot product of vectors |a| and |b| with size |length|.
float DotProduct(const float* a, const float* b, size_t length) {
@@ -72,61 +77,46 @@
}
}
-IntelligibilityEnhancer::IntelligibilityEnhancer()
- : IntelligibilityEnhancer(IntelligibilityEnhancer::Config()) {
-}
-
-IntelligibilityEnhancer::IntelligibilityEnhancer(const Config& config)
+IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
+ size_t num_render_channels)
: freqs_(RealFourier::ComplexLength(
- RealFourier::FftOrder(config.sample_rate_hz * kWindowSizeMs / 1000))),
- window_size_(static_cast<size_t>(1 << RealFourier::FftOrder(freqs_))),
- chunk_length_(
- static_cast<size_t>(config.sample_rate_hz * kChunkSizeMs / 1000)),
- bank_size_(GetBankSize(config.sample_rate_hz, kErbResolution)),
- sample_rate_hz_(config.sample_rate_hz),
- erb_resolution_(kErbResolution),
- num_capture_channels_(config.num_capture_channels),
- num_render_channels_(config.num_render_channels),
- analysis_rate_(config.analysis_rate),
- active_(true),
- clear_power_(freqs_, config.decay_rate),
- noise_power_(freqs_, 0.f),
+ RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
+ chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),
+ bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),
+ sample_rate_hz_(sample_rate_hz),
+ num_render_channels_(num_render_channels),
+ clear_power_estimator_(freqs_, kDecayRate),
+ noise_power_estimator_(
+ new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),
filtered_clear_pow_(new float[bank_size_]),
filtered_noise_pow_(new float[bank_size_]),
center_freqs_(new float[bank_size_]),
render_filter_bank_(CreateErbBank(freqs_)),
- rho_(new float[bank_size_]),
gains_eq_(new float[bank_size_]),
- gain_applier_(freqs_, config.gain_change_limit),
+ gain_applier_(freqs_, kMaxRelativeGainChange),
temp_render_out_buffer_(chunk_length_, num_render_channels_),
- kbd_window_(new float[window_size_]),
render_callback_(this),
- block_count_(0),
- analysis_step_(0) {
- RTC_DCHECK_LE(config.rho, 1.0f);
+ audio_s16_(chunk_length_),
+ chunks_since_voice_(kSpeechOffsetDelay),
+ is_speech_(false) {
+ RTC_DCHECK_LE(kRho, 1.f);
- memset(filtered_clear_pow_.get(),
- 0,
+ memset(filtered_clear_pow_.get(), 0,
bank_size_ * sizeof(filtered_clear_pow_[0]));
- memset(filtered_noise_pow_.get(),
- 0,
+ memset(filtered_noise_pow_.get(), 0,
bank_size_ * sizeof(filtered_noise_pow_[0]));
- // Assumes all rho equal.
- for (size_t i = 0; i < bank_size_; ++i) {
- rho_[i] = config.rho * config.rho;
- }
+ const size_t erb_index = static_cast<size_t>(
+ ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
+ 43.f));
+ start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution);
- float freqs_khz = kClipFreq / 1000.0f;
- size_t erb_index = static_cast<size_t>(ceilf(
- 11.17f * logf((freqs_khz + 0.312f) / (freqs_khz + 14.6575f)) + 43.0f));
- start_freq_ = std::max(static_cast<size_t>(1), erb_index * erb_resolution_);
-
- WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size_,
- kbd_window_.get());
+ size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
+ std::vector<float> kbd_window(window_size);
+ WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);
render_mangler_.reset(new LappedTransform(
- num_render_channels_, num_render_channels_, chunk_length_,
- kbd_window_.get(), window_size_, window_size_ / 2, &render_callback_));
+ num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],
+ window_size, window_size / 2, &render_callback_));
}
void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
@@ -134,13 +124,10 @@
if (capture_filter_bank_.size() != bank_size_ ||
capture_filter_bank_[0].size() != noise.size()) {
capture_filter_bank_ = CreateErbBank(noise.size());
+ noise_power_estimator_.reset(
+ new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));
}
- if (noise.size() != noise_power_.size()) {
- noise_power_.resize(noise.size());
- }
- for (size_t i = 0; i < noise.size(); ++i) {
- noise_power_[i] = noise[i] * noise[i];
- }
+ noise_power_estimator_->Step(&noise[0]);
}
void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
@@ -148,54 +135,29 @@
size_t num_channels) {
RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
RTC_CHECK_EQ(num_render_channels_, num_channels);
-
- if (active_) {
- render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
- }
-
- if (active_) {
- for (size_t i = 0; i < num_render_channels_; ++i) {
- memcpy(audio[i], temp_render_out_buffer_.channels()[i],
- chunk_length_ * sizeof(**audio));
- }
+ is_speech_ = IsSpeech(audio[0]);
+ render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
+ for (size_t i = 0; i < num_render_channels_; ++i) {
+ memcpy(audio[i], temp_render_out_buffer_.channels()[i],
+ chunk_length_ * sizeof(**audio));
}
}
void IntelligibilityEnhancer::ProcessClearBlock(
const std::complex<float>* in_block,
std::complex<float>* out_block) {
- if (block_count_ < 2) {
- memset(out_block, 0, freqs_ * sizeof(*out_block));
- ++block_count_;
- return;
+ if (is_speech_) {
+ clear_power_estimator_.Step(in_block);
}
-
- // TODO(ekm): Use VAD to |Step| and |AnalyzeClearBlock| only if necessary.
- if (true) {
- clear_power_.Step(in_block);
- if (block_count_ % analysis_rate_ == analysis_rate_ - 1) {
- AnalyzeClearBlock();
- ++analysis_step_;
- }
- ++block_count_;
- }
-
- if (active_) {
- gain_applier_.Apply(in_block, out_block);
- }
-}
-
-void IntelligibilityEnhancer::AnalyzeClearBlock() {
- const float* clear_power = clear_power_.Power();
- MapToErbBands(clear_power,
- render_filter_bank_,
+ const std::vector<float>& clear_power = clear_power_estimator_.power();
+ const std::vector<float>& noise_power = noise_power_estimator_->power();
+ MapToErbBands(&clear_power[0], render_filter_bank_,
filtered_clear_pow_.get());
- MapToErbBands(&noise_power_[0],
- capture_filter_bank_,
+ MapToErbBands(&noise_power[0], capture_filter_bank_,
filtered_noise_pow_.get());
SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());
- const float power_target = std::accumulate(
- clear_power, clear_power + freqs_, 0.f);
+ const float power_target =
+ std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);
const float power_top =
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());
@@ -205,6 +167,7 @@
SolveForLambda(power_target, power_bot, power_top);
UpdateErbGains();
} // Else experiencing power underflow, so do nothing.
+ gain_applier_.Apply(in_block, out_block);
}
void IntelligibilityEnhancer::SolveForLambda(float power_target,
@@ -217,11 +180,10 @@
1.f / (power_target + std::numeric_limits<float>::epsilon());
float lambda_bot = kLambdaBot;
float lambda_top = kLambdaTop;
- float power_ratio = 2.0f; // Ratio of achieved power to target power.
+ float power_ratio = 2.f; // Ratio of achieved power to target power.
int iters = 0;
- while (std::fabs(power_ratio - 1.0f) > kConvergeThresh &&
- iters <= kMaxIters) {
- const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.0f;
+ while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {
+ const float lambda = lambda_bot + (lambda_top - lambda_bot) / 2.f;
SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());
const float power =
DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
@@ -239,7 +201,7 @@
// (ERB gain) = filterbank' * (freq gain)
float* gains = gain_applier_.target();
for (size_t i = 0; i < freqs_; ++i) {
- gains[i] = 0.0f;
+ gains[i] = 0.f;
for (size_t j = 0; j < bank_size_; ++j) {
gains[i] = fmaf(render_filter_bank_[j][i], gains_eq_[j], gains[i]);
}
@@ -248,9 +210,9 @@
size_t IntelligibilityEnhancer::GetBankSize(int sample_rate,
size_t erb_resolution) {
- float freq_limit = sample_rate / 2000.0f;
+ float freq_limit = sample_rate / 2000.f;
size_t erb_scale = static_cast<size_t>(ceilf(
- 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.0f));
+ 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f));
return erb_scale * erb_resolution;
}
@@ -260,7 +222,7 @@
size_t lf = 1, rf = 4;
for (size_t i = 0; i < bank_size_; ++i) {
- float abs_temp = fabsf((i + 1.0f) / static_cast<float>(erb_resolution_));
+ float abs_temp = fabsf((i + 1.f) / static_cast<float>(kErbResolution));
center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp));
center_freqs_[i] -= 14678.49f;
}
@@ -274,48 +236,43 @@
}
for (size_t i = 1; i <= bank_size_; ++i) {
- size_t lll, ll, rr, rrr;
static const size_t kOne = 1; // Avoids repeated static_cast<>s below.
- lll = static_cast<size_t>(round(
- center_freqs_[std::max(kOne, i - lf) - 1] * num_freqs /
- (0.5f * sample_rate_hz_)));
- ll = static_cast<size_t>(round(
- center_freqs_[std::max(kOne, i) - 1] * num_freqs /
- (0.5f * sample_rate_hz_)));
+ size_t lll =
+ static_cast<size_t>(round(center_freqs_[std::max(kOne, i - lf) - 1] *
+ num_freqs / (0.5f * sample_rate_hz_)));
+ size_t ll = static_cast<size_t>(round(center_freqs_[std::max(kOne, i) - 1] *
+ num_freqs / (0.5f * sample_rate_hz_)));
lll = std::min(num_freqs, std::max(lll, kOne)) - 1;
ll = std::min(num_freqs, std::max(ll, kOne)) - 1;
- rrr = static_cast<size_t>(round(
- center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /
- (0.5f * sample_rate_hz_)));
- rr = static_cast<size_t>(round(
- center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /
- (0.5f * sample_rate_hz_)));
+ size_t rrr = static_cast<size_t>(
+ round(center_freqs_[std::min(bank_size_, i + rf) - 1] * num_freqs /
+ (0.5f * sample_rate_hz_)));
+ size_t rr = static_cast<size_t>(
+ round(center_freqs_[std::min(bank_size_, i + 1) - 1] * num_freqs /
+ (0.5f * sample_rate_hz_)));
rrr = std::min(num_freqs, std::max(rrr, kOne)) - 1;
rr = std::min(num_freqs, std::max(rr, kOne)) - 1;
- float step, element;
-
- step = ll == lll ? 0.f : 1.f / (ll - lll);
- element = 0.0f;
+ float step = ll == lll ? 0.f : 1.f / (ll - lll);
+ float element = 0.f;
for (size_t j = lll; j <= ll; ++j) {
filter_bank[i - 1][j] = element;
element += step;
}
step = rr == rrr ? 0.f : 1.f / (rrr - rr);
- element = 1.0f;
+ element = 1.f;
for (size_t j = rr; j <= rrr; ++j) {
filter_bank[i - 1][j] = element;
element -= step;
}
for (size_t j = ll; j <= rr; ++j) {
- filter_bank[i - 1][j] = 1.0f;
+ filter_bank[i - 1][j] = 1.f;
}
}
- float sum;
for (size_t i = 0; i < num_freqs; ++i) {
- sum = 0.0f;
+ float sum = 0.f;
for (size_t j = 0; j < bank_size_; ++j) {
sum += filter_bank[j][i];
}
@@ -329,22 +286,22 @@
void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda,
size_t start_freq,
float* sols) {
- bool quadratic = (kConfigRho < 1.0f);
+ bool quadratic = (kRho < 1.f);
const float* pow_x0 = filtered_clear_pow_.get();
const float* pow_n0 = filtered_noise_pow_.get();
for (size_t n = 0; n < start_freq; ++n) {
- sols[n] = 1.0f;
+ sols[n] = 1.f;
}
// Analytic solution for optimal gains. See paper for derivation.
for (size_t n = start_freq - 1; n < bank_size_; ++n) {
float alpha0, beta0, gamma0;
- gamma0 = 0.5f * rho_[n] * pow_x0[n] * pow_n0[n] +
+ gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] +
lambda * pow_x0[n] * pow_n0[n] * pow_n0[n];
- beta0 = lambda * pow_x0[n] * (2 - rho_[n]) * pow_x0[n] * pow_n0[n];
+ beta0 = lambda * pow_x0[n] * (2 - kRho) * pow_x0[n] * pow_n0[n];
if (quadratic) {
- alpha0 = lambda * pow_x0[n] * (1 - rho_[n]) * pow_x0[n] * pow_x0[n];
+ alpha0 = lambda * pow_x0[n] * (1 - kRho) * pow_x0[n] * pow_x0[n];
sols[n] =
(-beta0 - sqrtf(beta0 * beta0 - 4 * alpha0 * gamma0)) /
(2 * alpha0 + std::numeric_limits<float>::epsilon());
@@ -355,8 +312,15 @@
}
}
-bool IntelligibilityEnhancer::active() const {
- return active_;
+bool IntelligibilityEnhancer::IsSpeech(const float* audio) {
+ FloatToS16(audio, chunk_length_, &audio_s16_[0]);
+ vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);
+ if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
+ chunks_since_voice_ = 0;
+ } else if (chunks_since_voice_ < kSpeechOffsetDelay) {
+ ++chunks_since_voice_;
+ }
+ return chunks_since_voice_ < kSpeechOffsetDelay;
}
} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
index 2deb4d2..c18bac0 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
@@ -18,6 +18,7 @@
#include "webrtc/common_audio/lapped_transform.h"
#include "webrtc/common_audio/channel_buffer.h"
#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
+#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
namespace webrtc {
@@ -28,28 +29,7 @@
// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788
class IntelligibilityEnhancer {
public:
- struct Config {
- // TODO(bercic): the |decay_rate|, |analysis_rate| and |gain_limit|
- // parameters should probably go away once fine tuning is done.
- Config()
- : sample_rate_hz(16000),
- num_capture_channels(1),
- num_render_channels(1),
- decay_rate(0.9f),
- analysis_rate(60),
- gain_change_limit(0.1f),
- rho(0.02f) {}
- int sample_rate_hz;
- size_t num_capture_channels;
- size_t num_render_channels;
- float decay_rate;
- int analysis_rate;
- float gain_change_limit;
- float rho;
- };
-
- explicit IntelligibilityEnhancer(const Config& config);
- IntelligibilityEnhancer(); // Initialize with default config.
+ IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);
// Sets the capture noise magnitude spectrum estimate.
void SetCaptureNoiseEstimate(std::vector<float> noise);
@@ -86,9 +66,6 @@
void ProcessClearBlock(const std::complex<float>* in_block,
std::complex<float>* out_block);
- // Computes and sets modified gains.
- void AnalyzeClearBlock();
-
// Bisection search for optimal |lambda|.
void SolveForLambda(float power_target, float power_bot, float power_top);
@@ -105,29 +82,25 @@
// Negative gains are set to 0. Stores the results in |sols|.
void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);
+ // Returns true if the audio is speech.
+ bool IsSpeech(const float* audio);
+
const size_t freqs_; // Num frequencies in frequency domain.
- const size_t window_size_; // Window size in samples; also the block size.
const size_t chunk_length_; // Chunk size in samples.
const size_t bank_size_; // Num ERB filters.
const int sample_rate_hz_;
- const int erb_resolution_;
- const size_t num_capture_channels_;
const size_t num_render_channels_;
- const int analysis_rate_; // Num blocks before gains recalculated.
- const bool active_; // Whether render gains are being updated.
- // TODO(ekm): Add logic for updating |active_|.
-
- intelligibility::PowerEstimator clear_power_;
- std::vector<float> noise_power_;
+ intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;
+ std::unique_ptr<intelligibility::PowerEstimator<float>>
+ noise_power_estimator_;
std::unique_ptr<float[]> filtered_clear_pow_;
std::unique_ptr<float[]> filtered_noise_pow_;
std::unique_ptr<float[]> center_freqs_;
std::vector<std::vector<float>> capture_filter_bank_;
std::vector<std::vector<float>> render_filter_bank_;
size_t start_freq_;
- std::unique_ptr<float[]> rho_; // Production and interpretation SNR.
- // for each ERB band.
+
std::unique_ptr<float[]> gains_eq_; // Pre-filter modified gains.
intelligibility::GainApplier gain_applier_;
@@ -135,11 +108,13 @@
// the original input array with modifications.
ChannelBuffer<float> temp_render_out_buffer_;
- std::unique_ptr<float[]> kbd_window_;
TransformCallback render_callback_;
std::unique_ptr<LappedTransform> render_mangler_;
- int block_count_;
- int analysis_step_;
+
+ VoiceActivityDetector vad_;
+ std::vector<int16_t> audio_s16_;
+ size_t chunks_since_voice_;
+ bool is_speech_;
};
} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc
index b0f94ec..b59ae36 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc
@@ -26,54 +26,184 @@
// Target output for ERB create test. Generated with matlab.
const float kTestCenterFreqs[] = {
- 13.169f, 26.965f, 41.423f, 56.577f, 72.461f, 89.113f, 106.57f, 124.88f,
- 144.08f, 164.21f, 185.34f, 207.5f, 230.75f, 255.16f, 280.77f, 307.66f,
- 335.9f, 365.56f, 396.71f, 429.44f, 463.84f, 500.f};
-const float kTestFilterBank[][9] = {
- {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f},
- {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f},
- {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f},
- {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f},
- {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f},
- {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f},
- {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f},
- {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.f},
- {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.5f},
- {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.5f}};
+ 14.5213f, 29.735f, 45.6781f, 62.3884f, 79.9058f, 98.2691f, 117.521f,
+ 137.708f, 158.879f, 181.084f, 204.378f, 228.816f, 254.459f, 281.371f,
+ 309.618f, 339.273f, 370.411f, 403.115f, 437.469f, 473.564f, 511.497f,
+ 551.371f, 593.293f, 637.386f, 683.77f, 732.581f, 783.96f, 838.06f,
+ 895.046f, 955.09f, 1018.38f, 1085.13f, 1155.54f, 1229.85f, 1308.32f,
+ 1391.22f, 1478.83f, 1571.5f, 1669.55f, 1773.37f, 1883.37f, 2000.f};
+const float kTestFilterBank[][33] = {
+ {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.157895f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.210526f, 0.117647f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.315789f, 0.176471f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.315789f, 0.352941f, 0.142857f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.352941f, 0.285714f,
+ 0.157895f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f,
+ 0.210526f, 0.111111f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.285714f, 0.315789f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.315789f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f,
+ 0.108108f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f,
+ 0.243243f, 0.153846f, 0.0833333f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f,
+ 0.324324f, 0.230769f, 0.166667f, 0.0909091f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.324324f, 0.307692f, 0.25f, 0.181818f, 0.0833333f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.307692f, 0.333333f,
+ 0.363636f, 0.25f, 0.151515f, 0.0793651f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.166667f, 0.363636f, 0.333333f, 0.242424f,
+ 0.190476f, 0.133333f, 0.0689655f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.30303f, 0.253968f, 0.2f, 0.137931f,
+ 0.0714286f, 0.f, 0.f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.30303f, 0.31746f, 0.333333f, 0.275862f, 0.214286f,
+ 0.125f, 0.0655738f, 0.f, 0.f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.15873f, 0.333333f, 0.344828f, 0.357143f,
+ 0.25f, 0.196721f, 0.137931f, 0.0816327f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.172414f, 0.357143f,
+ 0.3125f, 0.245902f, 0.172414f, 0.102041f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.3125f, 0.327869f, 0.344828f, 0.204082f, 0.f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.163934f, 0.344828f, 0.408163f, 0.5f},
+ {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.204082f, 0.5f}};
static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestFilterBank),
"Test filterbank badly initialized.");
// Target output for gain solving test. Generated with matlab.
const size_t kTestStartFreq = 12; // Lowest integral frequency for ERBs.
-const float kTestZeroVar[] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,
- 1.f, 1.f, 1.f, 0.f, 0.f, 0.f, 0.f, 0.f,
- 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+const float kTestZeroVar[] = {
+ 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0};
static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestZeroVar),
"Power test data badly initialized.");
const float kTestNonZeroVarLambdaTop[] = {
- 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,
- 1.f, 1.f, 1.f, 0.f, 0.f, 0.0351f, 0.0636f, 0.0863f,
- 0.1037f, 0.1162f, 0.1236f, 0.1251f, 0.1189f, 0.0993f};
+ 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+ 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0};
static_assert(arraysize(kTestCenterFreqs) ==
arraysize(kTestNonZeroVarLambdaTop),
"Power test data badly initialized.");
const float kMaxTestError = 0.005f;
// Enhancer initialization parameters.
-const int kSamples = 2000;
-const int kSampleRate = 1000;
+const int kSamples = 1000;
+const int kSampleRate = 4000;
const int kNumChannels = 1;
const int kFragmentSize = kSampleRate / 100;
@@ -83,13 +213,11 @@
protected:
IntelligibilityEnhancerTest()
: clear_data_(kSamples), noise_data_(kSamples), orig_data_(kSamples) {
- config_.sample_rate_hz = kSampleRate;
- enh_.reset(new IntelligibilityEnhancer(config_));
+ enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
}
bool CheckUpdate() {
- config_.sample_rate_hz = kSampleRate;
- enh_.reset(new IntelligibilityEnhancer(config_));
+ enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
float* clear_cursor = &clear_data_[0];
float* noise_cursor = &noise_data_[0];
for (int i = 0; i < kSamples; i += kFragmentSize) {
@@ -105,7 +233,6 @@
return false;
}
- IntelligibilityEnhancer::Config config_;
std::unique_ptr<IntelligibilityEnhancer> enh_;
std::vector<float> clear_data_;
std::vector<float> noise_data_;
@@ -115,9 +242,9 @@
// For each class of generated data, tests that render stream is updated when
// it should be.
TEST_F(IntelligibilityEnhancerTest, TestRenderUpdate) {
- std::fill(noise_data_.begin(), noise_data_.end(), 0.0f);
- std::fill(orig_data_.begin(), orig_data_.end(), 0.0f);
- std::fill(clear_data_.begin(), clear_data_.end(), 0.0f);
+ std::fill(noise_data_.begin(), noise_data_.end(), 0.f);
+ std::fill(orig_data_.begin(), orig_data_.end(), 0.f);
+ std::fill(clear_data_.begin(), clear_data_.end(), 0.f);
EXPECT_FALSE(CheckUpdate());
std::srand(1);
auto float_rand = []() { return std::rand() * 2.f / RAND_MAX - 1; };
@@ -148,9 +275,8 @@
std::vector<float> sols(enh_->bank_size_);
float lambda = -0.001f;
for (size_t i = 0; i < enh_->bank_size_; i++) {
- enh_->filtered_clear_pow_[i] = 0.0f;
- enh_->filtered_noise_pow_[i] = 0.0f;
- enh_->rho_[i] = 0.02f;
+ enh_->filtered_clear_pow_[i] = 0.f;
+ enh_->filtered_noise_pow_[i] = 0.f;
}
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
for (size_t i = 0; i < enh_->bank_size_; i++) {
@@ -164,7 +290,7 @@
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError);
}
- lambda = -1.0;
+ lambda = -1.f;
enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestZeroVar[i], sols[i], kMaxTestError);
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc
index 6c44415..6d37199 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc
@@ -14,6 +14,7 @@
#include <stdlib.h>
#include <string.h>
#include <algorithm>
+#include <limits>
namespace webrtc {
@@ -21,45 +22,38 @@
namespace {
-// Return |current| changed towards |target|, with the change being at most
-// |limit|.
+// Return |current| changed towards |target|, with the relative change being at
+// most |limit|.
float UpdateFactor(float target, float current, float limit) {
- float delta = fabsf(target - current);
- float sign = copysign(1.f, target - current);
- return current + sign * fminf(delta, limit);
+ float gain = target / (current + std::numeric_limits<float>::epsilon());
+ if (gain < 1.f - limit) {
+ gain = 1.f - limit;
+ } else if (gain > 1.f + limit) {
+ gain = 1.f + limit;
+ }
+ return current * gain + std::numeric_limits<float>::epsilon();
}
} // namespace
-PowerEstimator::PowerEstimator(size_t num_freqs,
- float decay)
- : magnitude_(new float[num_freqs]()),
- power_(new float[num_freqs]()),
- num_freqs_(num_freqs),
- decay_(decay) {
- memset(magnitude_.get(), 0, sizeof(*magnitude_.get()) * num_freqs_);
- memset(power_.get(), 0, sizeof(*power_.get()) * num_freqs_);
-}
+template<typename T>
+PowerEstimator<T>::PowerEstimator(size_t num_freqs, float decay)
+ : power_(num_freqs, 0.f), decay_(decay) {}
-// Compute the magnitude from the beginning, with exponential decaying of the
-// series data.
-void PowerEstimator::Step(const std::complex<float>* data) {
- for (size_t i = 0; i < num_freqs_; ++i) {
- magnitude_[i] = decay_ * magnitude_[i] +
- (1.f - decay_) * std::abs(data[i]);
+template<typename T>
+void PowerEstimator<T>::Step(const T* data) {
+ for (size_t i = 0; i < power_.size(); ++i) {
+ power_[i] = decay_ * power_[i] +
+ (1.f - decay_) * std::abs(data[i]) * std::abs(data[i]);
}
}
-const float* PowerEstimator::Power() {
- for (size_t i = 0; i < num_freqs_; ++i) {
- power_[i] = magnitude_[i] * magnitude_[i];
- }
- return &power_[0];
-}
+template class PowerEstimator<float>;
+template class PowerEstimator<std::complex<float>>;
-GainApplier::GainApplier(size_t freqs, float change_limit)
+GainApplier::GainApplier(size_t freqs, float relative_change_limit)
: num_freqs_(freqs),
- change_limit_(change_limit),
+ relative_change_limit_(relative_change_limit),
target_(new float[freqs]()),
current_(new float[freqs]()) {
for (size_t i = 0; i < freqs; ++i) {
@@ -71,12 +65,8 @@
void GainApplier::Apply(const std::complex<float>* in_block,
std::complex<float>* out_block) {
for (size_t i = 0; i < num_freqs_; ++i) {
- float factor = sqrtf(fabsf(current_[i]));
- if (!std::isnormal(factor)) {
- factor = 1.f;
- }
- out_block[i] = factor * in_block[i];
- current_[i] = UpdateFactor(target_[i], current_[i], change_limit_);
+ current_[i] = UpdateFactor(target_[i], current_[i], relative_change_limit_);
+ out_block[i] = sqrtf(fabsf(current_[i])) * in_block[i];
}
}
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h
index 8858cff..3805a0c 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h
@@ -13,6 +13,7 @@
#include <complex>
#include <memory>
+#include <vector>
namespace webrtc {
@@ -21,6 +22,7 @@
// Internal helper for computing the power of a stream of arrays.
// The result is an array of power per position: the i-th power is the power of
// the stream of data on the i-th positions in the input arrays.
+template <typename T>
class PowerEstimator {
public:
// Construct an instance for the given input array length (|freqs|), with the
@@ -28,31 +30,24 @@
PowerEstimator(size_t freqs, float decay);
// Add a new data point to the series.
- void Step(const std::complex<float>* data);
+ void Step(const T* data);
// The current power array.
- const float* Power();
+ const std::vector<float>& power() { return power_; };
private:
- // TODO(ekmeyerson): Switch the following running means
- // and histories from std::unique_ptr to std::vector.
- std::unique_ptr<std::complex<float>[]> running_mean_sq_;
-
- // The current magnitude array.
- std::unique_ptr<float[]> magnitude_;
// The current power array.
- std::unique_ptr<float[]> power_;
+ std::vector<float> power_;
- const size_t num_freqs_;
const float decay_;
};
// Helper class for smoothing gain changes. On each application step, the
// currently used gains are changed towards a set of settable target gains,
-// constrained by a limit on the magnitude of the changes.
+// constrained by a limit on the relative changes.
class GainApplier {
public:
- GainApplier(size_t freqs, float change_limit);
+ GainApplier(size_t freqs, float relative_change_limit);
// Copy |in_block| to |out_block|, multiplied by the current set of gains,
// and step the current set of gains towards the target set.
@@ -64,7 +59,7 @@
private:
const size_t num_freqs_;
- const float change_limit_;
+ const float relative_change_limit_;
std::unique_ptr<float[]> target_;
std::unique_ptr<float[]> current_;
};
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc
index 43ad9a7..28957bb 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc
@@ -39,17 +39,16 @@
const float kDecay = 0.5f;
const std::vector<std::vector<std::complex<float>>> test_data(
GenerateTestData(kFreqs, kSamples));
- PowerEstimator power_estimator(kFreqs, kDecay);
- EXPECT_EQ(0, power_estimator.Power()[0]);
+ PowerEstimator<std::complex<float>> power_estimator(kFreqs, kDecay);
+ EXPECT_EQ(0, power_estimator.power()[0]);
// Makes sure Step is doing something.
power_estimator.Step(&test_data[0][0]);
for (size_t i = 1; i < kSamples; ++i) {
power_estimator.Step(&test_data[i][0]);
for (size_t j = 0; j < kFreqs; ++j) {
- const float* power = power_estimator.Power();
- EXPECT_GE(power[j], 0.f);
- EXPECT_LE(power[j], 1.f);
+ EXPECT_GE(power_estimator.power()[j], 0.f);
+ EXPECT_LE(power_estimator.power()[j], 1.f);
}
}
}
@@ -62,8 +61,8 @@
GainApplier gain_applier(kFreqs, kChangeLimit);
const std::vector<std::vector<std::complex<float>>> in_data(
GenerateTestData(kFreqs, kSamples));
- std::vector<std::vector<std::complex<float>>> out_data(GenerateTestData(
- kFreqs, kSamples));
+ std::vector<std::vector<std::complex<float>>> out_data(
+ GenerateTestData(kFreqs, kSamples));
for (size_t i = 0; i < kSamples; ++i) {
gain_applier.Apply(&in_data[i][0], &out_data[i][0]);
for (size_t j = 0; j < kFreqs; ++j) {
diff --git a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc
index 1ec85f0..ab8524b 100644
--- a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc
+++ b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc
@@ -30,44 +30,24 @@
namespace webrtc {
namespace {
-DEFINE_double(clear_alpha, 0.9, "Power decay factor for clear data.");
-DEFINE_int32(sample_rate,
- 16000,
- "Audio sample rate used in the input and output files.");
-DEFINE_int32(ana_rate,
- 60,
- "Analysis rate; gains recalculated every N blocks.");
-DEFINE_double(gain_limit, 1000.0, "Maximum gain change in one block.");
-
DEFINE_string(clear_file, "speech.wav", "Input file with clear speech.");
DEFINE_string(noise_file, "noise.wav", "Input file with noise data.");
DEFINE_string(out_file, "proc_enhanced.wav", "Enhanced output file.");
-const size_t kNumChannels = 1;
-
// void function for gtest
void void_main(int argc, char* argv[]) {
google::SetUsageMessage(
"\n\nInput files must be little-endian 16-bit signed raw PCM.\n");
google::ParseCommandLineFlags(&argc, &argv, true);
- size_t samples; // Number of samples in input PCM file
- size_t fragment_size; // Number of samples to process at a time
- // to simulate APM stream processing
-
// Load settings and wav input.
-
- fragment_size = FLAGS_sample_rate / 100; // Mirror real time APM chunk size.
- // Duplicates chunk_length_ in
- // IntelligibilityEnhancer.
-
struct stat in_stat, noise_stat;
ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0)
<< "Empty speech file.";
ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0)
<< "Empty noise file.";
- samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;
+ const size_t samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;
WavReader in_file(FLAGS_clear_file);
std::vector<float> in_fpcm(samples);
@@ -80,23 +60,19 @@
FloatS16ToFloat(&noise_fpcm[0], samples, &noise_fpcm[0]);
// Run intelligibility enhancement.
- IntelligibilityEnhancer::Config config;
- config.sample_rate_hz = FLAGS_sample_rate;
- config.decay_rate = static_cast<float>(FLAGS_clear_alpha);
- config.analysis_rate = FLAGS_ana_rate;
- config.gain_change_limit = FLAGS_gain_limit;
- IntelligibilityEnhancer enh(config);
+ IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels());
rtc::CriticalSection crit;
NoiseSuppressionImpl ns(&crit);
- ns.Initialize(kNumChannels, FLAGS_sample_rate);
+ ns.Initialize(noise_file.num_channels(), noise_file.sample_rate());
ns.Enable(true);
- AudioBuffer capture_audio(fragment_size,
- kNumChannels,
- fragment_size,
- kNumChannels,
+ // Mirror real time APM chunk size. Duplicates chunk_length_ in
+ // IntelligibilityEnhancer.
+ size_t fragment_size = in_file.sample_rate() / 100;
+ AudioBuffer capture_audio(fragment_size, noise_file.num_channels(),
+ fragment_size, noise_file.num_channels(),
fragment_size);
- StreamConfig stream_config(FLAGS_sample_rate, kNumChannels);
+ StreamConfig stream_config(in_file.sample_rate(), noise_file.num_channels());
// Slice the input into smaller chunks, as the APM would do, and feed them
// through the enhancer.
@@ -108,14 +84,17 @@
ns.AnalyzeCaptureAudio(&capture_audio);
ns.ProcessCaptureAudio(&capture_audio);
enh.SetCaptureNoiseEstimate(ns.NoiseEstimate());
- enh.ProcessRenderAudio(&clear_cursor, FLAGS_sample_rate, kNumChannels);
+ enh.ProcessRenderAudio(&clear_cursor, in_file.sample_rate(),
+ in_file.num_channels());
clear_cursor += fragment_size;
noise_cursor += fragment_size;
}
FloatToFloatS16(&in_fpcm[0], samples, &in_fpcm[0]);
- WavWriter out_file(FLAGS_out_file, FLAGS_sample_rate, kNumChannels);
+ WavWriter out_file(FLAGS_out_file,
+ in_file.sample_rate(),
+ in_file.num_channels());
out_file.WriteSamples(&in_fpcm[0], samples);
}
diff --git a/webrtc/modules/audio_processing/noise_suppression_impl.cc b/webrtc/modules/audio_processing/noise_suppression_impl.cc
index 076f1ba..7f19005 100644
--- a/webrtc/modules/audio_processing/noise_suppression_impl.cc
+++ b/webrtc/modules/audio_processing/noise_suppression_impl.cc
@@ -182,8 +182,8 @@
for (auto& suppressor : suppressors_) {
const float* noise = WebRtcNs_noise_estimate(suppressor->state());
for (size_t i = 0; i < noise_estimate.size(); ++i) {
- noise_estimate[i] += kNormalizationFactor *
- noise[i] / suppressors_.size();
+ noise_estimate[i] +=
+ kNormalizationFactor * noise[i] / suppressors_.size();
}
}
#elif defined(WEBRTC_NS_FIXED)