Fix the stereo support in IntelligibilityEnhancer
Review URL: https://codereview.webrtc.org/1729753003
Cr-Commit-Position: refs/heads/master@{#11795}
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
index 38a7ea3..d8f95ed 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc
@@ -54,29 +54,12 @@
float* result) {
for (size_t i = 0; i < filter_bank.size(); ++i) {
RTC_DCHECK_GT(filter_bank[i].size(), 0u);
- result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size());
+ result[i] = DotProduct(filter_bank[i].data(), pow, filter_bank[i].size());
}
}
} // namespace
-IntelligibilityEnhancer::TransformCallback::TransformCallback(
- IntelligibilityEnhancer* parent)
- : parent_(parent) {
-}
-
-void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock(
- const std::complex<float>* const* in_block,
- size_t in_channels,
- size_t frames,
- size_t /* out_channels */,
- std::complex<float>* const* out_block) {
- RTC_DCHECK_EQ(parent_->freqs_, frames);
- for (size_t i = 0; i < in_channels; ++i) {
- parent_->ProcessClearBlock(in_block[i], out_block[i]);
- }
-}
-
IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
size_t num_render_channels)
: freqs_(RealFourier::ComplexLength(
@@ -88,24 +71,17 @@
clear_power_estimator_(freqs_, kDecayRate),
noise_power_estimator_(
new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),
- filtered_clear_pow_(new float[bank_size_]),
- filtered_noise_pow_(new float[bank_size_]),
- center_freqs_(new float[bank_size_]),
+ filtered_clear_pow_(bank_size_, 0.f),
+ filtered_noise_pow_(bank_size_, 0.f),
+ center_freqs_(bank_size_),
render_filter_bank_(CreateErbBank(freqs_)),
- gains_eq_(new float[bank_size_]),
+ gains_eq_(bank_size_),
gain_applier_(freqs_, kMaxRelativeGainChange),
- temp_render_out_buffer_(chunk_length_, num_render_channels_),
- render_callback_(this),
audio_s16_(chunk_length_),
chunks_since_voice_(kSpeechOffsetDelay),
is_speech_(false) {
RTC_DCHECK_LE(kRho, 1.f);
- memset(filtered_clear_pow_.get(), 0,
- bank_size_ * sizeof(filtered_clear_pow_[0]));
- memset(filtered_noise_pow_.get(), 0,
- bank_size_ * sizeof(filtered_noise_pow_[0]));
-
const size_t erb_index = static_cast<size_t>(
ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) +
43.f));
@@ -113,10 +89,11 @@
size_t window_size = static_cast<size_t>(1 << RealFourier::FftOrder(freqs_));
std::vector<float> kbd_window(window_size);
- WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]);
+ WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size,
+ kbd_window.data());
render_mangler_.reset(new LappedTransform(
- num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0],
- window_size, window_size / 2, &render_callback_));
+ num_render_channels_, num_render_channels_, chunk_length_,
+ kbd_window.data(), window_size, window_size / 2, this));
}
void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
@@ -127,7 +104,7 @@
noise_power_estimator_.reset(
new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));
}
- noise_power_estimator_->Step(&noise[0]);
+ noise_power_estimator_->Step(noise.data());
}
void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
@@ -136,38 +113,40 @@
RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
RTC_CHECK_EQ(num_render_channels_, num_channels);
is_speech_ = IsSpeech(audio[0]);
- render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels());
- for (size_t i = 0; i < num_render_channels_; ++i) {
- memcpy(audio[i], temp_render_out_buffer_.channels()[i],
- chunk_length_ * sizeof(**audio));
- }
+ render_mangler_->ProcessChunk(audio, audio);
}
-void IntelligibilityEnhancer::ProcessClearBlock(
- const std::complex<float>* in_block,
- std::complex<float>* out_block) {
+void IntelligibilityEnhancer::ProcessAudioBlock(
+ const std::complex<float>* const* in_block,
+ size_t in_channels,
+ size_t frames,
+ size_t /* out_channels */,
+ std::complex<float>* const* out_block) {
+ RTC_DCHECK_EQ(freqs_, frames);
if (is_speech_) {
- clear_power_estimator_.Step(in_block);
+ clear_power_estimator_.Step(in_block[0]);
}
const std::vector<float>& clear_power = clear_power_estimator_.power();
const std::vector<float>& noise_power = noise_power_estimator_->power();
- MapToErbBands(&clear_power[0], render_filter_bank_,
- filtered_clear_pow_.get());
- MapToErbBands(&noise_power[0], capture_filter_bank_,
- filtered_noise_pow_.get());
- SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get());
+ MapToErbBands(clear_power.data(), render_filter_bank_,
+ filtered_clear_pow_.data());
+ MapToErbBands(noise_power.data(), capture_filter_bank_,
+ filtered_noise_pow_.data());
+ SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data());
const float power_target =
- std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f);
+ std::accumulate(clear_power.data(), clear_power.data() + freqs_, 0.f);
const float power_top =
- DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
- SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get());
+ DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
+ SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data());
const float power_bot =
- DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
+ DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
if (power_target >= power_bot && power_target <= power_top) {
SolveForLambda(power_target);
UpdateErbGains();
} // Else experiencing power underflow, so do nothing.
- gain_applier_.Apply(in_block, out_block);
+ for (size_t i = 0; i < in_channels; ++i) {
+ gain_applier_.Apply(in_block[i], out_block[i]);
+ }
}
void IntelligibilityEnhancer::SolveForLambda(float power_target) {
@@ -182,9 +161,9 @@
int iters = 0;
while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) {
const float lambda = (lambda_bot + lambda_top) / 2.f;
- SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get());
+ SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data());
const float power =
- DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_);
+ DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_);
if (power < power_target) {
lambda_bot = lambda;
} else {
@@ -286,8 +265,8 @@
float* sols) {
const float kMinPower = 1e-5f;
- const float* pow_x0 = filtered_clear_pow_.get();
- const float* pow_n0 = filtered_noise_pow_.get();
+ const float* pow_x0 = filtered_clear_pow_.data();
+ const float* pow_n0 = filtered_noise_pow_.data();
for (size_t n = 0; n < start_freq; ++n) {
sols[n] = 1.f;
@@ -316,8 +295,8 @@
}
bool IntelligibilityEnhancer::IsSpeech(const float* audio) {
- FloatToS16(audio, chunk_length_, &audio_s16_[0]);
- vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_);
+ FloatToS16(audio, chunk_length_, audio_s16_.data());
+ vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_);
if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) {
chunks_since_voice_ = 0;
} else if (chunks_since_voice_ < kSpeechOffsetDelay) {
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
index 22a3eab..3b46d16 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h
@@ -27,7 +27,7 @@
// frequency bin to enhance speech against the noise background.
// Details of the model and algorithm can be found in the original paper:
// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788
-class IntelligibilityEnhancer {
+class IntelligibilityEnhancer : public LappedTransform::Callback {
public:
IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);
@@ -40,32 +40,19 @@
size_t num_channels);
bool active() const;
+ protected:
+ // All in frequency domain, receives input |in_block|, applies
+ // intelligibility enhancement, and writes result to |out_block|.
+ void ProcessAudioBlock(const std::complex<float>* const* in_block,
+ size_t in_channels,
+ size_t frames,
+ size_t out_channels,
+ std::complex<float>* const* out_block) override;
+
private:
- // Provides access point to the frequency domain.
- class TransformCallback : public LappedTransform::Callback {
- public:
- TransformCallback(IntelligibilityEnhancer* parent);
-
- // All in frequency domain, receives input |in_block|, applies
- // intelligibility enhancement, and writes result to |out_block|.
- void ProcessAudioBlock(const std::complex<float>* const* in_block,
- size_t in_channels,
- size_t frames,
- size_t out_channels,
- std::complex<float>* const* out_block) override;
-
- private:
- IntelligibilityEnhancer* parent_;
- };
- friend class TransformCallback;
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);
- // Updates power computation and analysis with |in_block_|,
- // and writes modified speech to |out_block|.
- void ProcessClearBlock(const std::complex<float>* in_block,
- std::complex<float>* out_block);
-
// Bisection search for optimal |lambda|.
void SolveForLambda(float power_target);
@@ -94,21 +81,16 @@
intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;
std::unique_ptr<intelligibility::PowerEstimator<float>>
noise_power_estimator_;
- std::unique_ptr<float[]> filtered_clear_pow_;
- std::unique_ptr<float[]> filtered_noise_pow_;
- std::unique_ptr<float[]> center_freqs_;
+ std::vector<float> filtered_clear_pow_;
+ std::vector<float> filtered_noise_pow_;
+ std::vector<float> center_freqs_;
std::vector<std::vector<float>> capture_filter_bank_;
std::vector<std::vector<float>> render_filter_bank_;
size_t start_freq_;
- std::unique_ptr<float[]> gains_eq_; // Pre-filter modified gains.
+ std::vector<float> gains_eq_; // Pre-filter modified gains.
intelligibility::GainApplier gain_applier_;
- // Destination buffers used to reassemble blocked chunks before overwriting
- // the original input array with modifications.
- ChannelBuffer<float> temp_render_out_buffer_;
-
- TransformCallback render_callback_;
std::unique_ptr<LappedTransform> render_mangler_;
VoiceActivityDetector vad_;
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc
index ebfb67a..dd5b681 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc
@@ -213,8 +213,8 @@
bool CheckUpdate() {
enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
- float* clear_cursor = &clear_data_[0];
- float* noise_cursor = &noise_data_[0];
+ float* clear_cursor = clear_data_.data();
+ float* noise_cursor = noise_data_.data();
for (int i = 0; i < kSamples; i += kFragmentSize) {
enh_->ProcessRenderAudio(&clear_cursor, kSampleRate, kNumChannels);
clear_cursor += kFragmentSize;
@@ -273,7 +273,7 @@
enh_->filtered_clear_pow_[i] = 0.f;
enh_->filtered_noise_pow_[i] = 0.f;
}
- enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
+ enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data());
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestZeroVar, sols[i], kMaxTestError);
}
@@ -281,12 +281,12 @@
enh_->filtered_clear_pow_[i] = static_cast<float>(i + 1);
enh_->filtered_noise_pow_[i] = static_cast<float>(enh_->bank_size_ - i);
}
- enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
+ enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data());
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError);
}
lambda = -1.f;
- enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]);
+ enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data());
for (size_t i = 0; i < enh_->bank_size_; i++) {
EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError);
}
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc
index 6d37199..3a9433b 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc
@@ -54,13 +54,8 @@
GainApplier::GainApplier(size_t freqs, float relative_change_limit)
: num_freqs_(freqs),
relative_change_limit_(relative_change_limit),
- target_(new float[freqs]()),
- current_(new float[freqs]()) {
- for (size_t i = 0; i < freqs; ++i) {
- target_[i] = 1.f;
- current_[i] = 1.f;
- }
-}
+ target_(freqs, 1.f),
+ current_(freqs, 1.f) {}
void GainApplier::Apply(const std::complex<float>* in_block,
std::complex<float>* out_block) {
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h
index 3805a0c..11b9e49 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h
@@ -12,7 +12,6 @@
#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_
#include <complex>
-#include <memory>
#include <vector>
namespace webrtc {
@@ -55,13 +54,13 @@
std::complex<float>* out_block);
// Return the current target gain set. Modify this array to set the targets.
- float* target() const { return target_.get(); }
+ float* target() { return target_.data(); }
private:
const size_t num_freqs_;
const float relative_change_limit_;
- std::unique_ptr<float[]> target_;
- std::unique_ptr<float[]> current_;
+ std::vector<float> target_;
+ std::vector<float> current_;
};
} // namespace intelligibility
diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc
index 28957bb..08e8368 100644
--- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc
+++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc
@@ -43,9 +43,9 @@
EXPECT_EQ(0, power_estimator.power()[0]);
// Makes sure Step is doing something.
- power_estimator.Step(&test_data[0][0]);
+ power_estimator.Step(test_data[0].data());
for (size_t i = 1; i < kSamples; ++i) {
- power_estimator.Step(&test_data[i][0]);
+ power_estimator.Step(test_data[i].data());
for (size_t j = 0; j < kFreqs; ++j) {
EXPECT_GE(power_estimator.power()[j], 0.f);
EXPECT_LE(power_estimator.power()[j], 1.f);
@@ -64,7 +64,7 @@
std::vector<std::vector<std::complex<float>>> out_data(
GenerateTestData(kFreqs, kSamples));
for (size_t i = 0; i < kSamples; ++i) {
- gain_applier.Apply(&in_data[i][0], &out_data[i][0]);
+ gain_applier.Apply(in_data[i].data(), out_data[i].data());
for (size_t j = 0; j < kFreqs; ++j) {
EXPECT_GT(out_data[i][j].real(), 0.f);
EXPECT_LT(out_data[i][j].real(), 1.f);
diff --git a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc
index ab8524b..e196e29 100644
--- a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc
+++ b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc
@@ -8,17 +8,10 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-//
-// Command line tool for speech intelligibility enhancement. Provides for
-// running and testing intelligibility_enhancer as an independent process.
-// Use --help for options.
-//
-
-#include <sys/stat.h>
-
#include "gflags/gflags.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "webrtc/base/criticalsection.h"
+#include "webrtc/common_audio/channel_buffer.h"
#include "webrtc/common_audio/include/audio_util.h"
#include "webrtc/common_audio/wav_file.h"
#include "webrtc/modules/audio_processing/audio_buffer.h"
@@ -40,62 +33,45 @@
"\n\nInput files must be little-endian 16-bit signed raw PCM.\n");
google::ParseCommandLineFlags(&argc, &argv, true);
- // Load settings and wav input.
- struct stat in_stat, noise_stat;
- ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0)
- << "Empty speech file.";
- ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0)
- << "Empty noise file.";
-
- const size_t samples = std::min(in_stat.st_size, noise_stat.st_size) / 2;
-
WavReader in_file(FLAGS_clear_file);
- std::vector<float> in_fpcm(samples);
- in_file.ReadSamples(samples, &in_fpcm[0]);
- FloatS16ToFloat(&in_fpcm[0], samples, &in_fpcm[0]);
-
WavReader noise_file(FLAGS_noise_file);
- std::vector<float> noise_fpcm(samples);
- noise_file.ReadSamples(samples, &noise_fpcm[0]);
- FloatS16ToFloat(&noise_fpcm[0], samples, &noise_fpcm[0]);
-
- // Run intelligibility enhancement.
+ WavWriter out_file(FLAGS_out_file, in_file.sample_rate(),
+ in_file.num_channels());
IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels());
rtc::CriticalSection crit;
NoiseSuppressionImpl ns(&crit);
ns.Initialize(noise_file.num_channels(), noise_file.sample_rate());
ns.Enable(true);
-
- // Mirror real time APM chunk size. Duplicates chunk_length_ in
- // IntelligibilityEnhancer.
- size_t fragment_size = in_file.sample_rate() / 100;
- AudioBuffer capture_audio(fragment_size, noise_file.num_channels(),
- fragment_size, noise_file.num_channels(),
- fragment_size);
- StreamConfig stream_config(in_file.sample_rate(), noise_file.num_channels());
-
- // Slice the input into smaller chunks, as the APM would do, and feed them
- // through the enhancer.
- float* clear_cursor = &in_fpcm[0];
- float* noise_cursor = &noise_fpcm[0];
-
- for (size_t i = 0; i < samples; i += fragment_size) {
- capture_audio.CopyFrom(&noise_cursor, stream_config);
+ const size_t in_samples = noise_file.sample_rate() / 100;
+ const size_t noise_samples = noise_file.sample_rate() / 100;
+ std::vector<float> in(in_samples * in_file.num_channels());
+ std::vector<float> noise(noise_samples * noise_file.num_channels());
+ ChannelBuffer<float> in_buf(in_samples, in_file.num_channels());
+ ChannelBuffer<float> noise_buf(noise_samples, noise_file.num_channels());
+ AudioBuffer capture_audio(noise_samples, noise_file.num_channels(),
+ noise_samples, noise_file.num_channels(),
+ noise_samples);
+ StreamConfig stream_config(noise_file.sample_rate(),
+ noise_file.num_channels());
+ while (in_file.ReadSamples(in.size(), in.data()) == in.size() &&
+ noise_file.ReadSamples(noise.size(), noise.data()) == noise.size()) {
+ FloatS16ToFloat(in.data(), in.size(), in.data());
+ FloatS16ToFloat(noise.data(), noise.size(), noise.data());
+ Deinterleave(in.data(), in_buf.num_frames(), in_buf.num_channels(),
+ in_buf.channels());
+ Deinterleave(noise.data(), noise_buf.num_frames(), noise_buf.num_channels(),
+ noise_buf.channels());
+ capture_audio.CopyFrom(noise_buf.channels(), stream_config);
ns.AnalyzeCaptureAudio(&capture_audio);
ns.ProcessCaptureAudio(&capture_audio);
enh.SetCaptureNoiseEstimate(ns.NoiseEstimate());
- enh.ProcessRenderAudio(&clear_cursor, in_file.sample_rate(),
+ enh.ProcessRenderAudio(in_buf.channels(), in_file.sample_rate(),
in_file.num_channels());
- clear_cursor += fragment_size;
- noise_cursor += fragment_size;
+ Interleave(in_buf.channels(), in_buf.num_frames(), in_buf.num_channels(),
+ in.data());
+ FloatToFloatS16(in.data(), in.size(), in.data());
+ out_file.WriteSamples(in.data(), in.size());
}
-
- FloatToFloatS16(&in_fpcm[0], samples, &in_fpcm[0]);
-
- WavWriter out_file(FLAGS_out_file,
- in_file.sample_rate(),
- in_file.num_channels());
- out_file.WriteSamples(&in_fpcm[0], samples);
}
} // namespace