AudioProcessingImpl: Add a VAD submodule
Add a VoiceActivityDetectorWrapper submodule in AudioProcessingImpl
and enable injecting speech probability into GainController2.
Bug: webrtc:13663
Change-Id: I05e13b737d085b45ac8ce76660191867c56834c2
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/265166
Commit-Queue: Hanna Silen <silen@webrtc.org>
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#37275}
diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc
index fa5e929..5714d6b 100644
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@@ -162,6 +162,7 @@
bool noise_suppressor_enabled,
bool adaptive_gain_controller_enabled,
bool gain_controller2_enabled,
+ bool voice_activity_detector_enabled,
bool gain_adjustment_enabled,
bool echo_controller_enabled,
bool transient_suppressor_enabled) {
@@ -173,6 +174,8 @@
changed |=
(adaptive_gain_controller_enabled != adaptive_gain_controller_enabled_);
changed |= (gain_controller2_enabled != gain_controller2_enabled_);
+ changed |=
+ (voice_activity_detector_enabled != voice_activity_detector_enabled_);
changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
changed |= (echo_controller_enabled != echo_controller_enabled_);
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
@@ -182,6 +185,7 @@
noise_suppressor_enabled_ = noise_suppressor_enabled;
adaptive_gain_controller_enabled_ = adaptive_gain_controller_enabled;
gain_controller2_enabled_ = gain_controller2_enabled;
+ voice_activity_detector_enabled_ = voice_activity_detector_enabled;
gain_adjustment_enabled_ = gain_adjustment_enabled;
echo_controller_enabled_ = echo_controller_enabled;
transient_suppressor_enabled_ = transient_suppressor_enabled;
@@ -395,6 +399,7 @@
InitializeResidualEchoDetector();
InitializeEchoController();
InitializeGainController2(/*config_has_changed=*/true);
+ InitializeVoiceActivityDetector(/*config_has_changed=*/true);
InitializeNoiseSuppressor();
InitializeAnalyzer();
InitializePostProcessor();
@@ -569,6 +574,7 @@
}
InitializeGainController2(agc2_config_changed);
+ InitializeVoiceActivityDetector(agc2_config_changed);
if (pre_amplifier_config_changed || gain_adjustment_config_changed) {
InitializeCaptureLevelsAdjuster();
@@ -1297,10 +1303,19 @@
submodules_.capture_analyzer->Analyze(capture_buffer);
}
+ absl::optional<float> voice_activity_probability = absl::nullopt;
if (submodules_.gain_controller2) {
submodules_.gain_controller2->NotifyAnalogLevel(
recommended_stream_analog_level_locked());
- submodules_.gain_controller2->Process(capture_buffer);
+ if (submodules_.voice_activity_detector) {
+ voice_activity_probability =
+ submodules_.voice_activity_detector->Analyze(
+ AudioFrameView<const float>(capture_buffer->channels(),
+ capture_buffer->num_channels(),
+ capture_buffer->num_frames()));
+ }
+ submodules_.gain_controller2->Process(voice_activity_probability,
+ capture_buffer);
}
if (submodules_.capture_post_processor) {
@@ -1692,7 +1707,7 @@
return submodule_states_.Update(
config_.high_pass_filter.enabled, !!submodules_.echo_control_mobile,
!!submodules_.noise_suppressor, !!submodules_.gain_control,
- !!submodules_.gain_controller2,
+ !!submodules_.gain_controller2, !!submodules_.voice_activity_detector,
config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
capture_nonlocked_.echo_controller_enabled,
!!submodules_.transient_suppressor);
@@ -1900,9 +1915,35 @@
return;
}
if (!submodules_.gain_controller2 || config_has_changed) {
+ const bool use_internal_vad =
+ transient_suppressor_vad_mode_ != TransientSuppressor::VadMode::kRnnVad;
submodules_.gain_controller2 = std::make_unique<GainController2>(
config_.gain_controller2, proc_fullband_sample_rate_hz(),
- num_input_channels());
+ num_input_channels(), use_internal_vad);
+ }
+}
+
+void AudioProcessingImpl::InitializeVoiceActivityDetector(
+ bool config_has_changed) {
+ if (!config_has_changed) {
+ return;
+ }
+ const bool use_vad =
+ transient_suppressor_vad_mode_ == TransientSuppressor::VadMode::kRnnVad &&
+ config_.gain_controller2.enabled &&
+ config_.gain_controller2.adaptive_digital.enabled;
+ if (!use_vad) {
+ submodules_.voice_activity_detector.reset();
+ return;
+ }
+ if (!submodules_.voice_activity_detector || config_has_changed) {
+ RTC_DCHECK(!!submodules_.gain_controller2);
+ // TODO(bugs.webrtc.org/13663): Cache CPU features in APM and use here.
+ submodules_.voice_activity_detector =
+ std::make_unique<VoiceActivityDetectorWrapper>(
+ config_.gain_controller2.adaptive_digital.vad_reset_period_ms,
+ submodules_.gain_controller2->GetCpuFeatures(),
+ proc_fullband_sample_rate_hz());
}
}