AudioProcessingImpl: Add a VAD submodule

Add a VoiceActivityDetectorWrapper submodule in AudioProcessingImpl
and enable injecting speech probability into GainController2.

Bug: webrtc:13663
Change-Id: I05e13b737d085b45ac8ce76660191867c56834c2
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/265166
Commit-Queue: Hanna Silen <silen@webrtc.org>
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#37275}
diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc
index fa5e929..5714d6b 100644
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@@ -162,6 +162,7 @@
     bool noise_suppressor_enabled,
     bool adaptive_gain_controller_enabled,
     bool gain_controller2_enabled,
+    bool voice_activity_detector_enabled,
     bool gain_adjustment_enabled,
     bool echo_controller_enabled,
     bool transient_suppressor_enabled) {
@@ -173,6 +174,8 @@
   changed |=
       (adaptive_gain_controller_enabled != adaptive_gain_controller_enabled_);
   changed |= (gain_controller2_enabled != gain_controller2_enabled_);
+  changed |=
+      (voice_activity_detector_enabled != voice_activity_detector_enabled_);
   changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
   changed |= (echo_controller_enabled != echo_controller_enabled_);
   changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
@@ -182,6 +185,7 @@
     noise_suppressor_enabled_ = noise_suppressor_enabled;
     adaptive_gain_controller_enabled_ = adaptive_gain_controller_enabled;
     gain_controller2_enabled_ = gain_controller2_enabled;
+    voice_activity_detector_enabled_ = voice_activity_detector_enabled;
     gain_adjustment_enabled_ = gain_adjustment_enabled;
     echo_controller_enabled_ = echo_controller_enabled;
     transient_suppressor_enabled_ = transient_suppressor_enabled;
@@ -395,6 +399,7 @@
   InitializeResidualEchoDetector();
   InitializeEchoController();
   InitializeGainController2(/*config_has_changed=*/true);
+  InitializeVoiceActivityDetector(/*config_has_changed=*/true);
   InitializeNoiseSuppressor();
   InitializeAnalyzer();
   InitializePostProcessor();
@@ -569,6 +574,7 @@
   }
 
   InitializeGainController2(agc2_config_changed);
+  InitializeVoiceActivityDetector(agc2_config_changed);
 
   if (pre_amplifier_config_changed || gain_adjustment_config_changed) {
     InitializeCaptureLevelsAdjuster();
@@ -1297,10 +1303,19 @@
       submodules_.capture_analyzer->Analyze(capture_buffer);
     }
 
+    absl::optional<float> voice_activity_probability = absl::nullopt;
     if (submodules_.gain_controller2) {
       submodules_.gain_controller2->NotifyAnalogLevel(
           recommended_stream_analog_level_locked());
-      submodules_.gain_controller2->Process(capture_buffer);
+      if (submodules_.voice_activity_detector) {
+        voice_activity_probability =
+            submodules_.voice_activity_detector->Analyze(
+                AudioFrameView<const float>(capture_buffer->channels(),
+                                            capture_buffer->num_channels(),
+                                            capture_buffer->num_frames()));
+      }
+      submodules_.gain_controller2->Process(voice_activity_probability,
+                                            capture_buffer);
     }
 
     if (submodules_.capture_post_processor) {
@@ -1692,7 +1707,7 @@
   return submodule_states_.Update(
       config_.high_pass_filter.enabled, !!submodules_.echo_control_mobile,
       !!submodules_.noise_suppressor, !!submodules_.gain_control,
-      !!submodules_.gain_controller2,
+      !!submodules_.gain_controller2, !!submodules_.voice_activity_detector,
       config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
       capture_nonlocked_.echo_controller_enabled,
       !!submodules_.transient_suppressor);
@@ -1900,9 +1915,35 @@
     return;
   }
   if (!submodules_.gain_controller2 || config_has_changed) {
+    const bool use_internal_vad =
+        transient_suppressor_vad_mode_ != TransientSuppressor::VadMode::kRnnVad;
     submodules_.gain_controller2 = std::make_unique<GainController2>(
         config_.gain_controller2, proc_fullband_sample_rate_hz(),
-        num_input_channels());
+        num_input_channels(), use_internal_vad);
+  }
+}
+
+void AudioProcessingImpl::InitializeVoiceActivityDetector(
+    bool config_has_changed) {
+  if (!config_has_changed) {
+    return;
+  }
+  const bool use_vad =
+      transient_suppressor_vad_mode_ == TransientSuppressor::VadMode::kRnnVad &&
+      config_.gain_controller2.enabled &&
+      config_.gain_controller2.adaptive_digital.enabled;
+  if (!use_vad) {
+    submodules_.voice_activity_detector.reset();
+    return;
+  }
+  if (!submodules_.voice_activity_detector || config_has_changed) {
+    RTC_DCHECK(!!submodules_.gain_controller2);
+    // TODO(bugs.webrtc.org/13663): Cache CPU features in APM and use here.
+    submodules_.voice_activity_detector =
+        std::make_unique<VoiceActivityDetectorWrapper>(
+            config_.gain_controller2.adaptive_digital.vad_reset_period_ms,
+            submodules_.gain_controller2->GetCpuFeatures(),
+            proc_fullband_sample_rate_hz());
   }
 }