Improve buffer level estimation with DTX and add CNG time stretching.

The functionality is hidden behind field trial for experimentation.

Bug: webrtc:10736
Change-Id: I1daf60966717c3ea43bf6ee16d190290ab740ce7
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/144059
Commit-Queue: Jakob Ivarsson <jakobi@webrtc.org>
Reviewed-by: Minyue Li <minyue@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#28474}
diff --git a/modules/audio_coding/BUILD.gn b/modules/audio_coding/BUILD.gn
index 3976600..4d3e36f 100644
--- a/modules/audio_coding/BUILD.gn
+++ b/modules/audio_coding/BUILD.gn
@@ -1029,6 +1029,7 @@
     "../../rtc_base:rtc_base_approved",
     "../../rtc_base:safe_minmax",
     "../../rtc_base:sanitizer",
+    "../../rtc_base/experiments:field_trial_parser",
     "../../rtc_base/system:fallthrough",
     "../../system_wrappers",
     "../../system_wrappers:field_trial",
diff --git a/modules/audio_coding/neteq/decision_logic.cc b/modules/audio_coding/neteq/decision_logic.cc
index f9f420a..fc255e5 100644
--- a/modules/audio_coding/neteq/decision_logic.cc
+++ b/modules/audio_coding/neteq/decision_logic.cc
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <string>
 
+#include "absl/types/optional.h"
 #include "modules/audio_coding/neteq/buffer_level_filter.h"
 #include "modules/audio_coding/neteq/decoder_database.h"
 #include "modules/audio_coding/neteq/delay_manager.h"
@@ -21,12 +22,15 @@
 #include "modules/audio_coding/neteq/packet_buffer.h"
 #include "modules/audio_coding/neteq/sync_buffer.h"
 #include "rtc_base/checks.h"
+#include "rtc_base/experiments/field_trial_parser.h"
 #include "rtc_base/logging.h"
 #include "rtc_base/numerics/safe_conversions.h"
+#include "system_wrappers/include/field_trial.h"
 
 namespace {
 
 constexpr int kPostponeDecodingLevel = 50;
+constexpr int kDefaultTargetLevelWindowMs = 100;
 
 }  // namespace
 
@@ -65,8 +69,24 @@
       disallow_time_stretching_(disallow_time_stretching),
       timescale_countdown_(
           tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1)),
-      num_consecutive_expands_(0) {
+      num_consecutive_expands_(0),
+      time_stretched_cn_samples_(0),
+      estimate_dtx_delay_("estimate_dtx_delay", false),
+      time_stretch_cn_("time_stretch_cn", false),
+      target_level_window_ms_("target_level_window",
+                              kDefaultTargetLevelWindowMs,
+                              0,
+                              absl::nullopt) {
   SetSampleRate(fs_hz, output_size_samples);
+  const std::string field_trial_name =
+      field_trial::FindFullName("WebRTC-Audio-NetEqDecisionLogicSettings");
+  ParseFieldTrial(
+      {&estimate_dtx_delay_, &time_stretch_cn_, &target_level_window_ms_},
+      field_trial_name);
+  RTC_LOG(LS_INFO) << "NetEq decision logic settings:"
+                   << " estimate_dtx_delay=" << estimate_dtx_delay_
+                   << " time_stretch_cn=" << time_stretch_cn_
+                   << " target_level_window_ms=" << target_level_window_ms_;
 }
 
 DecisionLogic::~DecisionLogic() = default;
@@ -79,6 +99,7 @@
   prev_time_scale_ = false;
   timescale_countdown_.reset();
   num_consecutive_expands_ = 0;
+  time_stretched_cn_samples_ = 0;
 }
 
 void DecisionLogic::SoftReset() {
@@ -87,12 +108,13 @@
   prev_time_scale_ = false;
   timescale_countdown_ =
       tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1);
+  time_stretched_cn_samples_ = 0;
 }
 
 void DecisionLogic::SetSampleRate(int fs_hz, size_t output_size_samples) {
   // TODO(hlundin): Change to an enumerator and skip assert.
   assert(fs_hz == 8000 || fs_hz == 16000 || fs_hz == 32000 || fs_hz == 48000);
-  fs_mult_ = fs_hz / 8000;
+  sample_rate_ = fs_hz;
   output_size_samples_ = output_size_samples;
 }
 
@@ -113,9 +135,11 @@
     cng_state_ = kCngInternalOn;
   }
 
-  // TODO(jakobi): Use buffer span instead of num samples.
-  const size_t cur_size_samples =
-      packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
+  size_t cur_size_samples =
+      estimate_dtx_delay_
+          ? packet_buffer_.GetSpanSamples(decoder_frame_length, sample_rate_,
+                                          true)
+          : packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
 
   prev_time_scale_ =
       prev_time_scale_ && (prev_mode == kModeAccelerateSuccess ||
@@ -125,9 +149,9 @@
 
   // Do not update buffer history if currently playing CNG since it will bias
   // the filtered buffer level.
-  if ((prev_mode != kModeRfc3389Cng) && (prev_mode != kModeCodecInternalCng) &&
+  if (prev_mode != kModeRfc3389Cng && prev_mode != kModeCodecInternalCng &&
       !(next_packet && next_packet->frame &&
-        next_packet->frame->IsDtxPacket())) {
+        next_packet->frame->IsDtxPacket() && !estimate_dtx_delay_)) {
     FilterBufferLevel(cur_size_samples);
   }
 
@@ -173,7 +197,8 @@
   // if the mute factor is low enough (otherwise the expansion was short enough
   // to not be noticable).
   // Note that the MuteFactor is in Q14, so a value of 16384 corresponds to 1.
-  size_t current_span = packet_buffer_.GetSpanSamples(decoder_frame_length);
+  size_t current_span = packet_buffer_.GetSpanSamples(
+      decoder_frame_length, sample_rate_, estimate_dtx_delay_);
   if ((prev_mode == kModeExpand || prev_mode == kModeCodecPlc) &&
       expand.MuteFactor(0) < 16384 / 2 &&
       current_span < static_cast<size_t>(delay_manager_->TargetLevel() *
@@ -183,8 +208,7 @@
     return kExpand;
   }
 
-  const uint32_t five_seconds_samples =
-      static_cast<uint32_t>(5 * 8000 * fs_mult_);
+  const uint32_t five_seconds_samples = static_cast<uint32_t>(5 * sample_rate_);
   // Check if the required packet is available.
   if (target_timestamp == available_timestamp) {
     return ExpectedPacketAvailable(prev_mode, play_dtmf);
@@ -212,14 +236,15 @@
   buffer_level_filter_->SetTargetBufferLevel(
       delay_manager_->base_target_level());
 
-  int sample_memory_local = 0;
+  int time_stretched_samples = time_stretched_cn_samples_;
   if (prev_time_scale_) {
-    sample_memory_local = sample_memory_;
+    time_stretched_samples += sample_memory_;
     timescale_countdown_ = tick_timer_->GetNewCountdown(kMinTimescaleInterval);
   }
 
-  buffer_level_filter_->Update(buffer_size_samples, sample_memory_local);
+  buffer_level_filter_->Update(buffer_size_samples, time_stretched_samples);
   prev_time_scale_ = false;
+  time_stretched_cn_samples_ = 0;
 }
 
 Operations DecisionLogic::CngOperation(Modes prev_mode,
@@ -323,30 +348,53 @@
     return kNormal;
   }
 
-  const size_t cur_size_samples =
-      packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
-
   // If previous was comfort noise, then no merge is needed.
   if (prev_mode == kModeRfc3389Cng || prev_mode == kModeCodecInternalCng) {
-    // Keep the same delay as before the CNG, but make sure that the number of
-    // samples in buffer is no higher than 4 times the optimal level. (Note that
-    // TargetLevel() is in Q8.)
-    if (static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
-            available_timestamp ||
-        cur_size_samples >
-            ((delay_manager_->TargetLevel() * packet_length_samples_) >> 8) *
-                4) {
-      // Time to play this new packet.
-      return kNormal;
+    size_t cur_size_samples =
+        estimate_dtx_delay_
+            ? cur_size_samples = packet_buffer_.GetSpanSamples(
+                  decoder_frame_length, sample_rate_, true)
+            : packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
+    // Target level is in number of packets in Q8.
+    const size_t target_level_samples =
+        (delay_manager_->TargetLevel() * packet_length_samples_) >> 8;
+    const bool generated_enough_noise =
+        static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
+        available_timestamp;
+
+    if (time_stretch_cn_) {
+      const size_t target_threshold_samples =
+          target_level_window_ms_ / 2 * (sample_rate_ / 1000);
+      const bool above_target_window =
+          cur_size_samples > target_level_samples + target_threshold_samples;
+      const bool below_target_window =
+          target_level_samples > target_threshold_samples &&
+          cur_size_samples < target_level_samples - target_threshold_samples;
+      // Keep the delay same as before CNG, but make sure that it is within the
+      // target window.
+      if ((generated_enough_noise && !below_target_window) ||
+          above_target_window) {
+        time_stretched_cn_samples_ = timestamp_leap - generated_noise_samples;
+        return kNormal;
+      }
     } else {
-      // Too early to play this new packet; keep on playing comfort noise.
-      if (prev_mode == kModeRfc3389Cng) {
-        return kRfc3389CngNoPacket;
-      } else {  // prevPlayMode == kModeCodecInternalCng.
-        return kCodecInternalCng;
+      // Keep the same delay as before the CNG, but make sure that the number of
+      // samples in buffer is no higher than 4 times the optimal level.
+      if (generated_enough_noise ||
+          cur_size_samples > target_level_samples * 4) {
+        // Time to play this new packet.
+        return kNormal;
       }
     }
+
+    // Too early to play this new packet; keep on playing comfort noise.
+    if (prev_mode == kModeRfc3389Cng) {
+      return kRfc3389CngNoPacket;
+    }
+    // prevPlayMode == kModeCodecInternalCng.
+    return kCodecInternalCng;
   }
+
   // Do not merge unless we have done an expand before.
   if (prev_mode == kModeExpand) {
     return kMerge;
diff --git a/modules/audio_coding/neteq/decision_logic.h b/modules/audio_coding/neteq/decision_logic.h
index 49020b0..5a9bffb 100644
--- a/modules/audio_coding/neteq/decision_logic.h
+++ b/modules/audio_coding/neteq/decision_logic.h
@@ -14,6 +14,7 @@
 #include "modules/audio_coding/neteq/defines.h"
 #include "modules/audio_coding/neteq/tick_timer.h"
 #include "rtc_base/constructor_magic.h"
+#include "rtc_base/experiments/field_trial_parser.h"
 
 namespace webrtc {
 
@@ -167,7 +168,7 @@
   DelayManager* delay_manager_;
   BufferLevelFilter* buffer_level_filter_;
   const TickTimer* tick_timer_;
-  int fs_mult_;
+  int sample_rate_;
   size_t output_size_samples_;
   CngState cng_state_;  // Remember if comfort noise is interrupted by other
                         // event (e.g., DTMF).
@@ -178,6 +179,10 @@
   bool disallow_time_stretching_;
   std::unique_ptr<TickTimer::Countdown> timescale_countdown_;
   int num_consecutive_expands_;
+  int time_stretched_cn_samples_;
+  FieldTrialParameter<bool> estimate_dtx_delay_;
+  FieldTrialParameter<bool> time_stretch_cn_;
+  FieldTrialConstrained<int> target_level_window_ms_;
 
   RTC_DISALLOW_COPY_AND_ASSIGN(DecisionLogic);
 };
diff --git a/modules/audio_coding/neteq/packet_buffer.cc b/modules/audio_coding/neteq/packet_buffer.cc
index e90fadc..540d279 100644
--- a/modules/audio_coding/neteq/packet_buffer.cc
+++ b/modules/audio_coding/neteq/packet_buffer.cc
@@ -26,6 +26,7 @@
 #include "modules/audio_coding/neteq/tick_timer.h"
 #include "rtc_base/checks.h"
 #include "rtc_base/logging.h"
+#include "rtc_base/numerics/safe_conversions.h"
 
 namespace webrtc {
 namespace {
@@ -287,14 +288,22 @@
   return num_samples;
 }
 
-size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length) const {
+size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length,
+                                    size_t sample_rate,
+                                    bool count_dtx_waiting_time) const {
   if (buffer_.size() == 0) {
     return 0;
   }
 
   size_t span = buffer_.back().timestamp - buffer_.front().timestamp;
   if (buffer_.back().frame && buffer_.back().frame->Duration() > 0) {
-    span += buffer_.back().frame->Duration();
+    size_t duration = buffer_.back().frame->Duration();
+    if (count_dtx_waiting_time && buffer_.back().frame->IsDtxPacket()) {
+      size_t waiting_time_samples = rtc::dchecked_cast<size_t>(
+          buffer_.back().waiting_time->ElapsedMs() * (sample_rate / 1000));
+      duration = std::max(duration, waiting_time_samples);
+    }
+    span += duration;
   } else {
     span += last_decoded_length;
   }
diff --git a/modules/audio_coding/neteq/packet_buffer.h b/modules/audio_coding/neteq/packet_buffer.h
index 0837027..c00db29 100644
--- a/modules/audio_coding/neteq/packet_buffer.h
+++ b/modules/audio_coding/neteq/packet_buffer.h
@@ -123,7 +123,9 @@
 
   // Returns the total duration in samples that the packets in the buffer spans
   // across.
-  virtual size_t GetSpanSamples(size_t last_decoded_length) const;
+  virtual size_t GetSpanSamples(size_t last_decoded_length,
+                                size_t sample_rate,
+                                bool count_dtx_waiting_time) const;
 
   // Returns true if the packet buffer contains any DTX or CNG packets.
   virtual bool ContainsDtxOrCngPacket(