Adding a delay line to NetEq's output

This change adds an optional delay to NetEq's output. Note, this is not
equivalent to increasing the jitter buffer with the same extra length.

Bug: b/156734419
Change-Id: I8b70b6b3bffcfd3da296ccf29853864baa03d6bb
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/175110
Commit-Queue: Henrik Lundin <henrik.lundin@webrtc.org>
Reviewed-by: Karl Wiberg <kwiberg@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#31343}
diff --git a/modules/audio_coding/neteq/neteq_impl.cc b/modules/audio_coding/neteq/neteq_impl.cc
index 0b7510d..163a287 100644
--- a/modules/audio_coding/neteq/neteq_impl.cc
+++ b/modules/audio_coding/neteq/neteq_impl.cc
@@ -140,7 +140,10 @@
                                 10,  // Report once every 10 s.
                                 tick_timer_.get()),
       no_time_stretching_(config.for_test_no_time_stretching),
-      enable_rtx_handling_(config.enable_rtx_handling) {
+      enable_rtx_handling_(config.enable_rtx_handling),
+      output_delay_chain_(
+          rtc::CheckedDivExact(config.extra_output_delay_ms, 10)),
+      output_delay_chain_ms_(config.extra_output_delay_ms) {
   RTC_LOG(LS_INFO) << "NetEq config: " << config.ToString();
   int fs = config.sample_rate_hz;
   if (fs != 8000 && fs != 16000 && fs != 32000 && fs != 48000) {
@@ -255,6 +258,25 @@
              last_output_sample_rate_hz_ == 32000 ||
              last_output_sample_rate_hz_ == 48000)
       << "Unexpected sample rate " << last_output_sample_rate_hz_;
+
+  if (!output_delay_chain_.empty()) {
+    if (output_delay_chain_empty_) {
+      for (auto& f : output_delay_chain_) {
+        f.CopyFrom(*audio_frame);
+      }
+      output_delay_chain_empty_ = false;
+      delayed_last_output_sample_rate_hz_ = last_output_sample_rate_hz_;
+    } else {
+      RTC_DCHECK_GE(output_delay_chain_ix_, 0);
+      RTC_DCHECK_LT(output_delay_chain_ix_, output_delay_chain_.size());
+      swap(output_delay_chain_[output_delay_chain_ix_], *audio_frame);
+      *muted = audio_frame->muted();
+      output_delay_chain_ix_ =
+          (output_delay_chain_ix_ + 1) % output_delay_chain_.size();
+      delayed_last_output_sample_rate_hz_ = audio_frame->sample_rate_hz();
+    }
+  }
+
   return kOK;
 }
 
@@ -297,7 +319,8 @@
   rtc::CritScope lock(&crit_sect_);
   if (delay_ms >= 0 && delay_ms <= 10000) {
     assert(controller_.get());
-    return controller_->SetMinimumDelay(delay_ms);
+    return controller_->SetMinimumDelay(
+        std::max(delay_ms - output_delay_chain_ms_, 0));
   }
   return false;
 }
@@ -306,7 +329,8 @@
   rtc::CritScope lock(&crit_sect_);
   if (delay_ms >= 0 && delay_ms <= 10000) {
     assert(controller_.get());
-    return controller_->SetMaximumDelay(delay_ms);
+    return controller_->SetMaximumDelay(
+        std::max(delay_ms - output_delay_chain_ms_, 0));
   }
   return false;
 }
@@ -327,7 +351,7 @@
 int NetEqImpl::TargetDelayMs() const {
   rtc::CritScope lock(&crit_sect_);
   RTC_DCHECK(controller_.get());
-  return controller_->TargetLevelMs();
+  return controller_->TargetLevelMs() + output_delay_chain_ms_;
 }
 
 int NetEqImpl::FilteredCurrentDelayMs() const {
@@ -337,7 +361,8 @@
   const int delay_samples =
       controller_->GetFilteredBufferLevel() + sync_buffer_->FutureLength();
   // The division below will truncate. The return value is in ms.
-  return delay_samples / rtc::CheckedDivExact(fs_hz_, 1000);
+  return delay_samples / rtc::CheckedDivExact(fs_hz_, 1000) +
+         output_delay_chain_ms_;
 }
 
 int NetEqImpl::NetworkStatistics(NetEqNetworkStatistics* stats) {
@@ -351,6 +376,13 @@
   stats->jitter_peaks_found = controller_->PeakFound();
   stats_->GetNetworkStatistics(fs_hz_, total_samples_in_buffers,
                                decoder_frame_length_, stats);
+  // Compensate for output delay chain.
+  stats->current_buffer_size_ms += output_delay_chain_ms_;
+  stats->preferred_buffer_size_ms += output_delay_chain_ms_;
+  stats->mean_waiting_time_ms += output_delay_chain_ms_;
+  stats->median_waiting_time_ms += output_delay_chain_ms_;
+  stats->min_waiting_time_ms += output_delay_chain_ms_;
+  stats->max_waiting_time_ms += output_delay_chain_ms_;
   return 0;
 }
 
@@ -394,12 +426,19 @@
     // which is indicated by returning an empty value.
     return absl::nullopt;
   }
-  return timestamp_scaler_->ToExternal(playout_timestamp_);
+  size_t sum_samples_in_output_delay_chain = 0;
+  for (const auto& audio_frame : output_delay_chain_) {
+    sum_samples_in_output_delay_chain += audio_frame.samples_per_channel();
+  }
+  return timestamp_scaler_->ToExternal(
+      playout_timestamp_ -
+      static_cast<uint32_t>(sum_samples_in_output_delay_chain));
 }
 
 int NetEqImpl::last_output_sample_rate_hz() const {
   rtc::CritScope lock(&crit_sect_);
-  return last_output_sample_rate_hz_;
+  return delayed_last_output_sample_rate_hz_.value_or(
+      last_output_sample_rate_hz_);
 }
 
 absl::optional<NetEq::DecoderFormat> NetEqImpl::GetDecoderFormat(
@@ -1988,8 +2027,9 @@
     extracted_samples = packet->timestamp - first_timestamp + packet_duration;
 
     RTC_DCHECK(controller_);
-    stats_->JitterBufferDelay(packet_duration, waiting_time_ms,
-                              controller_->TargetLevelMs());
+    stats_->JitterBufferDelay(
+        packet_duration, waiting_time_ms + output_delay_chain_ms_,
+        controller_->TargetLevelMs() + output_delay_chain_ms_);
 
     packet_list->push_back(std::move(*packet));  // Store packet in list.
     packet = absl::nullopt;  // Ensure it's never used after the move.