Improve buffer level estimation with DTX and add CNG time stretching.
The functionality is hidden behind field trial for experimentation.
Bug: webrtc:10736
Change-Id: I1daf60966717c3ea43bf6ee16d190290ab740ce7
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/144059
Commit-Queue: Jakob Ivarsson <jakobi@webrtc.org>
Reviewed-by: Minyue Li <minyue@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#28474}
diff --git a/modules/audio_coding/BUILD.gn b/modules/audio_coding/BUILD.gn
index 3976600..4d3e36f 100644
--- a/modules/audio_coding/BUILD.gn
+++ b/modules/audio_coding/BUILD.gn
@@ -1029,6 +1029,7 @@
"../../rtc_base:rtc_base_approved",
"../../rtc_base:safe_minmax",
"../../rtc_base:sanitizer",
+ "../../rtc_base/experiments:field_trial_parser",
"../../rtc_base/system:fallthrough",
"../../system_wrappers",
"../../system_wrappers:field_trial",
diff --git a/modules/audio_coding/neteq/decision_logic.cc b/modules/audio_coding/neteq/decision_logic.cc
index f9f420a..fc255e5 100644
--- a/modules/audio_coding/neteq/decision_logic.cc
+++ b/modules/audio_coding/neteq/decision_logic.cc
@@ -14,6 +14,7 @@
#include <stdio.h>
#include <string>
+#include "absl/types/optional.h"
#include "modules/audio_coding/neteq/buffer_level_filter.h"
#include "modules/audio_coding/neteq/decoder_database.h"
#include "modules/audio_coding/neteq/delay_manager.h"
@@ -21,12 +22,15 @@
#include "modules/audio_coding/neteq/packet_buffer.h"
#include "modules/audio_coding/neteq/sync_buffer.h"
#include "rtc_base/checks.h"
+#include "rtc_base/experiments/field_trial_parser.h"
#include "rtc_base/logging.h"
#include "rtc_base/numerics/safe_conversions.h"
+#include "system_wrappers/include/field_trial.h"
namespace {
constexpr int kPostponeDecodingLevel = 50;
+constexpr int kDefaultTargetLevelWindowMs = 100;
} // namespace
@@ -65,8 +69,24 @@
disallow_time_stretching_(disallow_time_stretching),
timescale_countdown_(
tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1)),
- num_consecutive_expands_(0) {
+ num_consecutive_expands_(0),
+ time_stretched_cn_samples_(0),
+ estimate_dtx_delay_("estimate_dtx_delay", false),
+ time_stretch_cn_("time_stretch_cn", false),
+ target_level_window_ms_("target_level_window",
+ kDefaultTargetLevelWindowMs,
+ 0,
+ absl::nullopt) {
SetSampleRate(fs_hz, output_size_samples);
+ const std::string field_trial_name =
+ field_trial::FindFullName("WebRTC-Audio-NetEqDecisionLogicSettings");
+ ParseFieldTrial(
+ {&estimate_dtx_delay_, &time_stretch_cn_, &target_level_window_ms_},
+ field_trial_name);
+ RTC_LOG(LS_INFO) << "NetEq decision logic settings:"
+ << " estimate_dtx_delay=" << estimate_dtx_delay_
+ << " time_stretch_cn=" << time_stretch_cn_
+ << " target_level_window_ms=" << target_level_window_ms_;
}
DecisionLogic::~DecisionLogic() = default;
@@ -79,6 +99,7 @@
prev_time_scale_ = false;
timescale_countdown_.reset();
num_consecutive_expands_ = 0;
+ time_stretched_cn_samples_ = 0;
}
void DecisionLogic::SoftReset() {
@@ -87,12 +108,13 @@
prev_time_scale_ = false;
timescale_countdown_ =
tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1);
+ time_stretched_cn_samples_ = 0;
}
void DecisionLogic::SetSampleRate(int fs_hz, size_t output_size_samples) {
// TODO(hlundin): Change to an enumerator and skip assert.
assert(fs_hz == 8000 || fs_hz == 16000 || fs_hz == 32000 || fs_hz == 48000);
- fs_mult_ = fs_hz / 8000;
+ sample_rate_ = fs_hz;
output_size_samples_ = output_size_samples;
}
@@ -113,9 +135,11 @@
cng_state_ = kCngInternalOn;
}
- // TODO(jakobi): Use buffer span instead of num samples.
- const size_t cur_size_samples =
- packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
+ size_t cur_size_samples =
+ estimate_dtx_delay_
+ ? packet_buffer_.GetSpanSamples(decoder_frame_length, sample_rate_,
+ true)
+ : packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
prev_time_scale_ =
prev_time_scale_ && (prev_mode == kModeAccelerateSuccess ||
@@ -125,9 +149,9 @@
// Do not update buffer history if currently playing CNG since it will bias
// the filtered buffer level.
- if ((prev_mode != kModeRfc3389Cng) && (prev_mode != kModeCodecInternalCng) &&
+ if (prev_mode != kModeRfc3389Cng && prev_mode != kModeCodecInternalCng &&
!(next_packet && next_packet->frame &&
- next_packet->frame->IsDtxPacket())) {
+ next_packet->frame->IsDtxPacket() && !estimate_dtx_delay_)) {
FilterBufferLevel(cur_size_samples);
}
@@ -173,7 +197,8 @@
// if the mute factor is low enough (otherwise the expansion was short enough
// to not be noticable).
// Note that the MuteFactor is in Q14, so a value of 16384 corresponds to 1.
- size_t current_span = packet_buffer_.GetSpanSamples(decoder_frame_length);
+ size_t current_span = packet_buffer_.GetSpanSamples(
+ decoder_frame_length, sample_rate_, estimate_dtx_delay_);
if ((prev_mode == kModeExpand || prev_mode == kModeCodecPlc) &&
expand.MuteFactor(0) < 16384 / 2 &&
current_span < static_cast<size_t>(delay_manager_->TargetLevel() *
@@ -183,8 +208,7 @@
return kExpand;
}
- const uint32_t five_seconds_samples =
- static_cast<uint32_t>(5 * 8000 * fs_mult_);
+ const uint32_t five_seconds_samples = static_cast<uint32_t>(5 * sample_rate_);
// Check if the required packet is available.
if (target_timestamp == available_timestamp) {
return ExpectedPacketAvailable(prev_mode, play_dtmf);
@@ -212,14 +236,15 @@
buffer_level_filter_->SetTargetBufferLevel(
delay_manager_->base_target_level());
- int sample_memory_local = 0;
+ int time_stretched_samples = time_stretched_cn_samples_;
if (prev_time_scale_) {
- sample_memory_local = sample_memory_;
+ time_stretched_samples += sample_memory_;
timescale_countdown_ = tick_timer_->GetNewCountdown(kMinTimescaleInterval);
}
- buffer_level_filter_->Update(buffer_size_samples, sample_memory_local);
+ buffer_level_filter_->Update(buffer_size_samples, time_stretched_samples);
prev_time_scale_ = false;
+ time_stretched_cn_samples_ = 0;
}
Operations DecisionLogic::CngOperation(Modes prev_mode,
@@ -323,30 +348,53 @@
return kNormal;
}
- const size_t cur_size_samples =
- packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
-
// If previous was comfort noise, then no merge is needed.
if (prev_mode == kModeRfc3389Cng || prev_mode == kModeCodecInternalCng) {
- // Keep the same delay as before the CNG, but make sure that the number of
- // samples in buffer is no higher than 4 times the optimal level. (Note that
- // TargetLevel() is in Q8.)
- if (static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
- available_timestamp ||
- cur_size_samples >
- ((delay_manager_->TargetLevel() * packet_length_samples_) >> 8) *
- 4) {
- // Time to play this new packet.
- return kNormal;
+ size_t cur_size_samples =
+ estimate_dtx_delay_
+ ? cur_size_samples = packet_buffer_.GetSpanSamples(
+ decoder_frame_length, sample_rate_, true)
+ : packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
+ // Target level is in number of packets in Q8.
+ const size_t target_level_samples =
+ (delay_manager_->TargetLevel() * packet_length_samples_) >> 8;
+ const bool generated_enough_noise =
+ static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
+ available_timestamp;
+
+ if (time_stretch_cn_) {
+ const size_t target_threshold_samples =
+ target_level_window_ms_ / 2 * (sample_rate_ / 1000);
+ const bool above_target_window =
+ cur_size_samples > target_level_samples + target_threshold_samples;
+ const bool below_target_window =
+ target_level_samples > target_threshold_samples &&
+ cur_size_samples < target_level_samples - target_threshold_samples;
+ // Keep the delay same as before CNG, but make sure that it is within the
+ // target window.
+ if ((generated_enough_noise && !below_target_window) ||
+ above_target_window) {
+ time_stretched_cn_samples_ = timestamp_leap - generated_noise_samples;
+ return kNormal;
+ }
} else {
- // Too early to play this new packet; keep on playing comfort noise.
- if (prev_mode == kModeRfc3389Cng) {
- return kRfc3389CngNoPacket;
- } else { // prevPlayMode == kModeCodecInternalCng.
- return kCodecInternalCng;
+ // Keep the same delay as before the CNG, but make sure that the number of
+ // samples in buffer is no higher than 4 times the optimal level.
+ if (generated_enough_noise ||
+ cur_size_samples > target_level_samples * 4) {
+ // Time to play this new packet.
+ return kNormal;
}
}
+
+ // Too early to play this new packet; keep on playing comfort noise.
+ if (prev_mode == kModeRfc3389Cng) {
+ return kRfc3389CngNoPacket;
+ }
+ // prevPlayMode == kModeCodecInternalCng.
+ return kCodecInternalCng;
}
+
// Do not merge unless we have done an expand before.
if (prev_mode == kModeExpand) {
return kMerge;
diff --git a/modules/audio_coding/neteq/decision_logic.h b/modules/audio_coding/neteq/decision_logic.h
index 49020b0..5a9bffb 100644
--- a/modules/audio_coding/neteq/decision_logic.h
+++ b/modules/audio_coding/neteq/decision_logic.h
@@ -14,6 +14,7 @@
#include "modules/audio_coding/neteq/defines.h"
#include "modules/audio_coding/neteq/tick_timer.h"
#include "rtc_base/constructor_magic.h"
+#include "rtc_base/experiments/field_trial_parser.h"
namespace webrtc {
@@ -167,7 +168,7 @@
DelayManager* delay_manager_;
BufferLevelFilter* buffer_level_filter_;
const TickTimer* tick_timer_;
- int fs_mult_;
+ int sample_rate_;
size_t output_size_samples_;
CngState cng_state_; // Remember if comfort noise is interrupted by other
// event (e.g., DTMF).
@@ -178,6 +179,10 @@
bool disallow_time_stretching_;
std::unique_ptr<TickTimer::Countdown> timescale_countdown_;
int num_consecutive_expands_;
+ int time_stretched_cn_samples_;
+ FieldTrialParameter<bool> estimate_dtx_delay_;
+ FieldTrialParameter<bool> time_stretch_cn_;
+ FieldTrialConstrained<int> target_level_window_ms_;
RTC_DISALLOW_COPY_AND_ASSIGN(DecisionLogic);
};
diff --git a/modules/audio_coding/neteq/packet_buffer.cc b/modules/audio_coding/neteq/packet_buffer.cc
index e90fadc..540d279 100644
--- a/modules/audio_coding/neteq/packet_buffer.cc
+++ b/modules/audio_coding/neteq/packet_buffer.cc
@@ -26,6 +26,7 @@
#include "modules/audio_coding/neteq/tick_timer.h"
#include "rtc_base/checks.h"
#include "rtc_base/logging.h"
+#include "rtc_base/numerics/safe_conversions.h"
namespace webrtc {
namespace {
@@ -287,14 +288,22 @@
return num_samples;
}
-size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length) const {
+size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length,
+ size_t sample_rate,
+ bool count_dtx_waiting_time) const {
if (buffer_.size() == 0) {
return 0;
}
size_t span = buffer_.back().timestamp - buffer_.front().timestamp;
if (buffer_.back().frame && buffer_.back().frame->Duration() > 0) {
- span += buffer_.back().frame->Duration();
+ size_t duration = buffer_.back().frame->Duration();
+ if (count_dtx_waiting_time && buffer_.back().frame->IsDtxPacket()) {
+ size_t waiting_time_samples = rtc::dchecked_cast<size_t>(
+ buffer_.back().waiting_time->ElapsedMs() * (sample_rate / 1000));
+ duration = std::max(duration, waiting_time_samples);
+ }
+ span += duration;
} else {
span += last_decoded_length;
}
diff --git a/modules/audio_coding/neteq/packet_buffer.h b/modules/audio_coding/neteq/packet_buffer.h
index 0837027..c00db29 100644
--- a/modules/audio_coding/neteq/packet_buffer.h
+++ b/modules/audio_coding/neteq/packet_buffer.h
@@ -123,7 +123,9 @@
// Returns the total duration in samples that the packets in the buffer spans
// across.
- virtual size_t GetSpanSamples(size_t last_decoded_length) const;
+ virtual size_t GetSpanSamples(size_t last_decoded_length,
+ size_t sample_rate,
+ bool count_dtx_waiting_time) const;
// Returns true if the packet buffer contains any DTX or CNG packets.
virtual bool ContainsDtxOrCngPacket(