Add new fast mode for NetEq's Accelerate operation This change instroduces a mode where the Accelerate operation will be more aggressive. When enabled, it will allow acceleration at lower correlation levels, and possibly remove multiple pitch periods at once. The feature is enabled through NetEq::Config, and is off by default. This means that bit-exactness tests are currently not affected. A unit test was added for the Accelerate class, with and without fast mode enabled. BUG=4691 R=minyue@webrtc.org Review URL: https://webrtc-codereview.appspot.com/50039004 Cr-Commit-Position: refs/heads/master@{#9295}

commit: cf808d2366e58b33540931d182f36800d9a15b0d [log] [tgz]
author: Henrik Lundin <henrik.lundin@webrtc.org> Wed May 27 14:33:29 2015 +0200
committer: Henrik Lundin <henrik.lundin@webrtc.org> Wed May 27 12:33:39 2015 +0000
tree: 821aa7a6bd6ecd9052e4305f43f5f0a981dd5f27
parent: cbe408aa118e46e1f1dd28d201378968f00b60ea [diff]
diff --git a/webrtc/modules/audio_coding/neteq/accelerate.cc b/webrtc/modules/audio_coding/neteq/accelerate.cc
index 6acd778..ad74238 100644
--- a/webrtc/modules/audio_coding/neteq/accelerate.cc
+++ b/webrtc/modules/audio_coding/neteq/accelerate.cc

@@ -14,11 +14,11 @@
 
 namespace webrtc {
 
-Accelerate::ReturnCodes Accelerate::Process(
-    const int16_t* input,
-    size_t input_length,
-    AudioMultiVector* output,
-    int16_t* length_change_samples) {
+Accelerate::ReturnCodes Accelerate::Process(const int16_t* input,
+                                            size_t input_length,
+                                            bool fast_accelerate,
+                                            AudioMultiVector* output,
+                                            int16_t* length_change_samples) {
   // Input length must be (almost) 30 ms.
   static const int k15ms = 120;  // 15 ms = 120 samples at 8 kHz sample rate.
   if (num_channels_ == 0 || static_cast<int>(input_length) / num_channels_ <
@@ -28,7 +28,7 @@
     output->PushBackInterleaved(input, input_length);
     return kError;
   }
-  return TimeStretch::Process(input, input_length, output,
+  return TimeStretch::Process(input, input_length, fast_accelerate, output,
                               length_change_samples);
 }
 
@@ -41,17 +41,30 @@
 }
 
 Accelerate::ReturnCodes Accelerate::CheckCriteriaAndStretch(
-    const int16_t* input, size_t input_length, size_t peak_index,
-    int16_t best_correlation, bool active_speech,
+    const int16_t* input,
+    size_t input_length,
+    size_t peak_index,
+    int16_t best_correlation,
+    bool active_speech,
+    bool fast_mode,
     AudioMultiVector* output) const {
   // Check for strong correlation or passive speech.
-  if ((best_correlation > kCorrelationThreshold) || !active_speech) {
+  // Use 8192 (0.5 in Q14) in fast mode.
+  const int correlation_threshold = fast_mode ? 8192 : kCorrelationThreshold;
+  if ((best_correlation > correlation_threshold) || !active_speech) {
     // Do accelerate operation by overlap add.
 
     // Pre-calculate common multiplication with |fs_mult_|.
     // 120 corresponds to 15 ms.
     size_t fs_mult_120 = fs_mult_ * 120;
 
+    if (fast_mode) {
+      // Fit as many multiples of |peak_index| as possible in fs_mult_120.
+      // TODO(henrik.lundin) Consider finding multiple correlation peaks and
+      // pick the one with the longest correlation lag in this case.
+      peak_index = (fs_mult_120 / peak_index) * peak_index;
+    }
+
     assert(fs_mult_120 >= peak_index);  // Should be handled in Process().
     // Copy first part; 0 to 15 ms.
     output->PushBackInterleaved(input, fs_mult_120 * num_channels_);

diff --git a/webrtc/modules/audio_coding/neteq/accelerate.h b/webrtc/modules/audio_coding/neteq/accelerate.h
index 36bc094..684f74b 100644
--- a/webrtc/modules/audio_coding/neteq/accelerate.h
+++ b/webrtc/modules/audio_coding/neteq/accelerate.h

@@ -38,9 +38,12 @@
   // read from |input|, of length |input_length| elements, and are written to
   // |output|. The number of samples removed through time-stretching is
   // is provided in the output |length_change_samples|. The method returns
-  // the outcome of the operation as an enumerator value.
+  // the outcome of the operation as an enumerator value. If |fast_accelerate|
+  // is true, the algorithm will relax the requirements on finding strong
+  // correlations, and may remove multiple pitch periods if possible.
   ReturnCodes Process(const int16_t* input,
                       size_t input_length,
+                      bool fast_accelerate,
                       AudioMultiVector* output,
                       int16_t* length_change_samples);
 
@@ -58,6 +61,7 @@
                                       size_t peak_index,
                                       int16_t best_correlation,
                                       bool active_speech,
+                                      bool fast_mode,
                                       AudioMultiVector* output) const override;
 
  private:

diff --git a/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc b/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc
index f238284..89fdb51 100644
--- a/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc
+++ b/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc

@@ -132,15 +132,13 @@
     // Check criterion for time-stretching.
     int low_limit, high_limit;
     delay_manager_->BufferLimits(&low_limit, &high_limit);
-    if ((buffer_level_filter_->filtered_current_level() >= high_limit &&
-        TimescaleAllowed()) ||
-        buffer_level_filter_->filtered_current_level() >= high_limit << 2) {
-      // Buffer level higher than limit and time-scaling allowed,
-      // or buffer level really high.
-      return kAccelerate;
-    } else if ((buffer_level_filter_->filtered_current_level() < low_limit)
-        && TimescaleAllowed()) {
-      return kPreemptiveExpand;
+    if (buffer_level_filter_->filtered_current_level() >= high_limit << 2)
+      return kFastAccelerate;
+    if (TimescaleAllowed()) {
+      if (buffer_level_filter_->filtered_current_level() >= high_limit)
+        return kAccelerate;
+      if (buffer_level_filter_->filtered_current_level() < low_limit)
+        return kPreemptiveExpand;
     }
   }
   return kNormal;

diff --git a/webrtc/modules/audio_coding/neteq/defines.h b/webrtc/modules/audio_coding/neteq/defines.h
index 33d1bd9..3ed6b61 100644
--- a/webrtc/modules/audio_coding/neteq/defines.h
+++ b/webrtc/modules/audio_coding/neteq/defines.h

@@ -18,6 +18,7 @@
   kMerge,
   kExpand,
   kAccelerate,
+  kFastAccelerate,
   kPreemptiveExpand,
   kRfc3389Cng,
   kRfc3389CngNoPacket,

diff --git a/webrtc/modules/audio_coding/neteq/interface/neteq.h b/webrtc/modules/audio_coding/neteq/interface/neteq.h
index 439f049..88bf208 100644
--- a/webrtc/modules/audio_coding/neteq/interface/neteq.h
+++ b/webrtc/modules/audio_coding/neteq/interface/neteq.h

@@ -80,7 +80,8 @@
           // |max_delay_ms| has the same effect as calling SetMaximumDelay().
           max_delay_ms(2000),
           background_noise_mode(kBgnOff),
-          playout_mode(kPlayoutOn) {}
+          playout_mode(kPlayoutOn),
+          enable_fast_accelerate(false) {}
 
     std::string ToString() const;
 
@@ -90,6 +91,7 @@
     int max_delay_ms;
     BackgroundNoiseMode background_noise_mode;
     NetEqPlayoutMode playout_mode;
+    bool enable_fast_accelerate;
   };
 
   enum ReturnCodes {

diff --git a/webrtc/modules/audio_coding/neteq/neteq.cc b/webrtc/modules/audio_coding/neteq/neteq.cc
index ea10069..c8c4c46 100644
--- a/webrtc/modules/audio_coding/neteq/neteq.cc
+++ b/webrtc/modules/audio_coding/neteq/neteq.cc

@@ -34,7 +34,8 @@
      << (enable_audio_classifier ? "true" : "false")
      << ", max_packets_in_buffer=" << max_packets_in_buffer
      << ", background_noise_mode=" << background_noise_mode
-     << ", playout_mode=" << playout_mode;
+     << ", playout_mode=" << playout_mode
+     << ", enable_fast_accelerate=" << enable_fast_accelerate;
   return ss.str();
 }
 

diff --git a/webrtc/modules/audio_coding/neteq/neteq_impl.cc b/webrtc/modules/audio_coding/neteq/neteq_impl.cc
index fe078fd..1351e66 100644
--- a/webrtc/modules/audio_coding/neteq/neteq_impl.cc
+++ b/webrtc/modules/audio_coding/neteq/neteq_impl.cc

@@ -92,6 +92,7 @@
       decoder_error_code_(0),
       background_noise_mode_(config.background_noise_mode),
       playout_mode_(config.playout_mode),
+      enable_fast_accelerate_(config.enable_fast_accelerate),
       decoded_packet_sequence_number_(-1),
       decoded_packet_timestamp_(0) {
   LOG(LS_INFO) << "NetEq config: " << config.ToString();
@@ -745,9 +746,12 @@
       return_value = DoExpand(play_dtmf);
       break;
     }
-    case kAccelerate: {
+    case kAccelerate:
+    case kFastAccelerate: {
+      const bool fast_accelerate =
+          enable_fast_accelerate_ && (operation == kFastAccelerate);
       return_value = DoAccelerate(decoded_buffer_.get(), length, speech_type,
-                                  play_dtmf);
+                                  play_dtmf, fast_accelerate);
       break;
     }
     case kPreemptiveExpand: {
@@ -956,9 +960,8 @@
   // Check if we already have enough samples in the |sync_buffer_|. If so,
   // change decision to normal, unless the decision was merge, accelerate, or
   // preemptive expand.
-  if (samples_left >= output_size_samples_ &&
-      *operation != kMerge &&
-      *operation != kAccelerate &&
+  if (samples_left >= output_size_samples_ && *operation != kMerge &&
+      *operation != kAccelerate && *operation != kFastAccelerate &&
       *operation != kPreemptiveExpand) {
     *operation = kNormal;
     return 0;
@@ -1034,8 +1037,9 @@
       decision_logic_->set_generated_noise_samples(0);
       return 0;
     }
-    case kAccelerate: {
-      // In order to do a accelerate we need at least 30 ms of audio data.
+    case kAccelerate:
+    case kFastAccelerate: {
+      // In order to do an accelerate we need at least 30 ms of audio data.
       if (samples_left >= samples_30_ms) {
         // Already have enough data, so we do not need to extract any more.
         decision_logic_->set_sample_memory(samples_left);
@@ -1124,13 +1128,13 @@
     }
   }
 
-  if (*operation == kAccelerate ||
+  if (*operation == kAccelerate || *operation == kFastAccelerate ||
       *operation == kPreemptiveExpand) {
     decision_logic_->set_sample_memory(samples_left + extracted_samples);
     decision_logic_->set_prev_time_scale(true);
   }
 
-  if (*operation == kAccelerate) {
+  if (*operation == kAccelerate || *operation == kFastAccelerate) {
     // Check that we have enough data (30ms) to do accelerate.
     if (extracted_samples + samples_left < samples_30_ms) {
       // TODO(hlundin): Write test for this.
@@ -1263,7 +1267,8 @@
     assert(sync_buffer_->Channels() == decoder->Channels());
     assert(decoded_buffer_length_ >= kMaxFrameSize * decoder->Channels());
     assert(*operation == kNormal || *operation == kAccelerate ||
-           *operation == kMerge || *operation == kPreemptiveExpand);
+           *operation == kFastAccelerate || *operation == kMerge ||
+           *operation == kPreemptiveExpand);
     packet_list->pop_front();
     size_t payload_length = packet->payload_length;
     int16_t decode_length;
@@ -1427,9 +1432,11 @@
   return 0;
 }
 
-int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, size_t decoded_length,
+int NetEqImpl::DoAccelerate(int16_t* decoded_buffer,
+                            size_t decoded_length,
                             AudioDecoder::SpeechType speech_type,
-                            bool play_dtmf) {
+                            bool play_dtmf,
+                            bool fast_accelerate) {
   const size_t required_samples = 240 * fs_mult_;  // Must have 30 ms.
   size_t borrowed_samples_per_channel = 0;
   size_t num_channels = algorithm_buffer_->Channels();
@@ -1447,9 +1454,9 @@
   }
 
   int16_t samples_removed;
-  Accelerate::ReturnCodes return_code = accelerate_->Process(
-      decoded_buffer, decoded_length, algorithm_buffer_.get(),
-      &samples_removed);
+  Accelerate::ReturnCodes return_code =
+      accelerate_->Process(decoded_buffer, decoded_length, fast_accelerate,
+                           algorithm_buffer_.get(), &samples_removed);
   stats_.AcceleratedSamples(samples_removed);
   switch (return_code) {
     case Accelerate::kSuccess:

diff --git a/webrtc/modules/audio_coding/neteq/neteq_impl.h b/webrtc/modules/audio_coding/neteq/neteq_impl.h
index 248071f..55ba067 100644
--- a/webrtc/modules/audio_coding/neteq/neteq_impl.h
+++ b/webrtc/modules/audio_coding/neteq/neteq_impl.h

@@ -278,7 +278,8 @@
   int DoAccelerate(int16_t* decoded_buffer,
                    size_t decoded_length,
                    AudioDecoder::SpeechType speech_type,
-                   bool play_dtmf) EXCLUSIVE_LOCKS_REQUIRED(crit_sect_);
+                   bool play_dtmf,
+                   bool fast_accelerate) EXCLUSIVE_LOCKS_REQUIRED(crit_sect_);
 
   // Sub-method which calls the PreemptiveExpand class to perform the
   // preemtive expand operation.
@@ -392,6 +393,7 @@
   int decoder_error_code_ GUARDED_BY(crit_sect_);
   const BackgroundNoiseMode background_noise_mode_ GUARDED_BY(crit_sect_);
   NetEqPlayoutMode playout_mode_ GUARDED_BY(crit_sect_);
+  bool enable_fast_accelerate_ GUARDED_BY(crit_sect_);
 
   // These values are used by NACK module to estimate time-to-play of
   // a missing packet. Occasionally, NetEq might decide to decode more

diff --git a/webrtc/modules/audio_coding/neteq/preemptive_expand.cc b/webrtc/modules/audio_coding/neteq/preemptive_expand.cc
index b2dc3e6..6a3f8ec 100644
--- a/webrtc/modules/audio_coding/neteq/preemptive_expand.cc
+++ b/webrtc/modules/audio_coding/neteq/preemptive_expand.cc

@@ -34,7 +34,8 @@
     output->PushBackInterleaved(input, input_length);
     return kError;
   }
-  return TimeStretch::Process(input, input_length, output,
+  const bool kFastMode = false;  // Fast mode is not available for PE Expand.
+  return TimeStretch::Process(input, input_length, kFastMode, output,
                               length_change_samples);
 }
 
@@ -54,8 +55,12 @@
 }
 
 PreemptiveExpand::ReturnCodes PreemptiveExpand::CheckCriteriaAndStretch(
-    const int16_t *input, size_t input_length, size_t peak_index,
-    int16_t best_correlation, bool active_speech,
+    const int16_t* input,
+    size_t input_length,
+    size_t peak_index,
+    int16_t best_correlation,
+    bool active_speech,
+    bool /*fast_mode*/,
     AudioMultiVector* output) const {
   // Pre-calculate common multiplication with |fs_mult_|.
   // 120 corresponds to 15 ms.

diff --git a/webrtc/modules/audio_coding/neteq/preemptive_expand.h b/webrtc/modules/audio_coding/neteq/preemptive_expand.h
index 750c16b..c583a48 100644
--- a/webrtc/modules/audio_coding/neteq/preemptive_expand.h
+++ b/webrtc/modules/audio_coding/neteq/preemptive_expand.h

@@ -58,11 +58,12 @@
 
   // Checks the criteria for performing the time-stretching operation and,
   // if possible, performs the time-stretching.
-  ReturnCodes CheckCriteriaAndStretch(const int16_t* pw16_decoded,
-                                      size_t len,
-                                      size_t w16_bestIndex,
-                                      int16_t w16_bestCorr,
-                                      bool w16_VAD,
+  ReturnCodes CheckCriteriaAndStretch(const int16_t* input,
+                                      size_t input_length,
+                                      size_t peak_index,
+                                      int16_t best_correlation,
+                                      bool active_speech,
+                                      bool /*fast_mode*/,
                                       AudioMultiVector* output) const override;
 
  private:

diff --git a/webrtc/modules/audio_coding/neteq/time_stretch.cc b/webrtc/modules/audio_coding/neteq/time_stretch.cc
index 02305c8..5577cd2 100644
--- a/webrtc/modules/audio_coding/neteq/time_stretch.cc
+++ b/webrtc/modules/audio_coding/neteq/time_stretch.cc

@@ -19,12 +19,11 @@
 
 namespace webrtc {
 
-TimeStretch::ReturnCodes TimeStretch::Process(
-    const int16_t* input,
-    size_t input_len,
-    AudioMultiVector* output,
-    int16_t* length_change_samples) {
-
+TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
+                                              size_t input_len,
+                                              bool fast_mode,
+                                              AudioMultiVector* output,
+                                              int16_t* length_change_samples) {
   // Pre-calculate common multiplication with |fs_mult_|.
   int fs_mult_120 = fs_mult_ * 120;  // Corresponds to 15 ms.
 
@@ -140,8 +139,9 @@
 
 
   // Check accelerate criteria and stretch the signal.
-  ReturnCodes return_value = CheckCriteriaAndStretch(
-      input, input_len, peak_index, best_correlation, active_speech, output);
+  ReturnCodes return_value =
+      CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
+                              active_speech, fast_mode, output);
   switch (return_value) {
     case kSuccess:
       *length_change_samples = peak_index;

diff --git a/webrtc/modules/audio_coding/neteq/time_stretch.h b/webrtc/modules/audio_coding/neteq/time_stretch.h
index 9396d8f..7c84e1a 100644
--- a/webrtc/modules/audio_coding/neteq/time_stretch.h
+++ b/webrtc/modules/audio_coding/neteq/time_stretch.h

@@ -58,6 +58,7 @@
   // PreemptiveExpand.
   ReturnCodes Process(const int16_t* input,
                       size_t input_len,
+                      bool fast_mode,
                       AudioMultiVector* output,
                       int16_t* length_change_samples);
 
@@ -73,8 +74,12 @@
   // if possible, performs the time-stretching. This method must be implemented
   // by the sub-classes.
   virtual ReturnCodes CheckCriteriaAndStretch(
-      const int16_t* input, size_t input_length, size_t peak_index,
-      int16_t best_correlation, bool active_speech,
+      const int16_t* input,
+      size_t input_length,
+      size_t peak_index,
+      int16_t best_correlation,
+      bool active_speech,
+      bool fast_mode,
       AudioMultiVector* output) const = 0;
 
   static const int kCorrelationLen = 50;

diff --git a/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc b/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc
index 3d1e069..05385a1 100644
--- a/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc
+++ b/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc

@@ -13,14 +13,24 @@
 #include "webrtc/modules/audio_coding/neteq/accelerate.h"
 #include "webrtc/modules/audio_coding/neteq/preemptive_expand.h"
 
+#include <map>
+
 #include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/base/checks.h"
+#include "webrtc/base/scoped_ptr.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
 #include "webrtc/modules/audio_coding/neteq/background_noise.h"
+#include "webrtc/modules/audio_coding/neteq/tools/input_audio_file.h"
+#include "webrtc/test/testsupport/fileutils.h"
 
 namespace webrtc {
 
+namespace {
+const size_t kNumChannels = 1;
+}
+
 TEST(TimeStretch, CreateAndDestroy) {
   const int kSampleRate = 8000;
-  const size_t kNumChannels = 1;
   const int kOverlapSamples = 5 * kSampleRate / 8000;
   BackgroundNoise bgn(kNumChannels);
   Accelerate accelerate(kSampleRate, kNumChannels, bgn);
@@ -30,7 +40,6 @@
 
 TEST(TimeStretch, CreateUsingFactory) {
   const int kSampleRate = 8000;
-  const size_t kNumChannels = 1;
   const int kOverlapSamples = 5 * kSampleRate / 8000;
   BackgroundNoise bgn(kNumChannels);
 
@@ -47,6 +56,72 @@
   delete preemptive_expand;
 }
 
-// TODO(hlundin): Write more tests.
+class TimeStretchTest : public ::testing::Test {
+ protected:
+  TimeStretchTest()
+      : input_file_(new test::InputAudioFile(
+            test::ResourcePath("audio_coding/testfile32kHz", "pcm"))),
+        sample_rate_hz_(32000),
+        block_size_(30 * sample_rate_hz_ / 1000),  // 30 ms
+        audio_(new int16_t[block_size_]),
+        background_noise_(kNumChannels) {
+    WebRtcSpl_Init();
+  }
+
+  const int16_t* Next30Ms() {
+    CHECK(input_file_->Read(block_size_, audio_.get()));
+    return audio_.get();
+  }
+
+  // Returns the total length change (in samples) that the accelerate operation
+  // resulted in during the run.
+  int TestAccelerate(int loops, bool fast_mode) {
+    Accelerate accelerate(sample_rate_hz_, kNumChannels, background_noise_);
+    int total_length_change = 0;
+    for (int i = 0; i < loops; ++i) {
+      AudioMultiVector output(kNumChannels);
+      int16_t length_change;
+      UpdateReturnStats(accelerate.Process(Next30Ms(), block_size_, fast_mode,
+                                           &output, &length_change));
+      total_length_change += length_change;
+    }
+    return total_length_change;
+  }
+
+  void UpdateReturnStats(TimeStretch::ReturnCodes ret) {
+    switch (ret) {
+      case TimeStretch::kSuccess:
+      case TimeStretch::kSuccessLowEnergy:
+      case TimeStretch::kNoStretch:
+        ++return_stats_[ret];
+        break;
+      case TimeStretch::kError:
+        FAIL() << "Process returned an error";
+    }
+  }
+
+  rtc::scoped_ptr<test::InputAudioFile> input_file_;
+  const int sample_rate_hz_;
+  const size_t block_size_;
+  rtc::scoped_ptr<int16_t[]> audio_;
+  std::map<TimeStretch::ReturnCodes, int> return_stats_;
+  BackgroundNoise background_noise_;
+};
+
+TEST_F(TimeStretchTest, Accelerate) {
+  // TestAccelerate returns the total length change in samples.
+  EXPECT_EQ(15268, TestAccelerate(100, false));
+  EXPECT_EQ(9, return_stats_[TimeStretch::kSuccess]);
+  EXPECT_EQ(58, return_stats_[TimeStretch::kSuccessLowEnergy]);
+  EXPECT_EQ(33, return_stats_[TimeStretch::kNoStretch]);
+}
+
+TEST_F(TimeStretchTest, AccelerateFastMode) {
+  // TestAccelerate returns the total length change in samples.
+  EXPECT_EQ(21400, TestAccelerate(100, true));
+  EXPECT_EQ(31, return_stats_[TimeStretch::kSuccess]);
+  EXPECT_EQ(58, return_stats_[TimeStretch::kSuccessLowEnergy]);
+  EXPECT_EQ(11, return_stats_[TimeStretch::kNoStretch]);
+}
 
 }  // namespace webrtc
commit	cf808d2366e58b33540931d182f36800d9a15b0d	[log] [tgz]
author	Henrik Lundin <henrik.lundin@webrtc.org>	Wed May 27 14:33:29 2015 +0200
committer	Henrik Lundin <henrik.lundin@webrtc.org>	Wed May 27 12:33:39 2015 +0000
tree	821aa7a6bd6ecd9052e4305f43f5f0a981dd5f27
parent	cbe408aa118e46e1f1dd28d201378968f00b60ea [diff]