Clarify and extend test support for certain sample rates in audio processing

Sample rates not divisible by 100, in particular 11025 Hz and 22050 Hz, have long been used with APM in Chrome, but the support has never been stated explicitly.

This CL makes minor modifications to the APM API to clarify how rates are handled when 10 ms is not an integer number of samples. Unit tests are also extended to cover this case better.

This does not update all references to 10 ms and implicit floor(sample_rate/100) computations, but it does at least take us closer to a correct API.

Note that not all code needs to support these sample rates. For example, audio processing submodules only need to operate on the native APM rates 16000, 32000, 48000.

Bug: chromium:1332484
Change-Id: I1dad15468f6ccb9c0d4d09c5819fe87f8388d5b8
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/268769
Reviewed-by: Henrik Andreassson <henrika@webrtc.org>
Commit-Queue: Sam Zackrisson <saza@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#37682}
diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc
index e0512c9..32cf9a8 100644
--- a/modules/audio_processing/audio_processing_unittest.cc
+++ b/modules/audio_processing/audio_processing_unittest.cc
@@ -66,18 +66,9 @@
 namespace webrtc {
 namespace {
 
-// TODO(ekmeyerson): Switch to using StreamConfig and ProcessingConfig where
-// applicable.
-
-const int32_t kChannels[] = {1, 2};
-const int kSampleRates[] = {8000, 16000, 32000, 48000};
-
-#if defined(WEBRTC_AUDIOPROC_FIXED_PROFILE)
-// Android doesn't support 48kHz.
-const int kProcessSampleRates[] = {8000, 16000, 32000};
-#elif defined(WEBRTC_AUDIOPROC_FLOAT_PROFILE)
-const int kProcessSampleRates[] = {8000, 16000, 32000, 48000};
-#endif
+// All sample rates used by APM internally during processing. Other input /
+// output rates are resampled to / from one of these.
+const int kProcessSampleRates[] = {16000, 32000, 48000};
 
 enum StreamDirection { kForward = 0, kReverse };
 
@@ -300,10 +291,11 @@
   fclose(file);
 }
 
-// Reads a 10 ms chunk of int16 interleaved audio from the given (assumed
-// stereo) file, converts to deinterleaved float (optionally downmixing) and
-// returns the result in `cb`. Returns false if the file ended (or on error) and
-// true otherwise.
+// Reads a 10 ms chunk (actually AudioProcessing::GetFrameSize() samples per
+// channel) of int16 interleaved audio from the given (assumed stereo) file,
+// converts to deinterleaved float (optionally downmixing) and returns the
+// result in `cb`. Returns false if the file ended (or on error) and true
+// otherwise.
 //
 // `int_data` and `float_data` are just temporary space that must be
 // sufficiently large to hold the 10 ms chunk.
@@ -1150,8 +1142,9 @@
 // Verifies that despite volume slider quantization, the AGC can continue to
 // increase its volume.
 TEST_F(ApmTest, QuantizedVolumeDoesNotGetStuck) {
-  for (size_t i = 0; i < arraysize(kSampleRates); ++i) {
-    RunQuantizedVolumeDoesNotGetStuckTest(kSampleRates[i]);
+  for (size_t sample_rate_hz : kProcessSampleRates) {
+    SCOPED_TRACE(::testing::Message() << "sample_rate_hz=" << sample_rate_hz);
+    RunQuantizedVolumeDoesNotGetStuckTest(sample_rate_hz);
   }
 }
 
@@ -1205,8 +1198,9 @@
 }
 
 TEST_F(ApmTest, ManualVolumeChangeIsPossible) {
-  for (size_t i = 0; i < arraysize(kSampleRates); ++i) {
-    RunManualVolumeChangeIsPossibleTest(kSampleRates[i]);
+  for (size_t sample_rate_hz : kProcessSampleRates) {
+    SCOPED_TRACE(::testing::Message() << "sample_rate_hz=" << sample_rate_hz);
+    RunManualVolumeChangeIsPossibleTest(sample_rate_hz);
   }
 }
 
@@ -1227,9 +1221,18 @@
   EXPECT_FALSE(config.noise_suppression.enabled);
 }
 
-TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
-  for (size_t i = 0; i < arraysize(kSampleRates); i++) {
-    Init(kSampleRates[i], kSampleRates[i], kSampleRates[i], 2, 2, 2, false);
+TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabledInt) {
+  // Test that ProcessStream simply copies input to output when all components
+  // are disabled.
+  // Runs over all processing rates, and some particularly common or special
+  // rates.
+  // - 8000 Hz: lowest sample rate seen in Chrome metrics,
+  // - 22050 Hz: APM input/output frames are not exactly 10 ms,
+  // - 44100 Hz: very common desktop sample rate.
+  constexpr int kSampleRatesHz[] = {8000, 16000, 22050, 32000, 44100, 48000};
+  for (size_t sample_rate_hz : kSampleRatesHz) {
+    SCOPED_TRACE(::testing::Message() << "sample_rate_hz=" << sample_rate_hz);
+    Init(sample_rate_hz, sample_rate_hz, sample_rate_hz, 2, 2, 2, false);
     SetFrameTo(&frame_, 1000, 2000);
     Int16FrameData frame_copy;
     frame_copy.CopyFrom(frame_);
@@ -1253,7 +1256,8 @@
 }
 
 TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabledFloat) {
-  // Test that ProcessStream copies input to output even with no processing.
+  // Test that ProcessStream simply copies input to output when all components
+  // are disabled.
   const size_t kSamples = 160;
   const int sample_rate = 16000;
   const float src[kSamples] = {-1.0f, 0.0f, 1.0f};
@@ -1644,15 +1648,16 @@
   if (!absl::GetFlag(FLAGS_write_apm_ref_data)) {
     OpenFileAndReadMessage(ref_filename_, &ref_data);
   } else {
+    const int kChannels[] = {1, 2};
     // Write the desired tests to the protobuf reference file.
     for (size_t i = 0; i < arraysize(kChannels); i++) {
       for (size_t j = 0; j < arraysize(kChannels); j++) {
-        for (size_t l = 0; l < arraysize(kProcessSampleRates); l++) {
+        for (int sample_rate_hz : AudioProcessing::kNativeSampleRatesHz) {
           audioproc::Test* test = ref_data.add_test();
           test->set_num_reverse_channels(kChannels[i]);
           test->set_num_input_channels(kChannels[j]);
           test->set_num_output_channels(kChannels[j]);
-          test->set_sample_rate(kProcessSampleRates[l]);
+          test->set_sample_rate(sample_rate_hz);
           test->set_use_aec_extended_filter(false);
         }
       }
@@ -1821,10 +1826,12 @@
                    int expected_delay,
                    double* variance_acc,
                    double* sq_error_acc) {
+  RTC_CHECK_LT(expected_delay, length)
+      << "delay greater than signal length, cannot compute SNR";
   double best_snr = std::numeric_limits<double>::min();
   double best_variance = 0;
   double best_sq_error = 0;
-  // Search over a region of eight samples around the expected delay.
+  // Search over a region of nine samples around the expected delay.
   for (int delay = std::max(expected_delay - 4, 0); delay <= expected_delay + 4;
        ++delay) {
     double sq_error = 0;
@@ -1879,15 +1886,15 @@
 
   static void SetUpTestSuite() {
     // Create all needed output reference files.
-    const int kNativeRates[] = {8000, 16000, 32000, 48000};
     const size_t kNumChannels[] = {1, 2};
-    for (size_t i = 0; i < arraysize(kNativeRates); ++i) {
+    for (size_t i = 0; i < arraysize(kProcessSampleRates); ++i) {
       for (size_t j = 0; j < arraysize(kNumChannels); ++j) {
         for (size_t k = 0; k < arraysize(kNumChannels); ++k) {
           // The reference files always have matching input and output channels.
-          ProcessFormat(kNativeRates[i], kNativeRates[i], kNativeRates[i],
-                        kNativeRates[i], kNumChannels[j], kNumChannels[j],
-                        kNumChannels[k], kNumChannels[k], "ref");
+          ProcessFormat(kProcessSampleRates[i], kProcessSampleRates[i],
+                        kProcessSampleRates[i], kProcessSampleRates[i],
+                        kNumChannels[j], kNumChannels[j], kNumChannels[k],
+                        kNumChannels[k], "ref");
         }
       }
     }
@@ -1912,11 +1919,10 @@
                             size_t num_reverse_input_channels,
                             size_t num_reverse_output_channels,
                             const std::string& output_file_prefix) {
-    rtc::scoped_refptr<AudioProcessing> ap =
-        AudioProcessingBuilderForTesting().Create();
-    AudioProcessing::Config apm_config = ap->GetConfig();
+    AudioProcessing::Config apm_config;
     apm_config.gain_controller1.analog_gain_controller.enabled = false;
-    ap->ApplyConfig(apm_config);
+    rtc::scoped_refptr<AudioProcessing> ap =
+        AudioProcessingBuilderForTesting().SetConfig(apm_config).Create();
 
     EnableAllAPComponents(ap.get());
 
@@ -1949,14 +1955,16 @@
     ASSERT_TRUE(out_file != NULL);
     ASSERT_TRUE(rev_out_file != NULL);
 
-    ChannelBuffer<float> fwd_cb(SamplesFromRate(input_rate),
+    ChannelBuffer<float> fwd_cb(AudioProcessing::GetFrameSize(input_rate),
                                 num_input_channels);
-    ChannelBuffer<float> rev_cb(SamplesFromRate(reverse_input_rate),
-                                num_reverse_input_channels);
-    ChannelBuffer<float> out_cb(SamplesFromRate(output_rate),
+    ChannelBuffer<float> rev_cb(
+        AudioProcessing::GetFrameSize(reverse_input_rate),
+        num_reverse_input_channels);
+    ChannelBuffer<float> out_cb(AudioProcessing::GetFrameSize(output_rate),
                                 num_output_channels);
-    ChannelBuffer<float> rev_out_cb(SamplesFromRate(reverse_output_rate),
-                                    num_reverse_output_channels);
+    ChannelBuffer<float> rev_out_cb(
+        AudioProcessing::GetFrameSize(reverse_output_rate),
+        num_reverse_output_channels);
 
     // Temporary buffers.
     const int max_length =
@@ -2044,15 +2052,12 @@
 
       const int min_ref_rate = std::min(in_rate, out_rate);
       int ref_rate;
-
       if (min_ref_rate > 32000) {
         ref_rate = 48000;
       } else if (min_ref_rate > 16000) {
         ref_rate = 32000;
-      } else if (min_ref_rate > 8000) {
-        ref_rate = 16000;
       } else {
-        ref_rate = 8000;
+        ref_rate = 16000;
       }
 
       FILE* out_file = fopen(
@@ -2073,8 +2078,10 @@
       ASSERT_TRUE(out_file != NULL);
       ASSERT_TRUE(ref_file != NULL);
 
-      const size_t ref_length = SamplesFromRate(ref_rate) * out_num;
-      const size_t out_length = SamplesFromRate(out_rate) * out_num;
+      const size_t ref_length =
+          AudioProcessing::GetFrameSize(ref_rate) * out_num;
+      const size_t out_length =
+          AudioProcessing::GetFrameSize(out_rate) * out_num;
       // Data from the reference file.
       std::unique_ptr<float[]> ref_data(new float[ref_length]);
       // Data from the output file.
@@ -2103,6 +2110,9 @@
         expected_delay_sec +=
             PushSincResampler::AlgorithmicDelaySeconds(out_rate);
       }
+      // The delay is multiplied by the number of channels because
+      // UpdateBestSNR() computes the SNR over interleaved data without taking
+      // channels into account.
       int expected_delay =
           std::floor(expected_delay_sec * ref_rate + 0.5f) * out_num;
 
@@ -2113,7 +2123,7 @@
         float* out_ptr = out_data.get();
         if (out_rate != ref_rate) {
           // Resample the output back to its internal processing rate if
-          // necssary.
+          // necessary.
           ASSERT_EQ(ref_length,
                     static_cast<size_t>(resampler.Resample(
                         out_ptr, out_length, cmp_data.get(), ref_length)));
@@ -2150,6 +2160,8 @@
 INSTANTIATE_TEST_SUITE_P(
     CommonFormats,
     AudioProcessingTest,
+    // Internal processing rates and the particularly common sample rate 44100
+    // Hz are tested in a grid of combinations (capture in, render in, out).
     ::testing::Values(std::make_tuple(48000, 48000, 48000, 48000, 0, 0),
                       std::make_tuple(48000, 48000, 32000, 48000, 40, 30),
                       std::make_tuple(48000, 48000, 16000, 48000, 40, 20),
@@ -2200,7 +2212,21 @@
                       std::make_tuple(16000, 32000, 16000, 32000, 25, 20),
                       std::make_tuple(16000, 16000, 48000, 16000, 39, 20),
                       std::make_tuple(16000, 16000, 32000, 16000, 39, 20),
-                      std::make_tuple(16000, 16000, 16000, 16000, 0, 0)));
+                      std::make_tuple(16000, 16000, 16000, 16000, 0, 0),
+
+                      // Other sample rates are not tested exhaustively, to keep
+                      // the test runtime manageable.
+                      //
+                      // Testing most other sample rates logged by Chrome UMA:
+                      //  - WebRTC.AudioInputSampleRate
+                      //  - WebRTC.AudioOutputSampleRate
+                      // ApmConfiguration.HandlingOfRateCombinations covers
+                      // remaining sample rates.
+                      std::make_tuple(192000, 192000, 48000, 192000, 20, 40),
+                      std::make_tuple(176400, 176400, 48000, 176400, 20, 35),
+                      std::make_tuple(96000, 96000, 48000, 96000, 20, 40),
+                      std::make_tuple(88200, 88200, 48000, 88200, 20, 20),
+                      std::make_tuple(44100, 44100, 48000, 44100, 20, 20)));
 
 #elif defined(WEBRTC_AUDIOPROC_FIXED_PROFILE)
 INSTANTIATE_TEST_SUITE_P(
@@ -2256,7 +2282,13 @@
                       std::make_tuple(16000, 32000, 16000, 32000, 25, 20),
                       std::make_tuple(16000, 16000, 48000, 16000, 28, 20),
                       std::make_tuple(16000, 16000, 32000, 16000, 28, 20),
-                      std::make_tuple(16000, 16000, 16000, 16000, 0, 0)));
+                      std::make_tuple(16000, 16000, 16000, 16000, 0, 0),
+
+                      std::make_tuple(192000, 192000, 48000, 192000, 20, 40),
+                      std::make_tuple(176400, 176400, 48000, 176400, 20, 35),
+                      std::make_tuple(96000, 96000, 48000, 96000, 20, 40),
+                      std::make_tuple(88200, 88200, 48000, 88200, 20, 20),
+                      std::make_tuple(44100, 44100, 48000, 44100, 20, 20)));
 #endif
 
 // Produces a scoped trace debug output.
@@ -2297,11 +2329,12 @@
     rtc::ArrayView<const int> sample_rates_hz,
     rtc::ArrayView<const int> render_channel_counts,
     rtc::ArrayView<const int> capture_channel_counts) {
-  rtc::scoped_refptr<AudioProcessing> apm =
-      AudioProcessingBuilderForTesting().Create();
   webrtc::AudioProcessing::Config apm_config;
+  apm_config.pipeline.multi_channel_render = true;
+  apm_config.pipeline.multi_channel_capture = true;
   apm_config.echo_canceller.enabled = true;
-  apm->ApplyConfig(apm_config);
+  rtc::scoped_refptr<AudioProcessing> apm =
+      AudioProcessingBuilderForTesting().SetConfig(apm_config).Create();
 
   StreamConfig render_input_stream_config;
   StreamConfig render_output_stream_config;
@@ -2333,7 +2366,8 @@
                 cfg->set_sample_rate_hz(sample_rate_hz);
                 cfg->set_num_channels(num_channels);
 
-                size_t max_frame_size = ceil(sample_rate_hz / 100.f);
+                size_t max_frame_size =
+                    AudioProcessing::GetFrameSize(sample_rate_hz);
                 channels_data->resize(num_channels * max_frame_size);
                 std::fill(channels_data->begin(), channels_data->end(), 0.5f);
                 frame_data->resize(num_channels);
@@ -2821,8 +2855,13 @@
 }
 
 TEST(ApmConfiguration, HandlingOfRateCombinations) {
-  std::array<int, 9> sample_rates_hz = {8000,  11025, 16000,  22050, 32000,
-                                        48000, 96000, 192000, 384000};
+  // Test rates <= 96000 logged by Chrome UMA:
+  //  - WebRTC.AudioInputSampleRate
+  //  - WebRTC.AudioOutputSampleRate
+  // Higher rates are tested in AudioProcessingTest.Format, to keep the number
+  // of combinations in this test manageable.
+  std::array<int, 9> sample_rates_hz = {8000,  11025, 16000, 22050, 32000,
+                                        44100, 48000, 88200, 96000};
   std::array<int, 1> render_channel_counts = {2};
   std::array<int, 1> capture_channel_counts = {2};
   RunApmRateAndChannelTest(sample_rates_hz, render_channel_counts,
diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h
index a49067b..72791bd 100644
--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h
@@ -93,9 +93,9 @@
 //   2. Parameter getters are never called concurrently with the corresponding
 //      setter.
 //
-// APM accepts only linear PCM audio data in chunks of 10 ms. The int16
-// interfaces use interleaved data, while the float interfaces use deinterleaved
-// data.
+// APM accepts only linear PCM audio data in chunks of ~10 ms (see
+// AudioProcessing::GetFrameSize() for details). The int16 interfaces use
+// interleaved data, while the float interfaces use deinterleaved data.
 //
 // Usage example, omitting error checking:
 // AudioProcessing* apm = AudioProcessingBuilder().Create();
@@ -536,7 +536,7 @@
   // enqueueing was successfull.
   virtual bool PostRuntimeSetting(RuntimeSetting setting) = 0;
 
-  // Accepts and produces a 10 ms frame interleaved 16 bit integer audio as
+  // Accepts and produces a ~10 ms frame of interleaved 16 bit integer audio as
   // specified in `input_config` and `output_config`. `src` and `dest` may use
   // the same memory, if desired.
   virtual int ProcessStream(const int16_t* const src,
@@ -556,7 +556,7 @@
                             const StreamConfig& output_config,
                             float* const* dest) = 0;
 
-  // Accepts and produces a 10 ms frame of interleaved 16 bit integer audio for
+  // Accepts and produces a ~10 ms frame of interleaved 16 bit integer audio for
   // the reverse direction audio stream as specified in `input_config` and
   // `output_config`. `src` and `dest` may use the same memory, if desired.
   virtual int ProcessReverseStream(const int16_t* const src,
@@ -577,10 +577,10 @@
   virtual int AnalyzeReverseStream(const float* const* data,
                                    const StreamConfig& reverse_config) = 0;
 
-  // Returns the most recently produced 10 ms of the linear AEC output at a rate
-  // of 16 kHz. If there is more than one capture channel, a mono representation
-  // of the input is returned. Returns true/false to indicate whether an output
-  // returned.
+  // Returns the most recently produced ~10 ms of the linear AEC output at a
+  // rate of 16 kHz. If there is more than one capture channel, a mono
+  // representation of the input is returned. Returns true/false to indicate
+  // whether an output returned.
   virtual bool GetLinearAecOutput(
       rtc::ArrayView<std::array<float, 160>> linear_output) const = 0;
 
@@ -706,7 +706,29 @@
   static constexpr int kMaxNativeSampleRateHz =
       kNativeSampleRatesHz[kNumNativeSampleRates - 1];
 
+  // APM processes audio in chunks of about 10 ms. See GetFrameSize() for
+  // details.
   static constexpr int kChunkSizeMs = 10;
+
+  // Returns floor(sample_rate_hz/100): the number of samples per channel used
+  // as input and output to the audio processing module in calls to
+  // ProcessStream, ProcessReverseStream, AnalyzeReverseStream, and
+  // GetLinearAecOutput.
+  //
+  // This is exactly 10 ms for sample rates divisible by 100. For example:
+  //  - 48000 Hz (480 samples per channel),
+  //  - 44100 Hz (441 samples per channel),
+  //  - 16000 Hz (160 samples per channel).
+  //
+  // Sample rates not divisible by 100 are received/produced in frames of
+  // approximately 10 ms. For example:
+  //  - 22050 Hz (220 samples per channel, or ~9.98 ms per frame),
+  //  - 11025 Hz (110 samples per channel, or ~9.98 ms per frame).
+  // These nondivisible sample rates yield lower audio quality compared to
+  // multiples of 100. Internal resampling to 10 ms frames causes a simulated
+  // clock drift effect which impacts the performance of (for example) echo
+  // cancellation.
+  static int GetFrameSize(int sample_rate_hz) { return sample_rate_hz / 100; }
 };
 
 class RTC_EXPORT AudioProcessingBuilder {
@@ -804,8 +826,7 @@
 
  private:
   static size_t calculate_frames(int sample_rate_hz) {
-    return static_cast<size_t>(AudioProcessing::kChunkSizeMs * sample_rate_hz /
-                               1000);
+    return static_cast<size_t>(AudioProcessing::GetFrameSize(sample_rate_hz));
   }
 
   int sample_rate_hz_;
diff --git a/modules/audio_processing/test/test_utils.cc b/modules/audio_processing/test/test_utils.cc
index c041a68..dda2c3b 100644
--- a/modules/audio_processing/test/test_utils.cc
+++ b/modules/audio_processing/test/test_utils.cc
@@ -77,10 +77,6 @@
   return file;
 }
 
-size_t SamplesFromRate(int rate) {
-  return static_cast<size_t>(AudioProcessing::kChunkSizeMs * rate / 1000);
-}
-
 void SetFrameSampleRate(Int16FrameData* frame, int sample_rate_hz) {
   frame->sample_rate_hz = sample_rate_hz;
   frame->samples_per_channel =
diff --git a/modules/audio_processing/test/test_utils.h b/modules/audio_processing/test/test_utils.h
index 218052f..063ce87 100644
--- a/modules/audio_processing/test/test_utils.h
+++ b/modules/audio_processing/test/test_utils.h
@@ -114,8 +114,6 @@
 // Exits on failure; do not use in unit tests.
 FILE* OpenFile(const std::string& filename, const char* mode);
 
-size_t SamplesFromRate(int rate);
-
 void SetFrameSampleRate(Int16FrameData* frame, int sample_rate_hz);
 
 template <typename T>