Add plumbing of RtpPacketInfos to each AudioFrame as input for SourceTracker.

This change adds the plumbing of RtpPacketInfo from ChannelReceive::OnRtpPacket() to ChannelReceive::GetAudioFrameWithInfo() for audio. It is a step towards replacing the non-spec compliant ContributingSources that updates itself at packet-receive time, with the spec-compliant SourceTracker that will update itself at frame-delivery-to-track time.

Bug: webrtc:10668
Change-Id: I03385d6865bbc7bfbef7634f88de820a934f787a
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/139890
Reviewed-by: Stefan Holmer <stefan@webrtc.org>
Reviewed-by: Minyue Li <minyue@webrtc.org>
Commit-Queue: Chen Xing <chxg@google.com>
Cr-Commit-Position: refs/heads/master@{#28434}
diff --git a/modules/audio_coding/neteq/neteq_impl_unittest.cc b/modules/audio_coding/neteq/neteq_impl_unittest.cc
index 2b0d583..0c7c090 100644
--- a/modules/audio_coding/neteq/neteq_impl_unittest.cc
+++ b/modules/audio_coding/neteq/neteq_impl_unittest.cc
@@ -9,6 +9,8 @@
  */
 
 #include <memory>
+#include <utility>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "api/audio_codecs/builtin_audio_decoder_factory.h"
@@ -30,6 +32,7 @@
 #include "modules/audio_coding/neteq/sync_buffer.h"
 #include "modules/audio_coding/neteq/timestamp_scaler.h"
 #include "rtc_base/numerics/safe_conversions.h"
+#include "system_wrappers/include/clock.h"
 #include "test/audio_decoder_proxy_factory.h"
 #include "test/function_audio_decoder_factory.h"
 #include "test/gmock.h"
@@ -40,14 +43,17 @@
 using ::testing::_;
 using ::testing::AtLeast;
 using ::testing::DoAll;
+using ::testing::ElementsAre;
 using ::testing::InSequence;
 using ::testing::Invoke;
+using ::testing::IsEmpty;
 using ::testing::IsNull;
 using ::testing::Pointee;
 using ::testing::Return;
 using ::testing::ReturnNull;
 using ::testing::SetArgPointee;
 using ::testing::SetArrayArgument;
+using ::testing::SizeIs;
 using ::testing::WithArg;
 
 namespace webrtc {
@@ -62,12 +68,12 @@
 
 class NetEqImplTest : public ::testing::Test {
  protected:
-  NetEqImplTest() { config_.sample_rate_hz = 8000; }
+  NetEqImplTest() : clock_(0) { config_.sample_rate_hz = 8000; }
 
   void CreateInstance(
       const rtc::scoped_refptr<AudioDecoderFactory>& decoder_factory) {
     ASSERT_TRUE(decoder_factory);
-    NetEqImpl::Dependencies deps(config_, decoder_factory);
+    NetEqImpl::Dependencies deps(config_, &clock_, decoder_factory);
 
     // Get a local pointer to NetEq's TickTimer object.
     tick_timer_ = deps.tick_timer.get();
@@ -217,6 +223,10 @@
     EXPECT_EQ(1u, output.num_channels_);
     EXPECT_EQ(AudioFrame::kNormalSpeech, output.speech_type_);
 
+    // DTMF packets are immediately consumed by |InsertPacket()| and won't be
+    // returned by |GetAudio()|.
+    EXPECT_THAT(output.packet_infos_, IsEmpty());
+
     // Verify first 64 samples of actual output.
     const std::vector<int16_t> kOutput({
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1578, -2816, -3460, -3403, -2709, -1594,
@@ -231,6 +241,7 @@
 
   std::unique_ptr<NetEqImpl> neteq_;
   NetEq::Config config_;
+  SimulatedClock clock_;
   TickTimer* tick_timer_ = nullptr;
   MockBufferLevelFilter* mock_buffer_level_filter_ = nullptr;
   BufferLevelFilter* buffer_level_filter_ = nullptr;
@@ -263,7 +274,9 @@
 // TODO(hlundin): Move to separate file?
 TEST(NetEq, CreateAndDestroy) {
   NetEq::Config config;
-  NetEq* neteq = NetEq::Create(config, CreateBuiltinAudioDecoderFactory());
+  SimulatedClock clock(0);
+  NetEq* neteq =
+      NetEq::Create(config, &clock, CreateBuiltinAudioDecoderFactory());
   delete neteq;
 }
 
@@ -458,6 +471,10 @@
   rtp_header.sequenceNumber = 0x1234;
   rtp_header.timestamp = 0x12345678;
   rtp_header.ssrc = 0x87654321;
+  rtp_header.numCSRCs = 3;
+  rtp_header.arrOfCSRCs[0] = 43;
+  rtp_header.arrOfCSRCs[1] = 65;
+  rtp_header.arrOfCSRCs[2] = 17;
 
   // This is a dummy decoder that produces as many output samples as the input
   // has bytes. The output is an increasing series, starting at 1 for the first
@@ -501,6 +518,8 @@
                                           SdpAudioFormat("L16", 8000, 1)));
 
   // Insert one packet.
+  clock_.AdvanceTimeMilliseconds(123456);
+  int64_t expected_receive_time_ms = clock_.TimeInMilliseconds();
   EXPECT_EQ(NetEq::kOK,
             neteq_->InsertPacket(rtp_header, payload, kReceiveTime));
 
@@ -514,6 +533,17 @@
   EXPECT_EQ(1u, output.num_channels_);
   EXPECT_EQ(AudioFrame::kNormalSpeech, output.speech_type_);
 
+  // Verify |output.packet_infos_|.
+  ASSERT_THAT(output.packet_infos_, SizeIs(1));
+  {
+    const auto& packet_info = output.packet_infos_[0];
+    EXPECT_EQ(packet_info.ssrc(), rtp_header.ssrc);
+    EXPECT_THAT(packet_info.csrcs(), ElementsAre(43, 65, 17));
+    EXPECT_EQ(packet_info.rtp_timestamp(), rtp_header.timestamp);
+    EXPECT_FALSE(packet_info.audio_level().has_value());
+    EXPECT_EQ(packet_info.receive_time_ms(), expected_receive_time_ms);
+  }
+
   // Start with a simple check that the fake decoder is behaving as expected.
   EXPECT_EQ(kPayloadLengthSamples,
             static_cast<size_t>(decoder_.next_value() - 1));
@@ -561,6 +591,8 @@
   rtp_header.sequenceNumber = 0x1234;
   rtp_header.timestamp = 0x12345678;
   rtp_header.ssrc = 0x87654321;
+  rtp_header.extension.hasAudioLevel = true;
+  rtp_header.extension.audioLevel = 42;
 
   EXPECT_CALL(mock_decoder, Reset()).WillRepeatedly(Return());
   EXPECT_CALL(mock_decoder, SampleRateHz())
@@ -583,6 +615,8 @@
                                           SdpAudioFormat("L16", 8000, 1)));
 
   // Insert one packet.
+  clock_.AdvanceTimeMilliseconds(123456);
+  int64_t expected_receive_time_ms = clock_.TimeInMilliseconds();
   EXPECT_EQ(NetEq::kOK,
             neteq_->InsertPacket(rtp_header, payload, kReceiveTime));
 
@@ -595,16 +629,32 @@
   EXPECT_EQ(1u, output.num_channels_);
   EXPECT_EQ(AudioFrame::kNormalSpeech, output.speech_type_);
 
+  // Verify |output.packet_infos_|.
+  ASSERT_THAT(output.packet_infos_, SizeIs(1));
+  {
+    const auto& packet_info = output.packet_infos_[0];
+    EXPECT_EQ(packet_info.ssrc(), rtp_header.ssrc);
+    EXPECT_THAT(packet_info.csrcs(), IsEmpty());
+    EXPECT_EQ(packet_info.rtp_timestamp(), rtp_header.timestamp);
+    EXPECT_EQ(packet_info.audio_level(), rtp_header.extension.audioLevel);
+    EXPECT_EQ(packet_info.receive_time_ms(), expected_receive_time_ms);
+  }
+
   // Insert two more packets. The first one is out of order, and is already too
   // old, the second one is the expected next packet.
   rtp_header.sequenceNumber -= 1;
   rtp_header.timestamp -= kPayloadLengthSamples;
+  rtp_header.extension.audioLevel = 1;
   payload[0] = 1;
+  clock_.AdvanceTimeMilliseconds(1000);
   EXPECT_EQ(NetEq::kOK,
             neteq_->InsertPacket(rtp_header, payload, kReceiveTime));
   rtp_header.sequenceNumber += 2;
   rtp_header.timestamp += 2 * kPayloadLengthSamples;
+  rtp_header.extension.audioLevel = 2;
   payload[0] = 2;
+  clock_.AdvanceTimeMilliseconds(2000);
+  expected_receive_time_ms = clock_.TimeInMilliseconds();
   EXPECT_EQ(NetEq::kOK,
             neteq_->InsertPacket(rtp_header, payload, kReceiveTime));
 
@@ -627,6 +677,17 @@
   // out-of-order packet should have been discarded.
   EXPECT_TRUE(packet_buffer_->Empty());
 
+  // Verify |output.packet_infos_|. Expect to only see the second packet.
+  ASSERT_THAT(output.packet_infos_, SizeIs(1));
+  {
+    const auto& packet_info = output.packet_infos_[0];
+    EXPECT_EQ(packet_info.ssrc(), rtp_header.ssrc);
+    EXPECT_THAT(packet_info.csrcs(), IsEmpty());
+    EXPECT_EQ(packet_info.rtp_timestamp(), rtp_header.timestamp);
+    EXPECT_EQ(packet_info.audio_level(), rtp_header.extension.audioLevel);
+    EXPECT_EQ(packet_info.receive_time_ms(), expected_receive_time_ms);
+  }
+
   EXPECT_CALL(mock_decoder, Die());
 }
 
@@ -663,6 +724,7 @@
   EXPECT_EQ(kMaxOutputSize, output.samples_per_channel_);
   EXPECT_EQ(1u, output.num_channels_);
   EXPECT_EQ(AudioFrame::kPLC, output.speech_type_);
+  EXPECT_THAT(output.packet_infos_, IsEmpty());
 
   // Register the payload type.
   EXPECT_TRUE(neteq_->RegisterPayloadType(kPayloadType,
@@ -685,6 +747,7 @@
     EXPECT_EQ(1u, output.num_channels_);
     EXPECT_EQ(AudioFrame::kNormalSpeech, output.speech_type_)
         << "NetEq did not decode the packets as expected.";
+    EXPECT_THAT(output.packet_infos_, SizeIs(1));
   }
 }
 
@@ -722,6 +785,7 @@
     EXPECT_EQ(kMaxOutputSize, output.samples_per_channel_);
     EXPECT_EQ(1u, output.num_channels_);
     EXPECT_NE(AudioFrame::kNormalSpeech, output.speech_type_);
+    EXPECT_THAT(output.packet_infos_, IsEmpty());
   }
 
   // Insert 10 packets.
@@ -741,6 +805,7 @@
     EXPECT_EQ(1u, output.num_channels_);
     EXPECT_EQ(AudioFrame::kNormalSpeech, output.speech_type_)
         << "NetEq did not decode the packets as expected.";
+    EXPECT_THAT(output.packet_infos_, SizeIs(1));
   }
 
   auto lifetime_stats = neteq_->GetLifetimeStatistics();
@@ -975,12 +1040,14 @@
   const size_t kExpectedOutputSize = 10 * (kSampleRateHz / 1000) * kChannels;
   EXPECT_EQ(kExpectedOutputSize, output.samples_per_channel_ * kChannels);
   EXPECT_EQ(kChannels, output.num_channels_);
+  EXPECT_THAT(output.packet_infos_, IsEmpty());
 
   // Second call to GetAudio will decode the packet that is ok. No errors are
   // expected.
   EXPECT_EQ(NetEq::kOK, neteq_->GetAudio(&output, &muted));
   EXPECT_EQ(kExpectedOutputSize, output.samples_per_channel_ * kChannels);
   EXPECT_EQ(kChannels, output.num_channels_);
+  EXPECT_THAT(output.packet_infos_, SizeIs(1));
 
   // Die isn't called through NiceMock (since it's called by the
   // MockAudioDecoder constructor), so it needs to be mocked explicitly.
@@ -1082,6 +1149,7 @@
   ASSERT_EQ(kMaxOutputSize, output.samples_per_channel_);
   EXPECT_EQ(1u, output.num_channels_);
   EXPECT_EQ(AudioFrame::kNormalSpeech, output.speech_type_);
+  EXPECT_THAT(output.packet_infos_, SizeIs(1));
 
   EXPECT_CALL(mock_decoder, Die());
 }
@@ -1178,6 +1246,7 @@
   EXPECT_EQ(kMaxOutputSize, output.samples_per_channel_);
   EXPECT_EQ(1u, output.num_channels_);
   EXPECT_EQ(AudioFrame::kNormalSpeech, output.speech_type_);
+  EXPECT_THAT(output.packet_infos_, SizeIs(2));  // 5 ms packets vs 10 ms output
 
   // Pull audio again. Decoder fails.
   EXPECT_EQ(NetEq::kFail, neteq_->GetAudio(&output, &muted));
@@ -1191,12 +1260,14 @@
   EXPECT_EQ(kMaxOutputSize, output.samples_per_channel_);
   EXPECT_EQ(1u, output.num_channels_);
   EXPECT_EQ(AudioFrame::kPLC, output.speech_type_);
+  EXPECT_THAT(output.packet_infos_, IsEmpty());
 
   // Pull audio again, should behave normal.
   EXPECT_EQ(NetEq::kOK, neteq_->GetAudio(&output, &muted));
   EXPECT_EQ(kMaxOutputSize, output.samples_per_channel_);
   EXPECT_EQ(1u, output.num_channels_);
   EXPECT_EQ(AudioFrame::kNormalSpeech, output.speech_type_);
+  EXPECT_THAT(output.packet_infos_, SizeIs(2));  // 5 ms packets vs 10 ms output
 
   EXPECT_CALL(mock_decoder, Die());
 }
@@ -1625,4 +1696,4 @@
   EXPECT_EQ(kAccelerate, neteq_->last_operation_for_test());
 }
 
-}// namespace webrtc
+}  // namespace webrtc