Added RTCMediaStreamTrackStats.jitterBufferDelay for audio

Description of this stat can be found here:
https://w3c.github.io/webrtc-stats/#dom-rtcmediastreamtrackstats-jitterbufferdelay

Bug: webrtc:8281
Change-Id: Ib2e8174f3449e68ad419ae2d58d5565fc9854023
Reviewed-on: https://webrtc-review.googlesource.com/3381
Commit-Queue: Gustaf Ullberg <gustaf@webrtc.org>
Reviewed-by: Henrik Boström <hbos@webrtc.org>
Reviewed-by: Taylor Brandstetter <deadbeef@webrtc.org>
Reviewed-by: Fredrik Solenberg <solenberg@webrtc.org>
Reviewed-by: Henrik Lundin <henrik.lundin@webrtc.org>
Reviewed-by: Henrik Andreassson <henrika@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#20069}
diff --git a/api/stats/rtcstats_objects.h b/api/stats/rtcstats_objects.h
index 903d266..97a7d3c 100644
--- a/api/stats/rtcstats_objects.h
+++ b/api/stats/rtcstats_objects.h
@@ -255,6 +255,10 @@
   RTCStatsMember<bool> detached;
   // See |RTCMediaStreamTrackKind| for valid values.
   RTCStatsMember<std::string> kind;
+  // TODO(gustaf): Implement jitter_buffer_delay for video (currently
+  // implemented for audio only).
+  // https://crbug.com/webrtc/8318
+  RTCStatsMember<double> jitter_buffer_delay;
   // Video-only members
   RTCStatsMember<uint32_t> frame_width;
   RTCStatsMember<uint32_t> frame_height;
diff --git a/api/statstypes.cc b/api/statstypes.cc
index 37e8aac..f1a1130 100644
--- a/api/statstypes.cc
+++ b/api/statstypes.cc
@@ -403,6 +403,8 @@
       return "framesDecoded";
     case kStatsValueNameFramesEncoded:
       return "framesEncoded";
+    case kStatsValueNameJitterBufferDelay:
+      return "jitterBufferDelay";
     case kStatsValueNameCodecImplementationName:
       return "codecImplementationName";
     case kStatsValueNameMediaType:
diff --git a/api/statstypes.h b/api/statstypes.h
index 7f69b02..9e7f08c 100644
--- a/api/statstypes.h
+++ b/api/statstypes.h
@@ -109,6 +109,7 @@
     kStatsValueNameDataChannelId,
     kStatsValueNameFramesDecoded,
     kStatsValueNameFramesEncoded,
+    kStatsValueNameJitterBufferDelay,
     kStatsValueNameMediaType,
     kStatsValueNamePacketsLost,
     kStatsValueNamePacketsReceived,
diff --git a/audio/audio_receive_stream.cc b/audio/audio_receive_stream.cc
index 704c86e..2a57551 100644
--- a/audio/audio_receive_stream.cc
+++ b/audio/audio_receive_stream.cc
@@ -197,6 +197,9 @@
   stats.total_samples_received = ns.totalSamplesReceived;
   stats.concealed_samples = ns.concealedSamples;
   stats.concealment_events = ns.concealmentEvents;
+  stats.jitter_buffer_delay_seconds =
+      static_cast<double>(ns.jitterBufferDelayMs) /
+      static_cast<double>(rtc::kNumMillisecsPerSec);
   stats.expand_rate = Q14ToFloat(ns.currentExpandRate);
   stats.speech_expand_rate = Q14ToFloat(ns.currentSpeechExpandRate);
   stats.secondary_decoded_rate = Q14ToFloat(ns.currentSecondaryDecodedRate);
diff --git a/audio/audio_receive_stream_unittest.cc b/audio/audio_receive_stream_unittest.cc
index 1ceaaab..4fdb68c 100644
--- a/audio/audio_receive_stream_unittest.cc
+++ b/audio/audio_receive_stream_unittest.cc
@@ -64,9 +64,9 @@
     345,  678,  901, 234, -12, 3456, 7890, 567, 890, 123};
 const CodecInst kCodecInst = {
     123, "codec_name_recv", 96000, -187, 0, -103};
-const NetworkStatistics kNetworkStats = {123, 456, false, 789012, 3456, 123, 0,
-                                         {},  789, 12,    345,    678,  901, 0,
-                                         -1,  -1,  -1,    -1,     -1,   0};
+const NetworkStatistics kNetworkStats = {
+    123, 456, false, 789012, 3456, 123, 456, 0,  {}, 789, 12,
+    345, 678, 901,   0,      -1,   -1,  -1,  -1, -1, 0};
 const AudioDecodingCallStats kAudioDecodeStats = MakeAudioDecodeStatsForTest();
 
 struct ConfigHelper {
@@ -316,6 +316,9 @@
   EXPECT_EQ(kTotalOutputDuration, stats.total_output_duration);
   EXPECT_EQ(kNetworkStats.concealedSamples, stats.concealed_samples);
   EXPECT_EQ(kNetworkStats.concealmentEvents, stats.concealment_events);
+  EXPECT_EQ(static_cast<double>(kNetworkStats.jitterBufferDelayMs) /
+                static_cast<double>(rtc::kNumMillisecsPerSec),
+            stats.jitter_buffer_delay_seconds);
   EXPECT_EQ(Q14ToFloat(kNetworkStats.currentExpandRate), stats.expand_rate);
   EXPECT_EQ(Q14ToFloat(kNetworkStats.currentSpeechExpandRate),
             stats.speech_expand_rate);
diff --git a/call/audio_receive_stream.h b/call/audio_receive_stream.h
index baf2b67..44f093c 100644
--- a/call/audio_receive_stream.h
+++ b/call/audio_receive_stream.h
@@ -57,6 +57,7 @@
     double total_output_duration = 0.0;
     uint64_t concealed_samples = 0;
     uint64_t concealment_events = 0;
+    double jitter_buffer_delay_seconds = 0.0;
     // Stats below DO NOT correspond directly to anything in the WebRTC stats
     float expand_rate = 0.0f;
     float speech_expand_rate = 0.0f;
diff --git a/common_types.h b/common_types.h
index 69fc761..207c81e 100644
--- a/common_types.h
+++ b/common_types.h
@@ -368,17 +368,13 @@
   uint16_t preferredBufferSize;
   // adding extra delay due to "peaky jitter"
   bool jitterPeaksFound;
-  // Total number of audio samples received, including synthesized samples.
-  // https://w3c.github.io/webrtc-stats/#dom-rtcmediastreamtrackstats-totalsamplesreceived
+  // Stats below correspond to similarly-named fields in the WebRTC stats spec.
+  // https://w3c.github.io/webrtc-stats/#dom-rtcmediastreamtrackstats
   uint64_t totalSamplesReceived;
-  // Total number of inbound audio samples that are based on synthesized data to
-  // conceal packet loss.
-  // https://w3c.github.io/webrtc-stats/#dom-rtcmediastreamtrackstats-concealedsamples
   uint64_t concealedSamples;
-  // Number of times a concealed sample is synthesized after a non-concealed
-  // sample.
-  // https://w3c.github.io/webrtc-stats/#dom-rtcmediastreamtrackstats-concealmentevents
   uint64_t concealmentEvents;
+  uint64_t jitterBufferDelayMs;
+  // Stats below DO NOT correspond directly to anything in the WebRTC stats
   // Loss rate (network + late); fraction between 0 and 1, scaled to Q14.
   uint16_t currentPacketLossRate;
   // Late loss rate; fraction between 0 and 1, scaled to Q14.
diff --git a/media/base/mediachannel.h b/media/base/mediachannel.h
index 103240e..06766a8 100644
--- a/media/base/mediachannel.h
+++ b/media/base/mediachannel.h
@@ -658,6 +658,7 @@
         total_output_duration(0.0),
         concealed_samples(0),
         concealment_events(0),
+        jitter_buffer_delay_seconds(0),
         expand_rate(0),
         speech_expand_rate(0),
         secondary_decoded_rate(0),
@@ -686,6 +687,7 @@
   double total_output_duration;
   uint64_t concealed_samples;
   uint64_t concealment_events;
+  double jitter_buffer_delay_seconds;
   // Stats below DO NOT correspond directly to anything in the WebRTC stats
   // fraction of synthesized audio inserted through expansion.
   float expand_rate;
diff --git a/media/engine/webrtcvoiceengine.cc b/media/engine/webrtcvoiceengine.cc
index 881e8ec..467e38d 100644
--- a/media/engine/webrtcvoiceengine.cc
+++ b/media/engine/webrtcvoiceengine.cc
@@ -2302,6 +2302,7 @@
     rinfo.total_output_duration = stats.total_output_duration;
     rinfo.concealed_samples = stats.concealed_samples;
     rinfo.concealment_events = stats.concealment_events;
+    rinfo.jitter_buffer_delay_seconds = stats.jitter_buffer_delay_seconds;
     rinfo.expand_rate = stats.expand_rate;
     rinfo.speech_expand_rate = stats.speech_expand_rate;
     rinfo.secondary_decoded_rate = stats.secondary_decoded_rate;
diff --git a/media/engine/webrtcvoiceengine_unittest.cc b/media/engine/webrtcvoiceengine_unittest.cc
index 4e80788..30396d9 100644
--- a/media/engine/webrtcvoiceengine_unittest.cc
+++ b/media/engine/webrtcvoiceengine_unittest.cc
@@ -623,6 +623,7 @@
     stats.total_samples_received = 5678901;
     stats.concealed_samples = 234;
     stats.concealment_events = 12;
+    stats.jitter_buffer_delay_seconds = 34;
     stats.expand_rate = 5.67f;
     stats.speech_expand_rate = 8.90f;
     stats.secondary_decoded_rate = 1.23f;
@@ -663,6 +664,8 @@
     EXPECT_EQ(info.total_samples_received, stats.total_samples_received);
     EXPECT_EQ(info.concealed_samples, stats.concealed_samples);
     EXPECT_EQ(info.concealment_events, stats.concealment_events);
+    EXPECT_EQ(info.jitter_buffer_delay_seconds,
+              stats.jitter_buffer_delay_seconds);
     EXPECT_EQ(info.expand_rate, stats.expand_rate);
     EXPECT_EQ(info.speech_expand_rate, stats.speech_expand_rate);
     EXPECT_EQ(info.secondary_decoded_rate, stats.secondary_decoded_rate);
diff --git a/modules/audio_coding/acm2/acm_receiver.cc b/modules/audio_coding/acm2/acm_receiver.cc
index d999df0..085e77a 100644
--- a/modules/audio_coding/acm2/acm_receiver.cc
+++ b/modules/audio_coding/acm2/acm_receiver.cc
@@ -337,6 +337,7 @@
   acm_stat->totalSamplesReceived = neteq_lifetime_stat.total_samples_received;
   acm_stat->concealedSamples = neteq_lifetime_stat.concealed_samples;
   acm_stat->concealmentEvents = neteq_lifetime_stat.concealment_events;
+  acm_stat->jitterBufferDelayMs = neteq_lifetime_stat.jitter_buffer_delay_ms;
 }
 
 int AcmReceiver::DecoderByPayloadType(uint8_t payload_type,
diff --git a/modules/audio_coding/neteq/include/neteq.h b/modules/audio_coding/neteq/include/neteq.h
index b349f20..e6cafa8 100644
--- a/modules/audio_coding/neteq/include/neteq.h
+++ b/modules/audio_coding/neteq/include/neteq.h
@@ -66,6 +66,7 @@
   uint64_t total_samples_received = 0;
   uint64_t concealed_samples = 0;
   uint64_t concealment_events = 0;
+  uint64_t jitter_buffer_delay_ms = 0;
 };
 
 enum NetEqPlayoutMode {
diff --git a/modules/audio_coding/neteq/neteq_impl.cc b/modules/audio_coding/neteq/neteq_impl.cc
index 2d50225..36d6b27 100644
--- a/modules/audio_coding/neteq/neteq_impl.cc
+++ b/modules/audio_coding/neteq/neteq_impl.cc
@@ -1950,7 +1950,8 @@
       assert(false);  // Should always be able to extract a packet here.
       return -1;
     }
-    stats_.StoreWaitingTime(packet->waiting_time->ElapsedMs());
+    const uint64_t waiting_time_ms = packet->waiting_time->ElapsedMs();
+    stats_.StoreWaitingTime(waiting_time_ms);
     RTC_DCHECK(!packet->empty());
 
     if (first_packet) {
@@ -1990,6 +1991,8 @@
     }
     extracted_samples = packet->timestamp - first_timestamp + packet_duration;
 
+    stats_.JitterBufferDelay(extracted_samples, waiting_time_ms);
+
     packet_list->push_back(std::move(*packet));  // Store packet in list.
     packet = rtc::Optional<Packet>();  // Ensure it's never used after the move.
 
diff --git a/modules/audio_coding/neteq/neteq_unittest.cc b/modules/audio_coding/neteq/neteq_unittest.cc
index 5b92217..9dd60eb 100644
--- a/modules/audio_coding/neteq/neteq_unittest.cc
+++ b/modules/audio_coding/neteq/neteq_unittest.cc
@@ -522,6 +522,7 @@
   NetEqDecodingTestFaxMode() : NetEqDecodingTest() {
     config_.playout_mode = kPlayoutFax;
   }
+  void TestJitterBufferDelay(bool apply_packet_loss);
 };
 
 TEST_F(NetEqDecodingTestFaxMode, TestFrameWaitingTimeStatistics) {
@@ -1684,4 +1685,64 @@
   EXPECT_EQ(kNumConcealmentEvents, static_cast<int>(stats.concealment_events));
 }
 
+// Test that the jitter buffer delay stat is computed correctly.
+void NetEqDecodingTestFaxMode::TestJitterBufferDelay(bool apply_packet_loss) {
+  const int kNumPackets = 10;
+  const int kDelayInNumPackets = 2;
+  const int kPacketLenMs = 10;  // All packets are of 10 ms size.
+  const size_t kSamples = kPacketLenMs * 16;
+  const size_t kPayloadBytes = kSamples * 2;
+  RTPHeader rtp_info;
+  rtp_info.ssrc = 0x1234;     // Just an arbitrary SSRC.
+  rtp_info.payloadType = 94;  // PCM16b WB codec.
+  rtp_info.markerBit = 0;
+  const uint8_t payload[kPayloadBytes] = {0};
+  bool muted;
+  int packets_sent = 0;
+  int packets_received = 0;
+  int expected_delay = 0;
+  while (packets_received < kNumPackets) {
+    // Insert packet.
+    if (packets_sent < kNumPackets) {
+      rtp_info.sequenceNumber = packets_sent++;
+      rtp_info.timestamp = rtp_info.sequenceNumber * kSamples;
+      neteq_->InsertPacket(rtp_info, payload, 0);
+    }
+
+    // Get packet.
+    if (packets_sent > kDelayInNumPackets) {
+      neteq_->GetAudio(&out_frame_, &muted);
+      packets_received++;
+
+      // The delay reported by the jitter buffer never exceeds
+      // the number of samples previously fetched with GetAudio
+      // (hence the min()).
+      int packets_delay = std::min(packets_received, kDelayInNumPackets + 1);
+
+      // The increase of the expected delay is the product of
+      // the current delay of the jitter buffer in ms * the
+      // number of samples that are sent for play out.
+      int current_delay_ms = packets_delay * kPacketLenMs;
+      expected_delay += current_delay_ms * kSamples;
+    }
+  }
+
+  if (apply_packet_loss) {
+    // Extra call to GetAudio to cause concealment.
+    neteq_->GetAudio(&out_frame_, &muted);
+  }
+
+  // Check jitter buffer delay.
+  NetEqLifetimeStatistics stats = neteq_->GetLifetimeStatistics();
+  EXPECT_EQ(expected_delay, static_cast<int>(stats.jitter_buffer_delay_ms));
+}
+
+TEST_F(NetEqDecodingTestFaxMode, TestJitterBufferDelayWithoutLoss) {
+  TestJitterBufferDelay(false);
+}
+
+TEST_F(NetEqDecodingTestFaxMode, TestJitterBufferDelayWithLoss) {
+  TestJitterBufferDelay(true);
+}
+
 }  // namespace webrtc
diff --git a/modules/audio_coding/neteq/statistics_calculator.cc b/modules/audio_coding/neteq/statistics_calculator.cc
index 4e034e6..70a15ae 100644
--- a/modules/audio_coding/neteq/statistics_calculator.cc
+++ b/modules/audio_coding/neteq/statistics_calculator.cc
@@ -229,6 +229,11 @@
   lifetime_stats_.total_samples_received += num_samples;
 }
 
+void StatisticsCalculator::JitterBufferDelay(size_t num_samples,
+                                             uint64_t waiting_time_ms) {
+  lifetime_stats_.jitter_buffer_delay_ms += waiting_time_ms * num_samples;
+}
+
 void StatisticsCalculator::SecondaryDecodedSamples(int num_samples) {
   secondary_decoded_samples_ += num_samples;
 }
diff --git a/modules/audio_coding/neteq/statistics_calculator.h b/modules/audio_coding/neteq/statistics_calculator.h
index 5c2fbf3..c3d5c86 100644
--- a/modules/audio_coding/neteq/statistics_calculator.h
+++ b/modules/audio_coding/neteq/statistics_calculator.h
@@ -75,6 +75,9 @@
   // time is increasing.
   void IncreaseCounter(size_t num_samples, int fs_hz);
 
+  // Update jitter buffer delay counter.
+  void JitterBufferDelay(size_t num_samples, uint64_t waiting_time_ms);
+
   // Stores new packet waiting time in waiting time statistics.
   void StoreWaitingTime(int waiting_time_ms);
 
diff --git a/pc/rtcstats_integrationtest.cc b/pc/rtcstats_integrationtest.cc
index 62d316d..e0fb577 100644
--- a/pc/rtcstats_integrationtest.cc
+++ b/pc/rtcstats_integrationtest.cc
@@ -562,8 +562,11 @@
     }
     // totalSamplesReceived, concealedSamples and concealmentEvents are only
     // present on inbound audio tracks.
+    // jitterBufferDelay is currently only implemented for audio.
     if (*media_stream_track.kind == RTCMediaStreamTrackKind::kAudio &&
         *media_stream_track.remote_source) {
+      verifier.TestMemberIsNonNegative<double>(
+          media_stream_track.jitter_buffer_delay);
       verifier.TestMemberIsNonNegative<uint64_t>(
           media_stream_track.total_samples_received);
       verifier.TestMemberIsNonNegative<uint64_t>(
@@ -571,6 +574,7 @@
       verifier.TestMemberIsNonNegative<uint64_t>(
           media_stream_track.concealment_events);
     } else {
+      verifier.TestMemberIsUndefined(media_stream_track.jitter_buffer_delay);
       verifier.TestMemberIsUndefined(media_stream_track.total_samples_received);
       verifier.TestMemberIsUndefined(media_stream_track.concealed_samples);
       verifier.TestMemberIsUndefined(media_stream_track.concealment_events);
diff --git a/pc/rtcstatscollector.cc b/pc/rtcstatscollector.cc
index 161d224..2fcb754 100644
--- a/pc/rtcstatscollector.cc
+++ b/pc/rtcstatscollector.cc
@@ -410,6 +410,8 @@
     audio_track_stats->audio_level = DoubleAudioLevelFromIntAudioLevel(
         voice_receiver_info.audio_level);
   }
+  audio_track_stats->jitter_buffer_delay =
+      voice_receiver_info.jitter_buffer_delay_seconds;
   audio_track_stats->total_audio_energy =
       voice_receiver_info.total_output_energy;
   audio_track_stats->total_samples_received =
diff --git a/pc/rtcstatscollector_unittest.cc b/pc/rtcstatscollector_unittest.cc
index 14f669c..0e573b1 100644
--- a/pc/rtcstatscollector_unittest.cc
+++ b/pc/rtcstatscollector_unittest.cc
@@ -1556,6 +1556,7 @@
   voice_receiver_info.total_output_duration = 0.25;
   voice_receiver_info.concealed_samples = 123;
   voice_receiver_info.concealment_events = 12;
+  voice_receiver_info.jitter_buffer_delay_seconds = 3456;
 
   test_->CreateMockRtpSendersReceiversAndChannels(
       { std::make_pair(local_audio_track.get(), voice_sender_info_ssrc1),
@@ -1633,6 +1634,7 @@
   expected_remote_audio_track.total_samples_duration = 0.25;
   expected_remote_audio_track.concealed_samples = 123;
   expected_remote_audio_track.concealment_events = 12;
+  expected_remote_audio_track.jitter_buffer_delay = 3456;
   ASSERT_TRUE(report->Get(expected_remote_audio_track.id()));
   EXPECT_EQ(expected_remote_audio_track,
             report->Get(expected_remote_audio_track.id())->cast_to<
diff --git a/stats/rtcstats_objects.cc b/stats/rtcstats_objects.cc
index e643e12..b1698cf 100644
--- a/stats/rtcstats_objects.cc
+++ b/stats/rtcstats_objects.cc
@@ -367,6 +367,7 @@
                      &ended,
                      &detached,
                      &kind,
+                     &jitter_buffer_delay,
                      &frame_width,
                      &frame_height,
                      &frames_per_second,
@@ -401,6 +402,7 @@
       ended("ended"),
       detached("detached"),
       kind("kind", kind),
+      jitter_buffer_delay("jitterBufferDelay"),
       frame_width("frameWidth"),
       frame_height("frameHeight"),
       frames_per_second("framesPerSecond"),
@@ -431,6 +433,7 @@
       ended(other.ended),
       detached(other.detached),
       kind(other.kind),
+      jitter_buffer_delay(other.jitter_buffer_delay),
       frame_width(other.frame_width),
       frame_height(other.frame_height),
       frames_per_second(other.frames_per_second),