philipel | 4e70216 | 2020-11-27 17:56:37 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
| 11 | #include "modules/video_coding/rtp_vp9_ref_finder.h" |
| 12 | |
| 13 | #include <algorithm> |
| 14 | #include <utility> |
| 15 | |
| 16 | #include "rtc_base/logging.h" |
| 17 | |
| 18 | namespace webrtc { |
| 19 | namespace video_coding { |
| 20 | |
| 21 | RtpFrameReferenceFinder::ReturnVector RtpVp9RefFinder::ManageFrame( |
| 22 | std::unique_ptr<RtpFrameObject> frame) { |
| 23 | FrameDecision decision = ManageFrameInternal(frame.get()); |
| 24 | |
| 25 | RtpFrameReferenceFinder::ReturnVector res; |
| 26 | switch (decision) { |
| 27 | case kStash: |
| 28 | if (stashed_frames_.size() > kMaxStashedFrames) |
| 29 | stashed_frames_.pop_back(); |
| 30 | stashed_frames_.push_front(std::move(frame)); |
| 31 | return res; |
| 32 | case kHandOff: |
| 33 | res.push_back(std::move(frame)); |
| 34 | RetryStashedFrames(res); |
| 35 | return res; |
| 36 | case kDrop: |
| 37 | return res; |
| 38 | } |
| 39 | |
| 40 | return res; |
| 41 | } |
| 42 | |
| 43 | RtpVp9RefFinder::FrameDecision RtpVp9RefFinder::ManageFrameInternal( |
| 44 | RtpFrameObject* frame) { |
| 45 | const RTPVideoHeader& video_header = frame->GetRtpVideoHeader(); |
| 46 | const RTPVideoHeaderVP9& codec_header = |
| 47 | absl::get<RTPVideoHeaderVP9>(video_header.video_type_header); |
| 48 | |
| 49 | // Protect against corrupted packets with arbitrary large temporal idx. |
| 50 | if (codec_header.temporal_idx >= kMaxTemporalLayers || |
| 51 | codec_header.spatial_idx >= kMaxSpatialLayers) |
| 52 | return kDrop; |
| 53 | |
philipel | 0cb7326 | 2020-12-08 17:36:53 +0100 | [diff] [blame] | 54 | frame->SetSpatialIndex(codec_header.spatial_idx); |
philipel | 4e70216 | 2020-11-27 17:56:37 +0100 | [diff] [blame] | 55 | frame->id.picture_id = codec_header.picture_id & (kFrameIdLength - 1); |
| 56 | |
| 57 | if (last_picture_id_ == -1) |
| 58 | last_picture_id_ = frame->id.picture_id; |
| 59 | |
| 60 | if (codec_header.flexible_mode) { |
| 61 | if (codec_header.num_ref_pics > EncodedFrame::kMaxFrameReferences) { |
| 62 | return kDrop; |
| 63 | } |
| 64 | frame->num_references = codec_header.num_ref_pics; |
| 65 | for (size_t i = 0; i < frame->num_references; ++i) { |
| 66 | frame->references[i] = Subtract<kFrameIdLength>(frame->id.picture_id, |
| 67 | codec_header.pid_diff[i]); |
| 68 | } |
| 69 | |
philipel | 0cb7326 | 2020-12-08 17:36:53 +0100 | [diff] [blame] | 70 | FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted); |
philipel | 4e70216 | 2020-11-27 17:56:37 +0100 | [diff] [blame] | 71 | return kHandOff; |
| 72 | } |
| 73 | |
| 74 | if (codec_header.tl0_pic_idx == kNoTl0PicIdx) { |
| 75 | RTC_LOG(LS_WARNING) << "TL0PICIDX is expected to be present in " |
| 76 | "non-flexible mode."; |
| 77 | return kDrop; |
| 78 | } |
| 79 | |
| 80 | GofInfo* info; |
| 81 | int64_t unwrapped_tl0 = |
| 82 | tl0_unwrapper_.Unwrap(codec_header.tl0_pic_idx & 0xFF); |
| 83 | if (codec_header.ss_data_available) { |
| 84 | if (codec_header.temporal_idx != 0) { |
| 85 | RTC_LOG(LS_WARNING) << "Received scalability structure on a non base " |
| 86 | "layer frame. Scalability structure ignored."; |
| 87 | } else { |
| 88 | if (codec_header.gof.num_frames_in_gof > kMaxVp9FramesInGof) { |
| 89 | return kDrop; |
| 90 | } |
| 91 | |
| 92 | for (size_t i = 0; i < codec_header.gof.num_frames_in_gof; ++i) { |
| 93 | if (codec_header.gof.num_ref_pics[i] > kMaxVp9RefPics) { |
| 94 | return kDrop; |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | GofInfoVP9 gof = codec_header.gof; |
| 99 | if (gof.num_frames_in_gof == 0) { |
| 100 | RTC_LOG(LS_WARNING) << "Number of frames in GOF is zero. Assume " |
| 101 | "that stream has only one temporal layer."; |
| 102 | gof.SetGofInfoVP9(kTemporalStructureMode1); |
| 103 | } |
| 104 | |
| 105 | current_ss_idx_ = Add<kMaxGofSaved>(current_ss_idx_, 1); |
| 106 | scalability_structures_[current_ss_idx_] = gof; |
| 107 | scalability_structures_[current_ss_idx_].pid_start = frame->id.picture_id; |
| 108 | gof_info_.emplace(unwrapped_tl0, |
| 109 | GofInfo(&scalability_structures_[current_ss_idx_], |
| 110 | frame->id.picture_id)); |
| 111 | } |
| 112 | |
| 113 | const auto gof_info_it = gof_info_.find(unwrapped_tl0); |
| 114 | if (gof_info_it == gof_info_.end()) |
| 115 | return kStash; |
| 116 | |
| 117 | info = &gof_info_it->second; |
| 118 | |
| 119 | if (frame->frame_type() == VideoFrameType::kVideoFrameKey) { |
| 120 | frame->num_references = 0; |
| 121 | FrameReceivedVp9(frame->id.picture_id, info); |
philipel | 0cb7326 | 2020-12-08 17:36:53 +0100 | [diff] [blame] | 122 | FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted); |
philipel | 4e70216 | 2020-11-27 17:56:37 +0100 | [diff] [blame] | 123 | return kHandOff; |
| 124 | } |
| 125 | } else if (frame->frame_type() == VideoFrameType::kVideoFrameKey) { |
philipel | 0cb7326 | 2020-12-08 17:36:53 +0100 | [diff] [blame] | 126 | if (frame->SpatialIndex() == 0) { |
philipel | 4e70216 | 2020-11-27 17:56:37 +0100 | [diff] [blame] | 127 | RTC_LOG(LS_WARNING) << "Received keyframe without scalability structure"; |
| 128 | return kDrop; |
| 129 | } |
| 130 | const auto gof_info_it = gof_info_.find(unwrapped_tl0); |
| 131 | if (gof_info_it == gof_info_.end()) |
| 132 | return kStash; |
| 133 | |
| 134 | info = &gof_info_it->second; |
| 135 | |
philipel | 0cb7326 | 2020-12-08 17:36:53 +0100 | [diff] [blame] | 136 | frame->num_references = 0; |
| 137 | FrameReceivedVp9(frame->id.picture_id, info); |
| 138 | FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted); |
| 139 | return kHandOff; |
philipel | 4e70216 | 2020-11-27 17:56:37 +0100 | [diff] [blame] | 140 | } else { |
| 141 | auto gof_info_it = gof_info_.find( |
| 142 | (codec_header.temporal_idx == 0) ? unwrapped_tl0 - 1 : unwrapped_tl0); |
| 143 | |
| 144 | // Gof info for this frame is not available yet, stash this frame. |
| 145 | if (gof_info_it == gof_info_.end()) |
| 146 | return kStash; |
| 147 | |
| 148 | if (codec_header.temporal_idx == 0) { |
| 149 | gof_info_it = gof_info_ |
| 150 | .emplace(unwrapped_tl0, GofInfo(gof_info_it->second.gof, |
| 151 | frame->id.picture_id)) |
| 152 | .first; |
| 153 | } |
| 154 | |
| 155 | info = &gof_info_it->second; |
| 156 | } |
| 157 | |
| 158 | // Clean up info for base layers that are too old. |
| 159 | int64_t old_tl0_pic_idx = unwrapped_tl0 - kMaxGofSaved; |
| 160 | auto clean_gof_info_to = gof_info_.lower_bound(old_tl0_pic_idx); |
| 161 | gof_info_.erase(gof_info_.begin(), clean_gof_info_to); |
| 162 | |
| 163 | FrameReceivedVp9(frame->id.picture_id, info); |
| 164 | |
| 165 | // Make sure we don't miss any frame that could potentially have the |
| 166 | // up switch flag set. |
| 167 | if (MissingRequiredFrameVp9(frame->id.picture_id, *info)) |
| 168 | return kStash; |
| 169 | |
| 170 | if (codec_header.temporal_up_switch) |
| 171 | up_switch_.emplace(frame->id.picture_id, codec_header.temporal_idx); |
| 172 | |
| 173 | // Clean out old info about up switch frames. |
| 174 | uint16_t old_picture_id = Subtract<kFrameIdLength>(frame->id.picture_id, 50); |
| 175 | auto up_switch_erase_to = up_switch_.lower_bound(old_picture_id); |
| 176 | up_switch_.erase(up_switch_.begin(), up_switch_erase_to); |
| 177 | |
| 178 | size_t diff = ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, |
| 179 | frame->id.picture_id); |
| 180 | size_t gof_idx = diff % info->gof->num_frames_in_gof; |
| 181 | |
| 182 | if (info->gof->num_ref_pics[gof_idx] > EncodedFrame::kMaxFrameReferences) { |
| 183 | return kDrop; |
| 184 | } |
| 185 | // Populate references according to the scalability structure. |
| 186 | frame->num_references = info->gof->num_ref_pics[gof_idx]; |
| 187 | for (size_t i = 0; i < frame->num_references; ++i) { |
| 188 | frame->references[i] = Subtract<kFrameIdLength>( |
| 189 | frame->id.picture_id, info->gof->pid_diff[gof_idx][i]); |
| 190 | |
| 191 | // If this is a reference to a frame earlier than the last up switch point, |
| 192 | // then ignore this reference. |
| 193 | if (UpSwitchInIntervalVp9(frame->id.picture_id, codec_header.temporal_idx, |
| 194 | frame->references[i])) { |
| 195 | --frame->num_references; |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | // Override GOF references. |
| 200 | if (!codec_header.inter_pic_predicted) { |
| 201 | frame->num_references = 0; |
| 202 | } |
| 203 | |
philipel | 0cb7326 | 2020-12-08 17:36:53 +0100 | [diff] [blame] | 204 | FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted); |
philipel | 4e70216 | 2020-11-27 17:56:37 +0100 | [diff] [blame] | 205 | return kHandOff; |
| 206 | } |
| 207 | |
| 208 | bool RtpVp9RefFinder::MissingRequiredFrameVp9(uint16_t picture_id, |
| 209 | const GofInfo& info) { |
| 210 | size_t diff = |
| 211 | ForwardDiff<uint16_t, kFrameIdLength>(info.gof->pid_start, picture_id); |
| 212 | size_t gof_idx = diff % info.gof->num_frames_in_gof; |
| 213 | size_t temporal_idx = info.gof->temporal_idx[gof_idx]; |
| 214 | |
| 215 | if (temporal_idx >= kMaxTemporalLayers) { |
| 216 | RTC_LOG(LS_WARNING) << "At most " << kMaxTemporalLayers |
| 217 | << " temporal " |
| 218 | "layers are supported."; |
| 219 | return true; |
| 220 | } |
| 221 | |
| 222 | // For every reference this frame has, check if there is a frame missing in |
| 223 | // the interval (|ref_pid|, |picture_id|) in any of the lower temporal |
| 224 | // layers. If so, we are missing a required frame. |
| 225 | uint8_t num_references = info.gof->num_ref_pics[gof_idx]; |
| 226 | for (size_t i = 0; i < num_references; ++i) { |
| 227 | uint16_t ref_pid = |
| 228 | Subtract<kFrameIdLength>(picture_id, info.gof->pid_diff[gof_idx][i]); |
| 229 | for (size_t l = 0; l < temporal_idx; ++l) { |
| 230 | auto missing_frame_it = missing_frames_for_layer_[l].lower_bound(ref_pid); |
| 231 | if (missing_frame_it != missing_frames_for_layer_[l].end() && |
| 232 | AheadOf<uint16_t, kFrameIdLength>(picture_id, *missing_frame_it)) { |
| 233 | return true; |
| 234 | } |
| 235 | } |
| 236 | } |
| 237 | return false; |
| 238 | } |
| 239 | |
| 240 | void RtpVp9RefFinder::FrameReceivedVp9(uint16_t picture_id, GofInfo* info) { |
| 241 | int last_picture_id = info->last_picture_id; |
| 242 | size_t gof_size = std::min(info->gof->num_frames_in_gof, kMaxVp9FramesInGof); |
| 243 | |
| 244 | // If there is a gap, find which temporal layer the missing frames |
| 245 | // belong to and add the frame as missing for that temporal layer. |
| 246 | // Otherwise, remove this frame from the set of missing frames. |
| 247 | if (AheadOf<uint16_t, kFrameIdLength>(picture_id, last_picture_id)) { |
| 248 | size_t diff = ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, |
| 249 | last_picture_id); |
| 250 | size_t gof_idx = diff % gof_size; |
| 251 | |
| 252 | last_picture_id = Add<kFrameIdLength>(last_picture_id, 1); |
| 253 | while (last_picture_id != picture_id) { |
| 254 | gof_idx = (gof_idx + 1) % gof_size; |
| 255 | RTC_CHECK(gof_idx < kMaxVp9FramesInGof); |
| 256 | |
| 257 | size_t temporal_idx = info->gof->temporal_idx[gof_idx]; |
| 258 | if (temporal_idx >= kMaxTemporalLayers) { |
| 259 | RTC_LOG(LS_WARNING) << "At most " << kMaxTemporalLayers |
| 260 | << " temporal " |
| 261 | "layers are supported."; |
| 262 | return; |
| 263 | } |
| 264 | |
| 265 | missing_frames_for_layer_[temporal_idx].insert(last_picture_id); |
| 266 | last_picture_id = Add<kFrameIdLength>(last_picture_id, 1); |
| 267 | } |
| 268 | |
| 269 | info->last_picture_id = last_picture_id; |
| 270 | } else { |
| 271 | size_t diff = |
| 272 | ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, picture_id); |
| 273 | size_t gof_idx = diff % gof_size; |
| 274 | RTC_CHECK(gof_idx < kMaxVp9FramesInGof); |
| 275 | |
| 276 | size_t temporal_idx = info->gof->temporal_idx[gof_idx]; |
| 277 | if (temporal_idx >= kMaxTemporalLayers) { |
| 278 | RTC_LOG(LS_WARNING) << "At most " << kMaxTemporalLayers |
| 279 | << " temporal " |
| 280 | "layers are supported."; |
| 281 | return; |
| 282 | } |
| 283 | |
| 284 | missing_frames_for_layer_[temporal_idx].erase(picture_id); |
| 285 | } |
| 286 | } |
| 287 | |
| 288 | bool RtpVp9RefFinder::UpSwitchInIntervalVp9(uint16_t picture_id, |
| 289 | uint8_t temporal_idx, |
| 290 | uint16_t pid_ref) { |
| 291 | for (auto up_switch_it = up_switch_.upper_bound(pid_ref); |
| 292 | up_switch_it != up_switch_.end() && |
| 293 | AheadOf<uint16_t, kFrameIdLength>(picture_id, up_switch_it->first); |
| 294 | ++up_switch_it) { |
| 295 | if (up_switch_it->second < temporal_idx) |
| 296 | return true; |
| 297 | } |
| 298 | |
| 299 | return false; |
| 300 | } |
| 301 | |
| 302 | void RtpVp9RefFinder::RetryStashedFrames( |
| 303 | RtpFrameReferenceFinder::ReturnVector& res) { |
| 304 | bool complete_frame = false; |
| 305 | do { |
| 306 | complete_frame = false; |
| 307 | for (auto frame_it = stashed_frames_.begin(); |
| 308 | frame_it != stashed_frames_.end();) { |
| 309 | FrameDecision decision = ManageFrameInternal(frame_it->get()); |
| 310 | |
| 311 | switch (decision) { |
| 312 | case kStash: |
| 313 | ++frame_it; |
| 314 | break; |
| 315 | case kHandOff: |
| 316 | complete_frame = true; |
| 317 | res.push_back(std::move(*frame_it)); |
| 318 | ABSL_FALLTHROUGH_INTENDED; |
| 319 | case kDrop: |
| 320 | frame_it = stashed_frames_.erase(frame_it); |
| 321 | } |
| 322 | } |
| 323 | } while (complete_frame); |
| 324 | } |
| 325 | |
philipel | 0cb7326 | 2020-12-08 17:36:53 +0100 | [diff] [blame] | 326 | void RtpVp9RefFinder::FlattenFrameIdAndRefs(RtpFrameObject* frame, |
| 327 | bool inter_layer_predicted) { |
| 328 | for (size_t i = 0; i < frame->num_references; ++i) { |
| 329 | frame->references[i] = |
| 330 | unwrapper_.Unwrap(frame->references[i]) * kMaxSpatialLayers + |
| 331 | *frame->SpatialIndex(); |
| 332 | } |
| 333 | frame->id.picture_id = |
| 334 | unwrapper_.Unwrap(frame->id.picture_id) * kMaxSpatialLayers + |
| 335 | *frame->SpatialIndex(); |
| 336 | |
| 337 | if (inter_layer_predicted && |
| 338 | frame->num_references + 1 <= EncodedFrame::kMaxFrameReferences) { |
| 339 | frame->references[frame->num_references] = frame->id.picture_id - 1; |
| 340 | ++frame->num_references; |
| 341 | } |
philipel | 4e70216 | 2020-11-27 17:56:37 +0100 | [diff] [blame] | 342 | } |
| 343 | |
| 344 | void RtpVp9RefFinder::ClearTo(uint16_t seq_num) { |
| 345 | auto it = stashed_frames_.begin(); |
| 346 | while (it != stashed_frames_.end()) { |
| 347 | if (AheadOf<uint16_t>(seq_num, (*it)->first_seq_num())) { |
| 348 | it = stashed_frames_.erase(it); |
| 349 | } else { |
| 350 | ++it; |
| 351 | } |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | } // namespace video_coding |
| 356 | } // namespace webrtc |