Reland "VP9 encoder: handle disabled layers correctly"

Now vp9 screenshare would enable new layers as soon as requested and will force all spatial layers present on the next frame, even if they should be dropped because of frame-rate limiting.

This might cause frame-rate liming to be exceeded if layer is toggling on and off very often, but this situation is bad itself. E.g. in realtime video it will cause too many key-frames.

Now SvcRateAllocator and VP9EncoderImpl are aware that there may be some skipped layers before the first enabled. Key-frames and ss_info triggering logic is also updated.

(This is a reland without changes after updates to downstream projects)
Original-Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/153483

Bug: webrtc:10977
Change-Id: I02459c5982da2e0542a837514f5753c5f96401c6
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/154355
Reviewed-by: Ilya Nikolaevskiy <ilnik@webrtc.org>
Reviewed-by: Sergey Silkin <ssilkin@webrtc.org>
Commit-Queue: Ilya Nikolaevskiy <ilnik@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#29330}
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc
index 42ab4f7..b379e79 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.cc
+++ b/modules/video_coding/codecs/vp9/vp9_impl.cc
@@ -15,6 +15,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <utility>
 #include <vector>
 
 #include "absl/memory/memory.h"
@@ -137,15 +138,19 @@
   return ColorSpace(primaries, transfer, matrix, range);
 }
 
-bool MoreLayersEnabled(const VideoBitrateAllocation& first,
-                       const VideoBitrateAllocation& second) {
+std::pair<size_t, size_t> GetActiveLayers(
+    const VideoBitrateAllocation& allocation) {
   for (size_t sl_idx = 0; sl_idx < kMaxSpatialLayers; ++sl_idx) {
-    if (first.GetSpatialLayerSum(sl_idx) > 0 &&
-        second.GetSpatialLayerSum(sl_idx) == 0) {
-      return true;
+    if (allocation.GetSpatialLayerSum(sl_idx) > 0) {
+      size_t last_layer = sl_idx + 1;
+      while (last_layer < kMaxSpatialLayers &&
+             allocation.GetSpatialLayerSum(last_layer) > 0) {
+        ++last_layer;
+      }
+      return std::make_pair(sl_idx, last_layer);
     }
   }
-  return false;
+  return {0, 0};
 }
 
 uint32_t Interpolate(uint32_t low,
@@ -224,6 +229,7 @@
       num_temporal_layers_(0),
       num_spatial_layers_(0),
       num_active_spatial_layers_(0),
+      first_active_layer_(0),
       layer_deactivation_requires_key_frame_(
           field_trial::IsEnabled("WebRTC-Vp9IssueKeyFrameOnLayerDeactivation")),
       is_svc_(false),
@@ -237,6 +243,7 @@
       full_superframe_drop_(true),
       first_frame_in_picture_(true),
       ss_info_needed_(false),
+      force_all_active_layers_(false),
       is_flexible_mode_(false),
       variable_framerate_experiment_(ParseVariableFramerateConfig(
           "WebRTC-VP9VariableFramerateScreenshare")),
@@ -289,13 +296,31 @@
 
 bool VP9EncoderImpl::SetSvcRates(
     const VideoBitrateAllocation& bitrate_allocation) {
+  std::pair<size_t, size_t> current_layers =
+      GetActiveLayers(current_bitrate_allocation_);
+  std::pair<size_t, size_t> new_layers = GetActiveLayers(bitrate_allocation);
+
+  const bool layer_activation_requires_key_frame =
+      inter_layer_pred_ == InterLayerPredMode::kOff ||
+      inter_layer_pred_ == InterLayerPredMode::kOnKeyPic;
+  const bool lower_layers_enabled = new_layers.first < current_layers.first;
+  const bool higher_layers_enabled = new_layers.second > current_layers.second;
+  const bool disabled_layers = new_layers.first > current_layers.first ||
+                               new_layers.second < current_layers.second;
+
+  if (lower_layers_enabled ||
+      (higher_layers_enabled && layer_activation_requires_key_frame) ||
+      (disabled_layers && layer_deactivation_requires_key_frame_)) {
+    force_key_frame_ = true;
+  }
+
+  if (current_layers != new_layers) {
+    ss_info_needed_ = true;
+  }
+
   config_->rc_target_bitrate = bitrate_allocation.get_sum_kbps();
 
   if (ExplicitlyConfiguredSpatialLayers()) {
-    const bool layer_activation_requires_key_frame =
-        inter_layer_pred_ == InterLayerPredMode::kOff ||
-        inter_layer_pred_ == InterLayerPredMode::kOnKeyPic;
-
     for (size_t sl_idx = 0; sl_idx < num_spatial_layers_; ++sl_idx) {
       const bool was_layer_active = (config_->ss_target_bitrate[sl_idx] > 0);
       config_->ss_target_bitrate[sl_idx] =
@@ -306,15 +331,6 @@
             bitrate_allocation.GetTemporalLayerSum(sl_idx, tl_idx) / 1000;
       }
 
-      const bool is_active_layer = (config_->ss_target_bitrate[sl_idx] > 0);
-      if (!was_layer_active && is_active_layer &&
-          layer_activation_requires_key_frame) {
-        force_key_frame_ = true;
-      } else if (was_layer_active && !is_active_layer &&
-                 layer_deactivation_requires_key_frame_) {
-        force_key_frame_ = true;
-      }
-
       if (!was_layer_active) {
         // Reset frame rate controller if layer is resumed after pause.
         framerate_controller_[sl_idx].Reset();
@@ -367,13 +383,34 @@
   }
 
   num_active_spatial_layers_ = 0;
+  first_active_layer_ = 0;
+  bool seen_active_layer = false;
+  bool expect_no_more_active_layers = false;
   for (int i = 0; i < num_spatial_layers_; ++i) {
     if (config_->ss_target_bitrate[i] > 0) {
-      ++num_active_spatial_layers_;
+      RTC_DCHECK(!expect_no_more_active_layers) << "Only middle layer is "
+                                                   "deactivated.";
+      if (!seen_active_layer) {
+        first_active_layer_ = i;
+      }
+      num_active_spatial_layers_ = i + 1;
+      seen_active_layer = true;
+    } else {
+      expect_no_more_active_layers = seen_active_layer;
     }
   }
   RTC_DCHECK_GT(num_active_spatial_layers_, 0);
 
+  if (higher_layers_enabled && !force_key_frame_) {
+    // Prohibit drop of all layers for the next frame, so newly enabled
+    // layer would have a valid spatial reference.
+    for (size_t i = 0; i < num_spatial_layers_; ++i) {
+      svc_drop_frame_.framedrop_thresh[i] = 0;
+    }
+    force_all_active_layers_ = true;
+  }
+
+  current_bitrate_allocation_ = bitrate_allocation;
   return true;
 }
 
@@ -393,7 +430,16 @@
   }
 
   codec_.maxFramerate = static_cast<uint32_t>(parameters.framerate_fps + 0.5);
-  requested_rate_settings_ = parameters;
+
+  if (dynamic_rate_settings_) {
+    // Tweak rate control settings based on available network headroom.
+    UpdateRateSettings(
+        config_, GetRateSettings(parameters.bandwidth_allocation.bps<double>() /
+                                 parameters.bitrate.get_sum_bps()));
+  }
+
+  bool res = SetSvcRates(parameters.bitrate);
+  RTC_DCHECK(res) << "Failed to set new bitrate allocation";
 }
 
 // TODO(eladalon): s/inst/codec_settings/g.
@@ -830,6 +876,10 @@
           num_steady_state_frames_ >=
               variable_framerate_experiment_.frames_before_steady_state;
 
+      // Need to check all frame limiters, even if lower layers are disabled,
+      // because variable frame-rate limiter should be checked after the first
+      // layer. It's easier to overwrite active layers after, then check all
+      // cases.
       for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
         const float layer_fps =
             framerate_controller_[layer_id.spatial_layer_id].GetTargetRate();
@@ -856,6 +906,11 @@
       }
     }
 
+    if (force_all_active_layers_) {
+      layer_id.spatial_layer_id = first_active_layer_;
+      force_all_active_layers_ = false;
+    }
+
     RTC_DCHECK_LE(layer_id.spatial_layer_id, num_active_spatial_layers_);
     if (layer_id.spatial_layer_id >= num_active_spatial_layers_) {
       // Drop entire picture.
@@ -867,50 +922,12 @@
     layer_id.temporal_layer_id_per_spatial[sl_idx] = layer_id.temporal_layer_id;
   }
 
-  vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
-
-  if (requested_rate_settings_) {
-    if (dynamic_rate_settings_) {
-      // Tweak rate control settings based on available network headroom.
-      UpdateRateSettings(
-          config_,
-          GetRateSettings(
-              requested_rate_settings_->bandwidth_allocation.bps<double>() /
-              requested_rate_settings_->bitrate.get_sum_bps()));
-    }
-
-    bool more_layers_requested = MoreLayersEnabled(
-        requested_rate_settings_->bitrate, current_bitrate_allocation_);
-    bool less_layers_requested = MoreLayersEnabled(
-        current_bitrate_allocation_, requested_rate_settings_->bitrate);
-    // In SVC can enable new layers only if all lower layers are encoded and at
-    // the base temporal layer.
-    // This will delay rate allocation change until the next frame on the base
-    // spatial layer.
-    // In KSVC or simulcast modes KF will be generated for a new layer, so can
-    // update allocation any time.
-    bool can_upswitch =
-        inter_layer_pred_ != InterLayerPredMode::kOn ||
-        (layer_id.spatial_layer_id == 0 && layer_id.temporal_layer_id == 0);
-    if (!more_layers_requested || can_upswitch) {
-      current_bitrate_allocation_ = requested_rate_settings_->bitrate;
-      requested_rate_settings_ = absl::nullopt;
-      if (!SetSvcRates(current_bitrate_allocation_)) {
-        return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
-      }
-      if (less_layers_requested || more_layers_requested) {
-        ss_info_needed_ = true;
-      }
-      if (more_layers_requested && !force_key_frame_) {
-        // Prohibit drop of all layers for the next frame, so newly enabled
-        // layer would have a valid spatial reference.
-        for (size_t i = 0; i < num_spatial_layers_; ++i) {
-          svc_drop_frame_.framedrop_thresh[i] = 0;
-        }
-      }
-    }
+  if (layer_id.spatial_layer_id < first_active_layer_) {
+    layer_id.spatial_layer_id = first_active_layer_;
   }
 
+  vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
+
   if (num_spatial_layers_ > 1) {
     // Update frame dropping settings as they may change on per-frame basis.
     vpx_codec_control(encoder_, VP9E_SET_SVC_FRAME_DROP_LAYER,
@@ -1117,10 +1134,15 @@
   // of key picture (inter-layer prediction is enabled).
   const bool is_key_frame = is_key_pic && !vp9_info->inter_layer_predicted;
   if (is_key_frame || (ss_info_needed_ && layer_id.temporal_layer_id == 0 &&
-                       layer_id.spatial_layer_id == 0)) {
+                       layer_id.spatial_layer_id == first_active_layer_)) {
     vp9_info->ss_data_available = true;
     vp9_info->spatial_layer_resolution_present = true;
-    for (size_t i = 0; i < num_active_spatial_layers_; ++i) {
+    // Signal disabled layers.
+    for (size_t i = 0; i < first_active_layer_; ++i) {
+      vp9_info->width[i] = 0;
+      vp9_info->height[i] = 0;
+    }
+    for (size_t i = first_active_layer_; i < num_active_spatial_layers_; ++i) {
       vp9_info->width[i] = codec_.width * svc_params_.scaling_factor_num[i] /
                            svc_params_.scaling_factor_den[i];
       vp9_info->height[i] = codec_.height * svc_params_.scaling_factor_num[i] /