External VNR speed improvement.

Improved visual quality with 3x times speed-up.
Change list:
 1. Remove second chance filter in temporal denoising filter to mitigate trailing artifact.
 2. Add swap buffer to save one whole-frame memcpy.
 3. Do noise estimation on every N blocks.
 4. Adopt a faster moving object detection algorithm (change the structure).
 5. Refactor the for loops and PositionCheck().
 6. Refactor the function ReduceFalseDetection (RFD).
 7. Fix a bug in TrailingBlock() which causes a mismatch.
 8. Change unit test to support swap buffer test.
 9. Remove CopyMem8x8, use memcpy to copy U/V plane which can be optimized future.
 10. Remove DenoiseMetrics.

Review URL: https://codereview.webrtc.org/1871853003

Cr-Commit-Position: refs/heads/master@{#12340}
diff --git a/webrtc/modules/video_processing/frame_preprocessor.cc b/webrtc/modules/video_processing/frame_preprocessor.cc
index fd0d0ef..c204214 100644
--- a/webrtc/modules/video_processing/frame_preprocessor.cc
+++ b/webrtc/modules/video_processing/frame_preprocessor.cc
@@ -23,6 +23,7 @@
   ca_ = new VPMContentAnalysis(true);
   vd_ = new VPMVideoDecimator();
   EnableDenosing(false);
+  denoised_frame_toggle_ = 0;
 }
 
 VPMFramePreprocessor::~VPMFramePreprocessor() {
@@ -116,9 +117,18 @@
 
   const VideoFrame* current_frame = &frame;
   if (denoiser_) {
-    denoiser_->DenoiseFrame(*current_frame, &denoised_frame_,
-                            &denoised_frame_prev_, 0);
-    current_frame = &denoised_frame_;
+    VideoFrame* denoised_frame = &denoised_frame_[0];
+    VideoFrame* denoised_frame_prev = &denoised_frame_[1];
+    // Swap the buffer to save one memcpy in DenoiseFrame.
+    if (denoised_frame_toggle_) {
+      denoised_frame = &denoised_frame_[1];
+      denoised_frame_prev = &denoised_frame_[0];
+    }
+    // Invert the flag.
+    denoised_frame_toggle_ ^= 1;
+    denoiser_->DenoiseFrame(*current_frame, denoised_frame, denoised_frame_prev,
+                            true);
+    current_frame = denoised_frame;
   }
 
   if (spatial_resampler_->ApplyResample(current_frame->width(),
diff --git a/webrtc/modules/video_processing/frame_preprocessor.h b/webrtc/modules/video_processing/frame_preprocessor.h
index c35dd0d..270fbc2 100644
--- a/webrtc/modules/video_processing/frame_preprocessor.h
+++ b/webrtc/modules/video_processing/frame_preprocessor.h
@@ -70,14 +70,14 @@
   enum { kSkipFrameCA = 2 };
 
   VideoContentMetrics* content_metrics_;
-  VideoFrame denoised_frame_;
-  VideoFrame denoised_frame_prev_;
+  VideoFrame denoised_frame_[2];
   VideoFrame resampled_frame_;
   VPMSpatialResampler* spatial_resampler_;
   VPMContentAnalysis* ca_;
   VPMVideoDecimator* vd_;
   std::unique_ptr<VideoDenoiser> denoiser_;
   bool enable_ca_;
+  uint8_t denoised_frame_toggle_;
   uint32_t frame_cnt_;
 };
 
diff --git a/webrtc/modules/video_processing/test/denoiser_test.cc b/webrtc/modules/video_processing/test/denoiser_test.cc
index a45f933..4c13a05 100644
--- a/webrtc/modules/video_processing/test/denoiser_test.cc
+++ b/webrtc/modules/video_processing/test/denoiser_test.cc
@@ -31,18 +31,10 @@
     }
   }
 
-  memset(dst, 0, 8 * 8);
-  df_c->CopyMem8x8(src, 8, dst, 8);
-  EXPECT_EQ(0, memcmp(src, dst, 8 * 8));
-
   memset(dst, 0, 16 * 16);
   df_c->CopyMem16x16(src, 16, dst, 16);
   EXPECT_EQ(0, memcmp(src, dst, 16 * 16));
 
-  memset(dst, 0, 8 * 8);
-  df_sse_neon->CopyMem16x16(src, 8, dst, 8);
-  EXPECT_EQ(0, memcmp(src, dst, 8 * 8));
-
   memset(dst, 0, 16 * 16);
   df_sse_neon->CopyMem16x16(src, 16, dst, 16);
   EXPECT_EQ(0, memcmp(src, dst, 16 * 16));
@@ -87,10 +79,9 @@
     }
   }
   memset(dst, 0, 16 * 16);
-  df_c->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1, false);
+  df_c->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1);
   memset(dst_sse_neon, 0, 16 * 16);
-  df_sse_neon->MbDenoise(running_src, 16, dst_sse_neon, 16, src, 16, 0, 1,
-                         false);
+  df_sse_neon->MbDenoise(running_src, 16, dst_sse_neon, 16, src, 16, 0, 1);
   EXPECT_EQ(0, memcmp(dst, dst_sse_neon, 16 * 16));
 
   // Test case: |diff| >= |4 + shift_inc1|
@@ -101,10 +92,9 @@
     }
   }
   memset(dst, 0, 16 * 16);
-  df_c->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1, false);
+  df_c->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1);
   memset(dst_sse_neon, 0, 16 * 16);
-  df_sse_neon->MbDenoise(running_src, 16, dst_sse_neon, 16, src, 16, 0, 1,
-                         false);
+  df_sse_neon->MbDenoise(running_src, 16, dst_sse_neon, 16, src, 16, 0, 1);
   EXPECT_EQ(0, memcmp(dst, dst_sse_neon, 16 * 16));
 
   // Test case: |diff| >= 8
@@ -115,10 +105,9 @@
     }
   }
   memset(dst, 0, 16 * 16);
-  df_c->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1, false);
+  df_c->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1);
   memset(dst_sse_neon, 0, 16 * 16);
-  df_sse_neon->MbDenoise(running_src, 16, dst_sse_neon, 16, src, 16, 0, 1,
-                         false);
+  df_sse_neon->MbDenoise(running_src, 16, dst_sse_neon, 16, src, 16, 0, 1);
   EXPECT_EQ(0, memcmp(dst, dst_sse_neon, 16 * 16));
 
   // Test case: |diff| > 15
@@ -130,22 +119,23 @@
   }
   memset(dst, 0, 16 * 16);
   DenoiserDecision decision =
-      df_c->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1, false);
+      df_c->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1);
   EXPECT_EQ(COPY_BLOCK, decision);
-  decision =
-      df_sse_neon->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1, false);
+  decision = df_sse_neon->MbDenoise(running_src, 16, dst, 16, src, 16, 0, 1);
   EXPECT_EQ(COPY_BLOCK, decision);
 }
 
 TEST_F(VideoProcessingTest, Denoiser) {
+  // Used in swap buffer.
+  int denoised_frame_toggle = 0;
   // Create pure C denoiser.
   VideoDenoiser denoiser_c(false);
   // Create SSE or NEON denoiser.
   VideoDenoiser denoiser_sse_neon(true);
   VideoFrame denoised_frame_c;
-  VideoFrame denoised_frame_track_c;
+  VideoFrame denoised_frame_prev_c;
   VideoFrame denoised_frame_sse_neon;
-  VideoFrame denoised_frame_track_sse_neon;
+  VideoFrame denoised_frame_prev_sse_neon;
 
   std::unique_ptr<uint8_t[]> video_buffer(new uint8_t[frame_length_]);
   while (fread(video_buffer.get(), 1, frame_length_, source_file_) ==
@@ -154,13 +144,25 @@
     EXPECT_EQ(0, ConvertToI420(kI420, video_buffer.get(), 0, 0, width_, height_,
                                0, kVideoRotation_0, &video_frame_));
 
-    denoiser_c.DenoiseFrame(video_frame_, &denoised_frame_c,
-                            &denoised_frame_track_c, -1);
-    denoiser_sse_neon.DenoiseFrame(video_frame_, &denoised_frame_sse_neon,
-                                   &denoised_frame_track_sse_neon, -1);
-
+    VideoFrame* p_denoised_c = &denoised_frame_c;
+    VideoFrame* p_denoised_prev_c = &denoised_frame_prev_c;
+    VideoFrame* p_denoised_sse_neon = &denoised_frame_sse_neon;
+    VideoFrame* p_denoised_prev_sse_neon = &denoised_frame_prev_sse_neon;
+    // Swap the buffer to save one memcpy in DenoiseFrame.
+    if (denoised_frame_toggle) {
+      p_denoised_c = &denoised_frame_prev_c;
+      p_denoised_prev_c = &denoised_frame_c;
+      p_denoised_sse_neon = &denoised_frame_prev_sse_neon;
+      p_denoised_prev_sse_neon = &denoised_frame_sse_neon;
+    }
+    denoiser_c.DenoiseFrame(video_frame_, p_denoised_c, p_denoised_prev_c,
+                            false);
+    denoiser_sse_neon.DenoiseFrame(video_frame_, p_denoised_sse_neon,
+                                   p_denoised_prev_sse_neon, false);
+    // Invert the flag.
+    denoised_frame_toggle ^= 1;
     // Denoising results should be the same for C and SSE/NEON denoiser.
-    ASSERT_TRUE(test::FramesEqual(denoised_frame_c, denoised_frame_sse_neon));
+    ASSERT_TRUE(test::FramesEqual(*p_denoised_c, *p_denoised_sse_neon));
   }
   ASSERT_NE(0, feof(source_file_)) << "Error reading source file";
 }
diff --git a/webrtc/modules/video_processing/util/denoiser_filter.h b/webrtc/modules/video_processing/util/denoiser_filter.h
index f2c7570..1254a88 100644
--- a/webrtc/modules/video_processing/util/denoiser_filter.h
+++ b/webrtc/modules/video_processing/util/denoiser_filter.h
@@ -25,12 +25,6 @@
 
 enum DenoiserDecision { COPY_BLOCK, FILTER_BLOCK };
 enum CpuType { CPU_NEON, CPU_NOT_NEON };
-struct DenoiseMetrics {
-  uint32_t var;
-  uint32_t sad;
-  uint8_t denoise;
-  bool is_skin;
-};
 
 class DenoiserFilter {
  public:
@@ -43,10 +37,6 @@
                             int src_stride,
                             uint8_t* dst,
                             int dst_stride) = 0;
-  virtual void CopyMem8x8(const uint8_t* src,
-                          int src_stride,
-                          uint8_t* dst,
-                          int dst_stride) = 0;
   virtual uint32_t Variance16x8(const uint8_t* a,
                                 int a_stride,
                                 const uint8_t* b,
@@ -59,8 +49,7 @@
                                      const uint8_t* sig,
                                      int sig_stride,
                                      uint8_t motion_magnitude,
-                                     int increase_denoising,
-                                     bool denoise_always) = 0;
+                                     int increase_denoising) = 0;
 };
 
 }  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_c.cc b/webrtc/modules/video_processing/util/denoiser_filter_c.cc
index 8c84f49..1b3c0b7 100644
--- a/webrtc/modules/video_processing/util/denoiser_filter_c.cc
+++ b/webrtc/modules/video_processing/util/denoiser_filter_c.cc
@@ -25,17 +25,6 @@
   }
 }
 
-void DenoiserFilterC::CopyMem8x8(const uint8_t* src,
-                                 int src_stride,
-                                 uint8_t* dst,
-                                 int dst_stride) {
-  for (int i = 0; i < 8; i++) {
-    memcpy(dst, src, 8);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 uint32_t DenoiserFilterC::Variance16x8(const uint8_t* a,
                                        int a_stride,
                                        const uint8_t* b,
@@ -66,8 +55,7 @@
                                             const uint8_t* sig,
                                             int sig_stride,
                                             uint8_t motion_magnitude,
-                                            int increase_denoising,
-                                            bool denoise_always) {
+                                            int increase_denoising) {
   int sum_diff_thresh = 0;
   int sum_diff = 0;
   int adj_val[3] = {3, 4, 6};
@@ -137,60 +125,10 @@
     sum_diff += col_sum[c];
   }
 
-  if (denoise_always)
-    sum_diff_thresh = INT_MAX;
-  else if (increase_denoising)
-    sum_diff_thresh = kSumDiffThresholdHigh;
-  else
-    sum_diff_thresh = kSumDiffThreshold;
-  if (abs(sum_diff) > sum_diff_thresh) {
-    int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
-    // Only apply the adjustment for max delta up to 3.
-    if (delta < 4) {
-      sig -= sig_stride * 16;
-      mc_running_avg_y -= mc_avg_y_stride * 16;
-      running_avg_y -= avg_y_stride * 16;
-      for (int r = 0; r < 16; ++r) {
-        for (int c = 0; c < 16; ++c) {
-          int diff = mc_running_avg_y[c] - sig[c];
-          int adjustment = abs(diff);
-          if (adjustment > delta)
-            adjustment = delta;
-          if (diff > 0) {
-            // Bring denoised signal down.
-            if (running_avg_y[c] - adjustment < 0)
-              running_avg_y[c] = 0;
-            else
-              running_avg_y[c] = running_avg_y[c] - adjustment;
-            col_sum[c] -= adjustment;
-          } else if (diff < 0) {
-            // Bring denoised signal up.
-            if (running_avg_y[c] + adjustment > 255)
-              running_avg_y[c] = 255;
-            else
-              running_avg_y[c] = running_avg_y[c] + adjustment;
-            col_sum[c] += adjustment;
-          }
-        }
-        sig += sig_stride;
-        mc_running_avg_y += mc_avg_y_stride;
-        running_avg_y += avg_y_stride;
-      }
-
-      sum_diff = 0;
-      for (int c = 0; c < 16; ++c) {
-        if (col_sum[c] >= 128) {
-          col_sum[c] = 127;
-        }
-        sum_diff += col_sum[c];
-      }
-
-      if (abs(sum_diff) > sum_diff_thresh)
-        return COPY_BLOCK;
-    } else {
-      return COPY_BLOCK;
-    }
-  }
+  sum_diff_thresh =
+      increase_denoising ? kSumDiffThresholdHigh : kSumDiffThreshold;
+  if (abs(sum_diff) > sum_diff_thresh)
+    return COPY_BLOCK;
 
   return FILTER_BLOCK;
 }
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_c.h b/webrtc/modules/video_processing/util/denoiser_filter_c.h
index 3e52c3e..d8b6c5e 100644
--- a/webrtc/modules/video_processing/util/denoiser_filter_c.h
+++ b/webrtc/modules/video_processing/util/denoiser_filter_c.h
@@ -22,10 +22,6 @@
                     int src_stride,
                     uint8_t* dst,
                     int dst_stride) override;
-  void CopyMem8x8(const uint8_t* src,
-                  int src_stride,
-                  uint8_t* dst,
-                  int dst_stride) override;
   uint32_t Variance16x8(const uint8_t* a,
                         int a_stride,
                         const uint8_t* b,
@@ -38,8 +34,7 @@
                              const uint8_t* sig,
                              int sig_stride,
                              uint8_t motion_magnitude,
-                             int increase_denoising,
-                             bool denoise_always) override;
+                             int increase_denoising) override;
 };
 
 }  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_neon.cc b/webrtc/modules/video_processing/util/denoiser_filter_neon.cc
index 2920305..195b985 100644
--- a/webrtc/modules/video_processing/util/denoiser_filter_neon.cc
+++ b/webrtc/modules/video_processing/util/denoiser_filter_neon.cc
@@ -75,20 +75,6 @@
   }
 }
 
-void DenoiserFilterNEON::CopyMem8x8(const uint8_t* src,
-                                    int src_stride,
-                                    uint8_t* dst,
-                                    int dst_stride) {
-  uint8x8_t vtmp;
-
-  for (int r = 0; r < 8; r++) {
-    vtmp = vld1_u8(src);
-    vst1_u8(dst, vtmp);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 uint32_t DenoiserFilterNEON::Variance16x8(const uint8_t* a,
                                           int a_stride,
                                           const uint8_t* b,
@@ -106,8 +92,7 @@
                                                const uint8_t* sig,
                                                int sig_stride,
                                                uint8_t motion_magnitude,
-                                               int increase_denoising,
-                                               bool denoise_always) {
+                                               int increase_denoising) {
   // If motion_magnitude is small, making the denoiser more aggressive by
   // increasing the adjustment for each level, level1 adjustment is
   // increased, the deltas stay the same.
@@ -190,92 +175,13 @@
   }
 
   // Too much adjustments => copy block.
-  {
-    int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
-                            vget_low_s64(v_sum_diff_total));
-    int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
-    if (denoise_always)
-      sum_diff_thresh = INT_MAX;
-    else if (increase_denoising)
-      sum_diff_thresh = kSumDiffThresholdHigh;
-    else
-      sum_diff_thresh = kSumDiffThreshold;
-    if (sum_diff > sum_diff_thresh) {
-      // Before returning to copy the block (i.e., apply no denoising),
-      // checK if we can still apply some (weaker) temporal filtering to
-      // this block, that would otherwise not be denoised at all. Simplest
-      // is to apply an additional adjustment to running_avg_y to bring it
-      // closer to sig. The adjustment is capped by a maximum delta, and
-      // chosen such that in most cases the resulting sum_diff will be
-      // within the accceptable range given by sum_diff_thresh.
-
-      // The delta is set by the excess of absolute pixel diff over the
-      // threshold.
-      int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
-      // Only apply the adjustment for max delta up to 3.
-      if (delta < 4) {
-        const uint8x16_t k_delta = vmovq_n_u8(delta);
-        sig -= sig_stride * 16;
-        mc_running_avg_y -= mc_running_avg_y_stride * 16;
-        running_avg_y -= running_avg_y_stride * 16;
-        for (int r = 0; r < 16; ++r) {
-          uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
-          const uint8x16_t v_sig = vld1q_u8(sig);
-          const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
-
-          // Calculate absolute difference and sign masks.
-          const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
-          const uint8x16_t v_diff_pos_mask =
-              vcltq_u8(v_sig, v_mc_running_avg_y);
-          const uint8x16_t v_diff_neg_mask =
-              vcgtq_u8(v_sig, v_mc_running_avg_y);
-          // Clamp absolute difference to delta to get the adjustment.
-          const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
-
-          const uint8x16_t v_pos_adjustment =
-              vandq_u8(v_diff_pos_mask, v_abs_adjustment);
-          const uint8x16_t v_neg_adjustment =
-              vandq_u8(v_diff_neg_mask, v_abs_adjustment);
-
-          v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
-          v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
-
-          // Store results.
-          vst1q_u8(running_avg_y, v_running_avg_y);
-
-          {
-            const int8x16_t v_sum_diff =
-                vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
-                          vreinterpretq_s8_u8(v_pos_adjustment));
-
-            const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
-            const int32x4_t fedc_ba98_7654_3210 =
-                vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
-            const int64x2_t fedcba98_76543210 =
-                vpaddlq_s32(fedc_ba98_7654_3210);
-
-            v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
-          }
-          // Update pointers for next iteration.
-          sig += sig_stride;
-          mc_running_avg_y += mc_running_avg_y_stride;
-          running_avg_y += running_avg_y_stride;
-        }
-        {
-          // Update the sum of all pixel differences of this MB.
-          x = vqadd_s64(vget_high_s64(v_sum_diff_total),
-                        vget_low_s64(v_sum_diff_total));
-          sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
-
-          if (sum_diff > sum_diff_thresh) {
-            return COPY_BLOCK;
-          }
-        }
-      } else {
-        return COPY_BLOCK;
-      }
-    }
-  }
+  int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                          vget_low_s64(v_sum_diff_total));
+  int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+  sum_diff_thresh =
+      increase_denoising ? kSumDiffThresholdHigh : kSumDiffThreshold;
+  if (sum_diff > sum_diff_thresh)
+    return COPY_BLOCK;
 
   // Tell above level that block was filtered.
   running_avg_y -= running_avg_y_stride * 16;
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_neon.h b/webrtc/modules/video_processing/util/denoiser_filter_neon.h
index 2e3ea26..55850bd 100644
--- a/webrtc/modules/video_processing/util/denoiser_filter_neon.h
+++ b/webrtc/modules/video_processing/util/denoiser_filter_neon.h
@@ -22,10 +22,6 @@
                     int src_stride,
                     uint8_t* dst,
                     int dst_stride) override;
-  void CopyMem8x8(const uint8_t* src,
-                  int src_stride,
-                  uint8_t* dst,
-                  int dst_stride) override;
   uint32_t Variance16x8(const uint8_t* a,
                         int a_stride,
                         const uint8_t* b,
@@ -38,8 +34,7 @@
                              const uint8_t* sig,
                              int sig_stride,
                              uint8_t motion_magnitude,
-                             int increase_denoising,
-                             bool denoise_always) override;
+                             int increase_denoising) override;
 };
 
 }  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_sse2.cc b/webrtc/modules/video_processing/util/denoiser_filter_sse2.cc
index 614b6c9..0545a97 100644
--- a/webrtc/modules/video_processing/util/denoiser_filter_sse2.cc
+++ b/webrtc/modules/video_processing/util/denoiser_filter_sse2.cc
@@ -9,7 +9,6 @@
  */
 
 #include <emmintrin.h>
-
 #include "webrtc/modules/video_processing/util/denoiser_filter_sse2.h"
 
 namespace webrtc {
@@ -110,18 +109,6 @@
   }
 }
 
-// TODO(jackychen): Optimize this function using SSE2.
-void DenoiserFilterSSE2::CopyMem8x8(const uint8_t* src,
-                                    int src_stride,
-                                    uint8_t* dst,
-                                    int dst_stride) {
-  for (int i = 0; i < 8; i++) {
-    memcpy(dst, src, 8);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 uint32_t DenoiserFilterSSE2::Variance16x8(const uint8_t* src,
                                           int src_stride,
                                           const uint8_t* ref,
@@ -139,8 +126,8 @@
                                                const uint8_t* sig,
                                                int sig_stride,
                                                uint8_t motion_magnitude,
-                                               int increase_denoising,
-                                               bool denoise_always) {
+                                               int increase_denoising) {
+  DenoiserDecision decision = FILTER_BLOCK;
   unsigned int sum_diff_thresh = 0;
   int shift_inc =
       (increase_denoising && motion_magnitude <= kMotionMagnitudeThreshold) ? 1
@@ -210,76 +197,13 @@
     running_avg_y += avg_y_stride;
   }
 
-  {
-    // Compute the sum of all pixel differences of this MB.
-    unsigned int abs_sum_diff = AbsSumDiff16x1(acc_diff);
-    if (denoise_always)
-      sum_diff_thresh = INT_MAX;
-    else if (increase_denoising)
-      sum_diff_thresh = kSumDiffThresholdHigh;
-    else
-      sum_diff_thresh = kSumDiffThreshold;
-    if (abs_sum_diff > sum_diff_thresh) {
-      // Before returning to copy the block (i.e., apply no denoising),
-      // check if we can still apply some (weaker) temporal filtering to
-      // this block, that would otherwise not be denoised at all. Simplest
-      // is to apply an additional adjustment to running_avg_y to bring it
-      // closer to sig. The adjustment is capped by a maximum delta, and
-      // chosen such that in most cases the resulting sum_diff will be
-      // within the acceptable range given by sum_diff_thresh.
-
-      // The delta is set by the excess of absolute pixel diff over the
-      // threshold.
-      int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
-      // Only apply the adjustment for max delta up to 3.
-      if (delta < 4) {
-        const __m128i k_delta = _mm_set1_epi8(delta);
-        sig -= sig_stride * 16;
-        mc_running_avg_y -= mc_avg_y_stride * 16;
-        running_avg_y -= avg_y_stride * 16;
-        for (int r = 0; r < 16; ++r) {
-          __m128i v_running_avg_y =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(&running_avg_y[0]));
-          // Calculate differences.
-          const __m128i v_sig =
-              _mm_loadu_si128(reinterpret_cast<const __m128i*>(&sig[0]));
-          const __m128i v_mc_running_avg_y =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(&mc_running_avg_y[0]));
-          const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
-          const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
-          // Obtain the sign. FF if diff is negative.
-          const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
-          // Clamp absolute difference to delta to get the adjustment.
-          const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
-          // Restore the sign and get positive and negative adjustments.
-          __m128i padj, nadj;
-          padj = _mm_andnot_si128(diff_sign, adj);
-          nadj = _mm_and_si128(diff_sign, adj);
-          // Calculate filtered value.
-          v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
-          v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
-          _mm_storeu_si128(reinterpret_cast<__m128i*>(running_avg_y),
-                           v_running_avg_y);
-
-          // Accumulate the adjustments.
-          acc_diff = _mm_subs_epi8(acc_diff, padj);
-          acc_diff = _mm_adds_epi8(acc_diff, nadj);
-
-          // Update pointers for next iteration.
-          sig += sig_stride;
-          mc_running_avg_y += mc_avg_y_stride;
-          running_avg_y += avg_y_stride;
-        }
-        abs_sum_diff = AbsSumDiff16x1(acc_diff);
-        if (abs_sum_diff > sum_diff_thresh) {
-          return COPY_BLOCK;
-        }
-      } else {
-        return COPY_BLOCK;
-      }
-    }
-  }
-  return FILTER_BLOCK;
+  // Compute the sum of all pixel differences of this MB.
+  unsigned int abs_sum_diff = AbsSumDiff16x1(acc_diff);
+  sum_diff_thresh =
+      increase_denoising ? kSumDiffThresholdHigh : kSumDiffThreshold;
+  if (abs_sum_diff > sum_diff_thresh)
+    decision = COPY_BLOCK;
+  return decision;
 }
 
 }  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_sse2.h b/webrtc/modules/video_processing/util/denoiser_filter_sse2.h
index 395fa10..731344c 100644
--- a/webrtc/modules/video_processing/util/denoiser_filter_sse2.h
+++ b/webrtc/modules/video_processing/util/denoiser_filter_sse2.h
@@ -22,10 +22,6 @@
                     int src_stride,
                     uint8_t* dst,
                     int dst_stride) override;
-  void CopyMem8x8(const uint8_t* src,
-                  int src_stride,
-                  uint8_t* dst,
-                  int dst_stride) override;
   uint32_t Variance16x8(const uint8_t* a,
                         int a_stride,
                         const uint8_t* b,
@@ -38,8 +34,7 @@
                              const uint8_t* sig,
                              int sig_stride,
                              uint8_t motion_magnitude,
-                             int increase_denoising,
-                             bool denoise_always) override;
+                             int increase_denoising) override;
 };
 
 }  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/noise_estimation.cc b/webrtc/modules/video_processing/util/noise_estimation.cc
index 87beac3..a0ae2c4 100644
--- a/webrtc/modules/video_processing/util/noise_estimation.cc
+++ b/webrtc/modules/video_processing/util/noise_estimation.cc
@@ -27,10 +27,10 @@
   consec_low_var_[mb_index]++;
   num_static_block_++;
   if (consec_low_var_[mb_index] >= kConsecLowVarFrame &&
-      (luma >> 8) < kAverageLumaMax && (luma >> 8) > kAverageLumaMin) {
+      (luma >> 6) < kAverageLumaMax && (luma >> 6) > kAverageLumaMin) {
     // Normalized var by the average luma value, this gives more weight to
     // darker blocks.
-    int nor_var = var / (luma >> 12);
+    int nor_var = var / (luma >> 10);
     noise_var_ +=
         nor_var > kBlockSelectionVarMax ? kBlockSelectionVarMax : nor_var;
     num_noisy_block_++;
@@ -46,25 +46,28 @@
   // condition more reasonable.
   // No enough samples implies the motion of the camera or too many moving
   // objects in the frame.
-  if (num_static_block_ < (0.65 * mb_cols_ * mb_rows_) || !num_noisy_block_) {
+  if (num_static_block_ <
+          (0.65 * mb_cols_ * mb_rows_ / NOISE_SUBSAMPLE_INTERVAL) ||
+      !num_noisy_block_) {
+#if DISPLAY
+    printf("Not enough samples. %d \n", num_static_block_);
+#endif
     noise_var_ = 0;
     noise_var_accum_ = 0;
-    num_static_block_ = 0;
     num_noisy_block_ = 0;
-#if DISPLAY
-    printf("Not enough samples.\n");
-#endif
+    num_static_block_ = 0;
     return;
   } else {
+#if DISPLAY
+    printf("%d %d fraction = %.3f\n", num_static_block_,
+           mb_cols_ * mb_rows_ / NOISE_SUBSAMPLE_INTERVAL,
+           percent_static_block_);
+#endif
     // Normalized by the number of noisy blocks.
     noise_var_ /= num_noisy_block_;
     // Get the percentage of static blocks.
-    percent_static_block_ =
-        static_cast<double>(num_static_block_) / (mb_cols_ * mb_rows_);
-#if DISPLAY
-    printf("%d %d fraction = %.3f\n", num_static_block_, mb_cols_ * mb_rows_,
-           percent_static_block_);
-#endif
+    percent_static_block_ = static_cast<double>(num_static_block_) /
+                            (mb_cols_ * mb_rows_ / NOISE_SUBSAMPLE_INTERVAL);
     num_noisy_block_ = 0;
     num_static_block_ = 0;
   }
@@ -75,12 +78,12 @@
   } else {
     noise_var_accum_ = (noise_var_accum_ * 15 + noise_var_) / 16;
   }
-  // Reset noise_var_ for the next frame.
-  noise_var_ = 0;
 #if DISPLAY
   printf("noise_var_accum_ = %.1f, noise_var_ = %d.\n", noise_var_accum_,
          noise_var_);
 #endif
+  // Reset noise_var_ for the next frame.
+  noise_var_ = 0;
 }
 
 uint8_t NoiseEstimation::GetNoiseLevel() {
diff --git a/webrtc/modules/video_processing/util/noise_estimation.h b/webrtc/modules/video_processing/util/noise_estimation.h
index ca5cc23..24d44ca 100644
--- a/webrtc/modules/video_processing/util/noise_estimation.h
+++ b/webrtc/modules/video_processing/util/noise_estimation.h
@@ -18,7 +18,6 @@
 
 namespace webrtc {
 
-#define EXPERIMENTAL 0
 #define DISPLAY 0
 
 const int kNoiseThreshold = 200;
@@ -28,11 +27,18 @@
 const int kAverageLumaMax = 220;
 const int kBlockSelectionVarMax = kNoiseThreshold << 1;
 
+// TODO(jackychen): To test different sampling strategy.
+// Collect noise data every NOISE_SUBSAMPLE_INTERVAL blocks.
+#define NOISE_SUBSAMPLE_INTERVAL 41
+
 class NoiseEstimation {
  public:
   void Init(int width, int height, CpuType cpu_type);
+  // Collect noise data from one qualified block.
   void GetNoise(int mb_index, uint32_t var, uint32_t luma);
+  // Reset the counter for consecutive low-var blocks.
   void ResetConsecLowVar(int mb_index);
+  // Update noise level for current frame.
   void UpdateNoiseLevel();
   // 0: low noise, 1: high noise
   uint8_t GetNoiseLevel();
@@ -42,13 +48,13 @@
   int height_;
   int mb_rows_;
   int mb_cols_;
+  int num_noisy_block_;
+  int num_static_block_;
   CpuType cpu_type_;
   uint32_t noise_var_;
   double noise_var_accum_;
-  int num_noisy_block_;
-  int num_static_block_;
   double percent_static_block_;
-  rtc::scoped_ptr<uint32_t[]> consec_low_var_;
+  std::unique_ptr<uint32_t[]> consec_low_var_;
 };
 
 }  // namespace webrtc
diff --git a/webrtc/modules/video_processing/video_denoiser.cc b/webrtc/modules/video_processing/video_denoiser.cc
index b00da5c..4eef6d6 100644
--- a/webrtc/modules/video_processing/video_denoiser.cc
+++ b/webrtc/modules/video_processing/video_denoiser.cc
@@ -7,10 +7,65 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include "webrtc/common_video/libyuv/include/scaler.h"
 #include "webrtc/common_video/libyuv/include/webrtc_libyuv.h"
 #include "webrtc/modules/video_processing/video_denoiser.h"
 
+#if DISPLAY  // Rectangle diagnostics
+static void CopyMem8x8(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride) {
+  for (int i = 0; i < 8; i++) {
+    memcpy(dst, src, 8);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void ShowRect(const std::unique_ptr<DenoiserFilter>& filter,
+                     const std::unique_ptr<uint8_t[]>& d_status,
+                     const std::unique_ptr<uint8_t[]>& moving_edge_red,
+                     const std::unique_ptr<uint8_t[]>& x_density,
+                     const std::unique_ptr<uint8_t[]>& y_density,
+                     const uint8_t* u_src,
+                     const uint8_t* v_src,
+                     uint8_t* u_dst,
+                     uint8_t* v_dst,
+                     int mb_rows_,
+                     int mb_cols_,
+                     int stride_u_,
+                     int stride_v_) {
+  for (int mb_row = 0; mb_row < mb_rows_; ++mb_row) {
+    for (int mb_col = 0; mb_col < mb_cols_; ++mb_col) {
+      int mb_index = mb_row * mb_cols_ + mb_col;
+      const uint8_t* mb_src_u =
+          u_src + (mb_row << 3) * stride_u_ + (mb_col << 3);
+      const uint8_t* mb_src_v =
+          v_src + (mb_row << 3) * stride_v_ + (mb_col << 3);
+      uint8_t* mb_dst_u = u_dst + (mb_row << 3) * stride_u_ + (mb_col << 3);
+      uint8_t* mb_dst_v = v_dst + (mb_row << 3) * stride_v_ + (mb_col << 3);
+      uint8_t uv_tmp[8 * 8];
+      memset(uv_tmp, 200, 8 * 8);
+      if (d_status[mb_index] == 1) {
+        // Paint to red.
+        CopyMem8x8(mb_src_u, stride_u_, mb_dst_u, stride_u_);
+        CopyMem8x8(uv_tmp, 8, mb_dst_v, stride_v_);
+      } else if (moving_edge_red[mb_row * mb_cols_ + mb_col] &&
+                 x_density[mb_col] * y_density[mb_row]) {
+        // Paint to blue.
+        CopyMem8x8(uv_tmp, 8, mb_dst_u, stride_u_);
+        CopyMem8x8(mb_src_v, stride_v_, mb_dst_v, stride_v_);
+      } else {
+        CopyMem8x8(mb_src_u, stride_u_, mb_dst_u, stride_u_);
+        CopyMem8x8(mb_src_v, stride_v_, mb_dst_v, stride_v_);
+      }
+    }
+  }
+}
+#endif
+
 namespace webrtc {
 
 VideoDenoiser::VideoDenoiser(bool runtime_cpu_detection)
@@ -19,293 +74,255 @@
       filter_(DenoiserFilter::Create(runtime_cpu_detection, &cpu_type_)),
       ne_(new NoiseEstimation()) {}
 
-#if EXPERIMENTAL
-// Check the mb position(1: close to the center, 3: close to the border).
-static int PositionCheck(int mb_row, int mb_col, int mb_rows, int mb_cols) {
-  if ((mb_row >= (mb_rows >> 3)) && (mb_row <= (7 * mb_rows >> 3)) &&
-      (mb_col >= (mb_cols >> 3)) && (mb_col <= (7 * mb_cols >> 3)))
+void VideoDenoiser::DenoiserReset(const VideoFrame& frame,
+                                  VideoFrame* denoised_frame,
+                                  VideoFrame* denoised_frame_prev) {
+  width_ = frame.width();
+  height_ = frame.height();
+  mb_cols_ = width_ >> 4;
+  mb_rows_ = height_ >> 4;
+  stride_y_ = frame.stride(kYPlane);
+  stride_u_ = frame.stride(kUPlane);
+  stride_v_ = frame.stride(kVPlane);
+
+  // Allocate an empty buffer for denoised_frame_prev.
+  denoised_frame_prev->CreateEmptyFrame(width_, height_, stride_y_, stride_u_,
+                                        stride_v_);
+  // Allocate and initialize denoised_frame with key frame.
+  denoised_frame->CreateFrame(frame.buffer(kYPlane), frame.buffer(kUPlane),
+                              frame.buffer(kVPlane), width_, height_, stride_y_,
+                              stride_u_, stride_v_, kVideoRotation_0);
+  // Set time parameters to the output frame.
+  denoised_frame->set_timestamp(frame.timestamp());
+  denoised_frame->set_render_time_ms(frame.render_time_ms());
+
+  // Init noise estimator and allocate buffers.
+  ne_->Init(width_, height_, cpu_type_);
+  moving_edge_.reset(new uint8_t[mb_cols_ * mb_rows_]);
+  mb_filter_decision_.reset(new DenoiserDecision[mb_cols_ * mb_rows_]);
+  x_density_.reset(new uint8_t[mb_cols_]);
+  y_density_.reset(new uint8_t[mb_rows_]);
+  moving_object_.reset(new uint8_t[mb_cols_ * mb_rows_]);
+}
+
+int VideoDenoiser::PositionCheck(int mb_row, int mb_col, int noise_level) {
+  if (noise_level == 0)
     return 1;
-  else if ((mb_row >= (mb_rows >> 4)) && (mb_row <= (15 * mb_rows >> 4)) &&
-           (mb_col >= (mb_cols >> 4)) && (mb_col <= (15 * mb_cols >> 4)))
+  if ((mb_row <= (mb_rows_ >> 4)) || (mb_col <= (mb_cols_ >> 4)) ||
+      (mb_col >= (15 * mb_cols_ >> 4)))
+    return 3;
+  else if ((mb_row <= (mb_rows_ >> 3)) || (mb_col <= (mb_cols_ >> 3)) ||
+           (mb_col >= (7 * mb_cols_ >> 3)))
     return 2;
   else
-    return 3;
+    return 1;
 }
 
-static void ReduceFalseDetection(const std::unique_ptr<uint8_t[]>& d_status,
-                                 std::unique_ptr<uint8_t[]>* d_status_tmp1,
-                                 std::unique_ptr<uint8_t[]>* d_status_tmp2,
-                                 int noise_level,
-                                 int mb_rows,
-                                 int mb_cols) {
-  // Draft. This can be optimized. This code block is to reduce false detection
-  // in moving object detection.
-  int mb_row_min = noise_level ? mb_rows >> 3 : 1;
-  int mb_col_min = noise_level ? mb_cols >> 3 : 1;
-  int mb_row_max = noise_level ? (7 * mb_rows >> 3) : mb_rows - 2;
-  int mb_col_max = noise_level ? (7 * mb_cols >> 3) : mb_cols - 2;
-  memcpy((*d_status_tmp1).get(), d_status.get(), mb_rows * mb_cols);
-  // Up left.
-  for (int mb_row = mb_row_min; mb_row <= mb_row_max; ++mb_row) {
-    for (int mb_col = mb_col_min; mb_col <= mb_col_max; ++mb_col) {
-      (*d_status_tmp1)[mb_row * mb_cols + mb_col] |=
-          ((*d_status_tmp1)[(mb_row - 1) * mb_cols + mb_col] |
-           (*d_status_tmp1)[mb_row * mb_cols + mb_col - 1]);
+void VideoDenoiser::ReduceFalseDetection(
+    const std::unique_ptr<uint8_t[]>& d_status,
+    std::unique_ptr<uint8_t[]>* moving_edge_red,
+    int noise_level) {
+  // From up left corner.
+  int mb_col_stop = mb_cols_ - 1;
+  for (int mb_row = 0; mb_row <= mb_rows_ - 1; ++mb_row) {
+    for (int mb_col = 0; mb_col <= mb_col_stop; ++mb_col) {
+      if (d_status[mb_row * mb_cols_ + mb_col]) {
+        mb_col_stop = mb_col - 1;
+        break;
+      }
+      (*moving_edge_red)[mb_row * mb_cols_ + mb_col] = 0;
     }
   }
-  memcpy((*d_status_tmp2).get(), (*d_status_tmp1).get(), mb_rows * mb_cols);
-  memcpy((*d_status_tmp1).get(), d_status.get(), mb_rows * mb_cols);
-  // Bottom left.
-  for (int mb_row = mb_row_max; mb_row >= mb_row_min; --mb_row) {
-    for (int mb_col = mb_col_min; mb_col <= mb_col_max; ++mb_col) {
-      (*d_status_tmp1)[mb_row * mb_cols + mb_col] |=
-          ((*d_status_tmp1)[(mb_row + 1) * mb_cols + mb_col] |
-           (*d_status_tmp1)[mb_row * mb_cols + mb_col - 1]);
-      (*d_status_tmp2)[mb_row * mb_cols + mb_col] &=
-          (*d_status_tmp1)[mb_row * mb_cols + mb_col];
+  // From bottom left corner.
+  mb_col_stop = mb_cols_ - 1;
+  for (int mb_row = mb_rows_ - 1; mb_row >= 0; --mb_row) {
+    for (int mb_col = 0; mb_col <= mb_col_stop; ++mb_col) {
+      if (d_status[mb_row * mb_cols_ + mb_col]) {
+        mb_col_stop = mb_col - 1;
+        break;
+      }
+      (*moving_edge_red)[mb_row * mb_cols_ + mb_col] = 0;
     }
   }
-  memcpy((*d_status_tmp1).get(), d_status.get(), mb_rows * mb_cols);
-  // Up right.
-  for (int mb_row = mb_row_min; mb_row <= mb_row_max; ++mb_row) {
-    for (int mb_col = mb_col_max; mb_col >= mb_col_min; --mb_col) {
-      (*d_status_tmp1)[mb_row * mb_cols + mb_col] |=
-          ((*d_status_tmp1)[(mb_row - 1) * mb_cols + mb_col] |
-           (*d_status_tmp1)[mb_row * mb_cols + mb_col + 1]);
-      (*d_status_tmp2)[mb_row * mb_cols + mb_col] &=
-          (*d_status_tmp1)[mb_row * mb_cols + mb_col];
+  // From up right corner.
+  mb_col_stop = 0;
+  for (int mb_row = 0; mb_row <= mb_rows_ - 1; ++mb_row) {
+    for (int mb_col = mb_cols_ - 1; mb_col >= mb_col_stop; --mb_col) {
+      if (d_status[mb_row * mb_cols_ + mb_col]) {
+        mb_col_stop = mb_col + 1;
+        break;
+      }
+      (*moving_edge_red)[mb_row * mb_cols_ + mb_col] = 0;
     }
   }
-  memcpy((*d_status_tmp1).get(), d_status.get(), mb_rows * mb_cols);
-  // Bottom right.
-  for (int mb_row = mb_row_max; mb_row >= mb_row_min; --mb_row) {
-    for (int mb_col = mb_col_max; mb_col >= mb_col_min; --mb_col) {
-      (*d_status_tmp1)[mb_row * mb_cols + mb_col] |=
-          ((*d_status_tmp1)[(mb_row + 1) * mb_cols + mb_col] |
-           (*d_status_tmp1)[mb_row * mb_cols + mb_col + 1]);
-      (*d_status_tmp2)[mb_row * mb_cols + mb_col] &=
-          (*d_status_tmp1)[mb_row * mb_cols + mb_col];
+  // From bottom right corner.
+  mb_col_stop = 0;
+  for (int mb_row = mb_rows_ - 1; mb_row >= 0; --mb_row) {
+    for (int mb_col = mb_cols_ - 1; mb_col >= mb_col_stop; --mb_col) {
+      if (d_status[mb_row * mb_cols_ + mb_col]) {
+        mb_col_stop = mb_col + 1;
+        break;
+      }
+      (*moving_edge_red)[mb_row * mb_cols_ + mb_col] = 0;
     }
   }
 }
 
-static bool TrailingBlock(const std::unique_ptr<uint8_t[]>& d_status,
-                          int mb_row,
-                          int mb_col,
-                          int mb_rows,
-                          int mb_cols) {
-  int mb_index = mb_row * mb_cols + mb_col;
-  if (!mb_row || !mb_col || mb_row == mb_rows - 1 || mb_col == mb_cols - 1)
-    return false;
-  return d_status[mb_index + 1] || d_status[mb_index - 1] ||
-         d_status[mb_index + mb_cols] || d_status[mb_index - mb_cols];
+bool VideoDenoiser::IsTrailingBlock(const std::unique_ptr<uint8_t[]>& d_status,
+                                    int mb_row,
+                                    int mb_col) {
+  bool ret = false;
+  int mb_index = mb_row * mb_cols_ + mb_col;
+  if (!mb_row || !mb_col || mb_row == mb_rows_ - 1 || mb_col == mb_cols_ - 1)
+    ret = false;
+  else
+    ret = d_status[mb_index + 1] || d_status[mb_index - 1] ||
+          d_status[mb_index + mb_cols_] || d_status[mb_index - mb_cols_];
+  return ret;
 }
-#endif
 
-#if DISPLAY
-void ShowRect(const std::unique_ptr<DenoiserFilter>& filter,
-              const std::unique_ptr<uint8_t[]>& d_status,
-              const std::unique_ptr<uint8_t[]>& d_status_tmp2,
-              const std::unique_ptr<uint8_t[]>& x_density,
-              const std::unique_ptr<uint8_t[]>& y_density,
-              const uint8_t* u_src,
-              const uint8_t* v_src,
-              uint8_t* u_dst,
-              uint8_t* v_dst,
-              int mb_rows,
-              int mb_cols,
-              int stride_u,
-              int stride_v) {
-  for (int mb_row = 0; mb_row < mb_rows; ++mb_row) {
-    for (int mb_col = 0; mb_col < mb_cols; ++mb_col) {
-      int mb_index = mb_row * mb_cols + mb_col;
-      const uint8_t* mb_src_u =
-          u_src + (mb_row << 3) * stride_u + (mb_col << 3);
-      const uint8_t* mb_src_v =
-          v_src + (mb_row << 3) * stride_v + (mb_col << 3);
-      uint8_t* mb_dst_u = u_dst + (mb_row << 3) * stride_u + (mb_col << 3);
-      uint8_t* mb_dst_v = v_dst + (mb_row << 3) * stride_v + (mb_col << 3);
-      uint8_t y_tmp_255[8 * 8];
-      memset(y_tmp_255, 200, 8 * 8);
-      // x_density_[mb_col] * y_density_[mb_row]
-      if (d_status[mb_index] == 1) {
-        // Paint to red.
-        filter->CopyMem8x8(mb_src_u, stride_u, mb_dst_u, stride_u);
-        filter->CopyMem8x8(y_tmp_255, 8, mb_dst_v, stride_v);
-#if EXPERIMENTAL
-      } else if (d_status_tmp2[mb_row * mb_cols + mb_col] &&
-                 x_density[mb_col] * y_density[mb_row]) {
-#else
-      } else if (x_density[mb_col] * y_density[mb_row]) {
-#endif
-        // Paint to blue.
-        filter->CopyMem8x8(y_tmp_255, 8, mb_dst_u, stride_u);
-        filter->CopyMem8x8(mb_src_v, stride_v, mb_dst_v, stride_v);
-      } else {
-        filter->CopyMem8x8(mb_src_u, stride_u, mb_dst_u, stride_u);
-        filter->CopyMem8x8(mb_src_v, stride_v, mb_dst_v, stride_v);
+void VideoDenoiser::CopySrcOnMOB(const uint8_t* y_src, uint8_t* y_dst) {
+  // Loop over to copy src block if the block is marked as moving object block
+  // or if the block may cause trailing artifacts.
+  for (int mb_row = 0; mb_row < mb_rows_; ++mb_row) {
+    const int mb_index_base = mb_row * mb_cols_;
+    const int offset_base = (mb_row << 4) * stride_y_;
+    const uint8_t* mb_src_base = y_src + offset_base;
+    uint8_t* mb_dst_base = y_dst + offset_base;
+    for (int mb_col = 0; mb_col < mb_cols_; ++mb_col) {
+      const int mb_index = mb_index_base + mb_col;
+      const uint32_t offset_col = mb_col << 4;
+      const uint8_t* mb_src = mb_src_base + offset_col;
+      uint8_t* mb_dst = mb_dst_base + offset_col;
+      // Check if the block is a moving object block or may cause a trailing
+      // artifacts.
+      if (mb_filter_decision_[mb_index] != FILTER_BLOCK ||
+          IsTrailingBlock(moving_edge_, mb_row, mb_col) ||
+          (x_density_[mb_col] * y_density_[mb_row] &&
+           moving_object_[mb_row * mb_cols_ + mb_col])) {
+        // Copy y source.
+        filter_->CopyMem16x16(mb_src, stride_y_, mb_dst, stride_y_);
       }
     }
   }
 }
-#endif
 
 void VideoDenoiser::DenoiseFrame(const VideoFrame& frame,
                                  VideoFrame* denoised_frame,
                                  VideoFrame* denoised_frame_prev,
-                                 int noise_level_prev) {
-  int stride_y = frame.stride(kYPlane);
-  int stride_u = frame.stride(kUPlane);
-  int stride_v = frame.stride(kVPlane);
-  // If previous width and height are different from current frame's, then no
-  // denoising for the current frame.
+                                 bool noise_estimation_enabled) {
+  // If previous width and height are different from current frame's, need to
+  // reallocate the buffers and no denoising for the current frame.
   if (width_ != frame.width() || height_ != frame.height()) {
-    width_ = frame.width();
-    height_ = frame.height();
-    denoised_frame->CreateFrame(frame.buffer(kYPlane), frame.buffer(kUPlane),
-                                frame.buffer(kVPlane), width_, height_,
-                                stride_y, stride_u, stride_v, kVideoRotation_0);
-    denoised_frame_prev->CreateFrame(
-        frame.buffer(kYPlane), frame.buffer(kUPlane), frame.buffer(kVPlane),
-        width_, height_, stride_y, stride_u, stride_v, kVideoRotation_0);
-    // Setting time parameters to the output frame.
-    denoised_frame->set_timestamp(frame.timestamp());
-    denoised_frame->set_render_time_ms(frame.render_time_ms());
-    ne_->Init(width_, height_, cpu_type_);
+    DenoiserReset(frame, denoised_frame, denoised_frame_prev);
     return;
   }
-  // For 16x16 block.
-  int mb_cols = width_ >> 4;
-  int mb_rows = height_ >> 4;
-  if (metrics_.get() == nullptr)
-    metrics_.reset(new DenoiseMetrics[mb_cols * mb_rows]());
-  if (d_status_.get() == nullptr) {
-    d_status_.reset(new uint8_t[mb_cols * mb_rows]());
-#if EXPERIMENTAL
-    d_status_tmp1_.reset(new uint8_t[mb_cols * mb_rows]());
-    d_status_tmp2_.reset(new uint8_t[mb_cols * mb_rows]());
-#endif
-    x_density_.reset(new uint8_t[mb_cols]());
-    y_density_.reset(new uint8_t[mb_rows]());
-  }
 
-  // Denoise on Y plane.
+  // Set buffer pointers.
+  const uint8_t* y_src = frame.buffer(kYPlane);
+  const uint8_t* u_src = frame.buffer(kUPlane);
+  const uint8_t* v_src = frame.buffer(kVPlane);
   uint8_t* y_dst = denoised_frame->buffer(kYPlane);
   uint8_t* u_dst = denoised_frame->buffer(kUPlane);
   uint8_t* v_dst = denoised_frame->buffer(kVPlane);
   uint8_t* y_dst_prev = denoised_frame_prev->buffer(kYPlane);
-  const uint8_t* y_src = frame.buffer(kYPlane);
-  const uint8_t* u_src = frame.buffer(kUPlane);
-  const uint8_t* v_src = frame.buffer(kVPlane);
-  uint8_t noise_level = noise_level_prev == -1 ? 0 : ne_->GetNoiseLevel();
-  // Temporary buffer to store denoising result.
-  uint8_t y_tmp[16 * 16] = {0};
-  memset(x_density_.get(), 0, mb_cols);
-  memset(y_density_.get(), 0, mb_rows);
+  memset(x_density_.get(), 0, mb_cols_);
+  memset(y_density_.get(), 0, mb_rows_);
+  memset(moving_object_.get(), 1, mb_cols_ * mb_rows_);
 
+  uint8_t noise_level = noise_estimation_enabled ? ne_->GetNoiseLevel() : 0;
+  int thr_var_base = 16 * 16 * 5;
   // Loop over blocks to accumulate/extract noise level and update x/y_density
   // factors for moving object detection.
-  for (int mb_row = 0; mb_row < mb_rows; ++mb_row) {
-    for (int mb_col = 0; mb_col < mb_cols; ++mb_col) {
-      const uint8_t* mb_src = y_src + (mb_row << 4) * stride_y + (mb_col << 4);
-      uint8_t* mb_dst_prev =
-          y_dst_prev + (mb_row << 4) * stride_y + (mb_col << 4);
-      int mb_index = mb_row * mb_cols + mb_col;
-#if EXPERIMENTAL
-      int pos_factor = PositionCheck(mb_row, mb_col, mb_rows, mb_cols);
-      uint32_t thr_var_adp = 16 * 16 * 5 * (noise_level ? pos_factor : 1);
-#else
-      uint32_t thr_var_adp = 16 * 16 * 5;
-#endif
-      int brightness = 0;
-      for (int i = 0; i < 16; ++i) {
-        for (int j = 0; j < 16; ++j) {
-          brightness += mb_src[i * stride_y + j];
+  for (int mb_row = 0; mb_row < mb_rows_; ++mb_row) {
+    const int mb_index_base = mb_row * mb_cols_;
+    const int offset_base = (mb_row << 4) * stride_y_;
+    const uint8_t* mb_src_base = y_src + offset_base;
+    uint8_t* mb_dst_base = y_dst + offset_base;
+    uint8_t* mb_dst_prev_base = y_dst_prev + offset_base;
+    for (int mb_col = 0; mb_col < mb_cols_; ++mb_col) {
+      const int mb_index = mb_index_base + mb_col;
+      const bool ne_enable = (mb_index % NOISE_SUBSAMPLE_INTERVAL == 0);
+      const int pos_factor = PositionCheck(mb_row, mb_col, noise_level);
+      const uint32_t thr_var_adp = thr_var_base * pos_factor;
+      const uint32_t offset_col = mb_col << 4;
+      const uint8_t* mb_src = mb_src_base + offset_col;
+      uint8_t* mb_dst = mb_dst_base + offset_col;
+      uint8_t* mb_dst_prev = mb_dst_prev_base + offset_col;
+
+      // TODO(jackychen): Need SSE2/NEON opt.
+      int luma = 0;
+      if (ne_enable) {
+        for (int i = 4; i < 12; ++i) {
+          for (int j = 4; j < 12; ++j) {
+            luma += mb_src[i * stride_y_ + j];
+          }
         }
       }
 
-      // Get the denoised block.
-      filter_->MbDenoise(mb_dst_prev, stride_y, y_tmp, 16, mb_src, stride_y, 0,
-                         1, true);
-      // The variance is based on the denoised blocks in time T and T-1.
-      metrics_[mb_index].var = filter_->Variance16x8(
-          mb_dst_prev, stride_y, y_tmp, 16, &metrics_[mb_index].sad);
+      // Get the filtered block and filter_decision.
+      mb_filter_decision_[mb_index] =
+          filter_->MbDenoise(mb_dst_prev, stride_y_, mb_dst, stride_y_, mb_src,
+                             stride_y_, 0, noise_level);
 
-      if (metrics_[mb_index].var > thr_var_adp) {
-        ne_->ResetConsecLowVar(mb_index);
-        d_status_[mb_index] = 1;
-#if EXPERIMENTAL
-        if (noise_level == 0 || pos_factor < 3) {
-          x_density_[mb_col] += 1;
-          y_density_[mb_row] += 1;
+      // If filter decision is FILTER_BLOCK, no need to check moving edge.
+      // It is unlikely for a moving edge block to be filtered in current
+      // setting.
+      if (mb_filter_decision_[mb_index] == FILTER_BLOCK) {
+        uint32_t sse_t = 0;
+        if (ne_enable) {
+          // The variance used in noise estimation is based on the src block in
+          // time t (mb_src) and filtered block in time t-1 (mb_dist_prev).
+          uint32_t noise_var = filter_->Variance16x8(mb_dst_prev, stride_y_,
+                                                     mb_src, stride_y_, &sse_t);
+          ne_->GetNoise(mb_index, noise_var, luma);
         }
-#else
-        x_density_[mb_col] += 1;
-        y_density_[mb_row] += 1;
-#endif
+        moving_edge_[mb_index] = 0;  // Not a moving edge block.
       } else {
         uint32_t sse_t = 0;
-        // The variance is based on the src blocks in time T and denoised block
-        // in time T-1.
-        uint32_t noise_var = filter_->Variance16x8(mb_dst_prev, stride_y,
-                                                   mb_src, stride_y, &sse_t);
-        ne_->GetNoise(mb_index, noise_var, brightness);
-        d_status_[mb_index] = 0;
-      }
-      // Track denoised frame.
-      filter_->CopyMem16x16(y_tmp, 16, mb_dst_prev, stride_y);
-    }
-  }
-
-#if EXPERIMENTAL
-  ReduceFalseDetection(d_status_, &d_status_tmp1_, &d_status_tmp2_, noise_level,
-                       mb_rows, mb_cols);
-#endif
-
-  // Denoise each MB based on the results of moving objects detection.
-  for (int mb_row = 0; mb_row < mb_rows; ++mb_row) {
-    for (int mb_col = 0; mb_col < mb_cols; ++mb_col) {
-      const uint8_t* mb_src = y_src + (mb_row << 4) * stride_y + (mb_col << 4);
-      uint8_t* mb_dst = y_dst + (mb_row << 4) * stride_y + (mb_col << 4);
-      const uint8_t* mb_src_u =
-          u_src + (mb_row << 3) * stride_u + (mb_col << 3);
-      const uint8_t* mb_src_v =
-          v_src + (mb_row << 3) * stride_v + (mb_col << 3);
-      uint8_t* mb_dst_u = u_dst + (mb_row << 3) * stride_u + (mb_col << 3);
-      uint8_t* mb_dst_v = v_dst + (mb_row << 3) * stride_v + (mb_col << 3);
-#if EXPERIMENTAL
-      if ((!d_status_tmp2_[mb_row * mb_cols + mb_col] ||
-           x_density_[mb_col] * y_density_[mb_row] == 0) &&
-          !TrailingBlock(d_status_, mb_row, mb_col, mb_rows, mb_cols)) {
-#else
-      if (x_density_[mb_col] * y_density_[mb_row] == 0) {
-#endif
-        if (filter_->MbDenoise(mb_dst, stride_y, y_tmp, 16, mb_src, stride_y, 0,
-                               noise_level, false) == FILTER_BLOCK) {
-          filter_->CopyMem16x16(y_tmp, 16, mb_dst, stride_y);
+        // The variance used in MOD is based on the filtered blocks in time
+        // T (mb_dst) and T-1 (mb_dst_prev).
+        uint32_t noise_var = filter_->Variance16x8(mb_dst_prev, stride_y_,
+                                                   mb_dst, stride_y_, &sse_t);
+        if (noise_var > thr_var_adp) {  // Moving edge checking.
+          if (ne_enable) {
+            ne_->ResetConsecLowVar(mb_index);
+          }
+          moving_edge_[mb_index] = 1;  // Mark as moving edge block.
+          x_density_[mb_col] += (pos_factor < 3);
+          y_density_[mb_row] += (pos_factor < 3);
         } else {
-          // Copy y source.
-          filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
+          moving_edge_[mb_index] = 0;
+          if (ne_enable) {
+            // The variance used in noise estimation is based on the src block
+            // in time t (mb_src) and filtered block in time t-1 (mb_dist_prev).
+            uint32_t noise_var = filter_->Variance16x8(
+                mb_dst_prev, stride_y_, mb_src, stride_y_, &sse_t);
+            ne_->GetNoise(mb_index, noise_var, luma);
+          }
         }
-      } else {
-        // Copy y source.
-        filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
       }
-      filter_->CopyMem8x8(mb_src_u, stride_u, mb_dst_u, stride_u);
-      filter_->CopyMem8x8(mb_src_v, stride_v, mb_dst_v, stride_v);
-    }
-  }
+    }  // End of for loop
+  }    // End of for loop
+
+  ReduceFalseDetection(moving_edge_, &moving_object_, noise_level);
+
+  CopySrcOnMOB(y_src, y_dst);
+
+  // TODO(jackychen): Need SSE2/NEON opt.
+  // Copy u/v planes.
+  memcpy(u_dst, u_src, (height_ >> 1) * stride_u_);
+  memcpy(v_dst, v_src, (height_ >> 1) * stride_v_);
+
+  // Set time parameters to the output frame.
+  denoised_frame->set_timestamp(frame.timestamp());
+  denoised_frame->set_render_time_ms(frame.render_time_ms());
 
 #if DISPLAY  // Rectangle diagnostics
   // Show rectangular region
-  ShowRect(filter_, d_status_, d_status_tmp2_, x_density_, y_density_, u_src,
-           v_src, u_dst, v_dst, mb_rows, mb_cols, stride_u, stride_v);
+  ShowRect(filter_, moving_edge_, moving_object_, x_density_, y_density_, u_src,
+           v_src, u_dst, v_dst, mb_rows_, mb_cols_, stride_u_, stride_v_);
 #endif
-
-  // Setting time parameters to the output frame.
-  denoised_frame->set_timestamp(frame.timestamp());
-  denoised_frame->set_render_time_ms(frame.render_time_ms());
-  return;
 }
 
 }  // namespace webrtc
diff --git a/webrtc/modules/video_processing/video_denoiser.h b/webrtc/modules/video_processing/video_denoiser.h
index 03b30d9..319845b 100644
--- a/webrtc/modules/video_processing/video_denoiser.h
+++ b/webrtc/modules/video_processing/video_denoiser.h
@@ -22,25 +22,55 @@
 class VideoDenoiser {
  public:
   explicit VideoDenoiser(bool runtime_cpu_detection);
+
   void DenoiseFrame(const VideoFrame& frame,
                     VideoFrame* denoised_frame,
-                    VideoFrame* denoised_frame_track,
-                    int noise_level_prev);
+                    VideoFrame* denoised_frame_prev,
+                    bool noise_estimation_enabled);
 
  private:
+  void DenoiserReset(const VideoFrame& frame,
+                     VideoFrame* denoised_frame,
+                     VideoFrame* denoised_frame_prev);
+
+  // Check the mb position, return 1: close to the frame center (between 1/8
+  // and 7/8 of width/height), 3: close to the border (out of 1/16 and 15/16
+  // of width/height), 2: in between.
+  int PositionCheck(int mb_row, int mb_col, int noise_level);
+
+  // To reduce false detection in moving object detection (MOD).
+  void ReduceFalseDetection(const std::unique_ptr<uint8_t[]>& d_status,
+                            std::unique_ptr<uint8_t[]>* d_status_red,
+                            int noise_level);
+
+  // Return whether a block might cause trailing artifact by checking if one of
+  // its neighbor blocks is a moving edge block.
+  bool IsTrailingBlock(const std::unique_ptr<uint8_t[]>& d_status,
+                       int mb_row,
+                       int mb_col);
+
+  // Copy input blocks to dst buffer on moving object blocks (MOB).
+  void CopySrcOnMOB(const uint8_t* y_src, uint8_t* y_dst);
+
   int width_;
   int height_;
+  int mb_rows_;
+  int mb_cols_;
+  int stride_y_;
+  int stride_u_;
+  int stride_v_;
   CpuType cpu_type_;
-  std::unique_ptr<DenoiseMetrics[]> metrics_;
   std::unique_ptr<DenoiserFilter> filter_;
   std::unique_ptr<NoiseEstimation> ne_;
-  std::unique_ptr<uint8_t[]> d_status_;
-#if EXPERIMENTAL
-  std::unique_ptr<uint8_t[]> d_status_tmp1_;
-  std::unique_ptr<uint8_t[]> d_status_tmp2_;
-#endif
+  // 1 for moving edge block, 0 for static block.
+  std::unique_ptr<uint8_t[]> moving_edge_;
+  // 1 for moving object block, 0 for static block.
+  std::unique_ptr<uint8_t[]> moving_object_;
+  // x_density_ and y_density_ are used in MOD process.
   std::unique_ptr<uint8_t[]> x_density_;
   std::unique_ptr<uint8_t[]> y_density_;
+  // Save the return values by MbDenoise for each block.
+  std::unique_ptr<DenoiserDecision[]> mb_filter_decision_;
 };
 
 }  // namespace webrtc