blob: 9747ca237020a68392e8dc68e53ea944285e0ec0 [file] [log] [blame]
Alex Loikodb6af362018-06-20 14:14:18 +02001/*
2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "modules/audio_processing/agc2/vad_with_level.h"
12
13#include <algorithm>
Yves Gerey988cc082018-10-23 12:03:01 +020014#include <array>
15#include <cmath>
Alex Loikodb6af362018-06-20 14:14:18 +020016
Yves Gerey988cc082018-10-23 12:03:01 +020017#include "api/array_view.h"
Alex Loikodb6af362018-06-20 14:14:18 +020018#include "common_audio/include/audio_util.h"
Alessio Bazzica530781d2020-09-25 13:24:36 +020019#include "common_audio/resampler/include/push_resampler.h"
Alessio Bazzicac1ece012020-09-25 14:31:17 +020020#include "modules/audio_processing/agc2/agc2_common.h"
Alex Loikodb6af362018-06-20 14:14:18 +020021#include "modules/audio_processing/agc2/rnn_vad/common.h"
Alessio Bazzica530781d2020-09-25 13:24:36 +020022#include "modules/audio_processing/agc2/rnn_vad/features_extraction.h"
23#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
24#include "rtc_base/checks.h"
Alex Loikodb6af362018-06-20 14:14:18 +020025
26namespace webrtc {
Alex Loikodb6af362018-06-20 14:14:18 +020027namespace {
Alex Loikodb6af362018-06-20 14:14:18 +020028
Alessio Bazzica530781d2020-09-25 13:24:36 +020029using VoiceActivityDetector = VadLevelAnalyzer::VoiceActivityDetector;
30
31// Default VAD that combines a resampler and the RNN VAD.
32// Computes the speech probability on the first channel.
33class Vad : public VoiceActivityDetector {
34 public:
Alessio Bazzica253f8362020-11-27 16:02:38 +010035 explicit Vad(const AvailableCpuFeatures& cpu_features)
36 : features_extractor_(cpu_features), rnn_vad_(cpu_features) {}
Alessio Bazzica530781d2020-09-25 13:24:36 +020037 Vad(const Vad&) = delete;
38 Vad& operator=(const Vad&) = delete;
39 ~Vad() = default;
40
Alessio Bazzica841d74e2021-03-31 15:04:03 +020041 void Reset() override { rnn_vad_.Reset(); }
42
Alessio Bazzica530781d2020-09-25 13:24:36 +020043 float ComputeProbability(AudioFrameView<const float> frame) override {
44 // The source number of channels is 1, because we always use the 1st
45 // channel.
46 resampler_.InitializeIfNeeded(
47 /*sample_rate_hz=*/static_cast<int>(frame.samples_per_channel() * 100),
48 rnn_vad::kSampleRate24kHz,
49 /*num_channels=*/1);
50
51 std::array<float, rnn_vad::kFrameSize10ms24kHz> work_frame;
52 // Feed the 1st channel to the resampler.
53 resampler_.Resample(frame.channel(0).data(), frame.samples_per_channel(),
54 work_frame.data(), rnn_vad::kFrameSize10ms24kHz);
55
56 std::array<float, rnn_vad::kFeatureVectorSize> feature_vector;
57 const bool is_silence = features_extractor_.CheckSilenceComputeFeatures(
58 work_frame, feature_vector);
59 return rnn_vad_.ComputeVadProbability(feature_vector, is_silence);
Alex Loikodb6af362018-06-20 14:14:18 +020060 }
Alessio Bazzica530781d2020-09-25 13:24:36 +020061
62 private:
63 PushResampler<float> resampler_;
64 rnn_vad::FeaturesExtractor features_extractor_;
Alessio Bazzica812dc072020-12-03 16:54:38 +010065 rnn_vad::RnnVad rnn_vad_;
Alessio Bazzica530781d2020-09-25 13:24:36 +020066};
67
Alex Loikodb6af362018-06-20 14:14:18 +020068} // namespace
69
Alessio Bazzica841d74e2021-03-31 15:04:03 +020070VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms,
Alessio Bazzica253f8362020-11-27 16:02:38 +010071 const AvailableCpuFeatures& cpu_features)
Alessio Bazzica841d74e2021-03-31 15:04:03 +020072 : VadLevelAnalyzer(vad_reset_period_ms,
Alessio Bazzica253f8362020-11-27 16:02:38 +010073 std::make_unique<Vad>(cpu_features)) {}
Alessio Bazzicac1ece012020-09-25 14:31:17 +020074
Alessio Bazzica841d74e2021-03-31 15:04:03 +020075VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms,
Alessio Bazzicac1ece012020-09-25 14:31:17 +020076 std::unique_ptr<VoiceActivityDetector> vad)
Alessio Bazzica841d74e2021-03-31 15:04:03 +020077 : vad_(std::move(vad)),
78 vad_reset_period_frames_(
79 rtc::CheckedDivExact(vad_reset_period_ms, kFrameDurationMs)),
Alessio Bazzica980c4602021-04-14 19:09:17 +020080 time_to_vad_reset_(vad_reset_period_frames_) {
Alessio Bazzica530781d2020-09-25 13:24:36 +020081 RTC_DCHECK(vad_);
Alessio Bazzica841d74e2021-03-31 15:04:03 +020082 RTC_DCHECK_GT(vad_reset_period_frames_, 1);
Alex Loikodb6af362018-06-20 14:14:18 +020083}
84
Alessio Bazzica530781d2020-09-25 13:24:36 +020085VadLevelAnalyzer::~VadLevelAnalyzer() = default;
86
87VadLevelAnalyzer::Result VadLevelAnalyzer::AnalyzeFrame(
88 AudioFrameView<const float> frame) {
Alessio Bazzica841d74e2021-03-31 15:04:03 +020089 // Periodically reset the VAD.
90 time_to_vad_reset_--;
91 if (time_to_vad_reset_ <= 0) {
92 vad_->Reset();
93 time_to_vad_reset_ = vad_reset_period_frames_;
94 }
Alessio Bazzicac1ece012020-09-25 14:31:17 +020095 // Compute levels.
Alessio Bazzica841d74e2021-03-31 15:04:03 +020096 float peak = 0.0f;
97 float rms = 0.0f;
Alessio Bazzica530781d2020-09-25 13:24:36 +020098 for (const auto& x : frame.channel(0)) {
99 peak = std::max(std::fabs(x), peak);
100 rms += x * x;
101 }
Alessio Bazzica980c4602021-04-14 19:09:17 +0200102 return {vad_->ComputeProbability(frame),
Alessio Bazzica530781d2020-09-25 13:24:36 +0200103 FloatS16ToDbfs(std::sqrt(rms / frame.samples_per_channel())),
104 FloatS16ToDbfs(peak)};
Alex Loikodb6af362018-06-20 14:14:18 +0200105}
106
107} // namespace webrtc