Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
| 11 | #include "modules/audio_processing/agc2/vad_with_level.h" |
| 12 | |
| 13 | #include <algorithm> |
Yves Gerey | 988cc08 | 2018-10-23 12:03:01 +0200 | [diff] [blame] | 14 | #include <array> |
| 15 | #include <cmath> |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 16 | |
Yves Gerey | 988cc08 | 2018-10-23 12:03:01 +0200 | [diff] [blame] | 17 | #include "api/array_view.h" |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 18 | #include "common_audio/include/audio_util.h" |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 19 | #include "common_audio/resampler/include/push_resampler.h" |
Alessio Bazzica | c1ece01 | 2020-09-25 14:31:17 +0200 | [diff] [blame] | 20 | #include "modules/audio_processing/agc2/agc2_common.h" |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 21 | #include "modules/audio_processing/agc2/rnn_vad/common.h" |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 22 | #include "modules/audio_processing/agc2/rnn_vad/features_extraction.h" |
| 23 | #include "modules/audio_processing/agc2/rnn_vad/rnn.h" |
| 24 | #include "rtc_base/checks.h" |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 25 | |
| 26 | namespace webrtc { |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 27 | namespace { |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 28 | |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 29 | using VoiceActivityDetector = VadLevelAnalyzer::VoiceActivityDetector; |
| 30 | |
| 31 | // Default VAD that combines a resampler and the RNN VAD. |
| 32 | // Computes the speech probability on the first channel. |
| 33 | class Vad : public VoiceActivityDetector { |
| 34 | public: |
Alessio Bazzica | 253f836 | 2020-11-27 16:02:38 +0100 | [diff] [blame] | 35 | explicit Vad(const AvailableCpuFeatures& cpu_features) |
| 36 | : features_extractor_(cpu_features), rnn_vad_(cpu_features) {} |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 37 | Vad(const Vad&) = delete; |
| 38 | Vad& operator=(const Vad&) = delete; |
| 39 | ~Vad() = default; |
| 40 | |
Alessio Bazzica | 841d74e | 2021-03-31 15:04:03 +0200 | [diff] [blame] | 41 | void Reset() override { rnn_vad_.Reset(); } |
| 42 | |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 43 | float ComputeProbability(AudioFrameView<const float> frame) override { |
| 44 | // The source number of channels is 1, because we always use the 1st |
| 45 | // channel. |
| 46 | resampler_.InitializeIfNeeded( |
| 47 | /*sample_rate_hz=*/static_cast<int>(frame.samples_per_channel() * 100), |
| 48 | rnn_vad::kSampleRate24kHz, |
| 49 | /*num_channels=*/1); |
| 50 | |
| 51 | std::array<float, rnn_vad::kFrameSize10ms24kHz> work_frame; |
| 52 | // Feed the 1st channel to the resampler. |
| 53 | resampler_.Resample(frame.channel(0).data(), frame.samples_per_channel(), |
| 54 | work_frame.data(), rnn_vad::kFrameSize10ms24kHz); |
| 55 | |
| 56 | std::array<float, rnn_vad::kFeatureVectorSize> feature_vector; |
| 57 | const bool is_silence = features_extractor_.CheckSilenceComputeFeatures( |
| 58 | work_frame, feature_vector); |
| 59 | return rnn_vad_.ComputeVadProbability(feature_vector, is_silence); |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 60 | } |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 61 | |
| 62 | private: |
| 63 | PushResampler<float> resampler_; |
| 64 | rnn_vad::FeaturesExtractor features_extractor_; |
Alessio Bazzica | 812dc07 | 2020-12-03 16:54:38 +0100 | [diff] [blame] | 65 | rnn_vad::RnnVad rnn_vad_; |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 66 | }; |
| 67 | |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 68 | } // namespace |
| 69 | |
Alessio Bazzica | 841d74e | 2021-03-31 15:04:03 +0200 | [diff] [blame] | 70 | VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms, |
Alessio Bazzica | 253f836 | 2020-11-27 16:02:38 +0100 | [diff] [blame] | 71 | const AvailableCpuFeatures& cpu_features) |
Alessio Bazzica | 841d74e | 2021-03-31 15:04:03 +0200 | [diff] [blame] | 72 | : VadLevelAnalyzer(vad_reset_period_ms, |
Alessio Bazzica | 253f836 | 2020-11-27 16:02:38 +0100 | [diff] [blame] | 73 | std::make_unique<Vad>(cpu_features)) {} |
Alessio Bazzica | c1ece01 | 2020-09-25 14:31:17 +0200 | [diff] [blame] | 74 | |
Alessio Bazzica | 841d74e | 2021-03-31 15:04:03 +0200 | [diff] [blame] | 75 | VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms, |
Alessio Bazzica | c1ece01 | 2020-09-25 14:31:17 +0200 | [diff] [blame] | 76 | std::unique_ptr<VoiceActivityDetector> vad) |
Alessio Bazzica | 841d74e | 2021-03-31 15:04:03 +0200 | [diff] [blame] | 77 | : vad_(std::move(vad)), |
| 78 | vad_reset_period_frames_( |
| 79 | rtc::CheckedDivExact(vad_reset_period_ms, kFrameDurationMs)), |
Alessio Bazzica | 980c460 | 2021-04-14 19:09:17 +0200 | [diff] [blame] | 80 | time_to_vad_reset_(vad_reset_period_frames_) { |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 81 | RTC_DCHECK(vad_); |
Alessio Bazzica | 841d74e | 2021-03-31 15:04:03 +0200 | [diff] [blame] | 82 | RTC_DCHECK_GT(vad_reset_period_frames_, 1); |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 83 | } |
| 84 | |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 85 | VadLevelAnalyzer::~VadLevelAnalyzer() = default; |
| 86 | |
| 87 | VadLevelAnalyzer::Result VadLevelAnalyzer::AnalyzeFrame( |
| 88 | AudioFrameView<const float> frame) { |
Alessio Bazzica | 841d74e | 2021-03-31 15:04:03 +0200 | [diff] [blame] | 89 | // Periodically reset the VAD. |
| 90 | time_to_vad_reset_--; |
| 91 | if (time_to_vad_reset_ <= 0) { |
| 92 | vad_->Reset(); |
| 93 | time_to_vad_reset_ = vad_reset_period_frames_; |
| 94 | } |
Alessio Bazzica | c1ece01 | 2020-09-25 14:31:17 +0200 | [diff] [blame] | 95 | // Compute levels. |
Alessio Bazzica | 841d74e | 2021-03-31 15:04:03 +0200 | [diff] [blame] | 96 | float peak = 0.0f; |
| 97 | float rms = 0.0f; |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 98 | for (const auto& x : frame.channel(0)) { |
| 99 | peak = std::max(std::fabs(x), peak); |
| 100 | rms += x * x; |
| 101 | } |
Alessio Bazzica | 980c460 | 2021-04-14 19:09:17 +0200 | [diff] [blame] | 102 | return {vad_->ComputeProbability(frame), |
Alessio Bazzica | 530781d | 2020-09-25 13:24:36 +0200 | [diff] [blame] | 103 | FloatS16ToDbfs(std::sqrt(rms / frame.samples_per_channel())), |
| 104 | FloatS16ToDbfs(peak)}; |
Alex Loiko | db6af36 | 2018-06-20 14:14:18 +0200 | [diff] [blame] | 105 | } |
| 106 | |
| 107 | } // namespace webrtc |