niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 1 | /* |
bjornv@webrtc.org | f4b77fd | 2012-01-25 12:40:00 +0000 | [diff] [blame] | 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
Mirko Bonadei | 92ea95e | 2017-09-15 06:47:31 +0200 | [diff] [blame] | 11 | #include "modules/audio_processing/voice_detection_impl.h" |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 12 | |
Yves Gerey | 988cc08 | 2018-10-23 12:03:01 +0200 | [diff] [blame] | 13 | #include "api/audio/audio_frame.h" |
Mirko Bonadei | 92ea95e | 2017-09-15 06:47:31 +0200 | [diff] [blame] | 14 | #include "common_audio/vad/include/webrtc_vad.h" |
| 15 | #include "modules/audio_processing/audio_buffer.h" |
Yves Gerey | 988cc08 | 2018-10-23 12:03:01 +0200 | [diff] [blame] | 16 | #include "rtc_base/checks.h" |
Steve Anton | 10542f2 | 2019-01-11 09:11:00 -0800 | [diff] [blame] | 17 | #include "rtc_base/constructor_magic.h" |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 18 | |
| 19 | namespace webrtc { |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 20 | class VoiceDetectionImpl::Vad { |
| 21 | public: |
| 22 | Vad() { |
| 23 | state_ = WebRtcVad_Create(); |
| 24 | RTC_CHECK(state_); |
| 25 | int error = WebRtcVad_Init(state_); |
| 26 | RTC_DCHECK_EQ(0, error); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 27 | } |
Yves Gerey | 665174f | 2018-06-19 15:03:05 +0200 | [diff] [blame] | 28 | ~Vad() { WebRtcVad_Free(state_); } |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 29 | VadInst* state() { return state_; } |
Yves Gerey | 665174f | 2018-06-19 15:03:05 +0200 | [diff] [blame] | 30 | |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 31 | private: |
| 32 | VadInst* state_ = nullptr; |
| 33 | RTC_DISALLOW_COPY_AND_ASSIGN(Vad); |
| 34 | }; |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 35 | |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 36 | VoiceDetectionImpl::VoiceDetectionImpl(rtc::CriticalSection* crit) |
| 37 | : crit_(crit) { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 38 | RTC_DCHECK(crit); |
| 39 | } |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 40 | |
| 41 | VoiceDetectionImpl::~VoiceDetectionImpl() {} |
| 42 | |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 43 | void VoiceDetectionImpl::Initialize(int sample_rate_hz) { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 44 | rtc::CritScope cs(crit_); |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 45 | sample_rate_hz_ = sample_rate_hz; |
kwiberg | 88788ad | 2016-02-19 07:04:49 -0800 | [diff] [blame] | 46 | std::unique_ptr<Vad> new_vad; |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 47 | if (enabled_) { |
| 48 | new_vad.reset(new Vad()); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 49 | } |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 50 | vad_.swap(new_vad); |
| 51 | using_external_vad_ = false; |
| 52 | frame_size_samples_ = |
| 53 | static_cast<size_t>(frame_size_ms_ * sample_rate_hz_) / 1000; |
| 54 | set_likelihood(likelihood_); |
| 55 | } |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 56 | |
Per Åhgren | a135127 | 2019-08-15 12:15:46 +0200 | [diff] [blame] | 57 | bool VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) { |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 58 | rtc::CritScope cs(crit_); |
Per Åhgren | a135127 | 2019-08-15 12:15:46 +0200 | [diff] [blame] | 59 | RTC_DCHECK(enabled_); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 60 | |
Per Åhgren | 928146f | 2019-08-20 09:19:21 +0200 | [diff] [blame] | 61 | RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength, |
| 62 | audio->num_frames_per_band()); |
| 63 | std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data; |
| 64 | rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(), |
| 65 | audio->num_frames_per_band()); |
Steve Anton | f254e9e | 2019-08-21 17:52:28 +0000 | [diff] [blame^] | 66 | if (audio->num_proc_channels() == 1) { |
| 67 | FloatS16ToS16(audio->split_bands_const_f(0)[kBand0To8kHz], |
Per Åhgren | 928146f | 2019-08-20 09:19:21 +0200 | [diff] [blame] | 68 | audio->num_frames_per_band(), mixed_low_pass_data.data()); |
Per Åhgren | a135127 | 2019-08-15 12:15:46 +0200 | [diff] [blame] | 69 | } else { |
| 70 | const int num_channels = static_cast<int>(audio->num_channels()); |
| 71 | for (size_t i = 0; i < audio->num_frames_per_band(); ++i) { |
Per Åhgren | 928146f | 2019-08-20 09:19:21 +0200 | [diff] [blame] | 72 | int32_t value = |
Steve Anton | f254e9e | 2019-08-21 17:52:28 +0000 | [diff] [blame^] | 73 | FloatS16ToS16(audio->split_channels_const_f(kBand0To8kHz)[0][i]); |
Per Åhgren | a135127 | 2019-08-15 12:15:46 +0200 | [diff] [blame] | 74 | for (int j = 1; j < num_channels; ++j) { |
Steve Anton | f254e9e | 2019-08-21 17:52:28 +0000 | [diff] [blame^] | 75 | value += |
| 76 | FloatS16ToS16(audio->split_channels_const_f(kBand0To8kHz)[j][i]); |
Per Åhgren | a135127 | 2019-08-15 12:15:46 +0200 | [diff] [blame] | 77 | } |
| 78 | mixed_low_pass_data[i] = value / num_channels; |
| 79 | } |
Per Åhgren | a135127 | 2019-08-15 12:15:46 +0200 | [diff] [blame] | 80 | } |
| 81 | |
| 82 | int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_, |
| 83 | mixed_low_pass.data(), frame_size_samples_); |
andrew@webrtc.org | ed083d4 | 2011-09-19 15:28:51 +0000 | [diff] [blame] | 84 | if (vad_ret == 0) { |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 85 | stream_has_voice_ = false; |
Per Åhgren | a135127 | 2019-08-15 12:15:46 +0200 | [diff] [blame] | 86 | return false; |
andrew@webrtc.org | ed083d4 | 2011-09-19 15:28:51 +0000 | [diff] [blame] | 87 | } else if (vad_ret == 1) { |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 88 | stream_has_voice_ = true; |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 89 | } else { |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 90 | RTC_NOTREACHED(); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 91 | } |
Per Åhgren | a135127 | 2019-08-15 12:15:46 +0200 | [diff] [blame] | 92 | |
| 93 | return stream_has_voice_; |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 94 | } |
| 95 | |
| 96 | int VoiceDetectionImpl::Enable(bool enable) { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 97 | rtc::CritScope cs(crit_); |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 98 | if (enabled_ != enable) { |
| 99 | enabled_ = enable; |
| 100 | Initialize(sample_rate_hz_); |
| 101 | } |
| 102 | return AudioProcessing::kNoError; |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 103 | } |
| 104 | |
| 105 | bool VoiceDetectionImpl::is_enabled() const { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 106 | rtc::CritScope cs(crit_); |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 107 | return enabled_; |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 108 | } |
| 109 | |
| 110 | int VoiceDetectionImpl::set_stream_has_voice(bool has_voice) { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 111 | rtc::CritScope cs(crit_); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 112 | using_external_vad_ = true; |
| 113 | stream_has_voice_ = has_voice; |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 114 | return AudioProcessing::kNoError; |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 115 | } |
| 116 | |
| 117 | bool VoiceDetectionImpl::stream_has_voice() const { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 118 | rtc::CritScope cs(crit_); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 119 | // TODO(ajm): enable this assertion? |
Yves Gerey | 665174f | 2018-06-19 15:03:05 +0200 | [diff] [blame] | 120 | // RTC_DCHECK(using_external_vad_ || is_component_enabled()); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 121 | return stream_has_voice_; |
| 122 | } |
| 123 | |
| 124 | int VoiceDetectionImpl::set_likelihood(VoiceDetection::Likelihood likelihood) { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 125 | rtc::CritScope cs(crit_); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 126 | likelihood_ = likelihood; |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 127 | if (enabled_) { |
| 128 | int mode = 2; |
| 129 | switch (likelihood) { |
| 130 | case VoiceDetection::kVeryLowLikelihood: |
| 131 | mode = 3; |
| 132 | break; |
| 133 | case VoiceDetection::kLowLikelihood: |
| 134 | mode = 2; |
| 135 | break; |
| 136 | case VoiceDetection::kModerateLikelihood: |
| 137 | mode = 1; |
| 138 | break; |
| 139 | case VoiceDetection::kHighLikelihood: |
| 140 | mode = 0; |
| 141 | break; |
| 142 | default: |
| 143 | RTC_NOTREACHED(); |
| 144 | break; |
| 145 | } |
| 146 | int error = WebRtcVad_set_mode(vad_->state(), mode); |
| 147 | RTC_DCHECK_EQ(0, error); |
| 148 | } |
| 149 | return AudioProcessing::kNoError; |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 150 | } |
| 151 | |
| 152 | VoiceDetection::Likelihood VoiceDetectionImpl::likelihood() const { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 153 | rtc::CritScope cs(crit_); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 154 | return likelihood_; |
| 155 | } |
| 156 | |
| 157 | int VoiceDetectionImpl::set_frame_size_ms(int size) { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 158 | rtc::CritScope cs(crit_); |
Yves Gerey | 665174f | 2018-06-19 15:03:05 +0200 | [diff] [blame] | 159 | RTC_DCHECK_EQ(10, size); // TODO(ajm): remove when supported. |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 160 | frame_size_ms_ = size; |
solenberg | a29386c | 2015-12-16 03:31:12 -0800 | [diff] [blame] | 161 | Initialize(sample_rate_hz_); |
| 162 | return AudioProcessing::kNoError; |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 163 | } |
| 164 | |
| 165 | int VoiceDetectionImpl::frame_size_ms() const { |
peah | df3efa8 | 2015-11-28 12:35:15 -0800 | [diff] [blame] | 166 | rtc::CritScope cs(crit_); |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 167 | return frame_size_ms_; |
| 168 | } |
niklase@google.com | 470e71d | 2011-07-07 08:21:25 +0000 | [diff] [blame] | 169 | } // namespace webrtc |