blob: 80b633cbc2b2ae7e2753753fa7fa9302c6a5c48f [file] [log] [blame]
niklase@google.com470e71d2011-07-07 08:21:25 +00001/*
bjornv@webrtc.orgf4b77fd2012-01-25 12:40:00 +00002 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
niklase@google.com470e71d2011-07-07 08:21:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020011#include "modules/audio_processing/voice_detection_impl.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000012
Yves Gerey988cc082018-10-23 12:03:01 +020013#include "api/audio/audio_frame.h"
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020014#include "common_audio/vad/include/webrtc_vad.h"
15#include "modules/audio_processing/audio_buffer.h"
Yves Gerey988cc082018-10-23 12:03:01 +020016#include "rtc_base/checks.h"
Steve Anton10542f22019-01-11 09:11:00 -080017#include "rtc_base/constructor_magic.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000018
19namespace webrtc {
solenberga29386c2015-12-16 03:31:12 -080020class VoiceDetectionImpl::Vad {
21 public:
22 Vad() {
23 state_ = WebRtcVad_Create();
24 RTC_CHECK(state_);
25 int error = WebRtcVad_Init(state_);
26 RTC_DCHECK_EQ(0, error);
niklase@google.com470e71d2011-07-07 08:21:25 +000027 }
Yves Gerey665174f2018-06-19 15:03:05 +020028 ~Vad() { WebRtcVad_Free(state_); }
solenberga29386c2015-12-16 03:31:12 -080029 VadInst* state() { return state_; }
Yves Gerey665174f2018-06-19 15:03:05 +020030
solenberga29386c2015-12-16 03:31:12 -080031 private:
32 VadInst* state_ = nullptr;
33 RTC_DISALLOW_COPY_AND_ASSIGN(Vad);
34};
niklase@google.com470e71d2011-07-07 08:21:25 +000035
solenberga29386c2015-12-16 03:31:12 -080036VoiceDetectionImpl::VoiceDetectionImpl(rtc::CriticalSection* crit)
37 : crit_(crit) {
peahdf3efa82015-11-28 12:35:15 -080038 RTC_DCHECK(crit);
39}
niklase@google.com470e71d2011-07-07 08:21:25 +000040
41VoiceDetectionImpl::~VoiceDetectionImpl() {}
42
solenberga29386c2015-12-16 03:31:12 -080043void VoiceDetectionImpl::Initialize(int sample_rate_hz) {
peahdf3efa82015-11-28 12:35:15 -080044 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080045 sample_rate_hz_ = sample_rate_hz;
kwiberg88788ad2016-02-19 07:04:49 -080046 std::unique_ptr<Vad> new_vad;
solenberga29386c2015-12-16 03:31:12 -080047 if (enabled_) {
48 new_vad.reset(new Vad());
niklase@google.com470e71d2011-07-07 08:21:25 +000049 }
solenberga29386c2015-12-16 03:31:12 -080050 vad_.swap(new_vad);
51 using_external_vad_ = false;
52 frame_size_samples_ =
53 static_cast<size_t>(frame_size_ms_ * sample_rate_hz_) / 1000;
54 set_likelihood(likelihood_);
55}
niklase@google.com470e71d2011-07-07 08:21:25 +000056
Per Åhgrena1351272019-08-15 12:15:46 +020057bool VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
solenberga29386c2015-12-16 03:31:12 -080058 rtc::CritScope cs(crit_);
Per Åhgrena1351272019-08-15 12:15:46 +020059 RTC_DCHECK(enabled_);
niklase@google.com470e71d2011-07-07 08:21:25 +000060
Per Åhgren928146f2019-08-20 09:19:21 +020061 RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
62 audio->num_frames_per_band());
63 std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
64 rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
65 audio->num_frames_per_band());
Per Åhgrend47941e2019-08-22 11:51:13 +020066 if (audio->num_channels() == 1) {
67 FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
Per Åhgren928146f2019-08-20 09:19:21 +020068 audio->num_frames_per_band(), mixed_low_pass_data.data());
Per Åhgrena1351272019-08-15 12:15:46 +020069 } else {
70 const int num_channels = static_cast<int>(audio->num_channels());
71 for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
Per Åhgren928146f2019-08-20 09:19:21 +020072 int32_t value =
Per Åhgrend47941e2019-08-22 11:51:13 +020073 FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
Per Åhgrena1351272019-08-15 12:15:46 +020074 for (int j = 1; j < num_channels; ++j) {
Per Åhgrend47941e2019-08-22 11:51:13 +020075 value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
Per Åhgrena1351272019-08-15 12:15:46 +020076 }
77 mixed_low_pass_data[i] = value / num_channels;
78 }
Per Åhgrena1351272019-08-15 12:15:46 +020079 }
80
81 int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
82 mixed_low_pass.data(), frame_size_samples_);
andrew@webrtc.orged083d42011-09-19 15:28:51 +000083 if (vad_ret == 0) {
niklase@google.com470e71d2011-07-07 08:21:25 +000084 stream_has_voice_ = false;
Per Åhgrena1351272019-08-15 12:15:46 +020085 return false;
andrew@webrtc.orged083d42011-09-19 15:28:51 +000086 } else if (vad_ret == 1) {
niklase@google.com470e71d2011-07-07 08:21:25 +000087 stream_has_voice_ = true;
niklase@google.com470e71d2011-07-07 08:21:25 +000088 } else {
solenberga29386c2015-12-16 03:31:12 -080089 RTC_NOTREACHED();
niklase@google.com470e71d2011-07-07 08:21:25 +000090 }
Per Åhgrena1351272019-08-15 12:15:46 +020091
92 return stream_has_voice_;
niklase@google.com470e71d2011-07-07 08:21:25 +000093}
94
95int VoiceDetectionImpl::Enable(bool enable) {
peahdf3efa82015-11-28 12:35:15 -080096 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080097 if (enabled_ != enable) {
98 enabled_ = enable;
99 Initialize(sample_rate_hz_);
100 }
101 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000102}
103
104bool VoiceDetectionImpl::is_enabled() const {
peahdf3efa82015-11-28 12:35:15 -0800105 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -0800106 return enabled_;
niklase@google.com470e71d2011-07-07 08:21:25 +0000107}
108
109int VoiceDetectionImpl::set_stream_has_voice(bool has_voice) {
peahdf3efa82015-11-28 12:35:15 -0800110 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000111 using_external_vad_ = true;
112 stream_has_voice_ = has_voice;
solenberga29386c2015-12-16 03:31:12 -0800113 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000114}
115
116bool VoiceDetectionImpl::stream_has_voice() const {
peahdf3efa82015-11-28 12:35:15 -0800117 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000118 // TODO(ajm): enable this assertion?
Yves Gerey665174f2018-06-19 15:03:05 +0200119 // RTC_DCHECK(using_external_vad_ || is_component_enabled());
niklase@google.com470e71d2011-07-07 08:21:25 +0000120 return stream_has_voice_;
121}
122
123int VoiceDetectionImpl::set_likelihood(VoiceDetection::Likelihood likelihood) {
peahdf3efa82015-11-28 12:35:15 -0800124 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000125 likelihood_ = likelihood;
solenberga29386c2015-12-16 03:31:12 -0800126 if (enabled_) {
127 int mode = 2;
128 switch (likelihood) {
129 case VoiceDetection::kVeryLowLikelihood:
130 mode = 3;
131 break;
132 case VoiceDetection::kLowLikelihood:
133 mode = 2;
134 break;
135 case VoiceDetection::kModerateLikelihood:
136 mode = 1;
137 break;
138 case VoiceDetection::kHighLikelihood:
139 mode = 0;
140 break;
141 default:
142 RTC_NOTREACHED();
143 break;
144 }
145 int error = WebRtcVad_set_mode(vad_->state(), mode);
146 RTC_DCHECK_EQ(0, error);
147 }
148 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000149}
150
151VoiceDetection::Likelihood VoiceDetectionImpl::likelihood() const {
peahdf3efa82015-11-28 12:35:15 -0800152 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000153 return likelihood_;
154}
155
156int VoiceDetectionImpl::set_frame_size_ms(int size) {
peahdf3efa82015-11-28 12:35:15 -0800157 rtc::CritScope cs(crit_);
Yves Gerey665174f2018-06-19 15:03:05 +0200158 RTC_DCHECK_EQ(10, size); // TODO(ajm): remove when supported.
niklase@google.com470e71d2011-07-07 08:21:25 +0000159 frame_size_ms_ = size;
solenberga29386c2015-12-16 03:31:12 -0800160 Initialize(sample_rate_hz_);
161 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000162}
163
164int VoiceDetectionImpl::frame_size_ms() const {
peahdf3efa82015-11-28 12:35:15 -0800165 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000166 return frame_size_ms_;
167}
niklase@google.com470e71d2011-07-07 08:21:25 +0000168} // namespace webrtc