blob: 3b0eb7c7ca72b5c6e805e0fed44071817faa5d47 [file] [log] [blame]
niklase@google.com470e71d2011-07-07 08:21:25 +00001/*
bjornv@webrtc.orgf4b77fd2012-01-25 12:40:00 +00002 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
niklase@google.com470e71d2011-07-07 08:21:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020011#include "modules/audio_processing/voice_detection_impl.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000012
Yves Gerey988cc082018-10-23 12:03:01 +020013#include "api/audio/audio_frame.h"
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020014#include "common_audio/vad/include/webrtc_vad.h"
15#include "modules/audio_processing/audio_buffer.h"
Yves Gerey988cc082018-10-23 12:03:01 +020016#include "rtc_base/checks.h"
Steve Anton10542f22019-01-11 09:11:00 -080017#include "rtc_base/constructor_magic.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000018
19namespace webrtc {
solenberga29386c2015-12-16 03:31:12 -080020class VoiceDetectionImpl::Vad {
21 public:
22 Vad() {
23 state_ = WebRtcVad_Create();
24 RTC_CHECK(state_);
25 int error = WebRtcVad_Init(state_);
26 RTC_DCHECK_EQ(0, error);
niklase@google.com470e71d2011-07-07 08:21:25 +000027 }
Yves Gerey665174f2018-06-19 15:03:05 +020028 ~Vad() { WebRtcVad_Free(state_); }
solenberga29386c2015-12-16 03:31:12 -080029 VadInst* state() { return state_; }
Yves Gerey665174f2018-06-19 15:03:05 +020030
solenberga29386c2015-12-16 03:31:12 -080031 private:
32 VadInst* state_ = nullptr;
33 RTC_DISALLOW_COPY_AND_ASSIGN(Vad);
34};
niklase@google.com470e71d2011-07-07 08:21:25 +000035
solenberga29386c2015-12-16 03:31:12 -080036VoiceDetectionImpl::VoiceDetectionImpl(rtc::CriticalSection* crit)
37 : crit_(crit) {
peahdf3efa82015-11-28 12:35:15 -080038 RTC_DCHECK(crit);
39}
niklase@google.com470e71d2011-07-07 08:21:25 +000040
41VoiceDetectionImpl::~VoiceDetectionImpl() {}
42
solenberga29386c2015-12-16 03:31:12 -080043void VoiceDetectionImpl::Initialize(int sample_rate_hz) {
peahdf3efa82015-11-28 12:35:15 -080044 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080045 sample_rate_hz_ = sample_rate_hz;
kwiberg88788ad2016-02-19 07:04:49 -080046 std::unique_ptr<Vad> new_vad;
solenberga29386c2015-12-16 03:31:12 -080047 if (enabled_) {
48 new_vad.reset(new Vad());
niklase@google.com470e71d2011-07-07 08:21:25 +000049 }
solenberga29386c2015-12-16 03:31:12 -080050 vad_.swap(new_vad);
51 using_external_vad_ = false;
52 frame_size_samples_ =
53 static_cast<size_t>(frame_size_ms_ * sample_rate_hz_) / 1000;
54 set_likelihood(likelihood_);
55}
niklase@google.com470e71d2011-07-07 08:21:25 +000056
Per Åhgrena1351272019-08-15 12:15:46 +020057bool VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
solenberga29386c2015-12-16 03:31:12 -080058 rtc::CritScope cs(crit_);
Per Åhgrena1351272019-08-15 12:15:46 +020059 RTC_DCHECK(enabled_);
niklase@google.com470e71d2011-07-07 08:21:25 +000060
Per Åhgren928146f2019-08-20 09:19:21 +020061 RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
62 audio->num_frames_per_band());
63 std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
64 rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
65 audio->num_frames_per_band());
Per Åhgrena1351272019-08-15 12:15:46 +020066 if (audio->num_proc_channels() == 1) {
Per Åhgren928146f2019-08-20 09:19:21 +020067 FloatS16ToS16(audio->split_bands_const_f(0)[kBand0To8kHz],
68 audio->num_frames_per_band(), mixed_low_pass_data.data());
Per Åhgrena1351272019-08-15 12:15:46 +020069 } else {
70 const int num_channels = static_cast<int>(audio->num_channels());
71 for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
Per Åhgren928146f2019-08-20 09:19:21 +020072 int32_t value =
73 FloatS16ToS16(audio->split_channels_const_f(kBand0To8kHz)[0][i]);
Per Åhgrena1351272019-08-15 12:15:46 +020074 for (int j = 1; j < num_channels; ++j) {
Per Åhgren928146f2019-08-20 09:19:21 +020075 value +=
76 FloatS16ToS16(audio->split_channels_const_f(kBand0To8kHz)[j][i]);
Per Åhgrena1351272019-08-15 12:15:46 +020077 }
78 mixed_low_pass_data[i] = value / num_channels;
79 }
Per Åhgrena1351272019-08-15 12:15:46 +020080 }
81
82 int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
83 mixed_low_pass.data(), frame_size_samples_);
andrew@webrtc.orged083d42011-09-19 15:28:51 +000084 if (vad_ret == 0) {
niklase@google.com470e71d2011-07-07 08:21:25 +000085 stream_has_voice_ = false;
Per Åhgrena1351272019-08-15 12:15:46 +020086 return false;
andrew@webrtc.orged083d42011-09-19 15:28:51 +000087 } else if (vad_ret == 1) {
niklase@google.com470e71d2011-07-07 08:21:25 +000088 stream_has_voice_ = true;
niklase@google.com470e71d2011-07-07 08:21:25 +000089 } else {
solenberga29386c2015-12-16 03:31:12 -080090 RTC_NOTREACHED();
niklase@google.com470e71d2011-07-07 08:21:25 +000091 }
Per Åhgrena1351272019-08-15 12:15:46 +020092
93 return stream_has_voice_;
niklase@google.com470e71d2011-07-07 08:21:25 +000094}
95
96int VoiceDetectionImpl::Enable(bool enable) {
peahdf3efa82015-11-28 12:35:15 -080097 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080098 if (enabled_ != enable) {
99 enabled_ = enable;
100 Initialize(sample_rate_hz_);
101 }
102 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000103}
104
105bool VoiceDetectionImpl::is_enabled() const {
peahdf3efa82015-11-28 12:35:15 -0800106 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -0800107 return enabled_;
niklase@google.com470e71d2011-07-07 08:21:25 +0000108}
109
110int VoiceDetectionImpl::set_stream_has_voice(bool has_voice) {
peahdf3efa82015-11-28 12:35:15 -0800111 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000112 using_external_vad_ = true;
113 stream_has_voice_ = has_voice;
solenberga29386c2015-12-16 03:31:12 -0800114 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000115}
116
117bool VoiceDetectionImpl::stream_has_voice() const {
peahdf3efa82015-11-28 12:35:15 -0800118 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000119 // TODO(ajm): enable this assertion?
Yves Gerey665174f2018-06-19 15:03:05 +0200120 // RTC_DCHECK(using_external_vad_ || is_component_enabled());
niklase@google.com470e71d2011-07-07 08:21:25 +0000121 return stream_has_voice_;
122}
123
124int VoiceDetectionImpl::set_likelihood(VoiceDetection::Likelihood likelihood) {
peahdf3efa82015-11-28 12:35:15 -0800125 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000126 likelihood_ = likelihood;
solenberga29386c2015-12-16 03:31:12 -0800127 if (enabled_) {
128 int mode = 2;
129 switch (likelihood) {
130 case VoiceDetection::kVeryLowLikelihood:
131 mode = 3;
132 break;
133 case VoiceDetection::kLowLikelihood:
134 mode = 2;
135 break;
136 case VoiceDetection::kModerateLikelihood:
137 mode = 1;
138 break;
139 case VoiceDetection::kHighLikelihood:
140 mode = 0;
141 break;
142 default:
143 RTC_NOTREACHED();
144 break;
145 }
146 int error = WebRtcVad_set_mode(vad_->state(), mode);
147 RTC_DCHECK_EQ(0, error);
148 }
149 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000150}
151
152VoiceDetection::Likelihood VoiceDetectionImpl::likelihood() const {
peahdf3efa82015-11-28 12:35:15 -0800153 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000154 return likelihood_;
155}
156
157int VoiceDetectionImpl::set_frame_size_ms(int size) {
peahdf3efa82015-11-28 12:35:15 -0800158 rtc::CritScope cs(crit_);
Yves Gerey665174f2018-06-19 15:03:05 +0200159 RTC_DCHECK_EQ(10, size); // TODO(ajm): remove when supported.
niklase@google.com470e71d2011-07-07 08:21:25 +0000160 frame_size_ms_ = size;
solenberga29386c2015-12-16 03:31:12 -0800161 Initialize(sample_rate_hz_);
162 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000163}
164
165int VoiceDetectionImpl::frame_size_ms() const {
peahdf3efa82015-11-28 12:35:15 -0800166 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000167 return frame_size_ms_;
168}
niklase@google.com470e71d2011-07-07 08:21:25 +0000169} // namespace webrtc