blob: 9280be1ef9d0c24f5eea8f18500ac880e2eaa12f [file] [log] [blame]
niklase@google.com470e71d2011-07-07 08:21:25 +00001/*
bjornv@webrtc.orgf4b77fd2012-01-25 12:40:00 +00002 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
niklase@google.com470e71d2011-07-07 08:21:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020011#include "modules/audio_processing/voice_detection_impl.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000012
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020013#include "common_audio/vad/include/webrtc_vad.h"
14#include "modules/audio_processing/audio_buffer.h"
15#include "rtc_base/constructormagic.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000016
17namespace webrtc {
solenberga29386c2015-12-16 03:31:12 -080018class VoiceDetectionImpl::Vad {
19 public:
20 Vad() {
21 state_ = WebRtcVad_Create();
22 RTC_CHECK(state_);
23 int error = WebRtcVad_Init(state_);
24 RTC_DCHECK_EQ(0, error);
niklase@google.com470e71d2011-07-07 08:21:25 +000025 }
Yves Gerey665174f2018-06-19 15:03:05 +020026 ~Vad() { WebRtcVad_Free(state_); }
solenberga29386c2015-12-16 03:31:12 -080027 VadInst* state() { return state_; }
Yves Gerey665174f2018-06-19 15:03:05 +020028
solenberga29386c2015-12-16 03:31:12 -080029 private:
30 VadInst* state_ = nullptr;
31 RTC_DISALLOW_COPY_AND_ASSIGN(Vad);
32};
niklase@google.com470e71d2011-07-07 08:21:25 +000033
solenberga29386c2015-12-16 03:31:12 -080034VoiceDetectionImpl::VoiceDetectionImpl(rtc::CriticalSection* crit)
35 : crit_(crit) {
peahdf3efa82015-11-28 12:35:15 -080036 RTC_DCHECK(crit);
37}
niklase@google.com470e71d2011-07-07 08:21:25 +000038
39VoiceDetectionImpl::~VoiceDetectionImpl() {}
40
solenberga29386c2015-12-16 03:31:12 -080041void VoiceDetectionImpl::Initialize(int sample_rate_hz) {
peahdf3efa82015-11-28 12:35:15 -080042 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080043 sample_rate_hz_ = sample_rate_hz;
kwiberg88788ad2016-02-19 07:04:49 -080044 std::unique_ptr<Vad> new_vad;
solenberga29386c2015-12-16 03:31:12 -080045 if (enabled_) {
46 new_vad.reset(new Vad());
niklase@google.com470e71d2011-07-07 08:21:25 +000047 }
solenberga29386c2015-12-16 03:31:12 -080048 vad_.swap(new_vad);
49 using_external_vad_ = false;
50 frame_size_samples_ =
51 static_cast<size_t>(frame_size_ms_ * sample_rate_hz_) / 1000;
52 set_likelihood(likelihood_);
53}
niklase@google.com470e71d2011-07-07 08:21:25 +000054
solenberga29386c2015-12-16 03:31:12 -080055void VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
56 rtc::CritScope cs(crit_);
57 if (!enabled_) {
58 return;
59 }
niklase@google.com470e71d2011-07-07 08:21:25 +000060 if (using_external_vad_) {
61 using_external_vad_ = false;
solenberga29386c2015-12-16 03:31:12 -080062 return;
niklase@google.com470e71d2011-07-07 08:21:25 +000063 }
niklase@google.com470e71d2011-07-07 08:21:25 +000064
kwibergaf476c72016-11-28 15:21:39 -080065 RTC_DCHECK_GE(160, audio->num_frames_per_band());
niklase@google.com470e71d2011-07-07 08:21:25 +000066 // TODO(ajm): concatenate data in frame buffer here.
Yves Gerey665174f2018-06-19 15:03:05 +020067 int vad_ret =
68 WebRtcVad_Process(vad_->state(), sample_rate_hz_,
69 audio->mixed_low_pass_data(), frame_size_samples_);
andrew@webrtc.orged083d42011-09-19 15:28:51 +000070 if (vad_ret == 0) {
niklase@google.com470e71d2011-07-07 08:21:25 +000071 stream_has_voice_ = false;
andrew@webrtc.orged083d42011-09-19 15:28:51 +000072 audio->set_activity(AudioFrame::kVadPassive);
73 } else if (vad_ret == 1) {
niklase@google.com470e71d2011-07-07 08:21:25 +000074 stream_has_voice_ = true;
andrew@webrtc.orged083d42011-09-19 15:28:51 +000075 audio->set_activity(AudioFrame::kVadActive);
niklase@google.com470e71d2011-07-07 08:21:25 +000076 } else {
solenberga29386c2015-12-16 03:31:12 -080077 RTC_NOTREACHED();
niklase@google.com470e71d2011-07-07 08:21:25 +000078 }
niklase@google.com470e71d2011-07-07 08:21:25 +000079}
80
81int VoiceDetectionImpl::Enable(bool enable) {
peahdf3efa82015-11-28 12:35:15 -080082 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080083 if (enabled_ != enable) {
84 enabled_ = enable;
85 Initialize(sample_rate_hz_);
86 }
87 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +000088}
89
90bool VoiceDetectionImpl::is_enabled() const {
peahdf3efa82015-11-28 12:35:15 -080091 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080092 return enabled_;
niklase@google.com470e71d2011-07-07 08:21:25 +000093}
94
95int VoiceDetectionImpl::set_stream_has_voice(bool has_voice) {
peahdf3efa82015-11-28 12:35:15 -080096 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +000097 using_external_vad_ = true;
98 stream_has_voice_ = has_voice;
solenberga29386c2015-12-16 03:31:12 -080099 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000100}
101
102bool VoiceDetectionImpl::stream_has_voice() const {
peahdf3efa82015-11-28 12:35:15 -0800103 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000104 // TODO(ajm): enable this assertion?
Yves Gerey665174f2018-06-19 15:03:05 +0200105 // RTC_DCHECK(using_external_vad_ || is_component_enabled());
niklase@google.com470e71d2011-07-07 08:21:25 +0000106 return stream_has_voice_;
107}
108
109int VoiceDetectionImpl::set_likelihood(VoiceDetection::Likelihood likelihood) {
peahdf3efa82015-11-28 12:35:15 -0800110 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000111 likelihood_ = likelihood;
solenberga29386c2015-12-16 03:31:12 -0800112 if (enabled_) {
113 int mode = 2;
114 switch (likelihood) {
115 case VoiceDetection::kVeryLowLikelihood:
116 mode = 3;
117 break;
118 case VoiceDetection::kLowLikelihood:
119 mode = 2;
120 break;
121 case VoiceDetection::kModerateLikelihood:
122 mode = 1;
123 break;
124 case VoiceDetection::kHighLikelihood:
125 mode = 0;
126 break;
127 default:
128 RTC_NOTREACHED();
129 break;
130 }
131 int error = WebRtcVad_set_mode(vad_->state(), mode);
132 RTC_DCHECK_EQ(0, error);
133 }
134 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000135}
136
137VoiceDetection::Likelihood VoiceDetectionImpl::likelihood() const {
peahdf3efa82015-11-28 12:35:15 -0800138 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000139 return likelihood_;
140}
141
142int VoiceDetectionImpl::set_frame_size_ms(int size) {
peahdf3efa82015-11-28 12:35:15 -0800143 rtc::CritScope cs(crit_);
Yves Gerey665174f2018-06-19 15:03:05 +0200144 RTC_DCHECK_EQ(10, size); // TODO(ajm): remove when supported.
niklase@google.com470e71d2011-07-07 08:21:25 +0000145 frame_size_ms_ = size;
solenberga29386c2015-12-16 03:31:12 -0800146 Initialize(sample_rate_hz_);
147 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000148}
149
150int VoiceDetectionImpl::frame_size_ms() const {
peahdf3efa82015-11-28 12:35:15 -0800151 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000152 return frame_size_ms_;
153}
niklase@google.com470e71d2011-07-07 08:21:25 +0000154} // namespace webrtc