blob: 7bf6c4a29cbdf3a41692df9f06c2fe5fb51e31ce [file] [log] [blame]
niklase@google.com470e71d2011-07-07 08:21:25 +00001/*
bjornv@webrtc.orgf4b77fd2012-01-25 12:40:00 +00002 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
niklase@google.com470e71d2011-07-07 08:21:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020011#include "modules/audio_processing/voice_detection_impl.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000012
Yves Gerey988cc082018-10-23 12:03:01 +020013#include "api/audio/audio_frame.h"
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020014#include "common_audio/vad/include/webrtc_vad.h"
15#include "modules/audio_processing/audio_buffer.h"
Yves Gerey988cc082018-10-23 12:03:01 +020016#include "rtc_base/checks.h"
Steve Anton10542f22019-01-11 09:11:00 -080017#include "rtc_base/constructor_magic.h"
niklase@google.com470e71d2011-07-07 08:21:25 +000018
19namespace webrtc {
solenberga29386c2015-12-16 03:31:12 -080020class VoiceDetectionImpl::Vad {
21 public:
22 Vad() {
23 state_ = WebRtcVad_Create();
24 RTC_CHECK(state_);
25 int error = WebRtcVad_Init(state_);
26 RTC_DCHECK_EQ(0, error);
niklase@google.com470e71d2011-07-07 08:21:25 +000027 }
Yves Gerey665174f2018-06-19 15:03:05 +020028 ~Vad() { WebRtcVad_Free(state_); }
solenberga29386c2015-12-16 03:31:12 -080029 VadInst* state() { return state_; }
Yves Gerey665174f2018-06-19 15:03:05 +020030
solenberga29386c2015-12-16 03:31:12 -080031 private:
32 VadInst* state_ = nullptr;
33 RTC_DISALLOW_COPY_AND_ASSIGN(Vad);
34};
niklase@google.com470e71d2011-07-07 08:21:25 +000035
solenberga29386c2015-12-16 03:31:12 -080036VoiceDetectionImpl::VoiceDetectionImpl(rtc::CriticalSection* crit)
37 : crit_(crit) {
peahdf3efa82015-11-28 12:35:15 -080038 RTC_DCHECK(crit);
39}
niklase@google.com470e71d2011-07-07 08:21:25 +000040
41VoiceDetectionImpl::~VoiceDetectionImpl() {}
42
solenberga29386c2015-12-16 03:31:12 -080043void VoiceDetectionImpl::Initialize(int sample_rate_hz) {
peahdf3efa82015-11-28 12:35:15 -080044 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080045 sample_rate_hz_ = sample_rate_hz;
kwiberg88788ad2016-02-19 07:04:49 -080046 std::unique_ptr<Vad> new_vad;
solenberga29386c2015-12-16 03:31:12 -080047 if (enabled_) {
48 new_vad.reset(new Vad());
niklase@google.com470e71d2011-07-07 08:21:25 +000049 }
solenberga29386c2015-12-16 03:31:12 -080050 vad_.swap(new_vad);
51 using_external_vad_ = false;
52 frame_size_samples_ =
53 static_cast<size_t>(frame_size_ms_ * sample_rate_hz_) / 1000;
54 set_likelihood(likelihood_);
55}
niklase@google.com470e71d2011-07-07 08:21:25 +000056
solenberga29386c2015-12-16 03:31:12 -080057void VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
58 rtc::CritScope cs(crit_);
59 if (!enabled_) {
60 return;
61 }
niklase@google.com470e71d2011-07-07 08:21:25 +000062 if (using_external_vad_) {
63 using_external_vad_ = false;
solenberga29386c2015-12-16 03:31:12 -080064 return;
niklase@google.com470e71d2011-07-07 08:21:25 +000065 }
niklase@google.com470e71d2011-07-07 08:21:25 +000066
kwibergaf476c72016-11-28 15:21:39 -080067 RTC_DCHECK_GE(160, audio->num_frames_per_band());
niklase@google.com470e71d2011-07-07 08:21:25 +000068 // TODO(ajm): concatenate data in frame buffer here.
Yves Gerey665174f2018-06-19 15:03:05 +020069 int vad_ret =
70 WebRtcVad_Process(vad_->state(), sample_rate_hz_,
71 audio->mixed_low_pass_data(), frame_size_samples_);
andrew@webrtc.orged083d42011-09-19 15:28:51 +000072 if (vad_ret == 0) {
niklase@google.com470e71d2011-07-07 08:21:25 +000073 stream_has_voice_ = false;
andrew@webrtc.orged083d42011-09-19 15:28:51 +000074 audio->set_activity(AudioFrame::kVadPassive);
75 } else if (vad_ret == 1) {
niklase@google.com470e71d2011-07-07 08:21:25 +000076 stream_has_voice_ = true;
andrew@webrtc.orged083d42011-09-19 15:28:51 +000077 audio->set_activity(AudioFrame::kVadActive);
niklase@google.com470e71d2011-07-07 08:21:25 +000078 } else {
solenberga29386c2015-12-16 03:31:12 -080079 RTC_NOTREACHED();
niklase@google.com470e71d2011-07-07 08:21:25 +000080 }
niklase@google.com470e71d2011-07-07 08:21:25 +000081}
82
83int VoiceDetectionImpl::Enable(bool enable) {
peahdf3efa82015-11-28 12:35:15 -080084 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080085 if (enabled_ != enable) {
86 enabled_ = enable;
87 Initialize(sample_rate_hz_);
88 }
89 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +000090}
91
92bool VoiceDetectionImpl::is_enabled() const {
peahdf3efa82015-11-28 12:35:15 -080093 rtc::CritScope cs(crit_);
solenberga29386c2015-12-16 03:31:12 -080094 return enabled_;
niklase@google.com470e71d2011-07-07 08:21:25 +000095}
96
97int VoiceDetectionImpl::set_stream_has_voice(bool has_voice) {
peahdf3efa82015-11-28 12:35:15 -080098 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +000099 using_external_vad_ = true;
100 stream_has_voice_ = has_voice;
solenberga29386c2015-12-16 03:31:12 -0800101 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000102}
103
104bool VoiceDetectionImpl::stream_has_voice() const {
peahdf3efa82015-11-28 12:35:15 -0800105 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000106 // TODO(ajm): enable this assertion?
Yves Gerey665174f2018-06-19 15:03:05 +0200107 // RTC_DCHECK(using_external_vad_ || is_component_enabled());
niklase@google.com470e71d2011-07-07 08:21:25 +0000108 return stream_has_voice_;
109}
110
111int VoiceDetectionImpl::set_likelihood(VoiceDetection::Likelihood likelihood) {
peahdf3efa82015-11-28 12:35:15 -0800112 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000113 likelihood_ = likelihood;
solenberga29386c2015-12-16 03:31:12 -0800114 if (enabled_) {
115 int mode = 2;
116 switch (likelihood) {
117 case VoiceDetection::kVeryLowLikelihood:
118 mode = 3;
119 break;
120 case VoiceDetection::kLowLikelihood:
121 mode = 2;
122 break;
123 case VoiceDetection::kModerateLikelihood:
124 mode = 1;
125 break;
126 case VoiceDetection::kHighLikelihood:
127 mode = 0;
128 break;
129 default:
130 RTC_NOTREACHED();
131 break;
132 }
133 int error = WebRtcVad_set_mode(vad_->state(), mode);
134 RTC_DCHECK_EQ(0, error);
135 }
136 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000137}
138
139VoiceDetection::Likelihood VoiceDetectionImpl::likelihood() const {
peahdf3efa82015-11-28 12:35:15 -0800140 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000141 return likelihood_;
142}
143
144int VoiceDetectionImpl::set_frame_size_ms(int size) {
peahdf3efa82015-11-28 12:35:15 -0800145 rtc::CritScope cs(crit_);
Yves Gerey665174f2018-06-19 15:03:05 +0200146 RTC_DCHECK_EQ(10, size); // TODO(ajm): remove when supported.
niklase@google.com470e71d2011-07-07 08:21:25 +0000147 frame_size_ms_ = size;
solenberga29386c2015-12-16 03:31:12 -0800148 Initialize(sample_rate_hz_);
149 return AudioProcessing::kNoError;
niklase@google.com470e71d2011-07-07 08:21:25 +0000150}
151
152int VoiceDetectionImpl::frame_size_ms() const {
peahdf3efa82015-11-28 12:35:15 -0800153 rtc::CritScope cs(crit_);
niklase@google.com470e71d2011-07-07 08:21:25 +0000154 return frame_size_ms_;
155}
niklase@google.com470e71d2011-07-07 08:21:25 +0000156} // namespace webrtc