blob: 025ef2059d912b36b8e87f36b210734ef47a4c93 [file] [log] [blame]
pbos@webrtc.org788acd12014-12-15 09:41:24 +00001/*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020011#include "modules/audio_processing/vad/pitch_based_vad.h"
pbos@webrtc.org788acd12014-12-15 09:41:24 +000012
pbos@webrtc.org788acd12014-12-15 09:41:24 +000013#include <math.h>
14#include <string.h>
15
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020016#include "modules/audio_processing/vad/common.h"
17#include "modules/audio_processing/vad/noise_gmm_tables.h"
Yves Gerey665174f2018-06-19 15:03:05 +020018#include "modules/audio_processing/vad/vad_circular_buffer.h"
Mirko Bonadei92ea95e2017-09-15 06:47:31 +020019#include "modules/audio_processing/vad/voice_gmm_tables.h"
pbos@webrtc.org788acd12014-12-15 09:41:24 +000020
21namespace webrtc {
22
kwiberg@webrtc.org2ebfac52015-01-14 10:51:54 +000023static_assert(kNoiseGmmDim == kVoiceGmmDim,
24 "noise and voice gmm dimension not equal");
pbos@webrtc.org788acd12014-12-15 09:41:24 +000025
26// These values should match MATLAB counterparts for unit-tests to pass.
27static const int kPosteriorHistorySize = 500; // 5 sec of 10 ms frames.
28static const double kInitialPriorProbability = 0.3;
29static const int kTransientWidthThreshold = 7;
30static const double kLowProbabilityThreshold = 0.2;
31
32static double LimitProbability(double p) {
33 const double kLimHigh = 0.99;
34 const double kLimLow = 0.01;
35
36 if (p > kLimHigh)
37 p = kLimHigh;
38 else if (p < kLimLow)
39 p = kLimLow;
40 return p;
41}
42
43PitchBasedVad::PitchBasedVad()
44 : p_prior_(kInitialPriorProbability),
aluebsecf6b812015-06-25 12:28:48 -070045 circular_buffer_(VadCircularBuffer::Create(kPosteriorHistorySize)) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +000046 // Setup noise GMM.
47 noise_gmm_.dimension = kNoiseGmmDim;
48 noise_gmm_.num_mixtures = kNoiseGmmNumMixtures;
49 noise_gmm_.weight = kNoiseGmmWeights;
50 noise_gmm_.mean = &kNoiseGmmMean[0][0];
51 noise_gmm_.covar_inverse = &kNoiseGmmCovarInverse[0][0][0];
52
53 // Setup voice GMM.
54 voice_gmm_.dimension = kVoiceGmmDim;
55 voice_gmm_.num_mixtures = kVoiceGmmNumMixtures;
56 voice_gmm_.weight = kVoiceGmmWeights;
57 voice_gmm_.mean = &kVoiceGmmMean[0][0];
58 voice_gmm_.covar_inverse = &kVoiceGmmCovarInverse[0][0][0];
59}
60
Yves Gerey665174f2018-06-19 15:03:05 +020061PitchBasedVad::~PitchBasedVad() {}
pbos@webrtc.org788acd12014-12-15 09:41:24 +000062
63int PitchBasedVad::VoicingProbability(const AudioFeatures& features,
64 double* p_combined) {
65 double p;
66 double gmm_features[3];
67 double pdf_features_given_voice;
68 double pdf_features_given_noise;
69 // These limits are the same in matlab implementation 'VoicingProbGMM().'
70 const double kLimLowLogPitchGain = -2.0;
71 const double kLimHighLogPitchGain = -0.9;
72 const double kLimLowSpectralPeak = 200;
73 const double kLimHighSpectralPeak = 2000;
74 const double kEps = 1e-12;
Peter Kastingdce40cf2015-08-24 14:52:23 -070075 for (size_t n = 0; n < features.num_frames; n++) {
pbos@webrtc.org788acd12014-12-15 09:41:24 +000076 gmm_features[0] = features.log_pitch_gain[n];
77 gmm_features[1] = features.spectral_peak[n];
78 gmm_features[2] = features.pitch_lag_hz[n];
79
80 pdf_features_given_voice = EvaluateGmm(gmm_features, voice_gmm_);
81 pdf_features_given_noise = EvaluateGmm(gmm_features, noise_gmm_);
82
83 if (features.spectral_peak[n] < kLimLowSpectralPeak ||
84 features.spectral_peak[n] > kLimHighSpectralPeak ||
85 features.log_pitch_gain[n] < kLimLowLogPitchGain) {
86 pdf_features_given_voice = kEps * pdf_features_given_noise;
87 } else if (features.log_pitch_gain[n] > kLimHighLogPitchGain) {
88 pdf_features_given_noise = kEps * pdf_features_given_voice;
89 }
90
aluebsecf6b812015-06-25 12:28:48 -070091 p = p_prior_ * pdf_features_given_voice /
92 (pdf_features_given_voice * p_prior_ +
93 pdf_features_given_noise * (1 - p_prior_));
pbos@webrtc.org788acd12014-12-15 09:41:24 +000094
95 p = LimitProbability(p);
96
97 // Combine pitch-based probability with standalone probability, before
98 // updating prior probabilities.
99 double prod_active = p * p_combined[n];
100 double prod_inactive = (1 - p) * (1 - p_combined[n]);
101 p_combined[n] = prod_active / (prod_active + prod_inactive);
102
103 if (UpdatePrior(p_combined[n]) < 0)
104 return -1;
105 // Limit prior probability. With a zero prior probability the posterior
106 // probability is always zero.
107 p_prior_ = LimitProbability(p_prior_);
108 }
109 return 0;
110}
111
112int PitchBasedVad::UpdatePrior(double p) {
113 circular_buffer_->Insert(p);
114 if (circular_buffer_->RemoveTransient(kTransientWidthThreshold,
115 kLowProbabilityThreshold) < 0)
116 return -1;
117 p_prior_ = circular_buffer_->Mean();
118 return 0;
119}
120
121} // namespace webrtc