blob: 2cca66b00622d06f4b3aaf7600f798948a151754 [file] [log] [blame]
pbos@webrtc.orga7f77722014-12-15 16:33:16 +00001/*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include <math.h>
13#include <stdio.h>
14#include <stdlib.h>
15
16#include <algorithm>
kwibergbfefb032016-05-01 14:53:46 -070017#include <memory>
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000018
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000019#include "webrtc/modules/audio_processing/agc/agc.h"
peahbbe42332016-06-08 06:42:02 -070020#include "webrtc/modules/audio_processing/agc/loudness_histogram.h"
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000021#include "webrtc/modules/audio_processing/agc/utility.h"
aluebsecf6b812015-06-25 12:28:48 -070022#include "webrtc/modules/audio_processing/vad/common.h"
23#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
24#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
kwibergac9f8762016-09-30 22:29:43 -070025#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
Henrik Kjellanderff761fb2015-11-04 08:31:52 +010026#include "webrtc/modules/include/module_common_types.h"
Edward Lemurc20978e2017-07-06 19:44:34 +020027#include "webrtc/rtc_base/flags.h"
28#include "webrtc/rtc_base/safe_minmax.h"
kwibergac9f8762016-09-30 22:29:43 -070029#include "webrtc/test/gtest.h"
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000030
31static const int kAgcAnalWindowSamples = 100;
kjellander4fa5be42017-05-16 00:01:23 -070032static const float kDefaultActivityThreshold = 0.3f;
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000033
34DEFINE_bool(standalone_vad, true, "enable stand-alone VAD");
35DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'"
36 " format");
37DEFINE_string(video_vad, "", "name of a file containing video VAD (activity"
38 " probabilities) in double format. One activity per 10ms is"
39 " required. If no file is given the video information is not"
40 " incorporated. Negative activity is interpreted as video is"
41 " not adapted and the statistics are not computed during"
42 " the learning phase. Note that the negative video activities"
43 " are ONLY allowed at the beginning.");
44DEFINE_string(result, "", "name of a file to write the results. The results"
45 " will be appended to the end of the file. This is optional.");
46DEFINE_string(audio_content, "", "name of a file where audio content is written"
47 " to, in double format.");
kjellander4fa5be42017-05-16 00:01:23 -070048DEFINE_float(activity_threshold, kDefaultActivityThreshold,
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000049 "Activity threshold");
kjellander4fa5be42017-05-16 00:01:23 -070050DEFINE_bool(help, false, "prints this message");
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000051
52namespace webrtc {
53
54// TODO(turajs) A new CL will be committed soon where ExtractFeatures will
55// notify the caller of "silence" input, instead of bailing out. We would not
56// need the following function when such a change is made.
57
58// Add some dither to quiet frames. This avoids the ExtractFeatures skip a
59// silence frame. Otherwise true VAD would drift with respect to the audio.
60// We only consider mono inputs.
61static void DitherSilence(AudioFrame* frame) {
Peter Kasting69558702016-01-12 16:26:35 -080062 ASSERT_EQ(1u, frame->num_channels_);
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000063 const double kRmsSilence = 5;
64 const double sum_squared_silence = kRmsSilence * kRmsSilence *
65 frame->samples_per_channel_;
66 double sum_squared = 0;
yujo36b1a5f2017-06-12 12:45:32 -070067 int16_t* frame_data = frame->mutable_data();
Peter Kastingdce40cf2015-08-24 14:52:23 -070068 for (size_t n = 0; n < frame->samples_per_channel_; n++)
yujo36b1a5f2017-06-12 12:45:32 -070069 sum_squared += frame_data[n] * frame_data[n];
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000070 if (sum_squared <= sum_squared_silence) {
Peter Kastingdce40cf2015-08-24 14:52:23 -070071 for (size_t n = 0; n < frame->samples_per_channel_; n++)
yujo36b1a5f2017-06-12 12:45:32 -070072 frame_data[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe.
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000073 }
74}
75
76class AgcStat {
77 public:
78 AgcStat()
79 : video_index_(0),
80 activity_threshold_(kDefaultActivityThreshold),
peahbbe42332016-06-08 06:42:02 -070081 audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)),
aluebsecf6b812015-06-25 12:28:48 -070082 audio_processing_(new VadAudioProc()),
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000083 vad_(new PitchBasedVad()),
84 standalone_vad_(StandaloneVad::Create()),
85 audio_content_fid_(NULL) {
Peter Kastingdce40cf2015-08-24 14:52:23 -070086 for (size_t n = 0; n < kMaxNumFrames; n++)
pbos@webrtc.orga7f77722014-12-15 16:33:16 +000087 video_vad_[n] = 0.5;
88 }
89
90 ~AgcStat() {
91 if (audio_content_fid_ != NULL) {
92 fclose(audio_content_fid_);
93 }
94 }
95
96 void set_audio_content_file(FILE* audio_content_fid) {
97 audio_content_fid_ = audio_content_fid;
98 }
99
100 int AddAudio(const AudioFrame& frame, double p_video,
101 int* combined_vad) {
102 if (frame.num_channels_ != 1 ||
103 frame.samples_per_channel_ !=
104 kSampleRateHz / 100 ||
105 frame.sample_rate_hz_ != kSampleRateHz)
106 return -1;
107 video_vad_[video_index_++] = p_video;
108 AudioFeatures features;
yujo36b1a5f2017-06-12 12:45:32 -0700109 const int16_t* frame_data = frame.data();
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000110 audio_processing_->ExtractFeatures(
yujo36b1a5f2017-06-12 12:45:32 -0700111 frame_data, frame.samples_per_channel_, &features);
kjellander4fa5be42017-05-16 00:01:23 -0700112 if (FLAG_standalone_vad) {
yujo36b1a5f2017-06-12 12:45:32 -0700113 standalone_vad_->AddAudio(frame_data,
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000114 frame.samples_per_channel_);
115 }
116 if (features.num_frames > 0) {
117 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5};
kjellander4fa5be42017-05-16 00:01:23 -0700118 if (FLAG_standalone_vad) {
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000119 standalone_vad_->GetActivity(p, kMaxNumFrames);
120 }
121 // TODO(turajs) combining and limiting are used in the source files as
122 // well they can be moved to utility.
123 // Combine Video and stand-alone VAD.
Peter Kastingdce40cf2015-08-24 14:52:23 -0700124 for (size_t n = 0; n < features.num_frames; n++) {
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000125 double p_active = p[n] * video_vad_[n];
126 double p_passive = (1 - p[n]) * (1 - video_vad_[n]);
kwiberg07038562017-06-12 11:40:47 -0700127 p[n] = rtc::SafeClamp(p_active / (p_active + p_passive), 0.01, 0.99);
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000128 }
129 if (vad_->VoicingProbability(features, p) < 0)
130 return -1;
Peter Kastingdce40cf2015-08-24 14:52:23 -0700131 for (size_t n = 0; n < features.num_frames; n++) {
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000132 audio_content_->Update(features.rms[n], p[n]);
133 double ac = audio_content_->AudioContent();
134 if (audio_content_fid_ != NULL) {
135 fwrite(&ac, sizeof(ac), 1, audio_content_fid_);
136 }
137 if (ac > kAgcAnalWindowSamples * activity_threshold_) {
138 combined_vad[n] = 1;
139 } else {
140 combined_vad[n] = 0;
141 }
142 }
143 video_index_ = 0;
144 }
Peter Kastingdce40cf2015-08-24 14:52:23 -0700145 return static_cast<int>(features.num_frames);
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000146 }
147
148 void Reset() {
149 audio_content_->Reset();
150 }
151
152 void SetActivityThreshold(double activity_threshold) {
153 activity_threshold_ = activity_threshold;
154 }
155
156 private:
157 int video_index_;
158 double activity_threshold_;
159 double video_vad_[kMaxNumFrames];
peahbbe42332016-06-08 06:42:02 -0700160 std::unique_ptr<LoudnessHistogram> audio_content_;
kwibergbfefb032016-05-01 14:53:46 -0700161 std::unique_ptr<VadAudioProc> audio_processing_;
162 std::unique_ptr<PitchBasedVad> vad_;
163 std::unique_ptr<StandaloneVad> standalone_vad_;
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000164
165 FILE* audio_content_fid_;
166};
167
168
169void void_main(int argc, char* argv[]) {
170 webrtc::AgcStat agc_stat;
171
172 FILE* pcm_fid = fopen(argv[1], "rb");
173 ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1];
174
175 if (argc < 2) {
176 fprintf(stderr, "\nNot Enough arguments\n");
177 }
178
179 FILE* true_vad_fid = NULL;
kjellander4fa5be42017-05-16 00:01:23 -0700180 ASSERT_GT(strlen(FLAG_true_vad), 0u) << "Specify the file containing true "
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000181 "VADs using --true_vad flag.";
kjellander4fa5be42017-05-16 00:01:23 -0700182 true_vad_fid = fopen(FLAG_true_vad, "rb");
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000183 ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " <<
kjellander4fa5be42017-05-16 00:01:23 -0700184 FLAG_true_vad;
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000185
186 FILE* results_fid = NULL;
kjellander4fa5be42017-05-16 00:01:23 -0700187 if (strlen(FLAG_result) > 0) {
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000188 // True if this is the first time writing to this function and we add a
189 // header to the beginning of the file.
190 bool write_header;
191 // Open in the read mode. If it fails, the file doesn't exist and has to
192 // write a header for it. Otherwise no need to write a header.
kjellander4fa5be42017-05-16 00:01:23 -0700193 results_fid = fopen(FLAG_result, "r");
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000194 if (results_fid == NULL) {
195 write_header = true;
196 } else {
197 fclose(results_fid);
198 write_header = false;
199 }
200 // Open in append mode.
kjellander4fa5be42017-05-16 00:01:23 -0700201 results_fid = fopen(FLAG_result, "a");
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000202 ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " <<
kjellander4fa5be42017-05-16 00:01:23 -0700203 FLAG_result << ", to write the results.";
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000204 // Write the header if required.
205 if (write_header) {
206 fprintf(results_fid, "%% Total Active, Misdetection, "
207 "Total inactive, False Positive, On-sets, Missed segments, "
208 "Average response\n");
209 }
210 }
211
212 FILE* video_vad_fid = NULL;
kjellander4fa5be42017-05-16 00:01:23 -0700213 if (strlen(FLAG_video_vad) > 0) {
214 video_vad_fid = fopen(FLAG_video_vad, "rb");
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000215 ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " <<
kjellander4fa5be42017-05-16 00:01:23 -0700216 FLAG_video_vad << " to read video-based VAD decisions.\n";
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000217 }
218
219 // AgsStat will be the owner of this file and will close it at its
220 // destructor.
221 FILE* audio_content_fid = NULL;
kjellander4fa5be42017-05-16 00:01:23 -0700222 if (strlen(FLAG_audio_content) > 0) {
223 audio_content_fid = fopen(FLAG_audio_content, "wb");
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000224 ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " <<
kjellander4fa5be42017-05-16 00:01:23 -0700225 FLAG_audio_content << " to write audio-content.\n";
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000226 agc_stat.set_audio_content_file(audio_content_fid);
227 }
228
229 webrtc::AudioFrame frame;
230 frame.num_channels_ = 1;
231 frame.sample_rate_hz_ = 16000;
232 frame.samples_per_channel_ = frame.sample_rate_hz_ / 100;
233 const size_t kSamplesToRead = frame.num_channels_ *
234 frame.samples_per_channel_;
235
kjellander4fa5be42017-05-16 00:01:23 -0700236 agc_stat.SetActivityThreshold(FLAG_activity_threshold);
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000237
238 int ret_val = 0;
239 int num_frames = 0;
240 int agc_vad[kMaxNumFrames];
241 uint8_t true_vad[kMaxNumFrames];
242 double p_video = 0.5;
243 int total_active = 0;
244 int total_passive = 0;
245 int total_false_positive = 0;
246 int total_missed_detection = 0;
247 int onset_adaptation = 0;
248 int num_onsets = 0;
249 bool onset = false;
250 uint8_t previous_true_vad = 0;
251 int num_not_adapted = 0;
Peter Kastingdce40cf2015-08-24 14:52:23 -0700252 size_t true_vad_index = 0;
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000253 bool in_false_positive_region = false;
254 int total_false_positive_duration = 0;
255 bool video_adapted = false;
yujo36b1a5f2017-06-12 12:45:32 -0700256 while (kSamplesToRead == fread(frame.mutable_data(), sizeof(int16_t),
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000257 kSamplesToRead, pcm_fid)) {
258 assert(true_vad_index < kMaxNumFrames);
259 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1,
260 true_vad_fid))
261 << "Size mismatch between True-VAD and the PCM file.\n";
262 if (video_vad_fid != NULL) {
263 ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) <<
264 "Not enough video-based VAD probabilities.";
265 }
266
267 // Negative video activity indicates that the video-based VAD is not yet
268 // adapted. Disregards the learning phase in statistics.
269 if (p_video < 0) {
270 if (video_adapted) {
271 fprintf(stderr, "Negative video probabilities ONLY allowed at the "
272 "beginning of the sequence, not in the middle.\n");
273 exit(1);
274 }
275 continue;
276 } else {
277 video_adapted = true;
278 }
279
280 num_frames++;
281 uint8_t last_true_vad;
282 if (true_vad_index == 0) {
283 last_true_vad = previous_true_vad;
284 } else {
285 last_true_vad = true_vad[true_vad_index - 1];
286 }
287 if (last_true_vad == 1 && true_vad[true_vad_index] == 0) {
288 agc_stat.Reset();
289 }
290 true_vad_index++;
291
292 DitherSilence(&frame);
293
294 ret_val = agc_stat.AddAudio(frame, p_video, agc_vad);
295 ASSERT_GE(ret_val, 0);
296
297 if (ret_val > 0) {
Peter Kastingdce40cf2015-08-24 14:52:23 -0700298 ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val));
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000299 for (int n = 0; n < ret_val; n++) {
300 if (true_vad[n] == 1) {
301 total_active++;
302 if (previous_true_vad == 0) {
303 num_onsets++;
304 onset = true;
305 }
306 if (agc_vad[n] == 0) {
307 total_missed_detection++;
308 if (onset)
309 onset_adaptation++;
310 } else {
311 in_false_positive_region = false;
312 onset = false;
313 }
314 } else if (true_vad[n] == 0) {
315 // Check if |on_set| flag is still up. If so it means that we totally
316 // missed an active region
317 if (onset)
318 num_not_adapted++;
319 onset = false;
320
321 total_passive++;
322 if (agc_vad[n] == 1) {
323 total_false_positive++;
324 in_false_positive_region = true;
325 }
326 if (in_false_positive_region) {
327 total_false_positive_duration++;
328 }
329 } else {
330 ASSERT_TRUE(false) << "Invalid value for true-VAD.\n";
331 }
332 previous_true_vad = true_vad[n];
333 }
334 true_vad_index = 0;
335 }
336 }
337
338 if (results_fid != NULL) {
339 fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
340 total_active,
341 total_missed_detection,
342 total_passive,
343 total_false_positive,
344 num_onsets,
345 num_not_adapted,
346 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
347 static_cast<float>(total_false_positive_duration) /
348 (total_passive + 1e-12));
349 }
350 fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
351 total_active,
352 total_missed_detection,
353 total_passive,
354 total_false_positive,
355 num_onsets,
356 num_not_adapted,
357 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
358 static_cast<float>(total_false_positive_duration) /
359 (total_passive + 1e-12));
360
361 fclose(true_vad_fid);
362 fclose(pcm_fid);
363 if (video_vad_fid != NULL) {
364 fclose(video_vad_fid);
365 }
366 if (results_fid != NULL) {
367 fclose(results_fid);
368 }
369}
370
371} // namespace webrtc
372
373int main(int argc, char* argv[]) {
kjellander4fa5be42017-05-16 00:01:23 -0700374 if (argc == 1) {
375 // Print usage information.
376 std::cout <<
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000377 "\nCompute the number of misdetected and false-positive frames. Not\n"
378 " that for each frame of audio (10 ms) there should be one true\n"
379 " activity. If any video-based activity is given, there should also be\n"
380 " one probability per frame.\n"
kjellander4fa5be42017-05-16 00:01:23 -0700381 "Run with --help for more details on available flags.\n"
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000382 "\nUsage:\n\n"
383 "activity_metric input_pcm [options]\n"
384 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits "
385 "format.\n\n";
kjellander4fa5be42017-05-16 00:01:23 -0700386 return 0;
387 }
388 rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true);
389 if (FLAG_help) {
390 rtc::FlagList::Print(nullptr, false);
391 return 0;
392 }
pbos@webrtc.orga7f77722014-12-15 16:33:16 +0000393 webrtc::void_main(argc, argv);
394 return 0;
395}