Blame - modules/audio_coding/neteq/time_stretch.cc - webrtc.googlesource.com/src

blob: d0ea68af3a4ce7f0a969237fcc1809c4ce4e2ecc [file] [log] [blame]

henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame^]	11	#include "modules/audio_coding/neteq/time_stretch.h"
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	12
				13	#include <algorithm> // min, max
kwiberg	2d0c332	2016-02-14 09:28:33 -0800	[diff] [blame]	14	#include <memory>
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	15
Mirko Bonadei	92ea95e	2017-09-15 06:47:31 +0200	[diff] [blame^]	16	#include "common_audio/signal_processing/include/signal_processing_library.h"
				17	#include "modules/audio_coding/neteq/background_noise.h"
				18	#include "modules/audio_coding/neteq/cross_correlation.h"
				19	#include "modules/audio_coding/neteq/dsp_helper.h"
				20	#include "rtc_base/safe_conversions.h"
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	21
				22	namespace webrtc {
				23
Henrik Lundin	cf808d2	2015-05-27 14:33:29 +0200	[diff] [blame]	24	TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
				25	size_t input_len,
				26	bool fast_mode,
				27	AudioMultiVector* output,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	28	size_t* length_change_samples) {
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	29	// Pre-calculate common multiplication with \|fs_mult_\|.
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	30	size_t fs_mult_120 =
				31	static_cast<size_t>(fs_mult_ * 120); // Corresponds to 15 ms.
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	32
				33	const int16_t* signal;
kwiberg	2d0c332	2016-02-14 09:28:33 -0800	[diff] [blame]	34	std::unique_ptr<int16_t[]> signal_array;
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	35	size_t signal_len;
				36	if (num_channels_ == 1) {
				37	signal = input;
				38	signal_len = input_len;
				39	} else {
				40	// We want \|signal\| to be only the first channel of \|input\|, which is
				41	// interleaved. Thus, we take the first sample, skip forward \|num_channels\|
				42	// samples, and continue like that.
				43	signal_len = input_len / num_channels_;
				44	signal_array.reset(new int16_t[signal_len]);
				45	signal = signal_array.get();
				46	size_t j = master_channel_;
				47	for (size_t i = 0; i < signal_len; ++i) {
				48	signal_array[i] = input[j];
				49	j += num_channels_;
				50	}
				51	}
				52
				53	// Find maximum absolute value of input signal.
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	54	max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	55
				56	// Downsample to 4 kHz sample rate and calculate auto-correlation.
				57	DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
				58	sample_rate_hz_, true /* compensate delay*/,
				59	downsampled_input_);
				60	AutoCorrelation();
				61
				62	// Find the strongest correlation peak.
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	63	static const size_t kNumPeaks = 1;
				64	size_t peak_index;
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	65	int16_t peak_value;
				66	DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
				67	fs_mult_, &peak_index, &peak_value);
				68	// Assert that \|peak_index\| stays within boundaries.
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	69	assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);
				70
				71	// Compensate peak_index for displaced starting position. The displacement
				72	// happens in AutoCorrelation(). Here, \|kMinLag\| is in the down-sampled 4 kHz
				73	// domain, while the \|peak_index\| is in the original sample rate; hence, the
				74	// multiplication by fs_mult_ * 2.
				75	peak_index += kMinLag * fs_mult_ * 2;
				76	// Assert that \|peak_index\| stays within boundaries.
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	77	assert(peak_index >= static_cast<size_t>(20 * fs_mult_));
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	78	assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);
				79
				80	// Calculate scaling to ensure that \|peak_index\| samples can be square-summed
				81	// without overflowing.
				82	int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	83	WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	84	scaling = std::max(0, scaling);
				85
				86	// \|vec1\| starts at 15 ms minus one pitch period.
				87	const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
				88	// \|vec2\| start at 15 ms.
				89	const int16_t* vec2 = &signal[fs_mult_120];
				90	// Calculate energies for \|vec1\| and \|vec2\|, assuming they both contain
				91	// \|peak_index\| samples.
				92	int32_t vec1_energy =
				93	WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
				94	int32_t vec2_energy =
				95	WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);
				96
				97	// Calculate cross-correlation between \|vec1\| and \|vec2\|.
				98	int32_t cross_corr =
				99	WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);
				100
				101	// Check if the signal seems to be active speech or not (simple VAD).
				102	bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index,
				103	scaling);
				104
				105	int16_t best_correlation;
				106	if (!active_speech) {
				107	SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
				108	} else {
				109	// Calculate correlation:
				110	// cross_corr / sqrt(vec1_energy * vec2_energy).
				111
				112	// Start with calculating scale values.
				113	int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
				114	int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));
				115
				116	// Make sure total scaling is even (to simplify scale factor after sqrt).
				117	if ((energy1_scale + energy2_scale) & 1) {
				118	// The sum is odd.
				119	energy1_scale += 1;
				120	}
				121
				122	// Scale energies to int16_t.
				123	int16_t vec1_energy_int16 =
				124	static_cast<int16_t>(vec1_energy >> energy1_scale);
				125	int16_t vec2_energy_int16 =
				126	static_cast<int16_t>(vec2_energy >> energy2_scale);
				127
				128	// Calculate square-root of energy product.
				129	int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 *
				130	vec2_energy_int16);
				131
				132	// Calculate cross_corr / sqrt(en1*en2) in Q14.
				133	int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
				134	cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
				135	cross_corr = std::max(0, cross_corr); // Don't use if negative.
				136	best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
				137	// Make sure \|best_correlation\| is no larger than 1 in Q14.
				138	best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
				139	}
				140
				141
				142	// Check accelerate criteria and stretch the signal.
Henrik Lundin	cf808d2	2015-05-27 14:33:29 +0200	[diff] [blame]	143	ReturnCodes return_value =
				144	CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
				145	active_speech, fast_mode, output);
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	146	switch (return_value) {
				147	case kSuccess:
				148	*length_change_samples = peak_index;
				149	break;
				150	case kSuccessLowEnergy:
				151	*length_change_samples = peak_index;
				152	break;
				153	case kNoStretch:
				154	case kError:
				155	*length_change_samples = 0;
				156	break;
				157	}
				158	return return_value;
				159	}
				160
				161	void TimeStretch::AutoCorrelation() {
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	162	// Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
				163	int32_t auto_corr[kCorrelationLen];
minyue	53ff70f	2016-05-02 01:50:30 -0700	[diff] [blame]	164	CrossCorrelationWithAutoShift(
				165	&downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag],
				166	kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr);
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	167
				168	// Normalize correlation to 14 bits and write to \|auto_correlation_\|.
				169	int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
minyue	53ff70f	2016-05-02 01:50:30 -0700	[diff] [blame]	170	int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	171	WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
				172	auto_corr, scaling);
				173	}
				174
				175	bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy,
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	176	size_t peak_index, int scaling) const {
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	177	// Check if the signal seems to be active speech or not (simple VAD).
				178	// If (vec1_energy + vec2_energy) / (2 * peak_index) <=
				179	// 8 * background_noise_energy, then we say that the signal contains no
				180	// active speech.
				181	// Rewrite the inequality as:
				182	// (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
				183	// The two sides of the inequality will be denoted \|left_side\| and
				184	// \|right_side\|.
Henrik Lundin	b1629cf	2017-02-28 14:58:30 +0100	[diff] [blame]	185	int32_t left_side = rtc::saturated_cast<int32_t>(
				186	(static_cast<int64_t>(vec1_energy) + vec2_energy) / 16);
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	187	int32_t right_side;
				188	if (background_noise_.initialized()) {
				189	right_side = background_noise_.Energy(master_channel_);
				190	} else {
				191	// If noise parameters have not been estimated, use a fixed threshold.
				192	right_side = 75000;
				193	}
				194	int right_scale = 16 - WebRtcSpl_NormW32(right_side);
				195	right_scale = std::max(0, right_scale);
				196	left_side = left_side >> right_scale;
Peter Kasting	dce40cf	2015-08-24 14:52:23 -0700	[diff] [blame]	197	right_side =
kwiberg	d3edd77	2017-03-01 18:52:48 -0800	[diff] [blame]	198	rtc::dchecked_cast<int32_t>(peak_index) * (right_side >> right_scale);
henrik.lundin@webrtc.org	d94659d	2013-01-29 12:09:21 +0000	[diff] [blame]	199
				200	// Scale \|left_side\| properly before comparing with \|right_side\|.
				201	// (\|scaling\| is the scale factor before energy calculation, thus the scale
				202	// factor for the energy is 2 * scaling.)
				203	if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
				204	// Cannot scale only \|left_side\|, must scale \|right_side\| too.
				205	int temp_scale = WebRtcSpl_NormW32(left_side);
				206	left_side = left_side << temp_scale;
				207	right_side = right_side >> (2 * scaling - temp_scale);
				208	} else {
				209	left_side = left_side << 2 * scaling;
				210	}
				211	return left_side > right_side;
				212	}
				213
				214	} // namespace webrtc