EQ FIR: HiFi3 optimized version

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
diff --git a/src/audio/fir_hifi3.c b/src/audio/fir_hifi3.c
new file mode 100644
index 0000000..dad79e0
--- /dev/null
+++ b/src/audio/fir_hifi3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of the Intel Corporation nor the
+ *     names of its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <errno.h>
+#include <sof/audio/component.h>
+#include <sof/audio/format.h>
+#include <uapi/eq.h>
+#include "fir_config.h"
+
+#if FIR_HIFI3
+
+#include <xtensa/config/defs.h>
+#include <xtensa/tie/xt_hifi3.h>
+#include "fir_hifi3.h"
+
+/*
+ * EQ FIR algorithm code
+ */
+
+void fir_reset(struct fir_state_32x16 *fir)
+{
+	fir->mute = 1;
+	fir->length = 0;
+	fir->out_shift = 0;
+	fir->rwp = NULL;
+	fir->delay = NULL;
+	fir->delay_end = NULL;
+	fir->coef = NULL;
+	/* There may need to know the beginning of dynamic allocation after
+	 * reset so omitting setting also fir->delay to NULL.
+	 */
+}
+
+int fir_init_coef(struct fir_state_32x16 *fir, int16_t config[])
+{
+	struct sof_eq_fir_coef_data *setup;
+
+	/* The length is taps plus two since the filter computes two
+	 * samples per call. Length plus one would be minimum but the add
+	 * must be even. The even length is needed for 64 bit loads from delay
+	 * lines with 32 bit samples.
+	 */
+	setup = (struct sof_eq_fir_coef_data *)config;
+	fir->mute = 0;
+	fir->rwp = NULL;
+	fir->taps = (int)setup->length;
+	fir->length = fir->taps + 2;
+	fir->out_shift = (int)setup->out_shift;
+	fir->coef = (ae_f16x4 *)&setup->coef[0];
+	fir->delay = NULL;
+	fir->delay_end = NULL;
+
+	/* Check FIR tap count for implementation specific constraints */
+	if (fir->taps > SOF_EQ_FIR_MAX_LENGTH || fir->taps < 4)
+		return -EINVAL;
+
+	if (fir->taps & 3)
+		return -EINVAL;
+
+	return fir->length;
+}
+
+void fir_init_delay(struct fir_state_32x16 *fir, int32_t **data)
+{
+	fir->delay = (ae_int32 *) *data;
+	fir->delay_end = fir->delay + fir->length;
+	fir->rwp = (ae_int32 *)(fir->delay + fir->length - 1);
+	*data += fir->length; /* Point to next delay line start */
+}
+
+void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
+		      int *rshift)
+{
+	if (fir->mute) {
+		*lshift = 0;
+		*rshift = 31;
+	} else {
+		*lshift = (fir->out_shift < 0) ? -fir->out_shift : 0;
+		*rshift = (fir->out_shift > 0) ? fir->out_shift : 0;
+	}
+}
+
+/* For even frame lengths use FIR filter that processes two sequential
+ * sample per call.
+ */
+void eq_fir_2x_s32_hifi3(struct fir_state_32x16 fir[],
+			 struct comp_buffer *source, struct comp_buffer *sink,
+			 int frames, int nch)
+{
+	struct fir_state_32x16 *f;
+	int32_t *src = (int32_t *)source->r_ptr;
+	int32_t *snk = (int32_t *)sink->w_ptr;
+	int32_t *x0;
+	int32_t *y0;
+	int32_t *x1;
+	int32_t *y1;
+	int ch;
+	int i;
+	int rshift;
+	int lshift;
+	int shift;
+	int inc = nch << 1;
+
+	for (ch = 0; ch < nch; ch++) {
+		/* Get FIR instance and get shifts to e.g. apply mute
+		 * without overhead.
+		 */
+		f = &fir[ch];
+		fir_get_lrshifts(f, &lshift, &rshift);
+		shift = lshift - rshift;
+
+		/* Setup circular buffer for FIR input data delay */
+		fir_hifi3_setup_circular(f);
+
+		x0 = src++;
+		y0 = snk++;
+		for (i = 0; i < (frames >> 1); i++) {
+			x1 = x0 + nch;
+			y1 = y0 + nch;
+			fir_32x16_2x_hifi3(f, x0, x1, y0, y1, shift);
+			x0 += inc;
+			y0 += inc;
+		}
+	}
+}
+
+/* FIR for any number of frames */
+void eq_fir_s32_hifi3(struct fir_state_32x16 fir[], struct comp_buffer *source,
+		      struct comp_buffer *sink, int frames, int nch)
+{
+	struct fir_state_32x16 *f;
+	int32_t *src = (int32_t *)source->r_ptr;
+	int32_t *snk = (int32_t *)sink->w_ptr;
+	int32_t *x;
+	int32_t *y;
+	int ch;
+	int i;
+	int rshift;
+	int lshift;
+	int shift;
+
+	for (ch = 0; ch < nch; ch++) {
+		/* Get FIR instance and get shifts to e.g. apply mute
+		 * without overhead.
+		 */
+		f = &fir[ch];
+		fir_get_lrshifts(f, &lshift, &rshift);
+		shift = lshift - rshift;
+
+		/* Setup circular buffer for FIR input data delay */
+		fir_hifi3_setup_circular(f);
+
+		x = src++;
+		y = snk++;
+		for (i = 0; i < frames; i++) {
+			fir_32x16_hifi3(f, x, y, shift);
+			x += nch;
+			y += nch;
+		}
+	}
+}
+
+#endif
diff --git a/src/audio/fir_hifi3.h b/src/audio/fir_hifi3.h
new file mode 100644
index 0000000..20896f0
--- /dev/null
+++ b/src/audio/fir_hifi3.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of the Intel Corporation nor the
+ *     names of its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+ */
+
+#ifndef FIR_HIFI3_H
+#define FIR_HIFI3_H
+
+#include "fir_config.h"
+
+#if FIR_HIFI3
+
+#include <xtensa/config/defs.h>
+#include <xtensa/tie/xt_hifi2.h>
+#include <sof/audio/format.h>
+
+struct fir_state_32x16 {
+	ae_int32 *rwp; /* Circular read and write pointer */
+	ae_int32 *delay; /* Pointer to FIR delay line */
+	ae_int32 *delay_end; /* Pointer to FIR delay line end */
+	ae_f16x4 *coef; /* Pointer to FIR coefficients */
+	int mute; /* Set to 1 to mute EQ output, 0 otherwise */
+	int taps; /* Number of FIR taps */
+	int length; /* Number of FIR taps plus input length (even) */
+	int in_shift; /* Amount of right shifts at input */
+	int out_shift; /* Amount of right shifts at output */
+};
+
+void fir_reset(struct fir_state_32x16 *fir);
+
+int fir_init_coef(struct fir_state_32x16 *fir, int16_t config[]);
+
+void fir_init_delay(struct fir_state_32x16 *fir, int32_t **data);
+
+void eq_fir_2x_s32_hifi3(struct fir_state_32x16 fir[],
+			 struct comp_buffer *source, struct comp_buffer *sink,
+			 int frames, int nch);
+
+void eq_fir_s32_hifi3(struct fir_state_32x16 fir[], struct comp_buffer *source,
+		      struct comp_buffer *sink, int frames, int nch);
+
+/* The next trivial functions are inlined */
+
+static inline void fir_mute(struct fir_state_32x16 *fir)
+{
+	fir->mute = 1;
+}
+
+static inline void fir_unmute(struct fir_state_32x16 *fir)
+{
+	fir->mute = 0;
+}
+
+/* Setup circular buffer for FIR input data delay */
+static inline void fir_hifi3_setup_circular(struct fir_state_32x16 *fir)
+{
+	AE_SETCBEGIN0(fir->delay);
+	AE_SETCEND0(fir->delay_end);
+}
+
+void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
+		      int *rshift);
+
+/* The next functions are inlined to optmize execution speed */
+
+/* HiFi EP has the follow number of reqisters that should not be exceeded
+ * 4x 56 bit registers in register file Q
+ * 8x 48 bit registers in register file P
+ */
+
+static inline void fir_32x16_hifi3(struct fir_state_32x16 *fir, int32_t *x,
+				   int32_t *y, int shift)
+{
+	/* This function uses
+	 * 1x 56 bit registers Q,
+	 * 4x 48 bit registers P
+	 * 3x integers
+	 * 2x address pointers,
+	 */
+	ae_f64 a;
+	ae_valign u;
+	ae_f32x2 data2;
+	ae_f16x4 coefs;
+	ae_f32x2 d0;
+	ae_f32x2 d1;
+	int i;
+	ae_int32 *dp = fir->rwp;
+	ae_int16x4 *coefp = (ae_int16x4 *)fir->coef;
+	const int taps_div_4 = fir->taps >> 2;
+	const int inc = sizeof(int32_t);
+
+	/* Write sample to delay */
+	AE_S32_L_XC((ae_int32)*x, fir->rwp, -sizeof(int32_t));
+
+	/* Prime the coefficients stream */
+	u = AE_LA64_PP(coefp);
+
+	/* Note: If the next function is converted to handle two samples
+	 * per call the data load can be done with single instruction
+	 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
+	 */
+	a = AE_ZEROQ56();
+	for (i = 0; i < taps_div_4; i++) {
+		/* Load four coefficients. Coef_3 contains tap h[n],
+		 * coef_2 contains h[n+1], coef_1 contains h[n+2], and
+		 * coef_0 contains h[n+3];
+		 */
+		AE_LA16X4_IP(coefs, u, coefp);
+
+		/* Load two data samples and pack to d0 to data2_h and
+		 * d1 to data2_l.
+		 */
+		AE_L32_XC(d0, dp, inc);
+		AE_L32_XC(d1, dp, inc);
+		data2 = AE_SEL32_LL(d0, d1);
+
+		/* Accumulate
+		 * a += data2_h * coefs_3 + data2_l * coefs_2. The Q1.31
+		 * data and Q1.15 coefficients are used as 24 bits as
+		 * Q1.23 values.
+		 */
+		AE_MULAAFD32X16_H3_L2(a, data2, coefs);
+
+		/* Repeat the same for next two taps and increase coefp.
+		 * a += data2_h * coefs_1 + data2_l * coefs_0.
+		 */
+		AE_L32_XC(d0, dp, inc);
+		AE_L32_XC(d1, dp, inc);
+		data2 = AE_SEL32_LL(d0, d1);
+		AE_MULAAFD32X16_H1_L0(a, data2, coefs);
+		//coefp += 4;
+	}
+
+	/* Do scaling shifts and store sample. */
+	a = AE_SLAA64S(a, shift);
+	AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y, 0);
+}
+
+/* HiFi EP has the follow number of reqisters that should not be exceeded
+ * 4x 56 bit registers in register file Q
+ * 8x 48 bit registers in register file P
+ */
+
+static inline void fir_32x16_2x_hifi3(struct fir_state_32x16 *fir, int32_t *x0,
+				      int32_t *x1, int32_t *y0, int32_t *y1,
+				      int shift)
+{
+	/* This function uses
+	 * 2x 56 bit registers Q,
+	 * 4x 48 bit registers P
+	 * 3x integers
+	 * 2x address pointers,
+	 */
+	ae_f64 a;
+	ae_f64 b;
+	ae_valign u;
+	ae_f32x2 d0;
+	ae_f32x2 d1;
+	ae_f16x4 coefs;
+	int i;
+	ae_f32x2 *dp;
+	ae_f16x4 *coefp = fir->coef;
+	const int taps_div_4 = fir->taps >> 2;
+	const int inc = 2 * sizeof(int32_t);
+
+	/* Write samples to delay */
+	AE_S32_L_XC((ae_int32)*x0, fir->rwp, -sizeof(int32_t));
+	dp = (ae_f32x2 *)fir->rwp;
+	AE_S32_L_XC((ae_int32)*x1, fir->rwp, -sizeof(int32_t));
+
+	/* Note: If the next function is converted to handle two samples
+	 * per call the data load can be done with single instruction
+	 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
+	 */
+	a = AE_ZERO64();
+	b = AE_ZERO64();
+
+	/* Prime the coefficients stream */
+	u = AE_LA64_PP(coefp);
+
+	/* Load two data samples and pack to d0 to data2_h and
+	 * d1 to data2_l.
+	 */
+	AE_L32X2_XC(d0, dp, inc);
+	for (i = 0; i < taps_div_4; i++) {
+		/* Load four coefficients. Coef_3 contains tap h[n],
+		 * coef_2 contains h[n+1], coef_1 contains h[n+2], and
+		 * coef_0 contains h[n+3];
+		 */
+		AE_LA16X4_IP(coefs, u, coefp);
+
+		/* Load two data samples. Upper part d1_h is x[n+1] and
+		 * lower part d1_l is x[n].
+		 */
+		AE_L32X2_XC(d1, dp, inc);
+
+		/* Quad MAC (HH)
+		 * b += d0_h * coefs_3 + d0_l * coefs_2
+		 * a += d0_l * coefs_3 + d1_h * coefs_2
+		 */
+		AE_MULAFD32X16X2_FIR_HH(b, a, d0, d1, coefs);
+		d0 = d1;
+
+		/* Repeat the same for next two taps and increase coefp. */
+		AE_L32X2_XC(d1, dp, inc);
+
+		/* Quad MAC (HL)
+		 * b += d0_h * coefs_1 + d0_l * coefs_0
+		 * a += d0_l * coefs_1 + d1_h * coefs_0
+		 */
+		AE_MULAFD32X16X2_FIR_HL(b, a, d0, d1, coefs);
+		d0 = d1;
+	}
+
+	/* Do scaling shifts and store sample. */
+	b = AE_SLAA64S(b, shift);
+	a = AE_SLAA64S(a, shift);
+	AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0);
+	AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0);
+}
+
+#endif
+#endif