Blame - lib/raid6/recov_neon_inner.c - github.com/raspberrypi/raspberrypi-kernel

blob: 7d00c31a654706763eb29da6952bb7033e635eee [file] [log] [blame]

Ard Biesheuvel	6ec4e251	2017-07-13 18:16:01 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Intel Corporation
				3	* Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public License
				7	* as published by the Free Software Foundation; version 2
				8	* of the License.
				9	*/
				10
				11	#include <arm_neon.h>
				12
Ard Biesheuvel	6ec4e251	2017-07-13 18:16:01 +0100	[diff] [blame]	13	#ifdef CONFIG_ARM
				14	/*
				15	* AArch32 does not provide this intrinsic natively because it does not
				16	* implement the underlying instruction. AArch32 only provides a 64-bit
				17	* wide vtbl.8 instruction, so use that instead.
				18	*/
				19	static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
				20	{
				21	union {
				22	uint8x16_t val;
				23	uint8x8x2_t pair;
				24	} __a = { a };
				25
				26	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
				27	vtbl2_u8(__a.pair, vget_high_u8(b)));
				28	}
				29	#endif
				30
				31	void __raid6_2data_recov_neon(int bytes, uint8_t p, uint8_t q, uint8_t *dp,
				32	uint8_t dq, const uint8_t pbmul,
				33	const uint8_t *qmul)
				34	{
				35	uint8x16_t pm0 = vld1q_u8(pbmul);
				36	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
				37	uint8x16_t qm0 = vld1q_u8(qmul);
				38	uint8x16_t qm1 = vld1q_u8(qmul + 16);
ndesaulniers@google.com	1ad3935	2019-02-25 20:03:42 -0800	[diff] [blame^]	39	uint8x16_t x0f = vdupq_n_u8(0x0f);
Ard Biesheuvel	6ec4e251	2017-07-13 18:16:01 +0100	[diff] [blame]	40
				41	/*
				42	* while ( bytes-- ) {
				43	* uint8_t px, qx, db;
				44	*
				45	* px = p ^ dp;
				46	* qx = qmul[q ^ dq];
				47	* *dq++ = db = pbmul[px] ^ qx;
				48	* *dp++ = db ^ px;
				49	* p++; q++;
				50	* }
				51	*/
				52
				53	while (bytes) {
				54	uint8x16_t vx, vy, px, qx, db;
				55
				56	px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
				57	vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
				58
				59	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
				60	vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
				61	vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
				62	qx = veorq_u8(vx, vy);
				63
				64	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
				65	vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
				66	vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
				67	vx = veorq_u8(vx, vy);
				68	db = veorq_u8(vx, qx);
				69
				70	vst1q_u8(dq, db);
				71	vst1q_u8(dp, veorq_u8(db, px));
				72
				73	bytes -= 16;
				74	p += 16;
				75	q += 16;
				76	dp += 16;
				77	dq += 16;
				78	}
				79	}
				80
				81	void __raid6_datap_recov_neon(int bytes, uint8_t p, uint8_t q, uint8_t *dq,
				82	const uint8_t *qmul)
				83	{
				84	uint8x16_t qm0 = vld1q_u8(qmul);
				85	uint8x16_t qm1 = vld1q_u8(qmul + 16);
ndesaulniers@google.com	1ad3935	2019-02-25 20:03:42 -0800	[diff] [blame^]	86	uint8x16_t x0f = vdupq_n_u8(0x0f);
Ard Biesheuvel	6ec4e251	2017-07-13 18:16:01 +0100	[diff] [blame]	87
				88	/*
				89	* while (bytes--) {
				90	* p++ ^= dq = qmul[q ^ dq];
				91	* q++; dq++;
				92	* }
				93	*/
				94
				95	while (bytes) {
				96	uint8x16_t vx, vy;
				97
				98	vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
				99
				100	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
				101	vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
				102	vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
				103	vx = veorq_u8(vx, vy);
				104	vy = veorq_u8(vx, vld1q_u8(p));
				105
				106	vst1q_u8(dq, vx);
				107	vst1q_u8(p, vy);
				108
				109	bytes -= 16;
				110	p += 16;
				111	q += 16;
				112	dq += 16;
				113	}
				114	}