CMSIS_DSP_5 - The CMSIS DSP 5 library

Users » xorjoep » Code » CMSIS_DSP_5

The CMSIS DSP 5 library

Dependents: Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

functions/FilteringFunctions/arm_conv_partial_fast_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Committer:: xorjoep
Date:: Thu Jun 21 11:56:27 2018 +0000
Revision:: 3:4098b9d3d571
Parent:: 1:24714b45cd1b

headers is a folder not a library

Who changed what in which revision?

User	Revision	Line number	New contents of line
xorjoep	1:24714b45cd1b	1	/* ----------------------------------------------------------------------
xorjoep	1:24714b45cd1b	2	* Project: CMSIS DSP Library
xorjoep	1:24714b45cd1b	3	* Title: arm_conv_partial_fast_q15.c
xorjoep	1:24714b45cd1b	4	* Description: Fast Q15 Partial convolution
xorjoep	1:24714b45cd1b	5	*
xorjoep	1:24714b45cd1b	6	* $Date: 27. January 2017
xorjoep	1:24714b45cd1b	7	* $Revision: V.1.5.1
xorjoep	1:24714b45cd1b	8	*
xorjoep	1:24714b45cd1b	9	* Target Processor: Cortex-M cores
xorjoep	1:24714b45cd1b	10	* -------------------------------------------------------------------- */
xorjoep	1:24714b45cd1b	11	/*
xorjoep	1:24714b45cd1b	12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep	1:24714b45cd1b	13	*
xorjoep	1:24714b45cd1b	14	* SPDX-License-Identifier: Apache-2.0
xorjoep	1:24714b45cd1b	15	*
xorjoep	1:24714b45cd1b	16	* Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep	1:24714b45cd1b	17	* not use this file except in compliance with the License.
xorjoep	1:24714b45cd1b	18	* You may obtain a copy of the License at
xorjoep	1:24714b45cd1b	19	*
xorjoep	1:24714b45cd1b	20	* www.apache.org/licenses/LICENSE-2.0
xorjoep	1:24714b45cd1b	21	*
xorjoep	1:24714b45cd1b	22	* Unless required by applicable law or agreed to in writing, software
xorjoep	1:24714b45cd1b	23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep	1:24714b45cd1b	24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep	1:24714b45cd1b	25	* See the License for the specific language governing permissions and
xorjoep	1:24714b45cd1b	26	* limitations under the License.
xorjoep	1:24714b45cd1b	27	*/
xorjoep	1:24714b45cd1b	28
xorjoep	1:24714b45cd1b	29	#include "arm_math.h"
xorjoep	1:24714b45cd1b	30
xorjoep	1:24714b45cd1b	31	/**
xorjoep	1:24714b45cd1b	32	* @ingroup groupFilters
xorjoep	1:24714b45cd1b	33	*/
xorjoep	1:24714b45cd1b	34
xorjoep	1:24714b45cd1b	35	/**
xorjoep	1:24714b45cd1b	36	* @addtogroup PartialConv
xorjoep	1:24714b45cd1b	37	* @{
xorjoep	1:24714b45cd1b	38	*/
xorjoep	1:24714b45cd1b	39
xorjoep	1:24714b45cd1b	40	/**
xorjoep	1:24714b45cd1b	41	* @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
xorjoep	1:24714b45cd1b	42	* @param[in] *pSrcA points to the first input sequence.
xorjoep	1:24714b45cd1b	43	* @param[in] srcALen length of the first input sequence.
xorjoep	1:24714b45cd1b	44	* @param[in] *pSrcB points to the second input sequence.
xorjoep	1:24714b45cd1b	45	* @param[in] srcBLen length of the second input sequence.
xorjoep	1:24714b45cd1b	46	* @param[out] *pDst points to the location where the output result is written.
xorjoep	1:24714b45cd1b	47	* @param[in] firstIndex is the first output sample to start with.
xorjoep	1:24714b45cd1b	48	* @param[in] numPoints is the number of output points to be computed.
xorjoep	1:24714b45cd1b	49	* @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
xorjoep	1:24714b45cd1b	50	*
xorjoep	1:24714b45cd1b	51	* See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
xorjoep	1:24714b45cd1b	52	*/
xorjoep	1:24714b45cd1b	53
xorjoep	1:24714b45cd1b	54
xorjoep	1:24714b45cd1b	55	arm_status arm_conv_partial_fast_q15(
xorjoep	1:24714b45cd1b	56	q15_t * pSrcA,
xorjoep	1:24714b45cd1b	57	uint32_t srcALen,
xorjoep	1:24714b45cd1b	58	q15_t * pSrcB,
xorjoep	1:24714b45cd1b	59	uint32_t srcBLen,
xorjoep	1:24714b45cd1b	60	q15_t * pDst,
xorjoep	1:24714b45cd1b	61	uint32_t firstIndex,
xorjoep	1:24714b45cd1b	62	uint32_t numPoints)
xorjoep	1:24714b45cd1b	63	{
xorjoep	1:24714b45cd1b	64	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	65
xorjoep	1:24714b45cd1b	66	q15_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	67	q15_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	68	q15_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	69	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep	1:24714b45cd1b	70	q15_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	71	q15_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	72	q15_t pSrc1, pSrc2; /* Intermediate pointers */
xorjoep	1:24714b45cd1b	73	q31_t x0, x1, x2, x3, c0;
xorjoep	1:24714b45cd1b	74	uint32_t j, k, count, check, blkCnt;
xorjoep	1:24714b45cd1b	75	int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
xorjoep	1:24714b45cd1b	76	arm_status status; /* status of Partial convolution */
xorjoep	1:24714b45cd1b	77
xorjoep	1:24714b45cd1b	78	/* Check for range of output samples to be calculated */
xorjoep	1:24714b45cd1b	79	if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
xorjoep	1:24714b45cd1b	80	{
xorjoep	1:24714b45cd1b	81	/* Set status as ARM_MATH_ARGUMENT_ERROR */
xorjoep	1:24714b45cd1b	82	status = ARM_MATH_ARGUMENT_ERROR;
xorjoep	1:24714b45cd1b	83	}
xorjoep	1:24714b45cd1b	84	else
xorjoep	1:24714b45cd1b	85	{
xorjoep	1:24714b45cd1b	86
xorjoep	1:24714b45cd1b	87	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	88	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	89	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	90	if (srcALen >=srcBLen)
xorjoep	1:24714b45cd1b	91	{
xorjoep	1:24714b45cd1b	92	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	93	pIn1 = pSrcA;
xorjoep	1:24714b45cd1b	94
xorjoep	1:24714b45cd1b	95	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	96	pIn2 = pSrcB;
xorjoep	1:24714b45cd1b	97	}
xorjoep	1:24714b45cd1b	98	else
xorjoep	1:24714b45cd1b	99	{
xorjoep	1:24714b45cd1b	100	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	101	pIn1 = pSrcB;
xorjoep	1:24714b45cd1b	102
xorjoep	1:24714b45cd1b	103	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	104	pIn2 = pSrcA;
xorjoep	1:24714b45cd1b	105
xorjoep	1:24714b45cd1b	106	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	107	j = srcBLen;
xorjoep	1:24714b45cd1b	108	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	109	srcALen = j;
xorjoep	1:24714b45cd1b	110	}
xorjoep	1:24714b45cd1b	111
xorjoep	1:24714b45cd1b	112	/* Conditions to check which loopCounter holds
xorjoep	1:24714b45cd1b	113	* the first and last indices of the output samples to be calculated. */
xorjoep	1:24714b45cd1b	114	check = firstIndex + numPoints;
xorjoep	1:24714b45cd1b	115	blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
xorjoep	1:24714b45cd1b	116	blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
xorjoep	1:24714b45cd1b	117	blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
xorjoep	1:24714b45cd1b	118	blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
xorjoep	1:24714b45cd1b	119	(int32_t) numPoints) : 0;
xorjoep	1:24714b45cd1b	120	blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
xorjoep	1:24714b45cd1b	121	(int32_t) firstIndex);
xorjoep	1:24714b45cd1b	122	blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
xorjoep	1:24714b45cd1b	123
xorjoep	1:24714b45cd1b	124	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep	1:24714b45cd1b	125	/* The function is internally
xorjoep	1:24714b45cd1b	126	* divided into three stages according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	127	* taken place between inputA samples and inputB samples. In the first stage of the
xorjoep	1:24714b45cd1b	128	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	129	* In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	130	* In the third stage of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	131	* for every iteration. */
xorjoep	1:24714b45cd1b	132
xorjoep	1:24714b45cd1b	133	/* Set the output pointer to point to the firstIndex
xorjoep	1:24714b45cd1b	134	* of the output sample to be calculated. */
xorjoep	1:24714b45cd1b	135	pOut = pDst + firstIndex;
xorjoep	1:24714b45cd1b	136
xorjoep	1:24714b45cd1b	137	/* --------------------------
xorjoep	1:24714b45cd1b	138	* Initializations of stage1
xorjoep	1:24714b45cd1b	139	* -------------------------*/
xorjoep	1:24714b45cd1b	140
xorjoep	1:24714b45cd1b	141	/* sum = x[0] * y[0]
xorjoep	1:24714b45cd1b	142	* sum = x[0] * y[1] + x[1] * y[0]
xorjoep	1:24714b45cd1b	143	* ....
xorjoep	1:24714b45cd1b	144	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep	1:24714b45cd1b	145	*/
xorjoep	1:24714b45cd1b	146
xorjoep	1:24714b45cd1b	147	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	148	The count variable holds the number of MAC operations performed.
xorjoep	1:24714b45cd1b	149	Since the partial convolution starts from firstIndex
xorjoep	1:24714b45cd1b	150	Number of Macs to be performed is firstIndex + 1 */
xorjoep	1:24714b45cd1b	151	count = 1U + firstIndex;
xorjoep	1:24714b45cd1b	152
xorjoep	1:24714b45cd1b	153	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	154	px = pIn1;
xorjoep	1:24714b45cd1b	155
xorjoep	1:24714b45cd1b	156	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	157	pSrc2 = pIn2 + firstIndex;
xorjoep	1:24714b45cd1b	158	py = pSrc2;
xorjoep	1:24714b45cd1b	159
xorjoep	1:24714b45cd1b	160	/* ------------------------
xorjoep	1:24714b45cd1b	161	* Stage1 process
xorjoep	1:24714b45cd1b	162	* ----------------------*/
xorjoep	1:24714b45cd1b	163
xorjoep	1:24714b45cd1b	164	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	165	/* First part of this stage computes the MAC operations less than 4 */
xorjoep	1:24714b45cd1b	166	/* Second part of this stage computes the MAC operations greater than or equal to 4 */
xorjoep	1:24714b45cd1b	167
xorjoep	1:24714b45cd1b	168	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	169	while ((count < 4U) && (blockSize1 > 0))
xorjoep	1:24714b45cd1b	170	{
xorjoep	1:24714b45cd1b	171	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	172	sum = 0;
xorjoep	1:24714b45cd1b	173
xorjoep	1:24714b45cd1b	174	/* Loop over number of MAC operations between
xorjoep	1:24714b45cd1b	175	* inputA samples and inputB samples */
xorjoep	1:24714b45cd1b	176	k = count;
xorjoep	1:24714b45cd1b	177
xorjoep	1:24714b45cd1b	178	while (k > 0U)
xorjoep	1:24714b45cd1b	179	{
xorjoep	1:24714b45cd1b	180	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	181	sum = __SMLAD(px++, py--, sum);
xorjoep	1:24714b45cd1b	182
xorjoep	1:24714b45cd1b	183	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	184	k--;
xorjoep	1:24714b45cd1b	185	}
xorjoep	1:24714b45cd1b	186
xorjoep	1:24714b45cd1b	187	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	188	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	189
xorjoep	1:24714b45cd1b	190	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	191	py = ++pSrc2;
xorjoep	1:24714b45cd1b	192	px = pIn1;
xorjoep	1:24714b45cd1b	193
xorjoep	1:24714b45cd1b	194	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	195	count++;
xorjoep	1:24714b45cd1b	196
xorjoep	1:24714b45cd1b	197	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	198	blockSize1--;
xorjoep	1:24714b45cd1b	199	}
xorjoep	1:24714b45cd1b	200
xorjoep	1:24714b45cd1b	201	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	202	/* The internal loop, over count, is unrolled by 4 */
xorjoep	1:24714b45cd1b	203	/* To, read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	204	* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
xorjoep	1:24714b45cd1b	205	py = py - 1;
xorjoep	1:24714b45cd1b	206
xorjoep	1:24714b45cd1b	207	while (blockSize1 > 0)
xorjoep	1:24714b45cd1b	208	{
xorjoep	1:24714b45cd1b	209	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	210	sum = 0;
xorjoep	1:24714b45cd1b	211
xorjoep	1:24714b45cd1b	212	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	213	k = count >> 2U;
xorjoep	1:24714b45cd1b	214
xorjoep	1:24714b45cd1b	215	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	216	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	217	while (k > 0U)
xorjoep	1:24714b45cd1b	218	{
xorjoep	1:24714b45cd1b	219	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	220	/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
xorjoep	1:24714b45cd1b	221	sum = __SMLADX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	222	/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
xorjoep	1:24714b45cd1b	223	sum = __SMLADX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	224
xorjoep	1:24714b45cd1b	225	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	226	k--;
xorjoep	1:24714b45cd1b	227	}
xorjoep	1:24714b45cd1b	228
xorjoep	1:24714b45cd1b	229	/* For the next MAC operations, the pointer py is used without SIMD
xorjoep	1:24714b45cd1b	230	* So, py is incremented by 1 */
xorjoep	1:24714b45cd1b	231	py = py + 1U;
xorjoep	1:24714b45cd1b	232
xorjoep	1:24714b45cd1b	233	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	234	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	235	k = count % 0x4U;
xorjoep	1:24714b45cd1b	236
xorjoep	1:24714b45cd1b	237	while (k > 0U)
xorjoep	1:24714b45cd1b	238	{
xorjoep	1:24714b45cd1b	239	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	240	sum = __SMLAD(px++, py--, sum);
xorjoep	1:24714b45cd1b	241
xorjoep	1:24714b45cd1b	242	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	243	k--;
xorjoep	1:24714b45cd1b	244	}
xorjoep	1:24714b45cd1b	245
xorjoep	1:24714b45cd1b	246	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	247	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	248
xorjoep	1:24714b45cd1b	249	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	250	py = ++pSrc2 - 1U;
xorjoep	1:24714b45cd1b	251	px = pIn1;
xorjoep	1:24714b45cd1b	252
xorjoep	1:24714b45cd1b	253	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	254	count++;
xorjoep	1:24714b45cd1b	255
xorjoep	1:24714b45cd1b	256	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	257	blockSize1--;
xorjoep	1:24714b45cd1b	258	}
xorjoep	1:24714b45cd1b	259
xorjoep	1:24714b45cd1b	260	/* --------------------------
xorjoep	1:24714b45cd1b	261	* Initializations of stage2
xorjoep	1:24714b45cd1b	262	* ------------------------*/
xorjoep	1:24714b45cd1b	263
xorjoep	1:24714b45cd1b	264	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep	1:24714b45cd1b	265	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep	1:24714b45cd1b	266	* ....
xorjoep	1:24714b45cd1b	267	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	268	*/
xorjoep	1:24714b45cd1b	269
xorjoep	1:24714b45cd1b	270	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	271	if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep	1:24714b45cd1b	272	{
xorjoep	1:24714b45cd1b	273	px = pIn1 + firstIndex - srcBLen + 1;
xorjoep	1:24714b45cd1b	274	}
xorjoep	1:24714b45cd1b	275	else
xorjoep	1:24714b45cd1b	276	{
xorjoep	1:24714b45cd1b	277	px = pIn1;
xorjoep	1:24714b45cd1b	278	}
xorjoep	1:24714b45cd1b	279
xorjoep	1:24714b45cd1b	280	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	281	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	282	py = pSrc2;
xorjoep	1:24714b45cd1b	283
xorjoep	1:24714b45cd1b	284	/* count is the index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	285	count = 0U;
xorjoep	1:24714b45cd1b	286
xorjoep	1:24714b45cd1b	287
xorjoep	1:24714b45cd1b	288	/* --------------------
xorjoep	1:24714b45cd1b	289	* Stage2 process
xorjoep	1:24714b45cd1b	290	* -------------------*/
xorjoep	1:24714b45cd1b	291
xorjoep	1:24714b45cd1b	292	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	293	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	294	* srcBLen should be greater than or equal to 4 */
xorjoep	1:24714b45cd1b	295	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	296	{
xorjoep	1:24714b45cd1b	297	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	298	blkCnt = ((uint32_t) blockSize2 >> 2U);
xorjoep	1:24714b45cd1b	299
xorjoep	1:24714b45cd1b	300	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	301	{
xorjoep	1:24714b45cd1b	302	py = py - 1U;
xorjoep	1:24714b45cd1b	303
xorjoep	1:24714b45cd1b	304	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	305	acc0 = 0;
xorjoep	1:24714b45cd1b	306	acc1 = 0;
xorjoep	1:24714b45cd1b	307	acc2 = 0;
xorjoep	1:24714b45cd1b	308	acc3 = 0;
xorjoep	1:24714b45cd1b	309
xorjoep	1:24714b45cd1b	310
xorjoep	1:24714b45cd1b	311	/* read x[0], x[1] samples */
xorjoep	1:24714b45cd1b	312	x0 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	313	/* read x[1], x[2] samples */
xorjoep	1:24714b45cd1b	314	x1 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	315	px+= 2U;
xorjoep	1:24714b45cd1b	316
xorjoep	1:24714b45cd1b	317
xorjoep	1:24714b45cd1b	318	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	319	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	320
xorjoep	1:24714b45cd1b	321	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	322	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	323	do
xorjoep	1:24714b45cd1b	324	{
xorjoep	1:24714b45cd1b	325	/* Read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	326	* y[srcBLen - 1] and y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	327	c0 = *__SIMD32(py)--;
xorjoep	1:24714b45cd1b	328
xorjoep	1:24714b45cd1b	329	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	330	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	331
xorjoep	1:24714b45cd1b	332	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	333	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	334
xorjoep	1:24714b45cd1b	335	/* Read x[2], x[3] */
xorjoep	1:24714b45cd1b	336	x2 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	337
xorjoep	1:24714b45cd1b	338	/* Read x[3], x[4] */
xorjoep	1:24714b45cd1b	339	x3 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	340
xorjoep	1:24714b45cd1b	341	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	342	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	343
xorjoep	1:24714b45cd1b	344	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	345	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	346
xorjoep	1:24714b45cd1b	347	/* Read y[srcBLen - 3] and y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	348	c0 = *__SIMD32(py)--;
xorjoep	1:24714b45cd1b	349
xorjoep	1:24714b45cd1b	350	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	351	acc0 = __SMLADX(x2, c0, acc0);
xorjoep	1:24714b45cd1b	352
xorjoep	1:24714b45cd1b	353	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	354	acc1 = __SMLADX(x3, c0, acc1);
xorjoep	1:24714b45cd1b	355
xorjoep	1:24714b45cd1b	356	/* Read x[4], x[5] */
xorjoep	1:24714b45cd1b	357	x0 = _SIMD32_OFFSET(px+2);
xorjoep	1:24714b45cd1b	358
xorjoep	1:24714b45cd1b	359	/* Read x[5], x[6] */
xorjoep	1:24714b45cd1b	360	x1 = _SIMD32_OFFSET(px+3);
xorjoep	1:24714b45cd1b	361	px += 4U;
xorjoep	1:24714b45cd1b	362
xorjoep	1:24714b45cd1b	363	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	364	acc2 = __SMLADX(x0, c0, acc2);
xorjoep	1:24714b45cd1b	365
xorjoep	1:24714b45cd1b	366	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	367	acc3 = __SMLADX(x1, c0, acc3);
xorjoep	1:24714b45cd1b	368
xorjoep	1:24714b45cd1b	369	} while (--k);
xorjoep	1:24714b45cd1b	370
xorjoep	1:24714b45cd1b	371	/* For the next MAC operations, SIMD is not used
xorjoep	1:24714b45cd1b	372	* So, the 16 bit pointer if inputB, py is updated */
xorjoep	1:24714b45cd1b	373
xorjoep	1:24714b45cd1b	374	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	375	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	376	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	377
xorjoep	1:24714b45cd1b	378	if (k == 1U)
xorjoep	1:24714b45cd1b	379	{
xorjoep	1:24714b45cd1b	380	/* Read y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	381	c0 = *(py+1);
xorjoep	1:24714b45cd1b	382	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	383
xorjoep	1:24714b45cd1b	384	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	385
xorjoep	1:24714b45cd1b	386	#else
xorjoep	1:24714b45cd1b	387
xorjoep	1:24714b45cd1b	388	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	389
xorjoep	1:24714b45cd1b	390	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	391
xorjoep	1:24714b45cd1b	392	/* Read x[7] */
xorjoep	1:24714b45cd1b	393	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	394	px++;
xorjoep	1:24714b45cd1b	395
xorjoep	1:24714b45cd1b	396	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	397	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	398	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	399	acc2 = __SMLADX(x1, c0, acc2);
xorjoep	1:24714b45cd1b	400	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	401	}
xorjoep	1:24714b45cd1b	402
xorjoep	1:24714b45cd1b	403	if (k == 2U)
xorjoep	1:24714b45cd1b	404	{
xorjoep	1:24714b45cd1b	405	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	406	c0 = _SIMD32_OFFSET(py);
xorjoep	1:24714b45cd1b	407
xorjoep	1:24714b45cd1b	408	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	409	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	410
xorjoep	1:24714b45cd1b	411	/* Read x[9] */
xorjoep	1:24714b45cd1b	412	x2 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	413	px += 2U;
xorjoep	1:24714b45cd1b	414
xorjoep	1:24714b45cd1b	415	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	416	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	417	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	418	acc2 = __SMLADX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	419	acc3 = __SMLADX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	420	}
xorjoep	1:24714b45cd1b	421
xorjoep	1:24714b45cd1b	422	if (k == 3U)
xorjoep	1:24714b45cd1b	423	{
xorjoep	1:24714b45cd1b	424	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	425	c0 = _SIMD32_OFFSET(py);
xorjoep	1:24714b45cd1b	426
xorjoep	1:24714b45cd1b	427	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	428	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	429
xorjoep	1:24714b45cd1b	430	/* Read x[9] */
xorjoep	1:24714b45cd1b	431	x2 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	432
xorjoep	1:24714b45cd1b	433	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	434	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	435	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	436	acc2 = __SMLADX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	437	acc3 = __SMLADX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	438
xorjoep	1:24714b45cd1b	439	c0 = *(py-1);
xorjoep	1:24714b45cd1b	440	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	441
xorjoep	1:24714b45cd1b	442	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	443	#else
xorjoep	1:24714b45cd1b	444
xorjoep	1:24714b45cd1b	445	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	446	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	447
xorjoep	1:24714b45cd1b	448	/* Read x[10] */
xorjoep	1:24714b45cd1b	449	x3 = _SIMD32_OFFSET(px+2);
xorjoep	1:24714b45cd1b	450	px += 3U;
xorjoep	1:24714b45cd1b	451
xorjoep	1:24714b45cd1b	452	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	453	acc0 = __SMLADX(x1, c0, acc0);
xorjoep	1:24714b45cd1b	454	acc1 = __SMLAD(x2, c0, acc1);
xorjoep	1:24714b45cd1b	455	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	456	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	457	}
xorjoep	1:24714b45cd1b	458
xorjoep	1:24714b45cd1b	459	/* Store the results in the accumulators in the destination buffer. */
xorjoep	1:24714b45cd1b	460	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	461
xorjoep	1:24714b45cd1b	462	*__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
xorjoep	1:24714b45cd1b	463	*__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
xorjoep	1:24714b45cd1b	464
xorjoep	1:24714b45cd1b	465	#else
xorjoep	1:24714b45cd1b	466
xorjoep	1:24714b45cd1b	467	*__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
xorjoep	1:24714b45cd1b	468	*__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
xorjoep	1:24714b45cd1b	469
xorjoep	1:24714b45cd1b	470	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	471
xorjoep	1:24714b45cd1b	472	/* Increment the pointer pIn1 index, count by 4 */
xorjoep	1:24714b45cd1b	473	count += 4U;
xorjoep	1:24714b45cd1b	474
xorjoep	1:24714b45cd1b	475	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	476	px = pIn1 + count;
xorjoep	1:24714b45cd1b	477	py = pSrc2;
xorjoep	1:24714b45cd1b	478
xorjoep	1:24714b45cd1b	479	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	480	blkCnt--;
xorjoep	1:24714b45cd1b	481	}
xorjoep	1:24714b45cd1b	482
xorjoep	1:24714b45cd1b	483	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	484	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	485	blkCnt = (uint32_t) blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	486
xorjoep	1:24714b45cd1b	487	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	488	{
xorjoep	1:24714b45cd1b	489	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	490	sum = 0;
xorjoep	1:24714b45cd1b	491
xorjoep	1:24714b45cd1b	492	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	493	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	494
xorjoep	1:24714b45cd1b	495	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	496	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	497	while (k > 0U)
xorjoep	1:24714b45cd1b	498	{
xorjoep	1:24714b45cd1b	499	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	500	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	501	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	502	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	503	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	504
xorjoep	1:24714b45cd1b	505	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	506	k--;
xorjoep	1:24714b45cd1b	507	}
xorjoep	1:24714b45cd1b	508
xorjoep	1:24714b45cd1b	509	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	510	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	511	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	512
xorjoep	1:24714b45cd1b	513	while (k > 0U)
xorjoep	1:24714b45cd1b	514	{
xorjoep	1:24714b45cd1b	515	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	516	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	517
xorjoep	1:24714b45cd1b	518	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	519	k--;
xorjoep	1:24714b45cd1b	520	}
xorjoep	1:24714b45cd1b	521
xorjoep	1:24714b45cd1b	522	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	523	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	524
xorjoep	1:24714b45cd1b	525	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	526	count++;
xorjoep	1:24714b45cd1b	527
xorjoep	1:24714b45cd1b	528	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	529	if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep	1:24714b45cd1b	530	{
xorjoep	1:24714b45cd1b	531	px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep	1:24714b45cd1b	532	}
xorjoep	1:24714b45cd1b	533	else
xorjoep	1:24714b45cd1b	534	{
xorjoep	1:24714b45cd1b	535	px = pIn1 + count;
xorjoep	1:24714b45cd1b	536	}
xorjoep	1:24714b45cd1b	537	py = pSrc2;
xorjoep	1:24714b45cd1b	538
xorjoep	1:24714b45cd1b	539	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	540	blkCnt--;
xorjoep	1:24714b45cd1b	541	}
xorjoep	1:24714b45cd1b	542	}
xorjoep	1:24714b45cd1b	543	else
xorjoep	1:24714b45cd1b	544	{
xorjoep	1:24714b45cd1b	545	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	546	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	547	blkCnt = (uint32_t) blockSize2;
xorjoep	1:24714b45cd1b	548
xorjoep	1:24714b45cd1b	549	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	550	{
xorjoep	1:24714b45cd1b	551	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	552	sum = 0;
xorjoep	1:24714b45cd1b	553
xorjoep	1:24714b45cd1b	554	/* srcBLen number of MACS should be performed */
xorjoep	1:24714b45cd1b	555	k = srcBLen;
xorjoep	1:24714b45cd1b	556
xorjoep	1:24714b45cd1b	557	while (k > 0U)
xorjoep	1:24714b45cd1b	558	{
xorjoep	1:24714b45cd1b	559	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	560	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	561
xorjoep	1:24714b45cd1b	562	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	563	k--;
xorjoep	1:24714b45cd1b	564	}
xorjoep	1:24714b45cd1b	565
xorjoep	1:24714b45cd1b	566	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	567	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	568
xorjoep	1:24714b45cd1b	569	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	570	count++;
xorjoep	1:24714b45cd1b	571
xorjoep	1:24714b45cd1b	572	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	573	if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep	1:24714b45cd1b	574	{
xorjoep	1:24714b45cd1b	575	px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep	1:24714b45cd1b	576	}
xorjoep	1:24714b45cd1b	577	else
xorjoep	1:24714b45cd1b	578	{
xorjoep	1:24714b45cd1b	579	px = pIn1 + count;
xorjoep	1:24714b45cd1b	580	}
xorjoep	1:24714b45cd1b	581	py = pSrc2;
xorjoep	1:24714b45cd1b	582
xorjoep	1:24714b45cd1b	583	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	584	blkCnt--;
xorjoep	1:24714b45cd1b	585	}
xorjoep	1:24714b45cd1b	586	}
xorjoep	1:24714b45cd1b	587
xorjoep	1:24714b45cd1b	588
xorjoep	1:24714b45cd1b	589	/* --------------------------
xorjoep	1:24714b45cd1b	590	* Initializations of stage3
xorjoep	1:24714b45cd1b	591	* -------------------------*/
xorjoep	1:24714b45cd1b	592
xorjoep	1:24714b45cd1b	593	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	594	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep	1:24714b45cd1b	595	* ....
xorjoep	1:24714b45cd1b	596	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep	1:24714b45cd1b	597	* sum += x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	598	*/
xorjoep	1:24714b45cd1b	599
xorjoep	1:24714b45cd1b	600	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	601	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	602	count = srcBLen - 1U;
xorjoep	1:24714b45cd1b	603
xorjoep	1:24714b45cd1b	604	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	605	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	606	px = pSrc1;
xorjoep	1:24714b45cd1b	607
xorjoep	1:24714b45cd1b	608	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	609	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	610	pIn2 = pSrc2 - 1U;
xorjoep	1:24714b45cd1b	611	py = pIn2;
xorjoep	1:24714b45cd1b	612
xorjoep	1:24714b45cd1b	613	/* -------------------
xorjoep	1:24714b45cd1b	614	* Stage3 process
xorjoep	1:24714b45cd1b	615	* ------------------*/
xorjoep	1:24714b45cd1b	616
xorjoep	1:24714b45cd1b	617	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	618	/* First part of this stage computes the MAC operations greater than 4 */
xorjoep	1:24714b45cd1b	619	/* Second part of this stage computes the MAC operations less than or equal to 4 */
xorjoep	1:24714b45cd1b	620
xorjoep	1:24714b45cd1b	621	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	622	j = count >> 2U;
xorjoep	1:24714b45cd1b	623
xorjoep	1:24714b45cd1b	624	while ((j > 0U) && (blockSize3 > 0))
xorjoep	1:24714b45cd1b	625	{
xorjoep	1:24714b45cd1b	626	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	627	sum = 0;
xorjoep	1:24714b45cd1b	628
xorjoep	1:24714b45cd1b	629	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	630	k = count >> 2U;
xorjoep	1:24714b45cd1b	631
xorjoep	1:24714b45cd1b	632	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	633	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	634	while (k > 0U)
xorjoep	1:24714b45cd1b	635	{
xorjoep	1:24714b45cd1b	636	/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
xorjoep	1:24714b45cd1b	637	* with y[srcBLen - 1], y[srcBLen - 2] respectively */
xorjoep	1:24714b45cd1b	638	sum = __SMLADX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	639	/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
xorjoep	1:24714b45cd1b	640	* with y[srcBLen - 3], y[srcBLen - 4] respectively */
xorjoep	1:24714b45cd1b	641	sum = __SMLADX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	642
xorjoep	1:24714b45cd1b	643	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	644	k--;
xorjoep	1:24714b45cd1b	645	}
xorjoep	1:24714b45cd1b	646
xorjoep	1:24714b45cd1b	647	/* For the next MAC operations, the pointer py is used without SIMD
xorjoep	1:24714b45cd1b	648	* So, py is incremented by 1 */
xorjoep	1:24714b45cd1b	649	py = py + 1U;
xorjoep	1:24714b45cd1b	650
xorjoep	1:24714b45cd1b	651	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	652	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	653	k = count % 0x4U;
xorjoep	1:24714b45cd1b	654
xorjoep	1:24714b45cd1b	655	while (k > 0U)
xorjoep	1:24714b45cd1b	656	{
xorjoep	1:24714b45cd1b	657	/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	658	sum = __SMLAD(px++, py--, sum);
xorjoep	1:24714b45cd1b	659
xorjoep	1:24714b45cd1b	660	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	661	k--;
xorjoep	1:24714b45cd1b	662	}
xorjoep	1:24714b45cd1b	663
xorjoep	1:24714b45cd1b	664	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	665	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	666
xorjoep	1:24714b45cd1b	667	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	668	px = ++pSrc1;
xorjoep	1:24714b45cd1b	669	py = pIn2;
xorjoep	1:24714b45cd1b	670
xorjoep	1:24714b45cd1b	671	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	672	count--;
xorjoep	1:24714b45cd1b	673
xorjoep	1:24714b45cd1b	674	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	675	blockSize3--;
xorjoep	1:24714b45cd1b	676
xorjoep	1:24714b45cd1b	677	j--;
xorjoep	1:24714b45cd1b	678	}
xorjoep	1:24714b45cd1b	679
xorjoep	1:24714b45cd1b	680	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	681	/* SIMD is not used for the next MAC operations,
xorjoep	1:24714b45cd1b	682	* so pointer py is updated to read only one sample at a time */
xorjoep	1:24714b45cd1b	683	py = py + 1U;
xorjoep	1:24714b45cd1b	684
xorjoep	1:24714b45cd1b	685	while (blockSize3 > 0)
xorjoep	1:24714b45cd1b	686	{
xorjoep	1:24714b45cd1b	687	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	688	sum = 0;
xorjoep	1:24714b45cd1b	689
xorjoep	1:24714b45cd1b	690	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	691	k = count;
xorjoep	1:24714b45cd1b	692
xorjoep	1:24714b45cd1b	693	while (k > 0U)
xorjoep	1:24714b45cd1b	694	{
xorjoep	1:24714b45cd1b	695	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	696	/* sum += x[srcALen-1] * y[srcBLen-1] */
xorjoep	1:24714b45cd1b	697	sum = __SMLAD(px++, py--, sum);
xorjoep	1:24714b45cd1b	698
xorjoep	1:24714b45cd1b	699	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	700	k--;
xorjoep	1:24714b45cd1b	701	}
xorjoep	1:24714b45cd1b	702
xorjoep	1:24714b45cd1b	703	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	704	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	705
xorjoep	1:24714b45cd1b	706	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	707	px = ++pSrc1;
xorjoep	1:24714b45cd1b	708	py = pSrc2;
xorjoep	1:24714b45cd1b	709
xorjoep	1:24714b45cd1b	710	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	711	count--;
xorjoep	1:24714b45cd1b	712
xorjoep	1:24714b45cd1b	713	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	714	blockSize3--;
xorjoep	1:24714b45cd1b	715	}
xorjoep	1:24714b45cd1b	716
xorjoep	1:24714b45cd1b	717	/* set status as ARM_MATH_SUCCESS */
xorjoep	1:24714b45cd1b	718	status = ARM_MATH_SUCCESS;
xorjoep	1:24714b45cd1b	719	}
xorjoep	1:24714b45cd1b	720
xorjoep	1:24714b45cd1b	721	/* Return to application */
xorjoep	1:24714b45cd1b	722	return (status);
xorjoep	1:24714b45cd1b	723
xorjoep	1:24714b45cd1b	724	#else
xorjoep	1:24714b45cd1b	725
xorjoep	1:24714b45cd1b	726	q15_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	727	q15_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	728	q15_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	729	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep	1:24714b45cd1b	730	q15_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	731	q15_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	732	q15_t pSrc1, pSrc2; /* Intermediate pointers */
xorjoep	1:24714b45cd1b	733	q31_t x0, x1, x2, x3, c0;
xorjoep	1:24714b45cd1b	734	uint32_t j, k, count, check, blkCnt;
xorjoep	1:24714b45cd1b	735	int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
xorjoep	1:24714b45cd1b	736	arm_status status; /* status of Partial convolution */
xorjoep	1:24714b45cd1b	737	q15_t a, b;
xorjoep	1:24714b45cd1b	738
xorjoep	1:24714b45cd1b	739	/* Check for range of output samples to be calculated */
xorjoep	1:24714b45cd1b	740	if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
xorjoep	1:24714b45cd1b	741	{
xorjoep	1:24714b45cd1b	742	/* Set status as ARM_MATH_ARGUMENT_ERROR */
xorjoep	1:24714b45cd1b	743	status = ARM_MATH_ARGUMENT_ERROR;
xorjoep	1:24714b45cd1b	744	}
xorjoep	1:24714b45cd1b	745	else
xorjoep	1:24714b45cd1b	746	{
xorjoep	1:24714b45cd1b	747
xorjoep	1:24714b45cd1b	748	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	749	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	750	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	751	if (srcALen >=srcBLen)
xorjoep	1:24714b45cd1b	752	{
xorjoep	1:24714b45cd1b	753	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	754	pIn1 = pSrcA;
xorjoep	1:24714b45cd1b	755
xorjoep	1:24714b45cd1b	756	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	757	pIn2 = pSrcB;
xorjoep	1:24714b45cd1b	758	}
xorjoep	1:24714b45cd1b	759	else
xorjoep	1:24714b45cd1b	760	{
xorjoep	1:24714b45cd1b	761	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	762	pIn1 = pSrcB;
xorjoep	1:24714b45cd1b	763
xorjoep	1:24714b45cd1b	764	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	765	pIn2 = pSrcA;
xorjoep	1:24714b45cd1b	766
xorjoep	1:24714b45cd1b	767	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	768	j = srcBLen;
xorjoep	1:24714b45cd1b	769	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	770	srcALen = j;
xorjoep	1:24714b45cd1b	771	}
xorjoep	1:24714b45cd1b	772
xorjoep	1:24714b45cd1b	773	/* Conditions to check which loopCounter holds
xorjoep	1:24714b45cd1b	774	* the first and last indices of the output samples to be calculated. */
xorjoep	1:24714b45cd1b	775	check = firstIndex + numPoints;
xorjoep	1:24714b45cd1b	776	blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
xorjoep	1:24714b45cd1b	777	blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
xorjoep	1:24714b45cd1b	778	blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
xorjoep	1:24714b45cd1b	779	blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
xorjoep	1:24714b45cd1b	780	(int32_t) numPoints) : 0;
xorjoep	1:24714b45cd1b	781	blockSize2 = ((int32_t) check - blockSize3) -
xorjoep	1:24714b45cd1b	782	(blockSize1 + (int32_t) firstIndex);
xorjoep	1:24714b45cd1b	783	blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
xorjoep	1:24714b45cd1b	784
xorjoep	1:24714b45cd1b	785	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep	1:24714b45cd1b	786	/* The function is internally
xorjoep	1:24714b45cd1b	787	* divided into three stages according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	788	* taken place between inputA samples and inputB samples. In the first stage of the
xorjoep	1:24714b45cd1b	789	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	790	* In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	791	* In the third stage of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	792	* for every iteration. */
xorjoep	1:24714b45cd1b	793
xorjoep	1:24714b45cd1b	794	/* Set the output pointer to point to the firstIndex
xorjoep	1:24714b45cd1b	795	* of the output sample to be calculated. */
xorjoep	1:24714b45cd1b	796	pOut = pDst + firstIndex;
xorjoep	1:24714b45cd1b	797
xorjoep	1:24714b45cd1b	798	/* --------------------------
xorjoep	1:24714b45cd1b	799	* Initializations of stage1
xorjoep	1:24714b45cd1b	800	* -------------------------*/
xorjoep	1:24714b45cd1b	801
xorjoep	1:24714b45cd1b	802	/* sum = x[0] * y[0]
xorjoep	1:24714b45cd1b	803	* sum = x[0] * y[1] + x[1] * y[0]
xorjoep	1:24714b45cd1b	804	* ....
xorjoep	1:24714b45cd1b	805	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep	1:24714b45cd1b	806	*/
xorjoep	1:24714b45cd1b	807
xorjoep	1:24714b45cd1b	808	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	809	The count variable holds the number of MAC operations performed.
xorjoep	1:24714b45cd1b	810	Since the partial convolution starts from firstIndex
xorjoep	1:24714b45cd1b	811	Number of Macs to be performed is firstIndex + 1 */
xorjoep	1:24714b45cd1b	812	count = 1U + firstIndex;
xorjoep	1:24714b45cd1b	813
xorjoep	1:24714b45cd1b	814	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	815	px = pIn1;
xorjoep	1:24714b45cd1b	816
xorjoep	1:24714b45cd1b	817	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	818	pSrc2 = pIn2 + firstIndex;
xorjoep	1:24714b45cd1b	819	py = pSrc2;
xorjoep	1:24714b45cd1b	820
xorjoep	1:24714b45cd1b	821	/* ------------------------
xorjoep	1:24714b45cd1b	822	* Stage1 process
xorjoep	1:24714b45cd1b	823	* ----------------------*/
xorjoep	1:24714b45cd1b	824
xorjoep	1:24714b45cd1b	825	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	826	/* First part of this stage computes the MAC operations less than 4 */
xorjoep	1:24714b45cd1b	827	/* Second part of this stage computes the MAC operations greater than or equal to 4 */
xorjoep	1:24714b45cd1b	828
xorjoep	1:24714b45cd1b	829	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	830	while ((count < 4U) && (blockSize1 > 0))
xorjoep	1:24714b45cd1b	831	{
xorjoep	1:24714b45cd1b	832	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	833	sum = 0;
xorjoep	1:24714b45cd1b	834
xorjoep	1:24714b45cd1b	835	/* Loop over number of MAC operations between
xorjoep	1:24714b45cd1b	836	* inputA samples and inputB samples */
xorjoep	1:24714b45cd1b	837	k = count;
xorjoep	1:24714b45cd1b	838
xorjoep	1:24714b45cd1b	839	while (k > 0U)
xorjoep	1:24714b45cd1b	840	{
xorjoep	1:24714b45cd1b	841	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	842	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	843
xorjoep	1:24714b45cd1b	844	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	845	k--;
xorjoep	1:24714b45cd1b	846	}
xorjoep	1:24714b45cd1b	847
xorjoep	1:24714b45cd1b	848	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	849	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	850
xorjoep	1:24714b45cd1b	851	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	852	py = ++pSrc2;
xorjoep	1:24714b45cd1b	853	px = pIn1;
xorjoep	1:24714b45cd1b	854
xorjoep	1:24714b45cd1b	855	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	856	count++;
xorjoep	1:24714b45cd1b	857
xorjoep	1:24714b45cd1b	858	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	859	blockSize1--;
xorjoep	1:24714b45cd1b	860	}
xorjoep	1:24714b45cd1b	861
xorjoep	1:24714b45cd1b	862	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	863	/* The internal loop, over count, is unrolled by 4 */
xorjoep	1:24714b45cd1b	864	/* To, read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	865	* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
xorjoep	1:24714b45cd1b	866	py = py - 1;
xorjoep	1:24714b45cd1b	867
xorjoep	1:24714b45cd1b	868	while (blockSize1 > 0)
xorjoep	1:24714b45cd1b	869	{
xorjoep	1:24714b45cd1b	870	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	871	sum = 0;
xorjoep	1:24714b45cd1b	872
xorjoep	1:24714b45cd1b	873	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	874	k = count >> 2U;
xorjoep	1:24714b45cd1b	875
xorjoep	1:24714b45cd1b	876	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	877	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	878	py++;
xorjoep	1:24714b45cd1b	879
xorjoep	1:24714b45cd1b	880	while (k > 0U)
xorjoep	1:24714b45cd1b	881	{
xorjoep	1:24714b45cd1b	882	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	883	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	884	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	885	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	886	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	887
xorjoep	1:24714b45cd1b	888	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	889	k--;
xorjoep	1:24714b45cd1b	890	}
xorjoep	1:24714b45cd1b	891
xorjoep	1:24714b45cd1b	892	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	893	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	894	k = count % 0x4U;
xorjoep	1:24714b45cd1b	895
xorjoep	1:24714b45cd1b	896	while (k > 0U)
xorjoep	1:24714b45cd1b	897	{
xorjoep	1:24714b45cd1b	898	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	899	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	900
xorjoep	1:24714b45cd1b	901	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	902	k--;
xorjoep	1:24714b45cd1b	903	}
xorjoep	1:24714b45cd1b	904
xorjoep	1:24714b45cd1b	905	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	906	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	907
xorjoep	1:24714b45cd1b	908	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	909	py = ++pSrc2 - 1U;
xorjoep	1:24714b45cd1b	910	px = pIn1;
xorjoep	1:24714b45cd1b	911
xorjoep	1:24714b45cd1b	912	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	913	count++;
xorjoep	1:24714b45cd1b	914
xorjoep	1:24714b45cd1b	915	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	916	blockSize1--;
xorjoep	1:24714b45cd1b	917	}
xorjoep	1:24714b45cd1b	918
xorjoep	1:24714b45cd1b	919	/* --------------------------
xorjoep	1:24714b45cd1b	920	* Initializations of stage2
xorjoep	1:24714b45cd1b	921	* ------------------------*/
xorjoep	1:24714b45cd1b	922
xorjoep	1:24714b45cd1b	923	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep	1:24714b45cd1b	924	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep	1:24714b45cd1b	925	* ....
xorjoep	1:24714b45cd1b	926	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	927	*/
xorjoep	1:24714b45cd1b	928
xorjoep	1:24714b45cd1b	929	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	930	if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep	1:24714b45cd1b	931	{
xorjoep	1:24714b45cd1b	932	px = pIn1 + firstIndex - srcBLen + 1;
xorjoep	1:24714b45cd1b	933	}
xorjoep	1:24714b45cd1b	934	else
xorjoep	1:24714b45cd1b	935	{
xorjoep	1:24714b45cd1b	936	px = pIn1;
xorjoep	1:24714b45cd1b	937	}
xorjoep	1:24714b45cd1b	938
xorjoep	1:24714b45cd1b	939	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	940	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	941	py = pSrc2;
xorjoep	1:24714b45cd1b	942
xorjoep	1:24714b45cd1b	943	/* count is the index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	944	count = 0U;
xorjoep	1:24714b45cd1b	945
xorjoep	1:24714b45cd1b	946
xorjoep	1:24714b45cd1b	947	/* --------------------
xorjoep	1:24714b45cd1b	948	* Stage2 process
xorjoep	1:24714b45cd1b	949	* -------------------*/
xorjoep	1:24714b45cd1b	950
xorjoep	1:24714b45cd1b	951	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	952	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	953	* srcBLen should be greater than or equal to 4 */
xorjoep	1:24714b45cd1b	954	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	955	{
xorjoep	1:24714b45cd1b	956	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	957	blkCnt = ((uint32_t) blockSize2 >> 2U);
xorjoep	1:24714b45cd1b	958
xorjoep	1:24714b45cd1b	959	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	960	{
xorjoep	1:24714b45cd1b	961	py = py - 1U;
xorjoep	1:24714b45cd1b	962
xorjoep	1:24714b45cd1b	963	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	964	acc0 = 0;
xorjoep	1:24714b45cd1b	965	acc1 = 0;
xorjoep	1:24714b45cd1b	966	acc2 = 0;
xorjoep	1:24714b45cd1b	967	acc3 = 0;
xorjoep	1:24714b45cd1b	968
xorjoep	1:24714b45cd1b	969	/* read x[0], x[1] samples */
xorjoep	1:24714b45cd1b	970	a = *px++;
xorjoep	1:24714b45cd1b	971	b = *px++;
xorjoep	1:24714b45cd1b	972
xorjoep	1:24714b45cd1b	973	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	974
xorjoep	1:24714b45cd1b	975	x0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	976	a = *px;
xorjoep	1:24714b45cd1b	977	x1 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	978
xorjoep	1:24714b45cd1b	979	#else
xorjoep	1:24714b45cd1b	980
xorjoep	1:24714b45cd1b	981	x0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	982	a = *px;
xorjoep	1:24714b45cd1b	983	x1 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	984
xorjoep	1:24714b45cd1b	985	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	986
xorjoep	1:24714b45cd1b	987	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	988	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	989
xorjoep	1:24714b45cd1b	990	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	991	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	992	do
xorjoep	1:24714b45cd1b	993	{
xorjoep	1:24714b45cd1b	994	/* Read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	995	* y[srcBLen - 1] and y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	996	a = *py;
xorjoep	1:24714b45cd1b	997	b = *(py+1);
xorjoep	1:24714b45cd1b	998	py -= 2;
xorjoep	1:24714b45cd1b	999
xorjoep	1:24714b45cd1b	1000	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1001
xorjoep	1:24714b45cd1b	1002	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1003
xorjoep	1:24714b45cd1b	1004	#else
xorjoep	1:24714b45cd1b	1005
xorjoep	1:24714b45cd1b	1006	c0 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1007
xorjoep	1:24714b45cd1b	1008	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1009
xorjoep	1:24714b45cd1b	1010	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	1011	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1012
xorjoep	1:24714b45cd1b	1013	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	1014	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1015
xorjoep	1:24714b45cd1b	1016	a = *px;
xorjoep	1:24714b45cd1b	1017	b = *(px + 1);
xorjoep	1:24714b45cd1b	1018
xorjoep	1:24714b45cd1b	1019	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1020
xorjoep	1:24714b45cd1b	1021	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1022	a = *(px + 2);
xorjoep	1:24714b45cd1b	1023	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1024
xorjoep	1:24714b45cd1b	1025	#else
xorjoep	1:24714b45cd1b	1026
xorjoep	1:24714b45cd1b	1027	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1028	a = *(px + 2);
xorjoep	1:24714b45cd1b	1029	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1030
xorjoep	1:24714b45cd1b	1031	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1032
xorjoep	1:24714b45cd1b	1033	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	1034	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	1035
xorjoep	1:24714b45cd1b	1036	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	1037	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	1038
xorjoep	1:24714b45cd1b	1039	/* Read y[srcBLen - 3] and y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	1040	a = *py;
xorjoep	1:24714b45cd1b	1041	b = *(py+1);
xorjoep	1:24714b45cd1b	1042	py -= 2;
xorjoep	1:24714b45cd1b	1043
xorjoep	1:24714b45cd1b	1044	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1045
xorjoep	1:24714b45cd1b	1046	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1047
xorjoep	1:24714b45cd1b	1048	#else
xorjoep	1:24714b45cd1b	1049
xorjoep	1:24714b45cd1b	1050	c0 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1051
xorjoep	1:24714b45cd1b	1052	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1053
xorjoep	1:24714b45cd1b	1054	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	1055	acc0 = __SMLADX(x2, c0, acc0);
xorjoep	1:24714b45cd1b	1056
xorjoep	1:24714b45cd1b	1057	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	1058	acc1 = __SMLADX(x3, c0, acc1);
xorjoep	1:24714b45cd1b	1059
xorjoep	1:24714b45cd1b	1060	/* Read x[4], x[5], x[6] */
xorjoep	1:24714b45cd1b	1061	a = *(px + 2);
xorjoep	1:24714b45cd1b	1062	b = *(px + 3);
xorjoep	1:24714b45cd1b	1063
xorjoep	1:24714b45cd1b	1064	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1065
xorjoep	1:24714b45cd1b	1066	x0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1067	a = *(px + 4);
xorjoep	1:24714b45cd1b	1068	x1 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1069
xorjoep	1:24714b45cd1b	1070	#else
xorjoep	1:24714b45cd1b	1071
xorjoep	1:24714b45cd1b	1072	x0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1073	a = *(px + 4);
xorjoep	1:24714b45cd1b	1074	x1 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1075
xorjoep	1:24714b45cd1b	1076	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1077
xorjoep	1:24714b45cd1b	1078	px += 4U;
xorjoep	1:24714b45cd1b	1079
xorjoep	1:24714b45cd1b	1080	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	1081	acc2 = __SMLADX(x0, c0, acc2);
xorjoep	1:24714b45cd1b	1082
xorjoep	1:24714b45cd1b	1083	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	1084	acc3 = __SMLADX(x1, c0, acc3);
xorjoep	1:24714b45cd1b	1085
xorjoep	1:24714b45cd1b	1086	} while (--k);
xorjoep	1:24714b45cd1b	1087
xorjoep	1:24714b45cd1b	1088	/* For the next MAC operations, SIMD is not used
xorjoep	1:24714b45cd1b	1089	* So, the 16 bit pointer if inputB, py is updated */
xorjoep	1:24714b45cd1b	1090
xorjoep	1:24714b45cd1b	1091	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	1092	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1093	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	1094
xorjoep	1:24714b45cd1b	1095	if (k == 1U)
xorjoep	1:24714b45cd1b	1096	{
xorjoep	1:24714b45cd1b	1097	/* Read y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	1098	c0 = *(py+1);
xorjoep	1:24714b45cd1b	1099
xorjoep	1:24714b45cd1b	1100	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1101
xorjoep	1:24714b45cd1b	1102	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	1103
xorjoep	1:24714b45cd1b	1104	#else
xorjoep	1:24714b45cd1b	1105
xorjoep	1:24714b45cd1b	1106	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	1107
xorjoep	1:24714b45cd1b	1108	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1109
xorjoep	1:24714b45cd1b	1110	/* Read x[7] */
xorjoep	1:24714b45cd1b	1111	a = *px;
xorjoep	1:24714b45cd1b	1112	b = *(px+1);
xorjoep	1:24714b45cd1b	1113	px++;
xorjoep	1:24714b45cd1b	1114
xorjoep	1:24714b45cd1b	1115	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1116
xorjoep	1:24714b45cd1b	1117	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1118
xorjoep	1:24714b45cd1b	1119	#else
xorjoep	1:24714b45cd1b	1120
xorjoep	1:24714b45cd1b	1121	x3 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1122
xorjoep	1:24714b45cd1b	1123	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1124
xorjoep	1:24714b45cd1b	1125
xorjoep	1:24714b45cd1b	1126	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1127	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1128	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1129	acc2 = __SMLADX(x1, c0, acc2);
xorjoep	1:24714b45cd1b	1130	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	1131	}
xorjoep	1:24714b45cd1b	1132
xorjoep	1:24714b45cd1b	1133	if (k == 2U)
xorjoep	1:24714b45cd1b	1134	{
xorjoep	1:24714b45cd1b	1135	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	1136	a = *py;
xorjoep	1:24714b45cd1b	1137	b = *(py+1);
xorjoep	1:24714b45cd1b	1138
xorjoep	1:24714b45cd1b	1139	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1140
xorjoep	1:24714b45cd1b	1141	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1142
xorjoep	1:24714b45cd1b	1143	#else
xorjoep	1:24714b45cd1b	1144
xorjoep	1:24714b45cd1b	1145	c0 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1146
xorjoep	1:24714b45cd1b	1147	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1148
xorjoep	1:24714b45cd1b	1149	/* Read x[7], x[8], x[9] */
xorjoep	1:24714b45cd1b	1150	a = *px;
xorjoep	1:24714b45cd1b	1151	b = *(px + 1);
xorjoep	1:24714b45cd1b	1152
xorjoep	1:24714b45cd1b	1153	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1154
xorjoep	1:24714b45cd1b	1155	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1156	a = *(px + 2);
xorjoep	1:24714b45cd1b	1157	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1158
xorjoep	1:24714b45cd1b	1159	#else
xorjoep	1:24714b45cd1b	1160
xorjoep	1:24714b45cd1b	1161	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1162	a = *(px + 2);
xorjoep	1:24714b45cd1b	1163	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1164
xorjoep	1:24714b45cd1b	1165	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1166	px += 2U;
xorjoep	1:24714b45cd1b	1167
xorjoep	1:24714b45cd1b	1168	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1169	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1170	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1171	acc2 = __SMLADX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	1172	acc3 = __SMLADX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	1173	}
xorjoep	1:24714b45cd1b	1174
xorjoep	1:24714b45cd1b	1175	if (k == 3U)
xorjoep	1:24714b45cd1b	1176	{
xorjoep	1:24714b45cd1b	1177	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	1178	a = *py;
xorjoep	1:24714b45cd1b	1179	b = *(py+1);
xorjoep	1:24714b45cd1b	1180
xorjoep	1:24714b45cd1b	1181	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1182
xorjoep	1:24714b45cd1b	1183	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1184
xorjoep	1:24714b45cd1b	1185	#else
xorjoep	1:24714b45cd1b	1186
xorjoep	1:24714b45cd1b	1187	c0 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1188
xorjoep	1:24714b45cd1b	1189	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1190
xorjoep	1:24714b45cd1b	1191	/* Read x[7], x[8], x[9] */
xorjoep	1:24714b45cd1b	1192	a = *px;
xorjoep	1:24714b45cd1b	1193	b = *(px + 1);
xorjoep	1:24714b45cd1b	1194
xorjoep	1:24714b45cd1b	1195	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1196
xorjoep	1:24714b45cd1b	1197	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1198	a = *(px + 2);
xorjoep	1:24714b45cd1b	1199	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1200
xorjoep	1:24714b45cd1b	1201	#else
xorjoep	1:24714b45cd1b	1202
xorjoep	1:24714b45cd1b	1203	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1204	a = *(px + 2);
xorjoep	1:24714b45cd1b	1205	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1206
xorjoep	1:24714b45cd1b	1207	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1208
xorjoep	1:24714b45cd1b	1209	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1210	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1211	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1212	acc2 = __SMLADX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	1213	acc3 = __SMLADX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	1214
xorjoep	1:24714b45cd1b	1215	/* Read y[srcBLen - 7] */
xorjoep	1:24714b45cd1b	1216	c0 = *(py-1);
xorjoep	1:24714b45cd1b	1217	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1218
xorjoep	1:24714b45cd1b	1219	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	1220	#else
xorjoep	1:24714b45cd1b	1221
xorjoep	1:24714b45cd1b	1222	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	1223	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1224
xorjoep	1:24714b45cd1b	1225	/* Read x[10] */
xorjoep	1:24714b45cd1b	1226	a = *(px+2);
xorjoep	1:24714b45cd1b	1227	b = *(px+3);
xorjoep	1:24714b45cd1b	1228
xorjoep	1:24714b45cd1b	1229	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1230
xorjoep	1:24714b45cd1b	1231	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1232
xorjoep	1:24714b45cd1b	1233	#else
xorjoep	1:24714b45cd1b	1234
xorjoep	1:24714b45cd1b	1235	x3 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1236
xorjoep	1:24714b45cd1b	1237	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1238
xorjoep	1:24714b45cd1b	1239	px += 3U;
xorjoep	1:24714b45cd1b	1240
xorjoep	1:24714b45cd1b	1241	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1242	acc0 = __SMLADX(x1, c0, acc0);
xorjoep	1:24714b45cd1b	1243	acc1 = __SMLAD(x2, c0, acc1);
xorjoep	1:24714b45cd1b	1244	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	1245	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	1246	}
xorjoep	1:24714b45cd1b	1247
xorjoep	1:24714b45cd1b	1248	/* Store the results in the accumulators in the destination buffer. */
xorjoep	1:24714b45cd1b	1249	*pOut++ = (q15_t)(acc0 >> 15);
xorjoep	1:24714b45cd1b	1250	*pOut++ = (q15_t)(acc1 >> 15);
xorjoep	1:24714b45cd1b	1251	*pOut++ = (q15_t)(acc2 >> 15);
xorjoep	1:24714b45cd1b	1252	*pOut++ = (q15_t)(acc3 >> 15);
xorjoep	1:24714b45cd1b	1253
xorjoep	1:24714b45cd1b	1254	/* Increment the pointer pIn1 index, count by 4 */
xorjoep	1:24714b45cd1b	1255	count += 4U;
xorjoep	1:24714b45cd1b	1256
xorjoep	1:24714b45cd1b	1257	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1258	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1259	py = pSrc2;
xorjoep	1:24714b45cd1b	1260
xorjoep	1:24714b45cd1b	1261	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1262	blkCnt--;
xorjoep	1:24714b45cd1b	1263	}
xorjoep	1:24714b45cd1b	1264
xorjoep	1:24714b45cd1b	1265	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	1266	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1267	blkCnt = (uint32_t) blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	1268
xorjoep	1:24714b45cd1b	1269	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	1270	{
xorjoep	1:24714b45cd1b	1271	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1272	sum = 0;
xorjoep	1:24714b45cd1b	1273
xorjoep	1:24714b45cd1b	1274	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	1275	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	1276
xorjoep	1:24714b45cd1b	1277	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	1278	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	1279	while (k > 0U)
xorjoep	1:24714b45cd1b	1280	{
xorjoep	1:24714b45cd1b	1281	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1282	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1283	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1284	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1285	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1286
xorjoep	1:24714b45cd1b	1287	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1288	k--;
xorjoep	1:24714b45cd1b	1289	}
xorjoep	1:24714b45cd1b	1290
xorjoep	1:24714b45cd1b	1291	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	1292	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1293	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	1294
xorjoep	1:24714b45cd1b	1295	while (k > 0U)
xorjoep	1:24714b45cd1b	1296	{
xorjoep	1:24714b45cd1b	1297	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1298	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1299
xorjoep	1:24714b45cd1b	1300	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1301	k--;
xorjoep	1:24714b45cd1b	1302	}
xorjoep	1:24714b45cd1b	1303
xorjoep	1:24714b45cd1b	1304	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1305	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1306
xorjoep	1:24714b45cd1b	1307	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	1308	count++;
xorjoep	1:24714b45cd1b	1309
xorjoep	1:24714b45cd1b	1310	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1311	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1312	py = pSrc2;
xorjoep	1:24714b45cd1b	1313
xorjoep	1:24714b45cd1b	1314	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1315	blkCnt--;
xorjoep	1:24714b45cd1b	1316	}
xorjoep	1:24714b45cd1b	1317	}
xorjoep	1:24714b45cd1b	1318	else
xorjoep	1:24714b45cd1b	1319	{
xorjoep	1:24714b45cd1b	1320	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	1321	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	1322	blkCnt = (uint32_t) blockSize2;
xorjoep	1:24714b45cd1b	1323
xorjoep	1:24714b45cd1b	1324	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	1325	{
xorjoep	1:24714b45cd1b	1326	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1327	sum = 0;
xorjoep	1:24714b45cd1b	1328
xorjoep	1:24714b45cd1b	1329	/* srcBLen number of MACS should be performed */
xorjoep	1:24714b45cd1b	1330	k = srcBLen;
xorjoep	1:24714b45cd1b	1331
xorjoep	1:24714b45cd1b	1332	while (k > 0U)
xorjoep	1:24714b45cd1b	1333	{
xorjoep	1:24714b45cd1b	1334	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	1335	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1336
xorjoep	1:24714b45cd1b	1337	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1338	k--;
xorjoep	1:24714b45cd1b	1339	}
xorjoep	1:24714b45cd1b	1340
xorjoep	1:24714b45cd1b	1341	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1342	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1343
xorjoep	1:24714b45cd1b	1344	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	1345	count++;
xorjoep	1:24714b45cd1b	1346
xorjoep	1:24714b45cd1b	1347	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1348	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1349	py = pSrc2;
xorjoep	1:24714b45cd1b	1350
xorjoep	1:24714b45cd1b	1351	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1352	blkCnt--;
xorjoep	1:24714b45cd1b	1353	}
xorjoep	1:24714b45cd1b	1354	}
xorjoep	1:24714b45cd1b	1355
xorjoep	1:24714b45cd1b	1356
xorjoep	1:24714b45cd1b	1357	/* --------------------------
xorjoep	1:24714b45cd1b	1358	* Initializations of stage3
xorjoep	1:24714b45cd1b	1359	* -------------------------*/
xorjoep	1:24714b45cd1b	1360
xorjoep	1:24714b45cd1b	1361	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	1362	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep	1:24714b45cd1b	1363	* ....
xorjoep	1:24714b45cd1b	1364	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep	1:24714b45cd1b	1365	* sum += x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	1366	*/
xorjoep	1:24714b45cd1b	1367
xorjoep	1:24714b45cd1b	1368	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	1369	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	1370	count = srcBLen - 1U;
xorjoep	1:24714b45cd1b	1371
xorjoep	1:24714b45cd1b	1372	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	1373	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	1374	px = pSrc1;
xorjoep	1:24714b45cd1b	1375
xorjoep	1:24714b45cd1b	1376	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	1377	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	1378	pIn2 = pSrc2 - 1U;
xorjoep	1:24714b45cd1b	1379	py = pIn2;
xorjoep	1:24714b45cd1b	1380
xorjoep	1:24714b45cd1b	1381	/* -------------------
xorjoep	1:24714b45cd1b	1382	* Stage3 process
xorjoep	1:24714b45cd1b	1383	* ------------------*/
xorjoep	1:24714b45cd1b	1384
xorjoep	1:24714b45cd1b	1385	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	1386	/* First part of this stage computes the MAC operations greater than 4 */
xorjoep	1:24714b45cd1b	1387	/* Second part of this stage computes the MAC operations less than or equal to 4 */
xorjoep	1:24714b45cd1b	1388
xorjoep	1:24714b45cd1b	1389	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	1390	j = count >> 2U;
xorjoep	1:24714b45cd1b	1391
xorjoep	1:24714b45cd1b	1392	while ((j > 0U) && (blockSize3 > 0))
xorjoep	1:24714b45cd1b	1393	{
xorjoep	1:24714b45cd1b	1394	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1395	sum = 0;
xorjoep	1:24714b45cd1b	1396
xorjoep	1:24714b45cd1b	1397	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	1398	k = count >> 2U;
xorjoep	1:24714b45cd1b	1399
xorjoep	1:24714b45cd1b	1400	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	1401	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	1402	py++;
xorjoep	1:24714b45cd1b	1403
xorjoep	1:24714b45cd1b	1404	while (k > 0U)
xorjoep	1:24714b45cd1b	1405	{
xorjoep	1:24714b45cd1b	1406	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1407	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1408	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1409	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1410	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1411	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1412	k--;
xorjoep	1:24714b45cd1b	1413	}
xorjoep	1:24714b45cd1b	1414
xorjoep	1:24714b45cd1b	1415
xorjoep	1:24714b45cd1b	1416	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	1417	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1418	k = count % 0x4U;
xorjoep	1:24714b45cd1b	1419
xorjoep	1:24714b45cd1b	1420	while (k > 0U)
xorjoep	1:24714b45cd1b	1421	{
xorjoep	1:24714b45cd1b	1422	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1423	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1424
xorjoep	1:24714b45cd1b	1425	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1426	k--;
xorjoep	1:24714b45cd1b	1427	}
xorjoep	1:24714b45cd1b	1428
xorjoep	1:24714b45cd1b	1429	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1430	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1431
xorjoep	1:24714b45cd1b	1432	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1433	px = ++pSrc1;
xorjoep	1:24714b45cd1b	1434	py = pIn2;
xorjoep	1:24714b45cd1b	1435
xorjoep	1:24714b45cd1b	1436	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	1437	count--;
xorjoep	1:24714b45cd1b	1438
xorjoep	1:24714b45cd1b	1439	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1440	blockSize3--;
xorjoep	1:24714b45cd1b	1441
xorjoep	1:24714b45cd1b	1442	j--;
xorjoep	1:24714b45cd1b	1443	}
xorjoep	1:24714b45cd1b	1444
xorjoep	1:24714b45cd1b	1445	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	1446	/* SIMD is not used for the next MAC operations,
xorjoep	1:24714b45cd1b	1447	* so pointer py is updated to read only one sample at a time */
xorjoep	1:24714b45cd1b	1448	py = py + 1U;
xorjoep	1:24714b45cd1b	1449
xorjoep	1:24714b45cd1b	1450	while (blockSize3 > 0)
xorjoep	1:24714b45cd1b	1451	{
xorjoep	1:24714b45cd1b	1452	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1453	sum = 0;
xorjoep	1:24714b45cd1b	1454
xorjoep	1:24714b45cd1b	1455	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	1456	k = count;
xorjoep	1:24714b45cd1b	1457
xorjoep	1:24714b45cd1b	1458	while (k > 0U)
xorjoep	1:24714b45cd1b	1459	{
xorjoep	1:24714b45cd1b	1460	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1461	/* sum += x[srcALen-1] * y[srcBLen-1] */
xorjoep	1:24714b45cd1b	1462	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1463
xorjoep	1:24714b45cd1b	1464	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1465	k--;
xorjoep	1:24714b45cd1b	1466	}
xorjoep	1:24714b45cd1b	1467
xorjoep	1:24714b45cd1b	1468	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1469	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1470
xorjoep	1:24714b45cd1b	1471	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1472	px = ++pSrc1;
xorjoep	1:24714b45cd1b	1473	py = pSrc2;
xorjoep	1:24714b45cd1b	1474
xorjoep	1:24714b45cd1b	1475	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	1476	count--;
xorjoep	1:24714b45cd1b	1477
xorjoep	1:24714b45cd1b	1478	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1479	blockSize3--;
xorjoep	1:24714b45cd1b	1480	}
xorjoep	1:24714b45cd1b	1481
xorjoep	1:24714b45cd1b	1482	/* set status as ARM_MATH_SUCCESS */
xorjoep	1:24714b45cd1b	1483	status = ARM_MATH_SUCCESS;
xorjoep	1:24714b45cd1b	1484	}
xorjoep	1:24714b45cd1b	1485
xorjoep	1:24714b45cd1b	1486	/* Return to application */
xorjoep	1:24714b45cd1b	1487	return (status);
xorjoep	1:24714b45cd1b	1488
xorjoep	1:24714b45cd1b	1489	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
xorjoep	1:24714b45cd1b	1490	}
xorjoep	1:24714b45cd1b	1491
xorjoep	1:24714b45cd1b	1492	/**
xorjoep	1:24714b45cd1b	1493	* @} end of PartialConv group
xorjoep	1:24714b45cd1b	1494	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	20 Jun 2018
Imports:	227
Forks:	0
Commits:	4
Dependents:	10
Dependencies:	0
Followers:	6

functions/FilteringFunctions/arm_conv_partial_fast_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning