CMSIS_DSP_5 - The CMSIS DSP 5 library

Users » xorjoep » Code » CMSIS_DSP_5

The CMSIS DSP 5 library

Dependents: Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

functions/FilteringFunctions/arm_conv_partial_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Committer:: xorjoep
Date:: Thu Jun 21 11:56:27 2018 +0000
Revision:: 3:4098b9d3d571
Parent:: 1:24714b45cd1b

headers is a folder not a library

Who changed what in which revision?

User	Revision	Line number	New contents of line
xorjoep	1:24714b45cd1b	1	/* ----------------------------------------------------------------------
xorjoep	1:24714b45cd1b	2	* Project: CMSIS DSP Library
xorjoep	1:24714b45cd1b	3	* Title: arm_conv_partial_q15.c
xorjoep	1:24714b45cd1b	4	* Description: Partial convolution of Q15 sequences
xorjoep	1:24714b45cd1b	5	*
xorjoep	1:24714b45cd1b	6	* $Date: 27. January 2017
xorjoep	1:24714b45cd1b	7	* $Revision: V.1.5.1
xorjoep	1:24714b45cd1b	8	*
xorjoep	1:24714b45cd1b	9	* Target Processor: Cortex-M cores
xorjoep	1:24714b45cd1b	10	* -------------------------------------------------------------------- */
xorjoep	1:24714b45cd1b	11	/*
xorjoep	1:24714b45cd1b	12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep	1:24714b45cd1b	13	*
xorjoep	1:24714b45cd1b	14	* SPDX-License-Identifier: Apache-2.0
xorjoep	1:24714b45cd1b	15	*
xorjoep	1:24714b45cd1b	16	* Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep	1:24714b45cd1b	17	* not use this file except in compliance with the License.
xorjoep	1:24714b45cd1b	18	* You may obtain a copy of the License at
xorjoep	1:24714b45cd1b	19	*
xorjoep	1:24714b45cd1b	20	* www.apache.org/licenses/LICENSE-2.0
xorjoep	1:24714b45cd1b	21	*
xorjoep	1:24714b45cd1b	22	* Unless required by applicable law or agreed to in writing, software
xorjoep	1:24714b45cd1b	23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep	1:24714b45cd1b	24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep	1:24714b45cd1b	25	* See the License for the specific language governing permissions and
xorjoep	1:24714b45cd1b	26	* limitations under the License.
xorjoep	1:24714b45cd1b	27	*/
xorjoep	1:24714b45cd1b	28
xorjoep	1:24714b45cd1b	29	#include "arm_math.h"
xorjoep	1:24714b45cd1b	30
xorjoep	1:24714b45cd1b	31	/**
xorjoep	1:24714b45cd1b	32	* @ingroup groupFilters
xorjoep	1:24714b45cd1b	33	*/
xorjoep	1:24714b45cd1b	34
xorjoep	1:24714b45cd1b	35	/**
xorjoep	1:24714b45cd1b	36	* @addtogroup PartialConv
xorjoep	1:24714b45cd1b	37	* @{
xorjoep	1:24714b45cd1b	38	*/
xorjoep	1:24714b45cd1b	39
xorjoep	1:24714b45cd1b	40	/**
xorjoep	1:24714b45cd1b	41	* @brief Partial convolution of Q15 sequences.
xorjoep	1:24714b45cd1b	42	* @param[in] *pSrcA points to the first input sequence.
xorjoep	1:24714b45cd1b	43	* @param[in] srcALen length of the first input sequence.
xorjoep	1:24714b45cd1b	44	* @param[in] *pSrcB points to the second input sequence.
xorjoep	1:24714b45cd1b	45	* @param[in] srcBLen length of the second input sequence.
xorjoep	1:24714b45cd1b	46	* @param[out] *pDst points to the location where the output result is written.
xorjoep	1:24714b45cd1b	47	* @param[in] firstIndex is the first output sample to start with.
xorjoep	1:24714b45cd1b	48	* @param[in] numPoints is the number of output points to be computed.
xorjoep	1:24714b45cd1b	49	* @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
xorjoep	1:24714b45cd1b	50	*
xorjoep	1:24714b45cd1b	51	* Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
xorjoep	1:24714b45cd1b	52	*
xorjoep	1:24714b45cd1b	53	* \par
xorjoep	1:24714b45cd1b	54	* Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers.
xorjoep	1:24714b45cd1b	55	*
xorjoep	1:24714b45cd1b	56	*/
xorjoep	1:24714b45cd1b	57
xorjoep	1:24714b45cd1b	58	arm_status arm_conv_partial_q15(
xorjoep	1:24714b45cd1b	59	q15_t * pSrcA,
xorjoep	1:24714b45cd1b	60	uint32_t srcALen,
xorjoep	1:24714b45cd1b	61	q15_t * pSrcB,
xorjoep	1:24714b45cd1b	62	uint32_t srcBLen,
xorjoep	1:24714b45cd1b	63	q15_t * pDst,
xorjoep	1:24714b45cd1b	64	uint32_t firstIndex,
xorjoep	1:24714b45cd1b	65	uint32_t numPoints)
xorjoep	1:24714b45cd1b	66	{
xorjoep	1:24714b45cd1b	67
xorjoep	1:24714b45cd1b	68
xorjoep	1:24714b45cd1b	69	#if (defined(ARM_MATH_CM7) \|\| defined(ARM_MATH_CM4) \|\| defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
xorjoep	1:24714b45cd1b	70
xorjoep	1:24714b45cd1b	71	/* Run the below code for Cortex-M4 and Cortex-M3 */
xorjoep	1:24714b45cd1b	72
xorjoep	1:24714b45cd1b	73	q15_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	74	q15_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	75	q15_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	76	q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep	1:24714b45cd1b	77	q15_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	78	q15_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	79	q15_t pSrc1, pSrc2; /* Intermediate pointers */
xorjoep	1:24714b45cd1b	80	q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
xorjoep	1:24714b45cd1b	81	uint32_t j, k, count, check, blkCnt;
xorjoep	1:24714b45cd1b	82	int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
xorjoep	1:24714b45cd1b	83	arm_status status; /* status of Partial convolution */
xorjoep	1:24714b45cd1b	84
xorjoep	1:24714b45cd1b	85	/* Check for range of output samples to be calculated */
xorjoep	1:24714b45cd1b	86	if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
xorjoep	1:24714b45cd1b	87	{
xorjoep	1:24714b45cd1b	88	/* Set status as ARM_MATH_ARGUMENT_ERROR */
xorjoep	1:24714b45cd1b	89	status = ARM_MATH_ARGUMENT_ERROR;
xorjoep	1:24714b45cd1b	90	}
xorjoep	1:24714b45cd1b	91	else
xorjoep	1:24714b45cd1b	92	{
xorjoep	1:24714b45cd1b	93
xorjoep	1:24714b45cd1b	94	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	95	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	96	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	97	if (srcALen >= srcBLen)
xorjoep	1:24714b45cd1b	98	{
xorjoep	1:24714b45cd1b	99	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	100	pIn1 = pSrcA;
xorjoep	1:24714b45cd1b	101
xorjoep	1:24714b45cd1b	102	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	103	pIn2 = pSrcB;
xorjoep	1:24714b45cd1b	104	}
xorjoep	1:24714b45cd1b	105	else
xorjoep	1:24714b45cd1b	106	{
xorjoep	1:24714b45cd1b	107	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	108	pIn1 = pSrcB;
xorjoep	1:24714b45cd1b	109
xorjoep	1:24714b45cd1b	110	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	111	pIn2 = pSrcA;
xorjoep	1:24714b45cd1b	112
xorjoep	1:24714b45cd1b	113	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	114	j = srcBLen;
xorjoep	1:24714b45cd1b	115	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	116	srcALen = j;
xorjoep	1:24714b45cd1b	117	}
xorjoep	1:24714b45cd1b	118
xorjoep	1:24714b45cd1b	119	/* Conditions to check which loopCounter holds
xorjoep	1:24714b45cd1b	120	* the first and last indices of the output samples to be calculated. */
xorjoep	1:24714b45cd1b	121	check = firstIndex + numPoints;
xorjoep	1:24714b45cd1b	122	blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
xorjoep	1:24714b45cd1b	123	blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
xorjoep	1:24714b45cd1b	124	blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
xorjoep	1:24714b45cd1b	125	blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
xorjoep	1:24714b45cd1b	126	(int32_t) numPoints) : 0;
xorjoep	1:24714b45cd1b	127	blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
xorjoep	1:24714b45cd1b	128	(int32_t) firstIndex);
xorjoep	1:24714b45cd1b	129	blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
xorjoep	1:24714b45cd1b	130
xorjoep	1:24714b45cd1b	131	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep	1:24714b45cd1b	132	/* The function is internally
xorjoep	1:24714b45cd1b	133	* divided into three stages according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	134	* taken place between inputA samples and inputB samples. In the first stage of the
xorjoep	1:24714b45cd1b	135	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	136	* In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	137	* In the third stage of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	138	* for every iteration. */
xorjoep	1:24714b45cd1b	139
xorjoep	1:24714b45cd1b	140	/* Set the output pointer to point to the firstIndex
xorjoep	1:24714b45cd1b	141	* of the output sample to be calculated. */
xorjoep	1:24714b45cd1b	142	pOut = pDst + firstIndex;
xorjoep	1:24714b45cd1b	143
xorjoep	1:24714b45cd1b	144	/* --------------------------
xorjoep	1:24714b45cd1b	145	* Initializations of stage1
xorjoep	1:24714b45cd1b	146	* -------------------------*/
xorjoep	1:24714b45cd1b	147
xorjoep	1:24714b45cd1b	148	/* sum = x[0] * y[0]
xorjoep	1:24714b45cd1b	149	* sum = x[0] * y[1] + x[1] * y[0]
xorjoep	1:24714b45cd1b	150	* ....
xorjoep	1:24714b45cd1b	151	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep	1:24714b45cd1b	152	*/
xorjoep	1:24714b45cd1b	153
xorjoep	1:24714b45cd1b	154	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	155	The count variable holds the number of MAC operations performed.
xorjoep	1:24714b45cd1b	156	Since the partial convolution starts from firstIndex
xorjoep	1:24714b45cd1b	157	Number of Macs to be performed is firstIndex + 1 */
xorjoep	1:24714b45cd1b	158	count = 1U + firstIndex;
xorjoep	1:24714b45cd1b	159
xorjoep	1:24714b45cd1b	160	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	161	px = pIn1;
xorjoep	1:24714b45cd1b	162
xorjoep	1:24714b45cd1b	163	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	164	pSrc2 = pIn2 + firstIndex;
xorjoep	1:24714b45cd1b	165	py = pSrc2;
xorjoep	1:24714b45cd1b	166
xorjoep	1:24714b45cd1b	167	/* ------------------------
xorjoep	1:24714b45cd1b	168	* Stage1 process
xorjoep	1:24714b45cd1b	169	* ----------------------*/
xorjoep	1:24714b45cd1b	170
xorjoep	1:24714b45cd1b	171	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	172	/* First part of this stage computes the MAC operations less than 4 */
xorjoep	1:24714b45cd1b	173	/* Second part of this stage computes the MAC operations greater than or equal to 4 */
xorjoep	1:24714b45cd1b	174
xorjoep	1:24714b45cd1b	175	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	176	while ((count < 4U) && (blockSize1 > 0))
xorjoep	1:24714b45cd1b	177	{
xorjoep	1:24714b45cd1b	178	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	179	sum = 0;
xorjoep	1:24714b45cd1b	180
xorjoep	1:24714b45cd1b	181	/* Loop over number of MAC operations between
xorjoep	1:24714b45cd1b	182	* inputA samples and inputB samples */
xorjoep	1:24714b45cd1b	183	k = count;
xorjoep	1:24714b45cd1b	184
xorjoep	1:24714b45cd1b	185	while (k > 0U)
xorjoep	1:24714b45cd1b	186	{
xorjoep	1:24714b45cd1b	187	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	188	sum = __SMLALD(px++, py--, sum);
xorjoep	1:24714b45cd1b	189
xorjoep	1:24714b45cd1b	190	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	191	k--;
xorjoep	1:24714b45cd1b	192	}
xorjoep	1:24714b45cd1b	193
xorjoep	1:24714b45cd1b	194	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	195	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep	1:24714b45cd1b	196
xorjoep	1:24714b45cd1b	197	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	198	py = ++pSrc2;
xorjoep	1:24714b45cd1b	199	px = pIn1;
xorjoep	1:24714b45cd1b	200
xorjoep	1:24714b45cd1b	201	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	202	count++;
xorjoep	1:24714b45cd1b	203
xorjoep	1:24714b45cd1b	204	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	205	blockSize1--;
xorjoep	1:24714b45cd1b	206	}
xorjoep	1:24714b45cd1b	207
xorjoep	1:24714b45cd1b	208	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	209	/* The internal loop, over count, is unrolled by 4 */
xorjoep	1:24714b45cd1b	210	/* To, read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	211	* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
xorjoep	1:24714b45cd1b	212	py = py - 1;
xorjoep	1:24714b45cd1b	213
xorjoep	1:24714b45cd1b	214	while (blockSize1 > 0)
xorjoep	1:24714b45cd1b	215	{
xorjoep	1:24714b45cd1b	216	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	217	sum = 0;
xorjoep	1:24714b45cd1b	218
xorjoep	1:24714b45cd1b	219	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	220	k = count >> 2U;
xorjoep	1:24714b45cd1b	221
xorjoep	1:24714b45cd1b	222	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	223	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	224	while (k > 0U)
xorjoep	1:24714b45cd1b	225	{
xorjoep	1:24714b45cd1b	226	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	227	/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
xorjoep	1:24714b45cd1b	228	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	229	/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
xorjoep	1:24714b45cd1b	230	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	231
xorjoep	1:24714b45cd1b	232	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	233	k--;
xorjoep	1:24714b45cd1b	234	}
xorjoep	1:24714b45cd1b	235
xorjoep	1:24714b45cd1b	236	/* For the next MAC operations, the pointer py is used without SIMD
xorjoep	1:24714b45cd1b	237	* So, py is incremented by 1 */
xorjoep	1:24714b45cd1b	238	py = py + 1U;
xorjoep	1:24714b45cd1b	239
xorjoep	1:24714b45cd1b	240	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	241	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	242	k = count % 0x4U;
xorjoep	1:24714b45cd1b	243
xorjoep	1:24714b45cd1b	244	while (k > 0U)
xorjoep	1:24714b45cd1b	245	{
xorjoep	1:24714b45cd1b	246	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	247	sum = __SMLALD(px++, py--, sum);
xorjoep	1:24714b45cd1b	248
xorjoep	1:24714b45cd1b	249	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	250	k--;
xorjoep	1:24714b45cd1b	251	}
xorjoep	1:24714b45cd1b	252
xorjoep	1:24714b45cd1b	253	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	254	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep	1:24714b45cd1b	255
xorjoep	1:24714b45cd1b	256	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	257	py = ++pSrc2 - 1U;
xorjoep	1:24714b45cd1b	258	px = pIn1;
xorjoep	1:24714b45cd1b	259
xorjoep	1:24714b45cd1b	260	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	261	count++;
xorjoep	1:24714b45cd1b	262
xorjoep	1:24714b45cd1b	263	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	264	blockSize1--;
xorjoep	1:24714b45cd1b	265	}
xorjoep	1:24714b45cd1b	266
xorjoep	1:24714b45cd1b	267	/* --------------------------
xorjoep	1:24714b45cd1b	268	* Initializations of stage2
xorjoep	1:24714b45cd1b	269	* ------------------------*/
xorjoep	1:24714b45cd1b	270
xorjoep	1:24714b45cd1b	271	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep	1:24714b45cd1b	272	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep	1:24714b45cd1b	273	* ....
xorjoep	1:24714b45cd1b	274	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	275	*/
xorjoep	1:24714b45cd1b	276
xorjoep	1:24714b45cd1b	277	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	278	if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep	1:24714b45cd1b	279	{
xorjoep	1:24714b45cd1b	280	px = pIn1 + firstIndex - srcBLen + 1;
xorjoep	1:24714b45cd1b	281	}
xorjoep	1:24714b45cd1b	282	else
xorjoep	1:24714b45cd1b	283	{
xorjoep	1:24714b45cd1b	284	px = pIn1;
xorjoep	1:24714b45cd1b	285	}
xorjoep	1:24714b45cd1b	286
xorjoep	1:24714b45cd1b	287	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	288	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	289	py = pSrc2;
xorjoep	1:24714b45cd1b	290
xorjoep	1:24714b45cd1b	291	/* count is the index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	292	count = 0U;
xorjoep	1:24714b45cd1b	293
xorjoep	1:24714b45cd1b	294
xorjoep	1:24714b45cd1b	295	/* --------------------
xorjoep	1:24714b45cd1b	296	* Stage2 process
xorjoep	1:24714b45cd1b	297	* -------------------*/
xorjoep	1:24714b45cd1b	298
xorjoep	1:24714b45cd1b	299	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	300	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	301	* srcBLen should be greater than or equal to 4 */
xorjoep	1:24714b45cd1b	302	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	303	{
xorjoep	1:24714b45cd1b	304	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	305	blkCnt = blockSize2 >> 2U;
xorjoep	1:24714b45cd1b	306
xorjoep	1:24714b45cd1b	307	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	308	{
xorjoep	1:24714b45cd1b	309	py = py - 1U;
xorjoep	1:24714b45cd1b	310
xorjoep	1:24714b45cd1b	311	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	312	acc0 = 0;
xorjoep	1:24714b45cd1b	313	acc1 = 0;
xorjoep	1:24714b45cd1b	314	acc2 = 0;
xorjoep	1:24714b45cd1b	315	acc3 = 0;
xorjoep	1:24714b45cd1b	316
xorjoep	1:24714b45cd1b	317
xorjoep	1:24714b45cd1b	318	/* read x[0], x[1] samples */
xorjoep	1:24714b45cd1b	319	x0 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	320	/* read x[1], x[2] samples */
xorjoep	1:24714b45cd1b	321	x1 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	322	px+= 2U;
xorjoep	1:24714b45cd1b	323
xorjoep	1:24714b45cd1b	324
xorjoep	1:24714b45cd1b	325	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	326	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	327
xorjoep	1:24714b45cd1b	328	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	329	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	330	do
xorjoep	1:24714b45cd1b	331	{
xorjoep	1:24714b45cd1b	332	/* Read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	333	* y[srcBLen - 1] and y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	334	c0 = *__SIMD32(py)--;
xorjoep	1:24714b45cd1b	335
xorjoep	1:24714b45cd1b	336	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	337	acc0 = __SMLALDX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	338
xorjoep	1:24714b45cd1b	339	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	340	acc1 = __SMLALDX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	341
xorjoep	1:24714b45cd1b	342	/* Read x[2], x[3] */
xorjoep	1:24714b45cd1b	343	x2 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	344
xorjoep	1:24714b45cd1b	345	/* Read x[3], x[4] */
xorjoep	1:24714b45cd1b	346	x3 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	347
xorjoep	1:24714b45cd1b	348	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	349	acc2 = __SMLALDX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	350
xorjoep	1:24714b45cd1b	351	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	352	acc3 = __SMLALDX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	353
xorjoep	1:24714b45cd1b	354	/* Read y[srcBLen - 3] and y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	355	c0 = *__SIMD32(py)--;
xorjoep	1:24714b45cd1b	356
xorjoep	1:24714b45cd1b	357	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	358	acc0 = __SMLALDX(x2, c0, acc0);
xorjoep	1:24714b45cd1b	359
xorjoep	1:24714b45cd1b	360	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	361	acc1 = __SMLALDX(x3, c0, acc1);
xorjoep	1:24714b45cd1b	362
xorjoep	1:24714b45cd1b	363	/* Read x[4], x[5] */
xorjoep	1:24714b45cd1b	364	x0 = _SIMD32_OFFSET(px+2);
xorjoep	1:24714b45cd1b	365
xorjoep	1:24714b45cd1b	366	/* Read x[5], x[6] */
xorjoep	1:24714b45cd1b	367	x1 = _SIMD32_OFFSET(px+3);
xorjoep	1:24714b45cd1b	368	px += 4U;
xorjoep	1:24714b45cd1b	369
xorjoep	1:24714b45cd1b	370	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	371	acc2 = __SMLALDX(x0, c0, acc2);
xorjoep	1:24714b45cd1b	372
xorjoep	1:24714b45cd1b	373	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	374	acc3 = __SMLALDX(x1, c0, acc3);
xorjoep	1:24714b45cd1b	375
xorjoep	1:24714b45cd1b	376	} while (--k);
xorjoep	1:24714b45cd1b	377
xorjoep	1:24714b45cd1b	378	/* For the next MAC operations, SIMD is not used
xorjoep	1:24714b45cd1b	379	* So, the 16 bit pointer if inputB, py is updated */
xorjoep	1:24714b45cd1b	380
xorjoep	1:24714b45cd1b	381	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	382	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	383	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	384
xorjoep	1:24714b45cd1b	385	if (k == 1U)
xorjoep	1:24714b45cd1b	386	{
xorjoep	1:24714b45cd1b	387	/* Read y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	388	c0 = *(py+1);
xorjoep	1:24714b45cd1b	389
xorjoep	1:24714b45cd1b	390	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	391
xorjoep	1:24714b45cd1b	392	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	393
xorjoep	1:24714b45cd1b	394	#else
xorjoep	1:24714b45cd1b	395
xorjoep	1:24714b45cd1b	396	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	397
xorjoep	1:24714b45cd1b	398	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	399
xorjoep	1:24714b45cd1b	400	/* Read x[7] */
xorjoep	1:24714b45cd1b	401	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	402	px++;
xorjoep	1:24714b45cd1b	403
xorjoep	1:24714b45cd1b	404	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	405	acc0 = __SMLALD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	406	acc1 = __SMLALD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	407	acc2 = __SMLALDX(x1, c0, acc2);
xorjoep	1:24714b45cd1b	408	acc3 = __SMLALDX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	409	}
xorjoep	1:24714b45cd1b	410
xorjoep	1:24714b45cd1b	411	if (k == 2U)
xorjoep	1:24714b45cd1b	412	{
xorjoep	1:24714b45cd1b	413	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	414	c0 = _SIMD32_OFFSET(py);
xorjoep	1:24714b45cd1b	415
xorjoep	1:24714b45cd1b	416	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	417	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	418
xorjoep	1:24714b45cd1b	419	/* Read x[9] */
xorjoep	1:24714b45cd1b	420	x2 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	421	px += 2U;
xorjoep	1:24714b45cd1b	422
xorjoep	1:24714b45cd1b	423	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	424	acc0 = __SMLALDX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	425	acc1 = __SMLALDX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	426	acc2 = __SMLALDX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	427	acc3 = __SMLALDX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	428	}
xorjoep	1:24714b45cd1b	429
xorjoep	1:24714b45cd1b	430	if (k == 3U)
xorjoep	1:24714b45cd1b	431	{
xorjoep	1:24714b45cd1b	432	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	433	c0 = _SIMD32_OFFSET(py);
xorjoep	1:24714b45cd1b	434
xorjoep	1:24714b45cd1b	435	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	436	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	437
xorjoep	1:24714b45cd1b	438	/* Read x[9] */
xorjoep	1:24714b45cd1b	439	x2 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	440
xorjoep	1:24714b45cd1b	441	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	442	acc0 = __SMLALDX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	443	acc1 = __SMLALDX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	444	acc2 = __SMLALDX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	445	acc3 = __SMLALDX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	446
xorjoep	1:24714b45cd1b	447	c0 = *(py-1);
xorjoep	1:24714b45cd1b	448
xorjoep	1:24714b45cd1b	449	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	450
xorjoep	1:24714b45cd1b	451	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	452	#else
xorjoep	1:24714b45cd1b	453
xorjoep	1:24714b45cd1b	454	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	455	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	456
xorjoep	1:24714b45cd1b	457	/* Read x[10] */
xorjoep	1:24714b45cd1b	458	x3 = _SIMD32_OFFSET(px+2);
xorjoep	1:24714b45cd1b	459	px += 3U;
xorjoep	1:24714b45cd1b	460
xorjoep	1:24714b45cd1b	461	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	462	acc0 = __SMLALDX(x1, c0, acc0);
xorjoep	1:24714b45cd1b	463	acc1 = __SMLALD(x2, c0, acc1);
xorjoep	1:24714b45cd1b	464	acc2 = __SMLALDX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	465	acc3 = __SMLALDX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	466	}
xorjoep	1:24714b45cd1b	467
xorjoep	1:24714b45cd1b	468
xorjoep	1:24714b45cd1b	469	/* Store the results in the accumulators in the destination buffer. */
xorjoep	1:24714b45cd1b	470
xorjoep	1:24714b45cd1b	471	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	472
xorjoep	1:24714b45cd1b	473	*__SIMD32(pOut)++ =
xorjoep	1:24714b45cd1b	474	__PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
xorjoep	1:24714b45cd1b	475	*__SIMD32(pOut)++ =
xorjoep	1:24714b45cd1b	476	__PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
xorjoep	1:24714b45cd1b	477
xorjoep	1:24714b45cd1b	478	#else
xorjoep	1:24714b45cd1b	479
xorjoep	1:24714b45cd1b	480	*__SIMD32(pOut)++ =
xorjoep	1:24714b45cd1b	481	__PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
xorjoep	1:24714b45cd1b	482	*__SIMD32(pOut)++ =
xorjoep	1:24714b45cd1b	483	__PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
xorjoep	1:24714b45cd1b	484
xorjoep	1:24714b45cd1b	485	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	486
xorjoep	1:24714b45cd1b	487	/* Increment the pointer pIn1 index, count by 4 */
xorjoep	1:24714b45cd1b	488	count += 4U;
xorjoep	1:24714b45cd1b	489
xorjoep	1:24714b45cd1b	490	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	491	if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep	1:24714b45cd1b	492	{
xorjoep	1:24714b45cd1b	493	px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep	1:24714b45cd1b	494	}
xorjoep	1:24714b45cd1b	495	else
xorjoep	1:24714b45cd1b	496	{
xorjoep	1:24714b45cd1b	497	px = pIn1 + count;
xorjoep	1:24714b45cd1b	498	}
xorjoep	1:24714b45cd1b	499	py = pSrc2;
xorjoep	1:24714b45cd1b	500
xorjoep	1:24714b45cd1b	501	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	502	blkCnt--;
xorjoep	1:24714b45cd1b	503	}
xorjoep	1:24714b45cd1b	504
xorjoep	1:24714b45cd1b	505	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	506	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	507	blkCnt = (uint32_t) blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	508
xorjoep	1:24714b45cd1b	509	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	510	{
xorjoep	1:24714b45cd1b	511	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	512	sum = 0;
xorjoep	1:24714b45cd1b	513
xorjoep	1:24714b45cd1b	514	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	515	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	516
xorjoep	1:24714b45cd1b	517	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	518	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	519	while (k > 0U)
xorjoep	1:24714b45cd1b	520	{
xorjoep	1:24714b45cd1b	521	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	522	sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	523	sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	524	sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	525	sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	526
xorjoep	1:24714b45cd1b	527	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	528	k--;
xorjoep	1:24714b45cd1b	529	}
xorjoep	1:24714b45cd1b	530
xorjoep	1:24714b45cd1b	531	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	532	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	533	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	534
xorjoep	1:24714b45cd1b	535	while (k > 0U)
xorjoep	1:24714b45cd1b	536	{
xorjoep	1:24714b45cd1b	537	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	538	sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	539
xorjoep	1:24714b45cd1b	540	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	541	k--;
xorjoep	1:24714b45cd1b	542	}
xorjoep	1:24714b45cd1b	543
xorjoep	1:24714b45cd1b	544	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	545	*pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
xorjoep	1:24714b45cd1b	546
xorjoep	1:24714b45cd1b	547	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	548	count++;
xorjoep	1:24714b45cd1b	549
xorjoep	1:24714b45cd1b	550	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	551	if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep	1:24714b45cd1b	552	{
xorjoep	1:24714b45cd1b	553	px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep	1:24714b45cd1b	554	}
xorjoep	1:24714b45cd1b	555	else
xorjoep	1:24714b45cd1b	556	{
xorjoep	1:24714b45cd1b	557	px = pIn1 + count;
xorjoep	1:24714b45cd1b	558	}
xorjoep	1:24714b45cd1b	559	py = pSrc2;
xorjoep	1:24714b45cd1b	560
xorjoep	1:24714b45cd1b	561	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	562	blkCnt--;
xorjoep	1:24714b45cd1b	563	}
xorjoep	1:24714b45cd1b	564	}
xorjoep	1:24714b45cd1b	565	else
xorjoep	1:24714b45cd1b	566	{
xorjoep	1:24714b45cd1b	567	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	568	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	569	blkCnt = (uint32_t) blockSize2;
xorjoep	1:24714b45cd1b	570
xorjoep	1:24714b45cd1b	571	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	572	{
xorjoep	1:24714b45cd1b	573	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	574	sum = 0;
xorjoep	1:24714b45cd1b	575
xorjoep	1:24714b45cd1b	576	/* srcBLen number of MACS should be performed */
xorjoep	1:24714b45cd1b	577	k = srcBLen;
xorjoep	1:24714b45cd1b	578
xorjoep	1:24714b45cd1b	579	while (k > 0U)
xorjoep	1:24714b45cd1b	580	{
xorjoep	1:24714b45cd1b	581	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	582	sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	583
xorjoep	1:24714b45cd1b	584	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	585	k--;
xorjoep	1:24714b45cd1b	586	}
xorjoep	1:24714b45cd1b	587
xorjoep	1:24714b45cd1b	588	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	589	*pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
xorjoep	1:24714b45cd1b	590
xorjoep	1:24714b45cd1b	591	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	592	count++;
xorjoep	1:24714b45cd1b	593
xorjoep	1:24714b45cd1b	594	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	595	if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep	1:24714b45cd1b	596	{
xorjoep	1:24714b45cd1b	597	px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep	1:24714b45cd1b	598	}
xorjoep	1:24714b45cd1b	599	else
xorjoep	1:24714b45cd1b	600	{
xorjoep	1:24714b45cd1b	601	px = pIn1 + count;
xorjoep	1:24714b45cd1b	602	}
xorjoep	1:24714b45cd1b	603	py = pSrc2;
xorjoep	1:24714b45cd1b	604
xorjoep	1:24714b45cd1b	605	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	606	blkCnt--;
xorjoep	1:24714b45cd1b	607	}
xorjoep	1:24714b45cd1b	608	}
xorjoep	1:24714b45cd1b	609
xorjoep	1:24714b45cd1b	610
xorjoep	1:24714b45cd1b	611	/* --------------------------
xorjoep	1:24714b45cd1b	612	* Initializations of stage3
xorjoep	1:24714b45cd1b	613	* -------------------------*/
xorjoep	1:24714b45cd1b	614
xorjoep	1:24714b45cd1b	615	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	616	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep	1:24714b45cd1b	617	* ....
xorjoep	1:24714b45cd1b	618	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep	1:24714b45cd1b	619	* sum += x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	620	*/
xorjoep	1:24714b45cd1b	621
xorjoep	1:24714b45cd1b	622	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	623	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	624	count = srcBLen - 1U;
xorjoep	1:24714b45cd1b	625
xorjoep	1:24714b45cd1b	626	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	627	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	628	px = pSrc1;
xorjoep	1:24714b45cd1b	629
xorjoep	1:24714b45cd1b	630	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	631	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	632	pIn2 = pSrc2 - 1U;
xorjoep	1:24714b45cd1b	633	py = pIn2;
xorjoep	1:24714b45cd1b	634
xorjoep	1:24714b45cd1b	635	/* -------------------
xorjoep	1:24714b45cd1b	636	* Stage3 process
xorjoep	1:24714b45cd1b	637	* ------------------*/
xorjoep	1:24714b45cd1b	638
xorjoep	1:24714b45cd1b	639	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	640	/* First part of this stage computes the MAC operations greater than 4 */
xorjoep	1:24714b45cd1b	641	/* Second part of this stage computes the MAC operations less than or equal to 4 */
xorjoep	1:24714b45cd1b	642
xorjoep	1:24714b45cd1b	643	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	644	j = count >> 2U;
xorjoep	1:24714b45cd1b	645
xorjoep	1:24714b45cd1b	646	while ((j > 0U) && (blockSize3 > 0))
xorjoep	1:24714b45cd1b	647	{
xorjoep	1:24714b45cd1b	648	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	649	sum = 0;
xorjoep	1:24714b45cd1b	650
xorjoep	1:24714b45cd1b	651	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	652	k = count >> 2U;
xorjoep	1:24714b45cd1b	653
xorjoep	1:24714b45cd1b	654	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	655	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	656	while (k > 0U)
xorjoep	1:24714b45cd1b	657	{
xorjoep	1:24714b45cd1b	658	/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
xorjoep	1:24714b45cd1b	659	* with y[srcBLen - 1], y[srcBLen - 2] respectively */
xorjoep	1:24714b45cd1b	660	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	661	/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
xorjoep	1:24714b45cd1b	662	* with y[srcBLen - 3], y[srcBLen - 4] respectively */
xorjoep	1:24714b45cd1b	663	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	664
xorjoep	1:24714b45cd1b	665	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	666	k--;
xorjoep	1:24714b45cd1b	667	}
xorjoep	1:24714b45cd1b	668
xorjoep	1:24714b45cd1b	669	/* For the next MAC operations, the pointer py is used without SIMD
xorjoep	1:24714b45cd1b	670	* So, py is incremented by 1 */
xorjoep	1:24714b45cd1b	671	py = py + 1U;
xorjoep	1:24714b45cd1b	672
xorjoep	1:24714b45cd1b	673	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	674	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	675	k = count % 0x4U;
xorjoep	1:24714b45cd1b	676
xorjoep	1:24714b45cd1b	677	while (k > 0U)
xorjoep	1:24714b45cd1b	678	{
xorjoep	1:24714b45cd1b	679	/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	680	sum = __SMLALD(px++, py--, sum);
xorjoep	1:24714b45cd1b	681
xorjoep	1:24714b45cd1b	682	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	683	k--;
xorjoep	1:24714b45cd1b	684	}
xorjoep	1:24714b45cd1b	685
xorjoep	1:24714b45cd1b	686	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	687	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep	1:24714b45cd1b	688
xorjoep	1:24714b45cd1b	689	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	690	px = ++pSrc1;
xorjoep	1:24714b45cd1b	691	py = pIn2;
xorjoep	1:24714b45cd1b	692
xorjoep	1:24714b45cd1b	693	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	694	count--;
xorjoep	1:24714b45cd1b	695
xorjoep	1:24714b45cd1b	696	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	697	blockSize3--;
xorjoep	1:24714b45cd1b	698
xorjoep	1:24714b45cd1b	699	j--;
xorjoep	1:24714b45cd1b	700	}
xorjoep	1:24714b45cd1b	701
xorjoep	1:24714b45cd1b	702	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	703	/* SIMD is not used for the next MAC operations,
xorjoep	1:24714b45cd1b	704	* so pointer py is updated to read only one sample at a time */
xorjoep	1:24714b45cd1b	705	py = py + 1U;
xorjoep	1:24714b45cd1b	706
xorjoep	1:24714b45cd1b	707	while (blockSize3 > 0)
xorjoep	1:24714b45cd1b	708	{
xorjoep	1:24714b45cd1b	709	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	710	sum = 0;
xorjoep	1:24714b45cd1b	711
xorjoep	1:24714b45cd1b	712	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	713	k = count;
xorjoep	1:24714b45cd1b	714
xorjoep	1:24714b45cd1b	715	while (k > 0U)
xorjoep	1:24714b45cd1b	716	{
xorjoep	1:24714b45cd1b	717	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	718	/* sum += x[srcALen-1] * y[srcBLen-1] */
xorjoep	1:24714b45cd1b	719	sum = __SMLALD(px++, py--, sum);
xorjoep	1:24714b45cd1b	720
xorjoep	1:24714b45cd1b	721	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	722	k--;
xorjoep	1:24714b45cd1b	723	}
xorjoep	1:24714b45cd1b	724
xorjoep	1:24714b45cd1b	725	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	726	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep	1:24714b45cd1b	727
xorjoep	1:24714b45cd1b	728	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	729	px = ++pSrc1;
xorjoep	1:24714b45cd1b	730	py = pSrc2;
xorjoep	1:24714b45cd1b	731
xorjoep	1:24714b45cd1b	732	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	733	count--;
xorjoep	1:24714b45cd1b	734
xorjoep	1:24714b45cd1b	735	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	736	blockSize3--;
xorjoep	1:24714b45cd1b	737	}
xorjoep	1:24714b45cd1b	738
xorjoep	1:24714b45cd1b	739	/* set status as ARM_MATH_SUCCESS */
xorjoep	1:24714b45cd1b	740	status = ARM_MATH_SUCCESS;
xorjoep	1:24714b45cd1b	741	}
xorjoep	1:24714b45cd1b	742
xorjoep	1:24714b45cd1b	743	/* Return to application */
xorjoep	1:24714b45cd1b	744	return (status);
xorjoep	1:24714b45cd1b	745
xorjoep	1:24714b45cd1b	746	#else
xorjoep	1:24714b45cd1b	747
xorjoep	1:24714b45cd1b	748	/* Run the below code for Cortex-M0 */
xorjoep	1:24714b45cd1b	749
xorjoep	1:24714b45cd1b	750	q15_t pIn1 = pSrcA; / inputA pointer */
xorjoep	1:24714b45cd1b	751	q15_t pIn2 = pSrcB; / inputB pointer */
xorjoep	1:24714b45cd1b	752	q63_t sum; /* Accumulator */
xorjoep	1:24714b45cd1b	753	uint32_t i, j; /* loop counters */
xorjoep	1:24714b45cd1b	754	arm_status status; /* status of Partial convolution */
xorjoep	1:24714b45cd1b	755
xorjoep	1:24714b45cd1b	756	/* Check for range of output samples to be calculated */
xorjoep	1:24714b45cd1b	757	if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
xorjoep	1:24714b45cd1b	758	{
xorjoep	1:24714b45cd1b	759	/* Set status as ARM_ARGUMENT_ERROR */
xorjoep	1:24714b45cd1b	760	status = ARM_MATH_ARGUMENT_ERROR;
xorjoep	1:24714b45cd1b	761	}
xorjoep	1:24714b45cd1b	762	else
xorjoep	1:24714b45cd1b	763	{
xorjoep	1:24714b45cd1b	764	/* Loop to calculate convolution for output length number of values */
xorjoep	1:24714b45cd1b	765	for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
xorjoep	1:24714b45cd1b	766	{
xorjoep	1:24714b45cd1b	767	/* Initialize sum with zero to carry on MAC operations */
xorjoep	1:24714b45cd1b	768	sum = 0;
xorjoep	1:24714b45cd1b	769
xorjoep	1:24714b45cd1b	770	/* Loop to perform MAC operations according to convolution equation */
xorjoep	1:24714b45cd1b	771	for (j = 0; j <= i; j++)
xorjoep	1:24714b45cd1b	772	{
xorjoep	1:24714b45cd1b	773	/* Check the array limitations */
xorjoep	1:24714b45cd1b	774	if (((i - j) < srcBLen) && (j < srcALen))
xorjoep	1:24714b45cd1b	775	{
xorjoep	1:24714b45cd1b	776	/* z[i] += x[i-j] * y[j] */
xorjoep	1:24714b45cd1b	777	sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
xorjoep	1:24714b45cd1b	778	}
xorjoep	1:24714b45cd1b	779	}
xorjoep	1:24714b45cd1b	780
xorjoep	1:24714b45cd1b	781	/* Store the output in the destination buffer */
xorjoep	1:24714b45cd1b	782	pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
xorjoep	1:24714b45cd1b	783	}
xorjoep	1:24714b45cd1b	784	/* set status as ARM_SUCCESS as there are no argument errors */
xorjoep	1:24714b45cd1b	785	status = ARM_MATH_SUCCESS;
xorjoep	1:24714b45cd1b	786	}
xorjoep	1:24714b45cd1b	787	return (status);
xorjoep	1:24714b45cd1b	788
xorjoep	1:24714b45cd1b	789	#endif /* #if (defined(ARM_MATH_CM7) \|\| defined(ARM_MATH_CM4) \|\| defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
xorjoep	1:24714b45cd1b	790
xorjoep	1:24714b45cd1b	791	}
xorjoep	1:24714b45cd1b	792
xorjoep	1:24714b45cd1b	793	/**
xorjoep	1:24714b45cd1b	794	* @} end of PartialConv group
xorjoep	1:24714b45cd1b	795	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	20 Jun 2018
Imports:	227
Forks:	0
Commits:	4
Dependents:	10
Dependencies:	0
Followers:	6

functions/FilteringFunctions/arm_conv_partial_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning