CMSIS_DSP_5 - The CMSIS DSP 5 library

Users » xorjoep » Code » CMSIS_DSP_5

The CMSIS DSP 5 library

Dependents: Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

functions/FilteringFunctions/arm_conv_fast_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Committer:: xorjoep
Date:: Thu Jun 21 11:56:27 2018 +0000
Revision:: 3:4098b9d3d571
Parent:: 1:24714b45cd1b

headers is a folder not a library

Who changed what in which revision?

User	Revision	Line number	New contents of line
xorjoep	1:24714b45cd1b	1	/* ----------------------------------------------------------------------
xorjoep	1:24714b45cd1b	2	* Project: CMSIS DSP Library
xorjoep	1:24714b45cd1b	3	* Title: arm_conv_fast_q15.c
xorjoep	1:24714b45cd1b	4	* Description: Fast Q15 Convolution
xorjoep	1:24714b45cd1b	5	*
xorjoep	1:24714b45cd1b	6	* $Date: 27. January 2017
xorjoep	1:24714b45cd1b	7	* $Revision: V.1.5.1
xorjoep	1:24714b45cd1b	8	*
xorjoep	1:24714b45cd1b	9	* Target Processor: Cortex-M cores
xorjoep	1:24714b45cd1b	10	* -------------------------------------------------------------------- */
xorjoep	1:24714b45cd1b	11	/*
xorjoep	1:24714b45cd1b	12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep	1:24714b45cd1b	13	*
xorjoep	1:24714b45cd1b	14	* SPDX-License-Identifier: Apache-2.0
xorjoep	1:24714b45cd1b	15	*
xorjoep	1:24714b45cd1b	16	* Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep	1:24714b45cd1b	17	* not use this file except in compliance with the License.
xorjoep	1:24714b45cd1b	18	* You may obtain a copy of the License at
xorjoep	1:24714b45cd1b	19	*
xorjoep	1:24714b45cd1b	20	* www.apache.org/licenses/LICENSE-2.0
xorjoep	1:24714b45cd1b	21	*
xorjoep	1:24714b45cd1b	22	* Unless required by applicable law or agreed to in writing, software
xorjoep	1:24714b45cd1b	23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep	1:24714b45cd1b	24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep	1:24714b45cd1b	25	* See the License for the specific language governing permissions and
xorjoep	1:24714b45cd1b	26	* limitations under the License.
xorjoep	1:24714b45cd1b	27	*/
xorjoep	1:24714b45cd1b	28
xorjoep	1:24714b45cd1b	29	#include "arm_math.h"
xorjoep	1:24714b45cd1b	30
xorjoep	1:24714b45cd1b	31	/**
xorjoep	1:24714b45cd1b	32	* @ingroup groupFilters
xorjoep	1:24714b45cd1b	33	*/
xorjoep	1:24714b45cd1b	34
xorjoep	1:24714b45cd1b	35	/**
xorjoep	1:24714b45cd1b	36	* @addtogroup Conv
xorjoep	1:24714b45cd1b	37	* @{
xorjoep	1:24714b45cd1b	38	*/
xorjoep	1:24714b45cd1b	39
xorjoep	1:24714b45cd1b	40	/**
xorjoep	1:24714b45cd1b	41	* @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
xorjoep	1:24714b45cd1b	42	* @param[in] *pSrcA points to the first input sequence.
xorjoep	1:24714b45cd1b	43	* @param[in] srcALen length of the first input sequence.
xorjoep	1:24714b45cd1b	44	* @param[in] *pSrcB points to the second input sequence.
xorjoep	1:24714b45cd1b	45	* @param[in] srcBLen length of the second input sequence.
xorjoep	1:24714b45cd1b	46	* @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
xorjoep	1:24714b45cd1b	47	* @return none.
xorjoep	1:24714b45cd1b	48	*
xorjoep	1:24714b45cd1b	49	* <b>Scaling and Overflow Behavior:</b>
xorjoep	1:24714b45cd1b	50	*
xorjoep	1:24714b45cd1b	51	* \par
xorjoep	1:24714b45cd1b	52	* This fast version uses a 32-bit accumulator with 2.30 format.
xorjoep	1:24714b45cd1b	53	* The accumulator maintains full precision of the intermediate multiplication results
xorjoep	1:24714b45cd1b	54	* but provides only a single guard bit. There is no saturation on intermediate additions.
xorjoep	1:24714b45cd1b	55	* Thus, if the accumulator overflows it wraps around and distorts the result.
xorjoep	1:24714b45cd1b	56	* The input signals should be scaled down to avoid intermediate overflows.
xorjoep	1:24714b45cd1b	57	* Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
xorjoep	1:24714b45cd1b	58	* as maximum of min(srcALen, srcBLen) number of additions are carried internally.
xorjoep	1:24714b45cd1b	59	* The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
xorjoep	1:24714b45cd1b	60	*
xorjoep	1:24714b45cd1b	61	* \par
xorjoep	1:24714b45cd1b	62	* See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
xorjoep	1:24714b45cd1b	63	*/
xorjoep	1:24714b45cd1b	64
xorjoep	1:24714b45cd1b	65	void arm_conv_fast_q15(
xorjoep	1:24714b45cd1b	66	q15_t * pSrcA,
xorjoep	1:24714b45cd1b	67	uint32_t srcALen,
xorjoep	1:24714b45cd1b	68	q15_t * pSrcB,
xorjoep	1:24714b45cd1b	69	uint32_t srcBLen,
xorjoep	1:24714b45cd1b	70	q15_t * pDst)
xorjoep	1:24714b45cd1b	71	{
xorjoep	1:24714b45cd1b	72	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	73	q15_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	74	q15_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	75	q15_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	76	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep	1:24714b45cd1b	77	q15_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	78	q15_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	79	q15_t pSrc1, pSrc2; /* Intermediate pointers */
xorjoep	1:24714b45cd1b	80	q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
xorjoep	1:24714b45cd1b	81	uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
xorjoep	1:24714b45cd1b	82
xorjoep	1:24714b45cd1b	83	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	84	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	85	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	86	if (srcALen >= srcBLen)
xorjoep	1:24714b45cd1b	87	{
xorjoep	1:24714b45cd1b	88	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	89	pIn1 = pSrcA;
xorjoep	1:24714b45cd1b	90
xorjoep	1:24714b45cd1b	91	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	92	pIn2 = pSrcB;
xorjoep	1:24714b45cd1b	93	}
xorjoep	1:24714b45cd1b	94	else
xorjoep	1:24714b45cd1b	95	{
xorjoep	1:24714b45cd1b	96	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	97	pIn1 = pSrcB;
xorjoep	1:24714b45cd1b	98
xorjoep	1:24714b45cd1b	99	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	100	pIn2 = pSrcA;
xorjoep	1:24714b45cd1b	101
xorjoep	1:24714b45cd1b	102	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	103	j = srcBLen;
xorjoep	1:24714b45cd1b	104	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	105	srcALen = j;
xorjoep	1:24714b45cd1b	106	}
xorjoep	1:24714b45cd1b	107
xorjoep	1:24714b45cd1b	108	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep	1:24714b45cd1b	109	/* The function is internally
xorjoep	1:24714b45cd1b	110	* divided into three stages according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	111	* taken place between inputA samples and inputB samples. In the first stage of the
xorjoep	1:24714b45cd1b	112	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	113	* In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	114	* In the third stage of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	115	* for every iteration. */
xorjoep	1:24714b45cd1b	116
xorjoep	1:24714b45cd1b	117	/* The algorithm is implemented in three stages.
xorjoep	1:24714b45cd1b	118	The loop counters of each stage is initiated here. */
xorjoep	1:24714b45cd1b	119	blockSize1 = srcBLen - 1U;
xorjoep	1:24714b45cd1b	120	blockSize2 = srcALen - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	121	blockSize3 = blockSize1;
xorjoep	1:24714b45cd1b	122
xorjoep	1:24714b45cd1b	123	/* --------------------------
xorjoep	1:24714b45cd1b	124	* Initializations of stage1
xorjoep	1:24714b45cd1b	125	* -------------------------*/
xorjoep	1:24714b45cd1b	126
xorjoep	1:24714b45cd1b	127	/* sum = x[0] * y[0]
xorjoep	1:24714b45cd1b	128	* sum = x[0] * y[1] + x[1] * y[0]
xorjoep	1:24714b45cd1b	129	* ....
xorjoep	1:24714b45cd1b	130	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep	1:24714b45cd1b	131	*/
xorjoep	1:24714b45cd1b	132
xorjoep	1:24714b45cd1b	133	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	134	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	135	count = 1U;
xorjoep	1:24714b45cd1b	136
xorjoep	1:24714b45cd1b	137	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	138	px = pIn1;
xorjoep	1:24714b45cd1b	139
xorjoep	1:24714b45cd1b	140	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	141	py = pIn2;
xorjoep	1:24714b45cd1b	142
xorjoep	1:24714b45cd1b	143
xorjoep	1:24714b45cd1b	144	/* ------------------------
xorjoep	1:24714b45cd1b	145	* Stage1 process
xorjoep	1:24714b45cd1b	146	* ----------------------*/
xorjoep	1:24714b45cd1b	147
xorjoep	1:24714b45cd1b	148	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	149	/* First part of this stage computes the MAC operations less than 4 */
xorjoep	1:24714b45cd1b	150	/* Second part of this stage computes the MAC operations greater than or equal to 4 */
xorjoep	1:24714b45cd1b	151
xorjoep	1:24714b45cd1b	152	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	153	while ((count < 4U) && (blockSize1 > 0U))
xorjoep	1:24714b45cd1b	154	{
xorjoep	1:24714b45cd1b	155	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	156	sum = 0;
xorjoep	1:24714b45cd1b	157
xorjoep	1:24714b45cd1b	158	/* Loop over number of MAC operations between
xorjoep	1:24714b45cd1b	159	* inputA samples and inputB samples */
xorjoep	1:24714b45cd1b	160	k = count;
xorjoep	1:24714b45cd1b	161
xorjoep	1:24714b45cd1b	162	while (k > 0U)
xorjoep	1:24714b45cd1b	163	{
xorjoep	1:24714b45cd1b	164	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	165	sum = __SMLAD(px++, py--, sum);
xorjoep	1:24714b45cd1b	166
xorjoep	1:24714b45cd1b	167	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	168	k--;
xorjoep	1:24714b45cd1b	169	}
xorjoep	1:24714b45cd1b	170
xorjoep	1:24714b45cd1b	171	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	172	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	173
xorjoep	1:24714b45cd1b	174	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	175	py = pIn2 + count;
xorjoep	1:24714b45cd1b	176	px = pIn1;
xorjoep	1:24714b45cd1b	177
xorjoep	1:24714b45cd1b	178	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	179	count++;
xorjoep	1:24714b45cd1b	180
xorjoep	1:24714b45cd1b	181	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	182	blockSize1--;
xorjoep	1:24714b45cd1b	183	}
xorjoep	1:24714b45cd1b	184
xorjoep	1:24714b45cd1b	185	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	186	/* The internal loop, over count, is unrolled by 4 */
xorjoep	1:24714b45cd1b	187	/* To, read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	188	* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
xorjoep	1:24714b45cd1b	189	py = py - 1;
xorjoep	1:24714b45cd1b	190
xorjoep	1:24714b45cd1b	191	while (blockSize1 > 0U)
xorjoep	1:24714b45cd1b	192	{
xorjoep	1:24714b45cd1b	193	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	194	sum = 0;
xorjoep	1:24714b45cd1b	195
xorjoep	1:24714b45cd1b	196	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	197	k = count >> 2U;
xorjoep	1:24714b45cd1b	198
xorjoep	1:24714b45cd1b	199	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	200	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	201	while (k > 0U)
xorjoep	1:24714b45cd1b	202	{
xorjoep	1:24714b45cd1b	203	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	204	/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
xorjoep	1:24714b45cd1b	205	sum = __SMLADX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	206	/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
xorjoep	1:24714b45cd1b	207	sum = __SMLADX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	208
xorjoep	1:24714b45cd1b	209	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	210	k--;
xorjoep	1:24714b45cd1b	211	}
xorjoep	1:24714b45cd1b	212
xorjoep	1:24714b45cd1b	213	/* For the next MAC operations, the pointer py is used without SIMD
xorjoep	1:24714b45cd1b	214	* So, py is incremented by 1 */
xorjoep	1:24714b45cd1b	215	py = py + 1U;
xorjoep	1:24714b45cd1b	216
xorjoep	1:24714b45cd1b	217	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	218	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	219	k = count % 0x4U;
xorjoep	1:24714b45cd1b	220
xorjoep	1:24714b45cd1b	221	while (k > 0U)
xorjoep	1:24714b45cd1b	222	{
xorjoep	1:24714b45cd1b	223	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	224	sum = __SMLAD(px++, py--, sum);
xorjoep	1:24714b45cd1b	225
xorjoep	1:24714b45cd1b	226	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	227	k--;
xorjoep	1:24714b45cd1b	228	}
xorjoep	1:24714b45cd1b	229
xorjoep	1:24714b45cd1b	230	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	231	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	232
xorjoep	1:24714b45cd1b	233	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	234	py = pIn2 + (count - 1U);
xorjoep	1:24714b45cd1b	235	px = pIn1;
xorjoep	1:24714b45cd1b	236
xorjoep	1:24714b45cd1b	237	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	238	count++;
xorjoep	1:24714b45cd1b	239
xorjoep	1:24714b45cd1b	240	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	241	blockSize1--;
xorjoep	1:24714b45cd1b	242	}
xorjoep	1:24714b45cd1b	243
xorjoep	1:24714b45cd1b	244	/* --------------------------
xorjoep	1:24714b45cd1b	245	* Initializations of stage2
xorjoep	1:24714b45cd1b	246	* ------------------------*/
xorjoep	1:24714b45cd1b	247
xorjoep	1:24714b45cd1b	248	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep	1:24714b45cd1b	249	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep	1:24714b45cd1b	250	* ....
xorjoep	1:24714b45cd1b	251	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	252	*/
xorjoep	1:24714b45cd1b	253
xorjoep	1:24714b45cd1b	254	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	255	px = pIn1;
xorjoep	1:24714b45cd1b	256
xorjoep	1:24714b45cd1b	257	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	258	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	259	py = pSrc2;
xorjoep	1:24714b45cd1b	260
xorjoep	1:24714b45cd1b	261	/* count is the index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	262	count = 0U;
xorjoep	1:24714b45cd1b	263
xorjoep	1:24714b45cd1b	264
xorjoep	1:24714b45cd1b	265	/* --------------------
xorjoep	1:24714b45cd1b	266	* Stage2 process
xorjoep	1:24714b45cd1b	267	* -------------------*/
xorjoep	1:24714b45cd1b	268
xorjoep	1:24714b45cd1b	269	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	270	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	271	* srcBLen should be greater than or equal to 4 */
xorjoep	1:24714b45cd1b	272	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	273	{
xorjoep	1:24714b45cd1b	274	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	275	blkCnt = blockSize2 >> 2U;
xorjoep	1:24714b45cd1b	276
xorjoep	1:24714b45cd1b	277	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	278	{
xorjoep	1:24714b45cd1b	279	py = py - 1U;
xorjoep	1:24714b45cd1b	280
xorjoep	1:24714b45cd1b	281	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	282	acc0 = 0;
xorjoep	1:24714b45cd1b	283	acc1 = 0;
xorjoep	1:24714b45cd1b	284	acc2 = 0;
xorjoep	1:24714b45cd1b	285	acc3 = 0;
xorjoep	1:24714b45cd1b	286
xorjoep	1:24714b45cd1b	287
xorjoep	1:24714b45cd1b	288	/* read x[0], x[1] samples */
xorjoep	1:24714b45cd1b	289	x0 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	290	/* read x[1], x[2] samples */
xorjoep	1:24714b45cd1b	291	x1 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	292	px+= 2U;
xorjoep	1:24714b45cd1b	293
xorjoep	1:24714b45cd1b	294
xorjoep	1:24714b45cd1b	295	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	296	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	297
xorjoep	1:24714b45cd1b	298	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	299	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	300	do
xorjoep	1:24714b45cd1b	301	{
xorjoep	1:24714b45cd1b	302	/* Read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	303	* y[srcBLen - 1] and y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	304	c0 = *__SIMD32(py)--;
xorjoep	1:24714b45cd1b	305
xorjoep	1:24714b45cd1b	306	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	307	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	308
xorjoep	1:24714b45cd1b	309	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	310	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	311
xorjoep	1:24714b45cd1b	312	/* Read x[2], x[3] */
xorjoep	1:24714b45cd1b	313	x2 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	314
xorjoep	1:24714b45cd1b	315	/* Read x[3], x[4] */
xorjoep	1:24714b45cd1b	316	x3 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	317
xorjoep	1:24714b45cd1b	318	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	319	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	320
xorjoep	1:24714b45cd1b	321	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	322	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	323
xorjoep	1:24714b45cd1b	324	/* Read y[srcBLen - 3] and y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	325	c0 = *__SIMD32(py)--;
xorjoep	1:24714b45cd1b	326
xorjoep	1:24714b45cd1b	327	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	328	acc0 = __SMLADX(x2, c0, acc0);
xorjoep	1:24714b45cd1b	329
xorjoep	1:24714b45cd1b	330	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	331	acc1 = __SMLADX(x3, c0, acc1);
xorjoep	1:24714b45cd1b	332
xorjoep	1:24714b45cd1b	333	/* Read x[4], x[5] */
xorjoep	1:24714b45cd1b	334	x0 = _SIMD32_OFFSET(px+2);
xorjoep	1:24714b45cd1b	335
xorjoep	1:24714b45cd1b	336	/* Read x[5], x[6] */
xorjoep	1:24714b45cd1b	337	x1 = _SIMD32_OFFSET(px+3);
xorjoep	1:24714b45cd1b	338	px += 4U;
xorjoep	1:24714b45cd1b	339
xorjoep	1:24714b45cd1b	340	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	341	acc2 = __SMLADX(x0, c0, acc2);
xorjoep	1:24714b45cd1b	342
xorjoep	1:24714b45cd1b	343	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	344	acc3 = __SMLADX(x1, c0, acc3);
xorjoep	1:24714b45cd1b	345
xorjoep	1:24714b45cd1b	346	} while (--k);
xorjoep	1:24714b45cd1b	347
xorjoep	1:24714b45cd1b	348	/* For the next MAC operations, SIMD is not used
xorjoep	1:24714b45cd1b	349	* So, the 16 bit pointer if inputB, py is updated */
xorjoep	1:24714b45cd1b	350
xorjoep	1:24714b45cd1b	351	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	352	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	353	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	354
xorjoep	1:24714b45cd1b	355	if (k == 1U)
xorjoep	1:24714b45cd1b	356	{
xorjoep	1:24714b45cd1b	357	/* Read y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	358	c0 = *(py+1);
xorjoep	1:24714b45cd1b	359
xorjoep	1:24714b45cd1b	360	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	361
xorjoep	1:24714b45cd1b	362	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	363
xorjoep	1:24714b45cd1b	364	#else
xorjoep	1:24714b45cd1b	365
xorjoep	1:24714b45cd1b	366	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	367
xorjoep	1:24714b45cd1b	368	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	369
xorjoep	1:24714b45cd1b	370	/* Read x[7] */
xorjoep	1:24714b45cd1b	371	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	372	px++;
xorjoep	1:24714b45cd1b	373
xorjoep	1:24714b45cd1b	374	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	375	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	376	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	377	acc2 = __SMLADX(x1, c0, acc2);
xorjoep	1:24714b45cd1b	378	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	379	}
xorjoep	1:24714b45cd1b	380
xorjoep	1:24714b45cd1b	381	if (k == 2U)
xorjoep	1:24714b45cd1b	382	{
xorjoep	1:24714b45cd1b	383	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	384	c0 = _SIMD32_OFFSET(py);
xorjoep	1:24714b45cd1b	385
xorjoep	1:24714b45cd1b	386	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	387	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	388
xorjoep	1:24714b45cd1b	389	/* Read x[9] */
xorjoep	1:24714b45cd1b	390	x2 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	391	px += 2U;
xorjoep	1:24714b45cd1b	392
xorjoep	1:24714b45cd1b	393	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	394	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	395	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	396	acc2 = __SMLADX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	397	acc3 = __SMLADX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	398	}
xorjoep	1:24714b45cd1b	399
xorjoep	1:24714b45cd1b	400	if (k == 3U)
xorjoep	1:24714b45cd1b	401	{
xorjoep	1:24714b45cd1b	402	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	403	c0 = _SIMD32_OFFSET(py);
xorjoep	1:24714b45cd1b	404
xorjoep	1:24714b45cd1b	405	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	406	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	407
xorjoep	1:24714b45cd1b	408	/* Read x[9] */
xorjoep	1:24714b45cd1b	409	x2 = _SIMD32_OFFSET(px+1);
xorjoep	1:24714b45cd1b	410
xorjoep	1:24714b45cd1b	411	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	412	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	413	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	414	acc2 = __SMLADX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	415	acc3 = __SMLADX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	416
xorjoep	1:24714b45cd1b	417	/* Read y[srcBLen - 7] */
xorjoep	1:24714b45cd1b	418	c0 = *(py-1);
xorjoep	1:24714b45cd1b	419	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	420
xorjoep	1:24714b45cd1b	421	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	422	#else
xorjoep	1:24714b45cd1b	423
xorjoep	1:24714b45cd1b	424	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	425	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	426
xorjoep	1:24714b45cd1b	427	/* Read x[10] */
xorjoep	1:24714b45cd1b	428	x3 = _SIMD32_OFFSET(px+2);
xorjoep	1:24714b45cd1b	429	px += 3U;
xorjoep	1:24714b45cd1b	430
xorjoep	1:24714b45cd1b	431	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	432	acc0 = __SMLADX(x1, c0, acc0);
xorjoep	1:24714b45cd1b	433	acc1 = __SMLAD(x2, c0, acc1);
xorjoep	1:24714b45cd1b	434	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	435	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	436	}
xorjoep	1:24714b45cd1b	437
xorjoep	1:24714b45cd1b	438	/* Store the results in the accumulators in the destination buffer. */
xorjoep	1:24714b45cd1b	439	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	440
xorjoep	1:24714b45cd1b	441	*__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
xorjoep	1:24714b45cd1b	442	*__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
xorjoep	1:24714b45cd1b	443
xorjoep	1:24714b45cd1b	444	#else
xorjoep	1:24714b45cd1b	445
xorjoep	1:24714b45cd1b	446	*__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
xorjoep	1:24714b45cd1b	447	*__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
xorjoep	1:24714b45cd1b	448
xorjoep	1:24714b45cd1b	449	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	450
xorjoep	1:24714b45cd1b	451	/* Increment the pointer pIn1 index, count by 4 */
xorjoep	1:24714b45cd1b	452	count += 4U;
xorjoep	1:24714b45cd1b	453
xorjoep	1:24714b45cd1b	454	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	455	px = pIn1 + count;
xorjoep	1:24714b45cd1b	456	py = pSrc2;
xorjoep	1:24714b45cd1b	457
xorjoep	1:24714b45cd1b	458	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	459	blkCnt--;
xorjoep	1:24714b45cd1b	460	}
xorjoep	1:24714b45cd1b	461
xorjoep	1:24714b45cd1b	462	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	463	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	464	blkCnt = blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	465
xorjoep	1:24714b45cd1b	466	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	467	{
xorjoep	1:24714b45cd1b	468	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	469	sum = 0;
xorjoep	1:24714b45cd1b	470
xorjoep	1:24714b45cd1b	471	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	472	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	473
xorjoep	1:24714b45cd1b	474	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	475	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	476	while (k > 0U)
xorjoep	1:24714b45cd1b	477	{
xorjoep	1:24714b45cd1b	478	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	479	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	480	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	481	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	482	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	483
xorjoep	1:24714b45cd1b	484	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	485	k--;
xorjoep	1:24714b45cd1b	486	}
xorjoep	1:24714b45cd1b	487
xorjoep	1:24714b45cd1b	488	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	489	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	490	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	491
xorjoep	1:24714b45cd1b	492	while (k > 0U)
xorjoep	1:24714b45cd1b	493	{
xorjoep	1:24714b45cd1b	494	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	495	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	496
xorjoep	1:24714b45cd1b	497	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	498	k--;
xorjoep	1:24714b45cd1b	499	}
xorjoep	1:24714b45cd1b	500
xorjoep	1:24714b45cd1b	501	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	502	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	503
xorjoep	1:24714b45cd1b	504	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	505	count++;
xorjoep	1:24714b45cd1b	506
xorjoep	1:24714b45cd1b	507	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	508	px = pIn1 + count;
xorjoep	1:24714b45cd1b	509	py = pSrc2;
xorjoep	1:24714b45cd1b	510
xorjoep	1:24714b45cd1b	511	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	512	blkCnt--;
xorjoep	1:24714b45cd1b	513	}
xorjoep	1:24714b45cd1b	514	}
xorjoep	1:24714b45cd1b	515	else
xorjoep	1:24714b45cd1b	516	{
xorjoep	1:24714b45cd1b	517	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	518	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	519	blkCnt = blockSize2;
xorjoep	1:24714b45cd1b	520
xorjoep	1:24714b45cd1b	521	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	522	{
xorjoep	1:24714b45cd1b	523	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	524	sum = 0;
xorjoep	1:24714b45cd1b	525
xorjoep	1:24714b45cd1b	526	/* srcBLen number of MACS should be performed */
xorjoep	1:24714b45cd1b	527	k = srcBLen;
xorjoep	1:24714b45cd1b	528
xorjoep	1:24714b45cd1b	529	while (k > 0U)
xorjoep	1:24714b45cd1b	530	{
xorjoep	1:24714b45cd1b	531	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	532	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	533
xorjoep	1:24714b45cd1b	534	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	535	k--;
xorjoep	1:24714b45cd1b	536	}
xorjoep	1:24714b45cd1b	537
xorjoep	1:24714b45cd1b	538	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	539	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	540
xorjoep	1:24714b45cd1b	541	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	542	count++;
xorjoep	1:24714b45cd1b	543
xorjoep	1:24714b45cd1b	544	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	545	px = pIn1 + count;
xorjoep	1:24714b45cd1b	546	py = pSrc2;
xorjoep	1:24714b45cd1b	547
xorjoep	1:24714b45cd1b	548	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	549	blkCnt--;
xorjoep	1:24714b45cd1b	550	}
xorjoep	1:24714b45cd1b	551	}
xorjoep	1:24714b45cd1b	552
xorjoep	1:24714b45cd1b	553
xorjoep	1:24714b45cd1b	554	/* --------------------------
xorjoep	1:24714b45cd1b	555	* Initializations of stage3
xorjoep	1:24714b45cd1b	556	* -------------------------*/
xorjoep	1:24714b45cd1b	557
xorjoep	1:24714b45cd1b	558	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	559	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep	1:24714b45cd1b	560	* ....
xorjoep	1:24714b45cd1b	561	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep	1:24714b45cd1b	562	* sum += x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	563	*/
xorjoep	1:24714b45cd1b	564
xorjoep	1:24714b45cd1b	565	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	566	The blockSize3 variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	567
xorjoep	1:24714b45cd1b	568	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	569	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	570	px = pSrc1;
xorjoep	1:24714b45cd1b	571
xorjoep	1:24714b45cd1b	572	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	573	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	574	pIn2 = pSrc2 - 1U;
xorjoep	1:24714b45cd1b	575	py = pIn2;
xorjoep	1:24714b45cd1b	576
xorjoep	1:24714b45cd1b	577	/* -------------------
xorjoep	1:24714b45cd1b	578	* Stage3 process
xorjoep	1:24714b45cd1b	579	* ------------------*/
xorjoep	1:24714b45cd1b	580
xorjoep	1:24714b45cd1b	581	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	582	/* First part of this stage computes the MAC operations greater than 4 */
xorjoep	1:24714b45cd1b	583	/* Second part of this stage computes the MAC operations less than or equal to 4 */
xorjoep	1:24714b45cd1b	584
xorjoep	1:24714b45cd1b	585	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	586	j = blockSize3 >> 2U;
xorjoep	1:24714b45cd1b	587
xorjoep	1:24714b45cd1b	588	while ((j > 0U) && (blockSize3 > 0U))
xorjoep	1:24714b45cd1b	589	{
xorjoep	1:24714b45cd1b	590	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	591	sum = 0;
xorjoep	1:24714b45cd1b	592
xorjoep	1:24714b45cd1b	593	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	594	k = blockSize3 >> 2U;
xorjoep	1:24714b45cd1b	595
xorjoep	1:24714b45cd1b	596	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	597	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	598	while (k > 0U)
xorjoep	1:24714b45cd1b	599	{
xorjoep	1:24714b45cd1b	600	/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
xorjoep	1:24714b45cd1b	601	* with y[srcBLen - 1], y[srcBLen - 2] respectively */
xorjoep	1:24714b45cd1b	602	sum = __SMLADX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	603	/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
xorjoep	1:24714b45cd1b	604	* with y[srcBLen - 3], y[srcBLen - 4] respectively */
xorjoep	1:24714b45cd1b	605	sum = __SMLADX(__SIMD32(px)++, __SIMD32(py)--, sum);
xorjoep	1:24714b45cd1b	606
xorjoep	1:24714b45cd1b	607	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	608	k--;
xorjoep	1:24714b45cd1b	609	}
xorjoep	1:24714b45cd1b	610
xorjoep	1:24714b45cd1b	611	/* For the next MAC operations, the pointer py is used without SIMD
xorjoep	1:24714b45cd1b	612	* So, py is incremented by 1 */
xorjoep	1:24714b45cd1b	613	py = py + 1U;
xorjoep	1:24714b45cd1b	614
xorjoep	1:24714b45cd1b	615	/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	616	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	617	k = blockSize3 % 0x4U;
xorjoep	1:24714b45cd1b	618
xorjoep	1:24714b45cd1b	619	while (k > 0U)
xorjoep	1:24714b45cd1b	620	{
xorjoep	1:24714b45cd1b	621	/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	622	sum = __SMLAD(px++, py--, sum);
xorjoep	1:24714b45cd1b	623
xorjoep	1:24714b45cd1b	624	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	625	k--;
xorjoep	1:24714b45cd1b	626	}
xorjoep	1:24714b45cd1b	627
xorjoep	1:24714b45cd1b	628	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	629	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	630
xorjoep	1:24714b45cd1b	631	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	632	px = ++pSrc1;
xorjoep	1:24714b45cd1b	633	py = pIn2;
xorjoep	1:24714b45cd1b	634
xorjoep	1:24714b45cd1b	635	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	636	blockSize3--;
xorjoep	1:24714b45cd1b	637
xorjoep	1:24714b45cd1b	638	j--;
xorjoep	1:24714b45cd1b	639	}
xorjoep	1:24714b45cd1b	640
xorjoep	1:24714b45cd1b	641	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	642	/* SIMD is not used for the next MAC operations,
xorjoep	1:24714b45cd1b	643	* so pointer py is updated to read only one sample at a time */
xorjoep	1:24714b45cd1b	644	py = py + 1U;
xorjoep	1:24714b45cd1b	645
xorjoep	1:24714b45cd1b	646	while (blockSize3 > 0U)
xorjoep	1:24714b45cd1b	647	{
xorjoep	1:24714b45cd1b	648	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	649	sum = 0;
xorjoep	1:24714b45cd1b	650
xorjoep	1:24714b45cd1b	651	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	652	k = blockSize3;
xorjoep	1:24714b45cd1b	653
xorjoep	1:24714b45cd1b	654	while (k > 0U)
xorjoep	1:24714b45cd1b	655	{
xorjoep	1:24714b45cd1b	656	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	657	/* sum += x[srcALen-1] * y[srcBLen-1] */
xorjoep	1:24714b45cd1b	658	sum = __SMLAD(px++, py--, sum);
xorjoep	1:24714b45cd1b	659
xorjoep	1:24714b45cd1b	660	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	661	k--;
xorjoep	1:24714b45cd1b	662	}
xorjoep	1:24714b45cd1b	663
xorjoep	1:24714b45cd1b	664	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	665	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	666
xorjoep	1:24714b45cd1b	667	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	668	px = ++pSrc1;
xorjoep	1:24714b45cd1b	669	py = pSrc2;
xorjoep	1:24714b45cd1b	670
xorjoep	1:24714b45cd1b	671	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	672	blockSize3--;
xorjoep	1:24714b45cd1b	673	}
xorjoep	1:24714b45cd1b	674
xorjoep	1:24714b45cd1b	675	#else
xorjoep	1:24714b45cd1b	676	q15_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	677	q15_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	678	q15_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	679	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep	1:24714b45cd1b	680	q15_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	681	q15_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	682	q15_t pSrc1, pSrc2; /* Intermediate pointers */
xorjoep	1:24714b45cd1b	683	q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
xorjoep	1:24714b45cd1b	684	uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
xorjoep	1:24714b45cd1b	685	q15_t a, b;
xorjoep	1:24714b45cd1b	686
xorjoep	1:24714b45cd1b	687	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	688	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	689	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	690	if (srcALen >= srcBLen)
xorjoep	1:24714b45cd1b	691	{
xorjoep	1:24714b45cd1b	692	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	693	pIn1 = pSrcA;
xorjoep	1:24714b45cd1b	694
xorjoep	1:24714b45cd1b	695	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	696	pIn2 = pSrcB;
xorjoep	1:24714b45cd1b	697	}
xorjoep	1:24714b45cd1b	698	else
xorjoep	1:24714b45cd1b	699	{
xorjoep	1:24714b45cd1b	700	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	701	pIn1 = pSrcB;
xorjoep	1:24714b45cd1b	702
xorjoep	1:24714b45cd1b	703	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	704	pIn2 = pSrcA;
xorjoep	1:24714b45cd1b	705
xorjoep	1:24714b45cd1b	706	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	707	j = srcBLen;
xorjoep	1:24714b45cd1b	708	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	709	srcALen = j;
xorjoep	1:24714b45cd1b	710	}
xorjoep	1:24714b45cd1b	711
xorjoep	1:24714b45cd1b	712	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep	1:24714b45cd1b	713	/* The function is internally
xorjoep	1:24714b45cd1b	714	* divided into three stages according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	715	* taken place between inputA samples and inputB samples. In the first stage of the
xorjoep	1:24714b45cd1b	716	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	717	* In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	718	* In the third stage of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	719	* for every iteration. */
xorjoep	1:24714b45cd1b	720
xorjoep	1:24714b45cd1b	721	/* The algorithm is implemented in three stages.
xorjoep	1:24714b45cd1b	722	The loop counters of each stage is initiated here. */
xorjoep	1:24714b45cd1b	723	blockSize1 = srcBLen - 1U;
xorjoep	1:24714b45cd1b	724	blockSize2 = srcALen - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	725	blockSize3 = blockSize1;
xorjoep	1:24714b45cd1b	726
xorjoep	1:24714b45cd1b	727	/* --------------------------
xorjoep	1:24714b45cd1b	728	* Initializations of stage1
xorjoep	1:24714b45cd1b	729	* -------------------------*/
xorjoep	1:24714b45cd1b	730
xorjoep	1:24714b45cd1b	731	/* sum = x[0] * y[0]
xorjoep	1:24714b45cd1b	732	* sum = x[0] * y[1] + x[1] * y[0]
xorjoep	1:24714b45cd1b	733	* ....
xorjoep	1:24714b45cd1b	734	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep	1:24714b45cd1b	735	*/
xorjoep	1:24714b45cd1b	736
xorjoep	1:24714b45cd1b	737	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	738	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	739	count = 1U;
xorjoep	1:24714b45cd1b	740
xorjoep	1:24714b45cd1b	741	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	742	px = pIn1;
xorjoep	1:24714b45cd1b	743
xorjoep	1:24714b45cd1b	744	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	745	py = pIn2;
xorjoep	1:24714b45cd1b	746
xorjoep	1:24714b45cd1b	747
xorjoep	1:24714b45cd1b	748	/* ------------------------
xorjoep	1:24714b45cd1b	749	* Stage1 process
xorjoep	1:24714b45cd1b	750	* ----------------------*/
xorjoep	1:24714b45cd1b	751
xorjoep	1:24714b45cd1b	752	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	753	/* First part of this stage computes the MAC operations less than 4 */
xorjoep	1:24714b45cd1b	754	/* Second part of this stage computes the MAC operations greater than or equal to 4 */
xorjoep	1:24714b45cd1b	755
xorjoep	1:24714b45cd1b	756	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	757	while ((count < 4U) && (blockSize1 > 0U))
xorjoep	1:24714b45cd1b	758	{
xorjoep	1:24714b45cd1b	759	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	760	sum = 0;
xorjoep	1:24714b45cd1b	761
xorjoep	1:24714b45cd1b	762	/* Loop over number of MAC operations between
xorjoep	1:24714b45cd1b	763	* inputA samples and inputB samples */
xorjoep	1:24714b45cd1b	764	k = count;
xorjoep	1:24714b45cd1b	765
xorjoep	1:24714b45cd1b	766	while (k > 0U)
xorjoep	1:24714b45cd1b	767	{
xorjoep	1:24714b45cd1b	768	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	769	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	770
xorjoep	1:24714b45cd1b	771	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	772	k--;
xorjoep	1:24714b45cd1b	773	}
xorjoep	1:24714b45cd1b	774
xorjoep	1:24714b45cd1b	775	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	776	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	777
xorjoep	1:24714b45cd1b	778	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	779	py = pIn2 + count;
xorjoep	1:24714b45cd1b	780	px = pIn1;
xorjoep	1:24714b45cd1b	781
xorjoep	1:24714b45cd1b	782	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	783	count++;
xorjoep	1:24714b45cd1b	784
xorjoep	1:24714b45cd1b	785	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	786	blockSize1--;
xorjoep	1:24714b45cd1b	787	}
xorjoep	1:24714b45cd1b	788
xorjoep	1:24714b45cd1b	789	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	790	/* The internal loop, over count, is unrolled by 4 */
xorjoep	1:24714b45cd1b	791	/* To, read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	792	* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
xorjoep	1:24714b45cd1b	793	py = py - 1;
xorjoep	1:24714b45cd1b	794
xorjoep	1:24714b45cd1b	795	while (blockSize1 > 0U)
xorjoep	1:24714b45cd1b	796	{
xorjoep	1:24714b45cd1b	797	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	798	sum = 0;
xorjoep	1:24714b45cd1b	799
xorjoep	1:24714b45cd1b	800	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	801	k = count >> 2U;
xorjoep	1:24714b45cd1b	802
xorjoep	1:24714b45cd1b	803	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	804	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	805	py++;
xorjoep	1:24714b45cd1b	806
xorjoep	1:24714b45cd1b	807	while (k > 0U)
xorjoep	1:24714b45cd1b	808	{
xorjoep	1:24714b45cd1b	809	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	810	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	811	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	812	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	813	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	814
xorjoep	1:24714b45cd1b	815	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	816	k--;
xorjoep	1:24714b45cd1b	817	}
xorjoep	1:24714b45cd1b	818
xorjoep	1:24714b45cd1b	819	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	820	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	821	k = count % 0x4U;
xorjoep	1:24714b45cd1b	822
xorjoep	1:24714b45cd1b	823	while (k > 0U)
xorjoep	1:24714b45cd1b	824	{
xorjoep	1:24714b45cd1b	825	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	826	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	827
xorjoep	1:24714b45cd1b	828	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	829	k--;
xorjoep	1:24714b45cd1b	830	}
xorjoep	1:24714b45cd1b	831
xorjoep	1:24714b45cd1b	832	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	833	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	834
xorjoep	1:24714b45cd1b	835	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	836	py = pIn2 + (count - 1U);
xorjoep	1:24714b45cd1b	837	px = pIn1;
xorjoep	1:24714b45cd1b	838
xorjoep	1:24714b45cd1b	839	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	840	count++;
xorjoep	1:24714b45cd1b	841
xorjoep	1:24714b45cd1b	842	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	843	blockSize1--;
xorjoep	1:24714b45cd1b	844	}
xorjoep	1:24714b45cd1b	845
xorjoep	1:24714b45cd1b	846	/* --------------------------
xorjoep	1:24714b45cd1b	847	* Initializations of stage2
xorjoep	1:24714b45cd1b	848	* ------------------------*/
xorjoep	1:24714b45cd1b	849
xorjoep	1:24714b45cd1b	850	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep	1:24714b45cd1b	851	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep	1:24714b45cd1b	852	* ....
xorjoep	1:24714b45cd1b	853	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	854	*/
xorjoep	1:24714b45cd1b	855
xorjoep	1:24714b45cd1b	856	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	857	px = pIn1;
xorjoep	1:24714b45cd1b	858
xorjoep	1:24714b45cd1b	859	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	860	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	861	py = pSrc2;
xorjoep	1:24714b45cd1b	862
xorjoep	1:24714b45cd1b	863	/* count is the index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	864	count = 0U;
xorjoep	1:24714b45cd1b	865
xorjoep	1:24714b45cd1b	866
xorjoep	1:24714b45cd1b	867	/* --------------------
xorjoep	1:24714b45cd1b	868	* Stage2 process
xorjoep	1:24714b45cd1b	869	* -------------------*/
xorjoep	1:24714b45cd1b	870
xorjoep	1:24714b45cd1b	871	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	872	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	873	* srcBLen should be greater than or equal to 4 */
xorjoep	1:24714b45cd1b	874	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	875	{
xorjoep	1:24714b45cd1b	876	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	877	blkCnt = blockSize2 >> 2U;
xorjoep	1:24714b45cd1b	878
xorjoep	1:24714b45cd1b	879	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	880	{
xorjoep	1:24714b45cd1b	881	py = py - 1U;
xorjoep	1:24714b45cd1b	882
xorjoep	1:24714b45cd1b	883	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	884	acc0 = 0;
xorjoep	1:24714b45cd1b	885	acc1 = 0;
xorjoep	1:24714b45cd1b	886	acc2 = 0;
xorjoep	1:24714b45cd1b	887	acc3 = 0;
xorjoep	1:24714b45cd1b	888
xorjoep	1:24714b45cd1b	889	/* read x[0], x[1] samples */
xorjoep	1:24714b45cd1b	890	a = *px++;
xorjoep	1:24714b45cd1b	891	b = *px++;
xorjoep	1:24714b45cd1b	892
xorjoep	1:24714b45cd1b	893	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	894
xorjoep	1:24714b45cd1b	895	x0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	896	a = *px;
xorjoep	1:24714b45cd1b	897	x1 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	898
xorjoep	1:24714b45cd1b	899	#else
xorjoep	1:24714b45cd1b	900
xorjoep	1:24714b45cd1b	901	x0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	902	a = *px;
xorjoep	1:24714b45cd1b	903	x1 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	904
xorjoep	1:24714b45cd1b	905	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	906
xorjoep	1:24714b45cd1b	907	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	908	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	909
xorjoep	1:24714b45cd1b	910	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	911	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	912	do
xorjoep	1:24714b45cd1b	913	{
xorjoep	1:24714b45cd1b	914	/* Read the last two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	915	* y[srcBLen - 1] and y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	916	a = *py;
xorjoep	1:24714b45cd1b	917	b = *(py+1);
xorjoep	1:24714b45cd1b	918	py -= 2;
xorjoep	1:24714b45cd1b	919
xorjoep	1:24714b45cd1b	920	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	921
xorjoep	1:24714b45cd1b	922	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	923
xorjoep	1:24714b45cd1b	924	#else
xorjoep	1:24714b45cd1b	925
xorjoep	1:24714b45cd1b	926	c0 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	927
xorjoep	1:24714b45cd1b	928	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	929
xorjoep	1:24714b45cd1b	930	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	931	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	932
xorjoep	1:24714b45cd1b	933	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	934	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	935
xorjoep	1:24714b45cd1b	936	a = *px;
xorjoep	1:24714b45cd1b	937	b = *(px + 1);
xorjoep	1:24714b45cd1b	938
xorjoep	1:24714b45cd1b	939	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	940
xorjoep	1:24714b45cd1b	941	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	942	a = *(px + 2);
xorjoep	1:24714b45cd1b	943	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	944
xorjoep	1:24714b45cd1b	945	#else
xorjoep	1:24714b45cd1b	946
xorjoep	1:24714b45cd1b	947	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	948	a = *(px + 2);
xorjoep	1:24714b45cd1b	949	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	950
xorjoep	1:24714b45cd1b	951	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	952
xorjoep	1:24714b45cd1b	953	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	954	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	955
xorjoep	1:24714b45cd1b	956	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	957	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	958
xorjoep	1:24714b45cd1b	959	/* Read y[srcBLen - 3] and y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	960	a = *py;
xorjoep	1:24714b45cd1b	961	b = *(py+1);
xorjoep	1:24714b45cd1b	962	py -= 2;
xorjoep	1:24714b45cd1b	963
xorjoep	1:24714b45cd1b	964	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	965
xorjoep	1:24714b45cd1b	966	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	967
xorjoep	1:24714b45cd1b	968	#else
xorjoep	1:24714b45cd1b	969
xorjoep	1:24714b45cd1b	970	c0 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	971
xorjoep	1:24714b45cd1b	972	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	973
xorjoep	1:24714b45cd1b	974	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	975	acc0 = __SMLADX(x2, c0, acc0);
xorjoep	1:24714b45cd1b	976
xorjoep	1:24714b45cd1b	977	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	978	acc1 = __SMLADX(x3, c0, acc1);
xorjoep	1:24714b45cd1b	979
xorjoep	1:24714b45cd1b	980	/* Read x[4], x[5], x[6] */
xorjoep	1:24714b45cd1b	981	a = *(px + 2);
xorjoep	1:24714b45cd1b	982	b = *(px + 3);
xorjoep	1:24714b45cd1b	983
xorjoep	1:24714b45cd1b	984	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	985
xorjoep	1:24714b45cd1b	986	x0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	987	a = *(px + 4);
xorjoep	1:24714b45cd1b	988	x1 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	989
xorjoep	1:24714b45cd1b	990	#else
xorjoep	1:24714b45cd1b	991
xorjoep	1:24714b45cd1b	992	x0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	993	a = *(px + 4);
xorjoep	1:24714b45cd1b	994	x1 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	995
xorjoep	1:24714b45cd1b	996	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	997
xorjoep	1:24714b45cd1b	998	px += 4U;
xorjoep	1:24714b45cd1b	999
xorjoep	1:24714b45cd1b	1000	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	1001	acc2 = __SMLADX(x0, c0, acc2);
xorjoep	1:24714b45cd1b	1002
xorjoep	1:24714b45cd1b	1003	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	1004	acc3 = __SMLADX(x1, c0, acc3);
xorjoep	1:24714b45cd1b	1005
xorjoep	1:24714b45cd1b	1006	} while (--k);
xorjoep	1:24714b45cd1b	1007
xorjoep	1:24714b45cd1b	1008	/* For the next MAC operations, SIMD is not used
xorjoep	1:24714b45cd1b	1009	* So, the 16 bit pointer if inputB, py is updated */
xorjoep	1:24714b45cd1b	1010
xorjoep	1:24714b45cd1b	1011	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	1012	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1013	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	1014
xorjoep	1:24714b45cd1b	1015	if (k == 1U)
xorjoep	1:24714b45cd1b	1016	{
xorjoep	1:24714b45cd1b	1017	/* Read y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	1018	c0 = *(py+1);
xorjoep	1:24714b45cd1b	1019
xorjoep	1:24714b45cd1b	1020	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1021
xorjoep	1:24714b45cd1b	1022	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	1023
xorjoep	1:24714b45cd1b	1024	#else
xorjoep	1:24714b45cd1b	1025
xorjoep	1:24714b45cd1b	1026	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	1027
xorjoep	1:24714b45cd1b	1028	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1029
xorjoep	1:24714b45cd1b	1030	/* Read x[7] */
xorjoep	1:24714b45cd1b	1031	a = *px;
xorjoep	1:24714b45cd1b	1032	b = *(px+1);
xorjoep	1:24714b45cd1b	1033	px++;
xorjoep	1:24714b45cd1b	1034
xorjoep	1:24714b45cd1b	1035	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1036
xorjoep	1:24714b45cd1b	1037	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1038
xorjoep	1:24714b45cd1b	1039	#else
xorjoep	1:24714b45cd1b	1040
xorjoep	1:24714b45cd1b	1041	x3 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1042
xorjoep	1:24714b45cd1b	1043	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1044
xorjoep	1:24714b45cd1b	1045
xorjoep	1:24714b45cd1b	1046	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1047	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1048	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1049	acc2 = __SMLADX(x1, c0, acc2);
xorjoep	1:24714b45cd1b	1050	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	1051	}
xorjoep	1:24714b45cd1b	1052
xorjoep	1:24714b45cd1b	1053	if (k == 2U)
xorjoep	1:24714b45cd1b	1054	{
xorjoep	1:24714b45cd1b	1055	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	1056	a = *py;
xorjoep	1:24714b45cd1b	1057	b = *(py+1);
xorjoep	1:24714b45cd1b	1058
xorjoep	1:24714b45cd1b	1059	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1060
xorjoep	1:24714b45cd1b	1061	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1062
xorjoep	1:24714b45cd1b	1063	#else
xorjoep	1:24714b45cd1b	1064
xorjoep	1:24714b45cd1b	1065	c0 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1066
xorjoep	1:24714b45cd1b	1067	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1068
xorjoep	1:24714b45cd1b	1069	/* Read x[7], x[8], x[9] */
xorjoep	1:24714b45cd1b	1070	a = *px;
xorjoep	1:24714b45cd1b	1071	b = *(px + 1);
xorjoep	1:24714b45cd1b	1072
xorjoep	1:24714b45cd1b	1073	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1074
xorjoep	1:24714b45cd1b	1075	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1076	a = *(px + 2);
xorjoep	1:24714b45cd1b	1077	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1078
xorjoep	1:24714b45cd1b	1079	#else
xorjoep	1:24714b45cd1b	1080
xorjoep	1:24714b45cd1b	1081	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1082	a = *(px + 2);
xorjoep	1:24714b45cd1b	1083	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1084
xorjoep	1:24714b45cd1b	1085	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1086	px += 2U;
xorjoep	1:24714b45cd1b	1087
xorjoep	1:24714b45cd1b	1088	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1089	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1090	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1091	acc2 = __SMLADX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	1092	acc3 = __SMLADX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	1093	}
xorjoep	1:24714b45cd1b	1094
xorjoep	1:24714b45cd1b	1095	if (k == 3U)
xorjoep	1:24714b45cd1b	1096	{
xorjoep	1:24714b45cd1b	1097	/* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep	1:24714b45cd1b	1098	a = *py;
xorjoep	1:24714b45cd1b	1099	b = *(py+1);
xorjoep	1:24714b45cd1b	1100
xorjoep	1:24714b45cd1b	1101	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1102
xorjoep	1:24714b45cd1b	1103	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1104
xorjoep	1:24714b45cd1b	1105	#else
xorjoep	1:24714b45cd1b	1106
xorjoep	1:24714b45cd1b	1107	c0 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1108
xorjoep	1:24714b45cd1b	1109	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1110
xorjoep	1:24714b45cd1b	1111	/* Read x[7], x[8], x[9] */
xorjoep	1:24714b45cd1b	1112	a = *px;
xorjoep	1:24714b45cd1b	1113	b = *(px + 1);
xorjoep	1:24714b45cd1b	1114
xorjoep	1:24714b45cd1b	1115	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1116
xorjoep	1:24714b45cd1b	1117	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1118	a = *(px + 2);
xorjoep	1:24714b45cd1b	1119	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1120
xorjoep	1:24714b45cd1b	1121	#else
xorjoep	1:24714b45cd1b	1122
xorjoep	1:24714b45cd1b	1123	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1124	a = *(px + 2);
xorjoep	1:24714b45cd1b	1125	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1126
xorjoep	1:24714b45cd1b	1127	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1128
xorjoep	1:24714b45cd1b	1129	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1130	acc0 = __SMLADX(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1131	acc1 = __SMLADX(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1132	acc2 = __SMLADX(x3, c0, acc2);
xorjoep	1:24714b45cd1b	1133	acc3 = __SMLADX(x2, c0, acc3);
xorjoep	1:24714b45cd1b	1134
xorjoep	1:24714b45cd1b	1135	/* Read y[srcBLen - 7] */
xorjoep	1:24714b45cd1b	1136	c0 = *(py-1);
xorjoep	1:24714b45cd1b	1137	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1138
xorjoep	1:24714b45cd1b	1139	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	1140	#else
xorjoep	1:24714b45cd1b	1141
xorjoep	1:24714b45cd1b	1142	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	1143	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1144
xorjoep	1:24714b45cd1b	1145	/* Read x[10] */
xorjoep	1:24714b45cd1b	1146	a = *(px+2);
xorjoep	1:24714b45cd1b	1147	b = *(px+3);
xorjoep	1:24714b45cd1b	1148
xorjoep	1:24714b45cd1b	1149	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1150
xorjoep	1:24714b45cd1b	1151	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1152
xorjoep	1:24714b45cd1b	1153	#else
xorjoep	1:24714b45cd1b	1154
xorjoep	1:24714b45cd1b	1155	x3 = __PKHBT(b, a, 16);;
xorjoep	1:24714b45cd1b	1156
xorjoep	1:24714b45cd1b	1157	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1158
xorjoep	1:24714b45cd1b	1159	px += 3U;
xorjoep	1:24714b45cd1b	1160
xorjoep	1:24714b45cd1b	1161	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1162	acc0 = __SMLADX(x1, c0, acc0);
xorjoep	1:24714b45cd1b	1163	acc1 = __SMLAD(x2, c0, acc1);
xorjoep	1:24714b45cd1b	1164	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	1165	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	1166	}
xorjoep	1:24714b45cd1b	1167
xorjoep	1:24714b45cd1b	1168	/* Store the results in the accumulators in the destination buffer. */
xorjoep	1:24714b45cd1b	1169	*pOut++ = (q15_t)(acc0 >> 15);
xorjoep	1:24714b45cd1b	1170	*pOut++ = (q15_t)(acc1 >> 15);
xorjoep	1:24714b45cd1b	1171	*pOut++ = (q15_t)(acc2 >> 15);
xorjoep	1:24714b45cd1b	1172	*pOut++ = (q15_t)(acc3 >> 15);
xorjoep	1:24714b45cd1b	1173
xorjoep	1:24714b45cd1b	1174	/* Increment the pointer pIn1 index, count by 4 */
xorjoep	1:24714b45cd1b	1175	count += 4U;
xorjoep	1:24714b45cd1b	1176
xorjoep	1:24714b45cd1b	1177	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1178	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1179	py = pSrc2;
xorjoep	1:24714b45cd1b	1180
xorjoep	1:24714b45cd1b	1181	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1182	blkCnt--;
xorjoep	1:24714b45cd1b	1183	}
xorjoep	1:24714b45cd1b	1184
xorjoep	1:24714b45cd1b	1185	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	1186	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1187	blkCnt = blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	1188
xorjoep	1:24714b45cd1b	1189	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	1190	{
xorjoep	1:24714b45cd1b	1191	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1192	sum = 0;
xorjoep	1:24714b45cd1b	1193
xorjoep	1:24714b45cd1b	1194	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	1195	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	1196
xorjoep	1:24714b45cd1b	1197	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	1198	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	1199	while (k > 0U)
xorjoep	1:24714b45cd1b	1200	{
xorjoep	1:24714b45cd1b	1201	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1202	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1203	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1204	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1205	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1206
xorjoep	1:24714b45cd1b	1207	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1208	k--;
xorjoep	1:24714b45cd1b	1209	}
xorjoep	1:24714b45cd1b	1210
xorjoep	1:24714b45cd1b	1211	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	1212	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1213	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	1214
xorjoep	1:24714b45cd1b	1215	while (k > 0U)
xorjoep	1:24714b45cd1b	1216	{
xorjoep	1:24714b45cd1b	1217	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1218	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1219
xorjoep	1:24714b45cd1b	1220	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1221	k--;
xorjoep	1:24714b45cd1b	1222	}
xorjoep	1:24714b45cd1b	1223
xorjoep	1:24714b45cd1b	1224	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1225	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1226
xorjoep	1:24714b45cd1b	1227	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	1228	count++;
xorjoep	1:24714b45cd1b	1229
xorjoep	1:24714b45cd1b	1230	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1231	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1232	py = pSrc2;
xorjoep	1:24714b45cd1b	1233
xorjoep	1:24714b45cd1b	1234	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1235	blkCnt--;
xorjoep	1:24714b45cd1b	1236	}
xorjoep	1:24714b45cd1b	1237	}
xorjoep	1:24714b45cd1b	1238	else
xorjoep	1:24714b45cd1b	1239	{
xorjoep	1:24714b45cd1b	1240	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	1241	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	1242	blkCnt = blockSize2;
xorjoep	1:24714b45cd1b	1243
xorjoep	1:24714b45cd1b	1244	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	1245	{
xorjoep	1:24714b45cd1b	1246	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1247	sum = 0;
xorjoep	1:24714b45cd1b	1248
xorjoep	1:24714b45cd1b	1249	/* srcBLen number of MACS should be performed */
xorjoep	1:24714b45cd1b	1250	k = srcBLen;
xorjoep	1:24714b45cd1b	1251
xorjoep	1:24714b45cd1b	1252	while (k > 0U)
xorjoep	1:24714b45cd1b	1253	{
xorjoep	1:24714b45cd1b	1254	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	1255	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1256
xorjoep	1:24714b45cd1b	1257	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1258	k--;
xorjoep	1:24714b45cd1b	1259	}
xorjoep	1:24714b45cd1b	1260
xorjoep	1:24714b45cd1b	1261	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1262	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1263
xorjoep	1:24714b45cd1b	1264	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	1265	count++;
xorjoep	1:24714b45cd1b	1266
xorjoep	1:24714b45cd1b	1267	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1268	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1269	py = pSrc2;
xorjoep	1:24714b45cd1b	1270
xorjoep	1:24714b45cd1b	1271	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1272	blkCnt--;
xorjoep	1:24714b45cd1b	1273	}
xorjoep	1:24714b45cd1b	1274	}
xorjoep	1:24714b45cd1b	1275
xorjoep	1:24714b45cd1b	1276
xorjoep	1:24714b45cd1b	1277	/* --------------------------
xorjoep	1:24714b45cd1b	1278	* Initializations of stage3
xorjoep	1:24714b45cd1b	1279	* -------------------------*/
xorjoep	1:24714b45cd1b	1280
xorjoep	1:24714b45cd1b	1281	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	1282	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep	1:24714b45cd1b	1283	* ....
xorjoep	1:24714b45cd1b	1284	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep	1:24714b45cd1b	1285	* sum += x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	1286	*/
xorjoep	1:24714b45cd1b	1287
xorjoep	1:24714b45cd1b	1288	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	1289	The blockSize3 variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	1290
xorjoep	1:24714b45cd1b	1291	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	1292	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	1293	px = pSrc1;
xorjoep	1:24714b45cd1b	1294
xorjoep	1:24714b45cd1b	1295	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	1296	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	1297	pIn2 = pSrc2 - 1U;
xorjoep	1:24714b45cd1b	1298	py = pIn2;
xorjoep	1:24714b45cd1b	1299
xorjoep	1:24714b45cd1b	1300	/* -------------------
xorjoep	1:24714b45cd1b	1301	* Stage3 process
xorjoep	1:24714b45cd1b	1302	* ------------------*/
xorjoep	1:24714b45cd1b	1303
xorjoep	1:24714b45cd1b	1304	/* For loop unrolling by 4, this stage is divided into two. */
xorjoep	1:24714b45cd1b	1305	/* First part of this stage computes the MAC operations greater than 4 */
xorjoep	1:24714b45cd1b	1306	/* Second part of this stage computes the MAC operations less than or equal to 4 */
xorjoep	1:24714b45cd1b	1307
xorjoep	1:24714b45cd1b	1308	/* The first part of the stage starts here */
xorjoep	1:24714b45cd1b	1309	j = blockSize3 >> 2U;
xorjoep	1:24714b45cd1b	1310
xorjoep	1:24714b45cd1b	1311	while ((j > 0U) && (blockSize3 > 0U))
xorjoep	1:24714b45cd1b	1312	{
xorjoep	1:24714b45cd1b	1313	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1314	sum = 0;
xorjoep	1:24714b45cd1b	1315
xorjoep	1:24714b45cd1b	1316	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	1317	k = blockSize3 >> 2U;
xorjoep	1:24714b45cd1b	1318
xorjoep	1:24714b45cd1b	1319	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	1320	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	1321	py++;
xorjoep	1:24714b45cd1b	1322
xorjoep	1:24714b45cd1b	1323	while (k > 0U)
xorjoep	1:24714b45cd1b	1324	{
xorjoep	1:24714b45cd1b	1325	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1326	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1327	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1328	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1329	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1330	k--;
xorjoep	1:24714b45cd1b	1331	}
xorjoep	1:24714b45cd1b	1332
xorjoep	1:24714b45cd1b	1333	/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	1334	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1335	k = blockSize3 % 0x4U;
xorjoep	1:24714b45cd1b	1336
xorjoep	1:24714b45cd1b	1337	while (k > 0U)
xorjoep	1:24714b45cd1b	1338	{
xorjoep	1:24714b45cd1b	1339	/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	1340	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1341
xorjoep	1:24714b45cd1b	1342	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1343	k--;
xorjoep	1:24714b45cd1b	1344	}
xorjoep	1:24714b45cd1b	1345
xorjoep	1:24714b45cd1b	1346	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1347	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1348
xorjoep	1:24714b45cd1b	1349	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1350	px = ++pSrc1;
xorjoep	1:24714b45cd1b	1351	py = pIn2;
xorjoep	1:24714b45cd1b	1352
xorjoep	1:24714b45cd1b	1353	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1354	blockSize3--;
xorjoep	1:24714b45cd1b	1355
xorjoep	1:24714b45cd1b	1356	j--;
xorjoep	1:24714b45cd1b	1357	}
xorjoep	1:24714b45cd1b	1358
xorjoep	1:24714b45cd1b	1359	/* The second part of the stage starts here */
xorjoep	1:24714b45cd1b	1360	/* SIMD is not used for the next MAC operations,
xorjoep	1:24714b45cd1b	1361	* so pointer py is updated to read only one sample at a time */
xorjoep	1:24714b45cd1b	1362	py = py + 1U;
xorjoep	1:24714b45cd1b	1363
xorjoep	1:24714b45cd1b	1364	while (blockSize3 > 0U)
xorjoep	1:24714b45cd1b	1365	{
xorjoep	1:24714b45cd1b	1366	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1367	sum = 0;
xorjoep	1:24714b45cd1b	1368
xorjoep	1:24714b45cd1b	1369	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	1370	k = blockSize3;
xorjoep	1:24714b45cd1b	1371
xorjoep	1:24714b45cd1b	1372	while (k > 0U)
xorjoep	1:24714b45cd1b	1373	{
xorjoep	1:24714b45cd1b	1374	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1375	/* sum += x[srcALen-1] * y[srcBLen-1] */
xorjoep	1:24714b45cd1b	1376	sum += ((q31_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	1377
xorjoep	1:24714b45cd1b	1378	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1379	k--;
xorjoep	1:24714b45cd1b	1380	}
xorjoep	1:24714b45cd1b	1381
xorjoep	1:24714b45cd1b	1382	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1383	*pOut++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1384
xorjoep	1:24714b45cd1b	1385	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1386	px = ++pSrc1;
xorjoep	1:24714b45cd1b	1387	py = pSrc2;
xorjoep	1:24714b45cd1b	1388
xorjoep	1:24714b45cd1b	1389	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1390	blockSize3--;
xorjoep	1:24714b45cd1b	1391	}
xorjoep	1:24714b45cd1b	1392
xorjoep	1:24714b45cd1b	1393	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
xorjoep	1:24714b45cd1b	1394	}
xorjoep	1:24714b45cd1b	1395
xorjoep	1:24714b45cd1b	1396	/**
xorjoep	1:24714b45cd1b	1397	* @} end of Conv group
xorjoep	1:24714b45cd1b	1398	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	20 Jun 2018
Imports:	227
Forks:	0
Commits:	4
Dependents:	10
Dependencies:	0
Followers:	6

functions/FilteringFunctions/arm_conv_fast_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning