CMSIS_DSP_5 - The CMSIS DSP 5 library

Users » xorjoep » Code » CMSIS_DSP_5

The CMSIS DSP 5 library

Dependents: Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

functions/FilteringFunctions/arm_conv_q7.c@3:4098b9d3d571, 2018-06-21 (annotated)

Committer:: xorjoep
Date:: Thu Jun 21 11:56:27 2018 +0000
Revision:: 3:4098b9d3d571
Parent:: 1:24714b45cd1b

headers is a folder not a library

Who changed what in which revision?

User	Revision	Line number	New contents of line
xorjoep	1:24714b45cd1b	1	/* ----------------------------------------------------------------------
xorjoep	1:24714b45cd1b	2	* Project: CMSIS DSP Library
xorjoep	1:24714b45cd1b	3	* Title: arm_conv_q7.c
xorjoep	1:24714b45cd1b	4	* Description: Convolution of Q7 sequences
xorjoep	1:24714b45cd1b	5	*
xorjoep	1:24714b45cd1b	6	* $Date: 27. January 2017
xorjoep	1:24714b45cd1b	7	* $Revision: V.1.5.1
xorjoep	1:24714b45cd1b	8	*
xorjoep	1:24714b45cd1b	9	* Target Processor: Cortex-M cores
xorjoep	1:24714b45cd1b	10	* -------------------------------------------------------------------- */
xorjoep	1:24714b45cd1b	11	/*
xorjoep	1:24714b45cd1b	12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep	1:24714b45cd1b	13	*
xorjoep	1:24714b45cd1b	14	* SPDX-License-Identifier: Apache-2.0
xorjoep	1:24714b45cd1b	15	*
xorjoep	1:24714b45cd1b	16	* Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep	1:24714b45cd1b	17	* not use this file except in compliance with the License.
xorjoep	1:24714b45cd1b	18	* You may obtain a copy of the License at
xorjoep	1:24714b45cd1b	19	*
xorjoep	1:24714b45cd1b	20	* www.apache.org/licenses/LICENSE-2.0
xorjoep	1:24714b45cd1b	21	*
xorjoep	1:24714b45cd1b	22	* Unless required by applicable law or agreed to in writing, software
xorjoep	1:24714b45cd1b	23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep	1:24714b45cd1b	24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep	1:24714b45cd1b	25	* See the License for the specific language governing permissions and
xorjoep	1:24714b45cd1b	26	* limitations under the License.
xorjoep	1:24714b45cd1b	27	*/
xorjoep	1:24714b45cd1b	28
xorjoep	1:24714b45cd1b	29	#include "arm_math.h"
xorjoep	1:24714b45cd1b	30
xorjoep	1:24714b45cd1b	31	/**
xorjoep	1:24714b45cd1b	32	* @ingroup groupFilters
xorjoep	1:24714b45cd1b	33	*/
xorjoep	1:24714b45cd1b	34
xorjoep	1:24714b45cd1b	35	/**
xorjoep	1:24714b45cd1b	36	* @addtogroup Conv
xorjoep	1:24714b45cd1b	37	* @{
xorjoep	1:24714b45cd1b	38	*/
xorjoep	1:24714b45cd1b	39
xorjoep	1:24714b45cd1b	40	/**
xorjoep	1:24714b45cd1b	41	* @brief Convolution of Q7 sequences.
xorjoep	1:24714b45cd1b	42	* @param[in] *pSrcA points to the first input sequence.
xorjoep	1:24714b45cd1b	43	* @param[in] srcALen length of the first input sequence.
xorjoep	1:24714b45cd1b	44	* @param[in] *pSrcB points to the second input sequence.
xorjoep	1:24714b45cd1b	45	* @param[in] srcBLen length of the second input sequence.
xorjoep	1:24714b45cd1b	46	* @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
xorjoep	1:24714b45cd1b	47	* @return none.
xorjoep	1:24714b45cd1b	48	*
xorjoep	1:24714b45cd1b	49	* @details
xorjoep	1:24714b45cd1b	50	* <b>Scaling and Overflow Behavior:</b>
xorjoep	1:24714b45cd1b	51	*
xorjoep	1:24714b45cd1b	52	* \par
xorjoep	1:24714b45cd1b	53	* The function is implemented using a 32-bit internal accumulator.
xorjoep	1:24714b45cd1b	54	* Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
xorjoep	1:24714b45cd1b	55	* The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
xorjoep	1:24714b45cd1b	56	* This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
xorjoep	1:24714b45cd1b	57	* The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
xorjoep	1:24714b45cd1b	58	*
xorjoep	1:24714b45cd1b	59	* \par
xorjoep	1:24714b45cd1b	60	* Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
xorjoep	1:24714b45cd1b	61	*
xorjoep	1:24714b45cd1b	62	*/
xorjoep	1:24714b45cd1b	63
xorjoep	1:24714b45cd1b	64	void arm_conv_q7(
xorjoep	1:24714b45cd1b	65	q7_t * pSrcA,
xorjoep	1:24714b45cd1b	66	uint32_t srcALen,
xorjoep	1:24714b45cd1b	67	q7_t * pSrcB,
xorjoep	1:24714b45cd1b	68	uint32_t srcBLen,
xorjoep	1:24714b45cd1b	69	q7_t * pDst)
xorjoep	1:24714b45cd1b	70	{
xorjoep	1:24714b45cd1b	71
xorjoep	1:24714b45cd1b	72
xorjoep	1:24714b45cd1b	73	#if defined (ARM_MATH_DSP)
xorjoep	1:24714b45cd1b	74
xorjoep	1:24714b45cd1b	75	/* Run the below code for Cortex-M4 and Cortex-M3 */
xorjoep	1:24714b45cd1b	76
xorjoep	1:24714b45cd1b	77	q7_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	78	q7_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	79	q7_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	80	q7_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	81	q7_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	82	q7_t pSrc1, pSrc2; /* Intermediate pointers */
xorjoep	1:24714b45cd1b	83	q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
xorjoep	1:24714b45cd1b	84	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep	1:24714b45cd1b	85	q31_t input1, input2; /* Temporary input variables */
xorjoep	1:24714b45cd1b	86	q15_t in1, in2; /* Temporary input variables */
xorjoep	1:24714b45cd1b	87	uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
xorjoep	1:24714b45cd1b	88
xorjoep	1:24714b45cd1b	89	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	90	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	91	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	92	if (srcALen >= srcBLen)
xorjoep	1:24714b45cd1b	93	{
xorjoep	1:24714b45cd1b	94	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	95	pIn1 = pSrcA;
xorjoep	1:24714b45cd1b	96
xorjoep	1:24714b45cd1b	97	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	98	pIn2 = pSrcB;
xorjoep	1:24714b45cd1b	99	}
xorjoep	1:24714b45cd1b	100	else
xorjoep	1:24714b45cd1b	101	{
xorjoep	1:24714b45cd1b	102	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	103	pIn1 = pSrcB;
xorjoep	1:24714b45cd1b	104
xorjoep	1:24714b45cd1b	105	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	106	pIn2 = pSrcA;
xorjoep	1:24714b45cd1b	107
xorjoep	1:24714b45cd1b	108	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	109	j = srcBLen;
xorjoep	1:24714b45cd1b	110	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	111	srcALen = j;
xorjoep	1:24714b45cd1b	112	}
xorjoep	1:24714b45cd1b	113
xorjoep	1:24714b45cd1b	114	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep	1:24714b45cd1b	115	/* The function is internally
xorjoep	1:24714b45cd1b	116	* divided into three stages according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	117	* taken place between inputA samples and inputB samples. In the first stage of the
xorjoep	1:24714b45cd1b	118	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	119	* In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	120	* In the third stage of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	121	* for every iteration. */
xorjoep	1:24714b45cd1b	122
xorjoep	1:24714b45cd1b	123	/* The algorithm is implemented in three stages.
xorjoep	1:24714b45cd1b	124	The loop counters of each stage is initiated here. */
xorjoep	1:24714b45cd1b	125	blockSize1 = srcBLen - 1U;
xorjoep	1:24714b45cd1b	126	blockSize2 = (srcALen - srcBLen) + 1U;
xorjoep	1:24714b45cd1b	127	blockSize3 = blockSize1;
xorjoep	1:24714b45cd1b	128
xorjoep	1:24714b45cd1b	129	/* --------------------------
xorjoep	1:24714b45cd1b	130	* Initializations of stage1
xorjoep	1:24714b45cd1b	131	* -------------------------*/
xorjoep	1:24714b45cd1b	132
xorjoep	1:24714b45cd1b	133	/* sum = x[0] * y[0]
xorjoep	1:24714b45cd1b	134	* sum = x[0] * y[1] + x[1] * y[0]
xorjoep	1:24714b45cd1b	135	* ....
xorjoep	1:24714b45cd1b	136	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep	1:24714b45cd1b	137	*/
xorjoep	1:24714b45cd1b	138
xorjoep	1:24714b45cd1b	139	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	140	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	141	count = 1U;
xorjoep	1:24714b45cd1b	142
xorjoep	1:24714b45cd1b	143	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	144	px = pIn1;
xorjoep	1:24714b45cd1b	145
xorjoep	1:24714b45cd1b	146	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	147	py = pIn2;
xorjoep	1:24714b45cd1b	148
xorjoep	1:24714b45cd1b	149
xorjoep	1:24714b45cd1b	150	/* ------------------------
xorjoep	1:24714b45cd1b	151	* Stage1 process
xorjoep	1:24714b45cd1b	152	* ----------------------*/
xorjoep	1:24714b45cd1b	153
xorjoep	1:24714b45cd1b	154	/* The first stage starts here */
xorjoep	1:24714b45cd1b	155	while (blockSize1 > 0U)
xorjoep	1:24714b45cd1b	156	{
xorjoep	1:24714b45cd1b	157	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	158	sum = 0;
xorjoep	1:24714b45cd1b	159
xorjoep	1:24714b45cd1b	160	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	161	k = count >> 2U;
xorjoep	1:24714b45cd1b	162
xorjoep	1:24714b45cd1b	163	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	164	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	165	while (k > 0U)
xorjoep	1:24714b45cd1b	166	{
xorjoep	1:24714b45cd1b	167	/* x[0] , x[1] */
xorjoep	1:24714b45cd1b	168	in1 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	169	in2 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	170	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	171
xorjoep	1:24714b45cd1b	172	/* y[srcBLen - 1] , y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	173	in1 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	174	in2 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	175	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	176
xorjoep	1:24714b45cd1b	177	/* x[0] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	178	/* x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	179	sum = __SMLAD(input1, input2, sum);
xorjoep	1:24714b45cd1b	180
xorjoep	1:24714b45cd1b	181	/* x[2] , x[3] */
xorjoep	1:24714b45cd1b	182	in1 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	183	in2 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	184	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	185
xorjoep	1:24714b45cd1b	186	/* y[srcBLen - 3] , y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	187	in1 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	188	in2 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	189	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	190
xorjoep	1:24714b45cd1b	191	/* x[2] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	192	/* x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	193	sum = __SMLAD(input1, input2, sum);
xorjoep	1:24714b45cd1b	194
xorjoep	1:24714b45cd1b	195	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	196	k--;
xorjoep	1:24714b45cd1b	197	}
xorjoep	1:24714b45cd1b	198
xorjoep	1:24714b45cd1b	199	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	200	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	201	k = count % 0x4U;
xorjoep	1:24714b45cd1b	202
xorjoep	1:24714b45cd1b	203	while (k > 0U)
xorjoep	1:24714b45cd1b	204	{
xorjoep	1:24714b45cd1b	205	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	206	sum += ((q15_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	207
xorjoep	1:24714b45cd1b	208	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	209	k--;
xorjoep	1:24714b45cd1b	210	}
xorjoep	1:24714b45cd1b	211
xorjoep	1:24714b45cd1b	212	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	213	*pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
xorjoep	1:24714b45cd1b	214
xorjoep	1:24714b45cd1b	215	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	216	py = pIn2 + count;
xorjoep	1:24714b45cd1b	217	px = pIn1;
xorjoep	1:24714b45cd1b	218
xorjoep	1:24714b45cd1b	219	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	220	count++;
xorjoep	1:24714b45cd1b	221
xorjoep	1:24714b45cd1b	222	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	223	blockSize1--;
xorjoep	1:24714b45cd1b	224	}
xorjoep	1:24714b45cd1b	225
xorjoep	1:24714b45cd1b	226	/* --------------------------
xorjoep	1:24714b45cd1b	227	* Initializations of stage2
xorjoep	1:24714b45cd1b	228	* ------------------------*/
xorjoep	1:24714b45cd1b	229
xorjoep	1:24714b45cd1b	230	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep	1:24714b45cd1b	231	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep	1:24714b45cd1b	232	* ....
xorjoep	1:24714b45cd1b	233	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	234	*/
xorjoep	1:24714b45cd1b	235
xorjoep	1:24714b45cd1b	236	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	237	px = pIn1;
xorjoep	1:24714b45cd1b	238
xorjoep	1:24714b45cd1b	239	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	240	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	241	py = pSrc2;
xorjoep	1:24714b45cd1b	242
xorjoep	1:24714b45cd1b	243	/* count is index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	244	count = 0U;
xorjoep	1:24714b45cd1b	245
xorjoep	1:24714b45cd1b	246	/* -------------------
xorjoep	1:24714b45cd1b	247	* Stage2 process
xorjoep	1:24714b45cd1b	248	* ------------------*/
xorjoep	1:24714b45cd1b	249
xorjoep	1:24714b45cd1b	250	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	251	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	252	* srcBLen should be greater than or equal to 4 */
xorjoep	1:24714b45cd1b	253	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	254	{
xorjoep	1:24714b45cd1b	255	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	256	blkCnt = blockSize2 >> 2U;
xorjoep	1:24714b45cd1b	257
xorjoep	1:24714b45cd1b	258	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	259	{
xorjoep	1:24714b45cd1b	260	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	261	acc0 = 0;
xorjoep	1:24714b45cd1b	262	acc1 = 0;
xorjoep	1:24714b45cd1b	263	acc2 = 0;
xorjoep	1:24714b45cd1b	264	acc3 = 0;
xorjoep	1:24714b45cd1b	265
xorjoep	1:24714b45cd1b	266	/* read x[0], x[1], x[2] samples */
xorjoep	1:24714b45cd1b	267	x0 = *(px++);
xorjoep	1:24714b45cd1b	268	x1 = *(px++);
xorjoep	1:24714b45cd1b	269	x2 = *(px++);
xorjoep	1:24714b45cd1b	270
xorjoep	1:24714b45cd1b	271	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	272	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	273
xorjoep	1:24714b45cd1b	274	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	275	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	276	do
xorjoep	1:24714b45cd1b	277	{
xorjoep	1:24714b45cd1b	278	/* Read y[srcBLen - 1] sample */
xorjoep	1:24714b45cd1b	279	c0 = *(py--);
xorjoep	1:24714b45cd1b	280	/* Read y[srcBLen - 2] sample */
xorjoep	1:24714b45cd1b	281	c1 = *(py--);
xorjoep	1:24714b45cd1b	282
xorjoep	1:24714b45cd1b	283	/* Read x[3] sample */
xorjoep	1:24714b45cd1b	284	x3 = *(px++);
xorjoep	1:24714b45cd1b	285
xorjoep	1:24714b45cd1b	286	/* x[0] and x[1] are packed */
xorjoep	1:24714b45cd1b	287	in1 = (q15_t) x0;
xorjoep	1:24714b45cd1b	288	in2 = (q15_t) x1;
xorjoep	1:24714b45cd1b	289
xorjoep	1:24714b45cd1b	290	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	291
xorjoep	1:24714b45cd1b	292	/* y[srcBLen - 1] and y[srcBLen - 2] are packed */
xorjoep	1:24714b45cd1b	293	in1 = (q15_t) c0;
xorjoep	1:24714b45cd1b	294	in2 = (q15_t) c1;
xorjoep	1:24714b45cd1b	295
xorjoep	1:24714b45cd1b	296	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	297
xorjoep	1:24714b45cd1b	298	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	299	acc0 = __SMLAD(input1, input2, acc0);
xorjoep	1:24714b45cd1b	300
xorjoep	1:24714b45cd1b	301	/* x[1] and x[2] are packed */
xorjoep	1:24714b45cd1b	302	in1 = (q15_t) x1;
xorjoep	1:24714b45cd1b	303	in2 = (q15_t) x2;
xorjoep	1:24714b45cd1b	304
xorjoep	1:24714b45cd1b	305	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	306
xorjoep	1:24714b45cd1b	307	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	308	acc1 = __SMLAD(input1, input2, acc1);
xorjoep	1:24714b45cd1b	309
xorjoep	1:24714b45cd1b	310	/* x[2] and x[3] are packed */
xorjoep	1:24714b45cd1b	311	in1 = (q15_t) x2;
xorjoep	1:24714b45cd1b	312	in2 = (q15_t) x3;
xorjoep	1:24714b45cd1b	313
xorjoep	1:24714b45cd1b	314	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	315
xorjoep	1:24714b45cd1b	316	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	317	acc2 = __SMLAD(input1, input2, acc2);
xorjoep	1:24714b45cd1b	318
xorjoep	1:24714b45cd1b	319	/* Read x[4] sample */
xorjoep	1:24714b45cd1b	320	x0 = *(px++);
xorjoep	1:24714b45cd1b	321
xorjoep	1:24714b45cd1b	322	/* x[3] and x[4] are packed */
xorjoep	1:24714b45cd1b	323	in1 = (q15_t) x3;
xorjoep	1:24714b45cd1b	324	in2 = (q15_t) x0;
xorjoep	1:24714b45cd1b	325
xorjoep	1:24714b45cd1b	326	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	327
xorjoep	1:24714b45cd1b	328	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	329	acc3 = __SMLAD(input1, input2, acc3);
xorjoep	1:24714b45cd1b	330
xorjoep	1:24714b45cd1b	331	/* Read y[srcBLen - 3] sample */
xorjoep	1:24714b45cd1b	332	c0 = *(py--);
xorjoep	1:24714b45cd1b	333	/* Read y[srcBLen - 4] sample */
xorjoep	1:24714b45cd1b	334	c1 = *(py--);
xorjoep	1:24714b45cd1b	335
xorjoep	1:24714b45cd1b	336	/* Read x[5] sample */
xorjoep	1:24714b45cd1b	337	x1 = *(px++);
xorjoep	1:24714b45cd1b	338
xorjoep	1:24714b45cd1b	339	/* x[2] and x[3] are packed */
xorjoep	1:24714b45cd1b	340	in1 = (q15_t) x2;
xorjoep	1:24714b45cd1b	341	in2 = (q15_t) x3;
xorjoep	1:24714b45cd1b	342
xorjoep	1:24714b45cd1b	343	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	344
xorjoep	1:24714b45cd1b	345	/* y[srcBLen - 3] and y[srcBLen - 4] are packed */
xorjoep	1:24714b45cd1b	346	in1 = (q15_t) c0;
xorjoep	1:24714b45cd1b	347	in2 = (q15_t) c1;
xorjoep	1:24714b45cd1b	348
xorjoep	1:24714b45cd1b	349	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	350
xorjoep	1:24714b45cd1b	351	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	352	acc0 = __SMLAD(input1, input2, acc0);
xorjoep	1:24714b45cd1b	353
xorjoep	1:24714b45cd1b	354	/* x[3] and x[4] are packed */
xorjoep	1:24714b45cd1b	355	in1 = (q15_t) x3;
xorjoep	1:24714b45cd1b	356	in2 = (q15_t) x0;
xorjoep	1:24714b45cd1b	357
xorjoep	1:24714b45cd1b	358	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	359
xorjoep	1:24714b45cd1b	360	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	361	acc1 = __SMLAD(input1, input2, acc1);
xorjoep	1:24714b45cd1b	362
xorjoep	1:24714b45cd1b	363	/* x[4] and x[5] are packed */
xorjoep	1:24714b45cd1b	364	in1 = (q15_t) x0;
xorjoep	1:24714b45cd1b	365	in2 = (q15_t) x1;
xorjoep	1:24714b45cd1b	366
xorjoep	1:24714b45cd1b	367	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	368
xorjoep	1:24714b45cd1b	369	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	370	acc2 = __SMLAD(input1, input2, acc2);
xorjoep	1:24714b45cd1b	371
xorjoep	1:24714b45cd1b	372	/* Read x[6] sample */
xorjoep	1:24714b45cd1b	373	x2 = *(px++);
xorjoep	1:24714b45cd1b	374
xorjoep	1:24714b45cd1b	375	/* x[5] and x[6] are packed */
xorjoep	1:24714b45cd1b	376	in1 = (q15_t) x1;
xorjoep	1:24714b45cd1b	377	in2 = (q15_t) x2;
xorjoep	1:24714b45cd1b	378
xorjoep	1:24714b45cd1b	379	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	380
xorjoep	1:24714b45cd1b	381	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	382	acc3 = __SMLAD(input1, input2, acc3);
xorjoep	1:24714b45cd1b	383
xorjoep	1:24714b45cd1b	384	} while (--k);
xorjoep	1:24714b45cd1b	385
xorjoep	1:24714b45cd1b	386	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	387	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	388	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	389
xorjoep	1:24714b45cd1b	390	while (k > 0U)
xorjoep	1:24714b45cd1b	391	{
xorjoep	1:24714b45cd1b	392	/* Read y[srcBLen - 5] sample */
xorjoep	1:24714b45cd1b	393	c0 = *(py--);
xorjoep	1:24714b45cd1b	394
xorjoep	1:24714b45cd1b	395	/* Read x[7] sample */
xorjoep	1:24714b45cd1b	396	x3 = *(px++);
xorjoep	1:24714b45cd1b	397
xorjoep	1:24714b45cd1b	398	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	399	/* acc0 += x[4] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	400	acc0 += ((q15_t) x0 * c0);
xorjoep	1:24714b45cd1b	401	/* acc1 += x[5] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	402	acc1 += ((q15_t) x1 * c0);
xorjoep	1:24714b45cd1b	403	/* acc2 += x[6] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	404	acc2 += ((q15_t) x2 * c0);
xorjoep	1:24714b45cd1b	405	/* acc3 += x[7] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	406	acc3 += ((q15_t) x3 * c0);
xorjoep	1:24714b45cd1b	407
xorjoep	1:24714b45cd1b	408	/* Reuse the present samples for the next MAC */
xorjoep	1:24714b45cd1b	409	x0 = x1;
xorjoep	1:24714b45cd1b	410	x1 = x2;
xorjoep	1:24714b45cd1b	411	x2 = x3;
xorjoep	1:24714b45cd1b	412
xorjoep	1:24714b45cd1b	413	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	414	k--;
xorjoep	1:24714b45cd1b	415	}
xorjoep	1:24714b45cd1b	416
xorjoep	1:24714b45cd1b	417
xorjoep	1:24714b45cd1b	418	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	419	*pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
xorjoep	1:24714b45cd1b	420	*pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
xorjoep	1:24714b45cd1b	421	*pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8));
xorjoep	1:24714b45cd1b	422	*pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8));
xorjoep	1:24714b45cd1b	423
xorjoep	1:24714b45cd1b	424	/* Increment the pointer pIn1 index, count by 4 */
xorjoep	1:24714b45cd1b	425	count += 4U;
xorjoep	1:24714b45cd1b	426
xorjoep	1:24714b45cd1b	427	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	428	px = pIn1 + count;
xorjoep	1:24714b45cd1b	429	py = pSrc2;
xorjoep	1:24714b45cd1b	430
xorjoep	1:24714b45cd1b	431	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	432	blkCnt--;
xorjoep	1:24714b45cd1b	433	}
xorjoep	1:24714b45cd1b	434
xorjoep	1:24714b45cd1b	435	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	436	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	437	blkCnt = blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	438
xorjoep	1:24714b45cd1b	439	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	440	{
xorjoep	1:24714b45cd1b	441	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	442	sum = 0;
xorjoep	1:24714b45cd1b	443
xorjoep	1:24714b45cd1b	444	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	445	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	446
xorjoep	1:24714b45cd1b	447	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	448	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	449	while (k > 0U)
xorjoep	1:24714b45cd1b	450	{
xorjoep	1:24714b45cd1b	451
xorjoep	1:24714b45cd1b	452	/* Reading two inputs of SrcA buffer and packing */
xorjoep	1:24714b45cd1b	453	in1 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	454	in2 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	455	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	456
xorjoep	1:24714b45cd1b	457	/* Reading two inputs of SrcB buffer and packing */
xorjoep	1:24714b45cd1b	458	in1 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	459	in2 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	460	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	461
xorjoep	1:24714b45cd1b	462	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	463	sum = __SMLAD(input1, input2, sum);
xorjoep	1:24714b45cd1b	464
xorjoep	1:24714b45cd1b	465	/* Reading two inputs of SrcA buffer and packing */
xorjoep	1:24714b45cd1b	466	in1 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	467	in2 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	468	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	469
xorjoep	1:24714b45cd1b	470	/* Reading two inputs of SrcB buffer and packing */
xorjoep	1:24714b45cd1b	471	in1 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	472	in2 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	473	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	474
xorjoep	1:24714b45cd1b	475	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	476	sum = __SMLAD(input1, input2, sum);
xorjoep	1:24714b45cd1b	477
xorjoep	1:24714b45cd1b	478	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	479	k--;
xorjoep	1:24714b45cd1b	480	}
xorjoep	1:24714b45cd1b	481
xorjoep	1:24714b45cd1b	482	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	483	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	484	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	485
xorjoep	1:24714b45cd1b	486	while (k > 0U)
xorjoep	1:24714b45cd1b	487	{
xorjoep	1:24714b45cd1b	488	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	489	sum += ((q15_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	490
xorjoep	1:24714b45cd1b	491	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	492	k--;
xorjoep	1:24714b45cd1b	493	}
xorjoep	1:24714b45cd1b	494
xorjoep	1:24714b45cd1b	495	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	496	*pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
xorjoep	1:24714b45cd1b	497
xorjoep	1:24714b45cd1b	498	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	499	count++;
xorjoep	1:24714b45cd1b	500
xorjoep	1:24714b45cd1b	501	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	502	px = pIn1 + count;
xorjoep	1:24714b45cd1b	503	py = pSrc2;
xorjoep	1:24714b45cd1b	504
xorjoep	1:24714b45cd1b	505	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	506	blkCnt--;
xorjoep	1:24714b45cd1b	507	}
xorjoep	1:24714b45cd1b	508	}
xorjoep	1:24714b45cd1b	509	else
xorjoep	1:24714b45cd1b	510	{
xorjoep	1:24714b45cd1b	511	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	512	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	513	blkCnt = blockSize2;
xorjoep	1:24714b45cd1b	514
xorjoep	1:24714b45cd1b	515	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	516	{
xorjoep	1:24714b45cd1b	517	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	518	sum = 0;
xorjoep	1:24714b45cd1b	519
xorjoep	1:24714b45cd1b	520	/* srcBLen number of MACS should be performed */
xorjoep	1:24714b45cd1b	521	k = srcBLen;
xorjoep	1:24714b45cd1b	522
xorjoep	1:24714b45cd1b	523	while (k > 0U)
xorjoep	1:24714b45cd1b	524	{
xorjoep	1:24714b45cd1b	525	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	526	sum += ((q15_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	527
xorjoep	1:24714b45cd1b	528	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	529	k--;
xorjoep	1:24714b45cd1b	530	}
xorjoep	1:24714b45cd1b	531
xorjoep	1:24714b45cd1b	532	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	533	*pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
xorjoep	1:24714b45cd1b	534
xorjoep	1:24714b45cd1b	535	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	536	count++;
xorjoep	1:24714b45cd1b	537
xorjoep	1:24714b45cd1b	538	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	539	px = pIn1 + count;
xorjoep	1:24714b45cd1b	540	py = pSrc2;
xorjoep	1:24714b45cd1b	541
xorjoep	1:24714b45cd1b	542	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	543	blkCnt--;
xorjoep	1:24714b45cd1b	544	}
xorjoep	1:24714b45cd1b	545	}
xorjoep	1:24714b45cd1b	546
xorjoep	1:24714b45cd1b	547
xorjoep	1:24714b45cd1b	548	/* --------------------------
xorjoep	1:24714b45cd1b	549	* Initializations of stage3
xorjoep	1:24714b45cd1b	550	* -------------------------*/
xorjoep	1:24714b45cd1b	551
xorjoep	1:24714b45cd1b	552	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	553	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep	1:24714b45cd1b	554	* ....
xorjoep	1:24714b45cd1b	555	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep	1:24714b45cd1b	556	* sum += x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	557	*/
xorjoep	1:24714b45cd1b	558
xorjoep	1:24714b45cd1b	559	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	560	The blockSize3 variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	561
xorjoep	1:24714b45cd1b	562	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	563	pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
xorjoep	1:24714b45cd1b	564	px = pSrc1;
xorjoep	1:24714b45cd1b	565
xorjoep	1:24714b45cd1b	566	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	567	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	568	py = pSrc2;
xorjoep	1:24714b45cd1b	569
xorjoep	1:24714b45cd1b	570	/* -------------------
xorjoep	1:24714b45cd1b	571	* Stage3 process
xorjoep	1:24714b45cd1b	572	* ------------------*/
xorjoep	1:24714b45cd1b	573
xorjoep	1:24714b45cd1b	574	while (blockSize3 > 0U)
xorjoep	1:24714b45cd1b	575	{
xorjoep	1:24714b45cd1b	576	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	577	sum = 0;
xorjoep	1:24714b45cd1b	578
xorjoep	1:24714b45cd1b	579	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	580	k = blockSize3 >> 2U;
xorjoep	1:24714b45cd1b	581
xorjoep	1:24714b45cd1b	582	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	583	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	584	while (k > 0U)
xorjoep	1:24714b45cd1b	585	{
xorjoep	1:24714b45cd1b	586	/* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
xorjoep	1:24714b45cd1b	587	in1 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	588	in2 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	589	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	590
xorjoep	1:24714b45cd1b	591	/* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
xorjoep	1:24714b45cd1b	592	in1 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	593	in2 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	594	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	595
xorjoep	1:24714b45cd1b	596	/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	597	/* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	598	sum = __SMLAD(input1, input2, sum);
xorjoep	1:24714b45cd1b	599
xorjoep	1:24714b45cd1b	600	/* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
xorjoep	1:24714b45cd1b	601	in1 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	602	in2 = (q15_t) * px++;
xorjoep	1:24714b45cd1b	603	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	604
xorjoep	1:24714b45cd1b	605	/* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
xorjoep	1:24714b45cd1b	606	in1 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	607	in2 = (q15_t) * py--;
xorjoep	1:24714b45cd1b	608	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16U);
xorjoep	1:24714b45cd1b	609
xorjoep	1:24714b45cd1b	610	/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	611	/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	612	sum = __SMLAD(input1, input2, sum);
xorjoep	1:24714b45cd1b	613
xorjoep	1:24714b45cd1b	614	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	615	k--;
xorjoep	1:24714b45cd1b	616	}
xorjoep	1:24714b45cd1b	617
xorjoep	1:24714b45cd1b	618	/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	619	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	620	k = blockSize3 % 0x4U;
xorjoep	1:24714b45cd1b	621
xorjoep	1:24714b45cd1b	622	while (k > 0U)
xorjoep	1:24714b45cd1b	623	{
xorjoep	1:24714b45cd1b	624	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	625	sum += ((q15_t) * px++ * *py--);
xorjoep	1:24714b45cd1b	626
xorjoep	1:24714b45cd1b	627	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	628	k--;
xorjoep	1:24714b45cd1b	629	}
xorjoep	1:24714b45cd1b	630
xorjoep	1:24714b45cd1b	631	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	632	*pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
xorjoep	1:24714b45cd1b	633
xorjoep	1:24714b45cd1b	634	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	635	px = ++pSrc1;
xorjoep	1:24714b45cd1b	636	py = pSrc2;
xorjoep	1:24714b45cd1b	637
xorjoep	1:24714b45cd1b	638	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	639	blockSize3--;
xorjoep	1:24714b45cd1b	640	}
xorjoep	1:24714b45cd1b	641
xorjoep	1:24714b45cd1b	642	#else
xorjoep	1:24714b45cd1b	643
xorjoep	1:24714b45cd1b	644	/* Run the below code for Cortex-M0 */
xorjoep	1:24714b45cd1b	645
xorjoep	1:24714b45cd1b	646	q7_t pIn1 = pSrcA; / input pointer */
xorjoep	1:24714b45cd1b	647	q7_t pIn2 = pSrcB; / coefficient pointer */
xorjoep	1:24714b45cd1b	648	q31_t sum; /* Accumulator */
xorjoep	1:24714b45cd1b	649	uint32_t i, j; /* loop counter */
xorjoep	1:24714b45cd1b	650
xorjoep	1:24714b45cd1b	651	/* Loop to calculate output of convolution for output length number of times */
xorjoep	1:24714b45cd1b	652	for (i = 0; i < (srcALen + srcBLen - 1); i++)
xorjoep	1:24714b45cd1b	653	{
xorjoep	1:24714b45cd1b	654	/* Initialize sum with zero to carry on MAC operations */
xorjoep	1:24714b45cd1b	655	sum = 0;
xorjoep	1:24714b45cd1b	656
xorjoep	1:24714b45cd1b	657	/* Loop to perform MAC operations according to convolution equation */
xorjoep	1:24714b45cd1b	658	for (j = 0; j <= i; j++)
xorjoep	1:24714b45cd1b	659	{
xorjoep	1:24714b45cd1b	660	/* Check the array limitations */
xorjoep	1:24714b45cd1b	661	if (((i - j) < srcBLen) && (j < srcALen))
xorjoep	1:24714b45cd1b	662	{
xorjoep	1:24714b45cd1b	663	/* z[i] += x[i-j] * y[j] */
xorjoep	1:24714b45cd1b	664	sum += (q15_t) pIn1[j] * (pIn2[i - j]);
xorjoep	1:24714b45cd1b	665	}
xorjoep	1:24714b45cd1b	666	}
xorjoep	1:24714b45cd1b	667
xorjoep	1:24714b45cd1b	668	/* Store the output in the destination buffer */
xorjoep	1:24714b45cd1b	669	pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
xorjoep	1:24714b45cd1b	670	}
xorjoep	1:24714b45cd1b	671
xorjoep	1:24714b45cd1b	672	#endif /* #if defined (ARM_MATH_DSP) */
xorjoep	1:24714b45cd1b	673
xorjoep	1:24714b45cd1b	674	}
xorjoep	1:24714b45cd1b	675
xorjoep	1:24714b45cd1b	676	/**
xorjoep	1:24714b45cd1b	677	* @} end of Conv group
xorjoep	1:24714b45cd1b	678	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	20 Jun 2018
Imports:	227
Forks:	0
Commits:	4
Dependents:	10
Dependencies:	0
Followers:	6

functions/FilteringFunctions/arm_conv_q7.c@3:4098b9d3d571, 2018-06-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning