CMSIS_DSP_401 - V4.0.1 of the ARM CMSIS DSP libraries. Note that…

Users » emh203 » Code » CMSIS_DSP_401

V4.0.1 of the ARM CMSIS DSP libraries. Note that arm_bitreversal2.s, arm_cfft_f32.c and arm_rfft_fast_f32.c had to be removed. arm_bitreversal2.s will not assemble with the online tools. So, the fast f32 FFT functions are not yet available. All the other FFT functions are available.

Dependents: MPU9150_Example fir_f32 fir_f32 MPU9150_nucleo_noni2cdev ... more

FilteringFunctions/arm_conv_q7.c@0:3d9c67d97d6f, 2014-07-28 (annotated)

Committer:: emh203
Date:: Mon Jul 28 15:03:15 2014 +0000
Revision:: 0:3d9c67d97d6f

1st working commit.   Had to remove arm_bitreversal2.s     arm_cfft_f32.c and arm_rfft_fast_f32.c.    The .s will not assemble.      For now I removed these functions so we could at least have a library for the other functions.

Who changed what in which revision?

User	Revision	Line number	New contents of line
emh203	0:3d9c67d97d6f	1	/* ----------------------------------------------------------------------
emh203	0:3d9c67d97d6f	2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
emh203	0:3d9c67d97d6f	3	*
emh203	0:3d9c67d97d6f	4	* $Date: 12. March 2014
emh203	0:3d9c67d97d6f	5	* $Revision: V1.4.3
emh203	0:3d9c67d97d6f	6	*
emh203	0:3d9c67d97d6f	7	* Project: CMSIS DSP Library
emh203	0:3d9c67d97d6f	8	* Title: arm_conv_q7.c
emh203	0:3d9c67d97d6f	9	*
emh203	0:3d9c67d97d6f	10	* Description: Convolution of Q7 sequences.
emh203	0:3d9c67d97d6f	11	*
emh203	0:3d9c67d97d6f	12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
emh203	0:3d9c67d97d6f	13	*
emh203	0:3d9c67d97d6f	14	* Redistribution and use in source and binary forms, with or without
emh203	0:3d9c67d97d6f	15	* modification, are permitted provided that the following conditions
emh203	0:3d9c67d97d6f	16	* are met:
emh203	0:3d9c67d97d6f	17	* - Redistributions of source code must retain the above copyright
emh203	0:3d9c67d97d6f	18	* notice, this list of conditions and the following disclaimer.
emh203	0:3d9c67d97d6f	19	* - Redistributions in binary form must reproduce the above copyright
emh203	0:3d9c67d97d6f	20	* notice, this list of conditions and the following disclaimer in
emh203	0:3d9c67d97d6f	21	* the documentation and/or other materials provided with the
emh203	0:3d9c67d97d6f	22	* distribution.
emh203	0:3d9c67d97d6f	23	* - Neither the name of ARM LIMITED nor the names of its contributors
emh203	0:3d9c67d97d6f	24	* may be used to endorse or promote products derived from this
emh203	0:3d9c67d97d6f	25	* software without specific prior written permission.
emh203	0:3d9c67d97d6f	26	*
emh203	0:3d9c67d97d6f	27	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
emh203	0:3d9c67d97d6f	28	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
emh203	0:3d9c67d97d6f	29	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
emh203	0:3d9c67d97d6f	30	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
emh203	0:3d9c67d97d6f	31	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
emh203	0:3d9c67d97d6f	32	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
emh203	0:3d9c67d97d6f	33	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
emh203	0:3d9c67d97d6f	34	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
emh203	0:3d9c67d97d6f	35	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
emh203	0:3d9c67d97d6f	36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
emh203	0:3d9c67d97d6f	37	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
emh203	0:3d9c67d97d6f	38	* POSSIBILITY OF SUCH DAMAGE.
emh203	0:3d9c67d97d6f	39	* -------------------------------------------------------------------- */
emh203	0:3d9c67d97d6f	40
emh203	0:3d9c67d97d6f	41	#include "arm_math.h"
emh203	0:3d9c67d97d6f	42
emh203	0:3d9c67d97d6f	43	/**
emh203	0:3d9c67d97d6f	44	* @ingroup groupFilters
emh203	0:3d9c67d97d6f	45	*/
emh203	0:3d9c67d97d6f	46
emh203	0:3d9c67d97d6f	47	/**
emh203	0:3d9c67d97d6f	48	* @addtogroup Conv
emh203	0:3d9c67d97d6f	49	* @{
emh203	0:3d9c67d97d6f	50	*/
emh203	0:3d9c67d97d6f	51
emh203	0:3d9c67d97d6f	52	/**
emh203	0:3d9c67d97d6f	53	* @brief Convolution of Q7 sequences.
emh203	0:3d9c67d97d6f	54	* @param[in] *pSrcA points to the first input sequence.
emh203	0:3d9c67d97d6f	55	* @param[in] srcALen length of the first input sequence.
emh203	0:3d9c67d97d6f	56	* @param[in] *pSrcB points to the second input sequence.
emh203	0:3d9c67d97d6f	57	* @param[in] srcBLen length of the second input sequence.
emh203	0:3d9c67d97d6f	58	* @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
emh203	0:3d9c67d97d6f	59	* @return none.
emh203	0:3d9c67d97d6f	60	*
emh203	0:3d9c67d97d6f	61	* @details
emh203	0:3d9c67d97d6f	62	* <b>Scaling and Overflow Behavior:</b>
emh203	0:3d9c67d97d6f	63	*
emh203	0:3d9c67d97d6f	64	* \par
emh203	0:3d9c67d97d6f	65	* The function is implemented using a 32-bit internal accumulator.
emh203	0:3d9c67d97d6f	66	* Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
emh203	0:3d9c67d97d6f	67	* The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
emh203	0:3d9c67d97d6f	68	* This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
emh203	0:3d9c67d97d6f	69	* The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
emh203	0:3d9c67d97d6f	70	*
emh203	0:3d9c67d97d6f	71	* \par
emh203	0:3d9c67d97d6f	72	* Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
emh203	0:3d9c67d97d6f	73	*
emh203	0:3d9c67d97d6f	74	*/
emh203	0:3d9c67d97d6f	75
emh203	0:3d9c67d97d6f	76	void arm_conv_q7(
emh203	0:3d9c67d97d6f	77	q7_t * pSrcA,
emh203	0:3d9c67d97d6f	78	uint32_t srcALen,
emh203	0:3d9c67d97d6f	79	q7_t * pSrcB,
emh203	0:3d9c67d97d6f	80	uint32_t srcBLen,
emh203	0:3d9c67d97d6f	81	q7_t * pDst)
emh203	0:3d9c67d97d6f	82	{
emh203	0:3d9c67d97d6f	83
emh203	0:3d9c67d97d6f	84
emh203	0:3d9c67d97d6f	85	#ifndef ARM_MATH_CM0_FAMILY
emh203	0:3d9c67d97d6f	86
emh203	0:3d9c67d97d6f	87	/* Run the below code for Cortex-M4 and Cortex-M3 */
emh203	0:3d9c67d97d6f	88
emh203	0:3d9c67d97d6f	89	q7_t pIn1; / inputA pointer */
emh203	0:3d9c67d97d6f	90	q7_t pIn2; / inputB pointer */
emh203	0:3d9c67d97d6f	91	q7_t pOut = pDst; / output pointer */
emh203	0:3d9c67d97d6f	92	q7_t px; / Intermediate inputA pointer */
emh203	0:3d9c67d97d6f	93	q7_t py; / Intermediate inputB pointer */
emh203	0:3d9c67d97d6f	94	q7_t pSrc1, pSrc2; /* Intermediate pointers */
emh203	0:3d9c67d97d6f	95	q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
emh203	0:3d9c67d97d6f	96	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
emh203	0:3d9c67d97d6f	97	q31_t input1, input2; /* Temporary input variables */
emh203	0:3d9c67d97d6f	98	q15_t in1, in2; /* Temporary input variables */
emh203	0:3d9c67d97d6f	99	uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
emh203	0:3d9c67d97d6f	100
emh203	0:3d9c67d97d6f	101	/* The algorithm implementation is based on the lengths of the inputs. */
emh203	0:3d9c67d97d6f	102	/* srcB is always made to slide across srcA. */
emh203	0:3d9c67d97d6f	103	/* So srcBLen is always considered as shorter or equal to srcALen */
emh203	0:3d9c67d97d6f	104	if(srcALen >= srcBLen)
emh203	0:3d9c67d97d6f	105	{
emh203	0:3d9c67d97d6f	106	/* Initialization of inputA pointer */
emh203	0:3d9c67d97d6f	107	pIn1 = pSrcA;
emh203	0:3d9c67d97d6f	108
emh203	0:3d9c67d97d6f	109	/* Initialization of inputB pointer */
emh203	0:3d9c67d97d6f	110	pIn2 = pSrcB;
emh203	0:3d9c67d97d6f	111	}
emh203	0:3d9c67d97d6f	112	else
emh203	0:3d9c67d97d6f	113	{
emh203	0:3d9c67d97d6f	114	/* Initialization of inputA pointer */
emh203	0:3d9c67d97d6f	115	pIn1 = pSrcB;
emh203	0:3d9c67d97d6f	116
emh203	0:3d9c67d97d6f	117	/* Initialization of inputB pointer */
emh203	0:3d9c67d97d6f	118	pIn2 = pSrcA;
emh203	0:3d9c67d97d6f	119
emh203	0:3d9c67d97d6f	120	/* srcBLen is always considered as shorter or equal to srcALen */
emh203	0:3d9c67d97d6f	121	j = srcBLen;
emh203	0:3d9c67d97d6f	122	srcBLen = srcALen;
emh203	0:3d9c67d97d6f	123	srcALen = j;
emh203	0:3d9c67d97d6f	124	}
emh203	0:3d9c67d97d6f	125
emh203	0:3d9c67d97d6f	126	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
emh203	0:3d9c67d97d6f	127	/* The function is internally
emh203	0:3d9c67d97d6f	128	* divided into three stages according to the number of multiplications that has to be
emh203	0:3d9c67d97d6f	129	* taken place between inputA samples and inputB samples. In the first stage of the
emh203	0:3d9c67d97d6f	130	* algorithm, the multiplications increase by one for every iteration.
emh203	0:3d9c67d97d6f	131	* In the second stage of the algorithm, srcBLen number of multiplications are done.
emh203	0:3d9c67d97d6f	132	* In the third stage of the algorithm, the multiplications decrease by one
emh203	0:3d9c67d97d6f	133	* for every iteration. */
emh203	0:3d9c67d97d6f	134
emh203	0:3d9c67d97d6f	135	/* The algorithm is implemented in three stages.
emh203	0:3d9c67d97d6f	136	The loop counters of each stage is initiated here. */
emh203	0:3d9c67d97d6f	137	blockSize1 = srcBLen - 1u;
emh203	0:3d9c67d97d6f	138	blockSize2 = (srcALen - srcBLen) + 1u;
emh203	0:3d9c67d97d6f	139	blockSize3 = blockSize1;
emh203	0:3d9c67d97d6f	140
emh203	0:3d9c67d97d6f	141	/* --------------------------
emh203	0:3d9c67d97d6f	142	* Initializations of stage1
emh203	0:3d9c67d97d6f	143	* -------------------------*/
emh203	0:3d9c67d97d6f	144
emh203	0:3d9c67d97d6f	145	/* sum = x[0] * y[0]
emh203	0:3d9c67d97d6f	146	* sum = x[0] * y[1] + x[1] * y[0]
emh203	0:3d9c67d97d6f	147	* ....
emh203	0:3d9c67d97d6f	148	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
emh203	0:3d9c67d97d6f	149	*/
emh203	0:3d9c67d97d6f	150
emh203	0:3d9c67d97d6f	151	/* In this stage the MAC operations are increased by 1 for every iteration.
emh203	0:3d9c67d97d6f	152	The count variable holds the number of MAC operations performed */
emh203	0:3d9c67d97d6f	153	count = 1u;
emh203	0:3d9c67d97d6f	154
emh203	0:3d9c67d97d6f	155	/* Working pointer of inputA */
emh203	0:3d9c67d97d6f	156	px = pIn1;
emh203	0:3d9c67d97d6f	157
emh203	0:3d9c67d97d6f	158	/* Working pointer of inputB */
emh203	0:3d9c67d97d6f	159	py = pIn2;
emh203	0:3d9c67d97d6f	160
emh203	0:3d9c67d97d6f	161
emh203	0:3d9c67d97d6f	162	/* ------------------------
emh203	0:3d9c67d97d6f	163	* Stage1 process
emh203	0:3d9c67d97d6f	164	* ----------------------*/
emh203	0:3d9c67d97d6f	165
emh203	0:3d9c67d97d6f	166	/* The first stage starts here */
emh203	0:3d9c67d97d6f	167	while(blockSize1 > 0u)
emh203	0:3d9c67d97d6f	168	{
emh203	0:3d9c67d97d6f	169	/* Accumulator is made zero for every iteration */
emh203	0:3d9c67d97d6f	170	sum = 0;
emh203	0:3d9c67d97d6f	171
emh203	0:3d9c67d97d6f	172	/* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203	0:3d9c67d97d6f	173	k = count >> 2u;
emh203	0:3d9c67d97d6f	174
emh203	0:3d9c67d97d6f	175	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203	0:3d9c67d97d6f	176	** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203	0:3d9c67d97d6f	177	while(k > 0u)
emh203	0:3d9c67d97d6f	178	{
emh203	0:3d9c67d97d6f	179	/* x[0] , x[1] */
emh203	0:3d9c67d97d6f	180	in1 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	181	in2 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	182	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	183
emh203	0:3d9c67d97d6f	184	/* y[srcBLen - 1] , y[srcBLen - 2] */
emh203	0:3d9c67d97d6f	185	in1 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	186	in2 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	187	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	188
emh203	0:3d9c67d97d6f	189	/* x[0] * y[srcBLen - 1] */
emh203	0:3d9c67d97d6f	190	/* x[1] * y[srcBLen - 2] */
emh203	0:3d9c67d97d6f	191	sum = __SMLAD(input1, input2, sum);
emh203	0:3d9c67d97d6f	192
emh203	0:3d9c67d97d6f	193	/* x[2] , x[3] */
emh203	0:3d9c67d97d6f	194	in1 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	195	in2 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	196	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	197
emh203	0:3d9c67d97d6f	198	/* y[srcBLen - 3] , y[srcBLen - 4] */
emh203	0:3d9c67d97d6f	199	in1 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	200	in2 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	201	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	202
emh203	0:3d9c67d97d6f	203	/* x[2] * y[srcBLen - 3] */
emh203	0:3d9c67d97d6f	204	/* x[3] * y[srcBLen - 4] */
emh203	0:3d9c67d97d6f	205	sum = __SMLAD(input1, input2, sum);
emh203	0:3d9c67d97d6f	206
emh203	0:3d9c67d97d6f	207	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	208	k--;
emh203	0:3d9c67d97d6f	209	}
emh203	0:3d9c67d97d6f	210
emh203	0:3d9c67d97d6f	211	/* If the count is not a multiple of 4, compute any remaining MACs here.
emh203	0:3d9c67d97d6f	212	** No loop unrolling is used. */
emh203	0:3d9c67d97d6f	213	k = count % 0x4u;
emh203	0:3d9c67d97d6f	214
emh203	0:3d9c67d97d6f	215	while(k > 0u)
emh203	0:3d9c67d97d6f	216	{
emh203	0:3d9c67d97d6f	217	/* Perform the multiply-accumulates */
emh203	0:3d9c67d97d6f	218	sum += ((q15_t) * px++ * *py--);
emh203	0:3d9c67d97d6f	219
emh203	0:3d9c67d97d6f	220	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	221	k--;
emh203	0:3d9c67d97d6f	222	}
emh203	0:3d9c67d97d6f	223
emh203	0:3d9c67d97d6f	224	/* Store the result in the accumulator in the destination buffer. */
emh203	0:3d9c67d97d6f	225	*pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
emh203	0:3d9c67d97d6f	226
emh203	0:3d9c67d97d6f	227	/* Update the inputA and inputB pointers for next MAC calculation */
emh203	0:3d9c67d97d6f	228	py = pIn2 + count;
emh203	0:3d9c67d97d6f	229	px = pIn1;
emh203	0:3d9c67d97d6f	230
emh203	0:3d9c67d97d6f	231	/* Increment the MAC count */
emh203	0:3d9c67d97d6f	232	count++;
emh203	0:3d9c67d97d6f	233
emh203	0:3d9c67d97d6f	234	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	235	blockSize1--;
emh203	0:3d9c67d97d6f	236	}
emh203	0:3d9c67d97d6f	237
emh203	0:3d9c67d97d6f	238	/* --------------------------
emh203	0:3d9c67d97d6f	239	* Initializations of stage2
emh203	0:3d9c67d97d6f	240	* ------------------------*/
emh203	0:3d9c67d97d6f	241
emh203	0:3d9c67d97d6f	242	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
emh203	0:3d9c67d97d6f	243	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
emh203	0:3d9c67d97d6f	244	* ....
emh203	0:3d9c67d97d6f	245	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
emh203	0:3d9c67d97d6f	246	*/
emh203	0:3d9c67d97d6f	247
emh203	0:3d9c67d97d6f	248	/* Working pointer of inputA */
emh203	0:3d9c67d97d6f	249	px = pIn1;
emh203	0:3d9c67d97d6f	250
emh203	0:3d9c67d97d6f	251	/* Working pointer of inputB */
emh203	0:3d9c67d97d6f	252	pSrc2 = pIn2 + (srcBLen - 1u);
emh203	0:3d9c67d97d6f	253	py = pSrc2;
emh203	0:3d9c67d97d6f	254
emh203	0:3d9c67d97d6f	255	/* count is index by which the pointer pIn1 to be incremented */
emh203	0:3d9c67d97d6f	256	count = 0u;
emh203	0:3d9c67d97d6f	257
emh203	0:3d9c67d97d6f	258	/* -------------------
emh203	0:3d9c67d97d6f	259	* Stage2 process
emh203	0:3d9c67d97d6f	260	* ------------------*/
emh203	0:3d9c67d97d6f	261
emh203	0:3d9c67d97d6f	262	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
emh203	0:3d9c67d97d6f	263	* So, to loop unroll over blockSize2,
emh203	0:3d9c67d97d6f	264	* srcBLen should be greater than or equal to 4 */
emh203	0:3d9c67d97d6f	265	if(srcBLen >= 4u)
emh203	0:3d9c67d97d6f	266	{
emh203	0:3d9c67d97d6f	267	/* Loop unroll over blockSize2, by 4 */
emh203	0:3d9c67d97d6f	268	blkCnt = blockSize2 >> 2u;
emh203	0:3d9c67d97d6f	269
emh203	0:3d9c67d97d6f	270	while(blkCnt > 0u)
emh203	0:3d9c67d97d6f	271	{
emh203	0:3d9c67d97d6f	272	/* Set all accumulators to zero */
emh203	0:3d9c67d97d6f	273	acc0 = 0;
emh203	0:3d9c67d97d6f	274	acc1 = 0;
emh203	0:3d9c67d97d6f	275	acc2 = 0;
emh203	0:3d9c67d97d6f	276	acc3 = 0;
emh203	0:3d9c67d97d6f	277
emh203	0:3d9c67d97d6f	278	/* read x[0], x[1], x[2] samples */
emh203	0:3d9c67d97d6f	279	x0 = *(px++);
emh203	0:3d9c67d97d6f	280	x1 = *(px++);
emh203	0:3d9c67d97d6f	281	x2 = *(px++);
emh203	0:3d9c67d97d6f	282
emh203	0:3d9c67d97d6f	283	/* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203	0:3d9c67d97d6f	284	k = srcBLen >> 2u;
emh203	0:3d9c67d97d6f	285
emh203	0:3d9c67d97d6f	286	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203	0:3d9c67d97d6f	287	** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203	0:3d9c67d97d6f	288	do
emh203	0:3d9c67d97d6f	289	{
emh203	0:3d9c67d97d6f	290	/* Read y[srcBLen - 1] sample */
emh203	0:3d9c67d97d6f	291	c0 = *(py--);
emh203	0:3d9c67d97d6f	292	/* Read y[srcBLen - 2] sample */
emh203	0:3d9c67d97d6f	293	c1 = *(py--);
emh203	0:3d9c67d97d6f	294
emh203	0:3d9c67d97d6f	295	/* Read x[3] sample */
emh203	0:3d9c67d97d6f	296	x3 = *(px++);
emh203	0:3d9c67d97d6f	297
emh203	0:3d9c67d97d6f	298	/* x[0] and x[1] are packed */
emh203	0:3d9c67d97d6f	299	in1 = (q15_t) x0;
emh203	0:3d9c67d97d6f	300	in2 = (q15_t) x1;
emh203	0:3d9c67d97d6f	301
emh203	0:3d9c67d97d6f	302	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	303
emh203	0:3d9c67d97d6f	304	/* y[srcBLen - 1] and y[srcBLen - 2] are packed */
emh203	0:3d9c67d97d6f	305	in1 = (q15_t) c0;
emh203	0:3d9c67d97d6f	306	in2 = (q15_t) c1;
emh203	0:3d9c67d97d6f	307
emh203	0:3d9c67d97d6f	308	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	309
emh203	0:3d9c67d97d6f	310	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
emh203	0:3d9c67d97d6f	311	acc0 = __SMLAD(input1, input2, acc0);
emh203	0:3d9c67d97d6f	312
emh203	0:3d9c67d97d6f	313	/* x[1] and x[2] are packed */
emh203	0:3d9c67d97d6f	314	in1 = (q15_t) x1;
emh203	0:3d9c67d97d6f	315	in2 = (q15_t) x2;
emh203	0:3d9c67d97d6f	316
emh203	0:3d9c67d97d6f	317	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	318
emh203	0:3d9c67d97d6f	319	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
emh203	0:3d9c67d97d6f	320	acc1 = __SMLAD(input1, input2, acc1);
emh203	0:3d9c67d97d6f	321
emh203	0:3d9c67d97d6f	322	/* x[2] and x[3] are packed */
emh203	0:3d9c67d97d6f	323	in1 = (q15_t) x2;
emh203	0:3d9c67d97d6f	324	in2 = (q15_t) x3;
emh203	0:3d9c67d97d6f	325
emh203	0:3d9c67d97d6f	326	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	327
emh203	0:3d9c67d97d6f	328	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
emh203	0:3d9c67d97d6f	329	acc2 = __SMLAD(input1, input2, acc2);
emh203	0:3d9c67d97d6f	330
emh203	0:3d9c67d97d6f	331	/* Read x[4] sample */
emh203	0:3d9c67d97d6f	332	x0 = *(px++);
emh203	0:3d9c67d97d6f	333
emh203	0:3d9c67d97d6f	334	/* x[3] and x[4] are packed */
emh203	0:3d9c67d97d6f	335	in1 = (q15_t) x3;
emh203	0:3d9c67d97d6f	336	in2 = (q15_t) x0;
emh203	0:3d9c67d97d6f	337
emh203	0:3d9c67d97d6f	338	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	339
emh203	0:3d9c67d97d6f	340	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
emh203	0:3d9c67d97d6f	341	acc3 = __SMLAD(input1, input2, acc3);
emh203	0:3d9c67d97d6f	342
emh203	0:3d9c67d97d6f	343	/* Read y[srcBLen - 3] sample */
emh203	0:3d9c67d97d6f	344	c0 = *(py--);
emh203	0:3d9c67d97d6f	345	/* Read y[srcBLen - 4] sample */
emh203	0:3d9c67d97d6f	346	c1 = *(py--);
emh203	0:3d9c67d97d6f	347
emh203	0:3d9c67d97d6f	348	/* Read x[5] sample */
emh203	0:3d9c67d97d6f	349	x1 = *(px++);
emh203	0:3d9c67d97d6f	350
emh203	0:3d9c67d97d6f	351	/* x[2] and x[3] are packed */
emh203	0:3d9c67d97d6f	352	in1 = (q15_t) x2;
emh203	0:3d9c67d97d6f	353	in2 = (q15_t) x3;
emh203	0:3d9c67d97d6f	354
emh203	0:3d9c67d97d6f	355	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	356
emh203	0:3d9c67d97d6f	357	/* y[srcBLen - 3] and y[srcBLen - 4] are packed */
emh203	0:3d9c67d97d6f	358	in1 = (q15_t) c0;
emh203	0:3d9c67d97d6f	359	in2 = (q15_t) c1;
emh203	0:3d9c67d97d6f	360
emh203	0:3d9c67d97d6f	361	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	362
emh203	0:3d9c67d97d6f	363	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
emh203	0:3d9c67d97d6f	364	acc0 = __SMLAD(input1, input2, acc0);
emh203	0:3d9c67d97d6f	365
emh203	0:3d9c67d97d6f	366	/* x[3] and x[4] are packed */
emh203	0:3d9c67d97d6f	367	in1 = (q15_t) x3;
emh203	0:3d9c67d97d6f	368	in2 = (q15_t) x0;
emh203	0:3d9c67d97d6f	369
emh203	0:3d9c67d97d6f	370	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	371
emh203	0:3d9c67d97d6f	372	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
emh203	0:3d9c67d97d6f	373	acc1 = __SMLAD(input1, input2, acc1);
emh203	0:3d9c67d97d6f	374
emh203	0:3d9c67d97d6f	375	/* x[4] and x[5] are packed */
emh203	0:3d9c67d97d6f	376	in1 = (q15_t) x0;
emh203	0:3d9c67d97d6f	377	in2 = (q15_t) x1;
emh203	0:3d9c67d97d6f	378
emh203	0:3d9c67d97d6f	379	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	380
emh203	0:3d9c67d97d6f	381	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
emh203	0:3d9c67d97d6f	382	acc2 = __SMLAD(input1, input2, acc2);
emh203	0:3d9c67d97d6f	383
emh203	0:3d9c67d97d6f	384	/* Read x[6] sample */
emh203	0:3d9c67d97d6f	385	x2 = *(px++);
emh203	0:3d9c67d97d6f	386
emh203	0:3d9c67d97d6f	387	/* x[5] and x[6] are packed */
emh203	0:3d9c67d97d6f	388	in1 = (q15_t) x1;
emh203	0:3d9c67d97d6f	389	in2 = (q15_t) x2;
emh203	0:3d9c67d97d6f	390
emh203	0:3d9c67d97d6f	391	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	392
emh203	0:3d9c67d97d6f	393	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
emh203	0:3d9c67d97d6f	394	acc3 = __SMLAD(input1, input2, acc3);
emh203	0:3d9c67d97d6f	395
emh203	0:3d9c67d97d6f	396	} while(--k);
emh203	0:3d9c67d97d6f	397
emh203	0:3d9c67d97d6f	398	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
emh203	0:3d9c67d97d6f	399	** No loop unrolling is used. */
emh203	0:3d9c67d97d6f	400	k = srcBLen % 0x4u;
emh203	0:3d9c67d97d6f	401
emh203	0:3d9c67d97d6f	402	while(k > 0u)
emh203	0:3d9c67d97d6f	403	{
emh203	0:3d9c67d97d6f	404	/* Read y[srcBLen - 5] sample */
emh203	0:3d9c67d97d6f	405	c0 = *(py--);
emh203	0:3d9c67d97d6f	406
emh203	0:3d9c67d97d6f	407	/* Read x[7] sample */
emh203	0:3d9c67d97d6f	408	x3 = *(px++);
emh203	0:3d9c67d97d6f	409
emh203	0:3d9c67d97d6f	410	/* Perform the multiply-accumulates */
emh203	0:3d9c67d97d6f	411	/* acc0 += x[4] * y[srcBLen - 5] */
emh203	0:3d9c67d97d6f	412	acc0 += ((q15_t) x0 * c0);
emh203	0:3d9c67d97d6f	413	/* acc1 += x[5] * y[srcBLen - 5] */
emh203	0:3d9c67d97d6f	414	acc1 += ((q15_t) x1 * c0);
emh203	0:3d9c67d97d6f	415	/* acc2 += x[6] * y[srcBLen - 5] */
emh203	0:3d9c67d97d6f	416	acc2 += ((q15_t) x2 * c0);
emh203	0:3d9c67d97d6f	417	/* acc3 += x[7] * y[srcBLen - 5] */
emh203	0:3d9c67d97d6f	418	acc3 += ((q15_t) x3 * c0);
emh203	0:3d9c67d97d6f	419
emh203	0:3d9c67d97d6f	420	/* Reuse the present samples for the next MAC */
emh203	0:3d9c67d97d6f	421	x0 = x1;
emh203	0:3d9c67d97d6f	422	x1 = x2;
emh203	0:3d9c67d97d6f	423	x2 = x3;
emh203	0:3d9c67d97d6f	424
emh203	0:3d9c67d97d6f	425	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	426	k--;
emh203	0:3d9c67d97d6f	427	}
emh203	0:3d9c67d97d6f	428
emh203	0:3d9c67d97d6f	429
emh203	0:3d9c67d97d6f	430	/* Store the result in the accumulator in the destination buffer. */
emh203	0:3d9c67d97d6f	431	*pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
emh203	0:3d9c67d97d6f	432	*pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
emh203	0:3d9c67d97d6f	433	*pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
emh203	0:3d9c67d97d6f	434	*pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
emh203	0:3d9c67d97d6f	435
emh203	0:3d9c67d97d6f	436	/* Increment the pointer pIn1 index, count by 4 */
emh203	0:3d9c67d97d6f	437	count += 4u;
emh203	0:3d9c67d97d6f	438
emh203	0:3d9c67d97d6f	439	/* Update the inputA and inputB pointers for next MAC calculation */
emh203	0:3d9c67d97d6f	440	px = pIn1 + count;
emh203	0:3d9c67d97d6f	441	py = pSrc2;
emh203	0:3d9c67d97d6f	442
emh203	0:3d9c67d97d6f	443	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	444	blkCnt--;
emh203	0:3d9c67d97d6f	445	}
emh203	0:3d9c67d97d6f	446
emh203	0:3d9c67d97d6f	447	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
emh203	0:3d9c67d97d6f	448	** No loop unrolling is used. */
emh203	0:3d9c67d97d6f	449	blkCnt = blockSize2 % 0x4u;
emh203	0:3d9c67d97d6f	450
emh203	0:3d9c67d97d6f	451	while(blkCnt > 0u)
emh203	0:3d9c67d97d6f	452	{
emh203	0:3d9c67d97d6f	453	/* Accumulator is made zero for every iteration */
emh203	0:3d9c67d97d6f	454	sum = 0;
emh203	0:3d9c67d97d6f	455
emh203	0:3d9c67d97d6f	456	/* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203	0:3d9c67d97d6f	457	k = srcBLen >> 2u;
emh203	0:3d9c67d97d6f	458
emh203	0:3d9c67d97d6f	459	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203	0:3d9c67d97d6f	460	** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203	0:3d9c67d97d6f	461	while(k > 0u)
emh203	0:3d9c67d97d6f	462	{
emh203	0:3d9c67d97d6f	463
emh203	0:3d9c67d97d6f	464	/* Reading two inputs of SrcA buffer and packing */
emh203	0:3d9c67d97d6f	465	in1 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	466	in2 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	467	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	468
emh203	0:3d9c67d97d6f	469	/* Reading two inputs of SrcB buffer and packing */
emh203	0:3d9c67d97d6f	470	in1 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	471	in2 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	472	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	473
emh203	0:3d9c67d97d6f	474	/* Perform the multiply-accumulates */
emh203	0:3d9c67d97d6f	475	sum = __SMLAD(input1, input2, sum);
emh203	0:3d9c67d97d6f	476
emh203	0:3d9c67d97d6f	477	/* Reading two inputs of SrcA buffer and packing */
emh203	0:3d9c67d97d6f	478	in1 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	479	in2 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	480	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	481
emh203	0:3d9c67d97d6f	482	/* Reading two inputs of SrcB buffer and packing */
emh203	0:3d9c67d97d6f	483	in1 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	484	in2 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	485	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	486
emh203	0:3d9c67d97d6f	487	/* Perform the multiply-accumulates */
emh203	0:3d9c67d97d6f	488	sum = __SMLAD(input1, input2, sum);
emh203	0:3d9c67d97d6f	489
emh203	0:3d9c67d97d6f	490	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	491	k--;
emh203	0:3d9c67d97d6f	492	}
emh203	0:3d9c67d97d6f	493
emh203	0:3d9c67d97d6f	494	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
emh203	0:3d9c67d97d6f	495	** No loop unrolling is used. */
emh203	0:3d9c67d97d6f	496	k = srcBLen % 0x4u;
emh203	0:3d9c67d97d6f	497
emh203	0:3d9c67d97d6f	498	while(k > 0u)
emh203	0:3d9c67d97d6f	499	{
emh203	0:3d9c67d97d6f	500	/* Perform the multiply-accumulates */
emh203	0:3d9c67d97d6f	501	sum += ((q15_t) * px++ * *py--);
emh203	0:3d9c67d97d6f	502
emh203	0:3d9c67d97d6f	503	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	504	k--;
emh203	0:3d9c67d97d6f	505	}
emh203	0:3d9c67d97d6f	506
emh203	0:3d9c67d97d6f	507	/* Store the result in the accumulator in the destination buffer. */
emh203	0:3d9c67d97d6f	508	*pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
emh203	0:3d9c67d97d6f	509
emh203	0:3d9c67d97d6f	510	/* Increment the pointer pIn1 index, count by 1 */
emh203	0:3d9c67d97d6f	511	count++;
emh203	0:3d9c67d97d6f	512
emh203	0:3d9c67d97d6f	513	/* Update the inputA and inputB pointers for next MAC calculation */
emh203	0:3d9c67d97d6f	514	px = pIn1 + count;
emh203	0:3d9c67d97d6f	515	py = pSrc2;
emh203	0:3d9c67d97d6f	516
emh203	0:3d9c67d97d6f	517	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	518	blkCnt--;
emh203	0:3d9c67d97d6f	519	}
emh203	0:3d9c67d97d6f	520	}
emh203	0:3d9c67d97d6f	521	else
emh203	0:3d9c67d97d6f	522	{
emh203	0:3d9c67d97d6f	523	/* If the srcBLen is not a multiple of 4,
emh203	0:3d9c67d97d6f	524	* the blockSize2 loop cannot be unrolled by 4 */
emh203	0:3d9c67d97d6f	525	blkCnt = blockSize2;
emh203	0:3d9c67d97d6f	526
emh203	0:3d9c67d97d6f	527	while(blkCnt > 0u)
emh203	0:3d9c67d97d6f	528	{
emh203	0:3d9c67d97d6f	529	/* Accumulator is made zero for every iteration */
emh203	0:3d9c67d97d6f	530	sum = 0;
emh203	0:3d9c67d97d6f	531
emh203	0:3d9c67d97d6f	532	/* srcBLen number of MACS should be performed */
emh203	0:3d9c67d97d6f	533	k = srcBLen;
emh203	0:3d9c67d97d6f	534
emh203	0:3d9c67d97d6f	535	while(k > 0u)
emh203	0:3d9c67d97d6f	536	{
emh203	0:3d9c67d97d6f	537	/* Perform the multiply-accumulate */
emh203	0:3d9c67d97d6f	538	sum += ((q15_t) * px++ * *py--);
emh203	0:3d9c67d97d6f	539
emh203	0:3d9c67d97d6f	540	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	541	k--;
emh203	0:3d9c67d97d6f	542	}
emh203	0:3d9c67d97d6f	543
emh203	0:3d9c67d97d6f	544	/* Store the result in the accumulator in the destination buffer. */
emh203	0:3d9c67d97d6f	545	*pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
emh203	0:3d9c67d97d6f	546
emh203	0:3d9c67d97d6f	547	/* Increment the MAC count */
emh203	0:3d9c67d97d6f	548	count++;
emh203	0:3d9c67d97d6f	549
emh203	0:3d9c67d97d6f	550	/* Update the inputA and inputB pointers for next MAC calculation */
emh203	0:3d9c67d97d6f	551	px = pIn1 + count;
emh203	0:3d9c67d97d6f	552	py = pSrc2;
emh203	0:3d9c67d97d6f	553
emh203	0:3d9c67d97d6f	554	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	555	blkCnt--;
emh203	0:3d9c67d97d6f	556	}
emh203	0:3d9c67d97d6f	557	}
emh203	0:3d9c67d97d6f	558
emh203	0:3d9c67d97d6f	559
emh203	0:3d9c67d97d6f	560	/* --------------------------
emh203	0:3d9c67d97d6f	561	* Initializations of stage3
emh203	0:3d9c67d97d6f	562	* -------------------------*/
emh203	0:3d9c67d97d6f	563
emh203	0:3d9c67d97d6f	564	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
emh203	0:3d9c67d97d6f	565	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
emh203	0:3d9c67d97d6f	566	* ....
emh203	0:3d9c67d97d6f	567	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
emh203	0:3d9c67d97d6f	568	* sum += x[srcALen-1] * y[srcBLen-1]
emh203	0:3d9c67d97d6f	569	*/
emh203	0:3d9c67d97d6f	570
emh203	0:3d9c67d97d6f	571	/* In this stage the MAC operations are decreased by 1 for every iteration.
emh203	0:3d9c67d97d6f	572	The blockSize3 variable holds the number of MAC operations performed */
emh203	0:3d9c67d97d6f	573
emh203	0:3d9c67d97d6f	574	/* Working pointer of inputA */
emh203	0:3d9c67d97d6f	575	pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
emh203	0:3d9c67d97d6f	576	px = pSrc1;
emh203	0:3d9c67d97d6f	577
emh203	0:3d9c67d97d6f	578	/* Working pointer of inputB */
emh203	0:3d9c67d97d6f	579	pSrc2 = pIn2 + (srcBLen - 1u);
emh203	0:3d9c67d97d6f	580	py = pSrc2;
emh203	0:3d9c67d97d6f	581
emh203	0:3d9c67d97d6f	582	/* -------------------
emh203	0:3d9c67d97d6f	583	* Stage3 process
emh203	0:3d9c67d97d6f	584	* ------------------*/
emh203	0:3d9c67d97d6f	585
emh203	0:3d9c67d97d6f	586	while(blockSize3 > 0u)
emh203	0:3d9c67d97d6f	587	{
emh203	0:3d9c67d97d6f	588	/* Accumulator is made zero for every iteration */
emh203	0:3d9c67d97d6f	589	sum = 0;
emh203	0:3d9c67d97d6f	590
emh203	0:3d9c67d97d6f	591	/* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203	0:3d9c67d97d6f	592	k = blockSize3 >> 2u;
emh203	0:3d9c67d97d6f	593
emh203	0:3d9c67d97d6f	594	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203	0:3d9c67d97d6f	595	** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203	0:3d9c67d97d6f	596	while(k > 0u)
emh203	0:3d9c67d97d6f	597	{
emh203	0:3d9c67d97d6f	598	/* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
emh203	0:3d9c67d97d6f	599	in1 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	600	in2 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	601	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	602
emh203	0:3d9c67d97d6f	603	/* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
emh203	0:3d9c67d97d6f	604	in1 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	605	in2 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	606	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	607
emh203	0:3d9c67d97d6f	608	/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
emh203	0:3d9c67d97d6f	609	/* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
emh203	0:3d9c67d97d6f	610	sum = __SMLAD(input1, input2, sum);
emh203	0:3d9c67d97d6f	611
emh203	0:3d9c67d97d6f	612	/* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
emh203	0:3d9c67d97d6f	613	in1 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	614	in2 = (q15_t) * px++;
emh203	0:3d9c67d97d6f	615	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	616
emh203	0:3d9c67d97d6f	617	/* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
emh203	0:3d9c67d97d6f	618	in1 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	619	in2 = (q15_t) * py--;
emh203	0:3d9c67d97d6f	620	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
emh203	0:3d9c67d97d6f	621
emh203	0:3d9c67d97d6f	622	/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
emh203	0:3d9c67d97d6f	623	/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
emh203	0:3d9c67d97d6f	624	sum = __SMLAD(input1, input2, sum);
emh203	0:3d9c67d97d6f	625
emh203	0:3d9c67d97d6f	626	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	627	k--;
emh203	0:3d9c67d97d6f	628	}
emh203	0:3d9c67d97d6f	629
emh203	0:3d9c67d97d6f	630	/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
emh203	0:3d9c67d97d6f	631	** No loop unrolling is used. */
emh203	0:3d9c67d97d6f	632	k = blockSize3 % 0x4u;
emh203	0:3d9c67d97d6f	633
emh203	0:3d9c67d97d6f	634	while(k > 0u)
emh203	0:3d9c67d97d6f	635	{
emh203	0:3d9c67d97d6f	636	/* Perform the multiply-accumulates */
emh203	0:3d9c67d97d6f	637	sum += ((q15_t) * px++ * *py--);
emh203	0:3d9c67d97d6f	638
emh203	0:3d9c67d97d6f	639	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	640	k--;
emh203	0:3d9c67d97d6f	641	}
emh203	0:3d9c67d97d6f	642
emh203	0:3d9c67d97d6f	643	/* Store the result in the accumulator in the destination buffer. */
emh203	0:3d9c67d97d6f	644	*pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
emh203	0:3d9c67d97d6f	645
emh203	0:3d9c67d97d6f	646	/* Update the inputA and inputB pointers for next MAC calculation */
emh203	0:3d9c67d97d6f	647	px = ++pSrc1;
emh203	0:3d9c67d97d6f	648	py = pSrc2;
emh203	0:3d9c67d97d6f	649
emh203	0:3d9c67d97d6f	650	/* Decrement the loop counter */
emh203	0:3d9c67d97d6f	651	blockSize3--;
emh203	0:3d9c67d97d6f	652	}
emh203	0:3d9c67d97d6f	653
emh203	0:3d9c67d97d6f	654	#else
emh203	0:3d9c67d97d6f	655
emh203	0:3d9c67d97d6f	656	/* Run the below code for Cortex-M0 */
emh203	0:3d9c67d97d6f	657
emh203	0:3d9c67d97d6f	658	q7_t pIn1 = pSrcA; / input pointer */
emh203	0:3d9c67d97d6f	659	q7_t pIn2 = pSrcB; / coefficient pointer */
emh203	0:3d9c67d97d6f	660	q31_t sum; /* Accumulator */
emh203	0:3d9c67d97d6f	661	uint32_t i, j; /* loop counter */
emh203	0:3d9c67d97d6f	662
emh203	0:3d9c67d97d6f	663	/* Loop to calculate output of convolution for output length number of times */
emh203	0:3d9c67d97d6f	664	for (i = 0; i < (srcALen + srcBLen - 1); i++)
emh203	0:3d9c67d97d6f	665	{
emh203	0:3d9c67d97d6f	666	/* Initialize sum with zero to carry on MAC operations */
emh203	0:3d9c67d97d6f	667	sum = 0;
emh203	0:3d9c67d97d6f	668
emh203	0:3d9c67d97d6f	669	/* Loop to perform MAC operations according to convolution equation */
emh203	0:3d9c67d97d6f	670	for (j = 0; j <= i; j++)
emh203	0:3d9c67d97d6f	671	{
emh203	0:3d9c67d97d6f	672	/* Check the array limitations */
emh203	0:3d9c67d97d6f	673	if(((i - j) < srcBLen) && (j < srcALen))
emh203	0:3d9c67d97d6f	674	{
emh203	0:3d9c67d97d6f	675	/* z[i] += x[i-j] * y[j] */
emh203	0:3d9c67d97d6f	676	sum += (q15_t) pIn1[j] * (pIn2[i - j]);
emh203	0:3d9c67d97d6f	677	}
emh203	0:3d9c67d97d6f	678	}
emh203	0:3d9c67d97d6f	679
emh203	0:3d9c67d97d6f	680	/* Store the output in the destination buffer */
emh203	0:3d9c67d97d6f	681	pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
emh203	0:3d9c67d97d6f	682	}
emh203	0:3d9c67d97d6f	683
emh203	0:3d9c67d97d6f	684	#endif /* #ifndef ARM_MATH_CM0_FAMILY */
emh203	0:3d9c67d97d6f	685
emh203	0:3d9c67d97d6f	686	}
emh203	0:3d9c67d97d6f	687
emh203	0:3d9c67d97d6f	688	/**
emh203	0:3d9c67d97d6f	689	* @} end of Conv group
emh203	0:3d9c67d97d6f	690	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	28 Jul 2014
Imports:	1167
Forks:	0
Commits:	1
Dependents:	15
Dependencies:	0
Followers:	39

FilteringFunctions/arm_conv_q7.c@0:3d9c67d97d6f, 2014-07-28 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning