CMSIS_DSP_5 - The CMSIS DSP 5 library

Users » xorjoep » Code » CMSIS_DSP_5

The CMSIS DSP 5 library

Dependents: Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

functions/FilteringFunctions/arm_conv_fast_q31.c@3:4098b9d3d571, 2018-06-21 (annotated)

Committer:: xorjoep
Date:: Thu Jun 21 11:56:27 2018 +0000
Revision:: 3:4098b9d3d571
Parent:: 1:24714b45cd1b

headers is a folder not a library

Who changed what in which revision?

User	Revision	Line number	New contents of line
xorjoep	1:24714b45cd1b	1	/* ----------------------------------------------------------------------
xorjoep	1:24714b45cd1b	2	* Project: CMSIS DSP Library
xorjoep	1:24714b45cd1b	3	* Title: arm_conv_fast_q31.c
xorjoep	1:24714b45cd1b	4	* Description: Fast Q31 Convolution
xorjoep	1:24714b45cd1b	5	*
xorjoep	1:24714b45cd1b	6	* $Date: 27. January 2017
xorjoep	1:24714b45cd1b	7	* $Revision: V.1.5.1
xorjoep	1:24714b45cd1b	8	*
xorjoep	1:24714b45cd1b	9	* Target Processor: Cortex-M cores
xorjoep	1:24714b45cd1b	10	* -------------------------------------------------------------------- */
xorjoep	1:24714b45cd1b	11	/*
xorjoep	1:24714b45cd1b	12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep	1:24714b45cd1b	13	*
xorjoep	1:24714b45cd1b	14	* SPDX-License-Identifier: Apache-2.0
xorjoep	1:24714b45cd1b	15	*
xorjoep	1:24714b45cd1b	16	* Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep	1:24714b45cd1b	17	* not use this file except in compliance with the License.
xorjoep	1:24714b45cd1b	18	* You may obtain a copy of the License at
xorjoep	1:24714b45cd1b	19	*
xorjoep	1:24714b45cd1b	20	* www.apache.org/licenses/LICENSE-2.0
xorjoep	1:24714b45cd1b	21	*
xorjoep	1:24714b45cd1b	22	* Unless required by applicable law or agreed to in writing, software
xorjoep	1:24714b45cd1b	23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep	1:24714b45cd1b	24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep	1:24714b45cd1b	25	* See the License for the specific language governing permissions and
xorjoep	1:24714b45cd1b	26	* limitations under the License.
xorjoep	1:24714b45cd1b	27	*/
xorjoep	1:24714b45cd1b	28
xorjoep	1:24714b45cd1b	29	#include "arm_math.h"
xorjoep	1:24714b45cd1b	30
xorjoep	1:24714b45cd1b	31	/**
xorjoep	1:24714b45cd1b	32	* @ingroup groupFilters
xorjoep	1:24714b45cd1b	33	*/
xorjoep	1:24714b45cd1b	34
xorjoep	1:24714b45cd1b	35	/**
xorjoep	1:24714b45cd1b	36	* @addtogroup Conv
xorjoep	1:24714b45cd1b	37	* @{
xorjoep	1:24714b45cd1b	38	*/
xorjoep	1:24714b45cd1b	39
xorjoep	1:24714b45cd1b	40	/**
xorjoep	1:24714b45cd1b	41	* @param[in] *pSrcA points to the first input sequence.
xorjoep	1:24714b45cd1b	42	* @param[in] srcALen length of the first input sequence.
xorjoep	1:24714b45cd1b	43	* @param[in] *pSrcB points to the second input sequence.
xorjoep	1:24714b45cd1b	44	* @param[in] srcBLen length of the second input sequence.
xorjoep	1:24714b45cd1b	45	* @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
xorjoep	1:24714b45cd1b	46	* @return none.
xorjoep	1:24714b45cd1b	47	*
xorjoep	1:24714b45cd1b	48	* @details
xorjoep	1:24714b45cd1b	49	* <b>Scaling and Overflow Behavior:</b>
xorjoep	1:24714b45cd1b	50	*
xorjoep	1:24714b45cd1b	51	* \par
xorjoep	1:24714b45cd1b	52	* This function is optimized for speed at the expense of fixed-point precision and overflow protection.
xorjoep	1:24714b45cd1b	53	* The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
xorjoep	1:24714b45cd1b	54	* These intermediate results are accumulated in a 32-bit register in 2.30 format.
xorjoep	1:24714b45cd1b	55	* Finally, the accumulator is saturated and converted to a 1.31 result.
xorjoep	1:24714b45cd1b	56	*
xorjoep	1:24714b45cd1b	57	* \par
xorjoep	1:24714b45cd1b	58	* The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
xorjoep	1:24714b45cd1b	59	* In order to avoid overflows completely the input signals must be scaled down.
xorjoep	1:24714b45cd1b	60	* Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
xorjoep	1:24714b45cd1b	61	* as maximum of min(srcALen, srcBLen) number of additions are carried internally.
xorjoep	1:24714b45cd1b	62	*
xorjoep	1:24714b45cd1b	63	* \par
xorjoep	1:24714b45cd1b	64	* See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
xorjoep	1:24714b45cd1b	65	*/
xorjoep	1:24714b45cd1b	66
xorjoep	1:24714b45cd1b	67	void arm_conv_fast_q31(
xorjoep	1:24714b45cd1b	68	q31_t * pSrcA,
xorjoep	1:24714b45cd1b	69	uint32_t srcALen,
xorjoep	1:24714b45cd1b	70	q31_t * pSrcB,
xorjoep	1:24714b45cd1b	71	uint32_t srcBLen,
xorjoep	1:24714b45cd1b	72	q31_t * pDst)
xorjoep	1:24714b45cd1b	73	{
xorjoep	1:24714b45cd1b	74	q31_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	75	q31_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	76	q31_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	77	q31_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	78	q31_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	79	q31_t pSrc1, pSrc2; /* Intermediate pointers */
xorjoep	1:24714b45cd1b	80	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep	1:24714b45cd1b	81	q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
xorjoep	1:24714b45cd1b	82	uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
xorjoep	1:24714b45cd1b	83
xorjoep	1:24714b45cd1b	84	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	85	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	86	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	87	if (srcALen >= srcBLen)
xorjoep	1:24714b45cd1b	88	{
xorjoep	1:24714b45cd1b	89	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	90	pIn1 = pSrcA;
xorjoep	1:24714b45cd1b	91
xorjoep	1:24714b45cd1b	92	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	93	pIn2 = pSrcB;
xorjoep	1:24714b45cd1b	94	}
xorjoep	1:24714b45cd1b	95	else
xorjoep	1:24714b45cd1b	96	{
xorjoep	1:24714b45cd1b	97	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	98	pIn1 = pSrcB;
xorjoep	1:24714b45cd1b	99
xorjoep	1:24714b45cd1b	100	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	101	pIn2 = pSrcA;
xorjoep	1:24714b45cd1b	102
xorjoep	1:24714b45cd1b	103	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	104	j = srcBLen;
xorjoep	1:24714b45cd1b	105	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	106	srcALen = j;
xorjoep	1:24714b45cd1b	107	}
xorjoep	1:24714b45cd1b	108
xorjoep	1:24714b45cd1b	109	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep	1:24714b45cd1b	110	/* The function is internally
xorjoep	1:24714b45cd1b	111	* divided into three stages according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	112	* taken place between inputA samples and inputB samples. In the first stage of the
xorjoep	1:24714b45cd1b	113	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	114	* In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	115	* In the third stage of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	116	* for every iteration. */
xorjoep	1:24714b45cd1b	117
xorjoep	1:24714b45cd1b	118	/* The algorithm is implemented in three stages.
xorjoep	1:24714b45cd1b	119	The loop counters of each stage is initiated here. */
xorjoep	1:24714b45cd1b	120	blockSize1 = srcBLen - 1U;
xorjoep	1:24714b45cd1b	121	blockSize2 = srcALen - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	122	blockSize3 = blockSize1;
xorjoep	1:24714b45cd1b	123
xorjoep	1:24714b45cd1b	124	/* --------------------------
xorjoep	1:24714b45cd1b	125	* Initializations of stage1
xorjoep	1:24714b45cd1b	126	* -------------------------*/
xorjoep	1:24714b45cd1b	127
xorjoep	1:24714b45cd1b	128	/* sum = x[0] * y[0]
xorjoep	1:24714b45cd1b	129	* sum = x[0] * y[1] + x[1] * y[0]
xorjoep	1:24714b45cd1b	130	* ....
xorjoep	1:24714b45cd1b	131	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep	1:24714b45cd1b	132	*/
xorjoep	1:24714b45cd1b	133
xorjoep	1:24714b45cd1b	134	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	135	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	136	count = 1U;
xorjoep	1:24714b45cd1b	137
xorjoep	1:24714b45cd1b	138	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	139	px = pIn1;
xorjoep	1:24714b45cd1b	140
xorjoep	1:24714b45cd1b	141	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	142	py = pIn2;
xorjoep	1:24714b45cd1b	143
xorjoep	1:24714b45cd1b	144
xorjoep	1:24714b45cd1b	145	/* ------------------------
xorjoep	1:24714b45cd1b	146	* Stage1 process
xorjoep	1:24714b45cd1b	147	* ----------------------*/
xorjoep	1:24714b45cd1b	148
xorjoep	1:24714b45cd1b	149	/* The first stage starts here */
xorjoep	1:24714b45cd1b	150	while (blockSize1 > 0U)
xorjoep	1:24714b45cd1b	151	{
xorjoep	1:24714b45cd1b	152	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	153	sum = 0;
xorjoep	1:24714b45cd1b	154
xorjoep	1:24714b45cd1b	155	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	156	k = count >> 2U;
xorjoep	1:24714b45cd1b	157
xorjoep	1:24714b45cd1b	158	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	159	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	160	while (k > 0U)
xorjoep	1:24714b45cd1b	161	{
xorjoep	1:24714b45cd1b	162	/* x[0] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	163	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	164	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	165
xorjoep	1:24714b45cd1b	166	/* x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	167	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	168	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	169
xorjoep	1:24714b45cd1b	170	/* x[2] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	171	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	172	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	173
xorjoep	1:24714b45cd1b	174	/* x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	175	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	176	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	177
xorjoep	1:24714b45cd1b	178	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	179	k--;
xorjoep	1:24714b45cd1b	180	}
xorjoep	1:24714b45cd1b	181
xorjoep	1:24714b45cd1b	182	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	183	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	184	k = count % 0x4U;
xorjoep	1:24714b45cd1b	185
xorjoep	1:24714b45cd1b	186	while (k > 0U)
xorjoep	1:24714b45cd1b	187	{
xorjoep	1:24714b45cd1b	188	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	189	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	190	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	191
xorjoep	1:24714b45cd1b	192	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	193	k--;
xorjoep	1:24714b45cd1b	194	}
xorjoep	1:24714b45cd1b	195
xorjoep	1:24714b45cd1b	196	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	197	*pOut++ = sum << 1;
xorjoep	1:24714b45cd1b	198
xorjoep	1:24714b45cd1b	199	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	200	py = pIn2 + count;
xorjoep	1:24714b45cd1b	201	px = pIn1;
xorjoep	1:24714b45cd1b	202
xorjoep	1:24714b45cd1b	203	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	204	count++;
xorjoep	1:24714b45cd1b	205
xorjoep	1:24714b45cd1b	206	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	207	blockSize1--;
xorjoep	1:24714b45cd1b	208	}
xorjoep	1:24714b45cd1b	209
xorjoep	1:24714b45cd1b	210	/* --------------------------
xorjoep	1:24714b45cd1b	211	* Initializations of stage2
xorjoep	1:24714b45cd1b	212	* ------------------------*/
xorjoep	1:24714b45cd1b	213
xorjoep	1:24714b45cd1b	214	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep	1:24714b45cd1b	215	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep	1:24714b45cd1b	216	* ....
xorjoep	1:24714b45cd1b	217	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	218	*/
xorjoep	1:24714b45cd1b	219
xorjoep	1:24714b45cd1b	220	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	221	px = pIn1;
xorjoep	1:24714b45cd1b	222
xorjoep	1:24714b45cd1b	223	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	224	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	225	py = pSrc2;
xorjoep	1:24714b45cd1b	226
xorjoep	1:24714b45cd1b	227	/* count is index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	228	count = 0U;
xorjoep	1:24714b45cd1b	229
xorjoep	1:24714b45cd1b	230	/* -------------------
xorjoep	1:24714b45cd1b	231	* Stage2 process
xorjoep	1:24714b45cd1b	232	* ------------------*/
xorjoep	1:24714b45cd1b	233
xorjoep	1:24714b45cd1b	234	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	235	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	236	* srcBLen should be greater than or equal to 4 */
xorjoep	1:24714b45cd1b	237	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	238	{
xorjoep	1:24714b45cd1b	239	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	240	blkCnt = blockSize2 >> 2U;
xorjoep	1:24714b45cd1b	241
xorjoep	1:24714b45cd1b	242	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	243	{
xorjoep	1:24714b45cd1b	244	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	245	acc0 = 0;
xorjoep	1:24714b45cd1b	246	acc1 = 0;
xorjoep	1:24714b45cd1b	247	acc2 = 0;
xorjoep	1:24714b45cd1b	248	acc3 = 0;
xorjoep	1:24714b45cd1b	249
xorjoep	1:24714b45cd1b	250	/* read x[0], x[1], x[2] samples */
xorjoep	1:24714b45cd1b	251	x0 = *(px++);
xorjoep	1:24714b45cd1b	252	x1 = *(px++);
xorjoep	1:24714b45cd1b	253	x2 = *(px++);
xorjoep	1:24714b45cd1b	254
xorjoep	1:24714b45cd1b	255	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	256	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	257
xorjoep	1:24714b45cd1b	258	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	259	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	260	do
xorjoep	1:24714b45cd1b	261	{
xorjoep	1:24714b45cd1b	262	/* Read y[srcBLen - 1] sample */
xorjoep	1:24714b45cd1b	263	c0 = *(py--);
xorjoep	1:24714b45cd1b	264
xorjoep	1:24714b45cd1b	265	/* Read x[3] sample */
xorjoep	1:24714b45cd1b	266	x3 = *(px++);
xorjoep	1:24714b45cd1b	267
xorjoep	1:24714b45cd1b	268	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	269	/* acc0 += x[0] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	270	acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
xorjoep	1:24714b45cd1b	271
xorjoep	1:24714b45cd1b	272	/* acc1 += x[1] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	273	acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
xorjoep	1:24714b45cd1b	274
xorjoep	1:24714b45cd1b	275	/* acc2 += x[2] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	276	acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
xorjoep	1:24714b45cd1b	277
xorjoep	1:24714b45cd1b	278	/* acc3 += x[3] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	279	acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
xorjoep	1:24714b45cd1b	280
xorjoep	1:24714b45cd1b	281	/* Read y[srcBLen - 2] sample */
xorjoep	1:24714b45cd1b	282	c0 = *(py--);
xorjoep	1:24714b45cd1b	283
xorjoep	1:24714b45cd1b	284	/* Read x[4] sample */
xorjoep	1:24714b45cd1b	285	x0 = *(px++);
xorjoep	1:24714b45cd1b	286
xorjoep	1:24714b45cd1b	287	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	288	/* acc0 += x[1] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	289	acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
xorjoep	1:24714b45cd1b	290	/* acc1 += x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	291	acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
xorjoep	1:24714b45cd1b	292	/* acc2 += x[3] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	293	acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
xorjoep	1:24714b45cd1b	294	/* acc3 += x[4] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	295	acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
xorjoep	1:24714b45cd1b	296
xorjoep	1:24714b45cd1b	297	/* Read y[srcBLen - 3] sample */
xorjoep	1:24714b45cd1b	298	c0 = *(py--);
xorjoep	1:24714b45cd1b	299
xorjoep	1:24714b45cd1b	300	/* Read x[5] sample */
xorjoep	1:24714b45cd1b	301	x1 = *(px++);
xorjoep	1:24714b45cd1b	302
xorjoep	1:24714b45cd1b	303	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	304	/* acc0 += x[2] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	305	acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
xorjoep	1:24714b45cd1b	306	/* acc1 += x[3] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	307	acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
xorjoep	1:24714b45cd1b	308	/* acc2 += x[4] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	309	acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
xorjoep	1:24714b45cd1b	310	/* acc3 += x[5] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	311	acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
xorjoep	1:24714b45cd1b	312
xorjoep	1:24714b45cd1b	313	/* Read y[srcBLen - 4] sample */
xorjoep	1:24714b45cd1b	314	c0 = *(py--);
xorjoep	1:24714b45cd1b	315
xorjoep	1:24714b45cd1b	316	/* Read x[6] sample */
xorjoep	1:24714b45cd1b	317	x2 = *(px++);
xorjoep	1:24714b45cd1b	318
xorjoep	1:24714b45cd1b	319	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	320	/* acc0 += x[3] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	321	acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
xorjoep	1:24714b45cd1b	322	/* acc1 += x[4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	323	acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
xorjoep	1:24714b45cd1b	324	/* acc2 += x[5] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	325	acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
xorjoep	1:24714b45cd1b	326	/* acc3 += x[6] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	327	acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
xorjoep	1:24714b45cd1b	328
xorjoep	1:24714b45cd1b	329
xorjoep	1:24714b45cd1b	330	} while (--k);
xorjoep	1:24714b45cd1b	331
xorjoep	1:24714b45cd1b	332	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	333	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	334	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	335
xorjoep	1:24714b45cd1b	336	while (k > 0U)
xorjoep	1:24714b45cd1b	337	{
xorjoep	1:24714b45cd1b	338	/* Read y[srcBLen - 5] sample */
xorjoep	1:24714b45cd1b	339	c0 = *(py--);
xorjoep	1:24714b45cd1b	340
xorjoep	1:24714b45cd1b	341	/* Read x[7] sample */
xorjoep	1:24714b45cd1b	342	x3 = *(px++);
xorjoep	1:24714b45cd1b	343
xorjoep	1:24714b45cd1b	344	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	345	/* acc0 += x[4] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	346	acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
xorjoep	1:24714b45cd1b	347	/* acc1 += x[5] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	348	acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
xorjoep	1:24714b45cd1b	349	/* acc2 += x[6] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	350	acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
xorjoep	1:24714b45cd1b	351	/* acc3 += x[7] * y[srcBLen - 5] */
xorjoep	1:24714b45cd1b	352	acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
xorjoep	1:24714b45cd1b	353
xorjoep	1:24714b45cd1b	354	/* Reuse the present samples for the next MAC */
xorjoep	1:24714b45cd1b	355	x0 = x1;
xorjoep	1:24714b45cd1b	356	x1 = x2;
xorjoep	1:24714b45cd1b	357	x2 = x3;
xorjoep	1:24714b45cd1b	358
xorjoep	1:24714b45cd1b	359	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	360	k--;
xorjoep	1:24714b45cd1b	361	}
xorjoep	1:24714b45cd1b	362
xorjoep	1:24714b45cd1b	363	/* Store the results in the accumulators in the destination buffer. */
xorjoep	1:24714b45cd1b	364	*pOut++ = (q31_t) (acc0 << 1);
xorjoep	1:24714b45cd1b	365	*pOut++ = (q31_t) (acc1 << 1);
xorjoep	1:24714b45cd1b	366	*pOut++ = (q31_t) (acc2 << 1);
xorjoep	1:24714b45cd1b	367	*pOut++ = (q31_t) (acc3 << 1);
xorjoep	1:24714b45cd1b	368
xorjoep	1:24714b45cd1b	369	/* Increment the pointer pIn1 index, count by 4 */
xorjoep	1:24714b45cd1b	370	count += 4U;
xorjoep	1:24714b45cd1b	371
xorjoep	1:24714b45cd1b	372	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	373	px = pIn1 + count;
xorjoep	1:24714b45cd1b	374	py = pSrc2;
xorjoep	1:24714b45cd1b	375
xorjoep	1:24714b45cd1b	376	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	377	blkCnt--;
xorjoep	1:24714b45cd1b	378	}
xorjoep	1:24714b45cd1b	379
xorjoep	1:24714b45cd1b	380	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	381	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	382	blkCnt = blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	383
xorjoep	1:24714b45cd1b	384	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	385	{
xorjoep	1:24714b45cd1b	386	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	387	sum = 0;
xorjoep	1:24714b45cd1b	388
xorjoep	1:24714b45cd1b	389	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	390	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	391
xorjoep	1:24714b45cd1b	392	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	393	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	394	while (k > 0U)
xorjoep	1:24714b45cd1b	395	{
xorjoep	1:24714b45cd1b	396	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	397	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	398	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	399	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	400	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	401	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	402	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	403	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	404	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	405
xorjoep	1:24714b45cd1b	406	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	407	k--;
xorjoep	1:24714b45cd1b	408	}
xorjoep	1:24714b45cd1b	409
xorjoep	1:24714b45cd1b	410	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	411	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	412	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	413
xorjoep	1:24714b45cd1b	414	while (k > 0U)
xorjoep	1:24714b45cd1b	415	{
xorjoep	1:24714b45cd1b	416	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	417	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	418	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	419
xorjoep	1:24714b45cd1b	420	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	421	k--;
xorjoep	1:24714b45cd1b	422	}
xorjoep	1:24714b45cd1b	423
xorjoep	1:24714b45cd1b	424	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	425	*pOut++ = sum << 1;
xorjoep	1:24714b45cd1b	426
xorjoep	1:24714b45cd1b	427	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	428	count++;
xorjoep	1:24714b45cd1b	429
xorjoep	1:24714b45cd1b	430	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	431	px = pIn1 + count;
xorjoep	1:24714b45cd1b	432	py = pSrc2;
xorjoep	1:24714b45cd1b	433
xorjoep	1:24714b45cd1b	434	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	435	blkCnt--;
xorjoep	1:24714b45cd1b	436	}
xorjoep	1:24714b45cd1b	437	}
xorjoep	1:24714b45cd1b	438	else
xorjoep	1:24714b45cd1b	439	{
xorjoep	1:24714b45cd1b	440	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	441	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	442	blkCnt = blockSize2;
xorjoep	1:24714b45cd1b	443
xorjoep	1:24714b45cd1b	444	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	445	{
xorjoep	1:24714b45cd1b	446	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	447	sum = 0;
xorjoep	1:24714b45cd1b	448
xorjoep	1:24714b45cd1b	449	/* srcBLen number of MACS should be performed */
xorjoep	1:24714b45cd1b	450	k = srcBLen;
xorjoep	1:24714b45cd1b	451
xorjoep	1:24714b45cd1b	452	while (k > 0U)
xorjoep	1:24714b45cd1b	453	{
xorjoep	1:24714b45cd1b	454	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	455	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	456	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	457
xorjoep	1:24714b45cd1b	458	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	459	k--;
xorjoep	1:24714b45cd1b	460	}
xorjoep	1:24714b45cd1b	461
xorjoep	1:24714b45cd1b	462	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	463	*pOut++ = sum << 1;
xorjoep	1:24714b45cd1b	464
xorjoep	1:24714b45cd1b	465	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	466	count++;
xorjoep	1:24714b45cd1b	467
xorjoep	1:24714b45cd1b	468	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	469	px = pIn1 + count;
xorjoep	1:24714b45cd1b	470	py = pSrc2;
xorjoep	1:24714b45cd1b	471
xorjoep	1:24714b45cd1b	472	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	473	blkCnt--;
xorjoep	1:24714b45cd1b	474	}
xorjoep	1:24714b45cd1b	475	}
xorjoep	1:24714b45cd1b	476
xorjoep	1:24714b45cd1b	477
xorjoep	1:24714b45cd1b	478	/* --------------------------
xorjoep	1:24714b45cd1b	479	* Initializations of stage3
xorjoep	1:24714b45cd1b	480	* -------------------------*/
xorjoep	1:24714b45cd1b	481
xorjoep	1:24714b45cd1b	482	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	483	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep	1:24714b45cd1b	484	* ....
xorjoep	1:24714b45cd1b	485	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep	1:24714b45cd1b	486	* sum += x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	487	*/
xorjoep	1:24714b45cd1b	488
xorjoep	1:24714b45cd1b	489	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	490	The blockSize3 variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	491
xorjoep	1:24714b45cd1b	492	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	493	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	494	px = pSrc1;
xorjoep	1:24714b45cd1b	495
xorjoep	1:24714b45cd1b	496	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	497	pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	498	py = pSrc2;
xorjoep	1:24714b45cd1b	499
xorjoep	1:24714b45cd1b	500	/* -------------------
xorjoep	1:24714b45cd1b	501	* Stage3 process
xorjoep	1:24714b45cd1b	502	* ------------------*/
xorjoep	1:24714b45cd1b	503
xorjoep	1:24714b45cd1b	504	while (blockSize3 > 0U)
xorjoep	1:24714b45cd1b	505	{
xorjoep	1:24714b45cd1b	506	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	507	sum = 0;
xorjoep	1:24714b45cd1b	508
xorjoep	1:24714b45cd1b	509	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	510	k = blockSize3 >> 2U;
xorjoep	1:24714b45cd1b	511
xorjoep	1:24714b45cd1b	512	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	513	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	514	while (k > 0U)
xorjoep	1:24714b45cd1b	515	{
xorjoep	1:24714b45cd1b	516	/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	517	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	518	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	519
xorjoep	1:24714b45cd1b	520	/* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	521	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	522	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	523
xorjoep	1:24714b45cd1b	524	/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	525	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	526	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	527
xorjoep	1:24714b45cd1b	528	/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
xorjoep	1:24714b45cd1b	529	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	530	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	531
xorjoep	1:24714b45cd1b	532	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	533	k--;
xorjoep	1:24714b45cd1b	534	}
xorjoep	1:24714b45cd1b	535
xorjoep	1:24714b45cd1b	536	/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	537	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	538	k = blockSize3 % 0x4U;
xorjoep	1:24714b45cd1b	539
xorjoep	1:24714b45cd1b	540	while (k > 0U)
xorjoep	1:24714b45cd1b	541	{
xorjoep	1:24714b45cd1b	542	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	543	sum = (q31_t) ((((q63_t) sum << 32) +
xorjoep	1:24714b45cd1b	544	((q63_t) * px++ * (*py--))) >> 32);
xorjoep	1:24714b45cd1b	545
xorjoep	1:24714b45cd1b	546	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	547	k--;
xorjoep	1:24714b45cd1b	548	}
xorjoep	1:24714b45cd1b	549
xorjoep	1:24714b45cd1b	550	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	551	*pOut++ = sum << 1;
xorjoep	1:24714b45cd1b	552
xorjoep	1:24714b45cd1b	553	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	554	px = ++pSrc1;
xorjoep	1:24714b45cd1b	555	py = pSrc2;
xorjoep	1:24714b45cd1b	556
xorjoep	1:24714b45cd1b	557	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	558	blockSize3--;
xorjoep	1:24714b45cd1b	559	}
xorjoep	1:24714b45cd1b	560
xorjoep	1:24714b45cd1b	561	}
xorjoep	1:24714b45cd1b	562
xorjoep	1:24714b45cd1b	563	/**
xorjoep	1:24714b45cd1b	564	* @} end of Conv group
xorjoep	1:24714b45cd1b	565	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	20 Jun 2018
Imports:	227
Forks:	0
Commits:	4
Dependents:	10
Dependencies:	0
Followers:	6

functions/FilteringFunctions/arm_conv_fast_q31.c@3:4098b9d3d571, 2018-06-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning