CMSIS_DSP_5 - The CMSIS DSP 5 library

Users » xorjoep » Code » CMSIS_DSP_5

The CMSIS DSP 5 library

Dependents: Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

functions/FilteringFunctions/arm_correlate_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Committer:: xorjoep
Date:: Thu Jun 21 11:56:27 2018 +0000
Revision:: 3:4098b9d3d571
Parent:: 1:24714b45cd1b

headers is a folder not a library

Who changed what in which revision?

User	Revision	Line number	New contents of line
xorjoep	1:24714b45cd1b	1	/* ----------------------------------------------------------------------
xorjoep	1:24714b45cd1b	2	* Project: CMSIS DSP Library
xorjoep	1:24714b45cd1b	3	* Title: arm_correlate_q15.c
xorjoep	1:24714b45cd1b	4	* Description: Correlation of Q15 sequences
xorjoep	1:24714b45cd1b	5	*
xorjoep	1:24714b45cd1b	6	* $Date: 27. January 2017
xorjoep	1:24714b45cd1b	7	* $Revision: V.1.5.1
xorjoep	1:24714b45cd1b	8	*
xorjoep	1:24714b45cd1b	9	* Target Processor: Cortex-M cores
xorjoep	1:24714b45cd1b	10	* -------------------------------------------------------------------- */
xorjoep	1:24714b45cd1b	11	/*
xorjoep	1:24714b45cd1b	12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep	1:24714b45cd1b	13	*
xorjoep	1:24714b45cd1b	14	* SPDX-License-Identifier: Apache-2.0
xorjoep	1:24714b45cd1b	15	*
xorjoep	1:24714b45cd1b	16	* Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep	1:24714b45cd1b	17	* not use this file except in compliance with the License.
xorjoep	1:24714b45cd1b	18	* You may obtain a copy of the License at
xorjoep	1:24714b45cd1b	19	*
xorjoep	1:24714b45cd1b	20	* www.apache.org/licenses/LICENSE-2.0
xorjoep	1:24714b45cd1b	21	*
xorjoep	1:24714b45cd1b	22	* Unless required by applicable law or agreed to in writing, software
xorjoep	1:24714b45cd1b	23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep	1:24714b45cd1b	24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep	1:24714b45cd1b	25	* See the License for the specific language governing permissions and
xorjoep	1:24714b45cd1b	26	* limitations under the License.
xorjoep	1:24714b45cd1b	27	*/
xorjoep	1:24714b45cd1b	28
xorjoep	1:24714b45cd1b	29	#include "arm_math.h"
xorjoep	1:24714b45cd1b	30
xorjoep	1:24714b45cd1b	31	/**
xorjoep	1:24714b45cd1b	32	* @ingroup groupFilters
xorjoep	1:24714b45cd1b	33	*/
xorjoep	1:24714b45cd1b	34
xorjoep	1:24714b45cd1b	35	/**
xorjoep	1:24714b45cd1b	36	* @addtogroup Corr
xorjoep	1:24714b45cd1b	37	* @{
xorjoep	1:24714b45cd1b	38	*/
xorjoep	1:24714b45cd1b	39
xorjoep	1:24714b45cd1b	40	/**
xorjoep	1:24714b45cd1b	41	* @brief Correlation of Q15 sequences.
xorjoep	1:24714b45cd1b	42	* @param[in] *pSrcA points to the first input sequence.
xorjoep	1:24714b45cd1b	43	* @param[in] srcALen length of the first input sequence.
xorjoep	1:24714b45cd1b	44	* @param[in] *pSrcB points to the second input sequence.
xorjoep	1:24714b45cd1b	45	* @param[in] srcBLen length of the second input sequence.
xorjoep	1:24714b45cd1b	46	* @param[out] pDst points to the location where the output result is written. Length 2 max(srcALen, srcBLen) - 1.
xorjoep	1:24714b45cd1b	47	* @return none.
xorjoep	1:24714b45cd1b	48	*
xorjoep	1:24714b45cd1b	49	* @details
xorjoep	1:24714b45cd1b	50	* <b>Scaling and Overflow Behavior:</b>
xorjoep	1:24714b45cd1b	51	*
xorjoep	1:24714b45cd1b	52	* \par
xorjoep	1:24714b45cd1b	53	* The function is implemented using a 64-bit internal accumulator.
xorjoep	1:24714b45cd1b	54	* Both inputs are in 1.15 format and multiplications yield a 2.30 result.
xorjoep	1:24714b45cd1b	55	* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
xorjoep	1:24714b45cd1b	56	* This approach provides 33 guard bits and there is no risk of overflow.
xorjoep	1:24714b45cd1b	57	* The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
xorjoep	1:24714b45cd1b	58	*
xorjoep	1:24714b45cd1b	59	* \par
xorjoep	1:24714b45cd1b	60	* Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
xorjoep	1:24714b45cd1b	61	*
xorjoep	1:24714b45cd1b	62	* \par
xorjoep	1:24714b45cd1b	63	* Refer the function <code>arm_correlate_opt_q15()</code> for a faster implementation of this function using scratch buffers.
xorjoep	1:24714b45cd1b	64	*
xorjoep	1:24714b45cd1b	65	*/
xorjoep	1:24714b45cd1b	66
xorjoep	1:24714b45cd1b	67	void arm_correlate_q15(
xorjoep	1:24714b45cd1b	68	q15_t * pSrcA,
xorjoep	1:24714b45cd1b	69	uint32_t srcALen,
xorjoep	1:24714b45cd1b	70	q15_t * pSrcB,
xorjoep	1:24714b45cd1b	71	uint32_t srcBLen,
xorjoep	1:24714b45cd1b	72	q15_t * pDst)
xorjoep	1:24714b45cd1b	73	{
xorjoep	1:24714b45cd1b	74
xorjoep	1:24714b45cd1b	75	#if (defined(ARM_MATH_CM7) \|\| defined(ARM_MATH_CM4) \|\| defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
xorjoep	1:24714b45cd1b	76
xorjoep	1:24714b45cd1b	77	/* Run the below code for Cortex-M4 and Cortex-M3 */
xorjoep	1:24714b45cd1b	78
xorjoep	1:24714b45cd1b	79	q15_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	80	q15_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	81	q15_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	82	q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
xorjoep	1:24714b45cd1b	83	q15_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	84	q15_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	85	q15_t pSrc1; / Intermediate pointers */
xorjoep	1:24714b45cd1b	86	q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
xorjoep	1:24714b45cd1b	87	uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
xorjoep	1:24714b45cd1b	88	int32_t inc = 1; /* Destination address modifier */
xorjoep	1:24714b45cd1b	89
xorjoep	1:24714b45cd1b	90
xorjoep	1:24714b45cd1b	91	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	92	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	93	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	94	/* But CORR(x, y) is reverse of CORR(y, x) */
xorjoep	1:24714b45cd1b	95	/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
xorjoep	1:24714b45cd1b	96	/* and the destination pointer modifier, inc is set to -1 */
xorjoep	1:24714b45cd1b	97	/* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
xorjoep	1:24714b45cd1b	98	/* But to improve the performance,
xorjoep	1:24714b45cd1b	99	* we include zeroes in the output instead of zero padding either of the the inputs*/
xorjoep	1:24714b45cd1b	100	/* If srcALen > srcBLen,
xorjoep	1:24714b45cd1b	101	* (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
xorjoep	1:24714b45cd1b	102	/* If srcALen < srcBLen,
xorjoep	1:24714b45cd1b	103	* (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
xorjoep	1:24714b45cd1b	104	if (srcALen >= srcBLen)
xorjoep	1:24714b45cd1b	105	{
xorjoep	1:24714b45cd1b	106	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	107	pIn1 = (pSrcA);
xorjoep	1:24714b45cd1b	108
xorjoep	1:24714b45cd1b	109	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	110	pIn2 = (pSrcB);
xorjoep	1:24714b45cd1b	111
xorjoep	1:24714b45cd1b	112	/* Number of output samples is calculated */
xorjoep	1:24714b45cd1b	113	outBlockSize = (2U * srcALen) - 1U;
xorjoep	1:24714b45cd1b	114
xorjoep	1:24714b45cd1b	115	/* When srcALen > srcBLen, zero padding is done to srcB
xorjoep	1:24714b45cd1b	116	* to make their lengths equal.
xorjoep	1:24714b45cd1b	117	* Instead, (outBlockSize - (srcALen + srcBLen - 1))
xorjoep	1:24714b45cd1b	118	* number of output samples are made zero */
xorjoep	1:24714b45cd1b	119	j = outBlockSize - (srcALen + (srcBLen - 1U));
xorjoep	1:24714b45cd1b	120
xorjoep	1:24714b45cd1b	121	/* Updating the pointer position to non zero value */
xorjoep	1:24714b45cd1b	122	pOut += j;
xorjoep	1:24714b45cd1b	123
xorjoep	1:24714b45cd1b	124	}
xorjoep	1:24714b45cd1b	125	else
xorjoep	1:24714b45cd1b	126	{
xorjoep	1:24714b45cd1b	127	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	128	pIn1 = (pSrcB);
xorjoep	1:24714b45cd1b	129
xorjoep	1:24714b45cd1b	130	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	131	pIn2 = (pSrcA);
xorjoep	1:24714b45cd1b	132
xorjoep	1:24714b45cd1b	133	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	134	j = srcBLen;
xorjoep	1:24714b45cd1b	135	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	136	srcALen = j;
xorjoep	1:24714b45cd1b	137
xorjoep	1:24714b45cd1b	138	/* CORR(x, y) = Reverse order(CORR(y, x)) */
xorjoep	1:24714b45cd1b	139	/* Hence set the destination pointer to point to the last output sample */
xorjoep	1:24714b45cd1b	140	pOut = pDst + ((srcALen + srcBLen) - 2U);
xorjoep	1:24714b45cd1b	141
xorjoep	1:24714b45cd1b	142	/* Destination address modifier is set to -1 */
xorjoep	1:24714b45cd1b	143	inc = -1;
xorjoep	1:24714b45cd1b	144
xorjoep	1:24714b45cd1b	145	}
xorjoep	1:24714b45cd1b	146
xorjoep	1:24714b45cd1b	147	/* The function is internally
xorjoep	1:24714b45cd1b	148	* divided into three parts according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	149	* taken place between inputA samples and inputB samples. In the first part of the
xorjoep	1:24714b45cd1b	150	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	151	* In the second part of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	152	* In the third part of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	153	* for every iteration.*/
xorjoep	1:24714b45cd1b	154	/* The algorithm is implemented in three stages.
xorjoep	1:24714b45cd1b	155	* The loop counters of each stage is initiated here. */
xorjoep	1:24714b45cd1b	156	blockSize1 = srcBLen - 1U;
xorjoep	1:24714b45cd1b	157	blockSize2 = srcALen - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	158	blockSize3 = blockSize1;
xorjoep	1:24714b45cd1b	159
xorjoep	1:24714b45cd1b	160	/* --------------------------
xorjoep	1:24714b45cd1b	161	* Initializations of stage1
xorjoep	1:24714b45cd1b	162	* -------------------------*/
xorjoep	1:24714b45cd1b	163
xorjoep	1:24714b45cd1b	164	/* sum = x[0] * y[srcBlen - 1]
xorjoep	1:24714b45cd1b	165	* sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
xorjoep	1:24714b45cd1b	166	* ....
xorjoep	1:24714b45cd1b	167	* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
xorjoep	1:24714b45cd1b	168	*/
xorjoep	1:24714b45cd1b	169
xorjoep	1:24714b45cd1b	170	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	171	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	172	count = 1U;
xorjoep	1:24714b45cd1b	173
xorjoep	1:24714b45cd1b	174	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	175	px = pIn1;
xorjoep	1:24714b45cd1b	176
xorjoep	1:24714b45cd1b	177	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	178	pSrc1 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	179	py = pSrc1;
xorjoep	1:24714b45cd1b	180
xorjoep	1:24714b45cd1b	181	/* ------------------------
xorjoep	1:24714b45cd1b	182	* Stage1 process
xorjoep	1:24714b45cd1b	183	* ----------------------*/
xorjoep	1:24714b45cd1b	184
xorjoep	1:24714b45cd1b	185	/* The first loop starts here */
xorjoep	1:24714b45cd1b	186	while (blockSize1 > 0U)
xorjoep	1:24714b45cd1b	187	{
xorjoep	1:24714b45cd1b	188	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	189	sum = 0;
xorjoep	1:24714b45cd1b	190
xorjoep	1:24714b45cd1b	191	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	192	k = count >> 2;
xorjoep	1:24714b45cd1b	193
xorjoep	1:24714b45cd1b	194	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	195	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	196	while (k > 0U)
xorjoep	1:24714b45cd1b	197	{
xorjoep	1:24714b45cd1b	198	/* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	199	sum = __SMLALD(__SIMD32(px)++, __SIMD32(py)++, sum);
xorjoep	1:24714b45cd1b	200	/* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	201	sum = __SMLALD(__SIMD32(px)++, __SIMD32(py)++, sum);
xorjoep	1:24714b45cd1b	202
xorjoep	1:24714b45cd1b	203	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	204	k--;
xorjoep	1:24714b45cd1b	205	}
xorjoep	1:24714b45cd1b	206
xorjoep	1:24714b45cd1b	207	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	208	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	209	k = count % 0x4U;
xorjoep	1:24714b45cd1b	210
xorjoep	1:24714b45cd1b	211	while (k > 0U)
xorjoep	1:24714b45cd1b	212	{
xorjoep	1:24714b45cd1b	213	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	214	/* x[0] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	215	sum = __SMLALD(px++, py++, sum);
xorjoep	1:24714b45cd1b	216
xorjoep	1:24714b45cd1b	217	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	218	k--;
xorjoep	1:24714b45cd1b	219	}
xorjoep	1:24714b45cd1b	220
xorjoep	1:24714b45cd1b	221	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	222	*pOut = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep	1:24714b45cd1b	223	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	224	pOut += inc;
xorjoep	1:24714b45cd1b	225
xorjoep	1:24714b45cd1b	226	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	227	py = pSrc1 - count;
xorjoep	1:24714b45cd1b	228	px = pIn1;
xorjoep	1:24714b45cd1b	229
xorjoep	1:24714b45cd1b	230	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	231	count++;
xorjoep	1:24714b45cd1b	232
xorjoep	1:24714b45cd1b	233	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	234	blockSize1--;
xorjoep	1:24714b45cd1b	235	}
xorjoep	1:24714b45cd1b	236
xorjoep	1:24714b45cd1b	237	/* --------------------------
xorjoep	1:24714b45cd1b	238	* Initializations of stage2
xorjoep	1:24714b45cd1b	239	* ------------------------*/
xorjoep	1:24714b45cd1b	240
xorjoep	1:24714b45cd1b	241	/* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	242	* sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	243	* ....
xorjoep	1:24714b45cd1b	244	* sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	245	*/
xorjoep	1:24714b45cd1b	246
xorjoep	1:24714b45cd1b	247	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	248	px = pIn1;
xorjoep	1:24714b45cd1b	249
xorjoep	1:24714b45cd1b	250	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	251	py = pIn2;
xorjoep	1:24714b45cd1b	252
xorjoep	1:24714b45cd1b	253	/* count is index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	254	count = 0U;
xorjoep	1:24714b45cd1b	255
xorjoep	1:24714b45cd1b	256	/* -------------------
xorjoep	1:24714b45cd1b	257	* Stage2 process
xorjoep	1:24714b45cd1b	258	* ------------------*/
xorjoep	1:24714b45cd1b	259
xorjoep	1:24714b45cd1b	260	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	261	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	262	* srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
xorjoep	1:24714b45cd1b	263	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	264	{
xorjoep	1:24714b45cd1b	265	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	266	blkCnt = blockSize2 >> 2U;
xorjoep	1:24714b45cd1b	267
xorjoep	1:24714b45cd1b	268	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	269	{
xorjoep	1:24714b45cd1b	270	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	271	acc0 = 0;
xorjoep	1:24714b45cd1b	272	acc1 = 0;
xorjoep	1:24714b45cd1b	273	acc2 = 0;
xorjoep	1:24714b45cd1b	274	acc3 = 0;
xorjoep	1:24714b45cd1b	275
xorjoep	1:24714b45cd1b	276	/* read x[0], x[1] samples */
xorjoep	1:24714b45cd1b	277	x0 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	278	/* read x[1], x[2] samples */
xorjoep	1:24714b45cd1b	279	x1 = _SIMD32_OFFSET(px + 1);
xorjoep	1:24714b45cd1b	280	px += 2U;
xorjoep	1:24714b45cd1b	281
xorjoep	1:24714b45cd1b	282	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	283	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	284
xorjoep	1:24714b45cd1b	285	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	286	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	287	do
xorjoep	1:24714b45cd1b	288	{
xorjoep	1:24714b45cd1b	289	/* Read the first two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	290	* y[0] and y[1] */
xorjoep	1:24714b45cd1b	291	c0 = *__SIMD32(py)++;
xorjoep	1:24714b45cd1b	292
xorjoep	1:24714b45cd1b	293	/* acc0 += x[0] * y[0] + x[1] * y[1] */
xorjoep	1:24714b45cd1b	294	acc0 = __SMLALD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	295
xorjoep	1:24714b45cd1b	296	/* acc1 += x[1] * y[0] + x[2] * y[1] */
xorjoep	1:24714b45cd1b	297	acc1 = __SMLALD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	298
xorjoep	1:24714b45cd1b	299	/* Read x[2], x[3] */
xorjoep	1:24714b45cd1b	300	x2 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	301
xorjoep	1:24714b45cd1b	302	/* Read x[3], x[4] */
xorjoep	1:24714b45cd1b	303	x3 = _SIMD32_OFFSET(px + 1);
xorjoep	1:24714b45cd1b	304
xorjoep	1:24714b45cd1b	305	/* acc2 += x[2] * y[0] + x[3] * y[1] */
xorjoep	1:24714b45cd1b	306	acc2 = __SMLALD(x2, c0, acc2);
xorjoep	1:24714b45cd1b	307
xorjoep	1:24714b45cd1b	308	/* acc3 += x[3] * y[0] + x[4] * y[1] */
xorjoep	1:24714b45cd1b	309	acc3 = __SMLALD(x3, c0, acc3);
xorjoep	1:24714b45cd1b	310
xorjoep	1:24714b45cd1b	311	/* Read y[2] and y[3] */
xorjoep	1:24714b45cd1b	312	c0 = *__SIMD32(py)++;
xorjoep	1:24714b45cd1b	313
xorjoep	1:24714b45cd1b	314	/* acc0 += x[2] * y[2] + x[3] * y[3] */
xorjoep	1:24714b45cd1b	315	acc0 = __SMLALD(x2, c0, acc0);
xorjoep	1:24714b45cd1b	316
xorjoep	1:24714b45cd1b	317	/* acc1 += x[3] * y[2] + x[4] * y[3] */
xorjoep	1:24714b45cd1b	318	acc1 = __SMLALD(x3, c0, acc1);
xorjoep	1:24714b45cd1b	319
xorjoep	1:24714b45cd1b	320	/* Read x[4], x[5] */
xorjoep	1:24714b45cd1b	321	x0 = _SIMD32_OFFSET(px + 2);
xorjoep	1:24714b45cd1b	322
xorjoep	1:24714b45cd1b	323	/* Read x[5], x[6] */
xorjoep	1:24714b45cd1b	324	x1 = _SIMD32_OFFSET(px + 3);
xorjoep	1:24714b45cd1b	325
xorjoep	1:24714b45cd1b	326	px += 4U;
xorjoep	1:24714b45cd1b	327
xorjoep	1:24714b45cd1b	328	/* acc2 += x[4] * y[2] + x[5] * y[3] */
xorjoep	1:24714b45cd1b	329	acc2 = __SMLALD(x0, c0, acc2);
xorjoep	1:24714b45cd1b	330
xorjoep	1:24714b45cd1b	331	/* acc3 += x[5] * y[2] + x[6] * y[3] */
xorjoep	1:24714b45cd1b	332	acc3 = __SMLALD(x1, c0, acc3);
xorjoep	1:24714b45cd1b	333
xorjoep	1:24714b45cd1b	334	} while (--k);
xorjoep	1:24714b45cd1b	335
xorjoep	1:24714b45cd1b	336	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	337	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	338	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	339
xorjoep	1:24714b45cd1b	340	if (k == 1U)
xorjoep	1:24714b45cd1b	341	{
xorjoep	1:24714b45cd1b	342	/* Read y[4] */
xorjoep	1:24714b45cd1b	343	c0 = *py;
xorjoep	1:24714b45cd1b	344	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	345
xorjoep	1:24714b45cd1b	346	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	347
xorjoep	1:24714b45cd1b	348	#else
xorjoep	1:24714b45cd1b	349
xorjoep	1:24714b45cd1b	350	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	351
xorjoep	1:24714b45cd1b	352	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	353	/* Read x[7] */
xorjoep	1:24714b45cd1b	354	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	355	px++;
xorjoep	1:24714b45cd1b	356
xorjoep	1:24714b45cd1b	357	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	358	acc0 = __SMLALD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	359	acc1 = __SMLALD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	360	acc2 = __SMLALDX(x1, c0, acc2);
xorjoep	1:24714b45cd1b	361	acc3 = __SMLALDX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	362	}
xorjoep	1:24714b45cd1b	363
xorjoep	1:24714b45cd1b	364	if (k == 2U)
xorjoep	1:24714b45cd1b	365	{
xorjoep	1:24714b45cd1b	366	/* Read y[4], y[5] */
xorjoep	1:24714b45cd1b	367	c0 = *__SIMD32(py);
xorjoep	1:24714b45cd1b	368
xorjoep	1:24714b45cd1b	369	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	370	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	371
xorjoep	1:24714b45cd1b	372	/* Read x[9] */
xorjoep	1:24714b45cd1b	373	x2 = _SIMD32_OFFSET(px + 1);
xorjoep	1:24714b45cd1b	374	px += 2U;
xorjoep	1:24714b45cd1b	375
xorjoep	1:24714b45cd1b	376	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	377	acc0 = __SMLALD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	378	acc1 = __SMLALD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	379	acc2 = __SMLALD(x3, c0, acc2);
xorjoep	1:24714b45cd1b	380	acc3 = __SMLALD(x2, c0, acc3);
xorjoep	1:24714b45cd1b	381	}
xorjoep	1:24714b45cd1b	382
xorjoep	1:24714b45cd1b	383	if (k == 3U)
xorjoep	1:24714b45cd1b	384	{
xorjoep	1:24714b45cd1b	385	/* Read y[4], y[5] */
xorjoep	1:24714b45cd1b	386	c0 = *__SIMD32(py)++;
xorjoep	1:24714b45cd1b	387
xorjoep	1:24714b45cd1b	388	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	389	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	390
xorjoep	1:24714b45cd1b	391	/* Read x[9] */
xorjoep	1:24714b45cd1b	392	x2 = _SIMD32_OFFSET(px + 1);
xorjoep	1:24714b45cd1b	393
xorjoep	1:24714b45cd1b	394	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	395	acc0 = __SMLALD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	396	acc1 = __SMLALD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	397	acc2 = __SMLALD(x3, c0, acc2);
xorjoep	1:24714b45cd1b	398	acc3 = __SMLALD(x2, c0, acc3);
xorjoep	1:24714b45cd1b	399
xorjoep	1:24714b45cd1b	400	c0 = (*py);
xorjoep	1:24714b45cd1b	401
xorjoep	1:24714b45cd1b	402	/* Read y[6] */
xorjoep	1:24714b45cd1b	403	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	404
xorjoep	1:24714b45cd1b	405	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	406	#else
xorjoep	1:24714b45cd1b	407
xorjoep	1:24714b45cd1b	408	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	409	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	410	/* Read x[10] */
xorjoep	1:24714b45cd1b	411	x3 = _SIMD32_OFFSET(px + 2);
xorjoep	1:24714b45cd1b	412	px += 3U;
xorjoep	1:24714b45cd1b	413
xorjoep	1:24714b45cd1b	414	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	415	acc0 = __SMLALDX(x1, c0, acc0);
xorjoep	1:24714b45cd1b	416	acc1 = __SMLALD(x2, c0, acc1);
xorjoep	1:24714b45cd1b	417	acc2 = __SMLALDX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	418	acc3 = __SMLALDX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	419	}
xorjoep	1:24714b45cd1b	420
xorjoep	1:24714b45cd1b	421	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	422	*pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
xorjoep	1:24714b45cd1b	423	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	424	pOut += inc;
xorjoep	1:24714b45cd1b	425
xorjoep	1:24714b45cd1b	426	*pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
xorjoep	1:24714b45cd1b	427	pOut += inc;
xorjoep	1:24714b45cd1b	428
xorjoep	1:24714b45cd1b	429	*pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
xorjoep	1:24714b45cd1b	430	pOut += inc;
xorjoep	1:24714b45cd1b	431
xorjoep	1:24714b45cd1b	432	*pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
xorjoep	1:24714b45cd1b	433	pOut += inc;
xorjoep	1:24714b45cd1b	434
xorjoep	1:24714b45cd1b	435	/* Increment the count by 4 as 4 output values are computed */
xorjoep	1:24714b45cd1b	436	count += 4U;
xorjoep	1:24714b45cd1b	437
xorjoep	1:24714b45cd1b	438	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	439	px = pIn1 + count;
xorjoep	1:24714b45cd1b	440	py = pIn2;
xorjoep	1:24714b45cd1b	441
xorjoep	1:24714b45cd1b	442	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	443	blkCnt--;
xorjoep	1:24714b45cd1b	444	}
xorjoep	1:24714b45cd1b	445
xorjoep	1:24714b45cd1b	446	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	447	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	448	blkCnt = blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	449
xorjoep	1:24714b45cd1b	450	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	451	{
xorjoep	1:24714b45cd1b	452	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	453	sum = 0;
xorjoep	1:24714b45cd1b	454
xorjoep	1:24714b45cd1b	455	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	456	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	457
xorjoep	1:24714b45cd1b	458	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	459	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	460	while (k > 0U)
xorjoep	1:24714b45cd1b	461	{
xorjoep	1:24714b45cd1b	462	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	463	sum += ((q63_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	464	sum += ((q63_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	465	sum += ((q63_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	466	sum += ((q63_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	467
xorjoep	1:24714b45cd1b	468	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	469	k--;
xorjoep	1:24714b45cd1b	470	}
xorjoep	1:24714b45cd1b	471
xorjoep	1:24714b45cd1b	472	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	473	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	474	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	475
xorjoep	1:24714b45cd1b	476	while (k > 0U)
xorjoep	1:24714b45cd1b	477	{
xorjoep	1:24714b45cd1b	478	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	479	sum += ((q63_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	480
xorjoep	1:24714b45cd1b	481	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	482	k--;
xorjoep	1:24714b45cd1b	483	}
xorjoep	1:24714b45cd1b	484
xorjoep	1:24714b45cd1b	485	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	486	*pOut = (q15_t) (__SSAT(sum >> 15, 16));
xorjoep	1:24714b45cd1b	487	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	488	pOut += inc;
xorjoep	1:24714b45cd1b	489
xorjoep	1:24714b45cd1b	490	/* Increment count by 1, as one output value is computed */
xorjoep	1:24714b45cd1b	491	count++;
xorjoep	1:24714b45cd1b	492
xorjoep	1:24714b45cd1b	493	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	494	px = pIn1 + count;
xorjoep	1:24714b45cd1b	495	py = pIn2;
xorjoep	1:24714b45cd1b	496
xorjoep	1:24714b45cd1b	497	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	498	blkCnt--;
xorjoep	1:24714b45cd1b	499	}
xorjoep	1:24714b45cd1b	500	}
xorjoep	1:24714b45cd1b	501	else
xorjoep	1:24714b45cd1b	502	{
xorjoep	1:24714b45cd1b	503	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	504	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	505	blkCnt = blockSize2;
xorjoep	1:24714b45cd1b	506
xorjoep	1:24714b45cd1b	507	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	508	{
xorjoep	1:24714b45cd1b	509	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	510	sum = 0;
xorjoep	1:24714b45cd1b	511
xorjoep	1:24714b45cd1b	512	/* Loop over srcBLen */
xorjoep	1:24714b45cd1b	513	k = srcBLen;
xorjoep	1:24714b45cd1b	514
xorjoep	1:24714b45cd1b	515	while (k > 0U)
xorjoep	1:24714b45cd1b	516	{
xorjoep	1:24714b45cd1b	517	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	518	sum += ((q63_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	519
xorjoep	1:24714b45cd1b	520	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	521	k--;
xorjoep	1:24714b45cd1b	522	}
xorjoep	1:24714b45cd1b	523
xorjoep	1:24714b45cd1b	524	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	525	*pOut = (q15_t) (__SSAT(sum >> 15, 16));
xorjoep	1:24714b45cd1b	526	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	527	pOut += inc;
xorjoep	1:24714b45cd1b	528
xorjoep	1:24714b45cd1b	529	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	530	count++;
xorjoep	1:24714b45cd1b	531
xorjoep	1:24714b45cd1b	532	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	533	px = pIn1 + count;
xorjoep	1:24714b45cd1b	534	py = pIn2;
xorjoep	1:24714b45cd1b	535
xorjoep	1:24714b45cd1b	536	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	537	blkCnt--;
xorjoep	1:24714b45cd1b	538	}
xorjoep	1:24714b45cd1b	539	}
xorjoep	1:24714b45cd1b	540
xorjoep	1:24714b45cd1b	541	/* --------------------------
xorjoep	1:24714b45cd1b	542	* Initializations of stage3
xorjoep	1:24714b45cd1b	543	* -------------------------*/
xorjoep	1:24714b45cd1b	544
xorjoep	1:24714b45cd1b	545	/* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	546	* sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	547	* ....
xorjoep	1:24714b45cd1b	548	* sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	549	* sum += x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	550	*/
xorjoep	1:24714b45cd1b	551
xorjoep	1:24714b45cd1b	552	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	553	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	554	count = srcBLen - 1U;
xorjoep	1:24714b45cd1b	555
xorjoep	1:24714b45cd1b	556	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	557	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	558	px = pSrc1;
xorjoep	1:24714b45cd1b	559
xorjoep	1:24714b45cd1b	560	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	561	py = pIn2;
xorjoep	1:24714b45cd1b	562
xorjoep	1:24714b45cd1b	563	/* -------------------
xorjoep	1:24714b45cd1b	564	* Stage3 process
xorjoep	1:24714b45cd1b	565	* ------------------*/
xorjoep	1:24714b45cd1b	566
xorjoep	1:24714b45cd1b	567	while (blockSize3 > 0U)
xorjoep	1:24714b45cd1b	568	{
xorjoep	1:24714b45cd1b	569	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	570	sum = 0;
xorjoep	1:24714b45cd1b	571
xorjoep	1:24714b45cd1b	572	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	573	k = count >> 2U;
xorjoep	1:24714b45cd1b	574
xorjoep	1:24714b45cd1b	575	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	576	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	577	while (k > 0U)
xorjoep	1:24714b45cd1b	578	{
xorjoep	1:24714b45cd1b	579	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	580	/* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
xorjoep	1:24714b45cd1b	581	sum = __SMLALD(__SIMD32(px)++, __SIMD32(py)++, sum);
xorjoep	1:24714b45cd1b	582	/* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
xorjoep	1:24714b45cd1b	583	sum = __SMLALD(__SIMD32(px)++, __SIMD32(py)++, sum);
xorjoep	1:24714b45cd1b	584
xorjoep	1:24714b45cd1b	585	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	586	k--;
xorjoep	1:24714b45cd1b	587	}
xorjoep	1:24714b45cd1b	588
xorjoep	1:24714b45cd1b	589	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	590	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	591	k = count % 0x4U;
xorjoep	1:24714b45cd1b	592
xorjoep	1:24714b45cd1b	593	while (k > 0U)
xorjoep	1:24714b45cd1b	594	{
xorjoep	1:24714b45cd1b	595	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	596	sum = __SMLALD(px++, py++, sum);
xorjoep	1:24714b45cd1b	597
xorjoep	1:24714b45cd1b	598	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	599	k--;
xorjoep	1:24714b45cd1b	600	}
xorjoep	1:24714b45cd1b	601
xorjoep	1:24714b45cd1b	602	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	603	*pOut = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep	1:24714b45cd1b	604	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	605	pOut += inc;
xorjoep	1:24714b45cd1b	606
xorjoep	1:24714b45cd1b	607	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	608	px = ++pSrc1;
xorjoep	1:24714b45cd1b	609	py = pIn2;
xorjoep	1:24714b45cd1b	610
xorjoep	1:24714b45cd1b	611	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	612	count--;
xorjoep	1:24714b45cd1b	613
xorjoep	1:24714b45cd1b	614	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	615	blockSize3--;
xorjoep	1:24714b45cd1b	616	}
xorjoep	1:24714b45cd1b	617
xorjoep	1:24714b45cd1b	618	#else
xorjoep	1:24714b45cd1b	619
xorjoep	1:24714b45cd1b	620	/* Run the below code for Cortex-M0 */
xorjoep	1:24714b45cd1b	621
xorjoep	1:24714b45cd1b	622	q15_t pIn1 = pSrcA; / inputA pointer */
xorjoep	1:24714b45cd1b	623	q15_t pIn2 = pSrcB + (srcBLen - 1U); / inputB pointer */
xorjoep	1:24714b45cd1b	624	q63_t sum; /* Accumulators */
xorjoep	1:24714b45cd1b	625	uint32_t i = 0U, j; /* loop counters */
xorjoep	1:24714b45cd1b	626	uint32_t inv = 0U; /* Reverse order flag */
xorjoep	1:24714b45cd1b	627	uint32_t tot = 0U; /* Length */
xorjoep	1:24714b45cd1b	628
xorjoep	1:24714b45cd1b	629	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	630	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	631	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	632	/* But CORR(x, y) is reverse of CORR(y, x) */
xorjoep	1:24714b45cd1b	633	/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
xorjoep	1:24714b45cd1b	634	/* and a varaible, inv is set to 1 */
xorjoep	1:24714b45cd1b	635	/* If lengths are not equal then zero pad has to be done to make the two
xorjoep	1:24714b45cd1b	636	* inputs of same length. But to improve the performance, we include zeroes
xorjoep	1:24714b45cd1b	637	* in the output instead of zero padding either of the the inputs*/
xorjoep	1:24714b45cd1b	638	/* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
xorjoep	1:24714b45cd1b	639	* starting of the output buffer */
xorjoep	1:24714b45cd1b	640	/* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
xorjoep	1:24714b45cd1b	641	* ending of the output buffer */
xorjoep	1:24714b45cd1b	642	/* Once the zero padding is done the remaining of the output is calcualted
xorjoep	1:24714b45cd1b	643	* using convolution but with the shorter signal time shifted. */
xorjoep	1:24714b45cd1b	644
xorjoep	1:24714b45cd1b	645	/* Calculate the length of the remaining sequence */
xorjoep	1:24714b45cd1b	646	tot = ((srcALen + srcBLen) - 2U);
xorjoep	1:24714b45cd1b	647
xorjoep	1:24714b45cd1b	648	if (srcALen > srcBLen)
xorjoep	1:24714b45cd1b	649	{
xorjoep	1:24714b45cd1b	650	/* Calculating the number of zeros to be padded to the output */
xorjoep	1:24714b45cd1b	651	j = srcALen - srcBLen;
xorjoep	1:24714b45cd1b	652
xorjoep	1:24714b45cd1b	653	/* Initialise the pointer after zero padding */
xorjoep	1:24714b45cd1b	654	pDst += j;
xorjoep	1:24714b45cd1b	655	}
xorjoep	1:24714b45cd1b	656
xorjoep	1:24714b45cd1b	657	else if (srcALen < srcBLen)
xorjoep	1:24714b45cd1b	658	{
xorjoep	1:24714b45cd1b	659	/* Initialization to inputB pointer */
xorjoep	1:24714b45cd1b	660	pIn1 = pSrcB;
xorjoep	1:24714b45cd1b	661
xorjoep	1:24714b45cd1b	662	/* Initialization to the end of inputA pointer */
xorjoep	1:24714b45cd1b	663	pIn2 = pSrcA + (srcALen - 1U);
xorjoep	1:24714b45cd1b	664
xorjoep	1:24714b45cd1b	665	/* Initialisation of the pointer after zero padding */
xorjoep	1:24714b45cd1b	666	pDst = pDst + tot;
xorjoep	1:24714b45cd1b	667
xorjoep	1:24714b45cd1b	668	/* Swapping the lengths */
xorjoep	1:24714b45cd1b	669	j = srcALen;
xorjoep	1:24714b45cd1b	670	srcALen = srcBLen;
xorjoep	1:24714b45cd1b	671	srcBLen = j;
xorjoep	1:24714b45cd1b	672
xorjoep	1:24714b45cd1b	673	/* Setting the reverse flag */
xorjoep	1:24714b45cd1b	674	inv = 1;
xorjoep	1:24714b45cd1b	675
xorjoep	1:24714b45cd1b	676	}
xorjoep	1:24714b45cd1b	677
xorjoep	1:24714b45cd1b	678	/* Loop to calculate convolution for output length number of times */
xorjoep	1:24714b45cd1b	679	for (i = 0U; i <= tot; i++)
xorjoep	1:24714b45cd1b	680	{
xorjoep	1:24714b45cd1b	681	/* Initialize sum with zero to carry on MAC operations */
xorjoep	1:24714b45cd1b	682	sum = 0;
xorjoep	1:24714b45cd1b	683
xorjoep	1:24714b45cd1b	684	/* Loop to perform MAC operations according to convolution equation */
xorjoep	1:24714b45cd1b	685	for (j = 0U; j <= i; j++)
xorjoep	1:24714b45cd1b	686	{
xorjoep	1:24714b45cd1b	687	/* Check the array limitations */
xorjoep	1:24714b45cd1b	688	if ((((i - j) < srcBLen) && (j < srcALen)))
xorjoep	1:24714b45cd1b	689	{
xorjoep	1:24714b45cd1b	690	/* z[i] += x[i-j] * y[j] */
xorjoep	1:24714b45cd1b	691	sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
xorjoep	1:24714b45cd1b	692	}
xorjoep	1:24714b45cd1b	693	}
xorjoep	1:24714b45cd1b	694	/* Store the output in the destination buffer */
xorjoep	1:24714b45cd1b	695	if (inv == 1)
xorjoep	1:24714b45cd1b	696	*pDst-- = (q15_t) __SSAT((sum >> 15U), 16U);
xorjoep	1:24714b45cd1b	697	else
xorjoep	1:24714b45cd1b	698	*pDst++ = (q15_t) __SSAT((sum >> 15U), 16U);
xorjoep	1:24714b45cd1b	699	}
xorjoep	1:24714b45cd1b	700
xorjoep	1:24714b45cd1b	701	#endif /* #if (defined(ARM_MATH_CM7) \|\| defined(ARM_MATH_CM4) \|\| defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
xorjoep	1:24714b45cd1b	702
xorjoep	1:24714b45cd1b	703	}
xorjoep	1:24714b45cd1b	704
xorjoep	1:24714b45cd1b	705	/**
xorjoep	1:24714b45cd1b	706	* @} end of Corr group
xorjoep	1:24714b45cd1b	707	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	20 Jun 2018
Imports:	227
Forks:	0
Commits:	4
Dependents:	10
Dependencies:	0
Followers:	6

functions/FilteringFunctions/arm_correlate_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning