CMSIS_DSP_5 - The CMSIS DSP 5 library

Users » xorjoep » Code » CMSIS_DSP_5

The CMSIS DSP 5 library

Dependents: Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

functions/FilteringFunctions/arm_correlate_fast_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Committer:: xorjoep
Date:: Thu Jun 21 11:56:27 2018 +0000
Revision:: 3:4098b9d3d571
Parent:: 1:24714b45cd1b

headers is a folder not a library

Who changed what in which revision?

User	Revision	Line number	New contents of line
xorjoep	1:24714b45cd1b	1	/* ----------------------------------------------------------------------
xorjoep	1:24714b45cd1b	2	* Project: CMSIS DSP Library
xorjoep	1:24714b45cd1b	3	* Title: arm_correlate_fast_q15.c
xorjoep	1:24714b45cd1b	4	* Description: Fast Q15 Correlation
xorjoep	1:24714b45cd1b	5	*
xorjoep	1:24714b45cd1b	6	* $Date: 27. January 2017
xorjoep	1:24714b45cd1b	7	* $Revision: V.1.5.1
xorjoep	1:24714b45cd1b	8	*
xorjoep	1:24714b45cd1b	9	* Target Processor: Cortex-M cores
xorjoep	1:24714b45cd1b	10	* -------------------------------------------------------------------- */
xorjoep	1:24714b45cd1b	11	/*
xorjoep	1:24714b45cd1b	12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep	1:24714b45cd1b	13	*
xorjoep	1:24714b45cd1b	14	* SPDX-License-Identifier: Apache-2.0
xorjoep	1:24714b45cd1b	15	*
xorjoep	1:24714b45cd1b	16	* Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep	1:24714b45cd1b	17	* not use this file except in compliance with the License.
xorjoep	1:24714b45cd1b	18	* You may obtain a copy of the License at
xorjoep	1:24714b45cd1b	19	*
xorjoep	1:24714b45cd1b	20	* www.apache.org/licenses/LICENSE-2.0
xorjoep	1:24714b45cd1b	21	*
xorjoep	1:24714b45cd1b	22	* Unless required by applicable law or agreed to in writing, software
xorjoep	1:24714b45cd1b	23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep	1:24714b45cd1b	24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep	1:24714b45cd1b	25	* See the License for the specific language governing permissions and
xorjoep	1:24714b45cd1b	26	* limitations under the License.
xorjoep	1:24714b45cd1b	27	*/
xorjoep	1:24714b45cd1b	28
xorjoep	1:24714b45cd1b	29	#include "arm_math.h"
xorjoep	1:24714b45cd1b	30
xorjoep	1:24714b45cd1b	31	/**
xorjoep	1:24714b45cd1b	32	* @ingroup groupFilters
xorjoep	1:24714b45cd1b	33	*/
xorjoep	1:24714b45cd1b	34
xorjoep	1:24714b45cd1b	35	/**
xorjoep	1:24714b45cd1b	36	* @addtogroup Corr
xorjoep	1:24714b45cd1b	37	* @{
xorjoep	1:24714b45cd1b	38	*/
xorjoep	1:24714b45cd1b	39
xorjoep	1:24714b45cd1b	40	/**
xorjoep	1:24714b45cd1b	41	* @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
xorjoep	1:24714b45cd1b	42	* @param[in] *pSrcA points to the first input sequence.
xorjoep	1:24714b45cd1b	43	* @param[in] srcALen length of the first input sequence.
xorjoep	1:24714b45cd1b	44	* @param[in] *pSrcB points to the second input sequence.
xorjoep	1:24714b45cd1b	45	* @param[in] srcBLen length of the second input sequence.
xorjoep	1:24714b45cd1b	46	* @param[out] pDst points to the location where the output result is written. Length 2 max(srcALen, srcBLen) - 1.
xorjoep	1:24714b45cd1b	47	* @return none.
xorjoep	1:24714b45cd1b	48	*
xorjoep	1:24714b45cd1b	49	* <b>Scaling and Overflow Behavior:</b>
xorjoep	1:24714b45cd1b	50	*
xorjoep	1:24714b45cd1b	51	* \par
xorjoep	1:24714b45cd1b	52	* This fast version uses a 32-bit accumulator with 2.30 format.
xorjoep	1:24714b45cd1b	53	* The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
xorjoep	1:24714b45cd1b	54	* There is no saturation on intermediate additions.
xorjoep	1:24714b45cd1b	55	* Thus, if the accumulator overflows it wraps around and distorts the result.
xorjoep	1:24714b45cd1b	56	* The input signals should be scaled down to avoid intermediate overflows.
xorjoep	1:24714b45cd1b	57	* Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
xorjoep	1:24714b45cd1b	58	* maximum of min(srcALen, srcBLen) number of additions is carried internally.
xorjoep	1:24714b45cd1b	59	* The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
xorjoep	1:24714b45cd1b	60	*
xorjoep	1:24714b45cd1b	61	* \par
xorjoep	1:24714b45cd1b	62	* See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
xorjoep	1:24714b45cd1b	63	*/
xorjoep	1:24714b45cd1b	64
xorjoep	1:24714b45cd1b	65	void arm_correlate_fast_q15(
xorjoep	1:24714b45cd1b	66	q15_t * pSrcA,
xorjoep	1:24714b45cd1b	67	uint32_t srcALen,
xorjoep	1:24714b45cd1b	68	q15_t * pSrcB,
xorjoep	1:24714b45cd1b	69	uint32_t srcBLen,
xorjoep	1:24714b45cd1b	70	q15_t * pDst)
xorjoep	1:24714b45cd1b	71	{
xorjoep	1:24714b45cd1b	72	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	73
xorjoep	1:24714b45cd1b	74	q15_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	75	q15_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	76	q15_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	77	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
xorjoep	1:24714b45cd1b	78	q15_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	79	q15_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	80	q15_t pSrc1; / Intermediate pointers */
xorjoep	1:24714b45cd1b	81	q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
xorjoep	1:24714b45cd1b	82	uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
xorjoep	1:24714b45cd1b	83	int32_t inc = 1; /* Destination address modifier */
xorjoep	1:24714b45cd1b	84
xorjoep	1:24714b45cd1b	85
xorjoep	1:24714b45cd1b	86	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	87	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	88	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	89	/* But CORR(x, y) is reverse of CORR(y, x) */
xorjoep	1:24714b45cd1b	90	/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
xorjoep	1:24714b45cd1b	91	/* and the destination pointer modifier, inc is set to -1 */
xorjoep	1:24714b45cd1b	92	/* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
xorjoep	1:24714b45cd1b	93	/* But to improve the performance,
xorjoep	1:24714b45cd1b	94	* we include zeroes in the output instead of zero padding either of the the inputs*/
xorjoep	1:24714b45cd1b	95	/* If srcALen > srcBLen,
xorjoep	1:24714b45cd1b	96	* (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
xorjoep	1:24714b45cd1b	97	/* If srcALen < srcBLen,
xorjoep	1:24714b45cd1b	98	* (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
xorjoep	1:24714b45cd1b	99	if (srcALen >= srcBLen)
xorjoep	1:24714b45cd1b	100	{
xorjoep	1:24714b45cd1b	101	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	102	pIn1 = (pSrcA);
xorjoep	1:24714b45cd1b	103
xorjoep	1:24714b45cd1b	104	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	105	pIn2 = (pSrcB);
xorjoep	1:24714b45cd1b	106
xorjoep	1:24714b45cd1b	107	/* Number of output samples is calculated */
xorjoep	1:24714b45cd1b	108	outBlockSize = (2U * srcALen) - 1U;
xorjoep	1:24714b45cd1b	109
xorjoep	1:24714b45cd1b	110	/* When srcALen > srcBLen, zero padding is done to srcB
xorjoep	1:24714b45cd1b	111	* to make their lengths equal.
xorjoep	1:24714b45cd1b	112	* Instead, (outBlockSize - (srcALen + srcBLen - 1))
xorjoep	1:24714b45cd1b	113	* number of output samples are made zero */
xorjoep	1:24714b45cd1b	114	j = outBlockSize - (srcALen + (srcBLen - 1U));
xorjoep	1:24714b45cd1b	115
xorjoep	1:24714b45cd1b	116	/* Updating the pointer position to non zero value */
xorjoep	1:24714b45cd1b	117	pOut += j;
xorjoep	1:24714b45cd1b	118
xorjoep	1:24714b45cd1b	119	}
xorjoep	1:24714b45cd1b	120	else
xorjoep	1:24714b45cd1b	121	{
xorjoep	1:24714b45cd1b	122	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	123	pIn1 = (pSrcB);
xorjoep	1:24714b45cd1b	124
xorjoep	1:24714b45cd1b	125	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	126	pIn2 = (pSrcA);
xorjoep	1:24714b45cd1b	127
xorjoep	1:24714b45cd1b	128	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	129	j = srcBLen;
xorjoep	1:24714b45cd1b	130	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	131	srcALen = j;
xorjoep	1:24714b45cd1b	132
xorjoep	1:24714b45cd1b	133	/* CORR(x, y) = Reverse order(CORR(y, x)) */
xorjoep	1:24714b45cd1b	134	/* Hence set the destination pointer to point to the last output sample */
xorjoep	1:24714b45cd1b	135	pOut = pDst + ((srcALen + srcBLen) - 2U);
xorjoep	1:24714b45cd1b	136
xorjoep	1:24714b45cd1b	137	/* Destination address modifier is set to -1 */
xorjoep	1:24714b45cd1b	138	inc = -1;
xorjoep	1:24714b45cd1b	139
xorjoep	1:24714b45cd1b	140	}
xorjoep	1:24714b45cd1b	141
xorjoep	1:24714b45cd1b	142	/* The function is internally
xorjoep	1:24714b45cd1b	143	* divided into three parts according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	144	* taken place between inputA samples and inputB samples. In the first part of the
xorjoep	1:24714b45cd1b	145	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	146	* In the second part of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	147	* In the third part of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	148	* for every iteration.*/
xorjoep	1:24714b45cd1b	149	/* The algorithm is implemented in three stages.
xorjoep	1:24714b45cd1b	150	* The loop counters of each stage is initiated here. */
xorjoep	1:24714b45cd1b	151	blockSize1 = srcBLen - 1U;
xorjoep	1:24714b45cd1b	152	blockSize2 = srcALen - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	153	blockSize3 = blockSize1;
xorjoep	1:24714b45cd1b	154
xorjoep	1:24714b45cd1b	155	/* --------------------------
xorjoep	1:24714b45cd1b	156	* Initializations of stage1
xorjoep	1:24714b45cd1b	157	* -------------------------*/
xorjoep	1:24714b45cd1b	158
xorjoep	1:24714b45cd1b	159	/* sum = x[0] * y[srcBlen - 1]
xorjoep	1:24714b45cd1b	160	* sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
xorjoep	1:24714b45cd1b	161	* ....
xorjoep	1:24714b45cd1b	162	* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
xorjoep	1:24714b45cd1b	163	*/
xorjoep	1:24714b45cd1b	164
xorjoep	1:24714b45cd1b	165	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	166	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	167	count = 1U;
xorjoep	1:24714b45cd1b	168
xorjoep	1:24714b45cd1b	169	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	170	px = pIn1;
xorjoep	1:24714b45cd1b	171
xorjoep	1:24714b45cd1b	172	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	173	pSrc1 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	174	py = pSrc1;
xorjoep	1:24714b45cd1b	175
xorjoep	1:24714b45cd1b	176	/* ------------------------
xorjoep	1:24714b45cd1b	177	* Stage1 process
xorjoep	1:24714b45cd1b	178	* ----------------------*/
xorjoep	1:24714b45cd1b	179
xorjoep	1:24714b45cd1b	180	/* The first loop starts here */
xorjoep	1:24714b45cd1b	181	while (blockSize1 > 0U)
xorjoep	1:24714b45cd1b	182	{
xorjoep	1:24714b45cd1b	183	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	184	sum = 0;
xorjoep	1:24714b45cd1b	185
xorjoep	1:24714b45cd1b	186	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	187	k = count >> 2;
xorjoep	1:24714b45cd1b	188
xorjoep	1:24714b45cd1b	189	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	190	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	191	while (k > 0U)
xorjoep	1:24714b45cd1b	192	{
xorjoep	1:24714b45cd1b	193	/* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	194	sum = __SMLAD(__SIMD32(px)++, __SIMD32(py)++, sum);
xorjoep	1:24714b45cd1b	195	/* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
xorjoep	1:24714b45cd1b	196	sum = __SMLAD(__SIMD32(px)++, __SIMD32(py)++, sum);
xorjoep	1:24714b45cd1b	197
xorjoep	1:24714b45cd1b	198	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	199	k--;
xorjoep	1:24714b45cd1b	200	}
xorjoep	1:24714b45cd1b	201
xorjoep	1:24714b45cd1b	202	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	203	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	204	k = count % 0x4U;
xorjoep	1:24714b45cd1b	205
xorjoep	1:24714b45cd1b	206	while (k > 0U)
xorjoep	1:24714b45cd1b	207	{
xorjoep	1:24714b45cd1b	208	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	209	/* x[0] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	210	sum = __SMLAD(px++, py++, sum);
xorjoep	1:24714b45cd1b	211
xorjoep	1:24714b45cd1b	212	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	213	k--;
xorjoep	1:24714b45cd1b	214	}
xorjoep	1:24714b45cd1b	215
xorjoep	1:24714b45cd1b	216	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	217	*pOut = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	218	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	219	pOut += inc;
xorjoep	1:24714b45cd1b	220
xorjoep	1:24714b45cd1b	221	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	222	py = pSrc1 - count;
xorjoep	1:24714b45cd1b	223	px = pIn1;
xorjoep	1:24714b45cd1b	224
xorjoep	1:24714b45cd1b	225	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	226	count++;
xorjoep	1:24714b45cd1b	227
xorjoep	1:24714b45cd1b	228	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	229	blockSize1--;
xorjoep	1:24714b45cd1b	230	}
xorjoep	1:24714b45cd1b	231
xorjoep	1:24714b45cd1b	232	/* --------------------------
xorjoep	1:24714b45cd1b	233	* Initializations of stage2
xorjoep	1:24714b45cd1b	234	* ------------------------*/
xorjoep	1:24714b45cd1b	235
xorjoep	1:24714b45cd1b	236	/* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	237	* sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	238	* ....
xorjoep	1:24714b45cd1b	239	* sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	240	*/
xorjoep	1:24714b45cd1b	241
xorjoep	1:24714b45cd1b	242	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	243	px = pIn1;
xorjoep	1:24714b45cd1b	244
xorjoep	1:24714b45cd1b	245	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	246	py = pIn2;
xorjoep	1:24714b45cd1b	247
xorjoep	1:24714b45cd1b	248	/* count is index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	249	count = 0U;
xorjoep	1:24714b45cd1b	250
xorjoep	1:24714b45cd1b	251	/* -------------------
xorjoep	1:24714b45cd1b	252	* Stage2 process
xorjoep	1:24714b45cd1b	253	* ------------------*/
xorjoep	1:24714b45cd1b	254
xorjoep	1:24714b45cd1b	255	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	256	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	257	* srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
xorjoep	1:24714b45cd1b	258	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	259	{
xorjoep	1:24714b45cd1b	260	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	261	blkCnt = blockSize2 >> 2U;
xorjoep	1:24714b45cd1b	262
xorjoep	1:24714b45cd1b	263	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	264	{
xorjoep	1:24714b45cd1b	265	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	266	acc0 = 0;
xorjoep	1:24714b45cd1b	267	acc1 = 0;
xorjoep	1:24714b45cd1b	268	acc2 = 0;
xorjoep	1:24714b45cd1b	269	acc3 = 0;
xorjoep	1:24714b45cd1b	270
xorjoep	1:24714b45cd1b	271	/* read x[0], x[1] samples */
xorjoep	1:24714b45cd1b	272	x0 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	273	/* read x[1], x[2] samples */
xorjoep	1:24714b45cd1b	274	x1 = _SIMD32_OFFSET(px + 1);
xorjoep	1:24714b45cd1b	275	px += 2U;
xorjoep	1:24714b45cd1b	276
xorjoep	1:24714b45cd1b	277	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	278	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	279
xorjoep	1:24714b45cd1b	280	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	281	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	282	do
xorjoep	1:24714b45cd1b	283	{
xorjoep	1:24714b45cd1b	284	/* Read the first two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	285	* y[0] and y[1] */
xorjoep	1:24714b45cd1b	286	c0 = *__SIMD32(py)++;
xorjoep	1:24714b45cd1b	287
xorjoep	1:24714b45cd1b	288	/* acc0 += x[0] * y[0] + x[1] * y[1] */
xorjoep	1:24714b45cd1b	289	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	290
xorjoep	1:24714b45cd1b	291	/* acc1 += x[1] * y[0] + x[2] * y[1] */
xorjoep	1:24714b45cd1b	292	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	293
xorjoep	1:24714b45cd1b	294	/* Read x[2], x[3] */
xorjoep	1:24714b45cd1b	295	x2 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	296
xorjoep	1:24714b45cd1b	297	/* Read x[3], x[4] */
xorjoep	1:24714b45cd1b	298	x3 = _SIMD32_OFFSET(px + 1);
xorjoep	1:24714b45cd1b	299
xorjoep	1:24714b45cd1b	300	/* acc2 += x[2] * y[0] + x[3] * y[1] */
xorjoep	1:24714b45cd1b	301	acc2 = __SMLAD(x2, c0, acc2);
xorjoep	1:24714b45cd1b	302
xorjoep	1:24714b45cd1b	303	/* acc3 += x[3] * y[0] + x[4] * y[1] */
xorjoep	1:24714b45cd1b	304	acc3 = __SMLAD(x3, c0, acc3);
xorjoep	1:24714b45cd1b	305
xorjoep	1:24714b45cd1b	306	/* Read y[2] and y[3] */
xorjoep	1:24714b45cd1b	307	c0 = *__SIMD32(py)++;
xorjoep	1:24714b45cd1b	308
xorjoep	1:24714b45cd1b	309	/* acc0 += x[2] * y[2] + x[3] * y[3] */
xorjoep	1:24714b45cd1b	310	acc0 = __SMLAD(x2, c0, acc0);
xorjoep	1:24714b45cd1b	311
xorjoep	1:24714b45cd1b	312	/* acc1 += x[3] * y[2] + x[4] * y[3] */
xorjoep	1:24714b45cd1b	313	acc1 = __SMLAD(x3, c0, acc1);
xorjoep	1:24714b45cd1b	314
xorjoep	1:24714b45cd1b	315	/* Read x[4], x[5] */
xorjoep	1:24714b45cd1b	316	x0 = _SIMD32_OFFSET(px + 2);
xorjoep	1:24714b45cd1b	317
xorjoep	1:24714b45cd1b	318	/* Read x[5], x[6] */
xorjoep	1:24714b45cd1b	319	x1 = _SIMD32_OFFSET(px + 3);
xorjoep	1:24714b45cd1b	320	px += 4U;
xorjoep	1:24714b45cd1b	321
xorjoep	1:24714b45cd1b	322	/* acc2 += x[4] * y[2] + x[5] * y[3] */
xorjoep	1:24714b45cd1b	323	acc2 = __SMLAD(x0, c0, acc2);
xorjoep	1:24714b45cd1b	324
xorjoep	1:24714b45cd1b	325	/* acc3 += x[5] * y[2] + x[6] * y[3] */
xorjoep	1:24714b45cd1b	326	acc3 = __SMLAD(x1, c0, acc3);
xorjoep	1:24714b45cd1b	327
xorjoep	1:24714b45cd1b	328	} while (--k);
xorjoep	1:24714b45cd1b	329
xorjoep	1:24714b45cd1b	330	/* For the next MAC operations, SIMD is not used
xorjoep	1:24714b45cd1b	331	* So, the 16 bit pointer if inputB, py is updated */
xorjoep	1:24714b45cd1b	332
xorjoep	1:24714b45cd1b	333	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	334	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	335	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	336
xorjoep	1:24714b45cd1b	337	if (k == 1U)
xorjoep	1:24714b45cd1b	338	{
xorjoep	1:24714b45cd1b	339	/* Read y[4] */
xorjoep	1:24714b45cd1b	340	c0 = *py;
xorjoep	1:24714b45cd1b	341	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	342
xorjoep	1:24714b45cd1b	343	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	344
xorjoep	1:24714b45cd1b	345	#else
xorjoep	1:24714b45cd1b	346
xorjoep	1:24714b45cd1b	347	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	348
xorjoep	1:24714b45cd1b	349	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	350
xorjoep	1:24714b45cd1b	351	/* Read x[7] */
xorjoep	1:24714b45cd1b	352	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	353	px++;
xorjoep	1:24714b45cd1b	354
xorjoep	1:24714b45cd1b	355	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	356	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	357	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	358	acc2 = __SMLADX(x1, c0, acc2);
xorjoep	1:24714b45cd1b	359	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	360	}
xorjoep	1:24714b45cd1b	361
xorjoep	1:24714b45cd1b	362	if (k == 2U)
xorjoep	1:24714b45cd1b	363	{
xorjoep	1:24714b45cd1b	364	/* Read y[4], y[5] */
xorjoep	1:24714b45cd1b	365	c0 = *__SIMD32(py);
xorjoep	1:24714b45cd1b	366
xorjoep	1:24714b45cd1b	367	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	368	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	369
xorjoep	1:24714b45cd1b	370	/* Read x[9] */
xorjoep	1:24714b45cd1b	371	x2 = _SIMD32_OFFSET(px + 1);
xorjoep	1:24714b45cd1b	372	px += 2U;
xorjoep	1:24714b45cd1b	373
xorjoep	1:24714b45cd1b	374	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	375	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	376	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	377	acc2 = __SMLAD(x3, c0, acc2);
xorjoep	1:24714b45cd1b	378	acc3 = __SMLAD(x2, c0, acc3);
xorjoep	1:24714b45cd1b	379	}
xorjoep	1:24714b45cd1b	380
xorjoep	1:24714b45cd1b	381	if (k == 3U)
xorjoep	1:24714b45cd1b	382	{
xorjoep	1:24714b45cd1b	383	/* Read y[4], y[5] */
xorjoep	1:24714b45cd1b	384	c0 = *__SIMD32(py)++;
xorjoep	1:24714b45cd1b	385
xorjoep	1:24714b45cd1b	386	/* Read x[7], x[8] */
xorjoep	1:24714b45cd1b	387	x3 = *__SIMD32(px);
xorjoep	1:24714b45cd1b	388
xorjoep	1:24714b45cd1b	389	/* Read x[9] */
xorjoep	1:24714b45cd1b	390	x2 = _SIMD32_OFFSET(px + 1);
xorjoep	1:24714b45cd1b	391
xorjoep	1:24714b45cd1b	392	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	393	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	394	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	395	acc2 = __SMLAD(x3, c0, acc2);
xorjoep	1:24714b45cd1b	396	acc3 = __SMLAD(x2, c0, acc3);
xorjoep	1:24714b45cd1b	397
xorjoep	1:24714b45cd1b	398	c0 = (*py);
xorjoep	1:24714b45cd1b	399	/* Read y[6] */
xorjoep	1:24714b45cd1b	400	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	401
xorjoep	1:24714b45cd1b	402	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	403	#else
xorjoep	1:24714b45cd1b	404
xorjoep	1:24714b45cd1b	405	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	406	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	407
xorjoep	1:24714b45cd1b	408	/* Read x[10] */
xorjoep	1:24714b45cd1b	409	x3 = _SIMD32_OFFSET(px + 2);
xorjoep	1:24714b45cd1b	410	px += 3U;
xorjoep	1:24714b45cd1b	411
xorjoep	1:24714b45cd1b	412	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	413	acc0 = __SMLADX(x1, c0, acc0);
xorjoep	1:24714b45cd1b	414	acc1 = __SMLAD(x2, c0, acc1);
xorjoep	1:24714b45cd1b	415	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	416	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	417	}
xorjoep	1:24714b45cd1b	418
xorjoep	1:24714b45cd1b	419	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	420	*pOut = (q15_t) (acc0 >> 15);
xorjoep	1:24714b45cd1b	421	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	422	pOut += inc;
xorjoep	1:24714b45cd1b	423
xorjoep	1:24714b45cd1b	424	*pOut = (q15_t) (acc1 >> 15);
xorjoep	1:24714b45cd1b	425	pOut += inc;
xorjoep	1:24714b45cd1b	426
xorjoep	1:24714b45cd1b	427	*pOut = (q15_t) (acc2 >> 15);
xorjoep	1:24714b45cd1b	428	pOut += inc;
xorjoep	1:24714b45cd1b	429
xorjoep	1:24714b45cd1b	430	*pOut = (q15_t) (acc3 >> 15);
xorjoep	1:24714b45cd1b	431	pOut += inc;
xorjoep	1:24714b45cd1b	432
xorjoep	1:24714b45cd1b	433	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	434	count += 4U;
xorjoep	1:24714b45cd1b	435
xorjoep	1:24714b45cd1b	436	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	437	px = pIn1 + count;
xorjoep	1:24714b45cd1b	438	py = pIn2;
xorjoep	1:24714b45cd1b	439
xorjoep	1:24714b45cd1b	440
xorjoep	1:24714b45cd1b	441	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	442	blkCnt--;
xorjoep	1:24714b45cd1b	443	}
xorjoep	1:24714b45cd1b	444
xorjoep	1:24714b45cd1b	445	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	446	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	447	blkCnt = blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	448
xorjoep	1:24714b45cd1b	449	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	450	{
xorjoep	1:24714b45cd1b	451	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	452	sum = 0;
xorjoep	1:24714b45cd1b	453
xorjoep	1:24714b45cd1b	454	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	455	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	456
xorjoep	1:24714b45cd1b	457	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	458	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	459	while (k > 0U)
xorjoep	1:24714b45cd1b	460	{
xorjoep	1:24714b45cd1b	461	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	462	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	463	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	464	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	465	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	466
xorjoep	1:24714b45cd1b	467	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	468	k--;
xorjoep	1:24714b45cd1b	469	}
xorjoep	1:24714b45cd1b	470
xorjoep	1:24714b45cd1b	471	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	472	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	473	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	474
xorjoep	1:24714b45cd1b	475	while (k > 0U)
xorjoep	1:24714b45cd1b	476	{
xorjoep	1:24714b45cd1b	477	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	478	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	479
xorjoep	1:24714b45cd1b	480	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	481	k--;
xorjoep	1:24714b45cd1b	482	}
xorjoep	1:24714b45cd1b	483
xorjoep	1:24714b45cd1b	484	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	485	*pOut = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	486	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	487	pOut += inc;
xorjoep	1:24714b45cd1b	488
xorjoep	1:24714b45cd1b	489	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	490	count++;
xorjoep	1:24714b45cd1b	491
xorjoep	1:24714b45cd1b	492	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	493	px = pIn1 + count;
xorjoep	1:24714b45cd1b	494	py = pIn2;
xorjoep	1:24714b45cd1b	495
xorjoep	1:24714b45cd1b	496	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	497	blkCnt--;
xorjoep	1:24714b45cd1b	498	}
xorjoep	1:24714b45cd1b	499	}
xorjoep	1:24714b45cd1b	500	else
xorjoep	1:24714b45cd1b	501	{
xorjoep	1:24714b45cd1b	502	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	503	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	504	blkCnt = blockSize2;
xorjoep	1:24714b45cd1b	505
xorjoep	1:24714b45cd1b	506	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	507	{
xorjoep	1:24714b45cd1b	508	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	509	sum = 0;
xorjoep	1:24714b45cd1b	510
xorjoep	1:24714b45cd1b	511	/* Loop over srcBLen */
xorjoep	1:24714b45cd1b	512	k = srcBLen;
xorjoep	1:24714b45cd1b	513
xorjoep	1:24714b45cd1b	514	while (k > 0U)
xorjoep	1:24714b45cd1b	515	{
xorjoep	1:24714b45cd1b	516	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	517	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	518
xorjoep	1:24714b45cd1b	519	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	520	k--;
xorjoep	1:24714b45cd1b	521	}
xorjoep	1:24714b45cd1b	522
xorjoep	1:24714b45cd1b	523	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	524	*pOut = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	525	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	526	pOut += inc;
xorjoep	1:24714b45cd1b	527
xorjoep	1:24714b45cd1b	528	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	529	count++;
xorjoep	1:24714b45cd1b	530
xorjoep	1:24714b45cd1b	531	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	532	px = pIn1 + count;
xorjoep	1:24714b45cd1b	533	py = pIn2;
xorjoep	1:24714b45cd1b	534
xorjoep	1:24714b45cd1b	535	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	536	blkCnt--;
xorjoep	1:24714b45cd1b	537	}
xorjoep	1:24714b45cd1b	538	}
xorjoep	1:24714b45cd1b	539
xorjoep	1:24714b45cd1b	540	/* --------------------------
xorjoep	1:24714b45cd1b	541	* Initializations of stage3
xorjoep	1:24714b45cd1b	542	* -------------------------*/
xorjoep	1:24714b45cd1b	543
xorjoep	1:24714b45cd1b	544	/* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	545	* sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	546	* ....
xorjoep	1:24714b45cd1b	547	* sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	548	* sum += x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	549	*/
xorjoep	1:24714b45cd1b	550
xorjoep	1:24714b45cd1b	551	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	552	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	553	count = srcBLen - 1U;
xorjoep	1:24714b45cd1b	554
xorjoep	1:24714b45cd1b	555	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	556	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	557	px = pSrc1;
xorjoep	1:24714b45cd1b	558
xorjoep	1:24714b45cd1b	559	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	560	py = pIn2;
xorjoep	1:24714b45cd1b	561
xorjoep	1:24714b45cd1b	562	/* -------------------
xorjoep	1:24714b45cd1b	563	* Stage3 process
xorjoep	1:24714b45cd1b	564	* ------------------*/
xorjoep	1:24714b45cd1b	565
xorjoep	1:24714b45cd1b	566	while (blockSize3 > 0U)
xorjoep	1:24714b45cd1b	567	{
xorjoep	1:24714b45cd1b	568	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	569	sum = 0;
xorjoep	1:24714b45cd1b	570
xorjoep	1:24714b45cd1b	571	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	572	k = count >> 2U;
xorjoep	1:24714b45cd1b	573
xorjoep	1:24714b45cd1b	574	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	575	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	576	while (k > 0U)
xorjoep	1:24714b45cd1b	577	{
xorjoep	1:24714b45cd1b	578	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	579	/* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
xorjoep	1:24714b45cd1b	580	sum = __SMLAD(__SIMD32(px)++, __SIMD32(py)++, sum);
xorjoep	1:24714b45cd1b	581	/* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
xorjoep	1:24714b45cd1b	582	sum = __SMLAD(__SIMD32(px)++, __SIMD32(py)++, sum);
xorjoep	1:24714b45cd1b	583
xorjoep	1:24714b45cd1b	584	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	585	k--;
xorjoep	1:24714b45cd1b	586	}
xorjoep	1:24714b45cd1b	587
xorjoep	1:24714b45cd1b	588	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	589	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	590	k = count % 0x4U;
xorjoep	1:24714b45cd1b	591
xorjoep	1:24714b45cd1b	592	while (k > 0U)
xorjoep	1:24714b45cd1b	593	{
xorjoep	1:24714b45cd1b	594	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	595	sum = __SMLAD(px++, py++, sum);
xorjoep	1:24714b45cd1b	596
xorjoep	1:24714b45cd1b	597	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	598	k--;
xorjoep	1:24714b45cd1b	599	}
xorjoep	1:24714b45cd1b	600
xorjoep	1:24714b45cd1b	601	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	602	*pOut = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	603	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	604	pOut += inc;
xorjoep	1:24714b45cd1b	605
xorjoep	1:24714b45cd1b	606	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	607	px = ++pSrc1;
xorjoep	1:24714b45cd1b	608	py = pIn2;
xorjoep	1:24714b45cd1b	609
xorjoep	1:24714b45cd1b	610	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	611	count--;
xorjoep	1:24714b45cd1b	612
xorjoep	1:24714b45cd1b	613	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	614	blockSize3--;
xorjoep	1:24714b45cd1b	615	}
xorjoep	1:24714b45cd1b	616
xorjoep	1:24714b45cd1b	617	#else
xorjoep	1:24714b45cd1b	618
xorjoep	1:24714b45cd1b	619	q15_t pIn1; / inputA pointer */
xorjoep	1:24714b45cd1b	620	q15_t pIn2; / inputB pointer */
xorjoep	1:24714b45cd1b	621	q15_t pOut = pDst; / output pointer */
xorjoep	1:24714b45cd1b	622	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
xorjoep	1:24714b45cd1b	623	q15_t px; / Intermediate inputA pointer */
xorjoep	1:24714b45cd1b	624	q15_t py; / Intermediate inputB pointer */
xorjoep	1:24714b45cd1b	625	q15_t pSrc1; / Intermediate pointers */
xorjoep	1:24714b45cd1b	626	q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
xorjoep	1:24714b45cd1b	627	uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
xorjoep	1:24714b45cd1b	628	int32_t inc = 1; /* Destination address modifier */
xorjoep	1:24714b45cd1b	629	q15_t a, b;
xorjoep	1:24714b45cd1b	630
xorjoep	1:24714b45cd1b	631
xorjoep	1:24714b45cd1b	632	/* The algorithm implementation is based on the lengths of the inputs. */
xorjoep	1:24714b45cd1b	633	/* srcB is always made to slide across srcA. */
xorjoep	1:24714b45cd1b	634	/* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	635	/* But CORR(x, y) is reverse of CORR(y, x) */
xorjoep	1:24714b45cd1b	636	/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
xorjoep	1:24714b45cd1b	637	/* and the destination pointer modifier, inc is set to -1 */
xorjoep	1:24714b45cd1b	638	/* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
xorjoep	1:24714b45cd1b	639	/* But to improve the performance,
xorjoep	1:24714b45cd1b	640	* we include zeroes in the output instead of zero padding either of the the inputs*/
xorjoep	1:24714b45cd1b	641	/* If srcALen > srcBLen,
xorjoep	1:24714b45cd1b	642	* (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
xorjoep	1:24714b45cd1b	643	/* If srcALen < srcBLen,
xorjoep	1:24714b45cd1b	644	* (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
xorjoep	1:24714b45cd1b	645	if (srcALen >= srcBLen)
xorjoep	1:24714b45cd1b	646	{
xorjoep	1:24714b45cd1b	647	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	648	pIn1 = (pSrcA);
xorjoep	1:24714b45cd1b	649
xorjoep	1:24714b45cd1b	650	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	651	pIn2 = (pSrcB);
xorjoep	1:24714b45cd1b	652
xorjoep	1:24714b45cd1b	653	/* Number of output samples is calculated */
xorjoep	1:24714b45cd1b	654	outBlockSize = (2U * srcALen) - 1U;
xorjoep	1:24714b45cd1b	655
xorjoep	1:24714b45cd1b	656	/* When srcALen > srcBLen, zero padding is done to srcB
xorjoep	1:24714b45cd1b	657	* to make their lengths equal.
xorjoep	1:24714b45cd1b	658	* Instead, (outBlockSize - (srcALen + srcBLen - 1))
xorjoep	1:24714b45cd1b	659	* number of output samples are made zero */
xorjoep	1:24714b45cd1b	660	j = outBlockSize - (srcALen + (srcBLen - 1U));
xorjoep	1:24714b45cd1b	661
xorjoep	1:24714b45cd1b	662	/* Updating the pointer position to non zero value */
xorjoep	1:24714b45cd1b	663	pOut += j;
xorjoep	1:24714b45cd1b	664
xorjoep	1:24714b45cd1b	665	}
xorjoep	1:24714b45cd1b	666	else
xorjoep	1:24714b45cd1b	667	{
xorjoep	1:24714b45cd1b	668	/* Initialization of inputA pointer */
xorjoep	1:24714b45cd1b	669	pIn1 = (pSrcB);
xorjoep	1:24714b45cd1b	670
xorjoep	1:24714b45cd1b	671	/* Initialization of inputB pointer */
xorjoep	1:24714b45cd1b	672	pIn2 = (pSrcA);
xorjoep	1:24714b45cd1b	673
xorjoep	1:24714b45cd1b	674	/* srcBLen is always considered as shorter or equal to srcALen */
xorjoep	1:24714b45cd1b	675	j = srcBLen;
xorjoep	1:24714b45cd1b	676	srcBLen = srcALen;
xorjoep	1:24714b45cd1b	677	srcALen = j;
xorjoep	1:24714b45cd1b	678
xorjoep	1:24714b45cd1b	679	/* CORR(x, y) = Reverse order(CORR(y, x)) */
xorjoep	1:24714b45cd1b	680	/* Hence set the destination pointer to point to the last output sample */
xorjoep	1:24714b45cd1b	681	pOut = pDst + ((srcALen + srcBLen) - 2U);
xorjoep	1:24714b45cd1b	682
xorjoep	1:24714b45cd1b	683	/* Destination address modifier is set to -1 */
xorjoep	1:24714b45cd1b	684	inc = -1;
xorjoep	1:24714b45cd1b	685
xorjoep	1:24714b45cd1b	686	}
xorjoep	1:24714b45cd1b	687
xorjoep	1:24714b45cd1b	688	/* The function is internally
xorjoep	1:24714b45cd1b	689	* divided into three parts according to the number of multiplications that has to be
xorjoep	1:24714b45cd1b	690	* taken place between inputA samples and inputB samples. In the first part of the
xorjoep	1:24714b45cd1b	691	* algorithm, the multiplications increase by one for every iteration.
xorjoep	1:24714b45cd1b	692	* In the second part of the algorithm, srcBLen number of multiplications are done.
xorjoep	1:24714b45cd1b	693	* In the third part of the algorithm, the multiplications decrease by one
xorjoep	1:24714b45cd1b	694	* for every iteration.*/
xorjoep	1:24714b45cd1b	695	/* The algorithm is implemented in three stages.
xorjoep	1:24714b45cd1b	696	* The loop counters of each stage is initiated here. */
xorjoep	1:24714b45cd1b	697	blockSize1 = srcBLen - 1U;
xorjoep	1:24714b45cd1b	698	blockSize2 = srcALen - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	699	blockSize3 = blockSize1;
xorjoep	1:24714b45cd1b	700
xorjoep	1:24714b45cd1b	701	/* --------------------------
xorjoep	1:24714b45cd1b	702	* Initializations of stage1
xorjoep	1:24714b45cd1b	703	* -------------------------*/
xorjoep	1:24714b45cd1b	704
xorjoep	1:24714b45cd1b	705	/* sum = x[0] * y[srcBlen - 1]
xorjoep	1:24714b45cd1b	706	* sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
xorjoep	1:24714b45cd1b	707	* ....
xorjoep	1:24714b45cd1b	708	* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
xorjoep	1:24714b45cd1b	709	*/
xorjoep	1:24714b45cd1b	710
xorjoep	1:24714b45cd1b	711	/* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep	1:24714b45cd1b	712	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	713	count = 1U;
xorjoep	1:24714b45cd1b	714
xorjoep	1:24714b45cd1b	715	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	716	px = pIn1;
xorjoep	1:24714b45cd1b	717
xorjoep	1:24714b45cd1b	718	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	719	pSrc1 = pIn2 + (srcBLen - 1U);
xorjoep	1:24714b45cd1b	720	py = pSrc1;
xorjoep	1:24714b45cd1b	721
xorjoep	1:24714b45cd1b	722	/* ------------------------
xorjoep	1:24714b45cd1b	723	* Stage1 process
xorjoep	1:24714b45cd1b	724	* ----------------------*/
xorjoep	1:24714b45cd1b	725
xorjoep	1:24714b45cd1b	726	/* The first loop starts here */
xorjoep	1:24714b45cd1b	727	while (blockSize1 > 0U)
xorjoep	1:24714b45cd1b	728	{
xorjoep	1:24714b45cd1b	729	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	730	sum = 0;
xorjoep	1:24714b45cd1b	731
xorjoep	1:24714b45cd1b	732	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	733	k = count >> 2;
xorjoep	1:24714b45cd1b	734
xorjoep	1:24714b45cd1b	735	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	736	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	737	while (k > 0U)
xorjoep	1:24714b45cd1b	738	{
xorjoep	1:24714b45cd1b	739	/* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
xorjoep	1:24714b45cd1b	740	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	741	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	742	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	743	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	744
xorjoep	1:24714b45cd1b	745	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	746	k--;
xorjoep	1:24714b45cd1b	747	}
xorjoep	1:24714b45cd1b	748
xorjoep	1:24714b45cd1b	749	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	750	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	751	k = count % 0x4U;
xorjoep	1:24714b45cd1b	752
xorjoep	1:24714b45cd1b	753	while (k > 0U)
xorjoep	1:24714b45cd1b	754	{
xorjoep	1:24714b45cd1b	755	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	756	/* x[0] * y[srcBLen - 1] */
xorjoep	1:24714b45cd1b	757	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	758
xorjoep	1:24714b45cd1b	759	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	760	k--;
xorjoep	1:24714b45cd1b	761	}
xorjoep	1:24714b45cd1b	762
xorjoep	1:24714b45cd1b	763	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	764	*pOut = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	765	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	766	pOut += inc;
xorjoep	1:24714b45cd1b	767
xorjoep	1:24714b45cd1b	768	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	769	py = pSrc1 - count;
xorjoep	1:24714b45cd1b	770	px = pIn1;
xorjoep	1:24714b45cd1b	771
xorjoep	1:24714b45cd1b	772	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	773	count++;
xorjoep	1:24714b45cd1b	774
xorjoep	1:24714b45cd1b	775	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	776	blockSize1--;
xorjoep	1:24714b45cd1b	777	}
xorjoep	1:24714b45cd1b	778
xorjoep	1:24714b45cd1b	779	/* --------------------------
xorjoep	1:24714b45cd1b	780	* Initializations of stage2
xorjoep	1:24714b45cd1b	781	* ------------------------*/
xorjoep	1:24714b45cd1b	782
xorjoep	1:24714b45cd1b	783	/* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	784	* sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	785	* ....
xorjoep	1:24714b45cd1b	786	* sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	787	*/
xorjoep	1:24714b45cd1b	788
xorjoep	1:24714b45cd1b	789	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	790	px = pIn1;
xorjoep	1:24714b45cd1b	791
xorjoep	1:24714b45cd1b	792	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	793	py = pIn2;
xorjoep	1:24714b45cd1b	794
xorjoep	1:24714b45cd1b	795	/* count is index by which the pointer pIn1 to be incremented */
xorjoep	1:24714b45cd1b	796	count = 0U;
xorjoep	1:24714b45cd1b	797
xorjoep	1:24714b45cd1b	798	/* -------------------
xorjoep	1:24714b45cd1b	799	* Stage2 process
xorjoep	1:24714b45cd1b	800	* ------------------*/
xorjoep	1:24714b45cd1b	801
xorjoep	1:24714b45cd1b	802	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep	1:24714b45cd1b	803	* So, to loop unroll over blockSize2,
xorjoep	1:24714b45cd1b	804	* srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
xorjoep	1:24714b45cd1b	805	if (srcBLen >= 4U)
xorjoep	1:24714b45cd1b	806	{
xorjoep	1:24714b45cd1b	807	/* Loop unroll over blockSize2, by 4 */
xorjoep	1:24714b45cd1b	808	blkCnt = blockSize2 >> 2U;
xorjoep	1:24714b45cd1b	809
xorjoep	1:24714b45cd1b	810	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	811	{
xorjoep	1:24714b45cd1b	812	/* Set all accumulators to zero */
xorjoep	1:24714b45cd1b	813	acc0 = 0;
xorjoep	1:24714b45cd1b	814	acc1 = 0;
xorjoep	1:24714b45cd1b	815	acc2 = 0;
xorjoep	1:24714b45cd1b	816	acc3 = 0;
xorjoep	1:24714b45cd1b	817
xorjoep	1:24714b45cd1b	818	/* read x[0], x[1], x[2] samples */
xorjoep	1:24714b45cd1b	819	a = *px;
xorjoep	1:24714b45cd1b	820	b = *(px + 1);
xorjoep	1:24714b45cd1b	821
xorjoep	1:24714b45cd1b	822	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	823
xorjoep	1:24714b45cd1b	824	x0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	825	a = *(px + 2);
xorjoep	1:24714b45cd1b	826	x1 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	827
xorjoep	1:24714b45cd1b	828	#else
xorjoep	1:24714b45cd1b	829
xorjoep	1:24714b45cd1b	830	x0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	831	a = *(px + 2);
xorjoep	1:24714b45cd1b	832	x1 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	833
xorjoep	1:24714b45cd1b	834	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	835
xorjoep	1:24714b45cd1b	836	px += 2U;
xorjoep	1:24714b45cd1b	837
xorjoep	1:24714b45cd1b	838	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	839	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	840
xorjoep	1:24714b45cd1b	841	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	842	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	843	do
xorjoep	1:24714b45cd1b	844	{
xorjoep	1:24714b45cd1b	845	/* Read the first two inputB samples using SIMD:
xorjoep	1:24714b45cd1b	846	* y[0] and y[1] */
xorjoep	1:24714b45cd1b	847	a = *py;
xorjoep	1:24714b45cd1b	848	b = *(py + 1);
xorjoep	1:24714b45cd1b	849
xorjoep	1:24714b45cd1b	850	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	851
xorjoep	1:24714b45cd1b	852	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	853
xorjoep	1:24714b45cd1b	854	#else
xorjoep	1:24714b45cd1b	855
xorjoep	1:24714b45cd1b	856	c0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	857
xorjoep	1:24714b45cd1b	858	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	859
xorjoep	1:24714b45cd1b	860	/* acc0 += x[0] * y[0] + x[1] * y[1] */
xorjoep	1:24714b45cd1b	861	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	862
xorjoep	1:24714b45cd1b	863	/* acc1 += x[1] * y[0] + x[2] * y[1] */
xorjoep	1:24714b45cd1b	864	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	865
xorjoep	1:24714b45cd1b	866	/* Read x[2], x[3], x[4] */
xorjoep	1:24714b45cd1b	867	a = *px;
xorjoep	1:24714b45cd1b	868	b = *(px + 1);
xorjoep	1:24714b45cd1b	869
xorjoep	1:24714b45cd1b	870	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	871
xorjoep	1:24714b45cd1b	872	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	873	a = *(px + 2);
xorjoep	1:24714b45cd1b	874	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	875
xorjoep	1:24714b45cd1b	876	#else
xorjoep	1:24714b45cd1b	877
xorjoep	1:24714b45cd1b	878	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	879	a = *(px + 2);
xorjoep	1:24714b45cd1b	880	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	881
xorjoep	1:24714b45cd1b	882	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	883
xorjoep	1:24714b45cd1b	884	/* acc2 += x[2] * y[0] + x[3] * y[1] */
xorjoep	1:24714b45cd1b	885	acc2 = __SMLAD(x2, c0, acc2);
xorjoep	1:24714b45cd1b	886
xorjoep	1:24714b45cd1b	887	/* acc3 += x[3] * y[0] + x[4] * y[1] */
xorjoep	1:24714b45cd1b	888	acc3 = __SMLAD(x3, c0, acc3);
xorjoep	1:24714b45cd1b	889
xorjoep	1:24714b45cd1b	890	/* Read y[2] and y[3] */
xorjoep	1:24714b45cd1b	891	a = *(py + 2);
xorjoep	1:24714b45cd1b	892	b = *(py + 3);
xorjoep	1:24714b45cd1b	893
xorjoep	1:24714b45cd1b	894	py += 4U;
xorjoep	1:24714b45cd1b	895
xorjoep	1:24714b45cd1b	896	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	897
xorjoep	1:24714b45cd1b	898	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	899
xorjoep	1:24714b45cd1b	900	#else
xorjoep	1:24714b45cd1b	901
xorjoep	1:24714b45cd1b	902	c0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	903
xorjoep	1:24714b45cd1b	904	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	905
xorjoep	1:24714b45cd1b	906	/* acc0 += x[2] * y[2] + x[3] * y[3] */
xorjoep	1:24714b45cd1b	907	acc0 = __SMLAD(x2, c0, acc0);
xorjoep	1:24714b45cd1b	908
xorjoep	1:24714b45cd1b	909	/* acc1 += x[3] * y[2] + x[4] * y[3] */
xorjoep	1:24714b45cd1b	910	acc1 = __SMLAD(x3, c0, acc1);
xorjoep	1:24714b45cd1b	911
xorjoep	1:24714b45cd1b	912	/* Read x[4], x[5], x[6] */
xorjoep	1:24714b45cd1b	913	a = *(px + 2);
xorjoep	1:24714b45cd1b	914	b = *(px + 3);
xorjoep	1:24714b45cd1b	915
xorjoep	1:24714b45cd1b	916	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	917
xorjoep	1:24714b45cd1b	918	x0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	919	a = *(px + 4);
xorjoep	1:24714b45cd1b	920	x1 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	921
xorjoep	1:24714b45cd1b	922	#else
xorjoep	1:24714b45cd1b	923
xorjoep	1:24714b45cd1b	924	x0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	925	a = *(px + 4);
xorjoep	1:24714b45cd1b	926	x1 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	927
xorjoep	1:24714b45cd1b	928	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	929
xorjoep	1:24714b45cd1b	930	px += 4U;
xorjoep	1:24714b45cd1b	931
xorjoep	1:24714b45cd1b	932	/* acc2 += x[4] * y[2] + x[5] * y[3] */
xorjoep	1:24714b45cd1b	933	acc2 = __SMLAD(x0, c0, acc2);
xorjoep	1:24714b45cd1b	934
xorjoep	1:24714b45cd1b	935	/* acc3 += x[5] * y[2] + x[6] * y[3] */
xorjoep	1:24714b45cd1b	936	acc3 = __SMLAD(x1, c0, acc3);
xorjoep	1:24714b45cd1b	937
xorjoep	1:24714b45cd1b	938	} while (--k);
xorjoep	1:24714b45cd1b	939
xorjoep	1:24714b45cd1b	940	/* For the next MAC operations, SIMD is not used
xorjoep	1:24714b45cd1b	941	* So, the 16 bit pointer if inputB, py is updated */
xorjoep	1:24714b45cd1b	942
xorjoep	1:24714b45cd1b	943	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	944	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	945	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	946
xorjoep	1:24714b45cd1b	947	if (k == 1U)
xorjoep	1:24714b45cd1b	948	{
xorjoep	1:24714b45cd1b	949	/* Read y[4] */
xorjoep	1:24714b45cd1b	950	c0 = *py;
xorjoep	1:24714b45cd1b	951	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	952
xorjoep	1:24714b45cd1b	953	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	954
xorjoep	1:24714b45cd1b	955	#else
xorjoep	1:24714b45cd1b	956
xorjoep	1:24714b45cd1b	957	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	958
xorjoep	1:24714b45cd1b	959	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	960
xorjoep	1:24714b45cd1b	961	/* Read x[7] */
xorjoep	1:24714b45cd1b	962	a = *px;
xorjoep	1:24714b45cd1b	963	b = *(px + 1);
xorjoep	1:24714b45cd1b	964
xorjoep	1:24714b45cd1b	965	px++;;
xorjoep	1:24714b45cd1b	966
xorjoep	1:24714b45cd1b	967	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	968
xorjoep	1:24714b45cd1b	969	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	970
xorjoep	1:24714b45cd1b	971	#else
xorjoep	1:24714b45cd1b	972
xorjoep	1:24714b45cd1b	973	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	974
xorjoep	1:24714b45cd1b	975	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	976
xorjoep	1:24714b45cd1b	977	px++;
xorjoep	1:24714b45cd1b	978
xorjoep	1:24714b45cd1b	979	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	980	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	981	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	982	acc2 = __SMLADX(x1, c0, acc2);
xorjoep	1:24714b45cd1b	983	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	984	}
xorjoep	1:24714b45cd1b	985
xorjoep	1:24714b45cd1b	986	if (k == 2U)
xorjoep	1:24714b45cd1b	987	{
xorjoep	1:24714b45cd1b	988	/* Read y[4], y[5] */
xorjoep	1:24714b45cd1b	989	a = *py;
xorjoep	1:24714b45cd1b	990	b = *(py + 1);
xorjoep	1:24714b45cd1b	991
xorjoep	1:24714b45cd1b	992	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	993
xorjoep	1:24714b45cd1b	994	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	995
xorjoep	1:24714b45cd1b	996	#else
xorjoep	1:24714b45cd1b	997
xorjoep	1:24714b45cd1b	998	c0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	999
xorjoep	1:24714b45cd1b	1000	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1001
xorjoep	1:24714b45cd1b	1002	/* Read x[7], x[8], x[9] */
xorjoep	1:24714b45cd1b	1003	a = *px;
xorjoep	1:24714b45cd1b	1004	b = *(px + 1);
xorjoep	1:24714b45cd1b	1005
xorjoep	1:24714b45cd1b	1006	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1007
xorjoep	1:24714b45cd1b	1008	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1009	a = *(px + 2);
xorjoep	1:24714b45cd1b	1010	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1011
xorjoep	1:24714b45cd1b	1012	#else
xorjoep	1:24714b45cd1b	1013
xorjoep	1:24714b45cd1b	1014	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1015	a = *(px + 2);
xorjoep	1:24714b45cd1b	1016	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1017
xorjoep	1:24714b45cd1b	1018	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1019
xorjoep	1:24714b45cd1b	1020	px += 2U;
xorjoep	1:24714b45cd1b	1021
xorjoep	1:24714b45cd1b	1022	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1023	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1024	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1025	acc2 = __SMLAD(x3, c0, acc2);
xorjoep	1:24714b45cd1b	1026	acc3 = __SMLAD(x2, c0, acc3);
xorjoep	1:24714b45cd1b	1027	}
xorjoep	1:24714b45cd1b	1028
xorjoep	1:24714b45cd1b	1029	if (k == 3U)
xorjoep	1:24714b45cd1b	1030	{
xorjoep	1:24714b45cd1b	1031	/* Read y[4], y[5] */
xorjoep	1:24714b45cd1b	1032	a = *py;
xorjoep	1:24714b45cd1b	1033	b = *(py + 1);
xorjoep	1:24714b45cd1b	1034
xorjoep	1:24714b45cd1b	1035	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1036
xorjoep	1:24714b45cd1b	1037	c0 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1038
xorjoep	1:24714b45cd1b	1039	#else
xorjoep	1:24714b45cd1b	1040
xorjoep	1:24714b45cd1b	1041	c0 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1042
xorjoep	1:24714b45cd1b	1043	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1044
xorjoep	1:24714b45cd1b	1045	py += 2U;
xorjoep	1:24714b45cd1b	1046
xorjoep	1:24714b45cd1b	1047	/* Read x[7], x[8], x[9] */
xorjoep	1:24714b45cd1b	1048	a = *px;
xorjoep	1:24714b45cd1b	1049	b = *(px + 1);
xorjoep	1:24714b45cd1b	1050
xorjoep	1:24714b45cd1b	1051	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1052
xorjoep	1:24714b45cd1b	1053	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1054	a = *(px + 2);
xorjoep	1:24714b45cd1b	1055	x2 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1056
xorjoep	1:24714b45cd1b	1057	#else
xorjoep	1:24714b45cd1b	1058
xorjoep	1:24714b45cd1b	1059	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1060	a = *(px + 2);
xorjoep	1:24714b45cd1b	1061	x2 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1062
xorjoep	1:24714b45cd1b	1063	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1064
xorjoep	1:24714b45cd1b	1065	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1066	acc0 = __SMLAD(x0, c0, acc0);
xorjoep	1:24714b45cd1b	1067	acc1 = __SMLAD(x1, c0, acc1);
xorjoep	1:24714b45cd1b	1068	acc2 = __SMLAD(x3, c0, acc2);
xorjoep	1:24714b45cd1b	1069	acc3 = __SMLAD(x2, c0, acc3);
xorjoep	1:24714b45cd1b	1070
xorjoep	1:24714b45cd1b	1071	c0 = (*py);
xorjoep	1:24714b45cd1b	1072	/* Read y[6] */
xorjoep	1:24714b45cd1b	1073	#ifdef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1074
xorjoep	1:24714b45cd1b	1075	c0 = c0 << 16U;
xorjoep	1:24714b45cd1b	1076	#else
xorjoep	1:24714b45cd1b	1077
xorjoep	1:24714b45cd1b	1078	c0 = c0 & 0x0000FFFF;
xorjoep	1:24714b45cd1b	1079	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1080
xorjoep	1:24714b45cd1b	1081	/* Read x[10] */
xorjoep	1:24714b45cd1b	1082	b = *(px + 3);
xorjoep	1:24714b45cd1b	1083
xorjoep	1:24714b45cd1b	1084	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	1085
xorjoep	1:24714b45cd1b	1086	x3 = __PKHBT(a, b, 16);
xorjoep	1:24714b45cd1b	1087
xorjoep	1:24714b45cd1b	1088	#else
xorjoep	1:24714b45cd1b	1089
xorjoep	1:24714b45cd1b	1090	x3 = __PKHBT(b, a, 16);
xorjoep	1:24714b45cd1b	1091
xorjoep	1:24714b45cd1b	1092	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	1093
xorjoep	1:24714b45cd1b	1094	px += 3U;
xorjoep	1:24714b45cd1b	1095
xorjoep	1:24714b45cd1b	1096	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1097	acc0 = __SMLADX(x1, c0, acc0);
xorjoep	1:24714b45cd1b	1098	acc1 = __SMLAD(x2, c0, acc1);
xorjoep	1:24714b45cd1b	1099	acc2 = __SMLADX(x2, c0, acc2);
xorjoep	1:24714b45cd1b	1100	acc3 = __SMLADX(x3, c0, acc3);
xorjoep	1:24714b45cd1b	1101	}
xorjoep	1:24714b45cd1b	1102
xorjoep	1:24714b45cd1b	1103	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1104	*pOut = (q15_t) (acc0 >> 15);
xorjoep	1:24714b45cd1b	1105	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	1106	pOut += inc;
xorjoep	1:24714b45cd1b	1107
xorjoep	1:24714b45cd1b	1108	*pOut = (q15_t) (acc1 >> 15);
xorjoep	1:24714b45cd1b	1109	pOut += inc;
xorjoep	1:24714b45cd1b	1110
xorjoep	1:24714b45cd1b	1111	*pOut = (q15_t) (acc2 >> 15);
xorjoep	1:24714b45cd1b	1112	pOut += inc;
xorjoep	1:24714b45cd1b	1113
xorjoep	1:24714b45cd1b	1114	*pOut = (q15_t) (acc3 >> 15);
xorjoep	1:24714b45cd1b	1115	pOut += inc;
xorjoep	1:24714b45cd1b	1116
xorjoep	1:24714b45cd1b	1117	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	1118	count += 4U;
xorjoep	1:24714b45cd1b	1119
xorjoep	1:24714b45cd1b	1120	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1121	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1122	py = pIn2;
xorjoep	1:24714b45cd1b	1123
xorjoep	1:24714b45cd1b	1124
xorjoep	1:24714b45cd1b	1125	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1126	blkCnt--;
xorjoep	1:24714b45cd1b	1127	}
xorjoep	1:24714b45cd1b	1128
xorjoep	1:24714b45cd1b	1129	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	1130	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1131	blkCnt = blockSize2 % 0x4U;
xorjoep	1:24714b45cd1b	1132
xorjoep	1:24714b45cd1b	1133	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	1134	{
xorjoep	1:24714b45cd1b	1135	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1136	sum = 0;
xorjoep	1:24714b45cd1b	1137
xorjoep	1:24714b45cd1b	1138	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	1139	k = srcBLen >> 2U;
xorjoep	1:24714b45cd1b	1140
xorjoep	1:24714b45cd1b	1141	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	1142	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	1143	while (k > 0U)
xorjoep	1:24714b45cd1b	1144	{
xorjoep	1:24714b45cd1b	1145	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1146	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1147	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1148	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1149	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1150
xorjoep	1:24714b45cd1b	1151	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1152	k--;
xorjoep	1:24714b45cd1b	1153	}
xorjoep	1:24714b45cd1b	1154
xorjoep	1:24714b45cd1b	1155	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	1156	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1157	k = srcBLen % 0x4U;
xorjoep	1:24714b45cd1b	1158
xorjoep	1:24714b45cd1b	1159	while (k > 0U)
xorjoep	1:24714b45cd1b	1160	{
xorjoep	1:24714b45cd1b	1161	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1162	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1163
xorjoep	1:24714b45cd1b	1164	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1165	k--;
xorjoep	1:24714b45cd1b	1166	}
xorjoep	1:24714b45cd1b	1167
xorjoep	1:24714b45cd1b	1168	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1169	*pOut = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1170	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	1171	pOut += inc;
xorjoep	1:24714b45cd1b	1172
xorjoep	1:24714b45cd1b	1173	/* Increment the pointer pIn1 index, count by 1 */
xorjoep	1:24714b45cd1b	1174	count++;
xorjoep	1:24714b45cd1b	1175
xorjoep	1:24714b45cd1b	1176	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1177	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1178	py = pIn2;
xorjoep	1:24714b45cd1b	1179
xorjoep	1:24714b45cd1b	1180	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1181	blkCnt--;
xorjoep	1:24714b45cd1b	1182	}
xorjoep	1:24714b45cd1b	1183	}
xorjoep	1:24714b45cd1b	1184	else
xorjoep	1:24714b45cd1b	1185	{
xorjoep	1:24714b45cd1b	1186	/* If the srcBLen is not a multiple of 4,
xorjoep	1:24714b45cd1b	1187	* the blockSize2 loop cannot be unrolled by 4 */
xorjoep	1:24714b45cd1b	1188	blkCnt = blockSize2;
xorjoep	1:24714b45cd1b	1189
xorjoep	1:24714b45cd1b	1190	while (blkCnt > 0U)
xorjoep	1:24714b45cd1b	1191	{
xorjoep	1:24714b45cd1b	1192	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1193	sum = 0;
xorjoep	1:24714b45cd1b	1194
xorjoep	1:24714b45cd1b	1195	/* Loop over srcBLen */
xorjoep	1:24714b45cd1b	1196	k = srcBLen;
xorjoep	1:24714b45cd1b	1197
xorjoep	1:24714b45cd1b	1198	while (k > 0U)
xorjoep	1:24714b45cd1b	1199	{
xorjoep	1:24714b45cd1b	1200	/* Perform the multiply-accumulate */
xorjoep	1:24714b45cd1b	1201	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1202
xorjoep	1:24714b45cd1b	1203	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1204	k--;
xorjoep	1:24714b45cd1b	1205	}
xorjoep	1:24714b45cd1b	1206
xorjoep	1:24714b45cd1b	1207	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1208	*pOut = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1209	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	1210	pOut += inc;
xorjoep	1:24714b45cd1b	1211
xorjoep	1:24714b45cd1b	1212	/* Increment the MAC count */
xorjoep	1:24714b45cd1b	1213	count++;
xorjoep	1:24714b45cd1b	1214
xorjoep	1:24714b45cd1b	1215	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1216	px = pIn1 + count;
xorjoep	1:24714b45cd1b	1217	py = pIn2;
xorjoep	1:24714b45cd1b	1218
xorjoep	1:24714b45cd1b	1219	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1220	blkCnt--;
xorjoep	1:24714b45cd1b	1221	}
xorjoep	1:24714b45cd1b	1222	}
xorjoep	1:24714b45cd1b	1223
xorjoep	1:24714b45cd1b	1224	/* --------------------------
xorjoep	1:24714b45cd1b	1225	* Initializations of stage3
xorjoep	1:24714b45cd1b	1226	* -------------------------*/
xorjoep	1:24714b45cd1b	1227
xorjoep	1:24714b45cd1b	1228	/* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	1229	* sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
xorjoep	1:24714b45cd1b	1230	* ....
xorjoep	1:24714b45cd1b	1231	* sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
xorjoep	1:24714b45cd1b	1232	* sum += x[srcALen-1] * y[0]
xorjoep	1:24714b45cd1b	1233	*/
xorjoep	1:24714b45cd1b	1234
xorjoep	1:24714b45cd1b	1235	/* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep	1:24714b45cd1b	1236	The count variable holds the number of MAC operations performed */
xorjoep	1:24714b45cd1b	1237	count = srcBLen - 1U;
xorjoep	1:24714b45cd1b	1238
xorjoep	1:24714b45cd1b	1239	/* Working pointer of inputA */
xorjoep	1:24714b45cd1b	1240	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep	1:24714b45cd1b	1241	px = pSrc1;
xorjoep	1:24714b45cd1b	1242
xorjoep	1:24714b45cd1b	1243	/* Working pointer of inputB */
xorjoep	1:24714b45cd1b	1244	py = pIn2;
xorjoep	1:24714b45cd1b	1245
xorjoep	1:24714b45cd1b	1246	/* -------------------
xorjoep	1:24714b45cd1b	1247	* Stage3 process
xorjoep	1:24714b45cd1b	1248	* ------------------*/
xorjoep	1:24714b45cd1b	1249
xorjoep	1:24714b45cd1b	1250	while (blockSize3 > 0U)
xorjoep	1:24714b45cd1b	1251	{
xorjoep	1:24714b45cd1b	1252	/* Accumulator is made zero for every iteration */
xorjoep	1:24714b45cd1b	1253	sum = 0;
xorjoep	1:24714b45cd1b	1254
xorjoep	1:24714b45cd1b	1255	/* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep	1:24714b45cd1b	1256	k = count >> 2U;
xorjoep	1:24714b45cd1b	1257
xorjoep	1:24714b45cd1b	1258	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep	1:24714b45cd1b	1259	** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	1260	while (k > 0U)
xorjoep	1:24714b45cd1b	1261	{
xorjoep	1:24714b45cd1b	1262	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1263	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1264	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1265	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1266	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1267
xorjoep	1:24714b45cd1b	1268	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1269	k--;
xorjoep	1:24714b45cd1b	1270	}
xorjoep	1:24714b45cd1b	1271
xorjoep	1:24714b45cd1b	1272	/* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep	1:24714b45cd1b	1273	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	1274	k = count % 0x4U;
xorjoep	1:24714b45cd1b	1275
xorjoep	1:24714b45cd1b	1276	while (k > 0U)
xorjoep	1:24714b45cd1b	1277	{
xorjoep	1:24714b45cd1b	1278	/* Perform the multiply-accumulates */
xorjoep	1:24714b45cd1b	1279	sum += ((q31_t) * px++ * *py++);
xorjoep	1:24714b45cd1b	1280
xorjoep	1:24714b45cd1b	1281	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1282	k--;
xorjoep	1:24714b45cd1b	1283	}
xorjoep	1:24714b45cd1b	1284
xorjoep	1:24714b45cd1b	1285	/* Store the result in the accumulator in the destination buffer. */
xorjoep	1:24714b45cd1b	1286	*pOut = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	1287	/* Destination pointer is updated according to the address modifier, inc */
xorjoep	1:24714b45cd1b	1288	pOut += inc;
xorjoep	1:24714b45cd1b	1289
xorjoep	1:24714b45cd1b	1290	/* Update the inputA and inputB pointers for next MAC calculation */
xorjoep	1:24714b45cd1b	1291	px = ++pSrc1;
xorjoep	1:24714b45cd1b	1292	py = pIn2;
xorjoep	1:24714b45cd1b	1293
xorjoep	1:24714b45cd1b	1294	/* Decrement the MAC count */
xorjoep	1:24714b45cd1b	1295	count--;
xorjoep	1:24714b45cd1b	1296
xorjoep	1:24714b45cd1b	1297	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	1298	blockSize3--;
xorjoep	1:24714b45cd1b	1299	}
xorjoep	1:24714b45cd1b	1300
xorjoep	1:24714b45cd1b	1301	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
xorjoep	1:24714b45cd1b	1302
xorjoep	1:24714b45cd1b	1303	}
xorjoep	1:24714b45cd1b	1304
xorjoep	1:24714b45cd1b	1305	/**
xorjoep	1:24714b45cd1b	1306	* @} end of Corr group
xorjoep	1:24714b45cd1b	1307	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	20 Jun 2018
Imports:	227
Forks:	0
Commits:	4
Dependents:	10
Dependencies:	0
Followers:	6

functions/FilteringFunctions/arm_correlate_fast_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning