CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_partial_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q7.c 00009 * 00010 * Description: Partial convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * \par 00064 * Refer the function <code>arm_conv_partial_opt_q7()</code> for a faster implementation of this function. 00065 * 00066 */ 00067 00068 arm_status arm_conv_partial_q7( 00069 q7_t * pSrcA, 00070 uint32_t srcALen, 00071 q7_t * pSrcB, 00072 uint32_t srcBLen, 00073 q7_t * pDst, 00074 uint32_t firstIndex, 00075 uint32_t numPoints) 00076 { 00077 00078 00079 #ifndef ARM_MATH_CM0_FAMILY 00080 00081 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00082 00083 q7_t *pIn1; /* inputA pointer */ 00084 q7_t *pIn2; /* inputB pointer */ 00085 q7_t *pOut = pDst; /* output pointer */ 00086 q7_t *px; /* Intermediate inputA pointer */ 00087 q7_t *py; /* Intermediate inputB pointer */ 00088 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00089 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00090 q31_t input1, input2; 00091 q15_t in1, in2; 00092 q7_t x0, x1, x2, x3, c0, c1; 00093 uint32_t j, k, count, check, blkCnt; 00094 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00095 arm_status status; 00096 00097 00098 /* Check for range of output samples to be calculated */ 00099 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00100 { 00101 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00102 status = ARM_MATH_ARGUMENT_ERROR; 00103 } 00104 else 00105 { 00106 00107 /* The algorithm implementation is based on the lengths of the inputs. */ 00108 /* srcB is always made to slide across srcA. */ 00109 /* So srcBLen is always considered as shorter or equal to srcALen */ 00110 if(srcALen >= srcBLen) 00111 { 00112 /* Initialization of inputA pointer */ 00113 pIn1 = pSrcA; 00114 00115 /* Initialization of inputB pointer */ 00116 pIn2 = pSrcB; 00117 } 00118 else 00119 { 00120 /* Initialization of inputA pointer */ 00121 pIn1 = pSrcB; 00122 00123 /* Initialization of inputB pointer */ 00124 pIn2 = pSrcA; 00125 00126 /* srcBLen is always considered as shorter or equal to srcALen */ 00127 j = srcBLen; 00128 srcBLen = srcALen; 00129 srcALen = j; 00130 } 00131 00132 /* Conditions to check which loopCounter holds 00133 * the first and last indices of the output samples to be calculated. */ 00134 check = firstIndex + numPoints; 00135 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00136 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00137 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00138 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00139 (int32_t) numPoints) : 0; 00140 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00141 (int32_t) firstIndex); 00142 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00143 00144 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00145 /* The function is internally 00146 * divided into three stages according to the number of multiplications that has to be 00147 * taken place between inputA samples and inputB samples. In the first stage of the 00148 * algorithm, the multiplications increase by one for every iteration. 00149 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00150 * In the third stage of the algorithm, the multiplications decrease by one 00151 * for every iteration. */ 00152 00153 /* Set the output pointer to point to the firstIndex 00154 * of the output sample to be calculated. */ 00155 pOut = pDst + firstIndex; 00156 00157 /* -------------------------- 00158 * Initializations of stage1 00159 * -------------------------*/ 00160 00161 /* sum = x[0] * y[0] 00162 * sum = x[0] * y[1] + x[1] * y[0] 00163 * .... 00164 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00165 */ 00166 00167 /* In this stage the MAC operations are increased by 1 for every iteration. 00168 The count variable holds the number of MAC operations performed. 00169 Since the partial convolution starts from from firstIndex 00170 Number of Macs to be performed is firstIndex + 1 */ 00171 count = 1u + firstIndex; 00172 00173 /* Working pointer of inputA */ 00174 px = pIn1; 00175 00176 /* Working pointer of inputB */ 00177 pSrc2 = pIn2 + firstIndex; 00178 py = pSrc2; 00179 00180 /* ------------------------ 00181 * Stage1 process 00182 * ----------------------*/ 00183 00184 /* The first stage starts here */ 00185 while(blockSize1 > 0) 00186 { 00187 /* Accumulator is made zero for every iteration */ 00188 sum = 0; 00189 00190 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00191 k = count >> 2u; 00192 00193 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00194 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00195 while(k > 0u) 00196 { 00197 /* x[0] , x[1] */ 00198 in1 = (q15_t) * px++; 00199 in2 = (q15_t) * px++; 00200 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00201 00202 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00203 in1 = (q15_t) * py--; 00204 in2 = (q15_t) * py--; 00205 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00206 00207 /* x[0] * y[srcBLen - 1] */ 00208 /* x[1] * y[srcBLen - 2] */ 00209 sum = __SMLAD(input1, input2, sum); 00210 00211 /* x[2] , x[3] */ 00212 in1 = (q15_t) * px++; 00213 in2 = (q15_t) * px++; 00214 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00215 00216 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00217 in1 = (q15_t) * py--; 00218 in2 = (q15_t) * py--; 00219 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00220 00221 /* x[2] * y[srcBLen - 3] */ 00222 /* x[3] * y[srcBLen - 4] */ 00223 sum = __SMLAD(input1, input2, sum); 00224 00225 /* Decrement the loop counter */ 00226 k--; 00227 } 00228 00229 /* If the count is not a multiple of 4, compute any remaining MACs here. 00230 ** No loop unrolling is used. */ 00231 k = count % 0x4u; 00232 00233 while(k > 0u) 00234 { 00235 /* Perform the multiply-accumulates */ 00236 sum += ((q31_t) * px++ * *py--); 00237 00238 /* Decrement the loop counter */ 00239 k--; 00240 } 00241 00242 /* Store the result in the accumulator in the destination buffer. */ 00243 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00244 00245 /* Update the inputA and inputB pointers for next MAC calculation */ 00246 py = ++pSrc2; 00247 px = pIn1; 00248 00249 /* Increment the MAC count */ 00250 count++; 00251 00252 /* Decrement the loop counter */ 00253 blockSize1--; 00254 } 00255 00256 /* -------------------------- 00257 * Initializations of stage2 00258 * ------------------------*/ 00259 00260 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00261 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00262 * .... 00263 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00264 */ 00265 00266 /* Working pointer of inputA */ 00267 px = pIn1; 00268 00269 /* Working pointer of inputB */ 00270 pSrc2 = pIn2 + (srcBLen - 1u); 00271 py = pSrc2; 00272 00273 /* count is index by which the pointer pIn1 to be incremented */ 00274 count = 0u; 00275 00276 /* ------------------- 00277 * Stage2 process 00278 * ------------------*/ 00279 00280 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00281 * So, to loop unroll over blockSize2, 00282 * srcBLen should be greater than or equal to 4 */ 00283 if(srcBLen >= 4u) 00284 { 00285 /* Loop unroll over blockSize2, by 4 */ 00286 blkCnt = ((uint32_t) blockSize2 >> 2u); 00287 00288 while(blkCnt > 0u) 00289 { 00290 /* Set all accumulators to zero */ 00291 acc0 = 0; 00292 acc1 = 0; 00293 acc2 = 0; 00294 acc3 = 0; 00295 00296 /* read x[0], x[1], x[2] samples */ 00297 x0 = *(px++); 00298 x1 = *(px++); 00299 x2 = *(px++); 00300 00301 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00302 k = srcBLen >> 2u; 00303 00304 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00305 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00306 do 00307 { 00308 /* Read y[srcBLen - 1] sample */ 00309 c0 = *(py--); 00310 /* Read y[srcBLen - 2] sample */ 00311 c1 = *(py--); 00312 00313 /* Read x[3] sample */ 00314 x3 = *(px++); 00315 00316 /* x[0] and x[1] are packed */ 00317 in1 = (q15_t) x0; 00318 in2 = (q15_t) x1; 00319 00320 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00321 00322 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00323 in1 = (q15_t) c0; 00324 in2 = (q15_t) c1; 00325 00326 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00327 00328 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00329 acc0 = __SMLAD(input1, input2, acc0); 00330 00331 /* x[1] and x[2] are packed */ 00332 in1 = (q15_t) x1; 00333 in2 = (q15_t) x2; 00334 00335 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00336 00337 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00338 acc1 = __SMLAD(input1, input2, acc1); 00339 00340 /* x[2] and x[3] are packed */ 00341 in1 = (q15_t) x2; 00342 in2 = (q15_t) x3; 00343 00344 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00345 00346 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00347 acc2 = __SMLAD(input1, input2, acc2); 00348 00349 /* Read x[4] sample */ 00350 x0 = *(px++); 00351 00352 /* x[3] and x[4] are packed */ 00353 in1 = (q15_t) x3; 00354 in2 = (q15_t) x0; 00355 00356 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00357 00358 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00359 acc3 = __SMLAD(input1, input2, acc3); 00360 00361 /* Read y[srcBLen - 3] sample */ 00362 c0 = *(py--); 00363 /* Read y[srcBLen - 4] sample */ 00364 c1 = *(py--); 00365 00366 /* Read x[5] sample */ 00367 x1 = *(px++); 00368 00369 /* x[2] and x[3] are packed */ 00370 in1 = (q15_t) x2; 00371 in2 = (q15_t) x3; 00372 00373 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00374 00375 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00376 in1 = (q15_t) c0; 00377 in2 = (q15_t) c1; 00378 00379 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00380 00381 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00382 acc0 = __SMLAD(input1, input2, acc0); 00383 00384 /* x[3] and x[4] are packed */ 00385 in1 = (q15_t) x3; 00386 in2 = (q15_t) x0; 00387 00388 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00389 00390 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00391 acc1 = __SMLAD(input1, input2, acc1); 00392 00393 /* x[4] and x[5] are packed */ 00394 in1 = (q15_t) x0; 00395 in2 = (q15_t) x1; 00396 00397 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00398 00399 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00400 acc2 = __SMLAD(input1, input2, acc2); 00401 00402 /* Read x[6] sample */ 00403 x2 = *(px++); 00404 00405 /* x[5] and x[6] are packed */ 00406 in1 = (q15_t) x1; 00407 in2 = (q15_t) x2; 00408 00409 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00410 00411 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00412 acc3 = __SMLAD(input1, input2, acc3); 00413 00414 } while(--k); 00415 00416 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00417 ** No loop unrolling is used. */ 00418 k = srcBLen % 0x4u; 00419 00420 while(k > 0u) 00421 { 00422 /* Read y[srcBLen - 5] sample */ 00423 c0 = *(py--); 00424 00425 /* Read x[7] sample */ 00426 x3 = *(px++); 00427 00428 /* Perform the multiply-accumulates */ 00429 /* acc0 += x[4] * y[srcBLen - 5] */ 00430 acc0 += ((q31_t) x0 * c0); 00431 /* acc1 += x[5] * y[srcBLen - 5] */ 00432 acc1 += ((q31_t) x1 * c0); 00433 /* acc2 += x[6] * y[srcBLen - 5] */ 00434 acc2 += ((q31_t) x2 * c0); 00435 /* acc3 += x[7] * y[srcBLen - 5] */ 00436 acc3 += ((q31_t) x3 * c0); 00437 00438 /* Reuse the present samples for the next MAC */ 00439 x0 = x1; 00440 x1 = x2; 00441 x2 = x3; 00442 00443 /* Decrement the loop counter */ 00444 k--; 00445 } 00446 00447 /* Store the result in the accumulator in the destination buffer. */ 00448 *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8)); 00449 *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8)); 00450 *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8)); 00451 *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8)); 00452 00453 /* Increment the pointer pIn1 index, count by 4 */ 00454 count += 4u; 00455 00456 /* Update the inputA and inputB pointers for next MAC calculation */ 00457 px = pIn1 + count; 00458 py = pSrc2; 00459 00460 00461 /* Decrement the loop counter */ 00462 blkCnt--; 00463 } 00464 00465 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00466 ** No loop unrolling is used. */ 00467 blkCnt = (uint32_t) blockSize2 % 0x4u; 00468 00469 while(blkCnt > 0u) 00470 { 00471 /* Accumulator is made zero for every iteration */ 00472 sum = 0; 00473 00474 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00475 k = srcBLen >> 2u; 00476 00477 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00478 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00479 while(k > 0u) 00480 { 00481 00482 /* Reading two inputs of SrcA buffer and packing */ 00483 in1 = (q15_t) * px++; 00484 in2 = (q15_t) * px++; 00485 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00486 00487 /* Reading two inputs of SrcB buffer and packing */ 00488 in1 = (q15_t) * py--; 00489 in2 = (q15_t) * py--; 00490 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00491 00492 /* Perform the multiply-accumulates */ 00493 sum = __SMLAD(input1, input2, sum); 00494 00495 /* Reading two inputs of SrcA buffer and packing */ 00496 in1 = (q15_t) * px++; 00497 in2 = (q15_t) * px++; 00498 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00499 00500 /* Reading two inputs of SrcB buffer and packing */ 00501 in1 = (q15_t) * py--; 00502 in2 = (q15_t) * py--; 00503 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00504 00505 /* Perform the multiply-accumulates */ 00506 sum = __SMLAD(input1, input2, sum); 00507 00508 /* Decrement the loop counter */ 00509 k--; 00510 } 00511 00512 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00513 ** No loop unrolling is used. */ 00514 k = srcBLen % 0x4u; 00515 00516 while(k > 0u) 00517 { 00518 /* Perform the multiply-accumulates */ 00519 sum += ((q31_t) * px++ * *py--); 00520 00521 /* Decrement the loop counter */ 00522 k--; 00523 } 00524 00525 /* Store the result in the accumulator in the destination buffer. */ 00526 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00527 00528 /* Increment the pointer pIn1 index, count by 1 */ 00529 count++; 00530 00531 /* Update the inputA and inputB pointers for next MAC calculation */ 00532 px = pIn1 + count; 00533 py = pSrc2; 00534 00535 /* Decrement the loop counter */ 00536 blkCnt--; 00537 } 00538 } 00539 else 00540 { 00541 /* If the srcBLen is not a multiple of 4, 00542 * the blockSize2 loop cannot be unrolled by 4 */ 00543 blkCnt = (uint32_t) blockSize2; 00544 00545 while(blkCnt > 0u) 00546 { 00547 /* Accumulator is made zero for every iteration */ 00548 sum = 0; 00549 00550 /* srcBLen number of MACS should be performed */ 00551 k = srcBLen; 00552 00553 while(k > 0u) 00554 { 00555 /* Perform the multiply-accumulate */ 00556 sum += ((q31_t) * px++ * *py--); 00557 00558 /* Decrement the loop counter */ 00559 k--; 00560 } 00561 00562 /* Store the result in the accumulator in the destination buffer. */ 00563 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00564 00565 /* Increment the MAC count */ 00566 count++; 00567 00568 /* Update the inputA and inputB pointers for next MAC calculation */ 00569 px = pIn1 + count; 00570 py = pSrc2; 00571 00572 /* Decrement the loop counter */ 00573 blkCnt--; 00574 } 00575 } 00576 00577 00578 /* -------------------------- 00579 * Initializations of stage3 00580 * -------------------------*/ 00581 00582 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00583 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00584 * .... 00585 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00586 * sum += x[srcALen-1] * y[srcBLen-1] 00587 */ 00588 00589 /* In this stage the MAC operations are decreased by 1 for every iteration. 00590 The count variable holds the number of MAC operations performed */ 00591 count = srcBLen - 1u; 00592 00593 /* Working pointer of inputA */ 00594 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00595 px = pSrc1; 00596 00597 /* Working pointer of inputB */ 00598 pSrc2 = pIn2 + (srcBLen - 1u); 00599 py = pSrc2; 00600 00601 /* ------------------- 00602 * Stage3 process 00603 * ------------------*/ 00604 00605 while(blockSize3 > 0) 00606 { 00607 /* Accumulator is made zero for every iteration */ 00608 sum = 0; 00609 00610 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00611 k = count >> 2u; 00612 00613 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00614 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00615 while(k > 0u) 00616 { 00617 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00618 in1 = (q15_t) * px++; 00619 in2 = (q15_t) * px++; 00620 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00621 00622 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00623 in1 = (q15_t) * py--; 00624 in2 = (q15_t) * py--; 00625 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00626 00627 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00628 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00629 sum = __SMLAD(input1, input2, sum); 00630 00631 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00632 in1 = (q15_t) * px++; 00633 in2 = (q15_t) * px++; 00634 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00635 00636 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00637 in1 = (q15_t) * py--; 00638 in2 = (q15_t) * py--; 00639 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00640 00641 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00642 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00643 sum = __SMLAD(input1, input2, sum); 00644 00645 /* Decrement the loop counter */ 00646 k--; 00647 } 00648 00649 /* If the count is not a multiple of 4, compute any remaining MACs here. 00650 ** No loop unrolling is used. */ 00651 k = count % 0x4u; 00652 00653 while(k > 0u) 00654 { 00655 /* Perform the multiply-accumulates */ 00656 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00657 sum += ((q31_t) * px++ * *py--); 00658 00659 /* Decrement the loop counter */ 00660 k--; 00661 } 00662 00663 /* Store the result in the accumulator in the destination buffer. */ 00664 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00665 00666 /* Update the inputA and inputB pointers for next MAC calculation */ 00667 px = ++pSrc1; 00668 py = pSrc2; 00669 00670 /* Decrement the MAC count */ 00671 count--; 00672 00673 /* Decrement the loop counter */ 00674 blockSize3--; 00675 00676 } 00677 00678 /* set status as ARM_MATH_SUCCESS */ 00679 status = ARM_MATH_SUCCESS; 00680 } 00681 00682 /* Return to application */ 00683 return (status); 00684 00685 #else 00686 00687 /* Run the below code for Cortex-M0 */ 00688 00689 q7_t *pIn1 = pSrcA; /* inputA pointer */ 00690 q7_t *pIn2 = pSrcB; /* inputB pointer */ 00691 q31_t sum; /* Accumulator */ 00692 uint32_t i, j; /* loop counters */ 00693 arm_status status; /* status of Partial convolution */ 00694 00695 /* Check for range of output samples to be calculated */ 00696 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00697 { 00698 /* Set status as ARM_ARGUMENT_ERROR */ 00699 status = ARM_MATH_ARGUMENT_ERROR; 00700 } 00701 else 00702 { 00703 /* Loop to calculate convolution for output length number of values */ 00704 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00705 { 00706 /* Initialize sum with zero to carry on MAC operations */ 00707 sum = 0; 00708 00709 /* Loop to perform MAC operations according to convolution equation */ 00710 for (j = 0; j <= i; j++) 00711 { 00712 /* Check the array limitations */ 00713 if(((i - j) < srcBLen) && (j < srcALen)) 00714 { 00715 /* z[i] += x[i-j] * y[j] */ 00716 sum += ((q15_t) pIn1[j] * (pIn2[i - j])); 00717 } 00718 } 00719 00720 /* Store the output in the destination buffer */ 00721 pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); 00722 } 00723 /* set status as ARM_SUCCESS as there are no argument errors */ 00724 status = ARM_MATH_SUCCESS; 00725 } 00726 return (status); 00727 00728 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00729 00730 } 00731 00732 /** 00733 * @} end of PartialConv group 00734 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2