CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_opt_q7.c 00009 * 00010 * Description: Convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00061 * @return none. 00062 * 00063 * \par Restrictions 00064 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00065 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00066 * 00067 * @details 00068 * <b>Scaling and Overflow Behavior:</b> 00069 * 00070 * \par 00071 * The function is implemented using a 32-bit internal accumulator. 00072 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00073 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00074 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00075 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. 00076 * 00077 */ 00078 00079 void arm_conv_opt_q7( 00080 q7_t * pSrcA, 00081 uint32_t srcALen, 00082 q7_t * pSrcB, 00083 uint32_t srcBLen, 00084 q7_t * pDst, 00085 q15_t * pScratch1, 00086 q15_t * pScratch2) 00087 { 00088 00089 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00090 q15_t x4; /* Temporary input variable */ 00091 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00092 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00093 q7_t *px; /* Temporary input1 pointer */ 00094 q15_t *py; /* Temporary input2 pointer */ 00095 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00096 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00097 q7_t *pOut = pDst; /* output pointer */ 00098 q7_t out0, out1, out2, out3; /* temporary variables */ 00099 00100 /* The algorithm implementation is based on the lengths of the inputs. */ 00101 /* srcB is always made to slide across srcA. */ 00102 /* So srcBLen is always considered as shorter or equal to srcALen */ 00103 if(srcALen >= srcBLen) 00104 { 00105 /* Initialization of inputA pointer */ 00106 pIn1 = pSrcA; 00107 00108 /* Initialization of inputB pointer */ 00109 pIn2 = pSrcB; 00110 } 00111 else 00112 { 00113 /* Initialization of inputA pointer */ 00114 pIn1 = pSrcB; 00115 00116 /* Initialization of inputB pointer */ 00117 pIn2 = pSrcA; 00118 00119 /* srcBLen is always considered as shorter or equal to srcALen */ 00120 j = srcBLen; 00121 srcBLen = srcALen; 00122 srcALen = j; 00123 } 00124 00125 /* pointer to take end of scratch2 buffer */ 00126 pScr2 = pScratch2; 00127 00128 /* points to smaller length sequence */ 00129 px = pIn2 + srcBLen - 1; 00130 00131 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00132 k = srcBLen >> 2u; 00133 00134 /* First part of the processing with loop unrolling copies 4 data points at a time. 00135 ** a second loop below copies for the remaining 1 to 3 samples. */ 00136 while(k > 0u) 00137 { 00138 /* copy second buffer in reversal manner */ 00139 x4 = (q15_t) * px--; 00140 *pScr2++ = x4; 00141 x4 = (q15_t) * px--; 00142 *pScr2++ = x4; 00143 x4 = (q15_t) * px--; 00144 *pScr2++ = x4; 00145 x4 = (q15_t) * px--; 00146 *pScr2++ = x4; 00147 00148 /* Decrement the loop counter */ 00149 k--; 00150 } 00151 00152 /* If the count is not a multiple of 4, copy remaining samples here. 00153 ** No loop unrolling is used. */ 00154 k = srcBLen % 0x4u; 00155 00156 while(k > 0u) 00157 { 00158 /* copy second buffer in reversal manner for remaining samples */ 00159 x4 = (q15_t) * px--; 00160 *pScr2++ = x4; 00161 00162 /* Decrement the loop counter */ 00163 k--; 00164 } 00165 00166 /* Initialze temporary scratch pointer */ 00167 pScr1 = pScratch1; 00168 00169 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00170 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00171 00172 /* Update temporary scratch pointer */ 00173 pScr1 += (srcBLen - 1u); 00174 00175 /* Copy (srcALen) samples in scratch buffer */ 00176 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00177 k = srcALen >> 2u; 00178 00179 /* First part of the processing with loop unrolling copies 4 data points at a time. 00180 ** a second loop below copies for the remaining 1 to 3 samples. */ 00181 while(k > 0u) 00182 { 00183 /* copy second buffer in reversal manner */ 00184 x4 = (q15_t) * pIn1++; 00185 *pScr1++ = x4; 00186 x4 = (q15_t) * pIn1++; 00187 *pScr1++ = x4; 00188 x4 = (q15_t) * pIn1++; 00189 *pScr1++ = x4; 00190 x4 = (q15_t) * pIn1++; 00191 *pScr1++ = x4; 00192 00193 /* Decrement the loop counter */ 00194 k--; 00195 } 00196 00197 /* If the count is not a multiple of 4, copy remaining samples here. 00198 ** No loop unrolling is used. */ 00199 k = srcALen % 0x4u; 00200 00201 while(k > 0u) 00202 { 00203 /* copy second buffer in reversal manner for remaining samples */ 00204 x4 = (q15_t) * pIn1++; 00205 *pScr1++ = x4; 00206 00207 /* Decrement the loop counter */ 00208 k--; 00209 } 00210 00211 #ifndef UNALIGNED_SUPPORT_DISABLE 00212 00213 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00214 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00215 00216 /* Update pointer */ 00217 pScr1 += (srcBLen - 1u); 00218 00219 #else 00220 00221 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00222 k = (srcBLen - 1u) >> 2u; 00223 00224 /* First part of the processing with loop unrolling copies 4 data points at a time. 00225 ** a second loop below copies for the remaining 1 to 3 samples. */ 00226 while(k > 0u) 00227 { 00228 /* copy second buffer in reversal manner */ 00229 *pScr1++ = 0; 00230 *pScr1++ = 0; 00231 *pScr1++ = 0; 00232 *pScr1++ = 0; 00233 00234 /* Decrement the loop counter */ 00235 k--; 00236 } 00237 00238 /* If the count is not a multiple of 4, copy remaining samples here. 00239 ** No loop unrolling is used. */ 00240 k = (srcBLen - 1u) % 0x4u; 00241 00242 while(k > 0u) 00243 { 00244 /* copy second buffer in reversal manner for remaining samples */ 00245 *pScr1++ = 0; 00246 00247 /* Decrement the loop counter */ 00248 k--; 00249 } 00250 00251 #endif 00252 00253 /* Temporary pointer for scratch2 */ 00254 py = pScratch2; 00255 00256 /* Initialization of pIn2 pointer */ 00257 pIn2 = (q7_t *) py; 00258 00259 pScr2 = py; 00260 00261 /* Actual convolution process starts here */ 00262 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00263 00264 while(blkCnt > 0) 00265 { 00266 /* Initialze temporary scratch pointer as scratch1 */ 00267 pScr1 = pScratch1; 00268 00269 /* Clear Accumlators */ 00270 acc0 = 0; 00271 acc1 = 0; 00272 acc2 = 0; 00273 acc3 = 0; 00274 00275 /* Read two samples from scratch1 buffer */ 00276 x1 = *__SIMD32(pScr1)++; 00277 00278 /* Read next two samples from scratch1 buffer */ 00279 x2 = *__SIMD32(pScr1)++; 00280 00281 tapCnt = (srcBLen) >> 2u; 00282 00283 while(tapCnt > 0u) 00284 { 00285 00286 /* Read four samples from smaller buffer */ 00287 y1 = _SIMD32_OFFSET(pScr2); 00288 00289 /* multiply and accumlate */ 00290 acc0 = __SMLAD(x1, y1, acc0); 00291 acc2 = __SMLAD(x2, y1, acc2); 00292 00293 /* pack input data */ 00294 #ifndef ARM_MATH_BIG_ENDIAN 00295 x3 = __PKHBT(x2, x1, 0); 00296 #else 00297 x3 = __PKHBT(x1, x2, 0); 00298 #endif 00299 00300 /* multiply and accumlate */ 00301 acc1 = __SMLADX(x3, y1, acc1); 00302 00303 /* Read next two samples from scratch1 buffer */ 00304 x1 = *__SIMD32(pScr1)++; 00305 00306 /* pack input data */ 00307 #ifndef ARM_MATH_BIG_ENDIAN 00308 x3 = __PKHBT(x1, x2, 0); 00309 #else 00310 x3 = __PKHBT(x2, x1, 0); 00311 #endif 00312 00313 acc3 = __SMLADX(x3, y1, acc3); 00314 00315 /* Read four samples from smaller buffer */ 00316 y1 = _SIMD32_OFFSET(pScr2 + 2u); 00317 00318 acc0 = __SMLAD(x2, y1, acc0); 00319 00320 acc2 = __SMLAD(x1, y1, acc2); 00321 00322 acc1 = __SMLADX(x3, y1, acc1); 00323 00324 x2 = *__SIMD32(pScr1)++; 00325 00326 #ifndef ARM_MATH_BIG_ENDIAN 00327 x3 = __PKHBT(x2, x1, 0); 00328 #else 00329 x3 = __PKHBT(x1, x2, 0); 00330 #endif 00331 00332 acc3 = __SMLADX(x3, y1, acc3); 00333 00334 pScr2 += 4u; 00335 00336 00337 /* Decrement the loop counter */ 00338 tapCnt--; 00339 } 00340 00341 00342 00343 /* Update scratch pointer for remaining samples of smaller length sequence */ 00344 pScr1 -= 4u; 00345 00346 00347 /* apply same above for remaining samples of smaller length sequence */ 00348 tapCnt = (srcBLen) & 3u; 00349 00350 while(tapCnt > 0u) 00351 { 00352 00353 /* accumlate the results */ 00354 acc0 += (*pScr1++ * *pScr2); 00355 acc1 += (*pScr1++ * *pScr2); 00356 acc2 += (*pScr1++ * *pScr2); 00357 acc3 += (*pScr1++ * *pScr2++); 00358 00359 pScr1 -= 3u; 00360 00361 /* Decrement the loop counter */ 00362 tapCnt--; 00363 } 00364 00365 blkCnt--; 00366 00367 /* Store the result in the accumulator in the destination buffer. */ 00368 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00369 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00370 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00371 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00372 00373 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00374 00375 /* Initialization of inputB pointer */ 00376 pScr2 = py; 00377 00378 pScratch1 += 4u; 00379 00380 } 00381 00382 00383 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00384 00385 /* Calculate convolution for remaining samples of Bigger length sequence */ 00386 while(blkCnt > 0) 00387 { 00388 /* Initialze temporary scratch pointer as scratch1 */ 00389 pScr1 = pScratch1; 00390 00391 /* Clear Accumlators */ 00392 acc0 = 0; 00393 00394 tapCnt = (srcBLen) >> 1u; 00395 00396 while(tapCnt > 0u) 00397 { 00398 acc0 += (*pScr1++ * *pScr2++); 00399 acc0 += (*pScr1++ * *pScr2++); 00400 00401 /* Decrement the loop counter */ 00402 tapCnt--; 00403 } 00404 00405 tapCnt = (srcBLen) & 1u; 00406 00407 /* apply same above for remaining samples of smaller length sequence */ 00408 while(tapCnt > 0u) 00409 { 00410 00411 /* accumlate the results */ 00412 acc0 += (*pScr1++ * *pScr2++); 00413 00414 /* Decrement the loop counter */ 00415 tapCnt--; 00416 } 00417 00418 blkCnt--; 00419 00420 /* Store the result in the accumulator in the destination buffer. */ 00421 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00422 00423 /* Initialization of inputB pointer */ 00424 pScr2 = py; 00425 00426 pScratch1 += 1u; 00427 00428 } 00429 00430 } 00431 00432 00433 /** 00434 * @} end of Conv group 00435 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2