Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-os by
arm_conv_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_opt_q7.c 00009 * 00010 * Description: Convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00061 * @return none. 00062 * 00063 * \par Restrictions 00064 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00065 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00066 * 00067 * @details 00068 * <b>Scaling and Overflow Behavior:</b> 00069 * 00070 * \par 00071 * The function is implemented using a 32-bit internal accumulator. 00072 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00073 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00074 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00075 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. 00076 * 00077 */ 00078 00079 void arm_conv_opt_q7( 00080 q7_t * pSrcA, 00081 uint32_t srcALen, 00082 q7_t * pSrcB, 00083 uint32_t srcBLen, 00084 q7_t * pDst, 00085 q15_t * pScratch1, 00086 q15_t * pScratch2) 00087 { 00088 00089 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00090 q15_t x4; /* Temporary input variable */ 00091 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00092 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00093 q7_t *px; /* Temporary input1 pointer */ 00094 q15_t *py; /* Temporary input2 pointer */ 00095 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00096 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00097 q7_t *pOut = pDst; /* output pointer */ 00098 q7_t out0, out1, out2, out3; /* temporary variables */ 00099 00100 /* The algorithm implementation is based on the lengths of the inputs. */ 00101 /* srcB is always made to slide across srcA. */ 00102 /* So srcBLen is always considered as shorter or equal to srcALen */ 00103 if(srcALen >= srcBLen) 00104 { 00105 /* Initialization of inputA pointer */ 00106 pIn1 = pSrcA; 00107 00108 /* Initialization of inputB pointer */ 00109 pIn2 = pSrcB; 00110 } 00111 else 00112 { 00113 /* Initialization of inputA pointer */ 00114 pIn1 = pSrcB; 00115 00116 /* Initialization of inputB pointer */ 00117 pIn2 = pSrcA; 00118 00119 /* srcBLen is always considered as shorter or equal to srcALen */ 00120 j = srcBLen; 00121 srcBLen = srcALen; 00122 srcALen = j; 00123 } 00124 00125 /* pointer to take end of scratch2 buffer */ 00126 pScr2 = pScratch2; 00127 00128 /* points to smaller length sequence */ 00129 px = pIn2 + srcBLen - 1; 00130 00131 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00132 k = srcBLen >> 2u; 00133 00134 /* First part of the processing with loop unrolling copies 4 data points at a time. 00135 ** a second loop below copies for the remaining 1 to 3 samples. */ 00136 while(k > 0u) 00137 { 00138 /* copy second buffer in reversal manner */ 00139 x4 = (q15_t) * px--; 00140 *pScr2++ = x4; 00141 x4 = (q15_t) * px--; 00142 *pScr2++ = x4; 00143 x4 = (q15_t) * px--; 00144 *pScr2++ = x4; 00145 x4 = (q15_t) * px--; 00146 *pScr2++ = x4; 00147 00148 /* Decrement the loop counter */ 00149 k--; 00150 } 00151 00152 /* If the count is not a multiple of 4, copy remaining samples here. 00153 ** No loop unrolling is used. */ 00154 k = srcBLen % 0x4u; 00155 00156 while(k > 0u) 00157 { 00158 /* copy second buffer in reversal manner for remaining samples */ 00159 x4 = (q15_t) * px--; 00160 *pScr2++ = x4; 00161 00162 /* Decrement the loop counter */ 00163 k--; 00164 } 00165 00166 /* Initialze temporary scratch pointer */ 00167 pScr1 = pScratch1; 00168 00169 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00170 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00171 00172 /* Update temporary scratch pointer */ 00173 pScr1 += (srcBLen - 1u); 00174 00175 /* Copy (srcALen) samples in scratch buffer */ 00176 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00177 k = srcALen >> 2u; 00178 00179 /* First part of the processing with loop unrolling copies 4 data points at a time. 00180 ** a second loop below copies for the remaining 1 to 3 samples. */ 00181 while(k > 0u) 00182 { 00183 /* copy second buffer in reversal manner */ 00184 x4 = (q15_t) * pIn1++; 00185 *pScr1++ = x4; 00186 x4 = (q15_t) * pIn1++; 00187 *pScr1++ = x4; 00188 x4 = (q15_t) * pIn1++; 00189 *pScr1++ = x4; 00190 x4 = (q15_t) * pIn1++; 00191 *pScr1++ = x4; 00192 00193 /* Decrement the loop counter */ 00194 k--; 00195 } 00196 00197 /* If the count is not a multiple of 4, copy remaining samples here. 00198 ** No loop unrolling is used. */ 00199 k = srcALen % 0x4u; 00200 00201 while(k > 0u) 00202 { 00203 /* copy second buffer in reversal manner for remaining samples */ 00204 x4 = (q15_t) * pIn1++; 00205 *pScr1++ = x4; 00206 00207 /* Decrement the loop counter */ 00208 k--; 00209 } 00210 00211 #ifndef UNALIGNED_SUPPORT_DISABLE 00212 00213 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00214 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00215 00216 /* Update pointer */ 00217 pScr1 += (srcBLen - 1u); 00218 00219 #else 00220 00221 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00222 k = (srcBLen - 1u) >> 2u; 00223 00224 /* First part of the processing with loop unrolling copies 4 data points at a time. 00225 ** a second loop below copies for the remaining 1 to 3 samples. */ 00226 while(k > 0u) 00227 { 00228 /* copy second buffer in reversal manner */ 00229 *pScr1++ = 0; 00230 *pScr1++ = 0; 00231 *pScr1++ = 0; 00232 *pScr1++ = 0; 00233 00234 /* Decrement the loop counter */ 00235 k--; 00236 } 00237 00238 /* If the count is not a multiple of 4, copy remaining samples here. 00239 ** No loop unrolling is used. */ 00240 k = (srcBLen - 1u) % 0x4u; 00241 00242 while(k > 0u) 00243 { 00244 /* copy second buffer in reversal manner for remaining samples */ 00245 *pScr1++ = 0; 00246 00247 /* Decrement the loop counter */ 00248 k--; 00249 } 00250 00251 #endif 00252 00253 /* Temporary pointer for scratch2 */ 00254 py = pScratch2; 00255 00256 /* Initialization of pIn2 pointer */ 00257 pIn2 = (q7_t *) py; 00258 00259 pScr2 = py; 00260 00261 /* Actual convolution process starts here */ 00262 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00263 00264 while(blkCnt > 0) 00265 { 00266 /* Initialze temporary scratch pointer as scratch1 */ 00267 pScr1 = pScratch1; 00268 00269 /* Clear Accumlators */ 00270 acc0 = 0; 00271 acc1 = 0; 00272 acc2 = 0; 00273 acc3 = 0; 00274 00275 /* Read two samples from scratch1 buffer */ 00276 x1 = *__SIMD32(pScr1)++; 00277 00278 /* Read next two samples from scratch1 buffer */ 00279 x2 = *__SIMD32(pScr1)++; 00280 00281 tapCnt = (srcBLen) >> 2u; 00282 00283 while(tapCnt > 0u) 00284 { 00285 00286 /* Read four samples from smaller buffer */ 00287 y1 = _SIMD32_OFFSET(pScr2); 00288 00289 /* multiply and accumlate */ 00290 acc0 = __SMLAD(x1, y1, acc0); 00291 acc2 = __SMLAD(x2, y1, acc2); 00292 00293 /* pack input data */ 00294 #ifndef ARM_MATH_BIG_ENDIAN 00295 x3 = __PKHBT(x2, x1, 0); 00296 #else 00297 x3 = __PKHBT(x1, x2, 0); 00298 #endif 00299 00300 /* multiply and accumlate */ 00301 acc1 = __SMLADX(x3, y1, acc1); 00302 00303 /* Read next two samples from scratch1 buffer */ 00304 x1 = *__SIMD32(pScr1)++; 00305 00306 /* pack input data */ 00307 #ifndef ARM_MATH_BIG_ENDIAN 00308 x3 = __PKHBT(x1, x2, 0); 00309 #else 00310 x3 = __PKHBT(x2, x1, 0); 00311 #endif 00312 00313 acc3 = __SMLADX(x3, y1, acc3); 00314 00315 /* Read four samples from smaller buffer */ 00316 y1 = _SIMD32_OFFSET(pScr2 + 2u); 00317 00318 acc0 = __SMLAD(x2, y1, acc0); 00319 00320 acc2 = __SMLAD(x1, y1, acc2); 00321 00322 acc1 = __SMLADX(x3, y1, acc1); 00323 00324 x2 = *__SIMD32(pScr1)++; 00325 00326 #ifndef ARM_MATH_BIG_ENDIAN 00327 x3 = __PKHBT(x2, x1, 0); 00328 #else 00329 x3 = __PKHBT(x1, x2, 0); 00330 #endif 00331 00332 acc3 = __SMLADX(x3, y1, acc3); 00333 00334 pScr2 += 4u; 00335 00336 00337 /* Decrement the loop counter */ 00338 tapCnt--; 00339 } 00340 00341 00342 00343 /* Update scratch pointer for remaining samples of smaller length sequence */ 00344 pScr1 -= 4u; 00345 00346 00347 /* apply same above for remaining samples of smaller length sequence */ 00348 tapCnt = (srcBLen) & 3u; 00349 00350 while(tapCnt > 0u) 00351 { 00352 00353 /* accumlate the results */ 00354 acc0 += (*pScr1++ * *pScr2); 00355 acc1 += (*pScr1++ * *pScr2); 00356 acc2 += (*pScr1++ * *pScr2); 00357 acc3 += (*pScr1++ * *pScr2++); 00358 00359 pScr1 -= 3u; 00360 00361 /* Decrement the loop counter */ 00362 tapCnt--; 00363 } 00364 00365 blkCnt--; 00366 00367 /* Store the result in the accumulator in the destination buffer. */ 00368 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00369 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00370 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00371 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00372 00373 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00374 00375 /* Initialization of inputB pointer */ 00376 pScr2 = py; 00377 00378 pScratch1 += 4u; 00379 00380 } 00381 00382 00383 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00384 00385 /* Calculate convolution for remaining samples of Bigger length sequence */ 00386 while(blkCnt > 0) 00387 { 00388 /* Initialze temporary scratch pointer as scratch1 */ 00389 pScr1 = pScratch1; 00390 00391 /* Clear Accumlators */ 00392 acc0 = 0; 00393 00394 tapCnt = (srcBLen) >> 1u; 00395 00396 while(tapCnt > 0u) 00397 { 00398 acc0 += (*pScr1++ * *pScr2++); 00399 acc0 += (*pScr1++ * *pScr2++); 00400 00401 /* Decrement the loop counter */ 00402 tapCnt--; 00403 } 00404 00405 tapCnt = (srcBLen) & 1u; 00406 00407 /* apply same above for remaining samples of smaller length sequence */ 00408 while(tapCnt > 0u) 00409 { 00410 00411 /* accumlate the results */ 00412 acc0 += (*pScr1++ * *pScr2++); 00413 00414 /* Decrement the loop counter */ 00415 tapCnt--; 00416 } 00417 00418 blkCnt--; 00419 00420 /* Store the result in the accumulator in the destination buffer. */ 00421 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00422 00423 /* Initialization of inputB pointer */ 00424 pScr2 = py; 00425 00426 pScratch1 += 1u; 00427 00428 } 00429 00430 } 00431 00432 00433 /** 00434 * @} end of Conv group 00435 */
Generated on Tue Jul 12 2022 13:15:22 by
