Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_conv_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_opt_q7.c 00004 * Description: Convolution of Q7 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup Conv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Convolution of Q7 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00047 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00048 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00049 * @return none. 00050 * 00051 * \par Restrictions 00052 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00053 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00054 * 00055 * @details 00056 * <b>Scaling and Overflow Behavior:</b> 00057 * 00058 * \par 00059 * The function is implemented using a 32-bit internal accumulator. 00060 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00061 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00062 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00063 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. 00064 * 00065 */ 00066 00067 void arm_conv_opt_q7( 00068 q7_t * pSrcA, 00069 uint32_t srcALen, 00070 q7_t * pSrcB, 00071 uint32_t srcBLen, 00072 q7_t * pDst, 00073 q15_t * pScratch1, 00074 q15_t * pScratch2) 00075 { 00076 00077 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00078 q15_t x4; /* Temporary input variable */ 00079 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00080 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00081 q7_t *px; /* Temporary input1 pointer */ 00082 q15_t *py; /* Temporary input2 pointer */ 00083 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00084 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00085 q7_t *pOut = pDst; /* output pointer */ 00086 q7_t out0, out1, out2, out3; /* temporary variables */ 00087 00088 /* The algorithm implementation is based on the lengths of the inputs. */ 00089 /* srcB is always made to slide across srcA. */ 00090 /* So srcBLen is always considered as shorter or equal to srcALen */ 00091 if (srcALen >= srcBLen) 00092 { 00093 /* Initialization of inputA pointer */ 00094 pIn1 = pSrcA; 00095 00096 /* Initialization of inputB pointer */ 00097 pIn2 = pSrcB; 00098 } 00099 else 00100 { 00101 /* Initialization of inputA pointer */ 00102 pIn1 = pSrcB; 00103 00104 /* Initialization of inputB pointer */ 00105 pIn2 = pSrcA; 00106 00107 /* srcBLen is always considered as shorter or equal to srcALen */ 00108 j = srcBLen; 00109 srcBLen = srcALen; 00110 srcALen = j; 00111 } 00112 00113 /* pointer to take end of scratch2 buffer */ 00114 pScr2 = pScratch2; 00115 00116 /* points to smaller length sequence */ 00117 px = pIn2 + srcBLen - 1; 00118 00119 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00120 k = srcBLen >> 2U; 00121 00122 /* First part of the processing with loop unrolling copies 4 data points at a time. 00123 ** a second loop below copies for the remaining 1 to 3 samples. */ 00124 while (k > 0U) 00125 { 00126 /* copy second buffer in reversal manner */ 00127 x4 = (q15_t) * px--; 00128 *pScr2++ = x4; 00129 x4 = (q15_t) * px--; 00130 *pScr2++ = x4; 00131 x4 = (q15_t) * px--; 00132 *pScr2++ = x4; 00133 x4 = (q15_t) * px--; 00134 *pScr2++ = x4; 00135 00136 /* Decrement the loop counter */ 00137 k--; 00138 } 00139 00140 /* If the count is not a multiple of 4, copy remaining samples here. 00141 ** No loop unrolling is used. */ 00142 k = srcBLen % 0x4U; 00143 00144 while (k > 0U) 00145 { 00146 /* copy second buffer in reversal manner for remaining samples */ 00147 x4 = (q15_t) * px--; 00148 *pScr2++ = x4; 00149 00150 /* Decrement the loop counter */ 00151 k--; 00152 } 00153 00154 /* Initialze temporary scratch pointer */ 00155 pScr1 = pScratch1; 00156 00157 /* Fill (srcBLen - 1U) zeros in scratch buffer */ 00158 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00159 00160 /* Update temporary scratch pointer */ 00161 pScr1 += (srcBLen - 1U); 00162 00163 /* Copy (srcALen) samples in scratch buffer */ 00164 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00165 k = srcALen >> 2U; 00166 00167 /* First part of the processing with loop unrolling copies 4 data points at a time. 00168 ** a second loop below copies for the remaining 1 to 3 samples. */ 00169 while (k > 0U) 00170 { 00171 /* copy second buffer in reversal manner */ 00172 x4 = (q15_t) * pIn1++; 00173 *pScr1++ = x4; 00174 x4 = (q15_t) * pIn1++; 00175 *pScr1++ = x4; 00176 x4 = (q15_t) * pIn1++; 00177 *pScr1++ = x4; 00178 x4 = (q15_t) * pIn1++; 00179 *pScr1++ = x4; 00180 00181 /* Decrement the loop counter */ 00182 k--; 00183 } 00184 00185 /* If the count is not a multiple of 4, copy remaining samples here. 00186 ** No loop unrolling is used. */ 00187 k = srcALen % 0x4U; 00188 00189 while (k > 0U) 00190 { 00191 /* copy second buffer in reversal manner for remaining samples */ 00192 x4 = (q15_t) * pIn1++; 00193 *pScr1++ = x4; 00194 00195 /* Decrement the loop counter */ 00196 k--; 00197 } 00198 00199 #ifndef UNALIGNED_SUPPORT_DISABLE 00200 00201 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ 00202 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00203 00204 /* Update pointer */ 00205 pScr1 += (srcBLen - 1U); 00206 00207 #else 00208 00209 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00210 k = (srcBLen - 1U) >> 2U; 00211 00212 /* First part of the processing with loop unrolling copies 4 data points at a time. 00213 ** a second loop below copies for the remaining 1 to 3 samples. */ 00214 while (k > 0U) 00215 { 00216 /* copy second buffer in reversal manner */ 00217 *pScr1++ = 0; 00218 *pScr1++ = 0; 00219 *pScr1++ = 0; 00220 *pScr1++ = 0; 00221 00222 /* Decrement the loop counter */ 00223 k--; 00224 } 00225 00226 /* If the count is not a multiple of 4, copy remaining samples here. 00227 ** No loop unrolling is used. */ 00228 k = (srcBLen - 1U) % 0x4U; 00229 00230 while (k > 0U) 00231 { 00232 /* copy second buffer in reversal manner for remaining samples */ 00233 *pScr1++ = 0; 00234 00235 /* Decrement the loop counter */ 00236 k--; 00237 } 00238 00239 #endif 00240 00241 /* Temporary pointer for scratch2 */ 00242 py = pScratch2; 00243 00244 /* Initialization of pIn2 pointer */ 00245 pIn2 = (q7_t *) py; 00246 00247 pScr2 = py; 00248 00249 /* Actual convolution process starts here */ 00250 blkCnt = (srcALen + srcBLen - 1U) >> 2; 00251 00252 while (blkCnt > 0) 00253 { 00254 /* Initialze temporary scratch pointer as scratch1 */ 00255 pScr1 = pScratch1; 00256 00257 /* Clear Accumlators */ 00258 acc0 = 0; 00259 acc1 = 0; 00260 acc2 = 0; 00261 acc3 = 0; 00262 00263 /* Read two samples from scratch1 buffer */ 00264 x1 = *__SIMD32(pScr1)++; 00265 00266 /* Read next two samples from scratch1 buffer */ 00267 x2 = *__SIMD32(pScr1)++; 00268 00269 tapCnt = (srcBLen) >> 2U; 00270 00271 while (tapCnt > 0U) 00272 { 00273 00274 /* Read four samples from smaller buffer */ 00275 y1 = _SIMD32_OFFSET(pScr2); 00276 00277 /* multiply and accumlate */ 00278 acc0 = __SMLAD(x1, y1, acc0); 00279 acc2 = __SMLAD(x2, y1, acc2); 00280 00281 /* pack input data */ 00282 #ifndef ARM_MATH_BIG_ENDIAN 00283 x3 = __PKHBT(x2, x1, 0); 00284 #else 00285 x3 = __PKHBT(x1, x2, 0); 00286 #endif 00287 00288 /* multiply and accumlate */ 00289 acc1 = __SMLADX(x3, y1, acc1); 00290 00291 /* Read next two samples from scratch1 buffer */ 00292 x1 = *__SIMD32(pScr1)++; 00293 00294 /* pack input data */ 00295 #ifndef ARM_MATH_BIG_ENDIAN 00296 x3 = __PKHBT(x1, x2, 0); 00297 #else 00298 x3 = __PKHBT(x2, x1, 0); 00299 #endif 00300 00301 acc3 = __SMLADX(x3, y1, acc3); 00302 00303 /* Read four samples from smaller buffer */ 00304 y1 = _SIMD32_OFFSET(pScr2 + 2U); 00305 00306 acc0 = __SMLAD(x2, y1, acc0); 00307 00308 acc2 = __SMLAD(x1, y1, acc2); 00309 00310 acc1 = __SMLADX(x3, y1, acc1); 00311 00312 x2 = *__SIMD32(pScr1)++; 00313 00314 #ifndef ARM_MATH_BIG_ENDIAN 00315 x3 = __PKHBT(x2, x1, 0); 00316 #else 00317 x3 = __PKHBT(x1, x2, 0); 00318 #endif 00319 00320 acc3 = __SMLADX(x3, y1, acc3); 00321 00322 pScr2 += 4U; 00323 00324 00325 /* Decrement the loop counter */ 00326 tapCnt--; 00327 } 00328 00329 00330 00331 /* Update scratch pointer for remaining samples of smaller length sequence */ 00332 pScr1 -= 4U; 00333 00334 00335 /* apply same above for remaining samples of smaller length sequence */ 00336 tapCnt = (srcBLen) & 3U; 00337 00338 while (tapCnt > 0U) 00339 { 00340 00341 /* accumlate the results */ 00342 acc0 += (*pScr1++ * *pScr2); 00343 acc1 += (*pScr1++ * *pScr2); 00344 acc2 += (*pScr1++ * *pScr2); 00345 acc3 += (*pScr1++ * *pScr2++); 00346 00347 pScr1 -= 3U; 00348 00349 /* Decrement the loop counter */ 00350 tapCnt--; 00351 } 00352 00353 blkCnt--; 00354 00355 /* Store the result in the accumulator in the destination buffer. */ 00356 out0 = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00357 out1 = (q7_t) (__SSAT(acc1 >> 7U, 8)); 00358 out2 = (q7_t) (__SSAT(acc2 >> 7U, 8)); 00359 out3 = (q7_t) (__SSAT(acc3 >> 7U, 8)); 00360 00361 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00362 00363 /* Initialization of inputB pointer */ 00364 pScr2 = py; 00365 00366 pScratch1 += 4U; 00367 00368 } 00369 00370 00371 blkCnt = (srcALen + srcBLen - 1U) & 0x3; 00372 00373 /* Calculate convolution for remaining samples of Bigger length sequence */ 00374 while (blkCnt > 0) 00375 { 00376 /* Initialze temporary scratch pointer as scratch1 */ 00377 pScr1 = pScratch1; 00378 00379 /* Clear Accumlators */ 00380 acc0 = 0; 00381 00382 tapCnt = (srcBLen) >> 1U; 00383 00384 while (tapCnt > 0U) 00385 { 00386 acc0 += (*pScr1++ * *pScr2++); 00387 acc0 += (*pScr1++ * *pScr2++); 00388 00389 /* Decrement the loop counter */ 00390 tapCnt--; 00391 } 00392 00393 tapCnt = (srcBLen) & 1U; 00394 00395 /* apply same above for remaining samples of smaller length sequence */ 00396 while (tapCnt > 0U) 00397 { 00398 00399 /* accumlate the results */ 00400 acc0 += (*pScr1++ * *pScr2++); 00401 00402 /* Decrement the loop counter */ 00403 tapCnt--; 00404 } 00405 00406 blkCnt--; 00407 00408 /* Store the result in the accumulator in the destination buffer. */ 00409 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00410 00411 /* Initialization of inputB pointer */ 00412 pScr2 = py; 00413 00414 pScratch1 += 1U; 00415 00416 } 00417 00418 } 00419 00420 00421 /** 00422 * @} end of Conv group 00423 */ 00424
Generated on Tue Jul 12 2022 16:46:23 by
