Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_convolve_HWC_q7_fast_nonsquare.c Source File

arm_convolve_HWC_q7_fast_nonsquare.c

00001 /*
00002  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
00003  *
00004  * SPDX-License-Identifier: Apache-2.0
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the License); you may
00007  * not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  * www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00014  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 /* ----------------------------------------------------------------------
00020  * Project:      CMSIS NN Library
00021  * Title:        arm_convolve_HWC_q7_fast_nonsquare.c
00022  * Description:  Fast Q7 version of convolution (non-sqaure shape)
00023  *
00024  * $Date:        17. January 2018
00025  * $Revision:    V.1.0.0
00026  *
00027  * Target Processor:  Cortex-M cores
00028  *
00029  * -------------------------------------------------------------------- */
00030 
00031 #include "arm_math.h"
00032 #include "arm_nnfunctions.h"
00033 
00034 /**
00035  *  @ingroup groupNN
00036  */
00037 
00038 /**
00039  * @addtogroup NNConv
00040  * @{
00041  */
00042 
00043 /**
00044  * @brief Fast Q7 convolution function (non-sqaure shape)
00045  * @param[in]       Im_in        pointer to input tensor
00046  * @param[in]       dim_im_in_x  input tensor dimention x
00047  * @param[in]       dim_im_in_y  input tensor dimention y
00048  * @param[in]       ch_im_in     number of input tensor channels
00049  * @param[in]       wt           pointer to kernel weights
00050  * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
00051  * @param[in]       dim_kernel_x filter kernel size x
00052  * @param[in]       dim_kernel_y filter kernel size y
00053  * @param[in]       padding_x    padding size x
00054  * @param[in]       padding_y    padding size y
00055  * @param[in]       stride_x     convolution stride x
00056  * @param[in]       stride_y     convolution stride y
00057  * @param[in]       bias         pointer to bias
00058  * @param[in]       bias_shift   amount of left-shift for bias
00059  * @param[in]       out_shift    amount of right-shift for output
00060  * @param[in,out]   Im_out       pointer to output tensor
00061  * @param[in]       dim_im_out_x output tensor dimension x
00062  * @param[in]       dim_im_out_y output tensor dimension y
00063  * @param[in,out]   bufferA      pointer to buffer space for input 
00064  * @param[in,out]   bufferB      pointer to buffer space for output
00065  * @return     The function returns either
00066  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00067  *
00068  * This function is the version with full list of optimization tricks, but with
00069  * some contraints:
00070  *   ch_im_in is multiple of 4
00071  *   ch_im_out is multiple of 2
00072  */
00073 
00074 arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
00075                                               const uint16_t dim_im_in_x,
00076                                               const uint16_t dim_im_in_y,
00077                                               const uint16_t ch_im_in,
00078                                               const q7_t * wt,
00079                                               const uint16_t ch_im_out,
00080                                               const uint16_t dim_kernel_x,
00081                                               const uint16_t dim_kernel_y,
00082                                               const uint16_t padding_x,
00083                                               const uint16_t padding_y,
00084                                               const uint16_t stride_x,
00085                                               const uint16_t stride_y,
00086                                               const q7_t * bias,
00087                                               const uint16_t bias_shift,
00088                                               const uint16_t out_shift,
00089                                               q7_t * Im_out,
00090                                               const uint16_t dim_im_out_x,
00091                                               const uint16_t dim_im_out_y, 
00092                                               q15_t * bufferA, 
00093                                               q7_t * bufferB)
00094 {
00095 
00096 #if defined (ARM_MATH_DSP)
00097     /* Run the following code for Cortex-M4 and Cortex-M7 */
00098 
00099     int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
00100 
00101     /* -----------------------
00102      *  Here we use bufferA as q15_t internally as computation are done with q15_t level
00103      *  im2col are done to output in q15_t format from q7_t input
00104      */
00105 
00106     q15_t    *pBuffer = bufferA;
00107     q7_t     *pOut = Im_out;
00108 
00109     if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
00110     {
00111         /* check if the input dimension meets the constraints */
00112         return ARM_MATH_SIZE_MISMATCH;
00113     }
00114 
00115     /*
00116      *  Here we split the entire matrix into three regions depending on the padding situation
00117      *    Top: i_out_y from 0 to padding - 1
00118      * Middle: i_out_y from padding to dim_im_out-padding-1
00119      * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
00120      */
00121 
00122     /* top part */
00123     for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
00124     {
00125         for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
00126         {
00127             /* This part implements the im2col function */
00128             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
00129                  i_ker_y++)
00130             {
00131                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
00132                      i_ker_x++)
00133                 {
00134                     if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
00135                     {
00136                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
00137                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
00138                     } else
00139                     {
00140                         arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
00141                                                          pBuffer, ch_im_in);
00142                     }
00143                     pBuffer += ch_im_in;
00144                 }
00145             }
00146 
00147             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
00148             {
00149                 pOut =
00150                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
00151                                                   bias_shift, out_shift, bias, pOut);
00152                 /* counter reset */
00153                 pBuffer = bufferA;
00154             }
00155         }
00156     }
00157 
00158     /* middle part, here we also divide the x into left, mid and right */
00159     for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
00160     {
00161 
00162         /* left part */
00163         for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
00164         {
00165             /* This part implements the im2col function */
00166             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
00167                  i_ker_y++)
00168             {
00169                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
00170                      i_ker_x++)
00171                 {
00172                     if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
00173                     {
00174                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
00175                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
00176                     } else
00177                     {
00178                         arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
00179                                                          pBuffer, ch_im_in);
00180                     }
00181                     pBuffer += ch_im_in;
00182                 }
00183             }
00184 
00185             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
00186             {
00187                 pOut =
00188                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
00189                                                   bias_shift, out_shift, bias, pOut);
00190                 /* counter reset */
00191                 pBuffer = bufferA;
00192             }
00193         }
00194 
00195         /* mid part */
00196         for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
00197         {
00198             /* This part implements the im2col function */
00199             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
00200                  i_ker_y++)
00201             {
00202                 arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in +
00203                                                  (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in,
00204                                                  pBuffer, ch_im_in * dim_kernel_x);
00205                 pBuffer += ch_im_in * dim_kernel_x;
00206             }
00207 
00208             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
00209             {
00210                 pOut =
00211                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
00212                                                   bias_shift, out_shift, bias, pOut);
00213                 /* counter reset */
00214                 pBuffer = bufferA;
00215             }
00216         }
00217 
00218         /* right part */
00219         for (; i_out_x < dim_im_out_x; i_out_x++)
00220         {
00221             /* This part implements the im2col function */
00222             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
00223                  i_ker_y++)
00224             {
00225                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
00226                      i_ker_x++)
00227                 {
00228                     if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
00229                     {
00230                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
00231                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
00232                     } else
00233                     {
00234                         arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
00235                                                          pBuffer, ch_im_in);
00236                     }
00237                     pBuffer += ch_im_in;
00238                 }
00239             }
00240 
00241             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
00242             {
00243                 pOut =
00244                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
00245                                                   bias_shift, out_shift, bias, pOut);
00246                 /* counter reset */
00247                 pBuffer = bufferA;
00248             }
00249         }
00250     }
00251 
00252     for (; i_out_y < dim_im_out_y; i_out_y++)
00253     {
00254         for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
00255         {
00256             /* This part implements the im2col function */
00257             for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
00258                  i_ker_y++)
00259             {
00260                 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
00261                      i_ker_x++)
00262                 {
00263                     if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
00264                     {
00265                         /* arm_fill_q15(0, pBuffer, ch_im_in); */
00266                         memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
00267                     } else
00268                     {
00269                         arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
00270                                                          pBuffer, ch_im_in);
00271                     }
00272                     pBuffer += ch_im_in;
00273                 }
00274             }
00275 
00276             if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
00277             {
00278                 pOut =
00279                     arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
00280                                                   bias_shift, out_shift, bias, pOut);
00281                 /* counter reset */
00282                 pBuffer = bufferA;
00283             }
00284         }
00285     }
00286 
00287     /* check if there is left-over for compute */
00288     if (pBuffer != bufferA)
00289     {
00290         const q7_t *pA = wt;
00291         int       i;
00292         for (i = 0; i < ch_im_out; i++)
00293         {
00294             q31_t     sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
00295             q15_t    *pB = bufferA;
00296             /* basically each time it process 4 entries */
00297             uint16_t  colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
00298 
00299             while (colCnt)
00300             {
00301 
00302                 q31_t     inA1, inA2;
00303                 q31_t     inB1, inB2;
00304 
00305                 pA = (const q7_t *)read_and_pad_reordered((void *)pA, &inA1, &inA2);
00306 
00307                 inB1 = *__SIMD32(pB)++;
00308                 sum = __SMLAD(inA1, inB1, sum);
00309                 inB2 = *__SIMD32(pB)++;
00310                 sum = __SMLAD(inA2, inB2, sum);
00311 
00312                 colCnt--;
00313             }
00314             colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
00315             while (colCnt)
00316             {
00317                 q7_t      inA1 = *pA++;
00318                 q15_t     inB1 = *pB++;
00319                 sum += inA1 * inB1;
00320                 colCnt--;
00321             }
00322             *pOut = (q7_t) __SSAT((sum >> out_shift), 8);
00323             pOut++;
00324 
00325         }
00326 
00327     }
00328 
00329 #else
00330     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
00331     int       i, j, k, l, m, n;
00332     int       conv_out;
00333     int       in_row, in_col;
00334 
00335     if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
00336     {
00337         /* check if the input dimension meets the constraints */
00338         return ARM_MATH_SIZE_MISMATCH;
00339     }
00340 
00341     for (i = 0; i < ch_im_out; i++)
00342     {
00343         for (j = 0; j < dim_im_out_y; j++)
00344         {
00345             for (k = 0; k < dim_im_out_x; k++)
00346             {
00347                 conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
00348                 for (m = 0; m < dim_kernel_y; m++)
00349                 {
00350                     for (n = 0; n < dim_kernel_x; n++)
00351                     {
00352                         /* if-for implementation */
00353                         in_row = stride_y * j + m - padding_y;
00354                         in_col = stride_x * k + n - padding_x;
00355                         if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
00356                         {
00357                             for (l = 0; l < ch_im_in; l++)
00358                             {
00359                                 conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
00360                                     wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l];      
00361                             }
00362                         }
00363                     }
00364                 }
00365                 Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
00366             }
00367         }
00368     }
00369 
00370 
00371 #endif                          /* ARM_MATH_DSP */
00372 
00373     /* Return to application */
00374     return ARM_MATH_SUCCESS;
00375 }
00376 
00377 /**
00378  * @} end of NNConv group
00379  */
00380