Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_pool_q7_HWC.c Source File

arm_pool_q7_HWC.c

00001 /*
00002  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
00003  *
00004  * SPDX-License-Identifier: Apache-2.0
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the License); you may
00007  * not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  * www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00014  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 /* ----------------------------------------------------------------------
00020  * Project:      CMSIS NN Library
00021  * Title:        arm_pool_q7_HWC.c
00022  * Description:  Pooling function implementations
00023  *
00024  * $Date:        17. January 2018
00025  * $Revision:    V.1.0.0
00026  *
00027  * Target Processor:  Cortex-M cores
00028  *
00029  * -------------------------------------------------------------------- */
00030 
00031 #include "arm_math.h"
00032 #include "arm_nnfunctions.h"
00033 
00034 #if defined (ARM_MATH_DSP)
00035 
00036 /**
00037  * @brief A few utility functions used by pooling functions
00038  *
00039  * 
00040  */
00041 
00042 static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale)
00043 {
00044     int       i;
00045 
00046     for (i = 0; i < length; i++)
00047     {
00048         target[i] = (q7_t) (buffer[i] / scale);
00049     }
00050 }
00051 
00052 static void compare_and_replace_if_larger_q7(q7_t * base,   // base data
00053                                              q7_t * target, // compare target
00054                                              const uint16_t length  // data length
00055     )
00056 {
00057     q7_t     *pIn = base;
00058     q7_t     *pCom = target;
00059     union arm_nnword in;
00060     union arm_nnword com;
00061     uint16_t  cnt = length >> 2;
00062 
00063     while (cnt > 0u)
00064     {
00065         in.word = *__SIMD32(pIn);
00066         com.word = *__SIMD32(pCom)++;
00067 
00068         // if version
00069         if (com.bytes[0] > in.bytes[0])
00070             in.bytes[0] = com.bytes[0];
00071         if (com.bytes[1] > in.bytes[1])
00072             in.bytes[1] = com.bytes[1];
00073         if (com.bytes[2] > in.bytes[2])
00074             in.bytes[2] = com.bytes[2];
00075         if (com.bytes[3] > in.bytes[3])
00076             in.bytes[3] = com.bytes[3];
00077 
00078         *__SIMD32(pIn)++ = in.word;
00079 
00080         cnt--;
00081     }
00082 }
00083 
00084 static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length)
00085 {
00086     q15_t    *pCnt = base;
00087     q7_t     *pV = target;
00088     q31_t     v1, v2, vo1, vo2;
00089     uint16_t  cnt = length >> 2;
00090     q31_t     in;
00091 
00092     while (cnt > 0u)
00093     {
00094         q31_t     value = *__SIMD32(pV)++;
00095         v1 = __SXTB16(__ROR(value, 8));
00096         v2 = __SXTB16(value);
00097 #ifndef ARM_MATH_BIG_ENDIAN
00098 
00099         vo2 = __PKHTB(v1, v2, 16);
00100         vo1 = __PKHBT(v2, v1, 16);
00101 
00102 #else
00103 
00104         vo1 = __PKHTB(v1, v2, 16);
00105         vo2 = __PKHBT(v2, v1, 16);
00106 
00107 #endif
00108 
00109         in = *__SIMD32(pCnt);
00110         *__SIMD32(pCnt)++ = __QADD16(vo1, in);
00111 
00112         in = *__SIMD32(pCnt);
00113         *__SIMD32(pCnt)++ = __QADD16(vo2, in);
00114 
00115         cnt--;
00116     }
00117     cnt = length & 0x3;
00118     while (cnt > 0u)
00119     {
00120         *pCnt++ += *pV++;
00121         cnt--;
00122     }
00123 }
00124 
00125 #endif                          // ARM_MATH_DSP
00126 
00127 /**
00128  *  @ingroup groupNN
00129  */
00130 
00131 /**
00132  * @addtogroup Pooling
00133  * @{
00134  */
00135 
00136   /**
00137    * @brief Q7 max pooling function
00138    * @param[in, out]  Im_in       pointer to input tensor
00139    * @param[in]       dim_im_in   input tensor dimention
00140    * @param[in]       ch_im_in    number of input tensor channels
00141    * @param[in]       dim_kernel  filter kernel size
00142    * @param[in]       padding     padding sizes
00143    * @param[in]       stride      convolution stride
00144    * @param[in]       dim_im_out  output tensor dimension
00145    * @param[in,out]   bufferA     pointer to buffer space for input
00146    * @param[in,out]   Im_out      pointer to output tensor
00147    * @return none.
00148    *
00149    * @details
00150    *
00151    * <b>Buffer size:</b>
00152    *
00153    * bufferA size:  0
00154    *
00155    * The pooling function is implemented as split x-pooling then
00156    * y-pooling.
00157    *
00158    * This pooling function is input-destructive. Input data is undefined
00159    * after calling this function.
00160    *
00161    */
00162 
00163 void
00164 arm_maxpool_q7_HWC(q7_t * Im_in,
00165                    const uint16_t dim_im_in,
00166                    const uint16_t ch_im_in,
00167                    const uint16_t dim_kernel,
00168                    const uint16_t padding,
00169                    const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
00170 {
00171 
00172 #if defined (ARM_MATH_DSP)
00173     /* Run the following code for Cortex-M4 and Cortex-M7 */
00174 
00175     int16_t   i_x, i_y;
00176 
00177     /* first does the pooling along x axis */
00178     for (i_y = 0; i_y < dim_im_in; i_y++)
00179     {
00180 
00181         for (i_x = 0; i_x < dim_im_out; i_x++)
00182         {
00183             /* for each output pixel */
00184             q7_t     *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
00185             q7_t     *win_start;
00186             q7_t     *win_stop;
00187             if (i_x * stride - padding < 0)
00188             {
00189                 win_start = target;
00190             } else
00191             {
00192                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
00193             }
00194 
00195             if (i_x * stride - padding + dim_kernel >= dim_im_in)
00196             {
00197                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
00198             } else
00199             {
00200                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
00201             }
00202 
00203             /* first step is to copy over initial data */
00204             /* arm_copy_q7(win_start, target, ch_im_in); */
00205             memmove(target, win_start, ch_im_in);
00206 
00207             /* start the max operation from the second part */
00208             win_start += ch_im_in;
00209             for (; win_start < win_stop; win_start += ch_im_in)
00210             {
00211                 compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
00212             }
00213         }
00214     }
00215 
00216     /* then does the pooling along y axis */
00217     for (i_y = 0; i_y < dim_im_out; i_y++)
00218     {
00219 
00220         /* for each output row */
00221         q7_t     *target = Im_out + i_y * dim_im_out * ch_im_in;
00222         q7_t     *row_start;
00223         q7_t     *row_end;
00224         /* setting the starting row */
00225         if (i_y * stride - padding < 0)
00226         {
00227             row_start = Im_in;
00228         } else
00229         {
00230             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
00231         }
00232         /* setting the stopping row */
00233         if (i_y * stride - padding + dim_kernel >= dim_im_in)
00234         {
00235             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
00236         } else
00237         {
00238             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
00239         }
00240 
00241         /* copy over the first row */
00242         /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
00243         memmove(target, row_start, dim_im_out * ch_im_in);
00244 
00245         /* move over to next row */
00246         row_start += ch_im_in * dim_im_in;
00247 
00248         for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
00249         {
00250             compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
00251         }
00252     }
00253 
00254 #else
00255     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
00256 
00257     int16_t   i_ch_in, i_x, i_y;
00258     int16_t   k_x, k_y;
00259 
00260     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
00261     {
00262         for (i_y = 0; i_y < dim_im_out; i_y++)
00263         {
00264             for (i_x = 0; i_x < dim_im_out; i_x++)
00265             {
00266                 int       max = -129;
00267                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
00268                 {
00269                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
00270                     {
00271                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
00272                         {
00273                             if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
00274                             {
00275                                 max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
00276                             }
00277                         }
00278                     }
00279                 }
00280                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
00281             }
00282         }
00283     }
00284 
00285 #endif                          /* ARM_MATH_DSP */
00286 
00287 }
00288 
00289   /**
00290    * @brief Q7 average pooling function
00291    * @param[in,out]   Im_in       pointer to input tensor
00292    * @param[in]       dim_im_in   input tensor dimention
00293    * @param[in]       ch_im_in    number of input tensor channels
00294    * @param[in]       dim_kernel  filter kernel size
00295    * @param[in]       padding     padding sizes
00296    * @param[in]       stride      convolution stride
00297    * @param[in]       dim_im_out  output tensor dimension
00298    * @param[in,out]   bufferA     pointer to buffer space for input
00299    * @param[in,out]   Im_out      pointer to output tensor
00300    * @return none.
00301    *
00302    * @details
00303    *
00304    * <b>Buffer size:</b>
00305    *
00306    * bufferA size:  2*dim_im_out*ch_im_in
00307    *
00308    * The pooling function is implemented as split x-pooling then
00309    * y-pooling.
00310    *
00311    * This pooling function is input-destructive. Input data is undefined
00312    * after calling this function.
00313    *
00314    */
00315 
00316 void
00317 arm_avepool_q7_HWC(q7_t * Im_in,
00318                    const uint16_t dim_im_in,
00319                    const uint16_t ch_im_in,
00320                    const uint16_t dim_kernel,
00321                    const uint16_t padding,
00322                    const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
00323 {
00324 
00325 #if defined (ARM_MATH_DSP)
00326     /* Run the following code for Cortex-M4 and Cortex-M7 */
00327 
00328     q15_t    *buffer = (q15_t *) bufferA;
00329     int16_t   i_x, i_y;
00330     int16_t   count = 0;
00331 
00332     /* first does the pooling along x axis */
00333     for (i_y = 0; i_y < dim_im_in; i_y++)
00334     {
00335 
00336         for (i_x = 0; i_x < dim_im_out; i_x++)
00337         {
00338             /* for each output pixel */
00339             q7_t     *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
00340             q7_t     *win_start;
00341             q7_t     *win_stop;
00342             if (i_x * stride - padding < 0)
00343             {
00344                 win_start = target;
00345             } else
00346             {
00347                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
00348             }
00349 
00350             if (i_x * stride - padding + dim_kernel >= dim_im_in)
00351             {
00352                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
00353             } else
00354             {
00355                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
00356             }
00357 
00358             /* first step is to copy over initial data */
00359             arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
00360             count = 1;
00361 
00362             /* start the max operation from the second part */
00363             win_start += ch_im_in;
00364             for (; win_start < win_stop; win_start += ch_im_in)
00365             {
00366                 accumulate_q7_to_q15(buffer, win_start, ch_im_in);
00367                 count++;
00368             }
00369             buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
00370         }
00371     }
00372 
00373     /* then does the pooling along y axis */
00374     for (i_y = 0; i_y < dim_im_out; i_y++)
00375     {
00376         /* for each output row */
00377         q7_t     *target = Im_out + i_y * dim_im_out * ch_im_in;
00378         q7_t     *row_start;
00379         q7_t     *row_end;
00380         /* setting the starting row */
00381         if (i_y * stride - padding < 0)
00382         {
00383             row_start = Im_in;
00384         } else
00385         {
00386             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
00387         }
00388         /* setting the stopping row */
00389         if (i_y * stride - padding + dim_kernel >= dim_im_in)
00390         {
00391             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
00392         } else
00393         {
00394             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
00395         }
00396 
00397         /* copy over the first row */
00398         arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
00399         count = 1;
00400 
00401         /* move over to next row */
00402         row_start += ch_im_in * dim_im_in;
00403 
00404         for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
00405         {
00406             accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
00407             count++;
00408         }
00409         buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
00410     }
00411 
00412 #else
00413     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
00414 
00415     int16_t   i_ch_in, i_x, i_y;
00416     int16_t   k_x, k_y;
00417 
00418     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
00419     {
00420         for (i_y = 0; i_y < dim_im_out; i_y++)
00421         {
00422             for (i_x = 0; i_x < dim_im_out; i_x++)
00423             {
00424                 int       sum = 0;
00425                 int       count = 0;
00426                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
00427                 {
00428                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
00429                     {
00430                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
00431                         {
00432                             sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
00433                             count++;
00434                         }
00435                     }
00436                 }
00437                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
00438             }
00439         }
00440     }
00441 
00442 #endif                          /* ARM_MATH_DSP */
00443 
00444 }
00445 
00446 /**
00447  * @} end of Pooling group
00448  */
00449