CMSIS_DSP_5 - The CMSIS DSP 5 library

Users » xorjoep » Code » CMSIS_DSP_5

The CMSIS DSP 5 library

Dependents: Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

functions/MatrixFunctions/arm_mat_mult_fast_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Committer:: xorjoep
Date:: Thu Jun 21 11:56:27 2018 +0000
Revision:: 3:4098b9d3d571
Parent:: 1:24714b45cd1b

headers is a folder not a library

Who changed what in which revision?

User	Revision	Line number	New contents of line
xorjoep	1:24714b45cd1b	1	/* ----------------------------------------------------------------------
xorjoep	1:24714b45cd1b	2	* Project: CMSIS DSP Library
xorjoep	1:24714b45cd1b	3	* Title: arm_mat_mult_fast_q15.c
xorjoep	1:24714b45cd1b	4	* Description: Q15 matrix multiplication (fast variant)
xorjoep	1:24714b45cd1b	5	*
xorjoep	1:24714b45cd1b	6	* $Date: 27. January 2017
xorjoep	1:24714b45cd1b	7	* $Revision: V.1.5.1
xorjoep	1:24714b45cd1b	8	*
xorjoep	1:24714b45cd1b	9	* Target Processor: Cortex-M cores
xorjoep	1:24714b45cd1b	10	* -------------------------------------------------------------------- */
xorjoep	1:24714b45cd1b	11	/*
xorjoep	1:24714b45cd1b	12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep	1:24714b45cd1b	13	*
xorjoep	1:24714b45cd1b	14	* SPDX-License-Identifier: Apache-2.0
xorjoep	1:24714b45cd1b	15	*
xorjoep	1:24714b45cd1b	16	* Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep	1:24714b45cd1b	17	* not use this file except in compliance with the License.
xorjoep	1:24714b45cd1b	18	* You may obtain a copy of the License at
xorjoep	1:24714b45cd1b	19	*
xorjoep	1:24714b45cd1b	20	* www.apache.org/licenses/LICENSE-2.0
xorjoep	1:24714b45cd1b	21	*
xorjoep	1:24714b45cd1b	22	* Unless required by applicable law or agreed to in writing, software
xorjoep	1:24714b45cd1b	23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep	1:24714b45cd1b	24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep	1:24714b45cd1b	25	* See the License for the specific language governing permissions and
xorjoep	1:24714b45cd1b	26	* limitations under the License.
xorjoep	1:24714b45cd1b	27	*/
xorjoep	1:24714b45cd1b	28
xorjoep	1:24714b45cd1b	29	#include "arm_math.h"
xorjoep	1:24714b45cd1b	30
xorjoep	1:24714b45cd1b	31	/**
xorjoep	1:24714b45cd1b	32	* @ingroup groupMatrix
xorjoep	1:24714b45cd1b	33	*/
xorjoep	1:24714b45cd1b	34
xorjoep	1:24714b45cd1b	35	/**
xorjoep	1:24714b45cd1b	36	* @addtogroup MatrixMult
xorjoep	1:24714b45cd1b	37	* @{
xorjoep	1:24714b45cd1b	38	*/
xorjoep	1:24714b45cd1b	39
xorjoep	1:24714b45cd1b	40
xorjoep	1:24714b45cd1b	41	/**
xorjoep	1:24714b45cd1b	42	* @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
xorjoep	1:24714b45cd1b	43	* @param[in] *pSrcA points to the first input matrix structure
xorjoep	1:24714b45cd1b	44	* @param[in] *pSrcB points to the second input matrix structure
xorjoep	1:24714b45cd1b	45	* @param[out] *pDst points to output matrix structure
xorjoep	1:24714b45cd1b	46	* @param[in] *pState points to the array for storing intermediate results
xorjoep	1:24714b45cd1b	47	* @return The function returns either
xorjoep	1:24714b45cd1b	48	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
xorjoep	1:24714b45cd1b	49	*
xorjoep	1:24714b45cd1b	50	* @details
xorjoep	1:24714b45cd1b	51	* <b>Scaling and Overflow Behavior:</b>
xorjoep	1:24714b45cd1b	52	*
xorjoep	1:24714b45cd1b	53	* \par
xorjoep	1:24714b45cd1b	54	* The difference between the function arm_mat_mult_q15() and this fast variant is that
xorjoep	1:24714b45cd1b	55	* the fast variant use a 32-bit rather than a 64-bit accumulator.
xorjoep	1:24714b45cd1b	56	* The result of each 1.15 x 1.15 multiplication is truncated to
xorjoep	1:24714b45cd1b	57	* 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
xorjoep	1:24714b45cd1b	58	* format. Finally, the accumulator is saturated and converted to a 1.15 result.
xorjoep	1:24714b45cd1b	59	*
xorjoep	1:24714b45cd1b	60	* \par
xorjoep	1:24714b45cd1b	61	* The fast version has the same overflow behavior as the standard version but provides
xorjoep	1:24714b45cd1b	62	* less precision since it discards the low 16 bits of each multiplication result.
xorjoep	1:24714b45cd1b	63	* In order to avoid overflows completely the input signals must be scaled down.
xorjoep	1:24714b45cd1b	64	* Scale down one of the input matrices by log2(numColsA) bits to
xorjoep	1:24714b45cd1b	65	* avoid overflows, as a total of numColsA additions are computed internally for each
xorjoep	1:24714b45cd1b	66	* output element.
xorjoep	1:24714b45cd1b	67	*
xorjoep	1:24714b45cd1b	68	* \par
xorjoep	1:24714b45cd1b	69	* See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
xorjoep	1:24714b45cd1b	70	* which uses 64-bit accumulation to provide higher precision.
xorjoep	1:24714b45cd1b	71	*/
xorjoep	1:24714b45cd1b	72
xorjoep	1:24714b45cd1b	73	arm_status arm_mat_mult_fast_q15(
xorjoep	1:24714b45cd1b	74	const arm_matrix_instance_q15 * pSrcA,
xorjoep	1:24714b45cd1b	75	const arm_matrix_instance_q15 * pSrcB,
xorjoep	1:24714b45cd1b	76	arm_matrix_instance_q15 * pDst,
xorjoep	1:24714b45cd1b	77	q15_t * pState)
xorjoep	1:24714b45cd1b	78	{
xorjoep	1:24714b45cd1b	79	q31_t sum; /* accumulator */
xorjoep	1:24714b45cd1b	80	q15_t pSrcBT = pState; / input data matrix pointer for transpose */
xorjoep	1:24714b45cd1b	81	q15_t pInA = pSrcA->pData; / input data matrix pointer A of Q15 type */
xorjoep	1:24714b45cd1b	82	q15_t pInB = pSrcB->pData; / input data matrix pointer B of Q15 type */
xorjoep	1:24714b45cd1b	83	q15_t px; / Temporary output data matrix pointer */
xorjoep	1:24714b45cd1b	84	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
xorjoep	1:24714b45cd1b	85	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
xorjoep	1:24714b45cd1b	86	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
xorjoep	1:24714b45cd1b	87	uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
xorjoep	1:24714b45cd1b	88	uint32_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
xorjoep	1:24714b45cd1b	89	arm_status status; /* status of matrix multiplication */
xorjoep	1:24714b45cd1b	90
xorjoep	1:24714b45cd1b	91	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	92
xorjoep	1:24714b45cd1b	93	q31_t in; /* Temporary variable to hold the input value */
xorjoep	1:24714b45cd1b	94	q31_t inA1, inA2, inB1, inB2;
xorjoep	1:24714b45cd1b	95	q31_t sum2, sum3, sum4;
xorjoep	1:24714b45cd1b	96	q15_t pInA2, pInB2, *px2;
xorjoep	1:24714b45cd1b	97	uint32_t j = 0;
xorjoep	1:24714b45cd1b	98
xorjoep	1:24714b45cd1b	99	#else
xorjoep	1:24714b45cd1b	100
xorjoep	1:24714b45cd1b	101	q15_t in; /* Temporary variable to hold the input value */
xorjoep	1:24714b45cd1b	102	q15_t inA1, inA2, inB1, inB2;
xorjoep	1:24714b45cd1b	103
xorjoep	1:24714b45cd1b	104	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
xorjoep	1:24714b45cd1b	105
xorjoep	1:24714b45cd1b	106	#ifdef ARM_MATH_MATRIX_CHECK
xorjoep	1:24714b45cd1b	107	/* Check for matrix mismatch condition */
xorjoep	1:24714b45cd1b	108	if ((pSrcA->numCols != pSrcB->numRows) \|\|
xorjoep	1:24714b45cd1b	109	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
xorjoep	1:24714b45cd1b	110	{
xorjoep	1:24714b45cd1b	111	/* Set status as ARM_MATH_SIZE_MISMATCH */
xorjoep	1:24714b45cd1b	112	status = ARM_MATH_SIZE_MISMATCH;
xorjoep	1:24714b45cd1b	113	}
xorjoep	1:24714b45cd1b	114	else
xorjoep	1:24714b45cd1b	115	#endif
xorjoep	1:24714b45cd1b	116	{
xorjoep	1:24714b45cd1b	117	/* Matrix transpose */
xorjoep	1:24714b45cd1b	118	do
xorjoep	1:24714b45cd1b	119	{
xorjoep	1:24714b45cd1b	120	/* Apply loop unrolling and exchange the columns with row elements */
xorjoep	1:24714b45cd1b	121	col = numColsB >> 2;
xorjoep	1:24714b45cd1b	122
xorjoep	1:24714b45cd1b	123	/* The pointer px is set to starting address of the column being processed */
xorjoep	1:24714b45cd1b	124	px = pSrcBT + i;
xorjoep	1:24714b45cd1b	125
xorjoep	1:24714b45cd1b	126	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
xorjoep	1:24714b45cd1b	127	** a second loop below computes the remaining 1 to 3 samples. */
xorjoep	1:24714b45cd1b	128	while (col > 0U)
xorjoep	1:24714b45cd1b	129	{
xorjoep	1:24714b45cd1b	130	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	131	/* Read two elements from the row */
xorjoep	1:24714b45cd1b	132	in = *__SIMD32(pInB)++;
xorjoep	1:24714b45cd1b	133
xorjoep	1:24714b45cd1b	134	/* Unpack and store one element in the destination */
xorjoep	1:24714b45cd1b	135	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	136
xorjoep	1:24714b45cd1b	137	*px = (q15_t) in;
xorjoep	1:24714b45cd1b	138
xorjoep	1:24714b45cd1b	139	#else
xorjoep	1:24714b45cd1b	140
xorjoep	1:24714b45cd1b	141	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
xorjoep	1:24714b45cd1b	142
xorjoep	1:24714b45cd1b	143	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	144
xorjoep	1:24714b45cd1b	145	/* Update the pointer px to point to the next row of the transposed matrix */
xorjoep	1:24714b45cd1b	146	px += numRowsB;
xorjoep	1:24714b45cd1b	147
xorjoep	1:24714b45cd1b	148	/* Unpack and store the second element in the destination */
xorjoep	1:24714b45cd1b	149	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	150
xorjoep	1:24714b45cd1b	151	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
xorjoep	1:24714b45cd1b	152
xorjoep	1:24714b45cd1b	153	#else
xorjoep	1:24714b45cd1b	154
xorjoep	1:24714b45cd1b	155	*px = (q15_t) in;
xorjoep	1:24714b45cd1b	156
xorjoep	1:24714b45cd1b	157	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	158
xorjoep	1:24714b45cd1b	159	/* Update the pointer px to point to the next row of the transposed matrix */
xorjoep	1:24714b45cd1b	160	px += numRowsB;
xorjoep	1:24714b45cd1b	161
xorjoep	1:24714b45cd1b	162	/* Read two elements from the row */
xorjoep	1:24714b45cd1b	163	in = *__SIMD32(pInB)++;
xorjoep	1:24714b45cd1b	164
xorjoep	1:24714b45cd1b	165	/* Unpack and store one element in the destination */
xorjoep	1:24714b45cd1b	166	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	167
xorjoep	1:24714b45cd1b	168	*px = (q15_t) in;
xorjoep	1:24714b45cd1b	169
xorjoep	1:24714b45cd1b	170	#else
xorjoep	1:24714b45cd1b	171
xorjoep	1:24714b45cd1b	172	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
xorjoep	1:24714b45cd1b	173
xorjoep	1:24714b45cd1b	174	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	175
xorjoep	1:24714b45cd1b	176	/* Update the pointer px to point to the next row of the transposed matrix */
xorjoep	1:24714b45cd1b	177	px += numRowsB;
xorjoep	1:24714b45cd1b	178
xorjoep	1:24714b45cd1b	179	/* Unpack and store the second element in the destination */
xorjoep	1:24714b45cd1b	180
xorjoep	1:24714b45cd1b	181	#ifndef ARM_MATH_BIG_ENDIAN
xorjoep	1:24714b45cd1b	182
xorjoep	1:24714b45cd1b	183	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
xorjoep	1:24714b45cd1b	184
xorjoep	1:24714b45cd1b	185	#else
xorjoep	1:24714b45cd1b	186
xorjoep	1:24714b45cd1b	187	*px = (q15_t) in;
xorjoep	1:24714b45cd1b	188
xorjoep	1:24714b45cd1b	189	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep	1:24714b45cd1b	190
xorjoep	1:24714b45cd1b	191	#else
xorjoep	1:24714b45cd1b	192
xorjoep	1:24714b45cd1b	193	/* Read one element from the row */
xorjoep	1:24714b45cd1b	194	in = *pInB++;
xorjoep	1:24714b45cd1b	195
xorjoep	1:24714b45cd1b	196	/* Store one element in the destination */
xorjoep	1:24714b45cd1b	197	*px = in;
xorjoep	1:24714b45cd1b	198
xorjoep	1:24714b45cd1b	199	/* Update the pointer px to point to the next row of the transposed matrix */
xorjoep	1:24714b45cd1b	200	px += numRowsB;
xorjoep	1:24714b45cd1b	201
xorjoep	1:24714b45cd1b	202	/* Read one element from the row */
xorjoep	1:24714b45cd1b	203	in = *pInB++;
xorjoep	1:24714b45cd1b	204
xorjoep	1:24714b45cd1b	205	/* Store one element in the destination */
xorjoep	1:24714b45cd1b	206	*px = in;
xorjoep	1:24714b45cd1b	207
xorjoep	1:24714b45cd1b	208	/* Update the pointer px to point to the next row of the transposed matrix */
xorjoep	1:24714b45cd1b	209	px += numRowsB;
xorjoep	1:24714b45cd1b	210
xorjoep	1:24714b45cd1b	211	/* Read one element from the row */
xorjoep	1:24714b45cd1b	212	in = *pInB++;
xorjoep	1:24714b45cd1b	213
xorjoep	1:24714b45cd1b	214	/* Store one element in the destination */
xorjoep	1:24714b45cd1b	215	*px = in;
xorjoep	1:24714b45cd1b	216
xorjoep	1:24714b45cd1b	217	/* Update the pointer px to point to the next row of the transposed matrix */
xorjoep	1:24714b45cd1b	218	px += numRowsB;
xorjoep	1:24714b45cd1b	219
xorjoep	1:24714b45cd1b	220	/* Read one element from the row */
xorjoep	1:24714b45cd1b	221	in = *pInB++;
xorjoep	1:24714b45cd1b	222
xorjoep	1:24714b45cd1b	223	/* Store one element in the destination */
xorjoep	1:24714b45cd1b	224	*px = in;
xorjoep	1:24714b45cd1b	225
xorjoep	1:24714b45cd1b	226	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
xorjoep	1:24714b45cd1b	227
xorjoep	1:24714b45cd1b	228	/* Update the pointer px to point to the next row of the transposed matrix */
xorjoep	1:24714b45cd1b	229	px += numRowsB;
xorjoep	1:24714b45cd1b	230
xorjoep	1:24714b45cd1b	231	/* Decrement the column loop counter */
xorjoep	1:24714b45cd1b	232	col--;
xorjoep	1:24714b45cd1b	233	}
xorjoep	1:24714b45cd1b	234
xorjoep	1:24714b45cd1b	235	/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
xorjoep	1:24714b45cd1b	236	** No loop unrolling is used. */
xorjoep	1:24714b45cd1b	237	col = numColsB % 0x4U;
xorjoep	1:24714b45cd1b	238
xorjoep	1:24714b45cd1b	239	while (col > 0U)
xorjoep	1:24714b45cd1b	240	{
xorjoep	1:24714b45cd1b	241	/* Read and store the input element in the destination */
xorjoep	1:24714b45cd1b	242	px = pInB++;
xorjoep	1:24714b45cd1b	243
xorjoep	1:24714b45cd1b	244	/* Update the pointer px to point to the next row of the transposed matrix */
xorjoep	1:24714b45cd1b	245	px += numRowsB;
xorjoep	1:24714b45cd1b	246
xorjoep	1:24714b45cd1b	247	/* Decrement the column loop counter */
xorjoep	1:24714b45cd1b	248	col--;
xorjoep	1:24714b45cd1b	249	}
xorjoep	1:24714b45cd1b	250
xorjoep	1:24714b45cd1b	251	i++;
xorjoep	1:24714b45cd1b	252
xorjoep	1:24714b45cd1b	253	/* Decrement the row loop counter */
xorjoep	1:24714b45cd1b	254	row--;
xorjoep	1:24714b45cd1b	255
xorjoep	1:24714b45cd1b	256	} while (row > 0U);
xorjoep	1:24714b45cd1b	257
xorjoep	1:24714b45cd1b	258	/* Reset the variables for the usage in the following multiplication process */
xorjoep	1:24714b45cd1b	259	row = numRowsA;
xorjoep	1:24714b45cd1b	260	i = 0U;
xorjoep	1:24714b45cd1b	261	px = pDst->pData;
xorjoep	1:24714b45cd1b	262
xorjoep	1:24714b45cd1b	263	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	264	/* Process two rows from matrix A at a time and output two rows at a time */
xorjoep	1:24714b45cd1b	265	row = row >> 1;
xorjoep	1:24714b45cd1b	266	px2 = px + numColsB;
xorjoep	1:24714b45cd1b	267	#endif
xorjoep	1:24714b45cd1b	268
xorjoep	1:24714b45cd1b	269	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
xorjoep	1:24714b45cd1b	270	/* row loop */
xorjoep	1:24714b45cd1b	271	while (row > 0U)
xorjoep	1:24714b45cd1b	272	{
xorjoep	1:24714b45cd1b	273	/* For every row wise process, the column loop counter is to be initiated */
xorjoep	1:24714b45cd1b	274	col = numColsB;
xorjoep	1:24714b45cd1b	275
xorjoep	1:24714b45cd1b	276	/* For every row wise process, the pIn2 pointer is set
xorjoep	1:24714b45cd1b	277	** to the starting address of the transposed pSrcB data */
xorjoep	1:24714b45cd1b	278	pInB = pSrcBT;
xorjoep	1:24714b45cd1b	279
xorjoep	1:24714b45cd1b	280	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	281	/* Process two (transposed) columns from matrix B at a time */
xorjoep	1:24714b45cd1b	282	col = col >> 1;
xorjoep	1:24714b45cd1b	283	j = 0;
xorjoep	1:24714b45cd1b	284	#endif
xorjoep	1:24714b45cd1b	285
xorjoep	1:24714b45cd1b	286	/* column loop */
xorjoep	1:24714b45cd1b	287	while (col > 0U)
xorjoep	1:24714b45cd1b	288	{
xorjoep	1:24714b45cd1b	289	/* Set the variable sum, that acts as accumulator, to zero */
xorjoep	1:24714b45cd1b	290	sum = 0;
xorjoep	1:24714b45cd1b	291
xorjoep	1:24714b45cd1b	292	/* Initiate the pointer pInA to point to the starting address of the column being processed */
xorjoep	1:24714b45cd1b	293	pInA = pSrcA->pData + i;
xorjoep	1:24714b45cd1b	294
xorjoep	1:24714b45cd1b	295	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	296	sum2 = 0;
xorjoep	1:24714b45cd1b	297	sum3 = 0;
xorjoep	1:24714b45cd1b	298	sum4 = 0;
xorjoep	1:24714b45cd1b	299	pInB = pSrcBT + j;
xorjoep	1:24714b45cd1b	300	pInA2 = pInA + numColsA;
xorjoep	1:24714b45cd1b	301	pInB2 = pInB + numRowsB;
xorjoep	1:24714b45cd1b	302
xorjoep	1:24714b45cd1b	303	/* Read in two elements at once - alows dual MAC instruction */
xorjoep	1:24714b45cd1b	304	colCnt = numColsA >> 1;
xorjoep	1:24714b45cd1b	305	#else
xorjoep	1:24714b45cd1b	306	colCnt = numColsA >> 2;
xorjoep	1:24714b45cd1b	307	#endif
xorjoep	1:24714b45cd1b	308
xorjoep	1:24714b45cd1b	309	/* matrix multiplication */
xorjoep	1:24714b45cd1b	310	while (colCnt > 0U)
xorjoep	1:24714b45cd1b	311	{
xorjoep	1:24714b45cd1b	312	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
xorjoep	1:24714b45cd1b	313	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	314
xorjoep	1:24714b45cd1b	315	inA1 = *__SIMD32(pInA)++;
xorjoep	1:24714b45cd1b	316	inB1 = *__SIMD32(pInB)++;
xorjoep	1:24714b45cd1b	317	inA2 = *__SIMD32(pInA2)++;
xorjoep	1:24714b45cd1b	318	inB2 = *__SIMD32(pInB2)++;
xorjoep	1:24714b45cd1b	319
xorjoep	1:24714b45cd1b	320	sum = __SMLAD(inA1, inB1, sum);
xorjoep	1:24714b45cd1b	321	sum2 = __SMLAD(inA1, inB2, sum2);
xorjoep	1:24714b45cd1b	322	sum3 = __SMLAD(inA2, inB1, sum3);
xorjoep	1:24714b45cd1b	323	sum4 = __SMLAD(inA2, inB2, sum4);
xorjoep	1:24714b45cd1b	324
xorjoep	1:24714b45cd1b	325	#else
xorjoep	1:24714b45cd1b	326
xorjoep	1:24714b45cd1b	327	inA1 = *pInA;
xorjoep	1:24714b45cd1b	328	inB1 = *pInB;
xorjoep	1:24714b45cd1b	329	sum += inA1 * inB1;
xorjoep	1:24714b45cd1b	330
xorjoep	1:24714b45cd1b	331	inA2 = pInA[1];
xorjoep	1:24714b45cd1b	332	inB2 = pInB[1];
xorjoep	1:24714b45cd1b	333	sum += inA2 * inB2;
xorjoep	1:24714b45cd1b	334
xorjoep	1:24714b45cd1b	335	inA1 = pInA[2];
xorjoep	1:24714b45cd1b	336	inB1 = pInB[2];
xorjoep	1:24714b45cd1b	337	sum += inA1 * inB1;
xorjoep	1:24714b45cd1b	338
xorjoep	1:24714b45cd1b	339	inA2 = pInA[3];
xorjoep	1:24714b45cd1b	340	inB2 = pInB[3];
xorjoep	1:24714b45cd1b	341	sum += inA2 * inB2;
xorjoep	1:24714b45cd1b	342
xorjoep	1:24714b45cd1b	343	pInA += 4;
xorjoep	1:24714b45cd1b	344	pInB += 4;
xorjoep	1:24714b45cd1b	345
xorjoep	1:24714b45cd1b	346	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
xorjoep	1:24714b45cd1b	347
xorjoep	1:24714b45cd1b	348	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	349	colCnt--;
xorjoep	1:24714b45cd1b	350	}
xorjoep	1:24714b45cd1b	351
xorjoep	1:24714b45cd1b	352	/* process odd column samples */
xorjoep	1:24714b45cd1b	353	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	354	if (numColsA & 1U) {
xorjoep	1:24714b45cd1b	355	inA1 = *pInA++;
xorjoep	1:24714b45cd1b	356	inB1 = *pInB++;
xorjoep	1:24714b45cd1b	357	inA2 = *pInA2++;
xorjoep	1:24714b45cd1b	358	inB2 = *pInB2++;
xorjoep	1:24714b45cd1b	359	sum += inA1 * inB1;
xorjoep	1:24714b45cd1b	360	sum2 += inA1 * inB2;
xorjoep	1:24714b45cd1b	361	sum3 += inA2 * inB1;
xorjoep	1:24714b45cd1b	362	sum4 += inA2 * inB2;
xorjoep	1:24714b45cd1b	363	}
xorjoep	1:24714b45cd1b	364	#else
xorjoep	1:24714b45cd1b	365	colCnt = numColsA % 0x4U;
xorjoep	1:24714b45cd1b	366
xorjoep	1:24714b45cd1b	367	while (colCnt > 0U)
xorjoep	1:24714b45cd1b	368	{
xorjoep	1:24714b45cd1b	369	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
xorjoep	1:24714b45cd1b	370	sum += (q31_t) (pInA++) (*pInB++);
xorjoep	1:24714b45cd1b	371
xorjoep	1:24714b45cd1b	372	colCnt--;
xorjoep	1:24714b45cd1b	373	}
xorjoep	1:24714b45cd1b	374	#endif
xorjoep	1:24714b45cd1b	375
xorjoep	1:24714b45cd1b	376	/* Saturate and store the result in the destination buffer */
xorjoep	1:24714b45cd1b	377	*px++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	378
xorjoep	1:24714b45cd1b	379	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	380	*px++ = (q15_t) (sum2 >> 15);
xorjoep	1:24714b45cd1b	381	*px2++ = (q15_t) (sum3 >> 15);
xorjoep	1:24714b45cd1b	382	*px2++ = (q15_t) (sum4 >> 15);
xorjoep	1:24714b45cd1b	383	j += numRowsB * 2;
xorjoep	1:24714b45cd1b	384	#endif
xorjoep	1:24714b45cd1b	385
xorjoep	1:24714b45cd1b	386	/* Decrement the column loop counter */
xorjoep	1:24714b45cd1b	387	col--;
xorjoep	1:24714b45cd1b	388
xorjoep	1:24714b45cd1b	389	}
xorjoep	1:24714b45cd1b	390
xorjoep	1:24714b45cd1b	391	i = i + numColsA;
xorjoep	1:24714b45cd1b	392
xorjoep	1:24714b45cd1b	393	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	394	i = i + numColsA;
xorjoep	1:24714b45cd1b	395	px = px2 + (numColsB & 1U);
xorjoep	1:24714b45cd1b	396	px2 = px + numColsB;
xorjoep	1:24714b45cd1b	397	#endif
xorjoep	1:24714b45cd1b	398
xorjoep	1:24714b45cd1b	399	/* Decrement the row loop counter */
xorjoep	1:24714b45cd1b	400	row--;
xorjoep	1:24714b45cd1b	401
xorjoep	1:24714b45cd1b	402	}
xorjoep	1:24714b45cd1b	403
xorjoep	1:24714b45cd1b	404	/* Compute any remaining odd row/column below */
xorjoep	1:24714b45cd1b	405
xorjoep	1:24714b45cd1b	406	#ifndef UNALIGNED_SUPPORT_DISABLE
xorjoep	1:24714b45cd1b	407
xorjoep	1:24714b45cd1b	408	/* Compute remaining output column */
xorjoep	1:24714b45cd1b	409	if (numColsB & 1U) {
xorjoep	1:24714b45cd1b	410
xorjoep	1:24714b45cd1b	411	/* Avoid redundant computation of last element */
xorjoep	1:24714b45cd1b	412	row = numRowsA & (~0x1);
xorjoep	1:24714b45cd1b	413
xorjoep	1:24714b45cd1b	414	/* Point to remaining unfilled column in output matrix */
xorjoep	1:24714b45cd1b	415	px = pDst->pData+numColsB-1;
xorjoep	1:24714b45cd1b	416	pInA = pSrcA->pData;
xorjoep	1:24714b45cd1b	417
xorjoep	1:24714b45cd1b	418	/* row loop */
xorjoep	1:24714b45cd1b	419	while (row > 0)
xorjoep	1:24714b45cd1b	420	{
xorjoep	1:24714b45cd1b	421
xorjoep	1:24714b45cd1b	422	/* point to last column in matrix B */
xorjoep	1:24714b45cd1b	423	pInB = pSrcBT + numRowsB*(numColsB-1);
xorjoep	1:24714b45cd1b	424
xorjoep	1:24714b45cd1b	425	/* Set the variable sum, that acts as accumulator, to zero */
xorjoep	1:24714b45cd1b	426	sum = 0;
xorjoep	1:24714b45cd1b	427
xorjoep	1:24714b45cd1b	428	/* Compute 4 columns at once */
xorjoep	1:24714b45cd1b	429	colCnt = numColsA >> 2;
xorjoep	1:24714b45cd1b	430
xorjoep	1:24714b45cd1b	431	/* matrix multiplication */
xorjoep	1:24714b45cd1b	432	while (colCnt > 0U)
xorjoep	1:24714b45cd1b	433	{
xorjoep	1:24714b45cd1b	434	inA1 = *__SIMD32(pInA)++;
xorjoep	1:24714b45cd1b	435	inA2 = *__SIMD32(pInA)++;
xorjoep	1:24714b45cd1b	436	inB1 = *__SIMD32(pInB)++;
xorjoep	1:24714b45cd1b	437	inB2 = *__SIMD32(pInB)++;
xorjoep	1:24714b45cd1b	438
xorjoep	1:24714b45cd1b	439	sum = __SMLAD(inA1, inB1, sum);
xorjoep	1:24714b45cd1b	440	sum = __SMLAD(inA2, inB2, sum);
xorjoep	1:24714b45cd1b	441
xorjoep	1:24714b45cd1b	442	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	443	colCnt--;
xorjoep	1:24714b45cd1b	444	}
xorjoep	1:24714b45cd1b	445
xorjoep	1:24714b45cd1b	446	colCnt = numColsA & 3U;
xorjoep	1:24714b45cd1b	447	while (colCnt > 0U) {
xorjoep	1:24714b45cd1b	448	sum += (q31_t) (pInA++) (*pInB++);
xorjoep	1:24714b45cd1b	449	colCnt--;
xorjoep	1:24714b45cd1b	450	}
xorjoep	1:24714b45cd1b	451
xorjoep	1:24714b45cd1b	452	/* Store the result in the destination buffer */
xorjoep	1:24714b45cd1b	453	*px = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	454	px += numColsB;
xorjoep	1:24714b45cd1b	455
xorjoep	1:24714b45cd1b	456	/* Decrement the row loop counter */
xorjoep	1:24714b45cd1b	457	row--;
xorjoep	1:24714b45cd1b	458	}
xorjoep	1:24714b45cd1b	459	}
xorjoep	1:24714b45cd1b	460
xorjoep	1:24714b45cd1b	461	/* Compute remaining output row */
xorjoep	1:24714b45cd1b	462	if (numRowsA & 1U) {
xorjoep	1:24714b45cd1b	463
xorjoep	1:24714b45cd1b	464	/* point to last row in output matrix */
xorjoep	1:24714b45cd1b	465	px = pDst->pData+(numColsB)*(numRowsA-1);
xorjoep	1:24714b45cd1b	466
xorjoep	1:24714b45cd1b	467	pInB = pSrcBT;
xorjoep	1:24714b45cd1b	468	col = numColsB;
xorjoep	1:24714b45cd1b	469	i = 0U;
xorjoep	1:24714b45cd1b	470
xorjoep	1:24714b45cd1b	471	/* col loop */
xorjoep	1:24714b45cd1b	472	while (col > 0)
xorjoep	1:24714b45cd1b	473	{
xorjoep	1:24714b45cd1b	474
xorjoep	1:24714b45cd1b	475	/* point to last row in matrix A */
xorjoep	1:24714b45cd1b	476	pInA = pSrcA->pData + (numRowsA-1)*numColsA;
xorjoep	1:24714b45cd1b	477
xorjoep	1:24714b45cd1b	478	/* Set the variable sum, that acts as accumulator, to zero */
xorjoep	1:24714b45cd1b	479	sum = 0;
xorjoep	1:24714b45cd1b	480
xorjoep	1:24714b45cd1b	481	/* Compute 4 columns at once */
xorjoep	1:24714b45cd1b	482	colCnt = numColsA >> 2;
xorjoep	1:24714b45cd1b	483
xorjoep	1:24714b45cd1b	484	/* matrix multiplication */
xorjoep	1:24714b45cd1b	485	while (colCnt > 0U)
xorjoep	1:24714b45cd1b	486	{
xorjoep	1:24714b45cd1b	487	inA1 = *__SIMD32(pInA)++;
xorjoep	1:24714b45cd1b	488	inA2 = *__SIMD32(pInA)++;
xorjoep	1:24714b45cd1b	489	inB1 = *__SIMD32(pInB)++;
xorjoep	1:24714b45cd1b	490	inB2 = *__SIMD32(pInB)++;
xorjoep	1:24714b45cd1b	491
xorjoep	1:24714b45cd1b	492	sum = __SMLAD(inA1, inB1, sum);
xorjoep	1:24714b45cd1b	493	sum = __SMLAD(inA2, inB2, sum);
xorjoep	1:24714b45cd1b	494
xorjoep	1:24714b45cd1b	495	/* Decrement the loop counter */
xorjoep	1:24714b45cd1b	496	colCnt--;
xorjoep	1:24714b45cd1b	497	}
xorjoep	1:24714b45cd1b	498
xorjoep	1:24714b45cd1b	499	colCnt = numColsA & 3U;
xorjoep	1:24714b45cd1b	500	while (colCnt > 0U) {
xorjoep	1:24714b45cd1b	501	sum += (q31_t) (pInA++) (*pInB++);
xorjoep	1:24714b45cd1b	502	colCnt--;
xorjoep	1:24714b45cd1b	503	}
xorjoep	1:24714b45cd1b	504
xorjoep	1:24714b45cd1b	505	/* Store the result in the destination buffer */
xorjoep	1:24714b45cd1b	506	*px++ = (q15_t) (sum >> 15);
xorjoep	1:24714b45cd1b	507
xorjoep	1:24714b45cd1b	508	/* Decrement the col loop counter */
xorjoep	1:24714b45cd1b	509	col--;
xorjoep	1:24714b45cd1b	510	}
xorjoep	1:24714b45cd1b	511	}
xorjoep	1:24714b45cd1b	512
xorjoep	1:24714b45cd1b	513	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
xorjoep	1:24714b45cd1b	514
xorjoep	1:24714b45cd1b	515	/* set status as ARM_MATH_SUCCESS */
xorjoep	1:24714b45cd1b	516	status = ARM_MATH_SUCCESS;
xorjoep	1:24714b45cd1b	517	}
xorjoep	1:24714b45cd1b	518
xorjoep	1:24714b45cd1b	519	/* Return to application */
xorjoep	1:24714b45cd1b	520	return (status);
xorjoep	1:24714b45cd1b	521	}
xorjoep	1:24714b45cd1b	522
xorjoep	1:24714b45cd1b	523	/**
xorjoep	1:24714b45cd1b	524	* @} end of MatrixMult group
xorjoep	1:24714b45cd1b	525	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	20 Jun 2018
Imports:	227
Forks:	0
Commits:	4
Dependents:	10
Dependencies:	0
Followers:	6

functions/MatrixFunctions/arm_mat_mult_fast_q15.c@3:4098b9d3d571, 2018-06-21 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning