mbed-dsp - CMSIS DSP library

mbed official » Code » mbed-dsp

CMSIS DSP library

Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

cmsis_dsp/MatrixFunctions/arm_mat_mult_q31.c@2:da51fb522205, 2013-05-30 (annotated)

Committer:: emilmont
Date:: Thu May 30 17:10:11 2013 +0100
Revision:: 2:da51fb522205
Parent:: 1:fdd22bb7aa52
Child:: 3:7a284390b0ce

Keep "cmsis-dsp" module in synch with its source

Who changed what in which revision?

User	Revision	Line number	New contents of line
emilmont	1:fdd22bb7aa52	1	/* ----------------------------------------------------------------------
emilmont	1:fdd22bb7aa52	2	* Copyright (C) 2010 ARM Limited. All rights reserved.
emilmont	1:fdd22bb7aa52	3	*
emilmont	1:fdd22bb7aa52	4	* $Date: 15. February 2012
emilmont	2:da51fb522205	5	* $Revision: V1.1.0
emilmont	1:fdd22bb7aa52	6	*
emilmont	2:da51fb522205	7	* Project: CMSIS DSP Library
emilmont	2:da51fb522205	8	* Title: arm_mat_mult_q31.c
emilmont	1:fdd22bb7aa52	9	*
emilmont	2:da51fb522205	10	* Description: Q31 matrix multiplication.
emilmont	1:fdd22bb7aa52	11	*
emilmont	1:fdd22bb7aa52	12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
emilmont	1:fdd22bb7aa52	13	*
emilmont	1:fdd22bb7aa52	14	* Version 1.1.0 2012/02/15
emilmont	1:fdd22bb7aa52	15	* Updated with more optimizations, bug fixes and minor API changes.
emilmont	1:fdd22bb7aa52	16	*
emilmont	1:fdd22bb7aa52	17	* Version 1.0.10 2011/7/15
emilmont	1:fdd22bb7aa52	18	* Big Endian support added and Merged M0 and M3/M4 Source code.
emilmont	1:fdd22bb7aa52	19	*
emilmont	1:fdd22bb7aa52	20	* Version 1.0.3 2010/11/29
emilmont	1:fdd22bb7aa52	21	* Re-organized the CMSIS folders and updated documentation.
emilmont	1:fdd22bb7aa52	22	*
emilmont	1:fdd22bb7aa52	23	* Version 1.0.2 2010/11/11
emilmont	1:fdd22bb7aa52	24	* Documentation updated.
emilmont	1:fdd22bb7aa52	25	*
emilmont	1:fdd22bb7aa52	26	* Version 1.0.1 2010/10/05
emilmont	1:fdd22bb7aa52	27	* Production release and review comments incorporated.
emilmont	1:fdd22bb7aa52	28	*
emilmont	1:fdd22bb7aa52	29	* Version 1.0.0 2010/09/20
emilmont	1:fdd22bb7aa52	30	* Production release and review comments incorporated.
emilmont	1:fdd22bb7aa52	31	*
emilmont	1:fdd22bb7aa52	32	* Version 0.0.5 2010/04/26
emilmont	1:fdd22bb7aa52	33	* incorporated review comments and updated with latest CMSIS layer
emilmont	1:fdd22bb7aa52	34	*
emilmont	1:fdd22bb7aa52	35	* Version 0.0.3 2010/03/10
emilmont	1:fdd22bb7aa52	36	* Initial version
emilmont	1:fdd22bb7aa52	37	* -------------------------------------------------------------------- */
emilmont	1:fdd22bb7aa52	38
emilmont	1:fdd22bb7aa52	39	#include "arm_math.h"
emilmont	1:fdd22bb7aa52	40
emilmont	1:fdd22bb7aa52	41	/**
emilmont	1:fdd22bb7aa52	42	* @ingroup groupMatrix
emilmont	1:fdd22bb7aa52	43	*/
emilmont	1:fdd22bb7aa52	44
emilmont	1:fdd22bb7aa52	45	/**
emilmont	1:fdd22bb7aa52	46	* @addtogroup MatrixMult
emilmont	1:fdd22bb7aa52	47	* @{
emilmont	1:fdd22bb7aa52	48	*/
emilmont	1:fdd22bb7aa52	49
emilmont	1:fdd22bb7aa52	50	/**
emilmont	1:fdd22bb7aa52	51	* @brief Q31 matrix multiplication
emilmont	1:fdd22bb7aa52	52	* @param[in] *pSrcA points to the first input matrix structure
emilmont	1:fdd22bb7aa52	53	* @param[in] *pSrcB points to the second input matrix structure
emilmont	1:fdd22bb7aa52	54	* @param[out] *pDst points to output matrix structure
emilmont	2:da51fb522205	55	* @return The function returns either
emilmont	1:fdd22bb7aa52	56	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
emilmont	1:fdd22bb7aa52	57	*
emilmont	1:fdd22bb7aa52	58	* @details
emilmont	1:fdd22bb7aa52	59	* <b>Scaling and Overflow Behavior:</b>
emilmont	1:fdd22bb7aa52	60	*
emilmont	1:fdd22bb7aa52	61	* \par
emilmont	1:fdd22bb7aa52	62	* The function is implemented using an internal 64-bit accumulator.
emilmont	1:fdd22bb7aa52	63	* The accumulator has a 2.62 format and maintains full precision of the intermediate
emilmont	1:fdd22bb7aa52	64	* multiplication results but provides only a single guard bit. There is no saturation
emilmont	1:fdd22bb7aa52	65	* on intermediate additions. Thus, if the accumulator overflows it wraps around and
emilmont	1:fdd22bb7aa52	66	* distorts the result. The input signals should be scaled down to avoid intermediate
emilmont	1:fdd22bb7aa52	67	* overflows. The input is thus scaled down by log2(numColsA) bits
emilmont	1:fdd22bb7aa52	68	* to avoid overflows, as a total of numColsA additions are performed internally.
emilmont	1:fdd22bb7aa52	69	* The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
emilmont	1:fdd22bb7aa52	70	*
emilmont	1:fdd22bb7aa52	71	* \par
emilmont	1:fdd22bb7aa52	72	* See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
emilmont	1:fdd22bb7aa52	73	*
emilmont	1:fdd22bb7aa52	74	*/
emilmont	1:fdd22bb7aa52	75
emilmont	1:fdd22bb7aa52	76	arm_status arm_mat_mult_q31(
emilmont	1:fdd22bb7aa52	77	const arm_matrix_instance_q31 * pSrcA,
emilmont	1:fdd22bb7aa52	78	const arm_matrix_instance_q31 * pSrcB,
emilmont	1:fdd22bb7aa52	79	arm_matrix_instance_q31 * pDst)
emilmont	1:fdd22bb7aa52	80	{
emilmont	1:fdd22bb7aa52	81	q31_t pIn1 = pSrcA->pData; / input data matrix pointer A */
emilmont	1:fdd22bb7aa52	82	q31_t pIn2 = pSrcB->pData; / input data matrix pointer B */
emilmont	1:fdd22bb7aa52	83	q31_t pInA = pSrcA->pData; / input data matrix pointer A */
emilmont	1:fdd22bb7aa52	84	q31_t pOut = pDst->pData; / output data matrix pointer */
emilmont	1:fdd22bb7aa52	85	q31_t px; / Temporary output data matrix pointer */
emilmont	1:fdd22bb7aa52	86	q63_t sum; /* Accumulator */
emilmont	1:fdd22bb7aa52	87	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
emilmont	1:fdd22bb7aa52	88	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
emilmont	1:fdd22bb7aa52	89	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
emilmont	1:fdd22bb7aa52	90
emilmont	1:fdd22bb7aa52	91	#ifndef ARM_MATH_CM0
emilmont	1:fdd22bb7aa52	92
emilmont	1:fdd22bb7aa52	93	/* Run the below code for Cortex-M4 and Cortex-M3 */
emilmont	1:fdd22bb7aa52	94
emilmont	1:fdd22bb7aa52	95	uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */
emilmont	1:fdd22bb7aa52	96	arm_status status; /* status of matrix multiplication */
emilmont	1:fdd22bb7aa52	97	q31_t a0, a1, a2, a3, b0, b1, b2, b3;
emilmont	1:fdd22bb7aa52	98
emilmont	1:fdd22bb7aa52	99	#ifdef ARM_MATH_MATRIX_CHECK
emilmont	1:fdd22bb7aa52	100
emilmont	1:fdd22bb7aa52	101
emilmont	1:fdd22bb7aa52	102	/* Check for matrix mismatch condition */
emilmont	1:fdd22bb7aa52	103	if((pSrcA->numCols != pSrcB->numRows) \|\|
emilmont	1:fdd22bb7aa52	104	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
emilmont	1:fdd22bb7aa52	105	{
emilmont	1:fdd22bb7aa52	106	/* Set status as ARM_MATH_SIZE_MISMATCH */
emilmont	1:fdd22bb7aa52	107	status = ARM_MATH_SIZE_MISMATCH;
emilmont	1:fdd22bb7aa52	108	}
emilmont	1:fdd22bb7aa52	109	else
emilmont	1:fdd22bb7aa52	110	#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
emilmont	1:fdd22bb7aa52	111
emilmont	1:fdd22bb7aa52	112	{
emilmont	1:fdd22bb7aa52	113	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
emilmont	1:fdd22bb7aa52	114	/* row loop */
emilmont	1:fdd22bb7aa52	115	do
emilmont	1:fdd22bb7aa52	116	{
emilmont	1:fdd22bb7aa52	117	/* Output pointer is set to starting address of the row being processed */
emilmont	1:fdd22bb7aa52	118	px = pOut + i;
emilmont	1:fdd22bb7aa52	119
emilmont	1:fdd22bb7aa52	120	/* For every row wise process, the column loop counter is to be initiated */
emilmont	1:fdd22bb7aa52	121	col = numColsB;
emilmont	1:fdd22bb7aa52	122
emilmont	1:fdd22bb7aa52	123	/* For every row wise process, the pIn2 pointer is set
emilmont	1:fdd22bb7aa52	124	** to the starting address of the pSrcB data */
emilmont	1:fdd22bb7aa52	125	pIn2 = pSrcB->pData;
emilmont	1:fdd22bb7aa52	126
emilmont	1:fdd22bb7aa52	127	j = 0u;
emilmont	1:fdd22bb7aa52	128
emilmont	1:fdd22bb7aa52	129	/* column loop */
emilmont	1:fdd22bb7aa52	130	do
emilmont	1:fdd22bb7aa52	131	{
emilmont	1:fdd22bb7aa52	132	/* Set the variable sum, that acts as accumulator, to zero */
emilmont	1:fdd22bb7aa52	133	sum = 0;
emilmont	1:fdd22bb7aa52	134
emilmont	1:fdd22bb7aa52	135	/* Initiate the pointer pIn1 to point to the starting address of pInA */
emilmont	1:fdd22bb7aa52	136	pIn1 = pInA;
emilmont	1:fdd22bb7aa52	137
emilmont	1:fdd22bb7aa52	138	/* Apply loop unrolling and compute 4 MACs simultaneously. */
emilmont	1:fdd22bb7aa52	139	colCnt = numColsA >> 2;
emilmont	1:fdd22bb7aa52	140
emilmont	1:fdd22bb7aa52	141
emilmont	1:fdd22bb7aa52	142	/* matrix multiplication */
emilmont	1:fdd22bb7aa52	143	while(colCnt > 0u)
emilmont	1:fdd22bb7aa52	144	{
emilmont	1:fdd22bb7aa52	145	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
emilmont	1:fdd22bb7aa52	146	/* Perform the multiply-accumulates */
emilmont	1:fdd22bb7aa52	147	b0 = *pIn2;
emilmont	1:fdd22bb7aa52	148	pIn2 += numColsB;
emilmont	1:fdd22bb7aa52	149
emilmont	1:fdd22bb7aa52	150	a0 = *pIn1++;
emilmont	1:fdd22bb7aa52	151	a1 = *pIn1++;
emilmont	1:fdd22bb7aa52	152
emilmont	1:fdd22bb7aa52	153	b1 = *pIn2;
emilmont	1:fdd22bb7aa52	154	pIn2 += numColsB;
emilmont	1:fdd22bb7aa52	155	b2 = *pIn2;
emilmont	1:fdd22bb7aa52	156	pIn2 += numColsB;
emilmont	1:fdd22bb7aa52	157
emilmont	1:fdd22bb7aa52	158	sum += (q63_t) a0 *b0;
emilmont	1:fdd22bb7aa52	159	sum += (q63_t) a1 *b1;
emilmont	1:fdd22bb7aa52	160
emilmont	1:fdd22bb7aa52	161	a2 = *pIn1++;
emilmont	1:fdd22bb7aa52	162	a3 = *pIn1++;
emilmont	1:fdd22bb7aa52	163
emilmont	1:fdd22bb7aa52	164	b3 = *pIn2;
emilmont	1:fdd22bb7aa52	165	pIn2 += numColsB;
emilmont	1:fdd22bb7aa52	166
emilmont	1:fdd22bb7aa52	167	sum += (q63_t) a2 *b2;
emilmont	1:fdd22bb7aa52	168	sum += (q63_t) a3 *b3;
emilmont	1:fdd22bb7aa52	169
emilmont	1:fdd22bb7aa52	170	/* Decrement the loop counter */
emilmont	1:fdd22bb7aa52	171	colCnt--;
emilmont	1:fdd22bb7aa52	172	}
emilmont	1:fdd22bb7aa52	173
emilmont	1:fdd22bb7aa52	174	/* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
emilmont	1:fdd22bb7aa52	175	** No loop unrolling is used. */
emilmont	1:fdd22bb7aa52	176	colCnt = numColsA % 0x4u;
emilmont	1:fdd22bb7aa52	177
emilmont	1:fdd22bb7aa52	178	while(colCnt > 0u)
emilmont	1:fdd22bb7aa52	179	{
emilmont	1:fdd22bb7aa52	180	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
emilmont	1:fdd22bb7aa52	181	/* Perform the multiply-accumulates */
emilmont	1:fdd22bb7aa52	182	sum += (q63_t) * pIn1++ * *pIn2;
emilmont	1:fdd22bb7aa52	183	pIn2 += numColsB;
emilmont	1:fdd22bb7aa52	184
emilmont	1:fdd22bb7aa52	185	/* Decrement the loop counter */
emilmont	1:fdd22bb7aa52	186	colCnt--;
emilmont	1:fdd22bb7aa52	187	}
emilmont	1:fdd22bb7aa52	188
emilmont	1:fdd22bb7aa52	189	/* Convert the result from 2.62 to 1.31 format and store in destination buffer */
emilmont	1:fdd22bb7aa52	190	*px++ = (q31_t) (sum >> 31);
emilmont	1:fdd22bb7aa52	191
emilmont	1:fdd22bb7aa52	192	/* Update the pointer pIn2 to point to the starting address of the next column */
emilmont	1:fdd22bb7aa52	193	j++;
emilmont	1:fdd22bb7aa52	194	pIn2 = (pSrcB->pData) + j;
emilmont	1:fdd22bb7aa52	195
emilmont	1:fdd22bb7aa52	196	/* Decrement the column loop counter */
emilmont	1:fdd22bb7aa52	197	col--;
emilmont	1:fdd22bb7aa52	198
emilmont	1:fdd22bb7aa52	199	} while(col > 0u);
emilmont	1:fdd22bb7aa52	200
emilmont	1:fdd22bb7aa52	201	#else
emilmont	1:fdd22bb7aa52	202
emilmont	1:fdd22bb7aa52	203	/* Run the below code for Cortex-M0 */
emilmont	1:fdd22bb7aa52	204
emilmont	1:fdd22bb7aa52	205	q31_t pInB = pSrcB->pData; / input data matrix pointer B */
emilmont	1:fdd22bb7aa52	206	uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */
emilmont	1:fdd22bb7aa52	207	arm_status status; /* status of matrix multiplication */
emilmont	1:fdd22bb7aa52	208
emilmont	1:fdd22bb7aa52	209
emilmont	1:fdd22bb7aa52	210	#ifdef ARM_MATH_MATRIX_CHECK
emilmont	1:fdd22bb7aa52	211
emilmont	1:fdd22bb7aa52	212	/* Check for matrix mismatch condition */
emilmont	1:fdd22bb7aa52	213	if((pSrcA->numCols != pSrcB->numRows) \|\|
emilmont	1:fdd22bb7aa52	214	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
emilmont	1:fdd22bb7aa52	215	{
emilmont	1:fdd22bb7aa52	216	/* Set status as ARM_MATH_SIZE_MISMATCH */
emilmont	1:fdd22bb7aa52	217	status = ARM_MATH_SIZE_MISMATCH;
emilmont	1:fdd22bb7aa52	218	}
emilmont	1:fdd22bb7aa52	219	else
emilmont	1:fdd22bb7aa52	220	#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
emilmont	1:fdd22bb7aa52	221
emilmont	1:fdd22bb7aa52	222	{
emilmont	1:fdd22bb7aa52	223	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
emilmont	1:fdd22bb7aa52	224	/* row loop */
emilmont	1:fdd22bb7aa52	225	do
emilmont	1:fdd22bb7aa52	226	{
emilmont	1:fdd22bb7aa52	227	/* Output pointer is set to starting address of the row being processed */
emilmont	1:fdd22bb7aa52	228	px = pOut + i;
emilmont	1:fdd22bb7aa52	229
emilmont	1:fdd22bb7aa52	230	/* For every row wise process, the column loop counter is to be initiated */
emilmont	1:fdd22bb7aa52	231	col = numColsB;
emilmont	1:fdd22bb7aa52	232
emilmont	1:fdd22bb7aa52	233	/* For every row wise process, the pIn2 pointer is set
emilmont	1:fdd22bb7aa52	234	** to the starting address of the pSrcB data */
emilmont	1:fdd22bb7aa52	235	pIn2 = pSrcB->pData;
emilmont	1:fdd22bb7aa52	236
emilmont	1:fdd22bb7aa52	237	/* column loop */
emilmont	1:fdd22bb7aa52	238	do
emilmont	1:fdd22bb7aa52	239	{
emilmont	1:fdd22bb7aa52	240	/* Set the variable sum, that acts as accumulator, to zero */
emilmont	1:fdd22bb7aa52	241	sum = 0;
emilmont	1:fdd22bb7aa52	242
emilmont	1:fdd22bb7aa52	243	/* Initiate the pointer pIn1 to point to the starting address of pInA */
emilmont	1:fdd22bb7aa52	244	pIn1 = pInA;
emilmont	1:fdd22bb7aa52	245
emilmont	1:fdd22bb7aa52	246	/* Matrix A columns number of MAC operations are to be performed */
emilmont	1:fdd22bb7aa52	247	colCnt = numColsA;
emilmont	1:fdd22bb7aa52	248
emilmont	1:fdd22bb7aa52	249	/* matrix multiplication */
emilmont	1:fdd22bb7aa52	250	while(colCnt > 0u)
emilmont	1:fdd22bb7aa52	251	{
emilmont	1:fdd22bb7aa52	252	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
emilmont	1:fdd22bb7aa52	253	/* Perform the multiply-accumulates */
emilmont	1:fdd22bb7aa52	254	sum += (q63_t) * pIn1++ * *pIn2;
emilmont	1:fdd22bb7aa52	255	pIn2 += numColsB;
emilmont	1:fdd22bb7aa52	256
emilmont	1:fdd22bb7aa52	257	/* Decrement the loop counter */
emilmont	1:fdd22bb7aa52	258	colCnt--;
emilmont	1:fdd22bb7aa52	259	}
emilmont	1:fdd22bb7aa52	260
emilmont	1:fdd22bb7aa52	261	/* Convert the result from 2.62 to 1.31 format and store in destination buffer */
emilmont	1:fdd22bb7aa52	262	*px++ = (q31_t) (sum >> 31);
emilmont	1:fdd22bb7aa52	263
emilmont	1:fdd22bb7aa52	264	/* Decrement the column loop counter */
emilmont	1:fdd22bb7aa52	265	col--;
emilmont	1:fdd22bb7aa52	266
emilmont	1:fdd22bb7aa52	267	/* Update the pointer pIn2 to point to the starting address of the next column */
emilmont	1:fdd22bb7aa52	268	pIn2 = pInB + (numColsB - col);
emilmont	1:fdd22bb7aa52	269
emilmont	1:fdd22bb7aa52	270	} while(col > 0u);
emilmont	1:fdd22bb7aa52	271
emilmont	1:fdd22bb7aa52	272	#endif
emilmont	1:fdd22bb7aa52	273
emilmont	1:fdd22bb7aa52	274	/* Update the pointer pInA to point to the starting address of the next row */
emilmont	1:fdd22bb7aa52	275	i = i + numColsB;
emilmont	1:fdd22bb7aa52	276	pInA = pInA + numColsA;
emilmont	1:fdd22bb7aa52	277
emilmont	1:fdd22bb7aa52	278	/* Decrement the row loop counter */
emilmont	1:fdd22bb7aa52	279	row--;
emilmont	1:fdd22bb7aa52	280
emilmont	1:fdd22bb7aa52	281	} while(row > 0u);
emilmont	1:fdd22bb7aa52	282
emilmont	1:fdd22bb7aa52	283	/* set status as ARM_MATH_SUCCESS */
emilmont	1:fdd22bb7aa52	284	status = ARM_MATH_SUCCESS;
emilmont	1:fdd22bb7aa52	285	}
emilmont	1:fdd22bb7aa52	286	/* Return to application */
emilmont	1:fdd22bb7aa52	287	return (status);
emilmont	1:fdd22bb7aa52	288	}
emilmont	1:fdd22bb7aa52	289
emilmont	1:fdd22bb7aa52	290	/**
emilmont	1:fdd22bb7aa52	291	* @} end of MatrixMult group
emilmont	1:fdd22bb7aa52	292	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	11 Feb 2014
Imports:	270
Forks:	0
Commits:	4
Dependents:	55
Dependencies:	0
Followers:	25

The code in this repository is MIT licensed.

cmsis_dsp/MatrixFunctions/arm_mat_mult_q31.c@2:da51fb522205, 2013-05-30 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning