Camera_driver: refactored version of camera driver

This commit is contained in:
Petr Malanik
2023-01-23 19:36:57 +01:00
parent 34b9eaafc2
commit d99e4b27e6
1292 changed files with 1100905 additions and 0 deletions

View File

@ -0,0 +1,16 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPMatrix)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPMatrix STATIC ${SRC})
configdsp(CMSISDSPMatrix ..)
### Includes
target_include_directories(CMSISDSPMatrix PUBLIC "${DSP}/../../Include")

View File

@ -0,0 +1,53 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: MatrixFunctions.c
* Description: Combination of all matrix function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_mat_add_f32.c"
#include "arm_mat_add_q15.c"
#include "arm_mat_add_q31.c"
#include "arm_mat_cmplx_mult_f32.c"
#include "arm_mat_cmplx_mult_q15.c"
#include "arm_mat_cmplx_mult_q31.c"
#include "arm_mat_init_f32.c"
#include "arm_mat_init_q15.c"
#include "arm_mat_init_q31.c"
#include "arm_mat_inverse_f32.c"
#include "arm_mat_inverse_f64.c"
#include "arm_mat_mult_f32.c"
#include "arm_mat_mult_fast_q15.c"
#include "arm_mat_mult_fast_q31.c"
#include "arm_mat_mult_q15.c"
#include "arm_mat_mult_q31.c"
#include "arm_mat_scale_f32.c"
#include "arm_mat_scale_q15.c"
#include "arm_mat_scale_q31.c"
#include "arm_mat_sub_f32.c"
#include "arm_mat_sub_q15.c"
#include "arm_mat_sub_q31.c"
#include "arm_mat_trans_f32.c"
#include "arm_mat_trans_q15.c"
#include "arm_mat_trans_q31.c"

View File

@ -0,0 +1,232 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_f32.c
* Description: Floating-point matrix addition
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixAdd Matrix Addition
Adds two matrices.
\image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Floating-point matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON)
/*
Neon version is assuming the matrix is small enough.
So no blocking is used for taking into account cache effects.
For big matrix, there exist better libraries for Neon.
*/
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vaddq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
*pOut++ = (*pIn1++) + (*pIn2++);
/* Decrement the loop counter */
blkCnt--;
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
@} end of MatrixAdd group
*/

View File

@ -0,0 +1,149 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_q15.c
* Description: Q15 matrix addition
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Q15 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixAdd group
*/

View File

@ -0,0 +1,139 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_q31.c
* Description: Q31 matrix addition
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Q31 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixAdd group
*/

View File

@ -0,0 +1,631 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_mult_f32.c
* Description: Floating-point matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup CmplxMatrixMult Complex Matrix Multiplication
Complex Matrix multiplication is only defined if the number of columns of the
first matrix equals the number of rows of the second matrix.
Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
in an <code>M x P</code> matrix.
@par
When matrix size checking is enabled, the functions check:
- that the inner dimensions of <code>pSrcA</code> and <code>pSrcB</code> are equal;
- that the size of the output matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
*/
/**
@addtogroup CmplxMatrixMult
@{
*/
/**
@brief Floating-point Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON)
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
float32_t sumReal1, sumImag1; /* accumulator */
float32_t a0, b0, c0, d0;
float32_t a1, a1B,b1, b1B, c1, d1;
float32_t sumReal2, sumImag2; /* accumulator */
float32x4x2_t a0V, a1V;
float32x4_t accR0,accI0, accR1,accI1,tempR, tempI;
float32x2_t accum = vdup_n_f32(0);
float32_t *pIn1B = pSrcA->pData;
uint16_t col, i = 0U, j, rowCnt, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
float32_t sumReal1B, sumImag1B;
float32_t sumReal2B, sumImag2B;
float32_t *pxB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
rowCnt = row >> 1;
/* Row loop */
while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
pxB = px + 2 * numColsB;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0f;
sumImag1 = 0.0f;
sumReal1B = 0.0f;
sumImag1B = 0.0f;
sumReal2 = 0.0f;
sumImag2 = 0.0f;
sumReal2B = 0.0f;
sumImag2B = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
pIn1B = pIn1 + 2*numColsA;
accR0 = vdupq_n_f32(0.0);
accI0 = vdupq_n_f32(0.0);
accR1 = vdupq_n_f32(0.0);
accI1 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
a1V = vld2q_f32(pIn1B); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 8;
pIn1B += 8;
tempR[0] = *pIn2;
tempI[0] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[1] = *pIn2;
tempI[1] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[2] = *pIn2;
tempI[2] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[3] = *pIn2;
tempI[3] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
accR1 = vmlaq_f32(accR1,a1V.val[0],tempR);
accR1 = vmlsq_f32(accR1,a1V.val[1],tempI);
accI1 = vmlaq_f32(accI1,a1V.val[1],tempR);
accI1 = vmlaq_f32(accI1,a1V.val[0],tempI);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
sumReal1 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
sumImag1 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accR1), vget_high_f32(accR1));
sumReal1B += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accI1), vget_high_f32(accI1));
sumImag1B += accum[0] + accum[1];
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a1 = *pIn1;
a1B = *pIn1B;
c1 = *pIn2;
b1 = *(pIn1 + 1U);
b1B = *(pIn1B + 1U);
d1 = *(pIn2 + 1U);
sumReal1 += a1 * c1;
sumImag1 += b1 * c1;
sumReal1B += a1B * c1;
sumImag1B += b1B * c1;
pIn1 += 2U;
pIn1B += 2U;
pIn2 += 2 * numColsB;
sumReal2 -= b1 * d1;
sumImag2 += a1 * d1;
sumReal2B -= b1B * d1;
sumImag2B += a1B * d1;
/* Decrement the loop counter */
colCnt--;
}
sumReal1 += sumReal2;
sumImag1 += sumImag2;
sumReal1B += sumReal2B;
sumImag1B += sumImag2B;
/* Store the result in the destination buffer */
*px++ = sumReal1;
*px++ = sumImag1;
*pxB++ = sumReal1B;
*pxB++ = sumImag1B;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement the column loop counter */
col--;
}
/* Update the pointer pInA to point to the starting address of the next 2 row */
i = i + 2*numColsB;
pInA = pInA + 4 * numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
rowCnt = row & 1;
while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0f;
sumImag1 = 0.0f;
sumReal2 = 0.0f;
sumImag2 = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
accR0 = vdupq_n_f32(0.0);
accI0 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 8;
tempR[0] = *pIn2;
tempI[0] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[1] = *pIn2;
tempI[1] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[2] = *pIn2;
tempI[2] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
tempR[3] = *pIn2;
tempI[3] = *(pIn2 + 1U);
pIn2 += 2 * numColsB;
accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
sumReal1 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
sumImag1 += accum[0] + accum[1];
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
sumReal1 += a1 * c1;
sumImag1 += b1 * c1;
pIn1 += 2U;
pIn2 += 2 * numColsB;
sumReal2 -= b1 * d1;
sumImag2 += a1 * d1;
/* Decrement the loop counter */
colCnt--;
}
sumReal1 += sumReal2;
sumImag1 += sumImag2;
/* Store the result in the destination buffer */
*px++ = sumReal1;
*px++ = sumImag1;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement the column loop counter */
col--;
}
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
float32_t sumReal, sumImag; /* Accumulator */
float32_t a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
float32_t a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0f;
sumImag = 0.0f;
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop count */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sumReal;
*px++ = sumImag;
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
@} end of MatrixMult group
*/

View File

@ -0,0 +1,340 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mat_mult_q15.c
* Description: Q15 complex matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup CmplxMatrixMult
@{
*/
/**
@brief Q15 Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@param[in] pScratch points to an array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Conditions for optimum performance
Input, output and state buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*/
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
{
q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
q63_t sumReal, sumImag; /* accumulator */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t prod1, prod2;
q31_t pSourceA, pSourceB;
#else
q15_t a, b, c, d;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
#else
/* Initialize blkCnt with number of samples */
col = numColsB;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement column loop counter */
col--;
}
i = i + 2U;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set variable sum, that acts as accumulator, to zero */
sumReal = 0;
sumImag = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i * 2;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 1U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA buffer */
a = *pInA;
b = *(pInA + 1U);
/* read real and imag values from pSrcB buffer */
c = *pInB;
d = *(pInB + 1U);
/* Multiply and Accumlates */
sumReal += (q31_t) a *c;
sumImag += (q31_t) a *d;
sumReal -= (q31_t) b *d;
sumImag += (q31_t) b *c;
/* read next real and imag values from pSrcA buffer */
a = *(pInA + 2U);
b = *(pInA + 3U);
/* read next real and imag values from pSrcB buffer */
c = *(pInB + 2U);
d = *(pInB + 3U);
/* update pointer */
pInA += 4U;
/* Multiply and Accumlates */
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
/* update pointer */
pInB += 4U;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
if ((numColsA & 0x1U) > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA and pSrcB buffer */
a = *pInA++;
b = *pInA++;
c = *pInB++;
d = *pInB++;
/* Multiply and Accumlates */
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
#endif /* #if defined (ARM_MATH_DSP) */
}
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
*px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View File

@ -0,0 +1,283 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_mult_q31.c
* Description: Floating-point matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup CmplxMatrixMult
@{
*/
/**
@brief Q31 Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
*/
arm_status arm_mat_cmplx_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
q63_t sumReal, sumImag; /* Accumulator */
q31_t a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0;
sumImag = 0.0;
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += (q63_t) a0 * c0;
sumImag += (q63_t) b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= (q63_t) b0 * d0;
sumImag += (q63_t) a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += (q63_t) a0 * c0;
sumImag += (q63_t) b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= (q63_t) b0 * d0;
sumImag += (q63_t) a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
/* Decrement loop count */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = (q31_t) clip_q63_to_q31(sumReal >> 31);
*px++ = (q31_t) clip_q63_to_q31(sumImag >> 31);
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View File

@ -0,0 +1,76 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_f32.c
* Description: Floating-point matrix initialization
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixInit Matrix Initialization
Initializes the underlying matrix data structure.
The functions set the <code>numRows</code>,
<code>numCols</code>, and <code>pData</code> fields
of the matrix data structure.
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Floating-point matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_f32(
arm_matrix_instance_f32 * S,
uint16_t nRows,
uint16_t nColumns,
float32_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

View File

@ -0,0 +1,67 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_q15.c
* Description: Q15 matrix initialization
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Q15 matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q15(
arm_matrix_instance_q15 * S,
uint16_t nRows,
uint16_t nColumns,
q15_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

View File

@ -0,0 +1,72 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_q31.c
* Description: Q31 matrix initialization
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixInit Matrix Initialization
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Q31 matrix initialization.
@param[in,out] S points to an instance of the Q31 matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q31(
arm_matrix_instance_q31 * S,
uint16_t nRows,
uint16_t nColumns,
q31_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

View File

@ -0,0 +1,673 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_inverse_f64.c
* Description: Floating-point matrix inverse
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
@brief Floating-point (64 bit) matrix inverse.
@param[in] pSrc points to input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
arm_status arm_mat_inverse_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
float64_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float64_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float64_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
#if defined (ARM_MATH_DSP)
float64_t maxC; /* maximum value in the column */
float64_t Xchg, in = 0.0, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for column i is the greatest of the column.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is not the most significant of the columns, exchange that row with a row
* below it that does contain the most significant value in column i. If the most
* significant value of the column is zero, then an inverse to that matrix does not exist.
* The most significant value of the column is the absolute maximum.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Grab the most significant value from column l */
maxC = 0;
for (i = l; i < numRows; i++)
{
maxC = *pInT1 > 0 ? (*pInT1 > maxC ? *pInT1 : maxC) : (-*pInT1 > maxC ? -*pInT1 : maxC);
pInT1 += numCols;
}
/* Update the status if the matrix is singular */
if (maxC == 0.0)
{
return ARM_MATH_SINGULAR;
}
/* Restore pInT1 */
pInT1 = pIn;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is the most significant of the column */
if ( (in > 0.0 ? in : -in) != maxC)
{
/* Loop over the number rows present below */
i = numRows - (l + 1U);
while (i > 0U)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * l);
pOutT2 = pOutT1 + (numCols * k);
/* Look for the most significant element to
* replace in the rows below */
if ((*pInT2 > 0.0 ? *pInT2: -*pInT2) == maxC)
{
/* Loop over number of columns
* to the right of the pilot element */
j = numCols - l;
while (j > 0U)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Exchange the row elements of the destination matrix */
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement loop counter */
j--;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
/* Update the destination pointer modifier */
k++;
/* Decrement loop counter */
i--;
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pInT2 = pPivotRowDst;
/* Pivot element of the row */
in = *pPivotRowIn;
/* Loop over number of columns
* to the right of the pilot element */
j = (numCols - l);
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
in1 = *pInT1;
*pInT1++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
in1 = *pInT2;
*pInT2++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pInT2 = pOut;
/* index used to check for pivot element */
i = 0U;
/* Loop over number of rows */
/* to be replaced by the sum of that row and a multiple of row i */
k = numRows;
while (k > 0U)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pInT2 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
j = (numCols - l);
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT1;
*pInT1++ = in1 - (in * *pPRT_in++);
/* Decrement the loop counter */
j--;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
j = numCols;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement loop counter */
j--;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement loop counter */
k--;
/* Increment pivot index */
i++;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#else
float64_t Xchg, in = 0.0; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
for (i = (l + 1U); i < numRows; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * l);
pOutT2 = pOutT1 + (numCols * k);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
for (j = 0U; j < (numCols - l); j++)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
}
for (j = 0U; j < numCols; j++)
{
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
/* Update the destination pointer modifier */
k++;
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pOutT1 = pPivotRowDst;
/* Pivot element of the row */
in = *(pIn + (l * numCols));
/* Loop over number of columns
* to the right of the pilot element */
for (j = 0U; j < (numCols - l); j++)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
*pInT1 = *pInT1 / in;
pInT1++;
}
for (j = 0U; j < numCols; j++)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
*pOutT1 = *pOutT1 / in;
pOutT1++;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pOutT1 = pOut;
for (i = 0U; i < numRows; i++)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pOutT1 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
for (j = 0U; j < (numCols - l); j++)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
*pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
pOutT1++;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
/**
@} end of MatrixInv group
*/

View File

@ -0,0 +1,534 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_f32.c
* Description: Floating-point matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMatrix
*/
/**
* @defgroup MatrixMult Matrix Multiplication
*
* Multiplies two matrices.
*
* \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
* Matrix multiplication is only defined if the number of columns of the
* first matrix equals the number of rows of the second matrix.
* Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
* in an <code>M x P</code> matrix.
* When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
* <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
* matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
*/
/**
* @addtogroup MatrixMult
* @{
*/
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
#if defined(ARM_MATH_NEON)
#define GROUPOFROWS 8
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
float32_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
float32_t in1, in2, in3, in4;
uint16_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
float32x4_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
float32x4_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
float32x2_t accum = vdup_n_f32(0);
float32_t *pIn1B = pSrcA->pData;
float32_t *pIn1C = pSrcA->pData;
float32_t *pIn1D = pSrcA->pData;
float32_t *pIn1E = pSrcA->pData;
float32_t *pIn1F = pSrcA->pData;
float32_t *pIn1G = pSrcA->pData;
float32_t *pIn1H = pSrcA->pData;
float32_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH; /* Temporary output data matrix pointer */
float32_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* Row loop */
rowCnt = row >> 3;
while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + GROUPOFROWS*i;
pxB = px + numColsB;
pxC = px + 2*numColsB;
pxD = px + 3*numColsB;
pxE = px + 4*numColsB;
pxF = px + 5*numColsB;
pxG = px + 6*numColsB;
pxH = px + 7*numColsB;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum0 = 0.0f;
sum1 = 0.0f;
sum2 = 0.0f;
sum3 = 0.0f;
sum4 = 0.0f;
sum5 = 0.0f;
sum6 = 0.0f;
sum7 = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
pIn1B = pIn1 + numColsA;
pIn1C = pIn1 + 2*numColsA;
pIn1D = pIn1 + 3*numColsA;
pIn1E = pIn1 + 4*numColsA;
pIn1F = pIn1 + 5*numColsA;
pIn1G = pIn1 + 6*numColsA;
pIn1H = pIn1 + 7*numColsA;
acc0 = vdupq_n_f32(0.0);
acc1 = vdupq_n_f32(0.0);
acc2 = vdupq_n_f32(0.0);
acc3 = vdupq_n_f32(0.0);
acc4 = vdupq_n_f32(0.0);
acc5 = vdupq_n_f32(0.0);
acc6 = vdupq_n_f32(0.0);
acc7 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a0V = vld1q_f32(pIn1);
a1V = vld1q_f32(pIn1B);
a2V = vld1q_f32(pIn1C);
a3V = vld1q_f32(pIn1D);
a4V = vld1q_f32(pIn1E);
a5V = vld1q_f32(pIn1F);
a6V = vld1q_f32(pIn1G);
a7V = vld1q_f32(pIn1H);
pIn1 += 4;
pIn1B += 4;
pIn1C += 4;
pIn1D += 4;
pIn1E += 4;
pIn1F += 4;
pIn1G += 4;
pIn1H += 4;
temp[0] = *pIn2;
pIn2 += numColsB;
temp[1] = *pIn2;
pIn2 += numColsB;
temp[2] = *pIn2;
pIn2 += numColsB;
temp[3] = *pIn2;
pIn2 += numColsB;
acc0 = vmlaq_f32(acc0,a0V,temp);
acc1 = vmlaq_f32(acc1,a1V,temp);
acc2 = vmlaq_f32(acc2,a2V,temp);
acc3 = vmlaq_f32(acc3,a3V,temp);
acc4 = vmlaq_f32(acc4,a4V,temp);
acc5 = vmlaq_f32(acc5,a5V,temp);
acc6 = vmlaq_f32(acc6,a6V,temp);
acc7 = vmlaq_f32(acc7,a7V,temp);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum0 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
sum1 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
sum2 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
sum3 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4));
sum4 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5));
sum5 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6));
sum6 += accum[0] + accum[1];
accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7));
sum7 += accum[0] + accum[1];
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum0 += *pIn1++ * (*pIn2);
sum1 += *pIn1B++ * (*pIn2);
sum2 += *pIn1C++ * (*pIn2);
sum3 += *pIn1D++ * (*pIn2);
sum4 += *pIn1E++ * (*pIn2);
sum5 += *pIn1F++ * (*pIn2);
sum6 += *pIn1G++ * (*pIn2);
sum7 += *pIn1H++ * (*pIn2);
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = sum0;
*pxB++ = sum1;
*pxC++ = sum2;
*pxD++ = sum3;
*pxE++ = sum4;
*pxF++ = sum5;
*pxG++ = sum6;
*pxH++ = sum7;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + j;
/* Decrement the column loop counter */
col--;
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + GROUPOFROWS*numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/*
i was the index of a group of rows computed by previous loop.
Now i is the index of a row since below code is computing row per row
and no more group of row per group of rows.
*/
i = GROUPOFROWS*i;
rowCnt = row & 7;
while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
acc0 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a0V = vld1q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 4;
temp[0] = *pIn2;
pIn2 += numColsB;
temp[1] = *pIn2;
pIn2 += numColsB;
temp[2] = *pIn2;
pIn2 += numColsB;
temp[3] = *pIn2;
pIn2 += numColsB;
acc0 = vmlaq_f32(acc0,a0V,temp);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum += accum[0] + accum[1];
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum += *pIn1++ * (*pIn2);
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = sum;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + j;
/* Decrement the column loop counter */
col--;
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
float32_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixMult group
*/

View File

@ -0,0 +1,483 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_fast_q15.c
* Description: Q15 matrix multiplication (fast variant)
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q15 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q15() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.15 x 1.15 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.15 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 16 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q15() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q31_t sum; /* Accumulator */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
q31_t sum2, sum3, sum4;
q15_t *pInA2, *pInB2, *px2;
uint32_t j = 0;
#else
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inB1, inA2, inB2;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#if defined (ARM_MATH_DSP)
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
in = read_q15x2_ia ((q15_t **) &pInB);
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#else /* #if defined (ARM_MATH_DSP) */
/* Read one element from row */
in = *pInB++;
/* Store one element in destination */
*px = in;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pInB++;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
#if defined (ARM_MATH_DSP)
/* Process two rows from matrix A at a time and output two rows at a time */
row = row >> 1U;
px2 = px + numColsB;
#endif
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
#if defined (ARM_MATH_DSP)
/* Process two (transposed) columns from matrix B at a time */
col = col >> 1U;
j = 0;
#endif
/* column loop */
while (col > 0U)
{
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
#if defined (ARM_MATH_DSP)
sum2 = 0;
sum3 = 0;
sum4 = 0;
pInB = pSrcBT + j;
pInA2 = pInA + numColsA;
pInB2 = pInB + numRowsB;
/* Read in two elements at once - alows dual MAC instruction */
colCnt = numColsA >> 1U;
#else
colCnt = numColsA >> 2U;
#endif
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inA2 = read_q15x2_ia ((q15_t **) &pInA2);
inB2 = read_q15x2_ia ((q15_t **) &pInB2);
/* Multiply and Accumlates */
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
#else
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = *pInA++;
inB1 = *pInB++;
/* Multiply and Accumlates */
sum += inA1 * inB1;
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
inA1 = *pInA++;
inB1 = *pInB++;
sum += inA1 * inB1;
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
#if defined (ARM_MATH_DSP)
if (numColsA & 1U) {
inA1 = *pInA++;
inB1 = *pInB++;
inA2 = *pInA2++;
inB2 = *pInB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
}
#else
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += (q31_t) *pInA++ * *pInB++;
/* Decrement loop counter */
colCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
#if defined (ARM_MATH_DSP)
*px++ = (q15_t) (sum2 >> 15);
*px2++ = (q15_t) (sum3 >> 15);
*px2++ = (q15_t) (sum4 >> 15);
j += numRowsB * 2;
#endif
/* Decrement column loop counter */
col--;
}
i = i + numColsA;
#if defined (ARM_MATH_DSP)
i = i + numColsA;
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#endif
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
#if defined (ARM_MATH_DSP)
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
row = numRowsA & (~0x1);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
while (row > 0)
{
/* point to last column in matrix B */
pInB = pSrcBT + numRowsB * (numColsB-1);
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store result in destination buffer */
*px = (q15_t) (sum >> 15);
px += numColsB;
/* Decrement row loop counter */
row--;
}
}
/* Compute remaining output row */
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData + (numColsB) * (numRowsA-1);
pInB = pSrcBT;
col = numColsB;
i = 0U;
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA % 4U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
/* Decrement column loop counter */
col--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View File

@ -0,0 +1,374 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_fast_q31.c
* Description: Q31 matrix multiplication (fast variant)
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q31() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.31 x 1.31 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.31 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 32 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q31() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2;
q31_t sum1, sum2, sum3, sum4; /* Accumulator */
q31_t inA1, inA2, inB1, inB2;
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
px = pDst->pData;
row = row >> 1U;
px2 = px + numColsB;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pInB = pSrcB->pData;
j = 0U;
col = col >> 1U;
/* column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sum1 = 0;
sum2 = 0;
sum3 = 0;
sum4 = 0;
/* Initiate data pointers */
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
pInA2 = pInA + numColsA;
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
inA1 = *pInA++;
inB1 = pInB[0];
inA2 = *pInA2++;
inB2 = pInB[1];
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum2 = __SMMLA(inA1, inB2, sum2);
sum3 = __SMMLA(inA2, inB1, sum3);
sum4 = __SMMLA(inA2, inB2, sum4);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum2 = (q31_t) ((((q63_t) sum2 << 32) + ((q63_t) inA1 * inB2)) >> 32);
sum3 = (q31_t) ((((q63_t) sum3 << 32) + ((q63_t) inA2 * inB1)) >> 32);
sum4 = (q31_t) ((((q63_t) sum4 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement loop counter */
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px++ = sum1 << 1;
*px++ = sum2 << 1;
*px2++ = sum3 << 1;
*px2++ = sum4 << 1;
j += 2;
/* Decrement column loop counter */
col--;
}
i = i + (numColsA << 1U);
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
row = numRowsA & (~1U);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
while (row > 0)
{
/* point to last column in matrix B */
pInB = pSrcB->pData + numColsB-1;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px = sum1 << 1;
px += numColsB;
/* Decrement row loop counter */
row--;
}
}
/* Compute remaining output row */
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData + (numColsB) * (numRowsA-1);
col = numColsB;
i = 0U;
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
pInB = pSrcB->pData + i;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum1 << 1;
i++;
/* Decrement col loop counter */
col--;
}
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View File

@ -0,0 +1,357 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_q15.c
* Description: Q15 matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q15 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results (Unused)
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow.
The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
and then saturated to 1.15 format.
@par
Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
*/
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q63_t sum; /* Accumulator */
#if defined (ARM_MATH_DSP) /* != CM0 */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pInB++;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
sum = __SMLALD(inA1, inB1, sum);
sum = __SMLALD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
/* process remaining column samples */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += *pInA++ * *pInB++;
/* Decrement loop counter */
colCnt--;
}
/* Saturate and store result in destination buffer */
*px = (q15_t) (__SSAT((sum >> 15), 16));
px++;
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
#else /* #if defined (ARM_MATH_DSP) */
q15_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q15_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pIn1 to point to starting address of pSrcA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform multiply-accumulates */
sum += (q31_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) __SSAT((sum >> 15), 16);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pSrcA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View File

@ -0,0 +1,196 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
@remark
Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
*/
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View File

@ -0,0 +1,221 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_f32.c
* Description: Multiplies a floating-point matrix by a scalar
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixScale Matrix Scale
Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
matrix by the scalar. For example:
\image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
The function checks to make sure that the input and output matrices are of the same size.
In the fixed-point Q15 and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Floating-point matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scale scale factor to be applied
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
float32_t in1, in2, in3, in4; /* temporary variables */
float32_t out1, out2, out3, out4; /* temporary variables */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scaling and results are stored in the destination buffer. */
vec1 = vld1q_f32(pIn);
res = vmulq_f32(vec1, vdupq_n_f32(scale));
vst1q_f32(pOut, res);
/* update pointers to process next sampels */
pIn += 4U;
pOut += 4U;
/* Decrement the numSamples loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* The results are stored in the destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement the loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* Input data matrix pointer */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counters */
arm_status status; /* Status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
@} end of MatrixScale group
*/

View File

@ -0,0 +1,170 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_q15.c
* Description: Multiplies a Q15 matrix by a scalar
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Q15 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* Input data matrix pointer */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = 15 - shift; /* Total shift to apply after scaling */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t out1, out2, out3, out4; /* Temporary output variables */
q15_t in1, in2, in3, in4; /* Temporary input variables */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from source */
inA1 = read_q15x2_ia ((q15_t **) &pIn);
inA2 = read_q15x2_ia ((q15_t **) &pIn);
/* Scale inputs and store result in temporary variables
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
/* apply shifting */
out1 = out1 >> kShift;
out2 = out2 >> kShift;
out3 = out3 >> kShift;
out4 = out4 >> kShift;
/* saturate the output */
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
/* store result to destination */
write_q15x2_ia (&pOut, __PKHBT(in2, in1, 16));
write_q15x2_ia (&pOut, __PKHBT(in4, in3, 16));
#else
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixScale group
*/

View File

@ -0,0 +1,164 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_q31.c
* Description: Multiplies a Q31 matrix by a scalar
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Q31 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
*/
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* Input data matrix pointer */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = shift + 1; /* Shift to apply after scaling */
q31_t in, out; /* Temporary variabels */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
in = *pIn++; /* read four inputs from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in << kShift; /* apply shifting */
if (in != (out >> kShift)) /* saturate the results. */
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out; /* Store result destination */
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixScale group
*/

View File

@ -0,0 +1,226 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_f32.c
* Description: Floating-point matrix subtraction
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixSub Matrix Subtraction
Subtract two matrices.
\image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
/* Read values from source A */
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vsubq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* Update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
*pOut++ = (*pIn1++) - (*pIn2++);
/* Decrement the loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
@} end of MatrixSub group
*/

View File

@ -0,0 +1,144 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_q15.c
* Description: Q15 Matrix subtraction
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Q15 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, Saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixSub group
*/

View File

@ -0,0 +1,139 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_q31.c
* Description: Q31 matrix subtraction
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Q31 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
arm_status arm_mat_sub_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, saturate and then store the results in the destination buffer. */
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, saturate and store result in destination buffer. */
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixSub group
*/

View File

@ -0,0 +1,284 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_f32.c
* Description: Floating-point matrix transpose
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixTrans Matrix Transpose
Tranposes a matrix.
Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
\image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_NEON)
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* Row loop */
rowCnt = row >> 2;
while (rowCnt > 0U)
{
float32x4_t row0V,row1V,row2V,row3V;
float32x4x2_t ra0,ra1,rb0,rb1;
blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) /* Column loop */
{
row0V = vld1q_f32(pIn);
row1V = vld1q_f32(pIn + 1 * nColumns);
row2V = vld1q_f32(pIn + 2 * nColumns);
row3V = vld1q_f32(pIn + 3 * nColumns);
pIn += 4;
ra0 = vzipq_f32(row0V,row2V);
ra1 = vzipq_f32(row1V,row3V);
rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
vst1q_f32(px,rb0.val[0]);
px += nRows;
vst1q_f32(px,rb0.val[1]);
px += nRows;
vst1q_f32(px,rb1.val[0]);
px += nRows;
vst1q_f32(px,rb1.val[1]);
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
/* Perform matrix transpose for last 3 samples here. */
blkCnt = nColumns % 0x4U;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px++ = *pIn;
*px++ = *(pIn + 1 * nColumns);
*px++ = *(pIn + 2 * nColumns);
*px++ = *(pIn + 3 * nColumns);
px += (nRows - 4);
pIn++;
/* Decrement the column loop counter */
blkCnt--;
}
i += 4;
pIn += 3 * nColumns;
/* Decrement the row loop counter */
rowCnt--;
} /* Row loop end */
rowCnt = row & 3;
while (rowCnt > 0U)
{
blkCnt = nColumns ;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
i++;
rowCnt -- ;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
/**
* @} end of MatrixTrans group
*/

View File

@ -0,0 +1,182 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_q15.c
* Description: Q15 matrix transpose
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Q15 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in; /* variable to hold temporary output */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer pOut is set to starting address of column being processed */
pOut = pDst->pData + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pIn);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pIn);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*pOut = *pIn++;
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixTrans group
*/

View File

@ -0,0 +1,146 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_q31.c
* Description: Q31 matrix transpose
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Q31 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixTrans group
*/