_matrix3x3 multiply unrolling

This commit is contained in:
2023-09-09 15:52:09 +03:00
parent 0c8064fb0c
commit dea5f7aa4e

View File

@@ -5,6 +5,9 @@
#include "Quaternion.h"
#include "../Types/String.h"
#include <intrin.h>
#include <emmintrin.h>
const Matrix3x3 Matrix3x3::Zero(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
const Matrix3x3 Matrix3x3::Identity(
1.0f, 0.0f, 0.0f,
@@ -139,6 +142,67 @@ void Matrix3x3::Multiply(const Matrix3x3& left, float right, Matrix3x3& result)
void Matrix3x3::Multiply(const Matrix3x3& left, const Matrix3x3& right, Matrix3x3& result)
{
#if true
{
// First : naive solution with but with some tricks to make compiler (MSVC) behave
//* Note that, in this case, manually unrolling the loop helps
//* as the compiler can't auto-vectorize non-contagious memory access
float* __restrict const matData = result.Raw;
//Mat matC{ matB.width, matA.height, matB.rowSpan, matData };
for (int rowC = 0; rowC < 3; ++rowC) {
for (int colC = 0; colC < 3; ++colC) {
// an independent, local accumulator.
float accumulate = 0;
int pos = 0;
// manual unrolling IS helpful in this case
for (; pos < 3 - 4; pos += 4) {
accumulate += left.Raw[rowC * 3 + pos] *
right.Raw[pos * 3 + colC] +
left.Raw[rowC * 3 + pos + 1] *
right.Raw[(pos + 1) * 3 + colC] +
left.Raw[rowC * 3 + pos + 2] *
right.Raw[(pos + 2) * 3 + colC] +
left.Raw[rowC * 3 + pos + 3] *
right.Raw[(pos + 3) * 3 + colC];
}
for (; pos < 3; ++pos) {
accumulate += left.Raw[rowC * 3 + pos] *
right.Raw[pos * 3 + colC];
}
matData[rowC * 3 + colC] = accumulate;
}
}
}
#else
/*
__m256i vec_multi_res = _mm256_setzero_si256();
__m256i vec_mat1 = _mm256_setzero_si256();
__m256i vec_mat2 = _mm256_setzero_si256();
int i, j, k;
for (i = 0; i < 3; i++)
{
for (j = 0; j < 3; ++j)
{
//Stores one element in mat1 and use it in all computations needed before proceeding
//Stores as vector to increase computations per cycle
vec_mat1 = _mm256_set1_epi32(left.Values[j][i]);
for (k = 0; k < 3; k += 8)
{
vec_mat2 = _mm256_loadu_si256((__m256i*) & right.Values[k][j]); //Stores row of second matrix (eight in each iteration)
vec_multi_res = _mm256_loadu_si256((__m256i*) & result.Values[k][i]); //Loads the result matrix row as a vector
vec_multi_res = _mm256_add_epi32(vec_multi_res, _mm256_mullo_epi32(vec_mat1, vec_mat2));//Multiplies the vectors and adds to th the result vector
_mm256_storeu_si256((__m256i*) & result.Values[k][i], vec_multi_res); //Stores the result vector into the result array
}
}
}
*/
result = Matrix3x3(
left.M11 * right.M11 + left.M12 * right.M21 + left.M13 * right.M31,
left.M11 * right.M12 + left.M12 * right.M22 + left.M13 * right.M32,
@@ -150,6 +214,19 @@ void Matrix3x3::Multiply(const Matrix3x3& left, const Matrix3x3& right, Matrix3x
left.M31 * right.M12 + left.M32 * right.M22 + left.M33 * right.M32,
left.M31 * right.M13 + left.M32 * right.M23 + left.M33 * right.M33
);
/*Matrix3x3 result2 = Matrix3x3(
left.M11 * right.M11 + left.M12 * right.M21 + left.M13 * right.M31,
left.M11 * right.M12 + left.M12 * right.M22 + left.M13 * right.M32,
left.M11 * right.M13 + left.M12 * right.M23 + left.M13 * right.M33,
left.M21 * right.M11 + left.M22 * right.M21 + left.M23 * right.M31,
left.M21 * right.M12 + left.M22 * right.M22 + left.M23 * right.M32,
left.M21 * right.M13 + left.M22 * right.M23 + left.M23 * right.M33,
left.M31 * right.M11 + left.M32 * right.M21 + left.M33 * right.M31,
left.M31 * right.M12 + left.M32 * right.M22 + left.M33 * right.M32,
left.M31 * right.M13 + left.M32 * right.M23 + left.M33 * right.M33
);
ASSERT(result2 == result);*/
#endif
}
void Matrix3x3::Divide(const Matrix3x3& left, float right, Matrix3x3& result)