_matrix3x3 multiply unrolling

2023-09-09 15:52:09 +03:00
parent 0c8064fb0c
commit dea5f7aa4e
1 changed files with 77 additions and 0 deletions
--- a/Source/Engine/Core/Math/Matrix3x3.cpp
+++ b/Source/Engine/Core/Math/Matrix3x3.cpp
@@ -5,6 +5,9 @@
 #include "Quaternion.h"
 #include "../Types/String.h"

+#include <intrin.h>
+#include <emmintrin.h>
+
 const Matrix3x3 Matrix3x3::Zero(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
 const Matrix3x3 Matrix3x3::Identity(
    1.0f, 0.0f, 0.0f,
@@ -139,6 +142,67 @@ void Matrix3x3::Multiply(const Matrix3x3& left, float right, Matrix3x3& result)

 void Matrix3x3::Multiply(const Matrix3x3& left, const Matrix3x3& right, Matrix3x3& result)
 {
+    
+#if true
+    {
+        // First : naive solution with but with some tricks to make compiler (MSVC) behave
+     //* Note that, in this case, manually unrolling the loop helps
+     //* as the compiler can't auto-vectorize non-contagious memory access 
+        float* __restrict const matData = result.Raw;
+
+        //Mat matC{ matB.width, matA.height, matB.rowSpan, matData };
+
+        for (int rowC = 0; rowC < 3; ++rowC) {
+            for (int colC = 0; colC < 3; ++colC) {
+                // an independent, local accumulator. 
+                float accumulate = 0;
+                int pos = 0;
+               // manual unrolling IS helpful in this case 
+                for (; pos < 3 - 4; pos += 4) {
+                    accumulate += left.Raw[rowC * 3 + pos] *
+                        right.Raw[pos * 3 + colC] +
+                        left.Raw[rowC * 3 + pos + 1] *
+                        right.Raw[(pos + 1) * 3 + colC] +
+                        left.Raw[rowC * 3 + pos + 2] *
+                        right.Raw[(pos + 2) * 3 + colC] +
+                        left.Raw[rowC * 3 + pos + 3] *
+                        right.Raw[(pos + 3) * 3 + colC];
+                }
+                for (; pos < 3; ++pos) {
+                    accumulate += left.Raw[rowC * 3 + pos] *
+                        right.Raw[pos * 3 + colC];
+                }
+                matData[rowC * 3 + colC] = accumulate;
+            }
+        }
+    }
+#else
+    
+    /*
+    __m256i vec_multi_res = _mm256_setzero_si256();
+    __m256i vec_mat1 = _mm256_setzero_si256();
+    __m256i vec_mat2 = _mm256_setzero_si256();
+
+    int i, j, k;
+    for (i = 0; i < 3; i++)
+    {
+        for (j = 0; j < 3; ++j)
+        {
+            //Stores one element in mat1 and use it in all computations needed before proceeding
+            //Stores as vector to increase computations per cycle
+            vec_mat1 = _mm256_set1_epi32(left.Values[j][i]);
+
+            for (k = 0; k < 3; k += 8)
+            {
+                vec_mat2 = _mm256_loadu_si256((__m256i*) & right.Values[k][j]); //Stores row of second matrix (eight in each iteration)
+                vec_multi_res = _mm256_loadu_si256((__m256i*) & result.Values[k][i]); //Loads the result matrix row as a vector
+                vec_multi_res = _mm256_add_epi32(vec_multi_res, _mm256_mullo_epi32(vec_mat1, vec_mat2));//Multiplies the vectors and adds to th the result vector
+
+                _mm256_storeu_si256((__m256i*) & result.Values[k][i], vec_multi_res); //Stores the result vector into the result array
+            }
+        }
+    }
+    */
    result = Matrix3x3(
        left.M11 * right.M11 + left.M12 * right.M21 + left.M13 * right.M31,
        left.M11 * right.M12 + left.M12 * right.M22 + left.M13 * right.M32,
@@ -150,6 +214,19 @@ void Matrix3x3::Multiply(const Matrix3x3& left, const Matrix3x3& right, Matrix3x
        left.M31 * right.M12 + left.M32 * right.M22 + left.M33 * right.M32,
        left.M31 * right.M13 + left.M32 * right.M23 + left.M33 * right.M33
    );
+    /*Matrix3x3 result2 = Matrix3x3(
+        left.M11 * right.M11 + left.M12 * right.M21 + left.M13 * right.M31,
+        left.M11 * right.M12 + left.M12 * right.M22 + left.M13 * right.M32,
+        left.M11 * right.M13 + left.M12 * right.M23 + left.M13 * right.M33,
+        left.M21 * right.M11 + left.M22 * right.M21 + left.M23 * right.M31,
+        left.M21 * right.M12 + left.M22 * right.M22 + left.M23 * right.M32,
+        left.M21 * right.M13 + left.M22 * right.M23 + left.M23 * right.M33,
+        left.M31 * right.M11 + left.M32 * right.M21 + left.M33 * right.M31,
+        left.M31 * right.M12 + left.M32 * right.M22 + left.M33 * right.M32,
+        left.M31 * right.M13 + left.M32 * right.M23 + left.M33 * right.M33
+    );
+    ASSERT(result2 == result);*/
+#endif
 }

 void Matrix3x3::Divide(const Matrix3x3& left, float right, Matrix3x3& result)