Context Navigation

← Previous Changeset
Next Changeset →

Changeset 138866 in webkit

Timestamp:

Jan 4, 2013 4:34:04 PM (11 years ago)

Author:

benjamin@webkit.org

Message:

Optimize TransformationMatrix::multiply() for x86_64
https://bugs.webkit.org/show_bug.cgi?id=105719

Reviewed by Sam Weinig.

On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
We can use that in two ways to optimize matrix multiplications:
-Keep the source matrix completely in registers. Write the result directly in

the source matrix's memory. This avoids the memcpy at the end of the multiplication
and various memory operations.

-Use SIMD with SSE to perform 2 operations at a time.

The parameter from the second matrix are loaded one by one in XMM registers.
Loading them with SSE then shuffling the values perform worse than loading
one by one.

This is only enabled on 64bits as x86 only has access to 8 XMM registers and
the function should be written differently.

On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.

platform/graphics/transforms/TransformationMatrix.cpp:

(WebCore::TransformationMatrix::multiply):

platform/graphics/transforms/TransformationMatrix.h:

(TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.

Location:

trunk/Source/WebCore

Files:

: 3 edited

ChangeLog (modified) (1 diff)
platform/graphics/transforms/TransformationMatrix.cpp (modified) (3 diffs)
platform/graphics/transforms/TransformationMatrix.h (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Source/WebCore/ChangeLog

-                      r138864
+                      r138866
+-01-04  Benjamin Poulain  <benjamin@webkit.org>
+        Optimize TransformationMatrix::multiply() for x86_64
+        https://bugs.webkit.org/show_bug.cgi?id=105719
+        Reviewed by Sam Weinig.
+        On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
+        We can use that in two ways to optimize matrix multiplications:
+        -Keep the source matrix completely in registers. Write the result directly in
+         the source matrix's memory. This avoids the memcpy at the end of the multiplication
+         and various memory operations.
+        -Use SIMD with SSE to perform 2 operations at a time.
+        The parameter from the second matrix are loaded one by one in XMM registers.
+        Loading them with SSE then shuffling the values perform worse than loading
+        one by one.
+        This is only enabled on 64bits as x86 only has access to 8 XMM registers and
+        the function should be written differently.
+        On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.
+        * platform/graphics/transforms/TransformationMatrix.cpp:
+        (WebCore::TransformationMatrix::multiply):
+        * platform/graphics/transforms/TransformationMatrix.h:
+        (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.
 -01-04  Joshua Bell  <jsbell@chromium.org>

trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp

-                      r138711
+                      r138866
 #include <wtf/Assertions.h>
 #include <wtf/MathExtras.h>
+#if CPU(X86_64)
+#include <emmintrin.h>
+#endif
 using namespace std;
 …
+}
+//
+// *this = mat * *this
+//
+// this = mat * this.
 TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
+{
 …
 #undef MATRIX_MULTIPLY_ONE_LINE
+#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
+    // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
+    __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
+    __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
+    __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
+    __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
+    // First row.
+    __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
+    __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
+    __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
+    __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
+    // output00 and output01.
+    __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+    __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
+    __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
+    __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
+    __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[0][0], accumulator);
+    // output02 and output03.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[0][2], accumulator);
+    // Second row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
+    // output10 and output11.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[1][0], accumulator);
+    // output12 and output13.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[1][2], accumulator);
+    // Third row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
+    // output20 and output21.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[2][0], accumulator);
+    // output22 and output23.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[2][2], accumulator);
+    // Fourth row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
+    // output30 and output31.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[3][0], accumulator);
+    // output32 and output33.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[3][2], accumulator);
 #else
     Matrix4 tmp;

trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h

-                      r138711
+                      r138866
 class FloatQuad;
+#if CPU(X86_64) && !PLATFORM(WINDOWS)
+#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2
+#endif
 class TransformationMatrix {
     WTF_MAKE_FAST_ALLOCATED;
 public:
 #if CPU(APPLE_ARMV7S)
+#if CPU(APPLE_ARMV7S) || defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
     typedef double Matrix4[4][4] __attribute__((aligned (16)));
 #else
 …
     void setF(double f) { m_matrix[3][1] = f; }
     // this = this * mat
+    // this = mat * this.
     TransformationMatrix& multiply(const TransformationMatrix&);

Note: See TracChangeset for help on using the changeset viewer.