Changeset 138866 in webkit


Ignore:
Timestamp:
Jan 4, 2013 4:34:04 PM (11 years ago)
Author:
benjamin@webkit.org
Message:

Optimize TransformationMatrix::multiply() for x86_64
https://bugs.webkit.org/show_bug.cgi?id=105719

Reviewed by Sam Weinig.

On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
We can use that in two ways to optimize matrix multiplications:
-Keep the source matrix completely in registers. Write the result directly in

the source matrix's memory. This avoids the memcpy at the end of the multiplication
and various memory operations.

-Use SIMD with SSE to perform 2 operations at a time.

The parameter from the second matrix are loaded one by one in XMM registers.
Loading them with SSE then shuffling the values perform worse than loading
one by one.

This is only enabled on 64bits as x86 only has access to 8 XMM registers and
the function should be written differently.

On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.

  • platform/graphics/transforms/TransformationMatrix.cpp:

(WebCore::TransformationMatrix::multiply):

  • platform/graphics/transforms/TransformationMatrix.h:

(TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.

Location:
trunk/Source/WebCore
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/WebCore/ChangeLog

    r138864 r138866  
     12013-01-04  Benjamin Poulain  <benjamin@webkit.org>
     2
     3        Optimize TransformationMatrix::multiply() for x86_64
     4        https://bugs.webkit.org/show_bug.cgi?id=105719
     5
     6        Reviewed by Sam Weinig.
     7
     8        On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
     9        We can use that in two ways to optimize matrix multiplications:
     10        -Keep the source matrix completely in registers. Write the result directly in
     11         the source matrix's memory. This avoids the memcpy at the end of the multiplication
     12         and various memory operations.
     13        -Use SIMD with SSE to perform 2 operations at a time.
     14
     15        The parameter from the second matrix are loaded one by one in XMM registers.
     16        Loading them with SSE then shuffling the values perform worse than loading
     17        one by one.
     18
     19        This is only enabled on 64bits as x86 only has access to 8 XMM registers and
     20        the function should be written differently.
     21
     22        On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.
     23
     24        * platform/graphics/transforms/TransformationMatrix.cpp:
     25        (WebCore::TransformationMatrix::multiply):
     26        * platform/graphics/transforms/TransformationMatrix.h:
     27        (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.
     28
    1292013-01-04  Joshua Bell  <jsbell@chromium.org>
    230
  • trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp

    r138711 r138866  
    3636#include <wtf/Assertions.h>
    3737#include <wtf/MathExtras.h>
     38
     39#if CPU(X86_64)
     40#include <emmintrin.h>
     41#endif
    3842
    3943using namespace std;
     
    969973}
    970974
    971 //
    972 // *this = mat * *this
    973 //
     975// this = mat * this.
    974976TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
    975977{
     
    11161118#undef MATRIX_MULTIPLY_ONE_LINE
    11171119
     1120#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
     1121    // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
     1122    __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
     1123    __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
     1124    __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
     1125    __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
     1126
     1127    // First row.
     1128    __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
     1129    __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
     1130    __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
     1131    __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
     1132
     1133    // output00 and output01.
     1134    __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
     1135    __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
     1136    __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
     1137    __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
     1138
     1139    __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
     1140    __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
     1141    __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
     1142    __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
     1143
     1144    accumulator = _mm_add_pd(accumulator, temp1);
     1145    accumulator = _mm_add_pd(accumulator, temp2);
     1146    accumulator = _mm_add_pd(accumulator, temp3);
     1147    _mm_store_pd(&m_matrix[0][0], accumulator);
     1148
     1149    // output02 and output03.
     1150    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
     1151    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
     1152    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
     1153    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
     1154
     1155    accumulator = _mm_add_pd(accumulator, temp1);
     1156    accumulator = _mm_add_pd(accumulator, temp2);
     1157    accumulator = _mm_add_pd(accumulator, temp3);
     1158    _mm_store_pd(&m_matrix[0][2], accumulator);
     1159
     1160    // Second row.
     1161    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
     1162    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
     1163    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
     1164    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
     1165
     1166    // output10 and output11.
     1167    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
     1168    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
     1169    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
     1170    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
     1171
     1172    accumulator = _mm_add_pd(accumulator, temp1);
     1173    accumulator = _mm_add_pd(accumulator, temp2);
     1174    accumulator = _mm_add_pd(accumulator, temp3);
     1175    _mm_store_pd(&m_matrix[1][0], accumulator);
     1176
     1177    // output12 and output13.
     1178    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
     1179    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
     1180    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
     1181    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
     1182
     1183    accumulator = _mm_add_pd(accumulator, temp1);
     1184    accumulator = _mm_add_pd(accumulator, temp2);
     1185    accumulator = _mm_add_pd(accumulator, temp3);
     1186    _mm_store_pd(&m_matrix[1][2], accumulator);
     1187
     1188    // Third row.
     1189    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
     1190    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
     1191    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
     1192    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
     1193
     1194    // output20 and output21.
     1195    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
     1196    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
     1197    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
     1198    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
     1199
     1200    accumulator = _mm_add_pd(accumulator, temp1);
     1201    accumulator = _mm_add_pd(accumulator, temp2);
     1202    accumulator = _mm_add_pd(accumulator, temp3);
     1203    _mm_store_pd(&m_matrix[2][0], accumulator);
     1204
     1205    // output22 and output23.
     1206    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
     1207    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
     1208    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
     1209    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
     1210
     1211    accumulator = _mm_add_pd(accumulator, temp1);
     1212    accumulator = _mm_add_pd(accumulator, temp2);
     1213    accumulator = _mm_add_pd(accumulator, temp3);
     1214    _mm_store_pd(&m_matrix[2][2], accumulator);
     1215
     1216    // Fourth row.
     1217    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
     1218    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
     1219    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
     1220    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
     1221
     1222    // output30 and output31.
     1223    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
     1224    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
     1225    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
     1226    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
     1227
     1228    accumulator = _mm_add_pd(accumulator, temp1);
     1229    accumulator = _mm_add_pd(accumulator, temp2);
     1230    accumulator = _mm_add_pd(accumulator, temp3);
     1231    _mm_store_pd(&m_matrix[3][0], accumulator);
     1232
     1233    // output32 and output33.
     1234    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
     1235    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
     1236    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
     1237    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
     1238
     1239    accumulator = _mm_add_pd(accumulator, temp1);
     1240    accumulator = _mm_add_pd(accumulator, temp2);
     1241    accumulator = _mm_add_pd(accumulator, temp3);
     1242    _mm_store_pd(&m_matrix[3][2], accumulator);
    11181243#else
    11191244    Matrix4 tmp;
  • trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h

    r138711 r138866  
    7070class FloatQuad;
    7171
     72#if CPU(X86_64) && !PLATFORM(WINDOWS)
     73#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2
     74#endif
     75
    7276class TransformationMatrix {
    7377    WTF_MAKE_FAST_ALLOCATED;
    7478public:
    75 #if CPU(APPLE_ARMV7S)
     79#if CPU(APPLE_ARMV7S) || defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
    7680    typedef double Matrix4[4][4] __attribute__((aligned (16)));
    7781#else
     
    227231    void setF(double f) { m_matrix[3][1] = f; }
    228232
    229     // this = this * mat
     233    // this = mat * this.
    230234    TransformationMatrix& multiply(const TransformationMatrix&);
    231235
Note: See TracChangeset for help on using the changeset viewer.