Changeset 138866 in webkit
- Timestamp:
- Jan 4, 2013 4:34:04 PM (11 years ago)
- Location:
- trunk/Source/WebCore
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/WebCore/ChangeLog
r138864 r138866 1 2013-01-04 Benjamin Poulain <benjamin@webkit.org> 2 3 Optimize TransformationMatrix::multiply() for x86_64 4 https://bugs.webkit.org/show_bug.cgi?id=105719 5 6 Reviewed by Sam Weinig. 7 8 On x86_64, we have access to 16 XMM registers. This can hold 32 double values. 9 We can use that in two ways to optimize matrix multiplications: 10 -Keep the source matrix completely in registers. Write the result directly in 11 the source matrix's memory. This avoids the memcpy at the end of the multiplication 12 and various memory operations. 13 -Use SIMD with SSE to perform 2 operations at a time. 14 15 The parameter from the second matrix are loaded one by one in XMM registers. 16 Loading them with SSE then shuffling the values perform worse than loading 17 one by one. 18 19 This is only enabled on 64bits as x86 only has access to 8 XMM registers and 20 the function should be written differently. 21 22 On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change. 23 24 * platform/graphics/transforms/TransformationMatrix.cpp: 25 (WebCore::TransformationMatrix::multiply): 26 * platform/graphics/transforms/TransformationMatrix.h: 27 (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file. 28 1 29 2013-01-04 Joshua Bell <jsbell@chromium.org> 2 30 -
trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
r138711 r138866 36 36 #include <wtf/Assertions.h> 37 37 #include <wtf/MathExtras.h> 38 39 #if CPU(X86_64) 40 #include <emmintrin.h> 41 #endif 38 42 39 43 using namespace std; … … 969 973 } 970 974 971 // 972 // *this = mat * *this 973 // 975 // this = mat * this. 974 976 TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat) 975 977 { … … 1116 1118 #undef MATRIX_MULTIPLY_ONE_LINE 1117 1119 1120 #elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2) 1121 // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers. 1122 __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0])); 1123 __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0])); 1124 __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0])); 1125 __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0])); 1126 1127 // First row. 1128 __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]); 1129 __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]); 1130 __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]); 1131 __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]); 1132 1133 // output00 and output01. 1134 __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam); 1135 __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam); 1136 __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam); 1137 __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam); 1138 1139 __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2])); 1140 __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2])); 1141 __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2])); 1142 __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2])); 1143 1144 accumulator = _mm_add_pd(accumulator, temp1); 1145 accumulator = _mm_add_pd(accumulator, temp2); 1146 accumulator = _mm_add_pd(accumulator, temp3); 1147 _mm_store_pd(&m_matrix[0][0], accumulator); 1148 1149 // output02 and output03. 1150 accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam); 1151 temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam); 1152 temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam); 1153 temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam); 1154 1155 accumulator = _mm_add_pd(accumulator, temp1); 1156 accumulator = _mm_add_pd(accumulator, temp2); 1157 accumulator = _mm_add_pd(accumulator, temp3); 1158 _mm_store_pd(&m_matrix[0][2], accumulator); 1159 1160 // Second row. 1161 otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]); 1162 otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]); 1163 otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]); 1164 otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]); 1165 1166 // output10 and output11. 1167 accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam); 1168 temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam); 1169 temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam); 1170 temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam); 1171 1172 accumulator = _mm_add_pd(accumulator, temp1); 1173 accumulator = _mm_add_pd(accumulator, temp2); 1174 accumulator = _mm_add_pd(accumulator, temp3); 1175 _mm_store_pd(&m_matrix[1][0], accumulator); 1176 1177 // output12 and output13. 1178 accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam); 1179 temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam); 1180 temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam); 1181 temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam); 1182 1183 accumulator = _mm_add_pd(accumulator, temp1); 1184 accumulator = _mm_add_pd(accumulator, temp2); 1185 accumulator = _mm_add_pd(accumulator, temp3); 1186 _mm_store_pd(&m_matrix[1][2], accumulator); 1187 1188 // Third row. 1189 otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]); 1190 otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]); 1191 otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]); 1192 otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]); 1193 1194 // output20 and output21. 1195 accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam); 1196 temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam); 1197 temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam); 1198 temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam); 1199 1200 accumulator = _mm_add_pd(accumulator, temp1); 1201 accumulator = _mm_add_pd(accumulator, temp2); 1202 accumulator = _mm_add_pd(accumulator, temp3); 1203 _mm_store_pd(&m_matrix[2][0], accumulator); 1204 1205 // output22 and output23. 1206 accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam); 1207 temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam); 1208 temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam); 1209 temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam); 1210 1211 accumulator = _mm_add_pd(accumulator, temp1); 1212 accumulator = _mm_add_pd(accumulator, temp2); 1213 accumulator = _mm_add_pd(accumulator, temp3); 1214 _mm_store_pd(&m_matrix[2][2], accumulator); 1215 1216 // Fourth row. 1217 otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]); 1218 otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]); 1219 otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]); 1220 otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]); 1221 1222 // output30 and output31. 1223 accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam); 1224 temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam); 1225 temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam); 1226 temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam); 1227 1228 accumulator = _mm_add_pd(accumulator, temp1); 1229 accumulator = _mm_add_pd(accumulator, temp2); 1230 accumulator = _mm_add_pd(accumulator, temp3); 1231 _mm_store_pd(&m_matrix[3][0], accumulator); 1232 1233 // output32 and output33. 1234 accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam); 1235 temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam); 1236 temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam); 1237 temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam); 1238 1239 accumulator = _mm_add_pd(accumulator, temp1); 1240 accumulator = _mm_add_pd(accumulator, temp2); 1241 accumulator = _mm_add_pd(accumulator, temp3); 1242 _mm_store_pd(&m_matrix[3][2], accumulator); 1118 1243 #else 1119 1244 Matrix4 tmp; -
trunk/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
r138711 r138866 70 70 class FloatQuad; 71 71 72 #if CPU(X86_64) && !PLATFORM(WINDOWS) 73 #define TRANSFORMATION_MATRIX_USE_X86_64_SSE2 74 #endif 75 72 76 class TransformationMatrix { 73 77 WTF_MAKE_FAST_ALLOCATED; 74 78 public: 75 #if CPU(APPLE_ARMV7S) 79 #if CPU(APPLE_ARMV7S) || defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2) 76 80 typedef double Matrix4[4][4] __attribute__((aligned (16))); 77 81 #else … … 227 231 void setF(double f) { m_matrix[3][1] = f; } 228 232 229 // this = this * mat233 // this = mat * this. 230 234 TransformationMatrix& multiply(const TransformationMatrix&); 231 235
Note: See TracChangeset
for help on using the changeset viewer.