Changeset 102312 in webkit
- Timestamp:
- Dec 7, 2011 9:23:42 PM (12 years ago)
- Location:
- trunk/Source/WebCore
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/WebCore/ChangeLog
r102311 r102312 1 2011-12-07 Xingnan Wang <xingnan.wang@intel.com> 2 3 Implement the SSE optimization in SincResampler::process() 4 https://bugs.webkit.org/show_bug.cgi?id=73789 5 6 Reviewed by Benjamin Poulain. 7 8 Here is about 70% performance improvement on the hot spot of sample convolving. 9 10 * platform/audio/SincResampler.cpp: 11 1 12 2011-12-07 Luke Macpherson <macpherson@chromium.org> 2 13 -
trunk/Source/WebCore/platform/audio/SincResampler.cpp
r98792 r102312 36 36 #include <wtf/MathExtras.h> 37 37 38 #ifdef __SSE2__ 39 #include <emmintrin.h> 40 #endif 41 38 42 using namespace std; 39 43 … … 247 251 int n = m_kernelSize; 248 252 249 // FIXME: add SIMD optimizations for the following. The scalar code-path can probably also be optimized better.250 251 253 #define CONVOLVE_ONE_SAMPLE \ 252 254 input = *inputP++; \ … … 258 260 { 259 261 float input; 262 263 #ifdef __SSE2__ 264 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately. 265 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) { 266 CONVOLVE_ONE_SAMPLE 267 n--; 268 } 269 270 // Now the inputP is aligned and start to apply SSE. 271 float* endP = inputP + n - n % 4; 272 __m128 mInput; 273 __m128 mK1; 274 __m128 mK2; 275 __m128 mul1; 276 __m128 mul2; 277 278 __m128 sums1 = _mm_setzero_ps(); 279 __m128 sums2 = _mm_setzero_ps(); 280 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F); 281 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F); 282 283 #define LOAD_DATA(l1, l2) \ 284 mInput = _mm_load_ps(inputP); \ 285 mK1 = _mm_##l1##_ps(k1); \ 286 mK2 = _mm_##l2##_ps(k2); 287 288 #define CONVOLVE_4_SAMPLES \ 289 mul1 = _mm_mul_ps(mInput, mK1); \ 290 mul2 = _mm_mul_ps(mInput, mK2); \ 291 sums1 = _mm_add_ps(sums1, mul1); \ 292 sums2 = _mm_add_ps(sums2, mul2); \ 293 inputP += 4; \ 294 k1 += 4; \ 295 k2 += 4; 296 297 if (k1Aligned && k2Aligned) { // both aligned 298 while (inputP < endP) { 299 LOAD_DATA(load, load) 300 CONVOLVE_4_SAMPLES 301 } 302 } else if (!k1Aligned && k2Aligned) { // only k2 aligned 303 while (inputP < endP) { 304 LOAD_DATA(loadu, load) 305 CONVOLVE_4_SAMPLES 306 } 307 } else if (k1Aligned && !k2Aligned) { // only k1 aligned 308 while (inputP < endP) { 309 LOAD_DATA(load, loadu) 310 CONVOLVE_4_SAMPLES 311 } 312 } else { // both non-aligned 313 while (inputP < endP) { 314 LOAD_DATA(loadu, loadu) 315 CONVOLVE_4_SAMPLES 316 } 317 } 318 319 // Summarize the SSE results to sum1 and sum2. 320 float* groupSumP = reinterpret_cast<float*>(&sums1); 321 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 322 groupSumP = reinterpret_cast<float*>(&sums2); 323 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 324 325 n %= 4; 326 while (n) { 327 CONVOLVE_ONE_SAMPLE 328 n--; 329 } 330 #else 331 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better. 260 332 261 333 // Optimize size 32 and size 64 kernels by unrolling the while loop. … … 366 438 } 367 439 } 440 #endif 368 441 } 369 442
Note: See TracChangeset
for help on using the changeset viewer.