Changeset 134867 in webkit
- Timestamp:
- Nov 15, 2012 5:19:50 PM (11 years ago)
- Location:
- trunk/Source/WebCore
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/WebCore/ChangeLog
r134865 r134867 1 2012-11-15 Xingnan Wang <xingnan.wang@intel.com> 2 3 Optimize the multiply-add in Biquad.cpp::process 4 https://bugs.webkit.org/show_bug.cgi?id=75528 5 6 Reviewed by Brent Fulgham. 7 8 Pipeline the multiply-add with SSE2 intrinsics. 9 Get ~45% performance improvement for the function. 10 11 * platform/audio/Biquad.cpp: 12 (WebCore::Biquad::process): 13 1 14 2012-11-15 Alec Flett <alecflett@chromium.org> 2 15 -
trunk/Source/WebCore/platform/audio/Biquad.cpp
r110782 r134867 42 42 #endif 43 43 44 #ifdef __SSE2__ 45 #include <emmintrin.h> 46 #endif 47 44 48 namespace WebCore { 45 49 … … 97 101 double a2 = m_a2; 98 102 103 // Optimize the hot multiply-add by pipelining with SSE2 intrinsics. 104 #ifdef __SSE2__ 105 __m128d mm0 = _mm_set_pd(x1, x2); // mm0 = (x1, x2) 106 __m128d mm1 = _mm_set_pd(y2, static_cast<double>(*sourceP)); // mm1 = (y2, x) 107 __m128d mm2 = _mm_set_pd(b1, b2); // mm2 = (b1, b2) 108 __m128d mm3 = _mm_set_pd(-a2, b0); // mm3 = (-a2, b0) 109 __m128d mm4 = _mm_set_sd(y1); // mm4 = y1, only use low part of mm4. 110 __m128d mm5; 111 __m128d mm6; 112 __m128 mm7; // Only use low part of mm7. 113 __m128d mma1 = _mm_set_sd(-a1); // mma1 = -a1, only use low part of mma1. 114 115 while (n) { 116 sourceP++; 117 mm6 = mm1; // mm6 = (y2, x) 118 mm1 = _mm_shuffle_pd(mm1, mm4, 0); // mm1 = (y1, x) 119 mm5 = _mm_mul_pd(mm2, mm0); // mm5 = (x1 * b1, x2 * b2) 120 mm6 = _mm_mul_pd(mm3, mm6); // mm6 = (-y2 * a2, x * b0) 121 mm0 = _mm_shuffle_pd(mm0, mm1, 1); // mm0 = (x, x1) 122 mm4 = _mm_mul_sd(mm4, mma1); // mm4 = -y1 * a1 123 mm5 = _mm_add_pd(mm5, mm6); // mm5 = (x1 * b1 - y2 * a2, x2 * b2 + x * b0) 124 n--; 125 mm6 = mm5; // mm6 = (x1 * b1 - y2 * a2, x2 * b2 + x * b0) 126 mm7 = _mm_load_ss(sourceP); // mm7 = *sourceP, load next x value. 127 mm1 = _mm_cvtss_sd(mm1, mm7); // mm1 = (y1, x) 128 mm5 = _mm_add_sd(mm4, mm5); // mm5 = x2 * b2 + x * b0 - y1 * a1, only care low part of mm5 here. 129 mm6 = _mm_shuffle_pd(mm6, mm6, 1); // mm6 = (x2 * b2 + x * b0, x1 * b1 - y2 * a2) 130 mm5 = _mm_add_sd(mm5, mm6); // mm5 = x * b0 + x1 * b1 + x2 * b2 - y1 * a1 - y2 * a2 = y, only care low part of mm5. 131 mm7 = _mm_cvtsd_ss(mm7, mm5); // mm7 = static_cast<float>(y) 132 _mm_store_ss(destP, mm7); // Store y to destP 133 mm4 = mm5; // mm4 = y 134 destP++; 135 } 136 _mm_storeh_pd(&x1, mm0); 137 _mm_storel_pd(&x2, mm0); 138 _mm_storel_pd(&y1, mm4); 139 _mm_storeh_pd(&y2, mm1); 140 #else 99 141 while (n--) { 100 // FIXME: this can be optimized by pipelining the multiply adds...101 142 float x = *sourceP++; 102 143 float y = b0*x + b1*x1 + b2*x2 - a1*y1 - a2*y2; … … 110 151 y1 = y; 111 152 } 153 #endif // __SSE2__ 112 154 113 155 // Local variables back to member. Flush denormals here so we
Note: See TracChangeset
for help on using the changeset viewer.