Changeset 134867 in webkit


Ignore:
Timestamp:
Nov 15, 2012 5:19:50 PM (11 years ago)
Author:
commit-queue@webkit.org
Message:

Optimize the multiply-add in Biquad.cpp::process
https://bugs.webkit.org/show_bug.cgi?id=75528

Patch by Xingnan Wang <xingnan.wang@intel.com> on 2012-11-15
Reviewed by Brent Fulgham.

Pipeline the multiply-add with SSE2 intrinsics.
Get ~45% performance improvement for the function.

  • platform/audio/Biquad.cpp:

(WebCore::Biquad::process):

Location:
trunk/Source/WebCore
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/WebCore/ChangeLog

    r134865 r134867  
     12012-11-15  Xingnan Wang  <xingnan.wang@intel.com>
     2
     3        Optimize the multiply-add in Biquad.cpp::process
     4        https://bugs.webkit.org/show_bug.cgi?id=75528
     5
     6        Reviewed by Brent Fulgham.
     7
     8        Pipeline the multiply-add with SSE2 intrinsics.
     9        Get ~45% performance improvement for the function.
     10
     11        * platform/audio/Biquad.cpp:
     12        (WebCore::Biquad::process):
     13
    1142012-11-15  Alec Flett  <alecflett@chromium.org>
    215
  • trunk/Source/WebCore/platform/audio/Biquad.cpp

    r110782 r134867  
    4242#endif
    4343
     44#ifdef __SSE2__
     45#include <emmintrin.h>
     46#endif
     47
    4448namespace WebCore {
    4549
     
    97101    double a2 = m_a2;
    98102
     103    // Optimize the hot multiply-add by pipelining with SSE2 intrinsics.
     104#ifdef __SSE2__
     105    __m128d mm0 = _mm_set_pd(x1, x2); // mm0 = (x1, x2)
     106    __m128d mm1 = _mm_set_pd(y2, static_cast<double>(*sourceP)); // mm1 = (y2, x)
     107    __m128d mm2 = _mm_set_pd(b1, b2); // mm2 = (b1, b2)
     108    __m128d mm3 = _mm_set_pd(-a2, b0); // mm3 = (-a2, b0)
     109    __m128d mm4 = _mm_set_sd(y1); // mm4 = y1, only use low part of mm4.
     110    __m128d mm5;
     111    __m128d mm6;
     112    __m128 mm7; // Only use low part of mm7.
     113    __m128d mma1 = _mm_set_sd(-a1); // mma1 = -a1, only use low part of mma1.
     114
     115    while (n) {
     116        sourceP++;
     117        mm6 = mm1; // mm6 = (y2, x)
     118        mm1 = _mm_shuffle_pd(mm1, mm4, 0); // mm1 = (y1, x)
     119        mm5 = _mm_mul_pd(mm2, mm0); // mm5 = (x1 * b1, x2 * b2)
     120        mm6 = _mm_mul_pd(mm3, mm6); // mm6 = (-y2 * a2, x * b0)
     121        mm0 = _mm_shuffle_pd(mm0, mm1, 1); // mm0 = (x, x1)
     122        mm4 = _mm_mul_sd(mm4, mma1); // mm4 = -y1 * a1
     123        mm5 = _mm_add_pd(mm5, mm6); // mm5 = (x1 * b1 - y2 * a2, x2 * b2 + x * b0)
     124        n--;
     125        mm6 = mm5; // mm6 = (x1 * b1 - y2 * a2, x2 * b2 + x * b0)
     126        mm7 = _mm_load_ss(sourceP); // mm7 = *sourceP, load next x value.
     127        mm1 = _mm_cvtss_sd(mm1, mm7); // mm1 = (y1, x)
     128        mm5 = _mm_add_sd(mm4, mm5); // mm5 = x2 * b2 + x * b0 - y1 * a1, only care low part of mm5 here.
     129        mm6 = _mm_shuffle_pd(mm6, mm6, 1); // mm6 = (x2 * b2 + x * b0, x1 * b1 - y2 * a2)
     130        mm5 = _mm_add_sd(mm5, mm6); // mm5 = x * b0 + x1 * b1 + x2 * b2 - y1 * a1 - y2 * a2 = y, only care low part of mm5.
     131        mm7 = _mm_cvtsd_ss(mm7, mm5); // mm7 = static_cast<float>(y)
     132        _mm_store_ss(destP, mm7); // Store y to destP
     133        mm4 = mm5; // mm4 = y
     134        destP++;
     135    }
     136    _mm_storeh_pd(&x1, mm0);
     137    _mm_storel_pd(&x2, mm0);
     138    _mm_storel_pd(&y1, mm4);
     139    _mm_storeh_pd(&y2, mm1);
     140#else
    99141    while (n--) {
    100         // FIXME: this can be optimized by pipelining the multiply adds...
    101142        float x = *sourceP++;
    102143        float y = b0*x + b1*x1 + b2*x2 - a1*y1 - a2*y2;
     
    110151        y1 = y;
    111152    }
     153#endif // __SSE2__
    112154
    113155    // Local variables back to member. Flush denormals here so we
Note: See TracChangeset for help on using the changeset viewer.