Changeset 104143 in webkit


Ignore:
Timestamp:
Jan 5, 2012 4:54:04 AM (12 years ago)
Author:
commit-queue@webkit.org
Message:

Add a SSE2 optimized function zvmul in VectorMatch
https://bugs.webkit.org/show_bug.cgi?id=74842

Patch by Xingnan Wang <xingnan.wang@intel.com> on 2012-01-05
Reviewed by Kenneth Russell.

Use zvmul in FFTFrameFFMPEG.cpp::multiply() and FFTFrameMac.cpp::multiply().

  • platform/audio/VectorMath.cpp:

(WebCore::VectorMath::zvmul):

  • platform/audio/VectorMath.h:
  • platform/audio/ffmpeg/FFTFrameFFMPEG.cpp:

(WebCore::FFTFrame::multiply):

  • platform/audio/mac/FFTFrameMac.cpp:

(WebCore::FFTFrame::multiply):

Location:
trunk/Source/WebCore
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/WebCore/ChangeLog

    r104142 r104143  
     12012-01-05  Xingnan Wang  <xingnan.wang@intel.com>
     2
     3        Add a SSE2 optimized function zvmul in VectorMatch
     4        https://bugs.webkit.org/show_bug.cgi?id=74842
     5
     6        Reviewed by Kenneth Russell.
     7
     8        Use zvmul in FFTFrameFFMPEG.cpp::multiply() and FFTFrameMac.cpp::multiply().
     9
     10        * platform/audio/VectorMath.cpp:
     11        (WebCore::VectorMath::zvmul):
     12        * platform/audio/VectorMath.h:
     13        * platform/audio/ffmpeg/FFTFrameFFMPEG.cpp:
     14        (WebCore::FFTFrame::multiply):
     15        * platform/audio/mac/FFTFrameMac.cpp:
     16        (WebCore::FFTFrame::multiply):
     17
    1182012-01-05  Alpha Lam  <hclam@chromium.org>
    219
  • trunk/Source/WebCore/platform/audio/VectorMath.cpp

    r102702 r104143  
    7373}
    7474
     75void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
     76{
     77    DSPSplitComplex sc1;
     78    DSPSplitComplex sc2;
     79    DSPSplitComplex dest;
     80    sc1.realp = real1P;
     81    sc1.imagp = imag1P;
     82    sc2.realp = real2P;
     83    sc2.imagp = imag2P;
     84    dest.realp = realDestP;
     85    dest.imagp = imagDestP;
     86#if defined(__ppc__) || defined(__i386__)
     87    ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
     88#else
     89    vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
     90#endif
     91}
     92
    7593#else
    7694
     
    299317}
    300318
     319void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
     320{
     321    unsigned i = 0;
     322#ifdef __SSE2__
     323    // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
     324    // Otherwise, fall through to the scalar code below.
     325    if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
     326        && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
     327        && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
     328        && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
     329        && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
     330        && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
     331       
     332        unsigned endSize = framesToProcess - framesToProcess % 4;
     333        while (i < endSize) {
     334            __m128 real1 = _mm_load_ps(real1P + i);
     335            __m128 real2 = _mm_load_ps(real2P + i);
     336            __m128 imag1 = _mm_load_ps(imag1P + i);
     337            __m128 imag2 = _mm_load_ps(imag2P + i);
     338            __m128 real = _mm_mul_ps(real1, real2);
     339            real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
     340            __m128 imag = _mm_mul_ps(real1, imag2);
     341            imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
     342            _mm_store_ps(realDestP + i, real);
     343            _mm_store_ps(imagDestP + i, imag);
     344            i += 4;
     345        }
     346    }
     347#endif
     348    for (; i < framesToProcess; ++i) {
     349        realDestP[i] = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
     350        imagDestP[i] = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
     351    }
     352}
     353
    301354#endif // OS(DARWIN)
    302355
  • trunk/Source/WebCore/platform/audio/VectorMath.h

    r102702 r104143  
    3838void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess);
    3939
     40// Multiplies two complex vectors.
     41void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess);
     42
    4043} // namespace VectorMath
    4144
  • trunk/Source/WebCore/platform/audio/ffmpeg/FFTFrameFFMPEG.cpp

    r95901 r104143  
    116116    const float* imagP2 = frame2.imagData();
    117117
     118    unsigned halfSize = fftSize() / 2;
     119    float real0 = realP1[0];
     120    float imag0 = imagP1[0];
     121
     122    VectorMath::zvmul(realP1, imagP1, realP2, imagP2, realP1, imagP1, halfSize);
     123
     124    // Multiply the packed DC/nyquist component
     125    realP1[0] = real0 * realP2[0];
     126    imagP1[0] = imag0 * imagP2[0];
     127
    118128    // Scale accounts the peculiar scaling of vecLib on the Mac.
    119129    // This ensures the right scaling all the way back to inverse FFT.
     
    122132    float scale = 0.5f;
    123133
    124     // Multiply the packed DC/nyquist component
    125     realP1[0] *= scale * realP2[0];
    126     imagP1[0] *= scale * imagP2[0];
    127 
    128     // Complex multiplication. If this loop turns out to be hot then
    129     // we should use SSE or other intrinsics to accelerate it.
    130     unsigned halfSize = fftSize() / 2;
    131 
    132     for (unsigned i = 1; i < halfSize; ++i) {
    133         float realResult = realP1[i] * realP2[i] - imagP1[i] * imagP2[i];
    134         float imagResult = realP1[i] * imagP2[i] + imagP1[i] * realP2[i];
    135 
    136         realP1[i] = scale * realResult;
    137         imagP1[i] = scale * imagResult;
    138     }
     134    VectorMath::vsmul(realP1, 1, &scale, realP1, 1, halfSize);
     135    VectorMath::vsmul(imagP1, 1, &scale, imagP1, 1, halfSize);
    139136}
    140137
  • trunk/Source/WebCore/platform/audio/mac/FFTFrameMac.cpp

    r95901 r104143  
    3636
    3737#include "FFTFrame.h"
     38
     39#include "VectorMath.h"
    3840
    3941namespace WebCore {
     
    107109    const float* imagP2 = frame2.imagData();
    108110
     111    unsigned halfSize = m_FFTSize / 2;
     112    float real0 = realP1[0];
     113    float imag0 = imagP1[0];
     114
     115    // Complex multiply
     116    VectorMath::zvmul(realP1, imagP1, realP2, imagP2, realP1, imagP1, halfSize);
     117
     118    // Multiply the packed DC/nyquist component
     119    realP1[0] = real0 * realP2[0];
     120    imagP1[0] = imag0 * imagP2[0];
     121
    109122    // Scale accounts for vecLib's peculiar scaling
    110123    // This ensures the right scaling all the way back to inverse FFT
    111124    float scale = 0.5f;
    112125
    113     // Multiply packed DC/nyquist component
    114     realP1[0] *= scale * realP2[0];
    115     imagP1[0] *= scale * imagP2[0];
    116 
    117     // Multiply the rest, skipping packed DC/Nyquist components
    118     DSPSplitComplex sc1 = frame1.dspSplitComplex();
    119     sc1.realp++;
    120     sc1.imagp++;
    121 
    122     DSPSplitComplex sc2 = frame2.dspSplitComplex();
    123     sc2.realp++;
    124     sc2.imagp++;
    125 
    126     unsigned halfSize = m_FFTSize / 2;
    127 
    128     // Complex multiply
    129     vDSP_zvmul(&sc1, 1, &sc2, 1, &sc1, 1, halfSize - 1, 1 /* normal multiplication */);
    130 
    131     // We've previously scaled the packed part, now scale the rest.....
    132     vDSP_vsmul(sc1.realp, 1, &scale, sc1.realp, 1, halfSize - 1);
    133     vDSP_vsmul(sc1.imagp, 1, &scale, sc1.imagp, 1, halfSize - 1);
     126    VectorMath::vsmul(realP1, 1, &scale, realP1, 1, halfSize);
     127    VectorMath::vsmul(imagP1, 1, &scale, imagP1, 1, halfSize);
    134128}
    135129
Note: See TracChangeset for help on using the changeset viewer.