Context Navigation

← Previous Changeset
Next Changeset →

Changeset 104143 in webkit

Timestamp:

Jan 5, 2012 4:54:04 AM (12 years ago)

Author:

commit-queue@webkit.org

Message:

Add a SSE2 optimized function zvmul in VectorMatch
https://bugs.webkit.org/show_bug.cgi?id=74842

Patch by Xingnan Wang <xingnan.wang@intel.com> on 2012-01-05
Reviewed by Kenneth Russell.

Use zvmul in FFTFrameFFMPEG.cpp::multiply() and FFTFrameMac.cpp::multiply().

platform/audio/VectorMath.cpp:

(WebCore::VectorMath::zvmul):

platform/audio/VectorMath.h:
platform/audio/ffmpeg/FFTFrameFFMPEG.cpp:

(WebCore::FFTFrame::multiply):

platform/audio/mac/FFTFrameMac.cpp:

(WebCore::FFTFrame::multiply):

Location:

trunk/Source/WebCore

Files:

: 5 edited

ChangeLog (modified) (1 diff)
platform/audio/VectorMath.cpp (modified) (2 diffs)
platform/audio/VectorMath.h (modified) (1 diff)
platform/audio/ffmpeg/FFTFrameFFMPEG.cpp (modified) (2 diffs)
platform/audio/mac/FFTFrameMac.cpp (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Source/WebCore/ChangeLog

-                      r104142
+                      r104143
+-01-05  Xingnan Wang  <xingnan.wang@intel.com>
+        Add a SSE2 optimized function zvmul in VectorMatch
+        https://bugs.webkit.org/show_bug.cgi?id=74842
+        Reviewed by Kenneth Russell.
+        Use zvmul in FFTFrameFFMPEG.cpp::multiply() and FFTFrameMac.cpp::multiply().
+        * platform/audio/VectorMath.cpp:
+        (WebCore::VectorMath::zvmul):
+        * platform/audio/VectorMath.h:
+        * platform/audio/ffmpeg/FFTFrameFFMPEG.cpp:
+        (WebCore::FFTFrame::multiply):
+        * platform/audio/mac/FFTFrameMac.cpp:
+        (WebCore::FFTFrame::multiply):
 -01-05  Alpha Lam  <hclam@chromium.org>

trunk/Source/WebCore/platform/audio/VectorMath.cpp

-                      r102702
+                      r104143
+}
+void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
+{
+    DSPSplitComplex sc1;
+    DSPSplitComplex sc2;
+    DSPSplitComplex dest;
+    sc1.realp = real1P;
+    sc1.imagp = imag1P;
+    sc2.realp = real2P;
+    sc2.imagp = imag2P;
+    dest.realp = realDestP;
+    dest.imagp = imagDestP;
+#if defined(__ppc__) || defined(__i386__)
+    ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
+#else
+    vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
+#endif
+}
 #else
 …
+}
+void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
+{
+    unsigned i = 0;
+#ifdef __SSE2__
+    // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
+    // Otherwise, fall through to the scalar code below.
+    if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
+        && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
+        && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
+        && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
+        && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
+        && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
+        unsigned endSize = framesToProcess - framesToProcess % 4;
+        while (i < endSize) {
+            __m128 real1 = _mm_load_ps(real1P + i);
+            __m128 real2 = _mm_load_ps(real2P + i);
+            __m128 imag1 = _mm_load_ps(imag1P + i);
+            __m128 imag2 = _mm_load_ps(imag2P + i);
+            __m128 real = _mm_mul_ps(real1, real2);
+            real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
+            __m128 imag = _mm_mul_ps(real1, imag2);
+            imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
+            _mm_store_ps(realDestP + i, real);
+            _mm_store_ps(imagDestP + i, imag);
+            i += 4;
+        }
+    }
+#endif
+    for (; i < framesToProcess; ++i) {
+        realDestP[i] = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
+        imagDestP[i] = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
+    }
+}
 #endif // OS(DARWIN)

trunk/Source/WebCore/platform/audio/VectorMath.h

-                      r102702
+                      r104143
 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess);
+// Multiplies two complex vectors.
+void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess);
 } // namespace VectorMath

trunk/Source/WebCore/platform/audio/ffmpeg/FFTFrameFFMPEG.cpp

-                      r95901
+                      r104143
     const float* imagP2 = frame2.imagData();
+    unsigned halfSize = fftSize() / 2;
+    float real0 = realP1[0];
+    float imag0 = imagP1[0];
+    VectorMath::zvmul(realP1, imagP1, realP2, imagP2, realP1, imagP1, halfSize);
+    // Multiply the packed DC/nyquist component
+    realP1[0] = real0 * realP2[0];
+    imagP1[0] = imag0 * imagP2[0];
     // Scale accounts the peculiar scaling of vecLib on the Mac.
     // This ensures the right scaling all the way back to inverse FFT.
 …
     float scale = 0.5f;
+    // Multiply the packed DC/nyquist component
+    realP1[0] *= scale * realP2[0];
+    imagP1[0] *= scale * imagP2[0];
+    // Complex multiplication. If this loop turns out to be hot then
+    // we should use SSE or other intrinsics to accelerate it.
+    unsigned halfSize = fftSize() / 2;
+    for (unsigned i = 1; i < halfSize; ++i) {
+        float realResult = realP1[i] * realP2[i] - imagP1[i] * imagP2[i];
+        float imagResult = realP1[i] * imagP2[i] + imagP1[i] * realP2[i];
+        realP1[i] = scale * realResult;
+        imagP1[i] = scale * imagResult;
+    }
+    VectorMath::vsmul(realP1, 1, &scale, realP1, 1, halfSize);
+    VectorMath::vsmul(imagP1, 1, &scale, imagP1, 1, halfSize);
+}

trunk/Source/WebCore/platform/audio/mac/FFTFrameMac.cpp

-                      r95901
+                      r104143
 #include "FFTFrame.h"
+#include "VectorMath.h"
 namespace WebCore {
 …
     const float* imagP2 = frame2.imagData();
+    unsigned halfSize = m_FFTSize / 2;
+    float real0 = realP1[0];
+    float imag0 = imagP1[0];
+    // Complex multiply
+    VectorMath::zvmul(realP1, imagP1, realP2, imagP2, realP1, imagP1, halfSize);
+    // Multiply the packed DC/nyquist component
+    realP1[0] = real0 * realP2[0];
+    imagP1[0] = imag0 * imagP2[0];
     // Scale accounts for vecLib's peculiar scaling
     // This ensures the right scaling all the way back to inverse FFT
     float scale = 0.5f;
+    // Multiply packed DC/nyquist component
+    realP1[0] *= scale * realP2[0];
+    imagP1[0] *= scale * imagP2[0];
+    // Multiply the rest, skipping packed DC/Nyquist components
+    DSPSplitComplex sc1 = frame1.dspSplitComplex();
+    sc1.realp++;
+    sc1.imagp++;
+    DSPSplitComplex sc2 = frame2.dspSplitComplex();
+    sc2.realp++;
+    sc2.imagp++;
+    unsigned halfSize = m_FFTSize / 2;
+    // Complex multiply
+    vDSP_zvmul(&sc1, 1, &sc2, 1, &sc1, 1, halfSize - 1, 1 /* normal multiplication */);
+    // We've previously scaled the packed part, now scale the rest.....
+    vDSP_vsmul(sc1.realp, 1, &scale, sc1.realp, 1, halfSize - 1);
+    vDSP_vsmul(sc1.imagp, 1, &scale, sc1.imagp, 1, halfSize - 1);
+    VectorMath::vsmul(realP1, 1, &scale, realP1, 1, halfSize);
+    VectorMath::vsmul(imagP1, 1, &scale, imagP1, 1, halfSize);
+}

Note: See TracChangeset for help on using the changeset viewer.