Context Navigation

← Previous Changeset
Next Changeset →

Changeset 101894 in webkit

Timestamp:

Dec 2, 2011 6:43:43 PM (12 years ago)

Author:

commit-queue@webkit.org

Message:

-Implement the SSE optimization for vsmul and vadd.
https://bugs.webkit.org/show_bug.cgi?id=73182

Patch by James Wei <james.wei@intel.com> & Xingnan Wang <xingnan.wang@intel.com> on 2011-12-02
Reviewed by Kenneth Russell.

platform/audio/VectorMath.cpp:

(WebCore:VectorMath):

Location:

trunk/Source/WebCore

Files:

: 2 edited

ChangeLog (modified) (1 diff)
platform/audio/VectorMath.cpp (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Source/WebCore/ChangeLog

-                      r101890
+                      r101894
+-12-02  James Wei <james.wei@intel.com> & Xingnan Wang <xingnan.wang@intel.com>
+        -Implement the SSE optimization for vsmul and vadd.
+        https://bugs.webkit.org/show_bug.cgi?id=73182
+        Reviewed by Kenneth Russell.
+        * platform/audio/VectorMath.cpp:
+        (WebCore:VectorMath):
 -12-02  David Grogan  <dgrogan@chromium.org>

trunk/Source/WebCore/platform/audio/VectorMath.cpp

-                      r95901
+                      r101894
 #endif
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
 namespace WebCore {
 …
 // On the Mac we use the highly optimized versions in Accelerate.framework
 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
 // our namespaced function names, so we must handle this case differently.  Other architectures (64bit, ARM, etc.) do not include this header file.
+// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
 …
 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
+{
+    // FIXME: optimize for SSE
+#ifdef __SSE2__
+    if ((sourceStride == 1) && (destStride == 1)) {
+        int n = framesToProcess;
+        float k = *scale;
+        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+        while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
+            *destP = k * *sourceP;
+            sourceP++;
+            destP++;
+            n--;
+        }
+        // Now the sourceP address is aligned and start to apply SSE.
+        int group = n / 4;
+        __m128 mScale = _mm_set_ps1(k);
+        __m128* pSource;
+        __m128* pDest;
+        __m128 dest;
+        if (reinterpret_cast<size_t>(destP) & 0x0F) {
+            while (group--) {
+                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
+                dest = _mm_mul_ps(*pSource, mScale);
+                _mm_storeu_ps(destP, dest);
+                sourceP += 4;
+                destP += 4;
+            }
+        } else {
+            while (group--) {
+                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
+                pDest = reinterpret_cast<__m128*>(destP);
+                *pDest = _mm_mul_ps(*pSource, mScale);
+                sourceP += 4;
+                destP += 4;
+            }
+        }
+        // Non-SSE handling for remaining frames which is less than 4.
+        n %= 4;
+        while (n) {
+            *destP = k * *sourceP;
+            sourceP++;
+            destP++;
+            n--;
+        }
+    } else { // If strides are not 1, rollback to normal algorithm.
+#endif
     int n = framesToProcess;
     float k = *scale;
 …
         destP += destStride;
+    }
+#ifdef __SSE2__
+    }
+#endif
+}
 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
+{
+    // FIXME: optimize for SSE
+#ifdef __SSE2__
+    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
+        int n = framesToProcess;
+        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+        while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
+            *destP = *source1P + *source2P;
+            source1P++;
+            source2P++;
+            destP++;
+            n--;
+        }
+        // Now the source1P address is aligned and start to apply SSE.
+        int group = n / 4;
+        __m128* pSource1;
+        __m128* pSource2;
+        __m128* pDest;
+        __m128 source2;
+        __m128 dest;
+        bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
+        bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
+        if (source2Aligned && destAligned) { // all aligned
+            while (group--) {
+                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
+                pDest = reinterpret_cast<__m128*>(destP);
+                *pDest = _mm_add_ps(*pSource1, *pSource2);
+                source1P += 4;
+                source2P += 4;
+                destP += 4;
+            }
+        } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
+            while (group--) {
+                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
+                dest = _mm_add_ps(*pSource1, *pSource2);
+                _mm_storeu_ps(destP, dest);
+                source1P += 4;
+                source2P += 4;
+                destP += 4;
+            }
+        } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
+            while (group--) {
+                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+                source2 = _mm_loadu_ps(source2P);
+                pDest = reinterpret_cast<__m128*>(destP);
+                *pDest = _mm_add_ps(*pSource1, source2);
+                source1P += 4;
+                source2P += 4;
+                destP += 4;
+            }
+        } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
+            while (group--) {
+                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
+                source2 = _mm_loadu_ps(source2P);
+                dest = _mm_add_ps(*pSource1, source2);
+                _mm_storeu_ps(destP, dest);
+                source1P += 4;
+                source2P += 4;
+                destP += 4;
+            }
+        }
+        // Non-SSE handling for remaining frames which is less than 4.
+        n %= 4;
+        while (n) {
+            *destP = *source1P + *source2P;
+            source1P++;
+            source2P++;
+            destP++;
+            n--;
+        }
+    } else { // if strides are not 1, rollback to normal algorithm
+#endif
     int n = framesToProcess;
     while (n--) {
 …
         destP += destStride;
+    }
+#ifdef __SSE2__
+    }
+#endif
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 101894 in webkit

Legend:

trunk/Source/WebCore/ChangeLog

trunk/Source/WebCore/platform/audio/VectorMath.cpp

Download in other formats: