Context Navigation

← Previous Changeset
Next Changeset →

Changeset 102312 in webkit

Timestamp:

Dec 7, 2011 9:23:42 PM (12 years ago)

Author:

commit-queue@webkit.org

Message:

Implement the SSE optimization in SincResampler::process()
https://bugs.webkit.org/show_bug.cgi?id=73789

Patch by Xingnan Wang <xingnan.wang@intel.com> on 2011-12-07
Reviewed by Benjamin Poulain.

Here is about 70% performance improvement on the hot spot of sample convolving.

platform/audio/SincResampler.cpp:

Location:

trunk/Source/WebCore

Files:

: 2 edited

ChangeLog (modified) (1 diff)
platform/audio/SincResampler.cpp (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Source/WebCore/ChangeLog

-                      r102311
+                      r102312
+-12-07  Xingnan Wang  <xingnan.wang@intel.com>
+        Implement the SSE optimization in SincResampler::process()
+        https://bugs.webkit.org/show_bug.cgi?id=73789
+        Reviewed by Benjamin Poulain.
+        Here is about 70% performance improvement on the hot spot of sample convolving.
+        * platform/audio/SincResampler.cpp:
 -12-07  Luke Macpherson   <macpherson@chromium.org>

trunk/Source/WebCore/platform/audio/SincResampler.cpp

-                      r98792
+                      r102312
 #include <wtf/MathExtras.h>
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
 using namespace std;
 …
             int n = m_kernelSize;
-            // FIXME: add SIMD optimizations for the following. The scalar code-path can probably also be optimized better.
 #define CONVOLVE_ONE_SAMPLE      \
             input = *inputP++;   \
 …
+            {
                 float input;
+#ifdef __SSE2__
+                // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
+                while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
+                    CONVOLVE_ONE_SAMPLE
+                    n--;
+                }
+                // Now the inputP is aligned and start to apply SSE.
+                float* endP = inputP + n - n % 4;
+                __m128 mInput;
+                __m128 mK1;
+                __m128 mK2;
+                __m128 mul1;
+                __m128 mul2;
+                __m128 sums1 = _mm_setzero_ps();
+                __m128 sums2 = _mm_setzero_ps();
+                bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
+                bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
+#define LOAD_DATA(l1, l2)                        \
+                mInput = _mm_load_ps(inputP);    \
+                mK1 = _mm_##l1##_ps(k1);         \
+                mK2 = _mm_##l2##_ps(k2);
+#define CONVOLVE_4_SAMPLES                       \
+                mul1 = _mm_mul_ps(mInput, mK1);  \
+                mul2 = _mm_mul_ps(mInput, mK2);  \
+                sums1 = _mm_add_ps(sums1, mul1); \
+                sums2 = _mm_add_ps(sums2, mul2); \
+                inputP += 4;                     \
+                k1 += 4;                         \
+                k2 += 4;
+                if (k1Aligned && k2Aligned) { // both aligned
+                    while (inputP < endP) {
+                        LOAD_DATA(load, load)
+                        CONVOLVE_4_SAMPLES
+                    }
+                } else if (!k1Aligned && k2Aligned) { // only k2 aligned
+                    while (inputP < endP) {
+                        LOAD_DATA(loadu, load)
+                        CONVOLVE_4_SAMPLES
+                    }
+                } else if (k1Aligned && !k2Aligned) { // only k1 aligned
+                    while (inputP < endP) {
+                        LOAD_DATA(load, loadu)
+                        CONVOLVE_4_SAMPLES
+                    }
+                } else { // both non-aligned
+                    while (inputP < endP) {
+                        LOAD_DATA(loadu, loadu)
+                        CONVOLVE_4_SAMPLES
+                    }
+                }
+                // Summarize the SSE results to sum1 and sum2.
+                float* groupSumP = reinterpret_cast<float*>(&sums1);
+                sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
+                groupSumP = reinterpret_cast<float*>(&sums2);
+                sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
+                n %= 4;
+                while (n) {
+                    CONVOLVE_ONE_SAMPLE
+                    n--;
+                }
+#else
+                // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
                 // Optimize size 32 and size 64 kernels by unrolling the while loop.
 …
+                    }
+                }
+#endif
+            }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 102312 in webkit

Legend:

trunk/Source/WebCore/ChangeLog

trunk/Source/WebCore/platform/audio/SincResampler.cpp

Download in other formats: