Changeset 101894 in webkit


Ignore:
Timestamp:
Dec 2, 2011 6:43:43 PM (12 years ago)
Author:
commit-queue@webkit.org
Message:

-Implement the SSE optimization for vsmul and vadd.
https://bugs.webkit.org/show_bug.cgi?id=73182

Patch by James Wei <james.wei@intel.com> & Xingnan Wang <xingnan.wang@intel.com> on 2011-12-02
Reviewed by Kenneth Russell.

  • platform/audio/VectorMath.cpp:

(WebCore:VectorMath):

Location:
trunk/Source/WebCore
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/WebCore/ChangeLog

    r101890 r101894  
     12011-12-02  James Wei <james.wei@intel.com> & Xingnan Wang <xingnan.wang@intel.com>
     2
     3        -Implement the SSE optimization for vsmul and vadd.
     4        https://bugs.webkit.org/show_bug.cgi?id=73182
     5
     6        Reviewed by Kenneth Russell.
     7
     8        * platform/audio/VectorMath.cpp:
     9        (WebCore:VectorMath):
     10
    1112011-12-02  David Grogan  <dgrogan@chromium.org>
    212
  • trunk/Source/WebCore/platform/audio/VectorMath.cpp

    r95901 r101894  
    3333#endif
    3434
     35#ifdef __SSE2__
     36#include <emmintrin.h>
     37#endif
     38
    3539namespace WebCore {
    3640
     
    4044// On the Mac we use the highly optimized versions in Accelerate.framework
    4145// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
    42 // our namespaced function names, so we must handle this case differently.  Other architectures (64bit, ARM, etc.) do not include this header file.
     46// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
    4347
    4448void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
     
    6468void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
    6569{
    66     // FIXME: optimize for SSE
     70#ifdef __SSE2__
     71    if ((sourceStride == 1) && (destStride == 1)) {
     72       
     73        int n = framesToProcess;
     74        float k = *scale;
     75
     76        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
     77        while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
     78            *destP = k * *sourceP;
     79            sourceP++;
     80            destP++;
     81            n--;
     82        }
     83
     84        // Now the sourceP address is aligned and start to apply SSE.
     85        int group = n / 4;
     86        __m128 mScale = _mm_set_ps1(k);
     87        __m128* pSource;
     88        __m128* pDest;
     89        __m128 dest;
     90
     91
     92        if (reinterpret_cast<size_t>(destP) & 0x0F) {
     93            while (group--) {
     94                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
     95                dest = _mm_mul_ps(*pSource, mScale);
     96                _mm_storeu_ps(destP, dest);
     97
     98                sourceP += 4;
     99                destP += 4;
     100            }
     101        } else {
     102            while (group--) {
     103                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
     104                pDest = reinterpret_cast<__m128*>(destP);
     105                *pDest = _mm_mul_ps(*pSource, mScale);
     106
     107                sourceP += 4;
     108                destP += 4;
     109            }
     110        }
     111
     112        // Non-SSE handling for remaining frames which is less than 4.
     113        n %= 4;
     114        while (n) {
     115            *destP = k * *sourceP;
     116            sourceP++;
     117            destP++;
     118            n--;
     119        }
     120    } else { // If strides are not 1, rollback to normal algorithm.
     121#endif
    67122    int n = framesToProcess;
    68123    float k = *scale;
     
    72127        destP += destStride;
    73128    }
     129#ifdef __SSE2__
     130    }
     131#endif
    74132}
    75133
    76134void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
    77135{
    78     // FIXME: optimize for SSE
     136#ifdef __SSE2__
     137    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
     138
     139        int n = framesToProcess;
     140
     141        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
     142        while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
     143            *destP = *source1P + *source2P;
     144            source1P++;
     145            source2P++;
     146            destP++;
     147            n--;
     148        }
     149
     150        // Now the source1P address is aligned and start to apply SSE.
     151        int group = n / 4;
     152        __m128* pSource1;
     153        __m128* pSource2;
     154        __m128* pDest;
     155        __m128 source2;
     156        __m128 dest;
     157
     158        bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
     159        bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
     160
     161        if (source2Aligned && destAligned) { // all aligned
     162            while (group--) {
     163                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
     164                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
     165                pDest = reinterpret_cast<__m128*>(destP);
     166                *pDest = _mm_add_ps(*pSource1, *pSource2);
     167
     168                source1P += 4;
     169                source2P += 4;
     170                destP += 4;
     171            }
     172
     173        } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
     174            while (group--) {
     175                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
     176                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
     177                dest = _mm_add_ps(*pSource1, *pSource2);
     178                _mm_storeu_ps(destP, dest);
     179
     180                source1P += 4;
     181                source2P += 4;
     182                destP += 4;
     183            }
     184
     185        } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
     186            while (group--) {
     187                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
     188                source2 = _mm_loadu_ps(source2P);
     189                pDest = reinterpret_cast<__m128*>(destP);
     190                *pDest = _mm_add_ps(*pSource1, source2);
     191
     192                source1P += 4;
     193                source2P += 4;
     194                destP += 4;
     195            }
     196        } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
     197            while (group--) {
     198                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
     199                source2 = _mm_loadu_ps(source2P);
     200                dest = _mm_add_ps(*pSource1, source2);
     201                _mm_storeu_ps(destP, dest);
     202
     203                source1P += 4;
     204                source2P += 4;
     205                destP += 4;
     206            }
     207        }
     208
     209        // Non-SSE handling for remaining frames which is less than 4.
     210        n %= 4;
     211        while (n) {
     212            *destP = *source1P + *source2P;
     213            source1P++;
     214            source2P++;
     215            destP++;
     216            n--;
     217        }
     218    } else { // if strides are not 1, rollback to normal algorithm
     219#endif
    79220    int n = framesToProcess;
    80221    while (n--) {
     
    84225        destP += destStride;
    85226    }
     227#ifdef __SSE2__
     228    }
     229#endif
    86230}
    87231
Note: See TracChangeset for help on using the changeset viewer.