Changeset 101894 in webkit
- Timestamp:
- Dec 2, 2011 6:43:43 PM (12 years ago)
- Location:
- trunk/Source/WebCore
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/WebCore/ChangeLog
r101890 r101894 1 2011-12-02 James Wei <james.wei@intel.com> & Xingnan Wang <xingnan.wang@intel.com> 2 3 -Implement the SSE optimization for vsmul and vadd. 4 https://bugs.webkit.org/show_bug.cgi?id=73182 5 6 Reviewed by Kenneth Russell. 7 8 * platform/audio/VectorMath.cpp: 9 (WebCore:VectorMath): 10 1 11 2011-12-02 David Grogan <dgrogan@chromium.org> 2 12 -
trunk/Source/WebCore/platform/audio/VectorMath.cpp
r95901 r101894 33 33 #endif 34 34 35 #ifdef __SSE2__ 36 #include <emmintrin.h> 37 #endif 38 35 39 namespace WebCore { 36 40 … … 40 44 // On the Mac we use the highly optimized versions in Accelerate.framework 41 45 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as 42 // our namespaced function names, so we must handle this case differently. 46 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file. 43 47 44 48 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) … … 64 68 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 65 69 { 66 // FIXME: optimize for SSE 70 #ifdef __SSE2__ 71 if ((sourceStride == 1) && (destStride == 1)) { 72 73 int n = framesToProcess; 74 float k = *scale; 75 76 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately. 77 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { 78 *destP = k * *sourceP; 79 sourceP++; 80 destP++; 81 n--; 82 } 83 84 // Now the sourceP address is aligned and start to apply SSE. 85 int group = n / 4; 86 __m128 mScale = _mm_set_ps1(k); 87 __m128* pSource; 88 __m128* pDest; 89 __m128 dest; 90 91 92 if (reinterpret_cast<size_t>(destP) & 0x0F) { 93 while (group--) { 94 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 95 dest = _mm_mul_ps(*pSource, mScale); 96 _mm_storeu_ps(destP, dest); 97 98 sourceP += 4; 99 destP += 4; 100 } 101 } else { 102 while (group--) { 103 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 104 pDest = reinterpret_cast<__m128*>(destP); 105 *pDest = _mm_mul_ps(*pSource, mScale); 106 107 sourceP += 4; 108 destP += 4; 109 } 110 } 111 112 // Non-SSE handling for remaining frames which is less than 4. 113 n %= 4; 114 while (n) { 115 *destP = k * *sourceP; 116 sourceP++; 117 destP++; 118 n--; 119 } 120 } else { // If strides are not 1, rollback to normal algorithm. 121 #endif 67 122 int n = framesToProcess; 68 123 float k = *scale; … … 72 127 destP += destStride; 73 128 } 129 #ifdef __SSE2__ 130 } 131 #endif 74 132 } 75 133 76 134 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 77 135 { 78 // FIXME: optimize for SSE 136 #ifdef __SSE2__ 137 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 138 139 int n = framesToProcess; 140 141 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately. 142 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { 143 *destP = *source1P + *source2P; 144 source1P++; 145 source2P++; 146 destP++; 147 n--; 148 } 149 150 // Now the source1P address is aligned and start to apply SSE. 151 int group = n / 4; 152 __m128* pSource1; 153 __m128* pSource2; 154 __m128* pDest; 155 __m128 source2; 156 __m128 dest; 157 158 bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F); 159 bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F); 160 161 if (source2Aligned && destAligned) { // all aligned 162 while (group--) { 163 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 164 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 165 pDest = reinterpret_cast<__m128*>(destP); 166 *pDest = _mm_add_ps(*pSource1, *pSource2); 167 168 source1P += 4; 169 source2P += 4; 170 destP += 4; 171 } 172 173 } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned 174 while (group--) { 175 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 176 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 177 dest = _mm_add_ps(*pSource1, *pSource2); 178 _mm_storeu_ps(destP, dest); 179 180 source1P += 4; 181 source2P += 4; 182 destP += 4; 183 } 184 185 } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned 186 while (group--) { 187 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 188 source2 = _mm_loadu_ps(source2P); 189 pDest = reinterpret_cast<__m128*>(destP); 190 *pDest = _mm_add_ps(*pSource1, source2); 191 192 source1P += 4; 193 source2P += 4; 194 destP += 4; 195 } 196 } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned 197 while (group--) { 198 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 199 source2 = _mm_loadu_ps(source2P); 200 dest = _mm_add_ps(*pSource1, source2); 201 _mm_storeu_ps(destP, dest); 202 203 source1P += 4; 204 source2P += 4; 205 destP += 4; 206 } 207 } 208 209 // Non-SSE handling for remaining frames which is less than 4. 210 n %= 4; 211 while (n) { 212 *destP = *source1P + *source2P; 213 source1P++; 214 source2P++; 215 destP++; 216 n--; 217 } 218 } else { // if strides are not 1, rollback to normal algorithm 219 #endif 79 220 int n = framesToProcess; 80 221 while (n--) { … … 84 225 destP += destStride; 85 226 } 227 #ifdef __SSE2__ 228 } 229 #endif 86 230 } 87 231
Note: See TracChangeset
for help on using the changeset viewer.