Changeset 261827 in webkit
- Timestamp:
- May 18, 2020, 1:17:34 PM (5 years ago)
- Location:
- trunk
- Files:
-
- 1 added
- 9 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/WTF/ChangeLog
r261818 r261827 1 2020-05-18 Mark Lam <mark.lam@apple.com> 2 3 Implement a faster findBitInWord() using the hardware ctz instruction. 4 https://bugs.webkit.org/show_bug.cgi?id=212032 5 <rdar://problem/63348086> 6 7 Reviewed by Saam Barati. 8 9 Based on local microbenchmarking, both ARM64 and X86_64 shows a speed up of 10 28% - 95% faster than the loop. 11 12 The largest perf regressions for ctz vs the loop, are when finding 1 in a mostly 13 set word or finding 0 in a mostly cleared word. For example, on X86_64: 14 Find 1 in 0xffffffffffffffff: using loop 8.67 ms, using ctz 6.07 ms, delta 30.0% 15 Find 0 in 0x8000000000000000: using loop 9.01 ms, using ctz 6.54 ms, delta 28.1% 16 17 The largest perf progressions for ctz vs the loop, are the opposite: finding 1 in 18 a mostly cleared word, or finding a 0 in a mostly set word. For example, on X86_64: 19 Find 1 in 0x8000000000000000: using loop 91.4 ms, using ctz 6.48 ms, delta 92.9% 20 Find 0 in 0xffffffffffffffff: using loop 91.7 ms, using ctz 6.95 ms, delta 92.4% 21 22 TL;DR: the microbenchmark methodology used: 23 24 findBitInWord() takes: 25 a. word to scan 26 b. startIndex 27 c. endIndex 28 d. bool value to scan for, either 1 or 0. 29 30 1. Randomly select 1000 startIndex and endIndex pairs. 31 The endIndex is guaranteed to be >= startIndex. 32 33 2. Using a base word of 0xffffffffffffffff (or uint64_t) and 0xffffffff (for uint32_t), 34 shift left by 0 - N (where N is the number of bits in the word) to generate 35 N + 1 words for the test. This produces words with contiguous lengths of 1s 36 and 0s. We're choosing these words to deliberately measure the effects o 37 run lengths of 0s or 1s on the algorithm in use. 38 39 3. For each test word, call findBitInWord() with the start and end indexes chosen 40 in 1 for a 100 iterations. 41 42 4. Run (3) once to search for a true value, and once to search for a false value. 43 44 5. Print the results for each test word and value pair. 45 46 * wtf/BitVector.cpp: 47 * wtf/Bitmap.h: 48 * wtf/StdLibExtras.h: 49 (WTF::findBitInWord): 50 1 51 2020-05-18 Darin Adler <darin@apple.com> 2 52 -
trunk/Source/WTF/wtf/BitVector.cpp
r261661 r261827 30 30 #include <string.h> 31 31 #include <wtf/Assertions.h> 32 #include <wtf/ StdLibExtras.h>32 #include <wtf/MathExtras.h> 33 33 34 34 namespace WTF { -
trunk/Source/WTF/wtf/Bitmap.h
r261179 r261827 23 23 #include <wtf/Atomics.h> 24 24 #include <wtf/HashFunctions.h> 25 #include <wtf/MathExtras.h> 25 26 #include <wtf/StdIntExtras.h> 26 #include <wtf/StdLibExtras.h>27 27 #include <string.h> 28 28 #include <type_traits> -
trunk/Source/WTF/wtf/StdLibExtras.h
r259582 r261827 318 318 319 319 template<typename T> 320 bool findBitInWord(T word, size_t& index, size_t endIndex, bool value) 320 inline unsigned ctz(T value); // Clients will also need to #include MathExtras.h 321 322 template<typename T> 323 bool findBitInWord(T word, size_t& startOrResultIndex, size_t endIndex, bool value) 321 324 { 322 325 static_assert(std::is_unsigned<T>::value, "Type used in findBitInWord must be unsigned"); 323 326 327 constexpr size_t bitsInWord = sizeof(word) * 8; 328 ASSERT_UNUSED(bitsInWord, startOrResultIndex <= bitsInWord && endIndex <= bitsInWord); 329 330 size_t index = startOrResultIndex; 324 331 word >>= index; 325 332 333 #if COMPILER(GCC_COMPATIBLE) && (CPU(X86_64) || CPU(ARM64)) 334 // We should only use ctz() when we know that ctz() is implementated using 335 // a fast hardware instruction. Otherwise, this will actually result in 336 // worse performance. 337 338 word ^= (static_cast<T>(value) - 1); 339 index += ctz(word); 340 if (index < endIndex) { 341 startOrResultIndex = index; 342 return true; 343 } 344 #else 326 345 while (index < endIndex) { 327 if ((word & 1) == static_cast<T>(value)) 346 if ((word & 1) == static_cast<T>(value)) { 347 startOrResultIndex = index; 328 348 return true; 349 } 329 350 index++; 330 351 word >>= 1; 331 352 } 332 333 index = endIndex; 353 #endif 354 355 startOrResultIndex = endIndex; 334 356 return false; 335 357 } -
trunk/Source/bmalloc/ChangeLog
r261667 r261827 1 2020-05-18 Mark Lam <mark.lam@apple.com> 2 3 Implement a faster findBitInWord() using the hardware ctz instruction. 4 https://bugs.webkit.org/show_bug.cgi?id=212032 5 <rdar://problem/63348086> 6 7 Reviewed by Saam Barati. 8 9 Apply same changes to bmalloc's copy of findBitInWord(). 10 11 * bmalloc/Algorithm.h: 12 (bmalloc::ctz): 13 (bmalloc::findBitInWord): 14 1 15 2020-05-13 Yusuke Suzuki <ysuzuki@apple.com> 2 16 -
trunk/Source/bmalloc/bmalloc/Algorithm.h
r261667 r261827 1 1 /* 2 * Copyright (C) 2014 Apple Inc. All rights reserved.2 * Copyright (C) 2014-2020 Apple Inc. All rights reserved. 3 3 * 4 4 * Redistribution and use in source and binary forms, with or without … … 177 177 #define BOFFSETOF(class, field) (reinterpret_cast<ptrdiff_t>(&(reinterpret_cast<class*>(0x4000)->field)) - 0x4000) 178 178 179 template<typename T>180 bool findBitInWord(T word, size_t& index, size_t endIndex, bool value)181 {182 static_assert(std::is_unsigned<T>::value, "Type used in findBitInWord must be unsigned");183 184 word >>= index;185 186 while (index < endIndex) {187 if ((word & 1) == static_cast<T>(value))188 return true;189 index++;190 word >>= 1;191 }192 193 index = endIndex;194 return false;195 }196 197 179 template <typename T> 198 180 constexpr unsigned ctzConstexpr(T value) … … 212 194 } 213 195 return zeroCount; 196 } 197 198 template<typename T> 199 inline unsigned ctz(T value) 200 { 201 constexpr unsigned bitSize = sizeof(T) * CHAR_BIT; 202 203 using UT = typename std::make_unsigned<T>::type; 204 UT uValue = value; 205 206 #if BCOMPILER(GCC_COMPATIBLE) 207 if (uValue) 208 return __builtin_ctzll(uValue); 209 return bitSize; 210 #elif BCOMPILER(MSVC) && !BCPU(X86) 211 unsigned long ret = 0; 212 if (_BitScanForward64(&ret, uValue)) 213 return ret; 214 return bitSize; 215 #else 216 UNUSED_PARAM(bitSize); 217 UNUSED_PARAM(uValue); 218 return ctzConstexpr(value); 219 #endif 220 } 221 222 template<typename T> 223 bool findBitInWord(T word, size_t& startOrResultIndex, size_t endIndex, bool value) 224 { 225 static_assert(std::is_unsigned<T>::value, "Type used in findBitInWord must be unsigned"); 226 constexpr size_t bitsInWord = sizeof(word) * 8; 227 BASSERT(startOrResultIndex <= bitsInWord && endIndex <= bitsInWord); 228 BUNUSED(bitsInWord); 229 230 size_t index = startOrResultIndex; 231 word >>= index; 232 233 #if BCOMPILER(GCC_COMPATIBLE) && (BCPU(X86_64) || BCPU(ARM64)) 234 // We should only use ctz() when we know that ctz() is implementated using 235 // a fast hardware instruction. Otherwise, this will actually result in 236 // worse performance. 237 238 word ^= (static_cast<T>(value) - 1); 239 index += ctz(word); 240 if (index < endIndex) { 241 startOrResultIndex = index; 242 return true; 243 } 244 #else 245 while (index < endIndex) { 246 if ((word & 1) == static_cast<T>(value)) { 247 startOrResultIndex = index; 248 return true; 249 } 250 index++; 251 word >>= 1; 252 } 253 #endif 254 255 startOrResultIndex = endIndex; 256 return false; 214 257 } 215 258 -
trunk/Tools/ChangeLog
r261825 r261827 1 2020-05-18 Mark Lam <mark.lam@apple.com> 2 3 Implement a faster findBitInWord() using the hardware ctz instruction. 4 https://bugs.webkit.org/show_bug.cgi?id=212032 5 <rdar://problem/63348086> 6 7 Reviewed by Saam Barati. 8 9 Add tests to make sure that the ctz implementation matches the loop implementation 10 in behavior. 11 12 * TestWebKitAPI/CMakeLists.txt: 13 * TestWebKitAPI/TestWebKitAPI.xcodeproj/project.pbxproj: 14 * TestWebKitAPI/Tests/WTF/StdLibExtras.cpp: Added. 15 (TestWebKitAPI::testFindBitInWord): 16 (TestWebKitAPI::TEST): 17 1 18 2020-05-18 Wenson Hsieh <wenson_hsieh@apple.com> 2 19 -
trunk/Tools/TestWebKitAPI/CMakeLists.txt
r261790 r261827 84 84 Tests/WTF/ScopedLambda.cpp 85 85 Tests/WTF/SetForScope.cpp 86 Tests/WTF/StdLibExtras.cpp 86 87 Tests/WTF/StringBuilder.cpp 87 88 Tests/WTF/StringConcatenate.cpp -
trunk/Tools/TestWebKitAPI/TestWebKitAPI.xcodeproj/project.pbxproj
r261790 r261827 1144 1144 F6F49C6B15545CA70007F39D /* DOMWindowExtensionNoCache_Bundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F6F49C6615545C8D0007F39D /* DOMWindowExtensionNoCache_Bundle.cpp */; }; 1145 1145 F6FDDDD614241C6F004F1729 /* push-state.html in Copy Resources */ = {isa = PBXBuildFile; fileRef = F6FDDDD514241C48004F1729 /* push-state.html */; }; 1146 FE2BCDC72470FDA300DEC33B /* StdLibExtras.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FE2BCDC62470FC7000DEC33B /* StdLibExtras.cpp */; }; 1146 1147 FE2D9474245EB2F400E48135 /* Bitmap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FE2D9473245EB2DF00E48135 /* Bitmap.cpp */; }; 1147 1148 /* End PBXBuildFile section */ … … 2814 2815 F6FDDDD214241AD4004F1729 /* PrivateBrowsingPushStateNoHistoryCallback.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PrivateBrowsingPushStateNoHistoryCallback.cpp; sourceTree = "<group>"; }; 2815 2816 F6FDDDD514241C48004F1729 /* push-state.html */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.html; path = "push-state.html"; sourceTree = "<group>"; }; 2817 FE2BCDC62470FC7000DEC33B /* StdLibExtras.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StdLibExtras.cpp; sourceTree = "<group>"; }; 2816 2818 FE2D9473245EB2DF00E48135 /* Bitmap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Bitmap.cpp; sourceTree = "<group>"; }; 2817 2819 FEB6F74E1B2BA44E009E4922 /* NakedPtr.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NakedPtr.cpp; sourceTree = "<group>"; }; … … 4027 4029 CD5393C91757BAC400C07123 /* SHA1.cpp */, 4028 4030 E3953F951F2CF32100A76A2E /* Signals.cpp */, 4031 FE2BCDC62470FC7000DEC33B /* StdLibExtras.cpp */, 4029 4032 81B50192140F232300D9EB58 /* StringBuilder.cpp */, 4030 4033 7CD4C26C1E2C0E6E00929470 /* StringConcatenate.cpp */, … … 4695 4698 7C8BFF7123C0107A00C009B3 /* HexNumber.cpp in Sources */, 4696 4699 7C83DEE01D0A590C00FEBCF3 /* IntegerToStringConversion.cpp in Sources */, 4700 FE2BCDC72470FDA300DEC33B /* StdLibExtras.cpp in Sources */, 4697 4701 53FCDE6B229EFFB900598ECF /* IsoHeap.cpp in Sources */, 4698 4702 7CEB62AB223609DE0069CBB0 /* IteratorRange.cpp in Sources */,
Note:
See TracChangeset
for help on using the changeset viewer.