Changeset 222893 in webkit


Ignore:
Timestamp:
Oct 4, 2017 8:05:42 PM (6 years ago)
Author:
fpizlo@apple.com
Message:

bmalloc mutex should be adaptive
https://bugs.webkit.org/show_bug.cgi?id=177839

Reviewed by Michael Saboff.

Source/bmalloc:

This pulls the WordLock algorithm into bmalloc, mostly by copy-pasting the code. We need to
copy paste because sometimes we build WTF without bmalloc, so WTF cannot rely on bmalloc for
anything other than malloc.

  • bmalloc/Algorithm.h:

(bmalloc::compareExchangeWeak):
(bmalloc::compareExchangeStrong):

  • bmalloc/PerThread.h:
  • bmalloc/StaticMutex.cpp:

(bmalloc::StaticMutex::lockSlow):
(bmalloc::StaticMutex::unlockSlow):
(bmalloc::StaticMutex::lockSlowCase): Deleted.

  • bmalloc/StaticMutex.h:

(bmalloc::StaticMutex::try_lock):
(bmalloc::StaticMutex::isLocked const):
(bmalloc::StaticMutex::init):
(bmalloc::StaticMutex::tryLock):
(bmalloc::StaticMutex::lock):
(bmalloc::StaticMutex::unlock):
(bmalloc::sleep): Deleted.
(bmalloc::waitUntilFalse): Deleted.

Source/WTF:

Add some comments that I thought of while copy-pasting this code.

  • wtf/LockAlgorithmInlines.h:
  • wtf/WordLock.cpp:
Location:
trunk/Source
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/WTF/ChangeLog

    r222878 r222893  
     12017-10-04  Filip Pizlo  <fpizlo@apple.com>
     2
     3        bmalloc mutex should be adaptive
     4        https://bugs.webkit.org/show_bug.cgi?id=177839
     5
     6        Reviewed by Michael Saboff.
     7       
     8        Add some comments that I thought of while copy-pasting this code.
     9
     10        * wtf/LockAlgorithmInlines.h:
     11        * wtf/WordLock.cpp:
     12
    1132017-10-04  JF Bastien  <jfbastien@apple.com>
    214
  • trunk/Source/WTF/wtf/LockAlgorithmInlines.h

    r220186 r222893  
    3333// the lock algorithm slow path without recompiling the world. Right now this should be included in two
    3434// places (Lock.cpp and JSCell.cpp).
     35
     36// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
     37// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
     38// reward.
    3539
    3640namespace WTF {
  • trunk/Source/WTF/wtf/WordLock.cpp

    r219763 r222893  
    7777} // anonymous namespace
    7878
     79// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
     80// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
     81// reward.
     82
    7983NEVER_INLINE void WordLockBase::lockSlow()
    8084{
  • trunk/Source/bmalloc/ChangeLog

    r222731 r222893  
     12017-10-04  Filip Pizlo  <fpizlo@apple.com>
     2
     3        bmalloc mutex should be adaptive
     4        https://bugs.webkit.org/show_bug.cgi?id=177839
     5
     6        Reviewed by Michael Saboff.
     7       
     8        This pulls the WordLock algorithm into bmalloc, mostly by copy-pasting the code. We need to
     9        copy paste because sometimes we build WTF without bmalloc, so WTF cannot rely on bmalloc for
     10        anything other than malloc.
     11
     12        * bmalloc/Algorithm.h:
     13        (bmalloc::compareExchangeWeak):
     14        (bmalloc::compareExchangeStrong):
     15        * bmalloc/PerThread.h:
     16        * bmalloc/StaticMutex.cpp:
     17        (bmalloc::StaticMutex::lockSlow):
     18        (bmalloc::StaticMutex::unlockSlow):
     19        (bmalloc::StaticMutex::lockSlowCase): Deleted.
     20        * bmalloc/StaticMutex.h:
     21        (bmalloc::StaticMutex::try_lock):
     22        (bmalloc::StaticMutex::isLocked const):
     23        (bmalloc::StaticMutex::init):
     24        (bmalloc::StaticMutex::tryLock):
     25        (bmalloc::StaticMutex::lock):
     26        (bmalloc::StaticMutex::unlock):
     27        (bmalloc::sleep): Deleted.
     28        (bmalloc::waitUntilFalse): Deleted.
     29
    1302017-10-02  Yusuke Suzuki  <utatane.tea@gmail.com>
    231
  • trunk/Source/bmalloc/bmalloc/Algorithm.h

    r220097 r222893  
    11/*
    2  * Copyright (C) 2014 Apple Inc. All rights reserved.
     2 * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
    33 *
    44 * Redistribution and use in source and binary forms, with or without
     
    2929#include "BAssert.h"
    3030#include <algorithm>
     31#include <atomic>
    3132#include <cstdint>
    3233#include <cstddef>
     
    159160}
    160161
     162// We need a CAS API that isn't the badly designed one from C++.
     163template<typename T>
     164bool compareExchangeWeak(std::atomic<T>& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst)
     165{
     166    // They could have made a sensible CAS API, but they didn't. Sneaky fact: for no good reason, the
     167    // C++ API will mutate expected. I think that apologists will say something about how it saves you
     168    // reloading the value around the back of your CAS loop, but that's nonsense. The delay along the
     169    // back edge of a CAS loop has almost no impact on CAS loop performance. More often than not, we
     170    // want to *add* delays to the back edge, not remove them.
     171    return word.compare_exchange_weak(expected, desired, order, std::memory_order_relaxed);
     172}
     173
     174template<typename T>
     175bool compareExchangeStrong(std::atomic<T>& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst)
     176{
     177    return word.compare_exchange_strong(expected, desired, order, std::memory_order_relaxed);
     178}
     179
    161180} // namespace bmalloc
    162181
  • trunk/Source/bmalloc/bmalloc/PerThread.h

    r220352 r222893  
    6161};
    6262
    63 #if HAVE_PTHREAD_MACHDEP_H
    64 
    6563class Cache;
    6664template<typename T> struct PerThreadStorage;
     65
     66#if HAVE_PTHREAD_MACHDEP_H
    6767
    6868// For now, we only support PerThread<PerHeapKind<Cache>>. We can expand to other types by
     
    8383};
    8484
    85 #else
     85#endif
    8686
    8787template<typename T> struct PerThreadStorage {
     
    112112template<typename T> pthread_key_t PerThreadStorage<T>::s_key;
    113113template<typename T> std::once_flag PerThreadStorage<T>::s_onceFlag;
    114 
    115 #endif
    116114
    117115template<typename T>
  • trunk/Source/bmalloc/bmalloc/StaticMutex.cpp

    r219763 r222893  
    11/*
    2  * Copyright (C) 2014 Apple Inc. All rights reserved.
     2 * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
    33 *
    44 * Redistribution and use in source and binary forms, with or without
     
    2424 */
    2525
     26#include "StaticMutex.h"
     27
     28#include "PerThread.h"
    2629#include "ScopeExit.h"
    27 #include "StaticMutex.h"
     30#include <condition_variable>
     31#include <mutex>
    2832#include <thread>
    2933
    3034namespace bmalloc {
    3135
    32 void StaticMutex::lockSlowCase()
     36// FIXME: This duplicates code from WTF::WordLock.
     37// https://bugs.webkit.org/show_bug.cgi?id=177719
     38
     39namespace {
     40
     41// This data structure serves three purposes:
     42//
     43// 1) A parking mechanism for threads that go to sleep. That involves just a system mutex and
     44//    condition variable.
     45//
     46// 2) A queue node for when a thread is on some lock's queue.
     47//
     48// 3) The queue head. This is kind of funky. When a thread is the head of a queue, it also serves as
     49//    the basic queue bookkeeping data structure. When a thread is dequeued, the next thread in the
     50//    queue takes on the queue head duties.
     51struct ThreadData {
     52    // The parking mechanism.
     53    bool shouldPark { false };
     54    std::mutex parkingLock;
     55    std::condition_variable parkingCondition;
     56
     57    // The queue node.
     58    ThreadData* nextInQueue { nullptr };
     59
     60    // The queue itself.
     61    ThreadData* queueTail { nullptr };
     62};
     63
     64ThreadData* myThreadData()
    3365{
    34     // The longest critical section in bmalloc is much shorter than the
    35     // time it takes to make a system call to yield to the OS scheduler.
    36     // So, we try again a lot before we yield.
    37     static const size_t aLot = 256;
     66    return PerThread<ThreadData>::get();
     67}
     68
     69} // anonymous namespace
     70
     71// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
     72// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
     73// reward.
     74
     75BNO_INLINE void StaticMutex::lockSlow()
     76{
     77    unsigned spinCount = 0;
     78
     79    // This magic number turns out to be optimal based on past JikesRVM experiments.
     80    const unsigned spinLimit = 40;
    3881   
    39     if (!m_isSpinning.test_and_set()) {
    40         auto clear = makeScopeExit([&] { m_isSpinning.clear(); });
    41 
    42         for (size_t i = 0; i < aLot; ++i) {
    43             if (try_lock())
     82    for (;;) {
     83        uintptr_t currentWordValue = m_word.load();
     84       
     85        if (!(currentWordValue & isLockedBit)) {
     86            // It's not possible for someone to hold the queue lock while the lock itself is no longer
     87            // held, since we will only attempt to acquire the queue lock when the lock is held and
     88            // the queue lock prevents unlock.
     89            BASSERT(!(currentWordValue & isQueueLockedBit));
     90            if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isLockedBit)) {
     91                // Success! We acquired the lock.
    4492                return;
    45         }
     93            }
     94        }
     95
     96        // If there is no queue and we haven't spun too much, we can just try to spin around again.
     97        if (!(currentWordValue & ~queueHeadMask) && spinCount < spinLimit) {
     98            spinCount++;
     99            sched_yield();
     100            continue;
     101        }
     102
     103        // Need to put ourselves on the queue. Create the queue if one does not exist. This requries
     104        // owning the queue for a little bit. The lock that controls the queue is itself a spinlock.
     105        // But before we acquire the queue spinlock, we make sure that we have a ThreadData for this
     106        // thread.
     107        ThreadData* me = myThreadData();
     108        BASSERT(!me->shouldPark);
     109        BASSERT(!me->nextInQueue);
     110        BASSERT(!me->queueTail);
     111
     112        // Reload the current word value, since some time may have passed.
     113        currentWordValue = m_word.load();
     114
     115        // We proceed only if the queue lock is not held, the lock is held, and we succeed in
     116        // acquiring the queue lock.
     117        if ((currentWordValue & isQueueLockedBit)
     118            || !(currentWordValue & isLockedBit)
     119            || !compareExchangeWeak(m_word, currentWordValue, currentWordValue | isQueueLockedBit)) {
     120            sched_yield();
     121            continue;
     122        }
     123       
     124        me->shouldPark = true;
     125
     126        // We own the queue. Nobody can enqueue or dequeue until we're done. Also, it's not possible
     127        // to release the lock while we hold the queue lock.
     128        ThreadData* queueHead = reinterpret_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
     129        if (queueHead) {
     130            // Put this thread at the end of the queue.
     131            queueHead->queueTail->nextInQueue = me;
     132            queueHead->queueTail = me;
     133
     134            // Release the queue lock.
     135            currentWordValue = m_word.load();
     136            BASSERT(currentWordValue & ~queueHeadMask);
     137            BASSERT(currentWordValue & isQueueLockedBit);
     138            BASSERT(currentWordValue & isLockedBit);
     139            m_word.store(currentWordValue & ~isQueueLockedBit);
     140        } else {
     141            // Make this thread be the queue-head.
     142            queueHead = me;
     143            me->queueTail = me;
     144
     145            // Release the queue lock and install ourselves as the head. No need for a CAS loop, since
     146            // we own the queue lock.
     147            currentWordValue = m_word.load();
     148            BASSERT(~(currentWordValue & ~queueHeadMask));
     149            BASSERT(currentWordValue & isQueueLockedBit);
     150            BASSERT(currentWordValue & isLockedBit);
     151            uintptr_t newWordValue = currentWordValue;
     152            newWordValue |= reinterpret_cast<uintptr_t>(queueHead);
     153            newWordValue &= ~isQueueLockedBit;
     154            m_word.store(newWordValue);
     155        }
     156
     157        // At this point everyone who acquires the queue lock will see me on the queue, and anyone who
     158        // acquires me's lock will see that me wants to park. Note that shouldPark may have been
     159        // cleared as soon as the queue lock was released above, but it will happen while the
     160        // releasing thread holds me's parkingLock.
     161
     162        {
     163            std::unique_lock<std::mutex> locker(me->parkingLock);
     164            while (me->shouldPark)
     165                me->parkingCondition.wait(locker);
     166        }
     167
     168        BASSERT(!me->shouldPark);
     169        BASSERT(!me->nextInQueue);
     170        BASSERT(!me->queueTail);
     171       
     172        // Now we can loop around and try to acquire the lock again.
    46173    }
    47 
    48     // Avoid spinning pathologically.
    49     while (!try_lock())
    50         sched_yield();
    51174}
    52175
     176BNO_INLINE void StaticMutex::unlockSlow()
     177{
     178    // The fast path can fail either because of spurious weak CAS failure, or because someone put a
     179    // thread on the queue, or the queue lock is held. If the queue lock is held, it can only be
     180    // because someone *will* enqueue a thread onto the queue.
     181
     182    // Acquire the queue lock, or release the lock. This loop handles both lock release in case the
     183    // fast path's weak CAS spuriously failed and it handles queue lock acquisition if there is
     184    // actually something interesting on the queue.
     185    for (;;) {
     186        uintptr_t currentWordValue = m_word.load();
     187
     188        BASSERT(currentWordValue & isLockedBit);
     189       
     190        if (currentWordValue == isLockedBit) {
     191            if (compareExchangeWeak(m_word, isLockedBit, clear)) {
     192                // The fast path's weak CAS had spuriously failed, and now we succeeded. The lock is
     193                // unlocked and we're done!
     194                return;
     195            }
     196            // Loop around and try again.
     197            sched_yield();
     198            continue;
     199        }
     200       
     201        if (currentWordValue & isQueueLockedBit) {
     202            sched_yield();
     203            continue;
     204        }
     205
     206        // If it wasn't just a spurious weak CAS failure and if the queue lock is not held, then there
     207        // must be an entry on the queue.
     208        BASSERT(currentWordValue & ~queueHeadMask);
     209
     210        if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isQueueLockedBit))
     211            break;
     212    }
     213
     214    uintptr_t currentWordValue = m_word.load();
     215       
     216    // After we acquire the queue lock, the lock must still be held and the queue must be
     217    // non-empty. The queue must be non-empty since only the lockSlow() method could have held the
     218    // queue lock and if it did then it only releases it after putting something on the queue.
     219    BASSERT(currentWordValue & isLockedBit);
     220    BASSERT(currentWordValue & isQueueLockedBit);
     221    ThreadData* queueHead = reinterpret_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
     222    BASSERT(queueHead);
     223
     224    ThreadData* newQueueHead = queueHead->nextInQueue;
     225    // Either this was the only thread on the queue, in which case we delete the queue, or there
     226    // are still more threads on the queue, in which case we create a new queue head.
     227    if (newQueueHead)
     228        newQueueHead->queueTail = queueHead->queueTail;
     229
     230    // Change the queue head, possibly removing it if newQueueHead is null. No need for a CAS loop,
     231    // since we hold the queue lock and the lock itself so nothing about the lock can change right
     232    // now.
     233    currentWordValue = m_word.load();
     234    BASSERT(currentWordValue & isLockedBit);
     235    BASSERT(currentWordValue & isQueueLockedBit);
     236    BASSERT((currentWordValue & ~queueHeadMask) == reinterpret_cast<uintptr_t>(queueHead));
     237    uintptr_t newWordValue = currentWordValue;
     238    newWordValue &= ~isLockedBit; // Release the lock.
     239    newWordValue &= ~isQueueLockedBit; // Release the queue lock.
     240    newWordValue &= queueHeadMask; // Clear out the old queue head.
     241    newWordValue |= reinterpret_cast<uintptr_t>(newQueueHead); // Install new queue head.
     242    m_word.store(newWordValue);
     243
     244    // Now the lock is available for acquisition. But we just have to wake up the old queue head.
     245    // After that, we're done!
     246
     247    queueHead->nextInQueue = nullptr;
     248    queueHead->queueTail = nullptr;
     249
     250    // We do this carefully because this may run either before or during the parkingLock critical
     251    // section in lockSlow().
     252    {
     253        std::lock_guard<std::mutex> locker(queueHead->parkingLock);
     254        queueHead->shouldPark = false;
     255    }
     256    // Doesn't matter if we notify_all() or notify_one() here since the only thread that could be
     257    // waiting is queueHead.
     258    queueHead->parkingCondition.notify_one();
     259
     260    // The old queue head can now contend for the lock again. We're done!
     261}
     262
    53263} // namespace bmalloc
  • trunk/Source/bmalloc/bmalloc/StaticMutex.h

    r205210 r222893  
    11/*
    2  * Copyright (C) 2014 Apple Inc. All rights reserved.
     2 * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
    33 *
    44 * Redistribution and use in source and binary forms, with or without
     
    2727#define StaticMutex_h
    2828
     29#include "Algorithm.h"
    2930#include "BAssert.h"
     31#include "BExport.h"
    3032#include <atomic>
    3133#include <mutex>
     
    3638// Use StaticMutex in static storage, where global constructors and exit-time
    3739// destructors are prohibited, but all memory is zero-initialized automatically.
     40
     41// This uses the code from what WTF used to call WordLock. It's a fully adaptive mutex that uses
     42// sizeof(void*) storage. It has a fast path that is similar to a spinlock, and a slow path that is
     43// similar to std::mutex.
     44
     45// NOTE: In general, this is a great lock to use if you are very low in the stack. WTF continues to
     46// have a copy of this code.
     47
     48// FIXME: Either fold bmalloc into WTF or find a better way to share code between them.
     49// https://bugs.webkit.org/show_bug.cgi?id=177719
    3850
    3951namespace bmalloc {
     
    4658public:
    4759    void lock();
    48     bool try_lock();
     60    bool tryLock();
     61    bool try_lock() { return tryLock(); }
    4962    void unlock();
     63   
     64    bool isHeld() const;
     65    bool isLocked() const { return isHeld(); }
    5066
    5167private:
    52     void lockSlowCase();
     68    BEXPORT void lockSlow();
     69    BEXPORT void unlockSlow();
    5370
    54     std::atomic_flag m_flag;
    55     std::atomic_flag m_isSpinning;
     71    static const uintptr_t clear = 0;
     72    static const uintptr_t isLockedBit = 1;
     73    static const uintptr_t isQueueLockedBit = 2;
     74    static const uintptr_t queueHeadMask = 3;
     75
     76    std::atomic<uintptr_t> m_word;
    5677};
    57 
    58 static inline void sleep(
    59     std::unique_lock<StaticMutex>& lock, std::chrono::milliseconds duration)
    60 {
    61     if (duration == std::chrono::milliseconds(0))
    62         return;
    63    
    64     lock.unlock();
    65     std::this_thread::sleep_for(duration);
    66     lock.lock();
    67 }
    68 
    69 static inline void waitUntilFalse(
    70     std::unique_lock<StaticMutex>& lock, std::chrono::milliseconds sleepDuration,
    71     bool& condition)
    72 {
    73     while (condition) {
    74         condition = false;
    75         sleep(lock, sleepDuration);
    76     }
    77 }
    7878
    7979inline void StaticMutex::init()
    8080{
    81     m_flag.clear();
    82     m_isSpinning.clear();
     81    m_word.store(0, std::memory_order_relaxed);
    8382}
    8483
    85 inline bool StaticMutex::try_lock()
     84inline bool StaticMutex::tryLock()
    8685{
    87     return !m_flag.test_and_set(std::memory_order_acquire);
     86    return m_word.load(std::memory_order_acquire) & isLockedBit;
    8887}
    8988
    9089inline void StaticMutex::lock()
    9190{
    92     if (!try_lock())
    93         lockSlowCase();
     91    if (compareExchangeWeak(m_word, clear, isLockedBit, std::memory_order_acquire))
     92        return;
     93   
     94    lockSlow();
    9495}
    9596
    9697inline void StaticMutex::unlock()
    9798{
    98     m_flag.clear(std::memory_order_release);
     99    if (compareExchangeWeak(m_word, isLockedBit, clear, std::memory_order_release))
     100        return;
     101   
     102    unlockSlow();
    99103}
    100104
Note: See TracChangeset for help on using the changeset viewer.