• A futex-based semaphore

    From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Dec 14 19:00:09 2025
    From Newsgroup: comp.lang.c++

    // xsemaphore.h

    #pragma once
    #include <atomic>
    #include <stdexcept>
    #include <cassert>

    template<bool Abort = true>
    struct xsemaphore
    {
    -a -a struct overflow_error : public std::runtime_error
    -a -a {
    -a -a -a -a using std::runtime_error::runtime_error;
    -a -a };
    -a -a xsemaphore( uint32_t count = 0 ) noexcept( Abort );
    -a -a xsemaphore( const xsemaphore & ) = delete;
    -a -a xsemaphore &operator=( const xsemaphore & ) = delete;
    -a -a void release( uint32_t update = 1 ) noexcept( Abort );
    -a -a void acquire() noexcept( Abort );
    private:
    -a -a static void abrtThr() noexcept( Abort );
    -a -a static constexpr unsigned
    -a -a -a -a NOTIFY_BASE = 21,
    -a -a -a -a WAITER_BASE = 42;
    -a -a static constexpr uint64_t
    -a -a -a -a MASK21 = 0x1FFFFF,
    -a -a -a -a NOTIFY_MASK = MASK21 << NOTIFY_BASE,
    -a -a -a -a COUNTER_VALUE = 1,
    -a -a -a -a NOTIFY_VALUE = 1ull << NOTIFY_BASE,
    -a -a -a -a WAITER_VALUE = 1ull << WAITER_BASE;
    -a -a std::atomic_uint64_t m_counters;
    };

    // xsemaphore.cpp

    #include <algorithm>
    #include "xsemaphore.h"

    using namespace std;

    #if defined(_MSC_VER)
    -a -a #pragma warning(push)
    -a -a #pragma warning(disable: 4297)
    #endif

    template<bool Abort>
    xsemaphore<Abort>::xsemaphore( uint32_t count ) noexcept( Abort ) :
    -a -a m_counters( [&]
    -a -a -a -a {
    -a -a -a -a -a -a if( count & ~MASK21 )
    -a -a -a -a -a -a -a -a abrtThr();
    -a -a -a -a -a -a return count;
    -a -a -a -a }() )
    {
    }

    template xsemaphore<true>::xsemaphore( uint32_t ) noexcept;
    template xsemaphore<false>::xsemaphore( uint32_t );

    template<bool Abort>
    void xsemaphore<Abort>::release( uint32_t update ) noexcept( Abort )
    {
    -a -a if( !update )
    -a -a -a -a return;
    -a -a if( update & ~MASK21 ) [[unlikely]]
    -a -a -a -a abrtThr();
    -a -a uint32_t wakeUp;
    -a -a for( uint64_t ref = m_counters.load( memory_order_relaxed ); ; )
    -a -a {
    -a -a -a -a assert((int64_t)ref >= 0);
    -a -a -a -a uint32_t count = ref & MASK21;
    -a -a -a -a count += update;
    -a -a -a -a if( count & ~MASK21 ) [[unlikely]]
    -a -a -a -a -a -a abrtThr();
    -a -a -a -a uint32_t waiting = (ref >> WAITER_BASE) & MASK21;
    -a -a -a -a if( !waiting ) [[likely]]
    -a -a -a -a -a -a if( m_counters.compare_exchange_weak( ref, ref + update, memory_order_release, memory_order_relaxed ) ) [[likely]]
    -a -a -a -a -a -a -a -a return;
    -a -a -a -a -a -a else
    -a -a -a -a -a -a -a -a continue;
    -a -a -a -a wakeUp = min( update, waiting );
    -a -a -a -a uint32_t notify = ((ref >> NOTIFY_BASE) & MASK21) + wakeUp;
    -a -a -a -a assert(!(notify & ~MASK21));
    -a -a -a -a waiting -= wakeUp;
    -a -a -a -a assert(!(waiting & ~MASK21));
    -a -a -a -a uint64_t niu = (uint64_t)waiting << WAITER_BASE | (uint64_t)notify << NOTIFY_BASE | count;
    -a -a -a -a if( m_counters.compare_exchange_weak( ref, niu, memory_order_release, memory_order_relaxed ) ) [[likely]]
    -a -a -a -a -a -a break;
    -a -a }
    -a -a if( wakeUp == update ) [[likely]]
    -a -a -a -a m_counters.notify_all();
    -a -a else
    -a -a -a -a do
    -a -a -a -a -a -a m_counters.notify_one();
    -a -a -a -a while( --wakeUp );
    }

    template void xsemaphore<true>::release( uint32_t ) noexcept;
    template void xsemaphore<false>::release( uint32_t );

    template<bool Abort>
    void xsemaphore<Abort>::acquire() noexcept( Abort )
    {
    -a -a uint64_t ref = m_counters.load( memory_order_relaxed ), niu;
    -a -a for( ; ; )
    -a -a {
    -a -a -a -a assert((int64_t)ref >= 0);
    -a -a -a -a uint32_t
    -a -a -a -a -a -a count = ref & MASK21,
    -a -a -a -a -a -a notified = (ref >> NOTIFY_BASE) & MASK21;
    -a -a -a -a if( count > notified ) [[likely]]
    -a -a -a -a -a -a if( m_counters.compare_exchange_weak( ref, ref - COUNTER_VALUE, memory_order_acquire, memory_order_relaxed ) ) [[likely]]
    -a -a -a -a -a -a -a -a return;
    -a -a -a -a -a -a else
    -a -a -a -a -a -a -a -a continue;
    -a -a -a -a if( ((ref >> WAITER_BASE) & MASK21) == MASK21 ) [[unlikely]]
    -a -a -a -a -a -a abrtThr();
    -a -a -a -a niu = ref + WAITER_VALUE;
    -a -a -a -a if( m_counters.compare_exchange_weak( ref, niu, memory_order_relaxed, memory_order_relaxed ) ) [[likely]]
    -a -a -a -a -a -a break;
    -a -a }
    -a -a do [[unlikely]]
    -a -a {
    -a -a -a -a m_counters.wait( ref, memory_order_relaxed );
    -a -a -a -a ref = m_counters.load( memory_order_relaxed );
    -a -a -a -a assert((int64_t)ref >= 0);
    -a -a } while( !(ref & NOTIFY_MASK) );
    -a -a do [[unlikely]]
    -a -a {
    -a -a -a -a assert((ref & MASK21) && (int64_t)ref >= 0);
    -a -a -a -a niu = ref - (NOTIFY_VALUE + COUNTER_VALUE);
    -a -a } while( !m_counters.compare_exchange_weak( ref, niu, memory_order_acquire, memory_order_relaxed ) );
    }

    template void xsemaphore<true>::acquire() noexcept;
    template void xsemaphore<false>::acquire();

    template<bool Abort>
    void xsemaphore<Abort>::abrtThr() noexcept( Abort )
    {
    -a -a if constexpr( Abort )
    -a -a -a -a abort();
    -a -a else
    -a -a -a -a throw overflow_error( "xsemphore - semaphore counter too high" ); }

    #if defined(_MSC_VER)
    -a -a #pragma warning(pop)
    #endif

    With usual futex-based semaphores there may be stolen wakeups by threads decrementing the semaphore counter, thereby leaving wakeups of a signalling threads useless. This has to be comensated with looping. This doesn't happen With my solution.

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Sun Dec 14 13:15:14 2025
    From Newsgroup: comp.lang.c++

    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Dec 15 06:12:42 2025
    From Newsgroup: comp.lang.c++

    Am 14.12.2025 um 22:15 schrieb Chris M. Thomasson:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    There's no architectural description for this.

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Mon Dec 15 12:50:12 2025
    From Newsgroup: comp.lang.c++

    On 12/14/2025 9:12 PM, Bonita Montero wrote:
    Am 14.12.2025 um 22:15 schrieb Chris M. Thomasson:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    There's no architectural description for this.


    What do you mean?
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Dec 16 13:44:37 2025
    From Newsgroup: comp.lang.c++

    Am 15.12.2025 um 21:50 schrieb Chris M. Thomasson:
    On 12/14/2025 9:12 PM, Bonita Montero wrote:
    Am 14.12.2025 um 22:15 schrieb Chris M. Thomasson:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h
    [...]
    What about the simple semaphore?
    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html
    There's no architectural description for this.
    What do you mean?

    I'm interested in the implementation, not how to use it.


    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From David Brown@david.brown@hesbynett.no to comp.lang.c++ on Tue Dec 16 20:54:08 2025
    From Newsgroup: comp.lang.c++

    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system, but it
    was a mutex rather than a semaphore paired with an atomic counter. The
    result was massively faster than the original mutex.

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Tue Dec 16 12:07:52 2025
    From Newsgroup: comp.lang.c++

    On 12/16/2025 4:44 AM, Bonita Montero wrote:
    Am 15.12.2025 um 21:50 schrieb Chris M. Thomasson:
    On 12/14/2025 9:12 PM, Bonita Montero wrote:
    Am 14.12.2025 um 22:15 schrieb Chris M. Thomasson:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h
    [...]
    What about the simple semaphore?
    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html
    There's no architectural description for this.
    What do you mean?

    I'm interested in the implementation, not how to use it.



    Well, the implementation of it only uses fetch-and-add, (XADD) over in
    x86/x64 world. Also, its loopless.
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Tue Dec 16 12:12:02 2025
    From Newsgroup: comp.lang.c++

    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system, but it
    was a mutex rather than a semaphore paired with an atomic counter.-a The result was massively faster than the original mutex.


    Can you clarify a bit? A benaphore can be used as a mutex. But, it turns
    it into a bakery type algo...
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From David Brown@david.brown@hesbynett.no to comp.lang.c++ on Wed Dec 17 10:31:18 2025
    From Newsgroup: comp.lang.c++

    On 16/12/2025 21:12, Chris M. Thomasson wrote:
    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system, but
    it was a mutex rather than a semaphore paired with an atomic counter.
    The result was massively faster than the original mutex.


    Can you clarify a bit? A benaphore can be used as a mutex. But, it turns
    it into a bakery type algo...

    Mutexes and semaphores are different mechanisms for different uses.

    Terminology varies, and I don't know of any kind of "official"
    definitions. So I am just going to describe the terms as I am used to
    them from the RTOS's and systems I am familiar with.

    A semaphore is a signalling mechanism, and can be used to synchronise
    between threads and control waking and sleeping. Critical to semaphore behaviour is that they are counters that can be incremented or
    decremented by different threads - producer threads increment the
    counters, and consumer threads decrement them. A consumer that tries to decrement a semaphore that is currently at zero, blocks. When a
    producer increments a semaphore from zero, one or more sleeping threads
    may be woken. A binary semaphore is just a semaphore that should not be incremented past 1.

    A "benephore", as I understand it, is an atomic counter paired with a
    binary semaphore. Instead of using potentially costly OS-level
    increment and decrement operation for all changes to the semaphore
    count, it uses cheap atomic operations except in the cases of changes
    between 0 and 1 - those use the OS semaphore mechanisms so that threads
    can be blocked or woken as appropriate.

    A mutex is a resource control mechanism for handling exclusive access to
    a protected resource (data structure, hardware, whatever). When a
    thread locks the mutex, any other thread attempting to lock the same
    mutex will block until the first thread releases the mutex. Locked
    mutex's have an owner, can only be released by that owner. (Semaphores
    have no owner and can be incremented and decremented by any thread.) In
    an RTOS, you also usually have some kind of priority inheritance system
    - if a high priority thread tries to take a mutex that is currently
    locked by a low priority thread, that low priority thread gets
    temporarily boosted to a high priority level so that the real high
    priority task does not wait too long.

    So mutex's and semaphores are very different types of object, used for
    very different purposes - even though their implementations may share a
    lot of common code. (In FreeRTOS, for example, they are both
    specialisations of the same structure - a message queue.)

    The "frutex" (fast recursive mutex) mechanism I used was build from an
    atomic state variable, a counter for recursive use in the same thread,
    an owner variable, and a mutex. Here the mutex is normally in the
    locked state, and the atomic state variable was normally 0 (unlocked).
    When a thread wants to lock the frutex, it first checks if it is the
    current owner - if so, it can increment the recursion counter (without
    even needing atomic accesses) and return "locked". Otherwise, inside a critical section (much easier in a single processor embedded system) it
    checks if the frutex state is free - if so, set it to locked, set the
    owner to the current thread, set the recursive counter to 1, and exit
    the critical section and the "lock" function. If the state variable
    shows that the lock is not free, then increment the state (which then
    tracks the number of waiting threads), transfer the ownership of the
    OS-level mutex to the current owner of the frutex, and exit the critical section (but not the lock function - we still don't have the lock). Now
    we do a normal blocking OS lock of the mutex, and when we get it we set
    the frutex owner to ourself and the count to 1. Unlocking is mostly
    simple - if we own the frutex recursively, decrement the counter. If no
    one else is waiting for the lock (the state is 1), clear the owner and
    set the state to 0. But if there is one or more thread waiting (state
    is greater than 1), decrement the state and release the OS mutex. That
    will immediately wake a thread that is waiting for it.

    All this means that the common case - taking and releasing the lock with
    there is no contention - is done with a few reads and writes of
    variables within a critical section. (Critical sections are cheap on
    small single-core embedded systems.) Actual OS mutex handling, along
    with thread priority changes and potential scheduler calls, is only used
    when required to handle contention.







    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Wed Dec 17 12:34:27 2025
    From Newsgroup: comp.lang.c++

    Am 17.12.2025 um 10:31 schrieb David Brown:
    On 16/12/2025 21:12, Chris M. Thomasson wrote:
    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system,
    but it was a mutex rather than a semaphore paired with an atomic
    counter.-a The result was massively faster than the original mutex.


    Can you clarify a bit? A benaphore can be used as a mutex. But, it
    turns it into a bakery type algo...

    Mutexes and semaphores are different mechanisms for different uses.

    Terminology varies, and I don't know of any kind of "official" definitions.-a So I am just going to describe the terms as I am used to
    them from the RTOS's and systems I am familiar with.

    A semaphore is a signalling mechanism, and can be used to synchronise between threads and control waking and sleeping. Critical to semaphore behaviour is that they are counters that can be incremented or
    decremented by different threads - producer threads increment the
    counters, and consumer threads decrement them.-a A consumer that tries
    to decrement a semaphore that is currently at zero, blocks.-a When a producer increments a semaphore from zero, one or more sleeping
    threads may be woken.-a A binary semaphore is just a semaphore that
    should not be incremented past 1.

    A "benephore", as I understand it, is an atomic counter paired with a
    binary semaphore.-a Instead of using potentially costly OS-level
    increment and decrement operation for all changes to the semaphore
    count, it uses cheap atomic operations except in the cases of changes between 0 and 1 - those use the OS semaphore mechanisms so that
    threads can be blocked or woken as appropriate.

    A mutex is a resource control mechanism for handling exclusive access
    to a protected resource (data structure, hardware, whatever).-a When a thread locks the mutex, any other thread attempting to lock the same
    mutex will block until the first thread releases the mutex.-a Locked
    mutex's have an owner, can only be released by that owner.-a
    (Semaphores have no owner and can be incremented and decremented by
    any thread.)-a In an RTOS, you also usually have some kind of priority inheritance system - if a high priority thread tries to take a mutex
    that is currently locked by a low priority thread, that low priority
    thread gets temporarily boosted to a high priority level so that the
    real high priority task does not wait too long.

    So mutex's and semaphores are very different types of object, used for
    very different purposes - even though their implementations may share
    a lot of common code.-a (In FreeRTOS, for example, they are both specialisations of the same structure - a message queue.)

    The "frutex" (fast recursive mutex) mechanism I used was build from an atomic state variable, a counter for recursive use in the same thread,
    an owner variable, and a mutex.-a Here the mutex is normally in the
    locked state, and the atomic state variable was normally 0 (unlocked).
    When a thread wants to lock the frutex, it first checks if it is the
    current owner - if so, it can increment the recursion counter (without
    even needing atomic accesses) and return "locked".-a Otherwise, inside
    a critical section (much easier in a single processor embedded system)
    it checks if the frutex state is free - if so, set it to locked, set
    the owner to the current thread, set the recursive counter to 1, and
    exit the critical section and the "lock" function.-a If the state
    variable shows that the lock is not free, then increment the state
    (which then tracks the number of waiting threads), transfer the
    ownership of the OS-level mutex to the current owner of the frutex,
    and exit the critical section (but not the lock function - we still
    don't have the lock).-a Now we do a normal blocking OS lock of the
    mutex, and when we get it we set the frutex owner to ourself and the
    count to 1.-a Unlocking is mostly simple - if we own the frutex
    recursively, decrement the counter.-a If no one else is waiting for the
    lock (the state is 1), clear the owner and set the state to 0.-a But if there is one or more thread waiting (state is greater than 1),
    decrement the state and release the OS mutex.-a That will immediately
    wake a thread that is waiting for it.

    All this means that the common case - taking and releasing the lock
    with there is no contention - is done with a few reads and writes of variables within a critical section.-a (Critical sections are cheap on
    small single-core embedded systems.)-a Actual OS mutex handling, along
    with thread priority changes and potential scheduler calls, is only
    used when required to handle contention.

    You're carrying coals to Newcastle.
    Chris knows that.

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Wed Dec 17 15:43:37 2025
    From Newsgroup: comp.lang.c++

    That's my final version of my semaphore class. I had to deal
    with stolen wakeups following spurious wakeups (possible
    with futexes).

    #pragma once
    #include <atomic>
    #include <stdexcept>
    #include <cassert>

    struct xsemaphore
    {
    -a -a xsemaphore( uint32_t count = 0 );
    -a -a xsemaphore( const xsemaphore & ) = delete;
    -a -a xsemaphore &operator=( const xsemaphore & ) = delete;
    -a -a unsigned release( uint32_t update = 1 ) noexcept;
    -a -a bool acquire() noexcept;
    private:
    -a -a bool check( uint64_t counters );
    -a -a static constexpr unsigned
    -a -a -a -a NOTIFY_BASE = 21,
    -a -a -a -a WAITER_BASE = 42;
    -a -a static constexpr uint64_t
    -a -a -a -a MASK21 = 0x1FFFFF,
    -a -a -a -a NOTIFY_MASK = MASK21 << NOTIFY_BASE,
    -a -a -a -a WAITER_MASK = MASK21 << WAITER_BASE,
    -a -a -a -a COUNTER_VALUE = 1,
    -a -a -a -a NOTIFY_VALUE = 1ull << NOTIFY_BASE,
    -a -a -a -a WAITER_VALUE = 1ull << WAITER_BASE;
    -a -a std::atomic_uint64_t m_counters;
    };

    struct s_semaphore
    {
    -a -a s_semaphore( uint32_t count = 0 );
    -a -a s_semaphore( const xsemaphore & ) = delete;
    -a -a s_semaphore &operator=( const s_semaphore & ) = delete;
    -a -a void release( uint32_t update, bool all ) noexcept;
    -a -a void acquire() noexcept;
    private:
    -a -a std::atomic_uint32_t m_count;
    };

    inline s_semaphore::s_semaphore( uint32_t count ) :
    -a -a m_count( count )
    {
    }

    #include <algorithm>
    #include "xsemaphore.h"

    using namespace std;

    #if defined(_MSC_VER)
    -a -a #pragma warning(push)
    -a -a #pragma warning(disable: 4297)
    #endif

    xsemaphore::xsemaphore( uint32_t count ) :
    -a -a m_counters( [&]
    -a -a -a -a {
    -a -a -a -a -a -a if( count > MASK21 )
    -a -a -a -a -a -a -a -a throw overflow_error( "xsemphore - semaphore counter too high" );
    -a -a -a -a -a -a return count;
    -a -a -a -a }() )
    {
    }

    unsigned xsemaphore::release( uint32_t update ) noexcept
    {
    -a -a if( !update )
    -a -a -a -a return 0;
    -a -a uint32_t wakeUp, beyond;
    -a -a for( uint64_t ref = m_counters.load( memory_order_relaxed ); ; ) [[unlikely]]
    -a -a {
    -a -a -a -a assert(check( ref ));
    -a -a -a -a uint32_t count = ref & MASK21;
    -a -a -a -a if( count == MASK21 ) [[unlikely]]
    -a -a -a -a -a -a return update;
    -a -a -a -a uint32_t clippedUpdate = min( update, (uint32_t)(MASK21 - count) );
    -a -a -a -a beyond = update - clippedUpdate;
    -a -a -a -a update = clippedUpdate;
    -a -a -a -a count += update;
    -a -a -a -a uint32_t waiting = (ref >> WAITER_BASE) & MASK21;
    -a -a -a -a if( !waiting ) [[likely]]
    -a -a -a -a -a -a if( m_counters.compare_exchange_weak( ref, (ref & ~MASK21) | count, memory_order_release, memory_order_relaxed ) ) [[likely]]
    -a -a -a -a -a -a -a -a return beyond;
    -a -a -a -a -a -a else
    -a -a -a -a -a -a -a -a continue;
    -a -a -a -a wakeUp = min( update, waiting );
    -a -a -a -a uint32_t notify = ((ref >> NOTIFY_BASE) & MASK21) + wakeUp;
    -a -a -a -a assert(!(notify & ~MASK21));
    -a -a -a -a waiting -= wakeUp;
    -a -a -a -a assert(!(waiting & ~MASK21));
    -a -a -a -a uint64_t niu = (uint64_t)waiting << WAITER_BASE | (uint64_t)notify << NOTIFY_BASE | count;
    -a -a -a -a if( m_counters.compare_exchange_weak( ref, niu, memory_order_release, memory_order_relaxed ) ) [[likely]]
    -a -a -a -a -a -a break;
    -a -a }
    -a -a assert(wakeUp);
    -a -a if( wakeUp == update ) [[likely]]
    -a -a -a -a m_counters.notify_all();
    -a -a else
    -a -a -a -a do [[likely]]
    -a -a -a -a -a -a m_counters.notify_one();
    -a -a -a -a while( --wakeUp );
    -a -a return beyond;
    }

    bool xsemaphore::acquire() noexcept
    {
    -a -a uint64_t ref = m_counters.load( memory_order_relaxed ), niu;
    -a -a for( ; ; ) [[unlikely]]
    -a -a {
    -a -a -a -a assert(check( ref ));
    -a -a -a -a uint32_t
    -a -a -a -a -a -a count = ref & MASK21,
    -a -a -a -a -a -a notified = (ref >> NOTIFY_BASE) & MASK21;
    -a -a -a -a if( count > notified ) [[likely]]
    -a -a -a -a -a -a if( m_counters.compare_exchange_weak( ref, ref - COUNTER_VALUE, memory_order_acquire, memory_order_relaxed ) )
    -a -a -a -a -a -a -a -a return true;
    -a -a -a -a -a -a else
    -a -a -a -a -a -a -a -a continue;
    -a -a -a -a if( (ref & WAITER_MASK) == WAITER_MASK ) [[unlikely]]
    -a -a -a -a -a -a return false;
    -a -a -a -a uint64_t niu = ref + WAITER_VALUE;
    -a -a -a -a if( m_counters.compare_exchange_weak( ref, niu, memory_order_relaxed, memory_order_relaxed ) ) [[likely]]
    -a -a -a -a -a -a break;
    -a -a }
    -a -a for( ; ; ) [[unlikely]]
    -a -a {
    -a -a -a -a do [[unlikely]]
    -a -a -a -a {
    -a -a -a -a -a -a m_counters.wait( ref, memory_order_relaxed );
    -a -a -a -a -a -a ref = m_counters.load( memory_order_relaxed );
    -a -a -a -a -a -a assert(check( ref ));
    -a -a -a -a } while( !(ref & NOTIFY_MASK) );
    -a -a -a -a do [[unlikely]]
    -a -a -a -a {
    -a -a -a -a -a -a niu = ref - (NOTIFY_VALUE + COUNTER_VALUE);
    -a -a -a -a -a -a if( m_counters.compare_exchange_weak( ref, niu, memory_order_acquire, memory_order_relaxed ) ) [[likely]]
    -a -a -a -a -a -a -a -a return true;
    -a -a -a -a -a -a assert(check( ref ));
    -a -a -a -a } while( ref & NOTIFY_MASK );
    -a -a }
    }

    inline bool xsemaphore::check( uint64_t ref )
    {
    #if !defined(NDEBUG)
    -a -a uint32_t
    -a -a -a -a count = ref & MASK21,
    -a -a -a -a notified = (ref >> NOTIFY_BASE) & MASK21;
    -a -a return (int64_t)ref >= 0 && count >= notified;
    #else
    -a -a return true;
    #endif
    }

    void s_semaphore::release( uint32_t update, bool all ) noexcept
    {
    -a -a if( !update )
    -a -a -a -a return;
    -a -a m_count.fetch_add( update, memory_order_release );
    -a -a if( !all )
    -a -a -a -a do
    -a -a -a -a -a -a m_count.notify_one();
    -a -a -a -a while( --update );
    -a -a else
    -a -a -a -a m_count.notify_all();
    }

    void s_semaphore::acquire() noexcept
    {
    -a -a uint32_t ref;
    -a -a do
    -a -a -a -a while( !(ref = m_count.load( memory_order_relaxed )) )
    -a -a -a -a -a -a m_count.wait( ref, memory_order_relaxed );
    -a -a while( !m_count.compare_exchange_strong( ref, ref - 1, memory_order_acquire ) );
    }

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Wed Dec 17 12:48:16 2025
    From Newsgroup: comp.lang.c++

    On 12/17/2025 1:31 AM, David Brown wrote:
    On 16/12/2025 21:12, Chris M. Thomasson wrote:
    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system, but
    it was a mutex rather than a semaphore paired with an atomic counter.
    The result was massively faster than the original mutex.


    Can you clarify a bit? A benaphore can be used as a mutex. But, it
    turns it into a bakery type algo...

    Mutexes and semaphores are different mechanisms for different uses.

    Indeed.

    [...]

    Distributed mutex? Yes, I know about them. There were some interesting
    algos back in the day. It's going to be hard to try to find my old
    threads on comp.arch, sigh.

    One point, recursive locks are, well, I personally hate them... I had to
    debug a case where somebody was using recursive locks and I had to artificially step and pause threads to a point where I found the
    deadlock. Tedious, uggg. A single thread was deadlocked several
    recursions down. What a nightmare. Anyway, I digress.

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Wed Dec 17 13:41:52 2025
    From Newsgroup: comp.lang.c++

    On 12/16/2025 12:07 PM, Chris M. Thomasson wrote:
    On 12/16/2025 4:44 AM, Bonita Montero wrote:
    Am 15.12.2025 um 21:50 schrieb Chris M. Thomasson:
    On 12/14/2025 9:12 PM, Bonita Montero wrote:
    Am 14.12.2025 um 22:15 schrieb Chris M. Thomasson:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h
    [...]
    What about the simple semaphore?
    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html
    There's no architectural description for this.
    What do you mean?

    I'm interested in the implementation, not how to use it.



    Well, the implementation of it only uses fetch-and-add, (XADD) over in x86/x64 world. Also, its loopless.

    I need to clarify. Okay, on the x86 a LOCK XADD will make for a loopless
    impl. If were on another system and that XADD is some sort of LL/SC
    loop, well, that causes damage to my loopless claim... ;^o
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Wed Dec 17 14:24:19 2025
    From Newsgroup: comp.lang.c++

    On 12/17/2025 12:48 PM, Chris M. Thomasson wrote:
    On 12/17/2025 1:31 AM, David Brown wrote:
    On 16/12/2025 21:12, Chris M. Thomasson wrote:
    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system,
    but it was a mutex rather than a semaphore paired with an atomic
    counter. The result was massively faster than the original mutex.


    Can you clarify a bit? A benaphore can be used as a mutex. But, it
    turns it into a bakery type algo...

    Mutexes and semaphores are different mechanisms for different uses.

    Indeed.

    [...]

    Distributed mutex? Yes, I know about them. There were some interesting
    algos back in the day. It's going to be hard to try to find my old
    threads on comp.arch, sigh.

    One point, recursive locks are, well, I personally hate them... I had to debug a case where somebody was using recursive locks and I had to artificially step and pause threads to a point where I found the
    deadlock. Tedious, uggg. A single thread was deadlocked several
    recursions down. What a nightmare. Anyway, I digress.


    Damn it! I will can't find my old posts on comp.arch. But it was a
    queued lock, iirc, much like this one:

    https://lwn.net/Articles/590243/
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Wed Dec 17 14:31:59 2025
    From Newsgroup: comp.lang.c++

    On 12/17/2025 2:24 PM, Chris M. Thomasson wrote:
    [...]

    I found an old post of mine, but I still cannot find that old white
    paper about the MLOCK! Damn it!!!

    https://groups.google.com/g/comp.arch/c/aLVoxdQdRac/m/2FTiVLZm5JIJ

    BTW Here is M lock:
    __________________________________________________
    struct m_spinlock
    {
    struct flag
    {
    flag* m_next;
    std::atomic<unsigned> m_state;
    flag() : m_next(NULL), m_state(0) {}
    };


    struct thread
    {
    flag* m_flags;
    flag* m_cur;
    flag* m_prev;

    thread()
    : m_flags(new flag()), m_cur(new flag()), m_prev(NULL) {}

    ~thread()
    {
    while (m_flags)
    {
    flag* next = m_flags->m_next;
    delete m_flags;
    m_flags = next;
    }

    delete m_cur;

    RL_ASSERT(! m_prev);
    }
    };


    std::atomic<flag*> m_flag;


    m_spinlock() : m_flag(NULL) {}


    void lock(thread& t)
    {
    t.m_cur->m_state.store(1, mb_relaxed);
    std::atomic_thread_fence(mb_release);

    t.m_prev = m_flag.exchange(t.m_cur, mb_consume);

    if (t.m_prev)
    {
    linear_backoff backoff;
    while (t.m_prev->m_state.load(mb_relaxed))
    backoff.yield();
    }

    std::atomic_thread_fence(mb_acquire);
    }


    void unlock(thread& t)
    {
    std::atomic_thread_fence(mb_release);

    flag* cur = t.m_cur;
    if (! m_flag.compare_exchange_strong(
    cur,
    NULL,
    mb_relaxed))
    {
    t.m_cur->m_state.store(0, mb_relaxed);

    if (! (t.m_cur = t.m_prev))
    {
    if (! (t.m_cur = t.m_flags))
    {
    t.m_cur = t.m_flags = new flag();
    }

    t.m_flags = t.m_cur->m_next;
    }

    else
    {
    t.m_prev = NULL;
    }
    }

    else if (t.m_prev)
    {
    t.m_prev->m_next = t.m_flags;
    t.m_flags = t.m_prev;
    t.m_prev = NULL;
    }
    }
    };
    __________________________________________________

    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From David Brown@david.brown@hesbynett.no to comp.lang.c++ on Thu Dec 18 09:13:43 2025
    From Newsgroup: comp.lang.c++

    On 17/12/2025 21:48, Chris M. Thomasson wrote:
    On 12/17/2025 1:31 AM, David Brown wrote:
    On 16/12/2025 21:12, Chris M. Thomasson wrote:
    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system,
    but it was a mutex rather than a semaphore paired with an atomic
    counter. The result was massively faster than the original mutex.


    Can you clarify a bit? A benaphore can be used as a mutex. But, it
    turns it into a bakery type algo...

    Mutexes and semaphores are different mechanisms for different uses.

    Indeed.

    [...]

    Distributed mutex? Yes, I know about them. There were some interesting
    algos back in the day. It's going to be hard to try to find my old
    threads on comp.arch, sigh.

    One point, recursive locks are, well, I personally hate them... I had to debug a case where somebody was using recursive locks and I had to artificially step and pause threads to a point where I found the
    deadlock. Tedious, uggg. A single thread was deadlocked several
    recursions down. What a nightmare. Anyway, I digress.


    Certainly recursive locks can make some things harder to understand -
    but they can also make it easier to make well-structured code if you
    need locking at different levels.

    In my case, the locks were for protecting access to a bus that used
    indirect access. That is, instead of a nice, simple "read data from
    address X" instruction, you would first put address X into one register,
    put the "read" command into another register, and then read the data
    register. And clearly it would be a bad idea if another thread mixed in
    its own accesses to the bus at the same time. Thus the "read_data_reg" function would take this lock, handle the indirect access, then release
    the lock.

    But sometimes I needed to read (or write) multiple registers in an
    atomic unit. So the code for that would take the lock, do the
    "read_data_reg" or "write_data_reg" operations, then release the lock.
    If the lock were not recursive, I'd have to duplicate code with "read_data_reg_already_locked" functions, and so on.

    There are, of course, plenty of alternative setups - templates and
    overloaded functions could avoid duplication of source code when making "already locked" and "needs locks" variants of the functions. The
    functions could use a std::optional<> wrapping a lock guard, which could
    be created when needed while still being sure it got released at the
    right point. Ultimately, however, you have to track whether you have
    got the lock already or whether you need it, and a recursive mutex does
    that for you. Any alternative is likely to be at least as difficult to
    follow in the source code.

    And since this is all for a real-time embedded system, speed is
    important - and in particular, consistency and maximum time is
    important. So you don't use new and delete (or any dynamic memory), you
    don't use varying backoff times, you don't use spinlocks (on a single
    process system, a spinlock just kills the system), and you reduce the overheads that you can.


    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Thu Dec 18 16:45:10 2025
    From Newsgroup: comp.lang.c++

    On 12/18/2025 12:13 AM, David Brown wrote:
    On 17/12/2025 21:48, Chris M. Thomasson wrote:
    On 12/17/2025 1:31 AM, David Brown wrote:
    On 16/12/2025 21:12, Chris M. Thomasson wrote:
    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system,
    but it was a mutex rather than a semaphore paired with an atomic
    counter. The result was massively faster than the original mutex.


    Can you clarify a bit? A benaphore can be used as a mutex. But, it
    turns it into a bakery type algo...

    Mutexes and semaphores are different mechanisms for different uses.

    Indeed.

    [...]

    Distributed mutex? Yes, I know about them. There were some interesting
    algos back in the day. It's going to be hard to try to find my old
    threads on comp.arch, sigh.

    One point, recursive locks are, well, I personally hate them... I had
    to debug a case where somebody was using recursive locks and I had to
    artificially step and pause threads to a point where I found the
    deadlock. Tedious, uggg. A single thread was deadlocked several
    recursions down. What a nightmare. Anyway, I digress.


    Certainly recursive locks can make some things harder to understand -
    but they can also make it easier to make well-structured code if you
    need locking at different levels.

    Well, yeah. I am a bit biased because of all the recursive nightmare
    code from others that I had to debug in the past. One bad one was a
    server loop that locked its per-socket state then did a callback into
    user code that would call into per-socket functions that would take the
    damn lock again. A deadlock would occur around say, 7 recursions deep. Sigh!


    In my case, the locks were for protecting access to a bus that used
    indirect access.-a That is, instead of a nice, simple "read data from address X" instruction, you would first put address X into one register,
    put the "read" command into another register, and then read the data register.-a And clearly it would be a bad idea if another thread mixed in its own accesses to the bus at the same time.-a Thus the "read_data_reg" function would take this lock, handle the indirect access, then release
    the lock.

    But sometimes I needed to read (or write) multiple registers in an
    atomic unit.-a So the code for that would take the lock, do the "read_data_reg" or "write_data_reg" operations, then release the lock.
    If the lock were not recursive, I'd have to duplicate code with "read_data_reg_already_locked" functions, and so on.

    There are, of course, plenty of alternative setups - templates and overloaded functions could avoid duplication of source code when making "already locked" and "needs locks" variants of the functions.-a The functions could use a std::optional<> wrapping a lock guard, which could
    be created when needed while still being sure it got released at the
    right point.-a Ultimately, however, you have to track whether you have
    got the lock already or whether you need it, and a recursive mutex does
    that for you.-a Any alternative is likely to be at least as difficult to follow in the source code.

    They are there for a reason, its just that I personally do not like
    them. Shit happens...


    And since this is all for a real-time embedded system, speed is
    important - and in particular, consistency and maximum time is
    important.-a So you don't use new and delete (or any dynamic memory), you don't use varying backoff times, you don't use spinlocks (on a single process system, a spinlock just kills the system), and you reduce the overheads that you can.

    The distributed lock I was trying to find the paper for was basically something used in the kernel as a queued lock. So, not really meant for userspace anyway.

    lol. I remember one I used to benchmark against where each reader thread
    had its own mutex. A writer thread would take all of them. Writes were
    not very frequent. So, a write would be a slow path, so to speak.
    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From David Brown@david.brown@hesbynett.no to comp.lang.c++ on Fri Dec 19 08:57:47 2025
    From Newsgroup: comp.lang.c++

    On 19/12/2025 01:45, Chris M. Thomasson wrote:
    On 12/18/2025 12:13 AM, David Brown wrote:
    On 17/12/2025 21:48, Chris M. Thomasson wrote:
    On 12/17/2025 1:31 AM, David Brown wrote:
    On 16/12/2025 21:12, Chris M. Thomasson wrote:
    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html

    I implemented something similar in an RTOS on an embedded system, >>>>>> but it was a mutex rather than a semaphore paired with an atomic
    counter. The result was massively faster than the original mutex.


    Can you clarify a bit? A benaphore can be used as a mutex. But, it
    turns it into a bakery type algo...

    Mutexes and semaphores are different mechanisms for different uses.

    Indeed.

    [...]

    Distributed mutex? Yes, I know about them. There were some
    interesting algos back in the day. It's going to be hard to try to
    find my old threads on comp.arch, sigh.

    One point, recursive locks are, well, I personally hate them... I had
    to debug a case where somebody was using recursive locks and I had to
    artificially step and pause threads to a point where I found the
    deadlock. Tedious, uggg. A single thread was deadlocked several
    recursions down. What a nightmare. Anyway, I digress.


    Certainly recursive locks can make some things harder to understand -
    but they can also make it easier to make well-structured code if you
    need locking at different levels.

    Well, yeah. I am a bit biased because of all the recursive nightmare
    code from others that I had to debug in the past. One bad one was a
    server loop that locked its per-socket state then did a callback into
    user code that would call into per-socket functions that would take the
    damn lock again. A deadlock would occur around say, 7 recursions deep.
    Sigh!


    Locks should be held around as small an area as possible (both in terms
    of time, and code) - and you certainly should not be calling unknown
    external code while holding a lock. The people behind that particular
    bit of software architecture would probably have made an even more incomprehensible and unreliable solution if they did not have recursive
    locks - such as using a home-made recursive mutex using a non-recursive
    lock and a counter without appropriate care for ordering and atomicity.
    When you are feeling frustrated and annoyed dealing with bad code,
    remember it could always have been worse :-)


    --- Synchronet 3.21a-Linux NewsLink 1.2
  • From Chris M. Thomasson@chris.m.thomasson.1@gmail.com to comp.lang.c++ on Tue Dec 23 13:00:47 2025
    From Newsgroup: comp.lang.c++

    On 12/18/2025 11:57 PM, David Brown wrote:
    On 19/12/2025 01:45, Chris M. Thomasson wrote:
    On 12/18/2025 12:13 AM, David Brown wrote:
    On 17/12/2025 21:48, Chris M. Thomasson wrote:
    On 12/17/2025 1:31 AM, David Brown wrote:
    On 16/12/2025 21:12, Chris M. Thomasson wrote:
    On 12/16/2025 11:54 AM, David Brown wrote:
    On 14/12/2025 22:15, Chris M. Thomasson wrote:
    On 12/14/2025 10:00 AM, Bonita Montero wrote:
    // xsemaphore.h

    [...]

    What about the simple semaphore?

    https://www.haiku-os.org/legacy-docs/benewsletter/Issue1-26.html >>>>>>>
    I implemented something similar in an RTOS on an embedded system, >>>>>>> but it was a mutex rather than a semaphore paired with an atomic >>>>>>> counter. The result was massively faster than the original mutex. >>>>>>>

    Can you clarify a bit? A benaphore can be used as a mutex. But, it >>>>>> turns it into a bakery type algo...

    Mutexes and semaphores are different mechanisms for different uses.

    Indeed.

    [...]

    Distributed mutex? Yes, I know about them. There were some
    interesting algos back in the day. It's going to be hard to try to
    find my old threads on comp.arch, sigh.

    One point, recursive locks are, well, I personally hate them... I
    had to debug a case where somebody was using recursive locks and I
    had to artificially step and pause threads to a point where I found
    the deadlock. Tedious, uggg. A single thread was deadlocked several
    recursions down. What a nightmare. Anyway, I digress.


    Certainly recursive locks can make some things harder to understand -
    but they can also make it easier to make well-structured code if you
    need locking at different levels.

    Well, yeah. I am a bit biased because of all the recursive nightmare
    code from others that I had to debug in the past. One bad one was a
    server loop that locked its per-socket state then did a callback into
    user code that would call into per-socket functions that would take
    the damn lock again. A deadlock would occur around say, 7 recursions
    deep. Sigh!


    Locks should be held around as small an area as possible (both in terms
    of time, and code) - and you certainly should not be calling unknown external code while holding a lock.

    Oh yeah. For sure. Their per-socket mutex, uggg, was used to "protect"
    their per-connection data... Called into unknown user code. Damn it! I digress. Sorry. ;^o

    When I saw a mutex in their per-socket data structure I said oh this is
    going to be fun to debug... Oh shit.


    The people behind that particular
    bit of software architecture would probably have made an even more incomprehensible and unreliable solution if they did not have recursive locks - such as using a home-made recursive mutex using a non-recursive
    lock and a counter without appropriate care for ordering and atomicity.

    Well, that's hard to disagree with. Sometimes they "think, just knew"
    they needed recursive locks when, actually, they did not. Thankfully!
    Praise to god. lol. They did not homebrew it, but used PTHREAD_MUTEX_RECURSIVE.


    When you are feeling frustrated and annoyed dealing with bad code,
    remember it could always have been worse :-)

    God damn, AMEN! Indeed brother! :^)

    Wow. At least I was good at solving tedious deadlocks and race
    conditions back then. But, a recursive lock deadlocking 7 recursions
    deep into user code that I, sigh, also had to look at. Well, shit man. ;^o
    --- Synchronet 3.21a-Linux NewsLink 1.2