Your browser doesn't support the features required by impress.js, so you are presented with a simplified version of this presentation.

For the best experience please use the latest Chrome, Safari or Firefox browser.

Linux threads

User space locking

Introduction

What we will cover

Process

Thread

Synchronization primitive

Linux threads

[glibc/nptl/sysdeps/pthread/createthread.c]
static int
create_thread (struct pthread *pd, const struct pthread_attr *attr,
               STACK_VARIABLES_PARMS)
{

  int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGNAL
                     | CLONE_SETTLS | CLONE_PARENT_SETTID
                     | CLONE_CHILD_CLEARTID | CLONE_SYSVSEM
                     | 0);
  // [...]

  /* Actually create the thread.  */
  int res = do_clone (pd, attr, clone_flags, start_thread,
                      STACK_VARIABLES_ARGS, stopped);
  // [...]
  return res;
}

Why do we need thread specific synch primitives?

Let's compare SystemV vs POSIX semaphore

Same code run on two implementations

#include "sem_posix.hpp" // or sem_systemv.hpp

int main(int argc, char *argv[]) {
    my_sem m1(10000);

    for (int j=0; j<atoi(argv[1]); j++) {
        for (int i=0; i<10000; i++) {
            m1.wait();
        }

        for (int i=0; i<10000; i++) {
            m1.post();
        }
    }
}

Let's run the programs several times

for i in {10..1000..25}; do
  echo -n "$i " >> sem11.dat;
  /usr/bin/time -f "%U %S" ./sem11 $i  2>>sem11.dat;
done

Results for both semaphore types

Where is the difference?

Let's consider a shorter example

int main(int argc, char *argv[]) {
    my_sem m1(4);

    for (int i=0; i<4; i++) {
        m1.wait();
    }

    m1.wait(1);
}

Strace for System V

[maciek@pc futex]$ strace ./sem1
execve("./sem1", ["./sem1"], [/* 51 vars */]) = 0
brk(0)                                  = 0x85c000
[...]
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
stat("/tmp/file", {st_mode=S_IFREG|0664, st_size=0, ...}) = 0
semget(0x610101ec, 1, IPC_CREAT|0666)   = 393218
semctl(393218, 0, SETVAL, 0x4)          = 0
semtimedop(393218, {{0, -1, 0}}, 1, NULL) = 0
semtimedop(393218, {{0, -1, 0}}, 1, NULL) = 0
semtimedop(393218, {{0, -1, 0}}, 1, NULL) = 0
semtimedop(393218, {{0, -1, 0}}, 1, NULL) = 0
semtimedop(393218, {{0, -1, 0}}, 1, {1, 0}) = -1 EAGAIN
semctl(393218, 0, IPC_RMID, 0xffffffffffffff60) = 0
exit_group(0)                           = ?
+++ exited with 0 +++

Strace for POSIX

[maciek@pc futex]$ strace ./sem2
execve("./sem2", ["./sem2"], [/* 51 vars */]) = 0
brk(0)                                  = 0x10b8000
[...]
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
futex(0x7fffcbf93650, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, 0,
   {1388604825, 683126976}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
exit_group(0)                           = ?
+++ exited with 0 +++

Conclusions

Kernel vs user mode

User space locking

Kernel space vs user space primitive

Do we have pure user space locks in POSIX threads library?

POSIX spin lock

Lock function

[nptl/sysdeps/x86_64/pthread_spin_lock.S]
pthread_spin_lock:
1:      LOCK
        decl    0(%rdi)
        jne     2f
        xor     %eax, %eax
        ret

        .align  16
2:      rep
        nop
        cmpl    $0, 0(%rdi)
        jg      1b
        jmp     2b
* Note the LOCK prefix

Lock function

[nptl/sysdeps/x86_64/pthread_spin_lock.S]
pthread_spin_lock:
1:      LOCK
        decl    0(%rdi)          ; lock--;
        jne     2f               ; if (lock != 0) jump 2;
        xor     %eax, %eax       ; else return;
        ret                      ; 

        .align  16
2:      rep                      ; {
        nop                      ;    nop;
        cmpl    $0, 0(%rdi)      ;    if (lock > 0)
        jg      1b               ;       jump 1;
        jmp     2b               ; } while (1);
* Note the LOCK prefix

Unlock function

[nptl/sysdeps/x86_64/pthread_spin_lock.S]
pthread_spin_unlock:
        movl    $1, (%rdi)
        xorl    %eax, %eax
        retq
* Note the missing LOCK prefix

Spin lock summary

Futex primitive

FUTEX_WAIT operation

FUTEX_WAKE operation

Other futex operations

Futex usage example

Example 1 - for simplicity no assembly and no atomics

#include <thread>
#include <chrono>
#include <vector>
#include <cstdlib>
#include <limits.h>
#include <linux/futex.h>
#include <unistd.h>
#include <sys/syscall.h>

long futex_wait(int* uaddr, int val1) {
  return syscall(SYS_futex, uaddr, FUTEX_WAIT_PRIVATE, val1, NULL, NULL, 0);
}

long futex_wake(int* uaddr, int val1) {
  return syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, val1, NULL, NULL, 0);
}

long rand_ms(int max) {
  return std::rand() % max;
}

class step {
public:
    step() : val(0) {
    }

    void signal(int new_val) {
        val = new_val;
        futex_wake(&val, INT_MAX);
    }

    void wait(int till) {
        while (1) {
            int tmp = val;
            if (tmp >= till) break;
            futex_wait(&val, tmp);
        }
    }

private:
    int val;
};
* Do we need to use atomics here? Why tmp variable in wait?
class step {
public:
    step() : val(0) {
    }

    void signal(int new_val) {
        val = new_val;
        futex_wake(&val, INT_MAX);
    }

    void wait(int till) {
        while (val < till) {
            futex_wait(&val, val);
        }
    }



private:
    int val;
};
 
int main(int argc, char* argv[]) {
    std::srand(std::time(0));

    std::vector<std::thread> threads;
    step st;

    for (int i=0; i<10; i++) {
        using namespace std::chrono;
        threads.push_back(std::thread([&st, i] {
            std::this_thread::sleep_for<>(milliseconds(rand_ms(2000)));
            std::printf("[%d] WAIT\n", i);
            st.wait(i);

            std::this_thread::sleep_for<>(milliseconds(rand_ms(500)));
            std::printf("[%d] SIG\n", i);
            st.signal(i+1);
        }));
    }

    for (auto& th : threads) {
        th.join();
    }
}

Example console output

[maciek@pc futex_step]$ ./test
[8] WAIT
[4] WAIT
[0] WAIT
[0] SIG
[7] WAIT
[1] WAIT
[1] SIG
[6] WAIT
[9] WAIT
[3] WAIT
[2] WAIT
[2] SIG
[5] WAIT
[3] SIG
[4] SIG
[5] SIG
[6] SIG
[7] SIG
[8] SIG
[9] SIG

Futex mutex example

Example 2 - some atomics, some assembly, mostly c

Atomic operations used in example

Atomic decrement

[glibc/sysdeps/x86_64/bits/atomic.h]
#define __arch_decrement_body(lock, mem) \
  do {                                                                        \
    if (sizeof (*mem) == 1)                                                   \
       // [...]
    else if (sizeof (*mem) == 2)                                              \
       // [...]
    else if (sizeof (*mem) == 4)                                              \
      __asm __volatile (lock "decl %0"                                        \
                        : "=m" (*mem)                                         \
                        : "m" (*mem),                                         \
                          "i" (offsetof (tcbhead_t, multiple_threads)));      \
    else                                                                      \
       // [...]
  } while (0)

#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, mem)

Atomic bit test and set

[glibc/sysdeps/x86_64/bits/atomic.h]
#define atomic_bit_test_set(mem, bit) \
  ({ unsigned char __result;                                                  \
     if (sizeof (*mem) == 1)                                                  \
        // [...]
     else if (sizeof (*mem) == 2)                                             \
        // [...]
     else if (sizeof (*mem) == 4)                                             \
       __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0"                   \
                         : "=q" (__result), "=m" (*mem)                       \
                         : "m" (*mem), "ir" (bit));                           \
     else                                                                     \
        // [...]
     __result; })

Atomic add and test if zero

[glibc/sysdeps/x86_64/bits/atomic.h]
#define atomic_add_zero(mem, value) \
  ({ unsigned char __result;                                                  \
     if (sizeof (*mem) == 1)                                                  \
        // [...]
     else if (sizeof (*mem) == 2)                                             \
        // [...]
     else if (sizeof (*mem) == 4)                                             \
       __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1"                   \
                         : "=m" (*mem), "=qm" (__result)                      \
                         : "ir" (value), "m" (*mem));                         \
     else                                                                     \
        // [...]
     __result; })

Futex mutex

Mutext lock

[glibc/nptl/lowlevellock.h]
static inline void __generic_mutex_lock (int *mutex) {
  unsigned int v;

  /* Bit 31 was clear, we got the mutex.  (this is the fastpath).  */
  if (atomic_bit_test_set (mutex, 31) == 0) return;

  atomic_increment (mutex);
  while (1) {
      if (atomic_bit_test_set (mutex, 31) == 0) {
          atomic_decrement (mutex);
          return;
      }

      /* We have to wait now. First make sure the futex value we are
         monitoring is truly negative (i.e. locked). */
      v = *mutex;
      if (v >= 0) continue;

      lll_futex_wait (mutex, v, LLL_SHARED);
    }
}

Mutext unlock

[glibc/nptl/lowlevellock.h]
static inline void
__generic_mutex_unlock (int *mutex)
{
  /* Adding 0x80000000 to the counter results in 0 if and only if
     there are not other interested threads - we can return (this is
     the fastpath).  */
  if (atomic_add_zero (mutex, 0x80000000))
    return;

  /* There are other threads waiting for this mutex, wake one of them
     up.  */
  lll_futex_wake (mutex, 1, LLL_SHARED);
}

Example execution 1

THREAD1 Futex value (hex) Syscall
1 0x00000000
2 lock 0x80000000
5 unlock 0x00000000

Example execution 2

THREAD1 THREAD2 THREAD3 Futex value (hex) Syscall
1 0x00000000
2 lock 0x80000000
3 lock (1) 0x80000001 futex_wait
4 lock (1) 0x80000002 futex_wait
5 unlock 0x00000002 futex_wake
6 lock (2) 0x80000001
7 unlock 0x00000001 futex_wake
8 lock (2) 0x80000000
9 unlock 0x00000000

Pthread mutex

Example 3 - lot of atomics, lot of assembly

x86_64 recap

Atomic operations used by mutex

Low level lock - try lock

[glibc/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h]
#if !IS_IN (libc) || defined UP
#define __lll_trylock_asm LOCK_INSTR "cmpxchgl %2, %1"
#else
#define __lll_trylock_asm "cmpl $0, __libc_multiple_threads(%%rip)\n\t" \
                          "je 0f\n\t"                                   \ 
                          "lock; cmpxchgl %2, %1\n\t"                   \
                          "jmp 1f\n\t"                                  \
                          "0:\tcmpxchgl %2, %1\n\t"                     \
                           "1:"
#endif

#define lll_trylock(futex) \
 ({ int ret;                                                            \
    __asm __volatile (__lll_trylock_asm                                 \
                      : "=a" (ret), "=m" (futex)                        \
                      : "r" (LLL_LOCK_INITIALIZER_LOCKED), "m" (futex), \
                        "0" (LLL_LOCK_INITIALIZER)                      \
                      : "memory");                                      \
   ret; })
store LLL_LOCK_INITIALIZER in %eax
cmpxchg LLL_LOCK_INITIALIZER_LOCKED, futex

Low level lock - lock

[glibc/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h]
#if !IS_IN (libc) || defined UP
#define __lll_lock_asm_start LOCK_INSTR "cmpxchgl %4, %2\n\t"              \
                              "jz 24f\n\t"
#else
#define __lll_lock_asm_start "cmpl $0, __libc_multiple_threads(%%rip)\n\t" \
                              "je 0f\n\t"                                  \
                              "lock; cmpxchgl %4, %2\n\t"                  \
                              "jnz 1f\n\t"                                 \
                              "jmp 24f\n"                                  \
                              "0:\tcmpxchgl %4, %2\n\t"                    \
                              "jz 24f\n\t"
#endif
cmpxchg LLL_LOCK_INITIALIZER_LOCKED, futex
On 0 (lock was unlocked) exit (fast path)
On 1 or 2 (locked, waiters) continue
#define lll_lock(futex, private) \
  (void)                                                                   \
    ({ int ignore1, ignore2, ignore3;                                      \
       if (__builtin_constant_p (private) && (private) == LLL_PRIVATE)     \
         __asm __volatile (__lll_lock_asm_start                            \
                           "1:\tlea %2, %%" RDI_LP "\n"                    \
                           "2:\tsub $128, %%" RSP_LP "\n"                  \
                           ".cfi_adjust_cfa_offset 128\n"                  \
                           "3:\tcallq __lll_lock_wait_private\n"           \
                           "4:\tadd $128, %%" RSP_LP "\n"                  \
                           ".cfi_adjust_cfa_offset -128\n"                 \
                           "24:"                                           \
                           : "=S" (ignore1), "=&D" (ignore2), "=m" (futex),\
                             "=a" (ignore3)                                \
                           : "0" (1), "m" (futex), "3" (0)                 \
                           : "cx", "r11", "cc", "memory");                 \
       else                                                                \
         // [...]
    })
syscall_futex( &futex op val *timeout )
%rdi%rsi%rdx%r10
[nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S]
__lll_lock_wait_private:
    pushq   %r10
    pushq   %rdx
    xorq    %r10, %r10  /* No timeout.  */
    movl    $2, %edx
    LOAD_PRIVATE_FUTEX_WAIT (%esi)
    cmpl    %edx, %eax  /* NB:   %edx == 2 */
    jne 2f

1:  LIBC_PROBE (lll_lock_wait_private, 1, %rdi)
    movl    $SYS_futex, %eax
    syscall

2:  movl    %edx, %eax
    xchgl   %eax, (%rdi)    /* NB:   lock is implied */

    testl   %eax, %eax
    jnz 1b

    popq    %rdx
    popq    %r10
    retq
syscall_futex( &futex op val NULL )
%rdi%rsi%rdx%r10
syscall_futex( &futex op LLL_LOCK_INITIALIZER_WAITERS (2) NULL )
%rdi%rsi%rdx%r10
syscall_futex( &futex FUTEX_WAIT LLL_LOCK_INITIALIZER_WAITERS (2) NULL )
%rdi%rsi%rdx%r10
if (futex == LLL_LOCK_INITIALIZER_WAITERS) futex_wait
tmp = futex; futex = LLL_LOCK_INITIALIZER_WAITERS;
if (tmp == LLL_LOCK_INITIALIZER) return; else jump 1;

Low level lock - unlock

[glibc/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h]
#if !IS_IN (libc) || defined UP
#define __lll_unlock_asm_start LOCK_INSTR "decl %0\n\t"                   \
                                "je 24f\n\t"
#else
#define __lll_unlock_asm_start \
                            "cmpl $0, __libc_multiple_threads(%%rip)\n\t" \
                            "je 0f\n\t"                                   \
                            "lock; decl %0\n\t"                           \
                            "jne 1f\n\t"                                  \
                            "jmp 24f\n\t"                                 \
                            "0:\tdecl %0\n\t"                             \
                            "je 24f\n\t"
#endif
futex--;
if (futex == 0) exit; else continue;
#define lll_unlock(futex, private) \
  (void)                                                                  \
    ({ int ignore;                                                        \
       if (__builtin_constant_p (private) && (private) == LLL_PRIVATE)    \
         __asm __volatile (__lll_unlock_asm_start                         \
                           "1:\tlea %0, %%" RDI_LP "\n"                   \
                           "2:\tsub $128, %%" RSP_LP "\n"                 \
                           ".cfi_adjust_cfa_offset 128\n"                 \
                           "3:\tcallq __lll_unlock_wake_private\n"        \
                           "4:\tadd $128, %%" RSP_LP "\n"                 \ 
                           ".cfi_adjust_cfa_offset -128\n"                \
                           "24:"                                          \
                           : "=m" (futex), "=&D" (ignore)                 \
                           : "m" (futex)                                  \
                           : "ax", "cx", "r11", "cc", "memory");          \
       else                                                               \
         // [...]
    })
syscall_futex( &futex op val )
%rdi%rsi%rdx
[nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S]
__lll_unlock_wake_private:
    pushq   %rsi
    pushq   %rdx

    movl    $0, (%rdi)
    LOAD_PRIVATE_FUTEX_WAKE (%esi)
    movl    $1, %edx    /* Wake one thread.  */
    movl    $SYS_futex, %eax
    syscall

    popq    %rdx
    popq    %rsi
    retq
futex = 0
syscall_futex( &futex FUTEX_WAKE val )
%rdi%rsi%rdx
syscall_futex( &futex FUTEX_WAKE 1 )
%rdi%rsi%rdx

      

pthread_mutex_t struct

[nptl/sysdeps/unix/sysv/linux/x86/bits/pthreadtypes.h]
typedef union
{
  struct __pthread_mutex_s
  {
    int __lock;
    unsigned int __count;
    int __owner;
    unsigned int __nusers;
    int __kind;
    int __spins;
    __pthread_list_t __list;
  } __data;
  char __size[__SIZEOF_PTHREAD_MUTEX_T];
  long int __align;
} pthread_mutex_t;

The lock function

Some macros used by lock function

[glibc/nptl/pthread_mutex_lock.c]
#ifndef LLL_MUTEX_LOCK
# define LLL_MUTEX_LOCK(mutex) \
  lll_lock ((mutex)->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex))
# define LLL_MUTEX_TRYLOCK(mutex) \
  lll_trylock ((mutex)->__data.__lock)
# define LLL_ROBUST_MUTEX_LOCK(mutex, id) \
  lll_robust_lock ((mutex)->__data.__lock, id, \
                   PTHREAD_ROBUST_MUTEX_PSHARED (mutex))
#endif
int
__pthread_mutex_lock (mutex)
     pthread_mutex_t *mutex;
{
  assert (sizeof (mutex->__size) >= sizeof (mutex->__data));

  unsigned int type = PTHREAD_MUTEX_TYPE (mutex);

  LIBC_PROBE (mutex_entry, 1, mutex);

  if (__builtin_expect (type & ~PTHREAD_MUTEX_KIND_MASK_NP, 0))
    return __pthread_mutex_lock_full (mutex);

  pid_t id = THREAD_GETMEM (THREAD_SELF, tid);

  if (__builtin_expect (type, PTHREAD_MUTEX_TIMED_NP)
      == PTHREAD_MUTEX_TIMED_NP)
    {
    simple:
      /* Normal mutex.  */
      LLL_MUTEX_LOCK (mutex);
      assert (mutex->__data.__owner == 0);
    }

  else if (__builtin_expect (type == PTHREAD_MUTEX_RECURSIVE_NP, 1))
    {
      /* Recursive mutex.  */

      /* Check whether we already hold the mutex.  */
      if (mutex->__data.__owner == id)
        {
          /* Just bump the counter.  */
          if (__builtin_expect (mutex->__data.__count + 1 == 0, 0))
            /* Overflow of the counter.  */
            return EAGAIN;
          ++mutex->__data.__count;

          return 0;
        }

      /* We have to get the mutex.  */
      LLL_MUTEX_LOCK (mutex);

      assert (mutex->__data.__owner == 0);
      mutex->__data.__count = 1;
    }

  else if (__builtin_expect (type == PTHREAD_MUTEX_ADAPTIVE_NP, 1))
    {
      if (! __is_smp) goto simple;

      if (LLL_MUTEX_TRYLOCK (mutex) != 0)
        {
          int cnt = 0;
          int max_cnt = MIN(MAX_ADAPTIVE_COUNT, mutex->__data.__spins*2+10);
          do
            {
              if (cnt++ >= max_cnt)
                {
                  LLL_MUTEX_LOCK (mutex);
                  break;
                }

              BUSY_WAIT_NOP;
            }
          while (LLL_MUTEX_TRYLOCK (mutex) != 0);

          mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
        }
      assert (mutex->__data.__owner == 0);
    }

  else
    {
      assert (type == PTHREAD_MUTEX_ERRORCHECK_NP);
      /* Check whether we already hold the mutex.  */
      if (__builtin_expect (mutex->__data.__owner == id, 0))
        return EDEADLK;
      goto simple;
    }

  /* Record the ownership.  */
  mutex->__data.__owner = id;
#ifndef NO_INCR
  ++mutex->__data.__nusers;
#endif

  LIBC_PROBE (mutex_acquired, 1, mutex);

  return 0;
}

The unlock function

[glibc/nptl/pthread_mutex_unlock.c]
int
internal_function attribute_hidden
__pthread_mutex_unlock_usercnt (mutex, decr)
     pthread_mutex_t *mutex;
     int decr;
{
  int type = PTHREAD_MUTEX_TYPE (mutex);
  if (__builtin_expect (type & ~PTHREAD_MUTEX_KIND_MASK_NP, 0))
    return __pthread_mutex_unlock_full (mutex, decr);

  if (__builtin_expect (type, PTHREAD_MUTEX_TIMED_NP)
      == PTHREAD_MUTEX_TIMED_NP)
    {
      /* Always reset the owner field.  */
    normal:
      mutex->__data.__owner = 0;
      if (decr)
    /* One less user.  */
    --mutex->__data.__nusers;

      /* Unlock.  */
      lll_unlock (mutex->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex));

      LIBC_PROBE (mutex_release, 1, mutex);

      return 0;
    }
  else if (__builtin_expect (type == PTHREAD_MUTEX_RECURSIVE_NP, 1))
    {
      /* Recursive mutex.  */
      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
    return EPERM;

      if (--mutex->__data.__count != 0)
    /* We still hold the mutex.  */
    return 0;
      goto normal;
    }
  else if (__builtin_expect (type == PTHREAD_MUTEX_ADAPTIVE_NP, 1))
    goto normal;
  else
    {
      /* Error checking mutex.  */
      assert (type == PTHREAD_MUTEX_ERRORCHECK_NP);
      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid)
      || ! lll_islocked (mutex->__data.__lock))
    return EPERM;
      goto normal;
    }
}

Conclusions

Summary

References

Thank you

Use a spacebar or arrow keys to navigate