Your browser doesn't support the features required by impress.js, so you are presented with a simplified version of this presentation.

For the best experience please use the latest Chrome, Safari or Firefox browser.

Low level

concurrency

atomicity

atomic operations

ordering

barriers / fences

visibility

synchronizes-with happens-before

Single core

  • single processor/core
  • shared memory
  • multi-thread OS

I. Atomicity

Uniprocessor system

Exchange and add

shared_ptr::~shared_ptr()
{
  if (count-- == 1)
  {
    deleted ptr;
  }
}
  xadd val,(ptr)

Compare and exchange

void stack::push(const T& data) {
        node* new_node = new node(data);
 
        new_node->next = head;
        <context switch>
        head = new_node; 
}
bool __atomic_compare_exchange (type *ptr, type *pexp, type vnew);

void stack::push(const T& data) {
        node* new_node = new node(data);
 
        new_node->next = head;

        while(!__atomic_compare_exchange(
                &head,                     // ptr to value
                &new_node->next,           // ptr to expected value
                new_node)                  // new value
              ) ;
}

Compare and exchange - x86

  mov          (pexp), %eax    ; store expected in %eax
  cmpxchg      vnew, (ptr)     ; try store new in ptr
  sete         %cl             ;
  test         %cl,%cl         ; check cmpxchg status
  jne          _success        ; exit on sucess
  mov          %eax, (pexp)    ; move updated current value to expected
_success:

II. Odering

Compile time

bool isReady = false;
Job* job = nullptr;

void prepareJob() {
  job = new Job();
  isReady = true;
}

void waitForJob() {
  while (!isReady) std::this_thread::yield();

  job->process();
}

compiled with -O0

0000000000400676 <_Z10prepareJobv>:
  sub    $0x8,%rsp
  mov    $0x1,%edi
  callq  400570 <_Znwm@plt>
  movb   $0x1,0x2009d5(%rip)        # 601060 <isReady>
  mov    %rax,0x2009c6(%rip)        # 601058 <job>
  add    $0x8,%rsp
  retq

compiled with -O2 (-fschedule-insns2)

0000000000400676 <_Z10prepareJobv>:
  sub    $0x8,%rsp
  mov    $0x1,%edi
  callq  400570 <_Znwm@plt>
  mov    %rax,0x2009cd(%rip)        # 601058 <job>
  movb   $0x1,0x2009ce(%rip)        # 601060 <isReady>
  add    $0x8,%rsp
  retq

COMPILE
time reordering

  • Context switch will result in unexpected behaviour
  • // ...
    
    void prepareJob() {
      job = new Job();
      asm volatile("" ::: "memory"); 
      isReady = true;
    }
    
    // ...
    

    N cores

    + cache

    • multiple cores
    • cache coherent SMP

    MESI

    snooping protocol

    • Slow stores
    • Slow invalidates

    N cores

    + store buffers

    • keeps stores
    • loads read from it
    • FIFO or non-FIFO
    • makes stores faster

    III. Atomicity

    Cache coherent SMP system

    Exchange and add

    shared_ptr::~shared_ptr()
    {
      if (count-- == 1)
      {
        deleted ptr;
      }
    }
    
    type __atomic_fetch_add (type *ptr, type val, int memorder)
    
    shared_ptr::~shared_ptr() {
      if (__atomic_fetch_add(&count, -1, __ATOMIC_SEQ_CST) == 0) {
        deleted ptr;
      }
    }
    
      lock xadd val,(ptr)
    
      dmb     sy
    _loop:
      ldrex   r0, [ptr]
      add     r0, val
      strex   r1, r0, [ptr]
      cmp     r1, #0
      bne.n   _loop
      dmb     sy
    
          
       sync
    _loop:
       lwarx   r10,0,ptr
       add     r10,r10,val
       stwcx.  r10,0,ptr
       mcrf    cr7,cr0
       bne     cr7,_lop
       isync
    
          

    Compare and exchange

    void stack::push(const T& data) {
            node* new_node = new node(data);
     
            new_node->next = head;
    
            head = new_node; 
    }
    
    bool __atomic_compare_exchange_n (type *ptr, type *pexp, type vnew,
                                      bool weak, int success_memorder,
                                      int failure_memorder)
    
    void stack::push(const T& data) {
            node* new_node = new node(data);
     
            new_node->next = head;
    
            while(!__atomic_compare_exchange_n(
                    &head,                     // ptr to value
                    &new_node->next,           // ptr to expected value
                    new_node,                  // new value
                    false,                     // weak vs strong
                    __ATOMIC_SEQ_CST,          // success memory order
                    __ATOMIC_SEQ_CST)          // failure memory order
                  ) ;
    }
    

    Compare and exchange - x86

      mov          (pexp), %eax    ; store expected in %eax
      lock cmpxchg vnew, (ptr)     ; try store new in ptr
      sete         %cl             ;
      test         %cl,%cl         ; check cmpxchg status
      jne          _success        ; exit on sucess
      mov          %eax, (pexp)    ; move updated current value to expected
    _success:
    

    Compare and exchange - ARM

      ldr     vexp, [pexp, #0]   ; load the expected value
      dmb     sy                 ; memory barrier
    _loop:
      ldrex   rval, [ptr]        ; load ptr to rval and make reservation
      cmp     rval, vexp         ; check if loaded value equals expected
      bne.n   _exit              ; exit otherwise
      strex   r0, vnew, [ptr]    ; try to store vnew to ptr
      cmp.w   r0, #0             ; check status
      bne.n   _loop              ; loop if lost reservation
    _exit:
      dmb     sy                 ; memory barrier
      ite     ne                 ;
      movne   r1, #0             ; store the result
      moveq   r1, #1             ;
      cmp     r1, #0             ; check status
      bne.n   _success           ; exit on success
      str     rval, [pexp, #0]   ; move updated current value to expected
    _success:
    

    strong

    weak

    Compare and exchange - PowerPC

      sync                     ; memory barrier
    _loop:
      lwarx   rval,0,ptr       ; load ptr to rval and make reservation
      cmpw    cr7,rval,vexp    ; check if loaded value equals expected
      bne     cr7, _exit       ; exit otherwise
      stwcx.  vnew,0,ptr       ; try to store vnew to ptr
      mcrf    cr7,cr0          ; get the result from conditional register
      bne     cr7, _loop       ; loop if lost reservation
    _exit:
      isync                    ; memory barrier
      mfcr    r8               ; get the result from conditional register
      rlwinm  r8,r8,31,31,31   ;
      cmpwi   cr7,r8,0         ; check status
      bne     cr7, _success    ; exit on success
      stw     rval,0(pexp)     ; move updated current value to expected
    _success:
    

    IV. Ordering

    Cache coherent SMP system

    bool isReady = false;
    Job* job = nullptr;
    
    void prepareJob() {
      job = new Job();
      isReady = true;
    }
    
    void waitForJob() {
      while (!isRead) std::this_thread::yield();
    
      job->process();
    }
    

    STORES
    reorder after
    STORES

    bool isReady = false;
    Job* job = nullptr;
    
    void prepareJob() {
      job = new Job();
      smp_wmb();
      isReady = true;
    }
    
    void waitForJob() {
      while (!isRead) std::this_thread::yield();
    
      job->process();
    }
    

    int done1 = 0, done2 = 0,  firstOne = -1;
    
    int main(int argc, char* argv[]) {
        std::thread t1([&] {
            done1 = 1;
            if (done2) firstOne = 2;
        });
    
        std::thread t2([&] {
            done2 = 1;
            if (done1) firstOne = 1;
        });
    
        t1.join();
        t2.join();
        assert(firstOne == 1 || firstOne == 2);
    }
    

    STORES
    reorder after
    LOADS

    N cores

    + invalidate queue

    • keeps invalidate messages on busy cache
    • makes invalidate faster
    • make faster stores
    • makes scaling to N cores more effective
    bool isReady = false;
    Job* job = nullptr;
    
    void prepareJob() {
      job = new Job();
      smp_wmb();
      isReady = true;
    }
    
    void waitForJob() {
      while (!isRead) std::this_thread::yield();
      job->process();
    }
    

    LOADES
    reorder after
    LOADES

    bool isReady = false;
    Job* job = nullptr;
    
    void prepareJob() {
      job = new Job();
      smp_wmb();
      isReady = true;
    }
    
    void waitForJob() {
      while (!isRead) std::this_thread::yield();
      smp_rmb();
      job->process();
    }
    

    N cores

    + load buffer

    • keeps unfinished loads stalled due cache miss
    • makes loads faster
    
    Job* job = firstJob;
    bool jobConsumed;
    
    void consumer() {
      Job *oldJob = job;
      jobConsumed = 1;
    
      oldJob->process();
    }
    
    void producer() {
      if (jobConsumed) {
        job = nextJob;
      }
    }
    
    

    LOADES
    reorder after
    STORES

    Source of reordering

    LOADES
    reordered afer
    LOADES
    N Y Y
    LOADES
    reordered afer
    STORES
    N Y Y
    STORES
    reordered afer
    STORES
    N Y Y
    STORES
    reordered afer
    LOADES
    Y Y Y
    Memory model STRONG WEAK WEAK

    Barrier types


    Kernel barriers

    Barrier instructions

    Intel

    For write-back memory only stores need to be ordered.

    ARM

    POWERPC

    Memory model aware atomic

    SEQ_CST

    RELAXED

    Memory model

    Visibility

    Barrier parring

    bool isReady = false;
    Job* job = nullptr;
    
    void prepareJob() {
      job = new Job();
      smp_wmb();
      isReady = true;
    }
    
    void waitForJob() {
      while (!isRead) std::this_thread::yield();
      smp_rmb();
      job->process();
    }
    

    synchronizes-with / happens-before

    std::atomic<int> isReady(0);
    Job* job = nullptr;
    
    void prepareJob() {
                          | Job* tmp = (Job*) malloc(sizeof(Job));
                          | tmp->field1 = 1;
      job = new Job();  <-| tmp->field2 = 2;
                          | job = tmp;
    
      isReady.store(1, std::memory_order_release);
    
      // do some other stuff
    }
    
    void waitForJob() {
      while (!  isReady.load(std::memory_order_acquire)) {};
      job->process();
    }
    

    #1

    #2

    Why do I need this?

    Thank you

    Use a spacebar or arrow keys to navigate