Using Black Magic to Create a Fast Ring Buffer

Yesterday I took a look at the Wikipedia page on circular buffer and was intrigued by a supposed optimization technique that I was not familiar with before: 





. (, .) ; , , . , - - .





, «» (wrap around). :





void put(queue_t *q, uint8_t *data, size_t size) {
    for(size_t i = 0; i < size; i++){
        q->buffer[(q->tail + i) % q->buffer_size] = data[i];
    }
    q->tail = (q->tail + size) % q->buffer_size;
}
      
      



, , ( ) , , . , , , , - , . , , , .





. , .





() - , . «» , , , .





( ): ( head



- ) ( tail



- ). , tail



, head



. :





typedef struct {
    uint8_t *buffer;
    size_t   buffer_size;
    size_t   head;
    size_t   tail;
    size_t   bytes_avail;
} queue_t;
      
      



, (get



) (put



), :





bool put(queue_t *q, uint8_t *data, size_t size) {
    if(q->buffer_size - q->bytes_avail < size){
        return false;
    }
    for(size_t i = 0; i < size; i++){
        q->buffer[(q->tail + i) % q->buffer_size] = data[i];
    }
    q->tail = (q->tail + size) % q->buffer_size;
    q->bytes_avail += size;
    return true;
}

bool get(queue_t *q, uint8_t *data, size_t size) {
    if(q->bytes_avail < size){
        return false;
    }
    for(size_t i = 0; i < size; i++){
        data[i] = q->buffer[(q->head + i) % q->buffer_size];
    }
    q->head = (q->head + size) % q->buffer_size;
    q->bytes_avail -= size;
    return true;
}
      
      



, put



, , , . ( ).





, , . memcpy



, , , .





static inline off_t min(off_t a, off_t b) {
    return a < b ? a : b;
}
bool put(queue_t *q, uint8_t *data, size_t size) {
    if(q->buffer_size - q->bytes_avail < size){
        return false;
    }
const size_t part1 = min(size, q-&gt;buffer_size - q-&gt;tail);
const size_t part2 = size - part1;

memcpy(q-&gt;buffer + q-&gt;tail, data,         part1);
memcpy(q-&gt;buffer,           data + part1, part2);

q-&gt;tail = (q-&gt;tail + size) % q-&gt;buffer_size;
q-&gt;bytes_avail += size;
return true;

}
bool get(queue_t *q, uint8_t *data, size_t size) {
    if(q->bytes_avail < size){
        return false;
    }
const size_t part1 = min(size, q-&gt;buffer_size - q-&gt;head);
const size_t part2 = size - part1;

memcpy(data,         q-&gt;buffer + q-&gt;head, part1);
memcpy(data + part1, q-&gt;buffer,           part2);

q-&gt;head = (q-&gt;head + size) % q-&gt;buffer_size;
q-&gt;bytes_avail -= size;
return true;

}
      
      



:





int main() {
    queue_t queue;
queue.buffer      = malloc(128);
queue.buffer_size = 128;
queue.head        = 0;
queue.tail        = 0;
queue.bytes_avail = 0;

put(&amp;q, "hello ", 6);
put(&amp;q, "world\n", 7);

char s[13];
get(&amp;q, (uint8_t *) s, 13);

printf(s); // prints "hello world"

}
      
      



. ?





. , . , , , . , , , , .





, . - () , , 4 . () , , , .





:













0





null





1





25





2





12





3





null





4





56













B , B :













0





11





1





null





2





92





3





21





4





null













, , . , . , .





, , , .





, glibc ,

, Linux.













int getpagesize()







. ( , - )





int ftruncate(int fd, off_t length)







length, fd



.





void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)







, fd



, . (flags



).





, glibc



( ) :





System Call









Description









int memfd_create(const char *name, unsigned int flags)







  (anonymous file), .





! glibc



, , .





int memfd_create(const char *name, unsigned int flags) {
    return syscall(__NR_memfd_create, name, flags);
}
      
      



, , . . , ( ).





, . , , , . API .





typedef struct {
    uint8_t *buffer;
    size_t   buffer_size;
    int      fd;
    size_t   head;
    size_t   tail;
} queue_t;
bool init(queue *q, size_t size){
//   ,     
if(size % getpagesize() != 0){
    return 0;
}

//      
q-&gt;fd = memfd_create("queue_buffer", 0);
ftruncate(q-&gt;fd, size);

//   mmap  ,        
q-&gt;buffer = mmap(NULL, 2 * size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

//     
mmap(q-&gt;buffer, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, q-&gt;fd, 0);

//        
mmap(q-&gt;buffer + size, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, q-&gt;fd, 0);

//    
q-&gt;head = 0;
q-&gt;tail = 0;

}

      
      



, ? , memfd_create



, , . . , mmap. .





mmap size



. (, size ):





















q->buffer







q->fd



, 0





q->buffer + getpagesize()







q->fd



, getpagesize()















q->buffer + size







q->fd



, 0





q->buffer + size + getpagesize()







q->fd



, getpagesize()















, , . , :





bool put(queue_t *q, uint8_t *data, size_t size) {
    if(q->buffer_size - (q->tail - q->head) < size){
        return false;
    }
    memcpy(&q->buffer[q->tail], data, size);
    q->tail += size;
    return true;
}
bool get(queue_t *q, uint8_t *data, size_t size) {
    if(q->tail - q->head < size){
        return false;
    }
    memcpy(data, &q->buffer[q->head], size);
    q->head += size;
    if(q->head > q->buffer_size) {
       q->head -= q->buffer_size;
       q->tail -= q->buffer_size;
    }
    return true;
}

      
      



tail



- «» . - . , , , .





, memcpy . , head



. , ; , . API .





?

, , , . -, :





#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <sys/time.h>
#define BUFFER_SIZE (1024)
#define MESSAGE_SIZE (32)
#define NUMBER_RUNS (1000000)
static inline double microtime() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return 1e6 * tv.tv_sec + tv.tv_usec;
}
static inline off_t min(off_t a, off_t b) {
    return a < b ? a : b;
}
int main() {
    uint8_t message[MESSAGE_SIZE];
    uint8_t buffer[2 * BUFFER_SIZE];
    size_t offset;
double slow_start = microtime();
offset = 0;
for(int i = 0; i &lt; NUMBER_RUNS; i++){
    const size_t part1 = min(MESSAGE_SIZE, BUFFER_SIZE - offset);
    const size_t part2 = MESSAGE_SIZE - part1;
    memcpy(buffer + offset, message, part1);
    memcpy(buffer, message + part1, part2);
    offset = (offset + MESSAGE_SIZE) % BUFFER_SIZE;
}
double slow_stop = microtime();

double fast_start = microtime();
offset = 0;
for(int i = 0; i &lt; NUMBER_RUNS; i++){
    memcpy(&amp;buffer[offset], message, MESSAGE_SIZE);
    offset = (offset + MESSAGE_SIZE) % BUFFER_SIZE;
}
double fast_stop = microtime();

printf("slow: %f microseconds per write\n", (slow_stop - slow_start) / NUMBER_RUNS);
printf("fast: %f microseconds per write\n", (fast_stop - fast_start) / NUMBER_RUNS);

return 0;

}

      
      



i5-6400 , :





slow: 0.012196 microseconds per write
fast: 0.004024 microseconds per write
      
      



, memcpy , memcpy



. , 0,104943 . - ( , TLB), , .





?





, , . , .





, GitHub. , .






- « » - « C UNIX». C UNIX- BSD. . « » UNIX . !





- « »





- « C UNIX»








All Articles