cregit-Linux how code gets into the kernel

Release 4.15 kernel/trace/ring_buffer.c

Directory: kernel/trace
/*
 * Generic ring buffer
 *
 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
 */
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
#include <linux/sched/clock.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>	/* for self test */
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/list.h>
#include <linux/cpu.h>

#include <asm/local.h>

static void update_pages_handler(struct work_struct *work);

/*
 * The ring buffer header is special. We must manually up keep it.
 */

int ring_buffer_print_entry_header(struct trace_seq *s) { trace_seq_puts(s, "# compressed entry header\n"); trace_seq_puts(s, "\ttype_len : 5 bits\n"); trace_seq_puts(s, "\ttime_delta : 27 bits\n"); trace_seq_puts(s, "\tarray : 32 bits\n"); trace_seq_putc(s, '\n'); trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return !trace_seq_has_overflowed(s); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt6987.34%250.00%
Jovi Zhangwei67.59%125.00%
Lai Jiangshan45.06%125.00%
Total79100.00%4100.00%

/* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is * associated with the CPU it is currently executing on. A reader may read * from any per cpu buffer. * * The reader is special. For each per cpu buffer, the reader has its own * reader page. When a reader has read the entire reader page, this reader * page is swapped with another page in the ring buffer. * * Now, as long as the writer is off the reader page, the reader can do what * ever it wants with that page. The writer will never write to that page * again (as long as it is out of the ring buffer). * * Here's some silly ASCII art. * * +------+ * |reader| RING BUFFER * |page | * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | |-->| |-->| | * | +---+ +---+ +---+ * | | * | | * +------------------------------+ * * * +------+ * |buffer| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | | | |-->| | * | New +---+ +---+ +---+ * | Reader------^ | * | page | * +------------------------------+ * * * After we make this swap, the reader can hand this page off to the splice * code and be done with it. It can even allocate a new page if it needs to * and swap that into the ring buffer. * * We will be using cmpxchg soon to make all this lockless. * */ /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ #ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS # define RB_FORCE_8BYTE_ALIGNMENT 0 # define RB_ARCH_ALIGNMENT RB_ALIGNMENT #else # define RB_FORCE_8BYTE_ALIGNMENT 1 # define RB_ARCH_ALIGNMENT 8U #endif #define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, RB_LEN_TIME_STAMP = 16, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Zanussi2291.67%133.33%
Steven Rostedt14.17%133.33%
Lai Jiangshan14.17%133.33%
Total24100.00%3100.00%


static void rb_event_set_padding(struct ring_buffer_event *event) { /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Zanussi2291.67%133.33%
Lai Jiangshan14.17%133.33%
Steven Rostedt14.17%133.33%
Total24100.00%3100.00%


static unsigned rb_event_data_length(struct ring_buffer_event *event) { unsigned length; if (event->type_len) length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; }

Contributors

PersonTokensPropCommitsCommitProp
Tom Zanussi2967.44%133.33%
Steven Rostedt1227.91%133.33%
Lai Jiangshan24.65%133.33%
Total43100.00%3100.00%

/* * Return the length of the given event. Will return * the length of the time extend if the event is a * time extend. */
static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; case RINGBUF_TYPE_TIME_STAMP: return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: return rb_event_data_length(event); default: BUG(); } /* not hit */ return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt4256.76%250.00%
Tom Zanussi2432.43%125.00%
Lai Jiangshan810.81%125.00%
Total74100.00%4100.00%

/* * Return total length of time extend and data, * or just the event length for all other events. */
static inline unsigned rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); } return len + rb_event_length(event); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt47100.00%1100.00%
Total47100.00%1100.00%

/** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of * * Returns the size of the data load of a data event. * If the event is something other than a data event, it * returns the size of the event itself. With the exception * of a TIME EXTEND, where it still returns the size of the * data load of the data event after it. */
unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) event = skip_time_extend(event); length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) length -= sizeof(event->array[0]); return length; }

Contributors

PersonTokensPropCommitsCommitProp
Robert Richter4454.32%125.00%
Steven Rostedt3441.98%250.00%
Lai Jiangshan33.70%125.00%
Total81100.00%4100.00%

EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */
static __always_inline void * rb_event_data(struct ring_buffer_event *event) { if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt6794.37%375.00%
Lai Jiangshan45.63%125.00%
Total71100.00%4100.00%

/** * ring_buffer_event_data - return the data of the event * @event: the event to get the data from */
void *ring_buffer_event_data(struct ring_buffer_event *event) { return rb_event_data(event); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt17100.00%1100.00%
Total17100.00%1100.00%

EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) #define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer * page will be at the beginning of a cache line, and thus * the least significant bits will be zero. We use this to * add flags in the list struct pointers, to make the ring buffer * lockless. */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ struct buffer_data_page *page; /* Actual data page */ }; /* * The buffer page counters, write and entries, must be reset * atomically when crossing page boundaries. To synchronize this * update, two counters are inserted into the number. One is * the actual counter for the write position or count on the page. * * The other is a counter of updaters. Before an update happens * the update partition of the counter is incremented. This will * allow the updater to update the counter atomically. * * The counter is 20 bits, and the state data is 12. */ #define RB_WRITE_MASK 0xfffff #define RB_WRITE_INTCNT (1 << 20)
static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt21100.00%2100.00%
Total21100.00%2100.00%

/** * ring_buffer_page_len - the size of data on the page. * @page: The page to read * * Returns the amount of data on the page, including buffer page header. */
size_t ring_buffer_page_len(void *page) { struct buffer_data_page *bpage = page; return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) + BUF_PAGE_HDR_SIZE; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt32100.00%3100.00%
Total32100.00%3100.00%

/* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. */
static void free_buffer_page(struct buffer_page *bpage) { free_page((unsigned long)bpage->page); kfree(bpage); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt27100.00%3100.00%
Total27100.00%3100.00%

/* * We need to fit the time_stamp delta into 27 bits. */
static inline int test_time_stamp(u64 delta) { if (delta & TS_DELTA_TEST) return 1; return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt22100.00%1100.00%
Total22100.00%1100.00%

#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; trace_seq_printf(s, "\tfield: u64 timestamp;\t" "offset:0;\tsize:%u;\tsigned:%u;\n", (unsigned int)sizeof(field.time_stamp), (unsigned int)is_signed_type(u64)); trace_seq_printf(s, "\tfield: local_t commit;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: int overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), (unsigned int)BUF_PAGE_SIZE, (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt13181.37%375.00%
Tom Zanussi3018.63%125.00%
Total161100.00%4100.00%

struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; }; /* * Structure to hold event state and handle nested events. */ struct rb_event_info { u64 ts; u64 delta; unsigned long length; struct buffer_page *tail_page; int add_timestamp; }; /* * Used for which event context the event is in. * NMI = 0 * IRQ = 1 * SOFTIRQ = 2 * NORMAL = 3 * * See trace_recursive_lock() comment below for more details. */ enum { RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, RB_CTX_NORMAL, RB_CTX_MAX }; /* * head_page == tail_page && head == tail then buffer is empty. */ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; struct ring_buffer *buffer; raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; local_t entries_bytes; local_t entries; local_t overrun; local_t commit_overrun; local_t dropped_events; local_t committing; local_t commits; unsigned long read; unsigned long read_bytes; u64 write_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; struct rb_irq_work irq_work; }; struct ring_buffer { unsigned flags; int cpus; atomic_t record_disabled; atomic_t resize_disabled; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; struct mutex mutex; struct ring_buffer_per_cpu **buffers; struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; }; struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; u64 read_stamp; }; /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * * Schedules a delayed work to wake up any task that is blocked on the * ring buffer waiters queue. */
static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); if (rbwork->wakeup_full) { rbwork->wakeup_full = false; wake_up_all(&rbwork->full_waiters); } }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt56100.00%3100.00%
Total56100.00%3100.00%

/** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */
int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) { struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); struct rb_irq_work *work; int ret = 0; /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; /* Full only makes sense on per cpu reads */ full = false; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } while (true) { if (full) prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); else prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); /* * The events can happen in critical sections where * checking a work queue can cause deadlocks. * After adding a task to the queue, this flag is set * only to notify events to try to wake up the queue * using irq_work. * * We don't clear it even if the buffer is no longer * empty. The flag only causes the next event to run * irq_work to do the work queue wake up. The worse * that can happen if we race with !trace_empty() is that * an event will cause an irq_work to try to wake up * an empty queue. * * There's no reason to protect this flag either, as * the work queue and irq_work logic will do the necessary * synchronization for the wake ups. The only thing * that is necessary is that the wake up happens after * a task has been queued. It's OK for spurious wake ups. */ if (full) work->full_waiters_pending = true; else work->waiters_pending = true; if (signal_pending(current)) { ret = -EINTR; break; } if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) break; if (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; if (!full) break; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (!pagebusy) break; } schedule(); } if (full) finish_wait(&work->full_waiters, &wait); else finish_wait(&work->waiters, &wait); return ret; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt18567.03%583.33%
Rabin Vincent9132.97%116.67%
Total276100.00%6100.00%

/** * ring_buffer_poll_wait - poll on buffer input * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @filp: the file descriptor * @poll_table: The poll descriptor * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. * * Returns POLLIN | POLLRDNORM if data exists in the buffers, * zero otherwise. */
int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *work; if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } poll_wait(filp, &work->waiters, poll_table); work->waiters_pending = true; /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit * is set, the next event will wake the task up, but we can get stuck * if there's only a single event in. * * FIXME: Ideally, we need a memory barrier on the writer side as well, * but adding a memory barrier to all events will cause too much of a * performance hit in the fast path. We only need a memory barrier when * the buffer goes from empty to having content. But as this race is * extremely small, and it's not a problem if another event comes in, we * will fix it later. */ smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return POLLIN | POLLRDNORM; return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt12892.75%375.00%
Josef Bacik107.25%125.00%
Total138100.00%4100.00%

/* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ int _____ret = unlikely(cond); \ if (_____ret) { \ if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ struct ring_buffer_per_cpu *__b = \ (void *)b; \ atomic_inc(&__b->buffer->record_disabled); \ } else \ atomic_inc(&b->record_disabled); \ WARN_ON(1); \ } \ _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0
static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt21100.00%1100.00%
Total21100.00%1100.00%


u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) { u64 time; preempt_disable_notrace(); time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt32100.00%2100.00%
Total32100.00%2100.00%

EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, int cpu, u64 *ts) { /* Just stupid testing the normalize function and deltas */ *ts >>= DEBUG_SHIFT; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt23100.00%1100.00%
Total23100.00%1100.00%

EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); /* * Making the ring buffer lockless makes things tricky. * Although writes only happen on the CPU that they are on, * and they only need to worry about interrupts. Reads can * happen on any CPU. * * The reader page is always off the ring buffer, but when the * reader finishes with a page, it needs to swap its page with * a new one from the buffer. The reader needs to take from * the head (writes go to the tail). But if a writer is in overwrite * mode and wraps, it must push the head page forward. * * Here lies the problem. * * The reader must be careful to replace only the head page, and * not another one. As described at the top of the file in the * ASCII art, the reader sets its old page to point to the next * page after head. It then sets the page after head to point to * the old reader page. But if the writer moves the head page * during this operation, the reader could end up with the tail. * * We use cmpxchg to help prevent this race. We also do something * special with the page before head. We set the LSB to 1. * * When the writer must push the page forward, it will clear the * bit that points to the head page, move the head, and then set * the bit that points to the new head page. * * We also don't want an interrupt coming in and moving the head * page on another writer. Thus we use the second LSB to catch * that too. Thus: * * head->list->prev->next bit 1 bit 0 * ------- ------- * Normal page 0 0 * Points to head page 0 1 * New head page 1 0 * * Note we can not trust the prev pointer of the head page, because: * * +----+ +-----+ +-----+ * | |------>| T |---X--->| N | * | |<------| | | | * +----+ +-----+ +-----+ * ^ ^ | * | +-----+ | | * +----------| R |----------+ | * | |<-----------+ * +-----+ * * Key: ---X--> HEAD flag set in pointer * T Tail page * R Reader page * N Next page * * (see __rb_reserve_next() to see where this happens) * * What the above shows is that the reader just swapped out * the reader page with a page in the buffer, but before it * could make the new header point back to the new page added * it was preempted by a writer. The writer moved forward onto * the new page added by the reader and is about to move forward * again. * * You can see, it is legitimate for the previous pointer of * the head (or any page) not to point back to itself. But only * temporarially. */ #define RB_PAGE_NORMAL 0UL #define RB_PAGE_HEAD 1UL #define RB_PAGE_UPDATE 2UL #define RB_FLAG_MASK 3UL /* PAGE_MOVED is not part of the mask */ #define RB_PAGE_MOVED 4UL /* * rb_list_head - remove any bit */
static struct list_head *rb_list_head(struct list_head *list) { unsigned long val = (unsigned long)list; return (struct list_head *)(val & ~RB_FLAG_MASK); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt36100.00%2100.00%
Total36100.00%2100.00%

/* * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to * the reader page). But if the next page is a header page, * its flags will be non zero. */
static inline int rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *page, struct list_head *list) { unsigned long val; val = (unsigned long)list->next; if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) return RB_PAGE_MOVED; return val & RB_FLAG_MASK; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt6198.39%375.00%
Jesper Juhl11.61%125.00%
Total62100.00%4100.00%

/* * rb_is_reader_page * * The unique thing about the reader page, is that, if the * writer is ever on it, the previous pointer never points * back to the reader page. */
static bool rb_is_reader_page(struct buffer_page *page) { struct list_head *list = page->list.prev; return rb_list_head(list->next) != &page->list; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt3497.14%266.67%
Yaowei Bai12.86%133.33%
Total35100.00%3100.00%

/* * rb_set_list_to_head - set a list_head to be pointing to head. */
static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *list) { unsigned long *ptr; ptr = (unsigned long *)&list->next; *ptr |= RB_PAGE_HEAD; *ptr &= ~RB_PAGE_UPDATE; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt44100.00%1100.00%
Total44100.00%1100.00%

/* * rb_head_page_activate - sets up head page */
static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; head = cpu_buffer->head_page; if (!head) return; /* * Set the previous list pointer to have the HEAD flag. */ rb_set_list_to_head(cpu_buffer, head->list.prev); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt40100.00%1100.00%
Total40100.00%1100.00%


static void rb_list_head_clear(struct list_head *list) { unsigned long *ptr = (unsigned long *)&list->next; *ptr &= ~RB_FLAG_MASK; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt32100.00%1100.00%
Total32100.00%1100.00%

/* * rb_head_page_dactivate - clears head page ptr (for free list) */
static void rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *hd; /* Go through the whole list and clear any pointers found. */ rb_list_head_clear(cpu_buffer->pages); list_for_each(hd, cpu_buffer->pages) rb_list_head_clear(hd); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt35100.00%1100.00%
Total35100.00%1100.00%


static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag, int new_flag) { struct list_head *list; unsigned long val = (unsigned long)&head->list; unsigned long ret; list = &prev->list; val &= ~RB_FLAG_MASK; ret = cmpxchg((unsigned long *)&list->next, val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) return RB_PAGE_MOVED; return ret & RB_FLAG_MASK; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt104100.00%2100.00%
Total104100.00%2100.00%


static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_UPDATE); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt38100.00%1100.00%
Total38100.00%1100.00%


static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_HEAD); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt38100.00%1100.00%
Total38100.00%1100.00%


static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_NORMAL); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt38100.00%1100.00%
Total38100.00%1100.00%


static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page **bpage) { struct list_head *p = rb_list_head((*bpage)->list.next); *bpage = list_entry(p, struct buffer_page, list); }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt48100.00%1100.00%
Total48100.00%1100.00%


static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; struct buffer_page *page; struct list_head *list; int i; if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) return NULL; /* sanity check */ list = cpu_buffer->pages; if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) return NULL; page = head = cpu_buffer->head_page; /* * It is possible that the writer moves the header behind * where we started, and we miss in one loop. * A second loop should grab the header, but we'll do * three loops just because I'm paranoid. */ for (i = 0; i < 3; i++) { do { if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { cpu_buffer->head_page = page; return page; } rb_inc_page(cpu_buffer, &page); } while (page != head); } RB_WARN_ON(cpu_buffer, 1); return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt152100.00%1100.00%
Total152100.00%1100.00%


static int rb_head_page_replace(struct buffer_page *old, struct buffer_page *new) { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; unsigned long ret; val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; ret = cmpxchg(ptr, val, (unsigned long)&new->list); return ret == val; }

Contributors

PersonTokensPropCommitsCommitProp
Steven Rostedt78100.00%2100.00%
Total78100.00%2100.00%

/* * rb_tail_page_update - move the tail page forward */
static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page