ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler

The current method to prevent the ring buffer from entering into a recursize
loop is to use a bitmask and set the bit that maps to the current context
(normal, softirq, irq or NMI), and if that bit was already set, it is
considered a recursive loop.

New code is being added that may require the ring buffer to be entered a
second time in the current context. The recursive locking prevents that from
happening. Instead of mapping a bitmask to the current context, just allow 4
levels of nesting in the ring buffer. This matches the 4 context levels that
it can already nest. It is highly unlikely to have more than two levels,
thus it should be fine when we add the second entry into the ring buffer. If
that proves to be a problem, we can always up the number to 8.

An added benefit is that reading preempt_count() to get the current level
adds a very slight but noticeable overhead. This removes that need.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
(cherry picked from commit 1a149d7d3f45d311da1f63473736c05f30ae8a75)
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: azrim <mirzaspc@gmail.com>
This commit is contained in:
Steven Rostedt (VMware) 2017-09-22 16:59:02 -04:00 committed by azrim
parent 24f8a1ef03
commit 27f98b799c
No known key found for this signature in database
GPG Key ID: 497F8FB059B45D1C

View File

@ -41,6 +41,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
RINGBUF_TYPE_PADDING);
trace_seq_printf(s, "\ttime_extend : type == %d\n",
RINGBUF_TYPE_TIME_EXTEND);
trace_seq_printf(s, "\ttime_stamp : type == %d\n",
RINGBUF_TYPE_TIME_STAMP);
trace_seq_printf(s, "\tdata max type_len == %d\n",
RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
@ -140,12 +142,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
enum {
RB_LEN_TIME_EXTEND = 8,
RB_LEN_TIME_STAMP = 16,
RB_LEN_TIME_STAMP = 8,
};
#define skip_time_extend(event) \
((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
#define extended_time(event) \
(event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
static inline int rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@ -209,7 +214,7 @@ rb_event_ts_length(struct ring_buffer_event *event)
{
unsigned len = 0;
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
if (extended_time(event)) {
/* time extends include the data event after it */
len = RB_LEN_TIME_EXTEND;
event = skip_time_extend(event);
@ -231,7 +236,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
unsigned length;
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
if (extended_time(event))
event = skip_time_extend(event);
length = rb_event_length(event);
@ -248,7 +253,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static __always_inline void *
rb_event_data(struct ring_buffer_event *event)
{
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
if (extended_time(event))
event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
@ -275,6 +280,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
/**
* ring_buffer_event_time_stamp - return the event's extended timestamp
* @event: the event to get the timestamp of
*
* Returns the extended timestamp associated with a data event.
* An extended time_stamp is a 64-bit timestamp represented
* internally in a special way that makes the best use of space
* contained within a ring buffer event. This function decodes
* it and maps it to a straight u64 value.
*/
u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
ts = event->array[0];
ts <<= TS_SHIFT;
ts += event->time_delta;
return ts;
}
/* Flag when events were overwritten */
#define RB_MISSED_EVENTS (1 << 31)
/* Missed count stored at end */
@ -453,6 +479,7 @@ struct ring_buffer_per_cpu {
struct buffer_page *reader_page;
unsigned long lost_events;
unsigned long last_overrun;
unsigned long nest;
local_t entries_bytes;
local_t entries;
local_t overrun;
@ -490,6 +517,7 @@ struct ring_buffer {
u64 (*clock)(void);
struct rb_irq_work irq_work;
bool time_stamp_abs;
};
struct ring_buffer_iter {
@ -1389,6 +1417,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
buffer->clock = clock;
}
void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
{
buffer->time_stamp_abs = abs;
}
bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
{
return buffer->time_stamp_abs;
}
static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
@ -2221,12 +2259,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Slow path, do not inline */
static noinline struct ring_buffer_event *
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
{
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
else
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page? */
if (rb_event_index(event)) {
/* Not the first event on the page, or not delta? */
if (abs || rb_event_index(event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@ -2269,7 +2310,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
* add it to the start of the resevered space.
*/
if (unlikely(info->add_timestamp)) {
event = rb_add_time_stamp(event, delta);
bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
event = rb_add_time_stamp(event, info->delta, abs);
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
@ -2457,7 +2500,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer
static inline void rb_event_discard(struct ring_buffer_event *event)
{
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
if (extended_time(event))
event = skip_time_extend(event);
/* array[0] holds the actual length for the discarded event */
@ -2501,10 +2544,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
delta = event->array[0];
delta <<= TS_SHIFT;
delta += event->time_delta;
delta = ring_buffer_event_time_stamp(event);
cpu_buffer->write_stamp += delta;
} else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
delta = ring_buffer_event_time_stamp(event);
cpu_buffer->write_stamp = delta;
} else
cpu_buffer->write_stamp += event->time_delta;
}
@ -2611,19 +2655,16 @@ static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned int val = cpu_buffer->current_context;
unsigned long pc = preempt_count();
int bit;
if (in_interrupt()) {
if (in_nmi())
bit = RB_CTX_NMI;
else if (in_irq())
bit = RB_CTX_IRQ;
else
bit = RB_CTX_SOFTIRQ;
} else
if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
bit = RB_CTX_NORMAL;
else
bit = pc & NMI_MASK ? RB_CTX_NMI :
pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
if (unlikely(val & (1 << bit))) {
if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
/*
* It is possible that this was called by transitioning
* between interrupt context, and preempt_count() has not
@ -2634,7 +2675,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
return 1;
}
val |= (1 << bit);
val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
return 0;
@ -2643,7 +2684,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
cpu_buffer->current_context &= cpu_buffer->current_context - 1;
cpu_buffer->current_context &=
cpu_buffer->current_context - (1 << cpu_buffer->nest);
}
/* The recursive locking above uses 4 bits */
#define NESTED_BITS 4
/**
* ring_buffer_nest_start - Allow to trace while nested
* @buffer: The ring buffer to modify
*
* The ring buffer has a safty mechanism to prevent recursion.
* But there may be a case where a trace needs to be done while
* tracing something else. In this case, calling this function
* will allow this function to nest within a currently active
* ring_buffer_lock_reserve().
*
* Call this function before calling another ring_buffer_lock_reserve() and
* call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
*/
void ring_buffer_nest_start(struct ring_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
/* Enabled by ring_buffer_nest_end() */
preempt_disable_notrace();
cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
/* This is the shift value for the above recusive locking */
cpu_buffer->nest += NESTED_BITS;
}
/**
* ring_buffer_nest_end - Allow to trace while nested
* @buffer: The ring buffer to modify
*
* Must be called after ring_buffer_nest_start() and after the
* ring_buffer_unlock_commit().
*/
void ring_buffer_nest_end(struct ring_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
/* disabled by ring_buffer_nest_start() */
cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
/* This is the shift value for the above recusive locking */
cpu_buffer->nest -= NESTED_BITS;
preempt_enable_notrace();
}
/**
@ -2719,7 +2810,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
if (!tail)
if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
info->delta = 0;
/* See if we shot pass the end of this buffer page */
@ -2796,8 +2887,11 @@ rb_reserve_next_event(struct ring_buffer *buffer,
/* make sure this diff is calculated here */
barrier();
/* Did the write stamp get updated already? */
if (likely(info.ts >= cpu_buffer->write_stamp)) {
if (ring_buffer_time_stamp_abs(buffer)) {
info.delta = info.ts;
rb_handle_timestamp(cpu_buffer, &info);
} else /* Did the write stamp get updated already? */
if (likely(info.ts >= cpu_buffer->write_stamp)) {
info.delta = diff;
if (unlikely(test_time_stamp(info.delta)))
rb_handle_timestamp(cpu_buffer, &info);
@ -3515,14 +3609,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return;
case RINGBUF_TYPE_TIME_EXTEND:
delta = event->array[0];
delta <<= TS_SHIFT;
delta += event->time_delta;
delta = ring_buffer_event_time_stamp(event);
cpu_buffer->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
/* FIXME: not implemented */
delta = ring_buffer_event_time_stamp(event);
cpu_buffer->read_stamp = delta;
return;
case RINGBUF_TYPE_DATA:
@ -3546,14 +3639,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
return;
case RINGBUF_TYPE_TIME_EXTEND:
delta = event->array[0];
delta <<= TS_SHIFT;
delta += event->time_delta;
delta = ring_buffer_event_time_stamp(event);
iter->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
/* FIXME: not implemented */
delta = ring_buffer_event_time_stamp(event);
iter->read_stamp = delta;
return;
case RINGBUF_TYPE_DATA:
@ -3777,6 +3869,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
struct buffer_page *reader;
int nr_loops = 0;
if (ts)
*ts = 0;
again:
/*
* We repeat when a time extend is encountered.
@ -3813,12 +3907,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
goto again;
case RINGBUF_TYPE_TIME_STAMP:
/* FIXME: not implemented */
if (ts) {
*ts = ring_buffer_event_time_stamp(event);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
/* Internal data, OK to advance */
rb_advance_reader(cpu_buffer);
goto again;
case RINGBUF_TYPE_DATA:
if (ts) {
if (ts && !(*ts)) {
*ts = cpu_buffer->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
@ -3843,6 +3942,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_event *event;
int nr_loops = 0;
if (ts)
*ts = 0;
cpu_buffer = iter->cpu_buffer;
buffer = cpu_buffer->buffer;
@ -3895,12 +3997,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
goto again;
case RINGBUF_TYPE_TIME_STAMP:
/* FIXME: not implemented */
if (ts) {
*ts = ring_buffer_event_time_stamp(event);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
/* Internal data, OK to advance */
rb_advance_iter(iter);
goto again;
case RINGBUF_TYPE_DATA:
if (ts) {
if (ts && !(*ts)) {
*ts = iter->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(buffer,
cpu_buffer->cpu, ts);