Web lists-archives.com

[PATCH v1 4/6] perf: Allow using AUX data in perf samples




AUX data can be used to annotate perf events such as performance counters
or tracepoints/breakpoints by including it in sample records when
PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging
and profiling by providing, for example, a history of instruction flow
leading up to the event's overflow.

To do this, the AUX event's file descriptor is passed to the perf syscall
with PERF_FLAG_FD_SAMPLE flag set and PERF_SAMPLE_AUX bit set in the sample
type. Also, a new attribute field is added to allow the user to specify the
desired size of the AUX sample: attr.aux_sample_size.

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
 include/linux/perf_event.h      |  10 ++
 include/uapi/linux/perf_event.h |   8 +-
 kernel/events/core.c            | 158 +++++++++++++++++++++++++++++++-
 3 files changed, 174 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7546822a1d74..9f9e341d45cf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -102,6 +102,12 @@ struct perf_branch_stack {
 	struct perf_branch_entry	entries[0];
 };
 
+struct perf_aux_record {
+	u64		size;
+	unsigned long	from;
+	unsigned long	to;
+};
+
 struct task_struct;
 
 /*
@@ -674,6 +680,8 @@ struct perf_event {
 	struct bpf_prog			*prog;
 #endif
 
+	struct perf_event		*sample_event;
+
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call		*tp_event;
 	struct event_filter		*filter;
@@ -882,6 +890,7 @@ struct perf_sample_data {
 	 */
 	u64				addr;
 	struct perf_raw_record		*raw;
+	struct perf_aux_record		aux;
 	struct perf_branch_stack	*br_stack;
 	u64				period;
 	u64				weight;
@@ -933,6 +942,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	/* remaining struct members initialized in perf_prepare_sample() */
 	data->addr = addr;
 	data->raw  = NULL;
+	data->aux.from = data->aux.to = data->aux.size = 0;
 	data->br_stack = NULL;
 	data->period = period;
 	data->weight = 0;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c77c9a2ebbbb..19a22b161e39 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_AUX				= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 };
 
 /*
@@ -298,6 +299,7 @@ enum perf_event_read_format {
 					/* add: sample_stack_user */
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
+#define PERF_ATTR_SIZE_VER6	120	/* add: aux_sample_size */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -416,6 +418,7 @@ struct perf_event_attr {
 	__u32	aux_watermark;
 	__u16	sample_max_stack;
 	__u16	__reserved_2;	/* align to __u64 */
+	__u64	aux_sample_size;
 };
 
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
@@ -820,6 +823,8 @@ enum perf_event_type {
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+	 *	{ u64			size;
+	 *	  char			data[size]; } && PERF_SAMPLE_AUX
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
@@ -952,6 +957,7 @@ enum perf_callchain_context {
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
 #define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
+#define PERF_FLAG_FD_SAMPLE		(1UL << 4) /* use fd event to sample AUX data */
 
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 union perf_mem_data_src {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e1fce335a42a..70918ed33143 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -346,7 +346,8 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 		       PERF_FLAG_FD_OUTPUT  |\
 		       PERF_FLAG_PID_CGROUP |\
-		       PERF_FLAG_FD_CLOEXEC)
+		       PERF_FLAG_FD_CLOEXEC |\
+		       PERF_FLAG_FD_SAMPLE)
 
 /*
  * branch priv levels that need permission checks
@@ -3937,6 +3938,8 @@ static void unaccount_freq_event(void)
 		atomic_dec(&nr_freq_events);
 }
 
+static void put_event(struct perf_event *event);
+
 static void unaccount_event(struct perf_event *event)
 {
 	bool dec = false;
@@ -3970,6 +3973,9 @@ static void unaccount_event(struct perf_event *event)
 			schedule_delayed_work(&perf_sched_work, HZ);
 	}
 
+	if (event->sample_event)
+		put_event(event->sample_event);
+
 	unaccount_event_cpu(event, event->cpu);
 
 	unaccount_pmu_sb_event(event);
@@ -5608,6 +5614,100 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
 	}
 }
 
+/*
+ * See if we can take an AUX sample. If we can, prepare for writing
+ * the sample and return its size. In this case, perf_aux_sample_output()
+ * will undo the preparations.
+ */
+static unsigned long perf_aux_sample_size(struct perf_event *event,
+					  struct perf_sample_data *data,
+					  size_t size)
+{
+	struct perf_event *sampler = event->sample_event;
+	struct ring_buffer *rb;
+	int *disable_count;
+
+	data->aux.size = 0;
+
+	if (!sampler || READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)
+		goto out;
+
+	if (READ_ONCE(sampler->oncpu) != smp_processor_id())
+		goto out;
+
+	/*
+	 * Non-zero disable count here means that we, being the NMI
+	 * context, are racing with pmu::add, pmu::del or address filter
+	 * adjustment, which we want to avoid.
+	 */
+	disable_count = this_cpu_ptr(sampler->pmu->pmu_disable_count);
+	if (*disable_count)
+		goto out;
+
+	/* Re-enabled in perf_aux_sample_output() */
+	perf_pmu_disable(sampler->pmu);
+
+	rb = ring_buffer_get(sampler);
+	if (!rb) {
+		perf_pmu_enable(sampler->pmu);
+		goto out;
+	}
+
+	/* Restarted in perf_aux_sample_output() */
+	sampler->pmu->stop(sampler, PERF_EF_UPDATE);
+	data->aux.to = rb->aux_head;
+
+	size = min(size, perf_aux_size(rb));
+
+	if (data->aux.to < size)
+		data->aux.from = rb->aux_nr_pages * PAGE_SIZE + data->aux.to -
+			size;
+	else
+		data->aux.from = data->aux.to - size;
+	data->aux.size = ALIGN(size, sizeof(u64));
+	ring_buffer_put(rb);
+
+out:
+	return data->aux.size;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+				   struct perf_output_handle *handle,
+				   struct perf_sample_data *data)
+{
+	struct perf_event *sampler = event->sample_event;
+	struct ring_buffer *rb;
+	unsigned long pad;
+	int ret;
+
+	if (WARN_ON_ONCE(!sampler || !data->aux.size))
+		goto out_enable;
+
+	rb = ring_buffer_get(sampler);
+	if (WARN_ON_ONCE(!rb))
+		goto out_enable;
+
+	ret = rb_output_aux(rb, data->aux.from, data->aux.to,
+			    (aux_copyfn)perf_output_copy, handle);
+	if (ret < 0) {
+		pr_warn_ratelimited("failed to copy trace data\n");
+		goto out;
+	}
+
+	pad = data->aux.size - ret;
+	if (pad) {
+		u64 p = 0;
+
+		perf_output_copy(handle, &p, pad);
+	}
+out:
+	ring_buffer_put(rb);
+	sampler->pmu->start(sampler, 0);
+
+out_enable:
+	perf_pmu_enable(sampler->pmu);
+}
+
 static void __perf_event_header__init_id(struct perf_event_header *header,
 					 struct perf_sample_data *data,
 					 struct perf_event *event)
@@ -5926,6 +6026,13 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_AUX) {
+		perf_output_put(handle, data->aux.size);
+
+		if (data->aux.size)
+			perf_aux_sample_output(event, handle, data);
+	}
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6112,6 +6219,32 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
+
+	if (sample_type & PERF_SAMPLE_AUX) {
+		u64 size;
+
+		header->size += sizeof(u64); /* size */
+
+		/*
+		 * Given the 16bit nature of header::size, an AUX sample can
+		 * easily overflow it, what with all the preceding sample bits.
+		 * Make sure this doesn't happen by using up to U16_MAX bytes
+		 * per sample in total (rounded down to 8 byte boundary).
+		 */
+		size = min_t(size_t, U16_MAX - header->size,
+			     event->attr.aux_sample_size);
+		size = rounddown(size, 8);
+		size = perf_aux_sample_size(event, data, size);
+
+		WARN_ON_ONCE(size + header->size > U16_MAX);
+		header->size += size;
+	}
+	/*
+	 * If you're adding more sample types here, you likely need to do
+	 * something about the overflowing header::size, like repurpose the
+	 * lowest 3 bits of size, which should be always zero at the moment.
+	 */
+	WARN_ON_ONCE(header->size & 7);
 }
 
 static void __always_inline
@@ -9841,6 +9974,17 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader,
 	return gctx;
 }
 
+static bool
+can_sample_for(struct perf_event *sample_event, struct perf_event *event)
+{
+	if (has_aux(sample_event) &&
+	    sample_event->cpu == event->cpu &&
+	    atomic_long_inc_not_zero(&sample_event->refcount))
+		return true;
+
+	return false;
+}
+
 /**
  * sys_perf_event_open - open a performance event, associate it to a task/cpu
  *
@@ -9854,6 +9998,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_event *group_leader = NULL, *output_event = NULL;
+	struct perf_event *sample_event = NULL;
 	struct perf_event *event, *sibling;
 	struct perf_event_attr attr;
 	struct perf_event_context *ctx, *uninitialized_var(gctx);
@@ -9924,6 +10069,8 @@ SYSCALL_DEFINE5(perf_event_open,
 		group_leader = group.file->private_data;
 		if (flags & PERF_FLAG_FD_OUTPUT)
 			output_event = group_leader;
+		if (flags & PERF_FLAG_FD_SAMPLE)
+			sample_event = group_leader;
 		if (flags & PERF_FLAG_FD_NO_GROUP)
 			group_leader = NULL;
 	}
@@ -10146,6 +10293,15 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
+	if (sample_event) {
+		/* Grabs sample_event's reference on success */
+		if (!can_sample_for(sample_event, event)) {
+			err = -EINVAL;
+			goto err_locked;
+		}
+
+		event->sample_event = sample_event;
+	}
 
 	/*
 	 * Must be under the same ctx::mutex as perf_install_in_context(),
-- 
2.17.1