Web lists-archives.com

[PATCH V2 3/4] perf cgroup: Add new RB tree keys for cgroup




From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>

Current RB tree for pinned/flexible groups doesn't take cgroup into
account. All events on a given CPU will be fed to
pinned/flexible_sched_in(), which relies on perf_cgroup_match() to
filter the events for a specific cgroup. The method has high overhead,
especially in frequent context switch with several events and cgroups
involved.

Add new RB tree keys, cgrp_id and cgrp_group_index, for cgroup.
The unique cgrp_id (the same as css subsys-unique ID) is used to
indicate a cgroup. Events in the same cgroup has the same cgrp_id.
The cgrp_id is always zero for non-cgroup case. There is no functional
change for non-cgroup case.
The cgrp_group_index is used for multiplexing. The rotated events of a
cgroup has the same cgrp_group_index, which equals to the (group_index
-1) of the first rotated events.
The non-cgroup events, e.g. system-wide events, are treated as special
cgroups. The cgrp_group_index is also updated in multiplexing.

Add percpu pinned/flexible_event in perf_cgroup to track the left most
event for a cgroup, which will be used later to fast access the event of
a given cgroup.
Add percpu rotated_event to track the rotated events of a cgroup.

Add perf_event_groups_first_cgroup() to find the left most event for a
given cgroup ID and cgrp_group_index on a given CPU.

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
---
 include/linux/perf_event.h |   5 ++
 kernel/events/core.c       | 217 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 210 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3f12937..800bf62 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -703,6 +703,8 @@ struct perf_event {
 
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
+	u64				cgrp_id; /* perf cgroup ID */
+	u64				cgrp_group_index;
 #endif
 
 	struct list_head		sb_list;
@@ -837,6 +839,9 @@ struct perf_cgroup {
 	struct cgroup_subsys_state	css;
 	struct perf_cgroup_info	__percpu *info;
 	int				cgrp_event_type;
+	struct perf_event * __percpu	*pinned_event;
+	struct perf_event * __percpu	*flexible_event;
+	struct perf_event * __percpu	*rotated_event;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a3885e68..6891c74 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -717,6 +717,7 @@ static inline void perf_detach_cgroup(struct perf_event *event)
 {
 	css_put(&event->cgrp->css);
 	event->cgrp = NULL;
+	event->cgrp_id = 0;
 }
 
 static inline int is_cgroup_event(struct perf_event *event)
@@ -961,6 +962,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 
 	cgrp = container_of(css, struct perf_cgroup, css);
 	event->cgrp = cgrp;
+	event->cgrp_id = css->id;
 
 	if (event->attr.pinned)
 		cgrp->cgrp_event_type |= EVENT_CGROUP_PINNED_ONLY;
@@ -1561,6 +1563,9 @@ static void init_event_group(struct perf_event *event)
 {
 	RB_CLEAR_NODE(&event->group_node);
 	event->group_index = 0;
+#ifdef CONFIG_CGROUP_PERF
+	event->cgrp_group_index = 0;
+#endif
 }
 
 /*
@@ -1588,8 +1593,8 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
 /*
  * Compare function for event groups;
  *
- * Implements complex key that first sorts by CPU and then by virtual index
- * which provides ordering when rotating groups for the same CPU.
+ * Implements complex key that sorts by CPU, cgroup index, cgroup ID, and
+ * virtual index which provides ordering when rotating groups for the same CPU.
  */
 static bool
 perf_event_groups_less(struct perf_event *left, struct perf_event *right)
@@ -1599,6 +1604,18 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 	if (left->cpu > right->cpu)
 		return false;
 
+#ifdef CONFIG_CGROUP_PERF
+	if (left->cgrp_group_index < right->cgrp_group_index)
+		return true;
+	if (left->cgrp_group_index > right->cgrp_group_index)
+		return false;
+
+	if (left->cgrp_id < right->cgrp_id)
+		return true;
+	if (left->cgrp_id > right->cgrp_id)
+		return false;
+#endif
+
 	if (left->group_index < right->group_index)
 		return true;
 	if (left->group_index > right->group_index)
@@ -1608,13 +1625,14 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 }
 
 /*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
+ * Insert @event into @groups' tree; Using
+ * {@event->cpu, @event->cgrp_group_index, @event->cgrp_id, ++@groups->index}
+ * for key (see perf_event_groups_less). This places it last inside the CPU
  * subtree.
  */
 static void
-perf_event_groups_insert(struct perf_event_groups *groups,
-			 struct perf_event *event)
+__perf_event_groups_insert(struct perf_event_groups *groups,
+			   struct perf_event *event)
 {
 	struct perf_event *node_event;
 	struct rb_node *parent;
@@ -1639,6 +1657,10 @@ perf_event_groups_insert(struct perf_event_groups *groups,
 	rb_insert_color(&event->group_node, &groups->tree);
 }
 
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+			 struct perf_event *event);
+
 /*
  * Helper function to insert event into the pinned or flexible groups.
  */
@@ -1655,8 +1677,8 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
  * Delete a group from a tree.
  */
 static void
-perf_event_groups_delete(struct perf_event_groups *groups,
-			 struct perf_event *event)
+__perf_event_groups_delete(struct perf_event_groups *groups,
+			   struct perf_event *event)
 {
 	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
 		     RB_EMPTY_ROOT(&groups->tree));
@@ -1665,6 +1687,10 @@ perf_event_groups_delete(struct perf_event_groups *groups,
 	init_event_group(event);
 }
 
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+			 struct perf_event *event);
+
 /*
  * Helper function to delete event from its groups.
  */
@@ -1717,6 +1743,129 @@ perf_event_groups_next(struct perf_event *event)
 	return NULL;
 }
 
+#ifdef CONFIG_CGROUP_PERF
+
+static struct perf_event *
+perf_event_groups_first_cgroup(struct perf_event_groups *groups,
+			       int cpu, u64 cgrp_group_index, u64 cgrp_id)
+{
+	struct perf_event *node_event = NULL, *match = NULL;
+	struct rb_node *node = groups->tree.rb_node;
+
+	while (node) {
+		node_event = container_of(node, struct perf_event, group_node);
+
+		if (cpu < node_event->cpu) {
+			node = node->rb_left;
+		} else if (cpu > node_event->cpu) {
+			node = node->rb_right;
+		} else {
+			if (cgrp_group_index < node_event->cgrp_group_index)
+				node = node->rb_left;
+			else if (cgrp_group_index > node_event->cgrp_group_index)
+				node = node->rb_right;
+			else {
+
+				if (cgrp_id < node_event->cgrp_id)
+					node = node->rb_left;
+				else if (cgrp_id > node_event->cgrp_id)
+					node = node->rb_right;
+				else {
+					match = node_event;
+					node = node->rb_left;
+				}
+			}
+		}
+	}
+	return match;
+}
+
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+			 struct perf_event *event)
+{
+	struct perf_event **cgrp_event, **rotated_event;
+
+	__perf_event_groups_insert(groups, event);
+
+	if (is_cgroup_event(event)) {
+		if (event->attr.pinned)
+			cgrp_event = per_cpu_ptr(event->cgrp->pinned_event, event->cpu);
+		else {
+			cgrp_event = per_cpu_ptr(event->cgrp->flexible_event, event->cpu);
+			rotated_event = per_cpu_ptr(event->cgrp->rotated_event, event->cpu);
+
+			/* Add the first rotated event into *rotated_event */
+			if (*cgrp_event && !*rotated_event &&
+			    (event->cgrp_group_index > (*cgrp_event)->cgrp_group_index))
+				*rotated_event = event;
+
+			/*
+			 * *cgrp_event always point to the unrotated events.
+			 * All events have been rotated.
+			 * Update *cgrp_event and *rotated_event for next round.
+			 */
+			if (!*cgrp_event && *rotated_event) {
+				*cgrp_event = *rotated_event;
+				*rotated_event = NULL;
+			}
+		}
+		/*
+		 * Cgroup events for the same cgroup on the same CPU will
+		 * always be inserted at the right because of bigger
+		 * @groups->index.
+		 */
+		if (!*cgrp_event)
+			*cgrp_event = event;
+	}
+}
+
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+			 struct perf_event *event)
+{
+	struct perf_event **cgrp_event, **rotated_event;
+
+	__perf_event_groups_delete(groups, event);
+
+	if (is_cgroup_event(event)) {
+		if (event->attr.pinned)
+			cgrp_event = per_cpu_ptr(event->cgrp->pinned_event, event->cpu);
+		else {
+			cgrp_event = per_cpu_ptr(event->cgrp->flexible_event, event->cpu);
+			rotated_event = per_cpu_ptr(event->cgrp->rotated_event, event->cpu);
+			if (*rotated_event == event) {
+				*rotated_event = perf_event_groups_first_cgroup(groups, event->cpu,
+										event->cgrp_group_index,
+										event->cgrp_id);
+			}
+		}
+		if (*cgrp_event == event) {
+			*cgrp_event = perf_event_groups_first_cgroup(groups, event->cpu,
+								     event->cgrp_group_index,
+								     event->cgrp_id);
+		}
+	}
+}
+
+#else /* !CONFIG_CGROUP_PERF */
+
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+			 struct perf_event *event)
+{
+	__perf_event_groups_insert(groups, event);
+}
+
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+			 struct perf_event *event)
+{
+	__perf_event_groups_delete(groups, event);
+}
+
+#endif
+
 /*
  * Iterate through the whole groups tree.
  */
@@ -3757,6 +3906,10 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
  */
 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
 {
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+	struct perf_event **rotated_event;
+#endif
 	/*
 	 * Rotate the first entry last of non-pinned groups. Rotation might be
 	 * disabled by the inheritance code.
@@ -3765,6 +3918,22 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
 		return;
 
 	perf_event_groups_delete(&ctx->flexible_groups, event);
+
+#ifdef CONFIG_CGROUP_PERF
+
+	/* Rotate cgroups */
+	if (&cpuctx->ctx == ctx) {
+		if (event->cgrp) {
+			rotated_event = per_cpu_ptr(event->cgrp->rotated_event, event->cpu);
+			if (!*rotated_event)
+				event->cgrp_group_index = ctx->flexible_groups.index;
+			else
+				event->cgrp_group_index = (*rotated_event)->cgrp_group_index;
+		} else
+			event->cgrp_group_index = ctx->flexible_groups.index;
+	}
+#endif
+
 	perf_event_groups_insert(&ctx->flexible_groups, event);
 }
 
@@ -12196,18 +12365,42 @@ perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return ERR_PTR(-ENOMEM);
 
 	jc->info = alloc_percpu(struct perf_cgroup_info);
-	if (!jc->info) {
-		kfree(jc);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!jc->info)
+		goto free_jc;
+
+	jc->pinned_event = alloc_percpu(struct perf_event *);
+	if (!jc->pinned_event)
+		goto free_jc_info;
+
+	jc->flexible_event = alloc_percpu(struct perf_event *);
+	if (!jc->flexible_event)
+		goto free_jc_pinned;
+
+	jc->rotated_event = alloc_percpu(struct perf_event *);
+	if (!jc->rotated_event)
+		goto free_jc_flexible;
 
 	return &jc->css;
+
+free_jc_flexible:
+	free_percpu(jc->flexible_event);
+free_jc_pinned:
+	free_percpu(jc->pinned_event);
+free_jc_info:
+	free_percpu(jc->info);
+free_jc:
+	kfree(jc);
+
+	return ERR_PTR(-ENOMEM);
 }
 
 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
 
+	free_percpu(jc->pinned_event);
+	free_percpu(jc->flexible_event);
+	free_percpu(jc->rotated_event);
 	free_percpu(jc->info);
 	kfree(jc);
 }
-- 
2.7.4