Web lists-archives.com

[PATCH v5] bcc: Try use new API to create [k,u]probe with perf_event_open




New kernel API allows creating [k,u]probe with perf_event_open.
This patch tries to use the new API. If the new API doesn't work,
we fall back to old API.

bpf_detach_probe() looks up the event being removed. If the event
is not found, we skip the clean up procedure.

Signed-off-by: Song Liu <songliubraving@xxxxxx>
---
 src/cc/libbpf.c | 264 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 196 insertions(+), 68 deletions(-)

diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index ef6daf3..1ac685f 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -526,38 +526,113 @@ int bpf_attach_socket(int sock, int prog) {
   return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog));
 }
 
-static int bpf_attach_tracing_event(int progfd, const char *event_path,
-    struct perf_reader *reader, int pid, int cpu, int group_fd) {
-  int efd, pfd;
-  ssize_t bytes;
-  char buf[256];
-  struct perf_event_attr attr = {};
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+  int fd;
+  int ret;
+  char buf[64];
 
-  snprintf(buf, sizeof(buf), "%s/id", event_path);
-  efd = open(buf, O_RDONLY, 0);
-  if (efd < 0) {
-    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+  snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+
+  fd = open(buf, O_RDONLY);
+  if (fd < 0)
     return -1;
-  }
+  ret = read(fd, buf, sizeof(buf));
+  close(fd);
+  if (ret < 0 || ret >= sizeof(buf))
+    return -1;
+  ret = (int)strtol(buf, NULL, 10);
+  return errno ? -1 : ret;
+}
 
-  bytes = read(efd, buf, sizeof(buf));
-  if (bytes <= 0 || bytes >= sizeof(buf)) {
-    fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
-    close(efd);
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+  int fd;
+  int ret;
+  char buf[64];
+
+  snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+  fd = open(buf, O_RDONLY);
+  if (fd < 0)
+    return -1;
+  ret = read(fd, buf, sizeof(buf));
+  close(fd);
+  if (ret < 0 || ret >= sizeof(buf))
+    return -1;
+  if (strlen(buf) < strlen("config:"))
+    return -1;
+  ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+  return errno ? -1 : ret;
+}
+
+/*
+ * new kernel API allows creating [k,u]probe with perf_event_open, which
+ * makes it easier to clean up the [k,u]probe. This function tries to
+ * create pfd with the new API.
+ */
+static int bpf_try_perf_event_open_with_probe(const char *name, uint64_t offs,
+    int pid, int cpu, int group_fd, char *event_type, int is_return)
+{
+  struct perf_event_attr attr = {};
+  int type = bpf_find_probe_type(event_type);
+  int is_return_bit = bpf_get_retprobe_bit(event_type);
+
+  if (type < 0 || is_return_bit < 0)
     return -1;
-  }
-  close(efd);
-  buf[bytes] = '\0';
-  attr.config = strtol(buf, NULL, 0);
-  attr.type = PERF_TYPE_TRACEPOINT;
   attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
   attr.sample_period = 1;
   attr.wakeup_events = 1;
-  pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC);
+  if (is_return)
+    attr.config |= 1 << is_return_bit;
+  attr.probe_offset = offs;  /* for kprobe, if name is NULL, this the addr */
+  attr.size = sizeof(attr);
+  attr.type = type;
+  attr.kprobe_func = ptr_to_u64((void *)name);  /* also work for uprobe_path */
+  return syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd,
+                 PERF_FLAG_FD_CLOEXEC);
+}
+
+static int bpf_attach_tracing_event(int progfd, const char *event_path,
+    struct perf_reader *reader, int pid, int cpu, int group_fd, int pfd) {
+  int efd;
+  ssize_t bytes;
+  char buf[256];
+  struct perf_event_attr attr = {};
+
+  /*
+   * Only look up id and call perf_event_open when
+   * bpf_try_perf_event_open_with_probe() didn't returns valid pfd.
+   */
   if (pfd < 0) {
-    fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno));
-    return -1;
+    snprintf(buf, sizeof(buf), "%s/id", event_path);
+    efd = open(buf, O_RDONLY, 0);
+    if (efd < 0) {
+      fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+      return -1;
+    }
+
+    bytes = read(efd, buf, sizeof(buf));
+    if (bytes <= 0 || bytes >= sizeof(buf)) {
+      fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
+      close(efd);
+      return -1;
+    }
+    close(efd);
+    buf[bytes] = '\0';
+    attr.config = strtol(buf, NULL, 0);
+    attr.type = PERF_TYPE_TRACEPOINT;
+    attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+    attr.sample_period = 1;
+    attr.wakeup_events = 1;
+    pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC);
+    if (pfd < 0) {
+      fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno));
+      return -1;
+    }
   }
+
   perf_reader_set_fd(reader, pfd);
 
   if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
@@ -585,31 +660,39 @@ void * bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, con
   char event_alias[128];
   struct perf_reader *reader = NULL;
   static char *event_type = "kprobe";
+  int pfd;
 
   reader = perf_reader_new(cb, NULL, NULL, cb_cookie, probe_perf_reader_page_cnt);
   if (!reader)
     goto error;
 
-  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
-  kfd = open(buf, O_WRONLY | O_APPEND, 0);
-  if (kfd < 0) {
-    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
-    goto error;
-  }
+  /* try use new API to create kprobe */
+  pfd = bpf_try_perf_event_open_with_probe(fn_name, 0, pid, cpu, group_fd,
+                                           event_type,
+                                           attach_type != BPF_PROBE_ENTRY);
 
-  snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
-  snprintf(buf, sizeof(buf), "%c:%ss/%s %s", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
-			event_type, event_alias, fn_name);
-  if (write(kfd, buf, strlen(buf)) < 0) {
-    if (errno == EINVAL)
-      fprintf(stderr, "check dmesg output for possible cause\n");
+  if (pfd < 0) {
+    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
+    kfd = open(buf, O_WRONLY | O_APPEND, 0);
+    if (kfd < 0) {
+      fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+      goto error;
+    }
+
+    snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
+    snprintf(buf, sizeof(buf), "%c:%ss/%s %s", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
+             event_type, event_alias, fn_name);
+    if (write(kfd, buf, strlen(buf)) < 0) {
+      if (errno == EINVAL)
+        fprintf(stderr, "check dmesg output for possible cause\n");
+      close(kfd);
+      goto error;
+    }
     close(kfd);
-    goto error;
+    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
   }
-  close(kfd);
 
-  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
-  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd) < 0)
+  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd, pfd) < 0)
     goto error;
 
   return reader;
@@ -691,42 +774,50 @@ void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, con
   struct perf_reader *reader = NULL;
   static char *event_type = "uprobe";
   int res, kfd = -1, ns_fd = -1;
+  int pfd = -1;
 
   reader = perf_reader_new(cb, NULL, NULL, cb_cookie, probe_perf_reader_page_cnt);
   if (!reader)
     goto error;
 
-  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
-  kfd = open(buf, O_WRONLY | O_APPEND, 0);
-  if (kfd < 0) {
-    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
-    goto error;
-  }
+  /* try use new API to create uprobe */
+  pfd = bpf_try_perf_event_open_with_probe(binary_path, offset, pid, cpu,
+            group_fd, event_type, attach_type != BPF_PROBE_ENTRY);
 
-  res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
-  if (res < 0 || res >= sizeof(event_alias)) {
-    fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name);
-    goto error;
-  }
-  res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
-			event_type, event_alias, binary_path, offset);
-  if (res < 0 || res >= sizeof(buf)) {
-    fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias);
-    goto error;
-  }
+  if (pfd < 0) {
+    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
+    kfd = open(buf, O_WRONLY | O_APPEND, 0);
+    if (kfd < 0) {
+      fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+      goto error;
+    }
 
-  ns_fd = enter_mount_ns(pid);
-  if (write(kfd, buf, strlen(buf)) < 0) {
-    if (errno == EINVAL)
-      fprintf(stderr, "check dmesg output for possible cause\n");
-    goto error;
+    res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
+    if (res < 0 || res >= sizeof(event_alias)) {
+      fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name);
+      goto error;
+    }
+    res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
+                   event_type, event_alias, binary_path, offset);
+    if (res < 0 || res >= sizeof(buf)) {
+      fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias);
+      goto error;
+    }
+
+    ns_fd = enter_mount_ns(pid);
+    if (write(kfd, buf, strlen(buf)) < 0) {
+      if (errno == EINVAL)
+        fprintf(stderr, "check dmesg output for possible cause\n");
+      goto error;
+    }
+    close(kfd);
+    exit_mount_ns(ns_fd);
+    ns_fd = -1;
+
+    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
   }
-  close(kfd);
-  exit_mount_ns(ns_fd);
-  ns_fd = -1;
 
-  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
-  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd) < 0)
+  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd, pfd) < 0)
     goto error;
 
   return reader;
@@ -741,8 +832,43 @@ error:
 
 static int bpf_detach_probe(const char *ev_name, const char *event_type)
 {
-  int kfd, res;
+  int kfd = -1, res;
   char buf[PATH_MAX];
+  int found_event = 0;
+  size_t bufsize = 0;
+  char *cptr = NULL;
+  FILE *fp;
+
+  /*
+   * For [k,u]probe created with perf_event_open (on newer kernel), it is
+   * not necessary to clean it up in [k,u]probe_events. We first look up
+   * the %s_bcc_%d line in [k,u]probe_events. If the event is not found,
+   * it is safe to skip the cleaning up process (write -:... to the file).
+   */
+  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
+  fp = fopen(buf, "r");
+  if (!fp) {
+    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+    goto error;
+  }
+
+  res = snprintf(buf, sizeof(buf), "%ss/%s_bcc_%d", event_type, ev_name, getpid());
+  if (res < 0 || res >= sizeof(buf)) {
+    fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
+    goto error;
+  }
+
+  while (getline(&cptr, &bufsize, fp) != -1)
+    if (strstr(cptr, buf) != NULL) {
+      found_event = 1;
+      break;
+    }
+  fclose(fp);
+  fp = NULL;
+
+  if (!found_event)
+    return 0;
+
   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
   kfd = open(buf, O_WRONLY | O_APPEND, 0);
   if (kfd < 0) {
@@ -766,6 +892,8 @@ static int bpf_detach_probe(const char *ev_name, const char *event_type)
 error:
   if (kfd >= 0)
     close(kfd);
+  if (fp)
+    fclose(fp);
   return -1;
 }
 
@@ -792,7 +920,7 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category,
 
   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s",
            tp_category, tp_name);
-  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd) < 0)
+  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd, -1) < 0)
     goto error;
 
   return reader;
-- 
2.9.5