samples/bpf: xdp_monitor tool based on tracepoints

This tool xdp_monitor demonstrate how to use the different xdp_redirect tracepoints xdp_redirect{,_map}{,_err} from a BPF program. The default mode is to only monitor the error counters, to avoid affecting the per packet performance. Tracepoints comes with a base overhead of 25 nanosec for an attached bpf_prog, and 48 nanosec for using a full perf record (with non-matching filter). Thus, default loading the --stats mode could affect the maximum performance. This version of the tool is very simple and count all types of errors as one. It will be natural to extend this later with the different types of errors that can occur, which should help users quickly identify common mistakes. Because the TP_STRUCT was kept in sync all the tracepoints loads the same BPF code. It would also be natural to extend the map version to demonstrate how the map information could be used. Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-29 16:38:11 +02:00 · 2017-08-29 16:38:11 +02:00 · 3ffab54602
commit 3ffab54602
parent 306da4e685
3 changed files with 387 additions and 0 deletions
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@ -39,6 +39,7 @@ hostprogs-y += per_socket_stats_example
 hostprogs-y += load_sock_ops
 hostprogs-y += xdp_redirect
 hostprogs-y += xdp_redirect_map
 hostprogs-y += xdp_monitor
 hostprogs-y += syscall_tp
 # Libbpf dependencies
@ -83,6 +84,7 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
 per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o
 xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
 xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 # Tell kbuild to always build the programs
@ -127,6 +129,7 @@ always += tcp_iw_kern.o
 always += tcp_clamp_kern.o
 always += xdp_redirect_kern.o
 always += xdp_redirect_map_kern.o
 always += xdp_monitor_kern.o
 always += syscall_tp_kern.o
 HOSTCFLAGS += -I$(objtree)/usr/include
@ -166,6 +169,7 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
 HOSTLOADLIBES_test_map_in_map += -lelf
 HOSTLOADLIBES_xdp_redirect += -lelf
 HOSTLOADLIBES_xdp_redirect_map += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@ -0,0 +1,88 @@
 /* XDP monitor tool, based on tracepoints
 *
 *  Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
 */
 #include <uapi/linux/bpf.h>
 #include "bpf_helpers.h"
 struct bpf_map_def SEC("maps") redirect_err_cnt = {
 	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
 	.key_size = sizeof(u32),
 	.value_size = sizeof(u64),
 	.max_entries = 2,
 	/* TODO: have entries for all possible errno's */
 };
 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
 * Code in:                kernel/include/trace/events/xdp.h
 */
 struct xdp_redirect_ctx {
 	unsigned short common_type;	//	offset:0;  size:2; signed:0;
 	unsigned char common_flags;	//	offset:2;  size:1; signed:0;
 	unsigned char common_preempt_count;//	offset:3;  size:1; signed:0;
 	int common_pid;			//	offset:4;  size:4; signed:1;
 	int prog_id;			//	offset:8;  size:4; signed:1;
 	u32 act;			//	offset:12  size:4; signed:0;
 	int ifindex;			//	offset:16  size:4; signed:1;
 	int err;			//	offset:20  size:4; signed:1;
 	int to_ifindex;			//	offset:24  size:4; signed:1;
 	u32 map_id;			//	offset:28  size:4; signed:0;
 	int map_index;			//	offset:32  size:4; signed:1;
 };					//	offset:36
 enum {
 	XDP_REDIRECT_SUCCESS = 0,
 	XDP_REDIRECT_ERROR = 1
 };
 static __always_inline
 int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
 {
 	u32 key = XDP_REDIRECT_ERROR;
 	int err = ctx->err;
 	u64 *cnt;
 	if (!err)
 		key = XDP_REDIRECT_SUCCESS;
 	cnt  = bpf_map_lookup_elem(&redirect_err_cnt, &key);
 	if (!cnt)
 		return 0;
 	*cnt += 1;
 	return 0; /* Indicate event was filtered (no further processing)*/
 	/*
 	 * Returning 1 here would allow e.g. a perf-record tracepoint
 	 * to see and record these events, but it doesn't work well
 	 * in-practice as stopping perf-record also unload this
 	 * bpf_prog.  Plus, there is additional overhead of doing so.
 	 */
 }
 SEC("tracepoint/xdp/xdp_redirect_err")
 int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
 {
 	return xdp_redirect_collect_stat(ctx);
 }
 SEC("tracepoint/xdp/xdp_redirect_map_err")
 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
 {
 	return xdp_redirect_collect_stat(ctx);
 }
 /* Likely unloaded when prog starts */
 SEC("tracepoint/xdp/xdp_redirect")
 int trace_xdp_redirect(struct xdp_redirect_ctx *ctx)
 {
 	return xdp_redirect_collect_stat(ctx);
 }
 /* Likely unloaded when prog starts */
 SEC("tracepoint/xdp/xdp_redirect_map")
 int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
 {
 	return xdp_redirect_collect_stat(ctx);
 }
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@ -0,0 +1,295 @@
 /* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
 */
 static const char *__doc__=
 "XDP monitor tool, based on tracepoints\n"
 ;
 static const char *__doc_err_only__=
 " NOTICE: Only tracking XDP redirect errors\n"
 "         Enable TX success stats via '--stats'\n"
 "         (which comes with a per packet processing overhead)\n"
 ;
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
 #include <ctype.h>
 #include <unistd.h>
 #include <locale.h>
 #include <getopt.h>
 #include <net/if.h>
 #include <time.h>
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "bpf_util.h"
 static int verbose = 1;
 static bool debug = false;
 static const struct option long_options[] = {
 	{"help",	no_argument,		NULL, 'h' },
 	{"debug",	no_argument,		NULL, 'D' },
 	{"stats",	no_argument,		NULL, 'S' },
 	{"sec", 	required_argument,	NULL, 's' },
 	{0, 0, NULL,  0 }
 };
 static void usage(char *argv[])
 {
 	int i;
 	printf("\nDOCUMENTATION:\n%s\n", __doc__);
 	printf("\n");
 	printf(" Usage: %s (options-see-below)\n",
 	       argv[0]);
 	printf(" Listing options:\n");
 	for (i = 0; long_options[i].name != 0; i++) {
 		printf(" --%-15s", long_options[i].name);
 		if (long_options[i].flag != NULL)
 			printf(" flag (internal value:%d)",
 			       *long_options[i].flag);
 		else
 			printf("(internal short-option: -%c)",
 			       long_options[i].val);
 		printf("\n");
 	}
 	printf("\n");
 }
 #define NANOSEC_PER_SEC 1000000000 /* 10^9 */
 __u64 gettime(void)
 {
 	struct timespec t;
 	int res;
 	res = clock_gettime(CLOCK_MONOTONIC, &t);
 	if (res < 0) {
 		fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
 		exit(EXIT_FAILURE);
 	}
 	return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
 }
 enum {
 	REDIR_SUCCESS = 0,
 	REDIR_ERROR = 1,
 };
 #define REDIR_RES_MAX 2
 static const char *redir_names[REDIR_RES_MAX] = {
 	[REDIR_SUCCESS]	= "Success",
 	[REDIR_ERROR]	= "Error",
 };
 static const char *err2str(int err)
 {
 	if (err < REDIR_RES_MAX)
 		return redir_names[err];
 	return NULL;
 }
 struct record {
 	__u64 counter;
 	__u64 timestamp;
 };
 struct stats_record {
 	struct record xdp_redir[REDIR_RES_MAX];
 };
 static void stats_print_headers(bool err_only)
 {
 	if (err_only)
 		printf("\n%s\n", __doc_err_only__);
 	printf("%-14s %-10s %-18s %-9s\n",
 	       "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period");
 }
 static void stats_print(struct stats_record *rec,
 			struct stats_record *prev,
 			bool err_only)
 {
 	int i = 0;
 	if (err_only)
 		i = REDIR_ERROR;
 	for (; i < REDIR_RES_MAX; i++) {
 		struct record *r = &rec->xdp_redir[i];
 		struct record *p = &prev->xdp_redir[i];
 		__u64 period  = 0;
 		__u64 packets = 0;
 		double pps = 0;
 		double period_ = 0;
 		if (p->timestamp) {
 			packets = r->counter - p->counter;
 			period  = r->timestamp - p->timestamp;
 			if (period > 0) {
 				period_ = ((double) period / NANOSEC_PER_SEC);
 				pps = packets / period_;
 			}
 		}
 		printf("%-14s %-10.0f %'-18.0f %f\n",
 		       err2str(i), pps, pps, period_);
 	}
 }
 static __u64 get_key32_value64_percpu(int fd, __u32 key)
 {
 	/* For percpu maps, userspace gets a value per possible CPU */
 	unsigned int nr_cpus = bpf_num_possible_cpus();
 	__u64 values[nr_cpus];
 	__u64 sum = 0;
 	int i;
 	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
 		fprintf(stderr,
 			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
 		return 0;
 	}
 	/* Sum values from each CPU */
 	for (i = 0; i < nr_cpus; i++) {
 		sum += values[i];
 	}
 	return sum;
 }
 static bool stats_collect(int fd, struct stats_record *rec)
 {
 	int i;
 	/* TODO: Detect if someone unloaded the perf event_fd's, as
 	 * this can happen by someone running perf-record -e
 	 */
 	for (i = 0; i < REDIR_RES_MAX; i++) {
 		rec->xdp_redir[i].timestamp = gettime();
 		rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
 	}
 	return true;
 }
 static void stats_poll(int interval, bool err_only)
 {
 	struct stats_record rec, prev;
 	int map_fd;
 	memset(&rec, 0, sizeof(rec));
 	/* Trick to pretty printf with thousands separators use %' */
 	setlocale(LC_NUMERIC, "en_US");
 	/* Header */
 	if (verbose)
 		printf("\n%s", __doc__);
 	/* TODO Need more advanced stats on error types */
 	if (verbose)
 		printf(" - Stats map: %s\n", map_data[0].name);
 	map_fd = map_data[0].fd;
 	stats_print_headers(err_only);
 	fflush(stdout);
 	while (1) {
 		memcpy(&prev, &rec, sizeof(rec));
 		stats_collect(map_fd, &rec);
 		stats_print(&rec, &prev, err_only);
 		fflush(stdout);
 		sleep(interval);
 	}
 }
 void print_bpf_prog_info(void)
 {
 	int i;
 	/* Prog info */
 	printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt);
 	for (i = 0; i < prog_cnt; i++) {
 		printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]);
 	}
 	/* Maps info */
 	printf("Loaded BPF prog have %d map(s)\n", map_data_count);
 	for (i = 0; i < map_data_count; i++) {
 		char *name = map_data[i].name;
 		int fd     = map_data[i].fd;
 		printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name);
 	}
 	/* Event info */
 	printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt);
 	for (i = 0; i < prog_cnt; i++) {
 		if (event_fd[i] != -1)
 			printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]);
 	}
 }
 int main(int argc, char **argv)
 {
 	int longindex = 0, opt;
 	int ret = EXIT_SUCCESS;
 	char bpf_obj_file[256];
 	/* Default settings: */
 	bool errors_only = true;
 	int interval = 2;
 	snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
 	/* Parse commands line args */
 	while ((opt = getopt_long(argc, argv, "h",
 				  long_options, &longindex)) != -1) {
 		switch (opt) {
 		case 'D':
 			debug = true;
 			break;
 		case 'S':
 			errors_only = false;
 			break;
 		case 's':
 			interval = atoi(optarg);
 			break;
 		case 'h':
 		default:
 			usage(argv);
 			return EXIT_FAILURE;
 		}
 	}
 	if (load_bpf_file(bpf_obj_file)) {
 		printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
 		return 1;
 	}
 	if (!prog_fd[0]) {
 		printf("ERROR - load_bpf_file: %s\n", strerror(errno));
 		return 1;
 	}
 	if (debug) {
 		print_bpf_prog_info();
 	}
 	/* Unload/stop tracepoint event by closing fd's */
 	if (errors_only) {
 		/* The prog_fd[i] and event_fd[i] depend on the
 		 * order the functions was defined in _kern.c
 		 */
 		close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */
 		close(prog_fd[2]);  /* func: trace_xdp_redirect */
 		close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */
 		close(prog_fd[3]);  /* func: trace_xdp_redirect_map */
 	}
 	stats_poll(interval, errors_only);
 	return ret;
 }