perf sched timehist: Introduce timehist command

'perf sched timehist' provides an analysis of scheduling events.

Example usage:
    perf sched record -- sleep 1
    perf sched timehist

By default it shows the individual schedule events, including the wait
time (time between sched-out and next sched-in events for the task), the
task scheduling delay (time between wakeup and actually running) and run
time for the task:

            time    cpu  task name             wait time  sch delay   run time
                         [tid/pid]                (msec)     (msec)     (msec)
  -------------- ------  --------------------  ---------  ---------  ---------
    79371.874569 [0011]  gcc[31949]                0.014      0.000      1.148
    79371.874591 [0010]  gcc[31951]                0.000      0.000      0.024
    79371.874603 [0010]  migration/10[59]          3.350      0.004      0.011
    79371.874604 [0011]  <idle>                    1.148      0.000      0.035
    79371.874723 [0005]  <idle>                    0.016      0.000      1.383
    79371.874746 [0005]  gcc[31949]                0.153      0.078      0.022
...

Times are in msec.usec.

Committer note:

Add above explanation as the 'perf sched timehist' entry for 'man
perf-sched'.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/20161116060634.28477-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
David Ahern 2016-11-16 15:06:29 +09:00 committed by Arnaldo Carvalho de Melo
parent 69b7e48070
commit 49394a2a24
2 changed files with 637 additions and 7 deletions

View file

@ -8,11 +8,11 @@ perf-sched - Tool to trace/measure scheduler properties (latencies)
SYNOPSIS
--------
[verse]
'perf sched' {record|latency|map|replay|script}
'perf sched' {record|latency|map|replay|script|timehist}
DESCRIPTION
-----------
There are five variants of perf sched:
There are several variants of 'perf sched':
'perf sched record <command>' to record the scheduling events
of an arbitrary workload.
@ -36,6 +36,30 @@ There are five variants of perf sched:
are running on a CPU. A '*' denotes the CPU that had the event, and
a dot signals an idle CPU.
'perf sched timehist' provides an analysis of scheduling events.
Example usage:
perf sched record -- sleep 1
perf sched timehist
By default it shows the individual schedule events, including the wait
time (time between sched-out and next sched-in events for the task), the
task scheduling delay (time between wakeup and actually running) and run
time for the task:
time cpu task name wait time sch delay run time
[tid/pid] (msec) (msec) (msec)
-------------- ------ -------------------- --------- --------- ---------
79371.874569 [0011] gcc[31949] 0.014 0.000 1.148
79371.874591 [0010] gcc[31951] 0.000 0.000 0.024
79371.874603 [0010] migration/10[59] 3.350 0.004 0.011
79371.874604 [0011] <idle> 1.148 0.000 0.035
79371.874723 [0005] <idle> 0.016 0.000 1.383
79371.874746 [0005] gcc[31949] 0.153 0.078 0.022
...
Times are in msec.usec.
OPTIONS
-------
-i::
@ -66,6 +90,28 @@ OPTIONS for 'perf sched map'
--color-pids::
Highlight the given pids.
OPTIONS for 'perf sched timehist'
---------------------------------
-k::
--vmlinux=<file>::
vmlinux pathname
--kallsyms=<file>::
kallsyms pathname
-s::
--summary::
Show only a summary of scheduling by thread with min, max, and average
run times (in sec) and relative stddev.
-S::
--with-summary::
Show all scheduling events followed by a summary by thread with min,
max, and average run times (in sec) and relative stddev.
--symfs=<directory>::
Look for files with symbols relative to this directory.
SEE ALSO
--------
linkperf:perf-record[1]

View file

@ -13,12 +13,14 @@
#include "util/cloexec.h"
#include "util/thread_map.h"
#include "util/color.h"
#include "util/stat.h"
#include <subcmd/parse-options.h>
#include "util/trace-event.h"
#include "util/debug.h"
#include <linux/log2.h>
#include <sys/prctl.h>
#include <sys/resource.h>
@ -194,6 +196,29 @@ struct perf_sched {
struct perf_sched_map map;
};
/* per thread run time data */
struct thread_runtime {
u64 last_time; /* time of previous sched in/out event */
u64 dt_run; /* run time */
u64 dt_wait; /* time between CPU access (off cpu) */
u64 dt_delay; /* time between wakeup and sched-in */
u64 ready_to_run; /* time of wakeup */
struct stats run_stats;
u64 total_run_time;
};
/* per event run time data */
struct evsel_runtime {
u64 *last_time; /* time this event was last seen per cpu */
u32 ncpu; /* highest cpu slot allocated */
};
/* track idle times per cpu */
static struct thread **idle_threads;
static int idle_max_cpu;
static char idle_comm[] = "<idle>";
static u64 get_nsecs(void)
{
struct timespec ts;
@ -1654,6 +1679,546 @@ static int perf_sched__read_events(struct perf_sched *sched)
return rc;
}
/*
* scheduling times are printed as msec.usec
*/
static inline void print_sched_time(unsigned long long nsecs, int width)
{
unsigned long msecs;
unsigned long usecs;
msecs = nsecs / NSEC_PER_MSEC;
nsecs -= msecs * NSEC_PER_MSEC;
usecs = nsecs / NSEC_PER_USEC;
printf("%*lu.%03lu ", width, msecs, usecs);
}
/*
* returns runtime data for event, allocating memory for it the
* first time it is used.
*/
static struct evsel_runtime *perf_evsel__get_runtime(struct perf_evsel *evsel)
{
struct evsel_runtime *r = evsel->priv;
if (r == NULL) {
r = zalloc(sizeof(struct evsel_runtime));
evsel->priv = r;
}
return r;
}
/*
* save last time event was seen per cpu
*/
static void perf_evsel__save_time(struct perf_evsel *evsel,
u64 timestamp, u32 cpu)
{
struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
if (r == NULL)
return;
if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
int i, n = __roundup_pow_of_two(cpu+1);
void *p = r->last_time;
p = realloc(r->last_time, n * sizeof(u64));
if (!p)
return;
r->last_time = p;
for (i = r->ncpu; i < n; ++i)
r->last_time[i] = (u64) 0;
r->ncpu = n;
}
r->last_time[cpu] = timestamp;
}
/* returns last time this event was seen on the given cpu */
static u64 perf_evsel__get_time(struct perf_evsel *evsel, u32 cpu)
{
struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
return 0;
return r->last_time[cpu];
}
static int comm_width = 20;
static char *timehist_get_commstr(struct thread *thread)
{
static char str[32];
const char *comm = thread__comm_str(thread);
pid_t tid = thread->tid;
pid_t pid = thread->pid_;
int n;
if (pid == 0)
n = scnprintf(str, sizeof(str), "%s", comm);
else if (tid != pid)
n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);
else
n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);
if (n > comm_width)
comm_width = n;
return str;
}
static void timehist_header(void)
{
printf("%15s %6s ", "time", "cpu");
printf(" %-20s %9s %9s %9s",
"task name", "wait time", "sch delay", "run time");
printf("\n");
/*
* units row
*/
printf("%15s %-6s ", "", "");
printf(" %-20s %9s %9s %9s\n", "[tid/pid]", "(msec)", "(msec)", "(msec)");
/*
* separator
*/
printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
printf(" %.20s %.9s %.9s %.9s",
graph_dotted_line, graph_dotted_line, graph_dotted_line,
graph_dotted_line);
printf("\n");
}
static void timehist_print_sample(struct perf_sample *sample,
struct thread *thread)
{
struct thread_runtime *tr = thread__priv(thread);
char tstr[64];
timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
printf("%15s [%04d] ", tstr, sample->cpu);
printf(" %-*s ", comm_width, timehist_get_commstr(thread));
print_sched_time(tr->dt_wait, 6);
print_sched_time(tr->dt_delay, 6);
print_sched_time(tr->dt_run, 6);
printf("\n");
}
/*
* Explanation of delta-time stats:
*
* t = time of current schedule out event
* tprev = time of previous sched out event
* also time of schedule-in event for current task
* last_time = time of last sched change event for current task
* (i.e, time process was last scheduled out)
* ready_to_run = time of wakeup for current task
*
* -----|------------|------------|------------|------
* last ready tprev t
* time to run
*
* |-------- dt_wait --------|
* |- dt_delay -|-- dt_run --|
*
* dt_run = run time of current task
* dt_wait = time between last schedule out event for task and tprev
* represents time spent off the cpu
* dt_delay = time between wakeup and schedule-in of task
*/
static void timehist_update_runtime_stats(struct thread_runtime *r,
u64 t, u64 tprev)
{
r->dt_delay = 0;
r->dt_wait = 0;
r->dt_run = 0;
if (tprev) {
r->dt_run = t - tprev;
if (r->ready_to_run) {
if (r->ready_to_run > tprev)
pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
else
r->dt_delay = tprev - r->ready_to_run;
}
if (r->last_time > tprev)
pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
else if (r->last_time)
r->dt_wait = tprev - r->last_time;
}
update_stats(&r->run_stats, r->dt_run);
r->total_run_time += r->dt_run;
}
static bool is_idle_sample(struct perf_sample *sample,
struct perf_evsel *evsel)
{
/* pid 0 == swapper == idle task */
if (sample->pid == 0)
return true;
if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0) {
if (perf_evsel__intval(evsel, sample, "prev_pid") == 0)
return true;
}
return false;
}
/*
* Track idle stats per cpu by maintaining a local thread
* struct for the idle task on each cpu.
*/
static int init_idle_threads(int ncpu)
{
int i;
idle_threads = zalloc(ncpu * sizeof(struct thread *));
if (!idle_threads)
return -ENOMEM;
idle_max_cpu = ncpu - 1;
/* allocate the actual thread struct if needed */
for (i = 0; i < ncpu; ++i) {
idle_threads[i] = thread__new(0, 0);
if (idle_threads[i] == NULL)
return -ENOMEM;
thread__set_comm(idle_threads[i], idle_comm, 0);
}
return 0;
}
static void free_idle_threads(void)
{
int i;
if (idle_threads == NULL)
return;
for (i = 0; i <= idle_max_cpu; ++i) {
if ((idle_threads[i]))
thread__delete(idle_threads[i]);
}
free(idle_threads);
}
static struct thread *get_idle_thread(int cpu)
{
/*
* expand/allocate array of pointers to local thread
* structs if needed
*/
if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
int i, j = __roundup_pow_of_two(cpu+1);
void *p;
p = realloc(idle_threads, j * sizeof(struct thread *));
if (!p)
return NULL;
idle_threads = (struct thread **) p;
i = idle_max_cpu ? idle_max_cpu + 1 : 0;
for (; i < j; ++i)
idle_threads[i] = NULL;
idle_max_cpu = j;
}
/* allocate a new thread struct if needed */
if (idle_threads[cpu] == NULL) {
idle_threads[cpu] = thread__new(0, 0);
if (idle_threads[cpu]) {
idle_threads[cpu]->tid = 0;
thread__set_comm(idle_threads[cpu], idle_comm, 0);
}
}
return idle_threads[cpu];
}
/*
* handle runtime stats saved per thread
*/
static struct thread_runtime *thread__init_runtime(struct thread *thread)
{
struct thread_runtime *r;
r = zalloc(sizeof(struct thread_runtime));
if (!r)
return NULL;
init_stats(&r->run_stats);
thread__set_priv(thread, r);
return r;
}
static struct thread_runtime *thread__get_runtime(struct thread *thread)
{
struct thread_runtime *tr;
tr = thread__priv(thread);
if (tr == NULL) {
tr = thread__init_runtime(thread);
if (tr == NULL)
pr_debug("Failed to malloc memory for runtime data.\n");
}
return tr;
}
static struct thread *timehist_get_thread(struct perf_sample *sample,
struct machine *machine,
struct perf_evsel *evsel)
{
struct thread *thread;
if (is_idle_sample(sample, evsel)) {
thread = get_idle_thread(sample->cpu);
if (thread == NULL)
pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
} else {
thread = machine__findnew_thread(machine, sample->pid, sample->tid);
if (thread == NULL) {
pr_debug("Failed to get thread for tid %d. skipping sample.\n",
sample->tid);
}
}
return thread;
}
static bool timehist_skip_sample(struct thread *thread)
{
bool rc = false;
if (thread__is_filtered(thread))
rc = true;
return rc;
}
static int timehist_sched_wakeup_event(struct perf_tool *tool __maybe_unused,
union perf_event *event __maybe_unused,
struct perf_evsel *evsel,
struct perf_sample *sample,
struct machine *machine)
{
struct thread *thread;
struct thread_runtime *tr = NULL;
/* want pid of awakened task not pid in sample */
const u32 pid = perf_evsel__intval(evsel, sample, "pid");
thread = machine__findnew_thread(machine, 0, pid);
if (thread == NULL)
return -1;
tr = thread__get_runtime(thread);
if (tr == NULL)
return -1;
if (tr->ready_to_run == 0)
tr->ready_to_run = sample->time;
return 0;
}
static int timehist_sched_change_event(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct perf_evsel *evsel,
struct perf_sample *sample,
struct machine *machine)
{
struct addr_location al;
struct thread *thread;
struct thread_runtime *tr = NULL;
u64 tprev;
int rc = 0;
if (machine__resolve(machine, &al, sample) < 0) {
pr_err("problem processing %d event. skipping it\n",
event->header.type);
rc = -1;
goto out;
}
thread = timehist_get_thread(sample, machine, evsel);
if (thread == NULL) {
rc = -1;
goto out;
}
if (timehist_skip_sample(thread))
goto out;
tr = thread__get_runtime(thread);
if (tr == NULL) {
rc = -1;
goto out;
}
tprev = perf_evsel__get_time(evsel, sample->cpu);
timehist_update_runtime_stats(tr, sample->time, tprev);
timehist_print_sample(sample, thread);
out:
if (tr) {
/* time of this sched_switch event becomes last time task seen */
tr->last_time = sample->time;
/* sched out event for task so reset ready to run time */
tr->ready_to_run = 0;
}
perf_evsel__save_time(evsel, sample->time, sample->cpu);
return rc;
}
static int timehist_sched_switch_event(struct perf_tool *tool,
union perf_event *event,
struct perf_evsel *evsel,
struct perf_sample *sample,
struct machine *machine __maybe_unused)
{
return timehist_sched_change_event(tool, event, evsel, sample, machine);
}
static int process_lost(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct perf_sample *sample,
struct machine *machine __maybe_unused)
{
char tstr[64];
timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
printf("%15s ", tstr);
printf("lost %" PRIu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
return 0;
}
typedef int (*sched_handler)(struct perf_tool *tool,
union perf_event *event,
struct perf_evsel *evsel,
struct perf_sample *sample,
struct machine *machine);
static int perf_timehist__process_sample(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
struct perf_evsel *evsel,
struct machine *machine)
{
struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
int err = 0;
int this_cpu = sample->cpu;
if (this_cpu > sched->max_cpu)
sched->max_cpu = this_cpu;
if (evsel->handler != NULL) {
sched_handler f = evsel->handler;
err = f(tool, event, evsel, sample, machine);
}
return err;
}
static int perf_sched__timehist(struct perf_sched *sched)
{
const struct perf_evsel_str_handler handlers[] = {
{ "sched:sched_switch", timehist_sched_switch_event, },
{ "sched:sched_wakeup", timehist_sched_wakeup_event, },
{ "sched:sched_wakeup_new", timehist_sched_wakeup_event, },
};
struct perf_data_file file = {
.path = input_name,
.mode = PERF_DATA_MODE_READ,
};
struct perf_session *session;
int err = -1;
/*
* event handlers for timehist option
*/
sched->tool.sample = perf_timehist__process_sample;
sched->tool.mmap = perf_event__process_mmap;
sched->tool.comm = perf_event__process_comm;
sched->tool.exit = perf_event__process_exit;
sched->tool.fork = perf_event__process_fork;
sched->tool.lost = process_lost;
sched->tool.attr = perf_event__process_attr;
sched->tool.tracing_data = perf_event__process_tracing_data;
sched->tool.build_id = perf_event__process_build_id;
sched->tool.ordered_events = true;
sched->tool.ordering_requires_timestamps = true;
session = perf_session__new(&file, false, &sched->tool);
if (session == NULL)
return -ENOMEM;
symbol__init(&session->header.env);
setup_pager();
/* setup per-evsel handlers */
if (perf_session__set_tracepoints_handlers(session, handlers))
goto out;
if (!perf_session__has_traces(session, "record -R"))
goto out;
/* pre-allocate struct for per-CPU idle stats */
sched->max_cpu = session->header.env.nr_cpus_online;
if (sched->max_cpu == 0)
sched->max_cpu = 4;
if (init_idle_threads(sched->max_cpu))
goto out;
timehist_header();
err = perf_session__process_events(session);
if (err) {
pr_err("Failed to process events, error %d", err);
goto out;
}
out:
free_idle_threads();
perf_session__delete(session);
return err;
}
static void print_bad_events(struct perf_sched *sched)
{
if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
@ -1970,8 +2535,6 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
const struct option latency_options[] = {
OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
"sort by key(s): runtime, switch, avg, max"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
OPT_INTEGER('C', "CPU", &sched.profile_cpu,
"CPU to profile on"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
@ -1983,8 +2546,6 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
const struct option replay_options[] = {
OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
"repeat the workload replay N times (-1: infinite)"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
@ -2001,6 +2562,16 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
"display given CPUs in map"),
OPT_PARENT(sched_options)
};
const struct option timehist_options[] = {
OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
"file", "vmlinux pathname"),
OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
"file", "kallsyms pathname"),
OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
"Look for files with symbols relative to this directory"),
OPT_PARENT(sched_options)
};
const char * const latency_usage[] = {
"perf sched latency [<options>]",
NULL
@ -2013,8 +2584,13 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
"perf sched map [<options>]",
NULL
};
const char * const timehist_usage[] = {
"perf sched timehist [<options>]",
NULL
};
const char *const sched_subcommands[] = { "record", "latency", "map",
"replay", "script", NULL };
"replay", "script",
"timehist", NULL };
const char *sched_usage[] = {
NULL,
NULL
@ -2077,6 +2653,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
usage_with_options(replay_usage, replay_options);
}
return perf_sched__replay(&sched);
} else if (!strcmp(argv[0], "timehist")) {
if (argc) {
argc = parse_options(argc, argv, timehist_options,
timehist_usage, 0);
if (argc)
usage_with_options(timehist_usage, timehist_options);
}
return perf_sched__timehist(&sched);
} else {
usage_with_options(sched_usage, sched_options);
}