Release 4.10 tools/perf/builtin-sched.c
#include "builtin.h"
#include "perf.h"
#include "util/util.h"
#include "util/evlist.h"
#include "util/cache.h"
#include "util/evsel.h"
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
#include "util/session.h"
#include "util/tool.h"
#include "util/cloexec.h"
#include "util/thread_map.h"
#include "util/color.h"
#include "util/stat.h"
#include "util/callchain.h"
#include "util/time-utils.h"
#include <subcmd/parse-options.h>
#include "util/trace-event.h"
#include "util/debug.h"
#include <linux/log2.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
#include <api/fs/fs.h>
#include <linux/time64.h>
#define PR_SET_NAME 15
/* Set process name */
#define MAX_CPUS 4096
#define COMM_LEN 20
#define SYM_LEN 129
#define MAX_PID 1024000
struct sched_atom;
struct task_desc {
unsigned long nr;
unsigned long pid;
char comm[COMM_LEN];
unsigned long nr_events;
unsigned long curr_event;
struct sched_atom **atoms;
pthread_t thread;
sem_t sleep_sem;
sem_t ready_for_work;
sem_t work_done_sem;
u64 cpu_usage;
};
enum sched_event_type {
SCHED_EVENT_RUN,
SCHED_EVENT_SLEEP,
SCHED_EVENT_WAKEUP,
SCHED_EVENT_MIGRATION,
};
struct sched_atom {
enum sched_event_type type;
int specific_wait;
u64 timestamp;
u64 duration;
unsigned long nr;
sem_t *wait_sem;
struct task_desc *wakee;
};
#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
enum thread_state {
THREAD_SLEEPING = 0,
THREAD_WAIT_CPU,
THREAD_SCHED_IN,
THREAD_IGNORE
};
struct work_atom {
struct list_head list;
enum thread_state state;
u64 sched_out_time;
u64 wake_up_time;
u64 sched_in_time;
u64 runtime;
};
struct work_atoms {
struct list_head work_list;
struct thread *thread;
struct rb_node node;
u64 max_lat;
u64 max_lat_at;
u64 total_lat;
u64 nb_atoms;
u64 total_runtime;
int num_merged;
};
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
struct perf_sched;
struct trace_sched_handler {
int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
struct perf_sample *sample, struct machine *machine);
int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
struct perf_sample *sample, struct machine *machine);
int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
struct perf_sample *sample, struct machine *machine);
/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
int (*fork_event)(struct perf_sched *sched, union perf_event *event,
struct machine *machine);
int (*migrate_task_event)(struct perf_sched *sched,
struct perf_evsel *evsel,
struct perf_sample *sample,
struct machine *machine);
};
#define COLOR_PIDS PERF_COLOR_BLUE
#define COLOR_CPUS PERF_COLOR_BG_RED
struct perf_sched_map {
DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
int *comp_cpus;
bool comp;
struct thread_map *color_pids;
const char *color_pids_str;
struct cpu_map *color_cpus;
const char *color_cpus_str;
struct cpu_map *cpus;
const char *cpus_str;
};
struct perf_sched {
struct perf_tool tool;
const char *sort_order;
unsigned long nr_tasks;
struct task_desc **pid_to_task;
struct task_desc **tasks;
const struct trace_sched_handler *tp_handler;
pthread_mutex_t start_work_mutex;
pthread_mutex_t work_done_wait_mutex;
int profile_cpu;
/*
* Track the current task - that way we can know whether there's any
* weird events, such as a task being switched away that is not current.
*/
int max_cpu;
u32 curr_pid[MAX_CPUS];
struct thread *curr_thread[MAX_CPUS];
char next_shortname1;
char next_shortname2;
unsigned int replay_repeat;
unsigned long nr_run_events;
unsigned long nr_sleep_events;
unsigned long nr_wakeup_events;
unsigned long nr_sleep_corrections;
unsigned long nr_run_events_optimized;
unsigned long targetless_wakeups;
unsigned long multitarget_wakeups;
unsigned long nr_runs;
unsigned long nr_timestamps;
unsigned long nr_unordered_timestamps;
unsigned long nr_context_switch_bugs;
unsigned long nr_events;
unsigned long nr_lost_chunks;
unsigned long nr_lost_events;
u64 run_measurement_overhead;
u64 sleep_measurement_overhead;
u64 start_time;
u64 cpu_usage;
u64 runavg_cpu_usage;
u64 parent_cpu_usage;
u64 runavg_parent_cpu_usage;
u64 sum_runtime;
u64 sum_fluct;
u64 run_avg;
u64 all_runtime;
u64 all_count;
u64 cpu_last_switched[MAX_CPUS];
struct rb_root atom_root, sorted_atom_root, merged_atom_root;
struct list_head sort_list, cmp_pid;
bool force;
bool skip_merge;
struct perf_sched_map map;
/* options for timehist command */
bool summary;
bool summary_only;
bool idle_hist;
bool show_callchain;
unsigned int max_stack;
bool show_cpu_visual;
bool show_wakeups;
bool show_migrations;
u64 skipped_samples;
const char *time_str;
struct perf_time_interval ptime;
struct perf_time_interval hist_time;
};
/* per thread run time data */
struct thread_runtime {
u64 last_time; /* time of previous sched in/out event */
u64 dt_run; /* run time */
u64 dt_wait; /* time between CPU access (off cpu) */
u64 dt_delay; /* time between wakeup and sched-in */
u64 ready_to_run; /* time of wakeup */
struct stats run_stats;
u64 total_run_time;
u64 migrations;
};
/* per event run time data */
struct evsel_runtime {
u64 *last_time; /* time this event was last seen per cpu */
u32 ncpu; /* highest cpu slot allocated */
};
/* per cpu idle time data */
struct idle_thread_runtime {
struct thread_runtime tr;
struct thread *last_thread;
struct rb_root sorted_root;
struct callchain_root callchain;
struct callchain_cursor cursor;
};
/* track idle times per cpu */
static struct thread **idle_threads;
static int idle_max_cpu;
static char idle_comm[] = "<idle>";
static u64 get_nsecs(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 24 | 77.42% | 2 | 66.67% |
ingo molnar | ingo molnar | 7 | 22.58% | 1 | 33.33% |
| Total | 31 | 100.00% | 3 | 100.00% |
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
{
u64 T0 = get_nsecs(), T1;
do {
T1 = get_nsecs();
} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 37 | 86.05% | 1 | 50.00% |
ingo molnar | ingo molnar | 6 | 13.95% | 1 | 50.00% |
| Total | 43 | 100.00% | 2 | 100.00% |
static void sleep_nsecs(u64 nsecs)
{
struct timespec ts;
ts.tv_nsec = nsecs % 999999999;
ts.tv_sec = nsecs / 999999999;
nanosleep(&ts, NULL);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 37 | 100.00% | 1 | 100.00% |
| Total | 37 | 100.00% | 1 | 100.00% |
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
{
u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
int i;
for (i = 0; i < 10; i++) {
T0 = get_nsecs();
burn_nsecs(sched, 0);
T1 = get_nsecs();
delta = T1-T0;
min_delta = min(min_delta, delta);
}
sched->run_measurement_overhead = min_delta;
printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 62 | 71.26% | 2 | 66.67% |
ingo molnar | ingo molnar | 25 | 28.74% | 1 | 33.33% |
| Total | 87 | 100.00% | 3 | 100.00% |
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
{
u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
int i;
for (i = 0; i < 10; i++) {
T0 = get_nsecs();
sleep_nsecs(10000);
T1 = get_nsecs();
delta = T1-T0;
min_delta = min(min_delta, delta);
}
min_delta -= 10000;
sched->sleep_measurement_overhead = min_delta;
printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 54 | 60.67% | 2 | 50.00% |
ingo molnar | ingo molnar | 35 | 39.33% | 2 | 50.00% |
| Total | 89 | 100.00% | 4 | 100.00% |
static struct sched_atom *
get_new_event(struct task_desc *task, u64 timestamp)
{
struct sched_atom *event = zalloc(sizeof(*event));
unsigned long idx = task->nr_events;
size_t size;
event->timestamp = timestamp;
event->nr = idx;
task->nr_events++;
size = sizeof(struct sched_atom *) * task->nr_events;
task->atoms = realloc(task->atoms, size);
BUG_ON(!task->atoms);
task->atoms[idx] = event;
return event;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 71 | 68.27% | 1 | 25.00% |
ingo molnar | ingo molnar | 33 | 31.73% | 3 | 75.00% |
| Total | 104 | 100.00% | 4 | 100.00% |
static struct sched_atom *last_event(struct task_desc *task)
{
if (!task->nr_events)
return NULL;
return task->atoms[task->nr_events - 1];
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 28 | 80.00% | 1 | 50.00% |
ingo molnar | ingo molnar | 7 | 20.00% | 1 | 50.00% |
| Total | 35 | 100.00% | 2 | 100.00% |
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
u64 timestamp, u64 duration)
{
struct sched_atom *event, *curr_event = last_event(task);
/*
* optimize an existing RUN event by merging this one
* to it:
*/
if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
sched->nr_run_events_optimized++;
curr_event->duration += duration;
return;
}
event = get_new_event(task, timestamp);
event->type = SCHED_EVENT_RUN;
event->duration = duration;
sched->nr_run_events++;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 69 | 80.23% | 1 | 33.33% |
ingo molnar | ingo molnar | 15 | 17.44% | 1 | 33.33% |
xiao guangrong | xiao guangrong | 2 | 2.33% | 1 | 33.33% |
| Total | 86 | 100.00% | 3 | 100.00% |
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
u64 timestamp, struct task_desc *wakee)
{
struct sched_atom *event, *wakee_event;
event = get_new_event(task, timestamp);
event->type = SCHED_EVENT_WAKEUP;
event->wakee = wakee;
wakee_event = last_event(wakee);
if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
sched->targetless_wakeups++;
return;
}
if (wakee_event->wait_sem) {
sched->multitarget_wakeups++;
return;
}
wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
sem_init(wakee_event->wait_sem, 0, 0);
wakee_event->specific_wait = 1;
event->wait_sem = wakee_event->wait_sem;
sched->nr_wakeup_events++;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 120 | 86.96% | 1 | 33.33% |
xiao guangrong | xiao guangrong | 11 | 7.97% | 1 | 33.33% |
ingo molnar | ingo molnar | 7 | 5.07% | 1 | 33.33% |
| Total | 138 | 100.00% | 3 | 100.00% |
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
u64 timestamp, u64 task_state __maybe_unused)
{
struct sched_atom *event = get_new_event(task, timestamp);
event->type = SCHED_EVENT_SLEEP;
sched->nr_sleep_events++;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 37 | 80.43% | 1 | 33.33% |
ingo molnar | ingo molnar | 9 | 19.57% | 2 | 66.67% |
| Total | 46 | 100.00% | 3 | 100.00% |
static struct task_desc *register_pid(struct perf_sched *sched,
unsigned long pid, const char *comm)
{
struct task_desc *task;
static int pid_max;
if (sched->pid_to_task == NULL) {
if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
pid_max = MAX_PID;
BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
}
if (pid >= (unsigned long)pid_max) {
BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
sizeof(struct task_desc *))) == NULL);
while (pid >= (unsigned long)pid_max)
sched->pid_to_task[pid_max++] = NULL;
}
task = sched->pid_to_task[pid];
if (task)
return task;
task = zalloc(sizeof(*task));
task->pid = pid;
task->nr = sched->nr_tasks;
strcpy(task->comm, comm);
/*
* every task starts in sleeping state - this gets ignored
* if there's no wakeup pointing to this sleep state:
*/
add_sched_event_sleep(sched, task, 0, 0);
sched->pid_to_task[pid] = task;
sched->nr_tasks++;
sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
BUG_ON(!sched->tasks);
sched->tasks[task->nr] = task;
if (verbose)
printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
return task;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 136 | 48.57% | 2 | 28.57% |
yunlong song | yunlong song | 112 | 40.00% | 3 | 42.86% |
ingo molnar | ingo molnar | 28 | 10.00% | 1 | 14.29% |
xiao guangrong | xiao guangrong | 4 | 1.43% | 1 | 14.29% |
| Total | 280 | 100.00% | 7 | 100.00% |
static void print_task_traces(struct perf_sched *sched)
{
struct task_desc *task;
unsigned long i;
for (i = 0; i < sched->nr_tasks; i++) {
task = sched->tasks[i];
printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
task->nr, task->comm, task->pid, task->nr_events);
}
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 41 | 61.19% | 1 | 33.33% |
ingo molnar | ingo molnar | 25 | 37.31% | 1 | 33.33% |
xiao guangrong | xiao guangrong | 1 | 1.49% | 1 | 33.33% |
| Total | 67 | 100.00% | 3 | 100.00% |
static void add_cross_task_wakeups(struct perf_sched *sched)
{
struct task_desc *task1, *task2;
unsigned long i, j;
for (i = 0; i < sched->nr_tasks; i++) {
task1 = sched->tasks[i];
j = i + 1;
if (j == sched->nr_tasks)
j = 0;
task2 = sched->tasks[j];
add_sched_event_wakeup(sched, task1, 0, task2);
}
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
ingo molnar | ingo molnar | 47 | 52.81% | 1 | 50.00% |
arnaldo carvalho de melo | arnaldo carvalho de melo | 42 | 47.19% | 1 | 50.00% |
| Total | 89 | 100.00% | 2 | 100.00% |
static void perf_sched__process_event(struct perf_sched *sched,
struct sched_atom *atom)
{
int ret = 0;
switch (atom->type) {
case SCHED_EVENT_RUN:
burn_nsecs(sched, atom->duration);
break;
case SCHED_EVENT_SLEEP:
if (atom->wait_sem)
ret = sem_wait(atom->wait_sem);
BUG_ON(ret);
break;
case SCHED_EVENT_WAKEUP:
if (atom->wait_sem)
ret = sem_post(atom->wait_sem);
BUG_ON(ret);
break;
case SCHED_EVENT_MIGRATION:
break;
default:
BUG_ON(1);
}
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 64 | 64.00% | 1 | 50.00% |
ingo molnar | ingo molnar | 36 | 36.00% | 1 | 50.00% |
| Total | 100 | 100.00% | 2 | 100.00% |
static u64 get_cpu_usage_nsec_parent(void)
{
struct rusage ru;
u64 sum;
int err;
err = getrusage(RUSAGE_SELF, &ru);
BUG_ON(err);
sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
return sum;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 62 | 86.11% | 2 | 66.67% |
ingo molnar | ingo molnar | 10 | 13.89% | 1 | 33.33% |
| Total | 72 | 100.00% | 3 | 100.00% |
static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
{
struct perf_event_attr attr;
char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
int fd;
struct rlimit limit;
bool need_privilege = false;
memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_SOFTWARE;
attr.config = PERF_COUNT_SW_TASK_CLOCK;
force_again:
fd = sys_perf_event_open(&attr, 0, -1, -1,
perf_event_open_cloexec_flag());
if (fd < 0) {
if (errno == EMFILE) {
if (sched->force) {
BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
limit.rlim_cur += sched->nr_tasks - cur_task;
if (limit.rlim_cur > limit.rlim_max) {
limit.rlim_max = limit.rlim_cur;
need_privilege = true;
}
if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
if (need_privilege && errno == EPERM)
strcpy(info, "Need privilege\n");
} else
goto force_again;
} else
strcpy(info, "Have a try with -f option\n");
}
pr_err("Error: sys_perf_event_open() syscall returned "
"with %d (%s)\n%s", fd,
str_error_r(errno, sbuf, sizeof(sbuf)), info);
exit(EXIT_FAILURE);
}
return fd;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
yunlong song | yunlong song | 140 | 60.34% | 2 | 25.00% |
arnaldo carvalho de melo | arnaldo carvalho de melo | 61 | 26.29% | 2 | 25.00% |
ingo molnar | ingo molnar | 14 | 6.03% | 1 | 12.50% |
masami hiramatsu | masami hiramatsu | 13 | 5.60% | 1 | 12.50% |
namhyung kim | namhyung kim | 2 | 0.86% | 1 | 12.50% |
yann droneaud | yann droneaud | 2 | 0.86% | 1 | 12.50% |
| Total | 232 | 100.00% | 8 | 100.00% |
static u64 get_cpu_usage_nsec_self(int fd)
{
u64 runtime;
int ret;
ret = read(fd, &runtime, sizeof(runtime));
BUG_ON(ret != sizeof(runtime));
return runtime;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 42 | 97.67% | 1 | 50.00% |
ingo molnar | ingo molnar | 1 | 2.33% | 1 | 50.00% |
| Total | 43 | 100.00% | 2 | 100.00% |
struct sched_thread_parms {
struct task_desc *task;
struct perf_sched *sched;
int fd;
};
static void *thread_func(void *ctx)
{
struct sched_thread_parms *parms = ctx;
struct task_desc *this_task = parms->task;
struct perf_sched *sched = parms->sched;
u64 cpu_usage_0, cpu_usage_1;
unsigned long i, ret;
char comm2[22];
int fd = parms->fd;
zfree(&parms);
sprintf(comm2, ":%s", this_task->comm);
prctl(PR_SET_NAME, comm2);
if (fd < 0)
return NULL;
again:
ret = sem_post(&this_task->ready_for_work);
BUG_ON(ret);
ret = pthread_mutex_lock(&sched->start_work_mutex);
BUG_ON(ret);
ret = pthread_mutex_unlock(&sched->start_work_mutex);
BUG_ON(ret);
cpu_usage_0 = get_cpu_usage_nsec_self(fd);
for (i = 0; i < this_task->nr_events; i++) {
this_task->curr_event = i;
perf_sched__process_event(sched, this_task->atoms[i]);
}
cpu_usage_1 = get_cpu_usage_nsec_self(fd);
this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
ret = sem_post(&this_task->work_done_sem);
BUG_ON(ret);
ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
BUG_ON(ret);
ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
BUG_ON(ret);
goto again;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
arnaldo carvalho de melo | arnaldo carvalho de melo | 177 | 72.24% | 2 | 50.00% |
ingo molnar | ingo molnar | 64 | 26.12% | 1 | 25.00% |
yunlong song | yunlong song | 4 | 1.63% | 1 | 25.00% |
| Total | 245 | 100.00% | 4 | 100.00% |
static void create_tasks(struct perf_sched *sched)
{
struct task_desc *task;
pthread_attr_t attr;
unsigned long i;
int err;
err = pthread_attr_init(&attr);
BUG_ON(err);
err = pthread_attr_setstacksize(&attr,
(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
BUG_ON(err);
err = pthread_mutex_lock(&sched->start_work_mutex);
BUG_ON(err);
err = pthread_mutex_lock(&sched->work_done_wait_mutex);
BUG_ON(err);
for (i = 0; i < sched->nr_tasks; i++) {
struct sched_thread_parms *parms = malloc(sizeof(*parms));