cregit-Linux how code gets into the kernel

Release 4.10 tools/perf/builtin-sched.c

Directory: tools/perf
#include "builtin.h"
#include "perf.h"

#include "util/util.h"
#include "util/evlist.h"
#include "util/cache.h"
#include "util/evsel.h"
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
#include "util/session.h"
#include "util/tool.h"
#include "util/cloexec.h"
#include "util/thread_map.h"
#include "util/color.h"
#include "util/stat.h"
#include "util/callchain.h"
#include "util/time-utils.h"

#include <subcmd/parse-options.h>
#include "util/trace-event.h"

#include "util/debug.h"

#include <linux/log2.h>
#include <sys/prctl.h>
#include <sys/resource.h>

#include <semaphore.h>
#include <pthread.h>
#include <math.h>
#include <api/fs/fs.h>
#include <linux/time64.h>


#define PR_SET_NAME		15               
/* Set process name */

#define MAX_CPUS		4096

#define COMM_LEN		20

#define SYM_LEN			129

#define MAX_PID			1024000

struct sched_atom;


struct task_desc {
	
unsigned long		nr;
	
unsigned long		pid;
	
char			comm[COMM_LEN];

	
unsigned long		nr_events;
	
unsigned long		curr_event;
	
struct sched_atom	**atoms;

	
pthread_t		thread;
	
sem_t			sleep_sem;

	
sem_t			ready_for_work;
	
sem_t			work_done_sem;

	
u64			cpu_usage;
};


enum sched_event_type {
	
SCHED_EVENT_RUN,
	
SCHED_EVENT_SLEEP,
	
SCHED_EVENT_WAKEUP,
	
SCHED_EVENT_MIGRATION,
};


struct sched_atom {
	
enum sched_event_type	type;
	
int			specific_wait;
	
u64			timestamp;
	
u64			duration;
	
unsigned long		nr;
	
sem_t			*wait_sem;
	
struct task_desc	*wakee;
};


#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"


enum thread_state {
	
THREAD_SLEEPING = 0,
	
THREAD_WAIT_CPU,
	
THREAD_SCHED_IN,
	
THREAD_IGNORE
};


struct work_atom {
	
struct list_head	list;
	
enum thread_state	state;
	
u64			sched_out_time;
	
u64			wake_up_time;
	
u64			sched_in_time;
	
u64			runtime;
};


struct work_atoms {
	
struct list_head	work_list;
	
struct thread		*thread;
	
struct rb_node		node;
	
u64			max_lat;
	
u64			max_lat_at;
	
u64			total_lat;
	
u64			nb_atoms;
	
u64			total_runtime;
	
int			num_merged;
};


typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);

struct perf_sched;


struct trace_sched_handler {
	
int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);

	
int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);

	
int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);

	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
	
int (*fork_event)(struct perf_sched *sched, union perf_event *event,
			  struct machine *machine);

	
int (*migrate_task_event)(struct perf_sched *sched,
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
};


#define COLOR_PIDS PERF_COLOR_BLUE

#define COLOR_CPUS PERF_COLOR_BG_RED


struct perf_sched_map {
	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
	
int			*comp_cpus;
	
bool			 comp;
	
struct thread_map	*color_pids;
	
const char		*color_pids_str;
	
struct cpu_map		*color_cpus;
	
const char		*color_cpus_str;
	
struct cpu_map		*cpus;
	
const char		*cpus_str;
};


struct perf_sched {
	
struct perf_tool tool;
	
const char	 *sort_order;
	
unsigned long	 nr_tasks;
	
struct task_desc **pid_to_task;
	
struct task_desc **tasks;
	
const struct trace_sched_handler *tp_handler;
	
pthread_mutex_t	 start_work_mutex;
	
pthread_mutex_t	 work_done_wait_mutex;
	
int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	
int		 max_cpu;
	
u32		 curr_pid[MAX_CPUS];
	
struct thread	 *curr_thread[MAX_CPUS];
	
char		 next_shortname1;
	
char		 next_shortname2;
	
unsigned int	 replay_repeat;
	
unsigned long	 nr_run_events;
	
unsigned long	 nr_sleep_events;
	
unsigned long	 nr_wakeup_events;
	
unsigned long	 nr_sleep_corrections;
	
unsigned long	 nr_run_events_optimized;
	
unsigned long	 targetless_wakeups;
	
unsigned long	 multitarget_wakeups;
	
unsigned long	 nr_runs;
	
unsigned long	 nr_timestamps;
	
unsigned long	 nr_unordered_timestamps;
	
unsigned long	 nr_context_switch_bugs;
	
unsigned long	 nr_events;
	
unsigned long	 nr_lost_chunks;
	
unsigned long	 nr_lost_events;
	
u64		 run_measurement_overhead;
	
u64		 sleep_measurement_overhead;
	
u64		 start_time;
	
u64		 cpu_usage;
	
u64		 runavg_cpu_usage;
	
u64		 parent_cpu_usage;
	
u64		 runavg_parent_cpu_usage;
	
u64		 sum_runtime;
	
u64		 sum_fluct;
	
u64		 run_avg;
	
u64		 all_runtime;
	
u64		 all_count;
	
u64		 cpu_last_switched[MAX_CPUS];
	


struct rb_root	 atom_root, sorted_atom_root, merged_atom_root;
	

struct list_head sort_list, cmp_pid;
	
bool force;
	
bool skip_merge;
	
struct perf_sched_map map;

	/* options for timehist command */
	
bool		summary;
	
bool		summary_only;
	
bool		idle_hist;
	
bool		show_callchain;
	
unsigned int	max_stack;
	
bool		show_cpu_visual;
	
bool		show_wakeups;
	
bool		show_migrations;
	
u64		skipped_samples;
	
const char	*time_str;
	
struct perf_time_interval ptime;
	
struct perf_time_interval hist_time;
};

/* per thread run time data */

struct thread_runtime {
	
u64 last_time;      /* time of previous sched in/out event */
	
u64 dt_run;         /* run time */
	
u64 dt_wait;        /* time between CPU access (off cpu) */
	
u64 dt_delay;       /* time between wakeup and sched-in */
	
u64 ready_to_run;   /* time of wakeup */

	
struct stats run_stats;
	
u64 total_run_time;

	
u64 migrations;
};

/* per event run time data */

struct evsel_runtime {
	
u64 *last_time; /* time this event was last seen per cpu */
	
u32 ncpu;       /* highest cpu slot allocated */
};

/* per cpu idle time data */

struct idle_thread_runtime {
	
struct thread_runtime	tr;
	
struct thread		*last_thread;
	
struct rb_root		sorted_root;
	
struct callchain_root	callchain;
	
struct callchain_cursor	cursor;
};

/* track idle times per cpu */

static struct thread **idle_threads;

static int idle_max_cpu;

static char idle_comm[] = "<idle>";


static u64 get_nsecs(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo2477.42%266.67%
ingo molnaringo molnar722.58%133.33%
Total31100.00%3100.00%


static void burn_nsecs(struct perf_sched *sched, u64 nsecs) { u64 T0 = get_nsecs(), T1; do { T1 = get_nsecs(); } while (T1 + sched->run_measurement_overhead < T0 + nsecs); }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo3786.05%150.00%
ingo molnaringo molnar613.95%150.00%
Total43100.00%2100.00%


static void sleep_nsecs(u64 nsecs) { struct timespec ts; ts.tv_nsec = nsecs % 999999999; ts.tv_sec = nsecs / 999999999; nanosleep(&ts, NULL); }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo37100.00%1100.00%
Total37100.00%1100.00%


static void calibrate_run_measurement_overhead(struct perf_sched *sched) { u64 T0, T1, delta, min_delta = NSEC_PER_SEC; int i; for (i = 0; i < 10; i++) { T0 = get_nsecs(); burn_nsecs(sched, 0); T1 = get_nsecs(); delta = T1-T0; min_delta = min(min_delta, delta); } sched->run_measurement_overhead = min_delta; printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta); }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo6271.26%266.67%
ingo molnaringo molnar2528.74%133.33%
Total87100.00%3100.00%


static void calibrate_sleep_measurement_overhead(struct perf_sched *sched) { u64 T0, T1, delta, min_delta = NSEC_PER_SEC; int i; for (i = 0; i < 10; i++) { T0 = get_nsecs(); sleep_nsecs(10000); T1 = get_nsecs(); delta = T1-T0; min_delta = min(min_delta, delta); } min_delta -= 10000; sched->sleep_measurement_overhead = min_delta; printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta); }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo5460.67%250.00%
ingo molnaringo molnar3539.33%250.00%
Total89100.00%4100.00%


static struct sched_atom * get_new_event(struct task_desc *task, u64 timestamp) { struct sched_atom *event = zalloc(sizeof(*event)); unsigned long idx = task->nr_events; size_t size; event->timestamp = timestamp; event->nr = idx; task->nr_events++; size = sizeof(struct sched_atom *) * task->nr_events; task->atoms = realloc(task->atoms, size); BUG_ON(!task->atoms); task->atoms[idx] = event; return event; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo7168.27%125.00%
ingo molnaringo molnar3331.73%375.00%
Total104100.00%4100.00%


static struct sched_atom *last_event(struct task_desc *task) { if (!task->nr_events) return NULL; return task->atoms[task->nr_events - 1]; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo2880.00%150.00%
ingo molnaringo molnar720.00%150.00%
Total35100.00%2100.00%


static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task, u64 timestamp, u64 duration) { struct sched_atom *event, *curr_event = last_event(task); /* * optimize an existing RUN event by merging this one * to it: */ if (curr_event && curr_event->type == SCHED_EVENT_RUN) { sched->nr_run_events_optimized++; curr_event->duration += duration; return; } event = get_new_event(task, timestamp); event->type = SCHED_EVENT_RUN; event->duration = duration; sched->nr_run_events++; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo6980.23%133.33%
ingo molnaringo molnar1517.44%133.33%
xiao guangrongxiao guangrong22.33%133.33%
Total86100.00%3100.00%


static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task, u64 timestamp, struct task_desc *wakee) { struct sched_atom *event, *wakee_event; event = get_new_event(task, timestamp); event->type = SCHED_EVENT_WAKEUP; event->wakee = wakee; wakee_event = last_event(wakee); if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) { sched->targetless_wakeups++; return; } if (wakee_event->wait_sem) { sched->multitarget_wakeups++; return; } wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem)); sem_init(wakee_event->wait_sem, 0, 0); wakee_event->specific_wait = 1; event->wait_sem = wakee_event->wait_sem; sched->nr_wakeup_events++; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo12086.96%133.33%
xiao guangrongxiao guangrong117.97%133.33%
ingo molnaringo molnar75.07%133.33%
Total138100.00%3100.00%


static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task, u64 timestamp, u64 task_state __maybe_unused) { struct sched_atom *event = get_new_event(task, timestamp); event->type = SCHED_EVENT_SLEEP; sched->nr_sleep_events++; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo3780.43%133.33%
ingo molnaringo molnar919.57%266.67%
Total46100.00%3100.00%


static struct task_desc *register_pid(struct perf_sched *sched, unsigned long pid, const char *comm) { struct task_desc *task; static int pid_max; if (sched->pid_to_task == NULL) { if (sysctl__read_int("kernel/pid_max", &pid_max) < 0) pid_max = MAX_PID; BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL); } if (pid >= (unsigned long)pid_max) { BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) * sizeof(struct task_desc *))) == NULL); while (pid >= (unsigned long)pid_max) sched->pid_to_task[pid_max++] = NULL; } task = sched->pid_to_task[pid]; if (task) return task; task = zalloc(sizeof(*task)); task->pid = pid; task->nr = sched->nr_tasks; strcpy(task->comm, comm); /* * every task starts in sleeping state - this gets ignored * if there's no wakeup pointing to this sleep state: */ add_sched_event_sleep(sched, task, 0, 0); sched->pid_to_task[pid] = task; sched->nr_tasks++; sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *)); BUG_ON(!sched->tasks); sched->tasks[task->nr] = task; if (verbose) printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm); return task; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo13648.57%228.57%
yunlong songyunlong song11240.00%342.86%
ingo molnaringo molnar2810.00%114.29%
xiao guangrongxiao guangrong41.43%114.29%
Total280100.00%7100.00%


static void print_task_traces(struct perf_sched *sched) { struct task_desc *task; unsigned long i; for (i = 0; i < sched->nr_tasks; i++) { task = sched->tasks[i]; printf("task %6ld (%20s:%10ld), nr_events: %ld\n", task->nr, task->comm, task->pid, task->nr_events); } }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo4161.19%133.33%
ingo molnaringo molnar2537.31%133.33%
xiao guangrongxiao guangrong11.49%133.33%
Total67100.00%3100.00%


static void add_cross_task_wakeups(struct perf_sched *sched) { struct task_desc *task1, *task2; unsigned long i, j; for (i = 0; i < sched->nr_tasks; i++) { task1 = sched->tasks[i]; j = i + 1; if (j == sched->nr_tasks) j = 0; task2 = sched->tasks[j]; add_sched_event_wakeup(sched, task1, 0, task2); } }

Contributors

PersonTokensPropCommitsCommitProp
ingo molnaringo molnar4752.81%150.00%
arnaldo carvalho de meloarnaldo carvalho de melo4247.19%150.00%
Total89100.00%2100.00%


static void perf_sched__process_event(struct perf_sched *sched, struct sched_atom *atom) { int ret = 0; switch (atom->type) { case SCHED_EVENT_RUN: burn_nsecs(sched, atom->duration); break; case SCHED_EVENT_SLEEP: if (atom->wait_sem) ret = sem_wait(atom->wait_sem); BUG_ON(ret); break; case SCHED_EVENT_WAKEUP: if (atom->wait_sem) ret = sem_post(atom->wait_sem); BUG_ON(ret); break; case SCHED_EVENT_MIGRATION: break; default: BUG_ON(1); } }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo6464.00%150.00%
ingo molnaringo molnar3636.00%150.00%
Total100100.00%2100.00%


static u64 get_cpu_usage_nsec_parent(void) { struct rusage ru; u64 sum; int err; err = getrusage(RUSAGE_SELF, &ru); BUG_ON(err); sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC; sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC; return sum; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo6286.11%266.67%
ingo molnaringo molnar1013.89%133.33%
Total72100.00%3100.00%


static int self_open_counters(struct perf_sched *sched, unsigned long cur_task) { struct perf_event_attr attr; char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE]; int fd; struct rlimit limit; bool need_privilege = false; memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_TASK_CLOCK; force_again: fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag()); if (fd < 0) { if (errno == EMFILE) { if (sched->force) { BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1); limit.rlim_cur += sched->nr_tasks - cur_task; if (limit.rlim_cur > limit.rlim_max) { limit.rlim_max = limit.rlim_cur; need_privilege = true; } if (setrlimit(RLIMIT_NOFILE, &limit) == -1) { if (need_privilege && errno == EPERM) strcpy(info, "Need privilege\n"); } else goto force_again; } else strcpy(info, "Have a try with -f option\n"); } pr_err("Error: sys_perf_event_open() syscall returned " "with %d (%s)\n%s", fd, str_error_r(errno, sbuf, sizeof(sbuf)), info); exit(EXIT_FAILURE); } return fd; }

Contributors

PersonTokensPropCommitsCommitProp
yunlong songyunlong song14060.34%225.00%
arnaldo carvalho de meloarnaldo carvalho de melo6126.29%225.00%
ingo molnaringo molnar146.03%112.50%
masami hiramatsumasami hiramatsu135.60%112.50%
namhyung kimnamhyung kim20.86%112.50%
yann droneaudyann droneaud20.86%112.50%
Total232100.00%8100.00%


static u64 get_cpu_usage_nsec_self(int fd) { u64 runtime; int ret; ret = read(fd, &runtime, sizeof(runtime)); BUG_ON(ret != sizeof(runtime)); return runtime; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo4297.67%150.00%
ingo molnaringo molnar12.33%150.00%
Total43100.00%2100.00%

struct sched_thread_parms { struct task_desc *task; struct perf_sched *sched; int fd; };
static void *thread_func(void *ctx) { struct sched_thread_parms *parms = ctx; struct task_desc *this_task = parms->task; struct perf_sched *sched = parms->sched; u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; int fd = parms->fd; zfree(&parms); sprintf(comm2, ":%s", this_task->comm); prctl(PR_SET_NAME, comm2); if (fd < 0) return NULL; again: ret = sem_post(&this_task->ready_for_work); BUG_ON(ret); ret = pthread_mutex_lock(&sched->start_work_mutex); BUG_ON(ret); ret = pthread_mutex_unlock(&sched->start_work_mutex); BUG_ON(ret); cpu_usage_0 = get_cpu_usage_nsec_self(fd); for (i = 0; i < this_task->nr_events; i++) { this_task->curr_event = i; perf_sched__process_event(sched, this_task->atoms[i]); } cpu_usage_1 = get_cpu_usage_nsec_self(fd); this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; ret = sem_post(&this_task->work_done_sem); BUG_ON(ret); ret = pthread_mutex_lock(&sched->work_done_wait_mutex); BUG_ON(ret); ret = pthread_mutex_unlock(&sched->work_done_wait_mutex); BUG_ON(ret); goto again; }

Contributors

PersonTokensPropCommitsCommitProp
arnaldo carvalho de meloarnaldo carvalho de melo17772.24%250.00%
ingo molnaringo molnar6426.12%125.00%
yunlong songyunlong song41.63%125.00%
Total245100.00%4100.00%


static void create_tasks(struct perf_sched *sched) { struct task_desc *task; pthread_attr_t attr; unsigned long i; int err; err = pthread_attr_init(&attr); BUG_ON(err); err = pthread_attr_setstacksize(&attr, (size_t) max(16 * 1024, PTHREAD_STACK_MIN)); BUG_ON(err); err = pthread_mutex_lock(&sched->start_work_mutex); BUG_ON(err); err = pthread_mutex_lock(&sched->work_done_wait_mutex); BUG_ON(err); for (i = 0; i < sched->nr_tasks; i++) { struct sched_thread_parms *parms = malloc(sizeof(*parms));