cregit-Linux how code gets into the kernel

Release 4.14 tools/perf/builtin-sched.c

Directory: tools/perf
// SPDX-License-Identifier: GPL-2.0
#include "builtin.h"
#include "perf.h"

#include "util/util.h"
#include "util/evlist.h"
#include "util/cache.h"
#include "util/evsel.h"
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
#include "util/session.h"
#include "util/tool.h"
#include "util/cloexec.h"
#include "util/thread_map.h"
#include "util/color.h"
#include "util/stat.h"
#include "util/callchain.h"
#include "util/time-utils.h"

#include <subcmd/parse-options.h>
#include "util/trace-event.h"

#include "util/debug.h"

#include <linux/kernel.h>
#include <linux/log2.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <inttypes.h>

#include <errno.h>
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
#include <api/fs/fs.h>
#include <linux/time64.h>

#include "sane_ctype.h"


#define PR_SET_NAME		15               
/* Set process name */

#define MAX_CPUS		4096

#define COMM_LEN		20

#define SYM_LEN			129

#define MAX_PID			1024000

struct sched_atom;


struct task_desc {
	
unsigned long		nr;
	
unsigned long		pid;
	
char			comm[COMM_LEN];

	
unsigned long		nr_events;
	
unsigned long		curr_event;
	
struct sched_atom	**atoms;

	
pthread_t		thread;
	
sem_t			sleep_sem;

	
sem_t			ready_for_work;
	
sem_t			work_done_sem;

	
u64			cpu_usage;
};


enum sched_event_type {
	
SCHED_EVENT_RUN,
	
SCHED_EVENT_SLEEP,
	
SCHED_EVENT_WAKEUP,
	
SCHED_EVENT_MIGRATION,
};


struct sched_atom {
	
enum sched_event_type	type;
	
int			specific_wait;
	
u64			timestamp;
	
u64			duration;
	
unsigned long		nr;
	
sem_t			*wait_sem;
	
struct task_desc	*wakee;
};


#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"

/* task state bitmask, copied from include/linux/sched.h */

#define TASK_RUNNING		0

#define TASK_INTERRUPTIBLE	1

#define TASK_UNINTERRUPTIBLE	2

#define __TASK_STOPPED		4

#define __TASK_TRACED		8
/* in tsk->exit_state */

#define EXIT_DEAD		16

#define EXIT_ZOMBIE		32

#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
/* in tsk->state again */

#define TASK_DEAD		64

#define TASK_WAKEKILL		128

#define TASK_WAKING		256

#define TASK_PARKED		512


enum thread_state {
	
THREAD_SLEEPING = 0,
	
THREAD_WAIT_CPU,
	
THREAD_SCHED_IN,
	
THREAD_IGNORE
};


struct work_atom {
	
struct list_head	list;
	
enum thread_state	state;
	
u64			sched_out_time;
	
u64			wake_up_time;
	
u64			sched_in_time;
	
u64			runtime;
};


struct work_atoms {
	
struct list_head	work_list;
	
struct thread		*thread;
	
struct rb_node		node;
	
u64			max_lat;
	
u64			max_lat_at;
	
u64			total_lat;
	
u64			nb_atoms;
	
u64			total_runtime;
	
int			num_merged;
};


typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);

struct perf_sched;


struct trace_sched_handler {
	
int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);

	
int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);

	
int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);

	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
	
int (*fork_event)(struct perf_sched *sched, union perf_event *event,
			  struct machine *machine);

	
int (*migrate_task_event)(struct perf_sched *sched,
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
};


#define COLOR_PIDS PERF_COLOR_BLUE

#define COLOR_CPUS PERF_COLOR_BG_RED


struct perf_sched_map {
	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
	
int			*comp_cpus;
	
bool			 comp;
	
struct thread_map	*color_pids;
	
const char		*color_pids_str;
	
struct cpu_map		*color_cpus;
	
const char		*color_cpus_str;
	
struct cpu_map		*cpus;
	
const char		*cpus_str;
};


struct perf_sched {
	
struct perf_tool tool;
	
const char	 *sort_order;
	
unsigned long	 nr_tasks;
	
struct task_desc **pid_to_task;
	
struct task_desc **tasks;
	
const struct trace_sched_handler *tp_handler;
	
pthread_mutex_t	 start_work_mutex;
	
pthread_mutex_t	 work_done_wait_mutex;
	
int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	
int		 max_cpu;
	
u32		 curr_pid[MAX_CPUS];
	
struct thread	 *curr_thread[MAX_CPUS];
	
char		 next_shortname1;
	
char		 next_shortname2;
	
unsigned int	 replay_repeat;
	
unsigned long	 nr_run_events;
	
unsigned long	 nr_sleep_events;
	
unsigned long	 nr_wakeup_events;
	
unsigned long	 nr_sleep_corrections;
	
unsigned long	 nr_run_events_optimized;
	
unsigned long	 targetless_wakeups;
	
unsigned long	 multitarget_wakeups;
	
unsigned long	 nr_runs;
	
unsigned long	 nr_timestamps;
	
unsigned long	 nr_unordered_timestamps;
	
unsigned long	 nr_context_switch_bugs;
	
unsigned long	 nr_events;
	
unsigned long	 nr_lost_chunks;
	
unsigned long	 nr_lost_events;
	
u64		 run_measurement_overhead;
	
u64		 sleep_measurement_overhead;
	
u64		 start_time;
	
u64		 cpu_usage;
	
u64		 runavg_cpu_usage;
	
u64		 parent_cpu_usage;
	
u64		 runavg_parent_cpu_usage;
	
u64		 sum_runtime;
	
u64		 sum_fluct;
	
u64		 run_avg;
	
u64		 all_runtime;
	
u64		 all_count;
	
u64		 cpu_last_switched[MAX_CPUS];
	


struct rb_root	 atom_root, sorted_atom_root, merged_atom_root;
	

struct list_head sort_list, cmp_pid;
	
bool force;
	
bool skip_merge;
	
struct perf_sched_map map;

	/* options for timehist command */
	
bool		summary;
	
bool		summary_only;
	
bool		idle_hist;
	
bool		show_callchain;
	
unsigned int	max_stack;
	
bool		show_cpu_visual;
	
bool		show_wakeups;
	
bool		show_next;
	
bool		show_migrations;
	
bool		show_state;
	
u64		skipped_samples;
	
const char	*time_str;
	
struct perf_time_interval ptime;
	
struct perf_time_interval hist_time;
};

/* per thread run time data */

struct thread_runtime {
	
u64 last_time;      /* time of previous sched in/out event */
	
u64 dt_run;         /* run time */
	
u64 dt_sleep;       /* time between CPU access by sleep (off cpu) */
	
u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
	
u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
	
u64 dt_delay;       /* time between wakeup and sched-in */
	
u64 ready_to_run;   /* time of wakeup */

	
struct stats run_stats;
	
u64 total_run_time;
	
u64 total_sleep_time;
	
u64 total_iowait_time;
	
u64 total_preempt_time;
	
u64 total_delay_time;

	
int last_state;
	
u64 migrations;
};

/* per event run time data */

struct evsel_runtime {
	
u64 *last_time; /* time this event was last seen per cpu */
	
u32 ncpu;       /* highest cpu slot allocated */
};

/* per cpu idle time data */

struct idle_thread_runtime {
	
struct thread_runtime	tr;
	
struct thread		*last_thread;
	
struct rb_root		sorted_root;
	
struct callchain_root	callchain;
	
struct callchain_cursor	cursor;
};

/* track idle times per cpu */

static struct thread **idle_threads;

static int idle_max_cpu;

static char idle_comm[] = "<idle>";


static u64 get_nsecs(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo2683.87%250.00%
Ingo Molnar516.13%250.00%
Total31100.00%4100.00%


static void burn_nsecs(struct perf_sched *sched, u64 nsecs) { u64 T0 = get_nsecs(), T1; do { T1 = get_nsecs(); } while (T1 + sched->run_measurement_overhead < T0 + nsecs); }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo2865.12%133.33%
Ingo Molnar1534.88%266.67%
Total43100.00%3100.00%


static void sleep_nsecs(u64 nsecs) { struct timespec ts; ts.tv_nsec = nsecs % 999999999; ts.tv_sec = nsecs / 999999999; nanosleep(&ts, NULL); }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo3183.78%150.00%
Ingo Molnar616.22%150.00%
Total37100.00%2100.00%


static void calibrate_run_measurement_overhead(struct perf_sched *sched) { u64 T0, T1, delta, min_delta = NSEC_PER_SEC; int i; for (i = 0; i < 10; i++) { T0 = get_nsecs(); burn_nsecs(sched, 0); T1 = get_nsecs(); delta = T1-T0; min_delta = min(min_delta, delta); } sched->run_measurement_overhead = min_delta; printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta); }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo6777.01%250.00%
Ingo Molnar2022.99%250.00%
Total87100.00%4100.00%


static void calibrate_sleep_measurement_overhead(struct perf_sched *sched) { u64 T0, T1, delta, min_delta = NSEC_PER_SEC; int i; for (i = 0; i < 10; i++) { T0 = get_nsecs(); sleep_nsecs(10000); T1 = get_nsecs(); delta = T1-T0; min_delta = min(min_delta, delta); } min_delta -= 10000; sched->sleep_measurement_overhead = min_delta; printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta); }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo6269.66%250.00%
Ingo Molnar2325.84%125.00%
Xiao Guangrong44.49%125.00%
Total89100.00%4100.00%


static struct sched_atom * get_new_event(struct task_desc *task, u64 timestamp) { struct sched_atom *event = zalloc(sizeof(*event)); unsigned long idx = task->nr_events; size_t size; event->timestamp = timestamp; event->nr = idx; task->nr_events++; size = sizeof(struct sched_atom *) * task->nr_events; task->atoms = realloc(task->atoms, size); BUG_ON(!task->atoms); task->atoms[idx] = event; return event; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo6562.50%125.00%
Ingo Molnar3230.77%250.00%
Xiao Guangrong76.73%125.00%
Total104100.00%4100.00%


static struct sched_atom *last_event(struct task_desc *task) { if (!task->nr_events) return NULL; return task->atoms[task->nr_events - 1]; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo3497.14%150.00%
Ingo Molnar12.86%150.00%
Total35100.00%2100.00%


static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task, u64 timestamp, u64 duration) { struct sched_atom *event, *curr_event = last_event(task); /* * optimize an existing RUN event by merging this one * to it: */ if (curr_event && curr_event->type == SCHED_EVENT_RUN) { sched->nr_run_events_optimized++; curr_event->duration += duration; return; } event = get_new_event(task, timestamp); event->type = SCHED_EVENT_RUN; event->duration = duration; sched->nr_run_events++; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo7486.05%150.00%
Ingo Molnar1213.95%150.00%
Total86100.00%2100.00%


static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task, u64 timestamp, struct task_desc *wakee) { struct sched_atom *event, *wakee_event; event = get_new_event(task, timestamp); event->type = SCHED_EVENT_WAKEUP; event->wakee = wakee; wakee_event = last_event(wakee); if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) { sched->targetless_wakeups++; return; } if (wakee_event->wait_sem) { sched->multitarget_wakeups++; return; } wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem)); sem_init(wakee_event->wait_sem, 0, 0); wakee_event->specific_wait = 1; event->wait_sem = wakee_event->wait_sem; sched->nr_wakeup_events++; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo9367.39%133.33%
Ingo Molnar4431.88%133.33%
Jiri Pirko10.72%133.33%
Total138100.00%3100.00%


static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task, u64 timestamp, u64 task_state __maybe_unused) { struct sched_atom *event = get_new_event(task, timestamp); event->type = SCHED_EVENT_SLEEP; sched->nr_sleep_events++; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo3473.91%150.00%
Ingo Molnar1226.09%150.00%
Total46100.00%2100.00%


static struct task_desc *register_pid(struct perf_sched *sched, unsigned long pid, const char *comm) { struct task_desc *task; static int pid_max; if (sched->pid_to_task == NULL) { if (sysctl__read_int("kernel/pid_max", &pid_max) < 0) pid_max = MAX_PID; BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL); } if (pid >= (unsigned long)pid_max) { BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) * sizeof(struct task_desc *))) == NULL); while (pid >= (unsigned long)pid_max) sched->pid_to_task[pid_max++] = NULL; } task = sched->pid_to_task[pid]; if (task) return task; task = zalloc(sizeof(*task)); task->pid = pid; task->nr = sched->nr_tasks; strcpy(task->comm, comm); /* * every task starts in sleeping state - this gets ignored * if there's no wakeup pointing to this sleep state: */ add_sched_event_sleep(sched, task, 0, 0); sched->pid_to_task[pid] = task; sched->nr_tasks++; sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *)); BUG_ON(!sched->tasks); sched->tasks[task->nr] = task; if (verbose > 0) printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm); return task; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo11942.20%116.67%
Yunlong Song11239.72%350.00%
Ingo Molnar4917.38%116.67%
Namhyung Kim20.71%116.67%
Total282100.00%6100.00%


static void print_task_traces(struct perf_sched *sched) { struct task_desc *task; unsigned long i; for (i = 0; i < sched->nr_tasks; i++) { task = sched->tasks[i]; printf("task %6ld (%20s:%10ld), nr_events: %ld\n", task->nr, task->comm, task->pid, task->nr_events); } }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo3653.73%133.33%
Ingo Molnar3146.27%266.67%
Total67100.00%3100.00%


static void add_cross_task_wakeups(struct perf_sched *sched) { struct task_desc *task1, *task2; unsigned long i, j; for (i = 0; i < sched->nr_tasks; i++) { task1 = sched->tasks[i]; j = i + 1; if (j == sched->nr_tasks) j = 0; task2 = sched->tasks[j]; add_sched_event_wakeup(sched, task1, 0, task2); } }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo6370.79%150.00%
Ingo Molnar2629.21%150.00%
Total89100.00%2100.00%


static void perf_sched__process_event(struct perf_sched *sched, struct sched_atom *atom) { int ret = 0; switch (atom->type) { case SCHED_EVENT_RUN: burn_nsecs(sched, atom->duration); break; case SCHED_EVENT_SLEEP: if (atom->wait_sem) ret = sem_wait(atom->wait_sem); BUG_ON(ret); break; case SCHED_EVENT_WAKEUP: if (atom->wait_sem) ret = sem_post(atom->wait_sem); BUG_ON(ret); break; case SCHED_EVENT_MIGRATION: break; default: BUG_ON(1); } }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo8282.00%150.00%
Ingo Molnar1818.00%150.00%
Total100100.00%2100.00%


static u64 get_cpu_usage_nsec_parent(void) { struct rusage ru; u64 sum; int err; err = getrusage(RUSAGE_SELF, &ru); BUG_ON(err); sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC; sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC; return sum; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo5677.78%250.00%
Ingo Molnar1622.22%250.00%
Total72100.00%4100.00%


static int self_open_counters(struct perf_sched *sched, unsigned long cur_task) { struct perf_event_attr attr; char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE]; int fd; struct rlimit limit; bool need_privilege = false; memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_TASK_CLOCK; force_again: fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag()); if (fd < 0) { if (errno == EMFILE) { if (sched->force) { BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1); limit.rlim_cur += sched->nr_tasks - cur_task; if (limit.rlim_cur > limit.rlim_max) { limit.rlim_max = limit.rlim_cur; need_privilege = true; } if (setrlimit(RLIMIT_NOFILE, &limit) == -1) { if (need_privilege && errno == EPERM) strcpy(info, "Need privilege\n"); } else goto force_again; } else strcpy(info, "Have a try with -f option\n"); } pr_err("Error: sys_perf_event_open() syscall returned " "with %d (%s)\n%s", fd, str_error_r(errno, sbuf, sizeof(sbuf)), info); exit(EXIT_FAILURE); } return fd; }

Contributors

PersonTokensPropCommitsCommitProp
Yunlong Song14060.34%225.00%
Arnaldo Carvalho de Melo6427.59%225.00%
Masami Hiramatsu135.60%112.50%
Ingo Molnar114.74%112.50%
Namhyung Kim20.86%112.50%
Yann Droneaud20.86%112.50%
Total232100.00%8100.00%


static u64 get_cpu_usage_nsec_self(int fd) { u64 runtime; int ret; ret = read(fd, &runtime, sizeof(runtime)); BUG_ON(ret != sizeof(runtime)); return runtime; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo4195.35%150.00%
Frédéric Weisbecker24.65%150.00%
Total43100.00%2100.00%

struct sched_thread_parms { struct task_desc *task; struct perf_sched *sched; int fd; };
static void *thread_func(void *ctx) { struct sched_thread_parms *parms = ctx; struct task_desc *this_task = parms->task; struct perf_sched *sched = parms->sched; u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; int fd = parms->fd; zfree(&parms); sprintf(comm2, ":%s", this_task->comm); prctl(PR_SET_NAME, comm2); if (fd < 0) return NULL; again: ret = sem_post(&this_task->ready_for_work); BUG_ON(ret); ret = pthread_mutex_lock(&sched->start_work_mutex); BUG_ON(ret); ret = pthread_mutex_unlock(&sched->start_work_mutex); BUG_ON(ret); cpu_usage_0 = get_cpu_usage_nsec_self(fd); for (i = 0; i < this_task->nr_events; i++) { this_task->curr_event = i; perf_sched__process_event(sched, this_task->atoms[i]); } cpu_usage_1 = get_cpu_usage_nsec_self(fd); this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; ret = sem_post(&this_task->work_done_sem); BUG_ON(ret); ret = pthread_mutex_lock(&sched->work_done_wait_mutex); BUG_ON(ret); ret = pthread_mutex_unlock(&sched->work_done_wait_mutex); BUG_ON(ret); goto again; }

Contributors

PersonTokensPropCommitsCommitProp
Arnaldo Carvalho de Melo22692.24%233.33%
Frédéric Weisbecker62.45%116.67%
Ingo Molnar52.04%116.67%
Mike Galbraith41.63%116.67%
Yunlong Song41.63%116.67%
Total245100.00%6100.00%


static void create_tasks(struct perf_sched *sched) { struct task_desc *task; pthread_attr_t attr; unsigned long i; int err; err = pthread_attr_init(&attr); BUG_ON(err); err = pthread_attr_setstacksize(&attr, (size_t) max(16 * 1024, PTHREAD_STACK_MIN)); BUG_ON(err); err = pthread_mutex_lock(&sched->start_work_mutex); BUG_ON(err); err = pthread_mutex_lock(&sched->work_done_wait_mutex); BUG_ON(err);