Contributors: 13
Author Tokens Token Proportion Commits Commit Proportion
Steven Rostedt 2025 84.30% 10 35.71%
Joel A Fernandes 156 6.49% 1 3.57%
Mathieu Desnoyers 144 6.00% 3 10.71%
Frédéric Weisbecker 36 1.50% 2 7.14%
Ingo Molnar 18 0.75% 4 14.29%
Tingwei Zhang 7 0.29% 1 3.57%
Andrew Morton 4 0.17% 1 3.57%
Delyan Kratunov 4 0.17% 1 3.57%
Peter Zijlstra 3 0.12% 1 3.57%
Lai Jiangshan 2 0.08% 1 3.57%
Wenji Huang 1 0.04% 1 3.57%
Zhao Lei 1 0.04% 1 3.57%
Greg Kroah-Hartman 1 0.04% 1 3.57%
Total 2402 28


// SPDX-License-Identifier: GPL-2.0
/*
 * trace context switch
 *
 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
 *
 */
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/kmemleak.h>
#include <linux/ftrace.h>
#include <trace/events/sched.h>

#include "trace.h"

#define RECORD_CMDLINE	1
#define RECORD_TGID	2

static int		sched_cmdline_ref;
static int		sched_tgid_ref;
static DEFINE_MUTEX(sched_register_mutex);

static void
probe_sched_switch(void *ignore, bool preempt,
		   struct task_struct *prev, struct task_struct *next,
		   unsigned int prev_state)
{
	int flags;

	flags = (RECORD_TGID * !!sched_tgid_ref) +
		(RECORD_CMDLINE * !!sched_cmdline_ref);

	if (!flags)
		return;
	tracing_record_taskinfo_sched_switch(prev, next, flags);
}

static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee)
{
	int flags;

	flags = (RECORD_TGID * !!sched_tgid_ref) +
		(RECORD_CMDLINE * !!sched_cmdline_ref);

	if (!flags)
		return;
	tracing_record_taskinfo_sched_switch(current, wakee, flags);
}

static int tracing_sched_register(void)
{
	int ret;

	ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
	if (ret) {
		pr_info("wakeup trace: Couldn't activate tracepoint"
			" probe to kernel_sched_wakeup\n");
		return ret;
	}

	ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
	if (ret) {
		pr_info("wakeup trace: Couldn't activate tracepoint"
			" probe to kernel_sched_wakeup_new\n");
		goto fail_deprobe;
	}

	ret = register_trace_sched_switch(probe_sched_switch, NULL);
	if (ret) {
		pr_info("sched trace: Couldn't activate tracepoint"
			" probe to kernel_sched_switch\n");
		goto fail_deprobe_wake_new;
	}

	return ret;
fail_deprobe_wake_new:
	unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
fail_deprobe:
	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
	return ret;
}

static void tracing_sched_unregister(void)
{
	unregister_trace_sched_switch(probe_sched_switch, NULL);
	unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}

static void tracing_start_sched_switch(int ops)
{
	bool sched_register;

	mutex_lock(&sched_register_mutex);
	sched_register = (!sched_cmdline_ref && !sched_tgid_ref);

	switch (ops) {
	case RECORD_CMDLINE:
		sched_cmdline_ref++;
		break;

	case RECORD_TGID:
		sched_tgid_ref++;
		break;
	}

	if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
		tracing_sched_register();
	mutex_unlock(&sched_register_mutex);
}

static void tracing_stop_sched_switch(int ops)
{
	mutex_lock(&sched_register_mutex);

	switch (ops) {
	case RECORD_CMDLINE:
		sched_cmdline_ref--;
		break;

	case RECORD_TGID:
		sched_tgid_ref--;
		break;
	}

	if (!sched_cmdline_ref && !sched_tgid_ref)
		tracing_sched_unregister();
	mutex_unlock(&sched_register_mutex);
}

void tracing_start_cmdline_record(void)
{
	tracing_start_sched_switch(RECORD_CMDLINE);
}

void tracing_stop_cmdline_record(void)
{
	tracing_stop_sched_switch(RECORD_CMDLINE);
}

void tracing_start_tgid_record(void)
{
	tracing_start_sched_switch(RECORD_TGID);
}

void tracing_stop_tgid_record(void)
{
	tracing_stop_sched_switch(RECORD_TGID);
}

/*
 * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
 * is the tgid last observed corresponding to pid=i.
 */
static int *tgid_map;

/* The maximum valid index into tgid_map. */
static size_t tgid_map_max;

#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
/*
 * Preemption must be disabled before acquiring trace_cmdline_lock.
 * The various trace_arrays' max_lock must be acquired in a context
 * where interrupt is disabled.
 */
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
	unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
	unsigned *map_cmdline_to_pid;
	unsigned cmdline_num;
	int cmdline_idx;
	char saved_cmdlines[];
};
static struct saved_cmdlines_buffer *savedcmd;

/* Holds the size of a cmdline and pid element */
#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s)			\
	(TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))

static inline char *get_saved_cmdlines(int idx)
{
	return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
}

static inline void set_cmdline(int idx, const char *cmdline)
{
	strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}

static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
	int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);

	kmemleak_free(s);
	free_pages((unsigned long)s, order);
}

static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
{
	struct saved_cmdlines_buffer *s;
	struct page *page;
	int orig_size, size;
	int order;

	/* Figure out how much is needed to hold the given number of cmdlines */
	orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
	order = get_order(orig_size);
	size = 1 << (order + PAGE_SHIFT);
	page = alloc_pages(GFP_KERNEL, order);
	if (!page)
		return NULL;

	s = page_address(page);
	kmemleak_alloc(s, size, 1, GFP_KERNEL);
	memset(s, 0, sizeof(*s));

	/* Round up to actual allocation */
	val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
	s->cmdline_num = val;

	/* Place map_cmdline_to_pid array right after saved_cmdlines */
	s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];

	s->cmdline_idx = 0;
	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
	       sizeof(s->map_pid_to_cmdline));
	memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
	       val * sizeof(*s->map_cmdline_to_pid));

	return s;
}

int trace_create_savedcmd(void)
{
	savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);

	return savedcmd ? 0 : -ENOMEM;
}

int trace_save_cmdline(struct task_struct *tsk)
{
	unsigned tpid, idx;

	/* treat recording of idle task as a success */
	if (!tsk->pid)
		return 1;

	tpid = tsk->pid & (PID_MAX_DEFAULT - 1);

	/*
	 * It's not the end of the world if we don't get
	 * the lock, but we also don't want to spin
	 * nor do we want to disable interrupts,
	 * so if we miss here, then better luck next time.
	 *
	 * This is called within the scheduler and wake up, so interrupts
	 * had better been disabled and run queue lock been held.
	 */
	lockdep_assert_preemption_disabled();
	if (!arch_spin_trylock(&trace_cmdline_lock))
		return 0;

	idx = savedcmd->map_pid_to_cmdline[tpid];
	if (idx == NO_CMDLINE_MAP) {
		idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;

		savedcmd->map_pid_to_cmdline[tpid] = idx;
		savedcmd->cmdline_idx = idx;
	}

	savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
	set_cmdline(idx, tsk->comm);

	arch_spin_unlock(&trace_cmdline_lock);

	return 1;
}

static void __trace_find_cmdline(int pid, char comm[])
{
	unsigned map;
	int tpid;

	if (!pid) {
		strcpy(comm, "<idle>");
		return;
	}

	if (WARN_ON_ONCE(pid < 0)) {
		strcpy(comm, "<XXX>");
		return;
	}

	tpid = pid & (PID_MAX_DEFAULT - 1);
	map = savedcmd->map_pid_to_cmdline[tpid];
	if (map != NO_CMDLINE_MAP) {
		tpid = savedcmd->map_cmdline_to_pid[map];
		if (tpid == pid) {
			strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
			return;
		}
	}
	strcpy(comm, "<...>");
}

void trace_find_cmdline(int pid, char comm[])
{
	preempt_disable();
	arch_spin_lock(&trace_cmdline_lock);

	__trace_find_cmdline(pid, comm);

	arch_spin_unlock(&trace_cmdline_lock);
	preempt_enable();
}

static int *trace_find_tgid_ptr(int pid)
{
	/*
	 * Pairs with the smp_store_release in set_tracer_flag() to ensure that
	 * if we observe a non-NULL tgid_map then we also observe the correct
	 * tgid_map_max.
	 */
	int *map = smp_load_acquire(&tgid_map);

	if (unlikely(!map || pid > tgid_map_max))
		return NULL;

	return &map[pid];
}

int trace_find_tgid(int pid)
{
	int *ptr = trace_find_tgid_ptr(pid);

	return ptr ? *ptr : 0;
}

static int trace_save_tgid(struct task_struct *tsk)
{
	int *ptr;

	/* treat recording of idle task as a success */
	if (!tsk->pid)
		return 1;

	ptr = trace_find_tgid_ptr(tsk->pid);
	if (!ptr)
		return 0;

	*ptr = tsk->tgid;
	return 1;
}

static bool tracing_record_taskinfo_skip(int flags)
{
	if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
		return true;
	if (!__this_cpu_read(trace_taskinfo_save))
		return true;
	return false;
}

/**
 * tracing_record_taskinfo - record the task info of a task
 *
 * @task:  task to record
 * @flags: TRACE_RECORD_CMDLINE for recording comm
 *         TRACE_RECORD_TGID for recording tgid
 */
void tracing_record_taskinfo(struct task_struct *task, int flags)
{
	bool done;

	if (tracing_record_taskinfo_skip(flags))
		return;

	/*
	 * Record as much task information as possible. If some fail, continue
	 * to try to record the others.
	 */
	done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);

	/* If recording any information failed, retry again soon. */
	if (!done)
		return;

	__this_cpu_write(trace_taskinfo_save, false);
}

/**
 * tracing_record_taskinfo_sched_switch - record task info for sched_switch
 *
 * @prev: previous task during sched_switch
 * @next: next task during sched_switch
 * @flags: TRACE_RECORD_CMDLINE for recording comm
 *         TRACE_RECORD_TGID for recording tgid
 */
void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
					  struct task_struct *next, int flags)
{
	bool done;

	if (tracing_record_taskinfo_skip(flags))
		return;

	/*
	 * Record as much task information as possible. If some fail, continue
	 * to try to record the others.
	 */
	done  = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
	done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);

	/* If recording any information failed, retry again soon. */
	if (!done)
		return;

	__this_cpu_write(trace_taskinfo_save, false);
}

/* Helpers to record a specific task information */
void tracing_record_cmdline(struct task_struct *task)
{
	tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
}

void tracing_record_tgid(struct task_struct *task)
{
	tracing_record_taskinfo(task, TRACE_RECORD_TGID);
}

int trace_alloc_tgid_map(void)
{
	int *map;

	if (tgid_map)
		return 0;

	tgid_map_max = pid_max;
	map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
		       GFP_KERNEL);
	if (!map)
		return -ENOMEM;

	/*
	 * Pairs with smp_load_acquire() in
	 * trace_find_tgid_ptr() to ensure that if it observes
	 * the tgid_map we just allocated then it also observes
	 * the corresponding tgid_map_max value.
	 */
	smp_store_release(&tgid_map, map);
	return 0;
}

static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
	int pid = ++(*pos);

	return trace_find_tgid_ptr(pid);
}

static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
	int pid = *pos;

	return trace_find_tgid_ptr(pid);
}

static void saved_tgids_stop(struct seq_file *m, void *v)
{
}

static int saved_tgids_show(struct seq_file *m, void *v)
{
	int *entry = (int *)v;
	int pid = entry - tgid_map;
	int tgid = *entry;

	if (tgid == 0)
		return SEQ_SKIP;

	seq_printf(m, "%d %d\n", pid, tgid);
	return 0;
}

static const struct seq_operations tracing_saved_tgids_seq_ops = {
	.start		= saved_tgids_start,
	.stop		= saved_tgids_stop,
	.next		= saved_tgids_next,
	.show		= saved_tgids_show,
};

static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
{
	int ret;

	ret = tracing_check_open_get_tr(NULL);
	if (ret)
		return ret;

	return seq_open(filp, &tracing_saved_tgids_seq_ops);
}


const struct file_operations tracing_saved_tgids_fops = {
	.open		= tracing_saved_tgids_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= seq_release,
};

static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
{
	unsigned int *ptr = v;

	if (*pos || m->count)
		ptr++;

	(*pos)++;

	for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
	     ptr++) {
		if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
			continue;

		return ptr;
	}

	return NULL;
}

static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
{
	void *v;
	loff_t l = 0;

	preempt_disable();
	arch_spin_lock(&trace_cmdline_lock);

	v = &savedcmd->map_cmdline_to_pid[0];
	while (l <= *pos) {
		v = saved_cmdlines_next(m, v, &l);
		if (!v)
			return NULL;
	}

	return v;
}

static void saved_cmdlines_stop(struct seq_file *m, void *v)
{
	arch_spin_unlock(&trace_cmdline_lock);
	preempt_enable();
}

static int saved_cmdlines_show(struct seq_file *m, void *v)
{
	char buf[TASK_COMM_LEN];
	unsigned int *pid = v;

	__trace_find_cmdline(*pid, buf);
	seq_printf(m, "%d %s\n", *pid, buf);
	return 0;
}

static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
	.start		= saved_cmdlines_start,
	.next		= saved_cmdlines_next,
	.stop		= saved_cmdlines_stop,
	.show		= saved_cmdlines_show,
};

static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
{
	int ret;

	ret = tracing_check_open_get_tr(NULL);
	if (ret)
		return ret;

	return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
}

const struct file_operations tracing_saved_cmdlines_fops = {
	.open		= tracing_saved_cmdlines_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= seq_release,
};

static ssize_t
tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
				 size_t cnt, loff_t *ppos)
{
	char buf[64];
	int r;

	preempt_disable();
	arch_spin_lock(&trace_cmdline_lock);
	r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
	arch_spin_unlock(&trace_cmdline_lock);
	preempt_enable();

	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

void trace_free_saved_cmdlines_buffer(void)
{
	free_saved_cmdlines_buffer(savedcmd);
}

static int tracing_resize_saved_cmdlines(unsigned int val)
{
	struct saved_cmdlines_buffer *s, *savedcmd_temp;

	s = allocate_cmdlines_buffer(val);
	if (!s)
		return -ENOMEM;

	preempt_disable();
	arch_spin_lock(&trace_cmdline_lock);
	savedcmd_temp = savedcmd;
	savedcmd = s;
	arch_spin_unlock(&trace_cmdline_lock);
	preempt_enable();
	free_saved_cmdlines_buffer(savedcmd_temp);

	return 0;
}

static ssize_t
tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
				  size_t cnt, loff_t *ppos)
{
	unsigned long val;
	int ret;

	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
	if (ret)
		return ret;

	/* must have at least 1 entry or less than PID_MAX_DEFAULT */
	if (!val || val > PID_MAX_DEFAULT)
		return -EINVAL;

	ret = tracing_resize_saved_cmdlines((unsigned int)val);
	if (ret < 0)
		return ret;

	*ppos += cnt;

	return cnt;
}

const struct file_operations tracing_saved_cmdlines_size_fops = {
	.open		= tracing_open_generic,
	.read		= tracing_saved_cmdlines_size_read,
	.write		= tracing_saved_cmdlines_size_write,
};