Contributors: 12
Author Tokens Token Proportion Commits Commit Proportion
Thomas Gleixner 388 55.04% 16 50.00%
Mathieu Desnoyers 173 24.54% 4 12.50%
Kent Overstreet 113 16.03% 1 3.12%
Linus Torvalds (pre-git) 8 1.13% 2 6.25%
Ingo Molnar 6 0.85% 2 6.25%
Al Viro 4 0.57% 1 3.12%
Kai Germaschewski 3 0.43% 1 3.12%
Will Deacon 3 0.43% 1 3.12%
Simon Schuster 2 0.28% 1 3.12%
Peter Zijlstra 2 0.28% 1 3.12%
Cruz Zhao 2 0.28% 1 3.12%
David Howells 1 0.14% 1 3.12%
Total 705 32


/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H

#ifdef CONFIG_RSEQ
#include <linux/sched.h>

#include <uapi/linux/rseq.h>

void __rseq_handle_slowpath(struct pt_regs *regs);

/* Invoked from resume_user_mode_work() */
static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
	if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
		if (current->rseq.event.slowpath)
			__rseq_handle_slowpath(regs);
	} else {
		/* '&' is intentional to spare one conditional branch */
		if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
			__rseq_handle_slowpath(regs);
	}
}

void __rseq_signal_deliver(int sig, struct pt_regs *regs);

/*
 * Invoked from signal delivery to fixup based on the register context before
 * switching to the signal delivery context.
 */
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
		/* '&' is intentional to spare one conditional branch */
		if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
			__rseq_signal_deliver(ksig->sig, regs);
	} else {
		if (current->rseq.event.has_rseq)
			__rseq_signal_deliver(ksig->sig, regs);
	}
}

static inline void rseq_raise_notify_resume(struct task_struct *t)
{
	set_tsk_thread_flag(t, TIF_RSEQ);
}

/* Invoked from context switch to force evaluation on exit to user */
static __always_inline void rseq_sched_switch_event(struct task_struct *t)
{
	struct rseq_event *ev = &t->rseq.event;

	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
		/*
		 * Avoid a boat load of conditionals by using simple logic
		 * to determine whether NOTIFY_RESUME needs to be raised.
		 *
		 * It's required when the CPU or MM CID has changed or
		 * the entry was from user space.
		 */
		bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;

		if (raise) {
			ev->sched_switch = true;
			rseq_raise_notify_resume(t);
		}
	} else {
		if (ev->has_rseq) {
			t->rseq.event.sched_switch = true;
			rseq_raise_notify_resume(t);
		}
	}
}

/*
 * Invoked from __set_task_cpu() when a task migrates or from
 * mm_cid_schedin() when the CID changes to enforce an IDs update.
 *
 * This does not raise TIF_NOTIFY_RESUME as that happens in
 * rseq_sched_switch_event().
 */
static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
	t->rseq.event.ids_changed = true;
}

/* Enforce a full update after RSEQ registration and when execve() failed */
static inline void rseq_force_update(void)
{
	if (current->rseq.event.has_rseq) {
		current->rseq.event.ids_changed = true;
		current->rseq.event.sched_switch = true;
		rseq_raise_notify_resume(current);
	}
}

/*
 * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
 * which clears TIF_NOTIFY_RESUME on architectures that don't use the
 * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
 *
 * To avoid updating user space RSEQ in that case just to do it eventually
 * again before returning to user space, because __rseq_handle_slowpath()
 * does nothing when invoked with NULL register state.
 *
 * After returning from guest mode, before exiting to userspace, hypervisors
 * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
 */
static inline void rseq_virt_userspace_exit(void)
{
	/*
	 * The generic optimization for deferring RSEQ updates until the next
	 * exit relies on having a dedicated TIF_RSEQ.
	 */
	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
	    current->rseq.event.sched_switch)
		rseq_raise_notify_resume(current);
}

static inline void rseq_reset(struct task_struct *t)
{
	memset(&t->rseq, 0, sizeof(t->rseq));
	t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}

static inline void rseq_execve(struct task_struct *t)
{
	rseq_reset(t);
}

/*
 * If parent process has a registered restartable sequences area, the
 * child inherits. Unregister rseq for a clone with CLONE_VM set.
 *
 * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
 * on the COW page on exit to user space, when the child stays on the same
 * CPU as the parent. That's obviously not guaranteed, but in overcommit
 * scenarios it is more likely and optimizes for the fork/exec case without
 * taking the fault.
 */
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
	if (clone_flags & CLONE_VM)
		rseq_reset(t);
	else
		t->rseq = current->rseq;
}

/*
 * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq
 * registration. This is the active rseq area size rounded up to next
 * power of 2, which guarantees that the rseq structure will always be
 * aligned on the nearest power of two large enough to contain it, even
 * as it grows.
 */
static inline unsigned int rseq_alloc_align(void)
{
	return 1U << get_count_order(offsetof(struct rseq, end));
}

#else /* CONFIG_RSEQ */
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
#endif  /* !CONFIG_RSEQ */

#ifdef CONFIG_DEBUG_RSEQ
void rseq_syscall(struct pt_regs *regs);
#else /* CONFIG_DEBUG_RSEQ */
static inline void rseq_syscall(struct pt_regs *regs) { }
#endif /* !CONFIG_DEBUG_RSEQ */

#ifdef CONFIG_RSEQ_SLICE_EXTENSION
void rseq_syscall_enter_work(long syscall);
int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
static inline void rseq_syscall_enter_work(long syscall) { }
static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
{
	return -ENOTSUPP;
}
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */

#endif /* _LINUX_RSEQ_H */