cregit-Linux how code gets into the kernel

Release 4.14 arch/x86/entry/common.c

Directory: arch/x86/entry
/*
 * common.c - C code for kernel entry and exit
 * Copyright (c) 2015 Andrew Lutomirski
 * GPL v2
 *
 * Based on asm and ptrace code by many authors.  The code here originated
 * in ptrace.c and signal.c.
 */

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
#include <linux/syscalls.h>

#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>


#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>

#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */

__visible inline void enter_from_user_mode(void) { CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit_irqoff(); }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski1890.00%133.33%
Paolo Bonzini210.00%266.67%
Total20100.00%3100.00%

#else
static inline void enter_from_user_mode(void) {}

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski8100.00%1100.00%
Total8100.00%1100.00%

#endif
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) { #ifdef CONFIG_X86_64 if (arch == AUDIT_ARCH_X86_64) { audit_syscall_entry(regs->orig_ax, regs->di, regs->si, regs->dx, regs->r10); } else #endif { audit_syscall_entry(regs->orig_ax, regs->bx, regs->cx, regs->dx, regs->si); } }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski76100.00%1100.00%
Total76100.00%1100.00%

/* * Returns the syscall nr to run (which should match regs->orig_ax) or -1 * to skip the syscall. */
static long syscall_trace_enter(struct pt_regs *regs) { u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; struct thread_info *ti = current_thread_info(); unsigned long ret = 0; bool emulated = false; u32 work; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; if (unlikely(work & _TIF_SYSCALL_EMU)) emulated = true; if ((emulated || (work & _TIF_SYSCALL_TRACE)) && tracehook_report_syscall_entry(regs)) return -1L; if (emulated) return -1L; #ifdef CONFIG_SECCOMP /* * Do seccomp after ptrace, to catch any tracer changes. */ if (work & _TIF_SECCOMP) { struct seccomp_data sd; sd.arch = arch; sd.nr = regs->orig_ax; sd.instruction_pointer = regs->ip; #ifdef CONFIG_X86_64 if (arch == AUDIT_ARCH_X86_64) { sd.args[0] = regs->di; sd.args[1] = regs->si; sd.args[2] = regs->dx; sd.args[3] = regs->r10; sd.args[4] = regs->r8; sd.args[5] = regs->r9; } else #endif { sd.args[0] = regs->bx; sd.args[1] = regs->cx; sd.args[2] = regs->dx; sd.args[3] = regs->si; sd.args[4] = regs->di; sd.args[5] = regs->bp; } ret = __secure_computing(&sd); if (ret == -1) return ret; } #endif if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); do_audit_syscall_entry(regs, arch); return ret ?: regs->orig_ax; }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski30385.84%571.43%
Kees Cook4813.60%114.29%
Linus Torvalds20.57%114.29%
Total353100.00%7100.00%

#define EXIT_TO_USERMODE_LOOP_FLAGS \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) { /* * In order to return to user mode, we need to have IRQs off with * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags * can be set at any time on preemptable kernels if we have IRQs on, * so we need to loop. Disabling preemption wouldn't help: doing the * work to clear some of the flags can sleep. */ while (true) { /* We have work to do. */ local_irq_enable(); if (cached_flags & _TIF_NEED_RESCHED) schedule(); if (cached_flags & _TIF_UPROBE) uprobe_notify_resume(regs); /* deal with pending signal delivery */ if (cached_flags & _TIF_SIGPENDING) do_signal(regs); if (cached_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } if (cached_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); if (cached_flags & _TIF_PATCH_PENDING) klp_update_patch_state(current); /* Disable IRQs and retry */ local_irq_disable(); cached_flags = READ_ONCE(current_thread_info()->flags); if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) break; } }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski10588.24%250.00%
Josh Poimboeuf1210.08%125.00%
Linus Torvalds21.68%125.00%
Total119100.00%4100.00%

/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); u32 cached_flags; addr_limit_user_check(); if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); lockdep_sys_exit(); cached_flags = READ_ONCE(ti->flags); if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); #ifdef CONFIG_COMPAT /* * Compat syscalls set TS_COMPAT. Make sure we clear it before * returning to user mode. We need to clear it *after* signal * handling, because syscall restart has a fixup for compat * syscalls. The fixup is exercised by the ptrace_syscall_32 * selftest. * * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer * special case only applies after poking regs and before the * very next return to user mode. */ current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif user_enter_irqoff(); }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski8793.55%562.50%
Thomas Garnier33.23%112.50%
Linus Torvalds22.15%112.50%
Paolo Bonzini11.08%112.50%
Total93100.00%8100.00%

#define SYSCALL_EXIT_WORK_FLAGS \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) { bool step; audit_syscall_exit(regs); if (cached_flags & _TIF_SYSCALL_TRACEPOINT) trace_sys_exit(regs, regs->ax); /* * If TIF_SYSCALL_EMU is set, we only get here because of * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). * We already reported this syscall instruction in * syscall_trace_enter(). */ step = unlikely( (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) == _TIF_SINGLESTEP); if (step || cached_flags & _TIF_SYSCALL_TRACE) tracehook_report_syscall_exit(regs, step); }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski70100.00%1100.00%
Total70100.00%1100.00%

/* * Called with IRQs on and fully valid regs. Returns with IRQs off in a * state such that we can immediately switch to user mode. */
__visible inline void syscall_return_slowpath(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); u32 cached_flags = READ_ONCE(ti->flags); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) local_irq_enable(); /* * First do one-time work. If these work items are enabled, we * want to run them exactly once per syscall exit with IRQs on. */ if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) syscall_slow_exit_work(regs, cached_flags); local_irq_disable(); prepare_exit_to_usermode(regs); }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski8397.65%375.00%
Linus Torvalds22.35%125.00%
Total85100.00%4100.00%

#ifdef CONFIG_X86_64
__visible void do_syscall_64(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); unsigned long nr = regs->orig_ax; enter_from_user_mode(); local_irq_enable(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); /* * NB: Native and x32 syscalls are dispatched from the same * table. The only functional difference is the x32 bit in * regs->orig_ax, which changes the behavior of some syscalls. */ if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { regs->ax = sys_call_table[nr & __SYSCALL_MASK]( regs->di, regs->si, regs->dx, regs->r10, regs->r8, regs->r9); } syscall_return_slowpath(regs); }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski10698.15%266.67%
Linus Torvalds21.85%133.33%
Total108100.00%3100.00%

#endif #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does * all entry and exit work and returns with IRQs off. This function is * extremely hot in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; #ifdef CONFIG_IA32_EMULATION current->thread.status |= TS_COMPAT; #endif if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { /* * Subtlety here: if ptrace pokes something larger than * 2^32-1 into orig_ax, this truncates it. This may or * may not be necessary, but it matches the old asm * behavior. */ nr = syscall_trace_enter(regs); } if (likely(nr < IA32_NR_syscalls)) { /* * It's possible that a 32-bit syscall implementation * takes a 64-bit parameter but nonetheless assumes that * the high bits are zero. Make sure we zero-extend all * of the args. */ regs->ax = ia32_sys_call_table[nr]( (unsigned int)regs->bx, (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->si, (unsigned int)regs->di, (unsigned int)regs->bp); } syscall_return_slowpath(regs); }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski13998.58%583.33%
Linus Torvalds21.42%116.67%
Total141100.00%6100.00%

/* Handles int $0x80 */
__visible void do_int80_syscall_32(struct pt_regs *regs) { enter_from_user_mode(); local_irq_enable(); do_syscall_32_irqs_on(regs); }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski22100.00%3100.00%
Total22100.00%3100.00%

/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ unsigned long landing_pad = (unsigned long)current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad; /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. * Fix it up. */ regs->ip = landing_pad; enter_from_user_mode(); local_irq_enable(); /* Fetch EBP from where the vDSO stashed it. */ if ( #ifdef CONFIG_X86_64 /* * Micro-optimization: the pointer we're following is explicitly * 32 bits, so it can't be out of range. */ __get_user(*(u32 *)&regs->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #else get_user(*(u32 *)&regs->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #endif ) { /* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; prepare_exit_to_usermode(regs); return 0; /* Keep it simple: use IRET. */ } /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs); #ifdef CONFIG_X86_64 /* * Opportunistic SYSRETL: if possible, try to return using SYSRETL. * SYSRETL is available on all 64-bit CPUs, so we don't need to * bother with SYSEXIT. * * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, * because the ECX fixup above will ensure that this is essentially * never the case. */ return regs->cs == __USER32_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; #else /* * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. * * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, * because the ECX fixup above will ensure that this is essentially * never the case. * * We don't allow syscalls at all from VM86 mode, but we still * need to check VM, because we might be returning from sys_vm86. */ return static_cpu_has(X86_FEATURE_SEP) && regs->cs == __USER_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif }

Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski202100.00%7100.00%
Total202100.00%7100.00%

#endif

Overall Contributors

PersonTokensPropCommitsCommitProp
Andrew Lutomirski132093.48%2371.88%
Kees Cook483.40%13.12%
Josh Poimboeuf161.13%13.12%
Linus Torvalds130.92%26.25%
Thomas Garnier60.42%13.12%
Paolo Bonzini30.21%26.25%
Borislav Petkov30.21%13.12%
Ingo Molnar30.21%13.12%
Total1412100.00%32100.00%
Directory: arch/x86/entry
Information contained on this website is for historical information purposes only and does not indicate or represent copyright ownership.
Created with cregit.