Contributors: 5
Author Tokens Token Proportion Commits Commit Proportion
Benjamin Berg 731 97.21% 3 30.00%
Jeff Dike 14 1.86% 3 30.00%
Bodo Stroesser 5 0.66% 2 20.00%
Alex Dewar 1 0.13% 1 10.00%
Al Viro 1 0.13% 1 10.00%
Total 752 10


// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
 */

#include <sysdep/stub.h>

#include <linux/futex.h>
#include <sys/socket.h>
#include <errno.h>

/*
 * Known security issues
 *
 * Userspace can jump to this address to execute *any* syscall that is
 * permitted by the stub. As we will return afterwards, it can do
 * whatever it likes, including:
 * - Tricking the kernel into handing out the memory FD
 * - Using this memory FD to read/write all physical memory
 * - Running in parallel to the kernel processing a syscall
 *   (possibly creating data races?)
 * - Blocking e.g. SIGALRM to avoid time based scheduling
 *
 * To avoid this, the permitted location for each syscall needs to be
 * checked for in the SECCOMP filter (which is reasonably simple). Also,
 * more care will need to go into considerations how the code might be
 * tricked by using a prepared stack (or even modifying the stack from
 * another thread in case SMP support is added).
 *
 * As for the SIGALRM, the best counter measure will be to check in the
 * kernel that the process is reporting back the SIGALRM in a timely
 * fashion.
 */
static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
{
	struct stub_data *d = get_stub_data();
	int i;
	unsigned long res;
	int fd;

	for (i = 0; i < d->syscall_data_len; i++) {
		struct stub_syscall *sc = &d->syscall_data[i];

		switch (sc->syscall) {
		case STUB_SYSCALL_MMAP:
			if (fd_map)
				fd = fd_map[sc->mem.fd];
			else
				fd = sc->mem.fd;

			res = stub_syscall6(STUB_MMAP_NR,
					    sc->mem.addr, sc->mem.length,
					    sc->mem.prot,
					    MAP_SHARED | MAP_FIXED,
					    fd, sc->mem.offset);
			if (res != sc->mem.addr) {
				d->err = res;
				d->syscall_data_len = i;
				return -1;
			}
			break;
		case STUB_SYSCALL_MUNMAP:
			res = stub_syscall2(__NR_munmap,
					    sc->mem.addr, sc->mem.length);
			if (res) {
				d->err = res;
				d->syscall_data_len = i;
				return -1;
			}
			break;
		default:
			d->err = -95; /* EOPNOTSUPP */
			d->syscall_data_len = i;
			return -1;
		}
	}

	d->err = 0;
	d->syscall_data_len = 0;

	return 0;
}

void __section(".__syscall_stub")
stub_syscall_handler(void)
{
	syscall_handler(NULL);

	trap_myself();
}

void __section(".__syscall_stub")
stub_signal_interrupt(int sig, siginfo_t *info, void *p)
{
	struct stub_data *d = get_stub_data();
	char rcv_data;
	union {
		char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
		struct cmsghdr align;
	} ctrl = {};
	struct iovec iov = {
		.iov_base = &rcv_data,
		.iov_len = 1,
	};
	struct msghdr msghdr = {
		.msg_iov = &iov,
		.msg_iovlen = 1,
		.msg_control = &ctrl,
		.msg_controllen = sizeof(ctrl),
	};
	ucontext_t *uc = p;
	struct cmsghdr *fd_msg;
	int *fd_map;
	int num_fds;
	long res;

	d->signal = sig;
	d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0];
	d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0];

restart_wait:
	d->futex = FUTEX_IN_KERN;
	do {
		res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
				    FUTEX_WAKE, 1);
	} while (res == -EINTR);

	do {
		res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
				    FUTEX_WAIT, FUTEX_IN_KERN, 0);
	} while (res == -EINTR || d->futex == FUTEX_IN_KERN);

	if (res < 0 && res != -EAGAIN)
		stub_syscall1(__NR_exit_group, 1);

	if (d->syscall_data_len) {
		/* Read passed FDs (if any) */
		do {
			res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
		} while (res == -EINTR);

		/* We should never have a receive error (other than -EAGAIN) */
		if (res < 0 && res != -EAGAIN)
			stub_syscall1(__NR_exit_group, 1);

		/* Receive the FDs */
		num_fds = 0;
		fd_msg = msghdr.msg_control;
		fd_map = (void *)&CMSG_DATA(fd_msg);
		if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
			num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);

		/* Try running queued syscalls. */
		res = syscall_handler(fd_map);

		while (num_fds)
			stub_syscall2(__NR_close, fd_map[--num_fds], 0);
	} else {
		res = 0;
	}

	if (res < 0 || d->restart_wait) {
		/* Report SIGSYS if we restart. */
		d->signal = SIGSYS;
		d->restart_wait = 0;

		goto restart_wait;
	}

	/* Restore arch dependent state that is not part of the mcontext */
	stub_seccomp_restore_state(&d->arch_data);

	/* Return so that the host modified mcontext is restored. */
}

void __section(".__syscall_stub")
stub_signal_restorer(void)
{
	/* We must not have anything on the stack when doing rt_sigreturn */
	stub_syscall0(__NR_rt_sigreturn);
}