Contributors: 4
Author Tokens Token Proportion Commits Commit Proportion
Alexey Dobriyan 2408 80.00% 6 66.67%
Andrii Nakryiko 566 18.80% 1 11.11%
Brian Foster 31 1.03% 1 11.11%
Guo Zhengkui 5 0.17% 1 11.11%
Total 3010 9


/*
 * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
/*
 * Fork and exec tiny 1 page executable which precisely controls its VM.
 * Test /proc/$PID/maps
 * Test /proc/$PID/smaps
 * Test /proc/$PID/smaps_rollup
 * Test /proc/$PID/statm
 *
 * FIXME require CONFIG_TMPFS which can be disabled
 * FIXME test other values from "smaps"
 * FIXME support other archs
 */
#undef NDEBUG
#include <assert.h>
#include <errno.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/uio.h>
#include <linux/kdev_t.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <linux/fs.h>

#include "../kselftest.h"

static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags)
{
	return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags);
}

static void make_private_tmp(void)
{
	if (unshare(CLONE_NEWNS) == -1) {
		if (errno == ENOSYS || errno == EPERM) {
			exit(4);
		}
		exit(1);
	}
	if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
		exit(1);
	}
	if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) {
		exit(1);
	}
}

static pid_t pid = -1;
static void ate(void)
{
	if (pid > 0) {
		kill(pid, SIGTERM);
	}
}

struct elf64_hdr {
	uint8_t e_ident[16];
	uint16_t e_type;
	uint16_t e_machine;
	uint32_t e_version;
	uint64_t e_entry;
	uint64_t e_phoff;
	uint64_t e_shoff;
	uint32_t e_flags;
	uint16_t e_ehsize;
	uint16_t e_phentsize;
	uint16_t e_phnum;
	uint16_t e_shentsize;
	uint16_t e_shnum;
	uint16_t e_shstrndx;
};

struct elf64_phdr {
	uint32_t p_type;
	uint32_t p_flags;
	uint64_t p_offset;
	uint64_t p_vaddr;
	uint64_t p_paddr;
	uint64_t p_filesz;
	uint64_t p_memsz;
	uint64_t p_align;
};

#ifdef __x86_64__
#define PAGE_SIZE 4096
#define VADDR (1UL << 32)
#define MAPS_OFFSET 73

#define syscall	0x0f, 0x05
#define mov_rdi(x)	\
	0x48, 0xbf,	\
	(x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff,	\
	((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff

#define mov_rsi(x)	\
	0x48, 0xbe,	\
	(x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff,	\
	((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff

#define mov_eax(x)	\
	0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff

static const uint8_t payload[] = {
	/* Casually unmap stack, vDSO and everything else. */
	/* munmap */
	mov_rdi(VADDR + 4096),
	mov_rsi((1ULL << 47) - 4096 - VADDR - 4096),
	mov_eax(11),
	syscall,

	/* Ping parent. */
	/* write(0, &c, 1); */
	0x31, 0xff,					/* xor edi, edi */
	0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00,	/* lea rsi, [rip] */
	0xba, 0x01, 0x00, 0x00, 0x00,			/* mov edx, 1 */
	mov_eax(1),
	syscall,

	/* 1: pause(); */
	mov_eax(34),
	syscall,

	0xeb, 0xf7,	/* jmp 1b */
};

static int make_exe(const uint8_t *payload, size_t len)
{
	struct elf64_hdr h;
	struct elf64_phdr ph;

	struct iovec iov[3] = {
		{&h, sizeof(struct elf64_hdr)},
		{&ph, sizeof(struct elf64_phdr)},
		{(void *)payload, len},
	};
	int fd, fd1;
	char buf[64];

	memset(&h, 0, sizeof(h));
	h.e_ident[0] = 0x7f;
	h.e_ident[1] = 'E';
	h.e_ident[2] = 'L';
	h.e_ident[3] = 'F';
	h.e_ident[4] = 2;
	h.e_ident[5] = 1;
	h.e_ident[6] = 1;
	h.e_ident[7] = 0;
	h.e_type = 2;
	h.e_machine = 0x3e;
	h.e_version = 1;
	h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr);
	h.e_phoff = sizeof(struct elf64_hdr);
	h.e_shoff = 0;
	h.e_flags = 0;
	h.e_ehsize = sizeof(struct elf64_hdr);
	h.e_phentsize = sizeof(struct elf64_phdr);
	h.e_phnum = 1;
	h.e_shentsize = 0;
	h.e_shnum = 0;
	h.e_shstrndx = 0;

	memset(&ph, 0, sizeof(ph));
	ph.p_type = 1;
	ph.p_flags = (1<<2)|1;
	ph.p_offset = 0;
	ph.p_vaddr = VADDR;
	ph.p_paddr = 0;
	ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
	ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
	ph.p_align = 4096;

	fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700);
	if (fd == -1) {
		exit(1);
	}

	if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) {
		exit(1);
	}

	/* Avoid ETXTBSY on exec. */
	snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd);
	fd1 = open(buf, O_RDONLY|O_CLOEXEC);
	close(fd);

	return fd1;
}
#endif

/*
 * 0: vsyscall VMA doesn't exist	vsyscall=none
 * 1: vsyscall VMA is --xp		vsyscall=xonly
 * 2: vsyscall VMA is r-xp		vsyscall=emulate
 */
static volatile int g_vsyscall;
static const char *str_vsyscall;

static const char str_vsyscall_0[] = "";
static const char str_vsyscall_1[] =
"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0                  [vsyscall]\n";
static const char str_vsyscall_2[] =
"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n";

#ifdef __x86_64__
static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___)
{
	_exit(g_vsyscall);
}

/*
 * vsyscall page can't be unmapped, probe it directly.
 */
static void vsyscall(void)
{
	pid_t pid;
	int wstatus;

	pid = fork();
	if (pid < 0) {
		fprintf(stderr, "fork, errno %d\n", errno);
		exit(1);
	}
	if (pid == 0) {
		struct rlimit rlim = {0, 0};
		(void)setrlimit(RLIMIT_CORE, &rlim);

		/* Hide "segfault at ffffffffff600000" messages. */
		struct sigaction act;
		memset(&act, 0, sizeof(struct sigaction));
		act.sa_flags = SA_SIGINFO;
		act.sa_sigaction = sigaction_SIGSEGV;
		(void)sigaction(SIGSEGV, &act, NULL);

		g_vsyscall = 0;
		/* gettimeofday(NULL, NULL); */
		uint64_t rax = 0xffffffffff600000;
		asm volatile (
			"call *%[rax]"
			: [rax] "+a" (rax)
			: "D" (NULL), "S" (NULL)
			: "rcx", "r11"
		);

		g_vsyscall = 1;
		*(volatile int *)0xffffffffff600000UL;

		g_vsyscall = 2;
		exit(g_vsyscall);
	}
	waitpid(pid, &wstatus, 0);
	if (WIFEXITED(wstatus)) {
		g_vsyscall = WEXITSTATUS(wstatus);
	} else {
		fprintf(stderr, "error: wstatus %08x\n", wstatus);
		exit(1);
	}
}

int main(void)
{
	int pipefd[2];
	int exec_fd;

	vsyscall();
	switch (g_vsyscall) {
	case 0:
		str_vsyscall = str_vsyscall_0;
		break;
	case 1:
		str_vsyscall = str_vsyscall_1;
		break;
	case 2:
		str_vsyscall = str_vsyscall_2;
		break;
	default:
		abort();
	}

	atexit(ate);

	make_private_tmp();

	/* Reserve fd 0 for 1-byte pipe ping from child. */
	close(0);
	if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) {
		return 1;
	}

	exec_fd = make_exe(payload, sizeof(payload));

	if (pipe(pipefd) == -1) {
		return 1;
	}
	if (dup2(pipefd[1], 0) != 0) {
		return 1;
	}

	pid = fork();
	if (pid == -1) {
		return 1;
	}
	if (pid == 0) {
		sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH);
		return 1;
	}

	char _;
	if (read(pipefd[0], &_, 1) != 1) {
		return 1;
	}

	struct stat st;
	if (fstat(exec_fd, &st) == -1) {
		return 1;
	}

	/* Generate "head -n1 /proc/$PID/maps" */
	char buf0[256];
	memset(buf0, ' ', sizeof(buf0));
	int len = snprintf(buf0, sizeof(buf0),
			"%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu",
			VADDR, VADDR + PAGE_SIZE,
			MAJOR(st.st_dev), MINOR(st.st_dev),
			(unsigned long long)st.st_ino);
	buf0[len] = ' ';
	snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET,
		 "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino);

	/* Test /proc/$PID/maps */
	{
		const size_t len = strlen(buf0) + strlen(str_vsyscall);
		char buf[256];
		ssize_t rv;
		int fd;

		snprintf(buf, sizeof(buf), "/proc/%u/maps", pid);
		fd = open(buf, O_RDONLY);
		if (fd == -1) {
			return 1;
		}
		rv = read(fd, buf, sizeof(buf));
		assert(rv == len);
		assert(memcmp(buf, buf0, strlen(buf0)) == 0);
		if (g_vsyscall > 0) {
			assert(memcmp(buf + strlen(buf0), str_vsyscall, strlen(str_vsyscall)) == 0);
		}
	}

	/* Test /proc/$PID/smaps */
	{
		char buf[4096];
		ssize_t rv;
		int fd;

		snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid);
		fd = open(buf, O_RDONLY);
		if (fd == -1) {
			return 1;
		}
		rv = read(fd, buf, sizeof(buf));
		assert(0 <= rv && rv <= sizeof(buf));

		assert(rv >= strlen(buf0));
		assert(memcmp(buf, buf0, strlen(buf0)) == 0);

#define RSS1 "Rss:                   4 kB\n"
#define RSS2 "Rss:                   0 kB\n"
#define PSS1 "Pss:                   4 kB\n"
#define PSS2 "Pss:                   0 kB\n"
		assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
		       memmem(buf, rv, RSS2, strlen(RSS2)));
		assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
		       memmem(buf, rv, PSS2, strlen(PSS2)));

		static const char *S[] = {
			"Size:                  4 kB\n",
			"KernelPageSize:        4 kB\n",
			"MMUPageSize:           4 kB\n",
			"Anonymous:             0 kB\n",
			"AnonHugePages:         0 kB\n",
			"Shared_Hugetlb:        0 kB\n",
			"Private_Hugetlb:       0 kB\n",
			"Locked:                0 kB\n",
		};
		int i;

		for (i = 0; i < ARRAY_SIZE(S); i++) {
			assert(memmem(buf, rv, S[i], strlen(S[i])));
		}

		if (g_vsyscall > 0) {
			assert(memmem(buf, rv, str_vsyscall, strlen(str_vsyscall)));
		}
	}

	/* Test /proc/$PID/smaps_rollup */
	{
		char bufr[256];
		memset(bufr, ' ', sizeof(bufr));
		len = snprintf(bufr, sizeof(bufr),
				"%08lx-%08lx ---p 00000000 00:00 0",
				VADDR, VADDR + PAGE_SIZE);
		bufr[len] = ' ';
		snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET,
			 "[rollup]\n");

		char buf[1024];
		ssize_t rv;
		int fd;

		snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid);
		fd = open(buf, O_RDONLY);
		if (fd == -1) {
			return 1;
		}
		rv = read(fd, buf, sizeof(buf));
		assert(0 <= rv && rv <= sizeof(buf));

		assert(rv >= strlen(bufr));
		assert(memcmp(buf, bufr, strlen(bufr)) == 0);

		assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
		       memmem(buf, rv, RSS2, strlen(RSS2)));
		assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
		       memmem(buf, rv, PSS2, strlen(PSS2)));

		static const char *S[] = {
			"Anonymous:             0 kB\n",
			"AnonHugePages:         0 kB\n",
			"Shared_Hugetlb:        0 kB\n",
			"Private_Hugetlb:       0 kB\n",
			"Locked:                0 kB\n",
		};
		int i;

		for (i = 0; i < ARRAY_SIZE(S); i++) {
			assert(memmem(buf, rv, S[i], strlen(S[i])));
		}
	}

	/* Test /proc/$PID/statm */
	{
		char buf[64];
		ssize_t rv;
		int fd;

		snprintf(buf, sizeof(buf), "/proc/%u/statm", pid);
		fd = open(buf, O_RDONLY);
		if (fd == -1) {
			return 1;
		}
		rv = read(fd, buf, sizeof(buf));
		assert(rv == 7 * 2);

		assert(buf[0] == '1');	/* ->total_vm */
		assert(buf[1] == ' ');
		assert(buf[2] == '0' || buf[2] == '1');	/* rss */
		assert(buf[3] == ' ');
		assert(buf[4] == '0' || buf[2] == '1');	/* file rss */
		assert(buf[5] == ' ');
		assert(buf[6] == '1');	/* ELF executable segments */
		assert(buf[7] == ' ');
		assert(buf[8] == '0');
		assert(buf[9] == ' ');
		assert(buf[10] == '0');	/* ->data_vm + ->stack_vm */
		assert(buf[11] == ' ');
		assert(buf[12] == '0');
		assert(buf[13] == '\n');
	}

	/* Test PROCMAP_QUERY ioctl() for /proc/$PID/maps */
	{
		char path_buf[256], exp_path_buf[256];
		struct procmap_query q;
		int fd, err;

		snprintf(path_buf, sizeof(path_buf), "/proc/%u/maps", pid);
		fd = open(path_buf, O_RDONLY);
		if (fd == -1)
			return 1;

		/* CASE 1: exact MATCH at VADDR */
		memset(&q, 0, sizeof(q));
		q.size = sizeof(q);
		q.query_addr = VADDR;
		q.query_flags = 0;
		q.vma_name_addr = (__u64)(unsigned long)path_buf;
		q.vma_name_size = sizeof(path_buf);

		err = ioctl(fd, PROCMAP_QUERY, &q);
		assert(err == 0);

		assert(q.query_addr == VADDR);
		assert(q.query_flags == 0);

		assert(q.vma_flags == (PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_EXECUTABLE));
		assert(q.vma_start == VADDR);
		assert(q.vma_end == VADDR + PAGE_SIZE);
		assert(q.vma_page_size == PAGE_SIZE);

		assert(q.vma_offset == 0);
		assert(q.inode == st.st_ino);
		assert(q.dev_major == MAJOR(st.st_dev));
		assert(q.dev_minor == MINOR(st.st_dev));

		snprintf(exp_path_buf, sizeof(exp_path_buf),
			"/tmp/#%llu (deleted)", (unsigned long long)st.st_ino);
		assert(q.vma_name_size == strlen(exp_path_buf) + 1);
		assert(strcmp(path_buf, exp_path_buf) == 0);

		/* CASE 2: NO MATCH at VADDR-1 */
		memset(&q, 0, sizeof(q));
		q.size = sizeof(q);
		q.query_addr = VADDR - 1;
		q.query_flags = 0; /* exact match */

		err = ioctl(fd, PROCMAP_QUERY, &q);
		err = err < 0 ? -errno : 0;
		assert(err == -ENOENT);

		/* CASE 3: MATCH COVERING_OR_NEXT_VMA at VADDR - 1 */
		memset(&q, 0, sizeof(q));
		q.size = sizeof(q);
		q.query_addr = VADDR - 1;
		q.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA;

		err = ioctl(fd, PROCMAP_QUERY, &q);
		assert(err == 0);

		assert(q.query_addr == VADDR - 1);
		assert(q.query_flags == PROCMAP_QUERY_COVERING_OR_NEXT_VMA);
		assert(q.vma_start == VADDR);
		assert(q.vma_end == VADDR + PAGE_SIZE);

		/* CASE 4: NO MATCH at VADDR + PAGE_SIZE */
		memset(&q, 0, sizeof(q));
		q.size = sizeof(q);
		q.query_addr = VADDR + PAGE_SIZE; /* point right after the VMA */
		q.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA;

		err = ioctl(fd, PROCMAP_QUERY, &q);
		err = err < 0 ? -errno : 0;
		assert(err == -ENOENT);

		/* CASE 5: NO MATCH WRITABLE at VADDR */
		memset(&q, 0, sizeof(q));
		q.size = sizeof(q);
		q.query_addr = VADDR;
		q.query_flags = PROCMAP_QUERY_VMA_WRITABLE;

		err = ioctl(fd, PROCMAP_QUERY, &q);
		err = err < 0 ? -errno : 0;
		assert(err == -ENOENT);
	}

	return 0;
}
#else
int main(void)
{
	return 4;
}
#endif