Contributors: 22
Author Tokens Token Proportion Commits Commit Proportion
Jeff Xu 156 34.67% 2 5.56%
Lorenzo Stoakes 93 20.67% 2 5.56%
Andrew Morton 56 12.44% 5 13.89%
Liam R. Howlett 33 7.33% 3 8.33%
Linus Torvalds (pre-git) 27 6.00% 7 19.44%
Heiko Carstens 11 2.44% 1 2.78%
Dave Hansen 11 2.44% 1 2.78%
Al Viro 9 2.00% 1 2.78%
Rasmus Villemoes 8 1.78% 1 2.78%
Wei Yang 7 1.56% 1 2.78%
Colin Cross 7 1.56% 1 2.78%
Michal Hocko 7 1.56% 1 2.78%
Andrey Konovalov 7 1.56% 1 2.78%
Jakub Matěna 4 0.89% 1 2.78%
Jan Beulich 3 0.67% 1 2.78%
Arnd Bergmann 3 0.67% 1 2.78%
Andrea Arcangeli 2 0.44% 1 2.78%
Richard Henderson 2 0.44% 1 2.78%
Linus Torvalds 1 0.22% 1 2.78%
Hugh Dickins 1 0.22% 1 2.78%
Michel Lespinasse 1 0.22% 1 2.78%
Greg Kroah-Hartman 1 0.22% 1 2.78%
Total 450 36


// SPDX-License-Identifier: GPL-2.0
/*
 *  Implement mseal() syscall.
 *
 *  Copyright (c) 2023,2024 Google, Inc.
 *
 *  Author: Jeff Xu <jeffxu@chromium.org>
 */

#include <linux/mempolicy.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"

/*
 * mseal() disallows an input range which contain unmapped ranges (VMA holes).
 *
 * It disallows unmapped regions from start to end whether they exist at the
 * start, in the middle, or at the end of the range, or any combination thereof.
 *
 * This is because after sealng a range, there's nothing to stop memory mapping
 * of ranges in the remaining gaps later, meaning that the user might then
 * wrongly consider the entirety of the mseal()'d range to be sealed when it
 * in fact isn't.
 */

/*
 * Does the [start, end) range contain any unmapped memory?
 *
 * We ensure that:
 * - start is part of a valid VMA.
 * - end is part of a valid VMA.
 * - no gap (unallocated memory) exists between start and end.
 */
static bool range_contains_unmapped(struct mm_struct *mm,
		unsigned long start, unsigned long end)
{
	struct vm_area_struct *vma;
	unsigned long prev_end = start;
	VMA_ITERATOR(vmi, current->mm, start);

	for_each_vma_range(vmi, vma, end) {
		if (vma->vm_start > prev_end)
			return true;

		prev_end = vma->vm_end;
	}

	return prev_end < end;
}

static int mseal_apply(struct mm_struct *mm,
		unsigned long start, unsigned long end)
{
	struct vm_area_struct *vma, *prev;
	unsigned long curr_start = start;
	VMA_ITERATOR(vmi, mm, start);

	/* We know there are no gaps so this will be non-NULL. */
	vma = vma_iter_load(&vmi);
	prev = vma_prev(&vmi);
	if (start > vma->vm_start)
		prev = vma;

	for_each_vma_range(vmi, vma, end) {
		unsigned long curr_end = MIN(vma->vm_end, end);

		if (!(vma->vm_flags & VM_SEALED)) {
			vma = vma_modify_flags(&vmi, prev, vma,
					curr_start, curr_end,
					vma->vm_flags | VM_SEALED);
			if (IS_ERR(vma))
				return PTR_ERR(vma);
			vm_flags_set(vma, VM_SEALED);
		}

		prev = vma;
		curr_start = curr_end;
	}

	return 0;
}

/*
 * mseal(2) seals the VM's meta data from
 * selected syscalls.
 *
 * addr/len: VM address range.
 *
 *  The address range by addr/len must meet:
 *   start (addr) must be in a valid VMA.
 *   end (addr + len) must be in a valid VMA.
 *   no gap (unallocated memory) between start and end.
 *   start (addr) must be page aligned.
 *
 *  len: len will be page aligned implicitly.
 *
 *   Below VMA operations are blocked after sealing.
 *   1> Unmapping, moving to another location, and shrinking
 *	the size, via munmap() and mremap(), can leave an empty
 *	space, therefore can be replaced with a VMA with a new
 *	set of attributes.
 *   2> Moving or expanding a different vma into the current location,
 *	via mremap().
 *   3> Modifying a VMA via mmap(MAP_FIXED).
 *   4> Size expansion, via mremap(), does not appear to pose any
 *	specific risks to sealed VMAs. It is included anyway because
 *	the use case is unclear. In any case, users can rely on
 *	merging to expand a sealed VMA.
 *   5> mprotect and pkey_mprotect.
 *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
 *      for anonymous memory, when users don't have write permission to the
 *	memory. Those behaviors can alter region contents by discarding pages,
 *	effectively a memset(0) for anonymous memory.
 *
 *  flags: reserved.
 *
 * return values:
 *  zero: success.
 *  -EINVAL:
 *   invalid input flags.
 *   start address is not page aligned.
 *   Address arange (start + len) overflow.
 *  -ENOMEM:
 *   addr is not a valid address (not allocated).
 *   end (start + len) is not a valid address.
 *   a gap (unallocated memory) between start and end.
 *  -EPERM:
 *  - In 32 bit architecture, sealing is not supported.
 * Note:
 *  user can call mseal(2) multiple times, adding a seal on an
 *  already sealed memory is a no-action (no error).
 *
 *  unseal() is not supported.
 */
int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
{
	size_t len;
	int ret = 0;
	unsigned long end;
	struct mm_struct *mm = current->mm;

	/* Verify flags not set. */
	if (flags)
		return -EINVAL;

	start = untagged_addr(start);
	if (!PAGE_ALIGNED(start))
		return -EINVAL;

	len = PAGE_ALIGN(len_in);
	/* Check to see whether len was rounded up from small -ve to zero. */
	if (len_in && !len)
		return -EINVAL;

	end = start + len;
	if (end < start)
		return -EINVAL;

	if (end == start)
		return 0;

	if (mmap_write_lock_killable(mm))
		return -EINTR;

	if (range_contains_unmapped(mm, start, end)) {
		ret = -ENOMEM;
		goto out;
	}

	/*
	 * Second pass, this should success, unless there are errors
	 * from vma_modify_flags, e.g. merge/split error, or process
	 * reaching the max supported VMAs, however, those cases shall
	 * be rare.
	 */
	ret = mseal_apply(mm, start, end);

out:
	mmap_write_unlock(mm);
	return ret;
}

SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
		flags)
{
	return do_mseal(start, len, flags);
}