cregit-Linux how code gets into the kernel

Release 4.18 mm/memfd.c

Directory: mm
/*
 * memfd_create system call and file sealing support
 *
 * Code was originally included in shmem.c, and broken out to facilitate
 * use by hugetlbfs as well as tmpfs.
 *
 * This file is released under the GPL.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/khugepaged.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <uapi/linux/memfd.h>

/*
 * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
 * or hugetlbfs because they are memory only filesystems.
 */

#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE

#define LAST_SCAN               4       
/* about 150ms max */


static void memfd_tag_pins(struct address_space *mapping) { struct radix_tree_iter iter; void __rcu **slot; pgoff_t start; struct page *page; lru_add_drain(); start = 0; rcu_read_lock(); radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { slot = radix_tree_iter_retry(&iter); continue; } } else if (page_count(page) - page_mapcount(page) > 1) { xa_lock_irq(&mapping->i_pages); radix_tree_tag_set(&mapping->i_pages, iter.index, MEMFD_TAG_PINNED); xa_unlock_irq(&mapping->i_pages); } if (need_resched()) { slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); } } rcu_read_unlock(); }

Contributors

PersonTokensPropCommitsCommitProp
Mike Kravetz158100.00%1100.00%
Total158100.00%1100.00%

/* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references * to those pages to avoid races. */
static int memfd_wait_for_pins(struct address_space *mapping) { struct radix_tree_iter iter; void __rcu **slot; pgoff_t start; struct page *page; int error, scan; memfd_tag_pins(mapping); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) break; if (!scan) lru_add_drain_all(); else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; start = 0; rcu_read_lock(); radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, MEMFD_TAG_PINNED) { page = radix_tree_deref_slot(slot); if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { slot = radix_tree_iter_retry(&iter); continue; } page = NULL; } if (page && page_count(page) - page_mapcount(page) != 1) { if (scan < LAST_SCAN) goto continue_resched; /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found pages pinned. */ error = -EBUSY; } xa_lock_irq(&mapping->i_pages); radix_tree_tag_clear(&mapping->i_pages, iter.index, MEMFD_TAG_PINNED); xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); } } rcu_read_unlock(); } return error; }

Contributors

PersonTokensPropCommitsCommitProp
Mike Kravetz248100.00%1100.00%
Total248100.00%1100.00%


static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; #ifdef CONFIG_HUGETLBFS if (is_file_hugepages(file)) return &HUGETLBFS_I(file_inode(file))->seals; #endif return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Mike Kravetz59100.00%1100.00%
Total59100.00%1100.00%

#define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE)
static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; int error; /* * SEALING * Sealing allows multiple parties to share a tmpfs or hugetlbfs file * but restrict access to a specific subset of file operations. Seals * can only be added, but never removed. This way, mutually untrusted * parties can share common memory regions with a well-defined policy. * A malicious peer can thus never perform unwanted operations on a * shared object. * * Seals are only supported on special tmpfs or hugetlbfs files and * always affect the whole underlying inode. Once a seal is set, it * may prevent some kinds of access to the file. Currently, the * following seals are defined: * SEAL_SEAL: Prevent further seals from being set on this file * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file * only adds a given set of seals to the file, it never touches * existing seals. Furthermore, the "setting seals"-operation can be * sealed itself, which basically prevents any further seal from being * added. * * Semantics of sealing are only defined on volatile files. Only * anonymous tmpfs and hugetlbfs files support sealing. More * importantly, seals are never written to disk. Therefore, there's * no plan to support it on other file types. */ if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; inode_lock(inode); file_seals = memfd_file_seals_ptr(file); if (!file_seals) { error = -EINVAL; goto unlock; } if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; error = memfd_wait_for_pins(file->f_mapping); if (error) { mapping_allow_writable(file->f_mapping); goto unlock; } } *file_seals |= seals; error = 0; unlock: inode_unlock(inode); return error; }

Contributors

PersonTokensPropCommitsCommitProp
Mike Kravetz186100.00%1100.00%
Total186100.00%1100.00%


static int memfd_get_seals(struct file *file) { unsigned int *seals = memfd_file_seals_ptr(file); return seals ? *seals : -EINVAL; }

Contributors

PersonTokensPropCommitsCommitProp
Mike Kravetz30100.00%1100.00%
Total30100.00%1100.00%


long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { long error; switch (cmd) { case F_ADD_SEALS: /* disallow upper 32bit */ if (arg > UINT_MAX) return -EINVAL; error = memfd_add_seals(file, arg); break; case F_GET_SEALS: error = memfd_get_seals(file); break; default: error = -EINVAL; break; } return error; }

Contributors

PersonTokensPropCommitsCommitProp
Mike Kravetz72100.00%1100.00%
Total72100.00%1100.00%

#define MFD_NAME_PREFIX "memfd:" #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { unsigned int *file_seals; struct file *file; int fd, error; char *name; long len; if (!(flags & MFD_HUGETLB)) { if (flags & ~(unsigned int)MFD_ALL_FLAGS) return -EINVAL; } else { /* Allow huge page size encoding in flags. */ if (flags & ~(unsigned int)(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) return -EINVAL; } /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) return -EFAULT; if (len > MFD_NAME_MAX_LEN + 1) return -EINVAL; name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); if (!name) return -ENOMEM; strcpy(name, MFD_NAME_PREFIX); if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { error = -EFAULT; goto err_name; } /* terminating-zero may have changed after strnlen_user() returned */ if (name[len + MFD_NAME_PREFIX_LEN - 1]) { error = -EFAULT; goto err_name; } fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); if (fd < 0) { error = fd; goto err_name; } if (flags & MFD_HUGETLB) { struct user_struct *user = NULL; file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); } else file = shmem_file_setup(name, 0, VM_NORESERVE); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; } file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_RDWR | O_LARGEFILE; if (flags & MFD_ALLOW_SEALING) { file_seals = memfd_file_seals_ptr(file); *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); kfree(name); return fd; err_fd: put_unused_fd(fd); err_name: kfree(name); return error; }

Overall Contributors

PersonTokensPropCommitsCommitProp
Mike Kravetz1189100.00%1100.00%
Total1189100.00%1100.00%
Directory: mm
Information contained on this website is for historical information purposes only and does not indicate or represent copyright ownership.
Created with cregit.