cregit-Linux how code gets into the kernel

Release 4.18 fs/aio.c

Directory: fs
/*
 *      An async IO implementation for Linux
 *      Written by Benjamin LaHaise <bcrl@kvack.org>
 *
 *      Implements an efficient asynchronous io interface.
 *
 *      Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
 *
 *      See ../COPYING for licensing terms.
 */

#define pr_fmt(fmt) "%s: " fmt, __func__

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/aio_abi.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>

#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/eventfd.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/migrate.h>
#include <linux/ramfs.h>
#include <linux/percpu-refcount.h>
#include <linux/mount.h>

#include <asm/kmap_types.h>
#include <linux/uaccess.h>

#include "internal.h"


#define KIOCB_KEY		0


#define AIO_RING_MAGIC			0xa10a10a1

#define AIO_RING_COMPAT_FEATURES	1

#define AIO_RING_INCOMPAT_FEATURES	0

struct aio_ring {
	
unsigned	id;	/* kernel internal index number */
	
unsigned	nr;	/* number of io_events */
	
unsigned	head;	/* Written to by userland or under ring_lock
                                 * mutex by aio_read_events_ring(). */
	
unsigned	tail;

	
unsigned	magic;
	
unsigned	compat_features;
	
unsigned	incompat_features;
	
unsigned	header_length;	/* size of aio_ring */


	
struct io_event		io_events[0];
}; /* 128 bytes + ring size */


#define AIO_RING_PAGES	8


struct kioctx_table {
	
struct rcu_head		rcu;
	
unsigned		nr;
	
struct kioctx __rcu	*table[];
};


struct kioctx_cpu {
	
unsigned		reqs_available;
};


struct ctx_rq_wait {
	
struct completion comp;
	
atomic_t count;
};


struct kioctx {
	
struct percpu_ref	users;
	
atomic_t		dead;

	
struct percpu_ref	reqs;

	
unsigned long		user_id;

	
struct __percpu kioctx_cpu *cpu;

	/*
         * For percpu reqs_available, number of slots we move to/from global
         * counter at a time:
         */
	
unsigned		req_batch;
	/*
         * This is what userspace passed to io_setup(), it's not used for
         * anything but counting against the global max_reqs quota.
         *
         * The real limit is nr_events - 1, which will be larger (see
         * aio_setup_ring())
         */
	
unsigned		max_reqs;

	/* Size of ringbuffer, in units of struct io_event */
	
unsigned		nr_events;

	
unsigned long		mmap_base;
	
unsigned long		mmap_size;

	
struct page		**ring_pages;
	
long			nr_pages;

	
struct rcu_work		free_rwork;	/* see free_ioctx() */

	/*
         * signals when all in-flight requests are done
         */
	
struct ctx_rq_wait	*rq_wait;

	
struct {
		/*
                 * This counts the number of available slots in the ringbuffer,
                 * so we avoid overflowing it: it's decremented (if positive)
                 * when allocating a kiocb and incremented when the resulting
                 * io_event is pulled off the ringbuffer.
                 *
                 * We batch accesses to it with a percpu version.
                 */
		
atomic_t	reqs_available;
	
} ____cacheline_aligned_in_smp;

	
struct {
		
spinlock_t	ctx_lock;
		
struct list_head active_reqs;	/* used for cancellation */
	
} ____cacheline_aligned_in_smp;

	
struct {
		
struct mutex	ring_lock;
		
wait_queue_head_t wait;
	
} ____cacheline_aligned_in_smp;

	
struct {
		
unsigned	tail;
		
unsigned	completed_events;
		
spinlock_t	completion_lock;
	
} ____cacheline_aligned_in_smp;

	
struct page		*internal_pages[AIO_RING_PAGES];
	
struct file		*aio_ring_file;

	
unsigned		id;
};


struct fsync_iocb {
	
struct work_struct	work;
	
struct file		*file;
	
bool			datasync;
};


struct aio_kiocb {
	
union {
		
struct kiocb		rw;
		
struct fsync_iocb	fsync;
	};

	
struct kioctx		*ki_ctx;
	
kiocb_cancel_fn		*ki_cancel;

	
struct iocb __user	*ki_user_iocb;	/* user's aiocb */
	
__u64			ki_user_data;	/* user's data for completion */

	
struct list_head	ki_list;	/* the aio core uses this
                                                 * for cancellation */

	/*
         * If the aio_resfd field of the userspace iocb is not zero,
         * this is the underlying eventfd context to deliver events to.
         */
	
struct eventfd_ctx	*ki_eventfd;
};

/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);

unsigned long aio_nr;		
/* current system wide number of aio requests */

unsigned long aio_max_nr = 0x10000; 
/* system wide maximum number of aio requests */
/*----end sysctl variables---*/


static struct kmem_cache	*kiocb_cachep;

static struct kmem_cache	*kioctx_cachep;


static struct vfsmount *aio_mnt;


static const struct file_operations aio_ring_fops;

static const struct address_space_operations aio_ctx_aops;


static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct qstr this = QSTR_INIT("[aio]", 5); struct file *file; struct path path; struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; inode->i_mapping->private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); if (!path.dentry) { iput(inode); return ERR_PTR(-ENOMEM); } path.mnt = mntget(aio_mnt); d_instantiate(path.dentry, inode); file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops); if (IS_ERR(file)) { path_put(&path); return file; } file->f_flags = O_RDWR; return file; }

Contributors

PersonTokensPropCommitsCommitProp
Benjamin LaHaise17697.24%150.00%
Dan Carpenter52.76%150.00%
Total181100.00%2100.00%


static struct dentry *aio_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { static const struct dentry_operations ops = { .d_dname = simple_dname, }; struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); if (!IS_ERR(root)) root->d_sb->s_iflags |= SB_I_NOEXEC; return root; }

Contributors

PersonTokensPropCommitsCommitProp
Benjamin LaHaise5167.11%133.33%
Jann Horn2431.58%133.33%
Gu Zheng11.32%133.33%
Total76100.00%3100.00%

/* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. */
static int __init aio_setup(void) { static struct file_system_type aio_fs = { .name = "aio", .mount = aio_mount, .kill_sb = kill_anon_super, }; aio_mnt = kern_mount(&aio_fs); if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Benjamin LaHaise6990.79%240.00%
Andrew Morton45.26%120.00%
Christoph Lameter22.63%120.00%
Christoph Hellwig11.32%120.00%
Total76100.00%5100.00%

__initcall(aio_setup);
static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; struct address_space *i_mapping; if (aio_ring_file) { truncate_setsize(file_inode(aio_ring_file), 0); /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; spin_lock(&i_mapping->private_lock); i_mapping->private_data = NULL; ctx->aio_ring_file = NULL; spin_unlock(&i_mapping->private_lock); fput(aio_ring_file); } }

Contributors

PersonTokensPropCommitsCommitProp
Benjamin LaHaise5669.14%240.00%
Rasmus Villemoes1214.81%120.00%
Gu Zheng911.11%120.00%
Al Viro44.94%120.00%
Total81100.00%5100.00%


static void aio_free_ring(struct kioctx *ctx) { int i; /* Disconnect the kiotx from the ring file. This prevents future * accesses to the kioctx from page migration. */ put_aio_ring_file(ctx); for (i = 0; i < ctx->nr_pages; i++) { struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, page_count(ctx->ring_pages[i])); page = ctx->ring_pages[i]; if (!page) continue; ctx->ring_pages[i] = NULL; put_page(page); } if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); ctx->ring_pages = NULL; } }

Contributors

PersonTokensPropCommitsCommitProp
Benjamin LaHaise8368.60%457.14%
Gu Zheng2419.83%114.29%
Sasha Levin86.61%114.29%
Kent Overstreet64.96%114.29%
Total121100.00%7100.00%


static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); for (i = 0; i < table->nr; i++) { struct kioctx *ctx; ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; res = 0; } break; } } rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; }

Contributors

PersonTokensPropCommitsCommitProp
Pavel Emelyanov11072.37%120.00%
Al Viro2516.45%120.00%
Gu Zheng95.92%120.00%
Oleg Nesterov53.29%120.00%
Tejun Heo31.97%120.00%
Total152100.00%5100.00%

static const struct vm_operations_struct aio_ring_vm_ops = { .mremap = aio_ring_mremap, #if IS_ENABLED(CONFIG_MMU) .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, #endif };
static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { vma->vm_flags |= VM_DONTEXPAND; vma->vm_ops = &aio_ring_vm_ops; return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Oleg Nesterov32100.00%1100.00%
Total32100.00%1100.00%

static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, }; #if IS_ENABLED(CONFIG_MIGRATION)
static int aio_migratepage(struct address_space *mapping, struct page *new, struct page *old, enum migrate_mode mode) { struct kioctx *ctx; unsigned long flags; pgoff_t idx; int rc; /* * We cannot support the _NO_COPY case here, because copy needs to * happen under the ctx->completion_lock. That does not work with the * migration workflow of MIGRATE_SYNC_NO_COPY. */ if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; rc = 0; /* mapping->private_lock here protects against the kioctx teardown. */ spin_lock(&mapping->private_lock); ctx = mapping->private_data; if (!ctx) { rc = -EINVAL; goto out; } /* The ring_lock mutex. The prevents aio_read_events() from writing * to the ring's head, and prevents page migration from mucking in * a partially initialized kiotx. */ if (!mutex_trylock(&ctx->ring_lock)) { rc = -EAGAIN; goto out; } idx = old->index; if (idx < (pgoff_t)ctx->nr_pages) { /* Make sure the old page hasn't already been changed */ if (ctx->ring_pages[idx] != old) rc = -EAGAIN; } else rc = -EINVAL; if (rc != 0) goto out_unlock; /* Writeback must be complete */ BUG_ON(PageWriteback(old)); get_page(new); rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); goto out_unlock; } /* Take completion_lock to prevent other writes to the ring buffer * while the old page is copied to the new. This prevents new * events from being lost. */ spin_lock_irqsave(&ctx->completion_lock, flags); migrate_page_copy(new, old); BUG_ON(ctx->ring_pages[idx] != old); ctx->ring_pages[idx] = new; spin_unlock_irqrestore(&ctx->completion_lock, flags); /* The old page is no longer accessible. */ put_page(old); out_unlock: mutex_unlock(&ctx->ring_lock); out: spin_unlock(&mapping->private_lock); return rc; }

Contributors

PersonTokensPropCommitsCommitProp
Benjamin LaHaise15855.83%360.00%
Gu Zheng11440.28%120.00%
Jérôme Glisse113.89%120.00%
Total283100.00%5100.00%

#endif static const struct address_space_operations aio_ctx_aops = { .set_page_dirty = __set_page_dirty_no_writeback, #if IS_ENABLED(CONFIG_MIGRATION) .migratepage = aio_migratepage, #endif };
static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) { struct aio_ring *ring; struct mm_struct *mm = current->mm; unsigned long size, unused; int nr_pages; int i; struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; return -ENOMEM; } ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!ctx->ring_pages) { put_aio_ring_file(ctx); return -ENOMEM; } } for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_mapping, i, GFP_HIGHUSER | __GFP_ZERO); if (!page) break; pr_debug("pid(%d) page[%d]->count=%d\n", current->pid, i, page_count(page)); SetPageUptodate(page); unlock_page(page); ctx->ring_pages[i] = page; } ctx->nr_pages = i; if (unlikely(i != nr_pages)) { aio_free_ring(ctx); return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); if (down_write_killable(&mm->mmap_sem)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -EINTR; } ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 0, &unused, NULL); up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Benjamin LaHaise23646.00%421.05%
Gu Zheng11221.83%210.53%
Linus Torvalds10019.49%15.26%
Kent Overstreet244.68%315.79%
Michal Hocko203.90%15.26%
Zach Brown91.75%15.26%
Mauricio Faria de Oliveira40.78%15.26%
Michel Lespinasse30.58%210.53%
Mike Rapoport20.39%15.26%
Al Viro20.39%210.53%
Oliver Neukum10.19%15.26%
Total513100.00%19100.00%

#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); struct kioctx *ctx = req->ki_ctx; unsigned long flags; if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; spin_lock_irqsave(&ctx->ctx_lock, flags); list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; spin_unlock_irqrestore(&ctx->ctx_lock, flags); }

Contributors

PersonTokensPropCommitsCommitProp
Kent Overstreet6466.67%125.00%
Christoph Hellwig3233.33%375.00%
Total96100.00%4100.00%

EXPORT_SYMBOL(kiocb_set_cancel_fn); /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call * aio_free_ring(). Use rcu_work. */
static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx, free_rwork); pr_debug("freeing %p\n", ctx); aio_free_ring(ctx); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); }

Contributors

PersonTokensPropCommitsCommitProp
Kent Overstreet3752.11%342.86%
Tejun Heo2028.17%228.57%
Benjamin LaHaise1318.31%114.29%
Jens Axboe11.41%114.29%
Total71100.00%7100.00%


static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); /* At this point we know that there are no any in-flight requests */ if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); queue_rcu_work(system_wq, &ctx->free_rwork); }

Contributors

PersonTokensPropCommitsCommitProp
Kent Overstreet3344.59%120.00%
Jens Axboe1520.27%120.00%
Tejun Heo1418.92%240.00%
Anatol Pomozov1216.22%120.00%
Total74100.00%5100.00%

/* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */
static void free_ioctx_users(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, users); struct aio_kiocb *req; spin_lock_irq(&ctx->ctx_lock); while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); req->ki_cancel(&req->rw); list_del_init(&req->ki_list); } spin_unlock_irq(&ctx->ctx_lock); percpu_ref_kill(&ctx->reqs); percpu_ref_put(&ctx->reqs); }

Contributors

PersonTokensPropCommitsCommitProp
Kent Overstreet8477.06%342.86%
Benjamin LaHaise1311.93%114.29%
Christoph Hellwig87.34%228.57%
Al Viro43.67%114.29%
Total109100.00%7100.00%


static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) { unsigned i, new_nr; struct kioctx_table *table, *old; struct aio_ring *ring; spin_lock(&mm->ioctx_lock); table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) for (i = 0; i < table->nr; i++) if (!rcu_access_pointer(table->table[i])) { ctx->id = i; rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, * we are protected from page migration * changes ring_pages by ->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); ring->id = ctx->id; kunmap_atomic(ring); return 0; } new_nr = (table ? table->nr : 1) * 4; spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * new_nr, GFP_KERNEL); if (!table) return -ENOMEM; table->nr = new_nr; spin_lock(&mm->ioctx_lock); old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); } else if (table->nr > old->nr) { memcpy(table->table, old->table, old->nr * sizeof(struct kioctx *)); rcu_assign_pointer(mm->ioctx_table, table); kfree_rcu(old, rcu); } else { kfree(table); table = old; } } }

Contributors

PersonTokensPropCommitsCommitProp
Benjamin LaHaise28494.35%342.86%
Kent Overstreet82.66%228.57%
Tejun Heo72.33%114.29%
Oleg Nesterov20.66%114.29%
Total301100.00%7100.00%


static void aio_nr_sub(unsigned nr) { spin_lock(&aio_nr_lock); if (WARN_ON(aio_nr - nr > aio_nr)) aio_nr = 0; else aio_nr -= nr; spin_unlock(&aio_nr_lock); }

Contributors

PersonTokensPropCommitsCommitProp
Kent Overstreet41100.00%1100.00%
Total41100.00%1100.00%

/* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */
static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; /* * Store the original nr_events -- what userspace passed to io_setup(), * for counting against the global limit -- before it changes. */ unsigned int max_reqs