cregit-Linux how code gets into the kernel

Release 4.7 fs/splice.c

Directory: fs
/*
 * "splice": joining two ropes together by interweaving their strands.
 *
 * This is the "extended pipe" functionality, where a pipe is used as
 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
 * buffer that you can use to transfer data from one end to the other.
 *
 * The traditional unix read/write is extended with a "splice()" operation
 * that transfers data buffers to or from a pipe buffer.
 *
 * Named by Larry McVoy, original implementation from Linus, extended by
 * Jens to support splicing to files, network, direct splicing, etc and
 * fixing lots of bugs.
 *
 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
 *
 */
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/socket.h>
#include <linux/compat.h>
#include "internal.h"

/*
 * Attempt to steal a page from a pipe buffer. This should perhaps go into
 * a vm helper function, it's already simplified quite a bit by the
 * addition of remove_mapping(). If success is returned, the caller may
 * attempt to reuse this page for another destination.
 */

static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; struct address_space *mapping; lock_page(page); mapping = page_mapping(page); if (mapping) { WARN_ON(!PageUptodate(page)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this page, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * page, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ wait_on_page_writeback(page); if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, page)) { buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } } /* * Raced with truncate or failed to remove page from current * address space, unlock and return failure. */ out_unlock: unlock_page(page); return 1; }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe11198.23%880.00%
nick pigginnick piggin10.88%110.00%
david howellsdavid howells10.88%110.00%
Total113100.00%10100.00%


static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe2996.67%375.00%
kirill a. shutemovkirill a. shutemov13.33%125.00%
Total30100.00%4100.00%

/* * Check whether the contents of buf is OK to access. Since the content * is a page cache page, IO may be in flight. */
static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; int err; if (!PageUptodate(page)) { lock_page(page); /* * Page got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ if (!page->mapping) { err = -ENODATA; goto error; } /* * Uh oh, read-error from disk. */ if (!PageUptodate(page)) { err = -EIO; goto error; } /* * Page is ok afterall, we are done. */ unlock_page(page); } return 0; error: unlock_page(page); return err; }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe9797.98%583.33%
ingo molnaringo molnar22.02%116.67%
Total99100.00%6100.00%

const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, .get = generic_pipe_buf_get, };
static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return 1; buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_steal(pipe, buf); }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe44100.00%4100.00%
Total44100.00%4100.00%

static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, .get = generic_pipe_buf_get, };
static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wait)) wake_up_interruptible(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); }

Contributors

PersonTokensPropCommitsCommitProp
namhyung kimnamhyung kim44100.00%1100.00%
Total44100.00%1100.00%

/** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill * @spd: data to fill * * Description: * @spd contains a map of pages and len/offset tuples, along with * the struct pipe_buf_operations associated with these pages. This * function will link that data to the pipe. * */
ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; if (!spd_pages) return 0; ret = 0; do_wakeup = 0; page_nr = 0; pipe_lock(pipe); for (;;) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } if (pipe->nrbufs < pipe->buffers) { int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; if (spd->flags & SPLICE_F_GIFT) buf->flags |= PIPE_BUF_FLAG_GIFT; pipe->nrbufs++; page_nr++; ret += buf->len; if (pipe->files) do_wakeup = 1; if (!--spd->nr_pages) break; if (pipe->nrbufs < pipe->buffers) continue; break; } if (spd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } if (do_wakeup) { smp_mb(); if (waitqueue_active(&pipe->wait)) wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } pipe->waiting_writers++; pipe_wait(pipe); pipe->waiting_writers--; } pipe_unlock(pipe); if (do_wakeup) wakeup_pipe_readers(pipe); while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); return ret; }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe32183.59%1062.50%
ingo molnaringo molnar328.33%16.25%
linus torvaldslinus torvalds194.95%16.25%
rabin vincentrabin vincent82.08%16.25%
miklos szeredimiklos szeredi20.52%16.25%
namhyung kimnamhyung kim10.26%16.25%
al viroal viro10.26%16.25%
Total384100.00%16100.00%

EXPORT_SYMBOL_GPL(splice_to_pipe);
void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { put_page(spd->pages[i]); }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe2395.83%150.00%
kirill a. shutemovkirill a. shutemov14.17%150.00%
Total24100.00%2100.00%

/* * Check if we need to grow the arrays holding pages and partial page * descriptions. */
int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int buffers = ACCESS_ONCE(pipe->buffers); spd->nr_pages_max = buffers; if (buffers <= PIPE_DEF_BUFFERS) return 0; spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) return 0; kfree(spd->pages); kfree(spd->partial); return -ENOMEM; }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe9083.33%150.00%
eric dumazeteric dumazet1816.67%150.00%
Total108100.00%2100.00%


void splice_shrink_spd(struct splice_pipe_desc *spd) { if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); kfree(spd->partial); }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe3193.94%150.00%
eric dumazeteric dumazet26.06%150.00%
Total33100.00%2100.00%


static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int loff, nr_pages, req_pages; struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; int error, page_nr; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, }; if (splice_grow_spd(pipe, &spd)) return -ENOMEM; index = *ppos >> PAGE_SHIFT; loff = *ppos & ~PAGE_MASK; req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = min(req_pages, spd.nr_pages_max); /* * Lookup the (hopefully) full range of pages we need. */ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); index += spd.nr_pages; /* * If find_get_pages_contig() returned fewer pages than we needed, * readahead/allocate the rest and fill in the holes. */ if (spd.nr_pages < nr_pages) page_cache_sync_readahead(mapping, &in->f_ra, in, index, req_pages - spd.nr_pages); error = 0; while (spd.nr_pages < nr_pages) { /* * Page could be there, find_get_pages_contig() breaks on * the first hole. */ page = find_get_page(mapping, index); if (!page) { /* * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); if (!page) break; error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(error)) { put_page(page); if (error == -EEXIST) continue; break; } /* * add_to_page_cache() locks the page, unlock it * to avoid convoluting the logic below even more. */ unlock_page(page); } spd.pages[spd.nr_pages++] = page; index++; } /* * Now loop over the map and see if we need to start IO on any * pages, fill in the partial map, etc. */ index = *ppos >> PAGE_SHIFT; nr_pages = spd.nr_pages; spd.nr_pages = 0; for (page_nr = 0; page_nr < nr_pages; page_nr++) { unsigned int this_len; if (!len) break; /* * this_len is the max we'll use from this page */ this_len = min_t(unsigned long, len, PAGE_SIZE - loff); page = spd.pages[page_nr]; if (PageReadahead(page)) page_cache_async_readahead(mapping, &in->f_ra, in, page, index, req_pages - page_nr); /* * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { lock_page(page); /* * Page was truncated, or invalidated by the * filesystem. Redo the find/create, but this time the * page is kept locked, so there's no chance of another * race with truncate/invalidate. */ if (!page->mapping) { unlock_page(page); retry_lookup: page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); if (!page) { error = -ENOMEM; break; } put_page(spd.pages[page_nr]); spd.pages[page_nr] = page; } /* * page was already under io and is now done, great */ if (PageUptodate(page)) { unlock_page(page); goto fill_it; } /* * need to read in the page */ error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { /* * Re-lookup the page */ if (error == AOP_TRUNCATED_PAGE) goto retry_lookup; break; } } fill_it: /* * i_size must be checked after PageUptodate. */ isize = i_size_read(mapping->host); end_index = (isize - 1) >> PAGE_SHIFT; if (unlikely(!isize || index > end_index)) break; /* * if this is the last page, see if we need to shrink * the length and stop */ if (end_index == index) { unsigned int plen; /* * max good bytes in this page */ plen = ((isize - 1) & ~PAGE_MASK) + 1; if (plen <= loff) break; /* * force quit after adding this page */ this_len = min(this_len, plen - loff); len = this_len; } spd.partial[page_nr].offset = loff; spd.partial[page_nr].len = this_len; len -= this_len; loff = 0; spd.nr_pages++; index++; } /* * Release any pages at the end, if we quit early. 'page_nr' is how far * we got, 'nr_pages' is how many pages are in the map. */ while (page_nr < nr_pages) put_page(spd.pages[page_nr++]); in->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); splice_shrink_spd(&spd); return error; }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe55075.03%1450.00%
wu fengguangwu fengguang8511.60%414.29%
miklos szeredimiklos szeredi415.59%13.57%
hugh dickinshugh dickins182.46%13.57%
kirill a. shutemovkirill a. shutemov121.64%13.57%
eric dumazeteric dumazet81.09%13.57%
abhijith dasabhijith das60.82%13.57%
michal hockomichal hocko60.82%27.14%
linus torvaldslinus torvalds40.55%13.57%
rusty russellrusty russell20.27%13.57%
ingo molnaringo molnar10.14%13.57%
Total733100.00%28100.00%

/** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from * @ppos: position in @in * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given file and fill them into a pipe. Can be * used as long as the address_space operations for the source implements * a readpage() hook. * */
ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { loff_t isize, left; int ret; if (IS_DAX(in->f_mapping->host)) return default_file_splice_read(in, ppos, pipe, len, flags); isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) return 0; left = isize - *ppos; if (unlikely(left < len)) len = left; ret = __generic_file_splice_read(in, ppos, pipe, len, flags); if (ret > 0) { *ppos += ret; file_accessed(in); } return ret; }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe10474.82%555.56%
boaz harroshboaz harrosh2517.99%111.11%
miklos szeredimiklos szeredi75.04%111.11%
linus torvaldslinus torvalds21.44%111.11%
ingo molnaringo molnar10.72%111.11%
Total139100.00%9100.00%

EXPORT_SYMBOL(generic_file_splice_read); static const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, };
static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return 1; }

Contributors

PersonTokensPropCommitsCommitProp
miklos szeredimiklos szeredi19100.00%1100.00%
Total19100.00%1100.00%

/* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops);
static ssize_t kernel_readv(struct file *file, const struct iovec *vec, unsigned long vlen, loff_t offset) { mm_segment_t old_fs; loff_t pos = offset; ssize_t res; old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); set_fs(old_fs); return res; }

Contributors

PersonTokensPropCommitsCommitProp
miklos szeredimiklos szeredi7697.44%150.00%
christoph hellwigchristoph hellwig22.56%150.00%
Total78100.00%2100.00%


ssize_t kernel_write(struct file *file, const char *buf, size_t count, loff_t pos) { mm_segment_t old_fs; ssize_t res; old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ res = vfs_write(file, (__force const char __user *)buf, count, &pos); set_fs(old_fs); return res; }

Contributors

PersonTokensPropCommitsCommitProp
miklos szeredimiklos szeredi6798.53%266.67%
al viroal viro11.47%133.33%
Total68100.00%3100.00%

EXPORT_SYMBOL(kernel_write);
ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { unsigned int nr_pages; unsigned int nr_freed; size_t offset; struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; ssize_t res; size_t this_len; int error; int i; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &default_pipe_buf_ops, .spd_release = spd_release_page, }; if (splice_grow_spd(pipe, &spd)) return -ENOMEM; res = -ENOMEM; vec = __vec; if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); if (!vec) goto shrink_ret; } offset = *ppos & ~PAGE_MASK; nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { struct page *page; page = alloc_page(GFP_USER); error = -ENOMEM; if (!page) goto err; this_len = min_t(size_t, len, PAGE_SIZE - offset); vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; spd.pages[i] = page; spd.nr_pages++; len -= this_len; offset = 0; } res = kernel_readv(in, vec, spd.nr_pages, *ppos); if (res < 0) { error = res; goto err; } error = 0; if (!res) goto err; nr_freed = 0; for (i = 0; i < spd.nr_pages; i++) { this_len = min_t(size_t, vec[i].iov_len, res); spd.partial[i].offset = 0; spd.partial[i].len = this_len; if (!this_len) { __free_page(spd.pages[i]); spd.pages[i] = NULL; nr_freed++; } res -= this_len; } spd.nr_pages -= nr_freed; res = splice_to_pipe(pipe, &spd); if (res > 0) *ppos += res; shrink_ret: if (vec != __vec) kfree(vec); splice_shrink_spd(&spd); return res; err: for (i = 0; i < spd.nr_pages; i++) __free_page(spd.pages[i]); res = error; goto shrink_ret; }

Contributors

PersonTokensPropCommitsCommitProp
miklos szeredimiklos szeredi39776.64%116.67%
jens axboejens axboe9718.73%233.33%
eric dumazeteric dumazet142.70%116.67%
andrew mortonandrew morton61.16%116.67%
kirill a. shutemovkirill a. shutemov40.77%116.67%
Total518100.00%6100.00%

EXPORT_SYMBOL(default_file_splice_read); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. */
static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->u.file; loff_t pos = sd->pos; int more; if (!likely(file->f_op->sendpage)) return -EINVAL; more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; if (sd->len < sd->total_len && pipe->nrbufs > 1) more |= MSG_SENDPAGE_NOTLAST; return file->f_op->sendpage(file, buf->page, buf->offset, sd->len, &pos, more); }

Contributors

PersonTokensPropCommitsCommitProp
jens axboejens axboe8974.79%666.67%
eric dumazeteric dumazet1714.29%222.22%
michal miroslawmichal miroslaw1310.92%111.11%
Total119100.00%9100.00%


static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wait)) wake_up_interruptible(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); }

Contributors

PersonTokensPropCommitsCommitProp
miklos szeredimiklos szeredi44100.00%1100.00%
Total44100.00%1100.00%

/** * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in * the pipe or if the requested number of bytes (@sd->total_len) * have been copied. It returns a positive number (one) if the * pipe needs to be filled with more data, zero if the required * number of bytes have been copied and -errno on error. * * This, together with splice_from_pipe_{begin,end,next}, may be * used to implement the functionality of __splice_from_pipe() when * locking is required around copying the pipe buffers to the * destination. */
static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; while (pipe->nrbufs) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; const struct pipe_buf_operations *ops = buf->ops; sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret;