 * File operations used by nfsd. Some of these have been ripped from
 * other parts of the kernel because they weren't exported, others
 * are partial duplicates with added or changed functionality.
 * Note that several functions dget() the dentry upon which they want
 * to act, most notably those that create directory entries. Response
 * dentry's are dput()'d if necessary in the release callback.
 * So if you notice code paths that apparently fail to dput() the
 * dentry, don't worry--they have been taken care of.
 * Copyright (C) 1995-1999 Olaf Kirch <>
 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <>

#include <linux/fs.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/falloc.h>
#include <linux/fcntl.h>
#include <linux/namei.h>
#include <linux/delay.h>
#include <linux/fsnotify.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
#include <linux/jhash.h>
#include <linux/ima.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/security.h>

#include "xdr3.h"
#endif /* CONFIG_NFSD_V3 */

#include "../internal.h"
#include "acl.h"
#include "idmap.h"
#endif /* CONFIG_NFSD_V4 */

#include "nfsd.h"
#include "vfs.h"
#include "trace.h"


 * This is a cache of readahead params that help us choose the proper
 * readahead strategy. Initially, we set all readahead parameters to 0
 * and let the VFS handle things.
 * If you increase the number of cached files very much, you'll need to
 * add a hash table here.

struct raparms {
struct raparms		*p_next;
unsigned int		p_count;
ino_t			p_ino;
dev_t			p_dev;
int			p_set;
struct file_ra_state	p_ra;
unsigned int		p_hindex;

struct raparm_hbucket {
struct raparms		*pb_head;
spinlock_t		pb_lock;




static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];

 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
 * a mount point.
 * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged,
 *  or nfs_ok having possibly changed *dpp and *expp

int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp) { struct svc_export *exp = *expp, *exp2 = NULL; struct dentry *dentry = *dpp; struct path path = {.mnt = mntget(exp->ex_path.mnt), .dentry = dget(dentry)}; int err = 0; err = follow_down(&path); if (err < 0) goto out; exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { err = PTR_ERR(exp2); /* * We normally allow NFS clients to continue * "underneath" a mountpoint that is not exported. * The exception is V4ROOT, where no traversal is ever * allowed without an explicit export of the new * directory. */ if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) err = 0; path_put(&path); goto out; } if (nfsd_v4client(rqstp) || (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { /* successfully crossed mount point */ /* * This is subtle: path.dentry is *not* on path.mnt * at this point. The only reason we are safe is that * original mnt is pinned down by exp, so we should * put path *before* putting exp */ *dpp = path.dentry; path.dentry = dentry; *expp = exp2; exp2 = exp; } path_put(&path); exp_put(exp2); out: return err; }


static void follow_to_parent(struct path *path) { struct dentry *dp; while (path->dentry == path->mnt->mnt_root && follow_up(path)) ; dp = dget_parent(path->dentry); dput(path->dentry); path->dentry = dp; }


static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp) { struct svc_export *exp2; struct path path = {.mnt = mntget((*exp)->ex_path.mnt), .dentry = dget(dparent)}; follow_to_parent(&path); exp2 = rqst_exp_parent(rqstp, &path); if (PTR_ERR(exp2) == -ENOENT) { *dentryp = dget(dparent); } else if (IS_ERR(exp2)) { path_put(&path); return PTR_ERR(exp2); } else { *dentryp = dget(path.dentry); exp_put(*exp); *exp = exp2; } path_put(&path); return 0; }


/* * For nfsd purposes, we treat V4ROOT exports as though there was an * export at *every* directory. */
int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) { if (d_mountpoint(dentry)) return 1; if (nfsd4_is_junction(dentry)) return 1; if (!(exp->ex_flags & NFSEXP_V4ROOT)) return 0; return d_inode(dentry) != NULL; }


j. bruce fieldsj. bruce fields4477.19%133.33%
trond myklebusttrond myklebust1017.54%133.33%
__be32 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, unsigned int len, struct svc_export **exp_ret, struct dentry **dentry_ret) { struct svc_export *exp; struct dentry *dparent; struct dentry *dentry; int host_err; dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); dparent = fhp->fh_dentry; exp = exp_get(fhp->fh_export); /* Lookup the name, but don't follow links */ if (isdotent(name, len)) { if (len==1) dentry = dget(dparent); else if (dparent != exp->ex_path.dentry) dentry = dget_parent(dparent); else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp)) dentry = dget(dparent); /* .. == . just like at / */ else { /* checking mountpoint crossing is very different when stepping up */ host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry); if (host_err) goto out_nfserr; } } else { /* * In the nfsd4_open() case, this may be held across * subsequent open and delegation acquisition which may * need to take the child's i_mutex: */ fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = lookup_one_len(name, dparent, len); host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_nfserr; if (nfsd_mountpoint(dentry, exp)) { /* * We don't need the i_mutex after all. It's * still possible we could open this (regular * files can be mountpoints too), but the * i_mutex is just there to prevent renames of * something that we might be about to delegate, * and a mountpoint won't be renamed: */ fh_unlock(fhp); if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { dput(dentry); goto out_nfserr; } } } *dentry_ret = dentry; *exp_ret = exp; return 0; out_nfserr: exp_put(exp); return nfserrno(host_err); }


j. bruce fieldsj. bruce fields6422.54%718.92%
neil brownneil brown3311.62%616.22%
al viroal viro175.99%410.81%
linus torvaldslinus torvalds144.93%25.41%
andrew mortonandrew morton31.06%25.41%
jan blunckjan blunck31.06%12.70%
kinglong meekinglong mee20.70%12.70%
/* * Look up one component of a pathname. * N.B. After this call _both_ fhp and resfh need an fh_put * * If the lookup would cross a mountpoint, and the mounted filesystem * is exported to the client with NFSEXP_NOHIDE, then the lookup is * accepted as it stands and the mounted directory is * returned. Otherwise the covered directory is returned. * NOTE: this mountpoint crossing is not supported properly by all * clients and is explicitly disallowed for NFSv3 * NeilBrown <> */
__be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, unsigned int len, struct svc_fh *resfh) { struct svc_export *exp; struct dentry *dentry; __be32 err; err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); if (err) return err; err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); if (err) return err; err = check_nfsd_access(exp, rqstp); if (err) goto out; /* * Note: we compose the file handle now, but as the * dentry may be negative, it may need to be updated. */ err = fh_compose(resfh, exp, dentry, fhp); if (!err && d_really_is_negative(dentry)) err = nfserr_noent; out: dput(dentry); exp_put(exp); return err; }


j. bruce fieldsj. bruce fields8658.50%214.29%
andy adamsonandy adamson1812.24%17.14%
andrew mortonandrew morton106.80%214.29%
david howellsdavid howells32.04%17.14%
linus torvaldslinus torvalds21.36%17.14%
/* * Commit metadata changes to stable storage. */
static int commit_metadata(struct svc_fh *fhp) { struct inode *inode = d_inode(fhp->fh_dentry); const struct export_operations *export_ops = inode->i_sb->s_export_op; if (!EX_ISSYNC(fhp->fh_export)) return 0; if (export_ops->commit_metadata) return export_ops->commit_metadata(inode); return sync_inode_metadata(inode, 1); }


/* * Go over the attributes and take care of the small differences between * NFS semantics and what Linux expects. */
static void nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { iap->ia_mode &= S_IALLUGO; iap->ia_mode |= (inode->i_mode & ~S_IALLUGO); } /* Revoke setuid/setgid on chown */ if (!S_ISDIR(inode->i_mode) && ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ iap->ia_mode &= ~S_ISUID; if (iap->ia_mode & S_IXGRP) iap->ia_mode &= ~S_ISGID; } else { /* set ATTR_KILL_* bits and let VFS handle it */ iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); } } }


static __be32 nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap) { struct inode *inode = d_inode(fhp->fh_dentry); int host_err; if (iap->ia_size < inode->i_size) { __be32 err; err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); if (err) return err; } host_err = get_write_access(inode); if (host_err) goto out_nfserrno; host_err = locks_verify_truncate(inode, NULL, iap->ia_size); if (host_err) goto out_put_write_access; return 0; out_put_write_access: put_write_access(inode); out_nfserrno: return nfserrno(host_err); }


/* * Set various file attributes. After this call fhp needs an fh_put. */
__be32 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, int check_guard, time_t guardtime) { struct dentry *dentry; struct inode *inode; int accmode = NFSD_MAY_SATTR; umode_t ftype = 0; __be32 err; int host_err; bool get_write_count; int size_change = 0; if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; if (iap->ia_valid & ATTR_SIZE) ftype = S_IFREG; /* Callers that do fh_verify should do the fh_want_write: */ get_write_count = !fhp->fh_dentry; /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) goto out; if (get_write_count) { host_err = fh_want_write(fhp); if (host_err) return nfserrno(host_err); } dentry = fhp->fh_dentry; inode = d_inode(dentry); /* Ignore any mode updates on symlinks */ if (S_ISLNK(inode->i_mode)) iap->ia_valid &= ~ATTR_MODE; if (!iap->ia_valid) goto out; nfsd_sanitize_attrs(inode, iap); /* * The size case is special, it changes the file in addition to the * attributes. */ if (iap->ia_valid & ATTR_SIZE) { err = nfsd_get_write_access(rqstp, fhp, iap); if (err) goto out; size_change = 1; /* * RFC5661, Section 18.30.4: * Changing the size of a file with SETATTR indirectly * changes the time_modify and change attributes. * * (and similar for the older RFCs) */ if (iap->ia_size != i_size_read(inode)) iap->ia_valid |= ATTR_MTIME; } iap->ia_valid |= ATTR_CTIME; if (check_guard && guardtime != inode->i_ctime.tv_sec) { err = nfserr_notsync; goto out_put_write_access; } fh_lock(fhp); host_err = notify_change(dentry, iap, NULL); fh_unlock(fhp); err = nfserrno(host_err); out_put_write_access: if (size_change) put_write_access(inode); if (!err) err = nfserrno(commit_metadata(fhp)); out: return err; }


christoph hellwigchristoph hellwig18556.40%529.41%
j. bruce fieldsj. bruce fields11334.45%317.65%
al viroal viro82.44%211.76%
david howellsdavid howells30.91%15.88%
linus torvaldslinus torvalds10.30%15.88%
#if defined(CONFIG_NFSD_V4) /* * NFS junction information is stored in an extended attribute. */ #define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs" /** * nfsd4_is_junction - Test if an object could be an NFS junction * * @dentry: object to test * * Returns 1 if "dentry" appears to contain NFS junction information. * Otherwise 0 is returned. */
int nfsd4_is_junction(struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (inode == NULL) return 0; if (inode->i_mode & S_IXUGO) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) return 0; return 1; }


__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, struct xdr_netobj *label) { __be32 error; int host_error; struct dentry *dentry; error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); if (error) return error; dentry = fhp->fh_dentry; inode_lock(d_inode(dentry)); host_error = security_inode_setsecctx(dentry, label->data, label->len); inode_unlock(d_inode(dentry)); return nfserrno(host_error); }


__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, struct xdr_netobj *label) { return nfserr_notsupp; }


__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { return nfserrno(do_clone_file_range(src, src_pos, dst, dst_pos, count)); }


ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { /* * Limit copy to 4MB to prevent indefinitely blocking an nfsd * thread and client rpc slot. The choice of 4MB is somewhat * arbitrary. We might instead base this on r/wsize, or make it * tunable, or use a time instead of a byte limit, or implement * asynchronous copy. In theory a client could also recognize a * limit like this and pipeline multiple COPY requests. */ count = min_t(u64, count, 1 << 22); return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); }


__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) { int error; if (!S_ISREG(file_inode(file)->i_mode)) return nfserr_inval; error = vfs_fallocate(file, flags, offset, len); if (!error) error = commit_metadata(fhp); return nfserrno(error); }


#endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 /* * Check server access rights to a file system object */ struct accessmap { u32 access; int how; }; static struct accessmap nfs3_regaccess[] = { { NFS3_ACCESS_READ, NFSD_MAY_READ }, { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, { 0, 0 } }; static struct accessmap nfs3_diraccess[] = { { NFS3_ACCESS_READ, NFSD_MAY_READ }, { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC }, { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC}, { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, { 0, 0 } }; static struct accessmap nfs3_anyaccess[] = { /* Some clients - Solaris 2.6 at least, make an access call * to the server to check for access for things like /dev/null * (which really, the server doesn't care about). So * We provide simple access checking for them, looking * mainly at mode bits, and we make sure to ignore read-only * filesystem checks */ { NFS3_ACCESS_READ, NFSD_MAY_READ }, { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, { 0, 0 } };
__be32 nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) { struct accessmap *map; struct svc_export *export; struct dentry *dentry; u32 query, result = 0, sresult = 0; __be32 error; error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); if (error) goto out; export = fhp->fh_export; dentry = fhp->fh_dentry; if (d_is_reg(dentry)) map = nfs3_regaccess; else if (d_is_dir(dentry)) map = nfs3_diraccess; else map = nfs3_anyaccess; query = *access; for (; map->access; map++) { if (map->access & query) { __be32 err2; sresult |= map->access; err2 = nfsd_permission(rqstp, export, dentry, map->how); switch (err2) { case nfs_ok: result |= map->access; break; /* the following error codes just mean the access was not allowed, * rather than an error occurred */ case nfserr_rofs: case nfserr_acces: case nfserr_perm: /* simply don't "or" in the access bit. */ break; default: error = err2; goto out; } } } *access = result; if (supported) *supported = sresult; out: return error; }


#endif /* CONFIG_NFSD_V3 */
static int nfsd_open_break_lease(struct inode *inode, int access) { unsigned int mode; if (access & NFSD_MAY_NOT_BREAK_LEASE) return 0; mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; return break_lease(inode, mode | O_NONBLOCK); }


/* * Open an existing file or directory. * The may_flags argument indicates the type of open (read/write/lock) * and additional flags. * N.B. After this call fhp needs an fh_put */
__be32 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { struct path path; struct inode *inode; struct file *file; int flags = O_RDONLY|O_LARGEFILE; __be32 err; int host_err = 0; validate_process_creds(); /* * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE * in case a chmod has now revoked permission. * * Arguably we should also allow the owner override for * directories, but we never have and it doesn't seem to have * caused anyone a problem. If we were to change this, note * also that our filldir callbacks would need a variant of * lookup_one_len that doesn't check permissions. */ if (type == S_IFREG) may_flags |= NFSD_MAY_OWNER_OVERRIDE; err = fh_verify(rqstp, fhp, type, may_flags); if (err) goto out; path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; inode = d_inode(path.dentry); /* Disallow write access to files with the append-only bit set * or any access when mandatory locking enabled */ err = nfserr_perm; if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; /* * We must ignore files (but only files) which might have mandatory * locks on them because there is no way to know if the accesser has * the lock. */ if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) goto out; if (!inode->i_fop) goto out; host_err = nfsd_open_break_lease(inode, may_flags); if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; if (may_flags & NFSD_MAY_WRITE) { if (may_flags & NFSD_MAY_READ) flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; } file = dentry_open(&path, flags, current_cred()); if (IS_ERR(file)) { host_err = PTR_ERR(file); goto out_nfserr; } host_err = ima_file_check(file, may_flags, 0); if (host_err) { fput(file); goto out_nfserr; } if (may_flags & NFSD_MAY_64BIT_COOKIE) file->f_mode |= FMODE_64BITHASH; else file->f_mode |= FMODE_32BITHASH; *filp = file; out_nfserr: err = nfserrno(host_err); out: validate_process_creds(); return err; }


