Release 4.7 fs/eventpoll.c
/*
* fs/eventpoll.c (Efficient event retrieval implementation)
* Copyright (C) 2001,...,2009 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Davide Libenzi <davidel@xmailserver.org>
*
*/
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/rbtree.h>
#include <linux/wait.h>
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
/*
* LOCKING:
* There are three level of locking required by epoll :
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
* 3) ep->lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
* We need a spinlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
* a spinlock. During the event transfer loop (from kernel to
* user space) we could end up sleeping due a copy_to_user(), so
* we need a lock that will allow us to sleep. This lock is a
* mutex (ep->mtx). It is acquired during the event transfer loop,
* during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
* Then we also need a global mutex to serialize eventpoll_release_file()
* and ep_free().
* This mutex is acquired by ep_free() during the epoll file
* cleanup path and it is also acquired by eventpoll_release_file()
* if a file has been pushed inside an epoll set and it is then
* close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
* It is also acquired when inserting an epoll fd onto another epoll
* fd. We do this so that we walk the epoll tree and ensure that this
* insertion does not create a cycle of epoll file descriptors, which
* could lead to deadlock. We need a global mutex to prevent two
* simultaneous inserts (A into B and B into A) from racing and
* constructing a cycle without either insert observing that it is
* going to.
* It is necessary to acquire multiple "ep->mtx"es at once in the
* case when one epoll fd is added to another. In this case, we
* always acquire the locks in the order of nesting (i.e. after
* epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
* before e2->mtx). Since we disallow cycles of epoll file
* descriptors, this ensures that the mutexes are well-ordered. In
* order to communicate this nesting to lockdep, when walking a tree
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
* mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
* a better scalability.
*/
/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
#define EP_UNACTIVE_PTR ((void *) -1L)
#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
struct epoll_filefd {
struct file *file;
int fd;
}
__packed;
/*
* Structure used to track possible nested calls, for too deep recursions
* and loop cycles.
*/
struct nested_call_node {
struct list_head llink;
void *cookie;
void *ctx;
};
/*
* This structure is used as collector for nested calls, to check for
* maximum recursion dept and loop cycles.
*/
struct nested_calls {
struct list_head tasks_call_list;
spinlock_t lock;
};
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
* Avoid increasing the size of this struct, there can be many thousands
* of these on a server and we do not want this to take another cache line.
*/
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree */
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations */
int nwait;
/* List containing poll wait queues */
struct list_head pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
};
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
*/
struct eventpoll {
/* Protect the access to this structure */
spinlock_t lock;
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
struct list_head rdllist;
/* RB tree root used to store monitored fd structs */
struct rb_root rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
/* Wait structure used by the poll hooks */
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
struct list_head llink;
/* The "base" pointer is set to the container "struct epitem" */
struct epitem *base;
/*
* Wait queue item that will be linked to the target file wait
* queue head.
*/
wait_queue_t wait;
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;
};
/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};
/* Used by the ep_send_events() function as callback private data */
struct ep_send_events_data {
int maxevents;
struct epoll_event __user *events;
};
/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;
/*
* This mutex is used to serialize ep_free() and eventpoll_release_file().
*/
static DEFINE_MUTEX(epmutex);
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;
/* Used for safe wake up implementation */
static struct nested_calls poll_safewake_ncalls;
/* Used to call file's f_op->poll() under the nested calls boundaries */
static struct nested_calls poll_readywalk_ncalls;
/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
static LIST_HEAD(visited_list);
/*
* List of files with newly added links, where we may need to limit the number
* of emanating paths. Protected by the epmutex.
*/
static LIST_HEAD(tfile_check_list);
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
static long zero;
static long long_max = LONG_MAX;
struct ctl_table epoll_table[] = {
{
.procname = "max_user_watches",
.data = &max_user_watches,
.maxlen = sizeof(max_user_watches),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &zero,
.extra2 = &long_max,
},
{ }
};
#endif /* CONFIG_SYSCTL */
static const struct file_operations eventpoll_fops;
static inline int is_file_epoll(struct file *f)
{
return f->f_op == &eventpoll_fops;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
jason baron | jason baron | 20 | 100.00% | 1 | 100.00% |
| Total | 20 | 100.00% | 1 | 100.00% |
/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
struct file *file, int fd)
{
ffd->file = file;
ffd->fd = fd;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 29 | 90.62% | 3 | 75.00% |
andrew morton | andrew morton | 3 | 9.38% | 1 | 25.00% |
| Total | 32 | 100.00% | 4 | 100.00% |
/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
struct epoll_filefd *p2)
{
return (p1->file > p2->file ? +1:
(p1->file < p2->file ? -1 : p1->fd - p2->fd));
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 52 | 100.00% | 2 | 100.00% |
| Total | 52 | 100.00% | 2 | 100.00% |
/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct list_head *p)
{
return !list_empty(p);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 19 | 100.00% | 3 | 100.00% |
| Total | 19 | 100.00% | 3 | 100.00% |
static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
{
return container_of(p, struct eppoll_entry, wait);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
oleg nesterov | oleg nesterov | 24 | 100.00% | 1 | 100.00% |
| Total | 24 | 100.00% | 1 | 100.00% |
/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
{
return container_of(p, struct eppoll_entry, wait)->base;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 26 | 100.00% | 4 | 100.00% |
| Total | 26 | 100.00% | 4 | 100.00% |
/* Get the "struct epitem" from an epoll queue wrapper */
static inline struct epitem *ep_item_from_epqueue(poll_table *p)
{
return container_of(p, struct ep_pqueue, pt)->epi;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 26 | 100.00% | 4 | 100.00% |
| Total | 26 | 100.00% | 4 | 100.00% |
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 13 | 86.67% | 3 | 75.00% |
andrew morton | andrew morton | 2 | 13.33% | 1 | 25.00% |
| Total | 15 | 100.00% | 4 | 100.00% |
/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)
{
INIT_LIST_HEAD(&ncalls->tasks_call_list);
spin_lock_init(&ncalls->lock);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 27 | 100.00% | 3 | 100.00% |
| Total | 27 | 100.00% | 3 | 100.00% |
/**
* ep_events_available - Checks if ready events might be available.
*
* @ep: Pointer to the eventpoll context.
*
* Returns: Returns a value different than zero if ready events are available,
* or zero otherwise.
*/
static inline int ep_events_available(struct eventpoll *ep)
{
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 28 | 100.00% | 1 | 100.00% |
| Total | 28 | 100.00% | 1 | 100.00% |
/**
* ep_call_nested - Perform a bound (possibly) nested call, by checking
* that the recursion limit is not exceeded, and that
* the same nested call (by the meaning of same cookie) is
* no re-entered.
*
* @ncalls: Pointer to the nested_calls structure to be used for this call.
* @max_nests: Maximum number of allowed nesting calls.
* @nproc: Nested call core function pointer.
* @priv: Opaque data to be passed to the @nproc callback.
* @cookie: Cookie to be used to identify this nested call.
* @ctx: This instance context.
*
* Returns: Returns the code returned by the @nproc callback, or -1 if
* the maximum recursion limit has been exceeded.
*/
static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
int (*nproc)(void *, void *, int), void *priv,
void *cookie, void *ctx)
{
int error, call_nests = 0;
unsigned long flags;
struct list_head *lsthead = &ncalls->tasks_call_list;
struct nested_call_node *tncur;
struct nested_call_node tnode;
spin_lock_irqsave(&ncalls->lock, flags);
/*
* Try to see if the current task is already inside this wakeup call.
* We use a list here, since the population inside this set is always
* very much limited.
*/
list_for_each_entry(tncur, lsthead, llink) {
if (tncur->ctx == ctx &&
(tncur->cookie == cookie || ++call_nests > max_nests)) {
/*
* Ops ... loop detected or maximum nest level reached.
* We abort this wake by breaking the cycle itself.
*/
error = -1;
goto out_unlock;
}
}
/* Add the current task and cookie to the list */
tnode.ctx = ctx;
tnode.cookie = cookie;
list_add(&tnode.llink, lsthead);
spin_unlock_irqrestore(&ncalls->lock, flags);
/* Call the nested function */
error = (*nproc)(priv, cookie, call_nests);
/* Remove the current task from the list */
spin_lock_irqsave(&ncalls->lock, flags);
list_del(&tnode.llink);
out_unlock:
spin_unlock_irqrestore(&ncalls->lock, flags);
return error;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 193 | 93.69% | 9 | 75.00% |
tony battersby | tony battersby | 8 | 3.88% | 1 | 8.33% |
matthias kaehlcke | matthias kaehlcke | 4 | 1.94% | 1 | 8.33% |
peter zijlstra | peter zijlstra | 1 | 0.49% | 1 | 8.33% |
| Total | 206 | 100.00% | 12 | 100.00% |
/*
* As described in commit 0ccf831cb lockdep: annotate epoll
* the use of wait queues used by epoll is done in a very controlled
* manner. Wake ups can nest inside each other, but are never done
* with the same locking. For example:
*
* dfd = socket(...);
* efd1 = epoll_create();
* efd2 = epoll_create();
* epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
* epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
*
* When a packet arrives to the device underneath "dfd", the net code will
* issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
* callback wakeup entry on that queue, and the wake_up() performed by the
* "dfd" net code will end up in ep_poll_callback(). At this point epoll
* (efd1) notices that it may have some event ready, so it needs to wake up
* the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
* that ends up in another wake_up(), after having checked about the
* recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
* avoid stack blasting.
*
* When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
* this special case of epoll.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
unsigned long events, int subclass)
{
unsigned long flags;
spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
wake_up_locked_poll(wqueue, events);
spin_unlock_irqrestore(&wqueue->lock, flags);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 51 | 100.00% | 1 | 100.00% |
| Total | 51 | 100.00% | 1 | 100.00% |
#else
static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
unsigned long events, int subclass)
{
wake_up_poll(wqueue, events);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 25 | 100.00% | 1 | 100.00% |
| Total | 25 | 100.00% | 1 | 100.00% |
#endif
static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
{
ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
1 + call_nests);
return 0;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 35 | 100.00% | 2 | 100.00% |
| Total | 35 | 100.00% | 2 | 100.00% |
/*
* Perform a safe wake up of the poll wait list. The problem is that
* with the new callback'd wake up system, it is possible that the
* poll callback is reentered from inside the call to wake_up() done
* on the poll wait queue head. The rule is that we cannot reenter the
* wake up code from the same task more than EP_MAX_NESTS times,
* and we cannot reenter the same wait queue head at all. This will
* enable to have a hierarchy of epoll file descriptor of no more than
* EP_MAX_NESTS deep.
*/
static void ep_poll_safewake(wait_queue_head_t *wq)
{
int this_cpu = get_cpu();
ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
put_cpu();
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 42 | 100.00% | 3 | 100.00% |
| Total | 42 | 100.00% | 3 | 100.00% |
static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
wait_queue_head_t *whead;
rcu_read_lock();
/* If it is cleared by POLLFREE, it should be rcu-safe */
whead = rcu_dereference(pwq->whead);
if (whead)
remove_wait_queue(whead, &pwq->wait);
rcu_read_unlock();
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
oleg nesterov | oleg nesterov | 45 | 100.00% | 1 | 100.00% |
| Total | 45 | 100.00% | 1 | 100.00% |
/*
* This function unregisters poll callbacks from the associated file
* descriptor. Must be called with "mtx" held (or "epmutex" if called from
* ep_free).
*/
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
struct list_head *lsthead = &epi->pwqlist;
struct eppoll_entry *pwq;
while (!list_empty(lsthead)) {
pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
list_del(&pwq->llink);
ep_remove_wait_queue(pwq);
kmem_cache_free(pwq_cache, pwq);
}
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 69 | 94.52% | 4 | 57.14% |
andrew morton | andrew morton | 2 | 2.74% | 1 | 14.29% |
tony battersby | tony battersby | 1 | 1.37% | 1 | 14.29% |
oleg nesterov | oleg nesterov | 1 | 1.37% | 1 | 14.29% |
| Total | 73 | 100.00% | 7 | 100.00% |
/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
{
return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
eric wong | eric wong | 32 | 100.00% | 1 | 100.00% |
| Total | 32 | 100.00% | 1 | 100.00% |
/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
{
struct wakeup_source *ws = ep_wakeup_source(epi);
if (ws)
__pm_stay_awake(ws);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
eric wong | eric wong | 31 | 100.00% | 1 | 100.00% |
| Total | 31 | 100.00% | 1 | 100.00% |
static inline bool ep_has_wakeup_source(struct epitem *epi)
{
return rcu_access_pointer(epi->ws) ? true : false;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
eric wong | eric wong | 24 | 100.00% | 1 | 100.00% |
| Total | 24 | 100.00% | 1 | 100.00% |
/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
{
struct wakeup_source *ws;
rcu_read_lock();
ws = rcu_dereference(epi->ws);
if (ws)
__pm_stay_awake(ws);
rcu_read_unlock();
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
eric wong | eric wong | 41 | 100.00% | 1 | 100.00% |
| Total | 41 | 100.00% | 1 | 100.00% |
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
* O(NumReady) performance.
*
* @ep: Pointer to the epoll private data structure.
* @sproc: Pointer to the scan callback.
* @priv: Private opaque data passed to the @sproc callback.
* @depth: The current depth of recursive f_op->poll calls.
* @ep_locked: caller already holds ep->mtx
*
* Returns: The same integer error code returned by the @sproc callback.
*/
static int ep_scan_ready_list(struct eventpoll *ep,
int (*sproc)(struct eventpoll *,
struct list_head *, void *),
void *priv, int depth, bool ep_locked)
{
int error, pwake = 0;
unsigned long flags;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
/*
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl().
*/
if (!ep_locked)
mutex_lock_nested(&ep->mtx, depth);
/*
* Steal the ready list, and re-init the original one to the
* empty list. Also, set ep->ovflist to NULL so that events
* happening while looping w/out locks, are not lost. We cannot
* have the poll callback to queue directly on ep->rdllist,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
spin_lock_irqsave(&ep->lock, flags);
list_splice_init(&ep->rdllist, &txlist);
ep->ovflist = NULL;
spin_unlock_irqrestore(&ep->lock, flags);
/*
* Now call the callback function.
*/
error = (*sproc)(ep, &txlist, priv);
spin_lock_irqsave(&ep->lock, flags);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
for (nepi = ep->ovflist; (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
* During the "sproc" callback execution time, items are
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
/*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
* releasing the lock, events will be queued in the normal way inside
* ep->rdllist.
*/
ep->ovflist = EP_UNACTIVE_PTR;
/*
* Quickly re-inject items left on "txlist".
*/
list_splice(&txlist, &ep->rdllist);
__pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) {
/*
* Wake up (if active) both the eventpoll wait list and
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irqrestore(&ep->lock, flags);
if (!ep_locked)
mutex_unlock(&ep->mtx);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return error;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 281 | 88.92% | 13 | 68.42% |
arve hjonnevag | arve hjonnevag | 13 | 4.11% | 1 | 5.26% |
jason baron | jason baron | 13 | 4.11% | 1 | 5.26% |
nelson elhage | nelson elhage | 6 | 1.90% | 1 | 5.26% |
eric wong | eric wong | 1 | 0.32% | 1 | 5.26% |
andrew morton | andrew morton | 1 | 0.32% | 1 | 5.26% |
tony battersby | tony battersby | 1 | 0.32% | 1 | 5.26% |
| Total | 316 | 100.00% | 19 | 100.00% |
static void epi_rcu_free(struct rcu_head *head)
{
struct epitem *epi = container_of(head, struct epitem, rcu);
kmem_cache_free(epi_cache, epi);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
jason baron | jason baron | 33 | 100.00% | 1 | 100.00% |
| Total | 33 | 100.00% | 1 | 100.00% |
/*
* Removes a "struct epitem" from the eventpoll RB tree and deallocates
* all the associated resources. Must be called with "mtx" held.
*/
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
unsigned long flags;
struct file *file = epi->ffd.file;
/*
* Removes poll wait queue hooks. We _have_ to do this without holding
* the "ep->lock" otherwise a deadlock might occur. This because of the
* sequence of the lock acquisition. Here we do "ep->lock" then the wait
* queue head lock when unregistering the wait queue. The wakeup callback
* will run by holding the wait queue head lock and will call our callback
* that will try to get "ep->lock".
*/
ep_unregister_pollwait(ep, epi);
/* Remove the current item from the list of epoll hooks */
spin_lock(&file->f_lock);
list_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
rb_erase(&epi->rbn, &ep->rbr);
spin_lock_irqsave(&ep->lock, flags);
if (ep_is_linked(&epi->rdllink))
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
* At this point it is safe to free the eventpoll item. Use the union
* field epi->rcu, since we are trying to minimize the size of
* 'struct epitem'. The 'rbn' field is no longer in use. Protected by
* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
* use of the rbn field.
*/
call_rcu(&epi->rcu, epi_rcu_free);
atomic_long_dec(&ep->user->epoll_watches);
return 0;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 124 | 84.35% | 5 | 50.00% |
jason baron | jason baron | 8 | 5.44% | 1 | 10.00% |
andrew morton | andrew morton | 6 | 4.08% | 1 | 10.00% |
arve hjonnevag | arve hjonnevag | 5 | 3.40% | 1 | 10.00% |
eric wong | eric wong | 3 | 2.04% | 1 | 10.00% |
robin holt | robin holt | 1 | 0.68% | 1 | 10.00% |
| Total | 147 | 100.00% | 10 | 100.00% |
static void ep_free(struct eventpoll *ep)
{
struct rb_node *rbp;
struct epitem *epi;
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
ep_poll_safewake(&ep->poll_wait);
/*
* We need to lock this because we could be hit by
* eventpoll_release_file() while we're freeing the "struct eventpoll".
* We do not need to hold "ep->mtx" here because the epoll file
* is on the way to be removed and no one has references to it
* anymore. The only hit might come from eventpoll_release_file() but
* holding "epmutex" is sufficient here.
*/
mutex_lock(&epmutex);
/*
* Walks through the whole tree by unregistering poll callbacks.
*/
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_unregister_pollwait(ep, epi);
cond_resched();
}
/*
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
* us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
mutex_lock(&ep->mtx);
while ((rbp = rb_first(&ep->rbr)) != NULL) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
cond_resched();
}
mutex_unlock(&ep->mtx);
mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx);
free_uid(ep->user);
wakeup_source_unregister(ep->ws);
kfree(ep);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 146 | 79.78% | 7 | 58.33% |
eric wong | eric wong | 17 | 9.29% | 1 | 8.33% |
arve hjonnevag | arve hjonnevag | 7 | 3.83% | 1 | 8.33% |
eric dumazet | eric dumazet | 6 | 3.28% | 1 | 8.33% |
andrew morton | andrew morton | 6 | 3.28% | 1 | 8.33% |
lucas de marchi | lucas de marchi | 1 | 0.55% | 1 | 8.33% |
| Total | 183 | 100.00% | 12 | 100.00% |
static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
struct eventpoll *ep = file->private_data;
if (ep)
ep_free(ep);
return 0;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 36 | 97.30% | 5 | 83.33% |
jesper juhl | jesper juhl | 1 | 2.70% | 1 | 16.67% |
| Total | 37 | 100.00% | 6 | 100.00% |
static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
{
pt->_key = epi->event.events;
return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
eric wong | eric wong | 53 | 100.00% | 1 | 100.00% |
| Total | 53 | 100.00% | 1 | 100.00% |
static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct epitem *epi, *tmp;
poll_table pt;
init_poll_funcptr(&pt, NULL);
list_for_each_entry_safe(epi, tmp, head, rdllink) {
if (ep_item_poll(epi, &pt))
return POLLIN | POLLRDNORM;
else {
/*
* Item has been dropped into the ready list by the poll
* callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here.
*/
__pm_relax(ep_wakeup_source(epi));
list_del_init(&epi->rdllink);
}
}
return 0;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 67 | 75.28% | 6 | 60.00% |
hans verkuil | hans verkuil | 13 | 14.61% | 1 | 10.00% |
arve hjonnevag | arve hjonnevag | 5 | 5.62% | 1 | 10.00% |
eric wong | eric wong | 4 | 4.49% | 2 | 20.00% |
| Total | 89 | 100.00% | 10 | 100.00% |
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt);
struct readyevents_arg {
struct eventpoll *ep;
bool locked;
};
static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
{
struct readyevents_arg *arg = priv;
return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
call_nests + 1, arg->locked);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 26 | 59.09% | 4 | 66.67% |
jason baron | jason baron | 14 | 31.82% | 1 | 16.67% |
nelson elhage | nelson elhage | 4 | 9.09% | 1 | 16.67% |
| Total | 44 | 100.00% | 6 | 100.00% |
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
int pollflags;
struct eventpoll *ep = file->private_data;
struct readyevents_arg arg;
/*
* During ep_insert() we already hold the ep->mtx for the tfile.
* Prevent re-aquisition.
*/
arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
arg.ep = ep;
/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);
/*
* Proceed to find out if wanted events are really available inside
* the ready list. This need to be done under ep_call_nested()
* supervision, since the call to f_op->poll() done on listed files
* could re-enter here.
*/
pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
ep_poll_readyevents_proc, &arg, ep, current);
return pollflags != -1 ? pollflags : 0;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 69 | 71.88% | 7 | 87.50% |
jason baron | jason baron | 27 | 28.12% | 1 | 12.50% |
| Total | 96 | 100.00% | 8 | 100.00% |
#ifdef CONFIG_PROC_FS
static void ep_show_fdinfo(struct seq_file *m, struct file *f)
{
struct eventpoll *ep = f->private_data;
struct rb_node *rbp;
mutex_lock(&ep->mtx);
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
epi->ffd.fd, epi->event.events,
(long long)epi->event.data);
if (seq_has_overflowed(m))
break;
}
mutex_unlock(&ep->mtx);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
cyrill gorcunov | cyrill gorcunov | 116 | 95.87% | 1 | 50.00% |
joe perches | joe perches | 5 | 4.13% | 1 | 50.00% |
| Total | 121 | 100.00% | 2 | 100.00% |
#endif
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = ep_show_fdinfo,
#endif
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
.llseek = noop_llseek,
};
/*
* This is called from eventpoll_release() to unlink files from the eventpoll
* interface. We need to have this facility to cleanup correctly files that are
* closed without being removed from the eventpoll interface.
*/
void eventpoll_release_file(struct file *file)
{
struct eventpoll *ep;
struct epitem *epi, *next;
/*
* We don't want to get "file->f_lock" because it is not
* necessary. It is not necessary because we're in the "struct file"
* cleanup path, and this means that no one is using this file anymore.
* So, for example, epoll_ctl() cannot hit here since if we reach this
* point, the file counter already went to zero and fget() would fail.
* The only hit might come from ep_free() but by holding the mutex
* will correctly serialize the operation. We do need to acquire
* "ep->mtx" after "epmutex" because ep_remove() requires it when called
* from anywhere but ep_free().
*
* Besides, ep_remove() acquires the lock, so we can't hold it here.
*/
mutex_lock(&epmutex);
list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
ep = epi->ep;
mutex_lock_nested(&ep->mtx, 0);
ep_remove(ep, epi);
mutex_unlock(&ep->mtx);
}
mutex_unlock(&epmutex);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 65 | 82.28% | 6 | 60.00% |
konstantin khlebnikov | konstantin khlebnikov | 6 | 7.59% | 1 | 10.00% |
jason baron | jason baron | 4 | 5.06% | 1 | 10.00% |
nelson elhage | nelson elhage | 3 | 3.80% | 1 | 10.00% |
lucas de marchi | lucas de marchi | 1 | 1.27% | 1 | 10.00% |
| Total | 79 | 100.00% | 10 | 100.00% |
static int ep_alloc(struct eventpoll **pep)
{
int error;
struct user_struct *user;
struct eventpoll *ep;
user = get_current_user();
error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL);
if (unlikely(!ep))
goto free_uid;
spin_lock_init(&ep->lock);
mutex_init(&ep->mtx);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT;
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;
*pep = ep;
return 0;
free_uid:
free_uid(user);
return error;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
davide libenzi | davide libenzi | 135 | 100.00% | 8 | 100.00% |
| Total | 135 | 100.00% | 8 | 100.00% |
/*
* Search the file inside the eventpoll tree. The RB tree operations
* are protected by the "mtx" mutex, and ep_find() must be called with
* "mtx" held.
*/
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
int kcmp;
struct rb_node *rbp;