Release 4.15 kernel/futex.c
/*
* Fast Userspace Mutexes (which I call "Futexes!").
* (C) Rusty Russell, IBM 2002
*
* Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
*
* Removed page pinning, fix privately mapped COW pages and other cleanups
* (C) Copyright 2003, 2004 Jamie Lokier
*
* Robust futex support started by Ingo Molnar
* (C) Copyright 2006 Red Hat Inc, All Rights Reserved
* Thanks to Thomas Gleixner for suggestions, analysis and fixes.
*
* PI-futex support started by Ingo Molnar and Thomas Gleixner
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
* PRIVATE futexes by Eric Dumazet
* Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
*
* Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
* Copyright (C) IBM Corporation, 2009
* Thanks to Thomas Gleixner for conceptual design and careful reviews.
*
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
*
* "The futexes are also cursed."
* "But they come in a choice of three flavours!"
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/magic.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/sched/rt.h>
#include <linux/sched/wake_q.h>
#include <linux/sched/mm.h>
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/bootmem.h>
#include <linux/fault-inject.h>
#include <asm/futex.h>
#include "locking/rtmutex_common.h"
/*
* READ this before attempting to hack on futexes!
*
* Basic futex operation and ordering guarantees
* =============================================
*
* The waiter reads the futex value in user space and calls
* futex_wait(). This function computes the hash bucket and acquires
* the hash bucket lock. After that it reads the futex user space value
* again and verifies that the data has not changed. If it has not changed
* it enqueues itself into the hash bucket, releases the hash bucket lock
* and schedules.
*
* The waker side modifies the user space value of the futex and calls
* futex_wake(). This function computes the hash bucket and acquires the
* hash bucket lock. Then it looks for waiters on that futex in the hash
* bucket and wakes them.
*
* In futex wake up scenarios where no tasks are blocked on a futex, taking
* the hb spinlock can be avoided and simply return. In order for this
* optimization to work, ordering guarantees must exist so that the waiter
* being added to the list is acknowledged when the list is concurrently being
* checked by the waker, avoiding scenarios like the following:
*
* CPU 0 CPU 1
* val = *futex;
* sys_futex(WAIT, futex, val);
* futex_wait(futex, val);
* uval = *futex;
* *futex = newval;
* sys_futex(WAKE, futex);
* futex_wake(futex);
* if (queue_empty())
* return;
* if (uval == val)
* lock(hash_bucket(futex));
* queue();
* unlock(hash_bucket(futex));
* schedule();
*
* This would cause the waiter on CPU 0 to wait forever because it
* missed the transition of the user space value from val to newval
* and the waker did not find the waiter in the hash bucket queue.
*
* The correct serialization ensures that a waiter either observes
* the changed user space value before blocking or is woken by a
* concurrent waker:
*
* CPU 0 CPU 1
* val = *futex;
* sys_futex(WAIT, futex, val);
* futex_wait(futex, val);
*
* waiters++; (a)
* smp_mb(); (A) <-- paired with -.
* |
* lock(hash_bucket(futex)); |
* |
* uval = *futex; |
* | *futex = newval;
* | sys_futex(WAKE, futex);
* | futex_wake(futex);
* |
* `--------> smp_mb(); (B)
* if (uval == val)
* queue();
* unlock(hash_bucket(futex));
* schedule(); if (waiters)
* lock(hash_bucket(futex));
* else wake_waiters(futex);
* waiters--; (b) unlock(hash_bucket(futex));
*
* Where (A) orders the waiters increment and the futex value read through
* atomic operations (see hb_waiters_inc) and where (B) orders the write
* to futex and the waiters read -- this is done by the barriers for both
* shared and private futexes in get_futex_key_refs().
*
* This yields the following case (where X:=waiters, Y:=futex):
*
* X = Y = 0
*
* w[X]=1 w[Y]=1
* MB MB
* r[Y]=y r[X]=x
*
* Which guarantees that x==0 && y==0 is impossible; which translates back into
* the guarantee that we cannot both miss the futex variable change and the
* enqueue.
*
* Note that a new waiter is accounted for in (a) even when it is possible that
* the wait call can return error, in which case we backtrack from it in (b).
* Refer to the comment in queue_lock().
*
* Similarly, in order to account for waiters being requeued on another
* address we always increment the waiters for the destination bucket before
* acquiring the lock. It then decrements them again after releasing it -
* the code that actually moves the futex(es) between hash buckets (requeue_futex)
* will do the additional required waiter count housekeeping. This is done for
* double_lock_hb() and double_unlock_hb(), respectively.
*/
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
int __read_mostly futex_cmpxchg_enabled;
#endif
/*
* Futex flags used to encode options to functions and preserve them across
* restarts.
*/
#ifdef CONFIG_MMU
# define FLAGS_SHARED 0x01
#else
/*
* NOMMU does not have per process address space. Let the compiler optimize
* code away.
*/
# define FLAGS_SHARED 0x00
#endif
#define FLAGS_CLOCKRT 0x02
#define FLAGS_HAS_TIMEOUT 0x04
/*
* Priority Inheritance state:
*/
struct futex_pi_state {
/*
* list of 'owned' pi_state instances - these have to be
* cleaned up in do_exit() if the task exits prematurely:
*/
struct list_head list;
/*
* The PI object:
*/
struct rt_mutex pi_mutex;
struct task_struct *owner;
atomic_t refcount;
union futex_key key;
} __randomize_layout;
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
*
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakeup is always to make the first condition true, then
* the second.
*
* PI futexes are typically woken before they are removed from the hash list via
* the rt_mutex code. See unqueue_me_pi().
*/
struct futex_q {
struct plist_node list;
struct task_struct *task;
spinlock_t *lock_ptr;
union futex_key key;
struct futex_pi_state *pi_state;
struct rt_mutex_waiter *rt_waiter;
union futex_key *requeue_pi_key;
u32 bitset;
} __randomize_layout;
static const struct futex_q futex_q_init = {
/* list gets initialized in queue_me()*/
.key = FUTEX_KEY_INIT,
.bitset = FUTEX_BITSET_MATCH_ANY
};
/*
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
* waiting on a futex.
*/
struct futex_hash_bucket {
atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
} ____cacheline_aligned_in_smp;
/*
* The base of the bucket array and its size are always used together
* (after initialization only in hash_futex()), so ensure that they
* reside in the same cacheline.
*/
static struct {
struct futex_hash_bucket *queues;
unsigned long hashsize;
} __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_queues (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
/*
* Fault injections for futexes.
*/
#ifdef CONFIG_FAIL_FUTEX
static struct {
struct fault_attr attr;
bool ignore_private;
} fail_futex = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_private = false,
};
static int __init setup_fail_futex(char *str)
{
return setup_fault_attr(&fail_futex.attr, str);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Davidlohr Bueso A | 22 | 100.00% | 1 | 100.00% |
Total | 22 | 100.00% | 1 | 100.00% |
__setup("fail_futex=", setup_fail_futex);
static bool should_fail_futex(bool fshared)
{
if (fail_futex.ignore_private && !fshared)
return false;
return should_fail(&fail_futex.attr, 1);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Davidlohr Bueso A | 31 | 96.88% | 1 | 50.00% |
Fengguang Wu | 1 | 3.12% | 1 | 50.00% |
Total | 32 | 100.00% | 2 | 100.00% |
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init fail_futex_debugfs(void)
{
umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_futex", NULL,
&fail_futex.attr);
if (IS_ERR(dir))
return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-private", mode, dir,
&fail_futex.ignore_private)) {
debugfs_remove_recursive(dir);
return -ENOMEM;
}
return 0;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Davidlohr Bueso A | 81 | 100.00% | 1 | 100.00% |
Total | 81 | 100.00% | 1 | 100.00% |
late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
#else
static inline bool should_fail_futex(bool fshared)
{
return false;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Davidlohr Bueso A | 13 | 100.00% | 1 | 100.00% |
Total | 13 | 100.00% | 1 | 100.00% |
#endif /* CONFIG_FAIL_FUTEX */
static inline void futex_get_mm(union futex_key *key)
{
mmgrab(key->private.mm);
/*
* Ensure futex_get_mm() implies a full barrier such that
* get_futex_key() implies a full barrier. This is relied upon
* as smp_mb(); (B), see the ordering comment above.
*/
smp_mb__after_atomic();
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Davidlohr Bueso A | 23 | 92.00% | 2 | 50.00% |
Peter Zijlstra | 1 | 4.00% | 1 | 25.00% |
Vegard Nossum | 1 | 4.00% | 1 | 25.00% |
Total | 25 | 100.00% | 4 | 100.00% |
/*
* Reflects a new waiter being added to the waitqueue.
*/
static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
atomic_inc(&hb->waiters);
/*
* Full barrier (A), see the ordering comment above.
*/
smp_mb__after_atomic();
#endif
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Davidlohr Bueso A | 19 | 65.52% | 1 | 33.33% |
Linus Torvalds | 9 | 31.03% | 1 | 33.33% |
Peter Zijlstra | 1 | 3.45% | 1 | 33.33% |
Total | 29 | 100.00% | 3 | 100.00% |
/*
* Reflects a waiter being removed from the waitqueue by wakeup
* paths.
*/
static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
atomic_dec(&hb->waiters);
#endif
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Linus Torvalds | 25 | 100.00% | 1 | 100.00% |
Total | 25 | 100.00% | 1 | 100.00% |
static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
return atomic_read(&hb->waiters);
#else
return 1;
#endif
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Linus Torvalds | 17 | 54.84% | 1 | 50.00% |
Davidlohr Bueso A | 14 | 45.16% | 1 | 50.00% |
Total | 31 | 100.00% | 2 | 100.00% |
/**
* hash_futex - Return the hash bucket in the global hash
* @key: Pointer to the futex key for which the hash is calculated
*
* We hash on the keys returned from get_futex_key (see below) and return the
* corresponding hash bucket in the global hash.
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
u32 hash = jhash2((u32*)&key->both.word,
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
key->both.offset);
return &futex_queues[hash & (futex_hashsize - 1)];
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Rusty Russell | 50 | 70.42% | 3 | 42.86% |
Jamie Lokier | 16 | 22.54% | 1 | 14.29% |
Ingo Molnar | 2 | 2.82% | 1 | 14.29% |
Hugh Dickins | 2 | 2.82% | 1 | 14.29% |
Davidlohr Bueso A | 1 | 1.41% | 1 | 14.29% |
Total | 71 | 100.00% | 7 | 100.00% |
/**
* match_futex - Check whether two futex keys are equal
* @key1: Pointer to key1
* @key2: Pointer to key2
*
* Return 1 if two futex_keys are equal, 0 otherwise.
*/
static inline int match_futex(union futex_key *key1, union futex_key *key2)
{
return (key1 && key2
&& key1->both.word == key2->both.word
&& key1->both.ptr == key2->both.ptr
&& key1->both.offset == key2->both.offset);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Jamie Lokier | 43 | 71.67% | 1 | 20.00% |
Rusty Russell | 7 | 11.67% | 1 | 20.00% |
Hugh Dickins | 4 | 6.67% | 1 | 20.00% |
Darren Hart | 4 | 6.67% | 1 | 20.00% |
Ingo Molnar | 2 | 3.33% | 1 | 20.00% |
Total | 60 | 100.00% | 5 | 100.00% |
/*
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
*/
static void get_futex_key_refs(union futex_key *key)
{
if (!key->both.ptr)
return;
/*
* On MMU less systems futexes are always "private" as there is no per
* process address space. We need the smp wmb nevertheless - yes,
* arch/blackfin has MMU less SMP ...
*/
if (!IS_ENABLED(CONFIG_MMU)) {
smp_mb(); /* explicit smp_mb(); (B) */
return;
}
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
ihold(key->shared.inode); /* implies smp_mb(); (B) */
break;
case FUT_OFF_MMSHARED:
futex_get_mm(key); /* implies smp_mb(); (B) */
break;
default:
/*
* Private futexes do not hold reference on an inode or
* mm, therefore the only purpose of calling get_futex_key_refs
* is because we need the barrier for the lockless waiter check.
*/
smp_mb(); /* explicit smp_mb(); (B) */
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Peter Zijlstra | 57 | 68.67% | 1 | 14.29% |
Thomas Gleixner | 16 | 19.28% | 1 | 14.29% |
Davidlohr Bueso A | 5 | 6.02% | 3 | 42.86% |
Catalin Marinas | 4 | 4.82% | 1 | 14.29% |
Al Viro | 1 | 1.20% | 1 | 14.29% |
Total | 83 | 100.00% | 7 | 100.00% |
/*
* Drop a reference to the resource addressed by a key.
* The hash bucket spinlock must not be held. This is
* a no-op for private futexes, see comment in the get
* counterpart.
*/
static void drop_futex_key_refs(union futex_key *key)
{
if (!key->both.ptr) {
/* If we're here then we tried to put a key we failed to get */
WARN_ON_ONCE(1);
return;
}
if (!IS_ENABLED(CONFIG_MMU))
return;
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
iput(key->shared.inode);
break;
case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
break;
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Peter Zijlstra | 63 | 78.75% | 1 | 33.33% |
Thomas Gleixner | 9 | 11.25% | 1 | 33.33% |
Darren Hart | 8 | 10.00% | 1 | 33.33% |
Total | 80 | 100.00% | 3 | 100.00% |
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
* Return: a negative error code or 0
*
* The key words are stored in @key on success.
*
* For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct page *page, *tail;
struct address_space *mapping;
int err, ro = 0;
/*
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
if (unlikely((address % sizeof(u32)) != 0))
return -EINVAL;
address -= key->both.offset;
if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
return -EFAULT;
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
* virtual address, we dont even have to find the underlying vma.
* Note : We do have to check 'uaddr' is a valid user address,
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
get_futex_key_refs(key); /* implies smp_mb(); (B) */
return 0;
}
again:
/* Ignore any VERIFY_READ mapping (futex common case) */
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
err = get_user_pages_fast(address, 1, 1, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
*/
if (err == -EFAULT && rw == VERIFY_READ) {
err = get_user_pages_fast(address, 1, 0, &page);
ro = 1;
}
if (err < 0)
return err;
else
err = 0;
/*
* The treatment of mapping from this point on is critical. The page
* lock protects many things but in this context the page lock
* stabilizes mapping, prevents inode freeing in the shared
* file-backed region case and guards against movement to swap cache.
*
* Strictly speaking the page lock is not needed in all cases being
* considered here and page lock forces unnecessarily serialization
* From this point on, mapping will be re-verified if necessary and
* page lock will be acquired only if it is unavoidable
*
* Mapping checks require the head page for any compound page so the
* head page and mapping is looked up now. For anonymous pages, it
* does not matter if the page splits in the future as the key is
* based on the address. For filesystem-backed pages, the tail is
* required as the index of the page determines the key. For
* base pages, there is no tail page and tail == page.
*/
tail = page;
page = compound_head(page);
mapping = READ_ONCE(page->mapping);
/*
* If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
* found it, but truncated or holepunched or subjected to
* invalidate_complete_page2 before we got the page lock (also
* cases which we are happy to fail). And we hold a reference,
* so refcount care in invalidate_complete_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us.
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
* an unlikely race, but we do need to retry for page->mapping.
*/
if (unlikely(!mapping)) {
int shmem_swizzled;
/*
* Page lock is required to identify which special case above
* applies. If this is really a shmem page then the page lock
* will prevent unexpected transitions.
*/
lock_page(page);
shmem_swizzled = PageSwapCache(page) || page->mapping;
unlock_page(page);
put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
}
/*
* Private mappings are handled in a simple way.
*
* If the futex key is stored on an anonymous page, then the associated
* object is the mm which is implicitly pinned by the calling process.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
*/
if (unlikely(should_fail_futex(fshared)) || ro) {
err = -EFAULT;
goto out;
}
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
get_futex_key_refs(key); /* implies smp_mb(); (B) */
} else {
struct inode *inode;
/*
* The associated futex object in this case is the inode and
* the page->mapping must be traversed. Ordinarily this should
* be stabilised under page lock but it's not strictly
* necessary in this case as we just want to pin the inode, not
* update the radix tree or anything like that.
*
* The RCU read lock is taken as the inode is finally freed
* under RCU. If the mapping still matches expectations then the
* mapping->host can be safely accessed as being a valid inode.
*/
rcu_read_lock();
if (READ_ONCE(page->mapping) != mapping) {
rcu_read_unlock();
put_page(page);
goto again;
}
inode = READ_ONCE(mapping->host);
if (!inode) {
rcu_read_unlock();
put_page(page);
goto again;
}
/*
* Take a reference unless it is about to be freed. Previously
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
* truncated in parallel which is almost certainly an
* application bug. In such a case, just retry.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
if (!atomic_inc_not_zero(&inode->i_count)) {
rcu_read_unlock();
put_page(page);
goto again;
}
/* Should be impossible but lets be paranoid for now */
if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
err = -EFAULT;
rcu_read_unlock();
iput(inode);
goto out;
}
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = inode;
key->shared.pgoff = basepage_index(tail);
rcu_read_unlock();
}
out:
put_page(page);
return err;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Mel Gorman | 158 | 29.15% | 3 | 11.54% |
Jamie Lokier | 86 | 15.87% | 2 | 7.69% |
Shawn Bohrer | 69 | 12.73% | 1 | 3.85% |
Ingo Molnar | 60 | 11.07% | 5 | 19.23% |
Eric Dumazet | 54 | 9.96% | 1 | 3.85% |
Davidlohr Bueso A | 38 | 7.01% | 2 | 7.69% |
Peter Zijlstra | 32 | 5.90% | 4 | 15.38% |
Hugh Dickins | 13 | 2.40% | 2 | 7.69% |
Kirill A. Shutemov | 12 | 2.21% | 1 | 3.85% |
Linus Torvalds | 8 | 1.48% | 1 | 3.85% |
Andrea Arcangeli | 7 | 1.29% | 1 | 3.85% |
Zhang Yi | 3 | 0.55% | 1 | 3.85% |
Adrian Bunk | 1 | 0.18% | 1 | 3.85% |
Motohiro Kosaki | 1 | 0.18% | 1 | 3.85% |
Total | 542 | 100.00% | 26 | 100.00% |
static inline void put_futex_key(union futex_key *key)
{
drop_futex_key_refs(key);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Jamie Lokier | 13 | 76.47% | 1 | 33.33% |
Peter Zijlstra | 3 | 17.65% | 1 | 33.33% |
Adrian Bunk | 1 | 5.88% | 1 | 33.33% |
Total | 17 | 100.00% | 3 | 100.00% |
/**
* fault_in_user_writeable() - Fault in user address and verify RW access
* @uaddr: pointer to faulting user space address
*
* Slow path to fixup the fault we just took in the atomic write
* access to @uaddr.
*
* We have no generic implementation of a non-destructive write to the
* user address. We know that we faulted in the atomic pagefault
* disabled section so we can as well avoid the #PF overhead by
* calling get_user_pages() right away.
*/
static int fault_in_user_writeable(u32 __user *uaddr)
{
struct mm_struct *mm = current->mm;
int ret;
down_read(&mm->mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Thomas Gleixner | 34 | 50.75% | 1 | 25.00% |
Andi Kleen | 29 | 43.28% | 1 | 25.00% |
Dominik Dingel | 2 | 2.99% | 1 | 25.00% |
Benjamin Herrenschmidt | 2 | 2.99% | 1 | 25.00% |
Total | 67 | 100.00% | 4 | 100.00% |
/**
* futex_top_waiter() - Return the highest priority waiter on a futex
* @hb: the hash bucket the futex_q's reside in
* @key: the futex key (to distinguish it from other futex futex_q's)
*
* Must be called with the hb lock held.
*/
static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
union futex_key *key)
{
struct futex_q *this;
plist_for_each_entry(this, &hb->chain, list) {
if (match_futex(&this->key, key))
return this;
}
return NULL;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Darren Hart | 51 | 100.00% | 1 | 100.00% |
Total | 51 | 100.00% | 1 | 100.00% |
static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
u32 uval, u32 newval)
{
int ret;
pagefault_disable();
ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
pagefault_enable();
return ret;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Thomas Gleixner | 35 | 76.09% | 1 | 50.00% |
Michel Lespinasse | 11 | 23.91% | 1 | 50.00% |
Total | 46 | 100.00% | 2 | 100.00% |
static int get_futex_value_locked(u32 *dest, u32 __user *from)
{
int ret;
pagefault_disable();
ret = __get_user(*dest, from);
pagefault_enable();
return ret ? -EFAULT : 0;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Olof Johansson | 36 | 85.71% | 1 | 25.00% |
Ingo Molnar | 2 | 4.76% | 1 | 25.00% |
Peter Zijlstra | 2 | 4.76% | 1 | 25.00% |
Linus Torvalds | 2 | 4.76% | 1 | 25.00% |
Total | 42 | 100.00% | 4 | 100.00% |
/*
* PI code:
*/
static int refill_pi_state_cache(void)
{
struct futex_pi_state *pi_state;
if (likely(current->pi_state_cache))
return 0;
pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
if (!pi_state)
return -ENOMEM;
INIT_LIST_HEAD(&pi_state->list);
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
atomic_set(&pi_state->refcount, 1);
pi_state->key = FUTEX_KEY_INIT;
current->pi_state_cache = pi_state;
return 0;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Ingo Molnar | 52 | 59.77% | 4 | 36.36% |
Rusty Russell | 14 | 16.09% | 4 | 36.36% |
Jamie Lokier | 14 | 16.09% | 1 | 9.09% |
Peter Zijlstra | 6 | 6.90% | 1 | 9.09% |
Burman Yan | 1 | 1.15% | 1 | 9.09% |
Total | 87 | 100.00% | 11 | 100.00% |
static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
WARN_ON(!pi_state);
current->pi_state_cache = NULL;
return pi_state;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Ingo Molnar | 22 | 64.71% | 2 | 66.67% |
Jakub Jelínek | 12 | 35.29% | 1 | 33.33% |
Total | 34 | 100.00% | 3 | 100.00% |
static void get_pi_state(struct futex_pi_state *pi_state)
{
WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Peter Zijlstra | 23 | 100.00% | 1 | 100.00% |
Total | 23 | 100.00% | 1 | 100.00% |
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
*/
static void put_pi_state(struct futex_pi_state *pi_state)
{
if (!pi_state)
return;
if (!atomic_dec_and_test(&pi_state->refcount))
return;
/*
* If pi_state->owner is NULL, the owner is most probably dying
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
struct task_struct *owner;
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
owner = pi_state->owner;
if (owner) {
raw_spin_lock(&owner->pi_lock);
list_del_init(&pi_state->list);
raw_spin_unlock(&owner->pi_lock);
}
rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
}
if (current->pi_state_cache) {
kfree(pi_state);
} else {
/*
* pi_state->list is already empty.
* clear pi_state->owner.
* refcount is at 0 - put it back to 1.
*/
pi_state->owner = NULL;
atomic_set(&pi_state->refcount, 1);
current->pi_state_cache = pi_state;
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Ingo Molnar | 65 | 43.92% | 1 | 16.67% |
Jakub Jelínek | 38 | 25.68% | 1 | 16.67% |
Peter Zijlstra | 37 | 25.00% | 1 | 16.67% |
Brian Silverman | 6 | 4.05% | 1 | 16.67% |
Thomas Gleixner | 2 | 1.35% | 2 | 33.33% |
Total | 148 | 100.00% | 6 | 100.00% |
/*
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;
rcu_read_lock();
p = find_task_by_vpid(pid);
if (p)
get_task_struct(p);
rcu_read_unlock();
return p;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Ingo Molnar | 32 | 78.05% | 1 | 16.67% |
Oleg Nesterov | 4 | 9.76% | 1 | 16.67% |
David Howells | 3 | 7.32% | 2 | 33.33% |
Jakub Jelínek | 1 | 2.44% | 1 | 16.67% |
Pavel Emelyanov | 1 | 2.44% | 1 | 16.67% |
Total | 41 | 100.00% | 6 | 100.00% |
#ifdef CONFIG_FUTEX_PI
/*
* This task is holding PI mutexes at exit time => bad.
* Kernel cleans up PI-state, but userspace is likely hosed.
* (Robust-futex cleanup is separate and might save the day for userspace.)
*/
void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
if (!futex_cmpxchg_enabled)
return;
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
* versus waiters unqueueing themselves:
*/
raw_spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
hb = hash_futex(&key);
/*
* We can race against put_pi_state() removing itself from the
* list (a waiter going away). put_pi_state() will first
* decrement the reference count and then modify the list, so
* its possible to see the list entry but fail this reference
* acquire.
*
* In that case; drop the locks to let put_pi_state() make
* progress and retry the loop.
*/
if (!atomic_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
raw_spin_lock_irq(&curr->pi_lock);
continue;
}
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
raw_spin_lock(&curr->pi_lock);
/*
* We dropped the pi-lock, so re-check whether this
* task still owns the PI-state:
*/
if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
put_pi_state(pi_state);
continue;
}
WARN_ON(pi_state->owner != curr);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Ingo Molnar | 175 | 60.55% | 3 | 30.00% |
Peter Zijlstra | 86 | 29.76% | 4 | 40.00% |
Jakub Jelínek | 17 | 5.88% | 1 | 10.00% |
Thomas Gleixner | 11 | 3.81% | 2 | 20.00% |
Total | 289 | 100.00% | 10 | 100.00% |
#endif
/*
* We need to check the following states:
*
* Waiter | pi_state | pi->owner | uTID | uODIED | ?
*
* [1] NULL | --- | --- | 0 | 0/1 | Valid
* [2] NULL | --- | --- | >0 | 0/1 | Valid
*
* [3] Found | NULL | -- | Any | 0/1 | Invalid
*
* [4] Found | Found | NULL | 0 | 1 | Valid
* [5] Found | Found | NULL | >0 | 1 | Invalid
*
* [6] Found | Found | task | 0 | 1 | Valid
*
* [7] Found | Found | NULL | Any | 0 | Invalid
*
* [8] Found | Found | task | ==taskTID | 0/1 | Valid
* [9] Found | Found | task | 0 | 0 | Invalid
* [10] Found | Found | task | !=taskTID | 0/1 | Invalid
*
* [1] Indicates that the kernel can acquire the futex atomically. We
* came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
*
* [2] Valid, if TID does not belong to a kernel thread. If no matching
* thread is found then it indicates that the owner TID has died.
*
* [3] Invalid. The waiter is queued on a non PI futex
*
* [4] Valid state after exit_robust_list(), which sets the user space
* value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
*
* [5] The user space value got manipulated between exit_robust_list()
* and exit_pi_state_list()
*
* [6] Valid state after exit_pi_state_list() which sets the new owner in
* the pi_state but cannot access the user space value.
*
* [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
*
* [8] Owner and user space value match
*
* [9] There is no transient state which sets the user space TID to 0
* except exit_robust_list(), but this is indicated by the
* FUTEX_OWNER_DIED bit. See [4]
*
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
*
*
* Serialization and lifetime rules:
*
* hb->lock:
*
* hb -> futex_q, relation
* futex_q -> pi_state, relation
*
* (cannot be raw because hb can contain arbitrary amount
* of futex_q's)
*
* pi_mutex->wait_lock:
*
* {uval, pi_state}
*
* (and pi_mutex 'obviously')
*
* p->pi_lock:
*
* p->pi_state_list -> pi_state->list, relation
*
* pi_state->refcount:
*
* pi_state lifetime
*
*
* Lock order:
*
* hb->lock
* pi_mutex->wait_lock
* p->pi_lock
*
*/
/*
* Validate that the existing waiter has a pi_state and sanity check
* the pi_state against the user space value. If correct, attach to
* it.
*/
static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
u32 uval2;
int ret;
/*
* Userspace might have messed up non-PI and PI futexes [3]
*/
if (unlikely(!pi_state))
return -EINVAL;
/*
* We get here with hb->lock held, and having found a
* futex_top_waiter(). This means that futex_lock_pi() of said futex_q
* has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
* which in turn means that futex_lock_pi() still has a reference on
* our pi_state.
*
* The waiter holding a reference on @pi_state also protects against
* the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
* and futex_wait_requeue_pi() as it cannot go to 0 and consequently
* free pi_state before we can take a reference ourselves.
*/
WARN_ON(!atomic_read(&pi_state->refcount));
/*
* Now that we have a pi_state, we can acquire wait_lock
* and do the state validation.
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Since {uval, pi_state} is serialized by wait_lock, and our current
* uval was read without holding it, it can have changed. Verify it
* still is what we expect it to be, otherwise retry the entire
* operation.
*/
if (get_futex_value_locked(&uval2, uaddr))
goto out_efault;
if (uval != uval2)
goto out_eagain;
/*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
/*
* exit_pi_state_list sets owner to NULL and wakes the
* topmost waiter. The task which acquires the
* pi_state->rt_mutex will fixup owner.
*/
if (!pi_state->owner) {
/*
* No pi state owner, but the user space TID
* is not 0. Inconsistent state. [5]
*/
if (pid)
goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
goto out_attach;
}
/*
* If TID is 0, then either the dying owner has not
* yet executed exit_pi_state_list() or some waiter
* acquired the rtmutex in the pi state, but did not
* yet fixup the TID in user space.
*
* Take a ref on the state and return success. [6]
*/
if (!pid)
goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
goto out_einval;
}
/*
* Bail out if user space manipulated the futex value. If pi
* state exists then the owner TID must be the same as the
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
goto out_einval;
out_attach:
get_pi_state(pi_state);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
out_einval:
ret = -EINVAL;
goto out_error;
out_eagain:
ret = -EAGAIN;
goto out_error;
out_efault:
ret = -EFAULT;
goto out_error;
out_error:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Peter Zijlstra | 114 | 48.10% | 4 | 30.77% |
Thomas Gleixner | 69 | 29.11% | 5 | 38.46% |
Ingo Molnar | 37 | 15.61% | 2 | 15.38% |
Alexey Kuznetsov | 10 | 4.22% | 1 | 7.69% |
Pierre Peiffer | 7 | 2.95% | 1 | 7.69% |
Total | 237 | 100.00% | 13 | 100.00% |
/*
* Lookup the task for the TID provided from user space and attach to
* it after doing proper sanity checks.
*/
static int attach_to_pi_owner(u32 uval, union futex_key *key,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
struct futex_pi_state *pi_state;
struct task_struct *p;
/*
* We are the first waiter - try to look up the real owner and attach
* the new pi_state to it, but bail out when TID = 0 [1]
*/
if (!pid)
return -ESRCH;
p = futex_find_get_task(pid);
if (!p)
return -ESRCH;
if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
return -EPERM;
}
/*
* We need to look at the task state flags to figure out,
* whether the task is exiting. To protect against the do_exit
* change of the task flags, we do this protected by
* p->pi_lock:
*/
raw_spin_lock_irq(&p->pi_lock);
if (unlikely(p->flags & PF_EXITING)) {
/*
* The task is on the way out. When PF_EXITPIDONE is
* set, we know that the task has finished the
* cleanup:
*/
int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
return ret;
}
/*
* No existing pi state. First waiter. [2]
*
* This creates pi_state, we have hb->lock held, this means nothing can
* observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
/*
* Initialize the pi_mutex in locked state and make @p
* the owner of it:
*/
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
/* Store the key for possible exit cleanups: */
pi_state->key = *key;
WARN_ON(!list_empty(&