cregit-Linux how code gets into the kernel

Release 4.15 kernel/futex.c

Directory: kernel
/*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
 *
 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
 *
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
 *  Robust futex support started by Ingo Molnar
 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
 *
 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
 *  Copyright (C) IBM Corporation, 2009
 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
 *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/magic.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/sched/rt.h>
#include <linux/sched/wake_q.h>
#include <linux/sched/mm.h>
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/bootmem.h>
#include <linux/fault-inject.h>

#include <asm/futex.h>

#include "locking/rtmutex_common.h"

/*
 * READ this before attempting to hack on futexes!
 *
 * Basic futex operation and ordering guarantees
 * =============================================
 *
 * The waiter reads the futex value in user space and calls
 * futex_wait(). This function computes the hash bucket and acquires
 * the hash bucket lock. After that it reads the futex user space value
 * again and verifies that the data has not changed. If it has not changed
 * it enqueues itself into the hash bucket, releases the hash bucket lock
 * and schedules.
 *
 * The waker side modifies the user space value of the futex and calls
 * futex_wake(). This function computes the hash bucket and acquires the
 * hash bucket lock. Then it looks for waiters on that futex in the hash
 * bucket and wakes them.
 *
 * In futex wake up scenarios where no tasks are blocked on a futex, taking
 * the hb spinlock can be avoided and simply return. In order for this
 * optimization to work, ordering guarantees must exist so that the waiter
 * being added to the list is acknowledged when the list is concurrently being
 * checked by the waker, avoiding scenarios like the following:
 *
 * CPU 0                               CPU 1
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
 *   uval = *futex;
 *                                     *futex = newval;
 *                                     sys_futex(WAKE, futex);
 *                                       futex_wake(futex);
 *                                       if (queue_empty())
 *                                         return;
 *   if (uval == val)
 *      lock(hash_bucket(futex));
 *      queue();
 *     unlock(hash_bucket(futex));
 *     schedule();
 *
 * This would cause the waiter on CPU 0 to wait forever because it
 * missed the transition of the user space value from val to newval
 * and the waker did not find the waiter in the hash bucket queue.
 *
 * The correct serialization ensures that a waiter either observes
 * the changed user space value before blocking or is woken by a
 * concurrent waker:
 *
 * CPU 0                                 CPU 1
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
 *
 *   waiters++; (a)
 *   smp_mb(); (A) <-- paired with -.
 *                                  |
 *   lock(hash_bucket(futex));      |
 *                                  |
 *   uval = *futex;                 |
 *                                  |        *futex = newval;
 *                                  |        sys_futex(WAKE, futex);
 *                                  |          futex_wake(futex);
 *                                  |
 *                                  `--------> smp_mb(); (B)
 *   if (uval == val)
 *     queue();
 *     unlock(hash_bucket(futex));
 *     schedule();                         if (waiters)
 *                                           lock(hash_bucket(futex));
 *   else                                    wake_waiters(futex);
 *     waiters--; (b)                        unlock(hash_bucket(futex));
 *
 * Where (A) orders the waiters increment and the futex value read through
 * atomic operations (see hb_waiters_inc) and where (B) orders the write
 * to futex and the waiters read -- this is done by the barriers for both
 * shared and private futexes in get_futex_key_refs().
 *
 * This yields the following case (where X:=waiters, Y:=futex):
 *
 *      X = Y = 0
 *
 *      w[X]=1          w[Y]=1
 *      MB              MB
 *      r[Y]=y          r[X]=x
 *
 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 * the guarantee that we cannot both miss the futex variable change and the
 * enqueue.
 *
 * Note that a new waiter is accounted for in (a) even when it is possible that
 * the wait call can return error, in which case we backtrack from it in (b).
 * Refer to the comment in queue_lock().
 *
 * Similarly, in order to account for waiters being requeued on another
 * address we always increment the waiters for the destination bucket before
 * acquiring the lock. It then decrements them again  after releasing it -
 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
 * will do the additional required waiter count housekeeping. This is done for
 * double_lock_hb() and double_unlock_hb(), respectively.
 */

#ifndef CONFIG_HAVE_FUTEX_CMPXCHG

int __read_mostly futex_cmpxchg_enabled;
#endif

/*
 * Futex flags used to encode options to functions and preserve them across
 * restarts.
 */
#ifdef CONFIG_MMU

# define FLAGS_SHARED		0x01
#else
/*
 * NOMMU does not have per process address space. Let the compiler optimize
 * code away.
 */

# define FLAGS_SHARED		0x00
#endif

#define FLAGS_CLOCKRT		0x02

#define FLAGS_HAS_TIMEOUT	0x04

/*
 * Priority Inheritance state:
 */

struct futex_pi_state {
	/*
         * list of 'owned' pi_state instances - these have to be
         * cleaned up in do_exit() if the task exits prematurely:
         */
	
struct list_head list;

	/*
         * The PI object:
         */
	
struct rt_mutex pi_mutex;

	
struct task_struct *owner;
	
atomic_t refcount;

	
union futex_key key;

} __randomize_layout;

/**
 * struct futex_q - The hashed futex queue entry, one per waiting task
 * @list:               priority-sorted list of tasks waiting on this futex
 * @task:               the task waiting on the futex
 * @lock_ptr:           the hash bucket lock
 * @key:                the key the futex is hashed on
 * @pi_state:           optional priority inheritance state
 * @rt_waiter:          rt_waiter storage for use with requeue_pi
 * @requeue_pi_key:     the requeue_pi target futex key
 * @bitset:             bitset for the optional bitmasked wakeup
 *
 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
 * we can wake only the relevant ones (hashed queues may be shared).
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 * The order of wakeup is always to make the first condition true, then
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
 * the rt_mutex code. See unqueue_me_pi().
 */

struct futex_q {
	
struct plist_node list;

	
struct task_struct *task;
	
spinlock_t *lock_ptr;
	
union futex_key key;
	
struct futex_pi_state *pi_state;
	
struct rt_mutex_waiter *rt_waiter;
	
union futex_key *requeue_pi_key;
	
u32 bitset;

} __randomize_layout;


static const struct futex_q futex_q_init = {
	/* list gets initialized in queue_me()*/
	.key = FUTEX_KEY_INIT,
	.bitset = FUTEX_BITSET_MATCH_ANY
};

/*
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
 * waiting on a futex.
 */

struct futex_hash_bucket {
	
atomic_t waiters;
	
spinlock_t lock;
	
struct plist_head chain;

} ____cacheline_aligned_in_smp;

/*
 * The base of the bucket array and its size are always used together
 * (after initialization only in hash_futex()), so ensure that they
 * reside in the same cacheline.
 */
static struct {
	
struct futex_hash_bucket *queues;
	
unsigned long            hashsize;
} __futex_data __read_mostly __aligned(2*sizeof(long));

#define futex_queues   (__futex_data.queues)

#define futex_hashsize (__futex_data.hashsize)


/*
 * Fault injections for futexes.
 */
#ifdef CONFIG_FAIL_FUTEX

static struct {
	
struct fault_attr attr;

	
bool ignore_private;

} fail_futex = {
	.attr = FAULT_ATTR_INITIALIZER,
	.ignore_private = false,
};


static int __init setup_fail_futex(char *str) { return setup_fault_attr(&fail_futex.attr, str); }

Contributors

PersonTokensPropCommitsCommitProp
Davidlohr Bueso A22100.00%1100.00%
Total22100.00%1100.00%

__setup("fail_futex=", setup_fail_futex);
static bool should_fail_futex(bool fshared) { if (fail_futex.ignore_private && !fshared) return false; return should_fail(&fail_futex.attr, 1); }

Contributors

PersonTokensPropCommitsCommitProp
Davidlohr Bueso A3196.88%150.00%
Fengguang Wu13.12%150.00%
Total32100.00%2100.00%

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init fail_futex_debugfs(void) { umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_futex", NULL, &fail_futex.attr); if (IS_ERR(dir)) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-private", mode, dir, &fail_futex.ignore_private)) { debugfs_remove_recursive(dir); return -ENOMEM; } return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Davidlohr Bueso A81100.00%1100.00%
Total81100.00%1100.00%

late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ #else
static inline bool should_fail_futex(bool fshared) { return false; }

Contributors

PersonTokensPropCommitsCommitProp
Davidlohr Bueso A13100.00%1100.00%
Total13100.00%1100.00%

#endif /* CONFIG_FAIL_FUTEX */
static inline void futex_get_mm(union futex_key *key) { mmgrab(key->private.mm); /* * Ensure futex_get_mm() implies a full barrier such that * get_futex_key() implies a full barrier. This is relied upon * as smp_mb(); (B), see the ordering comment above. */ smp_mb__after_atomic(); }

Contributors

PersonTokensPropCommitsCommitProp
Davidlohr Bueso A2392.00%250.00%
Peter Zijlstra14.00%125.00%
Vegard Nossum14.00%125.00%
Total25100.00%4100.00%

/* * Reflects a new waiter being added to the waitqueue. */
static inline void hb_waiters_inc(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_inc(&hb->waiters); /* * Full barrier (A), see the ordering comment above. */ smp_mb__after_atomic(); #endif }

Contributors

PersonTokensPropCommitsCommitProp
Davidlohr Bueso A1965.52%133.33%
Linus Torvalds931.03%133.33%
Peter Zijlstra13.45%133.33%
Total29100.00%3100.00%

/* * Reflects a waiter being removed from the waitqueue by wakeup * paths. */
static inline void hb_waiters_dec(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_dec(&hb->waiters); #endif }

Contributors

PersonTokensPropCommitsCommitProp
Linus Torvalds25100.00%1100.00%
Total25100.00%1100.00%


static inline int hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP return atomic_read(&hb->waiters); #else return 1; #endif }

Contributors

PersonTokensPropCommitsCommitProp
Linus Torvalds1754.84%150.00%
Davidlohr Bueso A1445.16%150.00%
Total31100.00%2100.00%

/** * hash_futex - Return the hash bucket in the global hash * @key: Pointer to the futex key for which the hash is calculated * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket in the global hash. */
static struct futex_hash_bucket *hash_futex(union futex_key *key) { u32 hash = jhash2((u32*)&key->both.word, (sizeof(key->both.word)+sizeof(key->both.ptr))/4, key->both.offset); return &futex_queues[hash & (futex_hashsize - 1)]; }

Contributors

PersonTokensPropCommitsCommitProp
Rusty Russell5070.42%342.86%
Jamie Lokier1622.54%114.29%
Ingo Molnar22.82%114.29%
Hugh Dickins22.82%114.29%
Davidlohr Bueso A11.41%114.29%
Total71100.00%7100.00%

/** * match_futex - Check whether two futex keys are equal * @key1: Pointer to key1 * @key2: Pointer to key2 * * Return 1 if two futex_keys are equal, 0 otherwise. */
static inline int match_futex(union futex_key *key1, union futex_key *key2) { return (key1 && key2 && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); }

Contributors

PersonTokensPropCommitsCommitProp
Jamie Lokier4371.67%120.00%
Rusty Russell711.67%120.00%
Hugh Dickins46.67%120.00%
Darren Hart46.67%120.00%
Ingo Molnar23.33%120.00%
Total60100.00%5100.00%

/* * Take a reference to the resource addressed by a key. * Can be called while holding spinlocks. * */
static void get_futex_key_refs(union futex_key *key) { if (!key->both.ptr) return; /* * On MMU less systems futexes are always "private" as there is no per * process address space. We need the smp wmb nevertheless - yes, * arch/blackfin has MMU less SMP ... */ if (!IS_ENABLED(CONFIG_MMU)) { smp_mb(); /* explicit smp_mb(); (B) */ return; } switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: ihold(key->shared.inode); /* implies smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies smp_mb(); (B) */ break; default: /* * Private futexes do not hold reference on an inode or * mm, therefore the only purpose of calling get_futex_key_refs * is because we need the barrier for the lockless waiter check. */ smp_mb(); /* explicit smp_mb(); (B) */ } }

Contributors

PersonTokensPropCommitsCommitProp
Peter Zijlstra5768.67%114.29%
Thomas Gleixner1619.28%114.29%
Davidlohr Bueso A56.02%342.86%
Catalin Marinas44.82%114.29%
Al Viro11.20%114.29%
Total83100.00%7100.00%

/* * Drop a reference to the resource addressed by a key. * The hash bucket spinlock must not be held. This is * a no-op for private futexes, see comment in the get * counterpart. */
static void drop_futex_key_refs(union futex_key *key) { if (!key->both.ptr) { /* If we're here then we tried to put a key we failed to get */ WARN_ON_ONCE(1); return; } if (!IS_ENABLED(CONFIG_MMU)) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); break; } }

Contributors

PersonTokensPropCommitsCommitProp
Peter Zijlstra6378.75%133.33%
Thomas Gleixner911.25%133.33%
Darren Hart810.00%133.33%
Total80100.00%3100.00%

/** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: VERIFY_READ, * VERIFY_WRITE) * * Return: a negative error code or 0 * * The key words are stored in @key on success. * * For shared mappings, it's (page->index, file_inode(vma->vm_file), * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * * lock_page() might sleep, the caller should not hold a spinlock. */
static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page, *tail; struct address_space *mapping; int err, ro = 0; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { key->private.mm = mm; key->private.address = address; get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } again: /* Ignore any VERIFY_READ mapping (futex common case) */ if (unlikely(should_fail_futex(fshared))) return -EFAULT; err = get_user_pages_fast(address, 1, 1, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ if (err == -EFAULT && rw == VERIFY_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } if (err < 0) return err; else err = 0; /* * The treatment of mapping from this point on is critical. The page * lock protects many things but in this context the page lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * * Strictly speaking the page lock is not needed in all cases being * considered here and page lock forces unnecessarily serialization * From this point on, mapping will be re-verified if necessary and * page lock will be acquired only if it is unavoidable * * Mapping checks require the head page for any compound page so the * head page and mapping is looked up now. For anonymous pages, it * does not matter if the page splits in the future as the key is * based on the address. For filesystem-backed pages, the tail is * required as the index of the page determines the key. For * base pages, there is no tail page and tail == page. */ tail = page; page = compound_head(page); mapping = READ_ONCE(page->mapping); /* * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the page lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_complete_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for page->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* * Page lock is required to identify which special case above * applies. If this is really a shmem page then the page lock * will prevent unexpected transitions. */ lock_page(page); shmem_swizzled = PageSwapCache(page) || page->mapping; unlock_page(page); put_page(page); if (shmem_swizzled) goto again; return -EFAULT; } /* * Private mappings are handled in a simple way. * * If the futex key is stored on an anonymous page, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ if (PageAnon(page)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ if (unlikely(should_fail_futex(fshared)) || ro) { err = -EFAULT; goto out; } key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; get_futex_key_refs(key); /* implies smp_mb(); (B) */ } else { struct inode *inode; /* * The associated futex object in this case is the inode and * the page->mapping must be traversed. Ordinarily this should * be stabilised under page lock but it's not strictly * necessary in this case as we just want to pin the inode, not * update the radix tree or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the * mapping->host can be safely accessed as being a valid inode. */ rcu_read_lock(); if (READ_ONCE(page->mapping) != mapping) { rcu_read_unlock(); put_page(page); goto again; } inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); put_page(page); goto again; } /* * Take a reference unless it is about to be freed. Previously * this reference was taken by ihold under the page lock * pinning the inode in place so i_lock was unnecessary. The * only way for this check to fail is if the inode was * truncated in parallel which is almost certainly an * application bug. In such a case, just retry. * * We are not calling into get_futex_key_refs() in file-backed * cases, therefore a successful atomic_inc return below will * guarantee that get_futex_key() will still imply smp_mb(); (B). */ if (!atomic_inc_not_zero(&inode->i_count)) { rcu_read_unlock(); put_page(page); goto again; } /* Should be impossible but lets be paranoid for now */ if (WARN_ON_ONCE(inode->i_mapping != mapping)) { err = -EFAULT; rcu_read_unlock(); iput(inode); goto out; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.inode = inode; key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } out: put_page(page); return err; }

Contributors

PersonTokensPropCommitsCommitProp
Mel Gorman15829.15%311.54%
Jamie Lokier8615.87%27.69%
Shawn Bohrer6912.73%13.85%
Ingo Molnar6011.07%519.23%
Eric Dumazet549.96%13.85%
Davidlohr Bueso A387.01%27.69%
Peter Zijlstra325.90%415.38%
Hugh Dickins132.40%27.69%
Kirill A. Shutemov122.21%13.85%
Linus Torvalds81.48%13.85%
Andrea Arcangeli71.29%13.85%
Zhang Yi30.55%13.85%
Adrian Bunk10.18%13.85%
Motohiro Kosaki10.18%13.85%
Total542100.00%26100.00%


static inline void put_futex_key(union futex_key *key) { drop_futex_key_refs(key); }

Contributors

PersonTokensPropCommitsCommitProp
Jamie Lokier1376.47%133.33%
Peter Zijlstra317.65%133.33%
Adrian Bunk15.88%133.33%
Total17100.00%3100.00%

/** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. */
static int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; down_read(&mm->mmap_sem); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; }

Contributors

PersonTokensPropCommitsCommitProp
Thomas Gleixner3450.75%125.00%
Andi Kleen2943.28%125.00%
Dominik Dingel22.99%125.00%
Benjamin Herrenschmidt22.99%125.00%
Total67100.00%4100.00%

/** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */
static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) { struct futex_q *this; plist_for_each_entry(this, &hb->chain, list) { if (match_futex(&this->key, key)) return this; } return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Darren Hart51100.00%1100.00%
Total51100.00%1100.00%


static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { int ret; pagefault_disable(); ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); return ret; }

Contributors

PersonTokensPropCommitsCommitProp
Thomas Gleixner3576.09%150.00%
Michel Lespinasse1123.91%150.00%
Total46100.00%2100.00%


static int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; }

Contributors

PersonTokensPropCommitsCommitProp
Olof Johansson3685.71%125.00%
Ingo Molnar24.76%125.00%
Peter Zijlstra24.76%125.00%
Linus Torvalds24.76%125.00%
Total42100.00%4100.00%

/* * PI code: */
static int refill_pi_state_cache(void) { struct futex_pi_state *pi_state; if (likely(current->pi_state_cache)) return 0; pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); if (!pi_state) return -ENOMEM; INIT_LIST_HEAD(&pi_state->list); /* pi_mutex gets initialized later */ pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); pi_state->key = FUTEX_KEY_INIT; current->pi_state_cache = pi_state; return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar5259.77%436.36%
Rusty Russell1416.09%436.36%
Jamie Lokier1416.09%19.09%
Peter Zijlstra66.90%19.09%
Burman Yan11.15%19.09%
Total87100.00%11100.00%


static struct futex_pi_state *alloc_pi_state(void) { struct futex_pi_state *pi_state = current->pi_state_cache; WARN_ON(!pi_state); current->pi_state_cache = NULL; return pi_state; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar2264.71%266.67%
Jakub Jelínek1235.29%133.33%
Total34100.00%3100.00%


static void get_pi_state(struct futex_pi_state *pi_state) { WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); }

Contributors

PersonTokensPropCommitsCommitProp
Peter Zijlstra23100.00%1100.00%
Total23100.00%1100.00%

/* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. */
static void put_pi_state(struct futex_pi_state *pi_state) { if (!pi_state) return; if (!atomic_dec_and_test(&pi_state->refcount)) return; /* * If pi_state->owner is NULL, the owner is most probably dying * and has cleaned up the pi_state already */ if (pi_state->owner) { struct task_struct *owner; raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); owner = pi_state->owner; if (owner) { raw_spin_lock(&owner->pi_lock); list_del_init(&pi_state->list); raw_spin_unlock(&owner->pi_lock); } rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); } if (current->pi_state_cache) { kfree(pi_state); } else { /* * pi_state->list is already empty. * clear pi_state->owner. * refcount is at 0 - put it back to 1. */ pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); current->pi_state_cache = pi_state; } }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar6543.92%116.67%
Jakub Jelínek3825.68%116.67%
Peter Zijlstra3725.00%116.67%
Brian Silverman64.05%116.67%
Thomas Gleixner21.35%233.33%
Total148100.00%6100.00%

/* * Look up the task based on what TID userspace gave us. * We dont trust it. */
static struct task_struct *futex_find_get_task(pid_t pid) { struct task_struct *p; rcu_read_lock(); p = find_task_by_vpid(pid); if (p) get_task_struct(p); rcu_read_unlock(); return p; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar3278.05%116.67%
Oleg Nesterov49.76%116.67%
David Howells37.32%233.33%
Jakub Jelínek12.44%116.67%
Pavel Emelyanov12.44%116.67%
Total41100.00%6100.00%

#ifdef CONFIG_FUTEX_PI /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */
void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; if (!futex_cmpxchg_enabled) return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); /* * We can race against put_pi_state() removing itself from the * list (a waiter going away). put_pi_state() will first * decrement the reference count and then modify the list, so * its possible to see the list entry but fail this reference * acquire. * * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ if (!atomic_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); continue; } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar17560.55%330.00%
Peter Zijlstra8629.76%440.00%
Jakub Jelínek175.88%110.00%
Thomas Gleixner113.81%220.00%
Total289100.00%10100.00%

#endif /* * We need to check the following states: * * Waiter | pi_state | pi->owner | uTID | uODIED | ? * * [1] NULL | --- | --- | 0 | 0/1 | Valid * [2] NULL | --- | --- | >0 | 0/1 | Valid * * [3] Found | NULL | -- | Any | 0/1 | Invalid * * [4] Found | Found | NULL | 0 | 1 | Valid * [5] Found | Found | NULL | >0 | 1 | Invalid * * [6] Found | Found | task | 0 | 1 | Valid * * [7] Found | Found | NULL | Any | 0 | Invalid * * [8] Found | Found | task | ==taskTID | 0/1 | Valid * [9] Found | Found | task | 0 | 0 | Invalid * [10] Found | Found | task | !=taskTID | 0/1 | Invalid * * [1] Indicates that the kernel can acquire the futex atomically. We * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. * * [2] Valid, if TID does not belong to a kernel thread. If no matching * thread is found then it indicates that the owner TID has died. * * [3] Invalid. The waiter is queued on a non PI futex * * [4] Valid state after exit_robust_list(), which sets the user space * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. * * [5] The user space value got manipulated between exit_robust_list() * and exit_pi_state_list() * * [6] Valid state after exit_pi_state_list() which sets the new owner in * the pi_state but cannot access the user space value. * * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. * * [8] Owner and user space value match * * [9] There is no transient state which sets the user space TID to 0 * except exit_robust_list(), but this is indicated by the * FUTEX_OWNER_DIED bit. See [4] * * [10] There is no transient state which leaves owner and user space * TID out of sync. * * * Serialization and lifetime rules: * * hb->lock: * * hb -> futex_q, relation * futex_q -> pi_state, relation * * (cannot be raw because hb can contain arbitrary amount * of futex_q's) * * pi_mutex->wait_lock: * * {uval, pi_state} * * (and pi_mutex 'obviously') * * p->pi_lock: * * p->pi_state_list -> pi_state->list, relation * * pi_state->refcount: * * pi_state lifetime * * * Lock order: * * hb->lock * pi_mutex->wait_lock * p->pi_lock * */ /* * Validate that the existing waiter has a pi_state and sanity check * the pi_state against the user space value. If correct, attach to * it. */
static int attach_to_pi_state(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; u32 uval2; int ret; /* * Userspace might have messed up non-PI and PI futexes [3] */ if (unlikely(!pi_state)) return -EINVAL; /* * We get here with hb->lock held, and having found a * futex_top_waiter(). This means that futex_lock_pi() of said futex_q * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), * which in turn means that futex_lock_pi() still has a reference on * our pi_state. * * The waiter holding a reference on @pi_state also protects against * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() * and futex_wait_requeue_pi() as it cannot go to 0 and consequently * free pi_state before we can take a reference ourselves. */ WARN_ON(!atomic_read(&pi_state->refcount)); /* * Now that we have a pi_state, we can acquire wait_lock * and do the state validation. */ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* * Since {uval, pi_state} is serialized by wait_lock, and our current * uval was read without holding it, it can have changed. Verify it * still is what we expect it to be, otherwise retry the entire * operation. */ if (get_futex_value_locked(&uval2, uaddr)) goto out_efault; if (uval != uval2) goto out_eagain; /* * Handle the owner died case: */ if (uval & FUTEX_OWNER_DIED) { /* * exit_pi_state_list sets owner to NULL and wakes the * topmost waiter. The task which acquires the * pi_state->rt_mutex will fixup owner. */ if (!pi_state->owner) { /* * No pi state owner, but the user space TID * is not 0. Inconsistent state. [5] */ if (pid) goto out_einval; /* * Take a ref on the state and return success. [4] */ goto out_attach; } /* * If TID is 0, then either the dying owner has not * yet executed exit_pi_state_list() or some waiter * acquired the rtmutex in the pi state, but did not * yet fixup the TID in user space. * * Take a ref on the state and return success. [6] */ if (!pid) goto out_attach; } else { /* * If the owner died bit is not set, then the pi_state * must have an owner. [7] */ if (!pi_state->owner) goto out_einval; } /* * Bail out if user space manipulated the futex value. If pi * state exists then the owner TID must be the same as the * user space TID. [9/10] */ if (pid != task_pid_vnr(pi_state->owner)) goto out_einval; out_attach: get_pi_state(pi_state); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); *ps = pi_state; return 0; out_einval: ret = -EINVAL; goto out_error; out_eagain: ret = -EAGAIN; goto out_error; out_efault: ret = -EFAULT; goto out_error; out_error: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; }

Contributors

PersonTokensPropCommitsCommitProp
Peter Zijlstra11448.10%430.77%
Thomas Gleixner6929.11%538.46%
Ingo Molnar3715.61%215.38%
Alexey Kuznetsov104.22%17.69%
Pierre Peiffer72.95%17.69%
Total237100.00%13100.00%

/* * Lookup the task for the TID provided from user space and attach to * it after doing proper sanity checks. */
static int attach_to_pi_owner(u32 uval, union futex_key *key, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; struct futex_pi_state *pi_state; struct task_struct *p; /* * We are the first waiter - try to look up the real owner and attach * the new pi_state to it, but bail out when TID = 0 [1] */ if (!pid) return -ESRCH; p = futex_find_get_task(pid); if (!p) return -ESRCH; if (unlikely(p->flags & PF_KTHREAD)) { put_task_struct(p); return -EPERM; } /* * We need to look at the task state flags to figure out, * whether the task is exiting. To protect against the do_exit * change of the task flags, we do this protected by * p->pi_lock: */ raw_spin_lock_irq(&p->pi_lock); if (unlikely(p->flags & PF_EXITING)) { /* * The task is on the way out. When PF_EXITPIDONE is * set, we know that the task has finished the * cleanup: */ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); return ret; } /* * No existing pi state. First waiter. [2] * * This creates pi_state, we have hb->lock held, this means nothing can * observe this state, wait_lock is irrelevant. */ pi_state = alloc_pi_state(); /* * Initialize the pi_mutex in locked state and make @p * the owner of it: */ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); /* Store the key for possible exit cleanups: */ pi_state->key = *key; WARN_ON(!list_empty(&