Release 4.7 fs/fs-writeback.c
/*
* fs/fs-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
*
* Contains all the functions related to writing back and waiting
* upon dirty inodes against superblocks, and writing back dirty
* pages against inodes. ie: data writeback. Writeout of the
* inode itself is not handled here.
*
* 10Apr2002 Andrew Morton
* Split out of fs/inode.c
* Additions for address_space-based writeback
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
#include "internal.h"
/*
* 4MB minimal write chunk size
*/
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
struct wb_completion {
atomic_t cnt;
};
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
unsigned long *older_than_this;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
unsigned int auto_free:1; /* free on completion */
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
struct wb_completion *done; /* set if the caller waits */
};
/*
* If one wants to wait for one or more wb_writeback_works, each work's
* ->done should be set to a wb_completion defined using the following
* macro. Once all work items are issued with wb_queue_work(), the caller
* can wait for the completion of all using wb_wait_for_completion(). Work
* items which are waited upon aren't freed automatically on completion.
*/
#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
struct wb_completion cmpl = { \
.cnt = ATOMIC_INIT(1), \
}
/*
* If an inode is constantly having its pages dirtied, but then the
* updates stop dirtytime_expire_interval seconds in the past, it's
* possible for the worst case time between when an inode has its
* timestamps updated and when they finally get written out to be two
* dirtytime_expire_intervals. We set the default to 12 hours (in
* seconds), which means most of the time inodes will have their
* timestamps written to disk after 12 hours, but in the worst case a
* few inodes might not their timestamps updated for 24 hours.
*/
unsigned int dirtytime_expire_interval = 12 * 60 * 60;
static inline struct inode *wb_inode(struct list_head *head)
{
return list_entry(head, struct inode, i_io_list);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
nick piggin | nick piggin | 24 | 96.00% | 1 | 50.00% |
dave chinner | dave chinner | 1 | 4.00% | 1 | 50.00% |
| Total | 25 | 100.00% | 2 | 100.00% |
/*
* Include the creation of the trace points after defining the
* wb_writeback_work structure and inline functions so that the definition
* remains local to this file.
*/
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
static bool wb_io_lists_populated(struct bdi_writeback *wb)
{
if (wb_has_dirty_io(wb)) {
return false;
} else {
set_bit(WB_has_dirty_io, &wb->state);
WARN_ON_ONCE(!wb->avg_write_bandwidth);
atomic_long_add(wb->avg_write_bandwidth,
&wb->bdi->tot_write_bandwidth);
return true;
}
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 61 | 100.00% | 3 | 100.00% |
| Total | 61 | 100.00% | 3 | 100.00% |
static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
clear_bit(WB_has_dirty_io, &wb->state);
WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
&wb->bdi->tot_write_bandwidth) < 0);
}
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 73 | 100.00% | 3 | 100.00% |
| Total | 73 | 100.00% | 3 | 100.00% |
/**
* inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
* @inode: inode to be moved
* @wb: target bdi_writeback
* @head: one of @wb->b_{dirty|io|more_io}
*
* Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
* Returns %true if @inode is the first occupant of the !dirty_time IO
* lists; otherwise, %false.
*/
static bool inode_io_list_move_locked(struct inode *inode,
struct bdi_writeback *wb,
struct list_head *head)
{
assert_spin_locked(&wb->list_lock);
list_move(&inode->i_io_list, head);
/* dirty_time doesn't count as dirty_io until expiration */
if (head != &wb->b_dirty_time)
return wb_io_lists_populated(wb);
wb_io_lists_depopulated(wb);
return false;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 61 | 96.83% | 1 | 50.00% |
dave chinner | dave chinner | 2 | 3.17% | 1 | 50.00% |
| Total | 63 | 100.00% | 2 | 100.00% |
/**
* inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
* @inode: inode to be removed
* @wb: bdi_writeback @inode is being removed from
*
* Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
* clear %WB_has_dirty_io if all are empty afterwards.
*/
static void inode_io_list_del_locked(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock);
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 35 | 94.59% | 1 | 50.00% |
dave chinner | dave chinner | 2 | 5.41% | 1 | 50.00% |
| Total | 37 | 100.00% | 2 | 100.00% |
static void wb_wakeup(struct bdi_writeback *wb)
{
spin_lock_bh(&wb->work_lock);
if (test_bit(WB_registered, &wb->state))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
spin_unlock_bh(&wb->work_lock);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
jan kara | jan kara | 40 | 78.43% | 1 | 33.33% |
tejun heo | tejun heo | 11 | 21.57% | 2 | 66.67% |
| Total | 51 | 100.00% | 3 | 100.00% |
static void wb_queue_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
trace_writeback_queue(wb, work);
spin_lock_bh(&wb->work_lock);
if (!test_bit(WB_registered, &wb->state))
goto out_unlock;
if (work->done)
atomic_inc(&work->done->cnt);
list_add_tail(&work->list, &wb->work_list);
mod_delayed_work(bdi_wq, &wb->dwork, 0);
out_unlock:
spin_unlock_bh(&wb->work_lock);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
jan kara | jan kara | 32 | 32.65% | 1 | 6.67% |
tejun heo | tejun heo | 29 | 29.59% | 4 | 26.67% |
andrew morton | andrew morton | 18 | 18.37% | 4 | 26.67% |
jens axboe | jens axboe | 9 | 9.18% | 1 | 6.67% |
david chinner | david chinner | 6 | 6.12% | 1 | 6.67% |
artem bityutskiy | artem bityutskiy | 2 | 2.04% | 2 | 13.33% |
pavel emelianov | pavel emelianov | 1 | 1.02% | 1 | 6.67% |
christoph hellwig | christoph hellwig | 1 | 1.02% | 1 | 6.67% |
| Total | 98 | 100.00% | 15 | 100.00% |
/**
* wb_wait_for_completion - wait for completion of bdi_writeback_works
* @bdi: bdi work items were issued to
* @done: target wb_completion
*
* Wait for one or more work items issued to @bdi with their ->done field
* set to @done, which should have been defined with
* DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such
* work items are completed. Work items which are waited upon aren't freed
* automatically on completion.
*/
static void wb_wait_for_completion(struct backing_dev_info *bdi,
struct wb_completion *done)
{
atomic_dec(&done->cnt); /* put down the initial count */
wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 41 | 100.00% | 1 | 100.00% |
| Total | 41 | 100.00% | 1 | 100.00% |
#ifdef CONFIG_CGROUP_WRITEBACK
/* parameters for foreign inode detection, see wb_detach_inode() */
#define WB_FRN_TIME_SHIFT 13
/* 1s = 2^13, upto 8 secs w/ 16bit */
#define WB_FRN_TIME_AVG_SHIFT 3
/* avg = avg * 7/8 + new * 1/8 */
#define WB_FRN_TIME_CUT_DIV 2
/* ignore rounds < avg / 2 */
#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))
/* 2s */
#define WB_FRN_HIST_SLOTS 16
/* inode->i_wb_frn_history is 16bit */
#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
/* each slot's duration is 2s / 16 */
#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
/* if foreign slots >= 8, switch */
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
if (inode_cgwb_enabled(inode)) {
struct cgroup_subsys_state *memcg_css;
if (page) {
memcg_css = mem_cgroup_css_from_page(page);
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
} else {
/* must pin memcg_css, see wb_get_create() */
memcg_css = task_get_css(current, memory_cgrp_id);
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
}
}
if (!wb)
wb = &bdi->wb;
/*
* There may be multiple instances of this function racing to
* update the same inode. Use cmpxchg() to tell the winner.
*/
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 134 | 100.00% | 1 | 100.00% |
| Total | 134 | 100.00% | 1 | 100.00% |
/**
* locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
* @inode: inode of interest with i_lock held
*
* Returns @inode's wb with its list_lock held. @inode->i_lock must be
* held on entry and is released on return. The returned wb is guaranteed
* to stay @inode's associated wb until its list_lock is released.
*/
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
__releases(&inode->i_lockContributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 14 | 100.00% | 1 | 100.00% |
| Total | 14 | 100.00% | 1 | 100.00% |
)
__acquires(&wb->list_lock)
{
while (true) {
struct bdi_writeback *wb = inode_to_wb(inode);
/*
* inode_to_wb() association is protected by both
* @inode->i_lock and @wb->list_lock but list_lock nests
* outside i_lock. Drop i_lock and verify that the
* association hasn't changed after acquiring list_lock.
*/
wb_get(wb);
spin_unlock(&inode->i_lock);
spin_lock(&wb->list_lock);
/* i_wb may have changed inbetween, can't use inode_to_wb() */
if (likely(wb == inode->i_wb)) {
wb_put(wb); /* @inode already has ref */
return wb;
}
spin_unlock(&wb->list_lock);
wb_put(wb);
cpu_relax();
spin_lock(&inode->i_lock);
}
}
/**
* inode_to_wb_and_lock_list - determine an inode's wb and lock it
* @inode: inode of interest
*
* Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
* on entry.
*/
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
__acquires(&wb->list_lockContributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 14 | 100.00% | 1 | 100.00% |
| Total | 14 | 100.00% | 1 | 100.00% |
)
{
spin_lock(&inode->i_lock);
return locked_inode_to_wb_and_lock_list(inode);
}
struct inode_switch_wbs_context {
struct inode *inode;
struct bdi_writeback *new_wb;
struct rcu_head rcu_head;
struct work_struct work;
};
static void inode_switch_wbs_work_fn(struct work_struct *work)
{
struct inode_switch_wbs_context *isw =
container_of(work, struct inode_switch_wbs_context, work);
struct inode *inode = isw->inode;
struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
struct radix_tree_iter iter;
bool switched = false;
void **slot;
/*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
* synchronizing against mapping->tree_lock.
*
* Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
if (old_wb < new_wb) {
spin_lock(&old_wb->list_lock);
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
} else {
spin_lock(&new_wb->list_lock);
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
spin_lock(&inode->i_lock);
spin_lock_irq(&mapping->tree_lock);
/*
* Once I_FREEING is visible under i_lock, the eviction path owns
* the inode and we shouldn't modify ->i_io_list.
*/
if (unlikely(inode->i_state & I_FREEING))
goto skip_switch;
/*
* Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
* pages actually under underwriteback.
*/
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
PAGECACHE_TAG_DIRTY) {
struct page *page = radix_tree_deref_slot_protected(slot,
&mapping->tree_lock);
if (likely(page) && PageDirty(page)) {
__dec_wb_stat(old_wb, WB_RECLAIMABLE);
__inc_wb_stat(new_wb, WB_RECLAIMABLE);
}
}
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
PAGECACHE_TAG_WRITEBACK) {
struct page *page = radix_tree_deref_slot_protected(slot,
&mapping->tree_lock);
if (likely(page)) {
WARN_ON_ONCE(!PageWriteback(page));
__dec_wb_stat(old_wb, WB_WRITEBACK);
__inc_wb_stat(new_wb, WB_WRITEBACK);
}
}
wb_get(new_wb);
/*
* Transfer to @new_wb's IO list if necessary. The specific list
* @inode was on is ignored and the inode is put on ->b_dirty which
* is always correct including from ->b_dirty_time. The transfer
* preserves @inode->dirtied_when ordering.
*/
if (!list_empty(&inode->i_io_list)) {
struct inode *pos;
inode_io_list_del_locked(inode, old_wb);
inode->i_wb = new_wb;
list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
if (time_after_eq(inode->dirtied_when,
pos->dirtied_when))
break;
inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
} else {
inode->i_wb = new_wb;
}
/* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
inode->i_wb_frn_winner = 0;
inode->i_wb_frn_avg_time = 0;
inode->i_wb_frn_history = 0;
switched = true;
skip_switch:
/*
* Paired with load_acquire in unlocked_inode_to_wb_begin() and
* ensures that the new wb is visible if they see !I_WB_SWITCH.
*/
smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
spin_unlock_irq(&mapping->tree_lock);
spin_unlock(&inode->i_lock);
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
if (switched) {
wb_wakeup(new_wb);
wb_put(old_wb);
}
wb_put(new_wb);
iput(inode);
kfree(isw);
atomic_dec(&isw_nr_in_flight);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 459 | 98.71% | 4 | 80.00% |
dave chinner | dave chinner | 6 | 1.29% | 1 | 20.00% |
| Total | 465 | 100.00% | 5 | 100.00% |
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
{
struct inode_switch_wbs_context *isw = container_of(rcu_head,
struct inode_switch_wbs_context, rcu_head);
/* needs to grab bh-unsafe locks, bounce to work item */
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
queue_work(isw_wq, &isw->work);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 47 | 100.00% | 2 | 100.00% |
| Total | 47 | 100.00% | 2 | 100.00% |
/**
* inode_switch_wbs - change the wb association of an inode
* @inode: target inode
* @new_wb_id: ID of the new wb
*
* Switch @inode's wb association to the wb identified by @new_wb_id. The
* switching is performed asynchronously and may fail silently.
*/
static void inode_switch_wbs(struct inode *inode, int new_wb_id)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
/* noop if seems to be already in progress */
if (inode->i_state & I_WB_SWITCH)
return;
isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
if (!isw)
return;
/* find and pin the new wb */
rcu_read_lock();
memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
if (memcg_css)
isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
rcu_read_unlock();
if (!isw->new_wb)
goto out_free;
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
inode->i_state & (I_WB_SWITCH | I_FREEING) ||
inode_to_wb(inode) == isw->new_wb) {
spin_unlock(&inode->i_lock);
goto out_free;
}
inode->i_state |= I_WB_SWITCH;
__iget(inode);
spin_unlock(&inode->i_lock);
isw->inode = inode;
atomic_inc(&isw_nr_in_flight);
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
* tree_lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
return;
out_free:
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 219 | 97.77% | 2 | 66.67% |
tahsin erdogan | tahsin erdogan | 5 | 2.23% | 1 | 33.33% |
| Total | 224 | 100.00% | 3 | 100.00% |
/**
* wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
* @wbc: writeback_control of interest
* @inode: target inode
*
* @inode is locked and about to be written back under the control of @wbc.
* Record @inode's writeback context into @wbc and unlock the i_lock. On
* writeback completion, wbc_detach_inode() should be called. This is used
* to track the cgroup writeback context.
*/
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
struct inode *inode)
{
if (!inode_cgwb_enabled(inode)) {
spin_unlock(&inode->i_lock);
return;
}
wbc->wb = inode_to_wb(inode);
wbc->inode = inode;
wbc->wb_id = wbc->wb->memcg_css->id;
wbc->wb_lcand_id = inode->i_wb_frn_winner;
wbc->wb_tcand_id = 0;
wbc->wb_bytes = 0;
wbc->wb_lcand_bytes = 0;
wbc->wb_tcand_bytes = 0;
wb_get(wbc->wb);
spin_unlock(&inode->i_lock);
/*
* A dying wb indicates that the memcg-blkcg mapping has changed
* and a new wb is already serving the memcg. Switch immediately.
*/
if (unlikely(wb_dying(wbc->wb)))
inode_switch_wbs(inode, wbc->wb_id);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 130 | 100.00% | 4 | 100.00% |
| Total | 130 | 100.00% | 4 | 100.00% |
/**
* wbc_detach_inode - disassociate wbc from inode and perform foreign detection
* @wbc: writeback_control of the just finished writeback
*
* To be called after a writeback attempt of an inode finishes and undoes
* wbc_attach_and_unlock_inode(). Can be called under any context.
*
* As concurrent write sharing of an inode is expected to be very rare and
* memcg only tracks page ownership on first-use basis severely confining
* the usefulness of such sharing, cgroup writeback tracks ownership
* per-inode. While the support for concurrent write sharing of an inode
* is deemed unnecessary, an inode being written to by different cgroups at
* different points in time is a lot more common, and, more importantly,
* charging only by first-use can too readily lead to grossly incorrect
* behaviors (single foreign page can lead to gigabytes of writeback to be
* incorrectly attributed).
*
* To resolve this issue, cgroup writeback detects the majority dirtier of
* an inode and transfers the ownership to it. To avoid unnnecessary
* oscillation, the detection mechanism keeps track of history and gives
* out the switch verdict only if the foreign usage pattern is stable over
* a certain amount of time and/or writeback attempts.
*
* On each writeback attempt, @wbc tries to detect the majority writer
* using Boyer-Moore majority vote algorithm. In addition to the byte
* count from the majority voting, it also counts the bytes written for the
* current wb and the last round's winner wb (max of last round's current
* wb, the winner from two rounds ago, and the last round's majority
* candidate). Keeping track of the historical winner helps the algorithm
* to semi-reliably detect the most active writer even when it's not the
* absolute majority.
*
* Once the winner of the round is determined, whether the winner is
* foreign or not and how much IO time the round consumed is recorded in
* inode->i_wb_frn_history. If the amount of recorded foreign IO time is
* over a certain threshold, the switch verdict is given.
*/
void wbc_detach_inode(struct writeback_control *wbc)
{
struct bdi_writeback *wb = wbc->wb;
struct inode *inode = wbc->inode;
unsigned long avg_time, max_bytes, max_time;
u16 history;
int max_id;
if (!wb)
return;
history = inode->i_wb_frn_history;
avg_time = inode->i_wb_frn_avg_time;
/* pick the winner of this round */
if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
wbc->wb_bytes >= wbc->wb_tcand_bytes) {
max_id = wbc->wb_id;
max_bytes = wbc->wb_bytes;
} else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
max_id = wbc->wb_lcand_id;
max_bytes = wbc->wb_lcand_bytes;
} else {
max_id = wbc->wb_tcand_id;
max_bytes = wbc->wb_tcand_bytes;
}
/*
* Calculate the amount of IO time the winner consumed and fold it
* into the running average kept per inode. If the consumed IO
* time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
* deciding whether to switch or not. This is to prevent one-off
* small dirtiers from skewing the verdict.
*/
max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
wb->avg_write_bandwidth);
if (avg_time)
avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
(avg_time >> WB_FRN_TIME_AVG_SHIFT);
else
avg_time = max_time; /* immediate catch up on first run */
if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
int slots;
/*
* The switch verdict is reached if foreign wb's consume
* more than a certain proportion of IO time in a
* WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot
* history mask where each bit represents one sixteenth of
* the period. Determine the number of slots to shift into
* history from @max_time.
*/
slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
(unsigned long)WB_FRN_HIST_MAX_SLOTS);
history <<= slots;
if (wbc->wb_id != max_id)
history |= (1U << slots) - 1;
/*
* Switch if the current wb isn't the consistent winner.
* If there are multiple closely competing dirtiers, the
* inode may switch across them repeatedly over time, which
* is okay. The main goal is avoiding keeping an inode on
* the wrong wb for an extended period of time.
*/
if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
inode_switch_wbs(inode, max_id);
}
/*
* Multiple instances of this function may race to update the
* following fields but we don't mind occassional inaccuracies.
*/
inode->i_wb_frn_winner = max_id;
inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
inode->i_wb_frn_history = history;
wb_put(wbc->wb);
wbc->wb = NULL;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 287 | 100.00% | 4 | 100.00% |
| Total | 287 | 100.00% | 4 | 100.00% |
/**
* wbc_account_io - account IO issued during writeback
* @wbc: writeback_control of the writeback in progress
* @page: page being written out
* @bytes: number of bytes being written out
*
* @bytes from @page are about to written out during the writeback
* controlled by @wbc. Keep the book for foreign inode detection. See
* wbc_detach_inode().
*/
void wbc_account_io(struct writeback_control *wbc, struct page *page,
size_t bytes)
{
int id;
/*
* pageout() path doesn't attach @wbc to the inode being written
* out. This is intentional as we don't want the function to block
* behind a slow cgroup. Ultimately, we want pageout() to kick off
* regular writeback instead of writing things out itself.
*/
if (!wbc->wb)
return;
id = mem_cgroup_css_from_page(page)->id;
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
return;
}
if (id == wbc->wb_lcand_id)
wbc->wb_lcand_bytes += bytes;
/* Boyer-Moore majority vote algorithm */
if (!wbc->wb_tcand_bytes)
wbc->wb_tcand_id = id;
if (id == wbc->wb_tcand_id)
wbc->wb_tcand_bytes += bytes;
else
wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 112 | 100.00% | 1 | 100.00% |
| Total | 112 | 100.00% | 1 | 100.00% |
EXPORT_SYMBOL_GPL(wbc_account_io);
/**
* inode_congested - test whether an inode is congested
* @inode: inode to test for congestion (may be NULL)
* @cong_bits: mask of WB_[a]sync_congested bits to test
*
* Tests whether @inode is congested. @cong_bits is the mask of congestion
* bits to test and the return value is the mask of set bits.
*
* If cgroup writeback is enabled for @inode, the congestion state is
* determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
* associated with @inode is congested; otherwise, the root wb's congestion
* state is used.
*
* @inode is allowed to be NULL as this function is often called on
* mapping->host which is NULL for the swapper space.
*/
int inode_congested(struct inode *inode, int cong_bits)
{
/*
* Once set, ->i_wb never becomes NULL while the inode is alive.
* Start transaction iff ->i_wb is visible.
*/
if (inode && inode_to_wb_is_valid(inode)) {
struct bdi_writeback *wb;
bool locked, congested;
wb = unlocked_inode_to_wb_begin(inode, &locked);
congested = wb_congested(wb, cong_bits);
unlocked_inode_to_wb_end(inode, locked);
return congested;
}
return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 78 | 100.00% | 3 | 100.00% |
| Total | 78 | 100.00% | 3 | 100.00% |
EXPORT_SYMBOL_GPL(inode_congested);
/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
*
* Split @wb's portion of @nr_pages according to @wb's write bandwidth in
* relation to the total write bandwidth of all wb's w/ dirty inodes on
* @wb->bdi.
*/
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
unsigned long this_bw = wb->avg_write_bandwidth;
unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
if (nr_pages == LONG_MAX)
return LONG_MAX;
/*
* This may be called on clean wb's and proportional distribution
* may not make sense, just use the original @nr_pages in those
* cases. In general, we wanna err on the side of writing more.
*/
if (!tot_bw || this_bw >= tot_bw)
return nr_pages;
else
return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 72 | 100.00% | 1 | 100.00% |
| Total | 72 | 100.00% | 1 | 100.00% |
/**
* bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
* @bdi: target backing_dev_info
* @base_work: wb_writeback_work to issue
* @skip_if_busy: skip wb's which already have writeback in progress
*
* Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
* have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's
* distributed to the busy wbs according to each wb's proportion in the
* total active write bandwidth of @bdi.
*/
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
struct bdi_writeback *last_wb = NULL;
struct bdi_writeback *wb = list_entry(&bdi->wb_list,
struct bdi_writeback, bdi_node);
might_sleep();
restart:
rcu_read_lock();
list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
struct wb_writeback_work fallback_work;
struct wb_writeback_work *work;
long nr_pages;
if (last_wb) {
wb_put(last_wb);
last_wb = NULL;
}
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
list_empty(&wb->b_dirty_time)))
continue;
if (skip_if_busy && writeback_in_progress(wb))
continue;
nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work) {
*work = *base_work;
work->nr_pages = nr_pages;
work->auto_free = 1;
wb_queue_work(wb, work);
continue;
}
/* alloc failed, execute synchronously using on-stack fallback */
work = &fallback_work;
*work = *base_work;
work->nr_pages = nr_pages;
work->auto_free = 0;
work->done = &fallback_work_done;
wb_queue_work(wb, work);
/*
* Pin @wb so that it stays on @bdi->wb_list. This allows
* continuing iteration from @wb after dropping and
* regrabbing rcu read lock.
*/
wb_get(wb);
last_wb = wb;
rcu_read_unlock();
wb_wait_for_completion(bdi, &fallback_work_done);
goto restart;
}
rcu_read_unlock();
if (last_wb)
wb_put(last_wb);
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 260 | 100.00% | 5 | 100.00% |
| Total | 260 | 100.00% | 5 | 100.00% |
/**
* cgroup_writeback_umount - flush inode wb switches for umount
*
* This function is called when a super_block is about to be destroyed and
* flushes in-flight inode wb switches. An inode wb switch goes through
* RCU and then workqueue, so the two need to be flushed in order to ensure
* that all previously scheduled switches are finished. As wb switches are
* rare occurrences and synchronize_rcu() can take a while, perform
* flushing iff wb switches are in flight.
*/
void cgroup_writeback_umount(void)
{
if (atomic_read(&isw_nr_in_flight)) {
synchronize_rcu();
flush_workqueue(isw_wq);
}
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 25 | 100.00% | 1 | 100.00% |
| Total | 25 | 100.00% | 1 | 100.00% |
static int __init cgroup_writeback_init(void)
{
isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
if (!isw_wq)
return -ENOMEM;
return 0;
}
Contributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 32 | 100.00% | 1 | 100.00% |
| Total | 32 | 100.00% | 1 | 100.00% |
fs_initcall(cgroup_writeback_init);
#else /* CONFIG_CGROUP_WRITEBACK */
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
__releases(&inode->i_lockContributors
| Person | Tokens | Prop | Commits | CommitProp |
tejun heo | tejun heo | 14 | 100.00% | 1 | 100.00% |
| Total | 14 | 100.00% | 1 | 100.00% |
)
__acquires(&wb->list_lock)
{
struct bdi_writeback *wb = inode_to_wb(inode);
spin_unlock(&inode->