cregit-Linux how code gets into the kernel

Release 4.18 fs/fs-writeback.c

Directory: fs
/*
 * fs/fs-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains all the functions related to writing back and waiting
 * upon dirty inodes against superblocks, and writing back dirty
 * pages against inodes.  ie: data writeback.  Writeout of the
 * inode itself is not handled here.
 *
 * 10Apr2002    Andrew Morton
 *              Split out of fs/inode.c
 *              Additions for address_space-based writeback
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
#include "internal.h"

/*
 * 4MB minimal write chunk size
 */

#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10))


struct wb_completion {
	
atomic_t		cnt;
};

/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */

struct wb_writeback_work {
	
long nr_pages;
	
struct super_block *sb;
	
unsigned long *older_than_this;
	
enum writeback_sync_modes sync_mode;
	
unsigned int tagged_writepages:1;
	
unsigned int for_kupdate:1;
	
unsigned int range_cyclic:1;
	
unsigned int for_background:1;
	
unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
	
unsigned int auto_free:1;	/* free on completion */
	
enum wb_reason reason;		/* why was writeback initiated? */

	
struct list_head list;		/* pending work list */
	
struct wb_completion *done;	/* set if the caller waits */
};

/*
 * If one wants to wait for one or more wb_writeback_works, each work's
 * ->done should be set to a wb_completion defined using the following
 * macro.  Once all work items are issued with wb_queue_work(), the caller
 * can wait for the completion of all using wb_wait_for_completion().  Work
 * items which are waited upon aren't freed automatically on completion.
 */

#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)				\
	struct wb_completion cmpl = {                                   \
                .cnt            = ATOMIC_INIT(1),                       \
        }


/*
 * If an inode is constantly having its pages dirtied, but then the
 * updates stop dirtytime_expire_interval seconds in the past, it's
 * possible for the worst case time between when an inode has its
 * timestamps updated and when they finally get written out to be two
 * dirtytime_expire_intervals.  We set the default to 12 hours (in
 * seconds), which means most of the time inodes will have their
 * timestamps written to disk after 12 hours, but in the worst case a
 * few inodes might not their timestamps updated for 24 hours.
 */

unsigned int dirtytime_expire_interval = 12 * 60 * 60;


static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_io_list); }

Contributors

PersonTokensPropCommitsCommitProp
Nicholas Piggin2496.00%150.00%
Dave Chinner14.00%150.00%
Total25100.00%2100.00%

/* * Include the creation of the trace points after defining the * wb_writeback_work structure and inline functions so that the definition * remains local to this file. */ #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
static bool wb_io_lists_populated(struct bdi_writeback *wb) { if (wb_has_dirty_io(wb)) { return false; } else { set_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(!wb->avg_write_bandwidth); atomic_long_add(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth); return true; } }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo61100.00%3100.00%
Total61100.00%3100.00%


static void wb_io_lists_depopulated(struct bdi_writeback *wb) { if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { clear_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth) < 0); } }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo73100.00%3100.00%
Total73100.00%3100.00%

/** * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io|dirty_time} * * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */
static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) return wb_io_lists_populated(wb); wb_io_lists_depopulated(wb); return false; }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo6196.83%150.00%
Dave Chinner23.17%150.00%
Total63100.00%2100.00%

/** * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list * @inode: inode to be removed * @wb: bdi_writeback @inode is being removed from * * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and * clear %WB_has_dirty_io if all are empty afterwards. */
static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo3594.59%150.00%
Dave Chinner25.41%150.00%
Total37100.00%2100.00%


static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); spin_unlock_bh(&wb->work_lock); }

Contributors

PersonTokensPropCommitsCommitProp
Jan Kara4078.43%133.33%
Tejun Heo1121.57%266.67%
Total51100.00%3100.00%


static void finish_writeback_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { struct wb_completion *done = work->done; if (work->auto_free) kfree(work); if (done && atomic_dec_and_test(&done->cnt)) wake_up_all(&wb->bdi->wb_waitq); }

Contributors

PersonTokensPropCommitsCommitProp
Tahsin Erdogan3153.45%110.00%
Andrew Morton1017.24%330.00%
Jens Axboe712.07%110.00%
Tejun Heo46.90%220.00%
David Chinner46.90%110.00%
Jan Kara11.72%110.00%
Christoph Hellwig11.72%110.00%
Total58100.00%10100.00%


static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { trace_writeback_queue(wb, work); if (work->done) atomic_inc(&work->done->cnt); spin_lock_bh(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) { list_add_tail(&work->list, &wb->work_list); mod_delayed_work(bdi_wq, &wb->dwork, 0); } else finish_writeback_work(wb, work); spin_unlock_bh(&wb->work_lock); }

Contributors

PersonTokensPropCommitsCommitProp
Tahsin Erdogan5250.98%110.00%
Jan Kara2322.55%110.00%
Tejun Heo1716.67%330.00%
Andrew Morton98.82%440.00%
Artem B. Bityutskiy10.98%110.00%
Total102100.00%10100.00%

/** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @bdi: bdi work items were issued to * @done: target wb_completion * * Wait for one or more work items issued to @bdi with their ->done field * set to @done, which should have been defined with * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such * work items are completed. Work items which are waited upon aren't freed * automatically on completion. */
static void wb_wait_for_completion(struct backing_dev_info *bdi, struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo41100.00%1100.00%
Total41100.00%1100.00%

#ifdef CONFIG_CGROUP_WRITEBACK /* parameters for foreign inode detection, see wb_detach_inode() */ #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ #define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) /* each slot's duration is 2s / 16 */ #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) /* if foreign slots >= 8, switch */ #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq;
void __inode_attach_wb(struct inode *inode, struct page *page) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; if (page) { memcg_css = mem_cgroup_css_from_page(page); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); } } if (!wb) wb = &bdi->wb; /* * There may be multiple instances of this function racing to * update the same inode. Use cmpxchg() to tell the winner. */ if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo134100.00%1100.00%
Total134100.00%1100.00%

/** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held * * Returns @inode's wb with its list_lock held. @inode->i_lock must be * held on entry and is released on return. The returned wb is guaranteed * to stay @inode's associated wb until its list_lock is released. */
static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo14100.00%1100.00%
Total14100.00%1100.00%

) __acquires(&wb->list_lock) { while (true) { struct bdi_writeback *wb = inode_to_wb(inode); /* * inode_to_wb() association is protected by both * @inode->i_lock and @wb->list_lock but list_lock nests * outside i_lock. Drop i_lock and verify that the * association hasn't changed after acquiring list_lock. */ wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); /* i_wb may have changed inbetween, can't use inode_to_wb() */ if (likely(wb == inode->i_wb)) { wb_put(wb); /* @inode already has ref */ return wb; } spin_unlock(&wb->list_lock); wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } } /** * inode_to_wb_and_lock_list - determine an inode's wb and lock it * @inode: inode of interest * * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held * on entry. */
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo14100.00%1100.00%
Total14100.00%1100.00%

) { spin_lock(&inode->i_lock); return locked_inode_to_wb_and_lock_list(inode); } struct inode_switch_wbs_context { struct inode *inode; struct bdi_writeback *new_wb; struct rcu_head rcu_head; struct work_struct work; };
static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; struct radix_tree_iter iter; bool switched = false; void **slot; /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be * synchronizing against the i_pages lock. * * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&new_wb->list_lock); spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING is visible under i_lock, the eviction path owns * the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to * pages actually under writeback. */ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_DIRTY) { struct page *page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); if (likely(page) && PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_WRITEBACK) { struct page *page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); dec_wb_stat(old_wb, WB_WRITEBACK); inc_wb_stat(new_wb, WB_WRITEBACK); } } wb_get(new_wb); /* * Transfer to @new_wb's IO list if necessary. The specific list * @inode was on is ignored and the inode is put on ->b_dirty which * is always correct including from ->b_dirty_time. The transfer * preserves @inode->dirtied_when ordering. */ if (!list_empty(&inode->i_io_list)) { struct inode *pos; inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode->i_wb = new_wb; } /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; switched = true; skip_switch: /* * Paired with load_acquire in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); if (switched) { wb_wakeup(new_wb); wb_put(old_wb); } wb_put(new_wb); iput(inode); kfree(isw); atomic_dec(&isw_nr_in_flight); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo44594.88%457.14%
Matthew Wilcox142.99%114.29%
Dave Chinner61.28%114.29%
Nikolay Borisov40.85%114.29%
Total469100.00%7100.00%


static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) { struct inode_switch_wbs_context *isw = container_of(rcu_head, struct inode_switch_wbs_context, rcu_head); /* needs to grab bh-unsafe locks, bounce to work item */ INIT_WORK(&isw->work, inode_switch_wbs_work_fn); queue_work(isw_wq, &isw->work); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo47100.00%2100.00%
Total47100.00%2100.00%

/** * inode_switch_wbs - change the wb association of an inode * @inode: target inode * @new_wb_id: ID of the new wb * * Switch @inode's wb association to the wb identified by @new_wb_id. The * switching is performed asynchronously and may fail silently. */
static void inode_switch_wbs(struct inode *inode, int new_wb_id) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) return; isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) return; /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); if (memcg_css) isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); rcu_read_unlock(); if (!isw->new_wb) goto out_free; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || inode->i_state & (I_WB_SWITCH | I_FREEING) || inode_to_wb(inode) == isw->new_wb) { spin_unlock(&inode->i_lock); goto out_free; } inode->i_state |= I_WB_SWITCH; __iget(inode); spin_unlock(&inode->i_lock); isw->inode = inode; atomic_inc(&isw_nr_in_flight); /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); return; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo21796.88%240.00%
Tahsin Erdogan52.23%120.00%
Matthew Wilcox10.45%120.00%
Linus Torvalds10.45%120.00%
Total224100.00%5100.00%

/** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest * @inode: target inode * * @inode is locked and about to be written back under the control of @wbc. * Record @inode's writeback context into @wbc and unlock the i_lock. On * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */
void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); return; } wbc->wb = inode_to_wb(inode); wbc->inode = inode; wbc->wb_id = wbc->wb->memcg_css->id; wbc->wb_lcand_id = inode->i_wb_frn_winner; wbc->wb_tcand_id = 0; wbc->wb_bytes = 0; wbc->wb_lcand_bytes = 0; wbc->wb_tcand_bytes = 0; wb_get(wbc->wb); spin_unlock(&inode->i_lock); /* * A dying wb indicates that the memcg-blkcg mapping has changed * and a new wb is already serving the memcg. Switch immediately. */ if (unlikely(wb_dying(wbc->wb))) inode_switch_wbs(inode, wbc->wb_id); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo130100.00%4100.00%
Total130100.00%4100.00%

/** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection * @wbc: writeback_control of the just finished writeback * * To be called after a writeback attempt of an inode finishes and undoes * wbc_attach_and_unlock_inode(). Can be called under any context. * * As concurrent write sharing of an inode is expected to be very rare and * memcg only tracks page ownership on first-use basis severely confining * the usefulness of such sharing, cgroup writeback tracks ownership * per-inode. While the support for concurrent write sharing of an inode * is deemed unnecessary, an inode being written to by different cgroups at * different points in time is a lot more common, and, more importantly, * charging only by first-use can too readily lead to grossly incorrect * behaviors (single foreign page can lead to gigabytes of writeback to be * incorrectly attributed). * * To resolve this issue, cgroup writeback detects the majority dirtier of * an inode and transfers the ownership to it. To avoid unnnecessary * oscillation, the detection mechanism keeps track of history and gives * out the switch verdict only if the foreign usage pattern is stable over * a certain amount of time and/or writeback attempts. * * On each writeback attempt, @wbc tries to detect the majority writer * using Boyer-Moore majority vote algorithm. In addition to the byte * count from the majority voting, it also counts the bytes written for the * current wb and the last round's winner wb (max of last round's current * wb, the winner from two rounds ago, and the last round's majority * candidate). Keeping track of the historical winner helps the algorithm * to semi-reliably detect the most active writer even when it's not the * absolute majority. * * Once the winner of the round is determined, whether the winner is * foreign or not and how much IO time the round consumed is recorded in * inode->i_wb_frn_history. If the amount of recorded foreign IO time is * over a certain threshold, the switch verdict is given. */
void wbc_detach_inode(struct writeback_control *wbc) { struct bdi_writeback *wb = wbc->wb; struct inode *inode = wbc->inode; unsigned long avg_time, max_bytes, max_time; u16 history; int max_id; if (!wb) return; history = inode->i_wb_frn_history; avg_time = inode->i_wb_frn_avg_time; /* pick the winner of this round */ if (wbc->wb_bytes >= wbc->wb_lcand_bytes && wbc->wb_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_id; max_bytes = wbc->wb_bytes; } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_lcand_id; max_bytes = wbc->wb_lcand_bytes; } else { max_id = wbc->wb_tcand_id; max_bytes = wbc->wb_tcand_bytes; } /* * Calculate the amount of IO time the winner consumed and fold it * into the running average kept per inode. If the consumed IO * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for * deciding whether to switch or not. This is to prevent one-off * small dirtiers from skewing the verdict. */ max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, wb->avg_write_bandwidth); if (avg_time) avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - (avg_time >> WB_FRN_TIME_AVG_SHIFT); else avg_time = max_time; /* immediate catch up on first run */ if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { int slots; /* * The switch verdict is reached if foreign wb's consume * more than a certain proportion of IO time in a * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot * history mask where each bit represents one sixteenth of * the period. Determine the number of slots to shift into * history from @max_time. */ slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), (unsigned long)WB_FRN_HIST_MAX_SLOTS); history <<= slots; if (wbc->wb_id != max_id) history |= (1U << slots) - 1; /* * Switch if the current wb isn't the consistent winner. * If there are multiple closely competing dirtiers, the * inode may switch across them repeatedly over time, which * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) inode_switch_wbs(inode, max_id); } /* * Multiple instances of this function may race to update the * following fields but we don't mind occassional inaccuracies. */ inode->i_wb_frn_winner = max_id; inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); inode->i_wb_frn_history = history; wb_put(wbc->wb); wbc->wb = NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo287100.00%4100.00%
Total287100.00%4100.00%

/** * wbc_account_io - account IO issued during writeback * @wbc: writeback_control of the writeback in progress * @page: page being written out * @bytes: number of bytes being written out * * @bytes from @page are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */
void wbc_account_io(struct writeback_control *wbc, struct page *page, size_t bytes) { int id; /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (!wbc->wb) return; id = mem_cgroup_css_from_page(page)->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; return; } if (id == wbc->wb_lcand_id) wbc->wb_lcand_bytes += bytes; /* Boyer-Moore majority vote algorithm */ if (!wbc->wb_tcand_bytes) wbc->wb_tcand_id = id; if (id == wbc->wb_tcand_id) wbc->wb_tcand_bytes += bytes; else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo112100.00%1100.00%
Total112100.00%1100.00%

EXPORT_SYMBOL_GPL(wbc_account_io); /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion (may be NULL) * @cong_bits: mask of WB_[a]sync_congested bits to test * * Tests whether @inode is congested. @cong_bits is the mask of congestion * bits to test and the return value is the mask of set bits. * * If cgroup writeback is enabled for @inode, the congestion state is * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg * associated with @inode is congested; otherwise, the root wb's congestion * state is used. * * @inode is allowed to be NULL as this function is often called on * mapping->host which is NULL for the swapper space. */
int inode_congested(struct inode *inode, int cong_bits) { /* * Once set, ->i_wb never becomes NULL while the inode is alive. * Start transaction iff ->i_wb is visible. */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; struct wb_lock_cookie lock_cookie = {}; bool congested; wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo7489.16%375.00%
Greg Thelen910.84%125.00%
Total83100.00%4100.00%

EXPORT_SYMBOL_GPL(inode_congested); /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi * * Split @wb's portion of @nr_pages according to @wb's write bandwidth in * relation to the total write bandwidth of all wb's w/ dirty inodes on * @wb->bdi. */
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); if (nr_pages == LONG_MAX) return LONG_MAX; /* * This may be called on clean wb's and proportional distribution * may not make sense, just use the original @nr_pages in those * cases. In general, we wanna err on the side of writing more. */ if (!tot_bw || this_bw >= tot_bw) return nr_pages; else return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo72100.00%1100.00%
Total72100.00%1100.00%

/** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info * @base_work: wb_writeback_work to issue * @skip_if_busy: skip wb's which already have writeback in progress * * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's * distributed to the busy wbs according to each wb's proportion in the * total active write bandwidth of @bdi. */
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { struct bdi_writeback *last_wb = NULL; struct bdi_writeback *wb = list_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); might_sleep(); restart: rcu_read_lock(); list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; if (last_wb) { wb_put(last_wb); last_wb = NULL; } /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || list_empty(&wb->b_dirty_time))) continue; if (skip_if_busy && writeback_in_progress(wb)) continue; nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 1; wb_queue_work(wb, work); continue; } /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work =