Release 4.12 block/bfq-iosched.c
/*
* Budget Fair Queueing (BFQ) I/O scheduler.
*
* Based on ideas and code from CFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*
* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
* Paolo Valente <paolo.valente@unimore.it>
*
* Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
* Arianna Avanzini <avanzini@google.com>
*
* Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* BFQ is a proportional-share I/O scheduler, with some extra
* low-latency capabilities. BFQ also supports full hierarchical
* scheduling through cgroups. Next paragraphs provide an introduction
* on BFQ inner workings. Details on BFQ benefits, usage and
* limitations can be found in Documentation/block/bfq-iosched.txt.
*
* BFQ is a proportional-share storage-I/O scheduling algorithm based
* on the slice-by-slice service scheme of CFQ. But BFQ assigns
* budgets, measured in number of sectors, to processes instead of
* time slices. The device is not granted to the in-service process
* for a given time slice, but until it has exhausted its assigned
* budget. This change from the time to the service domain enables BFQ
* to distribute the device throughput among processes as desired,
* without any distortion due to throughput fluctuations, or to device
* internal queueing. BFQ uses an ad hoc internal scheduler, called
* B-WF2Q+, to schedule processes according to their budgets. More
* precisely, BFQ schedules queues associated with processes. Each
* process/queue is assigned a user-configurable weight, and B-WF2Q+
* guarantees that each queue receives a fraction of the throughput
* proportional to its weight. Thanks to the accurate policy of
* B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
* processes issuing sequential requests (to boost the throughput),
* and yet guarantee a low latency to interactive and soft real-time
* applications.
*
* In particular, to provide these low-latency guarantees, BFQ
* explicitly privileges the I/O of two classes of time-sensitive
* applications: interactive and soft real-time. This feature enables
* BFQ to provide applications in these classes with a very low
* latency. Finally, BFQ also features additional heuristics for
* preserving both a low latency and a high throughput on NCQ-capable,
* rotational or flash-based devices, and to get the job done quickly
* for applications consisting in many I/O-bound processes.
*
* NOTE: if the main or only goal, with a given device, is to achieve
* the maximum-possible throughput at all times, then do switch off
* all low-latency heuristics for that device, by setting low_latency
* to 0.
*
* BFQ is described in [1], where also a reference to the initial, more
* theoretical paper on BFQ can be found. The interested reader can find
* in the latter paper full details on the main algorithm, as well as
* formulas of the guarantees and formal proofs of all the properties.
* With respect to the version of BFQ presented in these papers, this
* implementation adds a few more heuristics, such as the one that
* guarantees a low latency to soft real-time applications, and a
* hierarchical extension based on H-WF2Q+.
*
* B-WF2Q+ is based on WF2Q+, which is described in [2], together with
* H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
* with O(log N) complexity derives from the one introduced with EEVDF
* in [3].
*
* [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
* Scheduler", Proceedings of the First Workshop on Mobile System
* Technologies (MST-2015), May 2015.
* http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
*
* [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
* Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
* Oct 1997.
*
* http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
*
* [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
* First: A Flexible and Accurate Mechanism for Proportional Share
* Resource Allocation", technical report.
*
* http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
#include "bfq-iosched.h"
#define BFQ_BFQQ_FNS(name) \
void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
{ \
__set_bit(BFQQF_##name, &(bfqq)->flags); \
} \
void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
{ \
__clear_bit(BFQQF_##name, &(bfqq)->flags); \
} \
int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
{ \
return test_bit(BFQQF_##name, &(bfqq)->flags); \
}
BFQ_BFQQ_FNS(just_created);
BFQ_BFQQ_FNS(busy);
BFQ_BFQQ_FNS(wait_request);
BFQ_BFQQ_FNS(non_blocking_wait_rq);
BFQ_BFQQ_FNS(fifo_expire);
BFQ_BFQQ_FNS(idle_window);
BFQ_BFQQ_FNS(sync);
BFQ_BFQQ_FNS(IO_bound);
BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
#undef BFQ_BFQQ_FNS
\
/* Expiration time of sync (0) and async (1) requests, in ns. */
static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
static const int bfq_back_max = 16 * 1024;
/* Penalty of a backwards seek, in number of sectors. */
static const int bfq_back_penalty = 2;
/* Idling period duration, in ns. */
static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
/* Minimum number of assigned budgets for which stats are safe to compute. */
static const int bfq_stats_min_budgets = 194;
/* Default maximum budget values, in sectors and number of requests. */
static const int bfq_default_max_budget = 16 * 1024;
/*
* Async to sync throughput distribution is controlled as follows:
* when an async request is served, the entity is charged the number
* of sectors of the request, multiplied by the factor below
*/
static const int bfq_async_charge_factor = 10;
/* Default timeout values, in jiffies, approximating CFQ defaults. */
const int bfq_timeout = HZ / 8;
static struct kmem_cache *bfq_pool;
/* Below this threshold (in ns), we consider thinktime immediate. */
#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
/* hw_tag detection: parallel requests threshold and min samples needed. */
#define BFQ_HW_QUEUE_THRESHOLD 4
#define BFQ_HW_QUEUE_SAMPLES 32
#define BFQQ_SEEK_THR (sector_t)(8 * 100)
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
/* Min number of samples required to perform peak-rate update */
#define BFQ_RATE_MIN_SAMPLES 32
/* Min observation time interval required to perform a peak-rate update (ns) */
#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
/* Target observation time interval for a peak-rate update (ns) */
#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
/* Shift used for peak rate fixed precision calculations. */
#define BFQ_RATE_SHIFT 16
/*
* By default, BFQ computes the duration of the weight raising for
* interactive applications automatically, using the following formula:
* duration = (R / r) * T, where r is the peak rate of the device, and
* R and T are two reference parameters.
* In particular, R is the peak rate of the reference device (see below),
* and T is a reference time: given the systems that are likely to be
* installed on the reference device according to its speed class, T is
* about the maximum time needed, under BFQ and while reading two files in
* parallel, to load typical large applications on these systems.
* In practice, the slower/faster the device at hand is, the more/less it
* takes to load applications with respect to the reference device.
* Accordingly, the longer/shorter BFQ grants weight raising to interactive
* applications.
*
* BFQ uses four different reference pairs (R, T), depending on:
* . whether the device is rotational or non-rotational;
* . whether the device is slow, such as old or portable HDDs, as well as
* SD cards, or fast, such as newer HDDs and SSDs.
*
* The device's speed class is dynamically (re)detected in
* bfq_update_peak_rate() every time the estimated peak rate is updated.
*
* In the following definitions, R_slow[0]/R_fast[0] and
* T_slow[0]/T_fast[0] are the reference values for a slow/fast
* rotational device, whereas R_slow[1]/R_fast[1] and
* T_slow[1]/T_fast[1] are the reference values for a slow/fast
* non-rotational device. Finally, device_speed_thresh are the
* thresholds used to switch between speed classes. The reference
* rates are not the actual peak rates of the devices used as a
* reference, but slightly lower values. The reason for using these
* slightly lower values is that the peak-rate estimator tends to
* yield slightly lower values than the actual peak rate (it can yield
* the actual peak rate only if there is only one process doing I/O,
* and the process does sequential I/O).
*
* Both the reference peak rates and the thresholds are measured in
* sectors/usec, left-shifted by BFQ_RATE_SHIFT.
*/
static int R_slow[2] = {1000, 10700};
static int R_fast[2] = {14000, 33000};
/*
* To improve readability, a conversion function is used to initialize the
* following arrays, which entails that they can be initialized only in a
* function.
*/
static int T_slow[2];
static int T_fast[2];
static int device_speed_thresh[2];
#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
{
return bic->bfqq[is_sync];
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Paolo Valente | 21 | 91.30% | 2 | 66.67% |
Arianna Avanzini | 2 | 8.70% | 1 | 33.33% |
Total | 23 | 100.00% | 3 | 100.00% |
void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
{
bic->bfqq[is_sync] = bfqq;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Paolo Valente | 19 | 70.37% | 2 | 66.67% |
Arianna Avanzini | 8 | 29.63% | 1 | 33.33% |
Total | 27 | 100.00% | 3 | 100.00% |
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
{
return bic->icq.q->elevator->elevator_data;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Paolo Valente | 12 | 52.17% | 1 | 50.00% |
Arianna Avanzini | 11 | 47.83% | 1 | 50.00% |
Total | 23 | 100.00% | 2 | 100.00% |
/**
* icq_to_bic - convert iocontext queue structure to bfq_io_cq.
* @icq: the iocontext queue.
*/
static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
{
/* bic->icq is the first member, %NULL will convert to %NULL */
return container_of(icq, struct bfq_io_cq, icq);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Paolo Valente | 14 | 56.00% | 1 | 50.00% |
Arianna Avanzini | 11 | 44.00% | 1 | 50.00% |
Total | 25 | 100.00% | 2 | 100.00% |
/**
* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
* @bfqd: the lookup key.
* @ioc: the io_context of the process doing I/O.
* @q: the request queue.
*/
static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
struct io_context *ioc,
struct request_queue *q)
{
if (ioc) {
unsigned long flags;
struct bfq_io_cq *icq;
spin_lock_irqsave(q->queue_lock, flags);
icq = icq_to_bic(ioc_lookup_icq(ioc, q));
spin_unlock_irqrestore(q->queue_lock, flags);
return icq;
}
return NULL;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Paolo Valente | 49 | 66.22% | 2 | 66.67% |
Arianna Avanzini | 25 | 33.78% | 1 | 33.33% |
Total | 74 | 100.00% | 3 | 100.00% |
/*
* Scheduler run of queue, if there are requests pending and no one in the
* driver that will restart queueing.
*/
void bfq_schedule_dispatch(struct bfq_data *bfqd)
{
if (bfqd->queued != 0) {
bfq_log(bfqd, "schedule dispatch");
blk_mq_run_hw_queues(bfqd->queue, true);
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Paolo Valente | 36 | 100.00% | 1 | 100.00% |
Total | 36 | 100.00% | 1 | 100.00% |
#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define bfq_sample_valid(samples) ((samples) > 80)
/*
* Lifted from AS - choose which of rq1 and rq2 that is best served now.
* We choose the request that is closesr to the head right now. Distance
* behind the head is penalized and only allowed to a certain extent.
*/
static struct request *bfq_choose_req(struct bfq_data *bfqd,
struct request *rq1,
struct request *rq2,
sector_t last)
{
sector_t s1, s2, d1 = 0, d2 = 0;
unsigned long back_max;
#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
if (!rq1 || rq1 == rq2)
return rq2;
if (!rq2)
return rq1;
if (rq_is_sync(rq1) && !rq_is_sync(rq2))
return rq1;
else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
return rq2;
if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
return rq1;
else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
return rq2;
s1 = blk_rq_pos(rq1);
s2 = blk_rq_pos(rq2);
/*
* By definition, 1KiB is 2 sectors.
*/
back_max = bfqd->bfq_back_max * 2;
/*
* Strict one way elevator _except_ in the case where we allow
* short backward seeks which are biased as twice the cost of a
* similar forward seek.
*/
if (s1 >= last)
d1 = s1 - last;
else if (s1 + back_max >= last)
d1 = (last - s1) * bfqd->bfq_back_penalty;
else
wrap |= BFQ_RQ1_WRAP;
if (s2 >= last)
d2 = s2 - last;
else if (s2 + back_max >= last)
d2 = (last - s2) * bfqd->bfq_back_penalty;
else
wrap |= BFQ_RQ2_WRAP;
/* Found required data */
/*
* By doing switch() on the bit mask "wrap" we avoid having to
* check two variables for all permutations: --> faster!
*/
switch (wrap) {
case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
if (d1 < d2)
return rq1;
else if (d2 < d1)
return rq2;
if (s1 >= s2)
return rq1;
else
return rq2;
case BFQ_RQ2_WRAP:
return rq1;
case BFQ_RQ1_WRAP:
return rq2;
case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
default:
/*
* Since both rqs are wrapped,
* start with the one that's further behind head
* (--> only *one* back seek required),
* since back seek takes more time than forward.
*/
if (s1 <= s2)
return rq1;
else
return rq2;
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Paolo Valente | 335 | 100.00% | 1 | 100.00% |
Total | 335 | 100.00% | 1 | 100.00% |
static struct bfq_queue *
bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
sector_t sector, struct rb_node **ret_parent,
struct rb_node ***rb_link)
{
struct rb_node **p, *parent;
struct bfq_queue *bfqq = NULL;
parent = NULL;
p = &root->rb_node;
while (*p) {
struct rb_node **n;
parent = *p;
bfqq = rb_entry(parent, struct bfq_queue, pos_node);
/*
* Sort strictly based on sector. Smallest to the left,
* largest to the right.
*/
if (sector > blk_rq_pos(bfqq->next_rq))
n = &(*p)->rb_right;
else if (sector < blk_rq_pos(bfqq->next_rq))
n = &(*p)->rb_left;
else
break;
p = n;
bfqq = NULL;
}
*ret_parent = parent;
if (rb_link)
*rb_link = p;
bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
(unsigned long long)sector,
bfqq ? bfqq->pid : 0);
return bfqq;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 156 | 84.78% | 2 | 66.67% |
Paolo Valente | 28 | 15.22% | 1 | 33.33% |
Total | 184 | 100.00% | 3 | 100.00% |
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
struct rb_node **p, *parent;
struct bfq_queue *__bfqq;
if (bfqq->pos_root) {
rb_erase(&bfqq->pos_node, bfqq->pos_root);
bfqq->pos_root = NULL;
}
if (bfq_class_idle(bfqq))
return;
if (!bfqq->next_rq)
return;
bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
blk_rq_pos(bfqq->next_rq), &parent, &p);
if (!__bfqq) {
rb_link_node(&bfqq->pos_node, parent, p);
rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
} else
bfqq->pos_root = NULL;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 145 | 100.00% | 1 | 100.00% |
Total | 145 | 100.00% | 1 | 100.00% |
/*
* Tell whether there are active queues or groups with differentiated weights.
*/
static bool bfq_differentiated_weights(struct bfq_data *bfqd)
{
/*
* For weights to differ, at least one of the trees must contain
* at least two nodes.
*/
return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
(bfqd->queue_weights_tree.rb_node->rb_left ||
bfqd->queue_weights_tree.rb_node->rb_right)
#ifdef CONFIG_BFQ_GROUP_IOSCHED
) ||
(!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
(bfqd->group_weights_tree.rb_node->rb_left ||
bfqd->group_weights_tree.rb_node->rb_right)
#endif
);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 76 | 100.00% | 1 | 100.00% |
Total | 76 | 100.00% | 1 | 100.00% |
/*
* The following function returns true if every queue must receive the
* same share of the throughput (this condition is used when deciding
* whether idling may be disabled, see the comments in the function
* bfq_bfqq_may_idle()).
*
* Such a scenario occurs when:
* 1) all active queues have the same weight,
* 2) all active groups at the same level in the groups tree have the same
* weight,
* 3) all active groups at the same level in the groups tree have the same
* number of children.
*
* Unfortunately, keeping the necessary state for evaluating exactly the
* above symmetry conditions would be quite complex and time-consuming.
* Therefore this function evaluates, instead, the following stronger
* sub-conditions, for which it is much easier to maintain the needed
* state:
* 1) all active queues have the same weight,
* 2) all active groups have the same weight,
* 3) all active groups have at most one active child each.
* In particular, the last two conditions are always true if hierarchical
* support and the cgroups interface are not enabled, thus no state needs
* to be maintained in this case.
*/
static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
{
return !bfq_differentiated_weights(bfqd);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 18 | 100.00% | 1 | 100.00% |
Total | 18 | 100.00% | 1 | 100.00% |
/*
* If the weight-counter tree passed as input contains no counter for
* the weight of the input entity, then add that counter; otherwise just
* increment the existing counter.
*
* Note that weight-counter trees contain few nodes in mostly symmetric
* scenarios. For example, if all queues have the same weight, then the
* weight-counter tree for the queues may contain at most one node.
* This holds even if low_latency is on, because weight-raised queues
* are not inserted in the tree.
* In most scenarios, the rate at which nodes are created/destroyed
* should be low too.
*/
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
struct rb_root *root)
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
/*
* Do not insert if the entity is already associated with a
* counter, which happens if:
* 1) the entity is associated with a queue,
* 2) a request arrival has caused the queue to become both
* non-weight-raised, and hence change its weight, and
* backlogged; in this respect, each of the two events
* causes an invocation of this function,
* 3) this is the invocation of this function caused by the
* second event. This second invocation is actually useless,
* and we handle this fact by exiting immediately. More
* efficient or clearer solutions might possibly be adopted.
*/
if (entity->weight_counter)
return;
while (*new) {
struct bfq_weight_counter *__counter = container_of(*new,
struct bfq_weight_counter,
weights_node);
parent = *new;
if (entity->weight == __counter->weight) {
entity->weight_counter = __counter;
goto inc_counter;
}
if (entity->weight < __counter->weight)
new = &((*new)->rb_left);
else
new = &((*new)->rb_right);
}
entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
GFP_ATOMIC);
/*
* In the unlucky event of an allocation failure, we just
* exit. This will cause the weight of entity to not be
* considered in bfq_differentiated_weights, which, in its
* turn, causes the scenario to be deemed wrongly symmetric in
* case entity's weight would have been the only weight making
* the scenario asymmetric. On the bright side, no unbalance
* will however occur when entity becomes inactive again (the
* invocation of this function is triggered by an activation
* of entity). In fact, bfq_weights_tree_remove does nothing
* if !entity->weight_counter.
*/
if (unlikely(!entity->weight_counter))
return;
entity->weight_counter->weight = entity->weight;
rb_link_node(&entity->weight_counter->weights_node, parent, new);
rb_insert_color(&entity->weight_counter->weights_node, root);
inc_counter:
entity->weight_counter->num_active++;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 195 | 96.53% | 2 | 66.67% |
Paolo Valente | 7 | 3.47% | 1 | 33.33% |
Total | 202 | 100.00% | 3 | 100.00% |
/*
* Decrement the weight counter associated with the entity, and, if the
* counter reaches 0, remove the counter from the tree.
* See the comments to the function bfq_weights_tree_add() for considerations
* about overhead.
*/
void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
struct rb_root *root)
{
if (!entity->weight_counter)
return;
entity->weight_counter->num_active--;
if (entity->weight_counter->num_active > 0)
goto reset_entity_pointer;
rb_erase(&entity->weight_counter->weights_node, root);
kfree(entity->weight_counter);
reset_entity_pointer:
entity->weight_counter = NULL;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 55 | 73.33% | 2 | 66.67% |
Paolo Valente | 20 | 26.67% | 1 | 33.33% |
Total | 75 | 100.00% | 3 | 100.00% |
/*
* Return expired entry, or NULL to just start from scratch in rbtree.
*/
static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
struct request *last)
{
struct request *rq;
if (bfq_bfqq_fifo_expire(bfqq))
return NULL;
bfq_mark_bfqq_fifo_expire(bfqq);
rq = rq_entry_fifo(bfqq->fifo.next);
if (rq == last || ktime_get_ns() < rq->fifo_time)
return NULL;
bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
return rq;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 49 | 60.49% | 1 | 25.00% |
Paolo Valente | 32 | 39.51% | 3 | 75.00% |
Total | 81 | 100.00% | 4 | 100.00% |
static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
struct request *last)
{
struct rb_node *rbnext = rb_next(&last->rb_node);
struct rb_node *rbprev = rb_prev(&last->rb_node);
struct request *next, *prev = NULL;
/* Follow expired path, else get first next available. */
next = bfq_check_fifo(bfqq, last);
if (next)
return next;
if (rbprev)
prev = rb_entry_rq(rbprev);
if (rbnext)
next = rb_entry_rq(rbnext);
else {
rbnext = rb_first(&bfqq->sort_list);
if (rbnext && rbnext != &last->rb_node)
next = rb_entry_rq(rbnext);
}
return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 119 | 82.64% | 1 | 50.00% |
Paolo Valente | 25 | 17.36% | 1 | 50.00% |
Total | 144 | 100.00% | 2 | 100.00% |
/* see the definition of bfq_async_charge_factor for details */
static unsigned long bfq_serv_to_charge(struct request *rq,
struct bfq_queue *bfqq)
{
if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
return blk_rq_sectors(rq);
/*
* If there are no weight-raised queues, then amplify service
* by just the async charge factor; otherwise amplify service
* by twice the async charge factor, to further reduce latency
* for weight-raised queues.
*/
if (bfqq->bfqd->wr_busy_queues == 0)
return blk_rq_sectors(rq) * bfq_async_charge_factor;
return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 65 | 100.00% | 1 | 100.00% |
Total | 65 | 100.00% | 1 | 100.00% |
/**
* bfq_updated_next_req - update the queue after a new next_rq selection.
* @bfqd: the device data the queue belongs to.
* @bfqq: the queue to update.
*
* If the first request of a queue changes we make sure that the queue
* has enough budget to serve at least its first request (if the
* request has grown). We do this because if the queue has not enough
* budget for its first request, it has to go through two dispatch
* rounds to actually get it dispatched.
*/
static void bfq_updated_next_req(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
struct request *next_rq = bfqq->next_rq;
unsigned long new_budget;
if (!next_rq)
return;
if (bfqq == bfqd->in_service_queue)
/*
* In order not to break guarantees, budgets cannot be
* changed after an entity has been selected.
*/
return;
new_budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(next_rq, bfqq));
if (entity->budget != new_budget) {
entity->budget = new_budget;
bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
new_budget);
bfq_requeue_bfqq(bfqd, bfqq);
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Paolo Valente | 60 | 60.00% | 1 | 33.33% |
Arianna Avanzini | 40 | 40.00% | 2 | 66.67% |
Total | 100 | 100.00% | 3 | 100.00% |
static void
bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
{
if (bic->saved_idle_window)
bfq_mark_bfqq_idle_window(bfqq);
else
bfq_clear_bfqq_idle_window(bfqq);
if (bic->saved_IO_bound)
bfq_mark_bfqq_IO_bound(bfqq);
else
bfq_clear_bfqq_IO_bound(bfqq);
bfqq->ttime = bic->saved_ttime;
bfqq->wr_coeff = bic->saved_wr_coeff;
bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
time_is_before_jiffies(bfqq->last_wr_start_finish +
bfqq->wr_cur_max_time))) {
bfq_log_bfqq(bfqq->bfqd, bfqq,
"resume state: switching off wr");
bfqq->wr_coeff = 1;
}
/* make sure weight will be updated, however we got here */
bfqq->entity.prio_changed = 1;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 130 | 90.28% | 2 | 66.67% |
Paolo Valente | 14 | 9.72% | 1 | 33.33% |
Total | 144 | 100.00% | 3 | 100.00% |
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 26 | 100.00% | 1 | 100.00% |
Total | 26 | 100.00% | 1 | 100.00% |
/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
struct bfq_queue *item;
struct hlist_node *n;
hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
hlist_del_init(&item->burst_list_node);
hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
bfqd->burst_size = 1;
bfqd->burst_parent_entity = bfqq->entity.parent;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 73 | 100.00% | 2 | 100.00% |
Total | 73 | 100.00% | 2 | 100.00% |
/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
/* Increment burst size to take into account also bfqq */
bfqd->burst_size++;
if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
struct bfq_queue *pos, *bfqq_item;
struct hlist_node *n;
/*
* Enough queues have been activated shortly after each
* other to consider this burst as large.
*/
bfqd->large_burst = true;
/*
* We can now mark all queues in the burst list as
* belonging to a large burst.
*/
hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
burst_list_node)
bfq_mark_bfqq_in_large_burst(bfqq_item);
bfq_mark_bfqq_in_large_burst(bfqq);
/*
* From now on, and until the current burst finishes, any
* new queue being activated shortly after the last queue
* was inserted in the burst can be immediately marked as
* belonging to a large burst. So the burst list is not
* needed any more. Remove it.
*/
hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
burst_list_node)
hlist_del_init(&pos->burst_list_node);
} else /*
* Burst not yet large: add bfqq to the burst list. Do
* not increment the ref counter for bfqq, because bfqq
* is removed from the burst list before freeing bfqq
* in put_queue.
*/
hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 97 | 90.65% | 2 | 66.67% |
Paolo Valente | 10 | 9.35% | 1 | 33.33% |
Total | 107 | 100.00% | 3 | 100.00% |
/*
* If many queues belonging to the same group happen to be created
* shortly after each other, then the processes associated with these
* queues have typically a common goal. In particular, bursts of queue
* creations are usually caused by services or applications that spawn
* many parallel threads/processes. Examples are systemd during boot,
* or git grep. To help these processes get their job done as soon as
* possible, it is usually better to not grant either weight-raising
* or device idling to their queues.
*
* In this comment we describe, firstly, the reasons why this fact
* holds, and, secondly, the next function, which implements the main
* steps needed to properly mark these queues so that they can then be
* treated in a different way.
*
* The above services or applications benefit mostly from a high
* throughput: the quicker the requests of the activated queues are
* cumulatively served, the sooner the target job of these queues gets
* completed. As a consequence, weight-raising any of these queues,
* which also implies idling the device for it, is almost always
* counterproductive. In most cases it just lowers throughput.
*
* On the other hand, a burst of queue creations may be caused also by
* the start of an application that does not consist of a lot of
* parallel I/O-bound threads. In fact, with a complex application,
* several short processes may need to be executed to start-up the
* application. In this respect, to start an application as quickly as
* possible, the best thing to do is in any case to privilege the I/O
* related to the application with respect to all other
* I/O. Therefore, the best strategy to start as quickly as possible
* an application that causes a burst of queue creations is to
* weight-raise all the queues created during the burst. This is the
* exact opposite of the best strategy for the other type of bursts.
*
* In the end, to take the best action for each of the two cases, the
* two types of bursts need to be distinguished. Fortunately, this
* seems relatively easy, by looking at the sizes of the bursts. In
* particular, we found a threshold such that only bursts with a
* larger size than that threshold are apparently caused by
* services or commands such as systemd or git grep. For brevity,
* hereafter we call just 'large' these bursts. BFQ *does not*
* weight-raise queues whose creation occurs in a large burst. In
* addition, for each of these queues BFQ performs or does not perform
* idling depending on which choice boosts the throughput more. The
* exact choice depends on the device and request pattern at
* hand.
*
* Unfortunately, false positives may occur while an interactive task
* is starting (e.g., an application is being started). The
* consequence is that the queues associated with the task do not
* enjoy weight raising as expected. Fortunately these false positives
* are very rare. They typically occur if some service happens to
* start doing I/O exactly when the interactive task starts.
*
* Turning back to the next function, it implements all the steps
* needed to detect the occurrence of a large burst and to properly
* mark all the queues belonging to it (so that they can then be
* treated in a different way). This goal is achieved by maintaining a
* "burst list" that holds, temporarily, the queues that belong to the
* burst in progress. The list is then used to mark these queues as
* belonging to a large burst if the burst does become large. The main
* steps are the following.
*
* . when the very first queue is created, the queue is inserted into the
* list (as it could be the first queue in a possible burst)
*
* . if the current burst has not yet become large, and a queue Q that does
* not yet belong to the burst is activated shortly after the last time
* at which a new queue entered the burst list, then the function appends
* Q to the burst list
*
* . if, as a consequence of the previous step, the burst size reaches
* the large-burst threshold, then
*
* . all the queues in the burst list are marked as belonging to a
* large burst
*
* . the burst list is deleted; in fact, the burst list already served
* its purpose (keeping temporarily track of the queues in a burst,
* so as to be able to mark them as belonging to a large burst in the
* previous sub-step), and now is not needed any more
*
* . the device enters a large-burst mode
*
* . if a queue Q that does not belong to the burst is created while
* the device is in large-burst mode and shortly after the last time
* at which a queue either entered the burst list or was marked as
* belonging to the current large burst, then Q is immediately marked
* as belonging to a large burst.
*
* . if a queue Q that does not belong to the burst is created a while
* later, i.e., not shortly after, than the last time at which a queue
* either entered the burst list or was marked as belonging to the
* current large burst, then the current burst is deemed as finished and:
*
* . the large-burst mode is reset if set
*
* . the burst list is emptied
*
* . Q is inserted in the burst list, as Q may be the first queue
* in a possible new burst (then the burst list contains just Q
* after this step).
*/
static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
/*
* If bfqq is already in the burst list or is part of a large
* burst, or finally has just been split, then there is
* nothing else to do.
*/
if (!hlist_unhashed(&bfqq->burst_list_node) ||
bfq_bfqq_in_large_burst(bfqq) ||
time_is_after_eq_jiffies(bfqq->split_time +
msecs_to_jiffies(10)))
return;
/*
* If bfqq's creation happens late enough, or bfqq belongs to
* a different group than the burst group, then the current
* burst is finished, and related data structures must be
* reset.
*
* In this respect, consider the special case where bfqq is
* the very first queue created after BFQ is selected for this
* device. In this case, last_ins_in_burst and
* burst_parent_entity are not yet significant when we get
* here. But it is easy to verify that, whether or not the
* following condition is true, bfqq will end up being
* inserted into the burst list. In particular the list will
* happen to contain only bfqq. And this is exactly what has
* to happen, as bfqq may be the first queue of the first
* burst.
*/
if (time_is_before_jiffies(bfqd->last_ins_in_burst +
bfqd->bfq_burst_interval) ||
bfqq->entity.parent != bfqd->burst_parent_entity) {
bfqd->large_burst = false;
bfq_reset_burst_list(bfqd, bfqq);
goto end;
}
/*
* If we get here, then bfqq is being activated shortly after the
* last queue. So, if the current burst is also large, we can mark
* bfqq as belonging to this large burst immediately.
*/
if (bfqd->large_burst) {
bfq_mark_bfqq_in_large_burst(bfqq);
goto end;
}
/*
* If we get here, then a large-burst state has not yet been
* reached, but bfqq is being activated shortly after the last
* queue. Then we add bfqq to the burst.
*/
bfq_add_to_burst(bfqd, bfqq);
end:
/*
* At this point, bfqq either has been added to the current
* burst or has caused the current burst to terminate and a
* possible new burst to start. In particular, in the second
* case, bfqq has become the first queue in the possible new
* burst. In both cases last_ins_in_burst needs to be moved
* forward.
*/
bfqd->last_ins_in_burst = jiffies;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 109 | 89.34% | 1 | 50.00% |
Paolo Valente | 13 | 10.66% | 1 | 50.00% |
Total | 122 | 100.00% | 2 | 100.00% |
static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
return entity->budget - entity->service;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 30 | 100.00% | 1 | 100.00% |
Total | 30 | 100.00% | 1 | 100.00% |
/*
* If enough samples have been computed, return the current max budget
* stored in bfqd, which is dynamically updated according to the
* estimated disk peak rate; otherwise return the default max budget
*/
static int bfq_max_budget(struct bfq_data *bfqd)
{
if (bfqd->budgets_assigned < bfq_stats_min_budgets)
return bfq_default_max_budget;
else
return bfqd->bfq_max_budget;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 28 | 100.00% | 1 | 100.00% |
Total | 28 | 100.00% | 1 | 100.00% |
/*
* Return min budget, which is a fraction of the current or default
* max budget (trying with 1/32)
*/
static int bfq_min_budget(struct bfq_data *bfqd)
{
if (bfqd->budgets_assigned < bfq_stats_min_budgets)
return bfq_default_max_budget / 32;
else
return bfqd->bfq_max_budget / 32;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Arianna Avanzini | 32 | 100.00% | 1 | 100.00% |
Total | 32 | 100.00% | 1 | 100.00% |
/*
* The next function, invoked after the input queue bfqq switches from
* idle to busy, updates the budget of bfqq. The function also tells
* whether the in-service queue should be expired, by returning
* true. The purpose of expiring the in-service queue is to give bfqq
* the chance to possibly preempt the in-service queue, and the reason
* for preempting the in-service queue is to achieve one of the two
* goals below.
*
* 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
* expired because it has remained idle. In particular, bfqq may have
* expired for one of the following two reasons:
*
* - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
* and did not make it to issue a new request before its last
* request was served;
*
* - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
* a new request before the expiration of the idling-time.
*
* Even if bfqq has expired for one of the above reasons, the process
* associated with the queue may be however issuing requests greedily,
* and thus be sensitive to the bandwidth it receives (bfqq may have
* remained idle for other reasons: CPU high load, bfqq not enjoying
* idling, I/O throttling somewhere in the path from the process to
* the I/O scheduler, ...). But if, after every expiration for one of
* the above two reasons, bfqq has to wait for the service of at least
* one full budget of another queue before being served again, then
* bfqq is likely to get a much lower bandwidth or resource time than
* its reserved ones. To address this issue, two countermeasures need
* to be taken.
*
* First, the budget and the timestamps of bfqq need to be updated in
* a special way on bfqq reactivation: they need to be updated as if
* bfqq did not remain idle and did not expire. In fact, if they are
* computed as if bfqq expired and remained idle until reactivation,
* then the process associated with bfqq is treated as if, instead of
* being greedy, it stopped issuing requests when bfqq remained idle,
* and restarts issuing requests only on this reactivation. In other
* words, the scheduler does not help the process recover the "service
* hole" between bfqq expiration and reactivation. As a consequence,
* the process receives a lower bandwidth than its reserved one. In
* contrast, to recover this hole, the budget must be updated as if
* bfqq was not expired at all before this reactivation, i.e., it must
* be set to the value of the remaining budget when bfqq was
* expired. Along the same line, timestamps need to be assigned the
* value they had the last time bfqq was selected for service, i.e.,
* before last expiration. Thus timestamps need to be back-shifted
* with respect to their normal computation (see [1] for more details
* on this tricky aspect).
*
* Secondly, to allow the process to recover the hole, the in-service
* queue must be expired too, to give bfqq the chance to preempt it
* immediately. In fact, if bfqq has to wait for a full budget of the
* in-service queue to be completed, then it may become impossible to
* let the process recover the hole, even if the back-shifted
* timestamps of bfqq are lower than those of the in-service queue. If
* this happens for most or all of the holes, then the process may not
* receive its reserved bandwidth. In this respect, it is worth noting
* that, being the service of outstanding requests unpreemptible, a
* little fraction of the holes may however be unrecoverable, thereby
* causing a little loss of bandwidth.
*
* The last important point is detecting whether bfqq does need this
* bandwidth recovery. In this respect, the next function deems the
* process associated with bfqq greedy, and thus allows it to recover
* the hole, if: 1) the process is waiting for the arrival of a new
* request (which implies that bfqq expired for one of the above two
* reasons), and 2) such a request has arrived soon. The first
* condition is controlled through the flag non_blocking_wait_rq,
* while the second through the flag arrived_in_time. If both
* conditions hold, then the function computes the budget in the
* above-described special way, and signals that the in-service queue
* should be expired. Timestamp back-shifting is done later in
* __bfq_activate_entity.
*
* 2. Reduce latency. Even if timestamps are not backshifted to let
* the process associated with bfqq recover a service hole, bfqq may
* however happen to have, after being (re)activated, a lower finish
* timestamp than the in-service queue. That is, the next budget of
* bfqq may have to be completed before the one of the in-service
* queue. If this is the case, then preempting the in-service queue
* allows this goal to be achieved, apart from the unpreemptible,
* outstanding requests mentioned above.
*
* Unfortunately, regardless of which of the above two goals one wants
* to achieve, service trees need first to be updated to know whether
* the in-service queue must be preempted. To have service trees
* correctly updated, the in-service queue must be expired and
* rescheduled, and bfqq must be scheduled too. This is one of the
* most costly operations (in future versions, the scheduling
* mechanism may be re-designed in such a way to make it possible to
* know whether preemption is needed without needing to update service
* trees). In addition, queue preemptions almost always cause random
* I/O, and thus loss of throughput. Because of these facts, the next
* function adopts the following simple scheme to avoid both costly
* operations and too frequent preemptions: it requests the expiration
* of the in-service queue (unconditionally) only for queues that need
* to recover a hole, or that either are weight-raised or deserve to
* be weight-raised.
*/
static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
bool arrived_in_time,
bool wr_or_deserves_wr)
{
struct bfq_entity *entity = &bfqq->entity;
if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
/*
* We do not clear the flag non_blocking_wait_rq here, as
* the latter is used in bfq_activate_bfqq to signal
* that timestamps need to be back-shifted (and is
* cleared right after).
*/
/*
* In next assignment we rely on that either
* entity->service or entity->budget are not updated
* on expiration if bfqq is empty (see
* __bfq_bfqq_recalc_budget). Thus both quantities
* remain unchanged after such an expiration, and the
* following statement therefore assigns to
* entity->budget the remaining budget on such an
* expiration. For clarity, entity->service is not
* updated on expiration in any case, and, in normal
* operation, is reset only when bfqq is selected for
* service (see bfq_get_next_queue).
*/
entity->budget = min_t(unsigned long,
bfq_bfqq_budget_left(bfqq),
bfqq->max_budget);
return true;
}
entity->budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(bfqq->next_rq, bfqq)