cregit-Linux how code gets into the kernel

Release 4.18 block/bfq-iosched.c

Directory: block
/*
 * Budget Fair Queueing (BFQ) I/O scheduler.
 *
 * Based on ideas and code from CFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                    Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
 *                    Arianna Avanzini <avanzini@google.com>
 *
 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License as
 *  published by the Free Software Foundation; either version 2 of the
 *  License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  General Public License for more details.
 *
 * BFQ is a proportional-share I/O scheduler, with some extra
 * low-latency capabilities. BFQ also supports full hierarchical
 * scheduling through cgroups. Next paragraphs provide an introduction
 * on BFQ inner workings. Details on BFQ benefits, usage and
 * limitations can be found in Documentation/block/bfq-iosched.txt.
 *
 * BFQ is a proportional-share storage-I/O scheduling algorithm based
 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
 * budgets, measured in number of sectors, to processes instead of
 * time slices. The device is not granted to the in-service process
 * for a given time slice, but until it has exhausted its assigned
 * budget. This change from the time to the service domain enables BFQ
 * to distribute the device throughput among processes as desired,
 * without any distortion due to throughput fluctuations, or to device
 * internal queueing. BFQ uses an ad hoc internal scheduler, called
 * B-WF2Q+, to schedule processes according to their budgets. More
 * precisely, BFQ schedules queues associated with processes. Each
 * process/queue is assigned a user-configurable weight, and B-WF2Q+
 * guarantees that each queue receives a fraction of the throughput
 * proportional to its weight. Thanks to the accurate policy of
 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
 * processes issuing sequential requests (to boost the throughput),
 * and yet guarantee a low latency to interactive and soft real-time
 * applications.
 *
 * In particular, to provide these low-latency guarantees, BFQ
 * explicitly privileges the I/O of two classes of time-sensitive
 * applications: interactive and soft real-time. In more detail, BFQ
 * behaves this way if the low_latency parameter is set (default
 * configuration). This feature enables BFQ to provide applications in
 * these classes with a very low latency.
 *
 * To implement this feature, BFQ constantly tries to detect whether
 * the I/O requests in a bfq_queue come from an interactive or a soft
 * real-time application. For brevity, in these cases, the queue is
 * said to be interactive or soft real-time. In both cases, BFQ
 * privileges the service of the queue, over that of non-interactive
 * and non-soft-real-time queues. This privileging is performed,
 * mainly, by raising the weight of the queue. So, for brevity, we
 * call just weight-raising periods the time periods during which a
 * queue is privileged, because deemed interactive or soft real-time.
 *
 * The detection of soft real-time queues/applications is described in
 * detail in the comments on the function
 * bfq_bfqq_softrt_next_start. On the other hand, the detection of an
 * interactive queue works as follows: a queue is deemed interactive
 * if it is constantly non empty only for a limited time interval,
 * after which it does become empty. The queue may be deemed
 * interactive again (for a limited time), if it restarts being
 * constantly non empty, provided that this happens only after the
 * queue has remained empty for a given minimum idle time.
 *
 * By default, BFQ computes automatically the above maximum time
 * interval, i.e., the time interval after which a constantly
 * non-empty queue stops being deemed interactive. Since a queue is
 * weight-raised while it is deemed interactive, this maximum time
 * interval happens to coincide with the (maximum) duration of the
 * weight-raising for interactive queues.
 *
 * Finally, BFQ also features additional heuristics for
 * preserving both a low latency and a high throughput on NCQ-capable,
 * rotational or flash-based devices, and to get the job done quickly
 * for applications consisting in many I/O-bound processes.
 *
 * NOTE: if the main or only goal, with a given device, is to achieve
 * the maximum-possible throughput at all times, then do switch off
 * all low-latency heuristics for that device, by setting low_latency
 * to 0.
 *
 * BFQ is described in [1], where also a reference to the initial,
 * more theoretical paper on BFQ can be found. The interested reader
 * can find in the latter paper full details on the main algorithm, as
 * well as formulas of the guarantees and formal proofs of all the
 * properties.  With respect to the version of BFQ presented in these
 * papers, this implementation adds a few more heuristics, such as the
 * ones that guarantee a low latency to interactive and soft real-time
 * applications, and a hierarchical extension based on H-WF2Q+.
 *
 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
 * with O(log N) complexity derives from the one introduced with EEVDF
 * in [3].
 *
 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
 *     Scheduler", Proceedings of the First Workshop on Mobile System
 *     Technologies (MST-2015), May 2015.
 *     http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
 *
 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
 *     Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
 *     Oct 1997.
 *
 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
 *
 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
 *     First: A Flexible and Accurate Mechanism for Proportional Share
 *     Resource Allocation", technical report.
 *
 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
 */
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
#include "bfq-iosched.h"
#include "blk-wbt.h"


#define BFQ_BFQQ_FNS(name)						\
void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)                       \
{                                                                       \
        __set_bit(BFQQF_##name, &(bfqq)->flags);                        \
}                                                                       \
void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)                      \
{                                                                       \
        __clear_bit(BFQQF_##name, &(bfqq)->flags);              \
}                                                                       \
int bfq_bfqq_##name(const struct bfq_queue *bfqq)                       \
{                                                                       \
        return test_bit(BFQQF_##name, &(bfqq)->flags);          \
}

BFQ_BFQQ_FNS(just_created);
BFQ_BFQQ_FNS(busy);
BFQ_BFQQ_FNS(wait_request);
BFQ_BFQQ_FNS(non_blocking_wait_rq);
BFQ_BFQQ_FNS(fifo_expire);
BFQ_BFQQ_FNS(has_short_ttime);
BFQ_BFQQ_FNS(sync);
BFQ_BFQQ_FNS(IO_bound);
BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
#undef BFQ_BFQQ_FNS						\

/* Expiration time of sync (0) and async (1) requests, in ns. */

static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };

/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */

static const int bfq_back_max = 16 * 1024;

/* Penalty of a backwards seek, in number of sectors. */

static const int bfq_back_penalty = 2;

/* Idling period duration, in ns. */

static u64 bfq_slice_idle = NSEC_PER_SEC / 125;

/* Minimum number of assigned budgets for which stats are safe to compute. */

static const int bfq_stats_min_budgets = 194;

/* Default maximum budget values, in sectors and number of requests. */

static const int bfq_default_max_budget = 16 * 1024;

/*
 * Async to sync throughput distribution is controlled as follows:
 * when an async request is served, the entity is charged the number
 * of sectors of the request, multiplied by the factor below
 */

static const int bfq_async_charge_factor = 10;

/* Default timeout values, in jiffies, approximating CFQ defaults. */

const int bfq_timeout = HZ / 8;

/*
 * Time limit for merging (see comments in bfq_setup_cooperator). Set
 * to the slowest value that, in our tests, proved to be effective in
 * removing false positives, while not causing true positives to miss
 * queue merging.
 *
 * As can be deduced from the low time limit below, queue merging, if
 * successful, happens at the very beggining of the I/O of the involved
 * cooperating processes, as a consequence of the arrival of the very
 * first requests from each cooperator.  After that, there is very
 * little chance to find cooperators.
 */

static const unsigned long bfq_merge_time_limit = HZ/10;


static struct kmem_cache *bfq_pool;

/* Below this threshold (in ns), we consider thinktime immediate. */

#define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)

/* hw_tag detection: parallel requests threshold and min samples needed. */

#define BFQ_HW_QUEUE_THRESHOLD	4

#define BFQ_HW_QUEUE_SAMPLES	32


#define BFQQ_SEEK_THR		(sector_t)(8 * 100)

#define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)

#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)

#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 19)

/* Min number of samples required to perform peak-rate update */

#define BFQ_RATE_MIN_SAMPLES	32
/* Min observation time interval required to perform a peak-rate update (ns) */

#define BFQ_RATE_MIN_INTERVAL	(300*NSEC_PER_MSEC)
/* Target observation time interval for a peak-rate update (ns) */

#define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC

/*
 * Shift used for peak-rate fixed precision calculations.
 * With
 * - the current shift: 16 positions
 * - the current type used to store rate: u32
 * - the current unit of measure for rate: [sectors/usec], or, more precisely,
 *   [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift,
 * the range of rates that can be stored is
 * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec =
 * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec =
 * [15, 65G] sectors/sec
 * Which, assuming a sector size of 512B, corresponds to a range of
 * [7.5K, 33T] B/sec
 */

#define BFQ_RATE_SHIFT		16

/*
 * When configured for computing the duration of the weight-raising
 * for interactive queues automatically (see the comments at the
 * beginning of this file), BFQ does it using the following formula:
 * duration = (ref_rate / r) * ref_wr_duration,
 * where r is the peak rate of the device, and ref_rate and
 * ref_wr_duration are two reference parameters.  In particular,
 * ref_rate is the peak rate of the reference storage device (see
 * below), and ref_wr_duration is about the maximum time needed, with
 * BFQ and while reading two files in parallel, to load typical large
 * applications on the reference device (see the comments on
 * max_service_from_wr below, for more details on how ref_wr_duration
 * is obtained).  In practice, the slower/faster the device at hand
 * is, the more/less it takes to load applications with respect to the
 * reference device.  Accordingly, the longer/shorter BFQ grants
 * weight raising to interactive applications.
 *
 * BFQ uses two different reference pairs (ref_rate, ref_wr_duration),
 * depending on whether the device is rotational or non-rotational.
 *
 * In the following definitions, ref_rate[0] and ref_wr_duration[0]
 * are the reference values for a rotational device, whereas
 * ref_rate[1] and ref_wr_duration[1] are the reference values for a
 * non-rotational device. The reference rates are not the actual peak
 * rates of the devices used as a reference, but slightly lower
 * values. The reason for using slightly lower values is that the
 * peak-rate estimator tends to yield slightly lower values than the
 * actual peak rate (it can yield the actual peak rate only if there
 * is only one process doing I/O, and the process does sequential
 * I/O).
 *
 * The reference peak rates are measured in sectors/usec, left-shifted
 * by BFQ_RATE_SHIFT.
 */

static int ref_rate[2] = {14000, 33000};
/*
 * To improve readability, a conversion function is used to initialize
 * the following array, which entails that the array can be
 * initialized only in a function.
 */

static int ref_wr_duration[2];

/*
 * BFQ uses the above-detailed, time-based weight-raising mechanism to
 * privilege interactive tasks. This mechanism is vulnerable to the
 * following false positives: I/O-bound applications that will go on
 * doing I/O for much longer than the duration of weight
 * raising. These applications have basically no benefit from being
 * weight-raised at the beginning of their I/O. On the opposite end,
 * while being weight-raised, these applications
 * a) unjustly steal throughput to applications that may actually need
 * low latency;
 * b) make BFQ uselessly perform device idling; device idling results
 * in loss of device throughput with most flash-based storage, and may
 * increase latencies when used purposelessly.
 *
 * BFQ tries to reduce these problems, by adopting the following
 * countermeasure. To introduce this countermeasure, we need first to
 * finish explaining how the duration of weight-raising for
 * interactive tasks is computed.
 *
 * For a bfq_queue deemed as interactive, the duration of weight
 * raising is dynamically adjusted, as a function of the estimated
 * peak rate of the device, so as to be equal to the time needed to
 * execute the 'largest' interactive task we benchmarked so far. By
 * largest task, we mean the task for which each involved process has
 * to do more I/O than for any of the other tasks we benchmarked. This
 * reference interactive task is the start-up of LibreOffice Writer,
 * and in this task each process/bfq_queue needs to have at most ~110K
 * sectors transferred.
 *
 * This last piece of information enables BFQ to reduce the actual
 * duration of weight-raising for at least one class of I/O-bound
 * applications: those doing sequential or quasi-sequential I/O. An
 * example is file copy. In fact, once started, the main I/O-bound
 * processes of these applications usually consume the above 110K
 * sectors in much less time than the processes of an application that
 * is starting, because these I/O-bound processes will greedily devote
 * almost all their CPU cycles only to their target,
 * throughput-friendly I/O operations. This is even more true if BFQ
 * happens to be underestimating the device peak rate, and thus
 * overestimating the duration of weight raising. But, according to
 * our measurements, once transferred 110K sectors, these processes
 * have no right to be weight-raised any longer.
 *
 * Basing on the last consideration, BFQ ends weight-raising for a
 * bfq_queue if the latter happens to have received an amount of
 * service at least equal to the following constant. The constant is
 * set to slightly more than 110K, to have a minimum safety margin.
 *
 * This early ending of weight-raising reduces the amount of time
 * during which interactive false positives cause the two problems
 * described at the beginning of these comments.
 */

static const unsigned long max_service_from_wr = 120000;


#define RQ_BIC(rq)		icq_to_bic((rq)->elv.priv[0])

#define RQ_BFQQ(rq)		((rq)->elv.priv[1])


struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) { return bic->bfqq[is_sync]; }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente2191.30%266.67%
Arianna Avanzini28.70%133.33%
Total23100.00%3100.00%


void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) { bic->bfqq[is_sync] = bfqq; }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente1970.37%266.67%
Arianna Avanzini829.63%133.33%
Total27100.00%3100.00%


struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) { return bic->icq.q->elevator->elevator_data; }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente1252.17%150.00%
Arianna Avanzini1147.83%150.00%
Total23100.00%2100.00%

/** * icq_to_bic - convert iocontext queue structure to bfq_io_cq. * @icq: the iocontext queue. */
static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) { /* bic->icq is the first member, %NULL will convert to %NULL */ return container_of(icq, struct bfq_io_cq, icq); }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente1456.00%150.00%
Arianna Avanzini1144.00%150.00%
Total25100.00%2100.00%

/** * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. * @bfqd: the lookup key. * @ioc: the io_context of the process doing I/O. * @q: the request queue. */
static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, struct io_context *ioc, struct request_queue *q) { if (ioc) { unsigned long flags; struct bfq_io_cq *icq; spin_lock_irqsave(q->queue_lock, flags); icq = icq_to_bic(ioc_lookup_icq(ioc, q)); spin_unlock_irqrestore(q->queue_lock, flags); return icq; } return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente4966.22%266.67%
Arianna Avanzini2533.78%133.33%
Total74100.00%3100.00%

/* * Scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing. */
void bfq_schedule_dispatch(struct bfq_data *bfqd) { if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); blk_mq_run_hw_queues(bfqd->queue, true); } }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente36100.00%1100.00%
Total36100.00%1100.00%

#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) #define bfq_sample_valid(samples) ((samples) > 80) /* * Lifted from AS - choose which of rq1 and rq2 that is best served now. * We choose the request that is closesr to the head right now. Distance * behind the head is penalized and only allowed to a certain extent. */
static struct request *bfq_choose_req(struct bfq_data *bfqd, struct request *rq1, struct request *rq2, sector_t last) { sector_t s1, s2, d1 = 0, d2 = 0; unsigned long back_max; #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ if (!rq1 || rq1 == rq2) return rq2; if (!rq2) return rq1; if (rq_is_sync(rq1) && !rq_is_sync(rq2)) return rq1; else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) return rq2; if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) return rq1; else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) return rq2; s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); /* * By definition, 1KiB is 2 sectors. */ back_max = bfqd->bfq_back_max * 2; /* * Strict one way elevator _except_ in the case where we allow * short backward seeks which are biased as twice the cost of a * similar forward seek. */ if (s1 >= last) d1 = s1 - last; else if (s1 + back_max >= last) d1 = (last - s1) * bfqd->bfq_back_penalty; else wrap |= BFQ_RQ1_WRAP; if (s2 >= last) d2 = s2 - last; else if (s2 + back_max >= last) d2 = (last - s2) * bfqd->bfq_back_penalty; else wrap |= BFQ_RQ2_WRAP; /* Found required data */ /* * By doing switch() on the bit mask "wrap" we avoid having to * check two variables for all permutations: --> faster! */ switch (wrap) { case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ if (d1 < d2) return rq1; else if (d2 < d1) return rq2; if (s1 >= s2) return rq1; else return rq2; case BFQ_RQ2_WRAP: return rq1; case BFQ_RQ1_WRAP: return rq2; case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */ default: /* * Since both rqs are wrapped, * start with the one that's further behind head * (--> only *one* back seek required), * since back seek takes more time than forward. */ if (s1 <= s2) return rq1; else return rq2; } }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente335100.00%1100.00%
Total335100.00%1100.00%

/* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, * such as those that sync(2) may trigger, can starve sync reads. * Limit depths of async I/O and sync writes so as to counter both * problems. */
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; if (op_is_sync(op) && !op_is_write(op)) return; data->shallow_depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", __func__, bfqd->wr_busy_queues, op_is_sync(op), data->shallow_depth); }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente85100.00%1100.00%
Total85100.00%1100.00%


static struct bfq_queue * bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, struct rb_node ***rb_link) { struct rb_node **p, *parent; struct bfq_queue *bfqq = NULL; parent = NULL; p = &root->rb_node; while (*p) { struct rb_node **n; parent = *p; bfqq = rb_entry(parent, struct bfq_queue, pos_node); /* * Sort strictly based on sector. Smallest to the left, * largest to the right. */ if (sector > blk_rq_pos(bfqq->next_rq)) n = &(*p)->rb_right; else if (sector < blk_rq_pos(bfqq->next_rq)) n = &(*p)->rb_left; else break; p = n; bfqq = NULL; } *ret_parent = parent; if (rb_link) *rb_link = p; bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", (unsigned long long)sector, bfqq ? bfqq->pid : 0); return bfqq; }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini15684.78%266.67%
Paolo Valente2815.22%133.33%
Total184100.00%3100.00%


static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) { return bfqq->service_from_backlogged > 0 && time_is_before_jiffies(bfqq->first_IO_time + bfq_merge_time_limit); }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente27100.00%1100.00%
Total27100.00%1100.00%


void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct rb_node **p, *parent; struct bfq_queue *__bfqq; if (bfqq->pos_root) { rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } /* * bfqq cannot be merged any longer (see comments in * bfq_setup_cooperator): no point in adding bfqq into the * position tree. */ if (bfq_too_late_for_merging(bfqq)) return; if (bfq_class_idle(bfqq)) return; if (!bfqq->next_rq) return; bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, blk_rq_pos(bfqq->next_rq), &parent, &p); if (!__bfqq) { rb_link_node(&bfqq->pos_node, parent, p); rb_insert_color(&bfqq->pos_node, bfqq->pos_root); } else bfqq->pos_root = NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini14594.16%150.00%
Paolo Valente95.84%150.00%
Total154100.00%2100.00%

/* * Tell whether there are active queues or groups with differentiated weights. */
static bool bfq_differentiated_weights(struct bfq_data *bfqd) { /* * For weights to differ, at least one of the trees must contain * at least two nodes. */ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && (bfqd->queue_weights_tree.rb_node->rb_left || bfqd->queue_weights_tree.rb_node->rb_right) #ifdef CONFIG_BFQ_GROUP_IOSCHED ) || (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && (bfqd->group_weights_tree.rb_node->rb_left || bfqd->group_weights_tree.rb_node->rb_right) #endif ); }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini76100.00%1100.00%
Total76100.00%1100.00%

/* * The following function returns true if every queue must receive the * same share of the throughput (this condition is used when deciding * whether idling may be disabled, see the comments in the function * bfq_bfqq_may_idle()). * * Such a scenario occurs when: * 1) all active queues have the same weight, * 2) all active groups at the same level in the groups tree have the same * weight, * 3) all active groups at the same level in the groups tree have the same * number of children. * * Unfortunately, keeping the necessary state for evaluating exactly the * above symmetry conditions would be quite complex and time-consuming. * Therefore this function evaluates, instead, the following stronger * sub-conditions, for which it is much easier to maintain the needed * state: * 1) all active queues have the same weight, * 2) all active groups have the same weight, * 3) all active groups have at most one active child each. * In particular, the last two conditions are always true if hierarchical * support and the cgroups interface are not enabled, thus no state needs * to be maintained in this case. */
static bool bfq_symmetric_scenario(struct bfq_data *bfqd) { return !bfq_differentiated_weights(bfqd); }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini18100.00%1100.00%
Total18100.00%1100.00%

/* * If the weight-counter tree passed as input contains no counter for * the weight of the input entity, then add that counter; otherwise just * increment the existing counter. * * Note that weight-counter trees contain few nodes in mostly symmetric * scenarios. For example, if all queues have the same weight, then the * weight-counter tree for the queues may contain at most one node. * This holds even if low_latency is on, because weight-raised queues * are not inserted in the tree. * In most scenarios, the rate at which nodes are created/destroyed * should be low too. */
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, struct rb_root *root) { struct rb_node **new = &(root->rb_node), *parent = NULL; /* * Do not insert if the entity is already associated with a * counter, which happens if: * 1) the entity is associated with a queue, * 2) a request arrival has caused the queue to become both * non-weight-raised, and hence change its weight, and * backlogged; in this respect, each of the two events * causes an invocation of this function, * 3) this is the invocation of this function caused by the * second event. This second invocation is actually useless, * and we handle this fact by exiting immediately. More * efficient or clearer solutions might possibly be adopted. */ if (entity->weight_counter) return; while (*new) { struct bfq_weight_counter *__counter = container_of(*new, struct bfq_weight_counter, weights_node); parent = *new; if (entity->weight == __counter->weight) { entity->weight_counter = __counter; goto inc_counter; } if (entity->weight < __counter->weight) new = &((*new)->rb_left); else new = &((*new)->rb_right); } entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), GFP_ATOMIC); /* * In the unlucky event of an allocation failure, we just * exit. This will cause the weight of entity to not be * considered in bfq_differentiated_weights, which, in its * turn, causes the scenario to be deemed wrongly symmetric in * case entity's weight would have been the only weight making * the scenario asymmetric. On the bright side, no unbalance * will however occur when entity becomes inactive again (the * invocation of this function is triggered by an activation * of entity). In fact, bfq_weights_tree_remove does nothing * if !entity->weight_counter. */ if (unlikely(!entity->weight_counter)) return; entity->weight_counter->weight = entity->weight; rb_link_node(&entity->weight_counter->weights_node, parent, new); rb_insert_color(&entity->weight_counter->weights_node, root); inc_counter: entity->weight_counter->num_active++; }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini19596.53%266.67%
Paolo Valente73.47%133.33%
Total202100.00%3100.00%

/* * Decrement the weight counter associated with the entity, and, if the * counter reaches 0, remove the counter from the tree. * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */
void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, struct rb_root *root) { if (!entity->weight_counter) return; entity->weight_counter->num_active--; if (entity->weight_counter->num_active > 0) goto reset_entity_pointer; rb_erase(&entity->weight_counter->weights_node, root); kfree(entity->weight_counter); reset_entity_pointer: entity->weight_counter = NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini5573.33%266.67%
Paolo Valente2026.67%133.33%
Total75100.00%3100.00%

/* * Return expired entry, or NULL to just start from scratch in rbtree. */
static struct request *bfq_check_fifo(struct bfq_queue *bfqq, struct request *last) { struct request *rq; if (bfq_bfqq_fifo_expire(bfqq)) return NULL; bfq_mark_bfqq_fifo_expire(bfqq); rq = rq_entry_fifo(bfqq->fifo.next); if (rq == last || ktime_get_ns() < rq->fifo_time) return NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); return rq; }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini4960.49%125.00%
Paolo Valente3239.51%375.00%
Total81100.00%4100.00%


static struct request *bfq_find_next_rq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *last) { struct rb_node *rbnext = rb_next(&last->rb_node); struct rb_node *rbprev = rb_prev(&last->rb_node); struct request *next, *prev = NULL; /* Follow expired path, else get first next available. */ next = bfq_check_fifo(bfqq, last); if (next) return next; if (rbprev) prev = rb_entry_rq(rbprev); if (rbnext) next = rb_entry_rq(rbnext); else { rbnext = rb_first(&bfqq->sort_list); if (rbnext && rbnext != &last->rb_node) next = rb_entry_rq(rbnext); } return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini11982.64%150.00%
Paolo Valente2517.36%150.00%
Total144100.00%2100.00%

/* see the definition of bfq_async_charge_factor for details */
static unsigned long bfq_serv_to_charge(struct request *rq, struct bfq_queue *bfqq) { if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) return blk_rq_sectors(rq); /* * If there are no weight-raised queues, then amplify service * by just the async charge factor; otherwise amplify service * by twice the async charge factor, to further reduce latency * for weight-raised queues. */ if (bfqq->bfqd->wr_busy_queues == 0) return blk_rq_sectors(rq) * bfq_async_charge_factor; return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini65100.00%1100.00%
Total65100.00%1100.00%

/** * bfq_updated_next_req - update the queue after a new next_rq selection. * @bfqd: the device data the queue belongs to. * @bfqq: the queue to update. * * If the first request of a queue changes we make sure that the queue * has enough budget to serve at least its first request (if the * request has grown). We do this because if the queue has not enough * budget for its first request, it has to go through two dispatch * rounds to actually get it dispatched. */
static void bfq_updated_next_req(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; struct request *next_rq = bfqq->next_rq; unsigned long new_budget; if (!next_rq) return; if (bfqq == bfqd->in_service_queue) /* * In order not to break guarantees, budgets cannot be * changed after an entity has been selected. */ return; new_budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); if (entity->budget != new_budget) { entity->budget = new_budget; bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); bfq_requeue_bfqq(bfqd, bfqq, false); } }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente6260.78%250.00%
Arianna Avanzini4039.22%250.00%
Total102100.00%4100.00%


static unsigned int bfq_wr_duration(struct bfq_data *bfqd) { u64 dur; if (bfqd->bfq_wr_max_time > 0) return bfqd->bfq_wr_max_time; dur = bfqd->rate_dur_prod; do_div(dur, bfqd->peak_rate); /* * Limit duration between 3 and 25 seconds. The upper limit * has been conservatively set after the following worst case: * on a QEMU/KVM virtual machine * - running in a slow PC * - with a virtual disk stacked on a slow low-end 5400rpm HDD * - serving a heavy I/O workload, such as the sequential reading * of several files * mplayer took 23 seconds to start, if constantly weight-raised. * * As for higher values than that accomodating the above bad * scenario, tests show that higher values would often yield * the opposite of the desired result, i.e., would worsen * responsiveness by allowing non-interactive applications to * preserve weight raising for too long. * * On the other end, lower values than 3 seconds make it * difficult for most interactive tasks to complete their jobs * before weight-raising finishes. */ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente5388.33%266.67%
Davide Sapienza711.67%133.33%
Total60100.00%3100.00%

/* switch back from soft real-time to interactive weight raising */
static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, struct bfq_data *bfqd) { bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente41100.00%1100.00%
Total41100.00%1100.00%


static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) { unsigned int old_wr_coeff = bfqq->wr_coeff; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); if (bic->saved_has_short_ttime) bfq_mark_bfqq_has_short_ttime(bfqq); else bfq_clear_bfqq_has_short_ttime(bfqq); if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); bfqq->ttime = bic->saved_ttime; bfqq->wr_coeff = bic->saved_wr_coeff; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time))) { if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && !bfq_bfqq_in_large_burst(bfqq) && time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + bfq_wr_duration(bfqd))) { switch_back_to_interactive_wr(bfqq, bfqd); } else { bfqq->wr_coeff = 1; bfq_log_bfqq(bfqq->bfqd, bfqq, "resume state: switching off wr"); } } /* make sure weight will be updated, however we got here */ bfqq->entity.prio_changed = 1; if (likely(!busy)) return; if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) bfqd->wr_busy_queues--; }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Valente13051.18%466.67%
Arianna Avanzini12448.82%233.33%
Total254100.00%6100.00%


static int bfqq_process_refs(struct bfq_queue *bfqq) { return bfqq->ref - bfqq->allocated - bfqq->entity.on_st; }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini26100.00%1100.00%
Total26100.00%1100.00%

/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_queue *item; struct hlist_node *n; hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) hlist_del_init(&item->burst_list_node); hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); bfqd->burst_size = 1; bfqd->burst_parent_entity = bfqq->entity.parent; }

Contributors

PersonTokensPropCommitsCommitProp
Arianna Avanzini73100.00%2100.00%
Total73100.00%2100.00%

/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* Increment burst size to take into account also bfqq */ bfqd->burst_size++; if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { struct bfq_queue *pos, *bfqq_item; struct hlist_node *n; /* * Enough queues have been activated shortly after each * other to consider this burst as large. */ bfqd->large_burst = true; /* * We can now mark all queues in the burst list as * belonging to a large burst. */ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, burst_list_node) bfq_mark_bfqq_in_large_burst(bfqq_item); bfq_mark_bfqq_in_large_burst(bfqq); /* * From now on, and until the current burst finishes, any * new queue being activated shortly after the last queue * was inserted in the burst can be immediately marked as * belonging to a large burst. So the burst list is not * needed any more. Remove it. */ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, burst_list_node) hlist_del_init(&