cregit-Linux how code gets into the kernel

Release 4.12 block/bfq-iosched.c

Directory: block
 * Budget Fair Queueing (BFQ) I/O scheduler.
 * Based on ideas and code from CFQ:
 * Copyright (C) 2003 Jens Axboe <>
 * Copyright (C) 2008 Fabio Checconi <>
 *                    Paolo Valente <>
 * Copyright (C) 2010 Paolo Valente <>
 *                    Arianna Avanzini <>
 * Copyright (C) 2017 Paolo Valente <>
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License as
 *  published by the Free Software Foundation; either version 2 of the
 *  License, or (at your option) any later version.
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  General Public License for more details.
 * BFQ is a proportional-share I/O scheduler, with some extra
 * low-latency capabilities. BFQ also supports full hierarchical
 * scheduling through cgroups. Next paragraphs provide an introduction
 * on BFQ inner workings. Details on BFQ benefits, usage and
 * limitations can be found in Documentation/block/bfq-iosched.txt.
 * BFQ is a proportional-share storage-I/O scheduling algorithm based
 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
 * budgets, measured in number of sectors, to processes instead of
 * time slices. The device is not granted to the in-service process
 * for a given time slice, but until it has exhausted its assigned
 * budget. This change from the time to the service domain enables BFQ
 * to distribute the device throughput among processes as desired,
 * without any distortion due to throughput fluctuations, or to device
 * internal queueing. BFQ uses an ad hoc internal scheduler, called
 * B-WF2Q+, to schedule processes according to their budgets. More
 * precisely, BFQ schedules queues associated with processes. Each
 * process/queue is assigned a user-configurable weight, and B-WF2Q+
 * guarantees that each queue receives a fraction of the throughput
 * proportional to its weight. Thanks to the accurate policy of
 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
 * processes issuing sequential requests (to boost the throughput),
 * and yet guarantee a low latency to interactive and soft real-time
 * applications.
 * In particular, to provide these low-latency guarantees, BFQ
 * explicitly privileges the I/O of two classes of time-sensitive
 * applications: interactive and soft real-time. This feature enables
 * BFQ to provide applications in these classes with a very low
 * latency. Finally, BFQ also features additional heuristics for
 * preserving both a low latency and a high throughput on NCQ-capable,
 * rotational or flash-based devices, and to get the job done quickly
 * for applications consisting in many I/O-bound processes.
 * NOTE: if the main or only goal, with a given device, is to achieve
 * the maximum-possible throughput at all times, then do switch off
 * all low-latency heuristics for that device, by setting low_latency
 * to 0.
 * BFQ is described in [1], where also a reference to the initial, more
 * theoretical paper on BFQ can be found. The interested reader can find
 * in the latter paper full details on the main algorithm, as well as
 * formulas of the guarantees and formal proofs of all the properties.
 * With respect to the version of BFQ presented in these papers, this
 * implementation adds a few more heuristics, such as the one that
 * guarantees a low latency to soft real-time applications, and a
 * hierarchical extension based on H-WF2Q+.
 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
 * with O(log N) complexity derives from the one introduced with EEVDF
 * in [3].
 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
 *     Scheduler", Proceedings of the First Workshop on Mobile System
 *     Technologies (MST-2015), May 2015.
 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
 *     Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
 *     Oct 1997.
 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
 *     First: A Flexible and Accurate Mechanism for Proportional Share
 *     Resource Allocation", technical report.
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
#include "bfq-iosched.h"

#define BFQ_BFQQ_FNS(name)						\
void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)                       \
{                                                                       \
        __set_bit(BFQQF_##name, &(bfqq)->flags);                        \
}                                                                       \
void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)                      \
{                                                                       \
        __clear_bit(BFQQF_##name, &(bfqq)->flags);              \
}                                                                       \
int bfq_bfqq_##name(const struct bfq_queue *bfqq)                       \
{                                                                       \
        return test_bit(BFQQF_##name, &(bfqq)->flags);          \













#undef BFQ_BFQQ_FNS						

/* Expiration time of sync (0) and async (1) requests, in ns. */

static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };

/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */

static const int bfq_back_max = 16 * 1024;

/* Penalty of a backwards seek, in number of sectors. */

static const int bfq_back_penalty = 2;

/* Idling period duration, in ns. */

static u64 bfq_slice_idle = NSEC_PER_SEC / 125;

/* Minimum number of assigned budgets for which stats are safe to compute. */

static const int bfq_stats_min_budgets = 194;

/* Default maximum budget values, in sectors and number of requests. */

static const int bfq_default_max_budget = 16 * 1024;

 * Async to sync throughput distribution is controlled as follows:
 * when an async request is served, the entity is charged the number
 * of sectors of the request, multiplied by the factor below

static const int bfq_async_charge_factor = 10;

/* Default timeout values, in jiffies, approximating CFQ defaults. */

const int bfq_timeout = HZ / 8;

static struct kmem_cache *bfq_pool;

/* Below this threshold (in ns), we consider thinktime immediate. */

#define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)

/* hw_tag detection: parallel requests threshold and min samples needed. */



#define BFQQ_SEEK_THR		(sector_t)(8 * 100)

#define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)

#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)

#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)

/* Min number of samples required to perform peak-rate update */

/* Min observation time interval required to perform a peak-rate update (ns) */

/* Target observation time interval for a peak-rate update (ns) */


/* Shift used for peak rate fixed precision calculations. */

#define BFQ_RATE_SHIFT		16

 * By default, BFQ computes the duration of the weight raising for
 * interactive applications automatically, using the following formula:
 * duration = (R / r) * T, where r is the peak rate of the device, and
 * R and T are two reference parameters.
 * In particular, R is the peak rate of the reference device (see below),
 * and T is a reference time: given the systems that are likely to be
 * installed on the reference device according to its speed class, T is
 * about the maximum time needed, under BFQ and while reading two files in
 * parallel, to load typical large applications on these systems.
 * In practice, the slower/faster the device at hand is, the more/less it
 * takes to load applications with respect to the reference device.
 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
 * applications.
 * BFQ uses four different reference pairs (R, T), depending on:
 * . whether the device is rotational or non-rotational;
 * . whether the device is slow, such as old or portable HDDs, as well as
 *   SD cards, or fast, such as newer HDDs and SSDs.
 * The device's speed class is dynamically (re)detected in
 * bfq_update_peak_rate() every time the estimated peak rate is updated.
 * In the following definitions, R_slow[0]/R_fast[0] and
 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
 * rotational device, whereas R_slow[1]/R_fast[1] and
 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
 * non-rotational device. Finally, device_speed_thresh are the
 * thresholds used to switch between speed classes. The reference
 * rates are not the actual peak rates of the devices used as a
 * reference, but slightly lower values. The reason for using these
 * slightly lower values is that the peak-rate estimator tends to
 * yield slightly lower values than the actual peak rate (it can yield
 * the actual peak rate only if there is only one process doing I/O,
 * and the process does sequential I/O).
 * Both the reference peak rates and the thresholds are measured in
 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.

static int R_slow[2] = {1000, 10700};

static int R_fast[2] = {14000, 33000};
 * To improve readability, a conversion function is used to initialize the
 * following arrays, which entails that they can be initialized only in a
 * function.

static int T_slow[2];

static int T_fast[2];

static int device_speed_thresh[2];

#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])

#define RQ_BFQQ(rq)		((rq)->elv.priv[1])

struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) { return bic->bfqq[is_sync]; }


Paolo Valente2191.30%266.67%
Arianna Avanzini28.70%133.33%

void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) { bic->bfqq[is_sync] = bfqq; }


Paolo Valente1970.37%266.67%
Arianna Avanzini829.63%133.33%

struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) { return bic->icq.q->elevator->elevator_data; }


Paolo Valente1252.17%150.00%
Arianna Avanzini1147.83%150.00%

/** * icq_to_bic - convert iocontext queue structure to bfq_io_cq. * @icq: the iocontext queue. */
static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) { /* bic->icq is the first member, %NULL will convert to %NULL */ return container_of(icq, struct bfq_io_cq, icq); }


Paolo Valente1456.00%150.00%
Arianna Avanzini1144.00%150.00%

/** * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. * @bfqd: the lookup key. * @ioc: the io_context of the process doing I/O. * @q: the request queue. */
static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, struct io_context *ioc, struct request_queue *q) { if (ioc) { unsigned long flags; struct bfq_io_cq *icq; spin_lock_irqsave(q->queue_lock, flags); icq = icq_to_bic(ioc_lookup_icq(ioc, q)); spin_unlock_irqrestore(q->queue_lock, flags); return icq; } return NULL; }


Paolo Valente4966.22%266.67%
Arianna Avanzini2533.78%133.33%

/* * Scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing. */
void bfq_schedule_dispatch(struct bfq_data *bfqd) { if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); blk_mq_run_hw_queues(bfqd->queue, true); } }


Paolo Valente36100.00%1100.00%

#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) #define bfq_sample_valid(samples) ((samples) > 80) /* * Lifted from AS - choose which of rq1 and rq2 that is best served now. * We choose the request that is closesr to the head right now. Distance * behind the head is penalized and only allowed to a certain extent. */
static struct request *bfq_choose_req(struct bfq_data *bfqd, struct request *rq1, struct request *rq2, sector_t last) { sector_t s1, s2, d1 = 0, d2 = 0; unsigned long back_max; #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ if (!rq1 || rq1 == rq2) return rq2; if (!rq2) return rq1; if (rq_is_sync(rq1) && !rq_is_sync(rq2)) return rq1; else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) return rq2; if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) return rq1; else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) return rq2; s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); /* * By definition, 1KiB is 2 sectors. */ back_max = bfqd->bfq_back_max * 2; /* * Strict one way elevator _except_ in the case where we allow * short backward seeks which are biased as twice the cost of a * similar forward seek. */ if (s1 >= last) d1 = s1 - last; else if (s1 + back_max >= last) d1 = (last - s1) * bfqd->bfq_back_penalty; else wrap |= BFQ_RQ1_WRAP; if (s2 >= last) d2 = s2 - last; else if (s2 + back_max >= last) d2 = (last - s2) * bfqd->bfq_back_penalty; else wrap |= BFQ_RQ2_WRAP; /* Found required data */ /* * By doing switch() on the bit mask "wrap" we avoid having to * check two variables for all permutations: --> faster! */ switch (wrap) { case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ if (d1 < d2) return rq1; else if (d2 < d1) return rq2; if (s1 >= s2) return rq1; else return rq2; case BFQ_RQ2_WRAP: return rq1; case BFQ_RQ1_WRAP: return rq2; case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */ default: /* * Since both rqs are wrapped, * start with the one that's further behind head * (--> only *one* back seek required), * since back seek takes more time than forward. */ if (s1 <= s2) return rq1; else return rq2; } }


Paolo Valente335100.00%1100.00%

static struct bfq_queue * bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, struct rb_node ***rb_link) { struct rb_node **p, *parent; struct bfq_queue *bfqq = NULL; parent = NULL; p = &root->rb_node; while (*p) { struct rb_node **n; parent = *p; bfqq = rb_entry(parent, struct bfq_queue, pos_node); /* * Sort strictly based on sector. Smallest to the left, * largest to the right. */ if (sector > blk_rq_pos(bfqq->next_rq)) n = &(*p)->rb_right; else if (sector < blk_rq_pos(bfqq->next_rq)) n = &(*p)->rb_left; else break; p = n; bfqq = NULL; } *ret_parent = parent; if (rb_link) *rb_link = p; bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", (unsigned long long)sector, bfqq ? bfqq->pid : 0); return bfqq; }


Arianna Avanzini15684.78%266.67%
Paolo Valente2815.22%133.33%

void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct rb_node **p, *parent; struct bfq_queue *__bfqq; if (bfqq->pos_root) { rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } if (bfq_class_idle(bfqq)) return; if (!bfqq->next_rq) return; bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, blk_rq_pos(bfqq->next_rq), &parent, &p); if (!__bfqq) { rb_link_node(&bfqq->pos_node, parent, p); rb_insert_color(&bfqq->pos_node, bfqq->pos_root); } else bfqq->pos_root = NULL; }


Arianna Avanzini145100.00%1100.00%

/* * Tell whether there are active queues or groups with differentiated weights. */
static bool bfq_differentiated_weights(struct bfq_data *bfqd) { /* * For weights to differ, at least one of the trees must contain * at least two nodes. */ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && (bfqd->queue_weights_tree.rb_node->rb_left || bfqd->queue_weights_tree.rb_node->rb_right) #ifdef CONFIG_BFQ_GROUP_IOSCHED ) || (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && (bfqd->group_weights_tree.rb_node->rb_left || bfqd->group_weights_tree.rb_node->rb_right) #endif ); }


Arianna Avanzini76100.00%1100.00%

/* * The following function returns true if every queue must receive the * same share of the throughput (this condition is used when deciding * whether idling may be disabled, see the comments in the function * bfq_bfqq_may_idle()). * * Such a scenario occurs when: * 1) all active queues have the same weight, * 2) all active groups at the same level in the groups tree have the same * weight, * 3) all active groups at the same level in the groups tree have the same * number of children. * * Unfortunately, keeping the necessary state for evaluating exactly the * above symmetry conditions would be quite complex and time-consuming. * Therefore this function evaluates, instead, the following stronger * sub-conditions, for which it is much easier to maintain the needed * state: * 1) all active queues have the same weight, * 2) all active groups have the same weight, * 3) all active groups have at most one active child each. * In particular, the last two conditions are always true if hierarchical * support and the cgroups interface are not enabled, thus no state needs * to be maintained in this case. */
static bool bfq_symmetric_scenario(struct bfq_data *bfqd) { return !bfq_differentiated_weights(bfqd); }


Arianna Avanzini18100.00%1100.00%

/* * If the weight-counter tree passed as input contains no counter for * the weight of the input entity, then add that counter; otherwise just * increment the existing counter. * * Note that weight-counter trees contain few nodes in mostly symmetric * scenarios. For example, if all queues have the same weight, then the * weight-counter tree for the queues may contain at most one node. * This holds even if low_latency is on, because weight-raised queues * are not inserted in the tree. * In most scenarios, the rate at which nodes are created/destroyed * should be low too. */
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, struct rb_root *root) { struct rb_node **new = &(root->rb_node), *parent = NULL; /* * Do not insert if the entity is already associated with a * counter, which happens if: * 1) the entity is associated with a queue, * 2) a request arrival has caused the queue to become both * non-weight-raised, and hence change its weight, and * backlogged; in this respect, each of the two events * causes an invocation of this function, * 3) this is the invocation of this function caused by the * second event. This second invocation is actually useless, * and we handle this fact by exiting immediately. More * efficient or clearer solutions might possibly be adopted. */ if (entity->weight_counter) return; while (*new) { struct bfq_weight_counter *__counter = container_of(*new, struct bfq_weight_counter, weights_node); parent = *new; if (entity->weight == __counter->weight) { entity->weight_counter = __counter; goto inc_counter; } if (entity->weight < __counter->weight) new = &((*new)->rb_left); else new = &((*new)->rb_right); } entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), GFP_ATOMIC); /* * In the unlucky event of an allocation failure, we just * exit. This will cause the weight of entity to not be * considered in bfq_differentiated_weights, which, in its * turn, causes the scenario to be deemed wrongly symmetric in * case entity's weight would have been the only weight making * the scenario asymmetric. On the bright side, no unbalance * will however occur when entity becomes inactive again (the * invocation of this function is triggered by an activation * of entity). In fact, bfq_weights_tree_remove does nothing * if !entity->weight_counter. */ if (unlikely(!entity->weight_counter)) return; entity->weight_counter->weight = entity->weight; rb_link_node(&entity->weight_counter->weights_node, parent, new); rb_insert_color(&entity->weight_counter->weights_node, root); inc_counter: entity->weight_counter->num_active++; }


Arianna Avanzini19596.53%266.67%
Paolo Valente73.47%133.33%

/* * Decrement the weight counter associated with the entity, and, if the * counter reaches 0, remove the counter from the tree. * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */
void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, struct rb_root *root) { if (!entity->weight_counter) return; entity->weight_counter->num_active--; if (entity->weight_counter->num_active > 0) goto reset_entity_pointer; rb_erase(&entity->weight_counter->weights_node, root); kfree(entity->weight_counter); reset_entity_pointer: entity->weight_counter = NULL; }


Arianna Avanzini5573.33%266.67%
Paolo Valente2026.67%133.33%

/* * Return expired entry, or NULL to just start from scratch in rbtree. */
static struct request *bfq_check_fifo(struct bfq_queue *bfqq, struct request *last) { struct request *rq; if (bfq_bfqq_fifo_expire(bfqq)) return NULL; bfq_mark_bfqq_fifo_expire(bfqq); rq = rq_entry_fifo(bfqq->; if (rq == last || ktime_get_ns() < rq->fifo_time) return NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); return rq; }


Arianna Avanzini4960.49%125.00%
Paolo Valente3239.51%375.00%

static struct request *bfq_find_next_rq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *last) { struct rb_node *rbnext = rb_next(&last->rb_node); struct rb_node *rbprev = rb_prev(&last->rb_node); struct request *next, *prev = NULL; /* Follow expired path, else get first next available. */ next = bfq_check_fifo(bfqq, last); if (next) return next; if (rbprev) prev = rb_entry_rq(rbprev); if (rbnext) next = rb_entry_rq(rbnext); else { rbnext = rb_first(&bfqq->sort_list); if (rbnext && rbnext != &last->rb_node) next = rb_entry_rq(rbnext); } return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); }


Arianna Avanzini11982.64%150.00%
Paolo Valente2517.36%150.00%

/* see the definition of bfq_async_charge_factor for details */
static unsigned long bfq_serv_to_charge(struct request *rq, struct bfq_queue *bfqq) { if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) return blk_rq_sectors(rq); /* * If there are no weight-raised queues, then amplify service * by just the async charge factor; otherwise amplify service * by twice the async charge factor, to further reduce latency * for weight-raised queues. */ if (bfqq->bfqd->wr_busy_queues == 0) return blk_rq_sectors(rq) * bfq_async_charge_factor; return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; }


Arianna Avanzini65100.00%1100.00%

/** * bfq_updated_next_req - update the queue after a new next_rq selection. * @bfqd: the device data the queue belongs to. * @bfqq: the queue to update. * * If the first request of a queue changes we make sure that the queue * has enough budget to serve at least its first request (if the * request has grown). We do this because if the queue has not enough * budget for its first request, it has to go through two dispatch * rounds to actually get it dispatched. */
static void bfq_updated_next_req(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; struct request *next_rq = bfqq->next_rq; unsigned long new_budget; if (!next_rq) return; if (bfqq == bfqd->in_service_queue) /* * In order not to break guarantees, budgets cannot be * changed after an entity has been selected. */ return; new_budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); if (entity->budget != new_budget) { entity->budget = new_budget; bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); bfq_requeue_bfqq(bfqd, bfqq); } }


Paolo Valente6060.00%133.33%
Arianna Avanzini4040.00%266.67%

static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) { if (bic->saved_idle_window) bfq_mark_bfqq_idle_window(bfqq); else bfq_clear_bfqq_idle_window(bfqq); if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); bfqq->ttime = bic->saved_ttime; bfqq->wr_coeff = bic->saved_wr_coeff; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time))) { bfq_log_bfqq(bfqq->bfqd, bfqq, "resume state: switching off wr"); bfqq->wr_coeff = 1; } /* make sure weight will be updated, however we got here */ bfqq->entity.prio_changed = 1; }


Arianna Avanzini13090.28%266.67%
Paolo Valente149.72%133.33%

static int bfqq_process_refs(struct bfq_queue *bfqq) { return bfqq->ref - bfqq->allocated - bfqq->entity.on_st; }


Arianna Avanzini26100.00%1100.00%

/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_queue *item; struct hlist_node *n; hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) hlist_del_init(&item->burst_list_node); hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); bfqd->burst_size = 1; bfqd->burst_parent_entity = bfqq->entity.parent; }


Arianna Avanzini73100.00%2100.00%

/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* Increment burst size to take into account also bfqq */ bfqd->burst_size++; if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { struct bfq_queue *pos, *bfqq_item; struct hlist_node *n; /* * Enough queues have been activated shortly after each * other to consider this burst as large. */ bfqd->large_burst = true; /* * We can now mark all queues in the burst list as * belonging to a large burst. */ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, burst_list_node) bfq_mark_bfqq_in_large_burst(bfqq_item); bfq_mark_bfqq_in_large_burst(bfqq); /* * From now on, and until the current burst finishes, any * new queue being activated shortly after the last queue * was inserted in the burst can be immediately marked as * belonging to a large burst. So the burst list is not * needed any more. Remove it. */ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, burst_list_node) hlist_del_init(&pos->burst_list_node); } else /* * Burst not yet large: add bfqq to the burst list. Do * not increment the ref counter for bfqq, because bfqq * is removed from the burst list before freeing bfqq * in put_queue. */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); }


Arianna Avanzini9790.65%266.67%
Paolo Valente109.35%133.33%

/* * If many queues belonging to the same group happen to be created * shortly after each other, then the processes associated with these * queues have typically a common goal. In particular, bursts of queue * creations are usually caused by services or applications that spawn * many parallel threads/processes. Examples are systemd during boot, * or git grep. To help these processes get their job done as soon as * possible, it is usually better to not grant either weight-raising * or device idling to their queues. * * In this comment we describe, firstly, the reasons why this fact * holds, and, secondly, the next function, which implements the main * steps needed to properly mark these queues so that they can then be * treated in a different way. * * The above services or applications benefit mostly from a high * throughput: the quicker the requests of the activated queues are * cumulatively served, the sooner the target job of these queues gets * completed. As a consequence, weight-raising any of these queues, * which also implies idling the device for it, is almost always * counterproductive. In most cases it just lowers throughput. * * On the other hand, a burst of queue creations may be caused also by * the start of an application that does not consist of a lot of * parallel I/O-bound threads. In fact, with a complex application, * several short processes may need to be executed to start-up the * application. In this respect, to start an application as quickly as * possible, the best thing to do is in any case to privilege the I/O * related to the application with respect to all other * I/O. Therefore, the best strategy to start as quickly as possible * an application that causes a burst of queue creations is to * weight-raise all the queues created during the burst. This is the * exact opposite of the best strategy for the other type of bursts. * * In the end, to take the best action for each of the two cases, the * two types of bursts need to be distinguished. Fortunately, this * seems relatively easy, by looking at the sizes of the bursts. In * particular, we found a threshold such that only bursts with a * larger size than that threshold are apparently caused by * services or commands such as systemd or git grep. For brevity, * hereafter we call just 'large' these bursts. BFQ *does not* * weight-raise queues whose creation occurs in a large burst. In * addition, for each of these queues BFQ performs or does not perform * idling depending on which choice boosts the throughput more. The * exact choice depends on the device and request pattern at * hand. * * Unfortunately, false positives may occur while an interactive task * is starting (e.g., an application is being started). The * consequence is that the queues associated with the task do not * enjoy weight raising as expected. Fortunately these false positives * are very rare. They typically occur if some service happens to * start doing I/O exactly when the interactive task starts. * * Turning back to the next function, it implements all the steps * needed to detect the occurrence of a large burst and to properly * mark all the queues belonging to it (so that they can then be * treated in a different way). This goal is achieved by maintaining a * "burst list" that holds, temporarily, the queues that belong to the * burst in progress. The list is then used to mark these queues as * belonging to a large burst if the burst does become large. The main * steps are the following. * * . when the very first queue is created, the queue is inserted into the * list (as it could be the first queue in a possible burst) * * . if the current burst has not yet become large, and a queue Q that does * not yet belong to the burst is activated shortly after the last time * at which a new queue entered the burst list, then the function appends * Q to the burst list * * . if, as a consequence of the previous step, the burst size reaches * the large-burst threshold, then * * . all the queues in the burst list are marked as belonging to a * large burst * * . the burst list is deleted; in fact, the burst list already served * its purpose (keeping temporarily track of the queues in a burst, * so as to be able to mark them as belonging to a large burst in the * previous sub-step), and now is not needed any more * * . the device enters a large-burst mode * * . if a queue Q that does not belong to the burst is created while * the device is in large-burst mode and shortly after the last time * at which a queue either entered the burst list or was marked as * belonging to the current large burst, then Q is immediately marked * as belonging to a large burst. * * . if a queue Q that does not belong to the burst is created a while * later, i.e., not shortly after, than the last time at which a queue * either entered the burst list or was marked as belonging to the * current large burst, then the current burst is deemed as finished and: * * . the large-burst mode is reset if set * * . the burst list is emptied * * . Q is inserted in the burst list, as Q may be the first queue * in a possible new burst (then the burst list contains just Q * after this step). */
static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* * If bfqq is already in the burst list or is part of a large * burst, or finally has just been split, then there is * nothing else to do. */ if (!hlist_unhashed(&bfqq->burst_list_node) || bfq_bfqq_in_large_burst(bfqq) || time_is_after_eq_jiffies(bfqq->split_time + msecs_to_jiffies(10))) return; /* * If bfqq's creation happens late enough, or bfqq belongs to * a different group than the burst group, then the current * burst is finished, and related data structures must be * reset. * * In this respect, consider the special case where bfqq is * the very first queue created after BFQ is selected for this * device. In this case, last_ins_in_burst and * burst_parent_entity are not yet significant when we get * here. But it is easy to verify that, whether or not the * following condition is true, bfqq will end up being * inserted into the burst list. In particular the list will * happen to contain only bfqq. And this is exactly what has * to happen, as bfqq may be the first queue of the first * burst. */ if (time_is_before_jiffies(bfqd->last_ins_in_burst + bfqd->bfq_burst_interval) || bfqq->entity.parent != bfqd->burst_parent_entity) { bfqd->large_burst = false; bfq_reset_burst_list(bfqd, bfqq); goto end; } /* * If we get here, then bfqq is being activated shortly after the * last queue. So, if the current burst is also large, we can mark * bfqq as belonging to this large burst immediately. */ if (bfqd->large_burst) { bfq_mark_bfqq_in_large_burst(bfqq); goto end; } /* * If we get here, then a large-burst state has not yet been * reached, but bfqq is being activated shortly after the last * queue. Then we add bfqq to the burst. */ bfq_add_to_burst(bfqd, bfqq); end: /* * At this point, bfqq either has been added to the current * burst or has caused the current burst to terminate and a * possible new burst to start. In particular, in the second * case, bfqq has become the first queue in the possible new * burst. In both cases last_ins_in_burst needs to be moved * forward. */ bfqd->last_ins_in_burst = jiffies; }


Arianna Avanzini10989.34%150.00%
Paolo Valente1310.66%150.00%

static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; return entity->budget - entity->service; }


Arianna Avanzini30100.00%1100.00%

/* * If enough samples have been computed, return the current max budget * stored in bfqd, which is dynamically updated according to the * estimated disk peak rate; otherwise return the default max budget */
static int bfq_max_budget(struct bfq_data *bfqd) { if (bfqd->budgets_assigned < bfq_stats_min_budgets) return bfq_default_max_budget; else return bfqd->bfq_max_budget; }


Arianna Avanzini28100.00%1100.00%

/* * Return min budget, which is a fraction of the current or default * max budget (trying with 1/32) */
static int bfq_min_budget(struct bfq_data *bfqd) { if (bfqd->budgets_assigned < bfq_stats_min_budgets) return bfq_default_max_budget / 32; else return bfqd->bfq_max_budget / 32; }


Arianna Avanzini32100.00%1100.00%

/* * The next function, invoked after the input queue bfqq switches from * idle to busy, updates the budget of bfqq. The function also tells * whether the in-service queue should be expired, by returning * true. The purpose of expiring the in-service queue is to give bfqq * the chance to possibly preempt the in-service queue, and the reason * for preempting the in-service queue is to achieve one of the two * goals below. * * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has * expired because it has remained idle. In particular, bfqq may have * expired for one of the following two reasons: * * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling * and did not make it to issue a new request before its last * request was served; * * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue * a new request before the expiration of the idling-time. * * Even if bfqq has expired for one of the above reasons, the process * associated with the queue may be however issuing requests greedily, * and thus be sensitive to the bandwidth it receives (bfqq may have * remained idle for other reasons: CPU high load, bfqq not enjoying * idling, I/O throttling somewhere in the path from the process to * the I/O scheduler, ...). But if, after every expiration for one of * the above two reasons, bfqq has to wait for the service of at least * one full budget of another queue before being served again, then * bfqq is likely to get a much lower bandwidth or resource time than * its reserved ones. To address this issue, two countermeasures need * to be taken. * * First, the budget and the timestamps of bfqq need to be updated in * a special way on bfqq reactivation: they need to be updated as if * bfqq did not remain idle and did not expire. In fact, if they are * computed as if bfqq expired and remained idle until reactivation, * then the process associated with bfqq is treated as if, instead of * being greedy, it stopped issuing requests when bfqq remained idle, * and restarts issuing requests only on this reactivation. In other * words, the scheduler does not help the process recover the "service * hole" between bfqq expiration and reactivation. As a consequence, * the process receives a lower bandwidth than its reserved one. In * contrast, to recover this hole, the budget must be updated as if * bfqq was not expired at all before this reactivation, i.e., it must * be set to the value of the remaining budget when bfqq was * expired. Along the same line, timestamps need to be assigned the * value they had the last time bfqq was selected for service, i.e., * before last expiration. Thus timestamps need to be back-shifted * with respect to their normal computation (see [1] for more details * on this tricky aspect). * * Secondly, to allow the process to recover the hole, the in-service * queue must be expired too, to give bfqq the chance to preempt it * immediately. In fact, if bfqq has to wait for a full budget of the * in-service queue to be completed, then it may become impossible to * let the process recover the hole, even if the back-shifted * timestamps of bfqq are lower than those of the in-service queue. If * this happens for most or all of the holes, then the process may not * receive its reserved bandwidth. In this respect, it is worth noting * that, being the service of outstanding requests unpreemptible, a * little fraction of the holes may however be unrecoverable, thereby * causing a little loss of bandwidth. * * The last important point is detecting whether bfqq does need this * bandwidth recovery. In this respect, the next function deems the * process associated with bfqq greedy, and thus allows it to recover * the hole, if: 1) the process is waiting for the arrival of a new * request (which implies that bfqq expired for one of the above two * reasons), and 2) such a request has arrived soon. The first * condition is controlled through the flag non_blocking_wait_rq, * while the second through the flag arrived_in_time. If both * conditions hold, then the function computes the budget in the * above-described special way, and signals that the in-service queue * should be expired. Timestamp back-shifting is done later in * __bfq_activate_entity. * * 2. Reduce latency. Even if timestamps are not backshifted to let * the process associated with bfqq recover a service hole, bfqq may * however happen to have, after being (re)activated, a lower finish * timestamp than the in-service queue. That is, the next budget of * bfqq may have to be completed before the one of the in-service * queue. If this is the case, then preempting the in-service queue * allows this goal to be achieved, apart from the unpreemptible, * outstanding requests mentioned above. * * Unfortunately, regardless of which of the above two goals one wants * to achieve, service trees need first to be updated to know whether * the in-service queue must be preempted. To have service trees * correctly updated, the in-service queue must be expired and * rescheduled, and bfqq must be scheduled too. This is one of the * most costly operations (in future versions, the scheduling * mechanism may be re-designed in such a way to make it possible to * know whether preemption is needed without needing to update service * trees). In addition, queue preemptions almost always cause random * I/O, and thus loss of throughput. Because of these facts, the next * function adopts the following simple scheme to avoid both costly * operations and too frequent preemptions: it requests the expiration * of the in-service queue (unconditionally) only for queues that need * to recover a hole, or that either are weight-raised or deserve to * be weight-raised. */
static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool arrived_in_time, bool wr_or_deserves_wr) { struct bfq_entity *entity = &bfqq->entity; if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { /* * We do not clear the flag non_blocking_wait_rq here, as * the latter is used in bfq_activate_bfqq to signal * that timestamps need to be back-shifted (and is * cleared right after). */ /* * In next assignment we rely on that either * entity->service or entity->budget are not updated * on expiration if bfqq is empty (see * __bfq_bfqq_recalc_budget). Thus both quantities * remain unchanged after such an expiration, and the * following statement therefore assigns to * entity->budget the remaining budget on such an * expiration. For clarity, entity->service is not * updated on expiration in any case, and, in normal * operation, is reset only when bfqq is selected for * service (see bfq_get_next_queue). */ entity->budget = min_t(unsigned long, bfq_bfqq_budget_left(bfqq), bfqq->max_budget); return true; } entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(bfqq->next_rq, bfqq)