72ef799b3f
Before merging a bio into an existing request, io scheduler is called to get its approval first. However, the requests that come from a plug flush may get merged by block layer without consulting with io scheduler. In case of CFQ, this can cause fairness problems. For instance, if a request gets merged into a low weight cgroup's request, high weight cgroup now will depend on low weight cgroup to get scheduled. If high weigt cgroup needs that io request to complete before submitting more requests, then it will also lose its timeslice. Following script demonstrates the problem. Group g1 has a low weight, g2 and g3 have equal high weights but g2's requests are adjacent to g1's requests so they are subject to merging. Due to these merges, g2 gets poor disk time allocation. cat > cfq-merge-repro.sh << "EOF" #!/bin/bash set -e IO_ROOT=/mnt-cgroup/io mkdir -p $IO_ROOT if ! mount | grep -qw $IO_ROOT; then mount -t cgroup none -oblkio $IO_ROOT fi cd $IO_ROOT for i in g1 g2 g3; do if [ -d $i ]; then rmdir $i fi done mkdir g1 && echo 10 > g1/blkio.weight mkdir g2 && echo 495 > g2/blkio.weight mkdir g3 && echo 495 > g3/blkio.weight RUNTIME=10 (echo $BASHPID > g1/cgroup.procs && fio --readonly --name name1 --filename /dev/sdb \ --rw read --size 64k --bs 64k --time_based \ --runtime=$RUNTIME --offset=0k &> /dev/null)& (echo $BASHPID > g2/cgroup.procs && fio --readonly --name name1 --filename /dev/sdb \ --rw read --size 64k --bs 64k --time_based \ --runtime=$RUNTIME --offset=64k &> /dev/null)& (echo $BASHPID > g3/cgroup.procs && fio --readonly --name name1 --filename /dev/sdb \ --rw read --size 64k --bs 64k --time_based \ --runtime=$RUNTIME --offset=256k &> /dev/null)& sleep $((RUNTIME+1)) for i in g1 g2 g3; do echo ---- $i ---- cat $i/blkio.time done EOF # ./cfq-merge-repro.sh ---- g1 ---- 8:16 162 ---- g2 ---- 8:16 165 ---- g3 ---- 8:16 686 After applying the patch: # ./cfq-merge-repro.sh ---- g1 ---- 8:16 90 ---- g2 ---- 8:16 445 ---- g3 ---- 8:16 471 Signed-off-by: Tahsin Erdogan <tahsin@google.com> Signed-off-by: Jens Axboe <axboe@fb.com>
475 lines
11 KiB
C
475 lines
11 KiB
C
/*
|
|
* Deadline i/o scheduler.
|
|
*
|
|
* Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/elevator.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/module.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/init.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/rbtree.h>
|
|
|
|
/*
|
|
* See Documentation/block/deadline-iosched.txt
|
|
*/
|
|
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
|
|
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
|
|
static const int writes_starved = 2; /* max times reads can starve a write */
|
|
static const int fifo_batch = 16; /* # of sequential requests treated as one
|
|
by the above parameters. For throughput. */
|
|
|
|
struct deadline_data {
|
|
/*
|
|
* run time data
|
|
*/
|
|
|
|
/*
|
|
* requests (deadline_rq s) are present on both sort_list and fifo_list
|
|
*/
|
|
struct rb_root sort_list[2];
|
|
struct list_head fifo_list[2];
|
|
|
|
/*
|
|
* next in sort order. read, write or both are NULL
|
|
*/
|
|
struct request *next_rq[2];
|
|
unsigned int batching; /* number of sequential requests made */
|
|
unsigned int starved; /* times reads have starved writes */
|
|
|
|
/*
|
|
* settings that change how the i/o scheduler behaves
|
|
*/
|
|
int fifo_expire[2];
|
|
int fifo_batch;
|
|
int writes_starved;
|
|
int front_merges;
|
|
};
|
|
|
|
static void deadline_move_request(struct deadline_data *, struct request *);
|
|
|
|
static inline struct rb_root *
|
|
deadline_rb_root(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
return &dd->sort_list[rq_data_dir(rq)];
|
|
}
|
|
|
|
/*
|
|
* get the request after `rq' in sector-sorted order
|
|
*/
|
|
static inline struct request *
|
|
deadline_latter_request(struct request *rq)
|
|
{
|
|
struct rb_node *node = rb_next(&rq->rb_node);
|
|
|
|
if (node)
|
|
return rb_entry_rq(node);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void
|
|
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
struct rb_root *root = deadline_rb_root(dd, rq);
|
|
|
|
elv_rb_add(root, rq);
|
|
}
|
|
|
|
static inline void
|
|
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
const int data_dir = rq_data_dir(rq);
|
|
|
|
if (dd->next_rq[data_dir] == rq)
|
|
dd->next_rq[data_dir] = deadline_latter_request(rq);
|
|
|
|
elv_rb_del(deadline_rb_root(dd, rq), rq);
|
|
}
|
|
|
|
/*
|
|
* add rq to rbtree and fifo
|
|
*/
|
|
static void
|
|
deadline_add_request(struct request_queue *q, struct request *rq)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
const int data_dir = rq_data_dir(rq);
|
|
|
|
deadline_add_rq_rb(dd, rq);
|
|
|
|
/*
|
|
* set expire time and add to fifo list
|
|
*/
|
|
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
|
|
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
|
|
}
|
|
|
|
/*
|
|
* remove rq from rbtree and fifo.
|
|
*/
|
|
static void deadline_remove_request(struct request_queue *q, struct request *rq)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
|
|
rq_fifo_clear(rq);
|
|
deadline_del_rq_rb(dd, rq);
|
|
}
|
|
|
|
static int
|
|
deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
struct request *__rq;
|
|
int ret;
|
|
|
|
/*
|
|
* check for front merge
|
|
*/
|
|
if (dd->front_merges) {
|
|
sector_t sector = bio_end_sector(bio);
|
|
|
|
__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
|
|
if (__rq) {
|
|
BUG_ON(sector != blk_rq_pos(__rq));
|
|
|
|
if (elv_bio_merge_ok(__rq, bio)) {
|
|
ret = ELEVATOR_FRONT_MERGE;
|
|
goto out;
|
|
}
|
|
}
|
|
}
|
|
|
|
return ELEVATOR_NO_MERGE;
|
|
out:
|
|
*req = __rq;
|
|
return ret;
|
|
}
|
|
|
|
static void deadline_merged_request(struct request_queue *q,
|
|
struct request *req, int type)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
|
|
/*
|
|
* if the merge was a front merge, we need to reposition request
|
|
*/
|
|
if (type == ELEVATOR_FRONT_MERGE) {
|
|
elv_rb_del(deadline_rb_root(dd, req), req);
|
|
deadline_add_rq_rb(dd, req);
|
|
}
|
|
}
|
|
|
|
static void
|
|
deadline_merged_requests(struct request_queue *q, struct request *req,
|
|
struct request *next)
|
|
{
|
|
/*
|
|
* if next expires before rq, assign its expire time to rq
|
|
* and move into next position (next will be deleted) in fifo
|
|
*/
|
|
if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
|
|
if (time_before((unsigned long)next->fifo_time,
|
|
(unsigned long)req->fifo_time)) {
|
|
list_move(&req->queuelist, &next->queuelist);
|
|
req->fifo_time = next->fifo_time;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* kill knowledge of next, this one is a goner
|
|
*/
|
|
deadline_remove_request(q, next);
|
|
}
|
|
|
|
/*
|
|
* move request from sort list to dispatch queue.
|
|
*/
|
|
static inline void
|
|
deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
struct request_queue *q = rq->q;
|
|
|
|
deadline_remove_request(q, rq);
|
|
elv_dispatch_add_tail(q, rq);
|
|
}
|
|
|
|
/*
|
|
* move an entry to dispatch queue
|
|
*/
|
|
static void
|
|
deadline_move_request(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
const int data_dir = rq_data_dir(rq);
|
|
|
|
dd->next_rq[READ] = NULL;
|
|
dd->next_rq[WRITE] = NULL;
|
|
dd->next_rq[data_dir] = deadline_latter_request(rq);
|
|
|
|
/*
|
|
* take it off the sort and fifo list, move
|
|
* to dispatch queue
|
|
*/
|
|
deadline_move_to_dispatch(dd, rq);
|
|
}
|
|
|
|
/*
|
|
* deadline_check_fifo returns 0 if there are no expired requests on the fifo,
|
|
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
|
|
*/
|
|
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
|
|
{
|
|
struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
|
|
|
|
/*
|
|
* rq is expired!
|
|
*/
|
|
if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* deadline_dispatch_requests selects the best request according to
|
|
* read/write expire, fifo_batch, etc
|
|
*/
|
|
static int deadline_dispatch_requests(struct request_queue *q, int force)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
const int reads = !list_empty(&dd->fifo_list[READ]);
|
|
const int writes = !list_empty(&dd->fifo_list[WRITE]);
|
|
struct request *rq;
|
|
int data_dir;
|
|
|
|
/*
|
|
* batches are currently reads XOR writes
|
|
*/
|
|
if (dd->next_rq[WRITE])
|
|
rq = dd->next_rq[WRITE];
|
|
else
|
|
rq = dd->next_rq[READ];
|
|
|
|
if (rq && dd->batching < dd->fifo_batch)
|
|
/* we have a next request are still entitled to batch */
|
|
goto dispatch_request;
|
|
|
|
/*
|
|
* at this point we are not running a batch. select the appropriate
|
|
* data direction (read / write)
|
|
*/
|
|
|
|
if (reads) {
|
|
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
|
|
|
|
if (writes && (dd->starved++ >= dd->writes_starved))
|
|
goto dispatch_writes;
|
|
|
|
data_dir = READ;
|
|
|
|
goto dispatch_find_request;
|
|
}
|
|
|
|
/*
|
|
* there are either no reads or writes have been starved
|
|
*/
|
|
|
|
if (writes) {
|
|
dispatch_writes:
|
|
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
|
|
|
|
dd->starved = 0;
|
|
|
|
data_dir = WRITE;
|
|
|
|
goto dispatch_find_request;
|
|
}
|
|
|
|
return 0;
|
|
|
|
dispatch_find_request:
|
|
/*
|
|
* we are not running a batch, find best request for selected data_dir
|
|
*/
|
|
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
|
|
/*
|
|
* A deadline has expired, the last request was in the other
|
|
* direction, or we have run out of higher-sectored requests.
|
|
* Start again from the request with the earliest expiry time.
|
|
*/
|
|
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
|
|
} else {
|
|
/*
|
|
* The last req was the same dir and we have a next request in
|
|
* sort order. No expired requests so continue on from here.
|
|
*/
|
|
rq = dd->next_rq[data_dir];
|
|
}
|
|
|
|
dd->batching = 0;
|
|
|
|
dispatch_request:
|
|
/*
|
|
* rq is the selected appropriate request.
|
|
*/
|
|
dd->batching++;
|
|
deadline_move_request(dd, rq);
|
|
|
|
return 1;
|
|
}
|
|
|
|
static void deadline_exit_queue(struct elevator_queue *e)
|
|
{
|
|
struct deadline_data *dd = e->elevator_data;
|
|
|
|
BUG_ON(!list_empty(&dd->fifo_list[READ]));
|
|
BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
|
|
|
|
kfree(dd);
|
|
}
|
|
|
|
/*
|
|
* initialize elevator private data (deadline_data).
|
|
*/
|
|
static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
|
|
{
|
|
struct deadline_data *dd;
|
|
struct elevator_queue *eq;
|
|
|
|
eq = elevator_alloc(q, e);
|
|
if (!eq)
|
|
return -ENOMEM;
|
|
|
|
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
|
|
if (!dd) {
|
|
kobject_put(&eq->kobj);
|
|
return -ENOMEM;
|
|
}
|
|
eq->elevator_data = dd;
|
|
|
|
INIT_LIST_HEAD(&dd->fifo_list[READ]);
|
|
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
|
|
dd->sort_list[READ] = RB_ROOT;
|
|
dd->sort_list[WRITE] = RB_ROOT;
|
|
dd->fifo_expire[READ] = read_expire;
|
|
dd->fifo_expire[WRITE] = write_expire;
|
|
dd->writes_starved = writes_starved;
|
|
dd->front_merges = 1;
|
|
dd->fifo_batch = fifo_batch;
|
|
|
|
spin_lock_irq(q->queue_lock);
|
|
q->elevator = eq;
|
|
spin_unlock_irq(q->queue_lock);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* sysfs parts below
|
|
*/
|
|
|
|
static ssize_t
|
|
deadline_var_show(int var, char *page)
|
|
{
|
|
return sprintf(page, "%d\n", var);
|
|
}
|
|
|
|
static ssize_t
|
|
deadline_var_store(int *var, const char *page, size_t count)
|
|
{
|
|
char *p = (char *) page;
|
|
|
|
*var = simple_strtol(p, &p, 10);
|
|
return count;
|
|
}
|
|
|
|
#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
|
|
static ssize_t __FUNC(struct elevator_queue *e, char *page) \
|
|
{ \
|
|
struct deadline_data *dd = e->elevator_data; \
|
|
int __data = __VAR; \
|
|
if (__CONV) \
|
|
__data = jiffies_to_msecs(__data); \
|
|
return deadline_var_show(__data, (page)); \
|
|
}
|
|
SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
|
|
SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
|
|
SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
|
|
SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
|
|
SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
|
|
#undef SHOW_FUNCTION
|
|
|
|
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
|
|
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
|
|
{ \
|
|
struct deadline_data *dd = e->elevator_data; \
|
|
int __data; \
|
|
int ret = deadline_var_store(&__data, (page), count); \
|
|
if (__data < (MIN)) \
|
|
__data = (MIN); \
|
|
else if (__data > (MAX)) \
|
|
__data = (MAX); \
|
|
if (__CONV) \
|
|
*(__PTR) = msecs_to_jiffies(__data); \
|
|
else \
|
|
*(__PTR) = __data; \
|
|
return ret; \
|
|
}
|
|
STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
|
|
STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
|
|
STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
|
|
STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
|
|
STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
|
|
#undef STORE_FUNCTION
|
|
|
|
#define DD_ATTR(name) \
|
|
__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
|
|
deadline_##name##_store)
|
|
|
|
static struct elv_fs_entry deadline_attrs[] = {
|
|
DD_ATTR(read_expire),
|
|
DD_ATTR(write_expire),
|
|
DD_ATTR(writes_starved),
|
|
DD_ATTR(front_merges),
|
|
DD_ATTR(fifo_batch),
|
|
__ATTR_NULL
|
|
};
|
|
|
|
static struct elevator_type iosched_deadline = {
|
|
.ops = {
|
|
.elevator_merge_fn = deadline_merge,
|
|
.elevator_merged_fn = deadline_merged_request,
|
|
.elevator_merge_req_fn = deadline_merged_requests,
|
|
.elevator_dispatch_fn = deadline_dispatch_requests,
|
|
.elevator_add_req_fn = deadline_add_request,
|
|
.elevator_former_req_fn = elv_rb_former_request,
|
|
.elevator_latter_req_fn = elv_rb_latter_request,
|
|
.elevator_init_fn = deadline_init_queue,
|
|
.elevator_exit_fn = deadline_exit_queue,
|
|
},
|
|
|
|
.elevator_attrs = deadline_attrs,
|
|
.elevator_name = "deadline",
|
|
.elevator_owner = THIS_MODULE,
|
|
};
|
|
|
|
static int __init deadline_init(void)
|
|
{
|
|
return elv_register(&iosched_deadline);
|
|
}
|
|
|
|
static void __exit deadline_exit(void)
|
|
{
|
|
elv_unregister(&iosched_deadline);
|
|
}
|
|
|
|
module_init(deadline_init);
|
|
module_exit(deadline_exit);
|
|
|
|
MODULE_AUTHOR("Jens Axboe");
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_DESCRIPTION("deadline IO scheduler");
|