Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps

 

 

Linux kernel RSS feed   Index | Next | Previous | View Threaded


vgoyal at redhat

Nov 3, 2009, 3:43 PM

Post #1 of 18 (465 views)
Permalink
[PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps

o Currently CFQ provides priority scaled time slices to processes. If a process
does not use the time slice, either because process did not have sufficient
IO to do or because think time of process is large and CFQ decided to disable
idling, then processes looses it time slice share.

o This works well in flat setup where fair share of a process can be achieved
in one go (by scaled time slices), and CFQ does not have to time stamp the
queue. But once IO groups are introduced, it does not work very well.
Consider following case.

root
/ \
G1 G2
| |
T1 T2

Here G1 and G2 are two groups of weights 100 each and T1 and T2 are two
tasks of prio 0 and 4 each. Now both groups should get 50% of disk time.
CFQ assigns slice length of 180ms to T1 (prio 0) and slice length of 100ms
to T2 (prio4). Now plain round robin of scaled slices does not work at
group level.

o One possible way to handle this is implement CFS like time stamping of the
cfq queues and keep track of vtime. Next queue for execution will be selected
based on the one who got lowest vtime. This patch implemented time stamping
mechanism of cfq queues based on disk time used.

o min_vdisktime represents the minimum vdisktime of the queue, either being
serviced or leftmost element on the serviec tree.

o Previously CFQ had one service tree where queues of all theree prio classes
were being queued. One side affect of this time stamping approach is that
now single tree approach might not work and we need to keep separate service
trees for three prio classes.

o Some parts of code of this patch are taken from CFS and BFQ.

Signed-off-by: Vivek Goyal <vgoyal [at] redhat>
---
block/cfq-iosched.c | 480 +++++++++++++++++++++++++++++++++++----------------
1 files changed, 335 insertions(+), 145 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 069a610..58ac8b7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -28,6 +28,8 @@ static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;

+#define IO_IOPRIO_CLASSES 3
+
/*
* offset from end of service tree
*/
@@ -64,11 +66,17 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
* move this into the elevator for the rq sorting as well.
*/
-struct cfq_rb_root {
+struct cfq_service_tree {
struct rb_root rb;
struct rb_node *left;
+ u64 min_vdisktime;
+ struct cfq_queue *active;
+};
+#define CFQ_RB_ROOT (struct cfq_service_tree) { RB_ROOT, NULL, 0, NULL}
+
+struct cfq_sched_data {
+ struct cfq_service_tree service_tree[IO_IOPRIO_CLASSES];
};
-#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, }

/*
* Per process-grouping structure
@@ -83,7 +91,9 @@ struct cfq_queue {
/* service_tree member */
struct rb_node rb_node;
/* service_tree key */
- unsigned long rb_key;
+ u64 vdisktime;
+ /* service tree we belong to */
+ struct cfq_service_tree *st;
/* prio tree member */
struct rb_node p_node;
/* prio tree root we belong to, if any */
@@ -99,8 +109,9 @@ struct cfq_queue {
/* fifo list of requests in sort_list */
struct list_head fifo;

+ /* time when first request from queue completed and slice started. */
+ unsigned long slice_start;
unsigned long slice_end;
- long slice_resid;
unsigned int slice_dispatch;

/* pending metadata requests */
@@ -111,6 +122,7 @@ struct cfq_queue {
/* io prio of this group */
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class, org_ioprio_class;
+ bool ioprio_class_changed;

pid_t pid;
};
@@ -124,7 +136,7 @@ struct cfq_data {
/*
* rr list of queues with requests and the count of them
*/
- struct cfq_rb_root service_tree;
+ struct cfq_sched_data sched_data;

/*
* Each priority tree is sorted by next_request position. These
@@ -234,6 +246,67 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
struct io_context *, gfp_t);
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
struct io_context *);
+static void cfq_put_queue(struct cfq_queue *cfqq);
+static struct cfq_queue *__cfq_get_next_queue(struct cfq_service_tree *st);
+
+static inline void
+init_cfqq_service_tree(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ unsigned short idx = cfqq->ioprio_class - 1;
+
+ BUG_ON(idx >= IO_IOPRIO_CLASSES);
+
+ cfqq->st = &cfqd->sched_data.service_tree[idx];
+}
+
+static inline s64
+cfqq_key(struct cfq_service_tree *st, struct cfq_queue *cfqq)
+{
+ return cfqq->vdisktime - st->min_vdisktime;
+}
+
+static inline u64
+cfq_delta_fair(unsigned long delta, struct cfq_queue *cfqq)
+{
+ const int base_slice = cfqq->cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
+
+ return delta + (base_slice/CFQ_SLICE_SCALE * (cfqq->ioprio - 4));
+}
+
+static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
+{
+ s64 delta = (s64)(vdisktime - min_vdisktime);
+ if (delta > 0)
+ min_vdisktime = vdisktime;
+
+ return min_vdisktime;
+}
+
+static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
+{
+ s64 delta = (s64)(vdisktime - min_vdisktime);
+ if (delta < 0)
+ min_vdisktime = vdisktime;
+
+ return min_vdisktime;
+}
+
+static void update_min_vdisktime(struct cfq_service_tree *st)
+{
+ u64 vdisktime = st->min_vdisktime;
+
+ if (st->active)
+ vdisktime = st->active->vdisktime;
+
+ if (st->left) {
+ struct cfq_queue *cfqq = rb_entry(st->left, struct cfq_queue,
+ rb_node);
+
+ vdisktime = min_vdisktime(vdisktime, cfqq->vdisktime);
+ }
+
+ st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
+}

static inline int rq_in_driver(struct cfq_data *cfqd)
{
@@ -277,7 +350,7 @@ static int cfq_queue_empty(struct request_queue *q)
{
struct cfq_data *cfqd = q->elevator->elevator_data;

- return !cfqd->busy_queues;
+ return !cfqd->rq_queued;
}

/*
@@ -304,6 +377,7 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
+ cfqq->slice_start = jiffies;
cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}
@@ -419,33 +493,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
}

/*
- * The below is leftmost cache rbtree addon
- */
-static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
-{
- if (!root->left)
- root->left = rb_first(&root->rb);
-
- if (root->left)
- return rb_entry(root->left, struct cfq_queue, rb_node);
-
- return NULL;
-}
-
-static void rb_erase_init(struct rb_node *n, struct rb_root *root)
-{
- rb_erase(n, root);
- RB_CLEAR_NODE(n);
-}
-
-static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
-{
- if (root->left == n)
- root->left = NULL;
- rb_erase_init(n, &root->rb);
-}
-
-/*
* would be nice to take fifo expire time into account as well
*/
static struct request *
@@ -472,102 +519,192 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
return cfq_choose_req(cfqd, next, prev);
}

-static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
- struct cfq_queue *cfqq)
-{
- /*
- * just an approximation, should be ok.
- */
- return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
- cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
-}
-
-/*
- * The cfqd->service_tree holds all pending cfq_queue's that have
- * requests waiting to be processed. It is sorted in the order that
- * we will service the queues.
- */
-static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
- bool add_front)
+static void
+place_cfqq(struct cfq_service_tree *st, struct cfq_queue *cfqq, int add_front)
{
- struct rb_node **p, *parent;
+ u64 vdisktime = st->min_vdisktime;
+ struct rb_node *parent;
struct cfq_queue *__cfqq;
- unsigned long rb_key;
- int left;

if (cfq_class_idle(cfqq)) {
- rb_key = CFQ_IDLE_DELAY;
- parent = rb_last(&cfqd->service_tree.rb);
+ vdisktime = CFQ_IDLE_DELAY;
+ parent = rb_last(&st->rb);
if (parent && parent != &cfqq->rb_node) {
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
- rb_key += __cfqq->rb_key;
+ vdisktime += __cfqq->vdisktime;
} else
- rb_key += jiffies;
+ vdisktime += st->min_vdisktime;
} else if (!add_front) {
- /*
- * Get our rb key offset. Subtract any residual slice
- * value carried from last service. A negative resid
- * count indicates slice overrun, and this should position
- * the next service time further away in the tree.
- */
- rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
- rb_key -= cfqq->slice_resid;
- cfqq->slice_resid = 0;
- } else {
- rb_key = -HZ;
- __cfqq = cfq_rb_first(&cfqd->service_tree);
- rb_key += __cfqq ? __cfqq->rb_key : jiffies;
+ parent = rb_last(&st->rb);
+ if (parent && parent != &cfqq->rb_node) {
+ __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+ vdisktime = __cfqq->vdisktime;
+ }
}

- if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+ cfqq->vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
+}
+
+static inline void cfqq_update_ioprio_class(struct cfq_queue *cfqq)
+{
+ if (unlikely(cfqq->ioprio_class_changed)) {
+ struct cfq_data *cfqd = cfqq->cfqd;
+
/*
- * same position, nothing more to do
+ * Re-initialize the service tree pointer as ioprio class
+ * change will lead to service tree change.
*/
- if (rb_key == cfqq->rb_key)
- return;
-
- cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+ init_cfqq_service_tree(cfqd, cfqq);
+ cfqq->ioprio_class_changed = 0;
+ cfqq->vdisktime = 0;
}
+}

- left = 1;
- parent = NULL;
- p = &cfqd->service_tree.rb.rb_node;
- while (*p) {
- struct rb_node **n;
+static void __dequeue_cfqq(struct cfq_service_tree *st, struct cfq_queue *cfqq)
+{
+ /* Node is not on tree */
+ if (RB_EMPTY_NODE(&cfqq->rb_node))
+ return;

- parent = *p;
+ if (st->left == &cfqq->rb_node)
+ st->left = rb_next(&cfqq->rb_node);
+
+ rb_erase(&cfqq->rb_node, &st->rb);
+ RB_CLEAR_NODE(&cfqq->rb_node);
+}
+
+static void dequeue_cfqq(struct cfq_queue *cfqq)
+{
+ struct cfq_service_tree *st = cfqq->st;
+
+ if (st->active == cfqq)
+ st->active = NULL;
+
+ __dequeue_cfqq(st, cfqq);
+}
+
+static void __enqueue_cfqq(struct cfq_service_tree *st, struct cfq_queue *cfqq,
+ int add_front)
+{
+ struct rb_node **node = &st->rb.rb_node;
+ struct rb_node *parent = NULL;
+ struct cfq_queue *__cfqq;
+ s64 key = cfqq_key(st, cfqq);
+ int leftmost = 1;
+
+ while (*node != NULL) {
+ parent = *node;
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

- /*
- * sort RT queues first, we always want to give
- * preference to them. IDLE queues goes to the back.
- * after that, sort on the next service time.
- */
- if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
- n = &(*p)->rb_left;
- else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
- n = &(*p)->rb_right;
- else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
- n = &(*p)->rb_left;
- else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
- n = &(*p)->rb_right;
- else if (time_before(rb_key, __cfqq->rb_key))
- n = &(*p)->rb_left;
- else
- n = &(*p)->rb_right;
+ if (key < cfqq_key(st, __cfqq) ||
+ (add_front && (key == cfqq_key(st, __cfqq)))) {
+ node = &parent->rb_left;
+ } else {
+ node = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ /*
+ * Maintain a cache of leftmost tree entries (it is frequently
+ * used)
+ */
+ if (leftmost)
+ st->left = &cfqq->rb_node;

- if (n == &(*p)->rb_right)
- left = 0;
+ rb_link_node(&cfqq->rb_node, parent, node);
+ rb_insert_color(&cfqq->rb_node, &st->rb);
+}

- p = n;
+static void enqueue_cfqq(struct cfq_queue *cfqq)
+{
+ cfqq_update_ioprio_class(cfqq);
+ place_cfqq(cfqq->st, cfqq, 0);
+ __enqueue_cfqq(cfqq->st, cfqq, 0);
+}
+
+/* Requeue a cfqq which is already on the service tree */
+static void requeue_cfqq(struct cfq_queue *cfqq, int add_front)
+{
+ struct cfq_service_tree *st = cfqq->st;
+ struct cfq_queue *next_cfqq;
+
+ if (add_front) {
+ next_cfqq = __cfq_get_next_queue(st);
+ if (next_cfqq && next_cfqq == cfqq)
+ return;
+ }
+
+ __dequeue_cfqq(st, cfqq);
+ place_cfqq(st, cfqq, add_front);
+ __enqueue_cfqq(st, cfqq, add_front);
+}
+
+static void __cfqq_served(struct cfq_queue *cfqq, unsigned long served)
+{
+ /*
+ * Can't update entity disk time while it is on sorted rb-tree
+ * as vdisktime is used as key.
+ */
+ __dequeue_cfqq(cfqq->st, cfqq);
+ cfqq->vdisktime += cfq_delta_fair(served, cfqq);
+ update_min_vdisktime(cfqq->st);
+ __enqueue_cfqq(cfqq->st, cfqq, 0);
+}
+
+static void cfqq_served(struct cfq_queue *cfqq, unsigned long served)
+{
+ /*
+ * We don't want to charge more than allocated slice otherwise this
+ * queue can miss one dispatch round doubling max latencies. On the
+ * other hand we don't want to charge less than allocated slice as
+ * we stick to CFQ theme of queue loosing its share if it does not
+ * use the slice and moves to the back of service tree (almost).
+ */
+ served = cfq_prio_to_slice(cfqq->cfqd, cfqq);
+ __cfqq_served(cfqq, served);
+
+ /* If cfqq prio class has changed, take that into account */
+ if (unlikely(cfqq->ioprio_class_changed)) {
+ dequeue_cfqq(cfqq);
+ enqueue_cfqq(cfqq);
}
+}
+
+/*
+ * Handles three operations.
+ * Addition of a new queue to service tree, when a new request comes in.
+ * Resorting of an expiring queue (used after slice expired)
+ * Requeuing a queue at the front (used during preemption).
+ */
+static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ bool add_front, unsigned long service)
+{
+ if (RB_EMPTY_NODE(&cfqq->rb_node)) {
+ /* Its a new queue. Add it to service tree */
+ enqueue_cfqq(cfqq);
+ return;
+ }
+
+ if (service) {
+ /*
+ * This queue just got served. Compute the new key and requeue
+ * in the service tree
+ */
+ cfqq_served(cfqq, service);

- if (left)
- cfqd->service_tree.left = &cfqq->rb_node;
+ /*
+ * Requeue async ioq so that these will be again placed at the
+ * end of service tree giving a chance to sync queues.
+ * TODO: Handle this case in a better manner.
+ */
+ if (!cfq_cfqq_sync(cfqq))
+ requeue_cfqq(cfqq, 0);
+ return;
+ }

- cfqq->rb_key = rb_key;
- rb_link_node(&cfqq->rb_node, parent, p);
- rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
+ /* Just requeuing an existing queue, used during preemption */
+ requeue_cfqq(cfqq, add_front);
}

static struct cfq_queue *
@@ -634,13 +771,14 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
/*
* Update cfqq's position in the service tree.
*/
-static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ unsigned long service)
{
/*
* Resorting requires the cfqq to be on the RR list already.
*/
if (cfq_cfqq_on_rr(cfqq)) {
- cfq_service_tree_add(cfqd, cfqq, 0);
+ cfq_service_tree_add(cfqd, cfqq, 0, service);
cfq_prio_tree_add(cfqd, cfqq);
}
}
@@ -656,7 +794,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfq_mark_cfqq_on_rr(cfqq);
cfqd->busy_queues++;

- cfq_resort_rr_list(cfqd, cfqq);
+ cfq_resort_rr_list(cfqd, cfqq, 0);
}

/*
@@ -669,8 +807,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(!cfq_cfqq_on_rr(cfqq));
cfq_clear_cfqq_on_rr(cfqq);

- if (!RB_EMPTY_NODE(&cfqq->rb_node))
- cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+ dequeue_cfqq(cfqq);
if (cfqq->p_root) {
rb_erase(&cfqq->p_node, cfqq->p_root);
cfqq->p_root = NULL;
@@ -686,7 +823,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static void cfq_del_rq_rb(struct request *rq)
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
- struct cfq_data *cfqd = cfqq->cfqd;
const int sync = rq_is_sync(rq);

BUG_ON(!cfqq->queued[sync]);
@@ -694,8 +830,17 @@ static void cfq_del_rq_rb(struct request *rq)

elv_rb_del(&cfqq->sort_list, rq);

- if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
- cfq_del_cfqq_rr(cfqd, cfqq);
+ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
+ /*
+ * Queue will be deleted from service tree when we actually
+ * expire it later. Right now just remove it from prio tree
+ * as it is empty.
+ */
+ if (cfqq->p_root) {
+ rb_erase(&cfqq->p_node, cfqq->p_root);
+ cfqq->p_root = NULL;
+ }
+ }
}

static void cfq_add_rq_rb(struct request *rq)
@@ -869,6 +1014,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
{
if (cfqq) {
cfq_log_cfqq(cfqd, cfqq, "set_active");
+ cfqq->slice_start = 0;
cfqq->slice_end = 0;
cfqq->slice_dispatch = 0;

@@ -888,10 +1034,11 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
* current cfqq expired its slice (or was too idle), select new one
*/
static void
-__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
- bool timed_out)
+__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
+ long slice_used = 0;
+
+ cfq_log_cfqq(cfqd, cfqq, "slice expired");

if (cfq_cfqq_wait_request(cfqq))
del_timer(&cfqd->idle_slice_timer);
@@ -899,14 +1046,20 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfq_clear_cfqq_wait_request(cfqq);

/*
- * store what was left of this slice, if the queue idled/timed out
+ * Queue got expired before even a single request completed or
+ * got expired immediately after first request completion.
*/
- if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
- cfqq->slice_resid = cfqq->slice_end - jiffies;
- cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
- }
+ if (!cfqq->slice_end || cfqq->slice_start == jiffies)
+ slice_used = 1;
+ else
+ slice_used = jiffies - cfqq->slice_start;

- cfq_resort_rr_list(cfqd, cfqq);
+ cfq_log_cfqq(cfqd, cfqq, "sl_used=%ld", slice_used);
+
+ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
+ cfq_del_cfqq_rr(cfqd, cfqq);
+
+ cfq_resort_rr_list(cfqd, cfqq, slice_used);

if (cfqq == cfqd->active_queue)
cfqd->active_queue = NULL;
@@ -917,12 +1070,22 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
}

-static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
+static inline void cfq_slice_expired(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq = cfqd->active_queue;

if (cfqq)
- __cfq_slice_expired(cfqd, cfqq, timed_out);
+ __cfq_slice_expired(cfqd, cfqq);
+}
+
+static struct cfq_queue *__cfq_get_next_queue(struct cfq_service_tree *st)
+{
+ struct rb_node *left = st->left;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct cfq_queue, rb_node);
}

/*
@@ -931,10 +1094,24 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
*/
static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
{
- if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
+ struct cfq_sched_data *sd = &cfqd->sched_data;
+ struct cfq_service_tree *st = sd->service_tree;
+ struct cfq_queue *cfqq = NULL;
+ int i;
+
+ if (!cfqd->rq_queued)
return NULL;

- return cfq_rb_first(&cfqd->service_tree);
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++, st++) {
+ cfqq = __cfq_get_next_queue(st);
+ if (cfqq) {
+ st->active = cfqq;
+ update_min_vdisktime(cfqq->st);
+ break;
+ }
+ }
+
+ return cfqq;
}

/*
@@ -1181,6 +1358,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
if (!cfqq)
goto new_queue;

+ if (!cfqd->rq_queued)
+ return NULL;
+
/*
* The active queue has run out of time, expire it and select new.
*/
@@ -1216,7 +1396,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
}

expire:
- cfq_slice_expired(cfqd, 0);
+ cfq_slice_expired(cfqd);
new_queue:
cfqq = cfq_set_active_queue(cfqd, new_cfqq);
keep_queue:
@@ -1233,6 +1413,10 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
}

BUG_ON(!list_empty(&cfqq->fifo));
+
+ /* By default cfqq is not expired if it is empty. Do it explicitly */
+ __cfq_slice_expired(cfqq->cfqd, cfqq);
+
return dispatched;
}

@@ -1245,10 +1429,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
struct cfq_queue *cfqq;
int dispatched = 0;

- while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
+ while ((cfqq = cfq_get_next_queue(cfqd)) != NULL)
dispatched += __cfq_forced_dispatch_cfqq(cfqq);

- cfq_slice_expired(cfqd, 0);
+ cfq_slice_expired(cfqd);

BUG_ON(cfqd->busy_queues);

@@ -1391,7 +1575,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
cfq_class_idle(cfqq))) {
cfqq->slice_end = jiffies + 1;
- cfq_slice_expired(cfqd, 0);
+ cfq_slice_expired(cfqd);
}

cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
@@ -1416,13 +1600,14 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
cfq_log_cfqq(cfqd, cfqq, "put_queue");
BUG_ON(rb_first(&cfqq->sort_list));
BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
- BUG_ON(cfq_cfqq_on_rr(cfqq));

if (unlikely(cfqd->active_queue == cfqq)) {
- __cfq_slice_expired(cfqd, cfqq, 0);
+ __cfq_slice_expired(cfqd, cfqq);
cfq_schedule_dispatch(cfqd);
}

+ BUG_ON(cfq_cfqq_on_rr(cfqq));
+
kmem_cache_free(cfq_pool, cfqq);
}

@@ -1514,7 +1699,7 @@ static void cfq_free_io_context(struct io_context *ioc)
static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
if (unlikely(cfqq == cfqd->active_queue)) {
- __cfq_slice_expired(cfqd, cfqq, 0);
+ __cfq_slice_expired(cfqd, cfqq);
cfq_schedule_dispatch(cfqd);
}

@@ -1634,6 +1819,8 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
break;
}

+ if (cfqq->org_ioprio_class != cfqq->ioprio_class)
+ cfqq->ioprio_class_changed = 1;
/*
* keep track of original prio settings in case we have to temporarily
* elevate the priority of this queue
@@ -2079,7 +2266,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
cfq_log_cfqq(cfqd, cfqq, "preempt");
- cfq_slice_expired(cfqd, 1);
+ cfq_slice_expired(cfqd);

/*
* Put the new queue at the front of the of the current list,
@@ -2087,7 +2274,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
*/
BUG_ON(!cfq_cfqq_on_rr(cfqq));

- cfq_service_tree_add(cfqd, cfqq, 1);
+ cfq_service_tree_add(cfqd, cfqq, 1, 0);

cfqq->slice_end = 0;
cfq_mark_cfqq_slice_new(cfqq);
@@ -2229,7 +2416,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
* of idling.
*/
if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
- cfq_slice_expired(cfqd, 1);
+ cfq_slice_expired(cfqd);
else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
sync && !rq_noidle(rq))
cfq_arm_slice_timer(cfqd);
@@ -2250,16 +2437,20 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
* boost idle prio on transactions that would lock out other
* users of the filesystem
*/
- if (cfq_class_idle(cfqq))
+ if (cfq_class_idle(cfqq)) {
cfqq->ioprio_class = IOPRIO_CLASS_BE;
+ cfqq->ioprio_class_changed = 1;
+ }
if (cfqq->ioprio > IOPRIO_NORM)
cfqq->ioprio = IOPRIO_NORM;
} else {
/*
* check if we need to unboost the queue
*/
- if (cfqq->ioprio_class != cfqq->org_ioprio_class)
+ if (cfqq->ioprio_class != cfqq->org_ioprio_class) {
cfqq->ioprio_class = cfqq->org_ioprio_class;
+ cfqq->ioprio_class_changed = 1;
+ }
if (cfqq->ioprio != cfqq->org_ioprio)
cfqq->ioprio = cfqq->org_ioprio;
}
@@ -2391,7 +2582,6 @@ static void cfq_idle_slice_timer(unsigned long data)
struct cfq_data *cfqd = (struct cfq_data *) data;
struct cfq_queue *cfqq;
unsigned long flags;
- int timed_out = 1;

cfq_log(cfqd, "idle timer fired");

@@ -2399,7 +2589,6 @@ static void cfq_idle_slice_timer(unsigned long data)

cfqq = cfqd->active_queue;
if (cfqq) {
- timed_out = 0;

/*
* We saw a request before the queue expired, let it through
@@ -2427,7 +2616,7 @@ static void cfq_idle_slice_timer(unsigned long data)
goto out_kick;
}
expire:
- cfq_slice_expired(cfqd, timed_out);
+ cfq_slice_expired(cfqd);
out_kick:
cfq_schedule_dispatch(cfqd);
out_cont:
@@ -2465,7 +2654,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
spin_lock_irq(q->queue_lock);

if (cfqd->active_queue)
- __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
+ __cfq_slice_expired(cfqd, cfqd->active_queue);

while (!list_empty(&cfqd->cic_list)) {
struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
@@ -2493,7 +2682,8 @@ static void *cfq_init_queue(struct request_queue *q)
if (!cfqd)
return NULL;

- cfqd->service_tree = CFQ_RB_ROOT;
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++)
+ cfqd->sched_data.service_tree[i] = CFQ_RB_ROOT;

/*
* Not strictly needed (since RB_ROOT just clears the node and we
--
1.6.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jmoyer at redhat

Nov 4, 2009, 6:30 AM

Post #2 of 18 (444 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

Vivek Goyal <vgoyal [at] redhat> writes:

> o Currently CFQ provides priority scaled time slices to processes. If a process
> does not use the time slice, either because process did not have sufficient
> IO to do or because think time of process is large and CFQ decided to disable
> idling, then processes looses it time slice share.
^^^^^^
loses

> o One possible way to handle this is implement CFS like time stamping of the
> cfq queues and keep track of vtime. Next queue for execution will be selected
> based on the one who got lowest vtime. This patch implemented time stamping
> mechanism of cfq queues based on disk time used.
>
> o min_vdisktime represents the minimum vdisktime of the queue, either being
^^^^^
> serviced or leftmost element on the serviec tree.

queue or service tree? The latter seems to make more sense to me.

> +static inline u64
> +cfq_delta_fair(unsigned long delta, struct cfq_queue *cfqq)
> +{
> + const int base_slice = cfqq->cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
> +
> + return delta + (base_slice/CFQ_SLICE_SCALE * (cfqq->ioprio - 4));
> +}

cfq_scale_delta might be a better name.


> +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
> +{
> + s64 delta = (s64)(vdisktime - min_vdisktime);
> + if (delta > 0)
> + min_vdisktime = vdisktime;
> +
> + return min_vdisktime;
> +}
> +
> +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
> +{
> + s64 delta = (s64)(vdisktime - min_vdisktime);
> + if (delta < 0)
> + min_vdisktime = vdisktime;
> +
> + return min_vdisktime;
> +}

Is there a reason you've reimplemented min and max?

> + /*
> + * Maintain a cache of leftmost tree entries (it is frequently
> + * used)
> + */

You make it sound like there is a cache of more than one entry. Please
fix the comment.

> +static void cfqq_served(struct cfq_queue *cfqq, unsigned long served)
> +{
> + /*
> + * We don't want to charge more than allocated slice otherwise this
> + * queue can miss one dispatch round doubling max latencies. On the
> + * other hand we don't want to charge less than allocated slice as
> + * we stick to CFQ theme of queue loosing its share if it does not
^^^^^^^
losing


> +/*
> + * Handles three operations.
> + * Addition of a new queue to service tree, when a new request comes in.
> + * Resorting of an expiring queue (used after slice expired)
> + * Requeuing a queue at the front (used during preemption).
> + */
> +static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> + bool add_front, unsigned long service)

service? Can we come up with a better name that actually hints at what
this is? service_time, maybe?


Mostly this looks pretty good and is fairly easy to read.

Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


vgoyal at redhat

Nov 4, 2009, 8:37 AM

Post #3 of 18 (441 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 04, 2009 at 09:30:34AM -0500, Jeff Moyer wrote:
> Vivek Goyal <vgoyal [at] redhat> writes:
>

Thanks for the review Jeff.

> > o Currently CFQ provides priority scaled time slices to processes. If a process
> > does not use the time slice, either because process did not have sufficient
> > IO to do or because think time of process is large and CFQ decided to disable
> > idling, then processes looses it time slice share.
> ^^^^^^
> loses
>

Will fix it.

> > o One possible way to handle this is implement CFS like time stamping of the
> > cfq queues and keep track of vtime. Next queue for execution will be selected
> > based on the one who got lowest vtime. This patch implemented time stamping
> > mechanism of cfq queues based on disk time used.
> >
> > o min_vdisktime represents the minimum vdisktime of the queue, either being
> ^^^^^
> > serviced or leftmost element on the serviec tree.
>
> queue or service tree? The latter seems to make more sense to me.

Yes, it should be service tree. Will fix it.

>
> > +static inline u64
> > +cfq_delta_fair(unsigned long delta, struct cfq_queue *cfqq)
> > +{
> > + const int base_slice = cfqq->cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
> > +
> > + return delta + (base_slice/CFQ_SLICE_SCALE * (cfqq->ioprio - 4));
> > +}
>
> cfq_scale_delta might be a better name.
>

cfq_scale_delta sounds good. Will use it in next version.

>
> > +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
> > +{
> > + s64 delta = (s64)(vdisktime - min_vdisktime);
> > + if (delta > 0)
> > + min_vdisktime = vdisktime;
> > +
> > + return min_vdisktime;
> > +}
> > +
> > +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
> > +{
> > + s64 delta = (s64)(vdisktime - min_vdisktime);
> > + if (delta < 0)
> > + min_vdisktime = vdisktime;
> > +
> > + return min_vdisktime;
> > +}
>
> Is there a reason you've reimplemented min and max?

I think you are referring to min_t and max_t. Will these macros take care
of wrapping too?

For example, if I used min_t(u64, A, B), then unsigned comparision will
not work right wrapping has just taken place for any of the A or B. So if
A=-1 and B=2, then min_t() would return B as minimum. This is not right
in our case.

If we do signed comparison (min_t(s64, A, B)), that also seems to be
broken in another case where a value of variable moves from 63bits to 64bits,
(A=0x7fffffffffffffff, B=0x8000000000000000). Above will return B as minimum but
in our scanario, vdisktime will progress from 0x7fffffffffffffff to
0x8000000000000000 and A should be returned as minimum (unsigned
comparison).

Hence I took these difnitions from CFS.

>
> > + /*
> > + * Maintain a cache of leftmost tree entries (it is frequently
> > + * used)
> > + */
>
> You make it sound like there is a cache of more than one entry. Please
> fix the comment.

Will fix it.

>
> > +static void cfqq_served(struct cfq_queue *cfqq, unsigned long served)
> > +{
> > + /*
> > + * We don't want to charge more than allocated slice otherwise this
> > + * queue can miss one dispatch round doubling max latencies. On the
> > + * other hand we don't want to charge less than allocated slice as
> > + * we stick to CFQ theme of queue loosing its share if it does not
> ^^^^^^^
> losing
>

Will fix it.

>
> > +/*
> > + * Handles three operations.
> > + * Addition of a new queue to service tree, when a new request comes in.
> > + * Resorting of an expiring queue (used after slice expired)
> > + * Requeuing a queue at the front (used during preemption).
> > + */
> > +static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> > + bool add_front, unsigned long service)
>
> service? Can we come up with a better name that actually hints at what
> this is? service_time, maybe?

Ok, service_time sounds good. Will change it.

>
>
> Mostly this looks pretty good and is fairly easy to read.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


czoccolo at gmail

Nov 4, 2009, 9:59 AM

Post #4 of 18 (442 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 4, 2009 at 5:37 PM, Vivek Goyal <vgoyal [at] redhat> wrote:
> On Wed, Nov 04, 2009 at 09:30:34AM -0500, Jeff Moyer wrote:
>> Vivek Goyal <vgoyal [at] redhat> writes:
>>
>
> Thanks for the review Jeff.
>
>> > o Currently CFQ provides priority scaled time slices to processes. If a process
>> >   does not use the time slice, either because process did not have sufficient
>> >   IO to do or because think time of process is large and CFQ decided to disable
>> >   idling, then processes looses it time slice share.
>>                            ^^^^^^
>> loses
>>
should be 'process loses'

>> > +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
>> > +{
>> > +   s64 delta = (s64)(vdisktime - min_vdisktime);
>> > +   if (delta > 0)
>> > +           min_vdisktime = vdisktime;
>> > +
>> > +   return min_vdisktime;
>> > +}
>> > +
>> > +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
>> > +{
>> > +   s64 delta = (s64)(vdisktime - min_vdisktime);
>> > +   if (delta < 0)
>> > +           min_vdisktime = vdisktime;
>> > +
>> > +   return min_vdisktime;
>> > +}
>>
>> Is there a reason you've reimplemented min and max?
>
> I think you are referring to min_t and max_t. Will these macros take care
> of wrapping too?
>
> For example, if I used min_t(u64, A, B), then unsigned comparision will
> not work right wrapping has just taken place for any of the A or B. So if
> A=-1 and B=2, then min_t() would return B as minimum. This is not right
> in our case.
>
> If we do signed comparison (min_t(s64, A, B)), that also seems to be
> broken in another case where a value of variable moves from 63bits to 64bits,
> (A=0x7fffffffffffffff, B=0x8000000000000000). Above will return B as minimum but
> in our scanario, vdisktime will progress from 0x7fffffffffffffff to
> 0x8000000000000000 and A should be returned as minimum (unsigned
> comparison).
>
> Hence I took these difnitions from CFS.
If those are times (measured in jiffies), why are you using u64? You
could use unsigned long and time_before/time_after, that perform the
proper wrap checking.


Corrado
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


vgoyal at redhat

Nov 4, 2009, 10:54 AM

Post #5 of 18 (437 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 04, 2009 at 06:59:45PM +0100, Corrado Zoccolo wrote:
> On Wed, Nov 4, 2009 at 5:37 PM, Vivek Goyal <vgoyal [at] redhat> wrote:
> > On Wed, Nov 04, 2009 at 09:30:34AM -0500, Jeff Moyer wrote:
> >> Vivek Goyal <vgoyal [at] redhat> writes:
> >>
> >
> > Thanks for the review Jeff.
> >
> >> > o Currently CFQ provides priority scaled time slices to processes. If a process
> >> >   does not use the time slice, either because process did not have sufficient
> >> >   IO to do or because think time of process is large and CFQ decided to disable
> >> >   idling, then processes looses it time slice share.
> >>                            ^^^^^^
> >> loses
> >>
> should be 'process loses'
>

Will fix it.

> >> > +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
> >> > +{
> >> > +   s64 delta = (s64)(vdisktime - min_vdisktime);
> >> > +   if (delta > 0)
> >> > +           min_vdisktime = vdisktime;
> >> > +
> >> > +   return min_vdisktime;
> >> > +}
> >> > +
> >> > +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
> >> > +{
> >> > +   s64 delta = (s64)(vdisktime - min_vdisktime);
> >> > +   if (delta < 0)
> >> > +           min_vdisktime = vdisktime;
> >> > +
> >> > +   return min_vdisktime;
> >> > +}
> >>
> >> Is there a reason you've reimplemented min and max?
> >
> > I think you are referring to min_t and max_t. Will these macros take care
> > of wrapping too?
> >
> > For example, if I used min_t(u64, A, B), then unsigned comparision will
> > not work right wrapping has just taken place for any of the A or B. So if
> > A=-1 and B=2, then min_t() would return B as minimum. This is not right
> > in our case.
> >
> > If we do signed comparison (min_t(s64, A, B)), that also seems to be
> > broken in another case where a value of variable moves from 63bits to 64bits,
> > (A=0x7fffffffffffffff, B=0x8000000000000000). Above will return B as minimum but
> > in our scanario, vdisktime will progress from 0x7fffffffffffffff to
> > 0x8000000000000000 and A should be returned as minimum (unsigned
> > comparison).
> >
> > Hence I took these difnitions from CFS.
> If those are times (measured in jiffies), why are you using u64? You
> could use unsigned long and time_before/time_after, that perform the
> proper wrap checking.
>

This is virtual time and not exactly the jiffies. This can run faster than
real time. In current patchset there are two reasons for that.

- We assign minimum time slice used by queue as 1ms and queue if we expire
the queue immediately after dispatching a request. So if we have really
fast hardware and we decide not to idle, we will be doing queue switch
very fast assigning each queue 1ms as slice used and vtime will be
progressing much faster than real time.

- We shift real time by CFQ_SERVICE_SHIFT so that theoritically one can
see service difference between weights x and x+1 for all values in the
weight range of 1 to 1000. and not loose small difference in the division
part.

vtime calculation is as follows.

vtime = (slice_used << CFQ_SERVICE_SHIFT )* DEFAUTL_WEIGHT/cfqe->weight

minimum value of slice_used is 1 jiffy. DEFAULT WEIGHT is 500. So if one
wants to get different vtime values for queue weights of 998 and 999, we
need to shift slice used by atleast 12 bits.

We are giving 12 bits shift to the real time, we are left with 20 bits on 32
bit hardware. So even if vtime does not run faster, in 1M jiffies (~ 1000
seconds) we will wrap around. I did some 4K sized direct IO on one of the
SSD and it achieve 7K io per second. In the worst case if these IO were
coming from two queues and we were interleaving the requests between these
then we will do 7 queue switch in 1ms. That means vtime travelling 7 times
faster and wrap around will take place in 1000/7 ~= 130 seconds.

I thought that on 32bit hardware we are really close to pushing the
limits, hence I thought of continuing to use 64bit vdisktime/key instead
of a unsigned long one.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


czoccolo at gmail

Nov 4, 2009, 1:18 PM

Post #6 of 18 (435 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

Hi Vivek,
On Wed, Nov 4, 2009 at 12:43 AM, Vivek Goyal <vgoyal [at] redhat> wrote:
> o Previously CFQ had one service tree where queues of all theree prio classes
>  were being queued. One side affect of this time stamping approach is that
>  now single tree approach might not work and we need to keep separate service
>  trees for three prio classes.
>
Single service tree is no longer true in cfq for-2.6.33.
Now we have a matrix of service trees, with first dimension being the
priority class, and second dimension being the workload type
(synchronous idle, synchronous no-idle, async).
You can have a look at the series: http://lkml.org/lkml/2009/10/26/482 .
It may have other interesting influences on your work, as the idle
introduced at the end of the synchronous no-idle tree, that provides
fairness also for seeky or high-think-time queues.

Corrado
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


vgoyal at redhat

Nov 4, 2009, 2:25 PM

Post #7 of 18 (435 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 04, 2009 at 10:18:15PM +0100, Corrado Zoccolo wrote:
> Hi Vivek,
> On Wed, Nov 4, 2009 at 12:43 AM, Vivek Goyal <vgoyal [at] redhat> wrote:
> > o Previously CFQ had one service tree where queues of all theree prio classes
> >  were being queued. One side affect of this time stamping approach is that
> >  now single tree approach might not work and we need to keep separate service
> >  trees for three prio classes.
> >
> Single service tree is no longer true in cfq for-2.6.33.
> Now we have a matrix of service trees, with first dimension being the
> priority class, and second dimension being the workload type
> (synchronous idle, synchronous no-idle, async).
> You can have a look at the series: http://lkml.org/lkml/2009/10/26/482 .
> It may have other interesting influences on your work, as the idle
> introduced at the end of the synchronous no-idle tree, that provides
> fairness also for seeky or high-think-time queues.
>

Thanks. I am looking at your patches right now. Got one question about
following commit.

****************************************************************
commit a6d44e982d3734583b3b4e1d36921af8cfd61fc0
Author: Corrado Zoccolo <czoccolo [at] gmail>
Date: Mon Oct 26 22:45:11 2009 +0100

cfq-iosched: enable idling for last queue on priority class

cfq can disable idling for queues in various circumstances.
When workloads of different priorities are competing, if the higher
priority queue has idling disabled, lower priority queues may steal
its disk share. For example, in a scenario with an RT process
performing seeky reads vs a BE process performing sequential reads,
on an NCQ enabled hardware, with low_latency unset,
the RT process will dispatch only the few pending requests every full
slice of service for the BE process.

The patch solves this issue by always performing idle on the last
queue at a given priority class > idle. If the same process, or one
that can pre-empt it (so at the same priority or higher), submits a
new request within the idle window, the lower priority queue won't
dispatch, saving the disk bandwidth for higher priority ones.

Note: this doesn't touch the non_rotational + NCQ case (no hardware
to test if this is a benefit in that case).
*************************************************************************

Not able to understand the logic of waiting for last queue in prio
class. This whole patch series seems to be about low latencies. So why
would not somebody set "low_latency" in IO scheduler? And if somebody
sets "low_latencies" then we will enable idling on random seeky reader
also. So problem will not exist.

On top of that, even if we don't idle for RT reader, we will always
preempt BE reader immediately and get the disk. The only side affect
is that on rotational media, disk head might have moved and bring the
overall throughput down.

So my concern is that with this idling on last queue, we are targetting
fairness issue for the random seeky readers with thinktime with-in 8ms.
That can be easily solved by setting low_latency=1. Why are we going
to this lenth then?

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


vgoyal at redhat

Nov 4, 2009, 3:22 PM

Post #8 of 18 (435 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 04, 2009 at 10:18:15PM +0100, Corrado Zoccolo wrote:
> Hi Vivek,
> On Wed, Nov 4, 2009 at 12:43 AM, Vivek Goyal <vgoyal [at] redhat> wrote:
> > o Previously CFQ had one service tree where queues of all theree prio classes
> >  were being queued. One side affect of this time stamping approach is that
> >  now single tree approach might not work and we need to keep separate service
> >  trees for three prio classes.
> >
> Single service tree is no longer true in cfq for-2.6.33.
> Now we have a matrix of service trees, with first dimension being the
> priority class, and second dimension being the workload type
> (synchronous idle, synchronous no-idle, async).
> You can have a look at the series: http://lkml.org/lkml/2009/10/26/482 .
> It may have other interesting influences on your work, as the idle
> introduced at the end of the synchronous no-idle tree, that provides
> fairness also for seeky or high-think-time queues.
>

Hi Corrado,

Had one more question. Now with dynamic slice length (reduce slice length
to meet target latency), don't wee see reduced throughput on rotational
media with sequential workload?

I saw some you posted numbers for SSD. Do you have some numbers for
rotational media also?

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


vgoyal at redhat

Nov 4, 2009, 4:05 PM

Post #9 of 18 (448 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 04, 2009 at 10:18:15PM +0100, Corrado Zoccolo wrote:
> Hi Vivek,
> On Wed, Nov 4, 2009 at 12:43 AM, Vivek Goyal <vgoyal [at] redhat> wrote:
> > o Previously CFQ had one service tree where queues of all theree prio classes
> >  were being queued. One side affect of this time stamping approach is that
> >  now single tree approach might not work and we need to keep separate service
> >  trees for three prio classes.
> >
> Single service tree is no longer true in cfq for-2.6.33.
> Now we have a matrix of service trees, with first dimension being the
> priority class, and second dimension being the workload type
> (synchronous idle, synchronous no-idle, async).
> You can have a look at the series: http://lkml.org/lkml/2009/10/26/482 .
> It may have other interesting influences on your work, as the idle
> introduced at the end of the synchronous no-idle tree, that provides
> fairness also for seeky or high-think-time queues.
>

I am sorry that I am asking questions about a different patchset in this
mail. I don't have ready access to other mail thread currently.

I am looking at your patchset and trying to understand how have you
ensured fairness for different priority level queues.

Following seems to be the key piece of code which determines the slice
length of the queue dynamically.


static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
if (cfqd->cfq_latency) {
/* interested queues (we consider only the ones with the same
* priority class) */
unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
unsigned sync_slice = cfqd->cfq_slice[1];
unsigned expect_latency = sync_slice * iq;
if (expect_latency > cfq_target_latency) {
unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
/* scale low_slice according to IO priority
* and sync vs async */
unsigned low_slice =
min(slice, base_low_slice * slice / sync_slice);
/* the adapted slice value is scaled to fit all iqs
* into the target latency */
slice = max(slice * cfq_target_latency / expect_latency,
low_slice);
}
}
cfqq->slice_end = jiffies + slice;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}

A question.

- expect_latency seems to be being calculated based on based slice lenth
for sync queues (100ms). This will give right number only if all the
queues in the system were of prio 4. What if there are 3 prio 0 queues.
They will/should get 180ms slice each resulting in max latency of 540 ms
but we will be calculating expect_latency to = 100 * 3 =300 ms which is
less than cfq_target_latency and we will not adjust slice length?

- With "no-idle" group, who benefits? As I said, all these optimizations
seems to be for low latency. In that case user will set "low_latency"
tunable in CFQ. If that's the case, then we will anyway enable idling
random seeky processes having think time less than 8ms. So they get
their fair share.

I guess this will provide benefit if user has not set "low_latency" and
in that case we will not enable idle on random seeky readers and we will
gain in terms of throughput on NCQ hardware because we dispatch from
other no-idle queues and then idle on the no-idle group.

Time for some testing...

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


dpshah at google

Nov 4, 2009, 6:44 PM

Post #10 of 18 (436 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 4, 2009 at 8:37 AM, Vivek Goyal <vgoyal [at] redhat> wrote:
> On Wed, Nov 04, 2009 at 09:30:34AM -0500, Jeff Moyer wrote:
>> Vivek Goyal <vgoyal [at] redhat> writes:
>>
>
> Thanks for the review Jeff.
>
>> > o Currently CFQ provides priority scaled time slices to processes. If a process
>> >   does not use the time slice, either because process did not have sufficient
>> >   IO to do or because think time of process is large and CFQ decided to disable
>> >   idling, then processes looses it time slice share.
>>                            ^^^^^^
>> loses
>>
>
> Will fix it.
>
>> > o One possible way to handle this is implement CFS like time stamping of the
>> >   cfq queues and keep track of vtime. Next queue for execution will be selected
>> >   based on the one who got lowest vtime. This patch implemented time stamping
>> >   mechanism of cfq queues based on disk time used.
>> >
>> > o min_vdisktime represents the minimum vdisktime of the queue, either being
>>                                                           ^^^^^
>> >   serviced or leftmost element on the serviec tree.
>>
>> queue or service tree?  The latter seems to make more sense to me.
>
> Yes, it should be service tree. Will fix it.
>
>>
>> > +static inline u64
>> > +cfq_delta_fair(unsigned long delta, struct cfq_queue *cfqq)
>> > +{
>> > +   const int base_slice = cfqq->cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
>> > +
>> > +   return delta + (base_slice/CFQ_SLICE_SCALE * (cfqq->ioprio - 4));
>> > +}
>>
>> cfq_scale_delta might be a better name.
>>
>
> cfq_scale_delta sounds good. Will use it in next version.
>
>>
>> > +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
>> > +{
>> > +   s64 delta = (s64)(vdisktime - min_vdisktime);
>> > +   if (delta > 0)
>> > +           min_vdisktime = vdisktime;
>> > +
>> > +   return min_vdisktime;
>> > +}
>> > +
>> > +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
>> > +{
>> > +   s64 delta = (s64)(vdisktime - min_vdisktime);
>> > +   if (delta < 0)
>> > +           min_vdisktime = vdisktime;
>> > +
>> > +   return min_vdisktime;
>> > +}
>>
>> Is there a reason you've reimplemented min and max?
>
> I think you are referring to min_t and max_t. Will these macros take care
> of wrapping too?
>
> For example, if I used min_t(u64, A, B), then unsigned comparision will
> not work right wrapping has just taken place for any of the A or B. So if
> A=-1 and B=2, then min_t() would return B as minimum. This is not right
> in our case.
>
> If we do signed comparison (min_t(s64, A, B)), that also seems to be
> broken in another case where a value of variable moves from 63bits to 64bits,
> (A=0x7fffffffffffffff, B=0x8000000000000000). Above will return B as minimum but
> in our scanario, vdisktime will progress from 0x7fffffffffffffff to
> 0x8000000000000000 and A should be returned as minimum (unsigned
> comparison).

Can you define and use u64 versions of time_before() and time_after()
(from include/linux/jiffies.h) for your comparisons? These take care
of wrapping as well. Maybe call them timestamp_before()/after().

>
> Hence I took these difnitions from CFS.

Also if these are exactly the same and you decide to continue using
these, can we move them to a common header file (time.h or maybe add a
vtime.h) and reuse?

>
>>
>> > +   /*
>> > +    * Maintain a cache of leftmost tree entries (it is frequently
>> > +    * used)
>> > +    */
>>
>> You make it sound like there is a cache of more than one entry.  Please
>> fix the comment.
>
> Will fix it.
>
>>
>> > +static void cfqq_served(struct cfq_queue *cfqq, unsigned long served)
>> > +{
>> > +   /*
>> > +    * We don't want to charge more than allocated slice otherwise this
>> > +    * queue can miss one dispatch round doubling max latencies. On the
>> > +    * other hand we don't want to charge less than allocated slice as
>> > +    * we stick to CFQ theme of queue loosing its share if it does not
>>                                           ^^^^^^^
>> losing
>>
>
> Will fix it.
>
>>
>> > +/*
>> > + * Handles three operations.
>> > + * Addition of a new queue to service tree, when a new request comes in.
>> > + * Resorting of an expiring queue (used after slice expired)
>> > + * Requeuing a queue at the front (used during preemption).
>> > + */
>> > +static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> > +                           bool add_front, unsigned long service)
>>
>> service?  Can we come up with a better name that actually hints at what
>> this is?  service_time, maybe?
>
> Ok, service_time sounds good. Will change it.
>
>>
>>
>> Mostly this looks pretty good and is fairly easy to read.
>
> Thanks
> Vivek
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


czoccolo at gmail

Nov 5, 2009, 12:27 AM

Post #11 of 18 (433 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

Hi Vivek,
let me answer all your questions in a single mail.

On Thu, Nov 5, 2009 at 12:22 AM, Vivek Goyal <vgoyal [at] redhat> wrote:
> Hi Corrado,
>
> Had one more question. Now with dynamic slice length (reduce slice length
> to meet target latency), don't wee see reduced throughput on rotational
> media with sequential workload?
>
Yes. This is the main reason for disabling dynamic slice length when
low_latency is not set. In this way, on servers where low latency is
not a must (but still desirable), this feature can be disabled, while
the others, that have positive impact on throughput, will not be
disabled.

> I saw some you posted numbers for SSD. Do you have some numbers for
> rotational media also?
Yes. I posted it in the first RFC for this patch, outside the series:
http://lkml.org/lkml/2009/9/3/87

The other patches in the series do not affect sequential bandwidth,
but can improve random read BW in case of NCQ hardware, regardless of
it being rotational, SSD, or SAN.

> I am looking at your patchset and trying to understand how have you
> ensured fairness for different priority level queues.
>
> Following seems to be the key piece of code which determines the slice
> length of the queue dynamically.
>
>
> static inline void
> cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> { [snipped code] }
>
> A question.
>
> - expect_latency seems to be being calculated based on based slice lenth
> for sync queues (100ms). This will give right number only if all the
> queues in the system were of prio 4. What if there are 3 prio 0 queues.
> They will/should get 180ms slice each resulting in max latency of 540 ms
> but we will be calculating expect_latency to = 100 * 3 =300 ms which is
> less than cfq_target_latency and we will not adjust slice length?
>
Yes. Those are soft latencies, so we don't *guarantee* 300ms. On an
average system, where the average slice length is 100ms, we will go
pretty close (but since CFQ doesn't count the first seek in the time
slice, we can still be some tenths of ms off), but if you have a
different distribution of priorities, then this will not be
guaranteed.

> - With "no-idle" group, who benefits? As I said, all these optimizations
> seems to be for low latency. In that case user will set "low_latency"
> tunable in CFQ. If that's the case, then we will anyway enable idling
> random seeky processes having think time less than 8ms. So they get
> their fair share.
My patch changes the meaning for low_latency. As we discussed some
months ago, I always thought that the solution of idling for seeky
processes was sub-optimal. With the new code, regardless of
low_latency settings, we won't idle between 'no-idle' queues. We will
idle only at the end of the no-idle tree, if we still have not reached
workload_expires. This provides fairness between 'no-idle' and normal
sync queues.
>
> I guess this will provide benefit if user has not set "low_latency" and
> in that case we will not enable idle on random seeky readers and we will
> gain in terms of throughput on NCQ hardware because we dispatch from
> other no-idle queues and then idle on the no-idle group.
It will improve both latency and bandwidth, and as I said, it is now
not limited to just low_latency not set. After my patch series,
low_latency will control just 2 things:
* the dynamic timeslice adaption
* the dynamic threshold for number of writes dispatched

Thanks
Corrado
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


czoccolo at gmail

Nov 5, 2009, 12:36 AM

Post #12 of 18 (433 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

Hi Vivek,
On Wed, Nov 4, 2009 at 11:25 PM, Vivek Goyal <vgoyal [at] redhat> wrote:
> Thanks. I am looking at your patches right now. Got one question about
> following commit.
>
> ****************************************************************
> commit a6d44e982d3734583b3b4e1d36921af8cfd61fc0
> Author: Corrado Zoccolo <czoccolo [at] gmail>
> Date:   Mon Oct 26 22:45:11 2009 +0100
>
>    cfq-iosched: enable idling for last queue on priority class
>
>    cfq can disable idling for queues in various circumstances.
>    When workloads of different priorities are competing, if the higher
>    priority queue has idling disabled, lower priority queues may steal
>    its disk share. For example, in a scenario with an RT process
>    performing seeky reads vs a BE process performing sequential reads,
>    on an NCQ enabled hardware, with low_latency unset,
>    the RT process will dispatch only the few pending requests every full
>    slice of service for the BE process.
>
>    The patch solves this issue by always performing idle on the last
>    queue at a given priority class > idle. If the same process, or one
>    that can pre-empt it (so at the same priority or higher), submits a
>    new request within the idle window, the lower priority queue won't
>    dispatch, saving the disk bandwidth for higher priority ones.
>
>    Note: this doesn't touch the non_rotational + NCQ case (no hardware
>    to test if this is a benefit in that case).
> *************************************************************************
>
[snipping questions I answered in the combo mail]
> On top of that, even if we don't idle for RT reader, we will always
> preempt BE reader immediately and get the disk. The only side affect
> is that on rotational media, disk head might have moved and bring the
> overall throughput down.
You bring down throughput, and also increase latency, not only on
rotational media, so you may not want to enable it on servers.
Without low_latency, I saw this bug in current 'fairness' policy in
CFQ, so this patch fixes it.
>
> So my concern is that with this idling on last queue, we are targetting
> fairness issue for the random seeky readers with thinktime with-in 8ms.
> That can be easily solved by setting low_latency=1. Why are we going
> to this lenth then?
Maybe on the servers where you want to run RT tasks you don't want the
aforementioned drawbacks of low_latency.
Since I was going to change the implications of low_latency in
following patches, I fixed the 'bug' here, so I was free to change the
implementation in the following, without reintroducing this bug (it
was present for long, before being fixed by the introduction of
low_latency).

Thanks
Corrado
>
> Thanks
> Vivek
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


vgoyal at redhat

Nov 5, 2009, 6:39 AM

Post #13 of 18 (432 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 04, 2009 at 06:44:28PM -0800, Divyesh Shah wrote:
> On Wed, Nov 4, 2009 at 8:37 AM, Vivek Goyal <vgoyal [at] redhat> wrote:
> > On Wed, Nov 04, 2009 at 09:30:34AM -0500, Jeff Moyer wrote:
> >> Vivek Goyal <vgoyal [at] redhat> writes:
> >>
> >
> > Thanks for the review Jeff.
> >
> >> > o Currently CFQ provides priority scaled time slices to processes. If a process
> >> >   does not use the time slice, either because process did not have sufficient
> >> >   IO to do or because think time of process is large and CFQ decided to disable
> >> >   idling, then processes looses it time slice share.
> >>                            ^^^^^^
> >> loses
> >>
> >
> > Will fix it.
> >
> >> > o One possible way to handle this is implement CFS like time stamping of the
> >> >   cfq queues and keep track of vtime. Next queue for execution will be selected
> >> >   based on the one who got lowest vtime. This patch implemented time stamping
> >> >   mechanism of cfq queues based on disk time used.
> >> >
> >> > o min_vdisktime represents the minimum vdisktime of the queue, either being
> >>                                                           ^^^^^
> >> >   serviced or leftmost element on the serviec tree.
> >>
> >> queue or service tree?  The latter seems to make more sense to me.
> >
> > Yes, it should be service tree. Will fix it.
> >
> >>
> >> > +static inline u64
> >> > +cfq_delta_fair(unsigned long delta, struct cfq_queue *cfqq)
> >> > +{
> >> > +   const int base_slice = cfqq->cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
> >> > +
> >> > +   return delta + (base_slice/CFQ_SLICE_SCALE * (cfqq->ioprio - 4));
> >> > +}
> >>
> >> cfq_scale_delta might be a better name.
> >>
> >
> > cfq_scale_delta sounds good. Will use it in next version.
> >
> >>
> >> > +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
> >> > +{
> >> > +   s64 delta = (s64)(vdisktime - min_vdisktime);
> >> > +   if (delta > 0)
> >> > +           min_vdisktime = vdisktime;
> >> > +
> >> > +   return min_vdisktime;
> >> > +}
> >> > +
> >> > +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
> >> > +{
> >> > +   s64 delta = (s64)(vdisktime - min_vdisktime);
> >> > +   if (delta < 0)
> >> > +           min_vdisktime = vdisktime;
> >> > +
> >> > +   return min_vdisktime;
> >> > +}
> >>
> >> Is there a reason you've reimplemented min and max?
> >
> > I think you are referring to min_t and max_t. Will these macros take care
> > of wrapping too?
> >
> > For example, if I used min_t(u64, A, B), then unsigned comparision will
> > not work right wrapping has just taken place for any of the A or B. So if
> > A=-1 and B=2, then min_t() would return B as minimum. This is not right
> > in our case.
> >
> > If we do signed comparison (min_t(s64, A, B)), that also seems to be
> > broken in another case where a value of variable moves from 63bits to 64bits,
> > (A=0x7fffffffffffffff, B=0x8000000000000000). Above will return B as minimum but
> > in our scanario, vdisktime will progress from 0x7fffffffffffffff to
> > 0x8000000000000000 and A should be returned as minimum (unsigned
> > comparison).
>
> Can you define and use u64 versions of time_before() and time_after()
> (from include/linux/jiffies.h) for your comparisons? These take care
> of wrapping as well. Maybe call them timestamp_before()/after().
>
> >
> > Hence I took these difnitions from CFS.
>
> Also if these are exactly the same and you decide to continue using
> these, can we move them to a common header file (time.h or maybe add a
> vtime.h) and reuse?
>

Ok. I will look into it. Sharing the function between CFS scheduler and
CFQ scheduler.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


guijianfeng at cn

Nov 10, 2009, 4:48 PM

Post #14 of 18 (415 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

Vivek Goyal wrote:
...
>
> @@ -1245,10 +1429,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
> struct cfq_queue *cfqq;
> int dispatched = 0;
>
> - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
> + while ((cfqq = cfq_get_next_queue(cfqd)) != NULL)
> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
>
> - cfq_slice_expired(cfqd, 0);
> + cfq_slice_expired(cfqd);
>
> BUG_ON(cfqd->busy_queues);
>
> @@ -1391,7 +1575,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
> cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
> cfq_class_idle(cfqq))) {
> cfqq->slice_end = jiffies + 1;
> - cfq_slice_expired(cfqd, 0);
> + cfq_slice_expired(cfqd);

Hi Vivek,

I think here you should make sure that when updating cfqq->slice_end, cfqq->slice_end doesn't
equal to 0. Because if cfqq->slice_end == 0, cfq_slice_expired() just charge for 1 jiffy, but
if cfqq->slice_end is updated when it equals to 0(first request still in the air), at that time
cfqq->slice_start == 0, and slice_used is charged as "jiffies - cfqq->slice_start". Following
patch fixes this bug.

Signed-off-by: Gui Jianfeng <guijianfeng [at] cn>
---
block/cfq-iosched.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f23d713..12afc14 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1999,7 +1999,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
cfq_class_idle(cfqq))) {
- cfqq->slice_end = jiffies + 1;
+ if (cfqq->slice_end)
+ cfqq->slice_end = jiffies + 1;
cfq_slice_expired(cfqd);
}

--
1.5.4.rc3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


vgoyal at redhat

Nov 12, 2009, 3:07 PM

Post #15 of 18 (412 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Wed, Nov 11, 2009 at 08:48:09AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> ...
> >
> > @@ -1245,10 +1429,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
> > struct cfq_queue *cfqq;
> > int dispatched = 0;
> >
> > - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
> > + while ((cfqq = cfq_get_next_queue(cfqd)) != NULL)
> > dispatched += __cfq_forced_dispatch_cfqq(cfqq);
> >
> > - cfq_slice_expired(cfqd, 0);
> > + cfq_slice_expired(cfqd);
> >
> > BUG_ON(cfqd->busy_queues);
> >
> > @@ -1391,7 +1575,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
> > cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
> > cfq_class_idle(cfqq))) {
> > cfqq->slice_end = jiffies + 1;
> > - cfq_slice_expired(cfqd, 0);
> > + cfq_slice_expired(cfqd);
>
> Hi Vivek,
>
> I think here you should make sure that when updating cfqq->slice_end, cfqq->slice_end doesn't
> equal to 0. Because if cfqq->slice_end == 0, cfq_slice_expired() just charge for 1 jiffy, but
> if cfqq->slice_end is updated when it equals to 0(first request still in the air), at that time
> cfqq->slice_start == 0, and slice_used is charged as "jiffies - cfqq->slice_start". Following
> patch fixes this bug.
>

Hi Gui,

This can happen only once during a one wrap around cycle of jiffies. That
too depends in case we are hitting jiffies+1 as 0 or not.

So I would not worry much about it right now.

In fact, not updating slice_end, will make idle or async queue slice last
much longer than it should have.

Thanks
Vivek


> Signed-off-by: Gui Jianfeng <guijianfeng [at] cn>
> ---
> block/cfq-iosched.c | 3 ++-
> 1 files changed, 2 insertions(+), 1 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index f23d713..12afc14 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -1999,7 +1999,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
> if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
> cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
> cfq_class_idle(cfqq))) {
> - cfqq->slice_end = jiffies + 1;
> + if (cfqq->slice_end)
> + cfqq->slice_end = jiffies + 1;
> cfq_slice_expired(cfqd);
> }
>
> --
> 1.5.4.rc3
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


guijianfeng at cn

Nov 12, 2009, 4:59 PM

Post #16 of 18 (414 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

Vivek Goyal wrote:
> On Wed, Nov 11, 2009 at 08:48:09AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>> ...
>>>
>>> @@ -1245,10 +1429,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
>>> struct cfq_queue *cfqq;
>>> int dispatched = 0;
>>>
>>> - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
>>> + while ((cfqq = cfq_get_next_queue(cfqd)) != NULL)
>>> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
>>>
>>> - cfq_slice_expired(cfqd, 0);
>>> + cfq_slice_expired(cfqd);
>>>
>>> BUG_ON(cfqd->busy_queues);
>>>
>>> @@ -1391,7 +1575,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
>>> cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
>>> cfq_class_idle(cfqq))) {
>>> cfqq->slice_end = jiffies + 1;
>>> - cfq_slice_expired(cfqd, 0);
>>> + cfq_slice_expired(cfqd);
>> Hi Vivek,
>>
>> I think here you should make sure that when updating cfqq->slice_end, cfqq->slice_end doesn't
>> equal to 0. Because if cfqq->slice_end == 0, cfq_slice_expired() just charge for 1 jiffy, but
>> if cfqq->slice_end is updated when it equals to 0(first request still in the air), at that time
>> cfqq->slice_start == 0, and slice_used is charged as "jiffies - cfqq->slice_start". Following
>> patch fixes this bug.
>>
>
> Hi Gui,
>
> This can happen only once during a one wrap around cycle of jiffies. That
> too depends in case we are hitting jiffies+1 as 0 or not.
>
> So I would not worry much about it right now.
>
> In fact, not updating slice_end, will make idle or async queue slice last
> much longer than it should have.

I don't think so Vivek, this bug can be easily trigger by creating two cgroup and run a idle
task in one group, then run a normal task in the other group. When the idle task sends out its
first request, this bug occurs. I can reproduce this bug every time by the following script.

#!/bin/sh

mkdir /cgroup
mount -t cgroup -o blkio io /cgroup
mkdir /cgroup/tst1
mkdir /cgroup/tst2

dd if=/dev/sdb2 of=/dev/null &
pid1=$!
echo $pid1 > /cgroup/tst1/tasks

dd if=/dev/sdb3 of=/dev/null &
pid2=$!
ionice -c3 -p$pid2
echo $pid2 > /cgroup/tst2/tasks

sleep 5

cat /cgroup/tst1/blkio.time
cat /cgroup/tst2/blkio.time

killall -9 dd
sleep 1

rmdir /cgroup/tst1
rmdir /cgroup/tst2
umount /cgroup
rmdir /cgroup


>
> Thanks
> Vivek
>
>
>> Signed-off-by: Gui Jianfeng <guijianfeng [at] cn>
>> ---
>> block/cfq-iosched.c | 3 ++-
>> 1 files changed, 2 insertions(+), 1 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index f23d713..12afc14 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -1999,7 +1999,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
>> if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
>> cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
>> cfq_class_idle(cfqq))) {
>> - cfqq->slice_end = jiffies + 1;
>> + if (cfqq->slice_end)
>> + cfqq->slice_end = jiffies + 1;
>> cfq_slice_expired(cfqd);
>> }
>>
>> --
>> 1.5.4.rc3
>
>
>

--
Regards
Gui Jianfeng

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


vgoyal at redhat

Nov 12, 2009, 5:24 PM

Post #17 of 18 (412 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

On Fri, Nov 13, 2009 at 08:59:08AM +0800, Gui Jianfeng wrote:
>
>
> Vivek Goyal wrote:
> > On Wed, Nov 11, 2009 at 08:48:09AM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >> ...
> >>>
> >>> @@ -1245,10 +1429,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
> >>> struct cfq_queue *cfqq;
> >>> int dispatched = 0;
> >>>
> >>> - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
> >>> + while ((cfqq = cfq_get_next_queue(cfqd)) != NULL)
> >>> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
> >>>
> >>> - cfq_slice_expired(cfqd, 0);
> >>> + cfq_slice_expired(cfqd);
> >>>
> >>> BUG_ON(cfqd->busy_queues);
> >>>
> >>> @@ -1391,7 +1575,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
> >>> cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
> >>> cfq_class_idle(cfqq))) {
> >>> cfqq->slice_end = jiffies + 1;
> >>> - cfq_slice_expired(cfqd, 0);
> >>> + cfq_slice_expired(cfqd);
> >> Hi Vivek,
> >>
> >> I think here you should make sure that when updating cfqq->slice_end, cfqq->slice_end doesn't
> >> equal to 0. Because if cfqq->slice_end == 0, cfq_slice_expired() just charge for 1 jiffy, but
> >> if cfqq->slice_end is updated when it equals to 0(first request still in the air), at that time
> >> cfqq->slice_start == 0, and slice_used is charged as "jiffies - cfqq->slice_start". Following
> >> patch fixes this bug.
> >>
> >
> > Hi Gui,
> >
> > This can happen only once during a one wrap around cycle of jiffies. That
> > too depends in case we are hitting jiffies+1 as 0 or not.
> >
> > So I would not worry much about it right now.
> >
> > In fact, not updating slice_end, will make idle or async queue slice last
> > much longer than it should have.
>
> I don't think so Vivek, this bug can be easily trigger by creating two cgroup and run a idle
> task in one group, then run a normal task in the other group. When the idle task sends out its
> first request, this bug occurs. I can reproduce this bug every time by the following script.
>

Oh.., sorry, Looks like I read your mail too fast. So you are saying that
in this case we should be charging 1 ms but instead we will be charging
(jiffies - 0), which might be too huge a number and then a particular
group will not be scheduled for a long time?

How about changing the charging code to also check if slice_start == 0? So
in my V2 I will change the cfq_cfqq_slice_usage() to also check for
slice_start to make sure whether a slice has actually started or not.

if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
charge_1ms;
else
charge_based_on_time_elapsed;

Thanks
Vivek

> #!/bin/sh
>
> mkdir /cgroup
> mount -t cgroup -o blkio io /cgroup
> mkdir /cgroup/tst1
> mkdir /cgroup/tst2
>
> dd if=/dev/sdb2 of=/dev/null &
> pid1=$!
> echo $pid1 > /cgroup/tst1/tasks
>
> dd if=/dev/sdb3 of=/dev/null &
> pid2=$!
> ionice -c3 -p$pid2
> echo $pid2 > /cgroup/tst2/tasks
>
> sleep 5
>
> cat /cgroup/tst1/blkio.time
> cat /cgroup/tst2/blkio.time
>
> killall -9 dd
> sleep 1
>
> rmdir /cgroup/tst1
> rmdir /cgroup/tst2
> umount /cgroup
> rmdir /cgroup
>
>
> >
> > Thanks
> > Vivek
> >
> >
> >> Signed-off-by: Gui Jianfeng <guijianfeng [at] cn>
> >> ---
> >> block/cfq-iosched.c | 3 ++-
> >> 1 files changed, 2 insertions(+), 1 deletions(-)
> >>
> >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> >> index f23d713..12afc14 100644
> >> --- a/block/cfq-iosched.c
> >> +++ b/block/cfq-iosched.c
> >> @@ -1999,7 +1999,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
> >> if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
> >> cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
> >> cfq_class_idle(cfqq))) {
> >> - cfqq->slice_end = jiffies + 1;
> >> + if (cfqq->slice_end)
> >> + cfqq->slice_end = jiffies + 1;
> >> cfq_slice_expired(cfqd);
> >> }
> >>
> >> --
> >> 1.5.4.rc3
> >
> >
> >
>
> --
> Regards
> Gui Jianfeng
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


guijianfeng at cn

Nov 12, 2009, 6:05 PM

Post #18 of 18 (417 views)
Permalink
Re: [PATCH 02/20] blkio: Change CFQ to use CFS like queue time stamps [In reply to]

Vivek Goyal wrote:
> On Fri, Nov 13, 2009 at 08:59:08AM +0800, Gui Jianfeng wrote:
>>
>> Vivek Goyal wrote:
>>> On Wed, Nov 11, 2009 at 08:48:09AM +0800, Gui Jianfeng wrote:
>>>> Vivek Goyal wrote:
>>>> ...
>>>>>
>>>>> @@ -1245,10 +1429,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
>>>>> struct cfq_queue *cfqq;
>>>>> int dispatched = 0;
>>>>>
>>>>> - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
>>>>> + while ((cfqq = cfq_get_next_queue(cfqd)) != NULL)
>>>>> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
>>>>>
>>>>> - cfq_slice_expired(cfqd, 0);
>>>>> + cfq_slice_expired(cfqd);
>>>>>
>>>>> BUG_ON(cfqd->busy_queues);
>>>>>
>>>>> @@ -1391,7 +1575,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
>>>>> cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
>>>>> cfq_class_idle(cfqq))) {
>>>>> cfqq->slice_end = jiffies + 1;
>>>>> - cfq_slice_expired(cfqd, 0);
>>>>> + cfq_slice_expired(cfqd);
>>>> Hi Vivek,
>>>>
>>>> I think here you should make sure that when updating cfqq->slice_end, cfqq->slice_end doesn't
>>>> equal to 0. Because if cfqq->slice_end == 0, cfq_slice_expired() just charge for 1 jiffy, but
>>>> if cfqq->slice_end is updated when it equals to 0(first request still in the air), at that time
>>>> cfqq->slice_start == 0, and slice_used is charged as "jiffies - cfqq->slice_start". Following
>>>> patch fixes this bug.
>>>>
>>> Hi Gui,
>>>
>>> This can happen only once during a one wrap around cycle of jiffies. That
>>> too depends in case we are hitting jiffies+1 as 0 or not.
>>>
>>> So I would not worry much about it right now.
>>>
>>> In fact, not updating slice_end, will make idle or async queue slice last
>>> much longer than it should have.
>> I don't think so Vivek, this bug can be easily trigger by creating two cgroup and run a idle
>> task in one group, then run a normal task in the other group. When the idle task sends out its
>> first request, this bug occurs. I can reproduce this bug every time by the following script.
>>
>
> Oh.., sorry, Looks like I read your mail too fast. So you are saying that
> in this case we should be charging 1 ms but instead we will be charging
> (jiffies - 0), which might be too huge a number and then a particular
> group will not be scheduled for a long time?

Yes, that's it.

>
> How about changing the charging code to also check if slice_start == 0? So
> in my V2 I will change the cfq_cfqq_slice_usage() to also check for
> slice_start to make sure whether a slice has actually started or not.
>
> if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
> charge_1ms;
> else
> charge_based_on_time_elapsed;

I think this change should also work :)

Thanks
Gui

>
> Thanks
> Vivek
>
>> #!/bin/sh
>>
>> mkdir /cgroup
>> mount -t cgroup -o blkio io /cgroup
>> mkdir /cgroup/tst1
>> mkdir /cgroup/tst2
>>
>> dd if=/dev/sdb2 of=/dev/null &
>> pid1=$!
>> echo $pid1 > /cgroup/tst1/tasks
>>
>> dd if=/dev/sdb3 of=/dev/null &
>> pid2=$!
>> ionice -c3 -p$pid2
>> echo $pid2 > /cgroup/tst2/tasks
>>
>> sleep 5
>>
>> cat /cgroup/tst1/blkio.time
>> cat /cgroup/tst2/blkio.time
>>
>> killall -9 dd
>> sleep 1
>>
>> rmdir /cgroup/tst1
>> rmdir /cgroup/tst2
>> umount /cgroup
>> rmdir /cgroup
>>
>>
>>> Thanks
>>> Vivek
>>>
>>>
>>>> Signed-off-by: Gui Jianfeng <guijianfeng [at] cn>
>>>> ---
>>>> block/cfq-iosched.c | 3 ++-
>>>> 1 files changed, 2 insertions(+), 1 deletions(-)
>>>>
>>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>>> index f23d713..12afc14 100644
>>>> --- a/block/cfq-iosched.c
>>>> +++ b/block/cfq-iosched.c
>>>> @@ -1999,7 +1999,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
>>>> if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
>>>> cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
>>>> cfq_class_idle(cfqq))) {
>>>> - cfqq->slice_end = jiffies + 1;
>>>> + if (cfqq->slice_end)
>>>> + cfqq->slice_end = jiffies + 1;
>>>> cfq_slice_expired(cfqd);
>>>> }
>>>>
>>>> --
>>>> 1.5.4.rc3
>>>
>>>
>> --
>> Regards
>> Gui Jianfeng
>
>
>

--
Regards
Gui Jianfeng

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.