ia64/xen-unstable

view xen/common/sched_bvt.c @ 9776:72f9c751d3ea

Replace &foo[0] with foo where the latter seems cleaner
(which is usually, and particularly when its an argument
to one of the bitops functions).

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Apr 19 18:32:20 2006 +0100 (2006-04-19)
parents 2303fb4682e7
children 6993a0f91efc
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: CPU scheduling
12 * implements A Borrowed Virtual Time scheduler.
13 * (see Duda & Cheriton SOSP'99)
14 */
16 #include <xen/config.h>
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/sched.h>
20 #include <xen/delay.h>
21 #include <xen/event.h>
22 #include <xen/time.h>
23 #include <xen/timer.h>
24 #include <xen/perfc.h>
25 #include <xen/sched-if.h>
26 #include <xen/softirq.h>
28 /* all per-domain BVT-specific scheduling info is stored here */
29 struct bvt_vcpu_info
30 {
31 struct list_head run_list; /* runqueue list pointers */
32 u32 avt; /* actual virtual time */
33 u32 evt; /* effective virtual time */
34 int migrated; /* migrated to a new CPU */
35 struct vcpu *vcpu;
36 struct bvt_dom_info *inf;
37 };
39 struct bvt_dom_info
40 {
41 struct domain *domain; /* domain this info belongs to */
42 u32 mcu_advance; /* inverse of weight */
43 int warpback; /* warp? */
44 int warp; /* warp set and within the warp
45 limits*/
46 s32 warp_value; /* virtual time warp */
47 s_time_t warpl; /* warp limit */
48 struct timer warp_timer; /* deals with warpl */
49 s_time_t warpu; /* unwarp time requirement */
50 struct timer unwarp_timer; /* deals with warpu */
52 struct bvt_vcpu_info vcpu_inf[MAX_VIRT_CPUS];
53 };
55 struct bvt_cpu_info
56 {
57 struct list_head runqueue;
58 unsigned long svt;
59 };
61 #define BVT_INFO(p) ((struct bvt_dom_info *)(p)->sched_priv)
62 #define EBVT_INFO(p) ((struct bvt_vcpu_info *)(p)->sched_priv)
63 #define CPU_INFO(cpu) ((struct bvt_cpu_info *)(schedule_data[cpu]).sched_priv)
64 #define RUNLIST(p) ((struct list_head *)&(EBVT_INFO(p)->run_list))
65 #define RUNQUEUE(cpu) ((struct list_head *)&(CPU_INFO(cpu)->runqueue))
66 #define CPU_SVT(cpu) (CPU_INFO(cpu)->svt)
68 #define MCU (s32)MICROSECS(100) /* Minimum unit */
69 #define MCU_ADVANCE 10 /* default weight */
70 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
71 #define CTX_MIN (s32)MICROSECS(10) /* Low limit for ctx_allow */
72 static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
74 static inline void __add_to_runqueue_head(struct vcpu *d)
75 {
76 list_add(RUNLIST(d), RUNQUEUE(d->processor));
77 }
79 static inline void __add_to_runqueue_tail(struct vcpu *d)
80 {
81 list_add_tail(RUNLIST(d), RUNQUEUE(d->processor));
82 }
84 static inline void __del_from_runqueue(struct vcpu *d)
85 {
86 struct list_head *runlist = RUNLIST(d);
87 list_del(runlist);
88 runlist->next = NULL;
89 }
91 static inline int __task_on_runqueue(struct vcpu *d)
92 {
93 return (RUNLIST(d))->next != NULL;
94 }
97 /* Warp/unwarp timer functions */
98 static void warp_timer_fn(void *data)
99 {
100 struct bvt_dom_info *inf = data;
101 struct vcpu *v = inf->domain->vcpu[0];
103 vcpu_schedule_lock_irq(v);
105 inf->warp = 0;
107 /* unwarp equal to zero => stop warping */
108 if ( inf->warpu == 0 )
109 {
110 inf->warpback = 0;
111 cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ);
112 }
114 set_timer(&inf->unwarp_timer, NOW() + inf->warpu);
116 vcpu_schedule_unlock_irq(v);
117 }
119 static void unwarp_timer_fn(void *data)
120 {
121 struct bvt_dom_info *inf = data;
122 struct vcpu *v = inf->domain->vcpu[0];
124 vcpu_schedule_lock_irq(v);
126 if ( inf->warpback )
127 {
128 inf->warp = 1;
129 cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ);
130 }
132 vcpu_schedule_unlock_irq(v);
133 }
135 static inline u32 calc_avt(struct vcpu *v, s_time_t now)
136 {
137 u32 ranfor, mcus;
138 struct bvt_dom_info *inf = BVT_INFO(v->domain);
139 struct bvt_vcpu_info *einf = EBVT_INFO(v);
141 ranfor = (u32)(now - v->runstate.state_entry_time);
142 mcus = (ranfor + MCU - 1)/MCU;
144 return einf->avt + mcus * inf->mcu_advance;
145 }
147 /*
148 * Calculate the effective virtual time for a domain. Take into account
149 * warping limits
150 */
151 static inline u32 calc_evt(struct vcpu *d, u32 avt)
152 {
153 struct bvt_dom_info *inf = BVT_INFO(d->domain);
154 /* TODO The warp routines need to be rewritten GM */
156 if ( inf->warp )
157 return avt - inf->warp_value;
158 else
159 return avt;
160 }
162 /**
163 * bvt_alloc_task - allocate BVT private structures for a task
164 * @p: task to allocate private structures for
165 *
166 * Returns non-zero on failure.
167 */
168 static int bvt_alloc_task(struct vcpu *v)
169 {
170 struct domain *d = v->domain;
171 struct bvt_dom_info *inf;
173 if ( (d->sched_priv == NULL) )
174 {
175 if ( (d->sched_priv = xmalloc(struct bvt_dom_info)) == NULL )
176 return -1;
177 memset(d->sched_priv, 0, sizeof(struct bvt_dom_info));
178 }
180 inf = BVT_INFO(d);
182 v->sched_priv = &inf->vcpu_inf[v->vcpu_id];
184 inf->vcpu_inf[v->vcpu_id].inf = BVT_INFO(d);
185 inf->vcpu_inf[v->vcpu_id].vcpu = v;
187 if ( v->vcpu_id == 0 )
188 {
189 inf->mcu_advance = MCU_ADVANCE;
190 inf->domain = v->domain;
191 inf->warpback = 0;
192 /* Set some default values here. */
193 inf->warp = 0;
194 inf->warp_value = 0;
195 inf->warpl = MILLISECS(2000);
196 inf->warpu = MILLISECS(1000);
197 /* Initialise the warp timers. */
198 init_timer(&inf->warp_timer, warp_timer_fn, inf, v->processor);
199 init_timer(&inf->unwarp_timer, unwarp_timer_fn, inf, v->processor);
200 }
202 return 0;
203 }
205 /*
206 * Add and remove a domain
207 */
208 static void bvt_add_task(struct vcpu *v)
209 {
210 struct bvt_vcpu_info *einf = EBVT_INFO(v);
212 /* Allocate per-CPU context if this is the first domain to be added. */
213 if ( CPU_INFO(v->processor) == NULL )
214 {
215 schedule_data[v->processor].sched_priv = xmalloc(struct bvt_cpu_info);
216 BUG_ON(CPU_INFO(v->processor) == NULL);
217 INIT_LIST_HEAD(RUNQUEUE(v->processor));
218 CPU_SVT(v->processor) = 0;
219 }
221 if ( is_idle_vcpu(v) )
222 {
223 einf->avt = einf->evt = ~0U;
224 BUG_ON(__task_on_runqueue(v));
225 __add_to_runqueue_head(v);
226 }
227 else
228 {
229 /* Set avt and evt to system virtual time. */
230 einf->avt = CPU_SVT(v->processor);
231 einf->evt = CPU_SVT(v->processor);
232 }
233 }
235 static void bvt_wake(struct vcpu *v)
236 {
237 struct bvt_vcpu_info *einf = EBVT_INFO(v);
238 struct vcpu *curr;
239 s_time_t now, r_time;
240 int cpu = v->processor;
241 u32 curr_evt;
243 if ( unlikely(__task_on_runqueue(v)) )
244 return;
246 __add_to_runqueue_head(v);
248 now = NOW();
250 /* Set the BVT parameters. AVT should always be updated
251 if CPU migration ocurred.*/
252 if ( (einf->avt < CPU_SVT(cpu)) || einf->migrated )
253 {
254 einf->avt = CPU_SVT(cpu);
255 einf->migrated = 0;
256 }
258 /* Deal with warping here. */
259 einf->evt = calc_evt(v, einf->avt);
261 curr = schedule_data[cpu].curr;
262 curr_evt = calc_evt(curr, calc_avt(curr, now));
263 /* Calculate the time the current domain would run assuming
264 the second smallest evt is of the newly woken domain */
265 r_time = curr->runstate.state_entry_time +
266 ((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) +
267 ctx_allow;
269 if ( is_idle_vcpu(curr) || (einf->evt <= curr_evt) )
270 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
271 else if ( schedule_data[cpu].s_timer.expires > r_time )
272 set_timer(&schedule_data[cpu].s_timer, r_time);
273 }
276 static void bvt_sleep(struct vcpu *v)
277 {
278 if ( schedule_data[v->processor].curr == v )
279 cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ);
280 else if ( __task_on_runqueue(v) )
281 __del_from_runqueue(v);
282 }
285 static int bvt_set_affinity(struct vcpu *v, cpumask_t *affinity)
286 {
287 if ( v == current )
288 return cpu_isset(v->processor, *affinity) ? 0 : -EBUSY;
290 vcpu_pause(v);
291 v->cpu_affinity = *affinity;
292 v->processor = first_cpu(v->cpu_affinity);
293 EBVT_INFO(v)->migrated = 1;
294 vcpu_unpause(v);
296 return 0;
297 }
300 /**
301 * bvt_free_task - free BVT private structures for a task
302 * @d: task
303 */
304 static void bvt_free_task(struct domain *d)
305 {
306 struct bvt_dom_info *inf = BVT_INFO(d);
308 ASSERT(inf != NULL);
310 kill_timer(&inf->warp_timer);
311 kill_timer(&inf->unwarp_timer);
313 xfree(inf);
314 }
316 /* Control the scheduler. */
317 static int bvt_ctl(struct sched_ctl_cmd *cmd)
318 {
319 struct bvt_ctl *params = &cmd->u.bvt;
321 if ( cmd->direction == SCHED_INFO_PUT )
322 ctx_allow = params->ctx_allow;
323 else
324 {
325 if ( ctx_allow < CTX_MIN )
326 ctx_allow = CTX_MIN;
327 params->ctx_allow = ctx_allow;
328 }
330 return 0;
331 }
333 /* Adjust scheduling parameter for a given domain. */
334 static int bvt_adjdom(
335 struct domain *d, struct sched_adjdom_cmd *cmd)
336 {
337 struct bvt_adjdom *params = &cmd->u.bvt;
339 if ( cmd->direction == SCHED_INFO_PUT )
340 {
341 u32 mcu_adv = params->mcu_adv;
342 u32 warpback = params->warpback;
343 s32 warpvalue = params->warpvalue;
344 s_time_t warpl = params->warpl;
345 s_time_t warpu = params->warpu;
347 struct bvt_dom_info *inf = BVT_INFO(d);
349 /* Sanity -- this can avoid divide-by-zero. */
350 if ( (mcu_adv == 0) || (warpl < 0) || (warpu < 0) )
351 return -EINVAL;
353 inf->mcu_advance = mcu_adv;
354 inf->warpback = warpback;
355 /* The warp should be the same as warpback */
356 inf->warp = warpback;
357 inf->warp_value = warpvalue;
358 inf->warpl = MILLISECS(warpl);
359 inf->warpu = MILLISECS(warpu);
361 /* If the unwarp timer set up it needs to be removed */
362 stop_timer(&inf->unwarp_timer);
363 /* If we stop warping the warp timer needs to be removed */
364 if ( !warpback )
365 stop_timer(&inf->warp_timer);
366 }
367 else if ( cmd->direction == SCHED_INFO_GET )
368 {
369 struct bvt_dom_info *inf = BVT_INFO(d);
370 params->mcu_adv = inf->mcu_advance;
371 params->warpvalue = inf->warp_value;
372 params->warpback = inf->warpback;
373 params->warpl = inf->warpl;
374 params->warpu = inf->warpu;
375 }
377 return 0;
378 }
381 /*
382 * The main function
383 * - deschedule the current domain.
384 * - pick a new domain.
385 * i.e., the domain with lowest EVT.
386 * The runqueue should be ordered by EVT so that is easy.
387 */
388 static struct task_slice bvt_do_schedule(s_time_t now)
389 {
390 struct domain *d;
391 struct vcpu *prev = current, *next = NULL, *next_prime, *ed;
392 int cpu = prev->processor;
393 s32 r_time; /* time for new dom to run */
394 u32 next_evt, next_prime_evt, min_avt;
395 struct bvt_dom_info *prev_inf = BVT_INFO(prev->domain);
396 struct bvt_vcpu_info *prev_einf = EBVT_INFO(prev);
397 struct bvt_vcpu_info *p_einf = NULL;
398 struct bvt_vcpu_info *next_einf = NULL;
399 struct bvt_vcpu_info *next_prime_einf = NULL;
400 struct task_slice ret;
402 ASSERT(prev->sched_priv != NULL);
403 ASSERT(prev_einf != NULL);
404 ASSERT(__task_on_runqueue(prev));
406 if ( likely(!is_idle_vcpu(prev)) )
407 {
408 prev_einf->avt = calc_avt(prev, now);
409 prev_einf->evt = calc_evt(prev, prev_einf->avt);
411 if(prev_inf->warpback && prev_inf->warpl > 0)
412 stop_timer(&prev_inf->warp_timer);
414 __del_from_runqueue(prev);
416 if ( vcpu_runnable(prev) )
417 __add_to_runqueue_tail(prev);
418 }
421 /* We should at least have the idle task */
422 ASSERT(!list_empty(RUNQUEUE(cpu)));
424 /*
425 * scan through the run queue and pick the task with the lowest evt
426 * *and* the task the second lowest evt.
427 * this code is O(n) but we expect n to be small.
428 */
429 next_einf = EBVT_INFO(schedule_data[cpu].idle);
430 next_prime_einf = NULL;
432 next_evt = ~0U;
433 next_prime_evt = ~0U;
434 min_avt = ~0U;
436 list_for_each_entry ( p_einf, RUNQUEUE(cpu), run_list )
437 {
438 if ( p_einf->evt < next_evt )
439 {
440 next_prime_einf = next_einf;
441 next_prime_evt = next_evt;
442 next_einf = p_einf;
443 next_evt = p_einf->evt;
444 }
445 else if ( next_prime_evt == ~0U )
446 {
447 next_prime_evt = p_einf->evt;
448 next_prime_einf = p_einf;
449 }
450 else if ( p_einf->evt < next_prime_evt )
451 {
452 next_prime_evt = p_einf->evt;
453 next_prime_einf = p_einf;
454 }
456 /* Determine system virtual time. */
457 if ( p_einf->avt < min_avt )
458 min_avt = p_einf->avt;
459 }
461 if ( next_einf->inf->warp && next_einf->inf->warpl > 0 )
462 set_timer(&next_einf->inf->warp_timer, now + next_einf->inf->warpl);
464 /* Extract the domain pointers from the dom infos */
465 next = next_einf->vcpu;
466 next_prime = next_prime_einf->vcpu;
468 /* Update system virtual time. */
469 if ( min_avt != ~0U )
470 CPU_SVT(cpu) = min_avt;
472 /* check for virtual time overrun on this cpu */
473 if ( CPU_SVT(cpu) >= 0xf0000000 )
474 {
475 ASSERT(!local_irq_is_enabled());
477 write_lock(&domlist_lock);
479 for_each_domain ( d )
480 {
481 for_each_vcpu (d, ed) {
482 if ( ed->processor == cpu )
483 {
484 p_einf = EBVT_INFO(ed);
485 p_einf->evt -= 0xe0000000;
486 p_einf->avt -= 0xe0000000;
487 }
488 }
489 }
491 write_unlock(&domlist_lock);
493 CPU_SVT(cpu) -= 0xe0000000;
494 }
496 /* work out time for next run through scheduler */
497 if ( is_idle_vcpu(next) )
498 {
499 r_time = ctx_allow;
500 goto sched_done;
501 }
503 if ( (next_prime == NULL) || is_idle_vcpu(next_prime) )
504 {
505 /* We have only one runnable task besides the idle task. */
506 r_time = 10 * ctx_allow; /* RN: random constant */
507 goto sched_done;
508 }
510 /*
511 * If we are here then we have two runnable tasks.
512 * Work out how long 'next' can run till its evt is greater than
513 * 'next_prime's evt. Take context switch allowance into account.
514 */
515 ASSERT(next_prime_einf->evt >= next_einf->evt);
517 r_time = ((next_prime_einf->evt - next_einf->evt)/next_einf->inf->mcu_advance)
518 + ctx_allow;
520 ASSERT(r_time >= ctx_allow);
522 sched_done:
523 ret.task = next;
524 ret.time = r_time;
525 return ret;
526 }
529 static void bvt_dump_runq_el(struct vcpu *p)
530 {
531 struct bvt_vcpu_info *inf = EBVT_INFO(p);
533 printk("mcua=%d ev=0x%08X av=0x%08X ",
534 inf->inf->mcu_advance, inf->evt, inf->avt);
535 }
537 static void bvt_dump_settings(void)
538 {
539 printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
540 }
542 static void bvt_dump_cpu_state(int i)
543 {
544 struct list_head *queue;
545 int loop = 0;
546 struct bvt_vcpu_info *vcpu_inf;
547 struct vcpu *v;
549 printk("svt=0x%08lX ", CPU_SVT(i));
551 queue = RUNQUEUE(i);
552 printk("QUEUE rq %lx n: %lx, p: %lx\n", (unsigned long)queue,
553 (unsigned long) queue->next, (unsigned long) queue->prev);
555 list_for_each_entry ( vcpu_inf, queue, run_list )
556 {
557 v = vcpu_inf->vcpu;
558 printk("%3d: %u has=%c ", loop++, v->domain->domain_id,
559 test_bit(_VCPUF_running, &v->vcpu_flags) ? 'T':'F');
560 bvt_dump_runq_el(v);
561 printk(" l: %p n: %p p: %p\n",
562 &vcpu_inf->run_list, vcpu_inf->run_list.next,
563 vcpu_inf->run_list.prev);
564 }
565 }
567 struct scheduler sched_bvt_def = {
568 .name = "Borrowed Virtual Time",
569 .opt_name = "bvt",
570 .sched_id = SCHED_BVT,
572 .alloc_task = bvt_alloc_task,
573 .add_task = bvt_add_task,
574 .free_task = bvt_free_task,
575 .do_schedule = bvt_do_schedule,
576 .control = bvt_ctl,
577 .adjdom = bvt_adjdom,
578 .dump_settings = bvt_dump_settings,
579 .dump_cpu_state = bvt_dump_cpu_state,
580 .sleep = bvt_sleep,
581 .wake = bvt_wake,
582 .set_affinity = bvt_set_affinity
583 };
585 /*
586 * Local variables:
587 * mode: C
588 * c-set-style: "BSD"
589 * c-basic-offset: 4
590 * tab-width: 4
591 * indent-tabs-mode: nil
592 * End:
593 */