ia64/xen-unstable

view xen/common/sched_bvt.c @ 6832:5959fae4722a

Set NE bit for VMX guest CR0. VMCS guest CR0.NE bit must
be set, else it will cause "vm-entry failed".

Signed-off-by: Chengyuan Li <chengyuan.li@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Sep 14 13:37:50 2005 +0000 (2005-09-14)
parents 8651a99cdc09
children 6b18f820f6a7
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: CPU scheduling
12 * implements A Borrowed Virtual Time scheduler.
13 * (see Duda & Cheriton SOSP'99)
14 */
16 #include <xen/config.h>
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/sched.h>
20 #include <xen/delay.h>
21 #include <xen/event.h>
22 #include <xen/time.h>
23 #include <xen/ac_timer.h>
24 #include <xen/perfc.h>
25 #include <xen/sched-if.h>
26 #include <xen/softirq.h>
28 /* all per-domain BVT-specific scheduling info is stored here */
29 struct bvt_vcpu_info
30 {
31 struct list_head run_list; /* runqueue list pointers */
32 u32 avt; /* actual virtual time */
33 u32 evt; /* effective virtual time */
34 struct vcpu *vcpu;
35 struct bvt_dom_info *inf;
36 };
38 struct bvt_dom_info
39 {
40 struct domain *domain; /* domain this info belongs to */
41 u32 mcu_advance; /* inverse of weight */
42 int warpback; /* warp? */
43 int warp; /* warp set and within the warp
44 limits*/
45 s32 warp_value; /* virtual time warp */
46 s_time_t warpl; /* warp limit */
47 struct ac_timer warp_timer; /* deals with warpl */
48 s_time_t warpu; /* unwarp time requirement */
49 struct ac_timer unwarp_timer; /* deals with warpu */
51 struct bvt_vcpu_info vcpu_inf[MAX_VIRT_CPUS];
52 };
54 struct bvt_cpu_info
55 {
56 struct list_head runqueue;
57 unsigned long svt;
58 };
60 #define BVT_INFO(p) ((struct bvt_dom_info *)(p)->sched_priv)
61 #define EBVT_INFO(p) ((struct bvt_vcpu_info *)(p)->sched_priv)
62 #define CPU_INFO(cpu) ((struct bvt_cpu_info *)(schedule_data[cpu]).sched_priv)
63 #define RUNLIST(p) ((struct list_head *)&(EBVT_INFO(p)->run_list))
64 #define RUNQUEUE(cpu) ((struct list_head *)&(CPU_INFO(cpu)->runqueue))
65 #define CPU_SVT(cpu) (CPU_INFO(cpu)->svt)
67 #define MCU (s32)MICROSECS(100) /* Minimum unit */
68 #define MCU_ADVANCE 10 /* default weight */
69 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
70 static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
72 static inline void __add_to_runqueue_head(struct vcpu *d)
73 {
74 list_add(RUNLIST(d), RUNQUEUE(d->processor));
75 }
77 static inline void __add_to_runqueue_tail(struct vcpu *d)
78 {
79 list_add_tail(RUNLIST(d), RUNQUEUE(d->processor));
80 }
82 static inline void __del_from_runqueue(struct vcpu *d)
83 {
84 struct list_head *runlist = RUNLIST(d);
85 list_del(runlist);
86 runlist->next = NULL;
87 }
89 static inline int __task_on_runqueue(struct vcpu *d)
90 {
91 return (RUNLIST(d))->next != NULL;
92 }
95 /* Warp/unwarp timer functions */
96 static void warp_timer_fn(void *data)
97 {
98 struct bvt_dom_info *inf = data;
99 unsigned int cpu = inf->domain->vcpu[0]->processor;
101 spin_lock_irq(&schedule_data[cpu].schedule_lock);
103 inf->warp = 0;
105 /* unwarp equal to zero => stop warping */
106 if ( inf->warpu == 0 )
107 {
108 inf->warpback = 0;
109 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
110 }
112 set_ac_timer(&inf->unwarp_timer, NOW() + inf->warpu);
114 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
115 }
117 static void unwarp_timer_fn(void *data)
118 {
119 struct bvt_dom_info *inf = data;
120 unsigned int cpu = inf->domain->vcpu[0]->processor;
122 spin_lock_irq(&schedule_data[cpu].schedule_lock);
124 if ( inf->warpback )
125 {
126 inf->warp = 1;
127 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
128 }
130 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
131 }
133 static inline u32 calc_avt(struct vcpu *d, s_time_t now)
134 {
135 u32 ranfor, mcus;
136 struct bvt_dom_info *inf = BVT_INFO(d->domain);
137 struct bvt_vcpu_info *einf = EBVT_INFO(d);
139 ranfor = (u32)(now - d->lastschd);
140 mcus = (ranfor + MCU - 1)/MCU;
142 return einf->avt + mcus * inf->mcu_advance;
143 }
145 /*
146 * Calculate the effective virtual time for a domain. Take into account
147 * warping limits
148 */
149 static inline u32 calc_evt(struct vcpu *d, u32 avt)
150 {
151 struct bvt_dom_info *inf = BVT_INFO(d->domain);
152 /* TODO The warp routines need to be rewritten GM */
154 if ( inf->warp )
155 return avt - inf->warp_value;
156 else
157 return avt;
158 }
160 /**
161 * bvt_alloc_task - allocate BVT private structures for a task
162 * @p: task to allocate private structures for
163 *
164 * Returns non-zero on failure.
165 */
166 static int bvt_alloc_task(struct vcpu *v)
167 {
168 struct domain *d = v->domain;
170 if ( (d->sched_priv == NULL) )
171 {
172 if ( (d->sched_priv = xmalloc(struct bvt_dom_info)) == NULL )
173 return -1;
174 memset(d->sched_priv, 0, sizeof(struct bvt_dom_info));
175 }
177 v->sched_priv = &BVT_INFO(d)->vcpu_inf[v->vcpu_id];
179 BVT_INFO(d)->vcpu_inf[v->vcpu_id].inf = BVT_INFO(d);
180 BVT_INFO(d)->vcpu_inf[v->vcpu_id].vcpu = v;
182 return 0;
183 }
185 /*
186 * Add and remove a domain
187 */
188 static void bvt_add_task(struct vcpu *v)
189 {
190 struct bvt_dom_info *inf = BVT_INFO(v->domain);
191 struct bvt_vcpu_info *einf = EBVT_INFO(v);
192 ASSERT(inf != NULL);
193 ASSERT(v != NULL);
195 /* Allocate per-CPU context if this is the first domain to be added. */
196 if ( CPU_INFO(v->processor) == NULL )
197 {
198 schedule_data[v->processor].sched_priv = xmalloc(struct bvt_cpu_info);
199 BUG_ON(CPU_INFO(v->processor) == NULL);
200 INIT_LIST_HEAD(RUNQUEUE(v->processor));
201 CPU_SVT(v->processor) = 0;
202 }
204 if ( v->vcpu_id == 0 )
205 {
206 inf->mcu_advance = MCU_ADVANCE;
207 inf->domain = v->domain;
208 inf->warpback = 0;
209 /* Set some default values here. */
210 inf->warp = 0;
211 inf->warp_value = 0;
212 inf->warpl = MILLISECS(2000);
213 inf->warpu = MILLISECS(1000);
214 /* Initialise the warp timers. */
215 init_ac_timer(&inf->warp_timer, warp_timer_fn, inf, v->processor);
216 init_ac_timer(&inf->unwarp_timer, unwarp_timer_fn, inf, v->processor);
217 }
219 einf->vcpu = v;
221 if ( is_idle_task(v->domain) )
222 {
223 einf->avt = einf->evt = ~0U;
224 BUG_ON(__task_on_runqueue(v));
225 __add_to_runqueue_head(v);
226 }
227 else
228 {
229 /* Set avt and evt to system virtual time. */
230 einf->avt = CPU_SVT(v->processor);
231 einf->evt = CPU_SVT(v->processor);
232 }
233 }
235 static void bvt_wake(struct vcpu *v)
236 {
237 struct bvt_vcpu_info *einf = EBVT_INFO(v);
238 struct vcpu *curr;
239 s_time_t now, r_time;
240 int cpu = v->processor;
241 u32 curr_evt;
243 if ( unlikely(__task_on_runqueue(v)) )
244 return;
246 __add_to_runqueue_head(v);
248 now = NOW();
250 /* Set the BVT parameters. AVT should always be updated
251 if CPU migration ocurred.*/
252 if ( einf->avt < CPU_SVT(cpu) ||
253 unlikely(test_bit(_VCPUF_cpu_migrated, &v->vcpu_flags)) )
254 einf->avt = CPU_SVT(cpu);
256 /* Deal with warping here. */
257 einf->evt = calc_evt(v, einf->avt);
259 curr = schedule_data[cpu].curr;
260 curr_evt = calc_evt(curr, calc_avt(curr, now));
261 /* Calculate the time the current domain would run assuming
262 the second smallest evt is of the newly woken domain */
263 r_time = curr->lastschd +
264 ((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) +
265 ctx_allow;
267 if ( is_idle_task(curr->domain) || (einf->evt <= curr_evt) )
268 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
269 else if ( schedule_data[cpu].s_timer.expires > r_time )
270 set_ac_timer(&schedule_data[cpu].s_timer, r_time);
271 }
274 static void bvt_sleep(struct vcpu *v)
275 {
276 if ( test_bit(_VCPUF_running, &v->vcpu_flags) )
277 cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ);
278 else if ( __task_on_runqueue(v) )
279 __del_from_runqueue(v);
280 }
282 /**
283 * bvt_free_task - free BVT private structures for a task
284 * @d: task
285 */
286 static void bvt_free_task(struct domain *d)
287 {
288 ASSERT(d->sched_priv != NULL);
289 xfree(d->sched_priv);
290 }
292 /* Control the scheduler. */
293 static int bvt_ctl(struct sched_ctl_cmd *cmd)
294 {
295 struct bvt_ctl *params = &cmd->u.bvt;
297 if ( cmd->direction == SCHED_INFO_PUT )
298 ctx_allow = params->ctx_allow;
299 else
300 params->ctx_allow = ctx_allow;
302 return 0;
303 }
305 /* Adjust scheduling parameter for a given domain. */
306 static int bvt_adjdom(
307 struct domain *d, struct sched_adjdom_cmd *cmd)
308 {
309 struct bvt_adjdom *params = &cmd->u.bvt;
311 if ( cmd->direction == SCHED_INFO_PUT )
312 {
313 u32 mcu_adv = params->mcu_adv;
314 u32 warpback = params->warpback;
315 s32 warpvalue = params->warpvalue;
316 s_time_t warpl = params->warpl;
317 s_time_t warpu = params->warpu;
319 struct bvt_dom_info *inf = BVT_INFO(d);
321 /* Sanity -- this can avoid divide-by-zero. */
322 if ( (mcu_adv == 0) || (warpl < 0) || (warpu < 0) )
323 return -EINVAL;
325 inf->mcu_advance = mcu_adv;
326 inf->warpback = warpback;
327 /* The warp should be the same as warpback */
328 inf->warp = warpback;
329 inf->warp_value = warpvalue;
330 inf->warpl = MILLISECS(warpl);
331 inf->warpu = MILLISECS(warpu);
333 /* If the unwarp timer set up it needs to be removed */
334 rem_ac_timer(&inf->unwarp_timer);
335 /* If we stop warping the warp timer needs to be removed */
336 if ( !warpback )
337 rem_ac_timer(&inf->warp_timer);
338 }
339 else if ( cmd->direction == SCHED_INFO_GET )
340 {
341 struct bvt_dom_info *inf = BVT_INFO(d);
342 params->mcu_adv = inf->mcu_advance;
343 params->warpvalue = inf->warp_value;
344 params->warpback = inf->warpback;
345 params->warpl = inf->warpl;
346 params->warpu = inf->warpu;
347 }
349 return 0;
350 }
353 /*
354 * The main function
355 * - deschedule the current domain.
356 * - pick a new domain.
357 * i.e., the domain with lowest EVT.
358 * The runqueue should be ordered by EVT so that is easy.
359 */
360 static struct task_slice bvt_do_schedule(s_time_t now)
361 {
362 struct domain *d;
363 struct vcpu *prev = current, *next = NULL, *next_prime, *ed;
364 int cpu = prev->processor;
365 s32 r_time; /* time for new dom to run */
366 u32 next_evt, next_prime_evt, min_avt;
367 struct bvt_dom_info *prev_inf = BVT_INFO(prev->domain);
368 struct bvt_vcpu_info *prev_einf = EBVT_INFO(prev);
369 struct bvt_vcpu_info *p_einf = NULL;
370 struct bvt_vcpu_info *next_einf = NULL;
371 struct bvt_vcpu_info *next_prime_einf = NULL;
372 struct task_slice ret;
374 ASSERT(prev->sched_priv != NULL);
375 ASSERT(prev_einf != NULL);
376 ASSERT(__task_on_runqueue(prev));
378 if ( likely(!is_idle_task(prev->domain)) )
379 {
380 prev_einf->avt = calc_avt(prev, now);
381 prev_einf->evt = calc_evt(prev, prev_einf->avt);
383 if(prev_inf->warpback && prev_inf->warpl > 0)
384 rem_ac_timer(&prev_inf->warp_timer);
386 __del_from_runqueue(prev);
388 if ( domain_runnable(prev) )
389 __add_to_runqueue_tail(prev);
390 }
393 /* We should at least have the idle task */
394 ASSERT(!list_empty(RUNQUEUE(cpu)));
396 /*
397 * scan through the run queue and pick the task with the lowest evt
398 * *and* the task the second lowest evt.
399 * this code is O(n) but we expect n to be small.
400 */
401 next_einf = EBVT_INFO(schedule_data[cpu].idle);
402 next_prime_einf = NULL;
404 next_evt = ~0U;
405 next_prime_evt = ~0U;
406 min_avt = ~0U;
408 list_for_each_entry ( p_einf, RUNQUEUE(cpu), run_list )
409 {
410 if ( p_einf->evt < next_evt )
411 {
412 next_prime_einf = next_einf;
413 next_prime_evt = next_evt;
414 next_einf = p_einf;
415 next_evt = p_einf->evt;
416 }
417 else if ( next_prime_evt == ~0U )
418 {
419 next_prime_evt = p_einf->evt;
420 next_prime_einf = p_einf;
421 }
422 else if ( p_einf->evt < next_prime_evt )
423 {
424 next_prime_evt = p_einf->evt;
425 next_prime_einf = p_einf;
426 }
428 /* Determine system virtual time. */
429 if ( p_einf->avt < min_avt )
430 min_avt = p_einf->avt;
431 }
433 if ( next_einf->inf->warp && next_einf->inf->warpl > 0 )
434 set_ac_timer(&next_einf->inf->warp_timer, now + next_einf->inf->warpl);
436 /* Extract the domain pointers from the dom infos */
437 next = next_einf->vcpu;
438 next_prime = next_prime_einf->vcpu;
440 /* Update system virtual time. */
441 if ( min_avt != ~0U )
442 CPU_SVT(cpu) = min_avt;
444 /* check for virtual time overrun on this cpu */
445 if ( CPU_SVT(cpu) >= 0xf0000000 )
446 {
447 ASSERT(!local_irq_is_enabled());
449 write_lock(&domlist_lock);
451 for_each_domain ( d )
452 {
453 for_each_vcpu (d, ed) {
454 if ( ed->processor == cpu )
455 {
456 p_einf = EBVT_INFO(ed);
457 p_einf->evt -= 0xe0000000;
458 p_einf->avt -= 0xe0000000;
459 }
460 }
461 }
463 write_unlock(&domlist_lock);
465 CPU_SVT(cpu) -= 0xe0000000;
466 }
468 /* work out time for next run through scheduler */
469 if ( is_idle_task(next->domain) )
470 {
471 r_time = ctx_allow;
472 goto sched_done;
473 }
475 if ( (next_prime == NULL) || is_idle_task(next_prime->domain) )
476 {
477 /* We have only one runnable task besides the idle task. */
478 r_time = 10 * ctx_allow; /* RN: random constant */
479 goto sched_done;
480 }
482 /*
483 * If we are here then we have two runnable tasks.
484 * Work out how long 'next' can run till its evt is greater than
485 * 'next_prime's evt. Take context switch allowance into account.
486 */
487 ASSERT(next_prime_einf->evt >= next_einf->evt);
489 r_time = ((next_prime_einf->evt - next_einf->evt)/next_einf->inf->mcu_advance)
490 + ctx_allow;
492 ASSERT(r_time >= ctx_allow);
494 sched_done:
495 ret.task = next;
496 ret.time = r_time;
497 return ret;
498 }
501 static void bvt_dump_runq_el(struct vcpu *p)
502 {
503 struct bvt_vcpu_info *inf = EBVT_INFO(p);
505 printk("mcua=%d ev=0x%08X av=0x%08X ",
506 inf->inf->mcu_advance, inf->evt, inf->avt);
507 }
509 static void bvt_dump_settings(void)
510 {
511 printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
512 }
514 static void bvt_dump_cpu_state(int i)
515 {
516 struct list_head *queue;
517 int loop = 0;
518 struct bvt_vcpu_info *vcpu_inf;
519 struct vcpu *v;
521 printk("svt=0x%08lX ", CPU_SVT(i));
523 queue = RUNQUEUE(i);
524 printk("QUEUE rq %lx n: %lx, p: %lx\n", (unsigned long)queue,
525 (unsigned long) queue->next, (unsigned long) queue->prev);
527 list_for_each_entry ( vcpu_inf, queue, run_list )
528 {
529 v = vcpu_inf->vcpu;
530 printk("%3d: %u has=%c ", loop++, v->domain->domain_id,
531 test_bit(_VCPUF_running, &v->vcpu_flags) ? 'T':'F');
532 bvt_dump_runq_el(v);
533 printk("c=0x%X%08X\n", (u32)(v->cpu_time>>32), (u32)v->cpu_time);
534 printk(" l: %p n: %p p: %p\n",
535 &vcpu_inf->run_list, vcpu_inf->run_list.next,
536 vcpu_inf->run_list.prev);
537 }
538 }
540 struct scheduler sched_bvt_def = {
541 .name = "Borrowed Virtual Time",
542 .opt_name = "bvt",
543 .sched_id = SCHED_BVT,
545 .alloc_task = bvt_alloc_task,
546 .add_task = bvt_add_task,
547 .free_task = bvt_free_task,
548 .do_schedule = bvt_do_schedule,
549 .control = bvt_ctl,
550 .adjdom = bvt_adjdom,
551 .dump_settings = bvt_dump_settings,
552 .dump_cpu_state = bvt_dump_cpu_state,
553 .sleep = bvt_sleep,
554 .wake = bvt_wake,
555 };
557 /*
558 * Local variables:
559 * mode: C
560 * c-set-style: "BSD"
561 * c-basic-offset: 4
562 * tab-width: 4
563 * indent-tabs-mode: nil
564 * End:
565 */