ia64/xen-unstable

view xen/common/schedule.c @ 6832:5959fae4722a

Set NE bit for VMX guest CR0. VMCS guest CR0.NE bit must
be set, else it will cause "vm-entry failed".

Signed-off-by: Chengyuan Li <chengyuan.li@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Sep 14 13:37:50 2005 +0000 (2005-09-14)
parents 3a34bcb7c28b
children 9af349b055e5 3233e7ecfa9f
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 /*#define WAKE_HISTO*/
17 /*#define BLOCKTIME_HISTO*/
19 #if defined(WAKE_HISTO)
20 #define BUCKETS 31
21 #elif defined(BLOCKTIME_HISTO)
22 #define BUCKETS 200
23 #endif
25 #include <xen/config.h>
26 #include <xen/init.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/domain.h>
30 #include <xen/delay.h>
31 #include <xen/event.h>
32 #include <xen/time.h>
33 #include <xen/ac_timer.h>
34 #include <xen/perfc.h>
35 #include <xen/sched-if.h>
36 #include <xen/softirq.h>
37 #include <xen/trace.h>
38 #include <xen/mm.h>
39 #include <public/sched_ctl.h>
41 extern void arch_getdomaininfo_ctxt(struct vcpu *,
42 struct vcpu_guest_context *);
43 /* opt_sched: scheduler - default to SEDF */
44 static char opt_sched[10] = "sedf";
45 string_param("sched", opt_sched);
47 #if defined(WAKE_HISTO)
48 #define BUCKETS 31
49 #elif defined(BLOCKTIME_HISTO)
50 #define BUCKETS 200
51 #endif
53 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
55 /* Various timer handlers. */
56 static void s_timer_fn(void *unused);
57 static void t_timer_fn(void *unused);
58 static void dom_timer_fn(void *data);
60 /* This is global for now so that private implementations can reach it */
61 struct schedule_data schedule_data[NR_CPUS];
63 extern struct scheduler sched_bvt_def;
64 extern struct scheduler sched_sedf_def;
65 static struct scheduler *schedulers[] = {
66 &sched_bvt_def,
67 &sched_sedf_def,
68 NULL
69 };
71 static void __enter_scheduler(void);
73 static struct scheduler ops;
75 #define SCHED_OP(fn, ...) \
76 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
77 : (typeof(ops.fn(__VA_ARGS__)))0 )
79 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
80 static struct ac_timer t_timer[NR_CPUS];
82 void free_domain_struct(struct domain *d)
83 {
84 int i;
86 SCHED_OP(free_task, d);
87 /* vcpu 0 has to be the last one destructed. */
88 for (i = MAX_VIRT_CPUS-1; i >= 0; i--)
89 if ( d->vcpu[i] )
90 arch_free_vcpu_struct(d->vcpu[i]);
92 xfree(d);
93 }
95 struct vcpu *alloc_vcpu_struct(
96 struct domain *d, unsigned long vcpu)
97 {
98 struct vcpu *v, *vc;
100 ASSERT( d->vcpu[vcpu] == NULL );
102 if ( (v = arch_alloc_vcpu_struct()) == NULL )
103 return NULL;
105 memset(v, 0, sizeof(*v));
107 d->vcpu[vcpu] = v;
108 v->domain = d;
109 v->vcpu_id = vcpu;
111 if ( SCHED_OP(alloc_task, v) < 0 )
112 goto out;
114 if ( vcpu != 0 )
115 {
116 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
118 for_each_vcpu( d, vc )
119 {
120 if ( (vc->next_in_list == NULL) ||
121 (vc->next_in_list->vcpu_id > vcpu) )
122 break;
123 }
124 v->next_in_list = vc->next_in_list;
125 vc->next_in_list = v;
127 if (test_bit(_VCPUF_cpu_pinned, &vc->vcpu_flags)) {
128 v->processor = (vc->processor + 1) % num_online_cpus();
129 set_bit(_VCPUF_cpu_pinned, &v->vcpu_flags);
130 } else {
131 v->processor = (vc->processor + 1) % num_online_cpus();
132 }
133 }
135 return v;
137 out:
138 d->vcpu[vcpu] = NULL;
139 arch_free_vcpu_struct(v);
141 return NULL;
142 }
144 struct domain *alloc_domain_struct(void)
145 {
146 struct domain *d;
148 if ( (d = xmalloc(struct domain)) == NULL )
149 return NULL;
151 memset(d, 0, sizeof(*d));
153 if ( alloc_vcpu_struct(d, 0) == NULL )
154 goto out;
156 return d;
158 out:
159 xfree(d);
160 return NULL;
161 }
163 /*
164 * Add and remove a domain
165 */
166 void sched_add_domain(struct vcpu *v)
167 {
168 struct domain *d = v->domain;
170 /* Initialise the per-domain timer. */
171 init_ac_timer(&v->timer, dom_timer_fn, v, v->processor);
173 if ( is_idle_task(d) )
174 {
175 schedule_data[v->processor].curr = v;
176 schedule_data[v->processor].idle = v;
177 set_bit(_VCPUF_running, &v->vcpu_flags);
178 }
179 else
180 {
181 /* Must be unpaused by control software to start execution. */
182 set_bit(_VCPUF_ctrl_pause, &v->vcpu_flags);
183 }
185 SCHED_OP(add_task, v);
186 TRACE_2D(TRC_SCHED_DOM_ADD, d->domain_id, v->vcpu_id);
187 }
189 void sched_rem_domain(struct vcpu *v)
190 {
191 rem_ac_timer(&v->timer);
192 SCHED_OP(rem_task, v);
193 TRACE_2D(TRC_SCHED_DOM_REM, v->domain->domain_id, v->vcpu_id);
194 }
196 void vcpu_sleep_nosync(struct vcpu *v)
197 {
198 unsigned long flags;
200 spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags);
201 if ( likely(!domain_runnable(v)) )
202 SCHED_OP(sleep, v);
203 spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags);
205 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
206 }
208 void vcpu_sleep_sync(struct vcpu *v)
209 {
210 vcpu_sleep_nosync(v);
212 /*
213 * We can be sure that the VCPU is finally descheduled after the running
214 * flag is cleared and the scheduler lock is released. We also check that
215 * the domain continues to be unrunnable, in case someone else wakes it.
216 */
217 while ( !domain_runnable(v) &&
218 (test_bit(_VCPUF_running, &v->vcpu_flags) ||
219 spin_is_locked(&schedule_data[v->processor].schedule_lock)) )
220 cpu_relax();
222 sync_vcpu_execstate(v);
223 }
225 void vcpu_wake(struct vcpu *v)
226 {
227 unsigned long flags;
229 spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags);
230 if ( likely(domain_runnable(v)) )
231 {
232 SCHED_OP(wake, v);
233 #ifdef WAKE_HISTO
234 v->wokenup = NOW();
235 #endif
236 }
237 clear_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
238 spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags);
240 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
241 }
243 /* Block the currently-executing domain until a pertinent event occurs. */
244 long do_block(void)
245 {
246 struct vcpu *v = current;
248 v->vcpu_info->evtchn_upcall_mask = 0;
249 set_bit(_VCPUF_blocked, &v->vcpu_flags);
251 /* Check for events /after/ blocking: avoids wakeup waiting race. */
252 if ( event_pending(v) )
253 {
254 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
255 }
256 else
257 {
258 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
259 __enter_scheduler();
260 }
262 return 0;
263 }
265 /* Voluntarily yield the processor for this allocation. */
266 static long do_yield(void)
267 {
268 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
269 __enter_scheduler();
270 return 0;
271 }
273 /* Mark target vcpu as non-runnable so it is not scheduled */
274 static long do_vcpu_down(int vcpu)
275 {
276 struct vcpu *target;
278 if ( vcpu > MAX_VIRT_CPUS )
279 return -EINVAL;
281 target = current->domain->vcpu[vcpu];
282 if ( target == NULL )
283 return -ESRCH;
284 set_bit(_VCPUF_down, &target->vcpu_flags);
286 return 0;
287 }
289 /* Mark target vcpu as runnable and wake it */
290 static long do_vcpu_up(int vcpu)
291 {
292 struct vcpu *target;
294 if (vcpu > MAX_VIRT_CPUS)
295 return -EINVAL;
297 target = current->domain->vcpu[vcpu];
298 if ( target == NULL )
299 return -ESRCH;
300 clear_bit(_VCPUF_down, &target->vcpu_flags);
301 /* wake vcpu */
302 vcpu_wake(target);
304 return 0;
305 }
307 static long do_vcpu_pickle(int vcpu, unsigned long arg)
308 {
309 struct vcpu *v;
310 vcpu_guest_context_t *c;
311 int ret = 0;
313 if (vcpu >= MAX_VIRT_CPUS)
314 return -EINVAL;
315 v = current->domain->vcpu[vcpu];
316 if (!v)
317 return -ESRCH;
318 /* Don't pickle vcpus which are currently running */
319 if (!test_bit(_VCPUF_down, &v->vcpu_flags)) {
320 return -EBUSY;
321 }
322 c = xmalloc(vcpu_guest_context_t);
323 if (!c)
324 return -ENOMEM;
325 arch_getdomaininfo_ctxt(v, c);
326 if (copy_to_user((vcpu_guest_context_t *)arg,
327 (const vcpu_guest_context_t *)c, sizeof(*c)))
328 ret = -EFAULT;
329 xfree(c);
330 return ret;
331 }
333 /*
334 * Demultiplex scheduler-related hypercalls.
335 */
336 long do_sched_op(unsigned long op, unsigned long arg)
337 {
338 long ret = 0;
340 switch ( op & SCHEDOP_cmdmask )
341 {
342 case SCHEDOP_yield:
343 {
344 ret = do_yield();
345 break;
346 }
348 case SCHEDOP_block:
349 {
350 ret = do_block();
351 break;
352 }
354 case SCHEDOP_shutdown:
355 {
356 TRACE_3D(TRC_SCHED_SHUTDOWN,
357 current->domain->domain_id, current->vcpu_id,
358 (op >> SCHEDOP_reasonshift));
359 domain_shutdown((u8)(op >> SCHEDOP_reasonshift));
360 break;
361 }
362 case SCHEDOP_vcpu_down:
363 {
364 ret = do_vcpu_down((int)(op >> SCHEDOP_vcpushift));
365 break;
366 }
367 case SCHEDOP_vcpu_up:
368 {
369 ret = do_vcpu_up((int)(op >> SCHEDOP_vcpushift));
370 break;
371 }
372 case SCHEDOP_vcpu_pickle:
373 {
374 ret = do_vcpu_pickle((int)(op >> SCHEDOP_vcpushift), arg);
375 break;
376 }
378 default:
379 ret = -ENOSYS;
380 }
382 return ret;
383 }
385 /* Per-domain one-shot-timer hypercall. */
386 long do_set_timer_op(s_time_t timeout)
387 {
388 struct vcpu *v = current;
390 if ( timeout == 0 )
391 rem_ac_timer(&v->timer);
392 else
393 set_ac_timer(&v->timer, timeout);
395 return 0;
396 }
398 /** sched_id - fetch ID of current scheduler */
399 int sched_id()
400 {
401 return ops.sched_id;
402 }
404 long sched_ctl(struct sched_ctl_cmd *cmd)
405 {
406 if ( cmd->sched_id != ops.sched_id )
407 return -EINVAL;
409 SCHED_OP(control, cmd);
410 TRACE_0D(TRC_SCHED_CTL);
411 return 0;
412 }
415 /* Adjust scheduling parameter for a given domain. */
416 long sched_adjdom(struct sched_adjdom_cmd *cmd)
417 {
418 struct domain *d;
419 struct vcpu *v;
420 int cpu;
421 #if NR_CPUS <=32
422 unsigned long have_lock;
423 #else
424 unsigned long long have_lock;
425 #endif
426 int succ;
428 #define __set_cpu_bit(cpu, data) data |= ((typeof(data))1)<<cpu
429 #define __get_cpu_bit(cpu, data) (data & ((typeof(data))1)<<cpu)
430 #define __clear_cpu_bits(data) data = ((typeof(data))0)
432 if ( cmd->sched_id != ops.sched_id )
433 return -EINVAL;
435 if ( cmd->direction != SCHED_INFO_PUT && cmd->direction != SCHED_INFO_GET )
436 return -EINVAL;
438 d = find_domain_by_id(cmd->domain);
439 if ( d == NULL )
440 return -ESRCH;
442 /* acquire locks on all CPUs on which vcpus of this domain run */
443 do {
444 succ = 0;
445 __clear_cpu_bits(have_lock);
446 for_each_vcpu(d, v) {
447 cpu = v->processor;
448 if (!__get_cpu_bit(cpu, have_lock)) {
449 /* if we don't have a lock on this CPU: acquire it*/
450 if (spin_trylock(&schedule_data[cpu].schedule_lock)) {
451 /*we have this lock!*/
452 __set_cpu_bit(cpu, have_lock);
453 succ = 1;
454 } else {
455 /*we didn,t get this lock -> free all other locks too!*/
456 for (cpu = 0; cpu < NR_CPUS; cpu++)
457 if (__get_cpu_bit(cpu, have_lock))
458 spin_unlock(&schedule_data[cpu].schedule_lock);
459 /* and start from the beginning! */
460 succ = 0;
461 /* leave the "for_each_domain_loop" */
462 break;
463 }
464 }
465 }
466 } while ( !succ );
468 SCHED_OP(adjdom, d, cmd);
470 for (cpu = 0; cpu < NR_CPUS; cpu++)
471 if (__get_cpu_bit(cpu, have_lock))
472 spin_unlock(&schedule_data[cpu].schedule_lock);
473 __clear_cpu_bits(have_lock);
475 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
476 put_domain(d);
477 return 0;
478 }
480 /*
481 * The main function
482 * - deschedule the current domain (scheduler independent).
483 * - pick a new domain (scheduler dependent).
484 */
485 static void __enter_scheduler(void)
486 {
487 struct vcpu *prev = current, *next = NULL;
488 int cpu = prev->processor;
489 s_time_t now;
490 struct task_slice next_slice;
491 s32 r_time; /* time for new dom to run */
493 perfc_incrc(sched_run);
495 spin_lock_irq(&schedule_data[cpu].schedule_lock);
497 now = NOW();
499 rem_ac_timer(&schedule_data[cpu].s_timer);
501 ASSERT(!in_irq());
503 prev->cpu_time += now - prev->lastschd;
505 /* get policy-specific decision on scheduling... */
506 next_slice = ops.do_schedule(now);
508 r_time = next_slice.time;
509 next = next_slice.task;
511 schedule_data[cpu].curr = next;
513 next->lastschd = now;
515 set_ac_timer(&schedule_data[cpu].s_timer, now + r_time);
517 if ( unlikely(prev == next) )
518 {
519 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
520 return continue_running(prev);
521 }
523 clear_bit(_VCPUF_running, &prev->vcpu_flags);
524 set_bit(_VCPUF_running, &next->vcpu_flags);
526 perfc_incrc(sched_ctx);
528 #if defined(WAKE_HISTO)
529 if ( !is_idle_task(next->domain) && next->wokenup )
530 {
531 ulong diff = (ulong)(now - next->wokenup);
532 diff /= (ulong)MILLISECS(1);
533 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
534 else schedule_data[cpu].hist[BUCKETS-1]++;
535 }
536 next->wokenup = (s_time_t)0;
537 #elif defined(BLOCKTIME_HISTO)
538 prev->lastdeschd = now;
539 if ( !is_idle_task(next->domain) )
540 {
541 ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
542 if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++;
543 else schedule_data[cpu].hist[BUCKETS-1]++;
544 }
545 #endif
547 prev->sleep_tick = schedule_data[cpu].tick;
549 /* Ensure that the domain has an up-to-date time base. */
550 if ( !is_idle_task(next->domain) )
551 {
552 update_dom_time(next);
553 if ( next->sleep_tick != schedule_data[cpu].tick )
554 send_guest_virq(next, VIRQ_TIMER);
555 }
557 TRACE_4D(TRC_SCHED_SWITCH,
558 prev->domain->domain_id, prev->vcpu_id,
559 next->domain->domain_id, next->vcpu_id);
561 context_switch(prev, next);
563 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
565 context_switch_finalise(next);
566 }
568 /* No locking needed -- pointer comparison is safe :-) */
569 int idle_cpu(int cpu)
570 {
571 struct vcpu *p = schedule_data[cpu].curr;
572 return p == idle_task[cpu];
573 }
576 /****************************************************************************
577 * Timers: the scheduler utilises a number of timers
578 * - s_timer: per CPU timer for preemption and scheduling decisions
579 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
580 * - dom_timer: per domain timer to specifiy timeout values
581 ****************************************************************************/
583 /* The scheduler timer: force a run through the scheduler */
584 static void s_timer_fn(void *unused)
585 {
586 raise_softirq(SCHEDULE_SOFTIRQ);
587 perfc_incrc(sched_irq);
588 }
590 /* Periodic tick timer: send timer event to current domain */
591 static void t_timer_fn(void *unused)
592 {
593 struct vcpu *v = current;
594 unsigned int cpu = v->processor;
596 schedule_data[cpu].tick++;
598 if ( !is_idle_task(v->domain) )
599 {
600 update_dom_time(v);
601 send_guest_virq(v, VIRQ_TIMER);
602 }
604 page_scrub_schedule_work();
606 set_ac_timer(&t_timer[cpu], NOW() + MILLISECS(10));
607 }
609 /* Domain timer function, sends a virtual timer interrupt to domain */
610 static void dom_timer_fn(void *data)
611 {
612 struct vcpu *v = data;
614 update_dom_time(v);
615 send_guest_virq(v, VIRQ_TIMER);
616 }
618 /* Initialise the data structures. */
619 void __init scheduler_init(void)
620 {
621 int i;
623 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
625 for ( i = 0; i < NR_CPUS; i++ )
626 {
627 spin_lock_init(&schedule_data[i].schedule_lock);
628 init_ac_timer(&schedule_data[i].s_timer, s_timer_fn, NULL, i);
629 init_ac_timer(&t_timer[i], t_timer_fn, NULL, i);
630 }
632 schedule_data[0].curr = idle_task[0];
633 schedule_data[0].idle = idle_task[0];
635 for ( i = 0; schedulers[i] != NULL; i++ )
636 {
637 ops = *schedulers[i];
638 if ( strcmp(ops.opt_name, opt_sched) == 0 )
639 break;
640 }
642 if ( schedulers[i] == NULL )
643 printk("Could not find scheduler: %s\n", opt_sched);
645 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
647 BUG_ON(SCHED_OP(alloc_task, idle_task[0]) < 0);
648 sched_add_domain(idle_task[0]);
649 }
651 /*
652 * Start a scheduler for each CPU
653 * This has to be done *after* the timers, e.g., APICs, have been initialised
654 */
655 void schedulers_start(void)
656 {
657 t_timer_fn(0);
658 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
659 }
661 void dump_runq(unsigned char key)
662 {
663 s_time_t now = NOW();
664 int i;
665 unsigned long flags;
667 local_irq_save(flags);
669 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
670 SCHED_OP(dump_settings);
671 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
673 for_each_online_cpu ( i )
674 {
675 spin_lock(&schedule_data[i].schedule_lock);
676 printk("CPU[%02d] ", i);
677 SCHED_OP(dump_cpu_state,i);
678 spin_unlock(&schedule_data[i].schedule_lock);
679 }
681 local_irq_restore(flags);
682 }
684 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
686 void print_sched_histo(unsigned char key)
687 {
688 int i, j, k;
689 for_each_online_cpu ( k )
690 {
691 j = 0;
692 printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
693 for ( i = 0; i < BUCKETS; i++ )
694 {
695 if ( schedule_data[k].hist[i] != 0 )
696 {
697 if ( i < BUCKETS-1 )
698 printk("%2d:[%7u] ", i, schedule_data[k].hist[i]);
699 else
700 printk(" >:[%7u] ", schedule_data[k].hist[i]);
701 if ( !(++j % 5) )
702 printk("\n");
703 }
704 }
705 printk("\n");
706 }
708 }
710 void reset_sched_histo(unsigned char key)
711 {
712 int i, j;
713 for ( j = 0; j < NR_CPUS; j++ )
714 for ( i=0; i < BUCKETS; i++ )
715 schedule_data[j].hist[i] = 0;
716 }
718 #else
720 void print_sched_histo(unsigned char key) { }
721 void reset_sched_histo(unsigned char key) { }
723 #endif
725 /*
726 * Local variables:
727 * mode: C
728 * c-set-style: "BSD"
729 * c-basic-offset: 4
730 * tab-width: 4
731 * indent-tabs-mode: nil
732 * End:
733 */