ia64/xen-unstable

view xen/common/schedule.c @ 19313:cfacba42091c

Improve vcpu_migration_delay handling.

Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Mar 11 10:12:14 2009 +0000 (2009-03-11)
parents d035b66b5b4d
children e5bed83d5180
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <public/sched.h>
35 #include <xsm/xsm.h>
37 /* opt_sched: scheduler - default to credit */
38 static char opt_sched[10] = "credit";
39 string_param("sched", opt_sched);
41 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
43 /* Various timer handlers. */
44 static void s_timer_fn(void *unused);
45 static void vcpu_periodic_timer_fn(void *data);
46 static void vcpu_singleshot_timer_fn(void *data);
47 static void poll_timer_fn(void *data);
49 /* This is global for now so that private implementations can reach it */
50 DEFINE_PER_CPU(struct schedule_data, schedule_data);
52 extern struct scheduler sched_sedf_def;
53 extern struct scheduler sched_credit_def;
54 static struct scheduler *schedulers[] = {
55 &sched_sedf_def,
56 &sched_credit_def,
57 NULL
58 };
60 static struct scheduler ops;
62 #define SCHED_OP(fn, ...) \
63 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
64 : (typeof(ops.fn(__VA_ARGS__)))0 )
66 static inline void trace_runstate_change(struct vcpu *v, int new_state)
67 {
68 struct { uint32_t vcpu:16, domain:16; } d;
69 uint32_t event;
71 if ( likely(!tb_init_done) )
72 return;
74 d.vcpu = v->vcpu_id;
75 d.domain = v->domain->domain_id;
77 event = TRC_SCHED_RUNSTATE_CHANGE;
78 event |= ( v->runstate.state & 0x3 ) << 8;
79 event |= ( new_state & 0x3 ) << 4;
81 __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
82 }
84 static inline void trace_continue_running(struct vcpu *v)
85 {
86 struct { uint32_t vcpu:16, domain:16; } d;
88 if ( likely(!tb_init_done) )
89 return;
91 d.vcpu = v->vcpu_id;
92 d.domain = v->domain->domain_id;
94 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d),
95 (unsigned char *)&d);
96 }
98 static inline void vcpu_runstate_change(
99 struct vcpu *v, int new_state, s_time_t new_entry_time)
100 {
101 s_time_t delta;
103 ASSERT(v->runstate.state != new_state);
104 ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
106 trace_runstate_change(v, new_state);
108 delta = new_entry_time - v->runstate.state_entry_time;
109 if ( delta > 0 )
110 {
111 v->runstate.time[v->runstate.state] += delta;
112 v->runstate.state_entry_time = new_entry_time;
113 }
115 v->runstate.state = new_state;
116 }
118 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
119 {
120 s_time_t delta;
122 if ( unlikely(v != current) )
123 vcpu_schedule_lock_irq(v);
125 memcpy(runstate, &v->runstate, sizeof(*runstate));
126 delta = NOW() - runstate->state_entry_time;
127 if ( delta > 0 )
128 runstate->time[runstate->state] += delta;
130 if ( unlikely(v != current) )
131 vcpu_schedule_unlock_irq(v);
132 }
134 uint64_t get_cpu_idle_time(unsigned int cpu)
135 {
136 struct vcpu_runstate_info state;
137 struct vcpu *v;
139 if ( (v = idle_vcpu[cpu]) == NULL )
140 return 0;
142 vcpu_runstate_get(v, &state);
143 return state.time[RUNSTATE_running];
144 }
146 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
147 {
148 struct domain *d = v->domain;
150 /*
151 * Initialize processor and affinity settings. The idler, and potentially
152 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
153 */
154 v->processor = processor;
155 if ( is_idle_domain(d) || d->is_pinned )
156 v->cpu_affinity = cpumask_of_cpu(processor);
157 else
158 cpus_setall(v->cpu_affinity);
160 /* Initialise the per-vcpu timers. */
161 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
162 v, v->processor);
163 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
164 v, v->processor);
165 init_timer(&v->poll_timer, poll_timer_fn,
166 v, v->processor);
168 /* Idle VCPUs are scheduled immediately. */
169 if ( is_idle_domain(d) )
170 {
171 per_cpu(schedule_data, v->processor).curr = v;
172 per_cpu(schedule_data, v->processor).idle = v;
173 v->is_running = 1;
174 }
176 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
178 return SCHED_OP(init_vcpu, v);
179 }
181 void sched_destroy_vcpu(struct vcpu *v)
182 {
183 kill_timer(&v->periodic_timer);
184 kill_timer(&v->singleshot_timer);
185 kill_timer(&v->poll_timer);
186 SCHED_OP(destroy_vcpu, v);
187 }
189 int sched_init_domain(struct domain *d)
190 {
191 return SCHED_OP(init_domain, d);
192 }
194 void sched_destroy_domain(struct domain *d)
195 {
196 SCHED_OP(destroy_domain, d);
197 }
199 void vcpu_sleep_nosync(struct vcpu *v)
200 {
201 unsigned long flags;
203 vcpu_schedule_lock_irqsave(v, flags);
205 if ( likely(!vcpu_runnable(v)) )
206 {
207 if ( v->runstate.state == RUNSTATE_runnable )
208 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
210 SCHED_OP(sleep, v);
211 }
213 vcpu_schedule_unlock_irqrestore(v, flags);
215 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
216 }
218 void vcpu_sleep_sync(struct vcpu *v)
219 {
220 vcpu_sleep_nosync(v);
222 while ( !vcpu_runnable(v) && v->is_running )
223 cpu_relax();
225 sync_vcpu_execstate(v);
226 }
228 void vcpu_wake(struct vcpu *v)
229 {
230 unsigned long flags;
232 vcpu_schedule_lock_irqsave(v, flags);
234 if ( likely(vcpu_runnable(v)) )
235 {
236 if ( v->runstate.state >= RUNSTATE_blocked )
237 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
238 SCHED_OP(wake, v);
239 }
240 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
241 {
242 if ( v->runstate.state == RUNSTATE_blocked )
243 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
244 }
246 vcpu_schedule_unlock_irqrestore(v, flags);
248 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
249 }
251 void vcpu_unblock(struct vcpu *v)
252 {
253 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
254 return;
256 /* Polling period ends when a VCPU is unblocked. */
257 if ( unlikely(v->poll_evtchn != 0) )
258 {
259 v->poll_evtchn = 0;
260 /*
261 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
262 * this VCPU (and it then going back to sleep on poll_mask).
263 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
264 */
265 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
266 clear_bit(_VPF_blocked, &v->pause_flags);
267 }
269 vcpu_wake(v);
270 }
272 static void vcpu_migrate(struct vcpu *v)
273 {
274 unsigned long flags;
275 int old_cpu;
277 vcpu_schedule_lock_irqsave(v, flags);
279 /*
280 * NB. Check of v->running happens /after/ setting migration flag
281 * because they both happen in (different) spinlock regions, and those
282 * regions are strictly serialised.
283 */
284 if ( v->is_running ||
285 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
286 {
287 vcpu_schedule_unlock_irqrestore(v, flags);
288 return;
289 }
291 /* Switch to new CPU, then unlock old CPU. */
292 old_cpu = v->processor;
293 v->processor = SCHED_OP(pick_cpu, v);
294 spin_unlock_irqrestore(
295 &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
297 /* Wake on new CPU. */
298 vcpu_wake(v);
299 }
301 /*
302 * Force a VCPU through a deschedule/reschedule path.
303 * For example, using this when setting the periodic timer period means that
304 * most periodic-timer state need only be touched from within the scheduler
305 * which can thus be done without need for synchronisation.
306 */
307 void vcpu_force_reschedule(struct vcpu *v)
308 {
309 vcpu_schedule_lock_irq(v);
310 if ( v->is_running )
311 set_bit(_VPF_migrating, &v->pause_flags);
312 vcpu_schedule_unlock_irq(v);
314 if ( test_bit(_VPF_migrating, &v->pause_flags) )
315 {
316 vcpu_sleep_nosync(v);
317 vcpu_migrate(v);
318 }
319 }
321 /*
322 * This function is used by cpu_hotplug code from stop_machine context.
323 * Hence we can avoid needing to take the
324 */
325 void cpu_disable_scheduler(void)
326 {
327 struct domain *d;
328 struct vcpu *v;
329 unsigned int cpu = smp_processor_id();
331 for_each_domain ( d )
332 {
333 for_each_vcpu ( d, v )
334 {
335 if ( is_idle_vcpu(v) )
336 continue;
338 if ( (cpus_weight(v->cpu_affinity) == 1) &&
339 cpu_isset(cpu, v->cpu_affinity) )
340 {
341 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
342 v->domain->domain_id, v->vcpu_id);
343 cpus_setall(v->cpu_affinity);
344 }
346 /*
347 * Migrate single-shot timers to CPU0. A new cpu will automatically
348 * be chosen when the timer is next re-set.
349 */
350 if ( v->singleshot_timer.cpu == cpu )
351 migrate_timer(&v->singleshot_timer, 0);
353 if ( v->processor == cpu )
354 {
355 set_bit(_VPF_migrating, &v->pause_flags);
356 vcpu_sleep_nosync(v);
357 vcpu_migrate(v);
358 }
359 }
360 }
361 }
363 static int __vcpu_set_affinity(
364 struct vcpu *v, cpumask_t *affinity,
365 bool_t old_lock_status, bool_t new_lock_status)
366 {
367 cpumask_t online_affinity, old_affinity;
369 cpus_and(online_affinity, *affinity, cpu_online_map);
370 if ( cpus_empty(online_affinity) )
371 return -EINVAL;
373 vcpu_schedule_lock_irq(v);
375 if ( v->affinity_locked != old_lock_status )
376 {
377 BUG_ON(!v->affinity_locked);
378 vcpu_schedule_unlock_irq(v);
379 return -EBUSY;
380 }
382 v->affinity_locked = new_lock_status;
384 old_affinity = v->cpu_affinity;
385 v->cpu_affinity = *affinity;
386 *affinity = old_affinity;
387 if ( !cpu_isset(v->processor, v->cpu_affinity) )
388 set_bit(_VPF_migrating, &v->pause_flags);
390 vcpu_schedule_unlock_irq(v);
392 if ( test_bit(_VPF_migrating, &v->pause_flags) )
393 {
394 vcpu_sleep_nosync(v);
395 vcpu_migrate(v);
396 }
398 return 0;
399 }
401 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
402 {
403 if ( v->domain->is_pinned )
404 return -EINVAL;
405 return __vcpu_set_affinity(v, affinity, 0, 0);
406 }
408 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
409 {
410 return __vcpu_set_affinity(v, affinity, 0, 1);
411 }
413 int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity)
414 {
415 return __vcpu_set_affinity(v, affinity, 1, 1);
416 }
418 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
419 {
420 cpumask_t online_affinity;
422 /* Do not fail if no CPU in old affinity mask is online. */
423 cpus_and(online_affinity, *affinity, cpu_online_map);
424 if ( cpus_empty(online_affinity) )
425 *affinity = cpu_online_map;
427 if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
428 BUG();
429 }
431 /* Block the currently-executing domain until a pertinent event occurs. */
432 static long do_block(void)
433 {
434 struct vcpu *v = current;
436 local_event_delivery_enable();
437 set_bit(_VPF_blocked, &v->pause_flags);
439 /* Check for events /after/ blocking: avoids wakeup waiting race. */
440 if ( local_events_need_delivery() )
441 {
442 clear_bit(_VPF_blocked, &v->pause_flags);
443 }
444 else
445 {
446 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
447 raise_softirq(SCHEDULE_SOFTIRQ);
448 }
450 return 0;
451 }
453 static long do_poll(struct sched_poll *sched_poll)
454 {
455 struct vcpu *v = current;
456 struct domain *d = v->domain;
457 evtchn_port_t port;
458 long rc;
459 unsigned int i;
461 /* Fairly arbitrary limit. */
462 if ( sched_poll->nr_ports > 128 )
463 return -EINVAL;
465 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
466 return -EFAULT;
468 set_bit(_VPF_blocked, &v->pause_flags);
469 v->poll_evtchn = -1;
470 set_bit(v->vcpu_id, d->poll_mask);
472 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
473 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
474 smp_mb();
476 /*
477 * Someone may have seen we are blocked but not that we are polling, or
478 * vice versa. We are certainly being woken, so clean up and bail. Beyond
479 * this point others can be guaranteed to clean up for us if they wake us.
480 */
481 rc = 0;
482 if ( (v->poll_evtchn == 0) ||
483 !test_bit(_VPF_blocked, &v->pause_flags) ||
484 !test_bit(v->vcpu_id, d->poll_mask) )
485 goto out;
486 #endif
488 rc = 0;
489 if ( local_events_need_delivery() )
490 goto out;
492 for ( i = 0; i < sched_poll->nr_ports; i++ )
493 {
494 rc = -EFAULT;
495 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
496 goto out;
498 rc = -EINVAL;
499 if ( port >= MAX_EVTCHNS(d) )
500 goto out;
502 rc = 0;
503 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
504 goto out;
505 }
507 if ( sched_poll->nr_ports == 1 )
508 v->poll_evtchn = port;
510 if ( sched_poll->timeout != 0 )
511 set_timer(&v->poll_timer, sched_poll->timeout);
513 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
514 raise_softirq(SCHEDULE_SOFTIRQ);
516 return 0;
518 out:
519 v->poll_evtchn = 0;
520 clear_bit(v->vcpu_id, d->poll_mask);
521 clear_bit(_VPF_blocked, &v->pause_flags);
522 return rc;
523 }
525 /* Voluntarily yield the processor for this allocation. */
526 static long do_yield(void)
527 {
528 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
529 raise_softirq(SCHEDULE_SOFTIRQ);
530 return 0;
531 }
533 long do_sched_op_compat(int cmd, unsigned long arg)
534 {
535 long ret = 0;
537 switch ( cmd )
538 {
539 case SCHEDOP_yield:
540 {
541 ret = do_yield();
542 break;
543 }
545 case SCHEDOP_block:
546 {
547 ret = do_block();
548 break;
549 }
551 case SCHEDOP_shutdown:
552 {
553 TRACE_3D(TRC_SCHED_SHUTDOWN,
554 current->domain->domain_id, current->vcpu_id, arg);
555 domain_shutdown(current->domain, (u8)arg);
556 break;
557 }
559 default:
560 ret = -ENOSYS;
561 }
563 return ret;
564 }
566 typedef long ret_t;
568 #endif /* !COMPAT */
570 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
571 {
572 ret_t ret = 0;
574 switch ( cmd )
575 {
576 case SCHEDOP_yield:
577 {
578 ret = do_yield();
579 break;
580 }
582 case SCHEDOP_block:
583 {
584 ret = do_block();
585 break;
586 }
588 case SCHEDOP_shutdown:
589 {
590 struct sched_shutdown sched_shutdown;
592 ret = -EFAULT;
593 if ( copy_from_guest(&sched_shutdown, arg, 1) )
594 break;
596 ret = 0;
597 TRACE_3D(TRC_SCHED_SHUTDOWN,
598 current->domain->domain_id, current->vcpu_id,
599 sched_shutdown.reason);
600 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
602 break;
603 }
605 case SCHEDOP_poll:
606 {
607 struct sched_poll sched_poll;
609 ret = -EFAULT;
610 if ( copy_from_guest(&sched_poll, arg, 1) )
611 break;
613 ret = do_poll(&sched_poll);
615 break;
616 }
618 case SCHEDOP_remote_shutdown:
619 {
620 struct domain *d;
621 struct sched_remote_shutdown sched_remote_shutdown;
623 ret = -EFAULT;
624 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
625 break;
627 ret = -ESRCH;
628 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
629 if ( d == NULL )
630 break;
632 if ( !IS_PRIV_FOR(current->domain, d) )
633 {
634 rcu_unlock_domain(d);
635 return -EPERM;
636 }
638 ret = xsm_schedop_shutdown(current->domain, d);
639 if ( ret )
640 {
641 rcu_unlock_domain(d);
642 return ret;
643 }
645 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
647 rcu_unlock_domain(d);
648 ret = 0;
650 break;
651 }
653 default:
654 ret = -ENOSYS;
655 }
657 return ret;
658 }
660 #ifndef COMPAT
662 /* Per-vcpu oneshot-timer hypercall. */
663 long do_set_timer_op(s_time_t timeout)
664 {
665 struct vcpu *v = current;
666 s_time_t offset = timeout - NOW();
668 if ( timeout == 0 )
669 {
670 stop_timer(&v->singleshot_timer);
671 }
672 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
673 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
674 {
675 /*
676 * Linux workaround: occasionally we will see timeouts a long way in
677 * the future due to wrapping in Linux's jiffy time handling. We check
678 * for timeouts wrapped negative, and for positive timeouts more than
679 * about 13 days in the future (2^50ns). The correct fix is to trigger
680 * an interrupt immediately (since Linux in fact has pending work to
681 * do in this situation). However, older guests also set a long timeout
682 * when they have *no* pending timers at all: setting an immediate
683 * timeout in this case can burn a lot of CPU. We therefore go for a
684 * reasonable middleground of triggering a timer event in 100ms.
685 */
686 gdprintk(XENLOG_INFO,
687 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
688 v->vcpu_id, (uint64_t)timeout);
689 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
690 }
691 else
692 {
693 if ( v->singleshot_timer.cpu != smp_processor_id() )
694 {
695 stop_timer(&v->singleshot_timer);
696 v->singleshot_timer.cpu = smp_processor_id();
697 }
699 set_timer(&v->singleshot_timer, timeout);
700 }
702 return 0;
703 }
705 /* sched_id - fetch ID of current scheduler */
706 int sched_id(void)
707 {
708 return ops.sched_id;
709 }
711 /* Adjust scheduling parameter for a given domain. */
712 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
713 {
714 struct vcpu *v;
715 long ret;
717 if ( (op->sched_id != ops.sched_id) ||
718 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
719 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
720 return -EINVAL;
722 /*
723 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
724 * we acquire the local schedule_lock to guard against concurrent updates.
725 *
726 * We only acquire the local schedule lock after we have paused all other
727 * VCPUs in this domain. There are two reasons for this:
728 * 1- We don't want to hold up interrupts as pausing a VCPU can
729 * trigger a tlb shootdown.
730 * 2- Pausing other VCPUs involves briefly locking the schedule
731 * lock of the CPU they are running on. This CPU could be the
732 * same as ours.
733 */
735 for_each_vcpu ( d, v )
736 {
737 if ( v != current )
738 vcpu_pause(v);
739 }
741 if ( d == current->domain )
742 vcpu_schedule_lock_irq(current);
744 if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
745 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
747 if ( d == current->domain )
748 vcpu_schedule_unlock_irq(current);
750 for_each_vcpu ( d, v )
751 {
752 if ( v != current )
753 vcpu_unpause(v);
754 }
756 return ret;
757 }
759 static void vcpu_periodic_timer_work(struct vcpu *v)
760 {
761 s_time_t now = NOW();
762 uint64_t periodic_next_event;
764 ASSERT(!active_timer(&v->periodic_timer));
766 if ( v->periodic_period == 0 )
767 return;
769 periodic_next_event = v->periodic_last_event + v->periodic_period;
771 /* The timer subsystem may call us up to TIME_SLOP ahead of deadline. */
772 if ( (now + TIME_SLOP) > periodic_next_event )
773 {
774 send_timer_event(v);
775 v->periodic_last_event = now;
776 periodic_next_event = now + v->periodic_period;
777 }
779 v->periodic_timer.cpu = smp_processor_id();
780 set_timer(&v->periodic_timer, periodic_next_event);
781 }
783 /*
784 * The main function
785 * - deschedule the current domain (scheduler independent).
786 * - pick a new domain (scheduler dependent).
787 */
788 static void schedule(void)
789 {
790 struct vcpu *prev = current, *next = NULL;
791 s_time_t now = NOW();
792 struct schedule_data *sd;
793 struct task_slice next_slice;
794 s32 r_time; /* time for new dom to run */
796 ASSERT(!in_irq());
797 ASSERT(this_cpu(mc_state).flags == 0);
799 perfc_incr(sched_run);
801 sd = &this_cpu(schedule_data);
803 spin_lock_irq(&sd->schedule_lock);
805 stop_timer(&sd->s_timer);
807 /* get policy-specific decision on scheduling... */
808 next_slice = ops.do_schedule(now);
810 r_time = next_slice.time;
811 next = next_slice.task;
813 sd->curr = next;
815 set_timer(&sd->s_timer, now + r_time);
817 if ( unlikely(prev == next) )
818 {
819 spin_unlock_irq(&sd->schedule_lock);
820 trace_continue_running(next);
821 return continue_running(prev);
822 }
824 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
825 prev->domain->domain_id,
826 now - prev->runstate.state_entry_time);
827 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
828 next->domain->domain_id,
829 (next->runstate.state == RUNSTATE_runnable) ?
830 (now - next->runstate.state_entry_time) : 0,
831 r_time);
833 ASSERT(prev->runstate.state == RUNSTATE_running);
834 vcpu_runstate_change(
835 prev,
836 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
837 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
838 now);
839 prev->last_run_time = now;
841 ASSERT(next->runstate.state != RUNSTATE_running);
842 vcpu_runstate_change(next, RUNSTATE_running, now);
844 ASSERT(!next->is_running);
845 next->is_running = 1;
847 spin_unlock_irq(&sd->schedule_lock);
849 perfc_incr(sched_ctx);
851 stop_timer(&prev->periodic_timer);
853 /* Ensure that the domain has an up-to-date time base. */
854 update_vcpu_system_time(next);
855 vcpu_periodic_timer_work(next);
857 TRACE_4D(TRC_SCHED_SWITCH,
858 prev->domain->domain_id, prev->vcpu_id,
859 next->domain->domain_id, next->vcpu_id);
861 context_switch(prev, next);
862 }
864 void context_saved(struct vcpu *prev)
865 {
866 /* Clear running flag /after/ writing context to memory. */
867 smp_wmb();
869 prev->is_running = 0;
871 /* Check for migration request /after/ clearing running flag. */
872 smp_mb();
874 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
875 vcpu_migrate(prev);
876 }
878 /* The scheduler timer: force a run through the scheduler */
879 static void s_timer_fn(void *unused)
880 {
881 raise_softirq(SCHEDULE_SOFTIRQ);
882 perfc_incr(sched_irq);
883 }
885 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
886 static void vcpu_periodic_timer_fn(void *data)
887 {
888 struct vcpu *v = data;
889 vcpu_periodic_timer_work(v);
890 }
892 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
893 static void vcpu_singleshot_timer_fn(void *data)
894 {
895 struct vcpu *v = data;
896 send_timer_event(v);
897 }
899 /* SCHEDOP_poll timeout callback. */
900 static void poll_timer_fn(void *data)
901 {
902 struct vcpu *v = data;
904 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
905 vcpu_unblock(v);
906 }
908 /* Initialise the data structures. */
909 void __init scheduler_init(void)
910 {
911 int i;
913 open_softirq(SCHEDULE_SOFTIRQ, schedule);
915 for_each_cpu ( i )
916 {
917 spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
918 init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
919 }
921 for ( i = 0; schedulers[i] != NULL; i++ )
922 {
923 ops = *schedulers[i];
924 if ( strcmp(ops.opt_name, opt_sched) == 0 )
925 break;
926 }
928 if ( schedulers[i] == NULL )
929 printk("Could not find scheduler: %s\n", opt_sched);
931 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
932 SCHED_OP(init);
933 }
935 void dump_runq(unsigned char key)
936 {
937 s_time_t now = NOW();
938 int i;
939 unsigned long flags;
941 local_irq_save(flags);
943 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
944 SCHED_OP(dump_settings);
945 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
947 for_each_online_cpu ( i )
948 {
949 spin_lock(&per_cpu(schedule_data, i).schedule_lock);
950 printk("CPU[%02d] ", i);
951 SCHED_OP(dump_cpu_state, i);
952 spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
953 }
955 local_irq_restore(flags);
956 }
958 #ifdef CONFIG_COMPAT
959 #include "compat/schedule.c"
960 #endif
962 #endif /* !COMPAT */
964 /*
965 * Local variables:
966 * mode: C
967 * c-set-style: "BSD"
968 * c-basic-offset: 4
969 * tab-width: 4
970 * indent-tabs-mode: nil
971 * End:
972 */