ia64/xen-unstable

view xen/common/schedule.c @ 19835:edfdeb150f27

Fix buildsystem to detect udev > version 124

udev removed the udevinfo symlink from versions higher than 123 and
xen's build-system could not detect if udev is in place and has the
required version.

Signed-off-by: Marc-A. Dahlhaus <mad@wol.de>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 25 13:02:37 2009 +0100 (2009-06-25)
parents 5a60eb7fad79
children
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <public/sched.h>
35 #include <xsm/xsm.h>
37 /* opt_sched: scheduler - default to credit */
38 static char opt_sched[10] = "credit";
39 string_param("sched", opt_sched);
41 /* if sched_smt_power_savings is set,
42 * scheduler will give preferrence to partially idle package compared to
43 * the full idle package, when picking pCPU to schedule vCPU.
44 */
45 int sched_smt_power_savings = 0;
46 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
48 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
50 /* Various timer handlers. */
51 static void s_timer_fn(void *unused);
52 static void vcpu_periodic_timer_fn(void *data);
53 static void vcpu_singleshot_timer_fn(void *data);
54 static void poll_timer_fn(void *data);
56 /* This is global for now so that private implementations can reach it */
57 DEFINE_PER_CPU(struct schedule_data, schedule_data);
59 extern struct scheduler sched_sedf_def;
60 extern struct scheduler sched_credit_def;
61 static struct scheduler *schedulers[] = {
62 &sched_sedf_def,
63 &sched_credit_def,
64 NULL
65 };
67 static struct scheduler ops;
69 #define SCHED_OP(fn, ...) \
70 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
71 : (typeof(ops.fn(__VA_ARGS__)))0 )
73 static inline void trace_runstate_change(struct vcpu *v, int new_state)
74 {
75 struct { uint32_t vcpu:16, domain:16; } d;
76 uint32_t event;
78 if ( likely(!tb_init_done) )
79 return;
81 d.vcpu = v->vcpu_id;
82 d.domain = v->domain->domain_id;
84 event = TRC_SCHED_RUNSTATE_CHANGE;
85 event |= ( v->runstate.state & 0x3 ) << 8;
86 event |= ( new_state & 0x3 ) << 4;
88 __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
89 }
91 static inline void trace_continue_running(struct vcpu *v)
92 {
93 struct { uint32_t vcpu:16, domain:16; } d;
95 if ( likely(!tb_init_done) )
96 return;
98 d.vcpu = v->vcpu_id;
99 d.domain = v->domain->domain_id;
101 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d),
102 (unsigned char *)&d);
103 }
105 static inline void vcpu_runstate_change(
106 struct vcpu *v, int new_state, s_time_t new_entry_time)
107 {
108 s_time_t delta;
110 ASSERT(v->runstate.state != new_state);
111 ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
113 trace_runstate_change(v, new_state);
115 delta = new_entry_time - v->runstate.state_entry_time;
116 if ( delta > 0 )
117 {
118 v->runstate.time[v->runstate.state] += delta;
119 v->runstate.state_entry_time = new_entry_time;
120 }
122 v->runstate.state = new_state;
123 }
125 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
126 {
127 s_time_t delta;
129 if ( unlikely(v != current) )
130 vcpu_schedule_lock_irq(v);
132 memcpy(runstate, &v->runstate, sizeof(*runstate));
133 delta = NOW() - runstate->state_entry_time;
134 if ( delta > 0 )
135 runstate->time[runstate->state] += delta;
137 if ( unlikely(v != current) )
138 vcpu_schedule_unlock_irq(v);
139 }
141 uint64_t get_cpu_idle_time(unsigned int cpu)
142 {
143 struct vcpu_runstate_info state;
144 struct vcpu *v;
146 if ( (v = idle_vcpu[cpu]) == NULL )
147 return 0;
149 vcpu_runstate_get(v, &state);
150 return state.time[RUNSTATE_running];
151 }
153 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
154 {
155 struct domain *d = v->domain;
157 /*
158 * Initialize processor and affinity settings. The idler, and potentially
159 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
160 */
161 v->processor = processor;
162 if ( is_idle_domain(d) || d->is_pinned )
163 v->cpu_affinity = cpumask_of_cpu(processor);
164 else
165 cpus_setall(v->cpu_affinity);
167 /* Initialise the per-vcpu timers. */
168 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
169 v, v->processor);
170 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
171 v, v->processor);
172 init_timer(&v->poll_timer, poll_timer_fn,
173 v, v->processor);
175 /* Idle VCPUs are scheduled immediately. */
176 if ( is_idle_domain(d) )
177 {
178 per_cpu(schedule_data, v->processor).curr = v;
179 per_cpu(schedule_data, v->processor).idle = v;
180 v->is_running = 1;
181 }
183 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
185 return SCHED_OP(init_vcpu, v);
186 }
188 void sched_destroy_vcpu(struct vcpu *v)
189 {
190 kill_timer(&v->periodic_timer);
191 kill_timer(&v->singleshot_timer);
192 kill_timer(&v->poll_timer);
193 SCHED_OP(destroy_vcpu, v);
194 }
196 int sched_init_domain(struct domain *d)
197 {
198 return SCHED_OP(init_domain, d);
199 }
201 void sched_destroy_domain(struct domain *d)
202 {
203 SCHED_OP(destroy_domain, d);
204 }
206 void vcpu_sleep_nosync(struct vcpu *v)
207 {
208 unsigned long flags;
210 vcpu_schedule_lock_irqsave(v, flags);
212 if ( likely(!vcpu_runnable(v)) )
213 {
214 if ( v->runstate.state == RUNSTATE_runnable )
215 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
217 SCHED_OP(sleep, v);
218 }
220 vcpu_schedule_unlock_irqrestore(v, flags);
222 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
223 }
225 void vcpu_sleep_sync(struct vcpu *v)
226 {
227 vcpu_sleep_nosync(v);
229 while ( !vcpu_runnable(v) && v->is_running )
230 cpu_relax();
232 sync_vcpu_execstate(v);
233 }
235 void vcpu_wake(struct vcpu *v)
236 {
237 unsigned long flags;
239 vcpu_schedule_lock_irqsave(v, flags);
241 if ( likely(vcpu_runnable(v)) )
242 {
243 if ( v->runstate.state >= RUNSTATE_blocked )
244 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
245 SCHED_OP(wake, v);
246 }
247 else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
248 {
249 if ( v->runstate.state == RUNSTATE_blocked )
250 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
251 }
253 vcpu_schedule_unlock_irqrestore(v, flags);
255 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
256 }
258 void vcpu_unblock(struct vcpu *v)
259 {
260 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
261 return;
263 /* Polling period ends when a VCPU is unblocked. */
264 if ( unlikely(v->poll_evtchn != 0) )
265 {
266 v->poll_evtchn = 0;
267 /*
268 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
269 * this VCPU (and it then going back to sleep on poll_mask).
270 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
271 */
272 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
273 clear_bit(_VPF_blocked, &v->pause_flags);
274 }
276 vcpu_wake(v);
277 }
279 static void vcpu_migrate(struct vcpu *v)
280 {
281 unsigned long flags;
282 int old_cpu;
284 vcpu_schedule_lock_irqsave(v, flags);
286 /*
287 * NB. Check of v->running happens /after/ setting migration flag
288 * because they both happen in (different) spinlock regions, and those
289 * regions are strictly serialised.
290 */
291 if ( v->is_running ||
292 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
293 {
294 vcpu_schedule_unlock_irqrestore(v, flags);
295 return;
296 }
298 /* Switch to new CPU, then unlock old CPU. */
299 old_cpu = v->processor;
300 v->processor = SCHED_OP(pick_cpu, v);
301 spin_unlock_irqrestore(
302 &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
304 /* Wake on new CPU. */
305 vcpu_wake(v);
306 }
308 /*
309 * Force a VCPU through a deschedule/reschedule path.
310 * For example, using this when setting the periodic timer period means that
311 * most periodic-timer state need only be touched from within the scheduler
312 * which can thus be done without need for synchronisation.
313 */
314 void vcpu_force_reschedule(struct vcpu *v)
315 {
316 vcpu_schedule_lock_irq(v);
317 if ( v->is_running )
318 set_bit(_VPF_migrating, &v->pause_flags);
319 vcpu_schedule_unlock_irq(v);
321 if ( test_bit(_VPF_migrating, &v->pause_flags) )
322 {
323 vcpu_sleep_nosync(v);
324 vcpu_migrate(v);
325 }
326 }
328 /*
329 * This function is used by cpu_hotplug code from stop_machine context.
330 * Hence we can avoid needing to take the
331 */
332 void cpu_disable_scheduler(void)
333 {
334 struct domain *d;
335 struct vcpu *v;
336 unsigned int cpu = smp_processor_id();
338 for_each_domain ( d )
339 {
340 for_each_vcpu ( d, v )
341 {
342 if ( is_idle_vcpu(v) )
343 continue;
345 if ( (cpus_weight(v->cpu_affinity) == 1) &&
346 cpu_isset(cpu, v->cpu_affinity) )
347 {
348 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
349 v->domain->domain_id, v->vcpu_id);
350 cpus_setall(v->cpu_affinity);
351 }
353 /*
354 * Migrate single-shot timers to CPU0. A new cpu will automatically
355 * be chosen when the timer is next re-set.
356 */
357 if ( v->singleshot_timer.cpu == cpu )
358 migrate_timer(&v->singleshot_timer, 0);
360 if ( v->processor == cpu )
361 {
362 set_bit(_VPF_migrating, &v->pause_flags);
363 vcpu_sleep_nosync(v);
364 vcpu_migrate(v);
365 }
366 }
367 }
368 }
370 static int __vcpu_set_affinity(
371 struct vcpu *v, cpumask_t *affinity,
372 bool_t old_lock_status, bool_t new_lock_status)
373 {
374 cpumask_t online_affinity, old_affinity;
376 cpus_and(online_affinity, *affinity, cpu_online_map);
377 if ( cpus_empty(online_affinity) )
378 return -EINVAL;
380 vcpu_schedule_lock_irq(v);
382 if ( v->affinity_locked != old_lock_status )
383 {
384 BUG_ON(!v->affinity_locked);
385 vcpu_schedule_unlock_irq(v);
386 return -EBUSY;
387 }
389 v->affinity_locked = new_lock_status;
391 old_affinity = v->cpu_affinity;
392 v->cpu_affinity = *affinity;
393 *affinity = old_affinity;
394 if ( !cpu_isset(v->processor, v->cpu_affinity) )
395 set_bit(_VPF_migrating, &v->pause_flags);
397 vcpu_schedule_unlock_irq(v);
399 if ( test_bit(_VPF_migrating, &v->pause_flags) )
400 {
401 vcpu_sleep_nosync(v);
402 vcpu_migrate(v);
403 }
405 return 0;
406 }
408 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
409 {
410 if ( v->domain->is_pinned )
411 return -EINVAL;
412 return __vcpu_set_affinity(v, affinity, 0, 0);
413 }
415 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
416 {
417 return __vcpu_set_affinity(v, affinity, 0, 1);
418 }
420 int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity)
421 {
422 return __vcpu_set_affinity(v, affinity, 1, 1);
423 }
425 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
426 {
427 cpumask_t online_affinity;
429 /* Do not fail if no CPU in old affinity mask is online. */
430 cpus_and(online_affinity, *affinity, cpu_online_map);
431 if ( cpus_empty(online_affinity) )
432 *affinity = cpu_online_map;
434 if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
435 BUG();
436 }
438 /* Block the currently-executing domain until a pertinent event occurs. */
439 static long do_block(void)
440 {
441 struct vcpu *v = current;
443 local_event_delivery_enable();
444 set_bit(_VPF_blocked, &v->pause_flags);
446 /* Check for events /after/ blocking: avoids wakeup waiting race. */
447 if ( local_events_need_delivery() )
448 {
449 clear_bit(_VPF_blocked, &v->pause_flags);
450 }
451 else
452 {
453 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
454 raise_softirq(SCHEDULE_SOFTIRQ);
455 }
457 return 0;
458 }
460 static long do_poll(struct sched_poll *sched_poll)
461 {
462 struct vcpu *v = current;
463 struct domain *d = v->domain;
464 evtchn_port_t port;
465 long rc;
466 unsigned int i;
468 /* Fairly arbitrary limit. */
469 if ( sched_poll->nr_ports > 128 )
470 return -EINVAL;
472 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
473 return -EFAULT;
475 set_bit(_VPF_blocked, &v->pause_flags);
476 v->poll_evtchn = -1;
477 set_bit(v->vcpu_id, d->poll_mask);
479 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
480 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
481 smp_mb();
483 /*
484 * Someone may have seen we are blocked but not that we are polling, or
485 * vice versa. We are certainly being woken, so clean up and bail. Beyond
486 * this point others can be guaranteed to clean up for us if they wake us.
487 */
488 rc = 0;
489 if ( (v->poll_evtchn == 0) ||
490 !test_bit(_VPF_blocked, &v->pause_flags) ||
491 !test_bit(v->vcpu_id, d->poll_mask) )
492 goto out;
493 #endif
495 rc = 0;
496 if ( local_events_need_delivery() )
497 goto out;
499 for ( i = 0; i < sched_poll->nr_ports; i++ )
500 {
501 rc = -EFAULT;
502 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
503 goto out;
505 rc = -EINVAL;
506 if ( port >= MAX_EVTCHNS(d) )
507 goto out;
509 rc = 0;
510 if ( test_bit(port, &shared_info(d, evtchn_pending)) )
511 goto out;
512 }
514 if ( sched_poll->nr_ports == 1 )
515 v->poll_evtchn = port;
517 if ( sched_poll->timeout != 0 )
518 set_timer(&v->poll_timer, sched_poll->timeout);
520 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
521 raise_softirq(SCHEDULE_SOFTIRQ);
523 return 0;
525 out:
526 v->poll_evtchn = 0;
527 clear_bit(v->vcpu_id, d->poll_mask);
528 clear_bit(_VPF_blocked, &v->pause_flags);
529 return rc;
530 }
532 /* Voluntarily yield the processor for this allocation. */
533 static long do_yield(void)
534 {
535 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
536 raise_softirq(SCHEDULE_SOFTIRQ);
537 return 0;
538 }
540 long do_sched_op_compat(int cmd, unsigned long arg)
541 {
542 long ret = 0;
544 switch ( cmd )
545 {
546 case SCHEDOP_yield:
547 {
548 ret = do_yield();
549 break;
550 }
552 case SCHEDOP_block:
553 {
554 ret = do_block();
555 break;
556 }
558 case SCHEDOP_shutdown:
559 {
560 TRACE_3D(TRC_SCHED_SHUTDOWN,
561 current->domain->domain_id, current->vcpu_id, arg);
562 domain_shutdown(current->domain, (u8)arg);
563 break;
564 }
566 default:
567 ret = -ENOSYS;
568 }
570 return ret;
571 }
573 typedef long ret_t;
575 #endif /* !COMPAT */
577 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
578 {
579 ret_t ret = 0;
581 switch ( cmd )
582 {
583 case SCHEDOP_yield:
584 {
585 ret = do_yield();
586 break;
587 }
589 case SCHEDOP_block:
590 {
591 ret = do_block();
592 break;
593 }
595 case SCHEDOP_shutdown:
596 {
597 struct sched_shutdown sched_shutdown;
599 ret = -EFAULT;
600 if ( copy_from_guest(&sched_shutdown, arg, 1) )
601 break;
603 ret = 0;
604 TRACE_3D(TRC_SCHED_SHUTDOWN,
605 current->domain->domain_id, current->vcpu_id,
606 sched_shutdown.reason);
607 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
609 break;
610 }
612 case SCHEDOP_poll:
613 {
614 struct sched_poll sched_poll;
616 ret = -EFAULT;
617 if ( copy_from_guest(&sched_poll, arg, 1) )
618 break;
620 ret = do_poll(&sched_poll);
622 break;
623 }
625 case SCHEDOP_remote_shutdown:
626 {
627 struct domain *d;
628 struct sched_remote_shutdown sched_remote_shutdown;
630 ret = -EFAULT;
631 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
632 break;
634 ret = -ESRCH;
635 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
636 if ( d == NULL )
637 break;
639 if ( !IS_PRIV_FOR(current->domain, d) )
640 {
641 rcu_unlock_domain(d);
642 return -EPERM;
643 }
645 ret = xsm_schedop_shutdown(current->domain, d);
646 if ( ret )
647 {
648 rcu_unlock_domain(d);
649 return ret;
650 }
652 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
654 rcu_unlock_domain(d);
655 ret = 0;
657 break;
658 }
660 default:
661 ret = -ENOSYS;
662 }
664 return ret;
665 }
667 #ifndef COMPAT
669 /* Per-vcpu oneshot-timer hypercall. */
670 long do_set_timer_op(s_time_t timeout)
671 {
672 struct vcpu *v = current;
673 s_time_t offset = timeout - NOW();
675 if ( timeout == 0 )
676 {
677 stop_timer(&v->singleshot_timer);
678 }
679 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
680 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
681 {
682 /*
683 * Linux workaround: occasionally we will see timeouts a long way in
684 * the future due to wrapping in Linux's jiffy time handling. We check
685 * for timeouts wrapped negative, and for positive timeouts more than
686 * about 13 days in the future (2^50ns). The correct fix is to trigger
687 * an interrupt immediately (since Linux in fact has pending work to
688 * do in this situation). However, older guests also set a long timeout
689 * when they have *no* pending timers at all: setting an immediate
690 * timeout in this case can burn a lot of CPU. We therefore go for a
691 * reasonable middleground of triggering a timer event in 100ms.
692 */
693 gdprintk(XENLOG_INFO,
694 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
695 v->vcpu_id, (uint64_t)timeout);
696 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
697 }
698 else
699 {
700 if ( v->singleshot_timer.cpu != smp_processor_id() )
701 {
702 stop_timer(&v->singleshot_timer);
703 v->singleshot_timer.cpu = smp_processor_id();
704 }
706 set_timer(&v->singleshot_timer, timeout);
707 }
709 return 0;
710 }
712 /* sched_id - fetch ID of current scheduler */
713 int sched_id(void)
714 {
715 return ops.sched_id;
716 }
718 /* Adjust scheduling parameter for a given domain. */
719 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
720 {
721 struct vcpu *v;
722 long ret;
724 if ( (op->sched_id != ops.sched_id) ||
725 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
726 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
727 return -EINVAL;
729 /*
730 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
731 * we acquire the local schedule_lock to guard against concurrent updates.
732 *
733 * We only acquire the local schedule lock after we have paused all other
734 * VCPUs in this domain. There are two reasons for this:
735 * 1- We don't want to hold up interrupts as pausing a VCPU can
736 * trigger a tlb shootdown.
737 * 2- Pausing other VCPUs involves briefly locking the schedule
738 * lock of the CPU they are running on. This CPU could be the
739 * same as ours.
740 */
742 for_each_vcpu ( d, v )
743 {
744 if ( v != current )
745 vcpu_pause(v);
746 }
748 if ( d == current->domain )
749 vcpu_schedule_lock_irq(current);
751 if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
752 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
754 if ( d == current->domain )
755 vcpu_schedule_unlock_irq(current);
757 for_each_vcpu ( d, v )
758 {
759 if ( v != current )
760 vcpu_unpause(v);
761 }
763 return ret;
764 }
766 static void vcpu_periodic_timer_work(struct vcpu *v)
767 {
768 s_time_t now = NOW();
769 uint64_t periodic_next_event;
771 ASSERT(!active_timer(&v->periodic_timer));
773 if ( v->periodic_period == 0 )
774 return;
776 periodic_next_event = v->periodic_last_event + v->periodic_period;
778 /* The timer subsystem may call us up to TIME_SLOP ahead of deadline. */
779 if ( (now + TIME_SLOP) > periodic_next_event )
780 {
781 send_timer_event(v);
782 v->periodic_last_event = now;
783 periodic_next_event = now + v->periodic_period;
784 }
786 v->periodic_timer.cpu = smp_processor_id();
787 set_timer(&v->periodic_timer, periodic_next_event);
788 }
790 /*
791 * The main function
792 * - deschedule the current domain (scheduler independent).
793 * - pick a new domain (scheduler dependent).
794 */
795 static void schedule(void)
796 {
797 struct vcpu *prev = current, *next = NULL;
798 s_time_t now = NOW();
799 struct schedule_data *sd;
800 struct task_slice next_slice;
802 ASSERT(!in_irq());
803 ASSERT(this_cpu(mc_state).flags == 0);
805 perfc_incr(sched_run);
807 sd = &this_cpu(schedule_data);
809 spin_lock_irq(&sd->schedule_lock);
811 stop_timer(&sd->s_timer);
813 /* get policy-specific decision on scheduling... */
814 next_slice = ops.do_schedule(now);
816 next = next_slice.task;
818 sd->curr = next;
820 if ( next_slice.time >= 0 ) /* -ve means no limit */
821 set_timer(&sd->s_timer, now + next_slice.time);
823 if ( unlikely(prev == next) )
824 {
825 spin_unlock_irq(&sd->schedule_lock);
826 trace_continue_running(next);
827 return continue_running(prev);
828 }
830 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
831 prev->domain->domain_id,
832 now - prev->runstate.state_entry_time);
833 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
834 next->domain->domain_id,
835 (next->runstate.state == RUNSTATE_runnable) ?
836 (now - next->runstate.state_entry_time) : 0,
837 next_slice.time);
839 ASSERT(prev->runstate.state == RUNSTATE_running);
840 vcpu_runstate_change(
841 prev,
842 (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
843 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
844 now);
845 prev->last_run_time = now;
847 ASSERT(next->runstate.state != RUNSTATE_running);
848 vcpu_runstate_change(next, RUNSTATE_running, now);
850 ASSERT(!next->is_running);
851 next->is_running = 1;
853 spin_unlock_irq(&sd->schedule_lock);
855 perfc_incr(sched_ctx);
857 stop_timer(&prev->periodic_timer);
859 /* Ensure that the domain has an up-to-date time base. */
860 update_vcpu_system_time(next);
861 vcpu_periodic_timer_work(next);
863 TRACE_4D(TRC_SCHED_SWITCH,
864 prev->domain->domain_id, prev->vcpu_id,
865 next->domain->domain_id, next->vcpu_id);
867 context_switch(prev, next);
868 }
870 void context_saved(struct vcpu *prev)
871 {
872 /* Clear running flag /after/ writing context to memory. */
873 smp_wmb();
875 prev->is_running = 0;
877 /* Check for migration request /after/ clearing running flag. */
878 smp_mb();
880 if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
881 vcpu_migrate(prev);
882 }
884 /* The scheduler timer: force a run through the scheduler */
885 static void s_timer_fn(void *unused)
886 {
887 raise_softirq(SCHEDULE_SOFTIRQ);
888 perfc_incr(sched_irq);
889 }
891 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
892 static void vcpu_periodic_timer_fn(void *data)
893 {
894 struct vcpu *v = data;
895 vcpu_periodic_timer_work(v);
896 }
898 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
899 static void vcpu_singleshot_timer_fn(void *data)
900 {
901 struct vcpu *v = data;
902 send_timer_event(v);
903 }
905 /* SCHEDOP_poll timeout callback. */
906 static void poll_timer_fn(void *data)
907 {
908 struct vcpu *v = data;
910 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
911 vcpu_unblock(v);
912 }
914 /* Initialise the data structures. */
915 void __init scheduler_init(void)
916 {
917 int i;
919 open_softirq(SCHEDULE_SOFTIRQ, schedule);
921 for_each_cpu ( i )
922 {
923 spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
924 init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
925 }
927 for ( i = 0; schedulers[i] != NULL; i++ )
928 {
929 ops = *schedulers[i];
930 if ( strcmp(ops.opt_name, opt_sched) == 0 )
931 break;
932 }
934 if ( schedulers[i] == NULL )
935 printk("Could not find scheduler: %s\n", opt_sched);
937 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
938 SCHED_OP(init);
939 }
941 void dump_runq(unsigned char key)
942 {
943 s_time_t now = NOW();
944 int i;
945 unsigned long flags;
947 local_irq_save(flags);
949 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
950 SCHED_OP(dump_settings);
951 printk("sched_smt_power_savings: %s\n",
952 sched_smt_power_savings? "enabled":"disabled");
953 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
955 for_each_online_cpu ( i )
956 {
957 spin_lock(&per_cpu(schedule_data, i).schedule_lock);
958 printk("CPU[%02d] ", i);
959 SCHED_OP(dump_cpu_state, i);
960 spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
961 }
963 local_irq_restore(flags);
964 }
966 void sched_tick_suspend(void)
967 {
968 SCHED_OP(tick_suspend);
969 }
971 void sched_tick_resume(void)
972 {
973 SCHED_OP(tick_resume);
974 }
976 #ifdef CONFIG_COMPAT
977 #include "compat/schedule.c"
978 #endif
980 #endif /* !COMPAT */
982 /*
983 * Local variables:
984 * mode: C
985 * c-set-style: "BSD"
986 * c-basic-offset: 4
987 * tab-width: 4
988 * indent-tabs-mode: nil
989 * End:
990 */