ia64/xen-unstable

view xen/common/schedule.c @ 14196:9d36026b1b43

xen: Cleanups and bug fixes after the rcu_lock_domain patch.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Mar 01 11:38:55 2007 +0000 (2007-03-01)
parents 271ffb1c12eb
children 215b799fa181
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #ifndef COMPAT
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/sched-if.h>
28 #include <xen/softirq.h>
29 #include <xen/trace.h>
30 #include <xen/mm.h>
31 #include <xen/errno.h>
32 #include <xen/guest_access.h>
33 #include <xen/multicall.h>
34 #include <public/sched.h>
36 /* opt_sched: scheduler - default to credit */
37 static char opt_sched[10] = "credit";
38 string_param("sched", opt_sched);
40 /* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
41 static unsigned int opt_dom0_vcpus_pin;
42 boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
44 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
46 /* Various timer handlers. */
47 static void s_timer_fn(void *unused);
48 static void t_timer_fn(void *unused);
49 static void vcpu_timer_fn(void *data);
50 static void poll_timer_fn(void *data);
52 /* This is global for now so that private implementations can reach it */
53 DEFINE_PER_CPU(struct schedule_data, schedule_data);
55 extern struct scheduler sched_sedf_def;
56 extern struct scheduler sched_credit_def;
57 static struct scheduler *schedulers[] = {
58 &sched_sedf_def,
59 &sched_credit_def,
60 NULL
61 };
63 static struct scheduler ops;
65 #define SCHED_OP(fn, ...) \
66 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
67 : (typeof(ops.fn(__VA_ARGS__)))0 )
69 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
70 static DEFINE_PER_CPU(struct timer, t_timer);
72 static inline void vcpu_runstate_change(
73 struct vcpu *v, int new_state, s_time_t new_entry_time)
74 {
75 ASSERT(v->runstate.state != new_state);
76 ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
78 v->runstate.time[v->runstate.state] +=
79 new_entry_time - v->runstate.state_entry_time;
80 v->runstate.state_entry_time = new_entry_time;
81 v->runstate.state = new_state;
82 }
84 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
85 {
86 if ( likely(v == current) )
87 {
88 /* Fast lock-free path. */
89 memcpy(runstate, &v->runstate, sizeof(*runstate));
90 ASSERT(runstate->state == RUNSTATE_running);
91 runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
92 }
93 else
94 {
95 vcpu_schedule_lock_irq(v);
96 memcpy(runstate, &v->runstate, sizeof(*runstate));
97 runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
98 vcpu_schedule_unlock_irq(v);
99 }
100 }
102 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
103 {
104 struct domain *d = v->domain;
106 /*
107 * Initialize processor and affinity settings. The idler, and potentially
108 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
109 */
110 v->processor = processor;
111 if ( is_idle_domain(d) || ((d->domain_id == 0) && opt_dom0_vcpus_pin) )
112 v->cpu_affinity = cpumask_of_cpu(processor);
113 else
114 cpus_setall(v->cpu_affinity);
116 /* Initialise the per-domain timers. */
117 init_timer(&v->timer, vcpu_timer_fn, v, v->processor);
118 init_timer(&v->poll_timer, poll_timer_fn, v, v->processor);
120 /* Idle VCPUs are scheduled immediately. */
121 if ( is_idle_domain(d) )
122 {
123 per_cpu(schedule_data, v->processor).curr = v;
124 per_cpu(schedule_data, v->processor).idle = v;
125 set_bit(_VCPUF_running, &v->vcpu_flags);
126 }
128 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
130 return SCHED_OP(init_vcpu, v);
131 }
133 void sched_destroy_vcpu(struct vcpu *v)
134 {
135 kill_timer(&v->timer);
136 kill_timer(&v->poll_timer);
137 SCHED_OP(destroy_vcpu, v);
138 }
140 int sched_init_domain(struct domain *d)
141 {
142 return SCHED_OP(init_domain, d);
143 }
145 void sched_destroy_domain(struct domain *d)
146 {
147 SCHED_OP(destroy_domain, d);
148 }
150 void vcpu_sleep_nosync(struct vcpu *v)
151 {
152 unsigned long flags;
154 vcpu_schedule_lock_irqsave(v, flags);
156 if ( likely(!vcpu_runnable(v)) )
157 {
158 if ( v->runstate.state == RUNSTATE_runnable )
159 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
161 SCHED_OP(sleep, v);
162 }
164 vcpu_schedule_unlock_irqrestore(v, flags);
166 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
167 }
169 void vcpu_sleep_sync(struct vcpu *v)
170 {
171 vcpu_sleep_nosync(v);
173 while ( !vcpu_runnable(v) && test_bit(_VCPUF_running, &v->vcpu_flags) )
174 cpu_relax();
176 sync_vcpu_execstate(v);
177 }
179 void vcpu_wake(struct vcpu *v)
180 {
181 unsigned long flags;
183 vcpu_schedule_lock_irqsave(v, flags);
185 if ( likely(vcpu_runnable(v)) )
186 {
187 if ( v->runstate.state >= RUNSTATE_blocked )
188 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
189 SCHED_OP(wake, v);
190 }
191 else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
192 {
193 if ( v->runstate.state == RUNSTATE_blocked )
194 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
195 }
197 vcpu_schedule_unlock_irqrestore(v, flags);
199 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
200 }
202 static void vcpu_migrate(struct vcpu *v)
203 {
204 unsigned long flags;
205 int old_cpu;
207 vcpu_schedule_lock_irqsave(v, flags);
209 if ( test_bit(_VCPUF_running, &v->vcpu_flags) ||
210 !test_and_clear_bit(_VCPUF_migrating, &v->vcpu_flags) )
211 {
212 vcpu_schedule_unlock_irqrestore(v, flags);
213 return;
214 }
216 /* Switch to new CPU, then unlock old CPU. */
217 old_cpu = v->processor;
218 v->processor = SCHED_OP(pick_cpu, v);
219 spin_unlock_irqrestore(
220 &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
222 /* Wake on new CPU. */
223 vcpu_wake(v);
224 }
226 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
227 {
228 cpumask_t online_affinity;
229 unsigned long flags;
231 if ( (v->domain->domain_id == 0) && opt_dom0_vcpus_pin )
232 return -EINVAL;
234 cpus_and(online_affinity, *affinity, cpu_online_map);
235 if ( cpus_empty(online_affinity) )
236 return -EINVAL;
238 vcpu_schedule_lock_irqsave(v, flags);
240 v->cpu_affinity = *affinity;
241 if ( !cpu_isset(v->processor, v->cpu_affinity) )
242 set_bit(_VCPUF_migrating, &v->vcpu_flags);
244 vcpu_schedule_unlock_irqrestore(v, flags);
246 if ( test_bit(_VCPUF_migrating, &v->vcpu_flags) )
247 {
248 vcpu_sleep_nosync(v);
249 vcpu_migrate(v);
250 }
252 return 0;
253 }
255 /* Block the currently-executing domain until a pertinent event occurs. */
256 static long do_block(void)
257 {
258 struct vcpu *v = current;
260 local_event_delivery_enable();
261 set_bit(_VCPUF_blocked, &v->vcpu_flags);
263 /* Check for events /after/ blocking: avoids wakeup waiting race. */
264 if ( local_events_need_delivery() )
265 {
266 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
267 }
268 else
269 {
270 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
271 raise_softirq(SCHEDULE_SOFTIRQ);
272 }
274 return 0;
275 }
277 static long do_poll(struct sched_poll *sched_poll)
278 {
279 struct vcpu *v = current;
280 struct domain *d = v->domain;
281 evtchn_port_t port;
282 long rc = 0;
283 unsigned int i;
285 /* Fairly arbitrary limit. */
286 if ( sched_poll->nr_ports > 128 )
287 return -EINVAL;
289 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
290 return -EFAULT;
292 /* These operations must occur in order. */
293 set_bit(_VCPUF_blocked, &v->vcpu_flags);
294 set_bit(_VCPUF_polling, &v->vcpu_flags);
295 set_bit(_DOMF_polling, &d->domain_flags);
297 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
298 for ( i = 0; i < sched_poll->nr_ports; i++ )
299 {
300 rc = -EFAULT;
301 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
302 goto out;
304 rc = -EINVAL;
305 if ( port >= MAX_EVTCHNS(d) )
306 goto out;
308 rc = 0;
309 if ( test_bit(port, shared_info_addr(d, evtchn_pending)) )
310 goto out;
311 }
313 if ( sched_poll->timeout != 0 )
314 set_timer(&v->poll_timer, sched_poll->timeout);
316 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
317 raise_softirq(SCHEDULE_SOFTIRQ);
319 return 0;
321 out:
322 clear_bit(_VCPUF_polling, &v->vcpu_flags);
323 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
324 return rc;
325 }
327 /* Voluntarily yield the processor for this allocation. */
328 static long do_yield(void)
329 {
330 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
331 raise_softirq(SCHEDULE_SOFTIRQ);
332 return 0;
333 }
335 long do_sched_op_compat(int cmd, unsigned long arg)
336 {
337 long ret = 0;
339 switch ( cmd )
340 {
341 case SCHEDOP_yield:
342 {
343 ret = do_yield();
344 break;
345 }
347 case SCHEDOP_block:
348 {
349 ret = do_block();
350 break;
351 }
353 case SCHEDOP_shutdown:
354 {
355 TRACE_3D(TRC_SCHED_SHUTDOWN,
356 current->domain->domain_id, current->vcpu_id, arg);
357 domain_shutdown(current->domain, (u8)arg);
358 break;
359 }
361 default:
362 ret = -ENOSYS;
363 }
365 return ret;
366 }
368 typedef long ret_t;
370 #endif /* !COMPAT */
372 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
373 {
374 ret_t ret = 0;
376 switch ( cmd )
377 {
378 case SCHEDOP_yield:
379 {
380 ret = do_yield();
381 break;
382 }
384 case SCHEDOP_block:
385 {
386 ret = do_block();
387 break;
388 }
390 case SCHEDOP_shutdown:
391 {
392 struct sched_shutdown sched_shutdown;
394 ret = -EFAULT;
395 if ( copy_from_guest(&sched_shutdown, arg, 1) )
396 break;
398 ret = 0;
399 TRACE_3D(TRC_SCHED_SHUTDOWN,
400 current->domain->domain_id, current->vcpu_id,
401 sched_shutdown.reason);
402 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
404 break;
405 }
407 case SCHEDOP_poll:
408 {
409 struct sched_poll sched_poll;
411 ret = -EFAULT;
412 if ( copy_from_guest(&sched_poll, arg, 1) )
413 break;
415 ret = do_poll(&sched_poll);
417 break;
418 }
420 case SCHEDOP_remote_shutdown:
421 {
422 struct domain *d;
423 struct sched_remote_shutdown sched_remote_shutdown;
425 if ( !IS_PRIV(current->domain) )
426 return -EPERM;
428 ret = -EFAULT;
429 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
430 break;
432 ret = -ESRCH;
433 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
434 if ( d == NULL )
435 break;
437 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
438 rcu_unlock_domain(d);
439 ret = 0;
441 break;
442 }
444 default:
445 ret = -ENOSYS;
446 }
448 return ret;
449 }
451 #ifndef COMPAT
453 /* Per-domain one-shot-timer hypercall. */
454 long do_set_timer_op(s_time_t timeout)
455 {
456 struct vcpu *v = current;
457 s_time_t offset = timeout - NOW();
459 if ( timeout == 0 )
460 {
461 stop_timer(&v->timer);
462 }
463 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
464 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
465 {
466 /*
467 * Linux workaround: occasionally we will see timeouts a long way in
468 * the future due to wrapping in Linux's jiffy time handling. We check
469 * for timeouts wrapped negative, and for positive timeouts more than
470 * about 13 days in the future (2^50ns). The correct fix is to trigger
471 * an interrupt immediately (since Linux in fact has pending work to
472 * do in this situation). However, older guests also set a long timeout
473 * when they have *no* pending timers at all: setting an immediate
474 * timeout in this case can burn a lot of CPU. We therefore go for a
475 * reasonable middleground of triggering a timer event in 100ms.
476 */
477 gdprintk(XENLOG_INFO, "Warning: huge timeout set by domain %d (vcpu %d):"
478 " %"PRIx64"\n",
479 v->domain->domain_id, v->vcpu_id, (uint64_t)timeout);
480 set_timer(&v->timer, NOW() + MILLISECS(100));
481 }
482 else
483 {
484 set_timer(&v->timer, timeout);
485 }
487 return 0;
488 }
490 /* sched_id - fetch ID of current scheduler */
491 int sched_id(void)
492 {
493 return ops.sched_id;
494 }
496 /* Adjust scheduling parameter for a given domain. */
497 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
498 {
499 struct vcpu *v;
501 if ( (op->sched_id != ops.sched_id) ||
502 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
503 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
504 return -EINVAL;
506 /*
507 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
508 * we acquire the local schedule_lock to guard against concurrent updates.
509 *
510 * We only acquire the local schedule lock after we have paused all other
511 * VCPUs in this domain. There are two reasons for this:
512 * 1- We don't want to hold up interrupts as pausing a VCPU can
513 * trigger a tlb shootdown.
514 * 2- Pausing other VCPUs involves briefly locking the schedule
515 * lock of the CPU they are running on. This CPU could be the
516 * same as ours.
517 */
519 for_each_vcpu ( d, v )
520 {
521 if ( v != current )
522 vcpu_pause(v);
523 }
525 if ( d == current->domain )
526 vcpu_schedule_lock_irq(current);
528 SCHED_OP(adjust, d, op);
529 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
531 if ( d == current->domain )
532 vcpu_schedule_unlock_irq(current);
534 for_each_vcpu ( d, v )
535 {
536 if ( v != current )
537 vcpu_unpause(v);
538 }
540 return 0;
541 }
543 /*
544 * The main function
545 * - deschedule the current domain (scheduler independent).
546 * - pick a new domain (scheduler dependent).
547 */
548 static void schedule(void)
549 {
550 struct vcpu *prev = current, *next = NULL;
551 s_time_t now = NOW();
552 struct schedule_data *sd;
553 struct task_slice next_slice;
554 s32 r_time; /* time for new dom to run */
556 ASSERT(!in_irq());
557 ASSERT(this_cpu(mc_state).flags == 0);
559 perfc_incrc(sched_run);
561 sd = &this_cpu(schedule_data);
563 spin_lock_irq(&sd->schedule_lock);
565 stop_timer(&sd->s_timer);
567 /* get policy-specific decision on scheduling... */
568 next_slice = ops.do_schedule(now);
570 r_time = next_slice.time;
571 next = next_slice.task;
573 sd->curr = next;
575 set_timer(&sd->s_timer, now + r_time);
577 if ( unlikely(prev == next) )
578 {
579 spin_unlock_irq(&sd->schedule_lock);
580 return continue_running(prev);
581 }
583 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
584 prev->domain->domain_id,
585 now - prev->runstate.state_entry_time);
586 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
587 next->domain->domain_id,
588 (next->runstate.state == RUNSTATE_runnable) ?
589 (now - next->runstate.state_entry_time) : 0,
590 r_time);
592 ASSERT(prev->runstate.state == RUNSTATE_running);
593 vcpu_runstate_change(
594 prev,
595 (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked :
596 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
597 now);
599 ASSERT(next->runstate.state != RUNSTATE_running);
600 vcpu_runstate_change(next, RUNSTATE_running, now);
602 ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags));
603 set_bit(_VCPUF_running, &next->vcpu_flags);
605 spin_unlock_irq(&sd->schedule_lock);
607 perfc_incrc(sched_ctx);
609 prev->sleep_tick = sd->tick;
611 /* Ensure that the domain has an up-to-date time base. */
612 if ( !is_idle_vcpu(next) )
613 {
614 update_vcpu_system_time(next);
615 if ( next->sleep_tick != sd->tick )
616 send_timer_event(next);
617 }
619 TRACE_4D(TRC_SCHED_SWITCH,
620 prev->domain->domain_id, prev->vcpu_id,
621 next->domain->domain_id, next->vcpu_id);
623 context_switch(prev, next);
624 }
626 void context_saved(struct vcpu *prev)
627 {
628 clear_bit(_VCPUF_running, &prev->vcpu_flags);
630 if ( unlikely(test_bit(_VCPUF_migrating, &prev->vcpu_flags)) )
631 vcpu_migrate(prev);
632 }
634 /****************************************************************************
635 * Timers: the scheduler utilises a number of timers
636 * - s_timer: per CPU timer for preemption and scheduling decisions
637 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
638 * - dom_timer: per domain timer to specifiy timeout values
639 ****************************************************************************/
641 /* The scheduler timer: force a run through the scheduler */
642 static void s_timer_fn(void *unused)
643 {
644 raise_softirq(SCHEDULE_SOFTIRQ);
645 perfc_incrc(sched_irq);
646 }
648 /* Periodic tick timer: send timer event to current domain */
649 static void t_timer_fn(void *unused)
650 {
651 struct vcpu *v = current;
653 this_cpu(schedule_data).tick++;
655 if ( !is_idle_vcpu(v) )
656 {
657 update_vcpu_system_time(v);
658 send_timer_event(v);
659 }
661 page_scrub_schedule_work();
663 SCHED_OP(tick, smp_processor_id());
665 set_timer(&this_cpu(t_timer), NOW() + MILLISECS(10));
666 }
668 /* Per-VCPU timer function: sends a virtual timer interrupt. */
669 static void vcpu_timer_fn(void *data)
670 {
671 struct vcpu *v = data;
672 send_timer_event(v);
673 }
675 /* SCHEDOP_poll timeout callback. */
676 static void poll_timer_fn(void *data)
677 {
678 struct vcpu *v = data;
679 if ( test_and_clear_bit(_VCPUF_polling, &v->vcpu_flags) )
680 vcpu_unblock(v);
681 }
683 /* Initialise the data structures. */
684 void __init scheduler_init(void)
685 {
686 int i;
688 open_softirq(SCHEDULE_SOFTIRQ, schedule);
690 for_each_cpu ( i )
691 {
692 spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
693 init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
694 init_timer(&per_cpu(t_timer, i), t_timer_fn, NULL, i);
695 }
697 for ( i = 0; schedulers[i] != NULL; i++ )
698 {
699 ops = *schedulers[i];
700 if ( strcmp(ops.opt_name, opt_sched) == 0 )
701 break;
702 }
704 if ( schedulers[i] == NULL )
705 printk("Could not find scheduler: %s\n", opt_sched);
707 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
708 SCHED_OP(init);
709 }
711 /*
712 * Start a scheduler for each CPU
713 * This has to be done *after* the timers, e.g., APICs, have been initialised
714 */
715 void schedulers_start(void)
716 {
717 t_timer_fn(0);
718 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
719 }
721 void dump_runq(unsigned char key)
722 {
723 s_time_t now = NOW();
724 int i;
725 unsigned long flags;
727 local_irq_save(flags);
729 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
730 SCHED_OP(dump_settings);
731 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
733 for_each_online_cpu ( i )
734 {
735 spin_lock(&per_cpu(schedule_data, i).schedule_lock);
736 printk("CPU[%02d] ", i);
737 SCHED_OP(dump_cpu_state, i);
738 spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
739 }
741 local_irq_restore(flags);
742 }
744 #ifdef CONFIG_COMPAT
745 #include "compat/schedule.c"
746 #endif
748 #endif /* !COMPAT */
750 /*
751 * Local variables:
752 * mode: C
753 * c-set-style: "BSD"
754 * c-basic-offset: 4
755 * tab-width: 4
756 * indent-tabs-mode: nil
757 * End:
758 */