ia64/xen-unstable

view xen/common/schedule.c @ 10794:79f4c91c5628

Fix Linux so that it does not set a timeout if there are no pending
timers. Fix Xen so that it does not immediately fire a timer event if
it sees a very long timeout -- sometimes this means that there are
no pending timers.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Jul 25 17:01:49 2006 +0100 (2006-07-25)
parents 3dfc2583a4f1
children 5e8c254c9dcd
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #include <xen/config.h>
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/sched.h>
20 #include <xen/domain.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <xen/mm.h>
30 #include <xen/guest_access.h>
31 #include <public/sched.h>
32 #include <public/sched_ctl.h>
34 extern void arch_getdomaininfo_ctxt(struct vcpu *,
35 struct vcpu_guest_context *);
36 /* opt_sched: scheduler - default to credit */
37 static char opt_sched[10] = "credit";
38 string_param("sched", opt_sched);
40 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
42 /* Various timer handlers. */
43 static void s_timer_fn(void *unused);
44 static void t_timer_fn(void *unused);
45 static void vcpu_timer_fn(void *data);
46 static void poll_timer_fn(void *data);
48 /* This is global for now so that private implementations can reach it */
49 struct schedule_data schedule_data[NR_CPUS];
51 extern struct scheduler sched_bvt_def;
52 extern struct scheduler sched_sedf_def;
53 extern struct scheduler sched_credit_def;
54 static struct scheduler *schedulers[] = {
55 &sched_bvt_def,
56 &sched_sedf_def,
57 &sched_credit_def,
58 NULL
59 };
61 static void __enter_scheduler(void);
63 static struct scheduler ops;
65 #define SCHED_OP(fn, ...) \
66 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
67 : (typeof(ops.fn(__VA_ARGS__)))0 )
69 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
70 static struct timer t_timer[NR_CPUS];
72 static inline void vcpu_runstate_change(
73 struct vcpu *v, int new_state, s_time_t new_entry_time)
74 {
75 ASSERT(v->runstate.state != new_state);
76 ASSERT(spin_is_locked(&schedule_data[v->processor].schedule_lock));
78 v->runstate.time[v->runstate.state] +=
79 new_entry_time - v->runstate.state_entry_time;
80 v->runstate.state_entry_time = new_entry_time;
81 v->runstate.state = new_state;
82 }
84 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
85 {
86 if ( likely(v == current) )
87 {
88 /* Fast lock-free path. */
89 memcpy(runstate, &v->runstate, sizeof(*runstate));
90 ASSERT(runstate->state == RUNSTATE_running);
91 runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
92 }
93 else
94 {
95 vcpu_schedule_lock_irq(v);
96 memcpy(runstate, &v->runstate, sizeof(*runstate));
97 runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
98 vcpu_schedule_unlock_irq(v);
99 }
100 }
102 int sched_init_vcpu(struct vcpu *v)
103 {
104 /* Initialise the per-domain timers. */
105 init_timer(&v->timer, vcpu_timer_fn, v, v->processor);
106 init_timer(&v->poll_timer, poll_timer_fn, v, v->processor);
108 if ( is_idle_vcpu(v) )
109 {
110 schedule_data[v->processor].curr = v;
111 schedule_data[v->processor].idle = v;
112 set_bit(_VCPUF_running, &v->vcpu_flags);
113 }
115 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
117 return SCHED_OP(init_vcpu, v);
118 }
120 void sched_destroy_domain(struct domain *d)
121 {
122 struct vcpu *v;
124 for_each_vcpu ( d, v )
125 {
126 kill_timer(&v->timer);
127 kill_timer(&v->poll_timer);
128 TRACE_2D(TRC_SCHED_DOM_REM, v->domain->domain_id, v->vcpu_id);
129 }
131 SCHED_OP(destroy_domain, d);
132 }
134 void vcpu_sleep_nosync(struct vcpu *v)
135 {
136 unsigned long flags;
138 vcpu_schedule_lock_irqsave(v, flags);
140 if ( likely(!vcpu_runnable(v)) )
141 {
142 if ( v->runstate.state == RUNSTATE_runnable )
143 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
145 SCHED_OP(sleep, v);
146 }
148 vcpu_schedule_unlock_irqrestore(v, flags);
150 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
151 }
153 void vcpu_sleep_sync(struct vcpu *v)
154 {
155 vcpu_sleep_nosync(v);
157 while ( !vcpu_runnable(v) && test_bit(_VCPUF_running, &v->vcpu_flags) )
158 cpu_relax();
160 sync_vcpu_execstate(v);
161 }
163 void vcpu_wake(struct vcpu *v)
164 {
165 unsigned long flags;
167 vcpu_schedule_lock_irqsave(v, flags);
169 if ( likely(vcpu_runnable(v)) )
170 {
171 if ( v->runstate.state >= RUNSTATE_blocked )
172 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
173 SCHED_OP(wake, v);
174 }
175 else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
176 {
177 if ( v->runstate.state == RUNSTATE_blocked )
178 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
179 }
181 vcpu_schedule_unlock_irqrestore(v, flags);
183 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
184 }
186 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
187 {
188 cpumask_t online_affinity;
190 cpus_and(online_affinity, *affinity, cpu_online_map);
191 if ( cpus_empty(online_affinity) )
192 return -EINVAL;
194 return SCHED_OP(set_affinity, v, affinity);
195 }
197 /* Block the currently-executing domain until a pertinent event occurs. */
198 static long do_block(void)
199 {
200 struct vcpu *v = current;
202 local_event_delivery_enable();
203 set_bit(_VCPUF_blocked, &v->vcpu_flags);
205 /* Check for events /after/ blocking: avoids wakeup waiting race. */
206 if ( local_events_need_delivery() )
207 {
208 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
209 }
210 else
211 {
212 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
213 __enter_scheduler();
214 }
216 return 0;
217 }
219 static long do_poll(struct sched_poll *sched_poll)
220 {
221 struct vcpu *v = current;
222 evtchn_port_t port;
223 long rc = 0;
224 unsigned int i;
226 /* Fairly arbitrary limit. */
227 if ( sched_poll->nr_ports > 128 )
228 return -EINVAL;
230 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
231 return -EFAULT;
233 /* These operations must occur in order. */
234 set_bit(_VCPUF_blocked, &v->vcpu_flags);
235 set_bit(_VCPUF_polling, &v->vcpu_flags);
236 set_bit(_DOMF_polling, &v->domain->domain_flags);
238 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
239 for ( i = 0; i < sched_poll->nr_ports; i++ )
240 {
241 rc = -EFAULT;
242 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
243 goto out;
245 rc = -EINVAL;
246 if ( port >= MAX_EVTCHNS )
247 goto out;
249 rc = 0;
250 if ( test_bit(port, v->domain->shared_info->evtchn_pending) )
251 goto out;
252 }
254 if ( sched_poll->timeout != 0 )
255 set_timer(&v->poll_timer, sched_poll->timeout);
257 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
258 __enter_scheduler();
260 stop_timer(&v->poll_timer);
262 out:
263 clear_bit(_VCPUF_polling, &v->vcpu_flags);
264 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
265 return rc;
266 }
268 /* Voluntarily yield the processor for this allocation. */
269 static long do_yield(void)
270 {
271 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
272 __enter_scheduler();
273 return 0;
274 }
276 long do_sched_op_compat(int cmd, unsigned long arg)
277 {
278 long ret = 0;
280 switch ( cmd )
281 {
282 case SCHEDOP_yield:
283 {
284 ret = do_yield();
285 break;
286 }
288 case SCHEDOP_block:
289 {
290 ret = do_block();
291 break;
292 }
294 case SCHEDOP_shutdown:
295 {
296 TRACE_3D(TRC_SCHED_SHUTDOWN,
297 current->domain->domain_id, current->vcpu_id, arg);
298 domain_shutdown(current->domain, (u8)arg);
299 break;
300 }
302 default:
303 ret = -ENOSYS;
304 }
306 return ret;
307 }
309 long do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
310 {
311 long ret = 0;
313 switch ( cmd )
314 {
315 case SCHEDOP_yield:
316 {
317 ret = do_yield();
318 break;
319 }
321 case SCHEDOP_block:
322 {
323 ret = do_block();
324 break;
325 }
327 case SCHEDOP_shutdown:
328 {
329 struct sched_shutdown sched_shutdown;
331 ret = -EFAULT;
332 if ( copy_from_guest(&sched_shutdown, arg, 1) )
333 break;
335 ret = 0;
336 TRACE_3D(TRC_SCHED_SHUTDOWN,
337 current->domain->domain_id, current->vcpu_id,
338 sched_shutdown.reason);
339 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
341 break;
342 }
344 case SCHEDOP_poll:
345 {
346 struct sched_poll sched_poll;
348 ret = -EFAULT;
349 if ( copy_from_guest(&sched_poll, arg, 1) )
350 break;
352 ret = do_poll(&sched_poll);
354 break;
355 }
357 case SCHEDOP_remote_shutdown:
358 {
359 struct domain *d;
360 struct sched_remote_shutdown sched_remote_shutdown;
362 if ( !IS_PRIV(current->domain) )
363 return -EPERM;
365 ret = -EFAULT;
366 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
367 break;
369 ret = -ESRCH;
370 d = find_domain_by_id(sched_remote_shutdown.domain_id);
371 if ( d == NULL )
372 break;
374 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
375 put_domain(d);
376 ret = 0;
378 break;
379 }
381 default:
382 ret = -ENOSYS;
383 }
385 return ret;
386 }
388 /* Per-domain one-shot-timer hypercall. */
389 long do_set_timer_op(s_time_t timeout)
390 {
391 struct vcpu *v = current;
392 s_time_t offset = timeout - NOW();
394 if ( timeout == 0 )
395 {
396 stop_timer(&v->timer);
397 }
398 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
399 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
400 {
401 /*
402 * Linux workaround: occasionally we will see timeouts a long way in
403 * the future due to wrapping in Linux's jiffy time handling. We check
404 * for timeouts wrapped negative, and for positive timeouts more than
405 * about 13 days in the future (2^50ns). The correct fix is to trigger
406 * an interrupt immediately (since Linux in fact has pending work to
407 * do in this situation). However, older guests also set a long timeout
408 * when they have *no* pending timers at all: setting an immediate
409 * timeout in this case can burn a lot of CPU. We therefore go for a
410 * reasonable middleground of triggering a timer event in 100ms.
411 */
412 DPRINTK("Warning: huge timeout set by domain %d (vcpu %d):"
413 " %"PRIx64"\n",
414 v->domain->domain_id, v->vcpu_id, (uint64_t)timeout);
415 set_timer(&v->timer, NOW() + MILLISECS(100));
416 }
417 else
418 {
419 set_timer(&v->timer, timeout);
420 }
422 return 0;
423 }
425 /* sched_id - fetch ID of current scheduler */
426 int sched_id(void)
427 {
428 return ops.sched_id;
429 }
431 long sched_ctl(struct sched_ctl_cmd *cmd)
432 {
433 if ( cmd->sched_id != ops.sched_id )
434 return -EINVAL;
436 SCHED_OP(control, cmd);
437 TRACE_0D(TRC_SCHED_CTL);
438 return 0;
439 }
442 /* Adjust scheduling parameter for a given domain. */
443 long sched_adjdom(struct sched_adjdom_cmd *cmd)
444 {
445 struct domain *d;
446 struct vcpu *v;
448 if ( (cmd->sched_id != ops.sched_id) ||
449 ((cmd->direction != SCHED_INFO_PUT) &&
450 (cmd->direction != SCHED_INFO_GET)) )
451 return -EINVAL;
453 d = find_domain_by_id(cmd->domain);
454 if ( d == NULL )
455 return -ESRCH;
457 /*
458 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
459 * we acquire the local schedule_lock to guard against concurrent updates.
460 *
461 * We only acquire the local schedule lock after we have paused all other
462 * VCPUs in this domain. There are two reasons for this:
463 * 1- We don't want to hold up interrupts as pausing a VCPU can
464 * trigger a tlb shootdown.
465 * 2- Pausing other VCPUs involves briefly locking the schedule
466 * lock of the CPU they are running on. This CPU could be the
467 * same as ours.
468 */
470 for_each_vcpu ( d, v )
471 {
472 if ( v != current )
473 vcpu_pause(v);
474 }
476 if ( d == current->domain )
477 vcpu_schedule_lock_irq(current);
479 SCHED_OP(adjdom, d, cmd);
480 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
482 if ( d == current->domain )
483 vcpu_schedule_unlock_irq(current);
485 for_each_vcpu ( d, v )
486 {
487 if ( v != current )
488 vcpu_unpause(v);
489 }
491 put_domain(d);
493 return 0;
494 }
496 /*
497 * The main function
498 * - deschedule the current domain (scheduler independent).
499 * - pick a new domain (scheduler dependent).
500 */
501 static void __enter_scheduler(void)
502 {
503 struct vcpu *prev = current, *next = NULL;
504 int cpu = smp_processor_id();
505 s_time_t now = NOW();
506 struct task_slice next_slice;
507 s32 r_time; /* time for new dom to run */
509 ASSERT(!in_irq());
511 perfc_incrc(sched_run);
513 spin_lock_irq(&schedule_data[cpu].schedule_lock);
515 stop_timer(&schedule_data[cpu].s_timer);
517 /* get policy-specific decision on scheduling... */
518 next_slice = ops.do_schedule(now);
520 r_time = next_slice.time;
521 next = next_slice.task;
523 schedule_data[cpu].curr = next;
525 set_timer(&schedule_data[cpu].s_timer, now + r_time);
527 if ( unlikely(prev == next) )
528 {
529 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
530 return continue_running(prev);
531 }
533 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
534 prev->domain->domain_id,
535 now - prev->runstate.state_entry_time);
536 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
537 next->domain->domain_id,
538 (next->runstate.state == RUNSTATE_runnable) ?
539 (now - next->runstate.state_entry_time) : 0,
540 r_time);
542 ASSERT(prev->runstate.state == RUNSTATE_running);
543 vcpu_runstate_change(
544 prev,
545 (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked :
546 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
547 now);
549 ASSERT(next->runstate.state != RUNSTATE_running);
550 vcpu_runstate_change(next, RUNSTATE_running, now);
552 ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags));
553 set_bit(_VCPUF_running, &next->vcpu_flags);
555 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
557 perfc_incrc(sched_ctx);
559 prev->sleep_tick = schedule_data[cpu].tick;
561 /* Ensure that the domain has an up-to-date time base. */
562 if ( !is_idle_vcpu(next) )
563 {
564 update_vcpu_system_time(next);
565 if ( next->sleep_tick != schedule_data[cpu].tick )
566 send_timer_event(next);
567 }
569 TRACE_4D(TRC_SCHED_SWITCH,
570 prev->domain->domain_id, prev->vcpu_id,
571 next->domain->domain_id, next->vcpu_id);
573 context_switch(prev, next);
574 }
577 /****************************************************************************
578 * Timers: the scheduler utilises a number of timers
579 * - s_timer: per CPU timer for preemption and scheduling decisions
580 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
581 * - dom_timer: per domain timer to specifiy timeout values
582 ****************************************************************************/
584 /* The scheduler timer: force a run through the scheduler */
585 static void s_timer_fn(void *unused)
586 {
587 raise_softirq(SCHEDULE_SOFTIRQ);
588 perfc_incrc(sched_irq);
589 }
591 /* Periodic tick timer: send timer event to current domain */
592 static void t_timer_fn(void *unused)
593 {
594 struct vcpu *v = current;
595 unsigned int cpu = smp_processor_id();
597 schedule_data[cpu].tick++;
599 if ( !is_idle_vcpu(v) )
600 {
601 update_vcpu_system_time(v);
602 send_timer_event(v);
603 }
605 page_scrub_schedule_work();
607 SCHED_OP(tick, cpu);
609 set_timer(&t_timer[cpu], NOW() + MILLISECS(10));
610 }
612 /* Per-VCPU timer function: sends a virtual timer interrupt. */
613 static void vcpu_timer_fn(void *data)
614 {
615 struct vcpu *v = data;
616 send_timer_event(v);
617 }
619 /* SCHEDOP_poll timeout callback. */
620 static void poll_timer_fn(void *data)
621 {
622 struct vcpu *v = data;
623 if ( test_and_clear_bit(_VCPUF_polling, &v->vcpu_flags) )
624 vcpu_unblock(v);
625 }
627 /* Initialise the data structures. */
628 void __init scheduler_init(void)
629 {
630 int i;
632 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
634 for ( i = 0; i < NR_CPUS; i++ )
635 {
636 spin_lock_init(&schedule_data[i].schedule_lock);
637 init_timer(&schedule_data[i].s_timer, s_timer_fn, NULL, i);
638 init_timer(&t_timer[i], t_timer_fn, NULL, i);
639 }
641 for ( i = 0; schedulers[i] != NULL; i++ )
642 {
643 ops = *schedulers[i];
644 if ( strcmp(ops.opt_name, opt_sched) == 0 )
645 break;
646 }
648 if ( schedulers[i] == NULL )
649 printk("Could not find scheduler: %s\n", opt_sched);
651 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
652 SCHED_OP(init);
653 }
655 /*
656 * Start a scheduler for each CPU
657 * This has to be done *after* the timers, e.g., APICs, have been initialised
658 */
659 void schedulers_start(void)
660 {
661 t_timer_fn(0);
662 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
663 }
665 void dump_runq(unsigned char key)
666 {
667 s_time_t now = NOW();
668 int i;
669 unsigned long flags;
671 local_irq_save(flags);
673 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
674 SCHED_OP(dump_settings);
675 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
677 for_each_online_cpu ( i )
678 {
679 spin_lock(&schedule_data[i].schedule_lock);
680 printk("CPU[%02d] ", i);
681 SCHED_OP(dump_cpu_state,i);
682 spin_unlock(&schedule_data[i].schedule_lock);
683 }
685 local_irq_restore(flags);
686 }
688 /*
689 * Local variables:
690 * mode: C
691 * c-set-style: "BSD"
692 * c-basic-offset: 4
693 * tab-width: 4
694 * indent-tabs-mode: nil
695 * End:
696 */