ia64/xen-unstable

view xen/common/schedule.c @ 9776:72f9c751d3ea

Replace &foo[0] with foo where the latter seems cleaner
(which is usually, and particularly when its an argument
to one of the bitops functions).

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Apr 19 18:32:20 2006 +0100 (2006-04-19)
parents b128f55ca05c
children 3145b215598c
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #include <xen/config.h>
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/sched.h>
20 #include <xen/domain.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <xen/mm.h>
30 #include <xen/guest_access.h>
31 #include <public/sched.h>
32 #include <public/sched_ctl.h>
34 extern void arch_getdomaininfo_ctxt(struct vcpu *,
35 struct vcpu_guest_context *);
36 /* opt_sched: scheduler - default to SEDF */
37 static char opt_sched[10] = "sedf";
38 string_param("sched", opt_sched);
40 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
42 /* Various timer handlers. */
43 static void s_timer_fn(void *unused);
44 static void t_timer_fn(void *unused);
45 static void dom_timer_fn(void *data);
46 static void poll_timer_fn(void *data);
48 /* This is global for now so that private implementations can reach it */
49 struct schedule_data schedule_data[NR_CPUS];
51 extern struct scheduler sched_bvt_def;
52 extern struct scheduler sched_sedf_def;
53 static struct scheduler *schedulers[] = {
54 &sched_bvt_def,
55 &sched_sedf_def,
56 NULL
57 };
59 static void __enter_scheduler(void);
61 static struct scheduler ops;
63 #define SCHED_OP(fn, ...) \
64 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
65 : (typeof(ops.fn(__VA_ARGS__)))0 )
67 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
68 static struct timer t_timer[NR_CPUS];
70 static inline void vcpu_runstate_change(
71 struct vcpu *v, int new_state, s_time_t new_entry_time)
72 {
73 ASSERT(v->runstate.state != new_state);
74 ASSERT(spin_is_locked(&schedule_data[v->processor].schedule_lock));
76 v->runstate.time[v->runstate.state] +=
77 new_entry_time - v->runstate.state_entry_time;
78 v->runstate.state_entry_time = new_entry_time;
79 v->runstate.state = new_state;
80 }
82 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
83 {
84 if ( likely(v == current) )
85 {
86 /* Fast lock-free path. */
87 memcpy(runstate, &v->runstate, sizeof(*runstate));
88 ASSERT(runstate->state == RUNSTATE_running);
89 runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
90 }
91 else
92 {
93 vcpu_schedule_lock_irq(v);
94 memcpy(runstate, &v->runstate, sizeof(*runstate));
95 runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
96 vcpu_schedule_unlock_irq(v);
97 }
98 }
100 struct domain *alloc_domain(void)
101 {
102 struct domain *d;
104 if ( (d = xmalloc(struct domain)) != NULL )
105 memset(d, 0, sizeof(*d));
107 return d;
108 }
110 void free_domain(struct domain *d)
111 {
112 struct vcpu *v;
113 int i;
115 for_each_vcpu ( d, v )
116 sched_rem_domain(v);
118 SCHED_OP(free_task, d);
120 for ( i = MAX_VIRT_CPUS-1; i >= 0; i-- )
121 if ( (v = d->vcpu[i]) != NULL )
122 free_vcpu_struct(v);
124 xfree(d);
125 }
127 struct vcpu *alloc_vcpu(
128 struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
129 {
130 struct vcpu *v;
132 BUG_ON(d->vcpu[vcpu_id] != NULL);
134 if ( (v = alloc_vcpu_struct(d, vcpu_id)) == NULL )
135 return NULL;
137 v->domain = d;
138 v->vcpu_id = vcpu_id;
139 v->processor = cpu_id;
140 atomic_set(&v->pausecnt, 0);
141 v->vcpu_info = &d->shared_info->vcpu_info[vcpu_id];
143 v->cpu_affinity = is_idle_domain(d) ?
144 cpumask_of_cpu(cpu_id) : CPU_MASK_ALL;
146 v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
147 v->runstate.state_entry_time = NOW();
149 if ( (vcpu_id != 0) && !is_idle_domain(d) )
150 set_bit(_VCPUF_down, &v->vcpu_flags);
152 if ( SCHED_OP(alloc_task, v) < 0 )
153 {
154 free_vcpu_struct(v);
155 return NULL;
156 }
158 d->vcpu[vcpu_id] = v;
159 if ( vcpu_id != 0 )
160 d->vcpu[v->vcpu_id-1]->next_in_list = v;
162 sched_add_domain(v);
164 return v;
165 }
167 void sched_add_domain(struct vcpu *v)
168 {
169 /* Initialise the per-domain timers. */
170 init_timer(&v->timer, dom_timer_fn, v, v->processor);
171 init_timer(&v->poll_timer, poll_timer_fn, v, v->processor);
173 if ( is_idle_vcpu(v) )
174 {
175 schedule_data[v->processor].curr = v;
176 schedule_data[v->processor].idle = v;
177 set_bit(_VCPUF_running, &v->vcpu_flags);
178 }
180 SCHED_OP(add_task, v);
181 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
182 }
184 void sched_rem_domain(struct vcpu *v)
185 {
186 kill_timer(&v->timer);
187 kill_timer(&v->poll_timer);
189 SCHED_OP(rem_task, v);
190 TRACE_2D(TRC_SCHED_DOM_REM, v->domain->domain_id, v->vcpu_id);
191 }
193 void vcpu_sleep_nosync(struct vcpu *v)
194 {
195 unsigned long flags;
197 vcpu_schedule_lock_irqsave(v, flags);
199 if ( likely(!vcpu_runnable(v)) )
200 {
201 if ( v->runstate.state == RUNSTATE_runnable )
202 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
204 SCHED_OP(sleep, v);
205 }
207 vcpu_schedule_unlock_irqrestore(v, flags);
209 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
210 }
212 void vcpu_sleep_sync(struct vcpu *v)
213 {
214 vcpu_sleep_nosync(v);
216 while ( !vcpu_runnable(v) && test_bit(_VCPUF_running, &v->vcpu_flags) )
217 cpu_relax();
219 sync_vcpu_execstate(v);
220 }
222 void vcpu_wake(struct vcpu *v)
223 {
224 unsigned long flags;
226 vcpu_schedule_lock_irqsave(v, flags);
228 if ( likely(vcpu_runnable(v)) )
229 {
230 if ( v->runstate.state >= RUNSTATE_blocked )
231 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
232 SCHED_OP(wake, v);
233 }
234 else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
235 {
236 if ( v->runstate.state == RUNSTATE_blocked )
237 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
238 }
240 vcpu_schedule_unlock_irqrestore(v, flags);
242 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
243 }
245 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
246 {
247 cpumask_t online_affinity;
249 cpus_and(online_affinity, *affinity, cpu_online_map);
250 if ( cpus_empty(online_affinity) )
251 return -EINVAL;
253 return SCHED_OP(set_affinity, v, affinity);
254 }
256 /* Block the currently-executing domain until a pertinent event occurs. */
257 static long do_block(void)
258 {
259 struct vcpu *v = current;
261 v->vcpu_info->evtchn_upcall_mask = 0;
262 set_bit(_VCPUF_blocked, &v->vcpu_flags);
264 /* Check for events /after/ blocking: avoids wakeup waiting race. */
265 if ( event_pending(v) )
266 {
267 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
268 }
269 else
270 {
271 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
272 __enter_scheduler();
273 }
275 return 0;
276 }
278 static long do_poll(struct sched_poll *sched_poll)
279 {
280 struct vcpu *v = current;
281 evtchn_port_t port;
282 long rc = 0;
283 unsigned int i;
285 /* Fairly arbitrary limit. */
286 if ( sched_poll->nr_ports > 128 )
287 return -EINVAL;
289 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
290 return -EFAULT;
292 /* Ensure that upcalls are disabled: tested by evtchn_set_pending(). */
293 if ( !v->vcpu_info->evtchn_upcall_mask )
294 return -EINVAL;
296 set_bit(_VCPUF_blocked, &v->vcpu_flags);
298 /* Check for events /after/ blocking: avoids wakeup waiting race. */
299 for ( i = 0; i < sched_poll->nr_ports; i++ )
300 {
301 rc = -EFAULT;
302 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
303 goto out;
305 rc = -EINVAL;
306 if ( port >= MAX_EVTCHNS )
307 goto out;
309 rc = 0;
310 if ( evtchn_pending(v->domain, port) )
311 goto out;
312 }
314 if ( sched_poll->timeout != 0 )
315 set_timer(&v->poll_timer, sched_poll->timeout);
317 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
318 __enter_scheduler();
320 stop_timer(&v->poll_timer);
322 out:
323 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
324 return rc;
325 }
327 /* Voluntarily yield the processor for this allocation. */
328 static long do_yield(void)
329 {
330 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
331 __enter_scheduler();
332 return 0;
333 }
335 long do_sched_op_compat(int cmd, unsigned long arg)
336 {
337 long ret = 0;
339 switch ( cmd )
340 {
341 case SCHEDOP_yield:
342 {
343 ret = do_yield();
344 break;
345 }
347 case SCHEDOP_block:
348 {
349 ret = do_block();
350 break;
351 }
353 case SCHEDOP_shutdown:
354 {
355 TRACE_3D(TRC_SCHED_SHUTDOWN,
356 current->domain->domain_id, current->vcpu_id, arg);
357 domain_shutdown(current->domain, (u8)arg);
358 break;
359 }
361 default:
362 ret = -ENOSYS;
363 }
365 return ret;
366 }
368 long do_sched_op(int cmd, GUEST_HANDLE(void) arg)
369 {
370 long ret = 0;
372 switch ( cmd )
373 {
374 case SCHEDOP_yield:
375 {
376 ret = do_yield();
377 break;
378 }
380 case SCHEDOP_block:
381 {
382 ret = do_block();
383 break;
384 }
386 case SCHEDOP_shutdown:
387 {
388 struct sched_shutdown sched_shutdown;
390 ret = -EFAULT;
391 if ( copy_from_guest(&sched_shutdown, arg, 1) )
392 break;
394 ret = 0;
395 TRACE_3D(TRC_SCHED_SHUTDOWN,
396 current->domain->domain_id, current->vcpu_id,
397 sched_shutdown.reason);
398 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
400 break;
401 }
403 case SCHEDOP_poll:
404 {
405 struct sched_poll sched_poll;
407 ret = -EFAULT;
408 if ( copy_from_guest(&sched_poll, arg, 1) )
409 break;
411 ret = do_poll(&sched_poll);
413 break;
414 }
416 case SCHEDOP_remote_shutdown:
417 {
418 struct domain *d;
419 struct sched_remote_shutdown sched_remote_shutdown;
421 if ( !IS_PRIV(current->domain) )
422 return -EPERM;
424 ret = -EFAULT;
425 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
426 break;
428 ret = -ESRCH;
429 d = find_domain_by_id(sched_remote_shutdown.domain_id);
430 if ( d == NULL )
431 break;
433 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
434 put_domain(d);
435 ret = 0;
437 break;
438 }
440 default:
441 ret = -ENOSYS;
442 }
444 return ret;
445 }
447 /* Per-domain one-shot-timer hypercall. */
448 long do_set_timer_op(s_time_t timeout)
449 {
450 struct vcpu *v = current;
452 if ( timeout == 0 )
453 stop_timer(&v->timer);
454 else
455 set_timer(&v->timer, timeout);
457 return 0;
458 }
460 /* sched_id - fetch ID of current scheduler */
461 int sched_id(void)
462 {
463 return ops.sched_id;
464 }
466 long sched_ctl(struct sched_ctl_cmd *cmd)
467 {
468 if ( cmd->sched_id != ops.sched_id )
469 return -EINVAL;
471 SCHED_OP(control, cmd);
472 TRACE_0D(TRC_SCHED_CTL);
473 return 0;
474 }
477 /* Adjust scheduling parameter for a given domain. */
478 long sched_adjdom(struct sched_adjdom_cmd *cmd)
479 {
480 struct domain *d;
481 struct vcpu *v;
483 if ( (cmd->sched_id != ops.sched_id) ||
484 ((cmd->direction != SCHED_INFO_PUT) &&
485 (cmd->direction != SCHED_INFO_GET)) )
486 return -EINVAL;
488 d = find_domain_by_id(cmd->domain);
489 if ( d == NULL )
490 return -ESRCH;
492 /*
493 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
494 * we acquire the local schedule_lock to guard against concurrent updates.
495 *
496 * We only acquire the local schedule lock after we have paused all other
497 * VCPUs in this domain. There are two reasons for this:
498 * 1- We don't want to hold up interrupts as pausing a VCPU can
499 * trigger a tlb shootdown.
500 * 2- Pausing other VCPUs involves briefly locking the schedule
501 * lock of the CPU they are running on. This CPU could be the
502 * same as ours.
503 */
505 for_each_vcpu ( d, v )
506 {
507 if ( v != current )
508 vcpu_pause(v);
509 }
511 if ( d == current->domain )
512 vcpu_schedule_lock_irq(current);
514 SCHED_OP(adjdom, d, cmd);
515 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
517 if ( d == current->domain )
518 vcpu_schedule_unlock_irq(current);
520 for_each_vcpu ( d, v )
521 {
522 if ( v != current )
523 vcpu_unpause(v);
524 }
526 put_domain(d);
528 return 0;
529 }
531 /*
532 * The main function
533 * - deschedule the current domain (scheduler independent).
534 * - pick a new domain (scheduler dependent).
535 */
536 static void __enter_scheduler(void)
537 {
538 struct vcpu *prev = current, *next = NULL;
539 int cpu = smp_processor_id();
540 s_time_t now = NOW();
541 struct task_slice next_slice;
542 s32 r_time; /* time for new dom to run */
544 ASSERT(!in_irq());
546 perfc_incrc(sched_run);
548 spin_lock_irq(&schedule_data[cpu].schedule_lock);
550 stop_timer(&schedule_data[cpu].s_timer);
552 /* get policy-specific decision on scheduling... */
553 next_slice = ops.do_schedule(now);
555 r_time = next_slice.time;
556 next = next_slice.task;
558 schedule_data[cpu].curr = next;
560 set_timer(&schedule_data[cpu].s_timer, now + r_time);
562 if ( unlikely(prev == next) )
563 {
564 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
565 return continue_running(prev);
566 }
568 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
569 prev->domain->domain_id,
570 now - prev->runstate.state_entry_time);
571 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
572 next->domain->domain_id,
573 (next->runstate.state == RUNSTATE_runnable) ?
574 (now - next->runstate.state_entry_time) : 0,
575 r_time);
577 ASSERT(prev->runstate.state == RUNSTATE_running);
578 vcpu_runstate_change(
579 prev,
580 (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked :
581 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
582 now);
584 ASSERT(next->runstate.state != RUNSTATE_running);
585 vcpu_runstate_change(next, RUNSTATE_running, now);
587 ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags));
588 set_bit(_VCPUF_running, &next->vcpu_flags);
590 spin_unlock_irq(&schedule_data[cpu].schedule_lock);
592 perfc_incrc(sched_ctx);
594 prev->sleep_tick = schedule_data[cpu].tick;
596 /* Ensure that the domain has an up-to-date time base. */
597 if ( !is_idle_vcpu(next) )
598 {
599 update_vcpu_system_time(next);
600 if ( next->sleep_tick != schedule_data[cpu].tick )
601 send_timer_event(next);
602 }
604 TRACE_4D(TRC_SCHED_SWITCH,
605 prev->domain->domain_id, prev->vcpu_id,
606 next->domain->domain_id, next->vcpu_id);
608 context_switch(prev, next);
609 }
612 /****************************************************************************
613 * Timers: the scheduler utilises a number of timers
614 * - s_timer: per CPU timer for preemption and scheduling decisions
615 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
616 * - dom_timer: per domain timer to specifiy timeout values
617 ****************************************************************************/
619 /* The scheduler timer: force a run through the scheduler */
620 static void s_timer_fn(void *unused)
621 {
622 raise_softirq(SCHEDULE_SOFTIRQ);
623 perfc_incrc(sched_irq);
624 }
626 /* Periodic tick timer: send timer event to current domain */
627 static void t_timer_fn(void *unused)
628 {
629 struct vcpu *v = current;
630 unsigned int cpu = smp_processor_id();
632 schedule_data[cpu].tick++;
634 if ( !is_idle_vcpu(v) )
635 {
636 update_vcpu_system_time(v);
637 send_timer_event(v);
638 }
640 page_scrub_schedule_work();
642 set_timer(&t_timer[cpu], NOW() + MILLISECS(10));
643 }
645 /* Domain timer function, sends a virtual timer interrupt to domain */
646 static void dom_timer_fn(void *data)
647 {
648 struct vcpu *v = data;
650 update_vcpu_system_time(v);
651 send_timer_event(v);
652 }
654 /* SCHEDOP_poll timeout callback. */
655 static void poll_timer_fn(void *data)
656 {
657 struct vcpu *v = data;
658 vcpu_unblock(v);
659 }
661 /* Initialise the data structures. */
662 void __init scheduler_init(void)
663 {
664 int i, rc;
666 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
668 for ( i = 0; i < NR_CPUS; i++ )
669 {
670 spin_lock_init(&schedule_data[i].schedule_lock);
671 init_timer(&schedule_data[i].s_timer, s_timer_fn, NULL, i);
672 init_timer(&t_timer[i], t_timer_fn, NULL, i);
673 }
675 for ( i = 0; schedulers[i] != NULL; i++ )
676 {
677 ops = *schedulers[i];
678 if ( strcmp(ops.opt_name, opt_sched) == 0 )
679 break;
680 }
682 if ( schedulers[i] == NULL )
683 printk("Could not find scheduler: %s\n", opt_sched);
685 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
687 if ( idle_vcpu[0] != NULL )
688 {
689 schedule_data[0].curr = idle_vcpu[0];
690 schedule_data[0].idle = idle_vcpu[0];
692 rc = SCHED_OP(alloc_task, idle_vcpu[0]);
693 BUG_ON(rc < 0);
695 sched_add_domain(idle_vcpu[0]);
696 }
697 }
699 /*
700 * Start a scheduler for each CPU
701 * This has to be done *after* the timers, e.g., APICs, have been initialised
702 */
703 void schedulers_start(void)
704 {
705 t_timer_fn(0);
706 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
707 }
709 void dump_runq(unsigned char key)
710 {
711 s_time_t now = NOW();
712 int i;
713 unsigned long flags;
715 local_irq_save(flags);
717 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
718 SCHED_OP(dump_settings);
719 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
721 for_each_online_cpu ( i )
722 {
723 spin_lock(&schedule_data[i].schedule_lock);
724 printk("CPU[%02d] ", i);
725 SCHED_OP(dump_cpu_state,i);
726 spin_unlock(&schedule_data[i].schedule_lock);
727 }
729 local_irq_restore(flags);
730 }
732 /*
733 * Local variables:
734 * mode: C
735 * c-set-style: "BSD"
736 * c-basic-offset: 4
737 * tab-width: 4
738 * indent-tabs-mode: nil
739 * End:
740 */