ia64/xen-unstable

view xen/common/schedule.c @ 12390:e28beea6d228

[IA64] Fix time services of EFI emulation

This patch serializes the execution of following efi.runtimes.
- GetTime
- SetTime
- GetWakeTime
- SetWakeTime

Linux/ia64 uses similar spinlocks in the EFI RTC driver.

Signed-off-by: Masaki Kanno <kanno.masaki@jp.fujitsu.com>
author awilliam@xenbuild.aw
date Fri Nov 10 12:03:19 2006 -0700 (2006-11-10)
parents bb6cd7ba259b
children cf98903ebb22
line source
1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
16 #include <xen/config.h>
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/sched.h>
20 #include <xen/domain.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <xen/mm.h>
30 #include <xen/errno.h>
31 #include <xen/guest_access.h>
32 #include <public/sched.h>
34 extern void arch_getdomaininfo_ctxt(struct vcpu *,
35 struct vcpu_guest_context *);
36 /* opt_sched: scheduler - default to credit */
37 static char opt_sched[10] = "credit";
38 string_param("sched", opt_sched);
40 /* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
41 static unsigned int opt_dom0_vcpus_pin;
42 boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
44 #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
46 /* Various timer handlers. */
47 static void s_timer_fn(void *unused);
48 static void t_timer_fn(void *unused);
49 static void vcpu_timer_fn(void *data);
50 static void poll_timer_fn(void *data);
52 /* This is global for now so that private implementations can reach it */
53 DEFINE_PER_CPU(struct schedule_data, schedule_data);
55 extern struct scheduler sched_sedf_def;
56 extern struct scheduler sched_credit_def;
57 static struct scheduler *schedulers[] = {
58 &sched_sedf_def,
59 &sched_credit_def,
60 NULL
61 };
63 static void __enter_scheduler(void);
65 static struct scheduler ops;
67 #define SCHED_OP(fn, ...) \
68 (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \
69 : (typeof(ops.fn(__VA_ARGS__)))0 )
71 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
72 static DEFINE_PER_CPU(struct timer, t_timer);
74 static inline void vcpu_runstate_change(
75 struct vcpu *v, int new_state, s_time_t new_entry_time)
76 {
77 ASSERT(v->runstate.state != new_state);
78 ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
80 v->runstate.time[v->runstate.state] +=
81 new_entry_time - v->runstate.state_entry_time;
82 v->runstate.state_entry_time = new_entry_time;
83 v->runstate.state = new_state;
84 }
86 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
87 {
88 if ( likely(v == current) )
89 {
90 /* Fast lock-free path. */
91 memcpy(runstate, &v->runstate, sizeof(*runstate));
92 ASSERT(runstate->state == RUNSTATE_running);
93 runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
94 }
95 else
96 {
97 vcpu_schedule_lock_irq(v);
98 memcpy(runstate, &v->runstate, sizeof(*runstate));
99 runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
100 vcpu_schedule_unlock_irq(v);
101 }
102 }
104 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
105 {
106 struct domain *d = v->domain;
108 /*
109 * Initialize processor and affinity settings. The idler, and potentially
110 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
111 */
112 v->processor = processor;
113 if ( is_idle_domain(d) || ((d->domain_id == 0) && opt_dom0_vcpus_pin) )
114 v->cpu_affinity = cpumask_of_cpu(processor);
115 else
116 cpus_setall(v->cpu_affinity);
118 /* Initialise the per-domain timers. */
119 init_timer(&v->timer, vcpu_timer_fn, v, v->processor);
120 init_timer(&v->poll_timer, poll_timer_fn, v, v->processor);
122 /* Idle VCPUs are scheduled immediately. */
123 if ( is_idle_domain(d) )
124 {
125 per_cpu(schedule_data, v->processor).curr = v;
126 per_cpu(schedule_data, v->processor).idle = v;
127 set_bit(_VCPUF_running, &v->vcpu_flags);
128 }
130 TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
132 return SCHED_OP(init_vcpu, v);
133 }
135 void sched_destroy_vcpu(struct vcpu *v)
136 {
137 kill_timer(&v->timer);
138 kill_timer(&v->poll_timer);
139 SCHED_OP(destroy_vcpu, v);
140 }
142 int sched_init_domain(struct domain *d)
143 {
144 return SCHED_OP(init_domain, d);
145 }
147 void sched_destroy_domain(struct domain *d)
148 {
149 SCHED_OP(destroy_domain, d);
150 }
152 void vcpu_sleep_nosync(struct vcpu *v)
153 {
154 unsigned long flags;
156 vcpu_schedule_lock_irqsave(v, flags);
158 if ( likely(!vcpu_runnable(v)) )
159 {
160 if ( v->runstate.state == RUNSTATE_runnable )
161 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
163 SCHED_OP(sleep, v);
164 }
166 vcpu_schedule_unlock_irqrestore(v, flags);
168 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
169 }
171 void vcpu_sleep_sync(struct vcpu *v)
172 {
173 vcpu_sleep_nosync(v);
175 while ( !vcpu_runnable(v) && test_bit(_VCPUF_running, &v->vcpu_flags) )
176 cpu_relax();
178 sync_vcpu_execstate(v);
179 }
181 void vcpu_wake(struct vcpu *v)
182 {
183 unsigned long flags;
185 vcpu_schedule_lock_irqsave(v, flags);
187 if ( likely(vcpu_runnable(v)) )
188 {
189 if ( v->runstate.state >= RUNSTATE_blocked )
190 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
191 SCHED_OP(wake, v);
192 }
193 else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
194 {
195 if ( v->runstate.state == RUNSTATE_blocked )
196 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
197 }
199 vcpu_schedule_unlock_irqrestore(v, flags);
201 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
202 }
204 static void vcpu_migrate(struct vcpu *v)
205 {
206 unsigned long flags;
207 int old_cpu;
209 vcpu_schedule_lock_irqsave(v, flags);
211 if ( test_bit(_VCPUF_running, &v->vcpu_flags) ||
212 !test_and_clear_bit(_VCPUF_migrating, &v->vcpu_flags) )
213 {
214 vcpu_schedule_unlock_irqrestore(v, flags);
215 return;
216 }
218 /* Switch to new CPU, then unlock old CPU. */
219 old_cpu = v->processor;
220 v->processor = SCHED_OP(pick_cpu, v);
221 spin_unlock_irqrestore(
222 &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
224 /* Wake on new CPU. */
225 vcpu_wake(v);
226 }
228 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
229 {
230 cpumask_t online_affinity;
231 unsigned long flags;
233 if ( (v->domain->domain_id == 0) && opt_dom0_vcpus_pin )
234 return -EINVAL;
236 cpus_and(online_affinity, *affinity, cpu_online_map);
237 if ( cpus_empty(online_affinity) )
238 return -EINVAL;
240 vcpu_schedule_lock_irqsave(v, flags);
242 v->cpu_affinity = *affinity;
243 if ( !cpu_isset(v->processor, v->cpu_affinity) )
244 set_bit(_VCPUF_migrating, &v->vcpu_flags);
246 vcpu_schedule_unlock_irqrestore(v, flags);
248 if ( test_bit(_VCPUF_migrating, &v->vcpu_flags) )
249 {
250 vcpu_sleep_nosync(v);
251 vcpu_migrate(v);
252 }
254 return 0;
255 }
257 /* Block the currently-executing domain until a pertinent event occurs. */
258 static long do_block(void)
259 {
260 struct vcpu *v = current;
262 local_event_delivery_enable();
263 set_bit(_VCPUF_blocked, &v->vcpu_flags);
265 /* Check for events /after/ blocking: avoids wakeup waiting race. */
266 if ( local_events_need_delivery() )
267 {
268 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
269 }
270 else
271 {
272 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
273 __enter_scheduler();
274 }
276 return 0;
277 }
279 static long do_poll(struct sched_poll *sched_poll)
280 {
281 struct vcpu *v = current;
282 evtchn_port_t port;
283 long rc = 0;
284 unsigned int i;
286 /* Fairly arbitrary limit. */
287 if ( sched_poll->nr_ports > 128 )
288 return -EINVAL;
290 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
291 return -EFAULT;
293 /* These operations must occur in order. */
294 set_bit(_VCPUF_blocked, &v->vcpu_flags);
295 set_bit(_VCPUF_polling, &v->vcpu_flags);
296 set_bit(_DOMF_polling, &v->domain->domain_flags);
298 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
299 for ( i = 0; i < sched_poll->nr_ports; i++ )
300 {
301 rc = -EFAULT;
302 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
303 goto out;
305 rc = -EINVAL;
306 if ( port >= MAX_EVTCHNS )
307 goto out;
309 rc = 0;
310 if ( test_bit(port, v->domain->shared_info->evtchn_pending) )
311 goto out;
312 }
314 if ( sched_poll->timeout != 0 )
315 set_timer(&v->poll_timer, sched_poll->timeout);
317 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
318 __enter_scheduler();
320 stop_timer(&v->poll_timer);
322 out:
323 clear_bit(_VCPUF_polling, &v->vcpu_flags);
324 clear_bit(_VCPUF_blocked, &v->vcpu_flags);
325 return rc;
326 }
328 /* Voluntarily yield the processor for this allocation. */
329 static long do_yield(void)
330 {
331 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
332 __enter_scheduler();
333 return 0;
334 }
336 long do_sched_op_compat(int cmd, unsigned long arg)
337 {
338 long ret = 0;
340 switch ( cmd )
341 {
342 case SCHEDOP_yield:
343 {
344 ret = do_yield();
345 break;
346 }
348 case SCHEDOP_block:
349 {
350 ret = do_block();
351 break;
352 }
354 case SCHEDOP_shutdown:
355 {
356 TRACE_3D(TRC_SCHED_SHUTDOWN,
357 current->domain->domain_id, current->vcpu_id, arg);
358 domain_shutdown(current->domain, (u8)arg);
359 break;
360 }
362 default:
363 ret = -ENOSYS;
364 }
366 return ret;
367 }
369 long do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
370 {
371 long ret = 0;
373 switch ( cmd )
374 {
375 case SCHEDOP_yield:
376 {
377 ret = do_yield();
378 break;
379 }
381 case SCHEDOP_block:
382 {
383 ret = do_block();
384 break;
385 }
387 case SCHEDOP_shutdown:
388 {
389 struct sched_shutdown sched_shutdown;
391 ret = -EFAULT;
392 if ( copy_from_guest(&sched_shutdown, arg, 1) )
393 break;
395 ret = 0;
396 TRACE_3D(TRC_SCHED_SHUTDOWN,
397 current->domain->domain_id, current->vcpu_id,
398 sched_shutdown.reason);
399 domain_shutdown(current->domain, (u8)sched_shutdown.reason);
401 break;
402 }
404 case SCHEDOP_poll:
405 {
406 struct sched_poll sched_poll;
408 ret = -EFAULT;
409 if ( copy_from_guest(&sched_poll, arg, 1) )
410 break;
412 ret = do_poll(&sched_poll);
414 break;
415 }
417 case SCHEDOP_remote_shutdown:
418 {
419 struct domain *d;
420 struct sched_remote_shutdown sched_remote_shutdown;
422 if ( !IS_PRIV(current->domain) )
423 return -EPERM;
425 ret = -EFAULT;
426 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
427 break;
429 ret = -ESRCH;
430 d = find_domain_by_id(sched_remote_shutdown.domain_id);
431 if ( d == NULL )
432 break;
434 domain_shutdown(d, (u8)sched_remote_shutdown.reason);
435 put_domain(d);
436 ret = 0;
438 break;
439 }
441 default:
442 ret = -ENOSYS;
443 }
445 return ret;
446 }
448 /* Per-domain one-shot-timer hypercall. */
449 long do_set_timer_op(s_time_t timeout)
450 {
451 struct vcpu *v = current;
452 s_time_t offset = timeout - NOW();
454 if ( timeout == 0 )
455 {
456 stop_timer(&v->timer);
457 }
458 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
459 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
460 {
461 /*
462 * Linux workaround: occasionally we will see timeouts a long way in
463 * the future due to wrapping in Linux's jiffy time handling. We check
464 * for timeouts wrapped negative, and for positive timeouts more than
465 * about 13 days in the future (2^50ns). The correct fix is to trigger
466 * an interrupt immediately (since Linux in fact has pending work to
467 * do in this situation). However, older guests also set a long timeout
468 * when they have *no* pending timers at all: setting an immediate
469 * timeout in this case can burn a lot of CPU. We therefore go for a
470 * reasonable middleground of triggering a timer event in 100ms.
471 */
472 gdprintk(XENLOG_INFO, "Warning: huge timeout set by domain %d (vcpu %d):"
473 " %"PRIx64"\n",
474 v->domain->domain_id, v->vcpu_id, (uint64_t)timeout);
475 set_timer(&v->timer, NOW() + MILLISECS(100));
476 }
477 else
478 {
479 set_timer(&v->timer, timeout);
480 }
482 return 0;
483 }
485 /* sched_id - fetch ID of current scheduler */
486 int sched_id(void)
487 {
488 return ops.sched_id;
489 }
491 /* Adjust scheduling parameter for a given domain. */
492 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
493 {
494 struct vcpu *v;
496 if ( (op->sched_id != ops.sched_id) ||
497 ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
498 (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
499 return -EINVAL;
501 /*
502 * Most VCPUs we can simply pause. If we are adjusting this VCPU then
503 * we acquire the local schedule_lock to guard against concurrent updates.
504 *
505 * We only acquire the local schedule lock after we have paused all other
506 * VCPUs in this domain. There are two reasons for this:
507 * 1- We don't want to hold up interrupts as pausing a VCPU can
508 * trigger a tlb shootdown.
509 * 2- Pausing other VCPUs involves briefly locking the schedule
510 * lock of the CPU they are running on. This CPU could be the
511 * same as ours.
512 */
514 for_each_vcpu ( d, v )
515 {
516 if ( v != current )
517 vcpu_pause(v);
518 }
520 if ( d == current->domain )
521 vcpu_schedule_lock_irq(current);
523 SCHED_OP(adjust, d, op);
524 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
526 if ( d == current->domain )
527 vcpu_schedule_unlock_irq(current);
529 for_each_vcpu ( d, v )
530 {
531 if ( v != current )
532 vcpu_unpause(v);
533 }
535 return 0;
536 }
538 /*
539 * The main function
540 * - deschedule the current domain (scheduler independent).
541 * - pick a new domain (scheduler dependent).
542 */
543 static void __enter_scheduler(void)
544 {
545 struct vcpu *prev = current, *next = NULL;
546 s_time_t now = NOW();
547 struct schedule_data *sd;
548 struct task_slice next_slice;
549 s32 r_time; /* time for new dom to run */
551 ASSERT(!in_irq());
553 perfc_incrc(sched_run);
555 sd = &this_cpu(schedule_data);
557 spin_lock_irq(&sd->schedule_lock);
559 stop_timer(&sd->s_timer);
561 /* get policy-specific decision on scheduling... */
562 next_slice = ops.do_schedule(now);
564 r_time = next_slice.time;
565 next = next_slice.task;
567 sd->curr = next;
569 set_timer(&sd->s_timer, now + r_time);
571 if ( unlikely(prev == next) )
572 {
573 spin_unlock_irq(&sd->schedule_lock);
574 return continue_running(prev);
575 }
577 TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
578 prev->domain->domain_id,
579 now - prev->runstate.state_entry_time);
580 TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
581 next->domain->domain_id,
582 (next->runstate.state == RUNSTATE_runnable) ?
583 (now - next->runstate.state_entry_time) : 0,
584 r_time);
586 ASSERT(prev->runstate.state == RUNSTATE_running);
587 vcpu_runstate_change(
588 prev,
589 (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked :
590 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
591 now);
593 ASSERT(next->runstate.state != RUNSTATE_running);
594 vcpu_runstate_change(next, RUNSTATE_running, now);
596 ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags));
597 set_bit(_VCPUF_running, &next->vcpu_flags);
599 spin_unlock_irq(&sd->schedule_lock);
601 perfc_incrc(sched_ctx);
603 prev->sleep_tick = sd->tick;
605 /* Ensure that the domain has an up-to-date time base. */
606 if ( !is_idle_vcpu(next) )
607 {
608 update_vcpu_system_time(next);
609 if ( next->sleep_tick != sd->tick )
610 send_timer_event(next);
611 }
613 TRACE_4D(TRC_SCHED_SWITCH,
614 prev->domain->domain_id, prev->vcpu_id,
615 next->domain->domain_id, next->vcpu_id);
617 context_switch(prev, next);
618 }
620 void context_saved(struct vcpu *prev)
621 {
622 clear_bit(_VCPUF_running, &prev->vcpu_flags);
624 if ( unlikely(test_bit(_VCPUF_migrating, &prev->vcpu_flags)) )
625 vcpu_migrate(prev);
626 }
628 /****************************************************************************
629 * Timers: the scheduler utilises a number of timers
630 * - s_timer: per CPU timer for preemption and scheduling decisions
631 * - t_timer: per CPU periodic timer to send timer interrupt to current dom
632 * - dom_timer: per domain timer to specifiy timeout values
633 ****************************************************************************/
635 /* The scheduler timer: force a run through the scheduler */
636 static void s_timer_fn(void *unused)
637 {
638 raise_softirq(SCHEDULE_SOFTIRQ);
639 perfc_incrc(sched_irq);
640 }
642 /* Periodic tick timer: send timer event to current domain */
643 static void t_timer_fn(void *unused)
644 {
645 struct vcpu *v = current;
647 this_cpu(schedule_data).tick++;
649 if ( !is_idle_vcpu(v) )
650 {
651 update_vcpu_system_time(v);
652 send_timer_event(v);
653 }
655 page_scrub_schedule_work();
657 SCHED_OP(tick, smp_processor_id());
659 set_timer(&this_cpu(t_timer), NOW() + MILLISECS(10));
660 }
662 /* Per-VCPU timer function: sends a virtual timer interrupt. */
663 static void vcpu_timer_fn(void *data)
664 {
665 struct vcpu *v = data;
666 send_timer_event(v);
667 }
669 /* SCHEDOP_poll timeout callback. */
670 static void poll_timer_fn(void *data)
671 {
672 struct vcpu *v = data;
673 if ( test_and_clear_bit(_VCPUF_polling, &v->vcpu_flags) )
674 vcpu_unblock(v);
675 }
677 /* Initialise the data structures. */
678 void __init scheduler_init(void)
679 {
680 int i;
682 open_softirq(SCHEDULE_SOFTIRQ, __enter_scheduler);
684 for_each_cpu ( i )
685 {
686 spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
687 init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
688 init_timer(&per_cpu(t_timer, i), t_timer_fn, NULL, i);
689 }
691 for ( i = 0; schedulers[i] != NULL; i++ )
692 {
693 ops = *schedulers[i];
694 if ( strcmp(ops.opt_name, opt_sched) == 0 )
695 break;
696 }
698 if ( schedulers[i] == NULL )
699 printk("Could not find scheduler: %s\n", opt_sched);
701 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
702 SCHED_OP(init);
703 }
705 /*
706 * Start a scheduler for each CPU
707 * This has to be done *after* the timers, e.g., APICs, have been initialised
708 */
709 void schedulers_start(void)
710 {
711 t_timer_fn(0);
712 smp_call_function((void *)t_timer_fn, NULL, 1, 1);
713 }
715 void dump_runq(unsigned char key)
716 {
717 s_time_t now = NOW();
718 int i;
719 unsigned long flags;
721 local_irq_save(flags);
723 printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
724 SCHED_OP(dump_settings);
725 printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
727 for_each_online_cpu ( i )
728 {
729 spin_lock(&per_cpu(schedule_data, i).schedule_lock);
730 printk("CPU[%02d] ", i);
731 SCHED_OP(dump_cpu_state, i);
732 spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
733 }
735 local_irq_restore(flags);
736 }
738 /*
739 * Local variables:
740 * mode: C
741 * c-set-style: "BSD"
742 * c-basic-offset: 4
743 * tab-width: 4
744 * indent-tabs-mode: nil
745 * End:
746 */