ia64/xen-unstable

view xen/common/domain.c @ 17337:b667e220e556

x86, hvm: MMIO emulations should defer domain shutdown requests until
the relevant instruction has been fully emulated (which may require
multiple round trips to qemu-dm).
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Mar 27 11:39:57 2008 +0000 (2008-03-27)
parents b2a3fe7f5591
children 4e2e98c2098e
line source
1 /******************************************************************************
2 * domain.c
3 *
4 * Generic domain-handling functions.
5 */
7 #include <xen/config.h>
8 #include <xen/compat.h>
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/errno.h>
12 #include <xen/sched.h>
13 #include <xen/domain.h>
14 #include <xen/mm.h>
15 #include <xen/event.h>
16 #include <xen/time.h>
17 #include <xen/console.h>
18 #include <xen/softirq.h>
19 #include <xen/domain_page.h>
20 #include <xen/rangeset.h>
21 #include <xen/guest_access.h>
22 #include <xen/hypercall.h>
23 #include <xen/delay.h>
24 #include <xen/shutdown.h>
25 #include <xen/percpu.h>
26 #include <xen/multicall.h>
27 #include <xen/rcupdate.h>
28 #include <asm/debugger.h>
29 #include <public/sched.h>
30 #include <public/vcpu.h>
31 #include <xsm/xsm.h>
33 /* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
34 static unsigned int opt_dom0_vcpus_pin;
35 boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
37 enum cpufreq_controller cpufreq_controller;
38 static void __init setup_cpufreq_option(char *str)
39 {
40 if ( !strcmp(str, "dom0-kernel") )
41 {
42 cpufreq_controller = FREQCTL_dom0_kernel;
43 opt_dom0_vcpus_pin = 1;
44 }
45 }
46 custom_param("cpufreq", setup_cpufreq_option);
48 /* Protect updates/reads (resp.) of domain_list and domain_hash. */
49 DEFINE_SPINLOCK(domlist_update_lock);
50 DEFINE_RCU_READ_LOCK(domlist_read_lock);
52 #define DOMAIN_HASH_SIZE 256
53 #define DOMAIN_HASH(_id) ((int)(_id)&(DOMAIN_HASH_SIZE-1))
54 static struct domain *domain_hash[DOMAIN_HASH_SIZE];
55 struct domain *domain_list;
57 struct domain *dom0;
59 struct vcpu *idle_vcpu[NR_CPUS] __read_mostly;
61 int current_domain_id(void)
62 {
63 return current->domain->domain_id;
64 }
66 struct domain *alloc_domain(domid_t domid)
67 {
68 struct domain *d;
70 if ( (d = xmalloc(struct domain)) == NULL )
71 return NULL;
73 memset(d, 0, sizeof(*d));
74 d->domain_id = domid;
76 if ( xsm_alloc_security_domain(d) != 0 )
77 {
78 free_domain(d);
79 return NULL;
80 }
82 atomic_set(&d->refcnt, 1);
83 spin_lock_init(&d->big_lock);
84 spin_lock_init(&d->page_alloc_lock);
85 spin_lock_init(&d->shutdown_lock);
86 spin_lock_init(&d->hypercall_deadlock_mutex);
87 INIT_LIST_HEAD(&d->page_list);
88 INIT_LIST_HEAD(&d->xenpage_list);
90 return d;
91 }
93 void free_domain(struct domain *d)
94 {
95 xsm_free_security_domain(d);
96 xfree(d);
97 }
99 static void __domain_finalise_shutdown(struct domain *d)
100 {
101 struct vcpu *v;
103 BUG_ON(!spin_is_locked(&d->shutdown_lock));
105 if ( d->is_shut_down )
106 return;
108 for_each_vcpu ( d, v )
109 if ( !v->paused_for_shutdown )
110 return;
112 d->is_shut_down = 1;
113 send_guest_global_virq(dom0, VIRQ_DOM_EXC);
114 }
116 static void vcpu_check_shutdown(struct vcpu *v)
117 {
118 struct domain *d = v->domain;
120 spin_lock(&d->shutdown_lock);
122 if ( d->is_shutting_down )
123 {
124 if ( !v->paused_for_shutdown )
125 vcpu_pause_nosync(v);
126 v->paused_for_shutdown = 1;
127 v->defer_shutdown = 0;
128 __domain_finalise_shutdown(d);
129 }
131 spin_unlock(&d->shutdown_lock);
132 }
134 struct vcpu *alloc_vcpu(
135 struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
136 {
137 struct vcpu *v;
139 BUG_ON(d->vcpu[vcpu_id] != NULL);
141 if ( (v = alloc_vcpu_struct()) == NULL )
142 return NULL;
144 v->domain = d;
145 v->vcpu_id = vcpu_id;
147 v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
148 v->runstate.state_entry_time = NOW();
150 if ( !is_idle_domain(d) )
151 {
152 set_bit(_VPF_down, &v->pause_flags);
153 v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]);
154 }
156 if ( sched_init_vcpu(v, cpu_id) != 0 )
157 {
158 free_vcpu_struct(v);
159 return NULL;
160 }
162 if ( vcpu_initialise(v) != 0 )
163 {
164 sched_destroy_vcpu(v);
165 free_vcpu_struct(v);
166 return NULL;
167 }
169 d->vcpu[vcpu_id] = v;
170 if ( vcpu_id != 0 )
171 d->vcpu[v->vcpu_id-1]->next_in_list = v;
173 /* Must be called after making new vcpu visible to for_each_vcpu(). */
174 vcpu_check_shutdown(v);
176 return v;
177 }
179 struct vcpu *alloc_idle_vcpu(unsigned int cpu_id)
180 {
181 struct domain *d;
182 struct vcpu *v;
183 unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
185 if ( (v = idle_vcpu[cpu_id]) != NULL )
186 return v;
188 d = (vcpu_id == 0) ?
189 domain_create(IDLE_DOMAIN_ID, 0, 0) :
190 idle_vcpu[cpu_id - vcpu_id]->domain;
191 BUG_ON(d == NULL);
193 v = alloc_vcpu(d, vcpu_id, cpu_id);
194 idle_vcpu[cpu_id] = v;
196 return v;
197 }
199 struct domain *domain_create(
200 domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
201 {
202 struct domain *d, **pd;
203 enum { INIT_evtchn = 1, INIT_gnttab = 2, INIT_arch = 8 };
204 int init_status = 0;
206 if ( (d = alloc_domain(domid)) == NULL )
207 return NULL;
209 if ( domcr_flags & DOMCRF_hvm )
210 d->is_hvm = 1;
212 if ( (domid == 0) && opt_dom0_vcpus_pin )
213 d->is_pinned = 1;
215 rangeset_domain_initialise(d);
217 if ( !is_idle_domain(d) )
218 {
219 if ( xsm_domain_create(d, ssidref) != 0 )
220 goto fail;
222 d->is_paused_by_controller = 1;
223 atomic_inc(&d->pause_count);
225 if ( evtchn_init(d) != 0 )
226 goto fail;
227 init_status |= INIT_evtchn;
229 if ( grant_table_create(d) != 0 )
230 goto fail;
231 init_status |= INIT_gnttab;
232 }
234 if ( arch_domain_create(d, domcr_flags) != 0 )
235 goto fail;
236 init_status |= INIT_arch;
238 d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex);
239 d->irq_caps = rangeset_new(d, "Interrupts", 0);
240 if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) )
241 goto fail;
243 if ( sched_init_domain(d) != 0 )
244 goto fail;
246 if ( !is_idle_domain(d) )
247 {
248 spin_lock(&domlist_update_lock);
249 pd = &domain_list; /* NB. domain_list maintained in order of domid. */
250 for ( pd = &domain_list; *pd != NULL; pd = &(*pd)->next_in_list )
251 if ( (*pd)->domain_id > d->domain_id )
252 break;
253 d->next_in_list = *pd;
254 d->next_in_hashbucket = domain_hash[DOMAIN_HASH(domid)];
255 rcu_assign_pointer(*pd, d);
256 rcu_assign_pointer(domain_hash[DOMAIN_HASH(domid)], d);
257 spin_unlock(&domlist_update_lock);
258 }
260 return d;
262 fail:
263 d->is_dying = DOMDYING_dead;
264 atomic_set(&d->refcnt, DOMAIN_DESTROYED);
265 if ( init_status & INIT_arch )
266 arch_domain_destroy(d);
267 if ( init_status & INIT_gnttab )
268 grant_table_destroy(d);
269 if ( init_status & INIT_evtchn )
270 evtchn_destroy(d);
271 rangeset_domain_destroy(d);
272 free_domain(d);
273 return NULL;
274 }
277 struct domain *get_domain_by_id(domid_t dom)
278 {
279 struct domain *d;
281 rcu_read_lock(&domlist_read_lock);
283 for ( d = rcu_dereference(domain_hash[DOMAIN_HASH(dom)]);
284 d != NULL;
285 d = rcu_dereference(d->next_in_hashbucket) )
286 {
287 if ( d->domain_id == dom )
288 {
289 if ( unlikely(!get_domain(d)) )
290 d = NULL;
291 break;
292 }
293 }
295 rcu_read_unlock(&domlist_read_lock);
297 return d;
298 }
301 struct domain *rcu_lock_domain_by_id(domid_t dom)
302 {
303 struct domain *d;
305 rcu_read_lock(&domlist_read_lock);
307 for ( d = rcu_dereference(domain_hash[DOMAIN_HASH(dom)]);
308 d != NULL;
309 d = rcu_dereference(d->next_in_hashbucket) )
310 {
311 if ( d->domain_id == dom )
312 return d;
313 }
315 rcu_read_unlock(&domlist_read_lock);
317 return NULL;
318 }
321 int domain_kill(struct domain *d)
322 {
323 int rc = 0;
325 if ( d == current->domain )
326 return -EINVAL;
328 /* Protected by domctl_lock. */
329 switch ( d->is_dying )
330 {
331 case DOMDYING_alive:
332 domain_pause(d);
333 d->is_dying = DOMDYING_dying;
334 evtchn_destroy(d);
335 gnttab_release_mappings(d);
336 /* fallthrough */
337 case DOMDYING_dying:
338 rc = domain_relinquish_resources(d);
339 page_scrub_kick();
340 if ( rc != 0 )
341 {
342 BUG_ON(rc != -EAGAIN);
343 break;
344 }
345 d->is_dying = DOMDYING_dead;
346 put_domain(d);
347 send_guest_global_virq(dom0, VIRQ_DOM_EXC);
348 /* fallthrough */
349 case DOMDYING_dead:
350 break;
351 }
353 return rc;
354 }
357 void __domain_crash(struct domain *d)
358 {
359 if ( d->is_shutting_down )
360 {
361 /* Print nothing: the domain is already shutting down. */
362 }
363 else if ( d == current->domain )
364 {
365 printk("Domain %d (vcpu#%d) crashed on cpu#%d:\n",
366 d->domain_id, current->vcpu_id, smp_processor_id());
367 show_execution_state(guest_cpu_user_regs());
368 }
369 else
370 {
371 printk("Domain %d reported crashed by domain %d on cpu#%d:\n",
372 d->domain_id, current->domain->domain_id, smp_processor_id());
373 }
375 domain_shutdown(d, SHUTDOWN_crash);
376 }
379 void __domain_crash_synchronous(void)
380 {
381 __domain_crash(current->domain);
383 /*
384 * Flush multicall state before dying if a multicall is in progress.
385 * This shouldn't be necessary, but some architectures are calling
386 * domain_crash_synchronous() when they really shouldn't (i.e., from
387 * within hypercall context).
388 */
389 if ( this_cpu(mc_state).flags != 0 )
390 {
391 dprintk(XENLOG_ERR,
392 "FIXME: synchronous domain crash during a multicall!\n");
393 this_cpu(mc_state).flags = 0;
394 }
396 vcpu_end_shutdown_deferral(current);
398 for ( ; ; )
399 do_softirq();
400 }
403 void domain_shutdown(struct domain *d, u8 reason)
404 {
405 struct vcpu *v;
407 if ( d->domain_id == 0 )
408 dom0_shutdown(reason);
410 spin_lock(&d->shutdown_lock);
412 if ( d->is_shutting_down )
413 {
414 spin_unlock(&d->shutdown_lock);
415 return;
416 }
418 d->is_shutting_down = 1;
419 d->shutdown_code = reason;
421 smp_mb(); /* set shutdown status /then/ check for per-cpu deferrals */
423 for_each_vcpu ( d, v )
424 {
425 if ( v->defer_shutdown )
426 continue;
427 vcpu_pause_nosync(v);
428 v->paused_for_shutdown = 1;
429 }
431 __domain_finalise_shutdown(d);
433 spin_unlock(&d->shutdown_lock);
434 }
436 void domain_resume(struct domain *d)
437 {
438 struct vcpu *v;
440 /*
441 * Some code paths assume that shutdown status does not get reset under
442 * their feet (e.g., some assertions make this assumption).
443 */
444 domain_pause(d);
446 spin_lock(&d->shutdown_lock);
448 d->is_shutting_down = d->is_shut_down = 0;
450 for_each_vcpu ( d, v )
451 {
452 if ( v->paused_for_shutdown )
453 vcpu_unpause(v);
454 v->paused_for_shutdown = 0;
455 }
457 spin_unlock(&d->shutdown_lock);
459 domain_unpause(d);
460 }
462 int vcpu_start_shutdown_deferral(struct vcpu *v)
463 {
464 if ( v->defer_shutdown )
465 return 1;
467 v->defer_shutdown = 1;
468 smp_mb(); /* set deferral status /then/ check for shutdown */
469 if ( unlikely(v->domain->is_shutting_down) )
470 vcpu_check_shutdown(v);
472 return v->defer_shutdown;
473 }
475 void vcpu_end_shutdown_deferral(struct vcpu *v)
476 {
477 v->defer_shutdown = 0;
478 smp_mb(); /* clear deferral status /then/ check for shutdown */
479 if ( unlikely(v->domain->is_shutting_down) )
480 vcpu_check_shutdown(v);
481 }
483 void domain_pause_for_debugger(void)
484 {
485 struct domain *d = current->domain;
486 struct vcpu *v;
488 atomic_inc(&d->pause_count);
489 if ( test_and_set_bool(d->is_paused_by_controller) )
490 domain_unpause(d); /* race-free atomic_dec(&d->pause_count) */
492 for_each_vcpu ( d, v )
493 vcpu_sleep_nosync(v);
495 send_guest_global_virq(dom0, VIRQ_DEBUGGER);
496 }
498 /* Complete domain destroy after RCU readers are not holding old references. */
499 static void complete_domain_destroy(struct rcu_head *head)
500 {
501 struct domain *d = container_of(head, struct domain, rcu);
502 struct vcpu *v;
503 int i;
505 for ( i = MAX_VIRT_CPUS-1; i >= 0; i-- )
506 {
507 if ( (v = d->vcpu[i]) == NULL )
508 continue;
509 vcpu_destroy(v);
510 sched_destroy_vcpu(v);
511 }
513 rangeset_domain_destroy(d);
515 grant_table_destroy(d);
517 arch_domain_destroy(d);
519 sched_destroy_domain(d);
521 for ( i = MAX_VIRT_CPUS-1; i >= 0; i-- )
522 if ( (v = d->vcpu[i]) != NULL )
523 free_vcpu_struct(v);
525 if (d->target)
526 put_domain(d->target);
528 free_domain(d);
530 send_guest_global_virq(dom0, VIRQ_DOM_EXC);
531 }
533 /* Release resources belonging to task @p. */
534 void domain_destroy(struct domain *d)
535 {
536 struct domain **pd;
537 atomic_t old, new;
539 BUG_ON(!d->is_dying);
541 /* May be already destroyed, or get_domain() can race us. */
542 _atomic_set(old, 0);
543 _atomic_set(new, DOMAIN_DESTROYED);
544 old = atomic_compareandswap(old, new, &d->refcnt);
545 if ( _atomic_read(old) != 0 )
546 return;
548 /* Delete from task list and task hashtable. */
549 spin_lock(&domlist_update_lock);
550 pd = &domain_list;
551 while ( *pd != d )
552 pd = &(*pd)->next_in_list;
553 rcu_assign_pointer(*pd, d->next_in_list);
554 pd = &domain_hash[DOMAIN_HASH(d->domain_id)];
555 while ( *pd != d )
556 pd = &(*pd)->next_in_hashbucket;
557 rcu_assign_pointer(*pd, d->next_in_hashbucket);
558 spin_unlock(&domlist_update_lock);
560 /* Schedule RCU asynchronous completion of domain destroy. */
561 call_rcu(&d->rcu, complete_domain_destroy);
562 }
564 void vcpu_pause(struct vcpu *v)
565 {
566 ASSERT(v != current);
567 atomic_inc(&v->pause_count);
568 vcpu_sleep_sync(v);
569 }
571 void vcpu_pause_nosync(struct vcpu *v)
572 {
573 atomic_inc(&v->pause_count);
574 vcpu_sleep_nosync(v);
575 }
577 void vcpu_unpause(struct vcpu *v)
578 {
579 if ( atomic_dec_and_test(&v->pause_count) )
580 vcpu_wake(v);
581 }
583 void domain_pause(struct domain *d)
584 {
585 struct vcpu *v;
587 ASSERT(d != current->domain);
589 atomic_inc(&d->pause_count);
591 for_each_vcpu( d, v )
592 vcpu_sleep_sync(v);
593 }
595 void domain_unpause(struct domain *d)
596 {
597 struct vcpu *v;
599 if ( atomic_dec_and_test(&d->pause_count) )
600 for_each_vcpu( d, v )
601 vcpu_wake(v);
602 }
604 void domain_pause_by_systemcontroller(struct domain *d)
605 {
606 domain_pause(d);
607 if ( test_and_set_bool(d->is_paused_by_controller) )
608 domain_unpause(d);
609 }
611 void domain_unpause_by_systemcontroller(struct domain *d)
612 {
613 if ( test_and_clear_bool(d->is_paused_by_controller) )
614 domain_unpause(d);
615 }
617 int boot_vcpu(struct domain *d, int vcpuid, vcpu_guest_context_u ctxt)
618 {
619 struct vcpu *v = d->vcpu[vcpuid];
621 BUG_ON(v->is_initialised);
623 return arch_set_info_guest(v, ctxt);
624 }
626 int vcpu_reset(struct vcpu *v)
627 {
628 struct domain *d = v->domain;
629 int rc;
631 domain_pause(d);
632 LOCK_BIGLOCK(d);
634 rc = arch_vcpu_reset(v);
635 if ( rc != 0 )
636 goto out;
638 set_bit(_VPF_down, &v->pause_flags);
640 v->fpu_initialised = 0;
641 v->fpu_dirtied = 0;
642 v->is_polling = 0;
643 v->is_initialised = 0;
644 v->nmi_pending = 0;
645 v->nmi_masked = 0;
646 clear_bit(_VPF_blocked, &v->pause_flags);
648 out:
649 UNLOCK_BIGLOCK(v->domain);
650 domain_unpause(d);
652 return rc;
653 }
656 long do_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
657 {
658 struct domain *d = current->domain;
659 struct vcpu *v;
660 struct vcpu_guest_context *ctxt;
661 long rc = 0;
663 if ( (vcpuid < 0) || (vcpuid >= MAX_VIRT_CPUS) )
664 return -EINVAL;
666 if ( (v = d->vcpu[vcpuid]) == NULL )
667 return -ENOENT;
669 switch ( cmd )
670 {
671 case VCPUOP_initialise:
672 if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
673 return -ENOMEM;
675 if ( copy_from_guest(ctxt, arg, 1) )
676 {
677 xfree(ctxt);
678 return -EFAULT;
679 }
681 LOCK_BIGLOCK(d);
682 rc = -EEXIST;
683 if ( !v->is_initialised )
684 rc = boot_vcpu(d, vcpuid, ctxt);
685 UNLOCK_BIGLOCK(d);
687 xfree(ctxt);
688 break;
690 case VCPUOP_up:
691 if ( !v->is_initialised )
692 return -EINVAL;
694 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
695 vcpu_wake(v);
697 break;
699 case VCPUOP_down:
700 if ( !test_and_set_bit(_VPF_down, &v->pause_flags) )
701 vcpu_sleep_nosync(v);
702 break;
704 case VCPUOP_is_up:
705 rc = !test_bit(_VPF_down, &v->pause_flags);
706 break;
708 case VCPUOP_get_runstate_info:
709 {
710 struct vcpu_runstate_info runstate;
711 vcpu_runstate_get(v, &runstate);
712 if ( copy_to_guest(arg, &runstate, 1) )
713 rc = -EFAULT;
714 break;
715 }
717 case VCPUOP_set_periodic_timer:
718 {
719 struct vcpu_set_periodic_timer set;
721 if ( copy_from_guest(&set, arg, 1) )
722 return -EFAULT;
724 if ( set.period_ns < MILLISECS(1) )
725 return -EINVAL;
727 v->periodic_period = set.period_ns;
728 vcpu_force_reschedule(v);
730 break;
731 }
733 case VCPUOP_stop_periodic_timer:
734 v->periodic_period = 0;
735 vcpu_force_reschedule(v);
736 break;
738 case VCPUOP_set_singleshot_timer:
739 {
740 struct vcpu_set_singleshot_timer set;
742 if ( v != current )
743 return -EINVAL;
745 if ( copy_from_guest(&set, arg, 1) )
746 return -EFAULT;
748 if ( (set.flags & VCPU_SSHOTTMR_future) &&
749 (set.timeout_abs_ns < NOW()) )
750 return -ETIME;
752 if ( v->singleshot_timer.cpu != smp_processor_id() )
753 {
754 stop_timer(&v->singleshot_timer);
755 v->singleshot_timer.cpu = smp_processor_id();
756 }
758 set_timer(&v->singleshot_timer, set.timeout_abs_ns);
760 break;
761 }
763 case VCPUOP_stop_singleshot_timer:
764 if ( v != current )
765 return -EINVAL;
767 stop_timer(&v->singleshot_timer);
769 break;
771 case VCPUOP_send_nmi:
772 if ( !guest_handle_is_null(arg) )
773 return -EINVAL;
775 if ( !test_and_set_bool(v->nmi_pending) )
776 vcpu_kick(v);
778 break;
780 default:
781 rc = arch_do_vcpu_op(cmd, v, arg);
782 break;
783 }
785 return rc;
786 }
788 long vm_assist(struct domain *p, unsigned int cmd, unsigned int type)
789 {
790 if ( type > MAX_VMASST_TYPE )
791 return -EINVAL;
793 switch ( cmd )
794 {
795 case VMASST_CMD_enable:
796 set_bit(type, &p->vm_assist);
797 return 0;
798 case VMASST_CMD_disable:
799 clear_bit(type, &p->vm_assist);
800 return 0;
801 }
803 return -ENOSYS;
804 }
806 /*
807 * Local variables:
808 * mode: C
809 * c-set-style: "BSD"
810 * c-basic-offset: 4
811 * tab-width: 4
812 * indent-tabs-mode: nil
813 * End:
814 */