ia64/xen-unstable

view xen/arch/x86/domain.c @ 18806:ed8524f4a044

x86: Re-initialise HPET on resume from S3

Signed-off-by: Guanqun Lu <guanqun.lu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Nov 18 15:55:14 2008 +0000 (2008-11-18)
parents 8de4b4e9a435
children c820bf73a914
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <asm/regs.h>
36 #include <asm/mc146818rtc.h>
37 #include <asm/system.h>
38 #include <asm/io.h>
39 #include <asm/processor.h>
40 #include <asm/desc.h>
41 #include <asm/i387.h>
42 #include <asm/mpspec.h>
43 #include <asm/ldt.h>
44 #include <asm/hypercall.h>
45 #include <asm/hvm/hvm.h>
46 #include <asm/hvm/support.h>
47 #include <asm/debugreg.h>
48 #include <asm/msr.h>
49 #include <asm/nmi.h>
50 #include <xen/numa.h>
51 #include <xen/iommu.h>
52 #ifdef CONFIG_COMPAT
53 #include <compat/vcpu.h>
54 #endif
56 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
57 DEFINE_PER_CPU(u64, efer);
58 DEFINE_PER_CPU(unsigned long, cr4);
60 static void default_idle(void);
61 void (*pm_idle) (void) = default_idle;
63 static void paravirt_ctxt_switch_from(struct vcpu *v);
64 static void paravirt_ctxt_switch_to(struct vcpu *v);
66 static void vcpu_destroy_pagetables(struct vcpu *v);
68 static void continue_idle_domain(struct vcpu *v)
69 {
70 reset_stack_and_jump(idle_loop);
71 }
73 static void continue_nonidle_domain(struct vcpu *v)
74 {
75 reset_stack_and_jump(ret_from_intr);
76 }
78 static void default_idle(void)
79 {
80 local_irq_disable();
81 if ( !softirq_pending(smp_processor_id()) )
82 safe_halt();
83 else
84 local_irq_enable();
85 }
87 static void play_dead(void)
88 {
89 /*
90 * Flush pending softirqs if any. They can be queued up before this CPU
91 * was taken out of cpu_online_map in __cpu_disable().
92 */
93 do_softirq();
95 /* This must be done before dead CPU ack */
96 cpu_exit_clear();
97 hvm_cpu_down();
98 wbinvd();
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
103 /* With physical CPU hotplug, we should halt the cpu. */
104 local_irq_disable();
105 for ( ; ; )
106 halt();
107 }
109 void idle_loop(void)
110 {
111 for ( ; ; )
112 {
113 if ( cpu_is_offline(smp_processor_id()) )
114 play_dead();
115 page_scrub_schedule_work();
116 (*pm_idle)();
117 do_softirq();
118 }
119 }
121 void startup_cpu_idle_loop(void)
122 {
123 struct vcpu *v = current;
125 ASSERT(is_idle_vcpu(v));
126 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
127 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
129 reset_stack_and_jump(idle_loop);
130 }
132 void dump_pageframe_info(struct domain *d)
133 {
134 struct page_info *page;
136 printk("Memory pages belonging to domain %u:\n", d->domain_id);
138 if ( d->tot_pages >= 10 )
139 {
140 printk(" DomPage list too long to display\n");
141 }
142 else
143 {
144 list_for_each_entry ( page, &d->page_list, list )
145 {
146 printk(" DomPage %p: caf=%08x, taf=%" PRtype_info "\n",
147 _p(page_to_mfn(page)),
148 page->count_info, page->u.inuse.type_info);
149 }
150 }
152 list_for_each_entry ( page, &d->xenpage_list, list )
153 {
154 printk(" XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
155 _p(page_to_mfn(page)),
156 page->count_info, page->u.inuse.type_info);
157 }
158 }
160 struct vcpu *alloc_vcpu_struct(void)
161 {
162 struct vcpu *v;
163 if ( (v = xmalloc(struct vcpu)) != NULL )
164 memset(v, 0, sizeof(*v));
165 return v;
166 }
168 void free_vcpu_struct(struct vcpu *v)
169 {
170 xfree(v);
171 }
173 #ifdef CONFIG_COMPAT
175 static int setup_compat_l4(struct vcpu *v)
176 {
177 struct page_info *pg;
178 l4_pgentry_t *l4tab;
180 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
181 if ( pg == NULL )
182 return -ENOMEM;
184 /* This page needs to look like a pagetable so that it can be shadowed */
185 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
187 l4tab = page_to_virt(pg);
188 copy_page(l4tab, idle_pg_table);
189 l4tab[0] = l4e_empty();
190 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
191 l4e_from_page(pg, __PAGE_HYPERVISOR);
192 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
193 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
194 __PAGE_HYPERVISOR);
196 v->arch.guest_table = pagetable_from_page(pg);
197 v->arch.guest_table_user = v->arch.guest_table;
199 return 0;
200 }
202 static void release_compat_l4(struct vcpu *v)
203 {
204 free_domheap_page(pagetable_get_page(v->arch.guest_table));
205 v->arch.guest_table = pagetable_null();
206 v->arch.guest_table_user = pagetable_null();
207 }
209 static inline int may_switch_mode(struct domain *d)
210 {
211 return (!is_hvm_domain(d) && (d->tot_pages == 0));
212 }
214 int switch_native(struct domain *d)
215 {
216 unsigned int vcpuid;
218 if ( d == NULL )
219 return -EINVAL;
220 if ( !may_switch_mode(d) )
221 return -EACCES;
222 if ( !is_pv_32on64_domain(d) )
223 return 0;
225 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
227 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
228 {
229 if (d->vcpu[vcpuid])
230 release_compat_l4(d->vcpu[vcpuid]);
231 }
233 return 0;
234 }
236 int switch_compat(struct domain *d)
237 {
238 unsigned int vcpuid;
240 if ( d == NULL )
241 return -EINVAL;
242 if ( !may_switch_mode(d) )
243 return -EACCES;
244 if ( is_pv_32on64_domain(d) )
245 return 0;
247 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
249 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
250 {
251 if ( (d->vcpu[vcpuid] != NULL) &&
252 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
253 goto undo_and_fail;
254 }
256 domain_set_alloc_bitsize(d);
258 return 0;
260 undo_and_fail:
261 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
262 while ( vcpuid-- != 0 )
263 {
264 if ( d->vcpu[vcpuid] != NULL )
265 release_compat_l4(d->vcpu[vcpuid]);
266 }
267 return -ENOMEM;
268 }
270 #else
271 #define setup_compat_l4(v) 0
272 #define release_compat_l4(v) ((void)0)
273 #endif
275 int vcpu_initialise(struct vcpu *v)
276 {
277 struct domain *d = v->domain;
278 int rc;
280 v->arch.vcpu_info_mfn = INVALID_MFN;
282 v->arch.flags = TF_kernel_mode;
284 #if defined(__i386__)
285 mapcache_vcpu_init(v);
286 #endif
288 pae_l3_cache_init(&v->arch.pae_l3_cache);
290 paging_vcpu_init(v);
292 if ( is_hvm_domain(d) )
293 {
294 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
295 return rc;
296 }
297 else
298 {
299 /* PV guests by default have a 100Hz ticker. */
300 if ( !is_idle_domain(d) )
301 v->periodic_period = MILLISECS(10);
303 /* PV guests get an emulated PIT too for video BIOSes to use. */
304 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
305 pit_init(v, cpu_khz);
307 v->arch.schedule_tail = continue_nonidle_domain;
308 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
309 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
311 if ( is_idle_domain(d) )
312 {
313 v->arch.schedule_tail = continue_idle_domain;
314 v->arch.cr3 = __pa(idle_pg_table);
315 }
317 v->arch.guest_context.ctrlreg[4] =
318 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
319 }
321 v->arch.perdomain_ptes =
322 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
324 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
325 }
327 void vcpu_destroy(struct vcpu *v)
328 {
329 if ( is_pv_32on64_vcpu(v) )
330 release_compat_l4(v);
332 if ( is_hvm_vcpu(v) )
333 hvm_vcpu_destroy(v);
334 }
336 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
337 {
338 #ifdef __x86_64__
339 struct page_info *pg;
340 #endif
341 int i, pdpt_order, paging_initialised = 0;
342 int rc = -ENOMEM;
344 d->arch.hvm_domain.hap_enabled =
345 is_hvm_domain(d) &&
346 hvm_funcs.hap_supported &&
347 (domcr_flags & DOMCRF_hap);
349 INIT_LIST_HEAD(&d->arch.pdev_list);
351 d->arch.relmem = RELMEM_not_started;
352 INIT_LIST_HEAD(&d->arch.relmem_list);
354 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
355 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
356 if ( d->arch.mm_perdomain_pt == NULL )
357 goto fail;
358 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
360 #if defined(__i386__)
362 mapcache_domain_init(d);
364 #else /* __x86_64__ */
366 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
367 if ( pg == NULL )
368 goto fail;
369 d->arch.mm_perdomain_l2 = page_to_virt(pg);
370 clear_page(d->arch.mm_perdomain_l2);
371 for ( i = 0; i < (1 << pdpt_order); i++ )
372 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
373 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
374 __PAGE_HYPERVISOR);
376 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
377 if ( pg == NULL )
378 goto fail;
379 d->arch.mm_perdomain_l3 = page_to_virt(pg);
380 clear_page(d->arch.mm_perdomain_l3);
381 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
382 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
383 __PAGE_HYPERVISOR);
385 #endif /* __x86_64__ */
387 #ifdef CONFIG_COMPAT
388 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
389 #endif
391 if ( (rc = paging_domain_init(d)) != 0 )
392 goto fail;
393 paging_initialised = 1;
395 if ( !is_idle_domain(d) )
396 {
397 d->arch.ioport_caps =
398 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
399 rc = -ENOMEM;
400 if ( d->arch.ioport_caps == NULL )
401 goto fail;
403 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
404 goto fail;
406 clear_page(d->shared_info);
407 share_xen_page_with_guest(
408 virt_to_page(d->shared_info), d, XENSHARE_writable);
410 if ( (rc = iommu_domain_init(d)) != 0 )
411 goto fail;
412 }
414 if ( is_hvm_domain(d) )
415 {
416 if ( (rc = hvm_domain_initialise(d)) != 0 )
417 {
418 iommu_domain_destroy(d);
419 goto fail;
420 }
421 }
422 else
423 {
424 /* 32-bit PV guest by default only if Xen is not 64-bit. */
425 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
426 (CONFIG_PAGING_LEVELS != 4);
427 }
429 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
430 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
431 {
432 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
433 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
434 }
436 return 0;
438 fail:
439 d->is_dying = DOMDYING_dead;
440 free_xenheap_page(d->shared_info);
441 if ( paging_initialised )
442 paging_final_teardown(d);
443 #ifdef __x86_64__
444 if ( d->arch.mm_perdomain_l2 )
445 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
446 if ( d->arch.mm_perdomain_l3 )
447 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
448 #endif
449 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
450 return rc;
451 }
453 void arch_domain_destroy(struct domain *d)
454 {
455 if ( is_hvm_domain(d) )
456 hvm_domain_destroy(d);
458 pci_release_devices(d);
459 free_domain_pirqs(d);
460 if ( !is_idle_domain(d) )
461 iommu_domain_destroy(d);
463 paging_final_teardown(d);
465 free_xenheap_pages(
466 d->arch.mm_perdomain_pt,
467 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
469 #ifdef __x86_64__
470 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
471 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
472 #endif
474 free_xenheap_page(d->shared_info);
475 }
477 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
478 {
479 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
481 hv_cr4_mask = ~X86_CR4_TSD;
482 if ( cpu_has_de )
483 hv_cr4_mask &= ~X86_CR4_DE;
485 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
486 gdprintk(XENLOG_WARNING,
487 "Attempt to change CR4 flags %08lx -> %08lx\n",
488 hv_cr4, guest_cr4);
490 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
491 }
493 /* This is called by arch_final_setup_guest and do_boot_vcpu */
494 int arch_set_info_guest(
495 struct vcpu *v, vcpu_guest_context_u c)
496 {
497 struct domain *d = v->domain;
498 unsigned long cr3_pfn = INVALID_MFN;
499 unsigned long flags, cr4;
500 int i, rc = 0, compat;
502 /* The context is a compat-mode one if the target domain is compat-mode;
503 * we expect the tools to DTRT even in compat-mode callers. */
504 compat = is_pv_32on64_domain(d);
506 #ifdef CONFIG_COMPAT
507 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
508 #else
509 #define c(fld) (c.nat->fld)
510 #endif
511 flags = c(flags);
513 if ( !is_hvm_vcpu(v) )
514 {
515 if ( !compat )
516 {
517 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
518 fixup_guest_stack_selector(d, c.nat->kernel_ss);
519 fixup_guest_code_selector(d, c.nat->user_regs.cs);
520 #ifdef __i386__
521 fixup_guest_code_selector(d, c.nat->event_callback_cs);
522 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
523 #endif
525 for ( i = 0; i < 256; i++ )
526 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
528 /* LDT safety checks. */
529 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
530 (c.nat->ldt_ents > 8192) ||
531 !array_access_ok(c.nat->ldt_base,
532 c.nat->ldt_ents,
533 LDT_ENTRY_SIZE) )
534 return -EINVAL;
535 }
536 #ifdef CONFIG_COMPAT
537 else
538 {
539 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
540 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
541 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
542 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
543 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
545 for ( i = 0; i < 256; i++ )
546 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
548 /* LDT safety checks. */
549 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
550 (c.cmp->ldt_ents > 8192) ||
551 !compat_array_access_ok(c.cmp->ldt_base,
552 c.cmp->ldt_ents,
553 LDT_ENTRY_SIZE) )
554 return -EINVAL;
555 }
556 #endif
557 }
559 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
561 v->arch.flags &= ~TF_kernel_mode;
562 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
563 v->arch.flags |= TF_kernel_mode;
565 if ( !compat )
566 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
567 #ifdef CONFIG_COMPAT
568 else
569 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
570 #endif
572 v->arch.guest_context.user_regs.eflags |= 2;
574 if ( is_hvm_vcpu(v) )
575 {
576 hvm_set_info_guest(v);
577 goto out;
578 }
580 /* Only CR0.TS is modifiable by guest or admin. */
581 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
582 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
584 init_int80_direct_trap(v);
586 /* IOPL privileges are virtualised. */
587 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
588 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
590 /* Ensure real hardware interrupts are enabled. */
591 v->arch.guest_context.user_regs.eflags |= EF_IE;
593 cr4 = v->arch.guest_context.ctrlreg[4];
594 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
595 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
597 memset(v->arch.guest_context.debugreg, 0,
598 sizeof(v->arch.guest_context.debugreg));
599 for ( i = 0; i < 8; i++ )
600 (void)set_debugreg(v, i, c(debugreg[i]));
602 if ( v->is_initialised )
603 goto out;
605 if ( v->vcpu_id == 0 )
606 d->vm_assist = c(vm_assist);
608 if ( !compat )
609 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
610 #ifdef CONFIG_COMPAT
611 else
612 {
613 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
614 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
616 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
617 return -EINVAL;
618 for ( i = 0; i < n; ++i )
619 gdt_frames[i] = c.cmp->gdt_frames[i];
620 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
621 }
622 #endif
623 if ( rc != 0 )
624 return rc;
626 if ( !compat )
627 {
628 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
630 if ( !mfn_valid(cr3_pfn) ||
631 (paging_mode_refcounts(d)
632 ? !get_page(mfn_to_page(cr3_pfn), d)
633 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
634 PGT_base_page_table)) )
635 {
636 destroy_gdt(v);
637 return -EINVAL;
638 }
640 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
642 #ifdef __x86_64__
643 if ( c.nat->ctrlreg[1] )
644 {
645 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
647 if ( !mfn_valid(cr3_pfn) ||
648 (paging_mode_refcounts(d)
649 ? !get_page(mfn_to_page(cr3_pfn), d)
650 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
651 PGT_base_page_table)) )
652 {
653 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
654 v->arch.guest_table = pagetable_null();
655 if ( paging_mode_refcounts(d) )
656 put_page(mfn_to_page(cr3_pfn));
657 else
658 put_page_and_type(mfn_to_page(cr3_pfn));
659 destroy_gdt(v);
660 return -EINVAL;
661 }
663 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
664 }
665 #endif
666 }
667 #ifdef CONFIG_COMPAT
668 else
669 {
670 l4_pgentry_t *l4tab;
672 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
674 if ( !mfn_valid(cr3_pfn) ||
675 (paging_mode_refcounts(d)
676 ? !get_page(mfn_to_page(cr3_pfn), d)
677 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
678 PGT_l3_page_table)) )
679 {
680 destroy_gdt(v);
681 return -EINVAL;
682 }
684 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
685 *l4tab = l4e_from_pfn(
686 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
687 }
688 #endif
690 if ( v->vcpu_id == 0 )
691 update_domain_wallclock_time(d);
693 /* Don't redo final setup */
694 v->is_initialised = 1;
696 if ( paging_mode_enabled(d) )
697 paging_update_paging_modes(v);
699 update_cr3(v);
701 out:
702 if ( flags & VGCF_online )
703 clear_bit(_VPF_down, &v->pause_flags);
704 else
705 set_bit(_VPF_down, &v->pause_flags);
706 return 0;
707 #undef c
708 }
710 void arch_vcpu_reset(struct vcpu *v)
711 {
712 if ( !is_hvm_vcpu(v) )
713 {
714 destroy_gdt(v);
715 vcpu_destroy_pagetables(v);
716 }
717 else
718 {
719 vcpu_end_shutdown_deferral(v);
720 }
721 }
723 /*
724 * Unmap the vcpu info page if the guest decided to place it somewhere
725 * else. This is only used from arch_domain_destroy, so there's no
726 * need to do anything clever.
727 */
728 static void
729 unmap_vcpu_info(struct vcpu *v)
730 {
731 struct domain *d = v->domain;
732 unsigned long mfn;
734 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
735 return;
737 mfn = v->arch.vcpu_info_mfn;
738 unmap_domain_page_global(v->vcpu_info);
740 v->vcpu_info = (void *)&shared_info(d, vcpu_info[v->vcpu_id]);
741 v->arch.vcpu_info_mfn = INVALID_MFN;
743 put_page_and_type(mfn_to_page(mfn));
744 }
746 /*
747 * Map a guest page in and point the vcpu_info pointer at it. This
748 * makes sure that the vcpu_info is always pointing at a valid piece
749 * of memory, and it sets a pending event to make sure that a pending
750 * event doesn't get missed.
751 */
752 static int
753 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
754 {
755 struct domain *d = v->domain;
756 void *mapping;
757 vcpu_info_t *new_info;
758 int i;
760 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
761 return -EINVAL;
763 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
764 return -EINVAL;
766 /* Run this command on yourself or on other offline VCPUS. */
767 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
768 return -EINVAL;
770 mfn = gmfn_to_mfn(d, mfn);
771 if ( !mfn_valid(mfn) ||
772 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
773 return -EINVAL;
775 mapping = map_domain_page_global(mfn);
776 if ( mapping == NULL )
777 {
778 put_page_and_type(mfn_to_page(mfn));
779 return -ENOMEM;
780 }
782 new_info = (vcpu_info_t *)(mapping + offset);
784 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
786 v->vcpu_info = new_info;
787 v->arch.vcpu_info_mfn = mfn;
789 /* Set new vcpu_info pointer /before/ setting pending flags. */
790 wmb();
792 /*
793 * Mark everything as being pending just to make sure nothing gets
794 * lost. The domain will get a spurious event, but it can cope.
795 */
796 vcpu_info(v, evtchn_upcall_pending) = 1;
797 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
798 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
800 return 0;
801 }
803 long
804 arch_do_vcpu_op(
805 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
806 {
807 long rc = 0;
809 switch ( cmd )
810 {
811 case VCPUOP_register_runstate_memory_area:
812 {
813 struct vcpu_register_runstate_memory_area area;
814 struct vcpu_runstate_info runstate;
816 rc = -EFAULT;
817 if ( copy_from_guest(&area, arg, 1) )
818 break;
820 if ( !guest_handle_okay(area.addr.h, 1) )
821 break;
823 rc = 0;
824 runstate_guest(v) = area.addr.h;
826 if ( v == current )
827 {
828 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
829 }
830 else
831 {
832 vcpu_runstate_get(v, &runstate);
833 __copy_to_guest(runstate_guest(v), &runstate, 1);
834 }
836 break;
837 }
839 case VCPUOP_register_vcpu_info:
840 {
841 struct domain *d = v->domain;
842 struct vcpu_register_vcpu_info info;
844 rc = -EFAULT;
845 if ( copy_from_guest(&info, arg, 1) )
846 break;
848 domain_lock(d);
849 rc = map_vcpu_info(v, info.mfn, info.offset);
850 domain_unlock(d);
852 break;
853 }
855 case VCPUOP_get_physid:
856 {
857 struct vcpu_get_physid cpu_id;
859 rc = -EINVAL;
860 if ( !v->domain->is_pinned )
861 break;
863 cpu_id.phys_id =
864 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
865 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
867 rc = -EFAULT;
868 if ( copy_to_guest(arg, &cpu_id, 1) )
869 break;
871 rc = 0;
872 break;
873 }
875 default:
876 rc = -ENOSYS;
877 break;
878 }
880 return rc;
881 }
883 #ifdef __x86_64__
885 #define loadsegment(seg,value) ({ \
886 int __r = 1; \
887 asm volatile ( \
888 "1: movl %k1,%%" #seg "\n2:\n" \
889 ".section .fixup,\"ax\"\n" \
890 "3: xorl %k0,%k0\n" \
891 " movl %k0,%%" #seg "\n" \
892 " jmp 2b\n" \
893 ".previous\n" \
894 ".section __ex_table,\"a\"\n" \
895 " .align 8\n" \
896 " .quad 1b,3b\n" \
897 ".previous" \
898 : "=r" (__r) : "r" (value), "0" (__r) );\
899 __r; })
901 /*
902 * save_segments() writes a mask of segments which are dirty (non-zero),
903 * allowing load_segments() to avoid some expensive segment loads and
904 * MSR writes.
905 */
906 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
907 #define DIRTY_DS 0x01
908 #define DIRTY_ES 0x02
909 #define DIRTY_FS 0x04
910 #define DIRTY_GS 0x08
911 #define DIRTY_FS_BASE 0x10
912 #define DIRTY_GS_BASE_USER 0x20
914 static void load_segments(struct vcpu *n)
915 {
916 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
917 int all_segs_okay = 1;
918 unsigned int dirty_segment_mask, cpu = smp_processor_id();
920 /* Load and clear the dirty segment mask. */
921 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
922 per_cpu(dirty_segment_mask, cpu) = 0;
924 /* Either selector != 0 ==> reload. */
925 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
926 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
928 /* Either selector != 0 ==> reload. */
929 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
930 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
932 /*
933 * Either selector != 0 ==> reload.
934 * Also reload to reset FS_BASE if it was non-zero.
935 */
936 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
937 nctxt->user_regs.fs) )
938 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
940 /*
941 * Either selector != 0 ==> reload.
942 * Also reload to reset GS_BASE if it was non-zero.
943 */
944 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
945 nctxt->user_regs.gs) )
946 {
947 /* Reset GS_BASE with user %gs? */
948 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
949 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
950 }
952 if ( !is_pv_32on64_domain(n->domain) )
953 {
954 /* This can only be non-zero if selector is NULL. */
955 if ( nctxt->fs_base )
956 wrmsr(MSR_FS_BASE,
957 nctxt->fs_base,
958 nctxt->fs_base>>32);
960 /* Most kernels have non-zero GS base, so don't bother testing. */
961 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
962 wrmsr(MSR_SHADOW_GS_BASE,
963 nctxt->gs_base_kernel,
964 nctxt->gs_base_kernel>>32);
966 /* This can only be non-zero if selector is NULL. */
967 if ( nctxt->gs_base_user )
968 wrmsr(MSR_GS_BASE,
969 nctxt->gs_base_user,
970 nctxt->gs_base_user>>32);
972 /* If in kernel mode then switch the GS bases around. */
973 if ( (n->arch.flags & TF_kernel_mode) )
974 asm volatile ( "swapgs" );
975 }
977 if ( unlikely(!all_segs_okay) )
978 {
979 struct cpu_user_regs *regs = guest_cpu_user_regs();
980 unsigned long *rsp =
981 (n->arch.flags & TF_kernel_mode) ?
982 (unsigned long *)regs->rsp :
983 (unsigned long *)nctxt->kernel_sp;
984 unsigned long cs_and_mask, rflags;
986 if ( is_pv_32on64_domain(n->domain) )
987 {
988 unsigned int *esp = ring_1(regs) ?
989 (unsigned int *)regs->rsp :
990 (unsigned int *)nctxt->kernel_sp;
991 unsigned int cs_and_mask, eflags;
992 int ret = 0;
994 /* CS longword also contains full evtchn_upcall_mask. */
995 cs_and_mask = (unsigned short)regs->cs |
996 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
997 /* Fold upcall mask into RFLAGS.IF. */
998 eflags = regs->_eflags & ~X86_EFLAGS_IF;
999 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1001 if ( !ring_1(regs) )
1003 ret = put_user(regs->ss, esp-1);
1004 ret |= put_user(regs->_esp, esp-2);
1005 esp -= 2;
1008 if ( ret |
1009 put_user(eflags, esp-1) |
1010 put_user(cs_and_mask, esp-2) |
1011 put_user(regs->_eip, esp-3) |
1012 put_user(nctxt->user_regs.gs, esp-4) |
1013 put_user(nctxt->user_regs.fs, esp-5) |
1014 put_user(nctxt->user_regs.es, esp-6) |
1015 put_user(nctxt->user_regs.ds, esp-7) )
1017 gdprintk(XENLOG_ERR, "Error while creating compat "
1018 "failsafe callback frame.\n");
1019 domain_crash(n->domain);
1022 if ( test_bit(_VGCF_failsafe_disables_events,
1023 &n->arch.guest_context.flags) )
1024 vcpu_info(n, evtchn_upcall_mask) = 1;
1026 regs->entry_vector = TRAP_syscall;
1027 regs->_eflags &= 0xFFFCBEFFUL;
1028 regs->ss = FLAT_COMPAT_KERNEL_SS;
1029 regs->_esp = (unsigned long)(esp-7);
1030 regs->cs = FLAT_COMPAT_KERNEL_CS;
1031 regs->_eip = nctxt->failsafe_callback_eip;
1032 return;
1035 if ( !(n->arch.flags & TF_kernel_mode) )
1036 toggle_guest_mode(n);
1037 else
1038 regs->cs &= ~3;
1040 /* CS longword also contains full evtchn_upcall_mask. */
1041 cs_and_mask = (unsigned long)regs->cs |
1042 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1044 /* Fold upcall mask into RFLAGS.IF. */
1045 rflags = regs->rflags & ~X86_EFLAGS_IF;
1046 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1048 if ( put_user(regs->ss, rsp- 1) |
1049 put_user(regs->rsp, rsp- 2) |
1050 put_user(rflags, rsp- 3) |
1051 put_user(cs_and_mask, rsp- 4) |
1052 put_user(regs->rip, rsp- 5) |
1053 put_user(nctxt->user_regs.gs, rsp- 6) |
1054 put_user(nctxt->user_regs.fs, rsp- 7) |
1055 put_user(nctxt->user_regs.es, rsp- 8) |
1056 put_user(nctxt->user_regs.ds, rsp- 9) |
1057 put_user(regs->r11, rsp-10) |
1058 put_user(regs->rcx, rsp-11) )
1060 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1061 "callback frame.\n");
1062 domain_crash(n->domain);
1065 if ( test_bit(_VGCF_failsafe_disables_events,
1066 &n->arch.guest_context.flags) )
1067 vcpu_info(n, evtchn_upcall_mask) = 1;
1069 regs->entry_vector = TRAP_syscall;
1070 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1071 X86_EFLAGS_NT|X86_EFLAGS_TF);
1072 regs->ss = FLAT_KERNEL_SS;
1073 regs->rsp = (unsigned long)(rsp-11);
1074 regs->cs = FLAT_KERNEL_CS;
1075 regs->rip = nctxt->failsafe_callback_eip;
1079 static void save_segments(struct vcpu *v)
1081 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1082 struct cpu_user_regs *regs = &ctxt->user_regs;
1083 unsigned int dirty_segment_mask = 0;
1085 regs->ds = read_segment_register(ds);
1086 regs->es = read_segment_register(es);
1087 regs->fs = read_segment_register(fs);
1088 regs->gs = read_segment_register(gs);
1090 if ( regs->ds )
1091 dirty_segment_mask |= DIRTY_DS;
1093 if ( regs->es )
1094 dirty_segment_mask |= DIRTY_ES;
1096 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1098 dirty_segment_mask |= DIRTY_FS;
1099 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1101 else if ( ctxt->fs_base )
1103 dirty_segment_mask |= DIRTY_FS_BASE;
1106 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1108 dirty_segment_mask |= DIRTY_GS;
1109 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1111 else if ( ctxt->gs_base_user )
1113 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1116 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1119 #define switch_kernel_stack(v) ((void)0)
1121 #elif defined(__i386__)
1123 #define load_segments(n) ((void)0)
1124 #define save_segments(p) ((void)0)
1126 static inline void switch_kernel_stack(struct vcpu *v)
1128 struct tss_struct *tss = &init_tss[smp_processor_id()];
1129 tss->esp1 = v->arch.guest_context.kernel_sp;
1130 tss->ss1 = v->arch.guest_context.kernel_ss;
1133 #endif /* __i386__ */
1135 static void paravirt_ctxt_switch_from(struct vcpu *v)
1137 save_segments(v);
1139 /*
1140 * Disable debug breakpoints. We do this aggressively because if we switch
1141 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1142 * inside Xen, before we get a chance to reload DR7, and this cannot always
1143 * safely be handled.
1144 */
1145 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1146 write_debugreg(7, 0);
1149 static void paravirt_ctxt_switch_to(struct vcpu *v)
1151 unsigned long cr4;
1153 set_int80_direct_trap(v);
1154 switch_kernel_stack(v);
1156 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1157 if ( unlikely(cr4 != read_cr4()) )
1158 write_cr4(cr4);
1160 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1162 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1163 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1164 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1165 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1166 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1167 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1171 static inline int need_full_gdt(struct vcpu *v)
1173 return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
1176 static void __context_switch(void)
1178 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1179 unsigned int cpu = smp_processor_id();
1180 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1181 struct vcpu *n = current;
1182 struct desc_struct *gdt;
1183 struct desc_ptr gdt_desc;
1185 ASSERT(p != n);
1186 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1188 if ( !is_idle_vcpu(p) )
1190 memcpy(&p->arch.guest_context.user_regs,
1191 stack_regs,
1192 CTXT_SWITCH_STACK_BYTES);
1193 unlazy_fpu(p);
1194 p->arch.ctxt_switch_from(p);
1197 if ( !is_idle_vcpu(n) )
1199 memcpy(stack_regs,
1200 &n->arch.guest_context.user_regs,
1201 CTXT_SWITCH_STACK_BYTES);
1202 n->arch.ctxt_switch_to(n);
1205 if ( p->domain != n->domain )
1206 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1207 cpu_set(cpu, n->vcpu_dirty_cpumask);
1209 gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
1210 per_cpu(compat_gdt_table, cpu);
1211 if ( need_full_gdt(n) )
1213 struct page_info *page = virt_to_page(gdt);
1214 unsigned int i;
1215 for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1216 l1e_write(n->domain->arch.mm_perdomain_pt +
1217 (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
1218 FIRST_RESERVED_GDT_PAGE + i,
1219 l1e_from_page(page + i, __PAGE_HYPERVISOR));
1222 if ( need_full_gdt(p) &&
1223 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
1225 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1226 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1227 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1230 write_ptbase(n);
1232 if ( need_full_gdt(n) &&
1233 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
1235 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1236 gdt_desc.base = GDT_VIRT_START(n);
1237 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1240 if ( p->domain != n->domain )
1241 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1242 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1244 per_cpu(curr_vcpu, cpu) = n;
1248 void context_switch(struct vcpu *prev, struct vcpu *next)
1250 unsigned int cpu = smp_processor_id();
1251 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1253 ASSERT(local_irq_is_enabled());
1255 /* Allow at most one CPU at a time to be dirty. */
1256 ASSERT(cpus_weight(dirty_mask) <= 1);
1257 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1259 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1260 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1261 flush_tlb_mask(next->vcpu_dirty_cpumask);
1264 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1265 pt_save_timer(prev);
1267 local_irq_disable();
1269 set_current(next);
1271 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1273 local_irq_enable();
1275 else
1277 __context_switch();
1279 #ifdef CONFIG_COMPAT
1280 if ( !is_hvm_vcpu(next) &&
1281 (is_idle_vcpu(prev) ||
1282 is_hvm_vcpu(prev) ||
1283 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1285 uint64_t efer = read_efer();
1286 if ( !(efer & EFER_SCE) )
1287 write_efer(efer | EFER_SCE);
1289 #endif
1291 /* Re-enable interrupts before restoring state which may fault. */
1292 local_irq_enable();
1294 if ( !is_hvm_vcpu(next) )
1296 load_LDT(next);
1297 load_segments(next);
1301 context_saved(prev);
1303 /* Update per-VCPU guest runstate shared memory area (if registered). */
1304 if ( !guest_handle_is_null(runstate_guest(next)) )
1306 if ( !is_pv_32on64_domain(next->domain) )
1307 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1308 #ifdef CONFIG_COMPAT
1309 else
1311 struct compat_vcpu_runstate_info info;
1313 XLAT_vcpu_runstate_info(&info, &next->runstate);
1314 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1316 #endif
1319 schedule_tail(next);
1320 BUG();
1323 void continue_running(struct vcpu *same)
1325 schedule_tail(same);
1326 BUG();
1329 int __sync_lazy_execstate(void)
1331 unsigned long flags;
1332 int switch_required;
1334 local_irq_save(flags);
1336 switch_required = (this_cpu(curr_vcpu) != current);
1338 if ( switch_required )
1340 ASSERT(current == idle_vcpu[smp_processor_id()]);
1341 __context_switch();
1344 local_irq_restore(flags);
1346 return switch_required;
1349 void sync_vcpu_execstate(struct vcpu *v)
1351 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1352 (void)__sync_lazy_execstate();
1354 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1355 flush_tlb_mask(v->vcpu_dirty_cpumask);
1358 struct migrate_info {
1359 long (*func)(void *data);
1360 void *data;
1361 void (*saved_schedule_tail)(struct vcpu *);
1362 cpumask_t saved_affinity;
1363 unsigned int nest;
1364 };
1366 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1368 struct cpu_user_regs *regs = guest_cpu_user_regs();
1369 struct migrate_info *info = v->arch.continue_info;
1370 cpumask_t mask = info->saved_affinity;
1371 void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
1373 regs->eax = info->func(info->data);
1375 if ( info->nest-- == 0 )
1377 xfree(info);
1378 v->arch.schedule_tail = saved_schedule_tail;
1379 v->arch.continue_info = NULL;
1380 vcpu_unlock_affinity(v, &mask);
1383 (*saved_schedule_tail)(v);
1386 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1388 struct vcpu *v = current;
1389 struct migrate_info *info;
1390 cpumask_t mask = cpumask_of_cpu(cpu);
1391 int rc;
1393 if ( cpu == smp_processor_id() )
1394 return func(data);
1396 info = v->arch.continue_info;
1397 if ( info == NULL )
1399 info = xmalloc(struct migrate_info);
1400 if ( info == NULL )
1401 return -ENOMEM;
1403 rc = vcpu_lock_affinity(v, &mask);
1404 if ( rc )
1406 xfree(info);
1407 return rc;
1410 info->saved_schedule_tail = v->arch.schedule_tail;
1411 info->saved_affinity = mask;
1412 info->nest = 0;
1414 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1415 v->arch.continue_info = info;
1417 else
1419 BUG_ON(info->nest != 0);
1420 rc = vcpu_locked_change_affinity(v, &mask);
1421 if ( rc )
1422 return rc;
1423 info->nest++;
1426 info->func = func;
1427 info->data = data;
1429 /* Dummy return value will be overwritten by new schedule_tail. */
1430 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1431 return 0;
1434 #define next_arg(fmt, args) ({ \
1435 unsigned long __arg; \
1436 switch ( *(fmt)++ ) \
1437 { \
1438 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1439 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1440 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1441 default: __arg = 0; BUG(); \
1442 } \
1443 __arg; \
1444 })
1446 DEFINE_PER_CPU(char, hc_preempted);
1448 unsigned long hypercall_create_continuation(
1449 unsigned int op, const char *format, ...)
1451 struct mc_state *mcs = &this_cpu(mc_state);
1452 struct cpu_user_regs *regs;
1453 const char *p = format;
1454 unsigned long arg;
1455 unsigned int i;
1456 va_list args;
1458 va_start(args, format);
1460 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1462 __set_bit(_MCSF_call_preempted, &mcs->flags);
1464 for ( i = 0; *p != '\0'; i++ )
1465 mcs->call.args[i] = next_arg(p, args);
1466 if ( is_pv_32on64_domain(current->domain) )
1468 for ( ; i < 6; i++ )
1469 mcs->call.args[i] = 0;
1472 else
1474 regs = guest_cpu_user_regs();
1475 regs->eax = op;
1476 /*
1477 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1478 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1479 */
1480 if ( !is_hvm_vcpu(current) )
1481 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1483 #ifdef __x86_64__
1484 if ( !is_hvm_vcpu(current) ?
1485 !is_pv_32on64_vcpu(current) :
1486 (hvm_guest_x86_mode(current) == 8) )
1488 for ( i = 0; *p != '\0'; i++ )
1490 arg = next_arg(p, args);
1491 switch ( i )
1493 case 0: regs->rdi = arg; break;
1494 case 1: regs->rsi = arg; break;
1495 case 2: regs->rdx = arg; break;
1496 case 3: regs->r10 = arg; break;
1497 case 4: regs->r8 = arg; break;
1498 case 5: regs->r9 = arg; break;
1502 else
1503 #endif
1505 if ( supervisor_mode_kernel )
1506 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1508 for ( i = 0; *p != '\0'; i++ )
1510 arg = next_arg(p, args);
1511 switch ( i )
1513 case 0: regs->ebx = arg; break;
1514 case 1: regs->ecx = arg; break;
1515 case 2: regs->edx = arg; break;
1516 case 3: regs->esi = arg; break;
1517 case 4: regs->edi = arg; break;
1518 case 5: regs->ebp = arg; break;
1523 this_cpu(hc_preempted) = 1;
1526 va_end(args);
1528 return op;
1531 #ifdef CONFIG_COMPAT
1532 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1534 int rc = 0;
1535 struct mc_state *mcs = &this_cpu(mc_state);
1536 struct cpu_user_regs *regs;
1537 unsigned int i, cval = 0;
1538 unsigned long nval = 0;
1539 va_list args;
1541 BUG_ON(*id > 5);
1542 BUG_ON(mask & (1U << *id));
1544 va_start(args, mask);
1546 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1548 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1549 return 0;
1550 for ( i = 0; i < 6; ++i, mask >>= 1 )
1552 if ( mask & 1 )
1554 nval = va_arg(args, unsigned long);
1555 cval = va_arg(args, unsigned int);
1556 if ( cval == nval )
1557 mask &= ~1U;
1558 else
1559 BUG_ON(nval == (unsigned int)nval);
1561 else if ( id && *id == i )
1563 *id = mcs->call.args[i];
1564 id = NULL;
1566 if ( (mask & 1) && mcs->call.args[i] == nval )
1568 mcs->call.args[i] = cval;
1569 ++rc;
1571 else
1572 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1575 else
1577 regs = guest_cpu_user_regs();
1578 for ( i = 0; i < 6; ++i, mask >>= 1 )
1580 unsigned long *reg;
1582 switch ( i )
1584 case 0: reg = &regs->ebx; break;
1585 case 1: reg = &regs->ecx; break;
1586 case 2: reg = &regs->edx; break;
1587 case 3: reg = &regs->esi; break;
1588 case 4: reg = &regs->edi; break;
1589 case 5: reg = &regs->ebp; break;
1590 default: BUG(); reg = NULL; break;
1592 if ( (mask & 1) )
1594 nval = va_arg(args, unsigned long);
1595 cval = va_arg(args, unsigned int);
1596 if ( cval == nval )
1597 mask &= ~1U;
1598 else
1599 BUG_ON(nval == (unsigned int)nval);
1601 else if ( id && *id == i )
1603 *id = *reg;
1604 id = NULL;
1606 if ( (mask & 1) && *reg == nval )
1608 *reg = cval;
1609 ++rc;
1611 else
1612 BUG_ON(*reg != (unsigned int)*reg);
1616 va_end(args);
1618 return rc;
1620 #endif
1622 static int relinquish_memory(
1623 struct domain *d, struct list_head *list, unsigned long type)
1625 struct list_head *ent;
1626 struct page_info *page;
1627 unsigned long x, y;
1628 int ret = 0;
1630 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1631 spin_lock_recursive(&d->page_alloc_lock);
1633 ent = list->next;
1634 while ( ent != list )
1636 page = list_entry(ent, struct page_info, list);
1638 /* Grab a reference to the page so it won't disappear from under us. */
1639 if ( unlikely(!get_page(page, d)) )
1641 /* Couldn't get a reference -- someone is freeing this page. */
1642 ent = ent->next;
1643 list_move_tail(&page->list, &d->arch.relmem_list);
1644 continue;
1647 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1648 ret = put_page_and_type_preemptible(page, 1);
1649 switch ( ret )
1651 case 0:
1652 break;
1653 case -EAGAIN:
1654 case -EINTR:
1655 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1656 put_page(page);
1657 goto out;
1658 default:
1659 BUG();
1662 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1663 put_page(page);
1665 /*
1666 * Forcibly invalidate top-most, still valid page tables at this point
1667 * to break circular 'linear page table' references as well as clean up
1668 * partially validated pages. This is okay because MMU structures are
1669 * not shared across domains and this domain is now dead. Thus top-most
1670 * valid tables are not in use so a non-zero count means circular
1671 * reference or partially validated.
1672 */
1673 y = page->u.inuse.type_info;
1674 for ( ; ; )
1676 x = y;
1677 if ( likely((x & PGT_type_mask) != type) ||
1678 likely(!(x & (PGT_validated|PGT_partial))) )
1679 break;
1681 y = cmpxchg(&page->u.inuse.type_info, x,
1682 x & ~(PGT_validated|PGT_partial));
1683 if ( likely(y == x) )
1685 /* No need for atomic update of type_info here: noone else updates it. */
1686 switch ( ret = free_page_type(page, x, 1) )
1688 case 0:
1689 break;
1690 case -EINTR:
1691 page->u.inuse.type_info |= PGT_validated;
1692 if ( x & PGT_partial )
1693 put_page(page);
1694 put_page(page);
1695 ret = -EAGAIN;
1696 goto out;
1697 case -EAGAIN:
1698 page->u.inuse.type_info |= PGT_partial;
1699 if ( x & PGT_partial )
1700 put_page(page);
1701 goto out;
1702 default:
1703 BUG();
1705 if ( x & PGT_partial )
1707 page->u.inuse.type_info--;
1708 put_page(page);
1710 break;
1714 /* Follow the list chain and /then/ potentially free the page. */
1715 ent = ent->next;
1716 list_move_tail(&page->list, &d->arch.relmem_list);
1717 put_page(page);
1719 if ( hypercall_preempt_check() )
1721 ret = -EAGAIN;
1722 goto out;
1726 list_splice_init(&d->arch.relmem_list, list);
1728 out:
1729 spin_unlock_recursive(&d->page_alloc_lock);
1730 return ret;
1733 static void vcpu_destroy_pagetables(struct vcpu *v)
1735 struct domain *d = v->domain;
1736 unsigned long pfn;
1738 #ifdef __x86_64__
1739 if ( is_pv_32on64_vcpu(v) )
1741 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1742 __va(pagetable_get_paddr(v->arch.guest_table)));
1744 if ( pfn != 0 )
1746 if ( paging_mode_refcounts(d) )
1747 put_page(mfn_to_page(pfn));
1748 else
1749 put_page_and_type(mfn_to_page(pfn));
1752 l4e_write(
1753 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1754 l4e_empty());
1756 v->arch.cr3 = 0;
1757 return;
1759 #endif
1761 pfn = pagetable_get_pfn(v->arch.guest_table);
1762 if ( pfn != 0 )
1764 if ( paging_mode_refcounts(d) )
1765 put_page(mfn_to_page(pfn));
1766 else
1767 put_page_and_type(mfn_to_page(pfn));
1768 v->arch.guest_table = pagetable_null();
1771 #ifdef __x86_64__
1772 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1773 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1774 if ( pfn != 0 )
1776 if ( !is_pv_32bit_vcpu(v) )
1778 if ( paging_mode_refcounts(d) )
1779 put_page(mfn_to_page(pfn));
1780 else
1781 put_page_and_type(mfn_to_page(pfn));
1783 v->arch.guest_table_user = pagetable_null();
1785 #endif
1787 v->arch.cr3 = 0;
1790 int domain_relinquish_resources(struct domain *d)
1792 int ret;
1793 struct vcpu *v;
1795 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1797 switch ( d->arch.relmem )
1799 case RELMEM_not_started:
1800 /* Tear down paging-assistance stuff. */
1801 paging_teardown(d);
1803 for_each_vcpu ( d, v )
1805 /* Drop the in-use references to page-table bases. */
1806 vcpu_destroy_pagetables(v);
1808 /*
1809 * Relinquish GDT mappings. No need for explicit unmapping of the
1810 * LDT as it automatically gets squashed with the guest mappings.
1811 */
1812 destroy_gdt(v);
1814 unmap_vcpu_info(v);
1817 d->arch.relmem = RELMEM_xen;
1818 /* fallthrough */
1820 /* Relinquish every page of memory. */
1821 case RELMEM_xen:
1822 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1823 if ( ret )
1824 return ret;
1825 #if CONFIG_PAGING_LEVELS >= 4
1826 d->arch.relmem = RELMEM_l4;
1827 /* fallthrough */
1829 case RELMEM_l4:
1830 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1831 if ( ret )
1832 return ret;
1833 #endif
1834 #if CONFIG_PAGING_LEVELS >= 3
1835 d->arch.relmem = RELMEM_l3;
1836 /* fallthrough */
1838 case RELMEM_l3:
1839 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1840 if ( ret )
1841 return ret;
1842 #endif
1843 d->arch.relmem = RELMEM_l2;
1844 /* fallthrough */
1846 case RELMEM_l2:
1847 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1848 if ( ret )
1849 return ret;
1850 d->arch.relmem = RELMEM_done;
1851 /* fallthrough */
1853 case RELMEM_done:
1854 break;
1856 default:
1857 BUG();
1860 /* Free page used by xen oprofile buffer. */
1861 free_xenoprof_pages(d);
1863 if ( is_hvm_domain(d) )
1864 hvm_domain_relinquish_resources(d);
1866 return 0;
1869 void arch_dump_domain_info(struct domain *d)
1871 paging_dump_domain_info(d);
1874 void arch_dump_vcpu_info(struct vcpu *v)
1876 paging_dump_vcpu_info(v);
1879 void domain_cpuid(
1880 struct domain *d,
1881 unsigned int input,
1882 unsigned int sub_input,
1883 unsigned int *eax,
1884 unsigned int *ebx,
1885 unsigned int *ecx,
1886 unsigned int *edx)
1888 cpuid_input_t *cpuid;
1889 int i;
1891 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
1893 cpuid = &d->arch.cpuids[i];
1895 if ( (cpuid->input[0] == input) &&
1896 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
1897 (cpuid->input[1] == sub_input)) )
1899 *eax = cpuid->eax;
1900 *ebx = cpuid->ebx;
1901 *ecx = cpuid->ecx;
1902 *edx = cpuid->edx;
1903 return;
1907 *eax = *ebx = *ecx = *edx = 0;
1910 void vcpu_kick(struct vcpu *v)
1912 /*
1913 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
1914 * pending flag. These values may fluctuate (after all, we hold no
1915 * locks) but the key insight is that each change will cause
1916 * evtchn_upcall_pending to be polled.
1918 * NB2. We save the running flag across the unblock to avoid a needless
1919 * IPI for domains that we IPI'd to unblock.
1920 */
1921 bool_t running = v->is_running;
1922 vcpu_unblock(v);
1923 if ( running && (in_irq() || (v != current)) )
1924 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
1927 void vcpu_mark_events_pending(struct vcpu *v)
1929 int already_pending = test_and_set_bit(
1930 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
1932 if ( already_pending )
1933 return;
1935 if ( is_hvm_vcpu(v) )
1936 hvm_assert_evtchn_irq(v);
1937 else
1938 vcpu_kick(v);
1941 static void vcpu_kick_softirq(void)
1943 /*
1944 * Nothing to do here: we merely prevent notifiers from racing with checks
1945 * executed on return to guest context with interrupts enabled. See, for
1946 * example, xxx_intr_assist() executed on return to HVM guest context.
1947 */
1950 static int __init init_vcpu_kick_softirq(void)
1952 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
1953 return 0;
1955 __initcall(init_vcpu_kick_softirq);
1958 /*
1959 * Local variables:
1960 * mode: C
1961 * c-set-style: "BSD"
1962 * c-basic-offset: 4
1963 * tab-width: 4
1964 * indent-tabs-mode: nil
1965 * End:
1966 */