ia64/xen-unstable

view xen/arch/x86/domain.c @ 18432:1e98ea5c8604

x86: Fix guest_handle_okay/guest_handle_subrange_okay

The guest handle checks should use paging_* predicates, not shadow_*.
Also tidy up a few places where p2m definitions were being imported
via asm/guest_access.h -> asm/shadow.h -> asm/p2m.h

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Sep 03 14:16:35 2008 +0100 (2008-09-03)
parents 021189f8cd78
children 34aed15ba9df
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <asm/regs.h>
36 #include <asm/mc146818rtc.h>
37 #include <asm/system.h>
38 #include <asm/io.h>
39 #include <asm/processor.h>
40 #include <asm/desc.h>
41 #include <asm/i387.h>
42 #include <asm/mpspec.h>
43 #include <asm/ldt.h>
44 #include <asm/hypercall.h>
45 #include <asm/hvm/hvm.h>
46 #include <asm/hvm/support.h>
47 #include <asm/debugreg.h>
48 #include <asm/msr.h>
49 #include <asm/nmi.h>
50 #include <xen/numa.h>
51 #include <xen/iommu.h>
52 #ifdef CONFIG_COMPAT
53 #include <compat/vcpu.h>
54 #endif
56 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
57 DEFINE_PER_CPU(u64, efer);
58 DEFINE_PER_CPU(unsigned long, cr4);
60 static void default_idle(void);
61 void (*pm_idle) (void) = default_idle;
63 static void paravirt_ctxt_switch_from(struct vcpu *v);
64 static void paravirt_ctxt_switch_to(struct vcpu *v);
66 static void vcpu_destroy_pagetables(struct vcpu *v);
68 static void continue_idle_domain(struct vcpu *v)
69 {
70 reset_stack_and_jump(idle_loop);
71 }
73 static void continue_nonidle_domain(struct vcpu *v)
74 {
75 reset_stack_and_jump(ret_from_intr);
76 }
78 static void default_idle(void)
79 {
80 local_irq_disable();
81 if ( !softirq_pending(smp_processor_id()) )
82 safe_halt();
83 else
84 local_irq_enable();
85 }
87 static void play_dead(void)
88 {
89 /* This must be done before dead CPU ack */
90 cpu_exit_clear();
91 hvm_cpu_down();
92 wbinvd();
93 mb();
94 /* Ack it */
95 __get_cpu_var(cpu_state) = CPU_DEAD;
97 /* With physical CPU hotplug, we should halt the cpu. */
98 local_irq_disable();
99 for ( ; ; )
100 halt();
101 }
103 void idle_loop(void)
104 {
105 for ( ; ; )
106 {
107 if ( cpu_is_offline(smp_processor_id()) )
108 play_dead();
109 page_scrub_schedule_work();
110 (*pm_idle)();
111 do_softirq();
112 }
113 }
115 void startup_cpu_idle_loop(void)
116 {
117 struct vcpu *v = current;
119 ASSERT(is_idle_vcpu(v));
120 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
121 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
123 reset_stack_and_jump(idle_loop);
124 }
126 void dump_pageframe_info(struct domain *d)
127 {
128 struct page_info *page;
130 printk("Memory pages belonging to domain %u:\n", d->domain_id);
132 if ( d->tot_pages >= 10 )
133 {
134 printk(" DomPage list too long to display\n");
135 }
136 else
137 {
138 list_for_each_entry ( page, &d->page_list, list )
139 {
140 printk(" DomPage %p: caf=%08x, taf=%" PRtype_info "\n",
141 _p(page_to_mfn(page)),
142 page->count_info, page->u.inuse.type_info);
143 }
144 }
146 list_for_each_entry ( page, &d->xenpage_list, list )
147 {
148 printk(" XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
149 _p(page_to_mfn(page)),
150 page->count_info, page->u.inuse.type_info);
151 }
152 }
154 struct vcpu *alloc_vcpu_struct(void)
155 {
156 struct vcpu *v;
157 if ( (v = xmalloc(struct vcpu)) != NULL )
158 memset(v, 0, sizeof(*v));
159 return v;
160 }
162 void free_vcpu_struct(struct vcpu *v)
163 {
164 xfree(v);
165 }
167 #ifdef CONFIG_COMPAT
169 static int setup_compat_l4(struct vcpu *v)
170 {
171 struct page_info *pg = alloc_domheap_page(NULL, 0);
172 l4_pgentry_t *l4tab;
174 if ( pg == NULL )
175 return -ENOMEM;
177 /* This page needs to look like a pagetable so that it can be shadowed */
178 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
180 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
181 l4tab[0] = l4e_empty();
182 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
183 l4e_from_page(pg, __PAGE_HYPERVISOR);
184 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
185 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
186 __PAGE_HYPERVISOR);
188 v->arch.guest_table = pagetable_from_page(pg);
189 v->arch.guest_table_user = v->arch.guest_table;
191 return 0;
192 }
194 static void release_compat_l4(struct vcpu *v)
195 {
196 free_domheap_page(pagetable_get_page(v->arch.guest_table));
197 v->arch.guest_table = pagetable_null();
198 v->arch.guest_table_user = pagetable_null();
199 }
201 static inline int may_switch_mode(struct domain *d)
202 {
203 return (!is_hvm_domain(d) && (d->tot_pages == 0));
204 }
206 int switch_native(struct domain *d)
207 {
208 l1_pgentry_t gdt_l1e;
209 unsigned int vcpuid;
211 if ( d == NULL )
212 return -EINVAL;
213 if ( !may_switch_mode(d) )
214 return -EACCES;
215 if ( !is_pv_32on64_domain(d) )
216 return 0;
218 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
220 /* switch gdt */
221 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
222 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
223 {
224 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
225 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
226 if (d->vcpu[vcpuid])
227 release_compat_l4(d->vcpu[vcpuid]);
228 }
230 return 0;
231 }
233 int switch_compat(struct domain *d)
234 {
235 l1_pgentry_t gdt_l1e;
236 unsigned int vcpuid;
238 if ( d == NULL )
239 return -EINVAL;
240 if ( !may_switch_mode(d) )
241 return -EACCES;
242 if ( is_pv_32on64_domain(d) )
243 return 0;
245 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
247 /* switch gdt */
248 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
249 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
250 {
251 if ( (d->vcpu[vcpuid] != NULL) &&
252 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
253 goto undo_and_fail;
254 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
255 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
256 }
258 domain_set_alloc_bitsize(d);
260 return 0;
262 undo_and_fail:
263 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
264 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
265 while ( vcpuid-- != 0 )
266 {
267 if ( d->vcpu[vcpuid] != NULL )
268 release_compat_l4(d->vcpu[vcpuid]);
269 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
270 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
271 }
272 return -ENOMEM;
273 }
275 #else
276 #define setup_compat_l4(v) 0
277 #define release_compat_l4(v) ((void)0)
278 #endif
280 int vcpu_initialise(struct vcpu *v)
281 {
282 struct domain *d = v->domain;
283 int rc;
285 v->arch.vcpu_info_mfn = INVALID_MFN;
287 v->arch.flags = TF_kernel_mode;
289 #if defined(__i386__)
290 mapcache_vcpu_init(v);
291 #endif
293 pae_l3_cache_init(&v->arch.pae_l3_cache);
295 paging_vcpu_init(v);
297 if ( is_hvm_domain(d) )
298 {
299 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
300 return rc;
301 }
302 else
303 {
304 /* PV guests by default have a 100Hz ticker. */
305 if ( !is_idle_domain(d) )
306 v->periodic_period = MILLISECS(10);
308 /* PV guests get an emulated PIT too for video BIOSes to use. */
309 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
310 pit_init(v, cpu_khz);
312 v->arch.schedule_tail = continue_nonidle_domain;
313 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
314 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
316 if ( is_idle_domain(d) )
317 {
318 v->arch.schedule_tail = continue_idle_domain;
319 v->arch.cr3 = __pa(idle_pg_table);
320 }
322 v->arch.guest_context.ctrlreg[4] =
323 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
324 }
326 v->arch.perdomain_ptes =
327 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
329 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
330 }
332 void vcpu_destroy(struct vcpu *v)
333 {
334 if ( is_pv_32on64_vcpu(v) )
335 release_compat_l4(v);
337 if ( is_hvm_vcpu(v) )
338 hvm_vcpu_destroy(v);
339 }
341 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
342 {
343 #ifdef __x86_64__
344 struct page_info *pg;
345 #endif
346 l1_pgentry_t gdt_l1e;
347 int i, vcpuid, pdpt_order, paging_initialised = 0;
348 int rc = -ENOMEM;
350 d->arch.hvm_domain.hap_enabled =
351 is_hvm_domain(d) &&
352 hvm_funcs.hap_supported &&
353 (domcr_flags & DOMCRF_hap);
355 INIT_LIST_HEAD(&d->arch.pdev_list);
357 d->arch.relmem = RELMEM_not_started;
358 INIT_LIST_HEAD(&d->arch.relmem_list);
360 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
361 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
362 if ( d->arch.mm_perdomain_pt == NULL )
363 goto fail;
364 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
366 /*
367 * Map Xen segments into every VCPU's GDT, irrespective of whether every
368 * VCPU will actually be used. This avoids an NMI race during context
369 * switch: if we take an interrupt after switching CR3 but before switching
370 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
371 * try to load CS from an invalid table.
372 */
373 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
374 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
375 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
376 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
378 #if defined(__i386__)
380 mapcache_domain_init(d);
382 #else /* __x86_64__ */
384 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
385 if ( pg == NULL )
386 goto fail;
387 d->arch.mm_perdomain_l2 = page_to_virt(pg);
388 clear_page(d->arch.mm_perdomain_l2);
389 for ( i = 0; i < (1 << pdpt_order); i++ )
390 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
391 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
392 __PAGE_HYPERVISOR);
394 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
395 if ( pg == NULL )
396 goto fail;
397 d->arch.mm_perdomain_l3 = page_to_virt(pg);
398 clear_page(d->arch.mm_perdomain_l3);
399 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
400 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
401 __PAGE_HYPERVISOR);
403 #endif /* __x86_64__ */
405 #ifdef CONFIG_COMPAT
406 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
407 #endif
409 if ( (rc = paging_domain_init(d)) != 0 )
410 goto fail;
411 paging_initialised = 1;
413 if ( !is_idle_domain(d) )
414 {
415 d->arch.ioport_caps =
416 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
417 rc = -ENOMEM;
418 if ( d->arch.ioport_caps == NULL )
419 goto fail;
421 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
422 goto fail;
424 clear_page(d->shared_info);
425 share_xen_page_with_guest(
426 virt_to_page(d->shared_info), d, XENSHARE_writable);
428 if ( (rc = iommu_domain_init(d)) != 0 )
429 goto fail;
430 }
432 spin_lock_init(&d->arch.irq_lock);
434 if ( is_hvm_domain(d) )
435 {
436 if ( (rc = hvm_domain_initialise(d)) != 0 )
437 {
438 iommu_domain_destroy(d);
439 goto fail;
440 }
441 }
442 else
443 {
444 /* 32-bit PV guest by default only if Xen is not 64-bit. */
445 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
446 (CONFIG_PAGING_LEVELS != 4);
447 }
449 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
450 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
451 {
452 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
453 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
454 }
456 return 0;
458 fail:
459 d->is_dying = DOMDYING_dead;
460 free_xenheap_page(d->shared_info);
461 if ( paging_initialised )
462 paging_final_teardown(d);
463 #ifdef __x86_64__
464 if ( d->arch.mm_perdomain_l2 )
465 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
466 if ( d->arch.mm_perdomain_l3 )
467 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
468 #endif
469 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
470 return rc;
471 }
473 void arch_domain_destroy(struct domain *d)
474 {
475 if ( is_hvm_domain(d) )
476 hvm_domain_destroy(d);
478 pci_release_devices(d);
479 if ( !is_idle_domain(d) )
480 iommu_domain_destroy(d);
482 paging_final_teardown(d);
484 free_xenheap_pages(
485 d->arch.mm_perdomain_pt,
486 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
488 #ifdef __x86_64__
489 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
490 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
491 #endif
493 free_xenheap_page(d->shared_info);
494 }
496 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
497 {
498 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
500 hv_cr4_mask = ~X86_CR4_TSD;
501 if ( cpu_has_de )
502 hv_cr4_mask &= ~X86_CR4_DE;
504 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
505 gdprintk(XENLOG_WARNING,
506 "Attempt to change CR4 flags %08lx -> %08lx\n",
507 hv_cr4, guest_cr4);
509 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
510 }
512 /* This is called by arch_final_setup_guest and do_boot_vcpu */
513 int arch_set_info_guest(
514 struct vcpu *v, vcpu_guest_context_u c)
515 {
516 struct domain *d = v->domain;
517 unsigned long cr3_pfn = INVALID_MFN;
518 unsigned long flags, cr4;
519 int i, rc = 0, compat;
521 /* The context is a compat-mode one if the target domain is compat-mode;
522 * we expect the tools to DTRT even in compat-mode callers. */
523 compat = is_pv_32on64_domain(d);
525 #ifdef CONFIG_COMPAT
526 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
527 #else
528 #define c(fld) (c.nat->fld)
529 #endif
530 flags = c(flags);
532 if ( !is_hvm_vcpu(v) )
533 {
534 if ( !compat )
535 {
536 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
537 fixup_guest_stack_selector(d, c.nat->kernel_ss);
538 fixup_guest_code_selector(d, c.nat->user_regs.cs);
539 #ifdef __i386__
540 fixup_guest_code_selector(d, c.nat->event_callback_cs);
541 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
542 #endif
544 for ( i = 0; i < 256; i++ )
545 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
547 /* LDT safety checks. */
548 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
549 (c.nat->ldt_ents > 8192) ||
550 !array_access_ok(c.nat->ldt_base,
551 c.nat->ldt_ents,
552 LDT_ENTRY_SIZE) )
553 return -EINVAL;
554 }
555 #ifdef CONFIG_COMPAT
556 else
557 {
558 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
559 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
560 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
561 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
562 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
564 for ( i = 0; i < 256; i++ )
565 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
567 /* LDT safety checks. */
568 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
569 (c.cmp->ldt_ents > 8192) ||
570 !compat_array_access_ok(c.cmp->ldt_base,
571 c.cmp->ldt_ents,
572 LDT_ENTRY_SIZE) )
573 return -EINVAL;
574 }
575 #endif
576 }
578 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
580 v->arch.flags &= ~TF_kernel_mode;
581 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
582 v->arch.flags |= TF_kernel_mode;
584 if ( !compat )
585 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
586 #ifdef CONFIG_COMPAT
587 else
588 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
589 #endif
591 v->arch.guest_context.user_regs.eflags |= 2;
593 if ( is_hvm_vcpu(v) )
594 goto out;
596 /* Only CR0.TS is modifiable by guest or admin. */
597 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
598 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
600 init_int80_direct_trap(v);
602 /* IOPL privileges are virtualised. */
603 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
604 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
606 /* Ensure real hardware interrupts are enabled. */
607 v->arch.guest_context.user_regs.eflags |= EF_IE;
609 cr4 = v->arch.guest_context.ctrlreg[4];
610 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
611 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
613 memset(v->arch.guest_context.debugreg, 0,
614 sizeof(v->arch.guest_context.debugreg));
615 for ( i = 0; i < 8; i++ )
616 (void)set_debugreg(v, i, c(debugreg[i]));
618 if ( v->is_initialised )
619 goto out;
621 if ( v->vcpu_id == 0 )
622 d->vm_assist = c(vm_assist);
624 if ( !compat )
625 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
626 #ifdef CONFIG_COMPAT
627 else
628 {
629 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
630 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
632 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
633 return -EINVAL;
634 for ( i = 0; i < n; ++i )
635 gdt_frames[i] = c.cmp->gdt_frames[i];
636 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
637 }
638 #endif
639 if ( rc != 0 )
640 return rc;
642 if ( !compat )
643 {
644 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
646 if ( !mfn_valid(cr3_pfn) ||
647 (paging_mode_refcounts(d)
648 ? !get_page(mfn_to_page(cr3_pfn), d)
649 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
650 PGT_base_page_table)) )
651 {
652 destroy_gdt(v);
653 return -EINVAL;
654 }
656 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
658 #ifdef __x86_64__
659 if ( c.nat->ctrlreg[1] )
660 {
661 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
663 if ( !mfn_valid(cr3_pfn) ||
664 (paging_mode_refcounts(d)
665 ? !get_page(mfn_to_page(cr3_pfn), d)
666 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
667 PGT_base_page_table)) )
668 {
669 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
670 v->arch.guest_table = pagetable_null();
671 if ( paging_mode_refcounts(d) )
672 put_page(mfn_to_page(cr3_pfn));
673 else
674 put_page_and_type(mfn_to_page(cr3_pfn));
675 destroy_gdt(v);
676 return -EINVAL;
677 }
679 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
680 }
681 #endif
682 }
683 #ifdef CONFIG_COMPAT
684 else
685 {
686 l4_pgentry_t *l4tab;
688 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
690 if ( !mfn_valid(cr3_pfn) ||
691 (paging_mode_refcounts(d)
692 ? !get_page(mfn_to_page(cr3_pfn), d)
693 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
694 PGT_l3_page_table)) )
695 {
696 destroy_gdt(v);
697 return -EINVAL;
698 }
700 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
701 *l4tab = l4e_from_pfn(
702 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
703 }
704 #endif
706 if ( v->vcpu_id == 0 )
707 update_domain_wallclock_time(d);
709 /* Don't redo final setup */
710 v->is_initialised = 1;
712 if ( paging_mode_enabled(d) )
713 paging_update_paging_modes(v);
715 update_cr3(v);
717 out:
718 if ( flags & VGCF_online )
719 clear_bit(_VPF_down, &v->pause_flags);
720 else
721 set_bit(_VPF_down, &v->pause_flags);
722 return 0;
723 #undef c
724 }
726 void arch_vcpu_reset(struct vcpu *v)
727 {
728 if ( !is_hvm_vcpu(v) )
729 {
730 destroy_gdt(v);
731 vcpu_destroy_pagetables(v);
732 }
733 else
734 {
735 vcpu_end_shutdown_deferral(v);
736 }
737 }
739 /*
740 * Unmap the vcpu info page if the guest decided to place it somewhere
741 * else. This is only used from arch_domain_destroy, so there's no
742 * need to do anything clever.
743 */
744 static void
745 unmap_vcpu_info(struct vcpu *v)
746 {
747 struct domain *d = v->domain;
748 unsigned long mfn;
750 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
751 return;
753 mfn = v->arch.vcpu_info_mfn;
754 unmap_domain_page_global(v->vcpu_info);
756 v->vcpu_info = (void *)&shared_info(d, vcpu_info[v->vcpu_id]);
757 v->arch.vcpu_info_mfn = INVALID_MFN;
759 put_page_and_type(mfn_to_page(mfn));
760 }
762 /*
763 * Map a guest page in and point the vcpu_info pointer at it. This
764 * makes sure that the vcpu_info is always pointing at a valid piece
765 * of memory, and it sets a pending event to make sure that a pending
766 * event doesn't get missed.
767 */
768 static int
769 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
770 {
771 struct domain *d = v->domain;
772 void *mapping;
773 vcpu_info_t *new_info;
774 int i;
776 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
777 return -EINVAL;
779 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
780 return -EINVAL;
782 /* Run this command on yourself or on other offline VCPUS. */
783 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
784 return -EINVAL;
786 mfn = gmfn_to_mfn(d, mfn);
787 if ( !mfn_valid(mfn) ||
788 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
789 return -EINVAL;
791 mapping = map_domain_page_global(mfn);
792 if ( mapping == NULL )
793 {
794 put_page_and_type(mfn_to_page(mfn));
795 return -ENOMEM;
796 }
798 new_info = (vcpu_info_t *)(mapping + offset);
800 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
802 v->vcpu_info = new_info;
803 v->arch.vcpu_info_mfn = mfn;
805 /* Set new vcpu_info pointer /before/ setting pending flags. */
806 wmb();
808 /*
809 * Mark everything as being pending just to make sure nothing gets
810 * lost. The domain will get a spurious event, but it can cope.
811 */
812 vcpu_info(v, evtchn_upcall_pending) = 1;
813 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
814 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
816 return 0;
817 }
819 long
820 arch_do_vcpu_op(
821 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
822 {
823 long rc = 0;
825 switch ( cmd )
826 {
827 case VCPUOP_register_runstate_memory_area:
828 {
829 struct vcpu_register_runstate_memory_area area;
830 struct vcpu_runstate_info runstate;
832 rc = -EFAULT;
833 if ( copy_from_guest(&area, arg, 1) )
834 break;
836 if ( !guest_handle_okay(area.addr.h, 1) )
837 break;
839 rc = 0;
840 runstate_guest(v) = area.addr.h;
842 if ( v == current )
843 {
844 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
845 }
846 else
847 {
848 vcpu_runstate_get(v, &runstate);
849 __copy_to_guest(runstate_guest(v), &runstate, 1);
850 }
852 break;
853 }
855 case VCPUOP_register_vcpu_info:
856 {
857 struct domain *d = v->domain;
858 struct vcpu_register_vcpu_info info;
860 rc = -EFAULT;
861 if ( copy_from_guest(&info, arg, 1) )
862 break;
864 domain_lock(d);
865 rc = map_vcpu_info(v, info.mfn, info.offset);
866 domain_unlock(d);
868 break;
869 }
871 case VCPUOP_get_physid:
872 {
873 struct vcpu_get_physid cpu_id;
875 rc = -EINVAL;
876 if ( !v->domain->is_pinned )
877 break;
879 cpu_id.phys_id =
880 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
881 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
883 rc = -EFAULT;
884 if ( copy_to_guest(arg, &cpu_id, 1) )
885 break;
887 rc = 0;
888 break;
889 }
891 default:
892 rc = -ENOSYS;
893 break;
894 }
896 return rc;
897 }
899 #ifdef __x86_64__
901 #define loadsegment(seg,value) ({ \
902 int __r = 1; \
903 asm volatile ( \
904 "1: movl %k1,%%" #seg "\n2:\n" \
905 ".section .fixup,\"ax\"\n" \
906 "3: xorl %k0,%k0\n" \
907 " movl %k0,%%" #seg "\n" \
908 " jmp 2b\n" \
909 ".previous\n" \
910 ".section __ex_table,\"a\"\n" \
911 " .align 8\n" \
912 " .quad 1b,3b\n" \
913 ".previous" \
914 : "=r" (__r) : "r" (value), "0" (__r) );\
915 __r; })
917 /*
918 * save_segments() writes a mask of segments which are dirty (non-zero),
919 * allowing load_segments() to avoid some expensive segment loads and
920 * MSR writes.
921 */
922 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
923 #define DIRTY_DS 0x01
924 #define DIRTY_ES 0x02
925 #define DIRTY_FS 0x04
926 #define DIRTY_GS 0x08
927 #define DIRTY_FS_BASE 0x10
928 #define DIRTY_GS_BASE_USER 0x20
930 static void load_segments(struct vcpu *n)
931 {
932 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
933 int all_segs_okay = 1;
934 unsigned int dirty_segment_mask, cpu = smp_processor_id();
936 /* Load and clear the dirty segment mask. */
937 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
938 per_cpu(dirty_segment_mask, cpu) = 0;
940 /* Either selector != 0 ==> reload. */
941 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
942 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
944 /* Either selector != 0 ==> reload. */
945 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
946 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
948 /*
949 * Either selector != 0 ==> reload.
950 * Also reload to reset FS_BASE if it was non-zero.
951 */
952 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
953 nctxt->user_regs.fs) )
954 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
956 /*
957 * Either selector != 0 ==> reload.
958 * Also reload to reset GS_BASE if it was non-zero.
959 */
960 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
961 nctxt->user_regs.gs) )
962 {
963 /* Reset GS_BASE with user %gs? */
964 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
965 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
966 }
968 if ( !is_pv_32on64_domain(n->domain) )
969 {
970 /* This can only be non-zero if selector is NULL. */
971 if ( nctxt->fs_base )
972 wrmsr(MSR_FS_BASE,
973 nctxt->fs_base,
974 nctxt->fs_base>>32);
976 /* Most kernels have non-zero GS base, so don't bother testing. */
977 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
978 wrmsr(MSR_SHADOW_GS_BASE,
979 nctxt->gs_base_kernel,
980 nctxt->gs_base_kernel>>32);
982 /* This can only be non-zero if selector is NULL. */
983 if ( nctxt->gs_base_user )
984 wrmsr(MSR_GS_BASE,
985 nctxt->gs_base_user,
986 nctxt->gs_base_user>>32);
988 /* If in kernel mode then switch the GS bases around. */
989 if ( (n->arch.flags & TF_kernel_mode) )
990 asm volatile ( "swapgs" );
991 }
993 if ( unlikely(!all_segs_okay) )
994 {
995 struct cpu_user_regs *regs = guest_cpu_user_regs();
996 unsigned long *rsp =
997 (n->arch.flags & TF_kernel_mode) ?
998 (unsigned long *)regs->rsp :
999 (unsigned long *)nctxt->kernel_sp;
1000 unsigned long cs_and_mask, rflags;
1002 if ( is_pv_32on64_domain(n->domain) )
1004 unsigned int *esp = ring_1(regs) ?
1005 (unsigned int *)regs->rsp :
1006 (unsigned int *)nctxt->kernel_sp;
1007 unsigned int cs_and_mask, eflags;
1008 int ret = 0;
1010 /* CS longword also contains full evtchn_upcall_mask. */
1011 cs_and_mask = (unsigned short)regs->cs |
1012 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1013 /* Fold upcall mask into RFLAGS.IF. */
1014 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1015 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1017 if ( !ring_1(regs) )
1019 ret = put_user(regs->ss, esp-1);
1020 ret |= put_user(regs->_esp, esp-2);
1021 esp -= 2;
1024 if ( ret |
1025 put_user(eflags, esp-1) |
1026 put_user(cs_and_mask, esp-2) |
1027 put_user(regs->_eip, esp-3) |
1028 put_user(nctxt->user_regs.gs, esp-4) |
1029 put_user(nctxt->user_regs.fs, esp-5) |
1030 put_user(nctxt->user_regs.es, esp-6) |
1031 put_user(nctxt->user_regs.ds, esp-7) )
1033 gdprintk(XENLOG_ERR, "Error while creating compat "
1034 "failsafe callback frame.\n");
1035 domain_crash(n->domain);
1038 if ( test_bit(_VGCF_failsafe_disables_events,
1039 &n->arch.guest_context.flags) )
1040 vcpu_info(n, evtchn_upcall_mask) = 1;
1042 regs->entry_vector = TRAP_syscall;
1043 regs->_eflags &= 0xFFFCBEFFUL;
1044 regs->ss = FLAT_COMPAT_KERNEL_SS;
1045 regs->_esp = (unsigned long)(esp-7);
1046 regs->cs = FLAT_COMPAT_KERNEL_CS;
1047 regs->_eip = nctxt->failsafe_callback_eip;
1048 return;
1051 if ( !(n->arch.flags & TF_kernel_mode) )
1052 toggle_guest_mode(n);
1053 else
1054 regs->cs &= ~3;
1056 /* CS longword also contains full evtchn_upcall_mask. */
1057 cs_and_mask = (unsigned long)regs->cs |
1058 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1060 /* Fold upcall mask into RFLAGS.IF. */
1061 rflags = regs->rflags & ~X86_EFLAGS_IF;
1062 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1064 if ( put_user(regs->ss, rsp- 1) |
1065 put_user(regs->rsp, rsp- 2) |
1066 put_user(rflags, rsp- 3) |
1067 put_user(cs_and_mask, rsp- 4) |
1068 put_user(regs->rip, rsp- 5) |
1069 put_user(nctxt->user_regs.gs, rsp- 6) |
1070 put_user(nctxt->user_regs.fs, rsp- 7) |
1071 put_user(nctxt->user_regs.es, rsp- 8) |
1072 put_user(nctxt->user_regs.ds, rsp- 9) |
1073 put_user(regs->r11, rsp-10) |
1074 put_user(regs->rcx, rsp-11) )
1076 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1077 "callback frame.\n");
1078 domain_crash(n->domain);
1081 if ( test_bit(_VGCF_failsafe_disables_events,
1082 &n->arch.guest_context.flags) )
1083 vcpu_info(n, evtchn_upcall_mask) = 1;
1085 regs->entry_vector = TRAP_syscall;
1086 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1087 X86_EFLAGS_NT|X86_EFLAGS_TF);
1088 regs->ss = FLAT_KERNEL_SS;
1089 regs->rsp = (unsigned long)(rsp-11);
1090 regs->cs = FLAT_KERNEL_CS;
1091 regs->rip = nctxt->failsafe_callback_eip;
1095 static void save_segments(struct vcpu *v)
1097 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1098 struct cpu_user_regs *regs = &ctxt->user_regs;
1099 unsigned int dirty_segment_mask = 0;
1101 regs->ds = read_segment_register(ds);
1102 regs->es = read_segment_register(es);
1103 regs->fs = read_segment_register(fs);
1104 regs->gs = read_segment_register(gs);
1106 if ( regs->ds )
1107 dirty_segment_mask |= DIRTY_DS;
1109 if ( regs->es )
1110 dirty_segment_mask |= DIRTY_ES;
1112 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1114 dirty_segment_mask |= DIRTY_FS;
1115 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1117 else if ( ctxt->fs_base )
1119 dirty_segment_mask |= DIRTY_FS_BASE;
1122 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1124 dirty_segment_mask |= DIRTY_GS;
1125 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1127 else if ( ctxt->gs_base_user )
1129 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1132 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1135 #define switch_kernel_stack(v) ((void)0)
1137 #elif defined(__i386__)
1139 #define load_segments(n) ((void)0)
1140 #define save_segments(p) ((void)0)
1142 static inline void switch_kernel_stack(struct vcpu *v)
1144 struct tss_struct *tss = &init_tss[smp_processor_id()];
1145 tss->esp1 = v->arch.guest_context.kernel_sp;
1146 tss->ss1 = v->arch.guest_context.kernel_ss;
1149 #endif /* __i386__ */
1151 static void paravirt_ctxt_switch_from(struct vcpu *v)
1153 save_segments(v);
1155 /*
1156 * Disable debug breakpoints. We do this aggressively because if we switch
1157 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1158 * inside Xen, before we get a chance to reload DR7, and this cannot always
1159 * safely be handled.
1160 */
1161 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1162 write_debugreg(7, 0);
1165 static void paravirt_ctxt_switch_to(struct vcpu *v)
1167 unsigned long cr4;
1169 set_int80_direct_trap(v);
1170 switch_kernel_stack(v);
1172 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1173 if ( unlikely(cr4 != read_cr4()) )
1174 write_cr4(cr4);
1176 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1178 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1179 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1180 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1181 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1182 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1183 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1187 static void __context_switch(void)
1189 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1190 unsigned int cpu = smp_processor_id();
1191 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1192 struct vcpu *n = current;
1194 ASSERT(p != n);
1195 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1197 if ( !is_idle_vcpu(p) )
1199 memcpy(&p->arch.guest_context.user_regs,
1200 stack_regs,
1201 CTXT_SWITCH_STACK_BYTES);
1202 unlazy_fpu(p);
1203 p->arch.ctxt_switch_from(p);
1206 if ( !is_idle_vcpu(n) )
1208 memcpy(stack_regs,
1209 &n->arch.guest_context.user_regs,
1210 CTXT_SWITCH_STACK_BYTES);
1211 n->arch.ctxt_switch_to(n);
1214 if ( p->domain != n->domain )
1215 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1216 cpu_set(cpu, n->vcpu_dirty_cpumask);
1218 write_ptbase(n);
1220 if ( p->vcpu_id != n->vcpu_id )
1222 char gdt_load[10];
1223 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1224 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1225 asm volatile ( "lgdt %0" : "=m" (gdt_load) );
1228 if ( p->domain != n->domain )
1229 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1230 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1232 per_cpu(curr_vcpu, cpu) = n;
1236 void context_switch(struct vcpu *prev, struct vcpu *next)
1238 unsigned int cpu = smp_processor_id();
1239 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1241 ASSERT(local_irq_is_enabled());
1243 /* Allow at most one CPU at a time to be dirty. */
1244 ASSERT(cpus_weight(dirty_mask) <= 1);
1245 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1247 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1248 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1249 flush_tlb_mask(next->vcpu_dirty_cpumask);
1252 local_irq_disable();
1254 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1255 pt_save_timer(prev);
1257 set_current(next);
1259 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1261 local_irq_enable();
1263 else
1265 __context_switch();
1267 #ifdef CONFIG_COMPAT
1268 if ( !is_hvm_vcpu(next) &&
1269 (is_idle_vcpu(prev) ||
1270 is_hvm_vcpu(prev) ||
1271 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1273 uint64_t efer = read_efer();
1274 if ( !(efer & EFER_SCE) )
1275 write_efer(efer | EFER_SCE);
1276 flush_tlb_one_local(GDT_VIRT_START(next) +
1277 FIRST_RESERVED_GDT_BYTE);
1279 #endif
1281 /* Re-enable interrupts before restoring state which may fault. */
1282 local_irq_enable();
1284 if ( !is_hvm_vcpu(next) )
1286 load_LDT(next);
1287 load_segments(next);
1291 context_saved(prev);
1293 /* Update per-VCPU guest runstate shared memory area (if registered). */
1294 if ( !guest_handle_is_null(runstate_guest(next)) )
1296 if ( !is_pv_32on64_domain(next->domain) )
1297 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1298 #ifdef CONFIG_COMPAT
1299 else
1301 struct compat_vcpu_runstate_info info;
1303 XLAT_vcpu_runstate_info(&info, &next->runstate);
1304 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1306 #endif
1309 schedule_tail(next);
1310 BUG();
1313 void continue_running(struct vcpu *same)
1315 schedule_tail(same);
1316 BUG();
1319 int __sync_lazy_execstate(void)
1321 unsigned long flags;
1322 int switch_required;
1324 local_irq_save(flags);
1326 switch_required = (this_cpu(curr_vcpu) != current);
1328 if ( switch_required )
1330 ASSERT(current == idle_vcpu[smp_processor_id()]);
1331 __context_switch();
1334 local_irq_restore(flags);
1336 return switch_required;
1339 void sync_vcpu_execstate(struct vcpu *v)
1341 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1342 (void)__sync_lazy_execstate();
1344 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1345 flush_tlb_mask(v->vcpu_dirty_cpumask);
1348 struct migrate_info {
1349 long (*func)(void *data);
1350 void *data;
1351 void (*saved_schedule_tail)(struct vcpu *);
1352 cpumask_t saved_affinity;
1353 };
1355 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1357 struct cpu_user_regs *regs = guest_cpu_user_regs();
1358 struct migrate_info *info = v->arch.continue_info;
1359 cpumask_t mask = info->saved_affinity;
1361 regs->eax = info->func(info->data);
1363 v->arch.schedule_tail = info->saved_schedule_tail;
1364 v->arch.continue_info = NULL;
1366 xfree(info);
1368 vcpu_unlock_affinity(v, &mask);
1369 schedule_tail(v);
1372 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1374 struct vcpu *v = current;
1375 struct migrate_info *info;
1376 int rc;
1378 if ( cpu == smp_processor_id() )
1379 return func(data);
1381 info = xmalloc(struct migrate_info);
1382 if ( info == NULL )
1383 return -ENOMEM;
1385 info->func = func;
1386 info->data = data;
1387 info->saved_schedule_tail = v->arch.schedule_tail;
1388 info->saved_affinity = cpumask_of_cpu(cpu);
1390 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1391 v->arch.continue_info = info;
1393 rc = vcpu_lock_affinity(v, &info->saved_affinity);
1394 if ( rc )
1396 v->arch.schedule_tail = info->saved_schedule_tail;
1397 v->arch.continue_info = NULL;
1398 xfree(info);
1399 return rc;
1402 /* Dummy return value will be overwritten by new schedule_tail. */
1403 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1404 return 0;
1407 #define next_arg(fmt, args) ({ \
1408 unsigned long __arg; \
1409 switch ( *(fmt)++ ) \
1410 { \
1411 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1412 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1413 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1414 default: __arg = 0; BUG(); \
1415 } \
1416 __arg; \
1417 })
1419 DEFINE_PER_CPU(char, hc_preempted);
1421 unsigned long hypercall_create_continuation(
1422 unsigned int op, const char *format, ...)
1424 struct mc_state *mcs = &this_cpu(mc_state);
1425 struct cpu_user_regs *regs;
1426 const char *p = format;
1427 unsigned long arg;
1428 unsigned int i;
1429 va_list args;
1431 va_start(args, format);
1433 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1435 __set_bit(_MCSF_call_preempted, &mcs->flags);
1437 for ( i = 0; *p != '\0'; i++ )
1438 mcs->call.args[i] = next_arg(p, args);
1439 if ( is_pv_32on64_domain(current->domain) )
1441 for ( ; i < 6; i++ )
1442 mcs->call.args[i] = 0;
1445 else
1447 regs = guest_cpu_user_regs();
1448 regs->eax = op;
1449 /*
1450 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1451 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1452 */
1453 if ( !is_hvm_vcpu(current) )
1454 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1456 #ifdef __x86_64__
1457 if ( !is_hvm_vcpu(current) ?
1458 !is_pv_32on64_vcpu(current) :
1459 (hvm_guest_x86_mode(current) == 8) )
1461 for ( i = 0; *p != '\0'; i++ )
1463 arg = next_arg(p, args);
1464 switch ( i )
1466 case 0: regs->rdi = arg; break;
1467 case 1: regs->rsi = arg; break;
1468 case 2: regs->rdx = arg; break;
1469 case 3: regs->r10 = arg; break;
1470 case 4: regs->r8 = arg; break;
1471 case 5: regs->r9 = arg; break;
1475 else
1476 #endif
1478 if ( supervisor_mode_kernel )
1479 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1481 for ( i = 0; *p != '\0'; i++ )
1483 arg = next_arg(p, args);
1484 switch ( i )
1486 case 0: regs->ebx = arg; break;
1487 case 1: regs->ecx = arg; break;
1488 case 2: regs->edx = arg; break;
1489 case 3: regs->esi = arg; break;
1490 case 4: regs->edi = arg; break;
1491 case 5: regs->ebp = arg; break;
1496 this_cpu(hc_preempted) = 1;
1499 va_end(args);
1501 return op;
1504 #ifdef CONFIG_COMPAT
1505 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1507 int rc = 0;
1508 struct mc_state *mcs = &this_cpu(mc_state);
1509 struct cpu_user_regs *regs;
1510 unsigned int i, cval = 0;
1511 unsigned long nval = 0;
1512 va_list args;
1514 BUG_ON(*id > 5);
1515 BUG_ON(mask & (1U << *id));
1517 va_start(args, mask);
1519 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1521 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1522 return 0;
1523 for ( i = 0; i < 6; ++i, mask >>= 1 )
1525 if ( mask & 1 )
1527 nval = va_arg(args, unsigned long);
1528 cval = va_arg(args, unsigned int);
1529 if ( cval == nval )
1530 mask &= ~1U;
1531 else
1532 BUG_ON(nval == (unsigned int)nval);
1534 else if ( id && *id == i )
1536 *id = mcs->call.args[i];
1537 id = NULL;
1539 if ( (mask & 1) && mcs->call.args[i] == nval )
1541 mcs->call.args[i] = cval;
1542 ++rc;
1544 else
1545 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1548 else
1550 regs = guest_cpu_user_regs();
1551 for ( i = 0; i < 6; ++i, mask >>= 1 )
1553 unsigned long *reg;
1555 switch ( i )
1557 case 0: reg = &regs->ebx; break;
1558 case 1: reg = &regs->ecx; break;
1559 case 2: reg = &regs->edx; break;
1560 case 3: reg = &regs->esi; break;
1561 case 4: reg = &regs->edi; break;
1562 case 5: reg = &regs->ebp; break;
1563 default: BUG(); reg = NULL; break;
1565 if ( (mask & 1) )
1567 nval = va_arg(args, unsigned long);
1568 cval = va_arg(args, unsigned int);
1569 if ( cval == nval )
1570 mask &= ~1U;
1571 else
1572 BUG_ON(nval == (unsigned int)nval);
1574 else if ( id && *id == i )
1576 *id = *reg;
1577 id = NULL;
1579 if ( (mask & 1) && *reg == nval )
1581 *reg = cval;
1582 ++rc;
1584 else
1585 BUG_ON(*reg != (unsigned int)*reg);
1589 va_end(args);
1591 return rc;
1593 #endif
1595 static int relinquish_memory(
1596 struct domain *d, struct list_head *list, unsigned long type)
1598 struct list_head *ent;
1599 struct page_info *page;
1600 unsigned long x, y;
1601 int ret = 0;
1603 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1604 spin_lock_recursive(&d->page_alloc_lock);
1606 ent = list->next;
1607 while ( ent != list )
1609 page = list_entry(ent, struct page_info, list);
1611 /* Grab a reference to the page so it won't disappear from under us. */
1612 if ( unlikely(!get_page(page, d)) )
1614 /* Couldn't get a reference -- someone is freeing this page. */
1615 ent = ent->next;
1616 list_move_tail(&page->list, &d->arch.relmem_list);
1617 continue;
1620 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1621 put_page_and_type(page);
1623 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1624 put_page(page);
1626 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1627 /*
1628 * Forcibly drop reference counts of page tables above top most (which
1629 * were skipped to prevent long latencies due to deep recursion - see
1630 * the special treatment in free_lX_table()).
1631 */
1632 y = page->u.inuse.type_info;
1633 if ( (type < PGT_root_page_table) &&
1634 unlikely(((y + PGT_type_mask) &
1635 (PGT_type_mask|PGT_validated)) == type) )
1637 BUG_ON((y & PGT_count_mask) >=
1638 (page->count_info & PGC_count_mask));
1639 while ( y & PGT_count_mask )
1641 put_page_and_type(page);
1642 y = page->u.inuse.type_info;
1645 #endif
1647 /*
1648 * Forcibly invalidate top-most, still valid page tables at this point
1649 * to break circular 'linear page table' references as well as clean up
1650 * partially validated pages. This is okay because MMU structures are
1651 * not shared across domains and this domain is now dead. Thus top-most
1652 * valid tables are not in use so a non-zero count means circular
1653 * reference or partially validated.
1654 */
1655 y = page->u.inuse.type_info;
1656 for ( ; ; )
1658 x = y;
1659 if ( likely((x & PGT_type_mask) != type) ||
1660 likely(!(x & (PGT_validated|PGT_partial))) )
1661 break;
1663 y = cmpxchg(&page->u.inuse.type_info, x,
1664 x & ~(PGT_validated|PGT_partial));
1665 if ( likely(y == x) )
1667 if ( free_page_type(page, x, 0) != 0 )
1668 BUG();
1669 break;
1673 /* Follow the list chain and /then/ potentially free the page. */
1674 ent = ent->next;
1675 list_move_tail(&page->list, &d->arch.relmem_list);
1676 put_page(page);
1678 if ( hypercall_preempt_check() )
1680 ret = -EAGAIN;
1681 goto out;
1685 list_splice_init(&d->arch.relmem_list, list);
1687 out:
1688 spin_unlock_recursive(&d->page_alloc_lock);
1689 return ret;
1692 static void vcpu_destroy_pagetables(struct vcpu *v)
1694 struct domain *d = v->domain;
1695 unsigned long pfn;
1697 #ifdef __x86_64__
1698 if ( is_pv_32on64_vcpu(v) )
1700 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1701 __va(pagetable_get_paddr(v->arch.guest_table)));
1703 if ( pfn != 0 )
1705 if ( paging_mode_refcounts(d) )
1706 put_page(mfn_to_page(pfn));
1707 else
1708 put_page_and_type(mfn_to_page(pfn));
1711 l4e_write(
1712 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1713 l4e_empty());
1715 v->arch.cr3 = 0;
1716 return;
1718 #endif
1720 pfn = pagetable_get_pfn(v->arch.guest_table);
1721 if ( pfn != 0 )
1723 if ( paging_mode_refcounts(d) )
1724 put_page(mfn_to_page(pfn));
1725 else
1726 put_page_and_type(mfn_to_page(pfn));
1727 v->arch.guest_table = pagetable_null();
1730 #ifdef __x86_64__
1731 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1732 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1733 if ( pfn != 0 )
1735 if ( !is_pv_32bit_vcpu(v) )
1737 if ( paging_mode_refcounts(d) )
1738 put_page(mfn_to_page(pfn));
1739 else
1740 put_page_and_type(mfn_to_page(pfn));
1742 v->arch.guest_table_user = pagetable_null();
1744 #endif
1746 v->arch.cr3 = 0;
1749 int domain_relinquish_resources(struct domain *d)
1751 int ret;
1752 struct vcpu *v;
1754 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1756 switch ( d->arch.relmem )
1758 case RELMEM_not_started:
1759 /* Tear down paging-assistance stuff. */
1760 paging_teardown(d);
1762 for_each_vcpu ( d, v )
1764 /* Drop the in-use references to page-table bases. */
1765 vcpu_destroy_pagetables(v);
1767 /*
1768 * Relinquish GDT mappings. No need for explicit unmapping of the
1769 * LDT as it automatically gets squashed with the guest mappings.
1770 */
1771 destroy_gdt(v);
1773 unmap_vcpu_info(v);
1776 d->arch.relmem = RELMEM_xen;
1777 /* fallthrough */
1779 /* Relinquish every page of memory. */
1780 case RELMEM_xen:
1781 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1782 if ( ret )
1783 return ret;
1784 #if CONFIG_PAGING_LEVELS >= 4
1785 d->arch.relmem = RELMEM_l4;
1786 /* fallthrough */
1788 case RELMEM_l4:
1789 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1790 if ( ret )
1791 return ret;
1792 #endif
1793 #if CONFIG_PAGING_LEVELS >= 3
1794 d->arch.relmem = RELMEM_l3;
1795 /* fallthrough */
1797 case RELMEM_l3:
1798 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1799 if ( ret )
1800 return ret;
1801 #endif
1802 d->arch.relmem = RELMEM_l2;
1803 /* fallthrough */
1805 case RELMEM_l2:
1806 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1807 if ( ret )
1808 return ret;
1809 d->arch.relmem = RELMEM_done;
1810 /* fallthrough */
1812 case RELMEM_done:
1813 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1814 ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
1815 if ( ret )
1816 return ret;
1817 #endif
1818 break;
1820 default:
1821 BUG();
1824 /* Free page used by xen oprofile buffer. */
1825 free_xenoprof_pages(d);
1827 if ( is_hvm_domain(d) )
1828 hvm_domain_relinquish_resources(d);
1830 return 0;
1833 void arch_dump_domain_info(struct domain *d)
1835 paging_dump_domain_info(d);
1838 void arch_dump_vcpu_info(struct vcpu *v)
1840 paging_dump_vcpu_info(v);
1843 void domain_cpuid(
1844 struct domain *d,
1845 unsigned int input,
1846 unsigned int sub_input,
1847 unsigned int *eax,
1848 unsigned int *ebx,
1849 unsigned int *ecx,
1850 unsigned int *edx)
1852 cpuid_input_t *cpuid;
1853 int i;
1855 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
1857 cpuid = &d->arch.cpuids[i];
1859 if ( (cpuid->input[0] == input) &&
1860 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
1861 (cpuid->input[1] == sub_input)) )
1863 *eax = cpuid->eax;
1864 *ebx = cpuid->ebx;
1865 *ecx = cpuid->ecx;
1866 *edx = cpuid->edx;
1867 return;
1871 *eax = *ebx = *ecx = *edx = 0;
1874 /*
1875 * Local variables:
1876 * mode: C
1877 * c-set-style: "BSD"
1878 * c-basic-offset: 4
1879 * tab-width: 4
1880 * indent-tabs-mode: nil
1881 * End:
1882 */