ia64/xen-unstable

view xen/arch/x86/domain.c @ 19108:839bece33989

x86: VCPU structure must reside below 4GB, since it contains embedded
PAE mode PDPTEs.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jan 28 17:05:18 2009 +0000 (2009-01-28)
parents 696351cde9a4
children 0858f961c77a
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <asm/regs.h>
36 #include <asm/mc146818rtc.h>
37 #include <asm/system.h>
38 #include <asm/io.h>
39 #include <asm/processor.h>
40 #include <asm/desc.h>
41 #include <asm/i387.h>
42 #include <asm/mpspec.h>
43 #include <asm/ldt.h>
44 #include <asm/hypercall.h>
45 #include <asm/hvm/hvm.h>
46 #include <asm/hvm/support.h>
47 #include <asm/debugreg.h>
48 #include <asm/msr.h>
49 #include <asm/nmi.h>
50 #include <xen/numa.h>
51 #include <xen/iommu.h>
52 #ifdef CONFIG_COMPAT
53 #include <compat/vcpu.h>
54 #endif
56 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
57 DEFINE_PER_CPU(u64, efer);
58 DEFINE_PER_CPU(unsigned long, cr4);
60 static void default_idle(void);
61 void (*pm_idle) (void) = default_idle;
63 static void paravirt_ctxt_switch_from(struct vcpu *v);
64 static void paravirt_ctxt_switch_to(struct vcpu *v);
66 static void vcpu_destroy_pagetables(struct vcpu *v);
68 static void continue_idle_domain(struct vcpu *v)
69 {
70 reset_stack_and_jump(idle_loop);
71 }
73 static void continue_nonidle_domain(struct vcpu *v)
74 {
75 reset_stack_and_jump(ret_from_intr);
76 }
78 static void default_idle(void)
79 {
80 local_irq_disable();
81 if ( !softirq_pending(smp_processor_id()) )
82 safe_halt();
83 else
84 local_irq_enable();
85 }
87 static void play_dead(void)
88 {
89 /*
90 * Flush pending softirqs if any. They can be queued up before this CPU
91 * was taken out of cpu_online_map in __cpu_disable().
92 */
93 do_softirq();
95 /* This must be done before dead CPU ack */
96 cpu_exit_clear();
97 hvm_cpu_down();
98 wbinvd();
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
103 /* With physical CPU hotplug, we should halt the cpu. */
104 local_irq_disable();
105 for ( ; ; )
106 halt();
107 }
109 void idle_loop(void)
110 {
111 for ( ; ; )
112 {
113 if ( cpu_is_offline(smp_processor_id()) )
114 play_dead();
115 page_scrub_schedule_work();
116 (*pm_idle)();
117 do_softirq();
118 }
119 }
121 void startup_cpu_idle_loop(void)
122 {
123 struct vcpu *v = current;
125 ASSERT(is_idle_vcpu(v));
126 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
127 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
129 reset_stack_and_jump(idle_loop);
130 }
132 void dump_pageframe_info(struct domain *d)
133 {
134 struct page_info *page;
136 printk("Memory pages belonging to domain %u:\n", d->domain_id);
138 if ( d->tot_pages >= 10 )
139 {
140 printk(" DomPage list too long to display\n");
141 }
142 else
143 {
144 list_for_each_entry ( page, &d->page_list, list )
145 {
146 printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
147 _p(page_to_mfn(page)),
148 page->count_info, page->u.inuse.type_info);
149 }
150 }
152 if ( is_hvm_domain(d) )
153 {
154 p2m_pod_dump_data(d);
155 }
157 list_for_each_entry ( page, &d->xenpage_list, list )
158 {
159 printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
160 _p(page_to_mfn(page)),
161 page->count_info, page->u.inuse.type_info);
162 }
163 }
165 struct vcpu *alloc_vcpu_struct(void)
166 {
167 struct vcpu *v;
168 /*
169 * This structure contains embedded PAE PDPTEs, used when an HVM guest
170 * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
171 * may require that the shadow CR3 points below 4GB, and hence the whole
172 * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
173 */
174 v = alloc_xenheap_pages(get_order_from_bytes(sizeof(*v)), MEMF_bits(32));
175 if ( v != NULL )
176 memset(v, 0, sizeof(*v));
177 return v;
178 }
180 void free_vcpu_struct(struct vcpu *v)
181 {
182 free_xenheap_pages(v, get_order_from_bytes(sizeof(*v)));
183 }
185 #ifdef CONFIG_COMPAT
187 static int setup_compat_l4(struct vcpu *v)
188 {
189 struct page_info *pg;
190 l4_pgentry_t *l4tab;
192 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
193 if ( pg == NULL )
194 return -ENOMEM;
196 /* This page needs to look like a pagetable so that it can be shadowed */
197 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
199 l4tab = page_to_virt(pg);
200 copy_page(l4tab, idle_pg_table);
201 l4tab[0] = l4e_empty();
202 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
203 l4e_from_page(pg, __PAGE_HYPERVISOR);
204 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
205 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
206 __PAGE_HYPERVISOR);
208 v->arch.guest_table = pagetable_from_page(pg);
209 v->arch.guest_table_user = v->arch.guest_table;
211 return 0;
212 }
214 static void release_compat_l4(struct vcpu *v)
215 {
216 free_domheap_page(pagetable_get_page(v->arch.guest_table));
217 v->arch.guest_table = pagetable_null();
218 v->arch.guest_table_user = pagetable_null();
219 }
221 static inline int may_switch_mode(struct domain *d)
222 {
223 return (!is_hvm_domain(d) && (d->tot_pages == 0));
224 }
226 int switch_native(struct domain *d)
227 {
228 unsigned int vcpuid;
230 if ( d == NULL )
231 return -EINVAL;
232 if ( !may_switch_mode(d) )
233 return -EACCES;
234 if ( !is_pv_32on64_domain(d) )
235 return 0;
237 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
239 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
240 {
241 if (d->vcpu[vcpuid])
242 release_compat_l4(d->vcpu[vcpuid]);
243 }
245 return 0;
246 }
248 int switch_compat(struct domain *d)
249 {
250 unsigned int vcpuid;
252 if ( d == NULL )
253 return -EINVAL;
254 if ( !may_switch_mode(d) )
255 return -EACCES;
256 if ( is_pv_32on64_domain(d) )
257 return 0;
259 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
261 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
262 {
263 if ( (d->vcpu[vcpuid] != NULL) &&
264 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
265 goto undo_and_fail;
266 }
268 domain_set_alloc_bitsize(d);
270 return 0;
272 undo_and_fail:
273 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
274 while ( vcpuid-- != 0 )
275 {
276 if ( d->vcpu[vcpuid] != NULL )
277 release_compat_l4(d->vcpu[vcpuid]);
278 }
279 return -ENOMEM;
280 }
282 #else
283 #define setup_compat_l4(v) 0
284 #define release_compat_l4(v) ((void)0)
285 #endif
287 int vcpu_initialise(struct vcpu *v)
288 {
289 struct domain *d = v->domain;
290 int rc;
292 v->arch.vcpu_info_mfn = INVALID_MFN;
294 v->arch.flags = TF_kernel_mode;
296 #if defined(__i386__)
297 mapcache_vcpu_init(v);
298 #endif
300 pae_l3_cache_init(&v->arch.pae_l3_cache);
302 paging_vcpu_init(v);
304 if ( is_hvm_domain(d) )
305 {
306 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
307 return rc;
308 }
309 else
310 {
311 /* PV guests by default have a 100Hz ticker. */
312 if ( !is_idle_domain(d) )
313 v->periodic_period = MILLISECS(10);
315 /* PV guests get an emulated PIT too for video BIOSes to use. */
316 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
317 pit_init(v, cpu_khz);
319 v->arch.schedule_tail = continue_nonidle_domain;
320 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
321 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
323 if ( is_idle_domain(d) )
324 {
325 v->arch.schedule_tail = continue_idle_domain;
326 v->arch.cr3 = __pa(idle_pg_table);
327 }
329 v->arch.guest_context.ctrlreg[4] =
330 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
331 }
333 v->arch.perdomain_ptes =
334 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
336 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
337 }
339 void vcpu_destroy(struct vcpu *v)
340 {
341 if ( is_pv_32on64_vcpu(v) )
342 release_compat_l4(v);
344 if ( is_hvm_vcpu(v) )
345 hvm_vcpu_destroy(v);
346 }
348 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
349 {
350 #ifdef __x86_64__
351 struct page_info *pg;
352 #endif
353 int i, pdpt_order, paging_initialised = 0;
354 int rc = -ENOMEM;
356 d->arch.hvm_domain.hap_enabled =
357 is_hvm_domain(d) &&
358 hvm_funcs.hap_supported &&
359 (domcr_flags & DOMCRF_hap);
361 INIT_LIST_HEAD(&d->arch.pdev_list);
363 d->arch.relmem = RELMEM_not_started;
364 INIT_LIST_HEAD(&d->arch.relmem_list);
366 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
367 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order, 0);
368 if ( d->arch.mm_perdomain_pt == NULL )
369 goto fail;
370 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
372 #if defined(__i386__)
374 mapcache_domain_init(d);
376 #else /* __x86_64__ */
378 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
379 if ( pg == NULL )
380 goto fail;
381 d->arch.mm_perdomain_l2 = page_to_virt(pg);
382 clear_page(d->arch.mm_perdomain_l2);
383 for ( i = 0; i < (1 << pdpt_order); i++ )
384 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
385 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
386 __PAGE_HYPERVISOR);
388 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
389 if ( pg == NULL )
390 goto fail;
391 d->arch.mm_perdomain_l3 = page_to_virt(pg);
392 clear_page(d->arch.mm_perdomain_l3);
393 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
394 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
395 __PAGE_HYPERVISOR);
397 #endif /* __x86_64__ */
399 #ifdef CONFIG_COMPAT
400 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
401 #endif
403 if ( (rc = paging_domain_init(d)) != 0 )
404 goto fail;
405 paging_initialised = 1;
407 if ( !is_idle_domain(d) )
408 {
409 d->arch.ioport_caps =
410 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
411 rc = -ENOMEM;
412 if ( d->arch.ioport_caps == NULL )
413 goto fail;
415 /*
416 * The shared_info machine address must fit in a 32-bit field within a
417 * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
418 */
419 if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
420 goto fail;
422 clear_page(d->shared_info);
423 share_xen_page_with_guest(
424 virt_to_page(d->shared_info), d, XENSHARE_writable);
426 if ( (rc = iommu_domain_init(d)) != 0 )
427 goto fail;
428 }
430 if ( is_hvm_domain(d) )
431 {
432 if ( (rc = hvm_domain_initialise(d)) != 0 )
433 {
434 iommu_domain_destroy(d);
435 goto fail;
436 }
437 }
438 else
439 {
440 /* 32-bit PV guest by default only if Xen is not 64-bit. */
441 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
442 (CONFIG_PAGING_LEVELS != 4);
443 }
445 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
446 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
447 {
448 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
449 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
450 }
452 return 0;
454 fail:
455 d->is_dying = DOMDYING_dead;
456 free_xenheap_page(d->shared_info);
457 if ( paging_initialised )
458 paging_final_teardown(d);
459 #ifdef __x86_64__
460 if ( d->arch.mm_perdomain_l2 )
461 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
462 if ( d->arch.mm_perdomain_l3 )
463 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
464 #endif
465 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
466 return rc;
467 }
469 void arch_domain_destroy(struct domain *d)
470 {
471 if ( is_hvm_domain(d) )
472 hvm_domain_destroy(d);
474 pci_release_devices(d);
475 free_domain_pirqs(d);
476 if ( !is_idle_domain(d) )
477 iommu_domain_destroy(d);
479 paging_final_teardown(d);
481 free_xenheap_pages(
482 d->arch.mm_perdomain_pt,
483 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
485 #ifdef __x86_64__
486 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
487 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
488 #endif
490 free_xenheap_page(d->shared_info);
491 }
493 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
494 {
495 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
497 hv_cr4_mask = ~X86_CR4_TSD;
498 if ( cpu_has_de )
499 hv_cr4_mask &= ~X86_CR4_DE;
501 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
502 gdprintk(XENLOG_WARNING,
503 "Attempt to change CR4 flags %08lx -> %08lx\n",
504 hv_cr4, guest_cr4);
506 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
507 }
509 /* This is called by arch_final_setup_guest and do_boot_vcpu */
510 int arch_set_info_guest(
511 struct vcpu *v, vcpu_guest_context_u c)
512 {
513 struct domain *d = v->domain;
514 unsigned long cr3_pfn = INVALID_MFN;
515 unsigned long flags, cr4;
516 int i, rc = 0, compat;
518 /* The context is a compat-mode one if the target domain is compat-mode;
519 * we expect the tools to DTRT even in compat-mode callers. */
520 compat = is_pv_32on64_domain(d);
522 #ifdef CONFIG_COMPAT
523 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
524 #else
525 #define c(fld) (c.nat->fld)
526 #endif
527 flags = c(flags);
529 if ( !is_hvm_vcpu(v) )
530 {
531 if ( !compat )
532 {
533 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
534 fixup_guest_stack_selector(d, c.nat->kernel_ss);
535 fixup_guest_code_selector(d, c.nat->user_regs.cs);
536 #ifdef __i386__
537 fixup_guest_code_selector(d, c.nat->event_callback_cs);
538 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
539 #endif
541 for ( i = 0; i < 256; i++ )
542 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
544 /* LDT safety checks. */
545 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
546 (c.nat->ldt_ents > 8192) ||
547 !array_access_ok(c.nat->ldt_base,
548 c.nat->ldt_ents,
549 LDT_ENTRY_SIZE) )
550 return -EINVAL;
551 }
552 #ifdef CONFIG_COMPAT
553 else
554 {
555 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
556 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
557 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
558 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
559 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
561 for ( i = 0; i < 256; i++ )
562 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
564 /* LDT safety checks. */
565 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
566 (c.cmp->ldt_ents > 8192) ||
567 !compat_array_access_ok(c.cmp->ldt_base,
568 c.cmp->ldt_ents,
569 LDT_ENTRY_SIZE) )
570 return -EINVAL;
571 }
572 #endif
573 }
575 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
577 v->arch.flags &= ~TF_kernel_mode;
578 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
579 v->arch.flags |= TF_kernel_mode;
581 if ( !compat )
582 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
583 #ifdef CONFIG_COMPAT
584 else
585 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
586 #endif
588 v->arch.guest_context.user_regs.eflags |= 2;
590 if ( is_hvm_vcpu(v) )
591 {
592 hvm_set_info_guest(v);
593 goto out;
594 }
596 /* Only CR0.TS is modifiable by guest or admin. */
597 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
598 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
600 init_int80_direct_trap(v);
602 /* IOPL privileges are virtualised. */
603 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
604 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
606 /* Ensure real hardware interrupts are enabled. */
607 v->arch.guest_context.user_regs.eflags |= EF_IE;
609 cr4 = v->arch.guest_context.ctrlreg[4];
610 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
611 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
613 memset(v->arch.guest_context.debugreg, 0,
614 sizeof(v->arch.guest_context.debugreg));
615 for ( i = 0; i < 8; i++ )
616 (void)set_debugreg(v, i, c(debugreg[i]));
618 if ( v->is_initialised )
619 goto out;
621 if ( v->vcpu_id == 0 )
622 d->vm_assist = c(vm_assist);
624 if ( !compat )
625 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
626 #ifdef CONFIG_COMPAT
627 else
628 {
629 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
630 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
632 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
633 return -EINVAL;
634 for ( i = 0; i < n; ++i )
635 gdt_frames[i] = c.cmp->gdt_frames[i];
636 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
637 }
638 #endif
639 if ( rc != 0 )
640 return rc;
642 if ( !compat )
643 {
644 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
646 if ( !mfn_valid(cr3_pfn) ||
647 (paging_mode_refcounts(d)
648 ? !get_page(mfn_to_page(cr3_pfn), d)
649 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
650 PGT_base_page_table)) )
651 {
652 destroy_gdt(v);
653 return -EINVAL;
654 }
656 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
658 #ifdef __x86_64__
659 if ( c.nat->ctrlreg[1] )
660 {
661 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
663 if ( !mfn_valid(cr3_pfn) ||
664 (paging_mode_refcounts(d)
665 ? !get_page(mfn_to_page(cr3_pfn), d)
666 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
667 PGT_base_page_table)) )
668 {
669 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
670 v->arch.guest_table = pagetable_null();
671 if ( paging_mode_refcounts(d) )
672 put_page(mfn_to_page(cr3_pfn));
673 else
674 put_page_and_type(mfn_to_page(cr3_pfn));
675 destroy_gdt(v);
676 return -EINVAL;
677 }
679 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
680 }
681 #endif
682 }
683 #ifdef CONFIG_COMPAT
684 else
685 {
686 l4_pgentry_t *l4tab;
688 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
690 if ( !mfn_valid(cr3_pfn) ||
691 (paging_mode_refcounts(d)
692 ? !get_page(mfn_to_page(cr3_pfn), d)
693 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
694 PGT_l3_page_table)) )
695 {
696 destroy_gdt(v);
697 return -EINVAL;
698 }
700 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
701 *l4tab = l4e_from_pfn(
702 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
703 }
704 #endif
706 if ( v->vcpu_id == 0 )
707 update_domain_wallclock_time(d);
709 /* Don't redo final setup */
710 v->is_initialised = 1;
712 if ( paging_mode_enabled(d) )
713 paging_update_paging_modes(v);
715 update_cr3(v);
717 out:
718 if ( flags & VGCF_online )
719 clear_bit(_VPF_down, &v->pause_flags);
720 else
721 set_bit(_VPF_down, &v->pause_flags);
722 return 0;
723 #undef c
724 }
726 void arch_vcpu_reset(struct vcpu *v)
727 {
728 if ( !is_hvm_vcpu(v) )
729 {
730 destroy_gdt(v);
731 vcpu_destroy_pagetables(v);
732 }
733 else
734 {
735 vcpu_end_shutdown_deferral(v);
736 }
737 }
739 /*
740 * Unmap the vcpu info page if the guest decided to place it somewhere
741 * else. This is only used from arch_domain_destroy, so there's no
742 * need to do anything clever.
743 */
744 static void
745 unmap_vcpu_info(struct vcpu *v)
746 {
747 struct domain *d = v->domain;
748 unsigned long mfn;
750 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
751 return;
753 mfn = v->arch.vcpu_info_mfn;
754 unmap_domain_page_global(v->vcpu_info);
756 v->vcpu_info = (void *)&shared_info(d, vcpu_info[v->vcpu_id]);
757 v->arch.vcpu_info_mfn = INVALID_MFN;
759 put_page_and_type(mfn_to_page(mfn));
760 }
762 /*
763 * Map a guest page in and point the vcpu_info pointer at it. This
764 * makes sure that the vcpu_info is always pointing at a valid piece
765 * of memory, and it sets a pending event to make sure that a pending
766 * event doesn't get missed.
767 */
768 static int
769 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
770 {
771 struct domain *d = v->domain;
772 void *mapping;
773 vcpu_info_t *new_info;
774 int i;
776 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
777 return -EINVAL;
779 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
780 return -EINVAL;
782 /* Run this command on yourself or on other offline VCPUS. */
783 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
784 return -EINVAL;
786 mfn = gmfn_to_mfn(d, mfn);
787 if ( !mfn_valid(mfn) ||
788 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
789 return -EINVAL;
791 mapping = map_domain_page_global(mfn);
792 if ( mapping == NULL )
793 {
794 put_page_and_type(mfn_to_page(mfn));
795 return -ENOMEM;
796 }
798 new_info = (vcpu_info_t *)(mapping + offset);
800 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
802 v->vcpu_info = new_info;
803 v->arch.vcpu_info_mfn = mfn;
805 /* Set new vcpu_info pointer /before/ setting pending flags. */
806 wmb();
808 /*
809 * Mark everything as being pending just to make sure nothing gets
810 * lost. The domain will get a spurious event, but it can cope.
811 */
812 vcpu_info(v, evtchn_upcall_pending) = 1;
813 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
814 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
816 return 0;
817 }
819 long
820 arch_do_vcpu_op(
821 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
822 {
823 long rc = 0;
825 switch ( cmd )
826 {
827 case VCPUOP_register_runstate_memory_area:
828 {
829 struct vcpu_register_runstate_memory_area area;
830 struct vcpu_runstate_info runstate;
832 rc = -EFAULT;
833 if ( copy_from_guest(&area, arg, 1) )
834 break;
836 if ( !guest_handle_okay(area.addr.h, 1) )
837 break;
839 rc = 0;
840 runstate_guest(v) = area.addr.h;
842 if ( v == current )
843 {
844 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
845 }
846 else
847 {
848 vcpu_runstate_get(v, &runstate);
849 __copy_to_guest(runstate_guest(v), &runstate, 1);
850 }
852 break;
853 }
855 case VCPUOP_register_vcpu_info:
856 {
857 struct domain *d = v->domain;
858 struct vcpu_register_vcpu_info info;
860 rc = -EFAULT;
861 if ( copy_from_guest(&info, arg, 1) )
862 break;
864 domain_lock(d);
865 rc = map_vcpu_info(v, info.mfn, info.offset);
866 domain_unlock(d);
868 break;
869 }
871 case VCPUOP_get_physid:
872 {
873 struct vcpu_get_physid cpu_id;
875 rc = -EINVAL;
876 if ( !v->domain->is_pinned )
877 break;
879 cpu_id.phys_id =
880 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
881 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
883 rc = -EFAULT;
884 if ( copy_to_guest(arg, &cpu_id, 1) )
885 break;
887 rc = 0;
888 break;
889 }
891 default:
892 rc = -ENOSYS;
893 break;
894 }
896 return rc;
897 }
899 #ifdef __x86_64__
901 #define loadsegment(seg,value) ({ \
902 int __r = 1; \
903 asm volatile ( \
904 "1: movl %k1,%%" #seg "\n2:\n" \
905 ".section .fixup,\"ax\"\n" \
906 "3: xorl %k0,%k0\n" \
907 " movl %k0,%%" #seg "\n" \
908 " jmp 2b\n" \
909 ".previous\n" \
910 ".section __ex_table,\"a\"\n" \
911 " .align 8\n" \
912 " .quad 1b,3b\n" \
913 ".previous" \
914 : "=r" (__r) : "r" (value), "0" (__r) );\
915 __r; })
917 /*
918 * save_segments() writes a mask of segments which are dirty (non-zero),
919 * allowing load_segments() to avoid some expensive segment loads and
920 * MSR writes.
921 */
922 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
923 #define DIRTY_DS 0x01
924 #define DIRTY_ES 0x02
925 #define DIRTY_FS 0x04
926 #define DIRTY_GS 0x08
927 #define DIRTY_FS_BASE 0x10
928 #define DIRTY_GS_BASE_USER 0x20
930 static void load_segments(struct vcpu *n)
931 {
932 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
933 int all_segs_okay = 1;
934 unsigned int dirty_segment_mask, cpu = smp_processor_id();
936 /* Load and clear the dirty segment mask. */
937 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
938 per_cpu(dirty_segment_mask, cpu) = 0;
940 /* Either selector != 0 ==> reload. */
941 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
942 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
944 /* Either selector != 0 ==> reload. */
945 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
946 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
948 /*
949 * Either selector != 0 ==> reload.
950 * Also reload to reset FS_BASE if it was non-zero.
951 */
952 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
953 nctxt->user_regs.fs) )
954 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
956 /*
957 * Either selector != 0 ==> reload.
958 * Also reload to reset GS_BASE if it was non-zero.
959 */
960 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
961 nctxt->user_regs.gs) )
962 {
963 /* Reset GS_BASE with user %gs? */
964 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
965 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
966 }
968 if ( !is_pv_32on64_domain(n->domain) )
969 {
970 /* This can only be non-zero if selector is NULL. */
971 if ( nctxt->fs_base )
972 wrmsr(MSR_FS_BASE,
973 nctxt->fs_base,
974 nctxt->fs_base>>32);
976 /* Most kernels have non-zero GS base, so don't bother testing. */
977 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
978 wrmsr(MSR_SHADOW_GS_BASE,
979 nctxt->gs_base_kernel,
980 nctxt->gs_base_kernel>>32);
982 /* This can only be non-zero if selector is NULL. */
983 if ( nctxt->gs_base_user )
984 wrmsr(MSR_GS_BASE,
985 nctxt->gs_base_user,
986 nctxt->gs_base_user>>32);
988 /* If in kernel mode then switch the GS bases around. */
989 if ( (n->arch.flags & TF_kernel_mode) )
990 asm volatile ( "swapgs" );
991 }
993 if ( unlikely(!all_segs_okay) )
994 {
995 struct cpu_user_regs *regs = guest_cpu_user_regs();
996 unsigned long *rsp =
997 (n->arch.flags & TF_kernel_mode) ?
998 (unsigned long *)regs->rsp :
999 (unsigned long *)nctxt->kernel_sp;
1000 unsigned long cs_and_mask, rflags;
1002 if ( is_pv_32on64_domain(n->domain) )
1004 unsigned int *esp = ring_1(regs) ?
1005 (unsigned int *)regs->rsp :
1006 (unsigned int *)nctxt->kernel_sp;
1007 unsigned int cs_and_mask, eflags;
1008 int ret = 0;
1010 /* CS longword also contains full evtchn_upcall_mask. */
1011 cs_and_mask = (unsigned short)regs->cs |
1012 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1013 /* Fold upcall mask into RFLAGS.IF. */
1014 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1015 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1017 if ( !ring_1(regs) )
1019 ret = put_user(regs->ss, esp-1);
1020 ret |= put_user(regs->_esp, esp-2);
1021 esp -= 2;
1024 if ( ret |
1025 put_user(eflags, esp-1) |
1026 put_user(cs_and_mask, esp-2) |
1027 put_user(regs->_eip, esp-3) |
1028 put_user(nctxt->user_regs.gs, esp-4) |
1029 put_user(nctxt->user_regs.fs, esp-5) |
1030 put_user(nctxt->user_regs.es, esp-6) |
1031 put_user(nctxt->user_regs.ds, esp-7) )
1033 gdprintk(XENLOG_ERR, "Error while creating compat "
1034 "failsafe callback frame.\n");
1035 domain_crash(n->domain);
1038 if ( test_bit(_VGCF_failsafe_disables_events,
1039 &n->arch.guest_context.flags) )
1040 vcpu_info(n, evtchn_upcall_mask) = 1;
1042 regs->entry_vector = TRAP_syscall;
1043 regs->_eflags &= 0xFFFCBEFFUL;
1044 regs->ss = FLAT_COMPAT_KERNEL_SS;
1045 regs->_esp = (unsigned long)(esp-7);
1046 regs->cs = FLAT_COMPAT_KERNEL_CS;
1047 regs->_eip = nctxt->failsafe_callback_eip;
1048 return;
1051 if ( !(n->arch.flags & TF_kernel_mode) )
1052 toggle_guest_mode(n);
1053 else
1054 regs->cs &= ~3;
1056 /* CS longword also contains full evtchn_upcall_mask. */
1057 cs_and_mask = (unsigned long)regs->cs |
1058 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1060 /* Fold upcall mask into RFLAGS.IF. */
1061 rflags = regs->rflags & ~X86_EFLAGS_IF;
1062 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1064 if ( put_user(regs->ss, rsp- 1) |
1065 put_user(regs->rsp, rsp- 2) |
1066 put_user(rflags, rsp- 3) |
1067 put_user(cs_and_mask, rsp- 4) |
1068 put_user(regs->rip, rsp- 5) |
1069 put_user(nctxt->user_regs.gs, rsp- 6) |
1070 put_user(nctxt->user_regs.fs, rsp- 7) |
1071 put_user(nctxt->user_regs.es, rsp- 8) |
1072 put_user(nctxt->user_regs.ds, rsp- 9) |
1073 put_user(regs->r11, rsp-10) |
1074 put_user(regs->rcx, rsp-11) )
1076 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1077 "callback frame.\n");
1078 domain_crash(n->domain);
1081 if ( test_bit(_VGCF_failsafe_disables_events,
1082 &n->arch.guest_context.flags) )
1083 vcpu_info(n, evtchn_upcall_mask) = 1;
1085 regs->entry_vector = TRAP_syscall;
1086 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1087 X86_EFLAGS_NT|X86_EFLAGS_TF);
1088 regs->ss = FLAT_KERNEL_SS;
1089 regs->rsp = (unsigned long)(rsp-11);
1090 regs->cs = FLAT_KERNEL_CS;
1091 regs->rip = nctxt->failsafe_callback_eip;
1095 static void save_segments(struct vcpu *v)
1097 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1098 struct cpu_user_regs *regs = &ctxt->user_regs;
1099 unsigned int dirty_segment_mask = 0;
1101 regs->ds = read_segment_register(ds);
1102 regs->es = read_segment_register(es);
1103 regs->fs = read_segment_register(fs);
1104 regs->gs = read_segment_register(gs);
1106 if ( regs->ds )
1107 dirty_segment_mask |= DIRTY_DS;
1109 if ( regs->es )
1110 dirty_segment_mask |= DIRTY_ES;
1112 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1114 dirty_segment_mask |= DIRTY_FS;
1115 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1117 else if ( ctxt->fs_base )
1119 dirty_segment_mask |= DIRTY_FS_BASE;
1122 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1124 dirty_segment_mask |= DIRTY_GS;
1125 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1127 else if ( ctxt->gs_base_user )
1129 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1132 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1135 #define switch_kernel_stack(v) ((void)0)
1137 #elif defined(__i386__)
1139 #define load_segments(n) ((void)0)
1140 #define save_segments(p) ((void)0)
1142 static inline void switch_kernel_stack(struct vcpu *v)
1144 struct tss_struct *tss = &init_tss[smp_processor_id()];
1145 tss->esp1 = v->arch.guest_context.kernel_sp;
1146 tss->ss1 = v->arch.guest_context.kernel_ss;
1149 #endif /* __i386__ */
1151 static void paravirt_ctxt_switch_from(struct vcpu *v)
1153 save_segments(v);
1155 /*
1156 * Disable debug breakpoints. We do this aggressively because if we switch
1157 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1158 * inside Xen, before we get a chance to reload DR7, and this cannot always
1159 * safely be handled.
1160 */
1161 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1162 write_debugreg(7, 0);
1165 static void paravirt_ctxt_switch_to(struct vcpu *v)
1167 unsigned long cr4;
1169 set_int80_direct_trap(v);
1170 switch_kernel_stack(v);
1172 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1173 if ( unlikely(cr4 != read_cr4()) )
1174 write_cr4(cr4);
1176 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1178 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1179 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1180 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1181 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1182 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1183 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1187 static inline int need_full_gdt(struct vcpu *v)
1189 return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
1192 static void __context_switch(void)
1194 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1195 unsigned int cpu = smp_processor_id();
1196 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1197 struct vcpu *n = current;
1198 struct desc_struct *gdt;
1199 struct desc_ptr gdt_desc;
1201 ASSERT(p != n);
1202 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1204 if ( !is_idle_vcpu(p) )
1206 memcpy(&p->arch.guest_context.user_regs,
1207 stack_regs,
1208 CTXT_SWITCH_STACK_BYTES);
1209 unlazy_fpu(p);
1210 p->arch.ctxt_switch_from(p);
1213 if ( !is_idle_vcpu(n) )
1215 memcpy(stack_regs,
1216 &n->arch.guest_context.user_regs,
1217 CTXT_SWITCH_STACK_BYTES);
1218 n->arch.ctxt_switch_to(n);
1221 if ( p->domain != n->domain )
1222 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1223 cpu_set(cpu, n->vcpu_dirty_cpumask);
1225 gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
1226 per_cpu(compat_gdt_table, cpu);
1227 if ( need_full_gdt(n) )
1229 struct page_info *page = virt_to_page(gdt);
1230 unsigned int i;
1231 for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1232 l1e_write(n->domain->arch.mm_perdomain_pt +
1233 (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
1234 FIRST_RESERVED_GDT_PAGE + i,
1235 l1e_from_page(page + i, __PAGE_HYPERVISOR));
1238 if ( need_full_gdt(p) &&
1239 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
1241 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1242 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1243 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1246 write_ptbase(n);
1248 if ( need_full_gdt(n) &&
1249 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
1251 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1252 gdt_desc.base = GDT_VIRT_START(n);
1253 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1256 if ( p->domain != n->domain )
1257 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1258 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1260 per_cpu(curr_vcpu, cpu) = n;
1264 void context_switch(struct vcpu *prev, struct vcpu *next)
1266 unsigned int cpu = smp_processor_id();
1267 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1269 ASSERT(local_irq_is_enabled());
1271 /* Allow at most one CPU at a time to be dirty. */
1272 ASSERT(cpus_weight(dirty_mask) <= 1);
1273 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1275 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1276 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1277 flush_tlb_mask(next->vcpu_dirty_cpumask);
1280 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1281 pt_save_timer(prev);
1283 local_irq_disable();
1285 set_current(next);
1287 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1289 local_irq_enable();
1291 else
1293 __context_switch();
1295 #ifdef CONFIG_COMPAT
1296 if ( !is_hvm_vcpu(next) &&
1297 (is_idle_vcpu(prev) ||
1298 is_hvm_vcpu(prev) ||
1299 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1301 uint64_t efer = read_efer();
1302 if ( !(efer & EFER_SCE) )
1303 write_efer(efer | EFER_SCE);
1305 #endif
1307 /* Re-enable interrupts before restoring state which may fault. */
1308 local_irq_enable();
1310 if ( !is_hvm_vcpu(next) )
1312 load_LDT(next);
1313 load_segments(next);
1317 context_saved(prev);
1319 /* Update per-VCPU guest runstate shared memory area (if registered). */
1320 if ( !guest_handle_is_null(runstate_guest(next)) )
1322 if ( !is_pv_32on64_domain(next->domain) )
1323 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1324 #ifdef CONFIG_COMPAT
1325 else
1327 struct compat_vcpu_runstate_info info;
1329 XLAT_vcpu_runstate_info(&info, &next->runstate);
1330 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1332 #endif
1335 schedule_tail(next);
1336 BUG();
1339 void continue_running(struct vcpu *same)
1341 schedule_tail(same);
1342 BUG();
1345 int __sync_lazy_execstate(void)
1347 unsigned long flags;
1348 int switch_required;
1350 local_irq_save(flags);
1352 switch_required = (this_cpu(curr_vcpu) != current);
1354 if ( switch_required )
1356 ASSERT(current == idle_vcpu[smp_processor_id()]);
1357 __context_switch();
1360 local_irq_restore(flags);
1362 return switch_required;
1365 void sync_vcpu_execstate(struct vcpu *v)
1367 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1368 (void)__sync_lazy_execstate();
1370 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1371 flush_tlb_mask(v->vcpu_dirty_cpumask);
1374 struct migrate_info {
1375 long (*func)(void *data);
1376 void *data;
1377 void (*saved_schedule_tail)(struct vcpu *);
1378 cpumask_t saved_affinity;
1379 unsigned int nest;
1380 };
1382 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1384 struct cpu_user_regs *regs = guest_cpu_user_regs();
1385 struct migrate_info *info = v->arch.continue_info;
1386 cpumask_t mask = info->saved_affinity;
1387 void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
1389 regs->eax = info->func(info->data);
1391 if ( info->nest-- == 0 )
1393 xfree(info);
1394 v->arch.schedule_tail = saved_schedule_tail;
1395 v->arch.continue_info = NULL;
1396 vcpu_unlock_affinity(v, &mask);
1399 (*saved_schedule_tail)(v);
1402 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1404 struct vcpu *v = current;
1405 struct migrate_info *info;
1406 cpumask_t mask = cpumask_of_cpu(cpu);
1407 int rc;
1409 if ( cpu == smp_processor_id() )
1410 return func(data);
1412 info = v->arch.continue_info;
1413 if ( info == NULL )
1415 info = xmalloc(struct migrate_info);
1416 if ( info == NULL )
1417 return -ENOMEM;
1419 rc = vcpu_lock_affinity(v, &mask);
1420 if ( rc )
1422 xfree(info);
1423 return rc;
1426 info->saved_schedule_tail = v->arch.schedule_tail;
1427 info->saved_affinity = mask;
1428 info->nest = 0;
1430 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1431 v->arch.continue_info = info;
1433 else
1435 BUG_ON(info->nest != 0);
1436 rc = vcpu_locked_change_affinity(v, &mask);
1437 if ( rc )
1438 return rc;
1439 info->nest++;
1442 info->func = func;
1443 info->data = data;
1445 /* Dummy return value will be overwritten by new schedule_tail. */
1446 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1447 return 0;
1450 #define next_arg(fmt, args) ({ \
1451 unsigned long __arg; \
1452 switch ( *(fmt)++ ) \
1453 { \
1454 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1455 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1456 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1457 default: __arg = 0; BUG(); \
1458 } \
1459 __arg; \
1460 })
1462 DEFINE_PER_CPU(char, hc_preempted);
1464 unsigned long hypercall_create_continuation(
1465 unsigned int op, const char *format, ...)
1467 struct mc_state *mcs = &this_cpu(mc_state);
1468 struct cpu_user_regs *regs;
1469 const char *p = format;
1470 unsigned long arg;
1471 unsigned int i;
1472 va_list args;
1474 va_start(args, format);
1476 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1478 __set_bit(_MCSF_call_preempted, &mcs->flags);
1480 for ( i = 0; *p != '\0'; i++ )
1481 mcs->call.args[i] = next_arg(p, args);
1482 if ( is_pv_32on64_domain(current->domain) )
1484 for ( ; i < 6; i++ )
1485 mcs->call.args[i] = 0;
1488 else
1490 regs = guest_cpu_user_regs();
1491 regs->eax = op;
1492 /*
1493 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1494 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1495 */
1496 if ( !is_hvm_vcpu(current) )
1497 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1499 #ifdef __x86_64__
1500 if ( !is_hvm_vcpu(current) ?
1501 !is_pv_32on64_vcpu(current) :
1502 (hvm_guest_x86_mode(current) == 8) )
1504 for ( i = 0; *p != '\0'; i++ )
1506 arg = next_arg(p, args);
1507 switch ( i )
1509 case 0: regs->rdi = arg; break;
1510 case 1: regs->rsi = arg; break;
1511 case 2: regs->rdx = arg; break;
1512 case 3: regs->r10 = arg; break;
1513 case 4: regs->r8 = arg; break;
1514 case 5: regs->r9 = arg; break;
1518 else
1519 #endif
1521 if ( supervisor_mode_kernel )
1522 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1524 for ( i = 0; *p != '\0'; i++ )
1526 arg = next_arg(p, args);
1527 switch ( i )
1529 case 0: regs->ebx = arg; break;
1530 case 1: regs->ecx = arg; break;
1531 case 2: regs->edx = arg; break;
1532 case 3: regs->esi = arg; break;
1533 case 4: regs->edi = arg; break;
1534 case 5: regs->ebp = arg; break;
1539 this_cpu(hc_preempted) = 1;
1542 va_end(args);
1544 return op;
1547 #ifdef CONFIG_COMPAT
1548 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1550 int rc = 0;
1551 struct mc_state *mcs = &this_cpu(mc_state);
1552 struct cpu_user_regs *regs;
1553 unsigned int i, cval = 0;
1554 unsigned long nval = 0;
1555 va_list args;
1557 BUG_ON(*id > 5);
1558 BUG_ON(mask & (1U << *id));
1560 va_start(args, mask);
1562 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1564 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1565 return 0;
1566 for ( i = 0; i < 6; ++i, mask >>= 1 )
1568 if ( mask & 1 )
1570 nval = va_arg(args, unsigned long);
1571 cval = va_arg(args, unsigned int);
1572 if ( cval == nval )
1573 mask &= ~1U;
1574 else
1575 BUG_ON(nval == (unsigned int)nval);
1577 else if ( id && *id == i )
1579 *id = mcs->call.args[i];
1580 id = NULL;
1582 if ( (mask & 1) && mcs->call.args[i] == nval )
1584 mcs->call.args[i] = cval;
1585 ++rc;
1587 else
1588 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1591 else
1593 regs = guest_cpu_user_regs();
1594 for ( i = 0; i < 6; ++i, mask >>= 1 )
1596 unsigned long *reg;
1598 switch ( i )
1600 case 0: reg = &regs->ebx; break;
1601 case 1: reg = &regs->ecx; break;
1602 case 2: reg = &regs->edx; break;
1603 case 3: reg = &regs->esi; break;
1604 case 4: reg = &regs->edi; break;
1605 case 5: reg = &regs->ebp; break;
1606 default: BUG(); reg = NULL; break;
1608 if ( (mask & 1) )
1610 nval = va_arg(args, unsigned long);
1611 cval = va_arg(args, unsigned int);
1612 if ( cval == nval )
1613 mask &= ~1U;
1614 else
1615 BUG_ON(nval == (unsigned int)nval);
1617 else if ( id && *id == i )
1619 *id = *reg;
1620 id = NULL;
1622 if ( (mask & 1) && *reg == nval )
1624 *reg = cval;
1625 ++rc;
1627 else
1628 BUG_ON(*reg != (unsigned int)*reg);
1632 va_end(args);
1634 return rc;
1636 #endif
1638 static int relinquish_memory(
1639 struct domain *d, struct list_head *list, unsigned long type)
1641 struct list_head *ent;
1642 struct page_info *page;
1643 unsigned long x, y;
1644 int ret = 0;
1646 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1647 spin_lock_recursive(&d->page_alloc_lock);
1649 ent = list->next;
1650 while ( ent != list )
1652 page = list_entry(ent, struct page_info, list);
1654 /* Grab a reference to the page so it won't disappear from under us. */
1655 if ( unlikely(!get_page(page, d)) )
1657 /* Couldn't get a reference -- someone is freeing this page. */
1658 ent = ent->next;
1659 list_move_tail(&page->list, &d->arch.relmem_list);
1660 continue;
1663 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1664 ret = put_page_and_type_preemptible(page, 1);
1665 switch ( ret )
1667 case 0:
1668 break;
1669 case -EAGAIN:
1670 case -EINTR:
1671 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1672 put_page(page);
1673 goto out;
1674 default:
1675 BUG();
1678 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1679 put_page(page);
1681 /*
1682 * Forcibly invalidate top-most, still valid page tables at this point
1683 * to break circular 'linear page table' references as well as clean up
1684 * partially validated pages. This is okay because MMU structures are
1685 * not shared across domains and this domain is now dead. Thus top-most
1686 * valid tables are not in use so a non-zero count means circular
1687 * reference or partially validated.
1688 */
1689 y = page->u.inuse.type_info;
1690 for ( ; ; )
1692 x = y;
1693 if ( likely((x & PGT_type_mask) != type) ||
1694 likely(!(x & (PGT_validated|PGT_partial))) )
1695 break;
1697 y = cmpxchg(&page->u.inuse.type_info, x,
1698 x & ~(PGT_validated|PGT_partial));
1699 if ( likely(y == x) )
1701 /* No need for atomic update of type_info here: noone else updates it. */
1702 switch ( ret = free_page_type(page, x, 1) )
1704 case 0:
1705 break;
1706 case -EINTR:
1707 page->u.inuse.type_info |= PGT_validated;
1708 if ( x & PGT_partial )
1709 put_page(page);
1710 put_page(page);
1711 ret = -EAGAIN;
1712 goto out;
1713 case -EAGAIN:
1714 page->u.inuse.type_info |= PGT_partial;
1715 if ( x & PGT_partial )
1716 put_page(page);
1717 goto out;
1718 default:
1719 BUG();
1721 if ( x & PGT_partial )
1723 page->u.inuse.type_info--;
1724 put_page(page);
1726 break;
1730 /* Follow the list chain and /then/ potentially free the page. */
1731 ent = ent->next;
1732 list_move_tail(&page->list, &d->arch.relmem_list);
1733 put_page(page);
1735 if ( hypercall_preempt_check() )
1737 ret = -EAGAIN;
1738 goto out;
1742 list_splice_init(&d->arch.relmem_list, list);
1744 out:
1745 spin_unlock_recursive(&d->page_alloc_lock);
1746 return ret;
1749 static void vcpu_destroy_pagetables(struct vcpu *v)
1751 struct domain *d = v->domain;
1752 unsigned long pfn;
1754 #ifdef __x86_64__
1755 if ( is_pv_32on64_vcpu(v) )
1757 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1758 __va(pagetable_get_paddr(v->arch.guest_table)));
1760 if ( pfn != 0 )
1762 if ( paging_mode_refcounts(d) )
1763 put_page(mfn_to_page(pfn));
1764 else
1765 put_page_and_type(mfn_to_page(pfn));
1768 l4e_write(
1769 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1770 l4e_empty());
1772 v->arch.cr3 = 0;
1773 return;
1775 #endif
1777 pfn = pagetable_get_pfn(v->arch.guest_table);
1778 if ( pfn != 0 )
1780 if ( paging_mode_refcounts(d) )
1781 put_page(mfn_to_page(pfn));
1782 else
1783 put_page_and_type(mfn_to_page(pfn));
1784 v->arch.guest_table = pagetable_null();
1787 #ifdef __x86_64__
1788 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1789 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1790 if ( pfn != 0 )
1792 if ( !is_pv_32bit_vcpu(v) )
1794 if ( paging_mode_refcounts(d) )
1795 put_page(mfn_to_page(pfn));
1796 else
1797 put_page_and_type(mfn_to_page(pfn));
1799 v->arch.guest_table_user = pagetable_null();
1801 #endif
1803 v->arch.cr3 = 0;
1806 int domain_relinquish_resources(struct domain *d)
1808 int ret;
1809 struct vcpu *v;
1811 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1813 switch ( d->arch.relmem )
1815 case RELMEM_not_started:
1816 /* Tear down paging-assistance stuff. */
1817 paging_teardown(d);
1819 for_each_vcpu ( d, v )
1821 /* Drop the in-use references to page-table bases. */
1822 vcpu_destroy_pagetables(v);
1824 /*
1825 * Relinquish GDT mappings. No need for explicit unmapping of the
1826 * LDT as it automatically gets squashed with the guest mappings.
1827 */
1828 destroy_gdt(v);
1830 unmap_vcpu_info(v);
1833 if ( d->arch.pirq_eoi_map != NULL )
1835 unmap_domain_page_global(d->arch.pirq_eoi_map);
1836 put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1837 d->arch.pirq_eoi_map = NULL;
1840 d->arch.relmem = RELMEM_xen;
1841 /* fallthrough */
1843 /* Relinquish every page of memory. */
1844 case RELMEM_xen:
1845 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1846 if ( ret )
1847 return ret;
1848 #if CONFIG_PAGING_LEVELS >= 4
1849 d->arch.relmem = RELMEM_l4;
1850 /* fallthrough */
1852 case RELMEM_l4:
1853 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1854 if ( ret )
1855 return ret;
1856 #endif
1857 #if CONFIG_PAGING_LEVELS >= 3
1858 d->arch.relmem = RELMEM_l3;
1859 /* fallthrough */
1861 case RELMEM_l3:
1862 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1863 if ( ret )
1864 return ret;
1865 #endif
1866 d->arch.relmem = RELMEM_l2;
1867 /* fallthrough */
1869 case RELMEM_l2:
1870 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1871 if ( ret )
1872 return ret;
1873 d->arch.relmem = RELMEM_done;
1874 /* fallthrough */
1876 case RELMEM_done:
1877 break;
1879 default:
1880 BUG();
1883 if ( is_hvm_domain(d) )
1884 hvm_domain_relinquish_resources(d);
1886 return 0;
1889 void arch_dump_domain_info(struct domain *d)
1891 paging_dump_domain_info(d);
1894 void arch_dump_vcpu_info(struct vcpu *v)
1896 paging_dump_vcpu_info(v);
1899 void domain_cpuid(
1900 struct domain *d,
1901 unsigned int input,
1902 unsigned int sub_input,
1903 unsigned int *eax,
1904 unsigned int *ebx,
1905 unsigned int *ecx,
1906 unsigned int *edx)
1908 cpuid_input_t *cpuid;
1909 int i;
1911 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
1913 cpuid = &d->arch.cpuids[i];
1915 if ( (cpuid->input[0] == input) &&
1916 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
1917 (cpuid->input[1] == sub_input)) )
1919 *eax = cpuid->eax;
1920 *ebx = cpuid->ebx;
1921 *ecx = cpuid->ecx;
1922 *edx = cpuid->edx;
1923 return;
1927 *eax = *ebx = *ecx = *edx = 0;
1930 void vcpu_kick(struct vcpu *v)
1932 /*
1933 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
1934 * pending flag. These values may fluctuate (after all, we hold no
1935 * locks) but the key insight is that each change will cause
1936 * evtchn_upcall_pending to be polled.
1938 * NB2. We save the running flag across the unblock to avoid a needless
1939 * IPI for domains that we IPI'd to unblock.
1940 */
1941 bool_t running = v->is_running;
1942 vcpu_unblock(v);
1943 if ( running && (in_irq() || (v != current)) )
1944 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
1947 void vcpu_mark_events_pending(struct vcpu *v)
1949 int already_pending = test_and_set_bit(
1950 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
1952 if ( already_pending )
1953 return;
1955 if ( is_hvm_vcpu(v) )
1956 hvm_assert_evtchn_irq(v);
1957 else
1958 vcpu_kick(v);
1961 static void vcpu_kick_softirq(void)
1963 /*
1964 * Nothing to do here: we merely prevent notifiers from racing with checks
1965 * executed on return to guest context with interrupts enabled. See, for
1966 * example, xxx_intr_assist() executed on return to HVM guest context.
1967 */
1970 static int __init init_vcpu_kick_softirq(void)
1972 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
1973 return 0;
1975 __initcall(init_vcpu_kick_softirq);
1978 /*
1979 * Local variables:
1980 * mode: C
1981 * c-set-style: "BSD"
1982 * c-basic-offset: 4
1983 * tab-width: 4
1984 * indent-tabs-mode: nil
1985 * End:
1986 */