ia64/xen-unstable

view xen/arch/x86/domain.c @ 18743:ae100f264f6a

x86: Fix relinquish_memory() for PGT_partial pages.

Original patch by Jan Beulich.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Oct 29 13:09:37 2008 +0000 (2008-10-29)
parents bec755616e8e
children 9e5cf6778a6d
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <asm/regs.h>
36 #include <asm/mc146818rtc.h>
37 #include <asm/system.h>
38 #include <asm/io.h>
39 #include <asm/processor.h>
40 #include <asm/desc.h>
41 #include <asm/i387.h>
42 #include <asm/mpspec.h>
43 #include <asm/ldt.h>
44 #include <asm/hypercall.h>
45 #include <asm/hvm/hvm.h>
46 #include <asm/hvm/support.h>
47 #include <asm/debugreg.h>
48 #include <asm/msr.h>
49 #include <asm/nmi.h>
50 #include <xen/numa.h>
51 #include <xen/iommu.h>
52 #ifdef CONFIG_COMPAT
53 #include <compat/vcpu.h>
54 #endif
56 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
57 DEFINE_PER_CPU(u64, efer);
58 DEFINE_PER_CPU(unsigned long, cr4);
60 static void default_idle(void);
61 void (*pm_idle) (void) = default_idle;
63 static void paravirt_ctxt_switch_from(struct vcpu *v);
64 static void paravirt_ctxt_switch_to(struct vcpu *v);
66 static void vcpu_destroy_pagetables(struct vcpu *v);
68 static void continue_idle_domain(struct vcpu *v)
69 {
70 reset_stack_and_jump(idle_loop);
71 }
73 static void continue_nonidle_domain(struct vcpu *v)
74 {
75 reset_stack_and_jump(ret_from_intr);
76 }
78 static void default_idle(void)
79 {
80 local_irq_disable();
81 if ( !softirq_pending(smp_processor_id()) )
82 safe_halt();
83 else
84 local_irq_enable();
85 }
87 static void play_dead(void)
88 {
89 /*
90 * Flush pending softirqs if any. They can be queued up before this CPU
91 * was taken out of cpu_online_map in __cpu_disable().
92 */
93 do_softirq();
95 /* This must be done before dead CPU ack */
96 cpu_exit_clear();
97 hvm_cpu_down();
98 wbinvd();
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
103 /* With physical CPU hotplug, we should halt the cpu. */
104 local_irq_disable();
105 for ( ; ; )
106 halt();
107 }
109 void idle_loop(void)
110 {
111 for ( ; ; )
112 {
113 if ( cpu_is_offline(smp_processor_id()) )
114 play_dead();
115 page_scrub_schedule_work();
116 (*pm_idle)();
117 do_softirq();
118 }
119 }
121 void startup_cpu_idle_loop(void)
122 {
123 struct vcpu *v = current;
125 ASSERT(is_idle_vcpu(v));
126 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
127 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
129 reset_stack_and_jump(idle_loop);
130 }
132 void dump_pageframe_info(struct domain *d)
133 {
134 struct page_info *page;
136 printk("Memory pages belonging to domain %u:\n", d->domain_id);
138 if ( d->tot_pages >= 10 )
139 {
140 printk(" DomPage list too long to display\n");
141 }
142 else
143 {
144 list_for_each_entry ( page, &d->page_list, list )
145 {
146 printk(" DomPage %p: caf=%08x, taf=%" PRtype_info "\n",
147 _p(page_to_mfn(page)),
148 page->count_info, page->u.inuse.type_info);
149 }
150 }
152 list_for_each_entry ( page, &d->xenpage_list, list )
153 {
154 printk(" XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
155 _p(page_to_mfn(page)),
156 page->count_info, page->u.inuse.type_info);
157 }
158 }
160 struct vcpu *alloc_vcpu_struct(void)
161 {
162 struct vcpu *v;
163 if ( (v = xmalloc(struct vcpu)) != NULL )
164 memset(v, 0, sizeof(*v));
165 return v;
166 }
168 void free_vcpu_struct(struct vcpu *v)
169 {
170 xfree(v);
171 }
173 #ifdef CONFIG_COMPAT
175 static int setup_compat_l4(struct vcpu *v)
176 {
177 struct page_info *pg;
178 l4_pgentry_t *l4tab;
180 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
181 if ( pg == NULL )
182 return -ENOMEM;
184 /* This page needs to look like a pagetable so that it can be shadowed */
185 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
187 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
188 l4tab[0] = l4e_empty();
189 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
190 l4e_from_page(pg, __PAGE_HYPERVISOR);
191 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
192 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
193 __PAGE_HYPERVISOR);
195 v->arch.guest_table = pagetable_from_page(pg);
196 v->arch.guest_table_user = v->arch.guest_table;
198 return 0;
199 }
201 static void release_compat_l4(struct vcpu *v)
202 {
203 free_domheap_page(pagetable_get_page(v->arch.guest_table));
204 v->arch.guest_table = pagetable_null();
205 v->arch.guest_table_user = pagetable_null();
206 }
208 static inline int may_switch_mode(struct domain *d)
209 {
210 return (!is_hvm_domain(d) && (d->tot_pages == 0));
211 }
213 int switch_native(struct domain *d)
214 {
215 unsigned int vcpuid;
217 if ( d == NULL )
218 return -EINVAL;
219 if ( !may_switch_mode(d) )
220 return -EACCES;
221 if ( !is_pv_32on64_domain(d) )
222 return 0;
224 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
226 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
227 {
228 if (d->vcpu[vcpuid])
229 release_compat_l4(d->vcpu[vcpuid]);
230 }
232 return 0;
233 }
235 int switch_compat(struct domain *d)
236 {
237 unsigned int vcpuid;
239 if ( d == NULL )
240 return -EINVAL;
241 if ( !may_switch_mode(d) )
242 return -EACCES;
243 if ( is_pv_32on64_domain(d) )
244 return 0;
246 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
248 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
249 {
250 if ( (d->vcpu[vcpuid] != NULL) &&
251 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
252 goto undo_and_fail;
253 }
255 domain_set_alloc_bitsize(d);
257 return 0;
259 undo_and_fail:
260 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
261 while ( vcpuid-- != 0 )
262 {
263 if ( d->vcpu[vcpuid] != NULL )
264 release_compat_l4(d->vcpu[vcpuid]);
265 }
266 return -ENOMEM;
267 }
269 #else
270 #define setup_compat_l4(v) 0
271 #define release_compat_l4(v) ((void)0)
272 #endif
274 int vcpu_initialise(struct vcpu *v)
275 {
276 struct domain *d = v->domain;
277 int rc;
279 v->arch.vcpu_info_mfn = INVALID_MFN;
281 v->arch.flags = TF_kernel_mode;
283 #if defined(__i386__)
284 mapcache_vcpu_init(v);
285 #endif
287 pae_l3_cache_init(&v->arch.pae_l3_cache);
289 paging_vcpu_init(v);
291 if ( is_hvm_domain(d) )
292 {
293 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
294 return rc;
295 }
296 else
297 {
298 /* PV guests by default have a 100Hz ticker. */
299 if ( !is_idle_domain(d) )
300 v->periodic_period = MILLISECS(10);
302 /* PV guests get an emulated PIT too for video BIOSes to use. */
303 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
304 pit_init(v, cpu_khz);
306 v->arch.schedule_tail = continue_nonidle_domain;
307 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
308 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
310 if ( is_idle_domain(d) )
311 {
312 v->arch.schedule_tail = continue_idle_domain;
313 if ( v->vcpu_id )
314 v->arch.cr3 = d->vcpu[0]->arch.cr3;
315 else if ( !*idle_vcpu )
316 v->arch.cr3 = __pa(idle_pg_table);
317 else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) )
318 return -ENOMEM;
319 }
321 v->arch.guest_context.ctrlreg[4] =
322 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
323 }
325 v->arch.perdomain_ptes =
326 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
328 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
329 }
331 void vcpu_destroy(struct vcpu *v)
332 {
333 if ( is_pv_32on64_vcpu(v) )
334 release_compat_l4(v);
336 if ( is_hvm_vcpu(v) )
337 hvm_vcpu_destroy(v);
338 }
340 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
341 {
342 #ifdef __x86_64__
343 struct page_info *pg;
344 #endif
345 int i, pdpt_order, paging_initialised = 0;
346 int rc = -ENOMEM;
348 d->arch.hvm_domain.hap_enabled =
349 is_hvm_domain(d) &&
350 hvm_funcs.hap_supported &&
351 (domcr_flags & DOMCRF_hap);
353 INIT_LIST_HEAD(&d->arch.pdev_list);
355 d->arch.relmem = RELMEM_not_started;
356 INIT_LIST_HEAD(&d->arch.relmem_list);
358 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
359 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
360 if ( d->arch.mm_perdomain_pt == NULL )
361 goto fail;
362 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
364 #if defined(__i386__)
366 mapcache_domain_init(d);
368 #else /* __x86_64__ */
370 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
371 if ( pg == NULL )
372 goto fail;
373 d->arch.mm_perdomain_l2 = page_to_virt(pg);
374 clear_page(d->arch.mm_perdomain_l2);
375 for ( i = 0; i < (1 << pdpt_order); i++ )
376 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
377 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
378 __PAGE_HYPERVISOR);
380 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
381 if ( pg == NULL )
382 goto fail;
383 d->arch.mm_perdomain_l3 = page_to_virt(pg);
384 clear_page(d->arch.mm_perdomain_l3);
385 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
386 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
387 __PAGE_HYPERVISOR);
389 #endif /* __x86_64__ */
391 #ifdef CONFIG_COMPAT
392 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
393 #endif
395 if ( (rc = paging_domain_init(d)) != 0 )
396 goto fail;
397 paging_initialised = 1;
399 if ( !is_idle_domain(d) )
400 {
401 d->arch.ioport_caps =
402 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
403 rc = -ENOMEM;
404 if ( d->arch.ioport_caps == NULL )
405 goto fail;
407 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
408 goto fail;
410 clear_page(d->shared_info);
411 share_xen_page_with_guest(
412 virt_to_page(d->shared_info), d, XENSHARE_writable);
414 if ( (rc = iommu_domain_init(d)) != 0 )
415 goto fail;
416 }
418 if ( is_hvm_domain(d) )
419 {
420 if ( (rc = hvm_domain_initialise(d)) != 0 )
421 {
422 iommu_domain_destroy(d);
423 goto fail;
424 }
425 }
426 else
427 {
428 /* 32-bit PV guest by default only if Xen is not 64-bit. */
429 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
430 (CONFIG_PAGING_LEVELS != 4);
431 }
433 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
434 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
435 {
436 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
437 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
438 }
440 return 0;
442 fail:
443 d->is_dying = DOMDYING_dead;
444 free_xenheap_page(d->shared_info);
445 if ( paging_initialised )
446 paging_final_teardown(d);
447 #ifdef __x86_64__
448 if ( d->arch.mm_perdomain_l2 )
449 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
450 if ( d->arch.mm_perdomain_l3 )
451 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
452 #endif
453 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
454 return rc;
455 }
457 void arch_domain_destroy(struct domain *d)
458 {
459 if ( is_hvm_domain(d) )
460 hvm_domain_destroy(d);
462 pci_release_devices(d);
463 free_domain_pirqs(d);
464 if ( !is_idle_domain(d) )
465 iommu_domain_destroy(d);
467 paging_final_teardown(d);
469 free_xenheap_pages(
470 d->arch.mm_perdomain_pt,
471 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
473 #ifdef __x86_64__
474 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
475 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
476 #endif
478 free_xenheap_page(d->shared_info);
479 }
481 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
482 {
483 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
485 hv_cr4_mask = ~X86_CR4_TSD;
486 if ( cpu_has_de )
487 hv_cr4_mask &= ~X86_CR4_DE;
489 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
490 gdprintk(XENLOG_WARNING,
491 "Attempt to change CR4 flags %08lx -> %08lx\n",
492 hv_cr4, guest_cr4);
494 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
495 }
497 /* This is called by arch_final_setup_guest and do_boot_vcpu */
498 int arch_set_info_guest(
499 struct vcpu *v, vcpu_guest_context_u c)
500 {
501 struct domain *d = v->domain;
502 unsigned long cr3_pfn = INVALID_MFN;
503 unsigned long flags, cr4;
504 int i, rc = 0, compat;
506 /* The context is a compat-mode one if the target domain is compat-mode;
507 * we expect the tools to DTRT even in compat-mode callers. */
508 compat = is_pv_32on64_domain(d);
510 #ifdef CONFIG_COMPAT
511 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
512 #else
513 #define c(fld) (c.nat->fld)
514 #endif
515 flags = c(flags);
517 if ( !is_hvm_vcpu(v) )
518 {
519 if ( !compat )
520 {
521 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
522 fixup_guest_stack_selector(d, c.nat->kernel_ss);
523 fixup_guest_code_selector(d, c.nat->user_regs.cs);
524 #ifdef __i386__
525 fixup_guest_code_selector(d, c.nat->event_callback_cs);
526 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
527 #endif
529 for ( i = 0; i < 256; i++ )
530 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
532 /* LDT safety checks. */
533 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
534 (c.nat->ldt_ents > 8192) ||
535 !array_access_ok(c.nat->ldt_base,
536 c.nat->ldt_ents,
537 LDT_ENTRY_SIZE) )
538 return -EINVAL;
539 }
540 #ifdef CONFIG_COMPAT
541 else
542 {
543 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
544 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
545 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
546 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
547 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
549 for ( i = 0; i < 256; i++ )
550 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
552 /* LDT safety checks. */
553 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
554 (c.cmp->ldt_ents > 8192) ||
555 !compat_array_access_ok(c.cmp->ldt_base,
556 c.cmp->ldt_ents,
557 LDT_ENTRY_SIZE) )
558 return -EINVAL;
559 }
560 #endif
561 }
563 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
565 v->arch.flags &= ~TF_kernel_mode;
566 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
567 v->arch.flags |= TF_kernel_mode;
569 if ( !compat )
570 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
571 #ifdef CONFIG_COMPAT
572 else
573 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
574 #endif
576 v->arch.guest_context.user_regs.eflags |= 2;
578 if ( is_hvm_vcpu(v) )
579 {
580 hvm_set_info_guest(v);
581 goto out;
582 }
584 /* Only CR0.TS is modifiable by guest or admin. */
585 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
586 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
588 init_int80_direct_trap(v);
590 /* IOPL privileges are virtualised. */
591 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
592 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
594 /* Ensure real hardware interrupts are enabled. */
595 v->arch.guest_context.user_regs.eflags |= EF_IE;
597 cr4 = v->arch.guest_context.ctrlreg[4];
598 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
599 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
601 memset(v->arch.guest_context.debugreg, 0,
602 sizeof(v->arch.guest_context.debugreg));
603 for ( i = 0; i < 8; i++ )
604 (void)set_debugreg(v, i, c(debugreg[i]));
606 if ( v->is_initialised )
607 goto out;
609 if ( v->vcpu_id == 0 )
610 d->vm_assist = c(vm_assist);
612 if ( !compat )
613 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
614 #ifdef CONFIG_COMPAT
615 else
616 {
617 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
618 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
620 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
621 return -EINVAL;
622 for ( i = 0; i < n; ++i )
623 gdt_frames[i] = c.cmp->gdt_frames[i];
624 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
625 }
626 #endif
627 if ( rc != 0 )
628 return rc;
630 if ( !compat )
631 {
632 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
634 if ( !mfn_valid(cr3_pfn) ||
635 (paging_mode_refcounts(d)
636 ? !get_page(mfn_to_page(cr3_pfn), d)
637 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
638 PGT_base_page_table)) )
639 {
640 destroy_gdt(v);
641 return -EINVAL;
642 }
644 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
646 #ifdef __x86_64__
647 if ( c.nat->ctrlreg[1] )
648 {
649 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
651 if ( !mfn_valid(cr3_pfn) ||
652 (paging_mode_refcounts(d)
653 ? !get_page(mfn_to_page(cr3_pfn), d)
654 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
655 PGT_base_page_table)) )
656 {
657 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
658 v->arch.guest_table = pagetable_null();
659 if ( paging_mode_refcounts(d) )
660 put_page(mfn_to_page(cr3_pfn));
661 else
662 put_page_and_type(mfn_to_page(cr3_pfn));
663 destroy_gdt(v);
664 return -EINVAL;
665 }
667 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
668 }
669 #endif
670 }
671 #ifdef CONFIG_COMPAT
672 else
673 {
674 l4_pgentry_t *l4tab;
676 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
678 if ( !mfn_valid(cr3_pfn) ||
679 (paging_mode_refcounts(d)
680 ? !get_page(mfn_to_page(cr3_pfn), d)
681 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
682 PGT_l3_page_table)) )
683 {
684 destroy_gdt(v);
685 return -EINVAL;
686 }
688 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
689 *l4tab = l4e_from_pfn(
690 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
691 }
692 #endif
694 if ( v->vcpu_id == 0 )
695 update_domain_wallclock_time(d);
697 /* Don't redo final setup */
698 v->is_initialised = 1;
700 if ( paging_mode_enabled(d) )
701 paging_update_paging_modes(v);
703 update_cr3(v);
705 out:
706 if ( flags & VGCF_online )
707 clear_bit(_VPF_down, &v->pause_flags);
708 else
709 set_bit(_VPF_down, &v->pause_flags);
710 return 0;
711 #undef c
712 }
714 void arch_vcpu_reset(struct vcpu *v)
715 {
716 if ( !is_hvm_vcpu(v) )
717 {
718 destroy_gdt(v);
719 vcpu_destroy_pagetables(v);
720 }
721 else
722 {
723 vcpu_end_shutdown_deferral(v);
724 }
725 }
727 /*
728 * Unmap the vcpu info page if the guest decided to place it somewhere
729 * else. This is only used from arch_domain_destroy, so there's no
730 * need to do anything clever.
731 */
732 static void
733 unmap_vcpu_info(struct vcpu *v)
734 {
735 struct domain *d = v->domain;
736 unsigned long mfn;
738 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
739 return;
741 mfn = v->arch.vcpu_info_mfn;
742 unmap_domain_page_global(v->vcpu_info);
744 v->vcpu_info = (void *)&shared_info(d, vcpu_info[v->vcpu_id]);
745 v->arch.vcpu_info_mfn = INVALID_MFN;
747 put_page_and_type(mfn_to_page(mfn));
748 }
750 /*
751 * Map a guest page in and point the vcpu_info pointer at it. This
752 * makes sure that the vcpu_info is always pointing at a valid piece
753 * of memory, and it sets a pending event to make sure that a pending
754 * event doesn't get missed.
755 */
756 static int
757 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
758 {
759 struct domain *d = v->domain;
760 void *mapping;
761 vcpu_info_t *new_info;
762 int i;
764 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
765 return -EINVAL;
767 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
768 return -EINVAL;
770 /* Run this command on yourself or on other offline VCPUS. */
771 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
772 return -EINVAL;
774 mfn = gmfn_to_mfn(d, mfn);
775 if ( !mfn_valid(mfn) ||
776 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
777 return -EINVAL;
779 mapping = map_domain_page_global(mfn);
780 if ( mapping == NULL )
781 {
782 put_page_and_type(mfn_to_page(mfn));
783 return -ENOMEM;
784 }
786 new_info = (vcpu_info_t *)(mapping + offset);
788 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
790 v->vcpu_info = new_info;
791 v->arch.vcpu_info_mfn = mfn;
793 /* Set new vcpu_info pointer /before/ setting pending flags. */
794 wmb();
796 /*
797 * Mark everything as being pending just to make sure nothing gets
798 * lost. The domain will get a spurious event, but it can cope.
799 */
800 vcpu_info(v, evtchn_upcall_pending) = 1;
801 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
802 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
804 return 0;
805 }
807 long
808 arch_do_vcpu_op(
809 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
810 {
811 long rc = 0;
813 switch ( cmd )
814 {
815 case VCPUOP_register_runstate_memory_area:
816 {
817 struct vcpu_register_runstate_memory_area area;
818 struct vcpu_runstate_info runstate;
820 rc = -EFAULT;
821 if ( copy_from_guest(&area, arg, 1) )
822 break;
824 if ( !guest_handle_okay(area.addr.h, 1) )
825 break;
827 rc = 0;
828 runstate_guest(v) = area.addr.h;
830 if ( v == current )
831 {
832 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
833 }
834 else
835 {
836 vcpu_runstate_get(v, &runstate);
837 __copy_to_guest(runstate_guest(v), &runstate, 1);
838 }
840 break;
841 }
843 case VCPUOP_register_vcpu_info:
844 {
845 struct domain *d = v->domain;
846 struct vcpu_register_vcpu_info info;
848 rc = -EFAULT;
849 if ( copy_from_guest(&info, arg, 1) )
850 break;
852 domain_lock(d);
853 rc = map_vcpu_info(v, info.mfn, info.offset);
854 domain_unlock(d);
856 break;
857 }
859 case VCPUOP_get_physid:
860 {
861 struct vcpu_get_physid cpu_id;
863 rc = -EINVAL;
864 if ( !v->domain->is_pinned )
865 break;
867 cpu_id.phys_id =
868 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
869 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
871 rc = -EFAULT;
872 if ( copy_to_guest(arg, &cpu_id, 1) )
873 break;
875 rc = 0;
876 break;
877 }
879 default:
880 rc = -ENOSYS;
881 break;
882 }
884 return rc;
885 }
887 #ifdef __x86_64__
889 #define loadsegment(seg,value) ({ \
890 int __r = 1; \
891 asm volatile ( \
892 "1: movl %k1,%%" #seg "\n2:\n" \
893 ".section .fixup,\"ax\"\n" \
894 "3: xorl %k0,%k0\n" \
895 " movl %k0,%%" #seg "\n" \
896 " jmp 2b\n" \
897 ".previous\n" \
898 ".section __ex_table,\"a\"\n" \
899 " .align 8\n" \
900 " .quad 1b,3b\n" \
901 ".previous" \
902 : "=r" (__r) : "r" (value), "0" (__r) );\
903 __r; })
905 /*
906 * save_segments() writes a mask of segments which are dirty (non-zero),
907 * allowing load_segments() to avoid some expensive segment loads and
908 * MSR writes.
909 */
910 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
911 #define DIRTY_DS 0x01
912 #define DIRTY_ES 0x02
913 #define DIRTY_FS 0x04
914 #define DIRTY_GS 0x08
915 #define DIRTY_FS_BASE 0x10
916 #define DIRTY_GS_BASE_USER 0x20
918 static void load_segments(struct vcpu *n)
919 {
920 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
921 int all_segs_okay = 1;
922 unsigned int dirty_segment_mask, cpu = smp_processor_id();
924 /* Load and clear the dirty segment mask. */
925 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
926 per_cpu(dirty_segment_mask, cpu) = 0;
928 /* Either selector != 0 ==> reload. */
929 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
930 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
932 /* Either selector != 0 ==> reload. */
933 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
934 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
936 /*
937 * Either selector != 0 ==> reload.
938 * Also reload to reset FS_BASE if it was non-zero.
939 */
940 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
941 nctxt->user_regs.fs) )
942 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
944 /*
945 * Either selector != 0 ==> reload.
946 * Also reload to reset GS_BASE if it was non-zero.
947 */
948 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
949 nctxt->user_regs.gs) )
950 {
951 /* Reset GS_BASE with user %gs? */
952 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
953 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
954 }
956 if ( !is_pv_32on64_domain(n->domain) )
957 {
958 /* This can only be non-zero if selector is NULL. */
959 if ( nctxt->fs_base )
960 wrmsr(MSR_FS_BASE,
961 nctxt->fs_base,
962 nctxt->fs_base>>32);
964 /* Most kernels have non-zero GS base, so don't bother testing. */
965 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
966 wrmsr(MSR_SHADOW_GS_BASE,
967 nctxt->gs_base_kernel,
968 nctxt->gs_base_kernel>>32);
970 /* This can only be non-zero if selector is NULL. */
971 if ( nctxt->gs_base_user )
972 wrmsr(MSR_GS_BASE,
973 nctxt->gs_base_user,
974 nctxt->gs_base_user>>32);
976 /* If in kernel mode then switch the GS bases around. */
977 if ( (n->arch.flags & TF_kernel_mode) )
978 asm volatile ( "swapgs" );
979 }
981 if ( unlikely(!all_segs_okay) )
982 {
983 struct cpu_user_regs *regs = guest_cpu_user_regs();
984 unsigned long *rsp =
985 (n->arch.flags & TF_kernel_mode) ?
986 (unsigned long *)regs->rsp :
987 (unsigned long *)nctxt->kernel_sp;
988 unsigned long cs_and_mask, rflags;
990 if ( is_pv_32on64_domain(n->domain) )
991 {
992 unsigned int *esp = ring_1(regs) ?
993 (unsigned int *)regs->rsp :
994 (unsigned int *)nctxt->kernel_sp;
995 unsigned int cs_and_mask, eflags;
996 int ret = 0;
998 /* CS longword also contains full evtchn_upcall_mask. */
999 cs_and_mask = (unsigned short)regs->cs |
1000 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1001 /* Fold upcall mask into RFLAGS.IF. */
1002 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1003 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1005 if ( !ring_1(regs) )
1007 ret = put_user(regs->ss, esp-1);
1008 ret |= put_user(regs->_esp, esp-2);
1009 esp -= 2;
1012 if ( ret |
1013 put_user(eflags, esp-1) |
1014 put_user(cs_and_mask, esp-2) |
1015 put_user(regs->_eip, esp-3) |
1016 put_user(nctxt->user_regs.gs, esp-4) |
1017 put_user(nctxt->user_regs.fs, esp-5) |
1018 put_user(nctxt->user_regs.es, esp-6) |
1019 put_user(nctxt->user_regs.ds, esp-7) )
1021 gdprintk(XENLOG_ERR, "Error while creating compat "
1022 "failsafe callback frame.\n");
1023 domain_crash(n->domain);
1026 if ( test_bit(_VGCF_failsafe_disables_events,
1027 &n->arch.guest_context.flags) )
1028 vcpu_info(n, evtchn_upcall_mask) = 1;
1030 regs->entry_vector = TRAP_syscall;
1031 regs->_eflags &= 0xFFFCBEFFUL;
1032 regs->ss = FLAT_COMPAT_KERNEL_SS;
1033 regs->_esp = (unsigned long)(esp-7);
1034 regs->cs = FLAT_COMPAT_KERNEL_CS;
1035 regs->_eip = nctxt->failsafe_callback_eip;
1036 return;
1039 if ( !(n->arch.flags & TF_kernel_mode) )
1040 toggle_guest_mode(n);
1041 else
1042 regs->cs &= ~3;
1044 /* CS longword also contains full evtchn_upcall_mask. */
1045 cs_and_mask = (unsigned long)regs->cs |
1046 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1048 /* Fold upcall mask into RFLAGS.IF. */
1049 rflags = regs->rflags & ~X86_EFLAGS_IF;
1050 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1052 if ( put_user(regs->ss, rsp- 1) |
1053 put_user(regs->rsp, rsp- 2) |
1054 put_user(rflags, rsp- 3) |
1055 put_user(cs_and_mask, rsp- 4) |
1056 put_user(regs->rip, rsp- 5) |
1057 put_user(nctxt->user_regs.gs, rsp- 6) |
1058 put_user(nctxt->user_regs.fs, rsp- 7) |
1059 put_user(nctxt->user_regs.es, rsp- 8) |
1060 put_user(nctxt->user_regs.ds, rsp- 9) |
1061 put_user(regs->r11, rsp-10) |
1062 put_user(regs->rcx, rsp-11) )
1064 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1065 "callback frame.\n");
1066 domain_crash(n->domain);
1069 if ( test_bit(_VGCF_failsafe_disables_events,
1070 &n->arch.guest_context.flags) )
1071 vcpu_info(n, evtchn_upcall_mask) = 1;
1073 regs->entry_vector = TRAP_syscall;
1074 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1075 X86_EFLAGS_NT|X86_EFLAGS_TF);
1076 regs->ss = FLAT_KERNEL_SS;
1077 regs->rsp = (unsigned long)(rsp-11);
1078 regs->cs = FLAT_KERNEL_CS;
1079 regs->rip = nctxt->failsafe_callback_eip;
1083 static void save_segments(struct vcpu *v)
1085 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1086 struct cpu_user_regs *regs = &ctxt->user_regs;
1087 unsigned int dirty_segment_mask = 0;
1089 regs->ds = read_segment_register(ds);
1090 regs->es = read_segment_register(es);
1091 regs->fs = read_segment_register(fs);
1092 regs->gs = read_segment_register(gs);
1094 if ( regs->ds )
1095 dirty_segment_mask |= DIRTY_DS;
1097 if ( regs->es )
1098 dirty_segment_mask |= DIRTY_ES;
1100 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1102 dirty_segment_mask |= DIRTY_FS;
1103 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1105 else if ( ctxt->fs_base )
1107 dirty_segment_mask |= DIRTY_FS_BASE;
1110 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1112 dirty_segment_mask |= DIRTY_GS;
1113 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1115 else if ( ctxt->gs_base_user )
1117 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1120 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1123 #define switch_kernel_stack(v) ((void)0)
1125 #elif defined(__i386__)
1127 #define load_segments(n) ((void)0)
1128 #define save_segments(p) ((void)0)
1130 static inline void switch_kernel_stack(struct vcpu *v)
1132 struct tss_struct *tss = &init_tss[smp_processor_id()];
1133 tss->esp1 = v->arch.guest_context.kernel_sp;
1134 tss->ss1 = v->arch.guest_context.kernel_ss;
1137 #endif /* __i386__ */
1139 static void paravirt_ctxt_switch_from(struct vcpu *v)
1141 save_segments(v);
1143 /*
1144 * Disable debug breakpoints. We do this aggressively because if we switch
1145 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1146 * inside Xen, before we get a chance to reload DR7, and this cannot always
1147 * safely be handled.
1148 */
1149 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1150 write_debugreg(7, 0);
1153 static void paravirt_ctxt_switch_to(struct vcpu *v)
1155 unsigned long cr4;
1157 set_int80_direct_trap(v);
1158 switch_kernel_stack(v);
1160 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1161 if ( unlikely(cr4 != read_cr4()) )
1162 write_cr4(cr4);
1164 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1166 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1167 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1168 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1169 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1170 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1171 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1175 static void __context_switch(void)
1177 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1178 unsigned int i, cpu = smp_processor_id();
1179 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1180 struct vcpu *n = current;
1181 struct desc_struct *gdt;
1182 struct page_info *page;
1183 struct desc_ptr gdt_desc;
1185 ASSERT(p != n);
1186 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1188 if ( !is_idle_vcpu(p) )
1190 memcpy(&p->arch.guest_context.user_regs,
1191 stack_regs,
1192 CTXT_SWITCH_STACK_BYTES);
1193 unlazy_fpu(p);
1194 p->arch.ctxt_switch_from(p);
1197 if ( !is_idle_vcpu(n) )
1199 memcpy(stack_regs,
1200 &n->arch.guest_context.user_regs,
1201 CTXT_SWITCH_STACK_BYTES);
1202 n->arch.ctxt_switch_to(n);
1205 if ( p->domain != n->domain )
1206 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1207 cpu_set(cpu, n->vcpu_dirty_cpumask);
1209 gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
1210 per_cpu(compat_gdt_table, cpu);
1211 page = virt_to_page(gdt);
1212 for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
1214 l1e_write(n->domain->arch.mm_perdomain_pt +
1215 (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
1216 FIRST_RESERVED_GDT_PAGE + i,
1217 l1e_from_page(page + i, __PAGE_HYPERVISOR));
1220 if ( p->vcpu_id != n->vcpu_id )
1222 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1223 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1224 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1227 write_ptbase(n);
1229 if ( p->vcpu_id != n->vcpu_id )
1231 gdt_desc.base = GDT_VIRT_START(n);
1232 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1235 if ( p->domain != n->domain )
1236 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1237 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1239 per_cpu(curr_vcpu, cpu) = n;
1243 void context_switch(struct vcpu *prev, struct vcpu *next)
1245 unsigned int cpu = smp_processor_id();
1246 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1248 ASSERT(local_irq_is_enabled());
1250 /* Allow at most one CPU at a time to be dirty. */
1251 ASSERT(cpus_weight(dirty_mask) <= 1);
1252 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1254 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1255 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1256 flush_tlb_mask(next->vcpu_dirty_cpumask);
1259 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1260 pt_save_timer(prev);
1262 local_irq_disable();
1264 set_current(next);
1266 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1268 local_irq_enable();
1270 else
1272 __context_switch();
1274 #ifdef CONFIG_COMPAT
1275 if ( !is_hvm_vcpu(next) &&
1276 (is_idle_vcpu(prev) ||
1277 is_hvm_vcpu(prev) ||
1278 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1280 uint64_t efer = read_efer();
1281 if ( !(efer & EFER_SCE) )
1282 write_efer(efer | EFER_SCE);
1284 #endif
1286 /* Re-enable interrupts before restoring state which may fault. */
1287 local_irq_enable();
1289 if ( !is_hvm_vcpu(next) )
1291 load_LDT(next);
1292 load_segments(next);
1296 context_saved(prev);
1298 /* Update per-VCPU guest runstate shared memory area (if registered). */
1299 if ( !guest_handle_is_null(runstate_guest(next)) )
1301 if ( !is_pv_32on64_domain(next->domain) )
1302 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1303 #ifdef CONFIG_COMPAT
1304 else
1306 struct compat_vcpu_runstate_info info;
1308 XLAT_vcpu_runstate_info(&info, &next->runstate);
1309 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1311 #endif
1314 schedule_tail(next);
1315 BUG();
1318 void continue_running(struct vcpu *same)
1320 schedule_tail(same);
1321 BUG();
1324 int __sync_lazy_execstate(void)
1326 unsigned long flags;
1327 int switch_required;
1329 local_irq_save(flags);
1331 switch_required = (this_cpu(curr_vcpu) != current);
1333 if ( switch_required )
1335 ASSERT(current == idle_vcpu[smp_processor_id()]);
1336 __context_switch();
1339 local_irq_restore(flags);
1341 return switch_required;
1344 void sync_vcpu_execstate(struct vcpu *v)
1346 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1347 (void)__sync_lazy_execstate();
1349 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1350 flush_tlb_mask(v->vcpu_dirty_cpumask);
1353 struct migrate_info {
1354 long (*func)(void *data);
1355 void *data;
1356 void (*saved_schedule_tail)(struct vcpu *);
1357 cpumask_t saved_affinity;
1358 unsigned int nest;
1359 };
1361 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1363 struct cpu_user_regs *regs = guest_cpu_user_regs();
1364 struct migrate_info *info = v->arch.continue_info;
1365 cpumask_t mask = info->saved_affinity;
1366 void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
1368 regs->eax = info->func(info->data);
1370 if ( info->nest-- == 0 )
1372 xfree(info);
1373 v->arch.schedule_tail = saved_schedule_tail;
1374 v->arch.continue_info = NULL;
1375 vcpu_unlock_affinity(v, &mask);
1378 (*saved_schedule_tail)(v);
1381 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1383 struct vcpu *v = current;
1384 struct migrate_info *info;
1385 cpumask_t mask = cpumask_of_cpu(cpu);
1386 int rc;
1388 if ( cpu == smp_processor_id() )
1389 return func(data);
1391 info = v->arch.continue_info;
1392 if ( info == NULL )
1394 info = xmalloc(struct migrate_info);
1395 if ( info == NULL )
1396 return -ENOMEM;
1398 rc = vcpu_lock_affinity(v, &mask);
1399 if ( rc )
1401 xfree(info);
1402 return rc;
1405 info->saved_schedule_tail = v->arch.schedule_tail;
1406 info->saved_affinity = mask;
1407 info->nest = 0;
1409 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1410 v->arch.continue_info = info;
1412 else
1414 BUG_ON(info->nest != 0);
1415 rc = vcpu_locked_change_affinity(v, &mask);
1416 if ( rc )
1417 return rc;
1418 info->nest++;
1421 info->func = func;
1422 info->data = data;
1424 /* Dummy return value will be overwritten by new schedule_tail. */
1425 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1426 return 0;
1429 #define next_arg(fmt, args) ({ \
1430 unsigned long __arg; \
1431 switch ( *(fmt)++ ) \
1432 { \
1433 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1434 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1435 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1436 default: __arg = 0; BUG(); \
1437 } \
1438 __arg; \
1439 })
1441 DEFINE_PER_CPU(char, hc_preempted);
1443 unsigned long hypercall_create_continuation(
1444 unsigned int op, const char *format, ...)
1446 struct mc_state *mcs = &this_cpu(mc_state);
1447 struct cpu_user_regs *regs;
1448 const char *p = format;
1449 unsigned long arg;
1450 unsigned int i;
1451 va_list args;
1453 va_start(args, format);
1455 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1457 __set_bit(_MCSF_call_preempted, &mcs->flags);
1459 for ( i = 0; *p != '\0'; i++ )
1460 mcs->call.args[i] = next_arg(p, args);
1461 if ( is_pv_32on64_domain(current->domain) )
1463 for ( ; i < 6; i++ )
1464 mcs->call.args[i] = 0;
1467 else
1469 regs = guest_cpu_user_regs();
1470 regs->eax = op;
1471 /*
1472 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1473 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1474 */
1475 if ( !is_hvm_vcpu(current) )
1476 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1478 #ifdef __x86_64__
1479 if ( !is_hvm_vcpu(current) ?
1480 !is_pv_32on64_vcpu(current) :
1481 (hvm_guest_x86_mode(current) == 8) )
1483 for ( i = 0; *p != '\0'; i++ )
1485 arg = next_arg(p, args);
1486 switch ( i )
1488 case 0: regs->rdi = arg; break;
1489 case 1: regs->rsi = arg; break;
1490 case 2: regs->rdx = arg; break;
1491 case 3: regs->r10 = arg; break;
1492 case 4: regs->r8 = arg; break;
1493 case 5: regs->r9 = arg; break;
1497 else
1498 #endif
1500 if ( supervisor_mode_kernel )
1501 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1503 for ( i = 0; *p != '\0'; i++ )
1505 arg = next_arg(p, args);
1506 switch ( i )
1508 case 0: regs->ebx = arg; break;
1509 case 1: regs->ecx = arg; break;
1510 case 2: regs->edx = arg; break;
1511 case 3: regs->esi = arg; break;
1512 case 4: regs->edi = arg; break;
1513 case 5: regs->ebp = arg; break;
1518 this_cpu(hc_preempted) = 1;
1521 va_end(args);
1523 return op;
1526 #ifdef CONFIG_COMPAT
1527 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1529 int rc = 0;
1530 struct mc_state *mcs = &this_cpu(mc_state);
1531 struct cpu_user_regs *regs;
1532 unsigned int i, cval = 0;
1533 unsigned long nval = 0;
1534 va_list args;
1536 BUG_ON(*id > 5);
1537 BUG_ON(mask & (1U << *id));
1539 va_start(args, mask);
1541 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1543 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1544 return 0;
1545 for ( i = 0; i < 6; ++i, mask >>= 1 )
1547 if ( mask & 1 )
1549 nval = va_arg(args, unsigned long);
1550 cval = va_arg(args, unsigned int);
1551 if ( cval == nval )
1552 mask &= ~1U;
1553 else
1554 BUG_ON(nval == (unsigned int)nval);
1556 else if ( id && *id == i )
1558 *id = mcs->call.args[i];
1559 id = NULL;
1561 if ( (mask & 1) && mcs->call.args[i] == nval )
1563 mcs->call.args[i] = cval;
1564 ++rc;
1566 else
1567 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1570 else
1572 regs = guest_cpu_user_regs();
1573 for ( i = 0; i < 6; ++i, mask >>= 1 )
1575 unsigned long *reg;
1577 switch ( i )
1579 case 0: reg = &regs->ebx; break;
1580 case 1: reg = &regs->ecx; break;
1581 case 2: reg = &regs->edx; break;
1582 case 3: reg = &regs->esi; break;
1583 case 4: reg = &regs->edi; break;
1584 case 5: reg = &regs->ebp; break;
1585 default: BUG(); reg = NULL; break;
1587 if ( (mask & 1) )
1589 nval = va_arg(args, unsigned long);
1590 cval = va_arg(args, unsigned int);
1591 if ( cval == nval )
1592 mask &= ~1U;
1593 else
1594 BUG_ON(nval == (unsigned int)nval);
1596 else if ( id && *id == i )
1598 *id = *reg;
1599 id = NULL;
1601 if ( (mask & 1) && *reg == nval )
1603 *reg = cval;
1604 ++rc;
1606 else
1607 BUG_ON(*reg != (unsigned int)*reg);
1611 va_end(args);
1613 return rc;
1615 #endif
1617 static int relinquish_memory(
1618 struct domain *d, struct list_head *list, unsigned long type)
1620 struct list_head *ent;
1621 struct page_info *page;
1622 unsigned long x, y;
1623 int ret = 0;
1625 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1626 spin_lock_recursive(&d->page_alloc_lock);
1628 ent = list->next;
1629 while ( ent != list )
1631 page = list_entry(ent, struct page_info, list);
1633 /* Grab a reference to the page so it won't disappear from under us. */
1634 if ( unlikely(!get_page(page, d)) )
1636 /* Couldn't get a reference -- someone is freeing this page. */
1637 ent = ent->next;
1638 list_move_tail(&page->list, &d->arch.relmem_list);
1639 continue;
1642 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1643 put_page_and_type(page);
1645 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1646 put_page(page);
1648 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1649 /*
1650 * Forcibly drop reference counts of page tables above top most (which
1651 * were skipped to prevent long latencies due to deep recursion - see
1652 * the special treatment in free_lX_table()).
1653 */
1654 y = page->u.inuse.type_info;
1655 if ( (type < PGT_root_page_table) &&
1656 unlikely(((y + PGT_type_mask) &
1657 (PGT_type_mask|PGT_validated)) == type) )
1659 BUG_ON((y & PGT_count_mask) >=
1660 (page->count_info & PGC_count_mask));
1661 while ( y & PGT_count_mask )
1663 put_page_and_type(page);
1664 y = page->u.inuse.type_info;
1667 #endif
1669 /*
1670 * Forcibly invalidate top-most, still valid page tables at this point
1671 * to break circular 'linear page table' references as well as clean up
1672 * partially validated pages. This is okay because MMU structures are
1673 * not shared across domains and this domain is now dead. Thus top-most
1674 * valid tables are not in use so a non-zero count means circular
1675 * reference or partially validated.
1676 */
1677 y = page->u.inuse.type_info;
1678 for ( ; ; )
1680 x = y;
1681 if ( likely((x & PGT_type_mask) != type) ||
1682 likely(!(x & (PGT_validated|PGT_partial))) )
1683 break;
1685 y = cmpxchg(&page->u.inuse.type_info, x,
1686 x & ~(PGT_validated|PGT_partial));
1687 if ( likely(y == x) )
1689 if ( free_page_type(page, x, 0) != 0 )
1690 BUG();
1691 if ( x & PGT_partial )
1692 page->u.inuse.type_info--;
1693 break;
1697 /* Follow the list chain and /then/ potentially free the page. */
1698 ent = ent->next;
1699 list_move_tail(&page->list, &d->arch.relmem_list);
1700 put_page(page);
1702 if ( hypercall_preempt_check() )
1704 ret = -EAGAIN;
1705 goto out;
1709 list_splice_init(&d->arch.relmem_list, list);
1711 out:
1712 spin_unlock_recursive(&d->page_alloc_lock);
1713 return ret;
1716 static void vcpu_destroy_pagetables(struct vcpu *v)
1718 struct domain *d = v->domain;
1719 unsigned long pfn;
1721 #ifdef __x86_64__
1722 if ( is_pv_32on64_vcpu(v) )
1724 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1725 __va(pagetable_get_paddr(v->arch.guest_table)));
1727 if ( pfn != 0 )
1729 if ( paging_mode_refcounts(d) )
1730 put_page(mfn_to_page(pfn));
1731 else
1732 put_page_and_type(mfn_to_page(pfn));
1735 l4e_write(
1736 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1737 l4e_empty());
1739 v->arch.cr3 = 0;
1740 return;
1742 #endif
1744 pfn = pagetable_get_pfn(v->arch.guest_table);
1745 if ( pfn != 0 )
1747 if ( paging_mode_refcounts(d) )
1748 put_page(mfn_to_page(pfn));
1749 else
1750 put_page_and_type(mfn_to_page(pfn));
1751 v->arch.guest_table = pagetable_null();
1754 #ifdef __x86_64__
1755 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1756 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1757 if ( pfn != 0 )
1759 if ( !is_pv_32bit_vcpu(v) )
1761 if ( paging_mode_refcounts(d) )
1762 put_page(mfn_to_page(pfn));
1763 else
1764 put_page_and_type(mfn_to_page(pfn));
1766 v->arch.guest_table_user = pagetable_null();
1768 #endif
1770 v->arch.cr3 = 0;
1773 int domain_relinquish_resources(struct domain *d)
1775 int ret;
1776 struct vcpu *v;
1778 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1780 switch ( d->arch.relmem )
1782 case RELMEM_not_started:
1783 /* Tear down paging-assistance stuff. */
1784 paging_teardown(d);
1786 for_each_vcpu ( d, v )
1788 /* Drop the in-use references to page-table bases. */
1789 vcpu_destroy_pagetables(v);
1791 /*
1792 * Relinquish GDT mappings. No need for explicit unmapping of the
1793 * LDT as it automatically gets squashed with the guest mappings.
1794 */
1795 destroy_gdt(v);
1797 unmap_vcpu_info(v);
1800 d->arch.relmem = RELMEM_xen;
1801 /* fallthrough */
1803 /* Relinquish every page of memory. */
1804 case RELMEM_xen:
1805 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1806 if ( ret )
1807 return ret;
1808 #if CONFIG_PAGING_LEVELS >= 4
1809 d->arch.relmem = RELMEM_l4;
1810 /* fallthrough */
1812 case RELMEM_l4:
1813 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1814 if ( ret )
1815 return ret;
1816 #endif
1817 #if CONFIG_PAGING_LEVELS >= 3
1818 d->arch.relmem = RELMEM_l3;
1819 /* fallthrough */
1821 case RELMEM_l3:
1822 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1823 if ( ret )
1824 return ret;
1825 #endif
1826 d->arch.relmem = RELMEM_l2;
1827 /* fallthrough */
1829 case RELMEM_l2:
1830 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1831 if ( ret )
1832 return ret;
1833 d->arch.relmem = RELMEM_done;
1834 /* fallthrough */
1836 case RELMEM_done:
1837 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
1838 ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
1839 if ( ret )
1840 return ret;
1841 #endif
1842 break;
1844 default:
1845 BUG();
1848 /* Free page used by xen oprofile buffer. */
1849 free_xenoprof_pages(d);
1851 if ( is_hvm_domain(d) )
1852 hvm_domain_relinquish_resources(d);
1854 return 0;
1857 void arch_dump_domain_info(struct domain *d)
1859 paging_dump_domain_info(d);
1862 void arch_dump_vcpu_info(struct vcpu *v)
1864 paging_dump_vcpu_info(v);
1867 void domain_cpuid(
1868 struct domain *d,
1869 unsigned int input,
1870 unsigned int sub_input,
1871 unsigned int *eax,
1872 unsigned int *ebx,
1873 unsigned int *ecx,
1874 unsigned int *edx)
1876 cpuid_input_t *cpuid;
1877 int i;
1879 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
1881 cpuid = &d->arch.cpuids[i];
1883 if ( (cpuid->input[0] == input) &&
1884 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
1885 (cpuid->input[1] == sub_input)) )
1887 *eax = cpuid->eax;
1888 *ebx = cpuid->ebx;
1889 *ecx = cpuid->ecx;
1890 *edx = cpuid->edx;
1891 return;
1895 *eax = *ebx = *ecx = *edx = 0;
1898 void vcpu_kick(struct vcpu *v)
1900 /*
1901 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
1902 * pending flag. These values may fluctuate (after all, we hold no
1903 * locks) but the key insight is that each change will cause
1904 * evtchn_upcall_pending to be polled.
1906 * NB2. We save the running flag across the unblock to avoid a needless
1907 * IPI for domains that we IPI'd to unblock.
1908 */
1909 bool_t running = v->is_running;
1910 vcpu_unblock(v);
1911 if ( running && (in_irq() || (v != current)) )
1912 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
1915 void vcpu_mark_events_pending(struct vcpu *v)
1917 int already_pending = test_and_set_bit(
1918 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
1920 if ( already_pending )
1921 return;
1923 if ( is_hvm_vcpu(v) )
1924 hvm_assert_evtchn_irq(v);
1925 else
1926 vcpu_kick(v);
1929 static void vcpu_kick_softirq(void)
1931 /*
1932 * Nothing to do here: we merely prevent notifiers from racing with checks
1933 * executed on return to guest context with interrupts enabled. See, for
1934 * example, xxx_intr_assist() executed on return to HVM guest context.
1935 */
1938 static int __init init_vcpu_kick_softirq(void)
1940 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
1941 return 0;
1943 __initcall(init_vcpu_kick_softirq);
1946 /*
1947 * Local variables:
1948 * mode: C
1949 * c-set-style: "BSD"
1950 * c-basic-offset: 4
1951 * tab-width: 4
1952 * indent-tabs-mode: nil
1953 * End:
1954 */