ia64/xen-unstable

view xen/arch/x86/domain.c @ 19835:edfdeb150f27

Fix buildsystem to detect udev > version 124

udev removed the udevinfo symlink from versions higher than 123 and
xen's build-system could not detect if udev is in place and has the
required version.

Signed-off-by: Marc-A. Dahlhaus <mad@wol.de>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 25 13:02:37 2009 +0100 (2009-06-25)
parents 1c01814f9a25
children
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <asm/regs.h>
36 #include <asm/mc146818rtc.h>
37 #include <asm/system.h>
38 #include <asm/io.h>
39 #include <asm/processor.h>
40 #include <asm/desc.h>
41 #include <asm/i387.h>
42 #include <asm/mpspec.h>
43 #include <asm/ldt.h>
44 #include <asm/hypercall.h>
45 #include <asm/hvm/hvm.h>
46 #include <asm/hvm/support.h>
47 #include <asm/debugreg.h>
48 #include <asm/msr.h>
49 #include <asm/traps.h>
50 #include <asm/nmi.h>
51 #include <xen/numa.h>
52 #include <xen/iommu.h>
53 #ifdef CONFIG_COMPAT
54 #include <compat/vcpu.h>
55 #endif
57 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
58 DEFINE_PER_CPU(u64, efer);
59 DEFINE_PER_CPU(unsigned long, cr4);
61 static void default_idle(void);
62 static void default_dead_idle(void);
63 void (*pm_idle) (void) = default_idle;
64 void (*dead_idle) (void) = default_dead_idle;
66 static void paravirt_ctxt_switch_from(struct vcpu *v);
67 static void paravirt_ctxt_switch_to(struct vcpu *v);
69 static void vcpu_destroy_pagetables(struct vcpu *v);
71 static void continue_idle_domain(struct vcpu *v)
72 {
73 reset_stack_and_jump(idle_loop);
74 }
76 static void continue_nonidle_domain(struct vcpu *v)
77 {
78 reset_stack_and_jump(ret_from_intr);
79 }
81 static void default_idle(void)
82 {
83 local_irq_disable();
84 if ( !softirq_pending(smp_processor_id()) )
85 safe_halt();
86 else
87 local_irq_enable();
88 }
90 static void default_dead_idle(void)
91 {
92 for ( ; ; )
93 halt();
94 }
96 static void play_dead(void)
97 {
98 /*
99 * Flush pending softirqs if any. They can be queued up before this CPU
100 * was taken out of cpu_online_map in __cpu_disable().
101 */
102 do_softirq();
104 /* This must be done before dead CPU ack */
105 cpu_exit_clear();
106 hvm_cpu_down();
107 wbinvd();
108 mb();
109 /* Ack it */
110 __get_cpu_var(cpu_state) = CPU_DEAD;
112 /* With physical CPU hotplug, we should halt the cpu. */
113 local_irq_disable();
114 (*dead_idle)();
115 }
117 void idle_loop(void)
118 {
119 for ( ; ; )
120 {
121 if ( cpu_is_offline(smp_processor_id()) )
122 play_dead();
123 page_scrub_schedule_work();
124 (*pm_idle)();
125 do_softirq();
126 }
127 }
129 void startup_cpu_idle_loop(void)
130 {
131 struct vcpu *v = current;
133 ASSERT(is_idle_vcpu(v));
134 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
135 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
137 reset_stack_and_jump(idle_loop);
138 }
140 void dump_pageframe_info(struct domain *d)
141 {
142 struct page_info *page;
144 printk("Memory pages belonging to domain %u:\n", d->domain_id);
146 if ( d->tot_pages >= 10 )
147 {
148 printk(" DomPage list too long to display\n");
149 }
150 else
151 {
152 page_list_for_each ( page, &d->page_list )
153 {
154 printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
155 _p(page_to_mfn(page)),
156 page->count_info, page->u.inuse.type_info);
157 }
158 }
160 if ( is_hvm_domain(d) )
161 {
162 p2m_pod_dump_data(d);
163 }
165 page_list_for_each ( page, &d->xenpage_list )
166 {
167 printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
168 _p(page_to_mfn(page)),
169 page->count_info, page->u.inuse.type_info);
170 }
171 }
173 struct domain *alloc_domain_struct(void)
174 {
175 struct domain *d;
176 /*
177 * We pack the MFN of the domain structure into a 32-bit field within
178 * the page_info structure. Hence the MEMF_bits() restriction.
179 */
180 d = alloc_xenheap_pages(
181 get_order_from_bytes(sizeof(*d)), MEMF_bits(32 + PAGE_SHIFT));
182 if ( d != NULL )
183 memset(d, 0, sizeof(*d));
184 return d;
185 }
187 void free_domain_struct(struct domain *d)
188 {
189 free_xenheap_pages(d, get_order_from_bytes(sizeof(*d)));
190 }
192 struct vcpu *alloc_vcpu_struct(void)
193 {
194 struct vcpu *v;
195 /*
196 * This structure contains embedded PAE PDPTEs, used when an HVM guest
197 * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
198 * may require that the shadow CR3 points below 4GB, and hence the whole
199 * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
200 */
201 v = alloc_xenheap_pages(get_order_from_bytes(sizeof(*v)), MEMF_bits(32));
202 if ( v != NULL )
203 memset(v, 0, sizeof(*v));
204 return v;
205 }
207 void free_vcpu_struct(struct vcpu *v)
208 {
209 free_xenheap_pages(v, get_order_from_bytes(sizeof(*v)));
210 }
212 #ifdef CONFIG_COMPAT
214 static int setup_compat_l4(struct vcpu *v)
215 {
216 struct page_info *pg;
217 l4_pgentry_t *l4tab;
219 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
220 if ( pg == NULL )
221 return -ENOMEM;
223 /* This page needs to look like a pagetable so that it can be shadowed */
224 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
226 l4tab = page_to_virt(pg);
227 copy_page(l4tab, idle_pg_table);
228 l4tab[0] = l4e_empty();
229 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
230 l4e_from_page(pg, __PAGE_HYPERVISOR);
231 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
232 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
233 __PAGE_HYPERVISOR);
235 v->arch.guest_table = pagetable_from_page(pg);
236 v->arch.guest_table_user = v->arch.guest_table;
238 return 0;
239 }
241 static void release_compat_l4(struct vcpu *v)
242 {
243 free_domheap_page(pagetable_get_page(v->arch.guest_table));
244 v->arch.guest_table = pagetable_null();
245 v->arch.guest_table_user = pagetable_null();
246 }
248 static inline int may_switch_mode(struct domain *d)
249 {
250 return (!is_hvm_domain(d) && (d->tot_pages == 0));
251 }
253 int switch_native(struct domain *d)
254 {
255 unsigned int vcpuid;
257 if ( d == NULL )
258 return -EINVAL;
259 if ( !may_switch_mode(d) )
260 return -EACCES;
261 if ( !is_pv_32on64_domain(d) )
262 return 0;
264 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
266 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
267 {
268 if (d->vcpu[vcpuid])
269 release_compat_l4(d->vcpu[vcpuid]);
270 }
272 return 0;
273 }
275 int switch_compat(struct domain *d)
276 {
277 unsigned int vcpuid;
279 if ( d == NULL )
280 return -EINVAL;
281 if ( !may_switch_mode(d) )
282 return -EACCES;
283 if ( is_pv_32on64_domain(d) )
284 return 0;
286 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
288 for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
289 {
290 if ( (d->vcpu[vcpuid] != NULL) &&
291 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
292 goto undo_and_fail;
293 }
295 domain_set_alloc_bitsize(d);
297 return 0;
299 undo_and_fail:
300 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
301 while ( vcpuid-- != 0 )
302 {
303 if ( d->vcpu[vcpuid] != NULL )
304 release_compat_l4(d->vcpu[vcpuid]);
305 }
306 return -ENOMEM;
307 }
309 #else
310 #define setup_compat_l4(v) 0
311 #define release_compat_l4(v) ((void)0)
312 #endif
314 int vcpu_initialise(struct vcpu *v)
315 {
316 struct domain *d = v->domain;
317 int rc;
319 v->arch.vcpu_info_mfn = INVALID_MFN;
321 v->arch.flags = TF_kernel_mode;
323 #if defined(__i386__)
324 mapcache_vcpu_init(v);
325 #else
326 {
327 unsigned int idx = perdomain_pt_pgidx(v);
328 struct page_info *pg;
330 if ( !perdomain_pt_page(d, idx) )
331 {
332 pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
333 if ( !pg )
334 return -ENOMEM;
335 clear_page(page_to_virt(pg));
336 perdomain_pt_page(d, idx) = pg;
337 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+idx]
338 = l2e_from_page(pg, __PAGE_HYPERVISOR);
339 }
340 }
341 #endif
343 pae_l3_cache_init(&v->arch.pae_l3_cache);
345 paging_vcpu_init(v);
347 if ( is_hvm_domain(d) )
348 {
349 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
350 return rc;
351 }
352 else
353 {
354 /* PV guests by default have a 100Hz ticker. */
355 if ( !is_idle_domain(d) )
356 v->periodic_period = MILLISECS(10);
358 /* PV guests get an emulated PIT too for video BIOSes to use. */
359 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
360 pit_init(v, cpu_khz);
362 v->arch.schedule_tail = continue_nonidle_domain;
363 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
364 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
366 if ( is_idle_domain(d) )
367 {
368 v->arch.schedule_tail = continue_idle_domain;
369 v->arch.cr3 = __pa(idle_pg_table);
370 }
372 v->arch.guest_context.ctrlreg[4] =
373 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
374 }
376 v->arch.perdomain_ptes = perdomain_ptes(d, v);
378 spin_lock_init(&v->arch.shadow_ldt_lock);
380 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
381 }
383 void vcpu_destroy(struct vcpu *v)
384 {
385 if ( is_pv_32on64_vcpu(v) )
386 release_compat_l4(v);
388 if ( is_hvm_vcpu(v) )
389 hvm_vcpu_destroy(v);
390 }
392 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
393 {
394 #ifdef __x86_64__
395 struct page_info *pg;
396 #else
397 int pdpt_order;
398 #endif
399 int i, paging_initialised = 0;
400 int rc = -ENOMEM;
402 d->arch.hvm_domain.hap_enabled =
403 is_hvm_domain(d) &&
404 hvm_funcs.hap_supported &&
405 (domcr_flags & DOMCRF_hap);
407 d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
409 INIT_LIST_HEAD(&d->arch.pdev_list);
411 d->arch.relmem = RELMEM_not_started;
412 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
414 #if defined(__i386__)
416 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
417 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order, 0);
418 if ( d->arch.mm_perdomain_pt == NULL )
419 goto fail;
420 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
422 mapcache_domain_init(d);
424 #else /* __x86_64__ */
426 BUILD_BUG_ON(PDPT_L2_ENTRIES * sizeof(*d->arch.mm_perdomain_pt_pages)
427 != PAGE_SIZE);
428 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
429 if ( !pg )
430 goto fail;
431 d->arch.mm_perdomain_pt_pages = page_to_virt(pg);
432 clear_page(d->arch.mm_perdomain_pt_pages);
434 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
435 if ( pg == NULL )
436 goto fail;
437 d->arch.mm_perdomain_l2 = page_to_virt(pg);
438 clear_page(d->arch.mm_perdomain_l2);
440 pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
441 if ( pg == NULL )
442 goto fail;
443 d->arch.mm_perdomain_l3 = page_to_virt(pg);
444 clear_page(d->arch.mm_perdomain_l3);
445 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
446 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
447 __PAGE_HYPERVISOR);
449 #endif /* __x86_64__ */
451 #ifdef CONFIG_COMPAT
452 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
453 #endif
455 if ( (rc = paging_domain_init(d)) != 0 )
456 goto fail;
457 paging_initialised = 1;
459 if ( !is_idle_domain(d) )
460 {
461 d->arch.ioport_caps =
462 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
463 rc = -ENOMEM;
464 if ( d->arch.ioport_caps == NULL )
465 goto fail;
467 /*
468 * The shared_info machine address must fit in a 32-bit field within a
469 * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
470 */
471 if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
472 goto fail;
474 clear_page(d->shared_info);
475 share_xen_page_with_guest(
476 virt_to_page(d->shared_info), d, XENSHARE_writable);
478 d->arch.pirq_vector = xmalloc_array(s16, d->nr_pirqs);
479 if ( !d->arch.pirq_vector )
480 goto fail;
481 memset(d->arch.pirq_vector, 0,
482 d->nr_pirqs * sizeof(*d->arch.pirq_vector));
484 if ( (rc = iommu_domain_init(d)) != 0 )
485 goto fail;
487 /* For Guest vMCE MSRs virtualization */
488 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
489 intel_mce_init_msr(d);
490 }
492 if ( is_hvm_domain(d) )
493 {
494 if ( (rc = hvm_domain_initialise(d)) != 0 )
495 {
496 iommu_domain_destroy(d);
497 goto fail;
498 }
499 }
500 else
501 {
502 /* 32-bit PV guest by default only if Xen is not 64-bit. */
503 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
504 (CONFIG_PAGING_LEVELS != 4);
505 }
507 memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
508 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
509 {
510 d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
511 d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
512 }
514 return 0;
516 fail:
517 d->is_dying = DOMDYING_dead;
518 xfree(d->arch.pirq_vector);
519 free_xenheap_page(d->shared_info);
520 if ( paging_initialised )
521 paging_final_teardown(d);
522 #ifdef __x86_64__
523 if ( d->arch.mm_perdomain_l2 )
524 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
525 if ( d->arch.mm_perdomain_l3 )
526 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
527 if ( d->arch.mm_perdomain_pt_pages )
528 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
529 #else
530 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
531 #endif
532 return rc;
533 }
535 void arch_domain_destroy(struct domain *d)
536 {
537 #ifdef __x86_64__
538 unsigned int i;
539 #endif
541 if ( is_hvm_domain(d) )
542 hvm_domain_destroy(d);
544 pci_release_devices(d);
545 free_domain_pirqs(d);
546 if ( !is_idle_domain(d) )
547 iommu_domain_destroy(d);
549 paging_final_teardown(d);
551 #ifdef __i386__
552 free_xenheap_pages(
553 d->arch.mm_perdomain_pt,
554 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
555 #else
556 for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
557 {
558 if ( perdomain_pt_page(d, i) )
559 free_domheap_page(perdomain_pt_page(d, i));
560 }
561 free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages));
562 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
563 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
564 #endif
566 free_xenheap_page(d->shared_info);
567 xfree(d->arch.pirq_vector);
568 }
570 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
571 {
572 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
574 hv_cr4_mask = ~X86_CR4_TSD;
575 if ( cpu_has_de )
576 hv_cr4_mask &= ~X86_CR4_DE;
578 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
579 gdprintk(XENLOG_WARNING,
580 "Attempt to change CR4 flags %08lx -> %08lx\n",
581 hv_cr4, guest_cr4);
583 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
584 }
586 /* This is called by arch_final_setup_guest and do_boot_vcpu */
587 int arch_set_info_guest(
588 struct vcpu *v, vcpu_guest_context_u c)
589 {
590 struct domain *d = v->domain;
591 unsigned long cr3_pfn = INVALID_MFN;
592 unsigned long flags, cr4;
593 int i, rc = 0, compat;
595 /* The context is a compat-mode one if the target domain is compat-mode;
596 * we expect the tools to DTRT even in compat-mode callers. */
597 compat = is_pv_32on64_domain(d);
599 #ifdef CONFIG_COMPAT
600 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
601 #else
602 #define c(fld) (c.nat->fld)
603 #endif
604 flags = c(flags);
606 if ( !is_hvm_vcpu(v) )
607 {
608 if ( !compat )
609 {
610 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
611 fixup_guest_stack_selector(d, c.nat->kernel_ss);
612 fixup_guest_code_selector(d, c.nat->user_regs.cs);
613 #ifdef __i386__
614 fixup_guest_code_selector(d, c.nat->event_callback_cs);
615 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
616 #endif
618 for ( i = 0; i < 256; i++ )
619 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
621 /* LDT safety checks. */
622 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
623 (c.nat->ldt_ents > 8192) ||
624 !array_access_ok(c.nat->ldt_base,
625 c.nat->ldt_ents,
626 LDT_ENTRY_SIZE) )
627 return -EINVAL;
628 }
629 #ifdef CONFIG_COMPAT
630 else
631 {
632 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
633 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
634 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
635 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
636 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
638 for ( i = 0; i < 256; i++ )
639 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
641 /* LDT safety checks. */
642 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
643 (c.cmp->ldt_ents > 8192) ||
644 !compat_array_access_ok(c.cmp->ldt_base,
645 c.cmp->ldt_ents,
646 LDT_ENTRY_SIZE) )
647 return -EINVAL;
648 }
649 #endif
650 }
652 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
654 v->arch.flags &= ~TF_kernel_mode;
655 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
656 v->arch.flags |= TF_kernel_mode;
658 if ( !compat )
659 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
660 #ifdef CONFIG_COMPAT
661 else
662 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
663 #endif
665 v->arch.guest_context.user_regs.eflags |= 2;
667 if ( is_hvm_vcpu(v) )
668 {
669 hvm_set_info_guest(v);
670 goto out;
671 }
673 /* Only CR0.TS is modifiable by guest or admin. */
674 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
675 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
677 init_int80_direct_trap(v);
679 /* IOPL privileges are virtualised. */
680 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
681 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
683 /* Ensure real hardware interrupts are enabled. */
684 v->arch.guest_context.user_regs.eflags |= EF_IE;
686 cr4 = v->arch.guest_context.ctrlreg[4];
687 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
688 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
690 memset(v->arch.guest_context.debugreg, 0,
691 sizeof(v->arch.guest_context.debugreg));
692 for ( i = 0; i < 8; i++ )
693 (void)set_debugreg(v, i, c(debugreg[i]));
695 if ( v->is_initialised )
696 goto out;
698 if ( v->vcpu_id == 0 )
699 d->vm_assist = c(vm_assist);
701 if ( !compat )
702 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
703 #ifdef CONFIG_COMPAT
704 else
705 {
706 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
707 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
709 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
710 return -EINVAL;
711 for ( i = 0; i < n; ++i )
712 gdt_frames[i] = c.cmp->gdt_frames[i];
713 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
714 }
715 #endif
716 if ( rc != 0 )
717 return rc;
719 if ( !compat )
720 {
721 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
723 if ( !mfn_valid(cr3_pfn) ||
724 (paging_mode_refcounts(d)
725 ? !get_page(mfn_to_page(cr3_pfn), d)
726 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
727 PGT_base_page_table)) )
728 {
729 destroy_gdt(v);
730 return -EINVAL;
731 }
733 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
735 #ifdef __x86_64__
736 if ( c.nat->ctrlreg[1] )
737 {
738 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
740 if ( !mfn_valid(cr3_pfn) ||
741 (paging_mode_refcounts(d)
742 ? !get_page(mfn_to_page(cr3_pfn), d)
743 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
744 PGT_base_page_table)) )
745 {
746 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
747 v->arch.guest_table = pagetable_null();
748 if ( paging_mode_refcounts(d) )
749 put_page(mfn_to_page(cr3_pfn));
750 else
751 put_page_and_type(mfn_to_page(cr3_pfn));
752 destroy_gdt(v);
753 return -EINVAL;
754 }
756 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
757 }
758 #endif
759 }
760 #ifdef CONFIG_COMPAT
761 else
762 {
763 l4_pgentry_t *l4tab;
765 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
767 if ( !mfn_valid(cr3_pfn) ||
768 (paging_mode_refcounts(d)
769 ? !get_page(mfn_to_page(cr3_pfn), d)
770 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
771 PGT_l3_page_table)) )
772 {
773 destroy_gdt(v);
774 return -EINVAL;
775 }
777 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
778 *l4tab = l4e_from_pfn(
779 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
780 }
781 #endif
783 if ( v->vcpu_id == 0 )
784 update_domain_wallclock_time(d);
786 /* Don't redo final setup */
787 v->is_initialised = 1;
789 if ( paging_mode_enabled(d) )
790 paging_update_paging_modes(v);
792 update_cr3(v);
794 out:
795 if ( flags & VGCF_online )
796 clear_bit(_VPF_down, &v->pause_flags);
797 else
798 set_bit(_VPF_down, &v->pause_flags);
799 return 0;
800 #undef c
801 }
803 void arch_vcpu_reset(struct vcpu *v)
804 {
805 if ( !is_hvm_vcpu(v) )
806 {
807 destroy_gdt(v);
808 vcpu_destroy_pagetables(v);
809 }
810 else
811 {
812 vcpu_end_shutdown_deferral(v);
813 }
814 }
816 /*
817 * Unmap the vcpu info page if the guest decided to place it somewhere
818 * else. This is only used from arch_domain_destroy, so there's no
819 * need to do anything clever.
820 */
821 static void
822 unmap_vcpu_info(struct vcpu *v)
823 {
824 struct domain *d = v->domain;
825 unsigned long mfn;
827 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
828 return;
830 mfn = v->arch.vcpu_info_mfn;
831 unmap_domain_page_global(v->vcpu_info);
833 v->vcpu_info = (void *)&shared_info(d, vcpu_info[v->vcpu_id]);
834 v->arch.vcpu_info_mfn = INVALID_MFN;
836 put_page_and_type(mfn_to_page(mfn));
837 }
839 /*
840 * Map a guest page in and point the vcpu_info pointer at it. This
841 * makes sure that the vcpu_info is always pointing at a valid piece
842 * of memory, and it sets a pending event to make sure that a pending
843 * event doesn't get missed.
844 */
845 static int
846 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
847 {
848 struct domain *d = v->domain;
849 void *mapping;
850 vcpu_info_t *new_info;
851 int i;
853 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
854 return -EINVAL;
856 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
857 return -EINVAL;
859 /* Run this command on yourself or on other offline VCPUS. */
860 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
861 return -EINVAL;
863 mfn = gmfn_to_mfn(d, mfn);
864 if ( !mfn_valid(mfn) ||
865 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
866 return -EINVAL;
868 mapping = map_domain_page_global(mfn);
869 if ( mapping == NULL )
870 {
871 put_page_and_type(mfn_to_page(mfn));
872 return -ENOMEM;
873 }
875 new_info = (vcpu_info_t *)(mapping + offset);
877 if ( v->vcpu_info )
878 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
879 else
880 {
881 memset(new_info, 0, sizeof(*new_info));
882 __vcpu_info(v, new_info, evtchn_upcall_mask) = 1;
883 }
885 v->vcpu_info = new_info;
886 v->arch.vcpu_info_mfn = mfn;
888 /* Set new vcpu_info pointer /before/ setting pending flags. */
889 wmb();
891 /*
892 * Mark everything as being pending just to make sure nothing gets
893 * lost. The domain will get a spurious event, but it can cope.
894 */
895 vcpu_info(v, evtchn_upcall_pending) = 1;
896 for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ )
897 set_bit(i, &vcpu_info(v, evtchn_pending_sel));
899 return 0;
900 }
902 long
903 arch_do_vcpu_op(
904 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
905 {
906 long rc = 0;
908 switch ( cmd )
909 {
910 case VCPUOP_register_runstate_memory_area:
911 {
912 struct vcpu_register_runstate_memory_area area;
913 struct vcpu_runstate_info runstate;
915 rc = -EFAULT;
916 if ( copy_from_guest(&area, arg, 1) )
917 break;
919 if ( !guest_handle_okay(area.addr.h, 1) )
920 break;
922 rc = 0;
923 runstate_guest(v) = area.addr.h;
925 if ( v == current )
926 {
927 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
928 }
929 else
930 {
931 vcpu_runstate_get(v, &runstate);
932 __copy_to_guest(runstate_guest(v), &runstate, 1);
933 }
935 break;
936 }
938 case VCPUOP_register_vcpu_info:
939 {
940 struct domain *d = v->domain;
941 struct vcpu_register_vcpu_info info;
943 rc = -EFAULT;
944 if ( copy_from_guest(&info, arg, 1) )
945 break;
947 domain_lock(d);
948 rc = map_vcpu_info(v, info.mfn, info.offset);
949 domain_unlock(d);
951 break;
952 }
954 case VCPUOP_get_physid:
955 {
956 struct vcpu_get_physid cpu_id;
958 rc = -EINVAL;
959 if ( !v->domain->is_pinned )
960 break;
962 cpu_id.phys_id =
963 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
964 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
966 rc = -EFAULT;
967 if ( copy_to_guest(arg, &cpu_id, 1) )
968 break;
970 rc = 0;
971 break;
972 }
974 default:
975 rc = -ENOSYS;
976 break;
977 }
979 return rc;
980 }
982 #ifdef __x86_64__
984 #define loadsegment(seg,value) ({ \
985 int __r = 1; \
986 asm volatile ( \
987 "1: movl %k1,%%" #seg "\n2:\n" \
988 ".section .fixup,\"ax\"\n" \
989 "3: xorl %k0,%k0\n" \
990 " movl %k0,%%" #seg "\n" \
991 " jmp 2b\n" \
992 ".previous\n" \
993 ".section __ex_table,\"a\"\n" \
994 " .align 8\n" \
995 " .quad 1b,3b\n" \
996 ".previous" \
997 : "=r" (__r) : "r" (value), "0" (__r) );\
998 __r; })
1000 /*
1001 * save_segments() writes a mask of segments which are dirty (non-zero),
1002 * allowing load_segments() to avoid some expensive segment loads and
1003 * MSR writes.
1004 */
1005 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
1006 #define DIRTY_DS 0x01
1007 #define DIRTY_ES 0x02
1008 #define DIRTY_FS 0x04
1009 #define DIRTY_GS 0x08
1010 #define DIRTY_FS_BASE 0x10
1011 #define DIRTY_GS_BASE_USER 0x20
1013 static void load_segments(struct vcpu *n)
1015 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
1016 int all_segs_okay = 1;
1017 unsigned int dirty_segment_mask, cpu = smp_processor_id();
1019 /* Load and clear the dirty segment mask. */
1020 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
1021 per_cpu(dirty_segment_mask, cpu) = 0;
1023 /* Either selector != 0 ==> reload. */
1024 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
1025 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
1027 /* Either selector != 0 ==> reload. */
1028 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
1029 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
1031 /*
1032 * Either selector != 0 ==> reload.
1033 * Also reload to reset FS_BASE if it was non-zero.
1034 */
1035 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
1036 nctxt->user_regs.fs) )
1037 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
1039 /*
1040 * Either selector != 0 ==> reload.
1041 * Also reload to reset GS_BASE if it was non-zero.
1042 */
1043 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
1044 nctxt->user_regs.gs) )
1046 /* Reset GS_BASE with user %gs? */
1047 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
1048 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
1051 if ( !is_pv_32on64_domain(n->domain) )
1053 /* This can only be non-zero if selector is NULL. */
1054 if ( nctxt->fs_base )
1055 wrmsr(MSR_FS_BASE,
1056 nctxt->fs_base,
1057 nctxt->fs_base>>32);
1059 /* Most kernels have non-zero GS base, so don't bother testing. */
1060 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1061 wrmsr(MSR_SHADOW_GS_BASE,
1062 nctxt->gs_base_kernel,
1063 nctxt->gs_base_kernel>>32);
1065 /* This can only be non-zero if selector is NULL. */
1066 if ( nctxt->gs_base_user )
1067 wrmsr(MSR_GS_BASE,
1068 nctxt->gs_base_user,
1069 nctxt->gs_base_user>>32);
1071 /* If in kernel mode then switch the GS bases around. */
1072 if ( (n->arch.flags & TF_kernel_mode) )
1073 asm volatile ( "swapgs" );
1076 if ( unlikely(!all_segs_okay) )
1078 struct cpu_user_regs *regs = guest_cpu_user_regs();
1079 unsigned long *rsp =
1080 (n->arch.flags & TF_kernel_mode) ?
1081 (unsigned long *)regs->rsp :
1082 (unsigned long *)nctxt->kernel_sp;
1083 unsigned long cs_and_mask, rflags;
1085 if ( is_pv_32on64_domain(n->domain) )
1087 unsigned int *esp = ring_1(regs) ?
1088 (unsigned int *)regs->rsp :
1089 (unsigned int *)nctxt->kernel_sp;
1090 unsigned int cs_and_mask, eflags;
1091 int ret = 0;
1093 /* CS longword also contains full evtchn_upcall_mask. */
1094 cs_and_mask = (unsigned short)regs->cs |
1095 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1096 /* Fold upcall mask into RFLAGS.IF. */
1097 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1098 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1100 if ( !ring_1(regs) )
1102 ret = put_user(regs->ss, esp-1);
1103 ret |= put_user(regs->_esp, esp-2);
1104 esp -= 2;
1107 if ( ret |
1108 put_user(eflags, esp-1) |
1109 put_user(cs_and_mask, esp-2) |
1110 put_user(regs->_eip, esp-3) |
1111 put_user(nctxt->user_regs.gs, esp-4) |
1112 put_user(nctxt->user_regs.fs, esp-5) |
1113 put_user(nctxt->user_regs.es, esp-6) |
1114 put_user(nctxt->user_regs.ds, esp-7) )
1116 gdprintk(XENLOG_ERR, "Error while creating compat "
1117 "failsafe callback frame.\n");
1118 domain_crash(n->domain);
1121 if ( test_bit(_VGCF_failsafe_disables_events,
1122 &n->arch.guest_context.flags) )
1123 vcpu_info(n, evtchn_upcall_mask) = 1;
1125 regs->entry_vector = TRAP_syscall;
1126 regs->_eflags &= 0xFFFCBEFFUL;
1127 regs->ss = FLAT_COMPAT_KERNEL_SS;
1128 regs->_esp = (unsigned long)(esp-7);
1129 regs->cs = FLAT_COMPAT_KERNEL_CS;
1130 regs->_eip = nctxt->failsafe_callback_eip;
1131 return;
1134 if ( !(n->arch.flags & TF_kernel_mode) )
1135 toggle_guest_mode(n);
1136 else
1137 regs->cs &= ~3;
1139 /* CS longword also contains full evtchn_upcall_mask. */
1140 cs_and_mask = (unsigned long)regs->cs |
1141 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1143 /* Fold upcall mask into RFLAGS.IF. */
1144 rflags = regs->rflags & ~X86_EFLAGS_IF;
1145 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1147 if ( put_user(regs->ss, rsp- 1) |
1148 put_user(regs->rsp, rsp- 2) |
1149 put_user(rflags, rsp- 3) |
1150 put_user(cs_and_mask, rsp- 4) |
1151 put_user(regs->rip, rsp- 5) |
1152 put_user(nctxt->user_regs.gs, rsp- 6) |
1153 put_user(nctxt->user_regs.fs, rsp- 7) |
1154 put_user(nctxt->user_regs.es, rsp- 8) |
1155 put_user(nctxt->user_regs.ds, rsp- 9) |
1156 put_user(regs->r11, rsp-10) |
1157 put_user(regs->rcx, rsp-11) )
1159 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1160 "callback frame.\n");
1161 domain_crash(n->domain);
1164 if ( test_bit(_VGCF_failsafe_disables_events,
1165 &n->arch.guest_context.flags) )
1166 vcpu_info(n, evtchn_upcall_mask) = 1;
1168 regs->entry_vector = TRAP_syscall;
1169 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1170 X86_EFLAGS_NT|X86_EFLAGS_TF);
1171 regs->ss = FLAT_KERNEL_SS;
1172 regs->rsp = (unsigned long)(rsp-11);
1173 regs->cs = FLAT_KERNEL_CS;
1174 regs->rip = nctxt->failsafe_callback_eip;
1178 static void save_segments(struct vcpu *v)
1180 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1181 struct cpu_user_regs *regs = &ctxt->user_regs;
1182 unsigned int dirty_segment_mask = 0;
1184 regs->ds = read_segment_register(ds);
1185 regs->es = read_segment_register(es);
1186 regs->fs = read_segment_register(fs);
1187 regs->gs = read_segment_register(gs);
1189 if ( regs->ds )
1190 dirty_segment_mask |= DIRTY_DS;
1192 if ( regs->es )
1193 dirty_segment_mask |= DIRTY_ES;
1195 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1197 dirty_segment_mask |= DIRTY_FS;
1198 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1200 else if ( ctxt->fs_base )
1202 dirty_segment_mask |= DIRTY_FS_BASE;
1205 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1207 dirty_segment_mask |= DIRTY_GS;
1208 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1210 else if ( ctxt->gs_base_user )
1212 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1215 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1218 #define switch_kernel_stack(v) ((void)0)
1220 #elif defined(__i386__)
1222 #define load_segments(n) ((void)0)
1223 #define save_segments(p) ((void)0)
1225 static inline void switch_kernel_stack(struct vcpu *v)
1227 struct tss_struct *tss = &init_tss[smp_processor_id()];
1228 tss->esp1 = v->arch.guest_context.kernel_sp;
1229 tss->ss1 = v->arch.guest_context.kernel_ss;
1232 #endif /* __i386__ */
1234 static void paravirt_ctxt_switch_from(struct vcpu *v)
1236 save_segments(v);
1238 /*
1239 * Disable debug breakpoints. We do this aggressively because if we switch
1240 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1241 * inside Xen, before we get a chance to reload DR7, and this cannot always
1242 * safely be handled.
1243 */
1244 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1245 write_debugreg(7, 0);
1248 static void paravirt_ctxt_switch_to(struct vcpu *v)
1250 unsigned long cr4;
1252 set_int80_direct_trap(v);
1253 switch_kernel_stack(v);
1255 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1256 if ( unlikely(cr4 != read_cr4()) )
1257 write_cr4(cr4);
1259 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
1261 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1262 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1263 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1264 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1265 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1266 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1270 static inline int need_full_gdt(struct vcpu *v)
1272 return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
1275 static void __context_switch(void)
1277 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1278 unsigned int cpu = smp_processor_id();
1279 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1280 struct vcpu *n = current;
1281 struct desc_struct *gdt;
1282 struct desc_ptr gdt_desc;
1284 ASSERT(p != n);
1285 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1287 if ( !is_idle_vcpu(p) )
1289 memcpy(&p->arch.guest_context.user_regs,
1290 stack_regs,
1291 CTXT_SWITCH_STACK_BYTES);
1292 unlazy_fpu(p);
1293 p->arch.ctxt_switch_from(p);
1296 if ( !is_idle_vcpu(n) )
1298 memcpy(stack_regs,
1299 &n->arch.guest_context.user_regs,
1300 CTXT_SWITCH_STACK_BYTES);
1301 n->arch.ctxt_switch_to(n);
1304 if ( p->domain != n->domain )
1305 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1306 cpu_set(cpu, n->vcpu_dirty_cpumask);
1308 gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
1309 per_cpu(compat_gdt_table, cpu);
1310 if ( need_full_gdt(n) )
1312 struct page_info *page = virt_to_page(gdt);
1313 unsigned int i;
1314 for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1315 l1e_write(n->arch.perdomain_ptes +
1316 FIRST_RESERVED_GDT_PAGE + i,
1317 l1e_from_page(page + i, __PAGE_HYPERVISOR));
1320 if ( need_full_gdt(p) &&
1321 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
1323 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1324 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1325 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1328 write_ptbase(n);
1330 if ( need_full_gdt(n) &&
1331 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
1333 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1334 gdt_desc.base = GDT_VIRT_START(n);
1335 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1338 if ( p->domain != n->domain )
1339 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1340 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1342 per_cpu(curr_vcpu, cpu) = n;
1346 void context_switch(struct vcpu *prev, struct vcpu *next)
1348 unsigned int cpu = smp_processor_id();
1349 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1351 ASSERT(local_irq_is_enabled());
1353 /* Allow at most one CPU at a time to be dirty. */
1354 ASSERT(cpus_weight(dirty_mask) <= 1);
1355 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1357 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1358 flush_tlb_mask(&dirty_mask);
1361 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1362 pt_save_timer(prev);
1364 local_irq_disable();
1366 set_current(next);
1368 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1370 local_irq_enable();
1372 else
1374 __context_switch();
1376 #ifdef CONFIG_COMPAT
1377 if ( !is_hvm_vcpu(next) &&
1378 (is_idle_vcpu(prev) ||
1379 is_hvm_vcpu(prev) ||
1380 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1382 uint64_t efer = read_efer();
1383 if ( !(efer & EFER_SCE) )
1384 write_efer(efer | EFER_SCE);
1386 #endif
1388 /* Re-enable interrupts before restoring state which may fault. */
1389 local_irq_enable();
1391 if ( !is_hvm_vcpu(next) )
1393 load_LDT(next);
1394 load_segments(next);
1398 context_saved(prev);
1400 /* Update per-VCPU guest runstate shared memory area (if registered). */
1401 if ( !guest_handle_is_null(runstate_guest(next)) )
1403 if ( !is_pv_32on64_domain(next->domain) )
1404 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1405 #ifdef CONFIG_COMPAT
1406 else
1408 struct compat_vcpu_runstate_info info;
1410 XLAT_vcpu_runstate_info(&info, &next->runstate);
1411 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1413 #endif
1416 schedule_tail(next);
1417 BUG();
1420 void continue_running(struct vcpu *same)
1422 schedule_tail(same);
1423 BUG();
1426 int __sync_lazy_execstate(void)
1428 unsigned long flags;
1429 int switch_required;
1431 local_irq_save(flags);
1433 switch_required = (this_cpu(curr_vcpu) != current);
1435 if ( switch_required )
1437 ASSERT(current == idle_vcpu[smp_processor_id()]);
1438 __context_switch();
1441 local_irq_restore(flags);
1443 return switch_required;
1446 void sync_vcpu_execstate(struct vcpu *v)
1448 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1449 (void)__sync_lazy_execstate();
1451 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1452 flush_tlb_mask(&v->vcpu_dirty_cpumask);
1455 struct migrate_info {
1456 long (*func)(void *data);
1457 void *data;
1458 void (*saved_schedule_tail)(struct vcpu *);
1459 cpumask_t saved_affinity;
1460 unsigned int nest;
1461 };
1463 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1465 struct cpu_user_regs *regs = guest_cpu_user_regs();
1466 struct migrate_info *info = v->arch.continue_info;
1467 cpumask_t mask = info->saved_affinity;
1468 void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
1470 regs->eax = info->func(info->data);
1472 if ( info->nest-- == 0 )
1474 xfree(info);
1475 v->arch.schedule_tail = saved_schedule_tail;
1476 v->arch.continue_info = NULL;
1477 vcpu_unlock_affinity(v, &mask);
1480 (*saved_schedule_tail)(v);
1483 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1485 struct vcpu *v = current;
1486 struct migrate_info *info;
1487 cpumask_t mask = cpumask_of_cpu(cpu);
1488 int rc;
1490 if ( cpu == smp_processor_id() )
1491 return func(data);
1493 info = v->arch.continue_info;
1494 if ( info == NULL )
1496 info = xmalloc(struct migrate_info);
1497 if ( info == NULL )
1498 return -ENOMEM;
1500 rc = vcpu_lock_affinity(v, &mask);
1501 if ( rc )
1503 xfree(info);
1504 return rc;
1507 info->saved_schedule_tail = v->arch.schedule_tail;
1508 info->saved_affinity = mask;
1509 info->nest = 0;
1511 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1512 v->arch.continue_info = info;
1514 else
1516 BUG_ON(info->nest != 0);
1517 rc = vcpu_locked_change_affinity(v, &mask);
1518 if ( rc )
1519 return rc;
1520 info->nest++;
1523 info->func = func;
1524 info->data = data;
1526 /* Dummy return value will be overwritten by new schedule_tail. */
1527 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1528 return 0;
1531 #define next_arg(fmt, args) ({ \
1532 unsigned long __arg; \
1533 switch ( *(fmt)++ ) \
1534 { \
1535 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1536 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1537 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1538 default: __arg = 0; BUG(); \
1539 } \
1540 __arg; \
1541 })
1543 DEFINE_PER_CPU(char, hc_preempted);
1545 unsigned long hypercall_create_continuation(
1546 unsigned int op, const char *format, ...)
1548 struct mc_state *mcs = &this_cpu(mc_state);
1549 struct cpu_user_regs *regs;
1550 const char *p = format;
1551 unsigned long arg;
1552 unsigned int i;
1553 va_list args;
1555 va_start(args, format);
1557 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1559 __set_bit(_MCSF_call_preempted, &mcs->flags);
1561 for ( i = 0; *p != '\0'; i++ )
1562 mcs->call.args[i] = next_arg(p, args);
1563 if ( is_pv_32on64_domain(current->domain) )
1565 for ( ; i < 6; i++ )
1566 mcs->call.args[i] = 0;
1569 else
1571 regs = guest_cpu_user_regs();
1572 regs->eax = op;
1573 /*
1574 * For PV guest, we update EIP to re-execute 'syscall' / 'int 0x82';
1575 * HVM does not need this since 'vmcall' / 'vmmcall' is fault-like.
1576 */
1577 if ( !is_hvm_vcpu(current) )
1578 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1580 #ifdef __x86_64__
1581 if ( !is_hvm_vcpu(current) ?
1582 !is_pv_32on64_vcpu(current) :
1583 (hvm_guest_x86_mode(current) == 8) )
1585 for ( i = 0; *p != '\0'; i++ )
1587 arg = next_arg(p, args);
1588 switch ( i )
1590 case 0: regs->rdi = arg; break;
1591 case 1: regs->rsi = arg; break;
1592 case 2: regs->rdx = arg; break;
1593 case 3: regs->r10 = arg; break;
1594 case 4: regs->r8 = arg; break;
1595 case 5: regs->r9 = arg; break;
1599 else
1600 #endif
1602 if ( supervisor_mode_kernel )
1603 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1605 for ( i = 0; *p != '\0'; i++ )
1607 arg = next_arg(p, args);
1608 switch ( i )
1610 case 0: regs->ebx = arg; break;
1611 case 1: regs->ecx = arg; break;
1612 case 2: regs->edx = arg; break;
1613 case 3: regs->esi = arg; break;
1614 case 4: regs->edi = arg; break;
1615 case 5: regs->ebp = arg; break;
1620 this_cpu(hc_preempted) = 1;
1623 va_end(args);
1625 return op;
1628 #ifdef CONFIG_COMPAT
1629 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1631 int rc = 0;
1632 struct mc_state *mcs = &this_cpu(mc_state);
1633 struct cpu_user_regs *regs;
1634 unsigned int i, cval = 0;
1635 unsigned long nval = 0;
1636 va_list args;
1638 BUG_ON(*id > 5);
1639 BUG_ON(mask & (1U << *id));
1641 va_start(args, mask);
1643 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1645 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1646 return 0;
1647 for ( i = 0; i < 6; ++i, mask >>= 1 )
1649 if ( mask & 1 )
1651 nval = va_arg(args, unsigned long);
1652 cval = va_arg(args, unsigned int);
1653 if ( cval == nval )
1654 mask &= ~1U;
1655 else
1656 BUG_ON(nval == (unsigned int)nval);
1658 else if ( id && *id == i )
1660 *id = mcs->call.args[i];
1661 id = NULL;
1663 if ( (mask & 1) && mcs->call.args[i] == nval )
1665 mcs->call.args[i] = cval;
1666 ++rc;
1668 else
1669 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1672 else
1674 regs = guest_cpu_user_regs();
1675 for ( i = 0; i < 6; ++i, mask >>= 1 )
1677 unsigned long *reg;
1679 switch ( i )
1681 case 0: reg = &regs->ebx; break;
1682 case 1: reg = &regs->ecx; break;
1683 case 2: reg = &regs->edx; break;
1684 case 3: reg = &regs->esi; break;
1685 case 4: reg = &regs->edi; break;
1686 case 5: reg = &regs->ebp; break;
1687 default: BUG(); reg = NULL; break;
1689 if ( (mask & 1) )
1691 nval = va_arg(args, unsigned long);
1692 cval = va_arg(args, unsigned int);
1693 if ( cval == nval )
1694 mask &= ~1U;
1695 else
1696 BUG_ON(nval == (unsigned int)nval);
1698 else if ( id && *id == i )
1700 *id = *reg;
1701 id = NULL;
1703 if ( (mask & 1) && *reg == nval )
1705 *reg = cval;
1706 ++rc;
1708 else
1709 BUG_ON(*reg != (unsigned int)*reg);
1713 va_end(args);
1715 return rc;
1717 #endif
1719 static int relinquish_memory(
1720 struct domain *d, struct page_list_head *list, unsigned long type)
1722 struct page_info *page;
1723 unsigned long x, y;
1724 int ret = 0;
1726 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1727 spin_lock_recursive(&d->page_alloc_lock);
1729 while ( (page = page_list_remove_head(list)) )
1731 /* Grab a reference to the page so it won't disappear from under us. */
1732 if ( unlikely(!get_page(page, d)) )
1734 /* Couldn't get a reference -- someone is freeing this page. */
1735 page_list_add_tail(page, &d->arch.relmem_list);
1736 continue;
1739 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1740 ret = put_page_and_type_preemptible(page, 1);
1741 switch ( ret )
1743 case 0:
1744 break;
1745 case -EAGAIN:
1746 case -EINTR:
1747 page_list_add(page, list);
1748 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1749 put_page(page);
1750 goto out;
1751 default:
1752 BUG();
1755 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1756 put_page(page);
1758 /*
1759 * Forcibly invalidate top-most, still valid page tables at this point
1760 * to break circular 'linear page table' references as well as clean up
1761 * partially validated pages. This is okay because MMU structures are
1762 * not shared across domains and this domain is now dead. Thus top-most
1763 * valid tables are not in use so a non-zero count means circular
1764 * reference or partially validated.
1765 */
1766 y = page->u.inuse.type_info;
1767 for ( ; ; )
1769 x = y;
1770 if ( likely((x & PGT_type_mask) != type) ||
1771 likely(!(x & (PGT_validated|PGT_partial))) )
1772 break;
1774 y = cmpxchg(&page->u.inuse.type_info, x,
1775 x & ~(PGT_validated|PGT_partial));
1776 if ( likely(y == x) )
1778 /* No need for atomic update of type_info here: noone else updates it. */
1779 switch ( ret = free_page_type(page, x, 1) )
1781 case 0:
1782 break;
1783 case -EINTR:
1784 page_list_add(page, list);
1785 page->u.inuse.type_info |= PGT_validated;
1786 if ( x & PGT_partial )
1787 put_page(page);
1788 put_page(page);
1789 ret = -EAGAIN;
1790 goto out;
1791 case -EAGAIN:
1792 page_list_add(page, list);
1793 page->u.inuse.type_info |= PGT_partial;
1794 if ( x & PGT_partial )
1795 put_page(page);
1796 goto out;
1797 default:
1798 BUG();
1800 if ( x & PGT_partial )
1802 page->u.inuse.type_info--;
1803 put_page(page);
1805 break;
1809 /* Put the page on the list and /then/ potentially free it. */
1810 page_list_add_tail(page, &d->arch.relmem_list);
1811 put_page(page);
1813 if ( hypercall_preempt_check() )
1815 ret = -EAGAIN;
1816 goto out;
1820 /* list is empty at this point. */
1821 page_list_move(list, &d->arch.relmem_list);
1823 out:
1824 spin_unlock_recursive(&d->page_alloc_lock);
1825 return ret;
1828 static void vcpu_destroy_pagetables(struct vcpu *v)
1830 struct domain *d = v->domain;
1831 unsigned long pfn;
1833 #ifdef __x86_64__
1834 if ( is_pv_32on64_vcpu(v) )
1836 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1837 __va(pagetable_get_paddr(v->arch.guest_table)));
1839 if ( pfn != 0 )
1841 if ( paging_mode_refcounts(d) )
1842 put_page(mfn_to_page(pfn));
1843 else
1844 put_page_and_type(mfn_to_page(pfn));
1847 l4e_write(
1848 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1849 l4e_empty());
1851 v->arch.cr3 = 0;
1852 return;
1854 #endif
1856 pfn = pagetable_get_pfn(v->arch.guest_table);
1857 if ( pfn != 0 )
1859 if ( paging_mode_refcounts(d) )
1860 put_page(mfn_to_page(pfn));
1861 else
1862 put_page_and_type(mfn_to_page(pfn));
1863 v->arch.guest_table = pagetable_null();
1866 #ifdef __x86_64__
1867 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1868 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1869 if ( pfn != 0 )
1871 if ( !is_pv_32bit_vcpu(v) )
1873 if ( paging_mode_refcounts(d) )
1874 put_page(mfn_to_page(pfn));
1875 else
1876 put_page_and_type(mfn_to_page(pfn));
1878 v->arch.guest_table_user = pagetable_null();
1880 #endif
1882 v->arch.cr3 = 0;
1885 int domain_relinquish_resources(struct domain *d)
1887 int ret;
1888 struct vcpu *v;
1890 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1892 switch ( d->arch.relmem )
1894 case RELMEM_not_started:
1895 /* Tear down paging-assistance stuff. */
1896 paging_teardown(d);
1898 for_each_vcpu ( d, v )
1900 /* Drop the in-use references to page-table bases. */
1901 vcpu_destroy_pagetables(v);
1903 /*
1904 * Relinquish GDT mappings. No need for explicit unmapping of the
1905 * LDT as it automatically gets squashed with the guest mappings.
1906 */
1907 destroy_gdt(v);
1909 unmap_vcpu_info(v);
1912 if ( d->arch.pirq_eoi_map != NULL )
1914 unmap_domain_page_global(d->arch.pirq_eoi_map);
1915 put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1916 d->arch.pirq_eoi_map = NULL;
1919 d->arch.relmem = RELMEM_xen;
1920 /* fallthrough */
1922 /* Relinquish every page of memory. */
1923 case RELMEM_xen:
1924 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1925 if ( ret )
1926 return ret;
1927 #if CONFIG_PAGING_LEVELS >= 4
1928 d->arch.relmem = RELMEM_l4;
1929 /* fallthrough */
1931 case RELMEM_l4:
1932 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1933 if ( ret )
1934 return ret;
1935 #endif
1936 #if CONFIG_PAGING_LEVELS >= 3
1937 d->arch.relmem = RELMEM_l3;
1938 /* fallthrough */
1940 case RELMEM_l3:
1941 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1942 if ( ret )
1943 return ret;
1944 #endif
1945 d->arch.relmem = RELMEM_l2;
1946 /* fallthrough */
1948 case RELMEM_l2:
1949 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1950 if ( ret )
1951 return ret;
1952 d->arch.relmem = RELMEM_done;
1953 /* fallthrough */
1955 case RELMEM_done:
1956 break;
1958 default:
1959 BUG();
1962 if ( is_hvm_domain(d) )
1963 hvm_domain_relinquish_resources(d);
1965 return 0;
1968 void arch_dump_domain_info(struct domain *d)
1970 paging_dump_domain_info(d);
1973 void arch_dump_vcpu_info(struct vcpu *v)
1975 paging_dump_vcpu_info(v);
1978 void domain_cpuid(
1979 struct domain *d,
1980 unsigned int input,
1981 unsigned int sub_input,
1982 unsigned int *eax,
1983 unsigned int *ebx,
1984 unsigned int *ecx,
1985 unsigned int *edx)
1987 cpuid_input_t *cpuid;
1988 int i;
1990 for ( i = 0; i < MAX_CPUID_INPUT; i++ )
1992 cpuid = &d->arch.cpuids[i];
1994 if ( (cpuid->input[0] == input) &&
1995 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
1996 (cpuid->input[1] == sub_input)) )
1998 *eax = cpuid->eax;
1999 *ebx = cpuid->ebx;
2000 *ecx = cpuid->ecx;
2001 *edx = cpuid->edx;
2002 return;
2006 *eax = *ebx = *ecx = *edx = 0;
2009 void vcpu_kick(struct vcpu *v)
2011 /*
2012 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2013 * pending flag. These values may fluctuate (after all, we hold no
2014 * locks) but the key insight is that each change will cause
2015 * evtchn_upcall_pending to be polled.
2017 * NB2. We save the running flag across the unblock to avoid a needless
2018 * IPI for domains that we IPI'd to unblock.
2019 */
2020 bool_t running = v->is_running;
2021 vcpu_unblock(v);
2022 if ( running && (in_irq() || (v != current)) )
2023 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2026 void vcpu_mark_events_pending(struct vcpu *v)
2028 int already_pending = test_and_set_bit(
2029 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2031 if ( already_pending )
2032 return;
2034 if ( is_hvm_vcpu(v) )
2035 hvm_assert_evtchn_irq(v);
2036 else
2037 vcpu_kick(v);
2040 static void vcpu_kick_softirq(void)
2042 /*
2043 * Nothing to do here: we merely prevent notifiers from racing with checks
2044 * executed on return to guest context with interrupts enabled. See, for
2045 * example, xxx_intr_assist() executed on return to HVM guest context.
2046 */
2049 static int __init init_vcpu_kick_softirq(void)
2051 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2052 return 0;
2054 __initcall(init_vcpu_kick_softirq);
2057 /*
2058 * Local variables:
2059 * mode: C
2060 * c-set-style: "BSD"
2061 * c-basic-offset: 4
2062 * tab-width: 4
2063 * indent-tabs-mode: nil
2064 * End:
2065 */