ia64/xen-unstable

view xen/arch/x86/domain.c @ 16406:f62e6c697eeb

x86, 32-on-64: Improve checking in vcpu_destroy_pagetables(). It *is*
possible for 64-bit guest to have matching guest_table and
guest_table_user.
Original patch by John Levon <levon@movementarian.org>
Signed-off-by: Keir Fraser <keir.fraser@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Nov 20 15:34:25 2007 +0000 (2007-11-20)
parents 00db9ec39831
children 69b56d3289f5
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <asm/regs.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/system.h>
35 #include <asm/io.h>
36 #include <asm/processor.h>
37 #include <asm/desc.h>
38 #include <asm/i387.h>
39 #include <asm/mpspec.h>
40 #include <asm/ldt.h>
41 #include <asm/paging.h>
42 #include <asm/hypercall.h>
43 #include <asm/hvm/hvm.h>
44 #include <asm/hvm/support.h>
45 #include <asm/msr.h>
46 #include <asm/nmi.h>
47 #include <asm/iommu.h>
48 #ifdef CONFIG_COMPAT
49 #include <compat/vcpu.h>
50 #endif
52 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
53 DEFINE_PER_CPU(u64, efer);
54 DEFINE_PER_CPU(unsigned long, cr4);
56 static void unmap_vcpu_info(struct vcpu *v);
58 static void paravirt_ctxt_switch_from(struct vcpu *v);
59 static void paravirt_ctxt_switch_to(struct vcpu *v);
61 static void vcpu_destroy_pagetables(struct vcpu *v);
63 static void continue_idle_domain(struct vcpu *v)
64 {
65 reset_stack_and_jump(idle_loop);
66 }
68 static void continue_nonidle_domain(struct vcpu *v)
69 {
70 reset_stack_and_jump(ret_from_intr);
71 }
73 static void default_idle(void)
74 {
75 local_irq_disable();
76 if ( !softirq_pending(smp_processor_id()) )
77 safe_halt();
78 else
79 local_irq_enable();
80 }
82 static void play_dead(void)
83 {
84 __cpu_disable();
85 /* This must be done before dead CPU ack */
86 cpu_exit_clear();
87 hvm_cpu_down();
88 wbinvd();
89 mb();
90 /* Ack it */
91 __get_cpu_var(cpu_state) = CPU_DEAD;
93 /* With physical CPU hotplug, we should halt the cpu. */
94 local_irq_disable();
95 for ( ; ; )
96 halt();
97 }
99 void idle_loop(void)
100 {
101 for ( ; ; )
102 {
103 if (cpu_is_offline(smp_processor_id()))
104 play_dead();
105 page_scrub_schedule_work();
106 default_idle();
107 do_softirq();
108 }
109 }
111 void startup_cpu_idle_loop(void)
112 {
113 struct vcpu *v = current;
115 ASSERT(is_idle_vcpu(v));
116 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
117 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
119 reset_stack_and_jump(idle_loop);
120 }
122 void dump_pageframe_info(struct domain *d)
123 {
124 struct page_info *page;
126 printk("Memory pages belonging to domain %u:\n", d->domain_id);
128 if ( d->tot_pages >= 10 )
129 {
130 printk(" DomPage list too long to display\n");
131 }
132 else
133 {
134 list_for_each_entry ( page, &d->page_list, list )
135 {
136 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
137 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
138 page->count_info, page->u.inuse.type_info);
139 }
140 }
142 list_for_each_entry ( page, &d->xenpage_list, list )
143 {
144 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
145 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
146 page->count_info, page->u.inuse.type_info);
147 }
148 }
150 struct vcpu *alloc_vcpu_struct(void)
151 {
152 struct vcpu *v;
153 if ( (v = xmalloc(struct vcpu)) != NULL )
154 memset(v, 0, sizeof(*v));
155 return v;
156 }
158 void free_vcpu_struct(struct vcpu *v)
159 {
160 xfree(v);
161 }
163 #ifdef CONFIG_COMPAT
165 int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
166 {
167 struct domain *d = v->domain;
168 unsigned i;
169 struct page_info *pg;
171 if ( !d->arch.mm_arg_xlat_l3 )
172 {
173 pg = alloc_domheap_page(NULL);
174 if ( !pg )
175 return -ENOMEM;
176 d->arch.mm_arg_xlat_l3 = page_to_virt(pg);
177 clear_page(d->arch.mm_arg_xlat_l3);
178 }
180 l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
181 l4e_from_paddr(__pa(d->arch.mm_arg_xlat_l3), __PAGE_HYPERVISOR);
183 for ( i = 0; i < COMPAT_ARG_XLAT_PAGES; ++i )
184 {
185 unsigned long va = COMPAT_ARG_XLAT_VIRT_START(v->vcpu_id) + i * PAGE_SIZE;
186 l2_pgentry_t *l2tab;
187 l1_pgentry_t *l1tab;
189 if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
190 {
191 pg = alloc_domheap_page(NULL);
192 if ( !pg )
193 return -ENOMEM;
194 clear_page(page_to_virt(pg));
195 d->arch.mm_arg_xlat_l3[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
196 }
197 l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
198 if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
199 {
200 pg = alloc_domheap_page(NULL);
201 if ( !pg )
202 return -ENOMEM;
203 clear_page(page_to_virt(pg));
204 l2tab[l2_table_offset(va)] = l2e_from_page(pg, __PAGE_HYPERVISOR);
205 }
206 l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
207 BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
208 pg = alloc_domheap_page(NULL);
209 if ( !pg )
210 return -ENOMEM;
211 l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
212 }
214 return 0;
215 }
217 static void release_arg_xlat_area(struct domain *d)
218 {
219 if ( d->arch.mm_arg_xlat_l3 )
220 {
221 unsigned l3;
223 for ( l3 = 0; l3 < L3_PAGETABLE_ENTRIES; ++l3 )
224 {
225 if ( l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3]) )
226 {
227 l2_pgentry_t *l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3]);
228 unsigned l2;
230 for ( l2 = 0; l2 < L2_PAGETABLE_ENTRIES; ++l2 )
231 {
232 if ( l2e_get_intpte(l2tab[l2]) )
233 {
234 l1_pgentry_t *l1tab = l2e_to_l1e(l2tab[l2]);
235 unsigned l1;
237 for ( l1 = 0; l1 < L1_PAGETABLE_ENTRIES; ++l1 )
238 {
239 if ( l1e_get_intpte(l1tab[l1]) )
240 free_domheap_page(l1e_get_page(l1tab[l1]));
241 }
242 free_domheap_page(l2e_get_page(l2tab[l2]));
243 }
244 }
245 free_domheap_page(l3e_get_page(d->arch.mm_arg_xlat_l3[l3]));
246 }
247 }
248 free_domheap_page(virt_to_page(d->arch.mm_arg_xlat_l3));
249 }
250 }
252 static int setup_compat_l4(struct vcpu *v)
253 {
254 struct page_info *pg = alloc_domheap_page(NULL);
255 l4_pgentry_t *l4tab;
256 int rc;
258 if ( pg == NULL )
259 return -ENOMEM;
261 /* This page needs to look like a pagetable so that it can be shadowed */
262 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated;
264 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
265 l4tab[0] = l4e_empty();
266 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
267 l4e_from_page(pg, __PAGE_HYPERVISOR);
268 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
269 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
270 __PAGE_HYPERVISOR);
272 if ( (rc = setup_arg_xlat_area(v, l4tab)) < 0 )
273 {
274 free_domheap_page(pg);
275 return rc;
276 }
278 v->arch.guest_table = pagetable_from_page(pg);
279 v->arch.guest_table_user = v->arch.guest_table;
281 return 0;
282 }
284 static void release_compat_l4(struct vcpu *v)
285 {
286 free_domheap_page(pagetable_get_page(v->arch.guest_table));
287 v->arch.guest_table = pagetable_null();
288 v->arch.guest_table_user = pagetable_null();
289 }
291 static inline int may_switch_mode(struct domain *d)
292 {
293 return (!is_hvm_domain(d) && (d->tot_pages == 0));
294 }
296 int switch_native(struct domain *d)
297 {
298 l1_pgentry_t gdt_l1e;
299 unsigned int vcpuid;
301 if ( d == NULL )
302 return -EINVAL;
303 if ( !may_switch_mode(d) )
304 return -EACCES;
305 if ( !is_pv_32on64_domain(d) )
306 return 0;
308 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
309 release_arg_xlat_area(d);
311 /* switch gdt */
312 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
313 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
314 {
315 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
316 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
317 if (d->vcpu[vcpuid])
318 release_compat_l4(d->vcpu[vcpuid]);
319 }
321 d->arch.physaddr_bitsize = 64;
323 return 0;
324 }
326 int switch_compat(struct domain *d)
327 {
328 l1_pgentry_t gdt_l1e;
329 unsigned int vcpuid;
331 if ( d == NULL )
332 return -EINVAL;
333 if ( !may_switch_mode(d) )
334 return -EACCES;
335 if ( is_pv_32on64_domain(d) )
336 return 0;
338 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
340 /* switch gdt */
341 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
342 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
343 {
344 if ( (d->vcpu[vcpuid] != NULL) &&
345 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
346 goto undo_and_fail;
347 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
348 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
349 }
351 d->arch.physaddr_bitsize =
352 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
353 + (PAGE_SIZE - 2);
355 return 0;
357 undo_and_fail:
358 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
359 release_arg_xlat_area(d);
360 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
361 while ( vcpuid-- != 0 )
362 {
363 if ( d->vcpu[vcpuid] != NULL )
364 release_compat_l4(d->vcpu[vcpuid]);
365 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
366 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
367 }
368 return -ENOMEM;
369 }
371 #else
372 #define release_arg_xlat_area(d) ((void)0)
373 #define setup_compat_l4(v) 0
374 #define release_compat_l4(v) ((void)0)
375 #endif
377 int vcpu_initialise(struct vcpu *v)
378 {
379 struct domain *d = v->domain;
380 int rc;
382 v->arch.vcpu_info_mfn = INVALID_MFN;
384 v->arch.flags = TF_kernel_mode;
386 #if defined(__i386__)
387 mapcache_vcpu_init(v);
388 #endif
390 pae_l3_cache_init(&v->arch.pae_l3_cache);
392 paging_vcpu_init(v);
394 if ( is_hvm_domain(d) )
395 {
396 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
397 return rc;
398 }
399 else
400 {
401 /* PV guests by default have a 100Hz ticker. */
402 v->periodic_period = MILLISECS(10);
404 /* PV guests get an emulated PIT too for video BIOSes to use. */
405 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
406 pit_init(v, cpu_khz);
408 v->arch.schedule_tail = continue_nonidle_domain;
409 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
410 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
412 if ( is_idle_domain(d) )
413 {
414 v->arch.schedule_tail = continue_idle_domain;
415 v->arch.cr3 = __pa(idle_pg_table);
416 }
418 v->arch.guest_context.ctrlreg[4] =
419 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
420 }
422 v->arch.perdomain_ptes =
423 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
425 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
426 }
428 void vcpu_destroy(struct vcpu *v)
429 {
430 if ( is_pv_32on64_vcpu(v) )
431 release_compat_l4(v);
433 unmap_vcpu_info(v);
435 if ( is_hvm_vcpu(v) )
436 hvm_vcpu_destroy(v);
437 }
439 int arch_domain_create(struct domain *d)
440 {
441 #ifdef __x86_64__
442 struct page_info *pg;
443 int i;
444 #endif
445 l1_pgentry_t gdt_l1e;
446 int vcpuid, pdpt_order, paging_initialised = 0;
447 int rc = -ENOMEM;
449 d->arch.relmem = RELMEM_not_started;
450 INIT_LIST_HEAD(&d->arch.relmem_list);
452 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
453 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
454 if ( d->arch.mm_perdomain_pt == NULL )
455 goto fail;
456 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
458 /*
459 * Map Xen segments into every VCPU's GDT, irrespective of whether every
460 * VCPU will actually be used. This avoids an NMI race during context
461 * switch: if we take an interrupt after switching CR3 but before switching
462 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
463 * try to load CS from an invalid table.
464 */
465 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
466 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
467 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
468 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
470 #if defined(__i386__)
472 mapcache_domain_init(d);
474 #else /* __x86_64__ */
476 if ( (pg = alloc_domheap_page(NULL)) == NULL )
477 goto fail;
478 d->arch.mm_perdomain_l2 = page_to_virt(pg);
479 clear_page(d->arch.mm_perdomain_l2);
480 for ( i = 0; i < (1 << pdpt_order); i++ )
481 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
482 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
483 __PAGE_HYPERVISOR);
485 if ( (pg = alloc_domheap_page(NULL)) == NULL )
486 goto fail;
487 d->arch.mm_perdomain_l3 = page_to_virt(pg);
488 clear_page(d->arch.mm_perdomain_l3);
489 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
490 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
491 __PAGE_HYPERVISOR);
493 #endif /* __x86_64__ */
495 #ifdef CONFIG_COMPAT
496 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
497 #endif
499 paging_domain_init(d);
500 paging_initialised = 1;
502 if ( !is_idle_domain(d) )
503 {
504 d->arch.ioport_caps =
505 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
506 if ( d->arch.ioport_caps == NULL )
507 goto fail;
509 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
510 goto fail;
512 clear_page(d->shared_info);
513 share_xen_page_with_guest(
514 virt_to_page(d->shared_info), d, XENSHARE_writable);
515 }
517 if ( (rc = iommu_domain_init(d)) != 0 )
518 goto fail;
520 if ( is_hvm_domain(d) )
521 {
522 if ( (rc = hvm_domain_initialise(d)) != 0 )
523 {
524 iommu_domain_destroy(d);
525 goto fail;
526 }
527 }
528 else
529 {
530 /* 32-bit PV guest by default only if Xen is not 64-bit. */
531 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
532 (CONFIG_PAGING_LEVELS != 4);
533 }
535 return 0;
537 fail:
538 free_xenheap_page(d->shared_info);
539 if ( paging_initialised )
540 paging_final_teardown(d);
541 #ifdef __x86_64__
542 if ( d->arch.mm_perdomain_l2 )
543 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
544 if ( d->arch.mm_perdomain_l3 )
545 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
546 #endif
547 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
548 return rc;
549 }
551 void arch_domain_destroy(struct domain *d)
552 {
553 if ( is_hvm_domain(d) )
554 hvm_domain_destroy(d);
556 iommu_domain_destroy(d);
558 paging_final_teardown(d);
560 free_xenheap_pages(
561 d->arch.mm_perdomain_pt,
562 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
564 #ifdef __x86_64__
565 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
566 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
567 #endif
569 if ( is_pv_32on64_domain(d) )
570 release_arg_xlat_area(d);
572 free_xenheap_page(d->shared_info);
573 }
575 unsigned long pv_guest_cr4_fixup(unsigned long guest_cr4)
576 {
577 unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
579 hv_cr4_mask = ~X86_CR4_TSD;
580 if ( cpu_has_de )
581 hv_cr4_mask &= ~X86_CR4_DE;
583 if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
584 gdprintk(XENLOG_WARNING,
585 "Attempt to change CR4 flags %08lx -> %08lx\n",
586 hv_cr4 & ~(X86_CR4_PGE|X86_CR4_PSE), guest_cr4);
588 return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
589 }
591 /* This is called by arch_final_setup_guest and do_boot_vcpu */
592 int arch_set_info_guest(
593 struct vcpu *v, vcpu_guest_context_u c)
594 {
595 struct domain *d = v->domain;
596 unsigned long cr3_pfn = INVALID_MFN;
597 unsigned long flags, cr4;
598 int i, rc = 0, compat;
600 /* The context is a compat-mode one if the target domain is compat-mode;
601 * we expect the tools to DTRT even in compat-mode callers. */
602 compat = is_pv_32on64_domain(d);
604 #ifdef CONFIG_COMPAT
605 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
606 #else
607 #define c(fld) (c.nat->fld)
608 #endif
609 flags = c(flags);
611 if ( !is_hvm_vcpu(v) )
612 {
613 if ( !compat )
614 {
615 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
616 fixup_guest_stack_selector(d, c.nat->kernel_ss);
617 fixup_guest_code_selector(d, c.nat->user_regs.cs);
618 #ifdef __i386__
619 fixup_guest_code_selector(d, c.nat->event_callback_cs);
620 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
621 #endif
623 for ( i = 0; i < 256; i++ )
624 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
626 /* LDT safety checks. */
627 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
628 (c.nat->ldt_ents > 8192) ||
629 !array_access_ok(c.nat->ldt_base,
630 c.nat->ldt_ents,
631 LDT_ENTRY_SIZE) )
632 return -EINVAL;
633 }
634 #ifdef CONFIG_COMPAT
635 else
636 {
637 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
638 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
639 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
640 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
641 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
643 for ( i = 0; i < 256; i++ )
644 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
646 /* LDT safety checks. */
647 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
648 (c.cmp->ldt_ents > 8192) ||
649 !compat_array_access_ok(c.cmp->ldt_base,
650 c.cmp->ldt_ents,
651 LDT_ENTRY_SIZE) )
652 return -EINVAL;
653 }
654 #endif
655 }
657 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
659 v->arch.flags &= ~TF_kernel_mode;
660 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
661 v->arch.flags |= TF_kernel_mode;
663 if ( !compat )
664 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
665 #ifdef CONFIG_COMPAT
666 else
667 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
668 #endif
670 v->arch.guest_context.user_regs.eflags |= 2;
672 if ( is_hvm_vcpu(v) )
673 goto out;
675 /* Only CR0.TS is modifiable by guest or admin. */
676 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
677 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
679 init_int80_direct_trap(v);
681 /* IOPL privileges are virtualised. */
682 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
683 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
685 /* Ensure real hardware interrupts are enabled. */
686 v->arch.guest_context.user_regs.eflags |= EF_IE;
688 cr4 = v->arch.guest_context.ctrlreg[4];
689 v->arch.guest_context.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(cr4) :
690 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
692 memset(v->arch.guest_context.debugreg, 0,
693 sizeof(v->arch.guest_context.debugreg));
694 for ( i = 0; i < 8; i++ )
695 (void)set_debugreg(v, i, c(debugreg[i]));
697 if ( v->is_initialised )
698 goto out;
700 if ( v->vcpu_id == 0 )
701 d->vm_assist = c(vm_assist);
703 if ( !compat )
704 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
705 #ifdef CONFIG_COMPAT
706 else
707 {
708 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
709 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
711 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
712 return -EINVAL;
713 for ( i = 0; i < n; ++i )
714 gdt_frames[i] = c.cmp->gdt_frames[i];
715 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
716 }
717 #endif
718 if ( rc != 0 )
719 return rc;
721 if ( !compat )
722 {
723 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
725 if ( !mfn_valid(cr3_pfn) ||
726 (paging_mode_refcounts(d)
727 ? !get_page(mfn_to_page(cr3_pfn), d)
728 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
729 PGT_base_page_table)) )
730 {
731 destroy_gdt(v);
732 return -EINVAL;
733 }
735 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
737 #ifdef __x86_64__
738 if ( c.nat->ctrlreg[1] )
739 {
740 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
742 if ( !mfn_valid(cr3_pfn) ||
743 (paging_mode_refcounts(d)
744 ? !get_page(mfn_to_page(cr3_pfn), d)
745 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
746 PGT_base_page_table)) )
747 {
748 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
749 v->arch.guest_table = pagetable_null();
750 if ( paging_mode_refcounts(d) )
751 put_page(mfn_to_page(cr3_pfn));
752 else
753 put_page_and_type(mfn_to_page(cr3_pfn));
754 destroy_gdt(v);
755 return -EINVAL;
756 }
758 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
759 }
760 #endif
761 }
762 #ifdef CONFIG_COMPAT
763 else
764 {
765 l4_pgentry_t *l4tab;
767 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
769 if ( !mfn_valid(cr3_pfn) ||
770 (paging_mode_refcounts(d)
771 ? !get_page(mfn_to_page(cr3_pfn), d)
772 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
773 PGT_l3_page_table)) )
774 {
775 destroy_gdt(v);
776 return -EINVAL;
777 }
779 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
780 *l4tab = l4e_from_pfn(
781 cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
782 }
783 #endif
785 if ( v->vcpu_id == 0 )
786 update_domain_wallclock_time(d);
788 /* Don't redo final setup */
789 v->is_initialised = 1;
791 if ( paging_mode_enabled(d) )
792 paging_update_paging_modes(v);
794 update_cr3(v);
796 out:
797 if ( flags & VGCF_online )
798 clear_bit(_VPF_down, &v->pause_flags);
799 else
800 set_bit(_VPF_down, &v->pause_flags);
801 return 0;
802 #undef c
803 }
805 int arch_vcpu_reset(struct vcpu *v)
806 {
807 destroy_gdt(v);
808 vcpu_destroy_pagetables(v);
809 return 0;
810 }
812 /*
813 * Unmap the vcpu info page if the guest decided to place it somewhere
814 * else. This is only used from arch_domain_destroy, so there's no
815 * need to do anything clever.
816 */
817 static void
818 unmap_vcpu_info(struct vcpu *v)
819 {
820 struct domain *d = v->domain;
821 unsigned long mfn;
823 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
824 return;
826 mfn = v->arch.vcpu_info_mfn;
827 unmap_domain_page_global(v->vcpu_info);
829 v->vcpu_info = shared_info_addr(d, vcpu_info[v->vcpu_id]);
830 v->arch.vcpu_info_mfn = INVALID_MFN;
832 put_page_and_type(mfn_to_page(mfn));
833 }
835 /*
836 * Map a guest page in and point the vcpu_info pointer at it. This
837 * makes sure that the vcpu_info is always pointing at a valid piece
838 * of memory, and it sets a pending event to make sure that a pending
839 * event doesn't get missed.
840 */
841 static int
842 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
843 {
844 struct domain *d = v->domain;
845 void *mapping;
846 vcpu_info_t *new_info;
847 int i;
849 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
850 return -EINVAL;
852 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
853 return -EINVAL;
855 /* Run this command on yourself or on other offline VCPUS. */
856 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
857 return -EINVAL;
859 mfn = gmfn_to_mfn(d, mfn);
860 if ( !mfn_valid(mfn) ||
861 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
862 return -EINVAL;
864 mapping = map_domain_page_global(mfn);
865 if ( mapping == NULL )
866 {
867 put_page_and_type(mfn_to_page(mfn));
868 return -ENOMEM;
869 }
871 new_info = (vcpu_info_t *)(mapping + offset);
873 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
875 v->vcpu_info = new_info;
876 v->arch.vcpu_info_mfn = mfn;
878 /* Set new vcpu_info pointer /before/ setting pending flags. */
879 wmb();
881 /*
882 * Mark everything as being pending just to make sure nothing gets
883 * lost. The domain will get a spurious event, but it can cope.
884 */
885 vcpu_info(v, evtchn_upcall_pending) = 1;
886 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
887 set_bit(i, vcpu_info_addr(v, evtchn_pending_sel));
889 /*
890 * Only bother to update time for the current vcpu. If we're
891 * operating on another vcpu, then it had better not be running at
892 * the time.
893 */
894 if ( v == current )
895 update_vcpu_system_time(v);
897 return 0;
898 }
900 long
901 arch_do_vcpu_op(
902 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
903 {
904 long rc = 0;
906 switch ( cmd )
907 {
908 case VCPUOP_register_runstate_memory_area:
909 {
910 struct vcpu_register_runstate_memory_area area;
911 struct vcpu_runstate_info runstate;
913 rc = -EFAULT;
914 if ( copy_from_guest(&area, arg, 1) )
915 break;
917 if ( !guest_handle_okay(area.addr.h, 1) )
918 break;
920 rc = 0;
921 runstate_guest(v) = area.addr.h;
923 if ( v == current )
924 {
925 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
926 }
927 else
928 {
929 vcpu_runstate_get(v, &runstate);
930 __copy_to_guest(runstate_guest(v), &runstate, 1);
931 }
933 break;
934 }
936 case VCPUOP_register_vcpu_info:
937 {
938 struct domain *d = v->domain;
939 struct vcpu_register_vcpu_info info;
941 rc = -EFAULT;
942 if ( copy_from_guest(&info, arg, 1) )
943 break;
945 LOCK_BIGLOCK(d);
946 rc = map_vcpu_info(v, info.mfn, info.offset);
947 UNLOCK_BIGLOCK(d);
949 break;
950 }
952 default:
953 rc = -ENOSYS;
954 break;
955 }
957 return rc;
958 }
960 #ifdef __x86_64__
962 #define loadsegment(seg,value) ({ \
963 int __r = 1; \
964 asm volatile ( \
965 "1: movl %k1,%%" #seg "\n2:\n" \
966 ".section .fixup,\"ax\"\n" \
967 "3: xorl %k0,%k0\n" \
968 " movl %k0,%%" #seg "\n" \
969 " jmp 2b\n" \
970 ".previous\n" \
971 ".section __ex_table,\"a\"\n" \
972 " .align 8\n" \
973 " .quad 1b,3b\n" \
974 ".previous" \
975 : "=r" (__r) : "r" (value), "0" (__r) );\
976 __r; })
978 /*
979 * save_segments() writes a mask of segments which are dirty (non-zero),
980 * allowing load_segments() to avoid some expensive segment loads and
981 * MSR writes.
982 */
983 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
984 #define DIRTY_DS 0x01
985 #define DIRTY_ES 0x02
986 #define DIRTY_FS 0x04
987 #define DIRTY_GS 0x08
988 #define DIRTY_FS_BASE 0x10
989 #define DIRTY_GS_BASE_USER 0x20
991 static void load_segments(struct vcpu *n)
992 {
993 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
994 int all_segs_okay = 1;
995 unsigned int dirty_segment_mask, cpu = smp_processor_id();
997 /* Load and clear the dirty segment mask. */
998 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
999 per_cpu(dirty_segment_mask, cpu) = 0;
1001 /* Either selector != 0 ==> reload. */
1002 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
1003 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
1005 /* Either selector != 0 ==> reload. */
1006 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
1007 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
1009 /*
1010 * Either selector != 0 ==> reload.
1011 * Also reload to reset FS_BASE if it was non-zero.
1012 */
1013 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
1014 nctxt->user_regs.fs) )
1015 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
1017 /*
1018 * Either selector != 0 ==> reload.
1019 * Also reload to reset GS_BASE if it was non-zero.
1020 */
1021 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
1022 nctxt->user_regs.gs) )
1024 /* Reset GS_BASE with user %gs? */
1025 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
1026 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
1029 if ( !is_pv_32on64_domain(n->domain) )
1031 /* This can only be non-zero if selector is NULL. */
1032 if ( nctxt->fs_base )
1033 wrmsr(MSR_FS_BASE,
1034 nctxt->fs_base,
1035 nctxt->fs_base>>32);
1037 /* Most kernels have non-zero GS base, so don't bother testing. */
1038 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1039 wrmsr(MSR_SHADOW_GS_BASE,
1040 nctxt->gs_base_kernel,
1041 nctxt->gs_base_kernel>>32);
1043 /* This can only be non-zero if selector is NULL. */
1044 if ( nctxt->gs_base_user )
1045 wrmsr(MSR_GS_BASE,
1046 nctxt->gs_base_user,
1047 nctxt->gs_base_user>>32);
1049 /* If in kernel mode then switch the GS bases around. */
1050 if ( (n->arch.flags & TF_kernel_mode) )
1051 asm volatile ( "swapgs" );
1054 if ( unlikely(!all_segs_okay) )
1056 struct cpu_user_regs *regs = guest_cpu_user_regs();
1057 unsigned long *rsp =
1058 (n->arch.flags & TF_kernel_mode) ?
1059 (unsigned long *)regs->rsp :
1060 (unsigned long *)nctxt->kernel_sp;
1061 unsigned long cs_and_mask, rflags;
1063 if ( is_pv_32on64_domain(n->domain) )
1065 unsigned int *esp = ring_1(regs) ?
1066 (unsigned int *)regs->rsp :
1067 (unsigned int *)nctxt->kernel_sp;
1068 unsigned int cs_and_mask, eflags;
1069 int ret = 0;
1071 /* CS longword also contains full evtchn_upcall_mask. */
1072 cs_and_mask = (unsigned short)regs->cs |
1073 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1074 /* Fold upcall mask into RFLAGS.IF. */
1075 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1076 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1078 if ( !ring_1(regs) )
1080 ret = put_user(regs->ss, esp-1);
1081 ret |= put_user(regs->_esp, esp-2);
1082 esp -= 2;
1085 if ( ret |
1086 put_user(eflags, esp-1) |
1087 put_user(cs_and_mask, esp-2) |
1088 put_user(regs->_eip, esp-3) |
1089 put_user(nctxt->user_regs.gs, esp-4) |
1090 put_user(nctxt->user_regs.fs, esp-5) |
1091 put_user(nctxt->user_regs.es, esp-6) |
1092 put_user(nctxt->user_regs.ds, esp-7) )
1094 gdprintk(XENLOG_ERR, "Error while creating compat "
1095 "failsafe callback frame.\n");
1096 domain_crash(n->domain);
1099 if ( test_bit(_VGCF_failsafe_disables_events,
1100 &n->arch.guest_context.flags) )
1101 vcpu_info(n, evtchn_upcall_mask) = 1;
1103 regs->entry_vector = TRAP_syscall;
1104 regs->_eflags &= 0xFFFCBEFFUL;
1105 regs->ss = FLAT_COMPAT_KERNEL_SS;
1106 regs->_esp = (unsigned long)(esp-7);
1107 regs->cs = FLAT_COMPAT_KERNEL_CS;
1108 regs->_eip = nctxt->failsafe_callback_eip;
1109 return;
1112 if ( !(n->arch.flags & TF_kernel_mode) )
1113 toggle_guest_mode(n);
1114 else
1115 regs->cs &= ~3;
1117 /* CS longword also contains full evtchn_upcall_mask. */
1118 cs_and_mask = (unsigned long)regs->cs |
1119 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1121 /* Fold upcall mask into RFLAGS.IF. */
1122 rflags = regs->rflags & ~X86_EFLAGS_IF;
1123 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1125 if ( put_user(regs->ss, rsp- 1) |
1126 put_user(regs->rsp, rsp- 2) |
1127 put_user(rflags, rsp- 3) |
1128 put_user(cs_and_mask, rsp- 4) |
1129 put_user(regs->rip, rsp- 5) |
1130 put_user(nctxt->user_regs.gs, rsp- 6) |
1131 put_user(nctxt->user_regs.fs, rsp- 7) |
1132 put_user(nctxt->user_regs.es, rsp- 8) |
1133 put_user(nctxt->user_regs.ds, rsp- 9) |
1134 put_user(regs->r11, rsp-10) |
1135 put_user(regs->rcx, rsp-11) )
1137 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1138 "callback frame.\n");
1139 domain_crash(n->domain);
1142 if ( test_bit(_VGCF_failsafe_disables_events,
1143 &n->arch.guest_context.flags) )
1144 vcpu_info(n, evtchn_upcall_mask) = 1;
1146 regs->entry_vector = TRAP_syscall;
1147 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1148 X86_EFLAGS_NT|X86_EFLAGS_TF);
1149 regs->ss = FLAT_KERNEL_SS;
1150 regs->rsp = (unsigned long)(rsp-11);
1151 regs->cs = FLAT_KERNEL_CS;
1152 regs->rip = nctxt->failsafe_callback_eip;
1156 static void save_segments(struct vcpu *v)
1158 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1159 struct cpu_user_regs *regs = &ctxt->user_regs;
1160 unsigned int dirty_segment_mask = 0;
1162 regs->ds = read_segment_register(ds);
1163 regs->es = read_segment_register(es);
1164 regs->fs = read_segment_register(fs);
1165 regs->gs = read_segment_register(gs);
1167 if ( regs->ds )
1168 dirty_segment_mask |= DIRTY_DS;
1170 if ( regs->es )
1171 dirty_segment_mask |= DIRTY_ES;
1173 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1175 dirty_segment_mask |= DIRTY_FS;
1176 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1178 else if ( ctxt->fs_base )
1180 dirty_segment_mask |= DIRTY_FS_BASE;
1183 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1185 dirty_segment_mask |= DIRTY_GS;
1186 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1188 else if ( ctxt->gs_base_user )
1190 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1193 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1196 #define switch_kernel_stack(v) ((void)0)
1198 #elif defined(__i386__)
1200 #define load_segments(n) ((void)0)
1201 #define save_segments(p) ((void)0)
1203 static inline void switch_kernel_stack(struct vcpu *v)
1205 struct tss_struct *tss = &init_tss[smp_processor_id()];
1206 tss->esp1 = v->arch.guest_context.kernel_sp;
1207 tss->ss1 = v->arch.guest_context.kernel_ss;
1210 #endif /* __i386__ */
1212 static void paravirt_ctxt_switch_from(struct vcpu *v)
1214 save_segments(v);
1216 /*
1217 * Disable debug breakpoints. We do this aggressively because if we switch
1218 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1219 * inside Xen, before we get a chance to reload DR7, and this cannot always
1220 * safely be handled.
1221 */
1222 if ( unlikely(v->arch.guest_context.debugreg[7]) )
1223 write_debugreg(7, 0);
1226 static void paravirt_ctxt_switch_to(struct vcpu *v)
1228 unsigned long cr4;
1230 set_int80_direct_trap(v);
1231 switch_kernel_stack(v);
1233 cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]);
1234 if ( unlikely(cr4 != read_cr4()) )
1235 write_cr4(cr4);
1237 if ( unlikely(v->arch.guest_context.debugreg[7]) )
1239 write_debugreg(0, v->arch.guest_context.debugreg[0]);
1240 write_debugreg(1, v->arch.guest_context.debugreg[1]);
1241 write_debugreg(2, v->arch.guest_context.debugreg[2]);
1242 write_debugreg(3, v->arch.guest_context.debugreg[3]);
1243 write_debugreg(6, v->arch.guest_context.debugreg[6]);
1244 write_debugreg(7, v->arch.guest_context.debugreg[7]);
1248 static void __context_switch(void)
1250 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1251 unsigned int cpu = smp_processor_id();
1252 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1253 struct vcpu *n = current;
1255 ASSERT(p != n);
1256 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1258 if ( !is_idle_vcpu(p) )
1260 memcpy(&p->arch.guest_context.user_regs,
1261 stack_regs,
1262 CTXT_SWITCH_STACK_BYTES);
1263 unlazy_fpu(p);
1264 p->arch.ctxt_switch_from(p);
1267 if ( !is_idle_vcpu(n) )
1269 memcpy(stack_regs,
1270 &n->arch.guest_context.user_regs,
1271 CTXT_SWITCH_STACK_BYTES);
1272 n->arch.ctxt_switch_to(n);
1275 if ( p->domain != n->domain )
1276 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1277 cpu_set(cpu, n->vcpu_dirty_cpumask);
1279 write_ptbase(n);
1281 if ( p->vcpu_id != n->vcpu_id )
1283 char gdt_load[10];
1284 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1285 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1286 asm volatile ( "lgdt %0" : "=m" (gdt_load) );
1289 if ( p->domain != n->domain )
1290 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1291 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1293 per_cpu(curr_vcpu, cpu) = n;
1297 void context_switch(struct vcpu *prev, struct vcpu *next)
1299 unsigned int cpu = smp_processor_id();
1300 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1302 ASSERT(local_irq_is_enabled());
1304 /* Allow at most one CPU at a time to be dirty. */
1305 ASSERT(cpus_weight(dirty_mask) <= 1);
1306 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1308 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1309 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1310 flush_tlb_mask(next->vcpu_dirty_cpumask);
1313 local_irq_disable();
1315 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1316 pt_save_timer(prev);
1318 set_current(next);
1320 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1322 local_irq_enable();
1324 else
1326 __context_switch();
1328 #ifdef CONFIG_COMPAT
1329 if ( !is_hvm_vcpu(next) &&
1330 (is_idle_vcpu(prev) ||
1331 is_hvm_vcpu(prev) ||
1332 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1334 uint64_t efer = read_efer();
1335 if ( !(efer & EFER_SCE) )
1336 write_efer(efer | EFER_SCE);
1337 flush_tlb_one_local(GDT_VIRT_START(next) +
1338 FIRST_RESERVED_GDT_BYTE);
1340 #endif
1342 /* Re-enable interrupts before restoring state which may fault. */
1343 local_irq_enable();
1345 if ( !is_hvm_vcpu(next) )
1347 load_LDT(next);
1348 load_segments(next);
1352 context_saved(prev);
1354 /* Update per-VCPU guest runstate shared memory area (if registered). */
1355 if ( !guest_handle_is_null(runstate_guest(next)) )
1357 if ( !is_pv_32on64_domain(next->domain) )
1358 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1359 #ifdef CONFIG_COMPAT
1360 else
1362 struct compat_vcpu_runstate_info info;
1364 XLAT_vcpu_runstate_info(&info, &next->runstate);
1365 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1367 #endif
1370 schedule_tail(next);
1371 BUG();
1374 void continue_running(struct vcpu *same)
1376 schedule_tail(same);
1377 BUG();
1380 int __sync_lazy_execstate(void)
1382 unsigned long flags;
1383 int switch_required;
1385 local_irq_save(flags);
1387 switch_required = (this_cpu(curr_vcpu) != current);
1389 if ( switch_required )
1391 ASSERT(current == idle_vcpu[smp_processor_id()]);
1392 __context_switch();
1395 local_irq_restore(flags);
1397 return switch_required;
1400 void sync_vcpu_execstate(struct vcpu *v)
1402 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1403 (void)__sync_lazy_execstate();
1405 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1406 flush_tlb_mask(v->vcpu_dirty_cpumask);
1409 struct migrate_info {
1410 long (*func)(void *data);
1411 void *data;
1412 void (*saved_schedule_tail)(struct vcpu *);
1413 cpumask_t saved_affinity;
1414 };
1416 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1418 struct cpu_user_regs *regs = guest_cpu_user_regs();
1419 struct migrate_info *info = v->arch.continue_info;
1421 regs->eax = info->func(info->data);
1423 v->arch.schedule_tail = info->saved_schedule_tail;
1424 v->arch.continue_info = NULL;
1426 xfree(info);
1428 vcpu_set_affinity(v, &v->cpu_affinity);
1429 schedule_tail(v);
1432 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1434 struct vcpu *v = current;
1435 struct migrate_info *info;
1436 cpumask_t mask = cpumask_of_cpu(cpu);
1437 int rc;
1439 if ( cpu == smp_processor_id() )
1440 return func(data);
1442 info = xmalloc(struct migrate_info);
1443 if ( info == NULL )
1444 return -ENOMEM;
1446 info->func = func;
1447 info->data = data;
1448 info->saved_schedule_tail = v->arch.schedule_tail;
1449 info->saved_affinity = v->cpu_affinity;
1451 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1452 v->arch.continue_info = info;
1454 rc = vcpu_set_affinity(v, &mask);
1455 if ( rc )
1457 v->arch.schedule_tail = info->saved_schedule_tail;
1458 v->arch.continue_info = NULL;
1459 xfree(info);
1460 return rc;
1463 /* Dummy return value will be overwritten by new schedule_tail. */
1464 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1465 return 0;
1468 #define next_arg(fmt, args) ({ \
1469 unsigned long __arg; \
1470 switch ( *(fmt)++ ) \
1471 { \
1472 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1473 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1474 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1475 default: __arg = 0; BUG(); \
1476 } \
1477 __arg; \
1478 })
1480 DEFINE_PER_CPU(char, hc_preempted);
1482 unsigned long hypercall_create_continuation(
1483 unsigned int op, const char *format, ...)
1485 struct mc_state *mcs = &this_cpu(mc_state);
1486 struct cpu_user_regs *regs;
1487 const char *p = format;
1488 unsigned long arg;
1489 unsigned int i;
1490 va_list args;
1492 va_start(args, format);
1494 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1496 __set_bit(_MCSF_call_preempted, &mcs->flags);
1498 for ( i = 0; *p != '\0'; i++ )
1499 mcs->call.args[i] = next_arg(p, args);
1500 if ( is_pv_32on64_domain(current->domain) )
1502 for ( ; i < 6; i++ )
1503 mcs->call.args[i] = 0;
1506 else
1508 regs = guest_cpu_user_regs();
1509 regs->eax = op;
1510 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1512 #ifdef __x86_64__
1513 if ( !is_hvm_vcpu(current) ?
1514 !is_pv_32on64_vcpu(current) :
1515 (hvm_guest_x86_mode(current) == 8) )
1517 for ( i = 0; *p != '\0'; i++ )
1519 arg = next_arg(p, args);
1520 switch ( i )
1522 case 0: regs->rdi = arg; break;
1523 case 1: regs->rsi = arg; break;
1524 case 2: regs->rdx = arg; break;
1525 case 3: regs->r10 = arg; break;
1526 case 4: regs->r8 = arg; break;
1527 case 5: regs->r9 = arg; break;
1531 else
1532 #endif
1534 if ( supervisor_mode_kernel )
1535 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1537 for ( i = 0; *p != '\0'; i++ )
1539 arg = next_arg(p, args);
1540 switch ( i )
1542 case 0: regs->ebx = arg; break;
1543 case 1: regs->ecx = arg; break;
1544 case 2: regs->edx = arg; break;
1545 case 3: regs->esi = arg; break;
1546 case 4: regs->edi = arg; break;
1547 case 5: regs->ebp = arg; break;
1552 this_cpu(hc_preempted) = 1;
1555 va_end(args);
1557 return op;
1560 #ifdef CONFIG_COMPAT
1561 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1563 int rc = 0;
1564 struct mc_state *mcs = &this_cpu(mc_state);
1565 struct cpu_user_regs *regs;
1566 unsigned int i, cval = 0;
1567 unsigned long nval = 0;
1568 va_list args;
1570 BUG_ON(*id > 5);
1571 BUG_ON(mask & (1U << *id));
1573 va_start(args, mask);
1575 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1577 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1578 return 0;
1579 for ( i = 0; i < 6; ++i, mask >>= 1 )
1581 if ( mask & 1 )
1583 nval = va_arg(args, unsigned long);
1584 cval = va_arg(args, unsigned int);
1585 if ( cval == nval )
1586 mask &= ~1U;
1587 else
1588 BUG_ON(nval == (unsigned int)nval);
1590 else if ( id && *id == i )
1592 *id = mcs->call.args[i];
1593 id = NULL;
1595 if ( (mask & 1) && mcs->call.args[i] == nval )
1597 mcs->call.args[i] = cval;
1598 ++rc;
1600 else
1601 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1604 else
1606 regs = guest_cpu_user_regs();
1607 for ( i = 0; i < 6; ++i, mask >>= 1 )
1609 unsigned long *reg;
1611 switch ( i )
1613 case 0: reg = &regs->ebx; break;
1614 case 1: reg = &regs->ecx; break;
1615 case 2: reg = &regs->edx; break;
1616 case 3: reg = &regs->esi; break;
1617 case 4: reg = &regs->edi; break;
1618 case 5: reg = &regs->ebp; break;
1619 default: BUG(); reg = NULL; break;
1621 if ( (mask & 1) )
1623 nval = va_arg(args, unsigned long);
1624 cval = va_arg(args, unsigned int);
1625 if ( cval == nval )
1626 mask &= ~1U;
1627 else
1628 BUG_ON(nval == (unsigned int)nval);
1630 else if ( id && *id == i )
1632 *id = *reg;
1633 id = NULL;
1635 if ( (mask & 1) && *reg == nval )
1637 *reg = cval;
1638 ++rc;
1640 else
1641 BUG_ON(*reg != (unsigned int)*reg);
1645 va_end(args);
1647 return rc;
1649 #endif
1651 static int relinquish_memory(
1652 struct domain *d, struct list_head *list, unsigned long type)
1654 struct list_head *ent;
1655 struct page_info *page;
1656 unsigned long x, y;
1657 int ret = 0;
1659 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1660 spin_lock_recursive(&d->page_alloc_lock);
1662 ent = list->next;
1663 while ( ent != list )
1665 page = list_entry(ent, struct page_info, list);
1667 /* Grab a reference to the page so it won't disappear from under us. */
1668 if ( unlikely(!get_page(page, d)) )
1670 /* Couldn't get a reference -- someone is freeing this page. */
1671 ent = ent->next;
1672 list_move_tail(&page->list, &d->arch.relmem_list);
1673 continue;
1676 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1677 put_page_and_type(page);
1679 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1680 put_page(page);
1682 /*
1683 * Forcibly invalidate top-most, still valid page tables at this point
1684 * to break circular 'linear page table' references. This is okay
1685 * because MMU structures are not shared across domains and this domain
1686 * is now dead. Thus top-most valid tables are not in use so a non-zero
1687 * count means circular reference.
1688 */
1689 y = page->u.inuse.type_info;
1690 for ( ; ; )
1692 x = y;
1693 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1694 (type|PGT_validated)) )
1695 break;
1697 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1698 if ( likely(y == x) )
1700 free_page_type(page, type);
1701 break;
1705 /* Follow the list chain and /then/ potentially free the page. */
1706 ent = ent->next;
1707 list_move_tail(&page->list, &d->arch.relmem_list);
1708 put_page(page);
1710 if ( hypercall_preempt_check() )
1712 ret = -EAGAIN;
1713 goto out;
1717 list_splice_init(&d->arch.relmem_list, list);
1719 out:
1720 spin_unlock_recursive(&d->page_alloc_lock);
1721 return ret;
1724 static void vcpu_destroy_pagetables(struct vcpu *v)
1726 struct domain *d = v->domain;
1727 unsigned long pfn;
1729 #ifdef __x86_64__
1730 if ( is_pv_32on64_vcpu(v) )
1732 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1733 __va(pagetable_get_paddr(v->arch.guest_table)));
1735 if ( pfn != 0 )
1737 if ( paging_mode_refcounts(d) )
1738 put_page(mfn_to_page(pfn));
1739 else
1740 put_page_and_type(mfn_to_page(pfn));
1743 l4e_write(
1744 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1745 l4e_empty());
1747 v->arch.cr3 = 0;
1748 return;
1750 #endif
1752 pfn = pagetable_get_pfn(v->arch.guest_table);
1753 if ( pfn != 0 )
1755 if ( paging_mode_refcounts(d) )
1756 put_page(mfn_to_page(pfn));
1757 else
1758 put_page_and_type(mfn_to_page(pfn));
1759 v->arch.guest_table = pagetable_null();
1762 #ifdef __x86_64__
1763 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1764 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1765 if ( pfn != 0 )
1767 if ( !is_pv_32bit_vcpu(v) )
1769 if ( paging_mode_refcounts(d) )
1770 put_page(mfn_to_page(pfn));
1771 else
1772 put_page_and_type(mfn_to_page(pfn));
1774 v->arch.guest_table_user = pagetable_null();
1776 #endif
1778 v->arch.cr3 = 0;
1781 int domain_relinquish_resources(struct domain *d)
1783 int ret;
1784 struct vcpu *v;
1786 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1788 switch ( d->arch.relmem )
1790 case RELMEM_not_started:
1791 /* Tear down paging-assistance stuff. */
1792 paging_teardown(d);
1794 /* Drop the in-use references to page-table bases. */
1795 for_each_vcpu ( d, v )
1796 vcpu_destroy_pagetables(v);
1798 /*
1799 * Relinquish GDT mappings. No need for explicit unmapping of the LDT
1800 * as it automatically gets squashed when the guest's mappings go away.
1801 */
1802 for_each_vcpu(d, v)
1803 destroy_gdt(v);
1805 d->arch.relmem = RELMEM_xen_l4;
1806 /* fallthrough */
1808 /* Relinquish every page of memory. */
1809 case RELMEM_xen_l4:
1810 #if CONFIG_PAGING_LEVELS >= 4
1811 ret = relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
1812 if ( ret )
1813 return ret;
1814 d->arch.relmem = RELMEM_dom_l4;
1815 /* fallthrough */
1816 case RELMEM_dom_l4:
1817 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1818 if ( ret )
1819 return ret;
1820 d->arch.relmem = RELMEM_xen_l3;
1821 /* fallthrough */
1822 #endif
1824 case RELMEM_xen_l3:
1825 #if CONFIG_PAGING_LEVELS >= 3
1826 ret = relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
1827 if ( ret )
1828 return ret;
1829 d->arch.relmem = RELMEM_dom_l3;
1830 /* fallthrough */
1831 case RELMEM_dom_l3:
1832 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1833 if ( ret )
1834 return ret;
1835 d->arch.relmem = RELMEM_xen_l2;
1836 /* fallthrough */
1837 #endif
1839 case RELMEM_xen_l2:
1840 ret = relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
1841 if ( ret )
1842 return ret;
1843 d->arch.relmem = RELMEM_dom_l2;
1844 /* fallthrough */
1845 case RELMEM_dom_l2:
1846 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1847 if ( ret )
1848 return ret;
1849 d->arch.relmem = RELMEM_done;
1850 /* fallthrough */
1852 case RELMEM_done:
1853 break;
1855 default:
1856 BUG();
1859 /* Free page used by xen oprofile buffer. */
1860 free_xenoprof_pages(d);
1862 if ( is_hvm_domain(d) )
1863 hvm_domain_relinquish_resources(d);
1865 return 0;
1868 void arch_dump_domain_info(struct domain *d)
1870 paging_dump_domain_info(d);
1873 void arch_dump_vcpu_info(struct vcpu *v)
1875 paging_dump_vcpu_info(v);
1878 /*
1879 * Local variables:
1880 * mode: C
1881 * c-set-style: "BSD"
1882 * c-basic-offset: 4
1883 * tab-width: 4
1884 * indent-tabs-mode: nil
1885 * End:
1886 */