ia64/xen-unstable

view xen/arch/x86/domain.c @ 15812:86a154e1ef5d

[HVM] Shadow: don't shadow the p2m table.
For HVM vcpus with paging disabled, we used to shadow the p2m table,
and skip the p2m lookup to go from gfn to mfn. Instead, we now
provide a simple pagetable that gives a one-to-one mapping of 4GB, and
shadow that, making the translations from gfn to mfn via the p2m.
This removes the paging-disabled special-case code from the shadow
fault handler, and allows us to expand the p2m interface, since all HVM
translations now go through the same p2m lookups.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Fri Aug 31 11:06:22 2007 +0100 (2007-08-31)
parents bb5c23bbc7b7
children bd59dd48e208
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <asm/regs.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/system.h>
35 #include <asm/io.h>
36 #include <asm/processor.h>
37 #include <asm/desc.h>
38 #include <asm/i387.h>
39 #include <asm/mpspec.h>
40 #include <asm/ldt.h>
41 #include <asm/paging.h>
42 #include <asm/hypercall.h>
43 #include <asm/hvm/hvm.h>
44 #include <asm/hvm/support.h>
45 #include <asm/msr.h>
46 #include <asm/nmi.h>
47 #ifdef CONFIG_COMPAT
48 #include <compat/vcpu.h>
49 #endif
51 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
52 DEFINE_PER_CPU(__u64, efer);
54 static void unmap_vcpu_info(struct vcpu *v);
56 static void paravirt_ctxt_switch_from(struct vcpu *v);
57 static void paravirt_ctxt_switch_to(struct vcpu *v);
59 static void vcpu_destroy_pagetables(struct vcpu *v);
61 static void continue_idle_domain(struct vcpu *v)
62 {
63 reset_stack_and_jump(idle_loop);
64 }
66 static void continue_nonidle_domain(struct vcpu *v)
67 {
68 reset_stack_and_jump(ret_from_intr);
69 }
71 static void default_idle(void)
72 {
73 local_irq_disable();
74 if ( !softirq_pending(smp_processor_id()) )
75 safe_halt();
76 else
77 local_irq_enable();
78 }
80 static void play_dead(void)
81 {
82 __cpu_disable();
83 /* This must be done before dead CPU ack */
84 cpu_exit_clear();
85 hvm_cpu_down();
86 wbinvd();
87 mb();
88 /* Ack it */
89 __get_cpu_var(cpu_state) = CPU_DEAD;
91 /* With physical CPU hotplug, we should halt the cpu. */
92 local_irq_disable();
93 for ( ; ; )
94 halt();
95 }
97 void idle_loop(void)
98 {
99 for ( ; ; )
100 {
101 if (cpu_is_offline(smp_processor_id()))
102 play_dead();
103 page_scrub_schedule_work();
104 default_idle();
105 do_softirq();
106 }
107 }
109 void startup_cpu_idle_loop(void)
110 {
111 struct vcpu *v = current;
113 ASSERT(is_idle_vcpu(v));
114 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
115 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
117 reset_stack_and_jump(idle_loop);
118 }
120 void dump_pageframe_info(struct domain *d)
121 {
122 struct page_info *page;
124 printk("Memory pages belonging to domain %u:\n", d->domain_id);
126 if ( d->tot_pages >= 10 )
127 {
128 printk(" DomPage list too long to display\n");
129 }
130 else
131 {
132 list_for_each_entry ( page, &d->page_list, list )
133 {
134 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
135 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
136 page->count_info, page->u.inuse.type_info);
137 }
138 }
140 list_for_each_entry ( page, &d->xenpage_list, list )
141 {
142 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
143 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
144 page->count_info, page->u.inuse.type_info);
145 }
146 }
148 struct vcpu *alloc_vcpu_struct(void)
149 {
150 struct vcpu *v;
151 if ( (v = xmalloc(struct vcpu)) != NULL )
152 memset(v, 0, sizeof(*v));
153 return v;
154 }
156 void free_vcpu_struct(struct vcpu *v)
157 {
158 xfree(v);
159 }
161 #ifdef CONFIG_COMPAT
163 int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
164 {
165 struct domain *d = v->domain;
166 unsigned i;
167 struct page_info *pg;
169 if ( !d->arch.mm_arg_xlat_l3 )
170 {
171 pg = alloc_domheap_page(NULL);
172 if ( !pg )
173 return -ENOMEM;
174 d->arch.mm_arg_xlat_l3 = page_to_virt(pg);
175 clear_page(d->arch.mm_arg_xlat_l3);
176 }
178 l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
179 l4e_from_paddr(__pa(d->arch.mm_arg_xlat_l3), __PAGE_HYPERVISOR);
181 for ( i = 0; i < COMPAT_ARG_XLAT_PAGES; ++i )
182 {
183 unsigned long va = COMPAT_ARG_XLAT_VIRT_START(v->vcpu_id) + i * PAGE_SIZE;
184 l2_pgentry_t *l2tab;
185 l1_pgentry_t *l1tab;
187 if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
188 {
189 pg = alloc_domheap_page(NULL);
190 if ( !pg )
191 return -ENOMEM;
192 clear_page(page_to_virt(pg));
193 d->arch.mm_arg_xlat_l3[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
194 }
195 l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
196 if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
197 {
198 pg = alloc_domheap_page(NULL);
199 if ( !pg )
200 return -ENOMEM;
201 clear_page(page_to_virt(pg));
202 l2tab[l2_table_offset(va)] = l2e_from_page(pg, __PAGE_HYPERVISOR);
203 }
204 l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
205 BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
206 pg = alloc_domheap_page(NULL);
207 if ( !pg )
208 return -ENOMEM;
209 l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
210 }
212 return 0;
213 }
215 static void release_arg_xlat_area(struct domain *d)
216 {
217 if ( d->arch.mm_arg_xlat_l3 )
218 {
219 unsigned l3;
221 for ( l3 = 0; l3 < L3_PAGETABLE_ENTRIES; ++l3 )
222 {
223 if ( l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3]) )
224 {
225 l2_pgentry_t *l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3]);
226 unsigned l2;
228 for ( l2 = 0; l2 < L2_PAGETABLE_ENTRIES; ++l2 )
229 {
230 if ( l2e_get_intpte(l2tab[l2]) )
231 {
232 l1_pgentry_t *l1tab = l2e_to_l1e(l2tab[l2]);
233 unsigned l1;
235 for ( l1 = 0; l1 < L1_PAGETABLE_ENTRIES; ++l1 )
236 {
237 if ( l1e_get_intpte(l1tab[l1]) )
238 free_domheap_page(l1e_get_page(l1tab[l1]));
239 }
240 free_domheap_page(l2e_get_page(l2tab[l2]));
241 }
242 }
243 free_domheap_page(l3e_get_page(d->arch.mm_arg_xlat_l3[l3]));
244 }
245 }
246 free_domheap_page(virt_to_page(d->arch.mm_arg_xlat_l3));
247 }
248 }
250 static int setup_compat_l4(struct vcpu *v)
251 {
252 struct page_info *pg = alloc_domheap_page(NULL);
253 l4_pgentry_t *l4tab;
254 int rc;
256 if ( pg == NULL )
257 return -ENOMEM;
259 /* This page needs to look like a pagetable so that it can be shadowed */
260 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated;
262 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
263 l4tab[0] = l4e_empty();
264 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
265 l4e_from_page(pg, __PAGE_HYPERVISOR);
266 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
267 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
268 __PAGE_HYPERVISOR);
270 if ( (rc = setup_arg_xlat_area(v, l4tab)) < 0 )
271 {
272 free_domheap_page(pg);
273 return rc;
274 }
276 v->arch.guest_table = pagetable_from_page(pg);
277 v->arch.guest_table_user = v->arch.guest_table;
279 return 0;
280 }
282 static void release_compat_l4(struct vcpu *v)
283 {
284 free_domheap_page(pagetable_get_page(v->arch.guest_table));
285 v->arch.guest_table = pagetable_null();
286 v->arch.guest_table_user = pagetable_null();
287 }
289 static inline int may_switch_mode(struct domain *d)
290 {
291 return (!is_hvm_domain(d) && (d->tot_pages == 0));
292 }
294 int switch_native(struct domain *d)
295 {
296 l1_pgentry_t gdt_l1e;
297 unsigned int vcpuid;
299 if ( d == NULL )
300 return -EINVAL;
301 if ( !may_switch_mode(d) )
302 return -EACCES;
303 if ( !is_pv_32on64_domain(d) )
304 return 0;
306 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
307 release_arg_xlat_area(d);
309 /* switch gdt */
310 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
311 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
312 {
313 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
314 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
315 if (d->vcpu[vcpuid])
316 release_compat_l4(d->vcpu[vcpuid]);
317 }
319 d->arch.physaddr_bitsize = 64;
321 return 0;
322 }
324 int switch_compat(struct domain *d)
325 {
326 l1_pgentry_t gdt_l1e;
327 unsigned int vcpuid;
329 if ( d == NULL )
330 return -EINVAL;
331 if ( !may_switch_mode(d) )
332 return -EACCES;
333 if ( is_pv_32on64_domain(d) )
334 return 0;
336 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
338 /* switch gdt */
339 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
340 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
341 {
342 if ( (d->vcpu[vcpuid] != NULL) &&
343 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
344 goto undo_and_fail;
345 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
346 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
347 }
349 d->arch.physaddr_bitsize =
350 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
351 + (PAGE_SIZE - 2);
353 return 0;
355 undo_and_fail:
356 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
357 release_arg_xlat_area(d);
358 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
359 while ( vcpuid-- != 0 )
360 {
361 if ( d->vcpu[vcpuid] != NULL )
362 release_compat_l4(d->vcpu[vcpuid]);
363 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
364 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
365 }
366 return -ENOMEM;
367 }
369 #else
370 #define release_arg_xlat_area(d) ((void)0)
371 #define setup_compat_l4(v) 0
372 #define release_compat_l4(v) ((void)0)
373 #endif
375 int vcpu_initialise(struct vcpu *v)
376 {
377 struct domain *d = v->domain;
378 int rc;
380 v->arch.vcpu_info_mfn = INVALID_MFN;
382 v->arch.flags = TF_kernel_mode;
384 pae_l3_cache_init(&v->arch.pae_l3_cache);
386 paging_vcpu_init(v);
388 if ( is_hvm_domain(d) )
389 {
390 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
391 return rc;
392 }
393 else
394 {
395 /* PV guests by default have a 100Hz ticker. */
396 v->periodic_period = MILLISECS(10);
398 /* PV guests get an emulated PIT too for video BIOSes to use. */
399 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
400 pit_init(v, cpu_khz);
402 v->arch.schedule_tail = continue_nonidle_domain;
403 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
404 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
406 if ( is_idle_domain(d) )
407 {
408 v->arch.schedule_tail = continue_idle_domain;
409 v->arch.cr3 = __pa(idle_pg_table);
410 }
411 }
413 v->arch.perdomain_ptes =
414 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
416 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
417 }
419 void vcpu_destroy(struct vcpu *v)
420 {
421 if ( is_pv_32on64_vcpu(v) )
422 release_compat_l4(v);
424 unmap_vcpu_info(v);
426 if ( is_hvm_vcpu(v) )
427 hvm_vcpu_destroy(v);
428 }
430 int arch_domain_create(struct domain *d)
431 {
432 #ifdef __x86_64__
433 struct page_info *pg;
434 int i;
435 #endif
436 l1_pgentry_t gdt_l1e;
437 int vcpuid, pdpt_order, paging_initialised = 0;
438 int rc = -ENOMEM;
440 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
441 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
442 if ( d->arch.mm_perdomain_pt == NULL )
443 goto fail;
444 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
446 /*
447 * Map Xen segments into every VCPU's GDT, irrespective of whether every
448 * VCPU will actually be used. This avoids an NMI race during context
449 * switch: if we take an interrupt after switching CR3 but before switching
450 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
451 * try to load CS from an invalid table.
452 */
453 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
454 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
455 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
456 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
458 #if defined(__i386__)
460 mapcache_init(d);
462 #else /* __x86_64__ */
464 if ( (pg = alloc_domheap_page(NULL)) == NULL )
465 goto fail;
466 d->arch.mm_perdomain_l2 = page_to_virt(pg);
467 clear_page(d->arch.mm_perdomain_l2);
468 for ( i = 0; i < (1 << pdpt_order); i++ )
469 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
470 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
471 __PAGE_HYPERVISOR);
473 if ( (pg = alloc_domheap_page(NULL)) == NULL )
474 goto fail;
475 d->arch.mm_perdomain_l3 = page_to_virt(pg);
476 clear_page(d->arch.mm_perdomain_l3);
477 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
478 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
479 __PAGE_HYPERVISOR);
481 #endif /* __x86_64__ */
483 #ifdef CONFIG_COMPAT
484 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
485 #endif
487 paging_domain_init(d);
488 paging_initialised = 1;
490 if ( !is_idle_domain(d) )
491 {
492 d->arch.ioport_caps =
493 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
494 if ( d->arch.ioport_caps == NULL )
495 goto fail;
497 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
498 goto fail;
500 clear_page(d->shared_info);
501 share_xen_page_with_guest(
502 virt_to_page(d->shared_info), d, XENSHARE_writable);
503 }
505 if ( is_hvm_domain(d) )
506 {
507 if ( (rc = hvm_domain_initialise(d)) != 0 )
508 goto fail;
509 }
510 else
511 {
512 /* 32-bit PV guest by default only if Xen is not 64-bit. */
513 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
514 (CONFIG_PAGING_LEVELS != 4);
515 }
517 return 0;
519 fail:
520 free_xenheap_page(d->shared_info);
521 if ( paging_initialised )
522 paging_final_teardown(d);
523 #ifdef __x86_64__
524 if ( d->arch.mm_perdomain_l2 )
525 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
526 if ( d->arch.mm_perdomain_l3 )
527 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
528 #endif
529 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
530 return rc;
531 }
533 void arch_domain_destroy(struct domain *d)
534 {
535 if ( is_hvm_domain(d) )
536 hvm_domain_destroy(d);
538 paging_final_teardown(d);
540 free_xenheap_pages(
541 d->arch.mm_perdomain_pt,
542 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
544 #ifdef __x86_64__
545 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
546 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
547 #endif
549 if ( is_pv_32on64_domain(d) )
550 release_arg_xlat_area(d);
552 free_xenheap_page(d->shared_info);
553 }
555 /* This is called by arch_final_setup_guest and do_boot_vcpu */
556 int arch_set_info_guest(
557 struct vcpu *v, vcpu_guest_context_u c)
558 {
559 struct domain *d = v->domain;
560 unsigned long cr3_pfn = INVALID_MFN;
561 unsigned long flags;
562 int i, rc = 0, compat;
564 /* The context is a compat-mode one if the target domain is compat-mode;
565 * we expect the tools to DTRT even in compat-mode callers. */
566 compat = is_pv_32on64_domain(d);
568 #ifdef CONFIG_COMPAT
569 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
570 #else
571 #define c(fld) (c.nat->fld)
572 #endif
573 flags = c(flags);
575 if ( !is_hvm_vcpu(v) )
576 {
577 if ( !compat )
578 {
579 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
580 fixup_guest_stack_selector(d, c.nat->kernel_ss);
581 fixup_guest_code_selector(d, c.nat->user_regs.cs);
582 #ifdef __i386__
583 fixup_guest_code_selector(d, c.nat->event_callback_cs);
584 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
585 #endif
587 for ( i = 0; i < 256; i++ )
588 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
590 /* LDT safety checks. */
591 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
592 (c.nat->ldt_ents > 8192) ||
593 !array_access_ok(c.nat->ldt_base,
594 c.nat->ldt_ents,
595 LDT_ENTRY_SIZE) )
596 return -EINVAL;
597 }
598 #ifdef CONFIG_COMPAT
599 else
600 {
601 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
602 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
603 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
604 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
605 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
607 for ( i = 0; i < 256; i++ )
608 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
610 /* LDT safety checks. */
611 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
612 (c.cmp->ldt_ents > 8192) ||
613 !compat_array_access_ok(c.cmp->ldt_base,
614 c.cmp->ldt_ents,
615 LDT_ENTRY_SIZE) )
616 return -EINVAL;
617 }
618 #endif
619 }
621 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
623 v->arch.flags &= ~TF_kernel_mode;
624 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
625 v->arch.flags |= TF_kernel_mode;
627 if ( !compat )
628 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
629 #ifdef CONFIG_COMPAT
630 else
631 {
632 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
633 }
634 #endif
636 /* Only CR0.TS is modifiable by guest or admin. */
637 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
638 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
640 init_int80_direct_trap(v);
642 if ( !is_hvm_vcpu(v) )
643 {
644 /* IOPL privileges are virtualised. */
645 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
646 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
648 /* Ensure real hardware interrupts are enabled. */
649 v->arch.guest_context.user_regs.eflags |= EF_IE;
650 }
651 else
652 {
653 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
654 }
656 if ( v->is_initialised )
657 goto out;
659 memset(v->arch.guest_context.debugreg, 0,
660 sizeof(v->arch.guest_context.debugreg));
661 for ( i = 0; i < 8; i++ )
662 (void)set_debugreg(v, i, c(debugreg[i]));
664 if ( v->vcpu_id == 0 )
665 d->vm_assist = c(vm_assist);
667 if ( !is_hvm_vcpu(v) )
668 {
669 if ( !compat )
670 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
671 #ifdef CONFIG_COMPAT
672 else
673 {
674 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
675 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
677 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
678 return -EINVAL;
679 for ( i = 0; i < n; ++i )
680 gdt_frames[i] = c.cmp->gdt_frames[i];
681 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
682 }
683 #endif
684 if ( rc != 0 )
685 return rc;
687 if ( !compat )
688 {
689 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
691 if ( !mfn_valid(cr3_pfn) ||
692 (paging_mode_refcounts(d)
693 ? !get_page(mfn_to_page(cr3_pfn), d)
694 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
695 PGT_base_page_table)) )
696 {
697 destroy_gdt(v);
698 return -EINVAL;
699 }
701 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
703 #ifdef __x86_64__
704 if ( c.nat->ctrlreg[1] )
705 {
706 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
708 if ( !mfn_valid(cr3_pfn) ||
709 (paging_mode_refcounts(d)
710 ? !get_page(mfn_to_page(cr3_pfn), d)
711 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
712 PGT_base_page_table)) )
713 {
714 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
715 v->arch.guest_table = pagetable_null();
716 if ( paging_mode_refcounts(d) )
717 put_page(mfn_to_page(cr3_pfn));
718 else
719 put_page_and_type(mfn_to_page(cr3_pfn));
720 destroy_gdt(v);
721 return -EINVAL;
722 }
724 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
725 }
726 #endif
727 }
728 #ifdef CONFIG_COMPAT
729 else
730 {
731 l4_pgentry_t *l4tab;
733 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
735 if ( !mfn_valid(cr3_pfn) ||
736 (paging_mode_refcounts(d)
737 ? !get_page(mfn_to_page(cr3_pfn), d)
738 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
739 PGT_l3_page_table)) )
740 {
741 destroy_gdt(v);
742 return -EINVAL;
743 }
745 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
746 *l4tab = l4e_from_pfn(cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
747 }
748 #endif
749 }
751 if ( v->vcpu_id == 0 )
752 update_domain_wallclock_time(d);
754 /* Don't redo final setup */
755 v->is_initialised = 1;
757 if ( paging_mode_enabled(d) )
758 paging_update_paging_modes(v);
760 update_cr3(v);
762 out:
763 if ( flags & VGCF_online )
764 clear_bit(_VPF_down, &v->pause_flags);
765 else
766 set_bit(_VPF_down, &v->pause_flags);
767 return 0;
768 #undef c
769 }
771 int arch_vcpu_reset(struct vcpu *v)
772 {
773 destroy_gdt(v);
774 vcpu_destroy_pagetables(v);
775 return 0;
776 }
778 /*
779 * Unmap the vcpu info page if the guest decided to place it somewhere
780 * else. This is only used from arch_domain_destroy, so there's no
781 * need to do anything clever.
782 */
783 static void
784 unmap_vcpu_info(struct vcpu *v)
785 {
786 struct domain *d = v->domain;
787 unsigned long mfn;
789 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
790 return;
792 mfn = v->arch.vcpu_info_mfn;
793 unmap_domain_page_global(v->vcpu_info);
795 v->vcpu_info = shared_info_addr(d, vcpu_info[v->vcpu_id]);
796 v->arch.vcpu_info_mfn = INVALID_MFN;
798 put_page_and_type(mfn_to_page(mfn));
799 }
801 /*
802 * Map a guest page in and point the vcpu_info pointer at it. This
803 * makes sure that the vcpu_info is always pointing at a valid piece
804 * of memory, and it sets a pending event to make sure that a pending
805 * event doesn't get missed.
806 */
807 static int
808 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
809 {
810 struct domain *d = v->domain;
811 void *mapping;
812 vcpu_info_t *new_info;
813 int i;
815 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
816 return -EINVAL;
818 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
819 return -EINVAL;
821 /* Run this command on yourself or on other offline VCPUS. */
822 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
823 return -EINVAL;
825 mfn = gmfn_to_mfn(d, mfn);
826 if ( !mfn_valid(mfn) ||
827 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
828 return -EINVAL;
830 mapping = map_domain_page_global(mfn);
831 if ( mapping == NULL )
832 {
833 put_page_and_type(mfn_to_page(mfn));
834 return -ENOMEM;
835 }
837 new_info = (vcpu_info_t *)(mapping + offset);
839 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
841 v->vcpu_info = new_info;
842 v->arch.vcpu_info_mfn = mfn;
844 /* Set new vcpu_info pointer /before/ setting pending flags. */
845 wmb();
847 /*
848 * Mark everything as being pending just to make sure nothing gets
849 * lost. The domain will get a spurious event, but it can cope.
850 */
851 vcpu_info(v, evtchn_upcall_pending) = 1;
852 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
853 set_bit(i, vcpu_info_addr(v, evtchn_pending_sel));
855 /*
856 * Only bother to update time for the current vcpu. If we're
857 * operating on another vcpu, then it had better not be running at
858 * the time.
859 */
860 if ( v == current )
861 update_vcpu_system_time(v);
863 return 0;
864 }
866 long
867 arch_do_vcpu_op(
868 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
869 {
870 long rc = 0;
872 switch ( cmd )
873 {
874 case VCPUOP_register_runstate_memory_area:
875 {
876 struct vcpu_register_runstate_memory_area area;
877 struct vcpu_runstate_info runstate;
879 rc = -EFAULT;
880 if ( copy_from_guest(&area, arg, 1) )
881 break;
883 if ( !guest_handle_okay(area.addr.h, 1) )
884 break;
886 rc = 0;
887 runstate_guest(v) = area.addr.h;
889 if ( v == current )
890 {
891 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
892 }
893 else
894 {
895 vcpu_runstate_get(v, &runstate);
896 __copy_to_guest(runstate_guest(v), &runstate, 1);
897 }
899 break;
900 }
902 case VCPUOP_register_vcpu_info:
903 {
904 struct domain *d = v->domain;
905 struct vcpu_register_vcpu_info info;
907 rc = -EFAULT;
908 if ( copy_from_guest(&info, arg, 1) )
909 break;
911 LOCK_BIGLOCK(d);
912 rc = map_vcpu_info(v, info.mfn, info.offset);
913 UNLOCK_BIGLOCK(d);
915 break;
916 }
918 default:
919 rc = -ENOSYS;
920 break;
921 }
923 return rc;
924 }
926 #ifdef __x86_64__
928 #define loadsegment(seg,value) ({ \
929 int __r = 1; \
930 __asm__ __volatile__ ( \
931 "1: movl %k1,%%" #seg "\n2:\n" \
932 ".section .fixup,\"ax\"\n" \
933 "3: xorl %k0,%k0\n" \
934 " movl %k0,%%" #seg "\n" \
935 " jmp 2b\n" \
936 ".previous\n" \
937 ".section __ex_table,\"a\"\n" \
938 " .align 8\n" \
939 " .quad 1b,3b\n" \
940 ".previous" \
941 : "=r" (__r) : "r" (value), "0" (__r) );\
942 __r; })
944 /*
945 * save_segments() writes a mask of segments which are dirty (non-zero),
946 * allowing load_segments() to avoid some expensive segment loads and
947 * MSR writes.
948 */
949 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
950 #define DIRTY_DS 0x01
951 #define DIRTY_ES 0x02
952 #define DIRTY_FS 0x04
953 #define DIRTY_GS 0x08
954 #define DIRTY_FS_BASE 0x10
955 #define DIRTY_GS_BASE_USER 0x20
957 static void load_segments(struct vcpu *n)
958 {
959 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
960 int all_segs_okay = 1;
961 unsigned int dirty_segment_mask, cpu = smp_processor_id();
963 /* Load and clear the dirty segment mask. */
964 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
965 per_cpu(dirty_segment_mask, cpu) = 0;
967 /* Either selector != 0 ==> reload. */
968 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
969 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
971 /* Either selector != 0 ==> reload. */
972 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
973 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
975 /*
976 * Either selector != 0 ==> reload.
977 * Also reload to reset FS_BASE if it was non-zero.
978 */
979 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
980 nctxt->user_regs.fs) )
981 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
983 /*
984 * Either selector != 0 ==> reload.
985 * Also reload to reset GS_BASE if it was non-zero.
986 */
987 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
988 nctxt->user_regs.gs) )
989 {
990 /* Reset GS_BASE with user %gs? */
991 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
992 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
993 }
995 if ( !is_pv_32on64_domain(n->domain) )
996 {
997 /* This can only be non-zero if selector is NULL. */
998 if ( nctxt->fs_base )
999 wrmsr(MSR_FS_BASE,
1000 nctxt->fs_base,
1001 nctxt->fs_base>>32);
1003 /* Most kernels have non-zero GS base, so don't bother testing. */
1004 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1005 wrmsr(MSR_SHADOW_GS_BASE,
1006 nctxt->gs_base_kernel,
1007 nctxt->gs_base_kernel>>32);
1009 /* This can only be non-zero if selector is NULL. */
1010 if ( nctxt->gs_base_user )
1011 wrmsr(MSR_GS_BASE,
1012 nctxt->gs_base_user,
1013 nctxt->gs_base_user>>32);
1015 /* If in kernel mode then switch the GS bases around. */
1016 if ( (n->arch.flags & TF_kernel_mode) )
1017 __asm__ __volatile__ ( "swapgs" );
1020 if ( unlikely(!all_segs_okay) )
1022 struct cpu_user_regs *regs = guest_cpu_user_regs();
1023 unsigned long *rsp =
1024 (n->arch.flags & TF_kernel_mode) ?
1025 (unsigned long *)regs->rsp :
1026 (unsigned long *)nctxt->kernel_sp;
1027 unsigned long cs_and_mask, rflags;
1029 if ( is_pv_32on64_domain(n->domain) )
1031 unsigned int *esp = ring_1(regs) ?
1032 (unsigned int *)regs->rsp :
1033 (unsigned int *)nctxt->kernel_sp;
1034 unsigned int cs_and_mask, eflags;
1035 int ret = 0;
1037 /* CS longword also contains full evtchn_upcall_mask. */
1038 cs_and_mask = (unsigned short)regs->cs |
1039 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1040 /* Fold upcall mask into RFLAGS.IF. */
1041 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1042 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1044 if ( !ring_1(regs) )
1046 ret = put_user(regs->ss, esp-1);
1047 ret |= put_user(regs->_esp, esp-2);
1048 esp -= 2;
1051 if ( ret |
1052 put_user(eflags, esp-1) |
1053 put_user(cs_and_mask, esp-2) |
1054 put_user(regs->_eip, esp-3) |
1055 put_user(nctxt->user_regs.gs, esp-4) |
1056 put_user(nctxt->user_regs.fs, esp-5) |
1057 put_user(nctxt->user_regs.es, esp-6) |
1058 put_user(nctxt->user_regs.ds, esp-7) )
1060 gdprintk(XENLOG_ERR, "Error while creating compat "
1061 "failsafe callback frame.\n");
1062 domain_crash(n->domain);
1065 if ( test_bit(_VGCF_failsafe_disables_events,
1066 &n->arch.guest_context.flags) )
1067 vcpu_info(n, evtchn_upcall_mask) = 1;
1069 regs->entry_vector = TRAP_syscall;
1070 regs->_eflags &= 0xFFFCBEFFUL;
1071 regs->ss = FLAT_COMPAT_KERNEL_SS;
1072 regs->_esp = (unsigned long)(esp-7);
1073 regs->cs = FLAT_COMPAT_KERNEL_CS;
1074 regs->_eip = nctxt->failsafe_callback_eip;
1075 return;
1078 if ( !(n->arch.flags & TF_kernel_mode) )
1079 toggle_guest_mode(n);
1080 else
1081 regs->cs &= ~3;
1083 /* CS longword also contains full evtchn_upcall_mask. */
1084 cs_and_mask = (unsigned long)regs->cs |
1085 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1087 /* Fold upcall mask into RFLAGS.IF. */
1088 rflags = regs->rflags & ~X86_EFLAGS_IF;
1089 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1091 if ( put_user(regs->ss, rsp- 1) |
1092 put_user(regs->rsp, rsp- 2) |
1093 put_user(rflags, rsp- 3) |
1094 put_user(cs_and_mask, rsp- 4) |
1095 put_user(regs->rip, rsp- 5) |
1096 put_user(nctxt->user_regs.gs, rsp- 6) |
1097 put_user(nctxt->user_regs.fs, rsp- 7) |
1098 put_user(nctxt->user_regs.es, rsp- 8) |
1099 put_user(nctxt->user_regs.ds, rsp- 9) |
1100 put_user(regs->r11, rsp-10) |
1101 put_user(regs->rcx, rsp-11) )
1103 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1104 "callback frame.\n");
1105 domain_crash(n->domain);
1108 if ( test_bit(_VGCF_failsafe_disables_events,
1109 &n->arch.guest_context.flags) )
1110 vcpu_info(n, evtchn_upcall_mask) = 1;
1112 regs->entry_vector = TRAP_syscall;
1113 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1114 X86_EFLAGS_NT|X86_EFLAGS_TF);
1115 regs->ss = FLAT_KERNEL_SS;
1116 regs->rsp = (unsigned long)(rsp-11);
1117 regs->cs = FLAT_KERNEL_CS;
1118 regs->rip = nctxt->failsafe_callback_eip;
1122 static void save_segments(struct vcpu *v)
1124 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1125 struct cpu_user_regs *regs = &ctxt->user_regs;
1126 unsigned int dirty_segment_mask = 0;
1128 regs->ds = read_segment_register(ds);
1129 regs->es = read_segment_register(es);
1130 regs->fs = read_segment_register(fs);
1131 regs->gs = read_segment_register(gs);
1133 if ( regs->ds )
1134 dirty_segment_mask |= DIRTY_DS;
1136 if ( regs->es )
1137 dirty_segment_mask |= DIRTY_ES;
1139 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1141 dirty_segment_mask |= DIRTY_FS;
1142 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1144 else if ( ctxt->fs_base )
1146 dirty_segment_mask |= DIRTY_FS_BASE;
1149 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1151 dirty_segment_mask |= DIRTY_GS;
1152 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1154 else if ( ctxt->gs_base_user )
1156 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1159 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1162 #define switch_kernel_stack(v) ((void)0)
1164 #elif defined(__i386__)
1166 #define load_segments(n) ((void)0)
1167 #define save_segments(p) ((void)0)
1169 static inline void switch_kernel_stack(struct vcpu *v)
1171 struct tss_struct *tss = &init_tss[smp_processor_id()];
1172 tss->esp1 = v->arch.guest_context.kernel_sp;
1173 tss->ss1 = v->arch.guest_context.kernel_ss;
1176 #endif /* __i386__ */
1178 static void paravirt_ctxt_switch_from(struct vcpu *v)
1180 save_segments(v);
1183 static void paravirt_ctxt_switch_to(struct vcpu *v)
1185 set_int80_direct_trap(v);
1186 switch_kernel_stack(v);
1189 #define loaddebug(_v,_reg) \
1190 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
1192 static void __context_switch(void)
1194 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1195 unsigned int cpu = smp_processor_id();
1196 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1197 struct vcpu *n = current;
1199 ASSERT(p != n);
1200 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1202 if ( !is_idle_vcpu(p) )
1204 memcpy(&p->arch.guest_context.user_regs,
1205 stack_regs,
1206 CTXT_SWITCH_STACK_BYTES);
1207 unlazy_fpu(p);
1208 p->arch.ctxt_switch_from(p);
1211 if ( !is_idle_vcpu(n) )
1213 memcpy(stack_regs,
1214 &n->arch.guest_context.user_regs,
1215 CTXT_SWITCH_STACK_BYTES);
1217 /* Maybe switch the debug registers. */
1218 if ( unlikely(n->arch.guest_context.debugreg[7]) )
1220 loaddebug(&n->arch.guest_context, 0);
1221 loaddebug(&n->arch.guest_context, 1);
1222 loaddebug(&n->arch.guest_context, 2);
1223 loaddebug(&n->arch.guest_context, 3);
1224 /* no 4 and 5 */
1225 loaddebug(&n->arch.guest_context, 6);
1226 loaddebug(&n->arch.guest_context, 7);
1228 n->arch.ctxt_switch_to(n);
1231 if ( p->domain != n->domain )
1232 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1233 cpu_set(cpu, n->vcpu_dirty_cpumask);
1235 write_ptbase(n);
1237 if ( p->vcpu_id != n->vcpu_id )
1239 char gdt_load[10];
1240 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1241 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1242 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
1245 if ( p->domain != n->domain )
1246 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1247 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1249 per_cpu(curr_vcpu, cpu) = n;
1253 void context_switch(struct vcpu *prev, struct vcpu *next)
1255 unsigned int cpu = smp_processor_id();
1256 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1258 ASSERT(local_irq_is_enabled());
1260 /* Allow at most one CPU at a time to be dirty. */
1261 ASSERT(cpus_weight(dirty_mask) <= 1);
1262 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1264 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1265 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1266 flush_tlb_mask(next->vcpu_dirty_cpumask);
1269 local_irq_disable();
1271 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1272 pt_freeze_time(prev);
1274 set_current(next);
1276 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1278 local_irq_enable();
1280 else
1282 __context_switch();
1284 #ifdef CONFIG_COMPAT
1285 if ( !is_hvm_vcpu(next) &&
1286 (is_idle_vcpu(prev) ||
1287 is_hvm_vcpu(prev) ||
1288 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1290 uint64_t efer = read_efer();
1292 local_flush_tlb_one(GDT_VIRT_START(next) +
1293 FIRST_RESERVED_GDT_BYTE);
1295 if ( !is_pv_32on64_vcpu(next) == !(efer & EFER_SCE) )
1296 write_efer(efer ^ EFER_SCE);
1298 #endif
1300 /* Re-enable interrupts before restoring state which may fault. */
1301 local_irq_enable();
1303 if ( !is_hvm_vcpu(next) )
1305 load_LDT(next);
1306 load_segments(next);
1310 context_saved(prev);
1312 /* Update per-VCPU guest runstate shared memory area (if registered). */
1313 if ( !guest_handle_is_null(runstate_guest(next)) )
1315 if ( !is_pv_32on64_domain(next->domain) )
1316 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1317 #ifdef CONFIG_COMPAT
1318 else
1320 struct compat_vcpu_runstate_info info;
1322 XLAT_vcpu_runstate_info(&info, &next->runstate);
1323 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1325 #endif
1328 schedule_tail(next);
1329 BUG();
1332 void continue_running(struct vcpu *same)
1334 schedule_tail(same);
1335 BUG();
1338 int __sync_lazy_execstate(void)
1340 unsigned long flags;
1341 int switch_required;
1343 local_irq_save(flags);
1345 switch_required = (this_cpu(curr_vcpu) != current);
1347 if ( switch_required )
1349 ASSERT(current == idle_vcpu[smp_processor_id()]);
1350 __context_switch();
1353 local_irq_restore(flags);
1355 return switch_required;
1358 void sync_vcpu_execstate(struct vcpu *v)
1360 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1361 (void)__sync_lazy_execstate();
1363 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1364 flush_tlb_mask(v->vcpu_dirty_cpumask);
1367 struct migrate_info {
1368 long (*func)(void *data);
1369 void *data;
1370 void (*saved_schedule_tail)(struct vcpu *);
1371 cpumask_t saved_affinity;
1372 };
1374 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1376 struct cpu_user_regs *regs = guest_cpu_user_regs();
1377 struct migrate_info *info = v->arch.continue_info;
1379 regs->eax = info->func(info->data);
1381 v->arch.schedule_tail = info->saved_schedule_tail;
1382 v->cpu_affinity = info->saved_affinity;
1384 xfree(info);
1385 v->arch.continue_info = NULL;
1387 vcpu_set_affinity(v, &v->cpu_affinity);
1388 schedule_tail(v);
1391 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1393 struct vcpu *v = current;
1394 struct migrate_info *info;
1395 cpumask_t mask = cpumask_of_cpu(cpu);
1397 if ( cpu == smp_processor_id() )
1398 return func(data);
1400 info = xmalloc(struct migrate_info);
1401 if ( info == NULL )
1402 return -ENOMEM;
1404 info->func = func;
1405 info->data = data;
1406 info->saved_schedule_tail = v->arch.schedule_tail;
1407 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1409 info->saved_affinity = v->cpu_affinity;
1410 v->arch.continue_info = info;
1412 vcpu_set_affinity(v, &mask);
1414 /* Dummy return value will be overwritten by new schedule_tail. */
1415 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1416 return 0;
1419 #define next_arg(fmt, args) ({ \
1420 unsigned long __arg; \
1421 switch ( *(fmt)++ ) \
1422 { \
1423 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1424 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1425 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1426 default: __arg = 0; BUG(); \
1427 } \
1428 __arg; \
1429 })
1431 DEFINE_PER_CPU(char, hc_preempted);
1433 unsigned long hypercall_create_continuation(
1434 unsigned int op, const char *format, ...)
1436 struct mc_state *mcs = &this_cpu(mc_state);
1437 struct cpu_user_regs *regs;
1438 const char *p = format;
1439 unsigned long arg;
1440 unsigned int i;
1441 va_list args;
1443 va_start(args, format);
1445 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1447 __set_bit(_MCSF_call_preempted, &mcs->flags);
1449 for ( i = 0; *p != '\0'; i++ )
1450 mcs->call.args[i] = next_arg(p, args);
1451 if ( is_pv_32on64_domain(current->domain) )
1453 for ( ; i < 6; i++ )
1454 mcs->call.args[i] = 0;
1457 else
1459 regs = guest_cpu_user_regs();
1460 regs->eax = op;
1461 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1463 #ifdef __x86_64__
1464 if ( !is_hvm_vcpu(current) ?
1465 !is_pv_32on64_vcpu(current) :
1466 (hvm_guest_x86_mode(current) == 8) )
1468 for ( i = 0; *p != '\0'; i++ )
1470 arg = next_arg(p, args);
1471 switch ( i )
1473 case 0: regs->rdi = arg; break;
1474 case 1: regs->rsi = arg; break;
1475 case 2: regs->rdx = arg; break;
1476 case 3: regs->r10 = arg; break;
1477 case 4: regs->r8 = arg; break;
1478 case 5: regs->r9 = arg; break;
1482 else
1483 #endif
1485 if ( supervisor_mode_kernel )
1486 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1488 for ( i = 0; *p != '\0'; i++ )
1490 arg = next_arg(p, args);
1491 switch ( i )
1493 case 0: regs->ebx = arg; break;
1494 case 1: regs->ecx = arg; break;
1495 case 2: regs->edx = arg; break;
1496 case 3: regs->esi = arg; break;
1497 case 4: regs->edi = arg; break;
1498 case 5: regs->ebp = arg; break;
1503 this_cpu(hc_preempted) = 1;
1506 va_end(args);
1508 return op;
1511 #ifdef CONFIG_COMPAT
1512 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1514 int rc = 0;
1515 struct mc_state *mcs = &this_cpu(mc_state);
1516 struct cpu_user_regs *regs;
1517 unsigned int i, cval = 0;
1518 unsigned long nval = 0;
1519 va_list args;
1521 BUG_ON(*id > 5);
1522 BUG_ON(mask & (1U << *id));
1524 va_start(args, mask);
1526 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1528 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1529 return 0;
1530 for ( i = 0; i < 6; ++i, mask >>= 1 )
1532 if ( mask & 1 )
1534 nval = va_arg(args, unsigned long);
1535 cval = va_arg(args, unsigned int);
1536 if ( cval == nval )
1537 mask &= ~1U;
1538 else
1539 BUG_ON(nval == (unsigned int)nval);
1541 else if ( id && *id == i )
1543 *id = mcs->call.args[i];
1544 id = NULL;
1546 if ( (mask & 1) && mcs->call.args[i] == nval )
1548 mcs->call.args[i] = cval;
1549 ++rc;
1551 else
1552 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1555 else
1557 regs = guest_cpu_user_regs();
1558 for ( i = 0; i < 6; ++i, mask >>= 1 )
1560 unsigned long *reg;
1562 switch ( i )
1564 case 0: reg = &regs->ebx; break;
1565 case 1: reg = &regs->ecx; break;
1566 case 2: reg = &regs->edx; break;
1567 case 3: reg = &regs->esi; break;
1568 case 4: reg = &regs->edi; break;
1569 case 5: reg = &regs->ebp; break;
1570 default: BUG(); reg = NULL; break;
1572 if ( (mask & 1) )
1574 nval = va_arg(args, unsigned long);
1575 cval = va_arg(args, unsigned int);
1576 if ( cval == nval )
1577 mask &= ~1U;
1578 else
1579 BUG_ON(nval == (unsigned int)nval);
1581 else if ( id && *id == i )
1583 *id = *reg;
1584 id = NULL;
1586 if ( (mask & 1) && *reg == nval )
1588 *reg = cval;
1589 ++rc;
1591 else
1592 BUG_ON(*reg != (unsigned int)*reg);
1596 va_end(args);
1598 return rc;
1600 #endif
1602 static void relinquish_memory(struct domain *d, struct list_head *list,
1603 unsigned long type)
1605 struct list_head *ent;
1606 struct page_info *page;
1607 unsigned long x, y;
1609 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1610 spin_lock_recursive(&d->page_alloc_lock);
1612 ent = list->next;
1613 while ( ent != list )
1615 page = list_entry(ent, struct page_info, list);
1617 /* Grab a reference to the page so it won't disappear from under us. */
1618 if ( unlikely(!get_page(page, d)) )
1620 /* Couldn't get a reference -- someone is freeing this page. */
1621 ent = ent->next;
1622 continue;
1625 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1626 put_page_and_type(page);
1628 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1629 put_page(page);
1631 /*
1632 * Forcibly invalidate top-most, still valid page tables at this point
1633 * to break circular 'linear page table' references. This is okay
1634 * because MMU structures are not shared across domains and this domain
1635 * is now dead. Thus top-most valid tables are not in use so a non-zero
1636 * count means circular reference.
1637 */
1638 y = page->u.inuse.type_info;
1639 for ( ; ; )
1641 x = y;
1642 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1643 (type|PGT_validated)) )
1644 break;
1646 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1647 if ( likely(y == x) )
1649 free_page_type(page, type);
1650 break;
1654 /* Follow the list chain and /then/ potentially free the page. */
1655 ent = ent->next;
1656 put_page(page);
1659 spin_unlock_recursive(&d->page_alloc_lock);
1662 static void vcpu_destroy_pagetables(struct vcpu *v)
1664 struct domain *d = v->domain;
1665 unsigned long pfn;
1667 #ifdef __x86_64__
1668 if ( is_pv_32on64_vcpu(v) )
1670 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1671 __va(pagetable_get_paddr(v->arch.guest_table)));
1673 if ( pfn != 0 )
1675 if ( paging_mode_refcounts(d) )
1676 put_page(mfn_to_page(pfn));
1677 else
1678 put_page_and_type(mfn_to_page(pfn));
1681 l4e_write(
1682 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1683 l4e_empty());
1685 v->arch.cr3 = 0;
1686 return;
1688 #endif
1690 pfn = pagetable_get_pfn(v->arch.guest_table);
1691 if ( pfn != 0 )
1693 if ( paging_mode_refcounts(d) )
1694 put_page(mfn_to_page(pfn));
1695 else
1696 put_page_and_type(mfn_to_page(pfn));
1697 #ifdef __x86_64__
1698 if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) )
1699 v->arch.guest_table_user = pagetable_null();
1700 #endif
1701 v->arch.guest_table = pagetable_null();
1704 #ifdef __x86_64__
1705 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1706 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1707 if ( pfn != 0 )
1709 if ( paging_mode_refcounts(d) )
1710 put_page(mfn_to_page(pfn));
1711 else
1712 put_page_and_type(mfn_to_page(pfn));
1713 v->arch.guest_table_user = pagetable_null();
1715 #endif
1717 v->arch.cr3 = 0;
1720 void domain_relinquish_resources(struct domain *d)
1722 struct vcpu *v;
1724 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1726 /* Tear down paging-assistance stuff. */
1727 paging_teardown(d);
1729 /* Drop the in-use references to page-table bases. */
1730 for_each_vcpu ( d, v )
1731 vcpu_destroy_pagetables(v);
1733 /*
1734 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1735 * it automatically gets squashed when the guest's mappings go away.
1736 */
1737 for_each_vcpu(d, v)
1738 destroy_gdt(v);
1740 /* Relinquish every page of memory. */
1741 #if CONFIG_PAGING_LEVELS >= 4
1742 relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
1743 relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1744 #endif
1745 #if CONFIG_PAGING_LEVELS >= 3
1746 relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
1747 relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1748 #endif
1749 relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
1750 relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1752 /* Free page used by xen oprofile buffer. */
1753 free_xenoprof_pages(d);
1755 if ( is_hvm_domain(d) )
1756 hvm_domain_relinquish_resources(d);
1759 void arch_dump_domain_info(struct domain *d)
1761 paging_dump_domain_info(d);
1764 void arch_dump_vcpu_info(struct vcpu *v)
1766 paging_dump_vcpu_info(v);
1769 /*
1770 * Local variables:
1771 * mode: C
1772 * c-set-style: "BSD"
1773 * c-basic-offset: 4
1774 * tab-width: 4
1775 * indent-tabs-mode: nil
1776 * End:
1777 */