ia64/xen-unstable

view xen/arch/x86/domain.c @ 15828:3b50a7e52ff2

Implement x86 continuable domain destroy.
This patch addresses the following bug report.
http://bugzilla.xensource.com/bugzilla/show_bug.cgi?id=1037
Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author kfraser@localhost.localdomain
date Fri Aug 31 17:00:11 2007 +0100 (2007-08-31)
parents bd59dd48e208
children f8e7f06b351c
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <asm/regs.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/system.h>
35 #include <asm/io.h>
36 #include <asm/processor.h>
37 #include <asm/desc.h>
38 #include <asm/i387.h>
39 #include <asm/mpspec.h>
40 #include <asm/ldt.h>
41 #include <asm/paging.h>
42 #include <asm/hypercall.h>
43 #include <asm/hvm/hvm.h>
44 #include <asm/hvm/support.h>
45 #include <asm/msr.h>
46 #include <asm/nmi.h>
47 #ifdef CONFIG_COMPAT
48 #include <compat/vcpu.h>
49 #endif
51 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
52 DEFINE_PER_CPU(__u64, efer);
54 static void unmap_vcpu_info(struct vcpu *v);
56 static void paravirt_ctxt_switch_from(struct vcpu *v);
57 static void paravirt_ctxt_switch_to(struct vcpu *v);
59 static void vcpu_destroy_pagetables(struct vcpu *v);
61 static void continue_idle_domain(struct vcpu *v)
62 {
63 reset_stack_and_jump(idle_loop);
64 }
66 static void continue_nonidle_domain(struct vcpu *v)
67 {
68 reset_stack_and_jump(ret_from_intr);
69 }
71 static void default_idle(void)
72 {
73 local_irq_disable();
74 if ( !softirq_pending(smp_processor_id()) )
75 safe_halt();
76 else
77 local_irq_enable();
78 }
80 static void play_dead(void)
81 {
82 __cpu_disable();
83 /* This must be done before dead CPU ack */
84 cpu_exit_clear();
85 hvm_cpu_down();
86 wbinvd();
87 mb();
88 /* Ack it */
89 __get_cpu_var(cpu_state) = CPU_DEAD;
91 /* With physical CPU hotplug, we should halt the cpu. */
92 local_irq_disable();
93 for ( ; ; )
94 halt();
95 }
97 void idle_loop(void)
98 {
99 for ( ; ; )
100 {
101 if (cpu_is_offline(smp_processor_id()))
102 play_dead();
103 page_scrub_schedule_work();
104 default_idle();
105 do_softirq();
106 }
107 }
109 void startup_cpu_idle_loop(void)
110 {
111 struct vcpu *v = current;
113 ASSERT(is_idle_vcpu(v));
114 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
115 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
117 reset_stack_and_jump(idle_loop);
118 }
120 void dump_pageframe_info(struct domain *d)
121 {
122 struct page_info *page;
124 printk("Memory pages belonging to domain %u:\n", d->domain_id);
126 if ( d->tot_pages >= 10 )
127 {
128 printk(" DomPage list too long to display\n");
129 }
130 else
131 {
132 list_for_each_entry ( page, &d->page_list, list )
133 {
134 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
135 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
136 page->count_info, page->u.inuse.type_info);
137 }
138 }
140 list_for_each_entry ( page, &d->xenpage_list, list )
141 {
142 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
143 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
144 page->count_info, page->u.inuse.type_info);
145 }
146 }
148 struct vcpu *alloc_vcpu_struct(void)
149 {
150 struct vcpu *v;
151 if ( (v = xmalloc(struct vcpu)) != NULL )
152 memset(v, 0, sizeof(*v));
153 return v;
154 }
156 void free_vcpu_struct(struct vcpu *v)
157 {
158 xfree(v);
159 }
161 #ifdef CONFIG_COMPAT
163 int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
164 {
165 struct domain *d = v->domain;
166 unsigned i;
167 struct page_info *pg;
169 if ( !d->arch.mm_arg_xlat_l3 )
170 {
171 pg = alloc_domheap_page(NULL);
172 if ( !pg )
173 return -ENOMEM;
174 d->arch.mm_arg_xlat_l3 = page_to_virt(pg);
175 clear_page(d->arch.mm_arg_xlat_l3);
176 }
178 l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
179 l4e_from_paddr(__pa(d->arch.mm_arg_xlat_l3), __PAGE_HYPERVISOR);
181 for ( i = 0; i < COMPAT_ARG_XLAT_PAGES; ++i )
182 {
183 unsigned long va = COMPAT_ARG_XLAT_VIRT_START(v->vcpu_id) + i * PAGE_SIZE;
184 l2_pgentry_t *l2tab;
185 l1_pgentry_t *l1tab;
187 if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
188 {
189 pg = alloc_domheap_page(NULL);
190 if ( !pg )
191 return -ENOMEM;
192 clear_page(page_to_virt(pg));
193 d->arch.mm_arg_xlat_l3[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
194 }
195 l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
196 if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
197 {
198 pg = alloc_domheap_page(NULL);
199 if ( !pg )
200 return -ENOMEM;
201 clear_page(page_to_virt(pg));
202 l2tab[l2_table_offset(va)] = l2e_from_page(pg, __PAGE_HYPERVISOR);
203 }
204 l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
205 BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
206 pg = alloc_domheap_page(NULL);
207 if ( !pg )
208 return -ENOMEM;
209 l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
210 }
212 return 0;
213 }
215 static void release_arg_xlat_area(struct domain *d)
216 {
217 if ( d->arch.mm_arg_xlat_l3 )
218 {
219 unsigned l3;
221 for ( l3 = 0; l3 < L3_PAGETABLE_ENTRIES; ++l3 )
222 {
223 if ( l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3]) )
224 {
225 l2_pgentry_t *l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3]);
226 unsigned l2;
228 for ( l2 = 0; l2 < L2_PAGETABLE_ENTRIES; ++l2 )
229 {
230 if ( l2e_get_intpte(l2tab[l2]) )
231 {
232 l1_pgentry_t *l1tab = l2e_to_l1e(l2tab[l2]);
233 unsigned l1;
235 for ( l1 = 0; l1 < L1_PAGETABLE_ENTRIES; ++l1 )
236 {
237 if ( l1e_get_intpte(l1tab[l1]) )
238 free_domheap_page(l1e_get_page(l1tab[l1]));
239 }
240 free_domheap_page(l2e_get_page(l2tab[l2]));
241 }
242 }
243 free_domheap_page(l3e_get_page(d->arch.mm_arg_xlat_l3[l3]));
244 }
245 }
246 free_domheap_page(virt_to_page(d->arch.mm_arg_xlat_l3));
247 }
248 }
250 static int setup_compat_l4(struct vcpu *v)
251 {
252 struct page_info *pg = alloc_domheap_page(NULL);
253 l4_pgentry_t *l4tab;
254 int rc;
256 if ( pg == NULL )
257 return -ENOMEM;
259 /* This page needs to look like a pagetable so that it can be shadowed */
260 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated;
262 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
263 l4tab[0] = l4e_empty();
264 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
265 l4e_from_page(pg, __PAGE_HYPERVISOR);
266 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
267 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
268 __PAGE_HYPERVISOR);
270 if ( (rc = setup_arg_xlat_area(v, l4tab)) < 0 )
271 {
272 free_domheap_page(pg);
273 return rc;
274 }
276 v->arch.guest_table = pagetable_from_page(pg);
277 v->arch.guest_table_user = v->arch.guest_table;
279 return 0;
280 }
282 static void release_compat_l4(struct vcpu *v)
283 {
284 free_domheap_page(pagetable_get_page(v->arch.guest_table));
285 v->arch.guest_table = pagetable_null();
286 v->arch.guest_table_user = pagetable_null();
287 }
289 static inline int may_switch_mode(struct domain *d)
290 {
291 return (!is_hvm_domain(d) && (d->tot_pages == 0));
292 }
294 int switch_native(struct domain *d)
295 {
296 l1_pgentry_t gdt_l1e;
297 unsigned int vcpuid;
299 if ( d == NULL )
300 return -EINVAL;
301 if ( !may_switch_mode(d) )
302 return -EACCES;
303 if ( !is_pv_32on64_domain(d) )
304 return 0;
306 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
307 release_arg_xlat_area(d);
309 /* switch gdt */
310 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
311 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
312 {
313 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
314 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
315 if (d->vcpu[vcpuid])
316 release_compat_l4(d->vcpu[vcpuid]);
317 }
319 d->arch.physaddr_bitsize = 64;
321 return 0;
322 }
324 int switch_compat(struct domain *d)
325 {
326 l1_pgentry_t gdt_l1e;
327 unsigned int vcpuid;
329 if ( d == NULL )
330 return -EINVAL;
331 if ( !may_switch_mode(d) )
332 return -EACCES;
333 if ( is_pv_32on64_domain(d) )
334 return 0;
336 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
338 /* switch gdt */
339 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
340 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
341 {
342 if ( (d->vcpu[vcpuid] != NULL) &&
343 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
344 goto undo_and_fail;
345 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
346 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
347 }
349 d->arch.physaddr_bitsize =
350 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
351 + (PAGE_SIZE - 2);
353 return 0;
355 undo_and_fail:
356 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
357 release_arg_xlat_area(d);
358 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
359 while ( vcpuid-- != 0 )
360 {
361 if ( d->vcpu[vcpuid] != NULL )
362 release_compat_l4(d->vcpu[vcpuid]);
363 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
364 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
365 }
366 return -ENOMEM;
367 }
369 #else
370 #define release_arg_xlat_area(d) ((void)0)
371 #define setup_compat_l4(v) 0
372 #define release_compat_l4(v) ((void)0)
373 #endif
375 int vcpu_initialise(struct vcpu *v)
376 {
377 struct domain *d = v->domain;
378 int rc;
380 v->arch.vcpu_info_mfn = INVALID_MFN;
382 v->arch.flags = TF_kernel_mode;
384 pae_l3_cache_init(&v->arch.pae_l3_cache);
386 paging_vcpu_init(v);
388 if ( is_hvm_domain(d) )
389 {
390 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
391 return rc;
392 }
393 else
394 {
395 /* PV guests by default have a 100Hz ticker. */
396 v->periodic_period = MILLISECS(10);
398 /* PV guests get an emulated PIT too for video BIOSes to use. */
399 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
400 pit_init(v, cpu_khz);
402 v->arch.schedule_tail = continue_nonidle_domain;
403 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
404 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
406 if ( is_idle_domain(d) )
407 {
408 v->arch.schedule_tail = continue_idle_domain;
409 v->arch.cr3 = __pa(idle_pg_table);
410 }
411 }
413 v->arch.perdomain_ptes =
414 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
416 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
417 }
419 void vcpu_destroy(struct vcpu *v)
420 {
421 if ( is_pv_32on64_vcpu(v) )
422 release_compat_l4(v);
424 unmap_vcpu_info(v);
426 if ( is_hvm_vcpu(v) )
427 hvm_vcpu_destroy(v);
428 }
430 int arch_domain_create(struct domain *d)
431 {
432 #ifdef __x86_64__
433 struct page_info *pg;
434 int i;
435 #endif
436 l1_pgentry_t gdt_l1e;
437 int vcpuid, pdpt_order, paging_initialised = 0;
438 int rc = -ENOMEM;
440 d->arch.relmem = RELMEM_not_started;
441 INIT_LIST_HEAD(&d->arch.relmem_list);
443 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
444 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
445 if ( d->arch.mm_perdomain_pt == NULL )
446 goto fail;
447 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
449 /*
450 * Map Xen segments into every VCPU's GDT, irrespective of whether every
451 * VCPU will actually be used. This avoids an NMI race during context
452 * switch: if we take an interrupt after switching CR3 but before switching
453 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
454 * try to load CS from an invalid table.
455 */
456 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
457 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
458 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
459 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
461 #if defined(__i386__)
463 mapcache_init(d);
465 #else /* __x86_64__ */
467 if ( (pg = alloc_domheap_page(NULL)) == NULL )
468 goto fail;
469 d->arch.mm_perdomain_l2 = page_to_virt(pg);
470 clear_page(d->arch.mm_perdomain_l2);
471 for ( i = 0; i < (1 << pdpt_order); i++ )
472 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
473 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
474 __PAGE_HYPERVISOR);
476 if ( (pg = alloc_domheap_page(NULL)) == NULL )
477 goto fail;
478 d->arch.mm_perdomain_l3 = page_to_virt(pg);
479 clear_page(d->arch.mm_perdomain_l3);
480 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
481 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
482 __PAGE_HYPERVISOR);
484 #endif /* __x86_64__ */
486 #ifdef CONFIG_COMPAT
487 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
488 #endif
490 paging_domain_init(d);
491 paging_initialised = 1;
493 if ( !is_idle_domain(d) )
494 {
495 d->arch.ioport_caps =
496 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
497 if ( d->arch.ioport_caps == NULL )
498 goto fail;
500 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
501 goto fail;
503 clear_page(d->shared_info);
504 share_xen_page_with_guest(
505 virt_to_page(d->shared_info), d, XENSHARE_writable);
506 }
508 if ( is_hvm_domain(d) )
509 {
510 if ( (rc = hvm_domain_initialise(d)) != 0 )
511 goto fail;
512 }
513 else
514 {
515 /* 32-bit PV guest by default only if Xen is not 64-bit. */
516 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
517 (CONFIG_PAGING_LEVELS != 4);
518 }
520 return 0;
522 fail:
523 free_xenheap_page(d->shared_info);
524 if ( paging_initialised )
525 paging_final_teardown(d);
526 #ifdef __x86_64__
527 if ( d->arch.mm_perdomain_l2 )
528 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
529 if ( d->arch.mm_perdomain_l3 )
530 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
531 #endif
532 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
533 return rc;
534 }
536 void arch_domain_destroy(struct domain *d)
537 {
538 if ( is_hvm_domain(d) )
539 hvm_domain_destroy(d);
541 paging_final_teardown(d);
543 free_xenheap_pages(
544 d->arch.mm_perdomain_pt,
545 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
547 #ifdef __x86_64__
548 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
549 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
550 #endif
552 if ( is_pv_32on64_domain(d) )
553 release_arg_xlat_area(d);
555 free_xenheap_page(d->shared_info);
556 }
558 /* This is called by arch_final_setup_guest and do_boot_vcpu */
559 int arch_set_info_guest(
560 struct vcpu *v, vcpu_guest_context_u c)
561 {
562 struct domain *d = v->domain;
563 unsigned long cr3_pfn = INVALID_MFN;
564 unsigned long flags;
565 int i, rc = 0, compat;
567 /* The context is a compat-mode one if the target domain is compat-mode;
568 * we expect the tools to DTRT even in compat-mode callers. */
569 compat = is_pv_32on64_domain(d);
571 #ifdef CONFIG_COMPAT
572 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
573 #else
574 #define c(fld) (c.nat->fld)
575 #endif
576 flags = c(flags);
578 if ( !is_hvm_vcpu(v) )
579 {
580 if ( !compat )
581 {
582 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
583 fixup_guest_stack_selector(d, c.nat->kernel_ss);
584 fixup_guest_code_selector(d, c.nat->user_regs.cs);
585 #ifdef __i386__
586 fixup_guest_code_selector(d, c.nat->event_callback_cs);
587 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
588 #endif
590 for ( i = 0; i < 256; i++ )
591 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
593 /* LDT safety checks. */
594 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
595 (c.nat->ldt_ents > 8192) ||
596 !array_access_ok(c.nat->ldt_base,
597 c.nat->ldt_ents,
598 LDT_ENTRY_SIZE) )
599 return -EINVAL;
600 }
601 #ifdef CONFIG_COMPAT
602 else
603 {
604 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
605 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
606 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
607 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
608 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
610 for ( i = 0; i < 256; i++ )
611 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
613 /* LDT safety checks. */
614 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
615 (c.cmp->ldt_ents > 8192) ||
616 !compat_array_access_ok(c.cmp->ldt_base,
617 c.cmp->ldt_ents,
618 LDT_ENTRY_SIZE) )
619 return -EINVAL;
620 }
621 #endif
622 }
624 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
626 v->arch.flags &= ~TF_kernel_mode;
627 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
628 v->arch.flags |= TF_kernel_mode;
630 if ( !compat )
631 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
632 #ifdef CONFIG_COMPAT
633 else
634 {
635 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
636 }
637 #endif
639 /* Only CR0.TS is modifiable by guest or admin. */
640 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
641 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
643 init_int80_direct_trap(v);
645 if ( !is_hvm_vcpu(v) )
646 {
647 /* IOPL privileges are virtualised. */
648 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
649 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
651 /* Ensure real hardware interrupts are enabled. */
652 v->arch.guest_context.user_regs.eflags |= EF_IE;
653 }
654 else
655 {
656 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
657 }
659 if ( v->is_initialised )
660 goto out;
662 memset(v->arch.guest_context.debugreg, 0,
663 sizeof(v->arch.guest_context.debugreg));
664 for ( i = 0; i < 8; i++ )
665 (void)set_debugreg(v, i, c(debugreg[i]));
667 if ( v->vcpu_id == 0 )
668 d->vm_assist = c(vm_assist);
670 if ( !is_hvm_vcpu(v) )
671 {
672 if ( !compat )
673 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
674 #ifdef CONFIG_COMPAT
675 else
676 {
677 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
678 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
680 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
681 return -EINVAL;
682 for ( i = 0; i < n; ++i )
683 gdt_frames[i] = c.cmp->gdt_frames[i];
684 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
685 }
686 #endif
687 if ( rc != 0 )
688 return rc;
690 if ( !compat )
691 {
692 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
694 if ( !mfn_valid(cr3_pfn) ||
695 (paging_mode_refcounts(d)
696 ? !get_page(mfn_to_page(cr3_pfn), d)
697 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
698 PGT_base_page_table)) )
699 {
700 destroy_gdt(v);
701 return -EINVAL;
702 }
704 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
706 #ifdef __x86_64__
707 if ( c.nat->ctrlreg[1] )
708 {
709 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
711 if ( !mfn_valid(cr3_pfn) ||
712 (paging_mode_refcounts(d)
713 ? !get_page(mfn_to_page(cr3_pfn), d)
714 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
715 PGT_base_page_table)) )
716 {
717 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
718 v->arch.guest_table = pagetable_null();
719 if ( paging_mode_refcounts(d) )
720 put_page(mfn_to_page(cr3_pfn));
721 else
722 put_page_and_type(mfn_to_page(cr3_pfn));
723 destroy_gdt(v);
724 return -EINVAL;
725 }
727 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
728 }
729 #endif
730 }
731 #ifdef CONFIG_COMPAT
732 else
733 {
734 l4_pgentry_t *l4tab;
736 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
738 if ( !mfn_valid(cr3_pfn) ||
739 (paging_mode_refcounts(d)
740 ? !get_page(mfn_to_page(cr3_pfn), d)
741 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
742 PGT_l3_page_table)) )
743 {
744 destroy_gdt(v);
745 return -EINVAL;
746 }
748 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
749 *l4tab = l4e_from_pfn(cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
750 }
751 #endif
752 }
754 if ( v->vcpu_id == 0 )
755 update_domain_wallclock_time(d);
757 /* Don't redo final setup */
758 v->is_initialised = 1;
760 if ( paging_mode_enabled(d) )
761 paging_update_paging_modes(v);
763 update_cr3(v);
765 out:
766 if ( flags & VGCF_online )
767 clear_bit(_VPF_down, &v->pause_flags);
768 else
769 set_bit(_VPF_down, &v->pause_flags);
770 return 0;
771 #undef c
772 }
774 int arch_vcpu_reset(struct vcpu *v)
775 {
776 destroy_gdt(v);
777 vcpu_destroy_pagetables(v);
778 return 0;
779 }
781 /*
782 * Unmap the vcpu info page if the guest decided to place it somewhere
783 * else. This is only used from arch_domain_destroy, so there's no
784 * need to do anything clever.
785 */
786 static void
787 unmap_vcpu_info(struct vcpu *v)
788 {
789 struct domain *d = v->domain;
790 unsigned long mfn;
792 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
793 return;
795 mfn = v->arch.vcpu_info_mfn;
796 unmap_domain_page_global(v->vcpu_info);
798 v->vcpu_info = shared_info_addr(d, vcpu_info[v->vcpu_id]);
799 v->arch.vcpu_info_mfn = INVALID_MFN;
801 put_page_and_type(mfn_to_page(mfn));
802 }
804 /*
805 * Map a guest page in and point the vcpu_info pointer at it. This
806 * makes sure that the vcpu_info is always pointing at a valid piece
807 * of memory, and it sets a pending event to make sure that a pending
808 * event doesn't get missed.
809 */
810 static int
811 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
812 {
813 struct domain *d = v->domain;
814 void *mapping;
815 vcpu_info_t *new_info;
816 int i;
818 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
819 return -EINVAL;
821 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
822 return -EINVAL;
824 /* Run this command on yourself or on other offline VCPUS. */
825 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
826 return -EINVAL;
828 mfn = gmfn_to_mfn(d, mfn);
829 if ( !mfn_valid(mfn) ||
830 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
831 return -EINVAL;
833 mapping = map_domain_page_global(mfn);
834 if ( mapping == NULL )
835 {
836 put_page_and_type(mfn_to_page(mfn));
837 return -ENOMEM;
838 }
840 new_info = (vcpu_info_t *)(mapping + offset);
842 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
844 v->vcpu_info = new_info;
845 v->arch.vcpu_info_mfn = mfn;
847 /* Set new vcpu_info pointer /before/ setting pending flags. */
848 wmb();
850 /*
851 * Mark everything as being pending just to make sure nothing gets
852 * lost. The domain will get a spurious event, but it can cope.
853 */
854 vcpu_info(v, evtchn_upcall_pending) = 1;
855 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
856 set_bit(i, vcpu_info_addr(v, evtchn_pending_sel));
858 /*
859 * Only bother to update time for the current vcpu. If we're
860 * operating on another vcpu, then it had better not be running at
861 * the time.
862 */
863 if ( v == current )
864 update_vcpu_system_time(v);
866 return 0;
867 }
869 long
870 arch_do_vcpu_op(
871 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
872 {
873 long rc = 0;
875 switch ( cmd )
876 {
877 case VCPUOP_register_runstate_memory_area:
878 {
879 struct vcpu_register_runstate_memory_area area;
880 struct vcpu_runstate_info runstate;
882 rc = -EFAULT;
883 if ( copy_from_guest(&area, arg, 1) )
884 break;
886 if ( !guest_handle_okay(area.addr.h, 1) )
887 break;
889 rc = 0;
890 runstate_guest(v) = area.addr.h;
892 if ( v == current )
893 {
894 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
895 }
896 else
897 {
898 vcpu_runstate_get(v, &runstate);
899 __copy_to_guest(runstate_guest(v), &runstate, 1);
900 }
902 break;
903 }
905 case VCPUOP_register_vcpu_info:
906 {
907 struct domain *d = v->domain;
908 struct vcpu_register_vcpu_info info;
910 rc = -EFAULT;
911 if ( copy_from_guest(&info, arg, 1) )
912 break;
914 LOCK_BIGLOCK(d);
915 rc = map_vcpu_info(v, info.mfn, info.offset);
916 UNLOCK_BIGLOCK(d);
918 break;
919 }
921 default:
922 rc = -ENOSYS;
923 break;
924 }
926 return rc;
927 }
929 #ifdef __x86_64__
931 #define loadsegment(seg,value) ({ \
932 int __r = 1; \
933 __asm__ __volatile__ ( \
934 "1: movl %k1,%%" #seg "\n2:\n" \
935 ".section .fixup,\"ax\"\n" \
936 "3: xorl %k0,%k0\n" \
937 " movl %k0,%%" #seg "\n" \
938 " jmp 2b\n" \
939 ".previous\n" \
940 ".section __ex_table,\"a\"\n" \
941 " .align 8\n" \
942 " .quad 1b,3b\n" \
943 ".previous" \
944 : "=r" (__r) : "r" (value), "0" (__r) );\
945 __r; })
947 /*
948 * save_segments() writes a mask of segments which are dirty (non-zero),
949 * allowing load_segments() to avoid some expensive segment loads and
950 * MSR writes.
951 */
952 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
953 #define DIRTY_DS 0x01
954 #define DIRTY_ES 0x02
955 #define DIRTY_FS 0x04
956 #define DIRTY_GS 0x08
957 #define DIRTY_FS_BASE 0x10
958 #define DIRTY_GS_BASE_USER 0x20
960 static void load_segments(struct vcpu *n)
961 {
962 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
963 int all_segs_okay = 1;
964 unsigned int dirty_segment_mask, cpu = smp_processor_id();
966 /* Load and clear the dirty segment mask. */
967 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
968 per_cpu(dirty_segment_mask, cpu) = 0;
970 /* Either selector != 0 ==> reload. */
971 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
972 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
974 /* Either selector != 0 ==> reload. */
975 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
976 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
978 /*
979 * Either selector != 0 ==> reload.
980 * Also reload to reset FS_BASE if it was non-zero.
981 */
982 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
983 nctxt->user_regs.fs) )
984 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
986 /*
987 * Either selector != 0 ==> reload.
988 * Also reload to reset GS_BASE if it was non-zero.
989 */
990 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
991 nctxt->user_regs.gs) )
992 {
993 /* Reset GS_BASE with user %gs? */
994 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
995 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
996 }
998 if ( !is_pv_32on64_domain(n->domain) )
999 {
1000 /* This can only be non-zero if selector is NULL. */
1001 if ( nctxt->fs_base )
1002 wrmsr(MSR_FS_BASE,
1003 nctxt->fs_base,
1004 nctxt->fs_base>>32);
1006 /* Most kernels have non-zero GS base, so don't bother testing. */
1007 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1008 wrmsr(MSR_SHADOW_GS_BASE,
1009 nctxt->gs_base_kernel,
1010 nctxt->gs_base_kernel>>32);
1012 /* This can only be non-zero if selector is NULL. */
1013 if ( nctxt->gs_base_user )
1014 wrmsr(MSR_GS_BASE,
1015 nctxt->gs_base_user,
1016 nctxt->gs_base_user>>32);
1018 /* If in kernel mode then switch the GS bases around. */
1019 if ( (n->arch.flags & TF_kernel_mode) )
1020 __asm__ __volatile__ ( "swapgs" );
1023 if ( unlikely(!all_segs_okay) )
1025 struct cpu_user_regs *regs = guest_cpu_user_regs();
1026 unsigned long *rsp =
1027 (n->arch.flags & TF_kernel_mode) ?
1028 (unsigned long *)regs->rsp :
1029 (unsigned long *)nctxt->kernel_sp;
1030 unsigned long cs_and_mask, rflags;
1032 if ( is_pv_32on64_domain(n->domain) )
1034 unsigned int *esp = ring_1(regs) ?
1035 (unsigned int *)regs->rsp :
1036 (unsigned int *)nctxt->kernel_sp;
1037 unsigned int cs_and_mask, eflags;
1038 int ret = 0;
1040 /* CS longword also contains full evtchn_upcall_mask. */
1041 cs_and_mask = (unsigned short)regs->cs |
1042 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1043 /* Fold upcall mask into RFLAGS.IF. */
1044 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1045 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1047 if ( !ring_1(regs) )
1049 ret = put_user(regs->ss, esp-1);
1050 ret |= put_user(regs->_esp, esp-2);
1051 esp -= 2;
1054 if ( ret |
1055 put_user(eflags, esp-1) |
1056 put_user(cs_and_mask, esp-2) |
1057 put_user(regs->_eip, esp-3) |
1058 put_user(nctxt->user_regs.gs, esp-4) |
1059 put_user(nctxt->user_regs.fs, esp-5) |
1060 put_user(nctxt->user_regs.es, esp-6) |
1061 put_user(nctxt->user_regs.ds, esp-7) )
1063 gdprintk(XENLOG_ERR, "Error while creating compat "
1064 "failsafe callback frame.\n");
1065 domain_crash(n->domain);
1068 if ( test_bit(_VGCF_failsafe_disables_events,
1069 &n->arch.guest_context.flags) )
1070 vcpu_info(n, evtchn_upcall_mask) = 1;
1072 regs->entry_vector = TRAP_syscall;
1073 regs->_eflags &= 0xFFFCBEFFUL;
1074 regs->ss = FLAT_COMPAT_KERNEL_SS;
1075 regs->_esp = (unsigned long)(esp-7);
1076 regs->cs = FLAT_COMPAT_KERNEL_CS;
1077 regs->_eip = nctxt->failsafe_callback_eip;
1078 return;
1081 if ( !(n->arch.flags & TF_kernel_mode) )
1082 toggle_guest_mode(n);
1083 else
1084 regs->cs &= ~3;
1086 /* CS longword also contains full evtchn_upcall_mask. */
1087 cs_and_mask = (unsigned long)regs->cs |
1088 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1090 /* Fold upcall mask into RFLAGS.IF. */
1091 rflags = regs->rflags & ~X86_EFLAGS_IF;
1092 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1094 if ( put_user(regs->ss, rsp- 1) |
1095 put_user(regs->rsp, rsp- 2) |
1096 put_user(rflags, rsp- 3) |
1097 put_user(cs_and_mask, rsp- 4) |
1098 put_user(regs->rip, rsp- 5) |
1099 put_user(nctxt->user_regs.gs, rsp- 6) |
1100 put_user(nctxt->user_regs.fs, rsp- 7) |
1101 put_user(nctxt->user_regs.es, rsp- 8) |
1102 put_user(nctxt->user_regs.ds, rsp- 9) |
1103 put_user(regs->r11, rsp-10) |
1104 put_user(regs->rcx, rsp-11) )
1106 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1107 "callback frame.\n");
1108 domain_crash(n->domain);
1111 if ( test_bit(_VGCF_failsafe_disables_events,
1112 &n->arch.guest_context.flags) )
1113 vcpu_info(n, evtchn_upcall_mask) = 1;
1115 regs->entry_vector = TRAP_syscall;
1116 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1117 X86_EFLAGS_NT|X86_EFLAGS_TF);
1118 regs->ss = FLAT_KERNEL_SS;
1119 regs->rsp = (unsigned long)(rsp-11);
1120 regs->cs = FLAT_KERNEL_CS;
1121 regs->rip = nctxt->failsafe_callback_eip;
1125 static void save_segments(struct vcpu *v)
1127 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1128 struct cpu_user_regs *regs = &ctxt->user_regs;
1129 unsigned int dirty_segment_mask = 0;
1131 regs->ds = read_segment_register(ds);
1132 regs->es = read_segment_register(es);
1133 regs->fs = read_segment_register(fs);
1134 regs->gs = read_segment_register(gs);
1136 if ( regs->ds )
1137 dirty_segment_mask |= DIRTY_DS;
1139 if ( regs->es )
1140 dirty_segment_mask |= DIRTY_ES;
1142 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1144 dirty_segment_mask |= DIRTY_FS;
1145 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1147 else if ( ctxt->fs_base )
1149 dirty_segment_mask |= DIRTY_FS_BASE;
1152 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1154 dirty_segment_mask |= DIRTY_GS;
1155 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1157 else if ( ctxt->gs_base_user )
1159 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1162 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1165 #define switch_kernel_stack(v) ((void)0)
1167 #elif defined(__i386__)
1169 #define load_segments(n) ((void)0)
1170 #define save_segments(p) ((void)0)
1172 static inline void switch_kernel_stack(struct vcpu *v)
1174 struct tss_struct *tss = &init_tss[smp_processor_id()];
1175 tss->esp1 = v->arch.guest_context.kernel_sp;
1176 tss->ss1 = v->arch.guest_context.kernel_ss;
1179 #endif /* __i386__ */
1181 static void paravirt_ctxt_switch_from(struct vcpu *v)
1183 save_segments(v);
1186 static void paravirt_ctxt_switch_to(struct vcpu *v)
1188 set_int80_direct_trap(v);
1189 switch_kernel_stack(v);
1192 #define loaddebug(_v,_reg) \
1193 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
1195 static void __context_switch(void)
1197 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1198 unsigned int cpu = smp_processor_id();
1199 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1200 struct vcpu *n = current;
1202 ASSERT(p != n);
1203 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1205 if ( !is_idle_vcpu(p) )
1207 memcpy(&p->arch.guest_context.user_regs,
1208 stack_regs,
1209 CTXT_SWITCH_STACK_BYTES);
1210 unlazy_fpu(p);
1211 p->arch.ctxt_switch_from(p);
1214 if ( !is_idle_vcpu(n) )
1216 memcpy(stack_regs,
1217 &n->arch.guest_context.user_regs,
1218 CTXT_SWITCH_STACK_BYTES);
1220 /* Maybe switch the debug registers. */
1221 if ( unlikely(n->arch.guest_context.debugreg[7]) )
1223 loaddebug(&n->arch.guest_context, 0);
1224 loaddebug(&n->arch.guest_context, 1);
1225 loaddebug(&n->arch.guest_context, 2);
1226 loaddebug(&n->arch.guest_context, 3);
1227 /* no 4 and 5 */
1228 loaddebug(&n->arch.guest_context, 6);
1229 loaddebug(&n->arch.guest_context, 7);
1231 n->arch.ctxt_switch_to(n);
1234 if ( p->domain != n->domain )
1235 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1236 cpu_set(cpu, n->vcpu_dirty_cpumask);
1238 write_ptbase(n);
1240 if ( p->vcpu_id != n->vcpu_id )
1242 char gdt_load[10];
1243 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1244 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1245 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
1248 if ( p->domain != n->domain )
1249 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1250 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1252 per_cpu(curr_vcpu, cpu) = n;
1256 void context_switch(struct vcpu *prev, struct vcpu *next)
1258 unsigned int cpu = smp_processor_id();
1259 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1261 ASSERT(local_irq_is_enabled());
1263 /* Allow at most one CPU at a time to be dirty. */
1264 ASSERT(cpus_weight(dirty_mask) <= 1);
1265 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1267 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1268 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1269 flush_tlb_mask(next->vcpu_dirty_cpumask);
1272 local_irq_disable();
1274 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1275 pt_freeze_time(prev);
1277 set_current(next);
1279 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1281 local_irq_enable();
1283 else
1285 __context_switch();
1287 #ifdef CONFIG_COMPAT
1288 if ( !is_hvm_vcpu(next) &&
1289 (is_idle_vcpu(prev) ||
1290 is_hvm_vcpu(prev) ||
1291 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1293 uint64_t efer = read_efer();
1295 local_flush_tlb_one(GDT_VIRT_START(next) +
1296 FIRST_RESERVED_GDT_BYTE);
1298 if ( !is_pv_32on64_vcpu(next) == !(efer & EFER_SCE) )
1299 write_efer(efer ^ EFER_SCE);
1301 #endif
1303 /* Re-enable interrupts before restoring state which may fault. */
1304 local_irq_enable();
1306 if ( !is_hvm_vcpu(next) )
1308 load_LDT(next);
1309 load_segments(next);
1313 context_saved(prev);
1315 /* Update per-VCPU guest runstate shared memory area (if registered). */
1316 if ( !guest_handle_is_null(runstate_guest(next)) )
1318 if ( !is_pv_32on64_domain(next->domain) )
1319 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1320 #ifdef CONFIG_COMPAT
1321 else
1323 struct compat_vcpu_runstate_info info;
1325 XLAT_vcpu_runstate_info(&info, &next->runstate);
1326 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1328 #endif
1331 schedule_tail(next);
1332 BUG();
1335 void continue_running(struct vcpu *same)
1337 schedule_tail(same);
1338 BUG();
1341 int __sync_lazy_execstate(void)
1343 unsigned long flags;
1344 int switch_required;
1346 local_irq_save(flags);
1348 switch_required = (this_cpu(curr_vcpu) != current);
1350 if ( switch_required )
1352 ASSERT(current == idle_vcpu[smp_processor_id()]);
1353 __context_switch();
1356 local_irq_restore(flags);
1358 return switch_required;
1361 void sync_vcpu_execstate(struct vcpu *v)
1363 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1364 (void)__sync_lazy_execstate();
1366 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1367 flush_tlb_mask(v->vcpu_dirty_cpumask);
1370 struct migrate_info {
1371 long (*func)(void *data);
1372 void *data;
1373 void (*saved_schedule_tail)(struct vcpu *);
1374 cpumask_t saved_affinity;
1375 };
1377 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
1379 struct cpu_user_regs *regs = guest_cpu_user_regs();
1380 struct migrate_info *info = v->arch.continue_info;
1382 regs->eax = info->func(info->data);
1384 v->arch.schedule_tail = info->saved_schedule_tail;
1385 v->cpu_affinity = info->saved_affinity;
1387 xfree(info);
1388 v->arch.continue_info = NULL;
1390 vcpu_set_affinity(v, &v->cpu_affinity);
1391 schedule_tail(v);
1394 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
1396 struct vcpu *v = current;
1397 struct migrate_info *info;
1398 cpumask_t mask = cpumask_of_cpu(cpu);
1400 if ( cpu == smp_processor_id() )
1401 return func(data);
1403 info = xmalloc(struct migrate_info);
1404 if ( info == NULL )
1405 return -ENOMEM;
1407 info->func = func;
1408 info->data = data;
1409 info->saved_schedule_tail = v->arch.schedule_tail;
1410 v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
1412 info->saved_affinity = v->cpu_affinity;
1413 v->arch.continue_info = info;
1415 vcpu_set_affinity(v, &mask);
1417 /* Dummy return value will be overwritten by new schedule_tail. */
1418 BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
1419 return 0;
1422 #define next_arg(fmt, args) ({ \
1423 unsigned long __arg; \
1424 switch ( *(fmt)++ ) \
1425 { \
1426 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1427 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1428 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1429 default: __arg = 0; BUG(); \
1430 } \
1431 __arg; \
1432 })
1434 DEFINE_PER_CPU(char, hc_preempted);
1436 unsigned long hypercall_create_continuation(
1437 unsigned int op, const char *format, ...)
1439 struct mc_state *mcs = &this_cpu(mc_state);
1440 struct cpu_user_regs *regs;
1441 const char *p = format;
1442 unsigned long arg;
1443 unsigned int i;
1444 va_list args;
1446 va_start(args, format);
1448 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1450 __set_bit(_MCSF_call_preempted, &mcs->flags);
1452 for ( i = 0; *p != '\0'; i++ )
1453 mcs->call.args[i] = next_arg(p, args);
1454 if ( is_pv_32on64_domain(current->domain) )
1456 for ( ; i < 6; i++ )
1457 mcs->call.args[i] = 0;
1460 else
1462 regs = guest_cpu_user_regs();
1463 regs->eax = op;
1464 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1466 #ifdef __x86_64__
1467 if ( !is_hvm_vcpu(current) ?
1468 !is_pv_32on64_vcpu(current) :
1469 (hvm_guest_x86_mode(current) == 8) )
1471 for ( i = 0; *p != '\0'; i++ )
1473 arg = next_arg(p, args);
1474 switch ( i )
1476 case 0: regs->rdi = arg; break;
1477 case 1: regs->rsi = arg; break;
1478 case 2: regs->rdx = arg; break;
1479 case 3: regs->r10 = arg; break;
1480 case 4: regs->r8 = arg; break;
1481 case 5: regs->r9 = arg; break;
1485 else
1486 #endif
1488 if ( supervisor_mode_kernel )
1489 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1491 for ( i = 0; *p != '\0'; i++ )
1493 arg = next_arg(p, args);
1494 switch ( i )
1496 case 0: regs->ebx = arg; break;
1497 case 1: regs->ecx = arg; break;
1498 case 2: regs->edx = arg; break;
1499 case 3: regs->esi = arg; break;
1500 case 4: regs->edi = arg; break;
1501 case 5: regs->ebp = arg; break;
1506 this_cpu(hc_preempted) = 1;
1509 va_end(args);
1511 return op;
1514 #ifdef CONFIG_COMPAT
1515 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1517 int rc = 0;
1518 struct mc_state *mcs = &this_cpu(mc_state);
1519 struct cpu_user_regs *regs;
1520 unsigned int i, cval = 0;
1521 unsigned long nval = 0;
1522 va_list args;
1524 BUG_ON(*id > 5);
1525 BUG_ON(mask & (1U << *id));
1527 va_start(args, mask);
1529 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1531 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1532 return 0;
1533 for ( i = 0; i < 6; ++i, mask >>= 1 )
1535 if ( mask & 1 )
1537 nval = va_arg(args, unsigned long);
1538 cval = va_arg(args, unsigned int);
1539 if ( cval == nval )
1540 mask &= ~1U;
1541 else
1542 BUG_ON(nval == (unsigned int)nval);
1544 else if ( id && *id == i )
1546 *id = mcs->call.args[i];
1547 id = NULL;
1549 if ( (mask & 1) && mcs->call.args[i] == nval )
1551 mcs->call.args[i] = cval;
1552 ++rc;
1554 else
1555 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1558 else
1560 regs = guest_cpu_user_regs();
1561 for ( i = 0; i < 6; ++i, mask >>= 1 )
1563 unsigned long *reg;
1565 switch ( i )
1567 case 0: reg = &regs->ebx; break;
1568 case 1: reg = &regs->ecx; break;
1569 case 2: reg = &regs->edx; break;
1570 case 3: reg = &regs->esi; break;
1571 case 4: reg = &regs->edi; break;
1572 case 5: reg = &regs->ebp; break;
1573 default: BUG(); reg = NULL; break;
1575 if ( (mask & 1) )
1577 nval = va_arg(args, unsigned long);
1578 cval = va_arg(args, unsigned int);
1579 if ( cval == nval )
1580 mask &= ~1U;
1581 else
1582 BUG_ON(nval == (unsigned int)nval);
1584 else if ( id && *id == i )
1586 *id = *reg;
1587 id = NULL;
1589 if ( (mask & 1) && *reg == nval )
1591 *reg = cval;
1592 ++rc;
1594 else
1595 BUG_ON(*reg != (unsigned int)*reg);
1599 va_end(args);
1601 return rc;
1603 #endif
1605 static int relinquish_memory(
1606 struct domain *d, struct list_head *list, unsigned long type)
1608 struct list_head *ent;
1609 struct page_info *page;
1610 unsigned long x, y;
1611 int ret = 0;
1613 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1614 spin_lock_recursive(&d->page_alloc_lock);
1616 ent = list->next;
1617 while ( ent != list )
1619 page = list_entry(ent, struct page_info, list);
1621 /* Grab a reference to the page so it won't disappear from under us. */
1622 if ( unlikely(!get_page(page, d)) )
1624 /* Couldn't get a reference -- someone is freeing this page. */
1625 ent = ent->next;
1626 list_move_tail(&page->list, &d->arch.relmem_list);
1627 continue;
1630 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1631 put_page_and_type(page);
1633 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1634 put_page(page);
1636 /*
1637 * Forcibly invalidate top-most, still valid page tables at this point
1638 * to break circular 'linear page table' references. This is okay
1639 * because MMU structures are not shared across domains and this domain
1640 * is now dead. Thus top-most valid tables are not in use so a non-zero
1641 * count means circular reference.
1642 */
1643 y = page->u.inuse.type_info;
1644 for ( ; ; )
1646 x = y;
1647 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1648 (type|PGT_validated)) )
1649 break;
1651 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1652 if ( likely(y == x) )
1654 free_page_type(page, type);
1655 break;
1659 /* Follow the list chain and /then/ potentially free the page. */
1660 ent = ent->next;
1661 list_move_tail(&page->list, &d->arch.relmem_list);
1662 put_page(page);
1664 if ( hypercall_preempt_check() )
1666 ret = -EAGAIN;
1667 goto out;
1671 list_splice_init(&d->arch.relmem_list, list);
1673 out:
1674 spin_unlock_recursive(&d->page_alloc_lock);
1675 return ret;
1678 static void vcpu_destroy_pagetables(struct vcpu *v)
1680 struct domain *d = v->domain;
1681 unsigned long pfn;
1683 #ifdef __x86_64__
1684 if ( is_pv_32on64_vcpu(v) )
1686 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1687 __va(pagetable_get_paddr(v->arch.guest_table)));
1689 if ( pfn != 0 )
1691 if ( paging_mode_refcounts(d) )
1692 put_page(mfn_to_page(pfn));
1693 else
1694 put_page_and_type(mfn_to_page(pfn));
1697 l4e_write(
1698 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1699 l4e_empty());
1701 v->arch.cr3 = 0;
1702 return;
1704 #endif
1706 pfn = pagetable_get_pfn(v->arch.guest_table);
1707 if ( pfn != 0 )
1709 if ( paging_mode_refcounts(d) )
1710 put_page(mfn_to_page(pfn));
1711 else
1712 put_page_and_type(mfn_to_page(pfn));
1713 #ifdef __x86_64__
1714 if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) )
1715 v->arch.guest_table_user = pagetable_null();
1716 #endif
1717 v->arch.guest_table = pagetable_null();
1720 #ifdef __x86_64__
1721 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1722 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1723 if ( pfn != 0 )
1725 if ( paging_mode_refcounts(d) )
1726 put_page(mfn_to_page(pfn));
1727 else
1728 put_page_and_type(mfn_to_page(pfn));
1729 v->arch.guest_table_user = pagetable_null();
1731 #endif
1733 v->arch.cr3 = 0;
1736 int domain_relinquish_resources(struct domain *d)
1738 int ret;
1739 struct vcpu *v;
1741 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1743 switch ( d->arch.relmem )
1745 case RELMEM_not_started:
1746 /* Tear down paging-assistance stuff. */
1747 paging_teardown(d);
1749 /* Drop the in-use references to page-table bases. */
1750 for_each_vcpu ( d, v )
1751 vcpu_destroy_pagetables(v);
1753 /*
1754 * Relinquish GDT mappings. No need for explicit unmapping of the LDT
1755 * as it automatically gets squashed when the guest's mappings go away.
1756 */
1757 for_each_vcpu(d, v)
1758 destroy_gdt(v);
1760 d->arch.relmem = RELMEM_xen_l4;
1761 /* fallthrough */
1763 /* Relinquish every page of memory. */
1764 #if CONFIG_PAGING_LEVELS >= 4
1765 case RELMEM_xen_l4:
1766 ret = relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
1767 if ( ret )
1768 return ret;
1769 d->arch.relmem = RELMEM_dom_l4;
1770 /* fallthrough */
1771 case RELMEM_dom_l4:
1772 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1773 if ( ret )
1774 return ret;
1775 d->arch.relmem = RELMEM_xen_l3;
1776 /* fallthrough */
1777 #endif
1779 #if CONFIG_PAGING_LEVELS >= 3
1780 case RELMEM_xen_l3:
1781 ret = relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
1782 if ( ret )
1783 return ret;
1784 d->arch.relmem = RELMEM_dom_l3;
1785 /* fallthrough */
1786 case RELMEM_dom_l3:
1787 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1788 if ( ret )
1789 return ret;
1790 d->arch.relmem = RELMEM_xen_l2;
1791 /* fallthrough */
1792 #endif
1794 case RELMEM_xen_l2:
1795 ret = relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
1796 if ( ret )
1797 return ret;
1798 d->arch.relmem = RELMEM_dom_l2;
1799 /* fallthrough */
1800 case RELMEM_dom_l2:
1801 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1802 if ( ret )
1803 return ret;
1804 d->arch.relmem = RELMEM_done;
1805 /* fallthrough */
1807 case RELMEM_done:
1808 break;
1810 default:
1811 BUG();
1814 /* Free page used by xen oprofile buffer. */
1815 free_xenoprof_pages(d);
1817 if ( is_hvm_domain(d) )
1818 hvm_domain_relinquish_resources(d);
1820 return 0;
1823 void arch_dump_domain_info(struct domain *d)
1825 paging_dump_domain_info(d);
1828 void arch_dump_vcpu_info(struct vcpu *v)
1830 paging_dump_vcpu_info(v);
1833 /*
1834 * Local variables:
1835 * mode: C
1836 * c-set-style: "BSD"
1837 * c-basic-offset: 4
1838 * tab-width: 4
1839 * indent-tabs-mode: nil
1840 * End:
1841 */