ia64/xen-unstable

view xen/arch/x86/domain.c @ 15425:79b180596baf

x86: introduce specialized clear_page()

More than doubles performance of page clearing on not too old
processors (SSE2 supported).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Wed Jun 20 16:18:03 2007 +0100 (2007-06-20)
parents 005dd6b1cf8e
children ecb89c6ce615
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <asm/regs.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/system.h>
35 #include <asm/io.h>
36 #include <asm/processor.h>
37 #include <asm/desc.h>
38 #include <asm/i387.h>
39 #include <asm/mpspec.h>
40 #include <asm/ldt.h>
41 #include <asm/paging.h>
42 #include <asm/hypercall.h>
43 #include <asm/hvm/hvm.h>
44 #include <asm/hvm/support.h>
45 #include <asm/msr.h>
46 #ifdef CONFIG_COMPAT
47 #include <compat/vcpu.h>
48 #endif
50 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
51 DEFINE_PER_CPU(__u64, efer);
53 static void unmap_vcpu_info(struct vcpu *v);
55 static void paravirt_ctxt_switch_from(struct vcpu *v);
56 static void paravirt_ctxt_switch_to(struct vcpu *v);
58 static void vcpu_destroy_pagetables(struct vcpu *v);
60 static void continue_idle_domain(struct vcpu *v)
61 {
62 reset_stack_and_jump(idle_loop);
63 }
65 static void continue_nonidle_domain(struct vcpu *v)
66 {
67 reset_stack_and_jump(ret_from_intr);
68 }
70 static void default_idle(void)
71 {
72 local_irq_disable();
73 if ( !softirq_pending(smp_processor_id()) )
74 safe_halt();
75 else
76 local_irq_enable();
77 }
79 void idle_loop(void)
80 {
81 for ( ; ; )
82 {
83 page_scrub_schedule_work();
84 default_idle();
85 do_softirq();
86 }
87 }
89 void startup_cpu_idle_loop(void)
90 {
91 struct vcpu *v = current;
93 ASSERT(is_idle_vcpu(v));
94 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
95 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
97 reset_stack_and_jump(idle_loop);
98 }
100 void dump_pageframe_info(struct domain *d)
101 {
102 struct page_info *page;
104 printk("Memory pages belonging to domain %u:\n", d->domain_id);
106 if ( d->tot_pages >= 10 )
107 {
108 printk(" DomPage list too long to display\n");
109 }
110 else
111 {
112 list_for_each_entry ( page, &d->page_list, list )
113 {
114 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
115 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
116 page->count_info, page->u.inuse.type_info);
117 }
118 }
120 list_for_each_entry ( page, &d->xenpage_list, list )
121 {
122 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
123 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
124 page->count_info, page->u.inuse.type_info);
125 }
126 }
128 struct vcpu *alloc_vcpu_struct(void)
129 {
130 struct vcpu *v;
131 if ( (v = xmalloc(struct vcpu)) != NULL )
132 memset(v, 0, sizeof(*v));
133 return v;
134 }
136 void free_vcpu_struct(struct vcpu *v)
137 {
138 xfree(v);
139 }
141 #ifdef CONFIG_COMPAT
143 int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
144 {
145 struct domain *d = v->domain;
146 unsigned i;
147 struct page_info *pg;
149 if ( !d->arch.mm_arg_xlat_l3 )
150 {
151 pg = alloc_domheap_page(NULL);
152 if ( !pg )
153 return -ENOMEM;
154 d->arch.mm_arg_xlat_l3 = page_to_virt(pg);
155 clear_page(d->arch.mm_arg_xlat_l3);
156 }
158 l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
159 l4e_from_paddr(__pa(d->arch.mm_arg_xlat_l3), __PAGE_HYPERVISOR);
161 for ( i = 0; i < COMPAT_ARG_XLAT_PAGES; ++i )
162 {
163 unsigned long va = COMPAT_ARG_XLAT_VIRT_START(v->vcpu_id) + i * PAGE_SIZE;
164 l2_pgentry_t *l2tab;
165 l1_pgentry_t *l1tab;
167 if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
168 {
169 pg = alloc_domheap_page(NULL);
170 if ( !pg )
171 return -ENOMEM;
172 clear_page(page_to_virt(pg));
173 d->arch.mm_arg_xlat_l3[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
174 }
175 l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
176 if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
177 {
178 pg = alloc_domheap_page(NULL);
179 if ( !pg )
180 return -ENOMEM;
181 clear_page(page_to_virt(pg));
182 l2tab[l2_table_offset(va)] = l2e_from_page(pg, __PAGE_HYPERVISOR);
183 }
184 l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
185 BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
186 pg = alloc_domheap_page(NULL);
187 if ( !pg )
188 return -ENOMEM;
189 l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
190 }
192 return 0;
193 }
195 static void release_arg_xlat_area(struct domain *d)
196 {
197 if ( d->arch.mm_arg_xlat_l3 )
198 {
199 unsigned l3;
201 for ( l3 = 0; l3 < L3_PAGETABLE_ENTRIES; ++l3 )
202 {
203 if ( l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3]) )
204 {
205 l2_pgentry_t *l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3]);
206 unsigned l2;
208 for ( l2 = 0; l2 < L2_PAGETABLE_ENTRIES; ++l2 )
209 {
210 if ( l2e_get_intpte(l2tab[l2]) )
211 {
212 l1_pgentry_t *l1tab = l2e_to_l1e(l2tab[l2]);
213 unsigned l1;
215 for ( l1 = 0; l1 < L1_PAGETABLE_ENTRIES; ++l1 )
216 {
217 if ( l1e_get_intpte(l1tab[l1]) )
218 free_domheap_page(l1e_get_page(l1tab[l1]));
219 }
220 free_domheap_page(l2e_get_page(l2tab[l2]));
221 }
222 }
223 free_domheap_page(l3e_get_page(d->arch.mm_arg_xlat_l3[l3]));
224 }
225 }
226 free_domheap_page(virt_to_page(d->arch.mm_arg_xlat_l3));
227 }
228 }
230 static int setup_compat_l4(struct vcpu *v)
231 {
232 struct page_info *pg = alloc_domheap_page(NULL);
233 l4_pgentry_t *l4tab;
234 int rc;
236 if ( pg == NULL )
237 return -ENOMEM;
239 /* This page needs to look like a pagetable so that it can be shadowed */
240 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated;
242 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
243 l4tab[0] = l4e_empty();
244 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
245 l4e_from_page(pg, __PAGE_HYPERVISOR);
246 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
247 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
248 __PAGE_HYPERVISOR);
250 if ( (rc = setup_arg_xlat_area(v, l4tab)) < 0 )
251 {
252 free_domheap_page(pg);
253 return rc;
254 }
256 v->arch.guest_table = pagetable_from_page(pg);
257 v->arch.guest_table_user = v->arch.guest_table;
259 return 0;
260 }
262 static void release_compat_l4(struct vcpu *v)
263 {
264 free_domheap_page(pagetable_get_page(v->arch.guest_table));
265 v->arch.guest_table = pagetable_null();
266 v->arch.guest_table_user = pagetable_null();
267 }
269 static inline int may_switch_mode(struct domain *d)
270 {
271 return (!is_hvm_domain(d) && (d->tot_pages == 0));
272 }
274 int switch_native(struct domain *d)
275 {
276 l1_pgentry_t gdt_l1e;
277 unsigned int vcpuid;
279 if ( d == NULL )
280 return -EINVAL;
281 if ( !may_switch_mode(d) )
282 return -EACCES;
283 if ( !is_pv_32on64_domain(d) )
284 return 0;
286 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
287 release_arg_xlat_area(d);
289 /* switch gdt */
290 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
291 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
292 {
293 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
294 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
295 if (d->vcpu[vcpuid])
296 release_compat_l4(d->vcpu[vcpuid]);
297 }
299 d->arch.physaddr_bitsize = 64;
301 return 0;
302 }
304 int switch_compat(struct domain *d)
305 {
306 l1_pgentry_t gdt_l1e;
307 unsigned int vcpuid;
309 if ( d == NULL )
310 return -EINVAL;
311 if ( compat_disabled )
312 return -ENOSYS;
313 if ( !may_switch_mode(d) )
314 return -EACCES;
315 if ( is_pv_32on64_domain(d) )
316 return 0;
318 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
320 /* switch gdt */
321 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
322 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
323 {
324 if ( (d->vcpu[vcpuid] != NULL) &&
325 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
326 goto undo_and_fail;
327 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
328 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
329 }
331 d->arch.physaddr_bitsize =
332 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
333 + (PAGE_SIZE - 2);
335 return 0;
337 undo_and_fail:
338 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
339 release_arg_xlat_area(d);
340 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
341 while ( vcpuid-- != 0 )
342 {
343 if ( d->vcpu[vcpuid] != NULL )
344 release_compat_l4(d->vcpu[vcpuid]);
345 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
346 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
347 }
348 return -ENOMEM;
349 }
351 #else
352 #define release_arg_xlat_area(d) ((void)0)
353 #define setup_compat_l4(v) 0
354 #define release_compat_l4(v) ((void)0)
355 #endif
357 int vcpu_initialise(struct vcpu *v)
358 {
359 struct domain *d = v->domain;
360 int rc;
362 v->arch.vcpu_info_mfn = INVALID_MFN;
364 v->arch.flags = TF_kernel_mode;
366 pae_l3_cache_init(&v->arch.pae_l3_cache);
368 paging_vcpu_init(v);
370 if ( is_hvm_domain(d) )
371 {
372 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
373 return rc;
374 }
375 else
376 {
377 /* PV guests by default have a 100Hz ticker. */
378 v->periodic_period = MILLISECS(10);
380 /* PV guests get an emulated PIT too for video BIOSes to use. */
381 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
382 pit_init(v, cpu_khz);
384 v->arch.schedule_tail = continue_nonidle_domain;
385 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
386 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
388 if ( is_idle_domain(d) )
389 {
390 v->arch.schedule_tail = continue_idle_domain;
391 v->arch.cr3 = __pa(idle_pg_table);
392 }
393 }
395 v->arch.perdomain_ptes =
396 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
398 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
399 }
401 void vcpu_destroy(struct vcpu *v)
402 {
403 if ( is_pv_32on64_vcpu(v) )
404 release_compat_l4(v);
406 unmap_vcpu_info(v);
408 if ( is_hvm_vcpu(v) )
409 hvm_vcpu_destroy(v);
410 }
412 int arch_domain_create(struct domain *d)
413 {
414 #ifdef __x86_64__
415 struct page_info *pg;
416 int i;
417 #endif
418 l1_pgentry_t gdt_l1e;
419 int vcpuid, pdpt_order, paging_initialised = 0;
420 int rc = -ENOMEM;
422 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
423 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
424 if ( d->arch.mm_perdomain_pt == NULL )
425 goto fail;
426 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
428 /*
429 * Map Xen segments into every VCPU's GDT, irrespective of whether every
430 * VCPU will actually be used. This avoids an NMI race during context
431 * switch: if we take an interrupt after switching CR3 but before switching
432 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
433 * try to load CS from an invalid table.
434 */
435 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
436 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
437 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
438 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
440 #if defined(__i386__)
442 mapcache_init(d);
444 #else /* __x86_64__ */
446 if ( (pg = alloc_domheap_page(NULL)) == NULL )
447 goto fail;
448 d->arch.mm_perdomain_l2 = page_to_virt(pg);
449 clear_page(d->arch.mm_perdomain_l2);
450 for ( i = 0; i < (1 << pdpt_order); i++ )
451 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
452 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
453 __PAGE_HYPERVISOR);
455 if ( (pg = alloc_domheap_page(NULL)) == NULL )
456 goto fail;
457 d->arch.mm_perdomain_l3 = page_to_virt(pg);
458 clear_page(d->arch.mm_perdomain_l3);
459 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
460 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
461 __PAGE_HYPERVISOR);
463 #endif /* __x86_64__ */
465 #ifdef CONFIG_COMPAT
466 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
467 #endif
469 paging_domain_init(d);
470 paging_initialised = 1;
472 if ( !is_idle_domain(d) )
473 {
474 d->arch.ioport_caps =
475 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
476 if ( d->arch.ioport_caps == NULL )
477 goto fail;
479 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
480 goto fail;
482 clear_page(d->shared_info);
483 share_xen_page_with_guest(
484 virt_to_page(d->shared_info), d, XENSHARE_writable);
485 }
487 if ( is_hvm_domain(d) )
488 {
489 if ( (rc = hvm_domain_initialise(d)) != 0 )
490 goto fail;
491 }
492 else
493 {
494 /* 32-bit PV guest by default only if Xen is not 64-bit. */
495 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
496 (CONFIG_PAGING_LEVELS != 4);
497 }
499 return 0;
501 fail:
502 free_xenheap_page(d->shared_info);
503 if ( paging_initialised )
504 paging_final_teardown(d);
505 #ifdef __x86_64__
506 if ( d->arch.mm_perdomain_l2 )
507 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
508 if ( d->arch.mm_perdomain_l3 )
509 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
510 #endif
511 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
512 return rc;
513 }
515 void arch_domain_destroy(struct domain *d)
516 {
517 if ( is_hvm_domain(d) )
518 hvm_domain_destroy(d);
520 paging_final_teardown(d);
522 free_xenheap_pages(
523 d->arch.mm_perdomain_pt,
524 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
526 #ifdef __x86_64__
527 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
528 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
529 #endif
531 if ( is_pv_32on64_domain(d) )
532 release_arg_xlat_area(d);
534 free_xenheap_page(d->shared_info);
535 }
537 /* This is called by arch_final_setup_guest and do_boot_vcpu */
538 int arch_set_info_guest(
539 struct vcpu *v, vcpu_guest_context_u c)
540 {
541 struct domain *d = v->domain;
542 unsigned long cr3_pfn = INVALID_MFN;
543 unsigned long flags;
544 int i, rc = 0, compat;
546 /* The context is a compat-mode one if the target domain is compat-mode;
547 * we expect the tools to DTRT even in compat-mode callers. */
548 compat = is_pv_32on64_domain(d);
550 #ifdef CONFIG_COMPAT
551 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
552 #else
553 #define c(fld) (c.nat->fld)
554 #endif
555 flags = c(flags);
557 if ( !is_hvm_vcpu(v) )
558 {
559 if ( !compat )
560 {
561 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
562 fixup_guest_stack_selector(d, c.nat->kernel_ss);
563 fixup_guest_code_selector(d, c.nat->user_regs.cs);
564 #ifdef __i386__
565 fixup_guest_code_selector(d, c.nat->event_callback_cs);
566 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
567 #endif
569 for ( i = 0; i < 256; i++ )
570 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
572 /* LDT safety checks. */
573 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
574 (c.nat->ldt_ents > 8192) ||
575 !array_access_ok(c.nat->ldt_base,
576 c.nat->ldt_ents,
577 LDT_ENTRY_SIZE) )
578 return -EINVAL;
579 }
580 #ifdef CONFIG_COMPAT
581 else
582 {
583 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
584 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
585 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
586 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
587 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
589 for ( i = 0; i < 256; i++ )
590 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
592 /* LDT safety checks. */
593 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
594 (c.cmp->ldt_ents > 8192) ||
595 !compat_array_access_ok(c.cmp->ldt_base,
596 c.cmp->ldt_ents,
597 LDT_ENTRY_SIZE) )
598 return -EINVAL;
599 }
600 #endif
601 }
603 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
605 v->arch.flags &= ~TF_kernel_mode;
606 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
607 v->arch.flags |= TF_kernel_mode;
609 if ( !compat )
610 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
611 #ifdef CONFIG_COMPAT
612 else
613 {
614 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
615 }
616 #endif
618 /* Only CR0.TS is modifiable by guest or admin. */
619 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
620 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
622 init_int80_direct_trap(v);
624 if ( !is_hvm_vcpu(v) )
625 {
626 /* IOPL privileges are virtualised. */
627 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
628 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
630 /* Ensure real hardware interrupts are enabled. */
631 v->arch.guest_context.user_regs.eflags |= EF_IE;
632 }
633 else
634 {
635 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
636 }
638 if ( v->is_initialised )
639 goto out;
641 memset(v->arch.guest_context.debugreg, 0,
642 sizeof(v->arch.guest_context.debugreg));
643 for ( i = 0; i < 8; i++ )
644 (void)set_debugreg(v, i, c(debugreg[i]));
646 if ( v->vcpu_id == 0 )
647 d->vm_assist = c(vm_assist);
649 if ( !is_hvm_vcpu(v) )
650 {
651 if ( !compat )
652 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
653 #ifdef CONFIG_COMPAT
654 else
655 {
656 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
657 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
659 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
660 return -EINVAL;
661 for ( i = 0; i < n; ++i )
662 gdt_frames[i] = c.cmp->gdt_frames[i];
663 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
664 }
665 #endif
666 if ( rc != 0 )
667 return rc;
669 if ( !compat )
670 {
671 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
673 if ( !mfn_valid(cr3_pfn) ||
674 (paging_mode_refcounts(d)
675 ? !get_page(mfn_to_page(cr3_pfn), d)
676 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
677 PGT_base_page_table)) )
678 {
679 destroy_gdt(v);
680 return -EINVAL;
681 }
683 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
685 #ifdef __x86_64__
686 if ( c.nat->ctrlreg[1] )
687 {
688 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
690 if ( !mfn_valid(cr3_pfn) ||
691 (paging_mode_refcounts(d)
692 ? !get_page(mfn_to_page(cr3_pfn), d)
693 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
694 PGT_base_page_table)) )
695 {
696 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
697 v->arch.guest_table = pagetable_null();
698 if ( paging_mode_refcounts(d) )
699 put_page(mfn_to_page(cr3_pfn));
700 else
701 put_page_and_type(mfn_to_page(cr3_pfn));
702 destroy_gdt(v);
703 return -EINVAL;
704 }
706 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
707 }
708 #endif
709 }
710 #ifdef CONFIG_COMPAT
711 else
712 {
713 l4_pgentry_t *l4tab;
715 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
717 if ( !mfn_valid(cr3_pfn) ||
718 (paging_mode_refcounts(d)
719 ? !get_page(mfn_to_page(cr3_pfn), d)
720 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
721 PGT_l3_page_table)) )
722 {
723 destroy_gdt(v);
724 return -EINVAL;
725 }
727 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
728 *l4tab = l4e_from_pfn(cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
729 }
730 #endif
731 }
733 if ( v->vcpu_id == 0 )
734 update_domain_wallclock_time(d);
736 /* Don't redo final setup */
737 v->is_initialised = 1;
739 if ( paging_mode_enabled(d) )
740 paging_update_paging_modes(v);
742 update_cr3(v);
744 out:
745 if ( flags & VGCF_online )
746 clear_bit(_VPF_down, &v->pause_flags);
747 else
748 set_bit(_VPF_down, &v->pause_flags);
749 return 0;
750 #undef c
751 }
753 int arch_vcpu_reset(struct vcpu *v)
754 {
755 destroy_gdt(v);
756 vcpu_destroy_pagetables(v);
757 return 0;
758 }
760 /*
761 * Unmap the vcpu info page if the guest decided to place it somewhere
762 * else. This is only used from arch_domain_destroy, so there's no
763 * need to do anything clever.
764 */
765 static void
766 unmap_vcpu_info(struct vcpu *v)
767 {
768 struct domain *d = v->domain;
769 unsigned long mfn;
771 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
772 return;
774 mfn = v->arch.vcpu_info_mfn;
775 unmap_domain_page_global(v->vcpu_info);
777 v->vcpu_info = shared_info_addr(d, vcpu_info[v->vcpu_id]);
778 v->arch.vcpu_info_mfn = INVALID_MFN;
780 put_page_and_type(mfn_to_page(mfn));
781 }
783 /*
784 * Map a guest page in and point the vcpu_info pointer at it. This
785 * makes sure that the vcpu_info is always pointing at a valid piece
786 * of memory, and it sets a pending event to make sure that a pending
787 * event doesn't get missed.
788 */
789 static int
790 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
791 {
792 struct domain *d = v->domain;
793 void *mapping;
794 vcpu_info_t *new_info;
795 int i;
797 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
798 return -EINVAL;
800 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
801 return -EINVAL;
803 /* Run this command on yourself or on other offline VCPUS. */
804 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
805 return -EINVAL;
807 mfn = gmfn_to_mfn(d, mfn);
808 if ( !mfn_valid(mfn) ||
809 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
810 return -EINVAL;
812 mapping = map_domain_page_global(mfn);
813 if ( mapping == NULL )
814 {
815 put_page_and_type(mfn_to_page(mfn));
816 return -ENOMEM;
817 }
819 new_info = (vcpu_info_t *)(mapping + offset);
821 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
823 v->vcpu_info = new_info;
824 v->arch.vcpu_info_mfn = mfn;
826 /* Set new vcpu_info pointer /before/ setting pending flags. */
827 wmb();
829 /*
830 * Mark everything as being pending just to make sure nothing gets
831 * lost. The domain will get a spurious event, but it can cope.
832 */
833 vcpu_info(v, evtchn_upcall_pending) = 1;
834 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
835 set_bit(i, vcpu_info_addr(v, evtchn_pending_sel));
837 /*
838 * Only bother to update time for the current vcpu. If we're
839 * operating on another vcpu, then it had better not be running at
840 * the time.
841 */
842 if ( v == current )
843 update_vcpu_system_time(v);
845 return 0;
846 }
848 long
849 arch_do_vcpu_op(
850 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
851 {
852 long rc = 0;
854 switch ( cmd )
855 {
856 case VCPUOP_register_runstate_memory_area:
857 {
858 struct vcpu_register_runstate_memory_area area;
859 struct vcpu_runstate_info runstate;
861 rc = -EFAULT;
862 if ( copy_from_guest(&area, arg, 1) )
863 break;
865 if ( !guest_handle_okay(area.addr.h, 1) )
866 break;
868 rc = 0;
869 runstate_guest(v) = area.addr.h;
871 if ( v == current )
872 {
873 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
874 }
875 else
876 {
877 vcpu_runstate_get(v, &runstate);
878 __copy_to_guest(runstate_guest(v), &runstate, 1);
879 }
881 break;
882 }
884 case VCPUOP_register_vcpu_info:
885 {
886 struct domain *d = v->domain;
887 struct vcpu_register_vcpu_info info;
889 rc = -EFAULT;
890 if ( copy_from_guest(&info, arg, 1) )
891 break;
893 LOCK_BIGLOCK(d);
894 rc = map_vcpu_info(v, info.mfn, info.offset);
895 UNLOCK_BIGLOCK(d);
897 break;
898 }
900 default:
901 rc = -ENOSYS;
902 break;
903 }
905 return rc;
906 }
908 #ifdef __x86_64__
910 #define loadsegment(seg,value) ({ \
911 int __r = 1; \
912 __asm__ __volatile__ ( \
913 "1: movl %k1,%%" #seg "\n2:\n" \
914 ".section .fixup,\"ax\"\n" \
915 "3: xorl %k0,%k0\n" \
916 " movl %k0,%%" #seg "\n" \
917 " jmp 2b\n" \
918 ".previous\n" \
919 ".section __ex_table,\"a\"\n" \
920 " .align 8\n" \
921 " .quad 1b,3b\n" \
922 ".previous" \
923 : "=r" (__r) : "r" (value), "0" (__r) );\
924 __r; })
926 /*
927 * save_segments() writes a mask of segments which are dirty (non-zero),
928 * allowing load_segments() to avoid some expensive segment loads and
929 * MSR writes.
930 */
931 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
932 #define DIRTY_DS 0x01
933 #define DIRTY_ES 0x02
934 #define DIRTY_FS 0x04
935 #define DIRTY_GS 0x08
936 #define DIRTY_FS_BASE 0x10
937 #define DIRTY_GS_BASE_USER 0x20
939 static void load_segments(struct vcpu *n)
940 {
941 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
942 int all_segs_okay = 1;
943 unsigned int dirty_segment_mask, cpu = smp_processor_id();
945 /* Load and clear the dirty segment mask. */
946 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
947 per_cpu(dirty_segment_mask, cpu) = 0;
949 /* Either selector != 0 ==> reload. */
950 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
951 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
953 /* Either selector != 0 ==> reload. */
954 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
955 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
957 /*
958 * Either selector != 0 ==> reload.
959 * Also reload to reset FS_BASE if it was non-zero.
960 */
961 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
962 nctxt->user_regs.fs) )
963 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
965 /*
966 * Either selector != 0 ==> reload.
967 * Also reload to reset GS_BASE if it was non-zero.
968 */
969 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
970 nctxt->user_regs.gs) )
971 {
972 /* Reset GS_BASE with user %gs? */
973 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
974 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
975 }
977 if ( !is_pv_32on64_domain(n->domain) )
978 {
979 /* This can only be non-zero if selector is NULL. */
980 if ( nctxt->fs_base )
981 wrmsr(MSR_FS_BASE,
982 nctxt->fs_base,
983 nctxt->fs_base>>32);
985 /* Most kernels have non-zero GS base, so don't bother testing. */
986 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
987 wrmsr(MSR_SHADOW_GS_BASE,
988 nctxt->gs_base_kernel,
989 nctxt->gs_base_kernel>>32);
991 /* This can only be non-zero if selector is NULL. */
992 if ( nctxt->gs_base_user )
993 wrmsr(MSR_GS_BASE,
994 nctxt->gs_base_user,
995 nctxt->gs_base_user>>32);
997 /* If in kernel mode then switch the GS bases around. */
998 if ( (n->arch.flags & TF_kernel_mode) )
999 __asm__ __volatile__ ( "swapgs" );
1002 if ( unlikely(!all_segs_okay) )
1004 struct cpu_user_regs *regs = guest_cpu_user_regs();
1005 unsigned long *rsp =
1006 (n->arch.flags & TF_kernel_mode) ?
1007 (unsigned long *)regs->rsp :
1008 (unsigned long *)nctxt->kernel_sp;
1009 unsigned long cs_and_mask, rflags;
1011 if ( is_pv_32on64_domain(n->domain) )
1013 unsigned int *esp = ring_1(regs) ?
1014 (unsigned int *)regs->rsp :
1015 (unsigned int *)nctxt->kernel_sp;
1016 unsigned int cs_and_mask, eflags;
1017 int ret = 0;
1019 /* CS longword also contains full evtchn_upcall_mask. */
1020 cs_and_mask = (unsigned short)regs->cs |
1021 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1022 /* Fold upcall mask into RFLAGS.IF. */
1023 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1024 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1026 if ( !ring_1(regs) )
1028 ret = put_user(regs->ss, esp-1);
1029 ret |= put_user(regs->_esp, esp-2);
1030 esp -= 2;
1033 if ( ret |
1034 put_user(eflags, esp-1) |
1035 put_user(cs_and_mask, esp-2) |
1036 put_user(regs->_eip, esp-3) |
1037 put_user(nctxt->user_regs.gs, esp-4) |
1038 put_user(nctxt->user_regs.fs, esp-5) |
1039 put_user(nctxt->user_regs.es, esp-6) |
1040 put_user(nctxt->user_regs.ds, esp-7) )
1042 gdprintk(XENLOG_ERR, "Error while creating compat "
1043 "failsafe callback frame.\n");
1044 domain_crash(n->domain);
1047 if ( test_bit(_VGCF_failsafe_disables_events,
1048 &n->arch.guest_context.flags) )
1049 vcpu_info(n, evtchn_upcall_mask) = 1;
1051 regs->entry_vector = TRAP_syscall;
1052 regs->_eflags &= 0xFFFCBEFFUL;
1053 regs->ss = FLAT_COMPAT_KERNEL_SS;
1054 regs->_esp = (unsigned long)(esp-7);
1055 regs->cs = FLAT_COMPAT_KERNEL_CS;
1056 regs->_eip = nctxt->failsafe_callback_eip;
1057 return;
1060 if ( !(n->arch.flags & TF_kernel_mode) )
1061 toggle_guest_mode(n);
1062 else
1063 regs->cs &= ~3;
1065 /* CS longword also contains full evtchn_upcall_mask. */
1066 cs_and_mask = (unsigned long)regs->cs |
1067 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1069 /* Fold upcall mask into RFLAGS.IF. */
1070 rflags = regs->rflags & ~X86_EFLAGS_IF;
1071 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1073 if ( put_user(regs->ss, rsp- 1) |
1074 put_user(regs->rsp, rsp- 2) |
1075 put_user(rflags, rsp- 3) |
1076 put_user(cs_and_mask, rsp- 4) |
1077 put_user(regs->rip, rsp- 5) |
1078 put_user(nctxt->user_regs.gs, rsp- 6) |
1079 put_user(nctxt->user_regs.fs, rsp- 7) |
1080 put_user(nctxt->user_regs.es, rsp- 8) |
1081 put_user(nctxt->user_regs.ds, rsp- 9) |
1082 put_user(regs->r11, rsp-10) |
1083 put_user(regs->rcx, rsp-11) )
1085 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1086 "callback frame.\n");
1087 domain_crash(n->domain);
1090 if ( test_bit(_VGCF_failsafe_disables_events,
1091 &n->arch.guest_context.flags) )
1092 vcpu_info(n, evtchn_upcall_mask) = 1;
1094 regs->entry_vector = TRAP_syscall;
1095 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1096 X86_EFLAGS_NT|X86_EFLAGS_TF);
1097 regs->ss = FLAT_KERNEL_SS;
1098 regs->rsp = (unsigned long)(rsp-11);
1099 regs->cs = FLAT_KERNEL_CS;
1100 regs->rip = nctxt->failsafe_callback_eip;
1104 static void save_segments(struct vcpu *v)
1106 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1107 struct cpu_user_regs *regs = &ctxt->user_regs;
1108 unsigned int dirty_segment_mask = 0;
1110 regs->ds = read_segment_register(ds);
1111 regs->es = read_segment_register(es);
1112 regs->fs = read_segment_register(fs);
1113 regs->gs = read_segment_register(gs);
1115 if ( regs->ds )
1116 dirty_segment_mask |= DIRTY_DS;
1118 if ( regs->es )
1119 dirty_segment_mask |= DIRTY_ES;
1121 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1123 dirty_segment_mask |= DIRTY_FS;
1124 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1126 else if ( ctxt->fs_base )
1128 dirty_segment_mask |= DIRTY_FS_BASE;
1131 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1133 dirty_segment_mask |= DIRTY_GS;
1134 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1136 else if ( ctxt->gs_base_user )
1138 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1141 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1144 #define switch_kernel_stack(v) ((void)0)
1146 #elif defined(__i386__)
1148 #define load_segments(n) ((void)0)
1149 #define save_segments(p) ((void)0)
1151 static inline void switch_kernel_stack(struct vcpu *v)
1153 struct tss_struct *tss = &init_tss[smp_processor_id()];
1154 tss->esp1 = v->arch.guest_context.kernel_sp;
1155 tss->ss1 = v->arch.guest_context.kernel_ss;
1158 #endif /* __i386__ */
1160 static void paravirt_ctxt_switch_from(struct vcpu *v)
1162 save_segments(v);
1165 static void paravirt_ctxt_switch_to(struct vcpu *v)
1167 set_int80_direct_trap(v);
1168 switch_kernel_stack(v);
1171 #define loaddebug(_v,_reg) \
1172 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
1174 static void __context_switch(void)
1176 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1177 unsigned int cpu = smp_processor_id();
1178 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1179 struct vcpu *n = current;
1181 ASSERT(p != n);
1182 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1184 if ( !is_idle_vcpu(p) )
1186 memcpy(&p->arch.guest_context.user_regs,
1187 stack_regs,
1188 CTXT_SWITCH_STACK_BYTES);
1189 unlazy_fpu(p);
1190 p->arch.ctxt_switch_from(p);
1193 if ( !is_idle_vcpu(n) )
1195 memcpy(stack_regs,
1196 &n->arch.guest_context.user_regs,
1197 CTXT_SWITCH_STACK_BYTES);
1199 /* Maybe switch the debug registers. */
1200 if ( unlikely(n->arch.guest_context.debugreg[7]) )
1202 loaddebug(&n->arch.guest_context, 0);
1203 loaddebug(&n->arch.guest_context, 1);
1204 loaddebug(&n->arch.guest_context, 2);
1205 loaddebug(&n->arch.guest_context, 3);
1206 /* no 4 and 5 */
1207 loaddebug(&n->arch.guest_context, 6);
1208 loaddebug(&n->arch.guest_context, 7);
1210 n->arch.ctxt_switch_to(n);
1213 if ( p->domain != n->domain )
1214 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1215 cpu_set(cpu, n->vcpu_dirty_cpumask);
1217 write_ptbase(n);
1219 if ( p->vcpu_id != n->vcpu_id )
1221 char gdt_load[10];
1222 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1223 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1224 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
1227 if ( p->domain != n->domain )
1228 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1229 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1231 per_cpu(curr_vcpu, cpu) = n;
1235 void context_switch(struct vcpu *prev, struct vcpu *next)
1237 unsigned int cpu = smp_processor_id();
1238 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1240 ASSERT(local_irq_is_enabled());
1242 /* Allow at most one CPU at a time to be dirty. */
1243 ASSERT(cpus_weight(dirty_mask) <= 1);
1244 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1246 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1247 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1248 flush_tlb_mask(next->vcpu_dirty_cpumask);
1251 local_irq_disable();
1253 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1254 pt_freeze_time(prev);
1256 set_current(next);
1258 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1260 local_irq_enable();
1262 else
1264 __context_switch();
1266 #ifdef CONFIG_COMPAT
1267 if ( !is_hvm_vcpu(next) &&
1268 (is_idle_vcpu(prev) ||
1269 is_hvm_vcpu(prev) ||
1270 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1272 uint64_t efer = read_efer();
1274 local_flush_tlb_one(GDT_VIRT_START(next) +
1275 FIRST_RESERVED_GDT_BYTE);
1277 if ( !is_pv_32on64_vcpu(next) == !(efer & EFER_SCE) )
1278 write_efer(efer ^ EFER_SCE);
1280 #endif
1282 /* Re-enable interrupts before restoring state which may fault. */
1283 local_irq_enable();
1285 if ( !is_hvm_vcpu(next) )
1287 load_LDT(next);
1288 load_segments(next);
1292 context_saved(prev);
1294 /* Update per-VCPU guest runstate shared memory area (if registered). */
1295 if ( !guest_handle_is_null(runstate_guest(next)) )
1297 if ( !is_pv_32on64_domain(next->domain) )
1298 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1299 #ifdef CONFIG_COMPAT
1300 else
1302 struct compat_vcpu_runstate_info info;
1304 XLAT_vcpu_runstate_info(&info, &next->runstate);
1305 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1307 #endif
1310 schedule_tail(next);
1311 BUG();
1314 void continue_running(struct vcpu *same)
1316 schedule_tail(same);
1317 BUG();
1320 int __sync_lazy_execstate(void)
1322 unsigned long flags;
1323 int switch_required;
1325 local_irq_save(flags);
1327 switch_required = (this_cpu(curr_vcpu) != current);
1329 if ( switch_required )
1331 ASSERT(current == idle_vcpu[smp_processor_id()]);
1332 __context_switch();
1335 local_irq_restore(flags);
1337 return switch_required;
1340 void sync_vcpu_execstate(struct vcpu *v)
1342 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1343 (void)__sync_lazy_execstate();
1345 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1346 flush_tlb_mask(v->vcpu_dirty_cpumask);
1349 #define next_arg(fmt, args) ({ \
1350 unsigned long __arg; \
1351 switch ( *(fmt)++ ) \
1352 { \
1353 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1354 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1355 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1356 default: __arg = 0; BUG(); \
1357 } \
1358 __arg; \
1359 })
1361 DEFINE_PER_CPU(char, hc_preempted);
1363 unsigned long hypercall_create_continuation(
1364 unsigned int op, const char *format, ...)
1366 struct mc_state *mcs = &this_cpu(mc_state);
1367 struct cpu_user_regs *regs;
1368 const char *p = format;
1369 unsigned long arg;
1370 unsigned int i;
1371 va_list args;
1373 va_start(args, format);
1375 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1377 __set_bit(_MCSF_call_preempted, &mcs->flags);
1379 for ( i = 0; *p != '\0'; i++ )
1380 mcs->call.args[i] = next_arg(p, args);
1381 if ( is_pv_32on64_domain(current->domain) )
1383 for ( ; i < 6; i++ )
1384 mcs->call.args[i] = 0;
1387 else
1389 regs = guest_cpu_user_regs();
1390 regs->eax = op;
1391 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1393 #ifdef __x86_64__
1394 if ( !is_hvm_vcpu(current) ?
1395 !is_pv_32on64_vcpu(current) :
1396 (hvm_guest_x86_mode(current) == 8) )
1398 for ( i = 0; *p != '\0'; i++ )
1400 arg = next_arg(p, args);
1401 switch ( i )
1403 case 0: regs->rdi = arg; break;
1404 case 1: regs->rsi = arg; break;
1405 case 2: regs->rdx = arg; break;
1406 case 3: regs->r10 = arg; break;
1407 case 4: regs->r8 = arg; break;
1408 case 5: regs->r9 = arg; break;
1412 else
1413 #endif
1415 if ( supervisor_mode_kernel )
1416 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1418 for ( i = 0; *p != '\0'; i++ )
1420 arg = next_arg(p, args);
1421 switch ( i )
1423 case 0: regs->ebx = arg; break;
1424 case 1: regs->ecx = arg; break;
1425 case 2: regs->edx = arg; break;
1426 case 3: regs->esi = arg; break;
1427 case 4: regs->edi = arg; break;
1428 case 5: regs->ebp = arg; break;
1433 this_cpu(hc_preempted) = 1;
1436 va_end(args);
1438 return op;
1441 #ifdef CONFIG_COMPAT
1442 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1444 int rc = 0;
1445 struct mc_state *mcs = &this_cpu(mc_state);
1446 struct cpu_user_regs *regs;
1447 unsigned int i, cval = 0;
1448 unsigned long nval = 0;
1449 va_list args;
1451 BUG_ON(*id > 5);
1452 BUG_ON(mask & (1U << *id));
1454 va_start(args, mask);
1456 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1458 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1459 return 0;
1460 for ( i = 0; i < 6; ++i, mask >>= 1 )
1462 if ( mask & 1 )
1464 nval = va_arg(args, unsigned long);
1465 cval = va_arg(args, unsigned int);
1466 if ( cval == nval )
1467 mask &= ~1U;
1468 else
1469 BUG_ON(nval == (unsigned int)nval);
1471 else if ( id && *id == i )
1473 *id = mcs->call.args[i];
1474 id = NULL;
1476 if ( (mask & 1) && mcs->call.args[i] == nval )
1478 mcs->call.args[i] = cval;
1479 ++rc;
1481 else
1482 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1485 else
1487 regs = guest_cpu_user_regs();
1488 for ( i = 0; i < 6; ++i, mask >>= 1 )
1490 unsigned long *reg;
1492 switch ( i )
1494 case 0: reg = &regs->ebx; break;
1495 case 1: reg = &regs->ecx; break;
1496 case 2: reg = &regs->edx; break;
1497 case 3: reg = &regs->esi; break;
1498 case 4: reg = &regs->edi; break;
1499 case 5: reg = &regs->ebp; break;
1500 default: BUG(); reg = NULL; break;
1502 if ( (mask & 1) )
1504 nval = va_arg(args, unsigned long);
1505 cval = va_arg(args, unsigned int);
1506 if ( cval == nval )
1507 mask &= ~1U;
1508 else
1509 BUG_ON(nval == (unsigned int)nval);
1511 else if ( id && *id == i )
1513 *id = *reg;
1514 id = NULL;
1516 if ( (mask & 1) && *reg == nval )
1518 *reg = cval;
1519 ++rc;
1521 else
1522 BUG_ON(*reg != (unsigned int)*reg);
1526 va_end(args);
1528 return rc;
1530 #endif
1532 static void relinquish_memory(struct domain *d, struct list_head *list,
1533 unsigned long type)
1535 struct list_head *ent;
1536 struct page_info *page;
1537 unsigned long x, y;
1539 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1540 spin_lock_recursive(&d->page_alloc_lock);
1542 ent = list->next;
1543 while ( ent != list )
1545 page = list_entry(ent, struct page_info, list);
1547 /* Grab a reference to the page so it won't disappear from under us. */
1548 if ( unlikely(!get_page(page, d)) )
1550 /* Couldn't get a reference -- someone is freeing this page. */
1551 ent = ent->next;
1552 continue;
1555 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1556 put_page_and_type(page);
1558 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1559 put_page(page);
1561 /*
1562 * Forcibly invalidate top-most, still valid page tables at this point
1563 * to break circular 'linear page table' references. This is okay
1564 * because MMU structures are not shared across domains and this domain
1565 * is now dead. Thus top-most valid tables are not in use so a non-zero
1566 * count means circular reference.
1567 */
1568 y = page->u.inuse.type_info;
1569 for ( ; ; )
1571 x = y;
1572 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1573 (type|PGT_validated)) )
1574 break;
1576 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1577 if ( likely(y == x) )
1579 free_page_type(page, type);
1580 break;
1584 /* Follow the list chain and /then/ potentially free the page. */
1585 ent = ent->next;
1586 put_page(page);
1589 spin_unlock_recursive(&d->page_alloc_lock);
1592 static void vcpu_destroy_pagetables(struct vcpu *v)
1594 struct domain *d = v->domain;
1595 unsigned long pfn;
1597 #ifdef __x86_64__
1598 if ( is_pv_32on64_vcpu(v) )
1600 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1601 __va(pagetable_get_paddr(v->arch.guest_table)));
1603 if ( pfn != 0 )
1605 if ( paging_mode_refcounts(d) )
1606 put_page(mfn_to_page(pfn));
1607 else
1608 put_page_and_type(mfn_to_page(pfn));
1611 l4e_write(
1612 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1613 l4e_empty());
1615 v->arch.cr3 = 0;
1616 return;
1618 #endif
1620 pfn = pagetable_get_pfn(v->arch.guest_table);
1621 if ( pfn != 0 )
1623 if ( paging_mode_refcounts(d) )
1624 put_page(mfn_to_page(pfn));
1625 else
1626 put_page_and_type(mfn_to_page(pfn));
1627 #ifdef __x86_64__
1628 if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) )
1629 v->arch.guest_table_user = pagetable_null();
1630 #endif
1631 v->arch.guest_table = pagetable_null();
1634 #ifdef __x86_64__
1635 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1636 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1637 if ( pfn != 0 )
1639 if ( paging_mode_refcounts(d) )
1640 put_page(mfn_to_page(pfn));
1641 else
1642 put_page_and_type(mfn_to_page(pfn));
1643 v->arch.guest_table_user = pagetable_null();
1645 #endif
1647 v->arch.cr3 = 0;
1650 void domain_relinquish_resources(struct domain *d)
1652 struct vcpu *v;
1654 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1656 /* Drop the in-use references to page-table bases. */
1657 for_each_vcpu ( d, v )
1658 vcpu_destroy_pagetables(v);
1660 /* Tear down paging-assistance stuff. */
1661 paging_teardown(d);
1663 /*
1664 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1665 * it automatically gets squashed when the guest's mappings go away.
1666 */
1667 for_each_vcpu(d, v)
1668 destroy_gdt(v);
1670 /* Relinquish every page of memory. */
1671 #if CONFIG_PAGING_LEVELS >= 4
1672 relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
1673 relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1674 #endif
1675 #if CONFIG_PAGING_LEVELS >= 3
1676 relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
1677 relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1678 #endif
1679 relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
1680 relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1682 /* Free page used by xen oprofile buffer. */
1683 free_xenoprof_pages(d);
1685 if ( is_hvm_domain(d) )
1686 hvm_domain_relinquish_resources(d);
1689 void arch_dump_domain_info(struct domain *d)
1691 paging_dump_domain_info(d);
1694 void arch_dump_vcpu_info(struct vcpu *v)
1696 paging_dump_vcpu_info(v);
1699 /*
1700 * Local variables:
1701 * mode: C
1702 * c-set-style: "BSD"
1703 * c-basic-offset: 4
1704 * tab-width: 4
1705 * indent-tabs-mode: nil
1706 * End:
1707 */