direct-io.hg

view xen/arch/x86/domain.c @ 15391:fe3df33e2761

32-on-64: First slot of hidden L4 page directory must start life as zero.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Wed Jun 20 13:39:54 2007 +0100 (2007-06-20)
parents 69658f935cc7
children 005dd6b1cf8e
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <asm/regs.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/system.h>
35 #include <asm/io.h>
36 #include <asm/processor.h>
37 #include <asm/desc.h>
38 #include <asm/i387.h>
39 #include <asm/mpspec.h>
40 #include <asm/ldt.h>
41 #include <asm/paging.h>
42 #include <asm/hypercall.h>
43 #include <asm/hvm/hvm.h>
44 #include <asm/hvm/support.h>
45 #include <asm/msr.h>
46 #ifdef CONFIG_COMPAT
47 #include <compat/vcpu.h>
48 #endif
50 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
51 DEFINE_PER_CPU(__u64, efer);
53 static void unmap_vcpu_info(struct vcpu *v);
55 static void paravirt_ctxt_switch_from(struct vcpu *v);
56 static void paravirt_ctxt_switch_to(struct vcpu *v);
58 static void vcpu_destroy_pagetables(struct vcpu *v);
60 static void continue_idle_domain(struct vcpu *v)
61 {
62 reset_stack_and_jump(idle_loop);
63 }
65 static void continue_nonidle_domain(struct vcpu *v)
66 {
67 reset_stack_and_jump(ret_from_intr);
68 }
70 static void default_idle(void)
71 {
72 local_irq_disable();
73 if ( !softirq_pending(smp_processor_id()) )
74 safe_halt();
75 else
76 local_irq_enable();
77 }
79 void idle_loop(void)
80 {
81 for ( ; ; )
82 {
83 page_scrub_schedule_work();
84 default_idle();
85 do_softirq();
86 }
87 }
89 void startup_cpu_idle_loop(void)
90 {
91 struct vcpu *v = current;
93 ASSERT(is_idle_vcpu(v));
94 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
95 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
97 reset_stack_and_jump(idle_loop);
98 }
100 void dump_pageframe_info(struct domain *d)
101 {
102 struct page_info *page;
104 printk("Memory pages belonging to domain %u:\n", d->domain_id);
106 if ( d->tot_pages >= 10 )
107 {
108 printk(" DomPage list too long to display\n");
109 }
110 else
111 {
112 list_for_each_entry ( page, &d->page_list, list )
113 {
114 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
115 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
116 page->count_info, page->u.inuse.type_info);
117 }
118 }
120 list_for_each_entry ( page, &d->xenpage_list, list )
121 {
122 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
123 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
124 page->count_info, page->u.inuse.type_info);
125 }
126 }
128 struct vcpu *alloc_vcpu_struct(void)
129 {
130 struct vcpu *v;
131 if ( (v = xmalloc(struct vcpu)) != NULL )
132 memset(v, 0, sizeof(*v));
133 return v;
134 }
136 void free_vcpu_struct(struct vcpu *v)
137 {
138 xfree(v);
139 }
141 #ifdef CONFIG_COMPAT
143 int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
144 {
145 struct domain *d = v->domain;
146 unsigned i;
147 struct page_info *pg;
149 if ( !d->arch.mm_arg_xlat_l3 )
150 {
151 pg = alloc_domheap_page(NULL);
152 if ( !pg )
153 return -ENOMEM;
154 d->arch.mm_arg_xlat_l3 = clear_page(page_to_virt(pg));
155 }
157 l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
158 l4e_from_paddr(__pa(d->arch.mm_arg_xlat_l3), __PAGE_HYPERVISOR);
160 for ( i = 0; i < COMPAT_ARG_XLAT_PAGES; ++i )
161 {
162 unsigned long va = COMPAT_ARG_XLAT_VIRT_START(v->vcpu_id) + i * PAGE_SIZE;
163 l2_pgentry_t *l2tab;
164 l1_pgentry_t *l1tab;
166 if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
167 {
168 pg = alloc_domheap_page(NULL);
169 if ( !pg )
170 return -ENOMEM;
171 clear_page(page_to_virt(pg));
172 d->arch.mm_arg_xlat_l3[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
173 }
174 l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
175 if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
176 {
177 pg = alloc_domheap_page(NULL);
178 if ( !pg )
179 return -ENOMEM;
180 clear_page(page_to_virt(pg));
181 l2tab[l2_table_offset(va)] = l2e_from_page(pg, __PAGE_HYPERVISOR);
182 }
183 l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
184 BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
185 pg = alloc_domheap_page(NULL);
186 if ( !pg )
187 return -ENOMEM;
188 l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
189 }
191 return 0;
192 }
194 static void release_arg_xlat_area(struct domain *d)
195 {
196 if ( d->arch.mm_arg_xlat_l3 )
197 {
198 unsigned l3;
200 for ( l3 = 0; l3 < L3_PAGETABLE_ENTRIES; ++l3 )
201 {
202 if ( l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3]) )
203 {
204 l2_pgentry_t *l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3]);
205 unsigned l2;
207 for ( l2 = 0; l2 < L2_PAGETABLE_ENTRIES; ++l2 )
208 {
209 if ( l2e_get_intpte(l2tab[l2]) )
210 {
211 l1_pgentry_t *l1tab = l2e_to_l1e(l2tab[l2]);
212 unsigned l1;
214 for ( l1 = 0; l1 < L1_PAGETABLE_ENTRIES; ++l1 )
215 {
216 if ( l1e_get_intpte(l1tab[l1]) )
217 free_domheap_page(l1e_get_page(l1tab[l1]));
218 }
219 free_domheap_page(l2e_get_page(l2tab[l2]));
220 }
221 }
222 free_domheap_page(l3e_get_page(d->arch.mm_arg_xlat_l3[l3]));
223 }
224 }
225 free_domheap_page(virt_to_page(d->arch.mm_arg_xlat_l3));
226 }
227 }
229 static int setup_compat_l4(struct vcpu *v)
230 {
231 struct page_info *pg = alloc_domheap_page(NULL);
232 l4_pgentry_t *l4tab;
233 int rc;
235 if ( pg == NULL )
236 return -ENOMEM;
238 /* This page needs to look like a pagetable so that it can be shadowed */
239 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated;
241 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
242 l4tab[0] = l4e_empty();
243 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
244 l4e_from_page(pg, __PAGE_HYPERVISOR);
245 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
246 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3),
247 __PAGE_HYPERVISOR);
249 if ( (rc = setup_arg_xlat_area(v, l4tab)) < 0 )
250 {
251 free_domheap_page(pg);
252 return rc;
253 }
255 v->arch.guest_table = pagetable_from_page(pg);
256 v->arch.guest_table_user = v->arch.guest_table;
258 return 0;
259 }
261 static void release_compat_l4(struct vcpu *v)
262 {
263 free_domheap_page(pagetable_get_page(v->arch.guest_table));
264 v->arch.guest_table = pagetable_null();
265 v->arch.guest_table_user = pagetable_null();
266 }
268 static inline int may_switch_mode(struct domain *d)
269 {
270 return (!is_hvm_domain(d) && (d->tot_pages == 0));
271 }
273 int switch_native(struct domain *d)
274 {
275 l1_pgentry_t gdt_l1e;
276 unsigned int vcpuid;
278 if ( d == NULL )
279 return -EINVAL;
280 if ( !may_switch_mode(d) )
281 return -EACCES;
282 if ( !is_pv_32on64_domain(d) )
283 return 0;
285 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
286 release_arg_xlat_area(d);
288 /* switch gdt */
289 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
290 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
291 {
292 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
293 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
294 if (d->vcpu[vcpuid])
295 release_compat_l4(d->vcpu[vcpuid]);
296 }
298 d->arch.physaddr_bitsize = 64;
300 return 0;
301 }
303 int switch_compat(struct domain *d)
304 {
305 l1_pgentry_t gdt_l1e;
306 unsigned int vcpuid;
308 if ( d == NULL )
309 return -EINVAL;
310 if ( compat_disabled )
311 return -ENOSYS;
312 if ( !may_switch_mode(d) )
313 return -EACCES;
314 if ( is_pv_32on64_domain(d) )
315 return 0;
317 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
319 /* switch gdt */
320 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
321 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
322 {
323 if ( (d->vcpu[vcpuid] != NULL) &&
324 (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
325 goto undo_and_fail;
326 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
327 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
328 }
330 d->arch.physaddr_bitsize =
331 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
332 + (PAGE_SIZE - 2);
334 return 0;
336 undo_and_fail:
337 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
338 release_arg_xlat_area(d);
339 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
340 while ( vcpuid-- != 0 )
341 {
342 if ( d->vcpu[vcpuid] != NULL )
343 release_compat_l4(d->vcpu[vcpuid]);
344 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
345 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
346 }
347 return -ENOMEM;
348 }
350 #else
351 #define release_arg_xlat_area(d) ((void)0)
352 #define setup_compat_l4(v) 0
353 #define release_compat_l4(v) ((void)0)
354 #endif
356 int vcpu_initialise(struct vcpu *v)
357 {
358 struct domain *d = v->domain;
359 int rc;
361 v->arch.vcpu_info_mfn = INVALID_MFN;
363 v->arch.flags = TF_kernel_mode;
365 pae_l3_cache_init(&v->arch.pae_l3_cache);
367 paging_vcpu_init(v);
369 if ( is_hvm_domain(d) )
370 {
371 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
372 return rc;
373 }
374 else
375 {
376 /* PV guests by default have a 100Hz ticker. */
377 v->periodic_period = MILLISECS(10);
379 /* PV guests get an emulated PIT too for video BIOSes to use. */
380 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
381 pit_init(v, cpu_khz);
383 v->arch.schedule_tail = continue_nonidle_domain;
384 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
385 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
387 if ( is_idle_domain(d) )
388 {
389 v->arch.schedule_tail = continue_idle_domain;
390 v->arch.cr3 = __pa(idle_pg_table);
391 }
392 }
394 v->arch.perdomain_ptes =
395 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
397 return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
398 }
400 void vcpu_destroy(struct vcpu *v)
401 {
402 if ( is_pv_32on64_vcpu(v) )
403 release_compat_l4(v);
405 unmap_vcpu_info(v);
407 if ( is_hvm_vcpu(v) )
408 hvm_vcpu_destroy(v);
409 }
411 int arch_domain_create(struct domain *d)
412 {
413 #ifdef __x86_64__
414 struct page_info *pg;
415 int i;
416 #endif
417 l1_pgentry_t gdt_l1e;
418 int vcpuid, pdpt_order, paging_initialised = 0;
419 int rc = -ENOMEM;
421 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
422 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
423 if ( d->arch.mm_perdomain_pt == NULL )
424 goto fail;
425 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
427 /*
428 * Map Xen segments into every VCPU's GDT, irrespective of whether every
429 * VCPU will actually be used. This avoids an NMI race during context
430 * switch: if we take an interrupt after switching CR3 but before switching
431 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
432 * try to load CS from an invalid table.
433 */
434 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
435 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
436 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
437 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
439 #if defined(__i386__)
441 mapcache_init(d);
443 #else /* __x86_64__ */
445 if ( (pg = alloc_domheap_page(NULL)) == NULL )
446 goto fail;
447 d->arch.mm_perdomain_l2 = clear_page(page_to_virt(pg));
448 for ( i = 0; i < (1 << pdpt_order); i++ )
449 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
450 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
451 __PAGE_HYPERVISOR);
453 if ( (pg = alloc_domheap_page(NULL)) == NULL )
454 goto fail;
455 d->arch.mm_perdomain_l3 = clear_page(page_to_virt(pg));
456 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
457 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
458 __PAGE_HYPERVISOR);
460 #endif /* __x86_64__ */
462 #ifdef CONFIG_COMPAT
463 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
464 #endif
466 paging_domain_init(d);
467 paging_initialised = 1;
469 if ( !is_idle_domain(d) )
470 {
471 d->arch.ioport_caps =
472 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
473 if ( d->arch.ioport_caps == NULL )
474 goto fail;
476 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
477 goto fail;
479 memset(d->shared_info, 0, PAGE_SIZE);
480 share_xen_page_with_guest(
481 virt_to_page(d->shared_info), d, XENSHARE_writable);
482 }
484 if ( is_hvm_domain(d) )
485 {
486 if ( (rc = hvm_domain_initialise(d)) != 0 )
487 goto fail;
488 }
489 else
490 {
491 /* 32-bit PV guest by default only if Xen is not 64-bit. */
492 d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
493 (CONFIG_PAGING_LEVELS != 4);
494 }
496 return 0;
498 fail:
499 free_xenheap_page(d->shared_info);
500 if ( paging_initialised )
501 paging_final_teardown(d);
502 #ifdef __x86_64__
503 if ( d->arch.mm_perdomain_l2 )
504 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
505 if ( d->arch.mm_perdomain_l3 )
506 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
507 #endif
508 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
509 return rc;
510 }
512 void arch_domain_destroy(struct domain *d)
513 {
514 if ( is_hvm_domain(d) )
515 hvm_domain_destroy(d);
517 paging_final_teardown(d);
519 free_xenheap_pages(
520 d->arch.mm_perdomain_pt,
521 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
523 #ifdef __x86_64__
524 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
525 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
526 #endif
528 if ( is_pv_32on64_domain(d) )
529 release_arg_xlat_area(d);
531 free_xenheap_page(d->shared_info);
532 }
534 /* This is called by arch_final_setup_guest and do_boot_vcpu */
535 int arch_set_info_guest(
536 struct vcpu *v, vcpu_guest_context_u c)
537 {
538 struct domain *d = v->domain;
539 unsigned long cr3_pfn = INVALID_MFN;
540 unsigned long flags;
541 int i, rc = 0, compat;
543 /* The context is a compat-mode one if the target domain is compat-mode;
544 * we expect the tools to DTRT even in compat-mode callers. */
545 compat = is_pv_32on64_domain(d);
547 #ifdef CONFIG_COMPAT
548 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
549 #else
550 #define c(fld) (c.nat->fld)
551 #endif
552 flags = c(flags);
554 if ( !is_hvm_vcpu(v) )
555 {
556 if ( !compat )
557 {
558 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
559 fixup_guest_stack_selector(d, c.nat->kernel_ss);
560 fixup_guest_code_selector(d, c.nat->user_regs.cs);
561 #ifdef __i386__
562 fixup_guest_code_selector(d, c.nat->event_callback_cs);
563 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
564 #endif
566 for ( i = 0; i < 256; i++ )
567 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
569 /* LDT safety checks. */
570 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
571 (c.nat->ldt_ents > 8192) ||
572 !array_access_ok(c.nat->ldt_base,
573 c.nat->ldt_ents,
574 LDT_ENTRY_SIZE) )
575 return -EINVAL;
576 }
577 #ifdef CONFIG_COMPAT
578 else
579 {
580 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
581 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
582 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
583 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
584 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
586 for ( i = 0; i < 256; i++ )
587 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
589 /* LDT safety checks. */
590 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
591 (c.cmp->ldt_ents > 8192) ||
592 !compat_array_access_ok(c.cmp->ldt_base,
593 c.cmp->ldt_ents,
594 LDT_ENTRY_SIZE) )
595 return -EINVAL;
596 }
597 #endif
598 }
600 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
602 v->arch.flags &= ~TF_kernel_mode;
603 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
604 v->arch.flags |= TF_kernel_mode;
606 if ( !compat )
607 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
608 #ifdef CONFIG_COMPAT
609 else
610 {
611 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
612 }
613 #endif
615 /* Only CR0.TS is modifiable by guest or admin. */
616 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
617 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
619 init_int80_direct_trap(v);
621 if ( !is_hvm_vcpu(v) )
622 {
623 /* IOPL privileges are virtualised. */
624 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
625 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
627 /* Ensure real hardware interrupts are enabled. */
628 v->arch.guest_context.user_regs.eflags |= EF_IE;
629 }
630 else
631 {
632 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
633 }
635 if ( v->is_initialised )
636 goto out;
638 memset(v->arch.guest_context.debugreg, 0,
639 sizeof(v->arch.guest_context.debugreg));
640 for ( i = 0; i < 8; i++ )
641 (void)set_debugreg(v, i, c(debugreg[i]));
643 if ( v->vcpu_id == 0 )
644 d->vm_assist = c(vm_assist);
646 if ( !is_hvm_vcpu(v) )
647 {
648 if ( !compat )
649 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
650 #ifdef CONFIG_COMPAT
651 else
652 {
653 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
654 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
656 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
657 return -EINVAL;
658 for ( i = 0; i < n; ++i )
659 gdt_frames[i] = c.cmp->gdt_frames[i];
660 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
661 }
662 #endif
663 if ( rc != 0 )
664 return rc;
666 if ( !compat )
667 {
668 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
670 if ( !mfn_valid(cr3_pfn) ||
671 (paging_mode_refcounts(d)
672 ? !get_page(mfn_to_page(cr3_pfn), d)
673 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
674 PGT_base_page_table)) )
675 {
676 destroy_gdt(v);
677 return -EINVAL;
678 }
680 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
682 #ifdef __x86_64__
683 if ( c.nat->ctrlreg[1] )
684 {
685 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
687 if ( !mfn_valid(cr3_pfn) ||
688 (paging_mode_refcounts(d)
689 ? !get_page(mfn_to_page(cr3_pfn), d)
690 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
691 PGT_base_page_table)) )
692 {
693 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
694 v->arch.guest_table = pagetable_null();
695 if ( paging_mode_refcounts(d) )
696 put_page(mfn_to_page(cr3_pfn));
697 else
698 put_page_and_type(mfn_to_page(cr3_pfn));
699 destroy_gdt(v);
700 return -EINVAL;
701 }
703 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
704 }
705 #endif
706 }
707 #ifdef CONFIG_COMPAT
708 else
709 {
710 l4_pgentry_t *l4tab;
712 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
714 if ( !mfn_valid(cr3_pfn) ||
715 (paging_mode_refcounts(d)
716 ? !get_page(mfn_to_page(cr3_pfn), d)
717 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
718 PGT_l3_page_table)) )
719 {
720 destroy_gdt(v);
721 return -EINVAL;
722 }
724 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
725 *l4tab = l4e_from_pfn(cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
726 }
727 #endif
728 }
730 if ( v->vcpu_id == 0 )
731 update_domain_wallclock_time(d);
733 /* Don't redo final setup */
734 v->is_initialised = 1;
736 if ( paging_mode_enabled(d) )
737 paging_update_paging_modes(v);
739 update_cr3(v);
741 out:
742 if ( flags & VGCF_online )
743 clear_bit(_VPF_down, &v->pause_flags);
744 else
745 set_bit(_VPF_down, &v->pause_flags);
746 return 0;
747 #undef c
748 }
750 int arch_vcpu_reset(struct vcpu *v)
751 {
752 destroy_gdt(v);
753 vcpu_destroy_pagetables(v);
754 return 0;
755 }
757 /*
758 * Unmap the vcpu info page if the guest decided to place it somewhere
759 * else. This is only used from arch_domain_destroy, so there's no
760 * need to do anything clever.
761 */
762 static void
763 unmap_vcpu_info(struct vcpu *v)
764 {
765 struct domain *d = v->domain;
766 unsigned long mfn;
768 if ( v->arch.vcpu_info_mfn == INVALID_MFN )
769 return;
771 mfn = v->arch.vcpu_info_mfn;
772 unmap_domain_page_global(v->vcpu_info);
774 v->vcpu_info = shared_info_addr(d, vcpu_info[v->vcpu_id]);
775 v->arch.vcpu_info_mfn = INVALID_MFN;
777 put_page_and_type(mfn_to_page(mfn));
778 }
780 /*
781 * Map a guest page in and point the vcpu_info pointer at it. This
782 * makes sure that the vcpu_info is always pointing at a valid piece
783 * of memory, and it sets a pending event to make sure that a pending
784 * event doesn't get missed.
785 */
786 static int
787 map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
788 {
789 struct domain *d = v->domain;
790 void *mapping;
791 vcpu_info_t *new_info;
792 int i;
794 if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
795 return -EINVAL;
797 if ( v->arch.vcpu_info_mfn != INVALID_MFN )
798 return -EINVAL;
800 /* Run this command on yourself or on other offline VCPUS. */
801 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )
802 return -EINVAL;
804 mfn = gmfn_to_mfn(d, mfn);
805 if ( !mfn_valid(mfn) ||
806 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
807 return -EINVAL;
809 mapping = map_domain_page_global(mfn);
810 if ( mapping == NULL )
811 {
812 put_page_and_type(mfn_to_page(mfn));
813 return -ENOMEM;
814 }
816 new_info = (vcpu_info_t *)(mapping + offset);
818 memcpy(new_info, v->vcpu_info, sizeof(*new_info));
820 v->vcpu_info = new_info;
821 v->arch.vcpu_info_mfn = mfn;
823 /* Set new vcpu_info pointer /before/ setting pending flags. */
824 wmb();
826 /*
827 * Mark everything as being pending just to make sure nothing gets
828 * lost. The domain will get a spurious event, but it can cope.
829 */
830 vcpu_info(v, evtchn_upcall_pending) = 1;
831 for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
832 set_bit(i, vcpu_info_addr(v, evtchn_pending_sel));
834 /*
835 * Only bother to update time for the current vcpu. If we're
836 * operating on another vcpu, then it had better not be running at
837 * the time.
838 */
839 if ( v == current )
840 update_vcpu_system_time(v);
842 return 0;
843 }
845 long
846 arch_do_vcpu_op(
847 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
848 {
849 long rc = 0;
851 switch ( cmd )
852 {
853 case VCPUOP_register_runstate_memory_area:
854 {
855 struct vcpu_register_runstate_memory_area area;
856 struct vcpu_runstate_info runstate;
858 rc = -EFAULT;
859 if ( copy_from_guest(&area, arg, 1) )
860 break;
862 if ( !guest_handle_okay(area.addr.h, 1) )
863 break;
865 rc = 0;
866 runstate_guest(v) = area.addr.h;
868 if ( v == current )
869 {
870 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
871 }
872 else
873 {
874 vcpu_runstate_get(v, &runstate);
875 __copy_to_guest(runstate_guest(v), &runstate, 1);
876 }
878 break;
879 }
881 case VCPUOP_register_vcpu_info:
882 {
883 struct domain *d = v->domain;
884 struct vcpu_register_vcpu_info info;
886 rc = -EFAULT;
887 if ( copy_from_guest(&info, arg, 1) )
888 break;
890 LOCK_BIGLOCK(d);
891 rc = map_vcpu_info(v, info.mfn, info.offset);
892 UNLOCK_BIGLOCK(d);
894 break;
895 }
897 default:
898 rc = -ENOSYS;
899 break;
900 }
902 return rc;
903 }
905 #ifdef __x86_64__
907 #define loadsegment(seg,value) ({ \
908 int __r = 1; \
909 __asm__ __volatile__ ( \
910 "1: movl %k1,%%" #seg "\n2:\n" \
911 ".section .fixup,\"ax\"\n" \
912 "3: xorl %k0,%k0\n" \
913 " movl %k0,%%" #seg "\n" \
914 " jmp 2b\n" \
915 ".previous\n" \
916 ".section __ex_table,\"a\"\n" \
917 " .align 8\n" \
918 " .quad 1b,3b\n" \
919 ".previous" \
920 : "=r" (__r) : "r" (value), "0" (__r) );\
921 __r; })
923 /*
924 * save_segments() writes a mask of segments which are dirty (non-zero),
925 * allowing load_segments() to avoid some expensive segment loads and
926 * MSR writes.
927 */
928 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
929 #define DIRTY_DS 0x01
930 #define DIRTY_ES 0x02
931 #define DIRTY_FS 0x04
932 #define DIRTY_GS 0x08
933 #define DIRTY_FS_BASE 0x10
934 #define DIRTY_GS_BASE_USER 0x20
936 static void load_segments(struct vcpu *n)
937 {
938 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
939 int all_segs_okay = 1;
940 unsigned int dirty_segment_mask, cpu = smp_processor_id();
942 /* Load and clear the dirty segment mask. */
943 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
944 per_cpu(dirty_segment_mask, cpu) = 0;
946 /* Either selector != 0 ==> reload. */
947 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
948 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
950 /* Either selector != 0 ==> reload. */
951 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
952 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
954 /*
955 * Either selector != 0 ==> reload.
956 * Also reload to reset FS_BASE if it was non-zero.
957 */
958 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
959 nctxt->user_regs.fs) )
960 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
962 /*
963 * Either selector != 0 ==> reload.
964 * Also reload to reset GS_BASE if it was non-zero.
965 */
966 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
967 nctxt->user_regs.gs) )
968 {
969 /* Reset GS_BASE with user %gs? */
970 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
971 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
972 }
974 if ( !is_pv_32on64_domain(n->domain) )
975 {
976 /* This can only be non-zero if selector is NULL. */
977 if ( nctxt->fs_base )
978 wrmsr(MSR_FS_BASE,
979 nctxt->fs_base,
980 nctxt->fs_base>>32);
982 /* Most kernels have non-zero GS base, so don't bother testing. */
983 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
984 wrmsr(MSR_SHADOW_GS_BASE,
985 nctxt->gs_base_kernel,
986 nctxt->gs_base_kernel>>32);
988 /* This can only be non-zero if selector is NULL. */
989 if ( nctxt->gs_base_user )
990 wrmsr(MSR_GS_BASE,
991 nctxt->gs_base_user,
992 nctxt->gs_base_user>>32);
994 /* If in kernel mode then switch the GS bases around. */
995 if ( (n->arch.flags & TF_kernel_mode) )
996 __asm__ __volatile__ ( "swapgs" );
997 }
999 if ( unlikely(!all_segs_okay) )
1001 struct cpu_user_regs *regs = guest_cpu_user_regs();
1002 unsigned long *rsp =
1003 (n->arch.flags & TF_kernel_mode) ?
1004 (unsigned long *)regs->rsp :
1005 (unsigned long *)nctxt->kernel_sp;
1006 unsigned long cs_and_mask, rflags;
1008 if ( is_pv_32on64_domain(n->domain) )
1010 unsigned int *esp = ring_1(regs) ?
1011 (unsigned int *)regs->rsp :
1012 (unsigned int *)nctxt->kernel_sp;
1013 unsigned int cs_and_mask, eflags;
1014 int ret = 0;
1016 /* CS longword also contains full evtchn_upcall_mask. */
1017 cs_and_mask = (unsigned short)regs->cs |
1018 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1019 /* Fold upcall mask into RFLAGS.IF. */
1020 eflags = regs->_eflags & ~X86_EFLAGS_IF;
1021 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1023 if ( !ring_1(regs) )
1025 ret = put_user(regs->ss, esp-1);
1026 ret |= put_user(regs->_esp, esp-2);
1027 esp -= 2;
1030 if ( ret |
1031 put_user(eflags, esp-1) |
1032 put_user(cs_and_mask, esp-2) |
1033 put_user(regs->_eip, esp-3) |
1034 put_user(nctxt->user_regs.gs, esp-4) |
1035 put_user(nctxt->user_regs.fs, esp-5) |
1036 put_user(nctxt->user_regs.es, esp-6) |
1037 put_user(nctxt->user_regs.ds, esp-7) )
1039 gdprintk(XENLOG_ERR, "Error while creating compat "
1040 "failsafe callback frame.\n");
1041 domain_crash(n->domain);
1044 if ( test_bit(_VGCF_failsafe_disables_events,
1045 &n->arch.guest_context.flags) )
1046 vcpu_info(n, evtchn_upcall_mask) = 1;
1048 regs->entry_vector = TRAP_syscall;
1049 regs->_eflags &= 0xFFFCBEFFUL;
1050 regs->ss = FLAT_COMPAT_KERNEL_SS;
1051 regs->_esp = (unsigned long)(esp-7);
1052 regs->cs = FLAT_COMPAT_KERNEL_CS;
1053 regs->_eip = nctxt->failsafe_callback_eip;
1054 return;
1057 if ( !(n->arch.flags & TF_kernel_mode) )
1058 toggle_guest_mode(n);
1059 else
1060 regs->cs &= ~3;
1062 /* CS longword also contains full evtchn_upcall_mask. */
1063 cs_and_mask = (unsigned long)regs->cs |
1064 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1066 /* Fold upcall mask into RFLAGS.IF. */
1067 rflags = regs->rflags & ~X86_EFLAGS_IF;
1068 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1070 if ( put_user(regs->ss, rsp- 1) |
1071 put_user(regs->rsp, rsp- 2) |
1072 put_user(rflags, rsp- 3) |
1073 put_user(cs_and_mask, rsp- 4) |
1074 put_user(regs->rip, rsp- 5) |
1075 put_user(nctxt->user_regs.gs, rsp- 6) |
1076 put_user(nctxt->user_regs.fs, rsp- 7) |
1077 put_user(nctxt->user_regs.es, rsp- 8) |
1078 put_user(nctxt->user_regs.ds, rsp- 9) |
1079 put_user(regs->r11, rsp-10) |
1080 put_user(regs->rcx, rsp-11) )
1082 gdprintk(XENLOG_ERR, "Error while creating failsafe "
1083 "callback frame.\n");
1084 domain_crash(n->domain);
1087 if ( test_bit(_VGCF_failsafe_disables_events,
1088 &n->arch.guest_context.flags) )
1089 vcpu_info(n, evtchn_upcall_mask) = 1;
1091 regs->entry_vector = TRAP_syscall;
1092 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1093 X86_EFLAGS_NT|X86_EFLAGS_TF);
1094 regs->ss = FLAT_KERNEL_SS;
1095 regs->rsp = (unsigned long)(rsp-11);
1096 regs->cs = FLAT_KERNEL_CS;
1097 regs->rip = nctxt->failsafe_callback_eip;
1101 static void save_segments(struct vcpu *v)
1103 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
1104 struct cpu_user_regs *regs = &ctxt->user_regs;
1105 unsigned int dirty_segment_mask = 0;
1107 regs->ds = read_segment_register(ds);
1108 regs->es = read_segment_register(es);
1109 regs->fs = read_segment_register(fs);
1110 regs->gs = read_segment_register(gs);
1112 if ( regs->ds )
1113 dirty_segment_mask |= DIRTY_DS;
1115 if ( regs->es )
1116 dirty_segment_mask |= DIRTY_ES;
1118 if ( regs->fs || is_pv_32on64_domain(v->domain) )
1120 dirty_segment_mask |= DIRTY_FS;
1121 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
1123 else if ( ctxt->fs_base )
1125 dirty_segment_mask |= DIRTY_FS_BASE;
1128 if ( regs->gs || is_pv_32on64_domain(v->domain) )
1130 dirty_segment_mask |= DIRTY_GS;
1131 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
1133 else if ( ctxt->gs_base_user )
1135 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1138 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1141 #define switch_kernel_stack(v) ((void)0)
1143 #elif defined(__i386__)
1145 #define load_segments(n) ((void)0)
1146 #define save_segments(p) ((void)0)
1148 static inline void switch_kernel_stack(struct vcpu *v)
1150 struct tss_struct *tss = &init_tss[smp_processor_id()];
1151 tss->esp1 = v->arch.guest_context.kernel_sp;
1152 tss->ss1 = v->arch.guest_context.kernel_ss;
1155 #endif /* __i386__ */
1157 static void paravirt_ctxt_switch_from(struct vcpu *v)
1159 save_segments(v);
1162 static void paravirt_ctxt_switch_to(struct vcpu *v)
1164 set_int80_direct_trap(v);
1165 switch_kernel_stack(v);
1168 #define loaddebug(_v,_reg) \
1169 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
1171 static void __context_switch(void)
1173 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1174 unsigned int cpu = smp_processor_id();
1175 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1176 struct vcpu *n = current;
1178 ASSERT(p != n);
1179 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1181 if ( !is_idle_vcpu(p) )
1183 memcpy(&p->arch.guest_context.user_regs,
1184 stack_regs,
1185 CTXT_SWITCH_STACK_BYTES);
1186 unlazy_fpu(p);
1187 p->arch.ctxt_switch_from(p);
1190 if ( !is_idle_vcpu(n) )
1192 memcpy(stack_regs,
1193 &n->arch.guest_context.user_regs,
1194 CTXT_SWITCH_STACK_BYTES);
1196 /* Maybe switch the debug registers. */
1197 if ( unlikely(n->arch.guest_context.debugreg[7]) )
1199 loaddebug(&n->arch.guest_context, 0);
1200 loaddebug(&n->arch.guest_context, 1);
1201 loaddebug(&n->arch.guest_context, 2);
1202 loaddebug(&n->arch.guest_context, 3);
1203 /* no 4 and 5 */
1204 loaddebug(&n->arch.guest_context, 6);
1205 loaddebug(&n->arch.guest_context, 7);
1207 n->arch.ctxt_switch_to(n);
1210 if ( p->domain != n->domain )
1211 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1212 cpu_set(cpu, n->vcpu_dirty_cpumask);
1214 write_ptbase(n);
1216 if ( p->vcpu_id != n->vcpu_id )
1218 char gdt_load[10];
1219 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1220 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1221 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
1224 if ( p->domain != n->domain )
1225 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1226 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1228 per_cpu(curr_vcpu, cpu) = n;
1232 void context_switch(struct vcpu *prev, struct vcpu *next)
1234 unsigned int cpu = smp_processor_id();
1235 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1237 ASSERT(local_irq_is_enabled());
1239 /* Allow at most one CPU at a time to be dirty. */
1240 ASSERT(cpus_weight(dirty_mask) <= 1);
1241 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1243 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1244 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1245 flush_tlb_mask(next->vcpu_dirty_cpumask);
1248 local_irq_disable();
1250 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1251 pt_freeze_time(prev);
1253 set_current(next);
1255 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1257 local_irq_enable();
1259 else
1261 __context_switch();
1263 #ifdef CONFIG_COMPAT
1264 if ( !is_hvm_vcpu(next) &&
1265 (is_idle_vcpu(prev) ||
1266 is_hvm_vcpu(prev) ||
1267 is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
1269 uint64_t efer = read_efer();
1271 local_flush_tlb_one(GDT_VIRT_START(next) +
1272 FIRST_RESERVED_GDT_BYTE);
1274 if ( !is_pv_32on64_vcpu(next) == !(efer & EFER_SCE) )
1275 write_efer(efer ^ EFER_SCE);
1277 #endif
1279 /* Re-enable interrupts before restoring state which may fault. */
1280 local_irq_enable();
1282 if ( !is_hvm_vcpu(next) )
1284 load_LDT(next);
1285 load_segments(next);
1289 context_saved(prev);
1291 /* Update per-VCPU guest runstate shared memory area (if registered). */
1292 if ( !guest_handle_is_null(runstate_guest(next)) )
1294 if ( !is_pv_32on64_domain(next->domain) )
1295 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1296 #ifdef CONFIG_COMPAT
1297 else
1299 struct compat_vcpu_runstate_info info;
1301 XLAT_vcpu_runstate_info(&info, &next->runstate);
1302 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1304 #endif
1307 schedule_tail(next);
1308 BUG();
1311 void continue_running(struct vcpu *same)
1313 schedule_tail(same);
1314 BUG();
1317 int __sync_lazy_execstate(void)
1319 unsigned long flags;
1320 int switch_required;
1322 local_irq_save(flags);
1324 switch_required = (this_cpu(curr_vcpu) != current);
1326 if ( switch_required )
1328 ASSERT(current == idle_vcpu[smp_processor_id()]);
1329 __context_switch();
1332 local_irq_restore(flags);
1334 return switch_required;
1337 void sync_vcpu_execstate(struct vcpu *v)
1339 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1340 (void)__sync_lazy_execstate();
1342 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1343 flush_tlb_mask(v->vcpu_dirty_cpumask);
1346 #define next_arg(fmt, args) ({ \
1347 unsigned long __arg; \
1348 switch ( *(fmt)++ ) \
1349 { \
1350 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1351 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1352 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1353 default: __arg = 0; BUG(); \
1354 } \
1355 __arg; \
1356 })
1358 DEFINE_PER_CPU(char, hc_preempted);
1360 unsigned long hypercall_create_continuation(
1361 unsigned int op, const char *format, ...)
1363 struct mc_state *mcs = &this_cpu(mc_state);
1364 struct cpu_user_regs *regs;
1365 const char *p = format;
1366 unsigned long arg;
1367 unsigned int i;
1368 va_list args;
1370 va_start(args, format);
1372 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1374 __set_bit(_MCSF_call_preempted, &mcs->flags);
1376 for ( i = 0; *p != '\0'; i++ )
1377 mcs->call.args[i] = next_arg(p, args);
1378 if ( is_pv_32on64_domain(current->domain) )
1380 for ( ; i < 6; i++ )
1381 mcs->call.args[i] = 0;
1384 else
1386 regs = guest_cpu_user_regs();
1387 regs->eax = op;
1388 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1390 #ifdef __x86_64__
1391 if ( !is_hvm_vcpu(current) ?
1392 !is_pv_32on64_vcpu(current) :
1393 (hvm_guest_x86_mode(current) == 8) )
1395 for ( i = 0; *p != '\0'; i++ )
1397 arg = next_arg(p, args);
1398 switch ( i )
1400 case 0: regs->rdi = arg; break;
1401 case 1: regs->rsi = arg; break;
1402 case 2: regs->rdx = arg; break;
1403 case 3: regs->r10 = arg; break;
1404 case 4: regs->r8 = arg; break;
1405 case 5: regs->r9 = arg; break;
1409 else
1410 #endif
1412 if ( supervisor_mode_kernel )
1413 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1415 for ( i = 0; *p != '\0'; i++ )
1417 arg = next_arg(p, args);
1418 switch ( i )
1420 case 0: regs->ebx = arg; break;
1421 case 1: regs->ecx = arg; break;
1422 case 2: regs->edx = arg; break;
1423 case 3: regs->esi = arg; break;
1424 case 4: regs->edi = arg; break;
1425 case 5: regs->ebp = arg; break;
1430 this_cpu(hc_preempted) = 1;
1433 va_end(args);
1435 return op;
1438 #ifdef CONFIG_COMPAT
1439 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1441 int rc = 0;
1442 struct mc_state *mcs = &this_cpu(mc_state);
1443 struct cpu_user_regs *regs;
1444 unsigned int i, cval = 0;
1445 unsigned long nval = 0;
1446 va_list args;
1448 BUG_ON(*id > 5);
1449 BUG_ON(mask & (1U << *id));
1451 va_start(args, mask);
1453 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1455 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1456 return 0;
1457 for ( i = 0; i < 6; ++i, mask >>= 1 )
1459 if ( mask & 1 )
1461 nval = va_arg(args, unsigned long);
1462 cval = va_arg(args, unsigned int);
1463 if ( cval == nval )
1464 mask &= ~1U;
1465 else
1466 BUG_ON(nval == (unsigned int)nval);
1468 else if ( id && *id == i )
1470 *id = mcs->call.args[i];
1471 id = NULL;
1473 if ( (mask & 1) && mcs->call.args[i] == nval )
1475 mcs->call.args[i] = cval;
1476 ++rc;
1478 else
1479 BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]);
1482 else
1484 regs = guest_cpu_user_regs();
1485 for ( i = 0; i < 6; ++i, mask >>= 1 )
1487 unsigned long *reg;
1489 switch ( i )
1491 case 0: reg = &regs->ebx; break;
1492 case 1: reg = &regs->ecx; break;
1493 case 2: reg = &regs->edx; break;
1494 case 3: reg = &regs->esi; break;
1495 case 4: reg = &regs->edi; break;
1496 case 5: reg = &regs->ebp; break;
1497 default: BUG(); reg = NULL; break;
1499 if ( (mask & 1) )
1501 nval = va_arg(args, unsigned long);
1502 cval = va_arg(args, unsigned int);
1503 if ( cval == nval )
1504 mask &= ~1U;
1505 else
1506 BUG_ON(nval == (unsigned int)nval);
1508 else if ( id && *id == i )
1510 *id = *reg;
1511 id = NULL;
1513 if ( (mask & 1) && *reg == nval )
1515 *reg = cval;
1516 ++rc;
1518 else
1519 BUG_ON(*reg != (unsigned int)*reg);
1523 va_end(args);
1525 return rc;
1527 #endif
1529 static void relinquish_memory(struct domain *d, struct list_head *list,
1530 unsigned long type)
1532 struct list_head *ent;
1533 struct page_info *page;
1534 unsigned long x, y;
1536 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1537 spin_lock_recursive(&d->page_alloc_lock);
1539 ent = list->next;
1540 while ( ent != list )
1542 page = list_entry(ent, struct page_info, list);
1544 /* Grab a reference to the page so it won't disappear from under us. */
1545 if ( unlikely(!get_page(page, d)) )
1547 /* Couldn't get a reference -- someone is freeing this page. */
1548 ent = ent->next;
1549 continue;
1552 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1553 put_page_and_type(page);
1555 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1556 put_page(page);
1558 /*
1559 * Forcibly invalidate top-most, still valid page tables at this point
1560 * to break circular 'linear page table' references. This is okay
1561 * because MMU structures are not shared across domains and this domain
1562 * is now dead. Thus top-most valid tables are not in use so a non-zero
1563 * count means circular reference.
1564 */
1565 y = page->u.inuse.type_info;
1566 for ( ; ; )
1568 x = y;
1569 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1570 (type|PGT_validated)) )
1571 break;
1573 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1574 if ( likely(y == x) )
1576 free_page_type(page, type);
1577 break;
1581 /* Follow the list chain and /then/ potentially free the page. */
1582 ent = ent->next;
1583 put_page(page);
1586 spin_unlock_recursive(&d->page_alloc_lock);
1589 static void vcpu_destroy_pagetables(struct vcpu *v)
1591 struct domain *d = v->domain;
1592 unsigned long pfn;
1594 #ifdef __x86_64__
1595 if ( is_pv_32on64_vcpu(v) )
1597 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1598 __va(pagetable_get_paddr(v->arch.guest_table)));
1600 if ( pfn != 0 )
1602 if ( paging_mode_refcounts(d) )
1603 put_page(mfn_to_page(pfn));
1604 else
1605 put_page_and_type(mfn_to_page(pfn));
1608 l4e_write(
1609 (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
1610 l4e_empty());
1612 v->arch.cr3 = 0;
1613 return;
1615 #endif
1617 pfn = pagetable_get_pfn(v->arch.guest_table);
1618 if ( pfn != 0 )
1620 if ( paging_mode_refcounts(d) )
1621 put_page(mfn_to_page(pfn));
1622 else
1623 put_page_and_type(mfn_to_page(pfn));
1624 #ifdef __x86_64__
1625 if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) )
1626 v->arch.guest_table_user = pagetable_null();
1627 #endif
1628 v->arch.guest_table = pagetable_null();
1631 #ifdef __x86_64__
1632 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1633 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1634 if ( pfn != 0 )
1636 if ( paging_mode_refcounts(d) )
1637 put_page(mfn_to_page(pfn));
1638 else
1639 put_page_and_type(mfn_to_page(pfn));
1640 v->arch.guest_table_user = pagetable_null();
1642 #endif
1644 v->arch.cr3 = 0;
1647 void domain_relinquish_resources(struct domain *d)
1649 struct vcpu *v;
1651 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1653 /* Drop the in-use references to page-table bases. */
1654 for_each_vcpu ( d, v )
1655 vcpu_destroy_pagetables(v);
1657 /* Tear down paging-assistance stuff. */
1658 paging_teardown(d);
1660 /*
1661 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1662 * it automatically gets squashed when the guest's mappings go away.
1663 */
1664 for_each_vcpu(d, v)
1665 destroy_gdt(v);
1667 /* Relinquish every page of memory. */
1668 #if CONFIG_PAGING_LEVELS >= 4
1669 relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
1670 relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1671 #endif
1672 #if CONFIG_PAGING_LEVELS >= 3
1673 relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
1674 relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1675 #endif
1676 relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
1677 relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1679 /* Free page used by xen oprofile buffer. */
1680 free_xenoprof_pages(d);
1682 if ( is_hvm_domain(d) )
1683 hvm_domain_relinquish_resources(d);
1686 void arch_dump_domain_info(struct domain *d)
1688 paging_dump_domain_info(d);
1691 void arch_dump_vcpu_info(struct vcpu *v)
1693 paging_dump_vcpu_info(v);
1696 /*
1697 * Local variables:
1698 * mode: C
1699 * c-set-style: "BSD"
1700 * c-basic-offset: 4
1701 * tab-width: 4
1702 * indent-tabs-mode: nil
1703 * End:
1704 */