direct-io.hg

view xen/arch/x86/domain.c @ 14353:b01d4f415f5f

x86: PAE linear page tables.

While full linear page table support makes little sense (and would be
more complicated to implement), partial linear page table support is
almost identical to that in non-PAE, and is used (at least) by NetWare.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Mon Mar 12 14:47:00 2007 +0000 (2007-03-12)
parents f3f5f2756d75
children 4b13fc910acf
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <asm/regs.h>
32 #include <asm/mc146818rtc.h>
33 #include <asm/system.h>
34 #include <asm/io.h>
35 #include <asm/processor.h>
36 #include <asm/desc.h>
37 #include <asm/i387.h>
38 #include <asm/mpspec.h>
39 #include <asm/ldt.h>
40 #include <asm/paging.h>
41 #include <asm/hvm/hvm.h>
42 #include <asm/hvm/support.h>
43 #include <asm/msr.h>
44 #ifdef CONFIG_COMPAT
45 #include <compat/vcpu.h>
46 #endif
48 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
50 static void paravirt_ctxt_switch_from(struct vcpu *v);
51 static void paravirt_ctxt_switch_to(struct vcpu *v);
53 static void vcpu_destroy_pagetables(struct vcpu *v);
55 static void continue_idle_domain(struct vcpu *v)
56 {
57 reset_stack_and_jump(idle_loop);
58 }
60 static void continue_nonidle_domain(struct vcpu *v)
61 {
62 reset_stack_and_jump(ret_from_intr);
63 }
65 static void default_idle(void)
66 {
67 local_irq_disable();
68 if ( !softirq_pending(smp_processor_id()) )
69 safe_halt();
70 else
71 local_irq_enable();
72 }
74 void idle_loop(void)
75 {
76 for ( ; ; )
77 {
78 page_scrub_schedule_work();
79 default_idle();
80 do_softirq();
81 }
82 }
84 void startup_cpu_idle_loop(void)
85 {
86 struct vcpu *v = current;
88 ASSERT(is_idle_vcpu(v));
89 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
90 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
92 reset_stack_and_jump(idle_loop);
93 }
95 void dump_pageframe_info(struct domain *d)
96 {
97 struct page_info *page;
99 printk("Memory pages belonging to domain %u:\n", d->domain_id);
101 if ( d->tot_pages >= 10 )
102 {
103 printk(" DomPage list too long to display\n");
104 }
105 else
106 {
107 list_for_each_entry ( page, &d->page_list, list )
108 {
109 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
110 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
111 page->count_info, page->u.inuse.type_info);
112 }
113 }
115 list_for_each_entry ( page, &d->xenpage_list, list )
116 {
117 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
118 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
119 page->count_info, page->u.inuse.type_info);
120 }
121 }
123 struct vcpu *alloc_vcpu_struct(void)
124 {
125 struct vcpu *v;
126 if ( (v = xmalloc(struct vcpu)) != NULL )
127 memset(v, 0, sizeof(*v));
128 return v;
129 }
131 void free_vcpu_struct(struct vcpu *v)
132 {
133 xfree(v);
134 }
136 #ifdef CONFIG_COMPAT
138 int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
139 {
140 struct domain *d = v->domain;
141 unsigned i;
142 struct page_info *pg;
144 if ( !d->arch.mm_arg_xlat_l3 )
145 {
146 pg = alloc_domheap_page(NULL);
147 if ( !pg )
148 return -ENOMEM;
149 d->arch.mm_arg_xlat_l3 = clear_page(page_to_virt(pg));
150 }
152 l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
153 l4e_from_paddr(__pa(d->arch.mm_arg_xlat_l3), __PAGE_HYPERVISOR);
155 for ( i = 0; i < COMPAT_ARG_XLAT_PAGES; ++i )
156 {
157 unsigned long va = COMPAT_ARG_XLAT_VIRT_START(v->vcpu_id) + i * PAGE_SIZE;
158 l2_pgentry_t *l2tab;
159 l1_pgentry_t *l1tab;
161 if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
162 {
163 pg = alloc_domheap_page(NULL);
164 if ( !pg )
165 return -ENOMEM;
166 clear_page(page_to_virt(pg));
167 d->arch.mm_arg_xlat_l3[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
168 }
169 l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
170 if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
171 {
172 pg = alloc_domheap_page(NULL);
173 if ( !pg )
174 return -ENOMEM;
175 clear_page(page_to_virt(pg));
176 l2tab[l2_table_offset(va)] = l2e_from_page(pg, __PAGE_HYPERVISOR);
177 }
178 l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
179 BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
180 pg = alloc_domheap_page(NULL);
181 if ( !pg )
182 return -ENOMEM;
183 l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
184 }
186 return 0;
187 }
189 static void release_arg_xlat_area(struct domain *d)
190 {
191 if ( d->arch.mm_arg_xlat_l3 )
192 {
193 unsigned l3;
195 for ( l3 = 0; l3 < L3_PAGETABLE_ENTRIES; ++l3 )
196 {
197 if ( l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3]) )
198 {
199 l2_pgentry_t *l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3]);
200 unsigned l2;
202 for ( l2 = 0; l2 < L2_PAGETABLE_ENTRIES; ++l2 )
203 {
204 if ( l2e_get_intpte(l2tab[l2]) )
205 {
206 l1_pgentry_t *l1tab = l2e_to_l1e(l2tab[l2]);
207 unsigned l1;
209 for ( l1 = 0; l1 < L1_PAGETABLE_ENTRIES; ++l1 )
210 {
211 if ( l1e_get_intpte(l1tab[l1]) )
212 free_domheap_page(l1e_get_page(l1tab[l1]));
213 }
214 free_domheap_page(l2e_get_page(l2tab[l2]));
215 }
216 }
217 free_domheap_page(l3e_get_page(d->arch.mm_arg_xlat_l3[l3]));
218 }
219 }
220 free_domheap_page(virt_to_page(d->arch.mm_arg_xlat_l3));
221 }
222 }
224 static int setup_compat_l4(struct vcpu *v)
225 {
226 struct page_info *pg = alloc_domheap_page(NULL);
227 l4_pgentry_t *l4tab;
228 int rc;
230 if ( !pg )
231 return -ENOMEM;
233 /* This page needs to look like a pagetable so that it can be shadowed */
234 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated;
236 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
237 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
238 l4e_from_page(pg, __PAGE_HYPERVISOR);
239 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
240 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
241 v->arch.guest_table = pagetable_from_page(pg);
242 v->arch.guest_table_user = v->arch.guest_table;
244 if ( (rc = setup_arg_xlat_area(v, l4tab)) < 0 )
245 {
246 free_domheap_page(pg);
247 return rc;
248 }
250 return 0;
251 }
253 static void release_compat_l4(struct vcpu *v)
254 {
255 free_domheap_page(pagetable_get_page(v->arch.guest_table));
256 v->arch.guest_table = pagetable_null();
257 v->arch.guest_table_user = pagetable_null();
258 }
260 static inline int may_switch_mode(struct domain *d)
261 {
262 return (d->tot_pages == 0);
263 }
265 int switch_native(struct domain *d)
266 {
267 l1_pgentry_t gdt_l1e;
268 unsigned int vcpuid;
270 if ( d == NULL )
271 return -EINVAL;
272 if ( !may_switch_mode(d) )
273 return -EACCES;
274 if ( !IS_COMPAT(d) )
275 return 0;
277 clear_bit(_DOMF_compat, &d->domain_flags);
278 release_arg_xlat_area(d);
280 /* switch gdt */
281 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
282 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
283 {
284 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
285 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
286 if (d->vcpu[vcpuid])
287 release_compat_l4(d->vcpu[vcpuid]);
288 }
290 d->arch.physaddr_bitsize = 64;
292 return 0;
293 }
295 int switch_compat(struct domain *d)
296 {
297 l1_pgentry_t gdt_l1e;
298 unsigned int vcpuid;
300 if ( d == NULL )
301 return -EINVAL;
302 if ( compat_disabled )
303 return -ENOSYS;
304 if ( !may_switch_mode(d) )
305 return -EACCES;
306 if ( IS_COMPAT(d) )
307 return 0;
309 set_bit(_DOMF_compat, &d->domain_flags);
311 /* switch gdt */
312 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
313 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
314 {
315 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
316 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
317 if (d->vcpu[vcpuid]
318 && setup_compat_l4(d->vcpu[vcpuid]) != 0)
319 return -ENOMEM;
320 }
322 d->arch.physaddr_bitsize =
323 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
324 + (PAGE_SIZE - 2);
326 return 0;
327 }
329 #else
330 #define release_arg_xlat_area(d) ((void)0)
331 #define setup_compat_l4(v) 0
332 #define release_compat_l4(v) ((void)0)
333 #endif
335 int vcpu_initialise(struct vcpu *v)
336 {
337 struct domain *d = v->domain;
338 int rc;
340 v->arch.flags = TF_kernel_mode;
342 pae_l3_cache_init(&v->arch.pae_l3_cache);
344 paging_vcpu_init(v);
346 if ( is_hvm_domain(d) )
347 {
348 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
349 return rc;
350 }
351 else
352 {
353 /* PV guests by default have a 100Hz ticker. */
354 v->periodic_period = MILLISECS(10);
356 /* PV guests get an emulated PIT too for video BIOSes to use. */
357 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
358 pit_init(v, cpu_khz);
360 v->arch.schedule_tail = continue_nonidle_domain;
361 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
362 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
364 if ( is_idle_domain(d) )
365 {
366 v->arch.schedule_tail = continue_idle_domain;
367 v->arch.cr3 = __pa(idle_pg_table);
368 }
369 }
371 v->arch.perdomain_ptes =
372 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
374 if ( IS_COMPAT(d) && (rc = setup_compat_l4(v)) != 0 )
375 return rc;
377 return 0;
378 }
380 void vcpu_destroy(struct vcpu *v)
381 {
382 if ( IS_COMPAT(v->domain) )
383 release_compat_l4(v);
384 }
386 int arch_domain_create(struct domain *d)
387 {
388 #ifdef __x86_64__
389 struct page_info *pg;
390 int i;
391 #endif
392 l1_pgentry_t gdt_l1e;
393 int vcpuid, pdpt_order;
394 int rc = -ENOMEM;
396 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
397 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
398 if ( d->arch.mm_perdomain_pt == NULL )
399 goto fail;
400 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
402 /*
403 * Map Xen segments into every VCPU's GDT, irrespective of whether every
404 * VCPU will actually be used. This avoids an NMI race during context
405 * switch: if we take an interrupt after switching CR3 but before switching
406 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
407 * try to load CS from an invalid table.
408 */
409 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
410 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
411 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
412 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
414 #if defined(__i386__)
416 mapcache_init(d);
418 #else /* __x86_64__ */
420 if ( (pg = alloc_domheap_page(NULL)) == NULL )
421 goto fail;
422 d->arch.mm_perdomain_l2 = clear_page(page_to_virt(pg));
423 for ( i = 0; i < (1 << pdpt_order); i++ )
424 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
425 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
426 __PAGE_HYPERVISOR);
428 if ( (pg = alloc_domheap_page(NULL)) == NULL )
429 goto fail;
430 d->arch.mm_perdomain_l3 = clear_page(page_to_virt(pg));
431 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
432 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
433 __PAGE_HYPERVISOR);
435 #endif /* __x86_64__ */
437 #ifdef CONFIG_COMPAT
438 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
439 #endif
441 paging_domain_init(d);
443 if ( !is_idle_domain(d) )
444 {
445 d->arch.ioport_caps =
446 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
447 if ( d->arch.ioport_caps == NULL )
448 goto fail;
450 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
451 goto fail;
453 memset(d->shared_info, 0, PAGE_SIZE);
454 share_xen_page_with_guest(
455 virt_to_page(d->shared_info), d, XENSHARE_writable);
456 }
458 return is_hvm_domain(d) ? hvm_domain_initialise(d) : 0;
460 fail:
461 free_xenheap_page(d->shared_info);
462 #ifdef __x86_64__
463 if ( d->arch.mm_perdomain_l2 )
464 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
465 if ( d->arch.mm_perdomain_l3 )
466 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
467 #endif
468 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
469 return rc;
470 }
472 void arch_domain_destroy(struct domain *d)
473 {
474 struct vcpu *v;
476 if ( is_hvm_domain(d) )
477 {
478 for_each_vcpu ( d, v )
479 hvm_vcpu_destroy(v);
480 hvm_domain_destroy(d);
481 }
483 paging_final_teardown(d);
485 free_xenheap_pages(
486 d->arch.mm_perdomain_pt,
487 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
489 #ifdef __x86_64__
490 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
491 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
492 #endif
494 if ( IS_COMPAT(d) )
495 release_arg_xlat_area(d);
497 free_xenheap_page(d->shared_info);
498 }
500 /* This is called by arch_final_setup_guest and do_boot_vcpu */
501 int arch_set_info_guest(
502 struct vcpu *v, vcpu_guest_context_u c)
503 {
504 struct domain *d = v->domain;
505 unsigned long cr3_pfn = INVALID_MFN;
506 unsigned long flags;
507 int i, rc = 0, compat;
509 /* The context is a compat-mode one if the target domain is compat-mode;
510 * we expect the tools to DTRT even in compat-mode callers. */
511 compat = IS_COMPAT(d);
513 #ifdef CONFIG_COMPAT
514 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
515 #else
516 #define c(fld) (c.nat->fld)
517 #endif
518 flags = c(flags);
520 if ( !is_hvm_vcpu(v) )
521 {
522 if ( !compat )
523 {
524 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
525 fixup_guest_stack_selector(d, c.nat->kernel_ss);
526 fixup_guest_code_selector(d, c.nat->user_regs.cs);
527 #ifdef __i386__
528 fixup_guest_code_selector(d, c.nat->event_callback_cs);
529 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
530 #endif
532 for ( i = 0; i < 256; i++ )
533 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
535 /* LDT safety checks. */
536 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
537 (c.nat->ldt_ents > 8192) ||
538 !array_access_ok(c.nat->ldt_base,
539 c.nat->ldt_ents,
540 LDT_ENTRY_SIZE) )
541 return -EINVAL;
542 }
543 #ifdef CONFIG_COMPAT
544 else
545 {
546 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
547 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
548 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
549 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
550 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
552 for ( i = 0; i < 256; i++ )
553 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
555 /* LDT safety checks. */
556 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
557 (c.cmp->ldt_ents > 8192) ||
558 !compat_array_access_ok(c.cmp->ldt_base,
559 c.cmp->ldt_ents,
560 LDT_ENTRY_SIZE) )
561 return -EINVAL;
562 }
563 #endif
564 }
566 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
567 if ( flags & VGCF_I387_VALID )
568 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
570 v->arch.flags &= ~TF_kernel_mode;
571 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
572 v->arch.flags |= TF_kernel_mode;
574 if ( !compat )
575 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
576 #ifdef CONFIG_COMPAT
577 else
578 {
579 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
580 }
581 #endif
583 /* Only CR0.TS is modifiable by guest or admin. */
584 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
585 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
587 init_int80_direct_trap(v);
589 if ( !is_hvm_vcpu(v) )
590 {
591 /* IOPL privileges are virtualised. */
592 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
593 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
595 /* Ensure real hardware interrupts are enabled. */
596 v->arch.guest_context.user_regs.eflags |= EF_IE;
597 }
598 else
599 {
600 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
601 }
603 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
604 goto out;
606 memset(v->arch.guest_context.debugreg, 0,
607 sizeof(v->arch.guest_context.debugreg));
608 for ( i = 0; i < 8; i++ )
609 (void)set_debugreg(v, i, c(debugreg[i]));
611 if ( v->vcpu_id == 0 )
612 d->vm_assist = c(vm_assist);
614 if ( !is_hvm_vcpu(v) )
615 {
616 if ( !compat )
617 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
618 #ifdef CONFIG_COMPAT
619 else
620 {
621 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
622 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
624 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
625 return -EINVAL;
626 for ( i = 0; i < n; ++i )
627 gdt_frames[i] = c.cmp->gdt_frames[i];
628 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
629 }
630 #endif
631 if ( rc != 0 )
632 return rc;
634 if ( !compat )
635 {
636 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
638 if ( !mfn_valid(cr3_pfn) ||
639 (paging_mode_refcounts(d)
640 ? !get_page(mfn_to_page(cr3_pfn), d)
641 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
642 PGT_base_page_table)) )
643 {
644 destroy_gdt(v);
645 return -EINVAL;
646 }
648 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
650 #ifdef __x86_64__
651 if ( c.nat->ctrlreg[1] )
652 {
653 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
655 if ( !mfn_valid(cr3_pfn) ||
656 (paging_mode_refcounts(d)
657 ? !get_page(mfn_to_page(cr3_pfn), d)
658 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
659 PGT_base_page_table)) )
660 {
661 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
662 v->arch.guest_table = pagetable_null();
663 if ( paging_mode_refcounts(d) )
664 put_page(mfn_to_page(cr3_pfn));
665 else
666 put_page_and_type(mfn_to_page(cr3_pfn));
667 destroy_gdt(v);
668 return -EINVAL;
669 }
671 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
672 }
673 #endif
674 }
675 #ifdef CONFIG_COMPAT
676 else
677 {
678 l4_pgentry_t *l4tab;
680 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
682 if ( !mfn_valid(cr3_pfn) ||
683 (paging_mode_refcounts(d)
684 ? !get_page(mfn_to_page(cr3_pfn), d)
685 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
686 PGT_l3_page_table)) )
687 {
688 destroy_gdt(v);
689 return -EINVAL;
690 }
692 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
693 *l4tab = l4e_from_pfn(cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
694 }
695 #endif
696 }
698 if ( v->vcpu_id == 0 )
699 update_domain_wallclock_time(d);
701 /* Don't redo final setup */
702 set_bit(_VCPUF_initialised, &v->vcpu_flags);
704 if ( paging_mode_enabled(d) )
705 paging_update_paging_modes(v);
707 update_cr3(v);
709 out:
710 if ( flags & VGCF_online )
711 clear_bit(_VCPUF_down, &v->vcpu_flags);
712 else
713 set_bit(_VCPUF_down, &v->vcpu_flags);
714 return 0;
715 #undef c
716 }
718 int arch_vcpu_reset(struct vcpu *v)
719 {
720 destroy_gdt(v);
721 vcpu_destroy_pagetables(v);
722 return 0;
723 }
725 long
726 arch_do_vcpu_op(
727 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
728 {
729 long rc = 0;
731 switch ( cmd )
732 {
733 case VCPUOP_register_runstate_memory_area:
734 {
735 struct vcpu_register_runstate_memory_area area;
736 struct vcpu_runstate_info runstate;
738 rc = -EFAULT;
739 if ( copy_from_guest(&area, arg, 1) )
740 break;
742 if ( !guest_handle_okay(area.addr.h, 1) )
743 break;
745 rc = 0;
746 runstate_guest(v) = area.addr.h;
748 if ( v == current )
749 {
750 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
751 }
752 else
753 {
754 vcpu_runstate_get(v, &runstate);
755 __copy_to_guest(runstate_guest(v), &runstate, 1);
756 }
758 break;
759 }
761 default:
762 rc = -ENOSYS;
763 break;
764 }
766 return rc;
767 }
769 #ifdef __x86_64__
771 #define loadsegment(seg,value) ({ \
772 int __r = 1; \
773 __asm__ __volatile__ ( \
774 "1: movl %k1,%%" #seg "\n2:\n" \
775 ".section .fixup,\"ax\"\n" \
776 "3: xorl %k0,%k0\n" \
777 " movl %k0,%%" #seg "\n" \
778 " jmp 2b\n" \
779 ".previous\n" \
780 ".section __ex_table,\"a\"\n" \
781 " .align 8\n" \
782 " .quad 1b,3b\n" \
783 ".previous" \
784 : "=r" (__r) : "r" (value), "0" (__r) );\
785 __r; })
787 /*
788 * save_segments() writes a mask of segments which are dirty (non-zero),
789 * allowing load_segments() to avoid some expensive segment loads and
790 * MSR writes.
791 */
792 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
793 #define DIRTY_DS 0x01
794 #define DIRTY_ES 0x02
795 #define DIRTY_FS 0x04
796 #define DIRTY_GS 0x08
797 #define DIRTY_FS_BASE 0x10
798 #define DIRTY_GS_BASE_USER 0x20
800 static void load_segments(struct vcpu *n)
801 {
802 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
803 int all_segs_okay = 1;
804 unsigned int dirty_segment_mask, cpu = smp_processor_id();
806 /* Load and clear the dirty segment mask. */
807 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
808 per_cpu(dirty_segment_mask, cpu) = 0;
810 /* Either selector != 0 ==> reload. */
811 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
812 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
814 /* Either selector != 0 ==> reload. */
815 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
816 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
818 /*
819 * Either selector != 0 ==> reload.
820 * Also reload to reset FS_BASE if it was non-zero.
821 */
822 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
823 nctxt->user_regs.fs) )
824 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
826 /*
827 * Either selector != 0 ==> reload.
828 * Also reload to reset GS_BASE if it was non-zero.
829 */
830 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
831 nctxt->user_regs.gs) )
832 {
833 /* Reset GS_BASE with user %gs? */
834 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
835 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
836 }
838 if ( !IS_COMPAT(n->domain) )
839 {
840 /* This can only be non-zero if selector is NULL. */
841 if ( nctxt->fs_base )
842 wrmsr(MSR_FS_BASE,
843 nctxt->fs_base,
844 nctxt->fs_base>>32);
846 /* Most kernels have non-zero GS base, so don't bother testing. */
847 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
848 wrmsr(MSR_SHADOW_GS_BASE,
849 nctxt->gs_base_kernel,
850 nctxt->gs_base_kernel>>32);
852 /* This can only be non-zero if selector is NULL. */
853 if ( nctxt->gs_base_user )
854 wrmsr(MSR_GS_BASE,
855 nctxt->gs_base_user,
856 nctxt->gs_base_user>>32);
858 /* If in kernel mode then switch the GS bases around. */
859 if ( (n->arch.flags & TF_kernel_mode) )
860 __asm__ __volatile__ ( "swapgs" );
861 }
863 if ( unlikely(!all_segs_okay) )
864 {
865 struct cpu_user_regs *regs = guest_cpu_user_regs();
866 unsigned long *rsp =
867 (n->arch.flags & TF_kernel_mode) ?
868 (unsigned long *)regs->rsp :
869 (unsigned long *)nctxt->kernel_sp;
870 unsigned long cs_and_mask, rflags;
872 if ( IS_COMPAT(n->domain) )
873 {
874 unsigned int *esp = ring_1(regs) ?
875 (unsigned int *)regs->rsp :
876 (unsigned int *)nctxt->kernel_sp;
877 unsigned int cs_and_mask, eflags;
878 int ret = 0;
880 /* CS longword also contains full evtchn_upcall_mask. */
881 cs_and_mask = (unsigned short)regs->cs |
882 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
883 /* Fold upcall mask into RFLAGS.IF. */
884 eflags = regs->_eflags & ~X86_EFLAGS_IF;
885 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
887 if ( !ring_1(regs) )
888 {
889 ret = put_user(regs->ss, esp-1);
890 ret |= put_user(regs->_esp, esp-2);
891 esp -= 2;
892 }
894 if ( ret |
895 put_user(eflags, esp-1) |
896 put_user(cs_and_mask, esp-2) |
897 put_user(regs->_eip, esp-3) |
898 put_user(nctxt->user_regs.gs, esp-4) |
899 put_user(nctxt->user_regs.fs, esp-5) |
900 put_user(nctxt->user_regs.es, esp-6) |
901 put_user(nctxt->user_regs.ds, esp-7) )
902 {
903 gdprintk(XENLOG_ERR, "Error while creating compat "
904 "failsafe callback frame.\n");
905 domain_crash(n->domain);
906 }
908 if ( test_bit(_VGCF_failsafe_disables_events,
909 &n->arch.guest_context.flags) )
910 vcpu_info(n, evtchn_upcall_mask) = 1;
912 regs->entry_vector = TRAP_syscall;
913 regs->_eflags &= 0xFFFCBEFFUL;
914 regs->ss = FLAT_COMPAT_KERNEL_SS;
915 regs->_esp = (unsigned long)(esp-7);
916 regs->cs = FLAT_COMPAT_KERNEL_CS;
917 regs->_eip = nctxt->failsafe_callback_eip;
918 return;
919 }
921 if ( !(n->arch.flags & TF_kernel_mode) )
922 toggle_guest_mode(n);
923 else
924 regs->cs &= ~3;
926 /* CS longword also contains full evtchn_upcall_mask. */
927 cs_and_mask = (unsigned long)regs->cs |
928 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
930 /* Fold upcall mask into RFLAGS.IF. */
931 rflags = regs->rflags & ~X86_EFLAGS_IF;
932 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
934 if ( put_user(regs->ss, rsp- 1) |
935 put_user(regs->rsp, rsp- 2) |
936 put_user(rflags, rsp- 3) |
937 put_user(cs_and_mask, rsp- 4) |
938 put_user(regs->rip, rsp- 5) |
939 put_user(nctxt->user_regs.gs, rsp- 6) |
940 put_user(nctxt->user_regs.fs, rsp- 7) |
941 put_user(nctxt->user_regs.es, rsp- 8) |
942 put_user(nctxt->user_regs.ds, rsp- 9) |
943 put_user(regs->r11, rsp-10) |
944 put_user(regs->rcx, rsp-11) )
945 {
946 gdprintk(XENLOG_ERR, "Error while creating failsafe "
947 "callback frame.\n");
948 domain_crash(n->domain);
949 }
951 if ( test_bit(_VGCF_failsafe_disables_events,
952 &n->arch.guest_context.flags) )
953 vcpu_info(n, evtchn_upcall_mask) = 1;
955 regs->entry_vector = TRAP_syscall;
956 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
957 X86_EFLAGS_NT|X86_EFLAGS_TF);
958 regs->ss = FLAT_KERNEL_SS;
959 regs->rsp = (unsigned long)(rsp-11);
960 regs->cs = FLAT_KERNEL_CS;
961 regs->rip = nctxt->failsafe_callback_eip;
962 }
963 }
965 static void save_segments(struct vcpu *v)
966 {
967 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
968 struct cpu_user_regs *regs = &ctxt->user_regs;
969 unsigned int dirty_segment_mask = 0;
971 regs->ds = read_segment_register(ds);
972 regs->es = read_segment_register(es);
973 regs->fs = read_segment_register(fs);
974 regs->gs = read_segment_register(gs);
976 if ( regs->ds )
977 dirty_segment_mask |= DIRTY_DS;
979 if ( regs->es )
980 dirty_segment_mask |= DIRTY_ES;
982 if ( regs->fs || IS_COMPAT(v->domain) )
983 {
984 dirty_segment_mask |= DIRTY_FS;
985 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
986 }
987 else if ( ctxt->fs_base )
988 {
989 dirty_segment_mask |= DIRTY_FS_BASE;
990 }
992 if ( regs->gs || IS_COMPAT(v->domain) )
993 {
994 dirty_segment_mask |= DIRTY_GS;
995 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
996 }
997 else if ( ctxt->gs_base_user )
998 {
999 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1002 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1005 #define switch_kernel_stack(v) ((void)0)
1007 #elif defined(__i386__)
1009 #define load_segments(n) ((void)0)
1010 #define save_segments(p) ((void)0)
1012 static inline void switch_kernel_stack(struct vcpu *v)
1014 struct tss_struct *tss = &init_tss[smp_processor_id()];
1015 tss->esp1 = v->arch.guest_context.kernel_sp;
1016 tss->ss1 = v->arch.guest_context.kernel_ss;
1019 #endif /* __i386__ */
1021 static void paravirt_ctxt_switch_from(struct vcpu *v)
1023 save_segments(v);
1026 static void paravirt_ctxt_switch_to(struct vcpu *v)
1028 set_int80_direct_trap(v);
1029 switch_kernel_stack(v);
1032 #define loaddebug(_v,_reg) \
1033 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
1035 static void __context_switch(void)
1037 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1038 unsigned int cpu = smp_processor_id();
1039 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1040 struct vcpu *n = current;
1042 ASSERT(p != n);
1043 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1045 if ( !is_idle_vcpu(p) )
1047 memcpy(&p->arch.guest_context.user_regs,
1048 stack_regs,
1049 CTXT_SWITCH_STACK_BYTES);
1050 unlazy_fpu(p);
1051 p->arch.ctxt_switch_from(p);
1054 if ( !is_idle_vcpu(n) )
1056 memcpy(stack_regs,
1057 &n->arch.guest_context.user_regs,
1058 CTXT_SWITCH_STACK_BYTES);
1060 /* Maybe switch the debug registers. */
1061 if ( unlikely(n->arch.guest_context.debugreg[7]) )
1063 loaddebug(&n->arch.guest_context, 0);
1064 loaddebug(&n->arch.guest_context, 1);
1065 loaddebug(&n->arch.guest_context, 2);
1066 loaddebug(&n->arch.guest_context, 3);
1067 /* no 4 and 5 */
1068 loaddebug(&n->arch.guest_context, 6);
1069 loaddebug(&n->arch.guest_context, 7);
1071 n->arch.ctxt_switch_to(n);
1074 if ( p->domain != n->domain )
1075 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1076 cpu_set(cpu, n->vcpu_dirty_cpumask);
1078 write_ptbase(n);
1080 if ( p->vcpu_id != n->vcpu_id )
1082 char gdt_load[10];
1083 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1084 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1085 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
1088 if ( p->domain != n->domain )
1089 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1090 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1092 per_cpu(curr_vcpu, cpu) = n;
1096 void context_switch(struct vcpu *prev, struct vcpu *next)
1098 unsigned int cpu = smp_processor_id();
1099 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1101 ASSERT(local_irq_is_enabled());
1103 /* Allow at most one CPU at a time to be dirty. */
1104 ASSERT(cpus_weight(dirty_mask) <= 1);
1105 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1107 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1108 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1109 flush_tlb_mask(next->vcpu_dirty_cpumask);
1112 local_irq_disable();
1114 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1115 pt_freeze_time(prev);
1117 set_current(next);
1119 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1121 local_irq_enable();
1123 else
1125 __context_switch();
1127 #ifdef CONFIG_COMPAT
1128 if ( is_idle_vcpu(prev)
1129 || IS_COMPAT(prev->domain) != IS_COMPAT(next->domain) )
1131 uint32_t efer_lo, efer_hi;
1133 local_flush_tlb_one(GDT_VIRT_START(next) + FIRST_RESERVED_GDT_BYTE);
1135 rdmsr(MSR_EFER, efer_lo, efer_hi);
1136 if ( !IS_COMPAT(next->domain) == !(efer_lo & EFER_SCE) )
1138 efer_lo ^= EFER_SCE;
1139 wrmsr(MSR_EFER, efer_lo, efer_hi);
1142 #endif
1144 /* Re-enable interrupts before restoring state which may fault. */
1145 local_irq_enable();
1147 if ( !is_hvm_vcpu(next) )
1149 load_LDT(next);
1150 load_segments(next);
1154 context_saved(prev);
1156 /* Update per-VCPU guest runstate shared memory area (if registered). */
1157 if ( !guest_handle_is_null(runstate_guest(next)) )
1159 if ( !IS_COMPAT(next->domain) )
1160 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1161 #ifdef CONFIG_COMPAT
1162 else
1164 struct compat_vcpu_runstate_info info;
1166 XLAT_vcpu_runstate_info(&info, &next->runstate);
1167 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1169 #endif
1172 schedule_tail(next);
1173 BUG();
1176 void continue_running(struct vcpu *same)
1178 schedule_tail(same);
1179 BUG();
1182 int __sync_lazy_execstate(void)
1184 unsigned long flags;
1185 int switch_required;
1187 local_irq_save(flags);
1189 switch_required = (this_cpu(curr_vcpu) != current);
1191 if ( switch_required )
1193 ASSERT(current == idle_vcpu[smp_processor_id()]);
1194 __context_switch();
1197 local_irq_restore(flags);
1199 return switch_required;
1202 void sync_vcpu_execstate(struct vcpu *v)
1204 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1205 (void)__sync_lazy_execstate();
1207 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1208 flush_tlb_mask(v->vcpu_dirty_cpumask);
1211 #define next_arg(fmt, args) ({ \
1212 unsigned long __arg; \
1213 switch ( *(fmt)++ ) \
1214 { \
1215 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1216 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1217 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1218 default: __arg = 0; BUG(); \
1219 } \
1220 __arg; \
1221 })
1223 unsigned long hypercall_create_continuation(
1224 unsigned int op, const char *format, ...)
1226 struct mc_state *mcs = &this_cpu(mc_state);
1227 struct cpu_user_regs *regs;
1228 const char *p = format;
1229 unsigned long arg;
1230 unsigned int i;
1231 va_list args;
1233 va_start(args, format);
1235 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1237 __set_bit(_MCSF_call_preempted, &mcs->flags);
1239 for ( i = 0; *p != '\0'; i++ )
1240 mcs->call.args[i] = next_arg(p, args);
1241 if ( IS_COMPAT(current->domain) )
1243 for ( ; i < 6; i++ )
1244 mcs->call.args[i] = 0;
1247 else
1249 regs = guest_cpu_user_regs();
1250 regs->eax = op;
1251 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1253 #ifdef __x86_64__
1254 if ( !IS_COMPAT(current->domain) )
1256 for ( i = 0; *p != '\0'; i++ )
1258 arg = next_arg(p, args);
1259 switch ( i )
1261 case 0: regs->rdi = arg; break;
1262 case 1: regs->rsi = arg; break;
1263 case 2: regs->rdx = arg; break;
1264 case 3: regs->r10 = arg; break;
1265 case 4: regs->r8 = arg; break;
1266 case 5: regs->r9 = arg; break;
1270 else
1271 #endif
1273 if ( supervisor_mode_kernel || is_hvm_vcpu(current) )
1274 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1276 for ( i = 0; *p != '\0'; i++ )
1278 arg = next_arg(p, args);
1279 switch ( i )
1281 case 0: regs->ebx = arg; break;
1282 case 1: regs->ecx = arg; break;
1283 case 2: regs->edx = arg; break;
1284 case 3: regs->esi = arg; break;
1285 case 4: regs->edi = arg; break;
1286 case 5: regs->ebp = arg; break;
1292 va_end(args);
1294 return op;
1297 #ifdef CONFIG_COMPAT
1298 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1300 int rc = 0;
1301 struct mc_state *mcs = &this_cpu(mc_state);
1302 struct cpu_user_regs *regs;
1303 unsigned int i, cval = 0;
1304 unsigned long nval = 0;
1305 va_list args;
1307 BUG_ON(*id > 5);
1308 BUG_ON(mask & (1U << *id));
1310 va_start(args, mask);
1312 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1314 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1315 return 0;
1316 for ( i = 0; i < 6; ++i, mask >>= 1 )
1318 if ( mask & 1 )
1320 nval = va_arg(args, unsigned long);
1321 cval = va_arg(args, unsigned int);
1322 if ( cval == nval )
1323 mask &= ~1U;
1324 else
1325 BUG_ON(nval == (unsigned int)nval);
1327 else if ( id && *id == i )
1329 *id = mcs->call.args[i];
1330 id = NULL;
1332 if ( (mask & 1) && mcs->call.args[i] == nval )
1333 ++rc;
1334 else
1336 cval = mcs->call.args[i];
1337 BUG_ON(mcs->call.args[i] != cval);
1339 mcs->compat_call.args[i] = cval;
1342 else
1344 regs = guest_cpu_user_regs();
1345 for ( i = 0; i < 6; ++i, mask >>= 1 )
1347 unsigned long *reg;
1349 switch ( i )
1351 case 0: reg = &regs->ebx; break;
1352 case 1: reg = &regs->ecx; break;
1353 case 2: reg = &regs->edx; break;
1354 case 3: reg = &regs->esi; break;
1355 case 4: reg = &regs->edi; break;
1356 case 5: reg = &regs->ebp; break;
1357 default: BUG(); reg = NULL; break;
1359 if ( (mask & 1) )
1361 nval = va_arg(args, unsigned long);
1362 cval = va_arg(args, unsigned int);
1363 if ( cval == nval )
1364 mask &= ~1U;
1365 else
1366 BUG_ON(nval == (unsigned int)nval);
1368 else if ( id && *id == i )
1370 *id = *reg;
1371 id = NULL;
1373 if ( (mask & 1) && *reg == nval )
1375 *reg = cval;
1376 ++rc;
1378 else
1379 BUG_ON(*reg != (unsigned int)*reg);
1383 va_end(args);
1385 return rc;
1387 #endif
1389 static void relinquish_memory(struct domain *d, struct list_head *list,
1390 unsigned long type)
1392 struct list_head *ent;
1393 struct page_info *page;
1394 unsigned long x, y;
1396 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1397 spin_lock_recursive(&d->page_alloc_lock);
1399 ent = list->next;
1400 while ( ent != list )
1402 page = list_entry(ent, struct page_info, list);
1404 /* Grab a reference to the page so it won't disappear from under us. */
1405 if ( unlikely(!get_page(page, d)) )
1407 /* Couldn't get a reference -- someone is freeing this page. */
1408 ent = ent->next;
1409 continue;
1412 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1413 put_page_and_type(page);
1415 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1416 put_page(page);
1418 /*
1419 * Forcibly invalidate top-most, still valid page tables at this point
1420 * to break circular 'linear page table' references. This is okay
1421 * because MMU structures are not shared across domains and this domain
1422 * is now dead. Thus top-most valid tables are not in use so a non-zero
1423 * count means circular reference.
1424 */
1425 y = page->u.inuse.type_info;
1426 for ( ; ; )
1428 x = y;
1429 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1430 (type|PGT_validated)) )
1431 break;
1433 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1434 if ( likely(y == x) )
1436 free_page_type(page, type);
1437 break;
1441 /* Follow the list chain and /then/ potentially free the page. */
1442 ent = ent->next;
1443 put_page(page);
1446 spin_unlock_recursive(&d->page_alloc_lock);
1449 static void vcpu_destroy_pagetables(struct vcpu *v)
1451 struct domain *d = v->domain;
1452 unsigned long pfn;
1454 #ifdef CONFIG_COMPAT
1455 if ( IS_COMPAT(d) )
1457 if ( is_hvm_vcpu(v) )
1458 pfn = pagetable_get_pfn(v->arch.guest_table);
1459 else
1460 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1461 __va(pagetable_get_paddr(v->arch.guest_table)));
1463 if ( pfn != 0 )
1465 if ( paging_mode_refcounts(d) )
1466 put_page(mfn_to_page(pfn));
1467 else
1468 put_page_and_type(mfn_to_page(pfn));
1471 if ( is_hvm_vcpu(v) )
1472 v->arch.guest_table = pagetable_null();
1473 else
1474 l4e_write(
1475 (l4_pgentry_t *) __va(pagetable_get_paddr(v->arch.guest_table)),
1476 l4e_empty());
1478 v->arch.cr3 = 0;
1479 return;
1481 #endif
1483 pfn = pagetable_get_pfn(v->arch.guest_table);
1484 if ( pfn != 0 )
1486 if ( paging_mode_refcounts(d) )
1487 put_page(mfn_to_page(pfn));
1488 else
1489 put_page_and_type(mfn_to_page(pfn));
1490 #ifdef __x86_64__
1491 if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) )
1492 v->arch.guest_table_user = pagetable_null();
1493 #endif
1494 v->arch.guest_table = pagetable_null();
1497 #ifdef __x86_64__
1498 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1499 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1500 if ( pfn != 0 )
1502 if ( paging_mode_refcounts(d) )
1503 put_page(mfn_to_page(pfn));
1504 else
1505 put_page_and_type(mfn_to_page(pfn));
1506 v->arch.guest_table_user = pagetable_null();
1508 #endif
1510 v->arch.cr3 = 0;
1513 void domain_relinquish_resources(struct domain *d)
1515 struct vcpu *v;
1517 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1519 /* Drop the in-use references to page-table bases. */
1520 for_each_vcpu ( d, v )
1521 vcpu_destroy_pagetables(v);
1523 /* Tear down paging-assistance stuff. */
1524 paging_teardown(d);
1526 /*
1527 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1528 * it automatically gets squashed when the guest's mappings go away.
1529 */
1530 for_each_vcpu(d, v)
1531 destroy_gdt(v);
1533 /* Relinquish every page of memory. */
1534 #if CONFIG_PAGING_LEVELS >= 4
1535 relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
1536 relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1537 #endif
1538 #if CONFIG_PAGING_LEVELS >= 3
1539 relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
1540 relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1541 #endif
1542 relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
1543 relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1545 /* Free page used by xen oprofile buffer */
1546 free_xenoprof_pages(d);
1549 void arch_dump_domain_info(struct domain *d)
1551 paging_dump_domain_info(d);
1554 void arch_dump_vcpu_info(struct vcpu *v)
1556 paging_dump_vcpu_info(v);
1559 /*
1560 * Local variables:
1561 * mode: C
1562 * c-set-style: "BSD"
1563 * c-basic-offset: 4
1564 * tab-width: 4
1565 * indent-tabs-mode: nil
1566 * End:
1567 */