ia64/xen-unstable

view xen/arch/x86/domain.c @ 14861:126f8bb9aa5d

xen x86: Fix PV guest destruction.
Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
author kfraser@localhost.localdomain
date Mon Apr 16 11:36:40 2007 +0100 (2007-04-16)
parents ba8d4bc2435a
children f18e1ca69380
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/domain.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <xen/grant_table.h>
24 #include <xen/iocap.h>
25 #include <xen/kernel.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <asm/regs.h>
32 #include <asm/mc146818rtc.h>
33 #include <asm/system.h>
34 #include <asm/io.h>
35 #include <asm/processor.h>
36 #include <asm/desc.h>
37 #include <asm/i387.h>
38 #include <asm/mpspec.h>
39 #include <asm/ldt.h>
40 #include <asm/paging.h>
41 #include <asm/hvm/hvm.h>
42 #include <asm/hvm/support.h>
43 #include <asm/msr.h>
44 #ifdef CONFIG_COMPAT
45 #include <compat/vcpu.h>
46 #endif
48 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
50 static void paravirt_ctxt_switch_from(struct vcpu *v);
51 static void paravirt_ctxt_switch_to(struct vcpu *v);
53 static void vcpu_destroy_pagetables(struct vcpu *v);
55 static void continue_idle_domain(struct vcpu *v)
56 {
57 reset_stack_and_jump(idle_loop);
58 }
60 static void continue_nonidle_domain(struct vcpu *v)
61 {
62 reset_stack_and_jump(ret_from_intr);
63 }
65 static void default_idle(void)
66 {
67 local_irq_disable();
68 if ( !softirq_pending(smp_processor_id()) )
69 safe_halt();
70 else
71 local_irq_enable();
72 }
74 void idle_loop(void)
75 {
76 for ( ; ; )
77 {
78 page_scrub_schedule_work();
79 default_idle();
80 do_softirq();
81 }
82 }
84 void startup_cpu_idle_loop(void)
85 {
86 struct vcpu *v = current;
88 ASSERT(is_idle_vcpu(v));
89 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
90 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
92 reset_stack_and_jump(idle_loop);
93 }
95 void dump_pageframe_info(struct domain *d)
96 {
97 struct page_info *page;
99 printk("Memory pages belonging to domain %u:\n", d->domain_id);
101 if ( d->tot_pages >= 10 )
102 {
103 printk(" DomPage list too long to display\n");
104 }
105 else
106 {
107 list_for_each_entry ( page, &d->page_list, list )
108 {
109 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
110 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
111 page->count_info, page->u.inuse.type_info);
112 }
113 }
115 list_for_each_entry ( page, &d->xenpage_list, list )
116 {
117 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
118 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
119 page->count_info, page->u.inuse.type_info);
120 }
121 }
123 struct vcpu *alloc_vcpu_struct(void)
124 {
125 struct vcpu *v;
126 if ( (v = xmalloc(struct vcpu)) != NULL )
127 memset(v, 0, sizeof(*v));
128 return v;
129 }
131 void free_vcpu_struct(struct vcpu *v)
132 {
133 xfree(v);
134 }
136 #ifdef CONFIG_COMPAT
138 int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
139 {
140 struct domain *d = v->domain;
141 unsigned i;
142 struct page_info *pg;
144 if ( !d->arch.mm_arg_xlat_l3 )
145 {
146 pg = alloc_domheap_page(NULL);
147 if ( !pg )
148 return -ENOMEM;
149 d->arch.mm_arg_xlat_l3 = clear_page(page_to_virt(pg));
150 }
152 l4tab[l4_table_offset(COMPAT_ARG_XLAT_VIRT_BASE)] =
153 l4e_from_paddr(__pa(d->arch.mm_arg_xlat_l3), __PAGE_HYPERVISOR);
155 for ( i = 0; i < COMPAT_ARG_XLAT_PAGES; ++i )
156 {
157 unsigned long va = COMPAT_ARG_XLAT_VIRT_START(v->vcpu_id) + i * PAGE_SIZE;
158 l2_pgentry_t *l2tab;
159 l1_pgentry_t *l1tab;
161 if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
162 {
163 pg = alloc_domheap_page(NULL);
164 if ( !pg )
165 return -ENOMEM;
166 clear_page(page_to_virt(pg));
167 d->arch.mm_arg_xlat_l3[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
168 }
169 l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
170 if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
171 {
172 pg = alloc_domheap_page(NULL);
173 if ( !pg )
174 return -ENOMEM;
175 clear_page(page_to_virt(pg));
176 l2tab[l2_table_offset(va)] = l2e_from_page(pg, __PAGE_HYPERVISOR);
177 }
178 l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
179 BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
180 pg = alloc_domheap_page(NULL);
181 if ( !pg )
182 return -ENOMEM;
183 l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
184 }
186 return 0;
187 }
189 static void release_arg_xlat_area(struct domain *d)
190 {
191 if ( d->arch.mm_arg_xlat_l3 )
192 {
193 unsigned l3;
195 for ( l3 = 0; l3 < L3_PAGETABLE_ENTRIES; ++l3 )
196 {
197 if ( l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3]) )
198 {
199 l2_pgentry_t *l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3]);
200 unsigned l2;
202 for ( l2 = 0; l2 < L2_PAGETABLE_ENTRIES; ++l2 )
203 {
204 if ( l2e_get_intpte(l2tab[l2]) )
205 {
206 l1_pgentry_t *l1tab = l2e_to_l1e(l2tab[l2]);
207 unsigned l1;
209 for ( l1 = 0; l1 < L1_PAGETABLE_ENTRIES; ++l1 )
210 {
211 if ( l1e_get_intpte(l1tab[l1]) )
212 free_domheap_page(l1e_get_page(l1tab[l1]));
213 }
214 free_domheap_page(l2e_get_page(l2tab[l2]));
215 }
216 }
217 free_domheap_page(l3e_get_page(d->arch.mm_arg_xlat_l3[l3]));
218 }
219 }
220 free_domheap_page(virt_to_page(d->arch.mm_arg_xlat_l3));
221 }
222 }
224 static int setup_compat_l4(struct vcpu *v)
225 {
226 struct page_info *pg = alloc_domheap_page(NULL);
227 l4_pgentry_t *l4tab;
228 int rc;
230 if ( !pg )
231 return -ENOMEM;
233 /* This page needs to look like a pagetable so that it can be shadowed */
234 pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated;
236 l4tab = copy_page(page_to_virt(pg), idle_pg_table);
237 l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
238 l4e_from_page(pg, __PAGE_HYPERVISOR);
239 l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
240 l4e_from_paddr(__pa(v->domain->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
241 v->arch.guest_table = pagetable_from_page(pg);
242 v->arch.guest_table_user = v->arch.guest_table;
244 if ( (rc = setup_arg_xlat_area(v, l4tab)) < 0 )
245 {
246 free_domheap_page(pg);
247 return rc;
248 }
250 return 0;
251 }
253 static void release_compat_l4(struct vcpu *v)
254 {
255 free_domheap_page(pagetable_get_page(v->arch.guest_table));
256 v->arch.guest_table = pagetable_null();
257 v->arch.guest_table_user = pagetable_null();
258 }
260 static inline int may_switch_mode(struct domain *d)
261 {
262 return (d->tot_pages == 0);
263 }
265 int switch_native(struct domain *d)
266 {
267 l1_pgentry_t gdt_l1e;
268 unsigned int vcpuid;
270 if ( d == NULL )
271 return -EINVAL;
272 if ( !may_switch_mode(d) )
273 return -EACCES;
274 if ( !IS_COMPAT(d) )
275 return 0;
277 d->is_compat = 0;
278 release_arg_xlat_area(d);
280 /* switch gdt */
281 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
282 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
283 {
284 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
285 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
286 if (d->vcpu[vcpuid])
287 release_compat_l4(d->vcpu[vcpuid]);
288 }
290 d->arch.physaddr_bitsize = 64;
292 return 0;
293 }
295 int switch_compat(struct domain *d)
296 {
297 l1_pgentry_t gdt_l1e;
298 unsigned int vcpuid;
300 if ( d == NULL )
301 return -EINVAL;
302 if ( compat_disabled )
303 return -ENOSYS;
304 if ( !may_switch_mode(d) )
305 return -EACCES;
306 if ( IS_COMPAT(d) )
307 return 0;
309 d->is_compat = 1;
311 /* switch gdt */
312 gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
313 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
314 {
315 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
316 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
317 if (d->vcpu[vcpuid]
318 && setup_compat_l4(d->vcpu[vcpuid]) != 0)
319 return -ENOMEM;
320 }
322 d->arch.physaddr_bitsize =
323 fls((1UL << 32) - HYPERVISOR_COMPAT_VIRT_START(d)) - 1
324 + (PAGE_SIZE - 2);
326 return 0;
327 }
329 #else
330 #define release_arg_xlat_area(d) ((void)0)
331 #define setup_compat_l4(v) 0
332 #define release_compat_l4(v) ((void)0)
333 #endif
335 int vcpu_initialise(struct vcpu *v)
336 {
337 struct domain *d = v->domain;
338 int rc;
340 v->arch.flags = TF_kernel_mode;
342 pae_l3_cache_init(&v->arch.pae_l3_cache);
344 paging_vcpu_init(v);
346 if ( is_hvm_domain(d) )
347 {
348 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
349 return rc;
350 }
351 else
352 {
353 /* PV guests by default have a 100Hz ticker. */
354 v->periodic_period = MILLISECS(10);
356 /* PV guests get an emulated PIT too for video BIOSes to use. */
357 if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
358 pit_init(v, cpu_khz);
360 v->arch.schedule_tail = continue_nonidle_domain;
361 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
362 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
364 if ( is_idle_domain(d) )
365 {
366 v->arch.schedule_tail = continue_idle_domain;
367 v->arch.cr3 = __pa(idle_pg_table);
368 }
369 }
371 v->arch.perdomain_ptes =
372 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
374 if ( IS_COMPAT(d) && (rc = setup_compat_l4(v)) != 0 )
375 return rc;
377 return 0;
378 }
380 void vcpu_destroy(struct vcpu *v)
381 {
382 if ( IS_COMPAT(v->domain) )
383 release_compat_l4(v);
384 }
386 int arch_domain_create(struct domain *d)
387 {
388 #ifdef __x86_64__
389 struct page_info *pg;
390 int i;
391 #endif
392 l1_pgentry_t gdt_l1e;
393 int vcpuid, pdpt_order;
394 int rc = -ENOMEM;
396 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
397 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
398 if ( d->arch.mm_perdomain_pt == NULL )
399 goto fail;
400 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
402 /*
403 * Map Xen segments into every VCPU's GDT, irrespective of whether every
404 * VCPU will actually be used. This avoids an NMI race during context
405 * switch: if we take an interrupt after switching CR3 but before switching
406 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
407 * try to load CS from an invalid table.
408 */
409 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
410 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
411 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
412 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
414 #if defined(__i386__)
416 mapcache_init(d);
418 #else /* __x86_64__ */
420 if ( (pg = alloc_domheap_page(NULL)) == NULL )
421 goto fail;
422 d->arch.mm_perdomain_l2 = clear_page(page_to_virt(pg));
423 for ( i = 0; i < (1 << pdpt_order); i++ )
424 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
425 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
426 __PAGE_HYPERVISOR);
428 if ( (pg = alloc_domheap_page(NULL)) == NULL )
429 goto fail;
430 d->arch.mm_perdomain_l3 = clear_page(page_to_virt(pg));
431 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
432 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
433 __PAGE_HYPERVISOR);
435 #endif /* __x86_64__ */
437 #ifdef CONFIG_COMPAT
438 HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
439 #endif
441 paging_domain_init(d);
443 if ( !is_idle_domain(d) )
444 {
445 d->arch.ioport_caps =
446 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
447 if ( d->arch.ioport_caps == NULL )
448 goto fail;
450 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
451 goto fail;
453 memset(d->shared_info, 0, PAGE_SIZE);
454 share_xen_page_with_guest(
455 virt_to_page(d->shared_info), d, XENSHARE_writable);
456 }
458 return is_hvm_domain(d) ? hvm_domain_initialise(d) : 0;
460 fail:
461 free_xenheap_page(d->shared_info);
462 #ifdef __x86_64__
463 if ( d->arch.mm_perdomain_l2 )
464 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
465 if ( d->arch.mm_perdomain_l3 )
466 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
467 #endif
468 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
469 return rc;
470 }
472 void arch_domain_destroy(struct domain *d)
473 {
474 struct vcpu *v;
476 if ( is_hvm_domain(d) )
477 {
478 for_each_vcpu ( d, v )
479 hvm_vcpu_destroy(v);
480 hvm_domain_destroy(d);
481 }
483 paging_final_teardown(d);
485 free_xenheap_pages(
486 d->arch.mm_perdomain_pt,
487 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
489 #ifdef __x86_64__
490 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
491 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
492 #endif
494 if ( IS_COMPAT(d) )
495 release_arg_xlat_area(d);
497 free_xenheap_page(d->shared_info);
498 }
500 /* This is called by arch_final_setup_guest and do_boot_vcpu */
501 int arch_set_info_guest(
502 struct vcpu *v, vcpu_guest_context_u c)
503 {
504 struct domain *d = v->domain;
505 unsigned long cr3_pfn = INVALID_MFN;
506 unsigned long flags;
507 int i, rc = 0, compat;
509 /* The context is a compat-mode one if the target domain is compat-mode;
510 * we expect the tools to DTRT even in compat-mode callers. */
511 compat = IS_COMPAT(d);
513 #ifdef CONFIG_COMPAT
514 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
515 #else
516 #define c(fld) (c.nat->fld)
517 #endif
518 flags = c(flags);
520 if ( !is_hvm_vcpu(v) )
521 {
522 if ( !compat )
523 {
524 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
525 fixup_guest_stack_selector(d, c.nat->kernel_ss);
526 fixup_guest_code_selector(d, c.nat->user_regs.cs);
527 #ifdef __i386__
528 fixup_guest_code_selector(d, c.nat->event_callback_cs);
529 fixup_guest_code_selector(d, c.nat->failsafe_callback_cs);
530 #endif
532 for ( i = 0; i < 256; i++ )
533 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
535 /* LDT safety checks. */
536 if ( ((c.nat->ldt_base & (PAGE_SIZE-1)) != 0) ||
537 (c.nat->ldt_ents > 8192) ||
538 !array_access_ok(c.nat->ldt_base,
539 c.nat->ldt_ents,
540 LDT_ENTRY_SIZE) )
541 return -EINVAL;
542 }
543 #ifdef CONFIG_COMPAT
544 else
545 {
546 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
547 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
548 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
549 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
550 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
552 for ( i = 0; i < 256; i++ )
553 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
555 /* LDT safety checks. */
556 if ( ((c.cmp->ldt_base & (PAGE_SIZE-1)) != 0) ||
557 (c.cmp->ldt_ents > 8192) ||
558 !compat_array_access_ok(c.cmp->ldt_base,
559 c.cmp->ldt_ents,
560 LDT_ENTRY_SIZE) )
561 return -EINVAL;
562 }
563 #endif
564 }
566 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
568 v->arch.flags &= ~TF_kernel_mode;
569 if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
570 v->arch.flags |= TF_kernel_mode;
572 if ( !compat )
573 memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
574 #ifdef CONFIG_COMPAT
575 else
576 {
577 XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp);
578 }
579 #endif
581 /* Only CR0.TS is modifiable by guest or admin. */
582 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
583 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
585 init_int80_direct_trap(v);
587 if ( !is_hvm_vcpu(v) )
588 {
589 /* IOPL privileges are virtualised. */
590 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
591 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
593 /* Ensure real hardware interrupts are enabled. */
594 v->arch.guest_context.user_regs.eflags |= EF_IE;
595 }
596 else
597 {
598 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
599 }
601 if ( v->is_initialised )
602 goto out;
604 memset(v->arch.guest_context.debugreg, 0,
605 sizeof(v->arch.guest_context.debugreg));
606 for ( i = 0; i < 8; i++ )
607 (void)set_debugreg(v, i, c(debugreg[i]));
609 if ( v->vcpu_id == 0 )
610 d->vm_assist = c(vm_assist);
612 if ( !is_hvm_vcpu(v) )
613 {
614 if ( !compat )
615 rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
616 #ifdef CONFIG_COMPAT
617 else
618 {
619 unsigned long gdt_frames[ARRAY_SIZE(c.cmp->gdt_frames)];
620 unsigned int i, n = (c.cmp->gdt_ents + 511) / 512;
622 if ( n > ARRAY_SIZE(c.cmp->gdt_frames) )
623 return -EINVAL;
624 for ( i = 0; i < n; ++i )
625 gdt_frames[i] = c.cmp->gdt_frames[i];
626 rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
627 }
628 #endif
629 if ( rc != 0 )
630 return rc;
632 if ( !compat )
633 {
634 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3]));
636 if ( !mfn_valid(cr3_pfn) ||
637 (paging_mode_refcounts(d)
638 ? !get_page(mfn_to_page(cr3_pfn), d)
639 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
640 PGT_base_page_table)) )
641 {
642 destroy_gdt(v);
643 return -EINVAL;
644 }
646 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
648 #ifdef __x86_64__
649 if ( c.nat->ctrlreg[1] )
650 {
651 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1]));
653 if ( !mfn_valid(cr3_pfn) ||
654 (paging_mode_refcounts(d)
655 ? !get_page(mfn_to_page(cr3_pfn), d)
656 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
657 PGT_base_page_table)) )
658 {
659 cr3_pfn = pagetable_get_pfn(v->arch.guest_table);
660 v->arch.guest_table = pagetable_null();
661 if ( paging_mode_refcounts(d) )
662 put_page(mfn_to_page(cr3_pfn));
663 else
664 put_page_and_type(mfn_to_page(cr3_pfn));
665 destroy_gdt(v);
666 return -EINVAL;
667 }
669 v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn);
670 }
671 #endif
672 }
673 #ifdef CONFIG_COMPAT
674 else
675 {
676 l4_pgentry_t *l4tab;
678 cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
680 if ( !mfn_valid(cr3_pfn) ||
681 (paging_mode_refcounts(d)
682 ? !get_page(mfn_to_page(cr3_pfn), d)
683 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
684 PGT_l3_page_table)) )
685 {
686 destroy_gdt(v);
687 return -EINVAL;
688 }
690 l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
691 *l4tab = l4e_from_pfn(cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
692 }
693 #endif
694 }
696 if ( v->vcpu_id == 0 )
697 update_domain_wallclock_time(d);
699 /* Don't redo final setup */
700 v->is_initialised = 1;
702 if ( paging_mode_enabled(d) )
703 paging_update_paging_modes(v);
705 update_cr3(v);
707 out:
708 if ( flags & VGCF_online )
709 clear_bit(_VPF_down, &v->pause_flags);
710 else
711 set_bit(_VPF_down, &v->pause_flags);
712 return 0;
713 #undef c
714 }
716 int arch_vcpu_reset(struct vcpu *v)
717 {
718 destroy_gdt(v);
719 vcpu_destroy_pagetables(v);
720 return 0;
721 }
723 long
724 arch_do_vcpu_op(
725 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
726 {
727 long rc = 0;
729 switch ( cmd )
730 {
731 case VCPUOP_register_runstate_memory_area:
732 {
733 struct vcpu_register_runstate_memory_area area;
734 struct vcpu_runstate_info runstate;
736 rc = -EFAULT;
737 if ( copy_from_guest(&area, arg, 1) )
738 break;
740 if ( !guest_handle_okay(area.addr.h, 1) )
741 break;
743 rc = 0;
744 runstate_guest(v) = area.addr.h;
746 if ( v == current )
747 {
748 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
749 }
750 else
751 {
752 vcpu_runstate_get(v, &runstate);
753 __copy_to_guest(runstate_guest(v), &runstate, 1);
754 }
756 break;
757 }
759 default:
760 rc = -ENOSYS;
761 break;
762 }
764 return rc;
765 }
767 #ifdef __x86_64__
769 #define loadsegment(seg,value) ({ \
770 int __r = 1; \
771 __asm__ __volatile__ ( \
772 "1: movl %k1,%%" #seg "\n2:\n" \
773 ".section .fixup,\"ax\"\n" \
774 "3: xorl %k0,%k0\n" \
775 " movl %k0,%%" #seg "\n" \
776 " jmp 2b\n" \
777 ".previous\n" \
778 ".section __ex_table,\"a\"\n" \
779 " .align 8\n" \
780 " .quad 1b,3b\n" \
781 ".previous" \
782 : "=r" (__r) : "r" (value), "0" (__r) );\
783 __r; })
785 /*
786 * save_segments() writes a mask of segments which are dirty (non-zero),
787 * allowing load_segments() to avoid some expensive segment loads and
788 * MSR writes.
789 */
790 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
791 #define DIRTY_DS 0x01
792 #define DIRTY_ES 0x02
793 #define DIRTY_FS 0x04
794 #define DIRTY_GS 0x08
795 #define DIRTY_FS_BASE 0x10
796 #define DIRTY_GS_BASE_USER 0x20
798 static void load_segments(struct vcpu *n)
799 {
800 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
801 int all_segs_okay = 1;
802 unsigned int dirty_segment_mask, cpu = smp_processor_id();
804 /* Load and clear the dirty segment mask. */
805 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
806 per_cpu(dirty_segment_mask, cpu) = 0;
808 /* Either selector != 0 ==> reload. */
809 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
810 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
812 /* Either selector != 0 ==> reload. */
813 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
814 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
816 /*
817 * Either selector != 0 ==> reload.
818 * Also reload to reset FS_BASE if it was non-zero.
819 */
820 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
821 nctxt->user_regs.fs) )
822 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
824 /*
825 * Either selector != 0 ==> reload.
826 * Also reload to reset GS_BASE if it was non-zero.
827 */
828 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
829 nctxt->user_regs.gs) )
830 {
831 /* Reset GS_BASE with user %gs? */
832 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
833 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
834 }
836 if ( !IS_COMPAT(n->domain) )
837 {
838 /* This can only be non-zero if selector is NULL. */
839 if ( nctxt->fs_base )
840 wrmsr(MSR_FS_BASE,
841 nctxt->fs_base,
842 nctxt->fs_base>>32);
844 /* Most kernels have non-zero GS base, so don't bother testing. */
845 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
846 wrmsr(MSR_SHADOW_GS_BASE,
847 nctxt->gs_base_kernel,
848 nctxt->gs_base_kernel>>32);
850 /* This can only be non-zero if selector is NULL. */
851 if ( nctxt->gs_base_user )
852 wrmsr(MSR_GS_BASE,
853 nctxt->gs_base_user,
854 nctxt->gs_base_user>>32);
856 /* If in kernel mode then switch the GS bases around. */
857 if ( (n->arch.flags & TF_kernel_mode) )
858 __asm__ __volatile__ ( "swapgs" );
859 }
861 if ( unlikely(!all_segs_okay) )
862 {
863 struct cpu_user_regs *regs = guest_cpu_user_regs();
864 unsigned long *rsp =
865 (n->arch.flags & TF_kernel_mode) ?
866 (unsigned long *)regs->rsp :
867 (unsigned long *)nctxt->kernel_sp;
868 unsigned long cs_and_mask, rflags;
870 if ( IS_COMPAT(n->domain) )
871 {
872 unsigned int *esp = ring_1(regs) ?
873 (unsigned int *)regs->rsp :
874 (unsigned int *)nctxt->kernel_sp;
875 unsigned int cs_and_mask, eflags;
876 int ret = 0;
878 /* CS longword also contains full evtchn_upcall_mask. */
879 cs_and_mask = (unsigned short)regs->cs |
880 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
881 /* Fold upcall mask into RFLAGS.IF. */
882 eflags = regs->_eflags & ~X86_EFLAGS_IF;
883 eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
885 if ( !ring_1(regs) )
886 {
887 ret = put_user(regs->ss, esp-1);
888 ret |= put_user(regs->_esp, esp-2);
889 esp -= 2;
890 }
892 if ( ret |
893 put_user(eflags, esp-1) |
894 put_user(cs_and_mask, esp-2) |
895 put_user(regs->_eip, esp-3) |
896 put_user(nctxt->user_regs.gs, esp-4) |
897 put_user(nctxt->user_regs.fs, esp-5) |
898 put_user(nctxt->user_regs.es, esp-6) |
899 put_user(nctxt->user_regs.ds, esp-7) )
900 {
901 gdprintk(XENLOG_ERR, "Error while creating compat "
902 "failsafe callback frame.\n");
903 domain_crash(n->domain);
904 }
906 if ( test_bit(_VGCF_failsafe_disables_events,
907 &n->arch.guest_context.flags) )
908 vcpu_info(n, evtchn_upcall_mask) = 1;
910 regs->entry_vector = TRAP_syscall;
911 regs->_eflags &= 0xFFFCBEFFUL;
912 regs->ss = FLAT_COMPAT_KERNEL_SS;
913 regs->_esp = (unsigned long)(esp-7);
914 regs->cs = FLAT_COMPAT_KERNEL_CS;
915 regs->_eip = nctxt->failsafe_callback_eip;
916 return;
917 }
919 if ( !(n->arch.flags & TF_kernel_mode) )
920 toggle_guest_mode(n);
921 else
922 regs->cs &= ~3;
924 /* CS longword also contains full evtchn_upcall_mask. */
925 cs_and_mask = (unsigned long)regs->cs |
926 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
928 /* Fold upcall mask into RFLAGS.IF. */
929 rflags = regs->rflags & ~X86_EFLAGS_IF;
930 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
932 if ( put_user(regs->ss, rsp- 1) |
933 put_user(regs->rsp, rsp- 2) |
934 put_user(rflags, rsp- 3) |
935 put_user(cs_and_mask, rsp- 4) |
936 put_user(regs->rip, rsp- 5) |
937 put_user(nctxt->user_regs.gs, rsp- 6) |
938 put_user(nctxt->user_regs.fs, rsp- 7) |
939 put_user(nctxt->user_regs.es, rsp- 8) |
940 put_user(nctxt->user_regs.ds, rsp- 9) |
941 put_user(regs->r11, rsp-10) |
942 put_user(regs->rcx, rsp-11) )
943 {
944 gdprintk(XENLOG_ERR, "Error while creating failsafe "
945 "callback frame.\n");
946 domain_crash(n->domain);
947 }
949 if ( test_bit(_VGCF_failsafe_disables_events,
950 &n->arch.guest_context.flags) )
951 vcpu_info(n, evtchn_upcall_mask) = 1;
953 regs->entry_vector = TRAP_syscall;
954 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
955 X86_EFLAGS_NT|X86_EFLAGS_TF);
956 regs->ss = FLAT_KERNEL_SS;
957 regs->rsp = (unsigned long)(rsp-11);
958 regs->cs = FLAT_KERNEL_CS;
959 regs->rip = nctxt->failsafe_callback_eip;
960 }
961 }
963 static void save_segments(struct vcpu *v)
964 {
965 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
966 struct cpu_user_regs *regs = &ctxt->user_regs;
967 unsigned int dirty_segment_mask = 0;
969 regs->ds = read_segment_register(ds);
970 regs->es = read_segment_register(es);
971 regs->fs = read_segment_register(fs);
972 regs->gs = read_segment_register(gs);
974 if ( regs->ds )
975 dirty_segment_mask |= DIRTY_DS;
977 if ( regs->es )
978 dirty_segment_mask |= DIRTY_ES;
980 if ( regs->fs || IS_COMPAT(v->domain) )
981 {
982 dirty_segment_mask |= DIRTY_FS;
983 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
984 }
985 else if ( ctxt->fs_base )
986 {
987 dirty_segment_mask |= DIRTY_FS_BASE;
988 }
990 if ( regs->gs || IS_COMPAT(v->domain) )
991 {
992 dirty_segment_mask |= DIRTY_GS;
993 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
994 }
995 else if ( ctxt->gs_base_user )
996 {
997 dirty_segment_mask |= DIRTY_GS_BASE_USER;
998 }
1000 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1003 #define switch_kernel_stack(v) ((void)0)
1005 #elif defined(__i386__)
1007 #define load_segments(n) ((void)0)
1008 #define save_segments(p) ((void)0)
1010 static inline void switch_kernel_stack(struct vcpu *v)
1012 struct tss_struct *tss = &init_tss[smp_processor_id()];
1013 tss->esp1 = v->arch.guest_context.kernel_sp;
1014 tss->ss1 = v->arch.guest_context.kernel_ss;
1017 #endif /* __i386__ */
1019 static void paravirt_ctxt_switch_from(struct vcpu *v)
1021 save_segments(v);
1024 static void paravirt_ctxt_switch_to(struct vcpu *v)
1026 set_int80_direct_trap(v);
1027 switch_kernel_stack(v);
1030 #define loaddebug(_v,_reg) \
1031 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
1033 static void __context_switch(void)
1035 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1036 unsigned int cpu = smp_processor_id();
1037 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1038 struct vcpu *n = current;
1040 ASSERT(p != n);
1041 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
1043 if ( !is_idle_vcpu(p) )
1045 memcpy(&p->arch.guest_context.user_regs,
1046 stack_regs,
1047 CTXT_SWITCH_STACK_BYTES);
1048 unlazy_fpu(p);
1049 p->arch.ctxt_switch_from(p);
1052 if ( !is_idle_vcpu(n) )
1054 memcpy(stack_regs,
1055 &n->arch.guest_context.user_regs,
1056 CTXT_SWITCH_STACK_BYTES);
1058 /* Maybe switch the debug registers. */
1059 if ( unlikely(n->arch.guest_context.debugreg[7]) )
1061 loaddebug(&n->arch.guest_context, 0);
1062 loaddebug(&n->arch.guest_context, 1);
1063 loaddebug(&n->arch.guest_context, 2);
1064 loaddebug(&n->arch.guest_context, 3);
1065 /* no 4 and 5 */
1066 loaddebug(&n->arch.guest_context, 6);
1067 loaddebug(&n->arch.guest_context, 7);
1069 n->arch.ctxt_switch_to(n);
1072 if ( p->domain != n->domain )
1073 cpu_set(cpu, n->domain->domain_dirty_cpumask);
1074 cpu_set(cpu, n->vcpu_dirty_cpumask);
1076 write_ptbase(n);
1078 if ( p->vcpu_id != n->vcpu_id )
1080 char gdt_load[10];
1081 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
1082 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
1083 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
1086 if ( p->domain != n->domain )
1087 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
1088 cpu_clear(cpu, p->vcpu_dirty_cpumask);
1090 per_cpu(curr_vcpu, cpu) = n;
1094 void context_switch(struct vcpu *prev, struct vcpu *next)
1096 unsigned int cpu = smp_processor_id();
1097 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
1099 ASSERT(local_irq_is_enabled());
1101 /* Allow at most one CPU at a time to be dirty. */
1102 ASSERT(cpus_weight(dirty_mask) <= 1);
1103 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
1105 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1106 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
1107 flush_tlb_mask(next->vcpu_dirty_cpumask);
1110 local_irq_disable();
1112 if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1113 pt_freeze_time(prev);
1115 set_current(next);
1117 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
1119 local_irq_enable();
1121 else
1123 __context_switch();
1125 #ifdef CONFIG_COMPAT
1126 if ( is_idle_vcpu(prev)
1127 || IS_COMPAT(prev->domain) != IS_COMPAT(next->domain) )
1129 uint32_t efer_lo, efer_hi;
1131 local_flush_tlb_one(GDT_VIRT_START(next) + FIRST_RESERVED_GDT_BYTE);
1133 rdmsr(MSR_EFER, efer_lo, efer_hi);
1134 if ( !IS_COMPAT(next->domain) == !(efer_lo & EFER_SCE) )
1136 efer_lo ^= EFER_SCE;
1137 wrmsr(MSR_EFER, efer_lo, efer_hi);
1140 #endif
1142 /* Re-enable interrupts before restoring state which may fault. */
1143 local_irq_enable();
1145 if ( !is_hvm_vcpu(next) )
1147 load_LDT(next);
1148 load_segments(next);
1152 context_saved(prev);
1154 /* Update per-VCPU guest runstate shared memory area (if registered). */
1155 if ( !guest_handle_is_null(runstate_guest(next)) )
1157 if ( !IS_COMPAT(next->domain) )
1158 __copy_to_guest(runstate_guest(next), &next->runstate, 1);
1159 #ifdef CONFIG_COMPAT
1160 else
1162 struct compat_vcpu_runstate_info info;
1164 XLAT_vcpu_runstate_info(&info, &next->runstate);
1165 __copy_to_guest(next->runstate_guest.compat, &info, 1);
1167 #endif
1170 schedule_tail(next);
1171 BUG();
1174 void continue_running(struct vcpu *same)
1176 schedule_tail(same);
1177 BUG();
1180 int __sync_lazy_execstate(void)
1182 unsigned long flags;
1183 int switch_required;
1185 local_irq_save(flags);
1187 switch_required = (this_cpu(curr_vcpu) != current);
1189 if ( switch_required )
1191 ASSERT(current == idle_vcpu[smp_processor_id()]);
1192 __context_switch();
1195 local_irq_restore(flags);
1197 return switch_required;
1200 void sync_vcpu_execstate(struct vcpu *v)
1202 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
1203 (void)__sync_lazy_execstate();
1205 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
1206 flush_tlb_mask(v->vcpu_dirty_cpumask);
1209 #define next_arg(fmt, args) ({ \
1210 unsigned long __arg; \
1211 switch ( *(fmt)++ ) \
1212 { \
1213 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
1214 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
1215 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
1216 default: __arg = 0; BUG(); \
1217 } \
1218 __arg; \
1219 })
1221 unsigned long hypercall_create_continuation(
1222 unsigned int op, const char *format, ...)
1224 struct mc_state *mcs = &this_cpu(mc_state);
1225 struct cpu_user_regs *regs;
1226 const char *p = format;
1227 unsigned long arg;
1228 unsigned int i;
1229 va_list args;
1231 va_start(args, format);
1233 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1235 __set_bit(_MCSF_call_preempted, &mcs->flags);
1237 for ( i = 0; *p != '\0'; i++ )
1238 mcs->call.args[i] = next_arg(p, args);
1239 if ( IS_COMPAT(current->domain) )
1241 for ( ; i < 6; i++ )
1242 mcs->call.args[i] = 0;
1245 else
1247 regs = guest_cpu_user_regs();
1248 regs->eax = op;
1249 regs->eip -= 2; /* re-execute 'syscall' / 'int 0x82' */
1251 #ifdef __x86_64__
1252 if ( !IS_COMPAT(current->domain) )
1254 for ( i = 0; *p != '\0'; i++ )
1256 arg = next_arg(p, args);
1257 switch ( i )
1259 case 0: regs->rdi = arg; break;
1260 case 1: regs->rsi = arg; break;
1261 case 2: regs->rdx = arg; break;
1262 case 3: regs->r10 = arg; break;
1263 case 4: regs->r8 = arg; break;
1264 case 5: regs->r9 = arg; break;
1268 else
1269 #endif
1271 if ( supervisor_mode_kernel || is_hvm_vcpu(current) )
1272 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
1274 for ( i = 0; *p != '\0'; i++ )
1276 arg = next_arg(p, args);
1277 switch ( i )
1279 case 0: regs->ebx = arg; break;
1280 case 1: regs->ecx = arg; break;
1281 case 2: regs->edx = arg; break;
1282 case 3: regs->esi = arg; break;
1283 case 4: regs->edi = arg; break;
1284 case 5: regs->ebp = arg; break;
1290 va_end(args);
1292 return op;
1295 #ifdef CONFIG_COMPAT
1296 int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
1298 int rc = 0;
1299 struct mc_state *mcs = &this_cpu(mc_state);
1300 struct cpu_user_regs *regs;
1301 unsigned int i, cval = 0;
1302 unsigned long nval = 0;
1303 va_list args;
1305 BUG_ON(*id > 5);
1306 BUG_ON(mask & (1U << *id));
1308 va_start(args, mask);
1310 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
1312 if ( !test_bit(_MCSF_call_preempted, &mcs->flags) )
1313 return 0;
1314 for ( i = 0; i < 6; ++i, mask >>= 1 )
1316 if ( mask & 1 )
1318 nval = va_arg(args, unsigned long);
1319 cval = va_arg(args, unsigned int);
1320 if ( cval == nval )
1321 mask &= ~1U;
1322 else
1323 BUG_ON(nval == (unsigned int)nval);
1325 else if ( id && *id == i )
1327 *id = mcs->call.args[i];
1328 id = NULL;
1330 if ( (mask & 1) && mcs->call.args[i] == nval )
1331 ++rc;
1332 else
1334 cval = mcs->call.args[i];
1335 BUG_ON(mcs->call.args[i] != cval);
1337 mcs->compat_call.args[i] = cval;
1340 else
1342 regs = guest_cpu_user_regs();
1343 for ( i = 0; i < 6; ++i, mask >>= 1 )
1345 unsigned long *reg;
1347 switch ( i )
1349 case 0: reg = &regs->ebx; break;
1350 case 1: reg = &regs->ecx; break;
1351 case 2: reg = &regs->edx; break;
1352 case 3: reg = &regs->esi; break;
1353 case 4: reg = &regs->edi; break;
1354 case 5: reg = &regs->ebp; break;
1355 default: BUG(); reg = NULL; break;
1357 if ( (mask & 1) )
1359 nval = va_arg(args, unsigned long);
1360 cval = va_arg(args, unsigned int);
1361 if ( cval == nval )
1362 mask &= ~1U;
1363 else
1364 BUG_ON(nval == (unsigned int)nval);
1366 else if ( id && *id == i )
1368 *id = *reg;
1369 id = NULL;
1371 if ( (mask & 1) && *reg == nval )
1373 *reg = cval;
1374 ++rc;
1376 else
1377 BUG_ON(*reg != (unsigned int)*reg);
1381 va_end(args);
1383 return rc;
1385 #endif
1387 static void relinquish_memory(struct domain *d, struct list_head *list,
1388 unsigned long type)
1390 struct list_head *ent;
1391 struct page_info *page;
1392 unsigned long x, y;
1394 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1395 spin_lock_recursive(&d->page_alloc_lock);
1397 ent = list->next;
1398 while ( ent != list )
1400 page = list_entry(ent, struct page_info, list);
1402 /* Grab a reference to the page so it won't disappear from under us. */
1403 if ( unlikely(!get_page(page, d)) )
1405 /* Couldn't get a reference -- someone is freeing this page. */
1406 ent = ent->next;
1407 continue;
1410 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1411 put_page_and_type(page);
1413 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1414 put_page(page);
1416 /*
1417 * Forcibly invalidate top-most, still valid page tables at this point
1418 * to break circular 'linear page table' references. This is okay
1419 * because MMU structures are not shared across domains and this domain
1420 * is now dead. Thus top-most valid tables are not in use so a non-zero
1421 * count means circular reference.
1422 */
1423 y = page->u.inuse.type_info;
1424 for ( ; ; )
1426 x = y;
1427 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1428 (type|PGT_validated)) )
1429 break;
1431 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1432 if ( likely(y == x) )
1434 free_page_type(page, type);
1435 break;
1439 /* Follow the list chain and /then/ potentially free the page. */
1440 ent = ent->next;
1441 put_page(page);
1444 spin_unlock_recursive(&d->page_alloc_lock);
1447 static void vcpu_destroy_pagetables(struct vcpu *v)
1449 struct domain *d = v->domain;
1450 unsigned long pfn;
1452 #ifdef CONFIG_COMPAT
1453 if ( IS_COMPAT(d) )
1455 if ( is_hvm_vcpu(v) )
1456 pfn = pagetable_get_pfn(v->arch.guest_table);
1457 else
1458 pfn = l4e_get_pfn(*(l4_pgentry_t *)
1459 __va(pagetable_get_paddr(v->arch.guest_table)));
1461 if ( pfn != 0 )
1463 if ( paging_mode_refcounts(d) )
1464 put_page(mfn_to_page(pfn));
1465 else
1466 put_page_and_type(mfn_to_page(pfn));
1469 if ( is_hvm_vcpu(v) )
1470 v->arch.guest_table = pagetable_null();
1471 else
1472 l4e_write(
1473 (l4_pgentry_t *) __va(pagetable_get_paddr(v->arch.guest_table)),
1474 l4e_empty());
1476 v->arch.cr3 = 0;
1477 return;
1479 #endif
1481 pfn = pagetable_get_pfn(v->arch.guest_table);
1482 if ( pfn != 0 )
1484 if ( paging_mode_refcounts(d) )
1485 put_page(mfn_to_page(pfn));
1486 else
1487 put_page_and_type(mfn_to_page(pfn));
1488 #ifdef __x86_64__
1489 if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) )
1490 v->arch.guest_table_user = pagetable_null();
1491 #endif
1492 v->arch.guest_table = pagetable_null();
1495 #ifdef __x86_64__
1496 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
1497 pfn = pagetable_get_pfn(v->arch.guest_table_user);
1498 if ( pfn != 0 )
1500 if ( paging_mode_refcounts(d) )
1501 put_page(mfn_to_page(pfn));
1502 else
1503 put_page_and_type(mfn_to_page(pfn));
1504 v->arch.guest_table_user = pagetable_null();
1506 #endif
1508 v->arch.cr3 = 0;
1511 void domain_relinquish_resources(struct domain *d)
1513 struct vcpu *v;
1515 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1517 /* Drop the in-use references to page-table bases. */
1518 for_each_vcpu ( d, v )
1519 vcpu_destroy_pagetables(v);
1521 /* Tear down paging-assistance stuff. */
1522 paging_teardown(d);
1524 /*
1525 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1526 * it automatically gets squashed when the guest's mappings go away.
1527 */
1528 for_each_vcpu(d, v)
1529 destroy_gdt(v);
1531 /* Relinquish every page of memory. */
1532 #if CONFIG_PAGING_LEVELS >= 4
1533 relinquish_memory(d, &d->xenpage_list, PGT_l4_page_table);
1534 relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1535 #endif
1536 #if CONFIG_PAGING_LEVELS >= 3
1537 relinquish_memory(d, &d->xenpage_list, PGT_l3_page_table);
1538 relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1539 #endif
1540 relinquish_memory(d, &d->xenpage_list, PGT_l2_page_table);
1541 relinquish_memory(d, &d->page_list, PGT_l2_page_table);
1543 /* Free page used by xen oprofile buffer. */
1544 free_xenoprof_pages(d);
1546 if ( is_hvm_domain(d) )
1547 hvm_domain_relinquish_resources(d);
1550 void arch_dump_domain_info(struct domain *d)
1552 paging_dump_domain_info(d);
1555 void arch_dump_vcpu_info(struct vcpu *v)
1557 paging_dump_vcpu_info(v);
1560 /*
1561 * Local variables:
1562 * mode: C
1563 * c-set-style: "BSD"
1564 * c-basic-offset: 4
1565 * tab-width: 4
1566 * indent-tabs-mode: nil
1567 * End:
1568 */