ia64/xen-unstable

view xen/arch/x86/domain.c @ 12564:2fd223c64fc6

[XEN] Pin l3 shadows of older x86_64 linux guests.
Older x86_64 linux kernels use one l4 table per cpu and context switch by
changing an l4 entry pointing to an l3 table. If we're shadowing them
we need to pin l3 shadows to stop them being torn down on every
context switch. (But don't do this for normal 64bit guests).
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Thu Nov 23 17:46:52 2006 +0000 (2006-11-23)
parents 3fa6635d04b9
children a89599a12d4b
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <xen/kernel.h>
25 #include <xen/multicall.h>
26 #include <xen/irq.h>
27 #include <xen/event.h>
28 #include <xen/console.h>
29 #include <xen/percpu.h>
30 #include <asm/regs.h>
31 #include <asm/mc146818rtc.h>
32 #include <asm/system.h>
33 #include <asm/io.h>
34 #include <asm/processor.h>
35 #include <asm/desc.h>
36 #include <asm/i387.h>
37 #include <asm/mpspec.h>
38 #include <asm/ldt.h>
39 #include <asm/shadow.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/msr.h>
44 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
46 static void paravirt_ctxt_switch_from(struct vcpu *v);
47 static void paravirt_ctxt_switch_to(struct vcpu *v);
49 static void continue_idle_domain(struct vcpu *v)
50 {
51 reset_stack_and_jump(idle_loop);
52 }
54 static void continue_nonidle_domain(struct vcpu *v)
55 {
56 reset_stack_and_jump(ret_from_intr);
57 }
59 static void default_idle(void)
60 {
61 local_irq_disable();
62 if ( !softirq_pending(smp_processor_id()) )
63 safe_halt();
64 else
65 local_irq_enable();
66 }
68 void idle_loop(void)
69 {
70 for ( ; ; )
71 {
72 page_scrub_schedule_work();
73 default_idle();
74 do_softirq();
75 }
76 }
78 void startup_cpu_idle_loop(void)
79 {
80 struct vcpu *v = current;
82 ASSERT(is_idle_vcpu(v));
83 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
84 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
86 reset_stack_and_jump(idle_loop);
87 }
89 void dump_pageframe_info(struct domain *d)
90 {
91 struct page_info *page;
93 printk("Memory pages belonging to domain %u:\n", d->domain_id);
95 if ( d->tot_pages >= 10 )
96 {
97 printk(" DomPage list too long to display\n");
98 }
99 else
100 {
101 list_for_each_entry ( page, &d->page_list, list )
102 {
103 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
104 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
105 page->count_info, page->u.inuse.type_info);
106 }
107 }
109 list_for_each_entry ( page, &d->xenpage_list, list )
110 {
111 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
112 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
113 page->count_info, page->u.inuse.type_info);
114 }
115 }
117 struct vcpu *alloc_vcpu_struct(void)
118 {
119 struct vcpu *v;
120 if ( (v = xmalloc(struct vcpu)) != NULL )
121 memset(v, 0, sizeof(*v));
122 return v;
123 }
125 void free_vcpu_struct(struct vcpu *v)
126 {
127 xfree(v);
128 }
130 int vcpu_initialise(struct vcpu *v)
131 {
132 struct domain *d = v->domain;
133 int rc;
135 v->arch.flags = TF_kernel_mode;
137 if ( is_hvm_domain(d) )
138 {
139 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
140 return rc;
141 }
142 else
143 {
144 v->arch.schedule_tail = continue_nonidle_domain;
145 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
146 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
148 if ( is_idle_domain(d) )
149 {
150 v->arch.schedule_tail = continue_idle_domain;
151 v->arch.cr3 = __pa(idle_pg_table);
152 }
153 }
155 v->arch.perdomain_ptes =
156 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
158 pae_l3_cache_init(&v->arch.pae_l3_cache);
160 return 0;
161 }
163 void vcpu_destroy(struct vcpu *v)
164 {
165 }
167 int arch_domain_create(struct domain *d)
168 {
169 #ifdef __x86_64__
170 struct page_info *pg;
171 #endif
172 l1_pgentry_t gdt_l1e;
173 int vcpuid, pdpt_order;
174 int i, rc = -ENOMEM;
176 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
177 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
178 if ( d->arch.mm_perdomain_pt == NULL )
179 goto fail;
180 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
182 /*
183 * Map Xen segments into every VCPU's GDT, irrespective of whether every
184 * VCPU will actually be used. This avoids an NMI race during context
185 * switch: if we take an interrupt after switching CR3 but before switching
186 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
187 * try to load CS from an invalid table.
188 */
189 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
190 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
191 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
192 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
194 #if defined(__i386__)
196 mapcache_init(d);
198 #else /* __x86_64__ */
200 if ( (pg = alloc_domheap_page(NULL)) == NULL )
201 goto fail;
202 d->arch.mm_perdomain_l2 = clear_page(page_to_virt(pg));
203 for ( i = 0; i < (1 << pdpt_order); i++ )
204 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
205 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
206 __PAGE_HYPERVISOR);
208 if ( (pg = alloc_domheap_page(NULL)) == NULL )
209 goto fail;
210 d->arch.mm_perdomain_l3 = clear_page(page_to_virt(pg));
211 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
212 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
213 __PAGE_HYPERVISOR);
215 #endif /* __x86_64__ */
217 shadow_lock_init(d);
218 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
219 INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
220 INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
221 INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
222 INIT_LIST_HEAD(&d->arch.shadow.pinned_shadows);
224 if ( !is_idle_domain(d) )
225 {
226 d->arch.ioport_caps =
227 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
228 if ( d->arch.ioport_caps == NULL )
229 goto fail;
231 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
232 goto fail;
234 memset(d->shared_info, 0, PAGE_SIZE);
235 share_xen_page_with_guest(
236 virt_to_page(d->shared_info), d, XENSHARE_writable);
237 }
239 return is_hvm_domain(d) ? hvm_domain_initialise(d) : 0;
241 fail:
242 free_xenheap_page(d->shared_info);
243 #ifdef __x86_64__
244 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
245 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
246 #endif
247 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
248 return rc;
249 }
251 void arch_domain_destroy(struct domain *d)
252 {
253 struct vcpu *v;
255 if ( is_hvm_domain(d) )
256 {
257 for_each_vcpu ( d, v )
258 hvm_vcpu_destroy(v);
259 hvm_domain_destroy(d);
260 }
262 shadow_final_teardown(d);
264 free_xenheap_pages(
265 d->arch.mm_perdomain_pt,
266 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
268 #ifdef __x86_64__
269 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
270 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
271 #endif
273 free_xenheap_page(d->shared_info);
274 }
276 /* This is called by arch_final_setup_guest and do_boot_vcpu */
277 int arch_set_info_guest(
278 struct vcpu *v, struct vcpu_guest_context *c)
279 {
280 struct domain *d = v->domain;
281 unsigned long cr3_pfn = INVALID_MFN;
282 int i, rc;
284 if ( !is_hvm_vcpu(v) )
285 {
286 fixup_guest_stack_selector(c->user_regs.ss);
287 fixup_guest_stack_selector(c->kernel_ss);
288 fixup_guest_code_selector(c->user_regs.cs);
290 #ifdef __i386__
291 fixup_guest_code_selector(c->event_callback_cs);
292 fixup_guest_code_selector(c->failsafe_callback_cs);
293 #endif
295 for ( i = 0; i < 256; i++ )
296 fixup_guest_code_selector(c->trap_ctxt[i].cs);
298 /* LDT safety checks. */
299 if ( ((c->ldt_base & (PAGE_SIZE-1)) != 0) ||
300 (c->ldt_ents > 8192) ||
301 !array_access_ok(c->ldt_base, c->ldt_ents, LDT_ENTRY_SIZE) )
302 return -EINVAL;
303 }
305 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
306 if ( c->flags & VGCF_i387_valid )
307 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
309 v->arch.flags &= ~TF_kernel_mode;
310 if ( (c->flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
311 v->arch.flags |= TF_kernel_mode;
313 memcpy(&v->arch.guest_context, c, sizeof(*c));
315 /* Only CR0.TS is modifiable by guest or admin. */
316 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
317 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
319 init_int80_direct_trap(v);
321 if ( !is_hvm_vcpu(v) )
322 {
323 /* IOPL privileges are virtualised. */
324 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
325 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
327 /* Ensure real hardware interrupts are enabled. */
328 v->arch.guest_context.user_regs.eflags |= EF_IE;
329 }
330 else
331 {
332 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
333 }
335 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
336 return 0;
338 memset(v->arch.guest_context.debugreg, 0,
339 sizeof(v->arch.guest_context.debugreg));
340 for ( i = 0; i < 8; i++ )
341 (void)set_debugreg(v, i, c->debugreg[i]);
343 if ( v->vcpu_id == 0 )
344 d->vm_assist = c->vm_assist;
346 if ( !is_hvm_vcpu(v) )
347 {
348 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
349 return rc;
351 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c->ctrlreg[3]));
353 if ( shadow_mode_refcounts(d)
354 ? !get_page(mfn_to_page(cr3_pfn), d)
355 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
356 PGT_base_page_table) )
357 {
358 destroy_gdt(v);
359 return -EINVAL;
360 }
362 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
363 }
365 /* Shadow: make sure the domain has enough shadow memory to
366 * boot another vcpu */
367 if ( shadow_mode_enabled(d)
368 && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) )
369 {
370 destroy_gdt(v);
371 return -ENOMEM;
372 }
374 if ( v->vcpu_id == 0 )
375 update_domain_wallclock_time(d);
377 /* Don't redo final setup */
378 set_bit(_VCPUF_initialised, &v->vcpu_flags);
380 if ( shadow_mode_enabled(d) )
381 shadow_update_paging_modes(v);
383 update_cr3(v);
385 return 0;
386 }
388 long
389 arch_do_vcpu_op(
390 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
391 {
392 long rc = 0;
394 switch ( cmd )
395 {
396 case VCPUOP_register_runstate_memory_area:
397 {
398 struct vcpu_register_runstate_memory_area area;
399 struct vcpu_runstate_info runstate;
401 rc = -EFAULT;
402 if ( copy_from_guest(&area, arg, 1) )
403 break;
405 if ( !guest_handle_okay(area.addr.h, 1) )
406 break;
408 rc = 0;
409 v->runstate_guest = area.addr.h;
411 if ( v == current )
412 {
413 __copy_to_guest(v->runstate_guest, &v->runstate, 1);
414 }
415 else
416 {
417 vcpu_runstate_get(v, &runstate);
418 __copy_to_guest(v->runstate_guest, &runstate, 1);
419 }
421 break;
422 }
424 default:
425 rc = -ENOSYS;
426 break;
427 }
429 return rc;
430 }
432 #ifdef __x86_64__
434 #define loadsegment(seg,value) ({ \
435 int __r = 1; \
436 __asm__ __volatile__ ( \
437 "1: movl %k1,%%" #seg "\n2:\n" \
438 ".section .fixup,\"ax\"\n" \
439 "3: xorl %k0,%k0\n" \
440 " movl %k0,%%" #seg "\n" \
441 " jmp 2b\n" \
442 ".previous\n" \
443 ".section __ex_table,\"a\"\n" \
444 " .align 8\n" \
445 " .quad 1b,3b\n" \
446 ".previous" \
447 : "=r" (__r) : "r" (value), "0" (__r) );\
448 __r; })
450 /*
451 * save_segments() writes a mask of segments which are dirty (non-zero),
452 * allowing load_segments() to avoid some expensive segment loads and
453 * MSR writes.
454 */
455 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
456 #define DIRTY_DS 0x01
457 #define DIRTY_ES 0x02
458 #define DIRTY_FS 0x04
459 #define DIRTY_GS 0x08
460 #define DIRTY_FS_BASE 0x10
461 #define DIRTY_GS_BASE_USER 0x20
463 static void load_segments(struct vcpu *n)
464 {
465 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
466 int all_segs_okay = 1;
467 unsigned int dirty_segment_mask, cpu = smp_processor_id();
469 /* Load and clear the dirty segment mask. */
470 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
471 per_cpu(dirty_segment_mask, cpu) = 0;
473 /* Either selector != 0 ==> reload. */
474 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
475 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
477 /* Either selector != 0 ==> reload. */
478 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
479 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
481 /*
482 * Either selector != 0 ==> reload.
483 * Also reload to reset FS_BASE if it was non-zero.
484 */
485 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
486 nctxt->user_regs.fs) )
487 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
489 /*
490 * Either selector != 0 ==> reload.
491 * Also reload to reset GS_BASE if it was non-zero.
492 */
493 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
494 nctxt->user_regs.gs) )
495 {
496 /* Reset GS_BASE with user %gs? */
497 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
498 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
499 }
501 /* This can only be non-zero if selector is NULL. */
502 if ( nctxt->fs_base )
503 wrmsr(MSR_FS_BASE,
504 nctxt->fs_base,
505 nctxt->fs_base>>32);
507 /* Most kernels have non-zero GS base, so don't bother testing. */
508 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
509 wrmsr(MSR_SHADOW_GS_BASE,
510 nctxt->gs_base_kernel,
511 nctxt->gs_base_kernel>>32);
513 /* This can only be non-zero if selector is NULL. */
514 if ( nctxt->gs_base_user )
515 wrmsr(MSR_GS_BASE,
516 nctxt->gs_base_user,
517 nctxt->gs_base_user>>32);
519 /* If in kernel mode then switch the GS bases around. */
520 if ( n->arch.flags & TF_kernel_mode )
521 __asm__ __volatile__ ( "swapgs" );
523 if ( unlikely(!all_segs_okay) )
524 {
525 struct cpu_user_regs *regs = guest_cpu_user_regs();
526 unsigned long *rsp =
527 (n->arch.flags & TF_kernel_mode) ?
528 (unsigned long *)regs->rsp :
529 (unsigned long *)nctxt->kernel_sp;
530 unsigned long cs_and_mask, rflags;
532 if ( !(n->arch.flags & TF_kernel_mode) )
533 toggle_guest_mode(n);
534 else
535 regs->cs &= ~3;
537 /* CS longword also contains full evtchn_upcall_mask. */
538 cs_and_mask = (unsigned long)regs->cs |
539 ((unsigned long)n->vcpu_info->evtchn_upcall_mask << 32);
541 /* Fold upcall mask into RFLAGS.IF. */
542 rflags = regs->rflags & ~X86_EFLAGS_IF;
543 rflags |= !n->vcpu_info->evtchn_upcall_mask << 9;
545 if ( put_user(regs->ss, rsp- 1) |
546 put_user(regs->rsp, rsp- 2) |
547 put_user(rflags, rsp- 3) |
548 put_user(cs_and_mask, rsp- 4) |
549 put_user(regs->rip, rsp- 5) |
550 put_user(nctxt->user_regs.gs, rsp- 6) |
551 put_user(nctxt->user_regs.fs, rsp- 7) |
552 put_user(nctxt->user_regs.es, rsp- 8) |
553 put_user(nctxt->user_regs.ds, rsp- 9) |
554 put_user(regs->r11, rsp-10) |
555 put_user(regs->rcx, rsp-11) )
556 {
557 gdprintk(XENLOG_ERR, "Error while creating failsafe "
558 "callback frame.\n");
559 domain_crash(n->domain);
560 }
562 if ( test_bit(_VGCF_failsafe_disables_events,
563 &n->arch.guest_context.flags) )
564 n->vcpu_info->evtchn_upcall_mask = 1;
566 regs->entry_vector = TRAP_syscall;
567 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
568 X86_EFLAGS_NT|X86_EFLAGS_TF);
569 regs->ss = FLAT_KERNEL_SS;
570 regs->rsp = (unsigned long)(rsp-11);
571 regs->cs = FLAT_KERNEL_CS;
572 regs->rip = nctxt->failsafe_callback_eip;
573 }
574 }
576 static void save_segments(struct vcpu *v)
577 {
578 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
579 struct cpu_user_regs *regs = &ctxt->user_regs;
580 unsigned int dirty_segment_mask = 0;
582 regs->ds = read_segment_register(ds);
583 regs->es = read_segment_register(es);
584 regs->fs = read_segment_register(fs);
585 regs->gs = read_segment_register(gs);
587 if ( regs->ds )
588 dirty_segment_mask |= DIRTY_DS;
590 if ( regs->es )
591 dirty_segment_mask |= DIRTY_ES;
593 if ( regs->fs )
594 {
595 dirty_segment_mask |= DIRTY_FS;
596 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
597 }
598 else if ( ctxt->fs_base )
599 {
600 dirty_segment_mask |= DIRTY_FS_BASE;
601 }
603 if ( regs->gs )
604 {
605 dirty_segment_mask |= DIRTY_GS;
606 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
607 }
608 else if ( ctxt->gs_base_user )
609 {
610 dirty_segment_mask |= DIRTY_GS_BASE_USER;
611 }
613 this_cpu(dirty_segment_mask) = dirty_segment_mask;
614 }
616 #define switch_kernel_stack(v) ((void)0)
618 #elif defined(__i386__)
620 #define load_segments(n) ((void)0)
621 #define save_segments(p) ((void)0)
623 static inline void switch_kernel_stack(struct vcpu *v)
624 {
625 struct tss_struct *tss = &init_tss[smp_processor_id()];
626 tss->esp1 = v->arch.guest_context.kernel_sp;
627 tss->ss1 = v->arch.guest_context.kernel_ss;
628 }
630 #endif /* __i386__ */
632 static void paravirt_ctxt_switch_from(struct vcpu *v)
633 {
634 save_segments(v);
635 }
637 static void paravirt_ctxt_switch_to(struct vcpu *v)
638 {
639 set_int80_direct_trap(v);
640 switch_kernel_stack(v);
641 }
643 #define loaddebug(_v,_reg) \
644 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
646 static void __context_switch(void)
647 {
648 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
649 unsigned int cpu = smp_processor_id();
650 struct vcpu *p = per_cpu(curr_vcpu, cpu);
651 struct vcpu *n = current;
653 ASSERT(p != n);
654 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
656 if ( !is_idle_vcpu(p) )
657 {
658 memcpy(&p->arch.guest_context.user_regs,
659 stack_regs,
660 CTXT_SWITCH_STACK_BYTES);
661 unlazy_fpu(p);
662 p->arch.ctxt_switch_from(p);
663 }
665 if ( !is_idle_vcpu(n) )
666 {
667 memcpy(stack_regs,
668 &n->arch.guest_context.user_regs,
669 CTXT_SWITCH_STACK_BYTES);
671 /* Maybe switch the debug registers. */
672 if ( unlikely(n->arch.guest_context.debugreg[7]) )
673 {
674 loaddebug(&n->arch.guest_context, 0);
675 loaddebug(&n->arch.guest_context, 1);
676 loaddebug(&n->arch.guest_context, 2);
677 loaddebug(&n->arch.guest_context, 3);
678 /* no 4 and 5 */
679 loaddebug(&n->arch.guest_context, 6);
680 loaddebug(&n->arch.guest_context, 7);
681 }
682 n->arch.ctxt_switch_to(n);
683 }
685 if ( p->domain != n->domain )
686 cpu_set(cpu, n->domain->domain_dirty_cpumask);
687 cpu_set(cpu, n->vcpu_dirty_cpumask);
689 write_ptbase(n);
691 if ( p->vcpu_id != n->vcpu_id )
692 {
693 char gdt_load[10];
694 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
695 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
696 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
697 }
699 if ( p->domain != n->domain )
700 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
701 cpu_clear(cpu, p->vcpu_dirty_cpumask);
703 per_cpu(curr_vcpu, cpu) = n;
704 }
707 void context_switch(struct vcpu *prev, struct vcpu *next)
708 {
709 unsigned int cpu = smp_processor_id();
710 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
712 ASSERT(local_irq_is_enabled());
714 /* Allow at most one CPU at a time to be dirty. */
715 ASSERT(cpus_weight(dirty_mask) <= 1);
716 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
717 {
718 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
719 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
720 flush_tlb_mask(next->vcpu_dirty_cpumask);
721 }
723 local_irq_disable();
725 set_current(next);
727 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
728 {
729 local_irq_enable();
730 }
731 else
732 {
733 __context_switch();
735 /* Re-enable interrupts before restoring state which may fault. */
736 local_irq_enable();
738 if ( !is_hvm_vcpu(next) )
739 {
740 load_LDT(next);
741 load_segments(next);
742 }
743 }
745 context_saved(prev);
747 /* Update per-VCPU guest runstate shared memory area (if registered). */
748 if ( !guest_handle_is_null(next->runstate_guest) )
749 __copy_to_guest(next->runstate_guest, &next->runstate, 1);
751 schedule_tail(next);
752 BUG();
753 }
755 void continue_running(struct vcpu *same)
756 {
757 schedule_tail(same);
758 BUG();
759 }
761 int __sync_lazy_execstate(void)
762 {
763 unsigned long flags;
764 int switch_required;
766 local_irq_save(flags);
768 switch_required = (this_cpu(curr_vcpu) != current);
770 if ( switch_required )
771 {
772 ASSERT(current == idle_vcpu[smp_processor_id()]);
773 __context_switch();
774 }
776 local_irq_restore(flags);
778 return switch_required;
779 }
781 void sync_vcpu_execstate(struct vcpu *v)
782 {
783 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
784 (void)__sync_lazy_execstate();
786 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
787 flush_tlb_mask(v->vcpu_dirty_cpumask);
788 }
790 #define next_arg(fmt, args) ({ \
791 unsigned long __arg; \
792 switch ( *(fmt)++ ) \
793 { \
794 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
795 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
796 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
797 default: __arg = 0; BUG(); \
798 } \
799 __arg; \
800 })
802 unsigned long hypercall_create_continuation(
803 unsigned int op, const char *format, ...)
804 {
805 struct mc_state *mcs = &this_cpu(mc_state);
806 struct cpu_user_regs *regs;
807 const char *p = format;
808 unsigned long arg;
809 unsigned int i;
810 va_list args;
812 va_start(args, format);
814 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
815 {
816 __set_bit(_MCSF_call_preempted, &mcs->flags);
818 for ( i = 0; *p != '\0'; i++ )
819 mcs->call.args[i] = next_arg(p, args);
820 }
821 else
822 {
823 regs = guest_cpu_user_regs();
824 #if defined(__i386__)
825 regs->eax = op;
827 if ( supervisor_mode_kernel || is_hvm_vcpu(current) )
828 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
829 else
830 regs->eip -= 2; /* re-execute 'int 0x82' */
832 for ( i = 0; *p != '\0'; i++ )
833 {
834 arg = next_arg(p, args);
835 switch ( i )
836 {
837 case 0: regs->ebx = arg; break;
838 case 1: regs->ecx = arg; break;
839 case 2: regs->edx = arg; break;
840 case 3: regs->esi = arg; break;
841 case 4: regs->edi = arg; break;
842 case 5: regs->ebp = arg; break;
843 }
844 }
845 #elif defined(__x86_64__)
846 regs->rax = op;
847 regs->rip -= 2; /* re-execute 'syscall' */
849 for ( i = 0; *p != '\0'; i++ )
850 {
851 arg = next_arg(p, args);
852 switch ( i )
853 {
854 case 0: regs->rdi = arg; break;
855 case 1: regs->rsi = arg; break;
856 case 2: regs->rdx = arg; break;
857 case 3: regs->r10 = arg; break;
858 case 4: regs->r8 = arg; break;
859 case 5: regs->r9 = arg; break;
860 }
861 }
862 #endif
863 }
865 va_end(args);
867 return op;
868 }
870 static void relinquish_memory(struct domain *d, struct list_head *list)
871 {
872 struct list_head *ent;
873 struct page_info *page;
874 unsigned long x, y;
876 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
877 spin_lock_recursive(&d->page_alloc_lock);
879 ent = list->next;
880 while ( ent != list )
881 {
882 page = list_entry(ent, struct page_info, list);
884 /* Grab a reference to the page so it won't disappear from under us. */
885 if ( unlikely(!get_page(page, d)) )
886 {
887 /* Couldn't get a reference -- someone is freeing this page. */
888 ent = ent->next;
889 continue;
890 }
892 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
893 put_page_and_type(page);
895 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
896 put_page(page);
898 /*
899 * Forcibly invalidate base page tables at this point to break circular
900 * 'linear page table' references. This is okay because MMU structures
901 * are not shared across domains and this domain is now dead. Thus base
902 * tables are not in use so a non-zero count means circular reference.
903 */
904 y = page->u.inuse.type_info;
905 for ( ; ; )
906 {
907 x = y;
908 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
909 (PGT_base_page_table|PGT_validated)) )
910 break;
912 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
913 if ( likely(y == x) )
914 {
915 free_page_type(page, PGT_base_page_table);
916 break;
917 }
918 }
920 /* Follow the list chain and /then/ potentially free the page. */
921 ent = ent->next;
922 put_page(page);
923 }
925 spin_unlock_recursive(&d->page_alloc_lock);
926 }
928 void domain_relinquish_resources(struct domain *d)
929 {
930 struct vcpu *v;
931 unsigned long pfn;
933 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
935 /* Drop the in-use references to page-table bases. */
936 for_each_vcpu ( d, v )
937 {
938 /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
939 * or sh_update_paging_modes()) */
940 pfn = pagetable_get_pfn(v->arch.guest_table);
941 if ( pfn != 0 )
942 {
943 if ( shadow_mode_refcounts(d) )
944 put_page(mfn_to_page(pfn));
945 else
946 put_page_and_type(mfn_to_page(pfn));
947 v->arch.guest_table = pagetable_null();
948 }
950 #ifdef __x86_64__
951 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
952 pfn = pagetable_get_pfn(v->arch.guest_table_user);
953 if ( pfn != 0 )
954 {
955 if ( shadow_mode_refcounts(d) )
956 put_page(mfn_to_page(pfn));
957 else
958 put_page_and_type(mfn_to_page(pfn));
959 v->arch.guest_table_user = pagetable_null();
960 }
961 #endif
962 }
964 /* Tear down shadow mode stuff. */
965 shadow_teardown(d);
967 /*
968 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
969 * it automatically gets squashed when the guest's mappings go away.
970 */
971 for_each_vcpu(d, v)
972 destroy_gdt(v);
974 /* Relinquish every page of memory. */
975 relinquish_memory(d, &d->xenpage_list);
976 relinquish_memory(d, &d->page_list);
978 /* Free page used by xen oprofile buffer */
979 free_xenoprof_pages(d);
980 }
982 void arch_dump_domain_info(struct domain *d)
983 {
984 if ( shadow_mode_enabled(d) )
985 {
986 printk(" shadow mode: ");
987 if ( d->arch.shadow.mode & SHM2_enable )
988 printk("enabled ");
989 if ( shadow_mode_refcounts(d) )
990 printk("refcounts ");
991 if ( shadow_mode_log_dirty(d) )
992 printk("log_dirty ");
993 if ( shadow_mode_translate(d) )
994 printk("translate ");
995 if ( shadow_mode_external(d) )
996 printk("external ");
997 printk("\n");
998 }
999 }
1001 void arch_dump_vcpu_info(struct vcpu *v)
1003 if ( shadow_mode_enabled(v->domain) )
1005 if ( v->arch.shadow.mode )
1006 printk(" shadowed %u-on-%u, %stranslated\n",
1007 v->arch.shadow.mode->guest_levels,
1008 v->arch.shadow.mode->shadow_levels,
1009 shadow_vcpu_mode_translate(v) ? "" : "not ");
1010 else
1011 printk(" not shadowed\n");
1015 /*
1016 * Local variables:
1017 * mode: C
1018 * c-set-style: "BSD"
1019 * c-basic-offset: 4
1020 * tab-width: 4
1021 * indent-tabs-mode: nil
1022 * End:
1023 */