ia64/xen-unstable

view xen/arch/x86/domain.c @ 12226:45e34f00a78f

[HVM] Clean up VCPU initialisation in Xen. No longer
parse HVM e820 tables in Xen (add some extra HVM parameters as a
cleaner alternative). Lots of code removal.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Nov 02 15:55:51 2006 +0000 (2006-11-02)
parents cf3d69ba5633
children dd62270df2ad
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <xen/kernel.h>
25 #include <xen/multicall.h>
26 #include <xen/irq.h>
27 #include <xen/event.h>
28 #include <xen/console.h>
29 #include <xen/percpu.h>
30 #include <asm/regs.h>
31 #include <asm/mc146818rtc.h>
32 #include <asm/system.h>
33 #include <asm/io.h>
34 #include <asm/processor.h>
35 #include <asm/desc.h>
36 #include <asm/i387.h>
37 #include <asm/mpspec.h>
38 #include <asm/ldt.h>
39 #include <asm/shadow.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/msr.h>
44 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
46 static void paravirt_ctxt_switch_from(struct vcpu *v);
47 static void paravirt_ctxt_switch_to(struct vcpu *v);
49 static void continue_idle_domain(struct vcpu *v)
50 {
51 reset_stack_and_jump(idle_loop);
52 }
54 static void continue_nonidle_domain(struct vcpu *v)
55 {
56 reset_stack_and_jump(ret_from_intr);
57 }
59 static void default_idle(void)
60 {
61 local_irq_disable();
62 if ( !softirq_pending(smp_processor_id()) )
63 safe_halt();
64 else
65 local_irq_enable();
66 }
68 void idle_loop(void)
69 {
70 for ( ; ; )
71 {
72 page_scrub_schedule_work();
73 default_idle();
74 do_softirq();
75 }
76 }
78 void startup_cpu_idle_loop(void)
79 {
80 struct vcpu *v = current;
82 ASSERT(is_idle_vcpu(v));
83 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
84 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
86 reset_stack_and_jump(idle_loop);
87 }
89 void dump_pageframe_info(struct domain *d)
90 {
91 struct page_info *page;
93 printk("Memory pages belonging to domain %u:\n", d->domain_id);
95 if ( d->tot_pages >= 10 )
96 {
97 printk(" DomPage list too long to display\n");
98 }
99 else
100 {
101 list_for_each_entry ( page, &d->page_list, list )
102 {
103 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
104 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
105 page->count_info, page->u.inuse.type_info);
106 }
107 }
109 list_for_each_entry ( page, &d->xenpage_list, list )
110 {
111 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
112 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
113 page->count_info, page->u.inuse.type_info);
114 }
115 }
117 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
118 {
119 struct vcpu *v;
121 if ( (v = xmalloc(struct vcpu)) == NULL )
122 return NULL;
124 memset(v, 0, sizeof(*v));
126 v->vcpu_id = vcpu_id;
127 v->domain = d;
129 v->arch.flags = TF_kernel_mode;
131 if ( is_hvm_domain(d) )
132 {
133 if ( hvm_vcpu_initialise(v) != 0 )
134 {
135 xfree(v);
136 return NULL;
137 }
138 }
139 else
140 {
141 v->arch.schedule_tail = continue_nonidle_domain;
142 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
143 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
145 if ( is_idle_domain(d) )
146 {
147 v->arch.schedule_tail = continue_idle_domain;
148 v->arch.cr3 = __pa(idle_pg_table);
149 }
150 }
152 v->arch.perdomain_ptes =
153 d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
155 pae_l3_cache_init(&v->arch.pae_l3_cache);
157 return v;
158 }
160 void free_vcpu_struct(struct vcpu *v)
161 {
162 xfree(v);
163 }
165 int arch_domain_create(struct domain *d)
166 {
167 l1_pgentry_t gdt_l1e;
168 int vcpuid, pdpt_order;
169 int i, rc = -ENOMEM;
171 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
172 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
173 if ( d->arch.mm_perdomain_pt == NULL )
174 goto fail;
175 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
177 /*
178 * Map Xen segments into every VCPU's GDT, irrespective of whether every
179 * VCPU will actually be used. This avoids an NMI race during context
180 * switch: if we take an interrupt after switching CR3 but before switching
181 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
182 * try to load CS from an invalid table.
183 */
184 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
185 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
186 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
187 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
189 #if defined(__i386__)
191 mapcache_init(d);
193 #else /* __x86_64__ */
195 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
196 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
197 if ( (d->arch.mm_perdomain_l2 == NULL) ||
198 (d->arch.mm_perdomain_l3 == NULL) )
199 goto fail;
201 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
202 for ( i = 0; i < (1 << pdpt_order); i++ )
203 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
204 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
205 __PAGE_HYPERVISOR);
207 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
208 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
209 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
210 __PAGE_HYPERVISOR);
212 #endif /* __x86_64__ */
214 shadow_lock_init(d);
215 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
216 INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
217 INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
218 INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
219 INIT_LIST_HEAD(&d->arch.shadow.toplevel_shadows);
221 if ( !is_idle_domain(d) )
222 {
223 d->arch.ioport_caps =
224 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
225 if ( d->arch.ioport_caps == NULL )
226 goto fail;
228 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
229 goto fail;
231 memset(d->shared_info, 0, PAGE_SIZE);
232 share_xen_page_with_guest(
233 virt_to_page(d->shared_info), d, XENSHARE_writable);
234 }
236 if ( is_hvm_domain(d) )
237 {
238 if ( !hvm_enabled )
239 {
240 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
241 "on a non-VT/AMDV platform.\n");
242 rc = -EINVAL;
243 goto fail;
244 }
246 rc = shadow_enable(d, SHM2_refcounts|SHM2_translate|SHM2_external);
247 if ( rc != 0 )
248 goto fail;
249 }
251 return 0;
253 fail:
254 free_xenheap_page(d->shared_info);
255 #ifdef __x86_64__
256 free_xenheap_page(d->arch.mm_perdomain_l2);
257 free_xenheap_page(d->arch.mm_perdomain_l3);
258 #endif
259 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
260 return rc;
261 }
263 void arch_domain_destroy(struct domain *d)
264 {
265 shadow_final_teardown(d);
267 free_xenheap_pages(
268 d->arch.mm_perdomain_pt,
269 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
271 #ifdef __x86_64__
272 free_xenheap_page(d->arch.mm_perdomain_l2);
273 free_xenheap_page(d->arch.mm_perdomain_l3);
274 #endif
276 free_xenheap_page(d->shared_info);
277 }
279 /* This is called by arch_final_setup_guest and do_boot_vcpu */
280 int arch_set_info_guest(
281 struct vcpu *v, struct vcpu_guest_context *c)
282 {
283 struct domain *d = v->domain;
284 unsigned long cr3_pfn = INVALID_MFN;
285 int i, rc;
287 if ( !!(c->flags & VGCF_hvm_guest) != is_hvm_vcpu(v) )
288 return -EINVAL;
289 c->flags &= ~VGCF_hvm_guest;
291 if ( !is_hvm_vcpu(v) )
292 {
293 fixup_guest_stack_selector(c->user_regs.ss);
294 fixup_guest_stack_selector(c->kernel_ss);
295 fixup_guest_code_selector(c->user_regs.cs);
297 #ifdef __i386__
298 fixup_guest_code_selector(c->event_callback_cs);
299 fixup_guest_code_selector(c->failsafe_callback_cs);
300 #endif
302 for ( i = 0; i < 256; i++ )
303 fixup_guest_code_selector(c->trap_ctxt[i].cs);
304 }
306 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
307 if ( c->flags & VGCF_i387_valid )
308 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
310 v->arch.flags &= ~TF_kernel_mode;
311 if ( (c->flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
312 v->arch.flags |= TF_kernel_mode;
314 memcpy(&v->arch.guest_context, c, sizeof(*c));
316 /* Only CR0.TS is modifiable by guest or admin. */
317 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
318 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
320 init_int80_direct_trap(v);
322 if ( !is_hvm_vcpu(v) )
323 {
324 /* IOPL privileges are virtualised. */
325 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
326 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
328 /* Ensure real hardware interrupts are enabled. */
329 v->arch.guest_context.user_regs.eflags |= EF_IE;
330 }
331 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
332 {
333 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
334 }
336 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
337 return 0;
339 memset(v->arch.guest_context.debugreg, 0,
340 sizeof(v->arch.guest_context.debugreg));
341 for ( i = 0; i < 8; i++ )
342 (void)set_debugreg(v, i, c->debugreg[i]);
344 if ( v->vcpu_id == 0 )
345 d->vm_assist = c->vm_assist;
347 if ( !is_hvm_vcpu(v) )
348 {
349 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
350 return rc;
352 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c->ctrlreg[3]));
354 if ( shadow_mode_refcounts(d)
355 ? !get_page(mfn_to_page(cr3_pfn), d)
356 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
357 PGT_base_page_table) )
358 {
359 destroy_gdt(v);
360 return -EINVAL;
361 }
363 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
364 }
366 /* Shadow: make sure the domain has enough shadow memory to
367 * boot another vcpu */
368 if ( shadow_mode_enabled(d)
369 && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) )
370 {
371 destroy_gdt(v);
372 return -ENOMEM;
373 }
375 if ( v->vcpu_id == 0 )
376 update_domain_wallclock_time(d);
378 /* Don't redo final setup */
379 set_bit(_VCPUF_initialised, &v->vcpu_flags);
381 if ( shadow_mode_enabled(d) )
382 shadow_update_paging_modes(v);
384 update_cr3(v);
386 return 0;
387 }
389 long
390 arch_do_vcpu_op(
391 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
392 {
393 long rc = 0;
395 switch ( cmd )
396 {
397 case VCPUOP_register_runstate_memory_area:
398 {
399 struct vcpu_register_runstate_memory_area area;
400 struct vcpu_runstate_info runstate;
402 rc = -EFAULT;
403 if ( copy_from_guest(&area, arg, 1) )
404 break;
406 if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) )
407 break;
409 rc = 0;
410 v->runstate_guest = area.addr.v;
412 if ( v == current )
413 {
414 __copy_to_user(v->runstate_guest, &v->runstate,
415 sizeof(v->runstate));
416 }
417 else
418 {
419 vcpu_runstate_get(v, &runstate);
420 __copy_to_user(v->runstate_guest, &runstate, sizeof(runstate));
421 }
423 break;
424 }
426 default:
427 rc = -ENOSYS;
428 break;
429 }
431 return rc;
432 }
434 void new_thread(struct vcpu *d,
435 unsigned long start_pc,
436 unsigned long start_stack,
437 unsigned long start_info)
438 {
439 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
441 /*
442 * Initial register values:
443 * DS,ES,FS,GS = FLAT_KERNEL_DS
444 * CS:EIP = FLAT_KERNEL_CS:start_pc
445 * SS:ESP = FLAT_KERNEL_SS:start_stack
446 * ESI = start_info
447 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
448 */
449 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
450 regs->ss = FLAT_KERNEL_SS;
451 regs->cs = FLAT_KERNEL_CS;
452 regs->eip = start_pc;
453 regs->esp = start_stack;
454 regs->esi = start_info;
456 __save_flags(regs->eflags);
457 regs->eflags |= X86_EFLAGS_IF;
458 }
461 #ifdef __x86_64__
463 #define loadsegment(seg,value) ({ \
464 int __r = 1; \
465 __asm__ __volatile__ ( \
466 "1: movl %k1,%%" #seg "\n2:\n" \
467 ".section .fixup,\"ax\"\n" \
468 "3: xorl %k0,%k0\n" \
469 " movl %k0,%%" #seg "\n" \
470 " jmp 2b\n" \
471 ".previous\n" \
472 ".section __ex_table,\"a\"\n" \
473 " .align 8\n" \
474 " .quad 1b,3b\n" \
475 ".previous" \
476 : "=r" (__r) : "r" (value), "0" (__r) );\
477 __r; })
479 /*
480 * save_segments() writes a mask of segments which are dirty (non-zero),
481 * allowing load_segments() to avoid some expensive segment loads and
482 * MSR writes.
483 */
484 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
485 #define DIRTY_DS 0x01
486 #define DIRTY_ES 0x02
487 #define DIRTY_FS 0x04
488 #define DIRTY_GS 0x08
489 #define DIRTY_FS_BASE 0x10
490 #define DIRTY_GS_BASE_USER 0x20
492 static void load_segments(struct vcpu *n)
493 {
494 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
495 int all_segs_okay = 1;
496 unsigned int dirty_segment_mask, cpu = smp_processor_id();
498 /* Load and clear the dirty segment mask. */
499 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
500 per_cpu(dirty_segment_mask, cpu) = 0;
502 /* Either selector != 0 ==> reload. */
503 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
504 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
506 /* Either selector != 0 ==> reload. */
507 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
508 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
510 /*
511 * Either selector != 0 ==> reload.
512 * Also reload to reset FS_BASE if it was non-zero.
513 */
514 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
515 nctxt->user_regs.fs) )
516 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
518 /*
519 * Either selector != 0 ==> reload.
520 * Also reload to reset GS_BASE if it was non-zero.
521 */
522 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
523 nctxt->user_regs.gs) )
524 {
525 /* Reset GS_BASE with user %gs? */
526 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
527 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
528 }
530 /* This can only be non-zero if selector is NULL. */
531 if ( nctxt->fs_base )
532 wrmsr(MSR_FS_BASE,
533 nctxt->fs_base,
534 nctxt->fs_base>>32);
536 /* Most kernels have non-zero GS base, so don't bother testing. */
537 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
538 wrmsr(MSR_SHADOW_GS_BASE,
539 nctxt->gs_base_kernel,
540 nctxt->gs_base_kernel>>32);
542 /* This can only be non-zero if selector is NULL. */
543 if ( nctxt->gs_base_user )
544 wrmsr(MSR_GS_BASE,
545 nctxt->gs_base_user,
546 nctxt->gs_base_user>>32);
548 /* If in kernel mode then switch the GS bases around. */
549 if ( n->arch.flags & TF_kernel_mode )
550 __asm__ __volatile__ ( "swapgs" );
552 if ( unlikely(!all_segs_okay) )
553 {
554 struct cpu_user_regs *regs = guest_cpu_user_regs();
555 unsigned long *rsp =
556 (n->arch.flags & TF_kernel_mode) ?
557 (unsigned long *)regs->rsp :
558 (unsigned long *)nctxt->kernel_sp;
559 unsigned long cs_and_mask, rflags;
561 if ( !(n->arch.flags & TF_kernel_mode) )
562 toggle_guest_mode(n);
563 else
564 regs->cs &= ~3;
566 /* CS longword also contains full evtchn_upcall_mask. */
567 cs_and_mask = (unsigned long)regs->cs |
568 ((unsigned long)n->vcpu_info->evtchn_upcall_mask << 32);
570 /* Fold upcall mask into RFLAGS.IF. */
571 rflags = regs->rflags & ~X86_EFLAGS_IF;
572 rflags |= !n->vcpu_info->evtchn_upcall_mask << 9;
574 if ( put_user(regs->ss, rsp- 1) |
575 put_user(regs->rsp, rsp- 2) |
576 put_user(rflags, rsp- 3) |
577 put_user(cs_and_mask, rsp- 4) |
578 put_user(regs->rip, rsp- 5) |
579 put_user(nctxt->user_regs.gs, rsp- 6) |
580 put_user(nctxt->user_regs.fs, rsp- 7) |
581 put_user(nctxt->user_regs.es, rsp- 8) |
582 put_user(nctxt->user_regs.ds, rsp- 9) |
583 put_user(regs->r11, rsp-10) |
584 put_user(regs->rcx, rsp-11) )
585 {
586 gdprintk(XENLOG_ERR, "Error while creating failsafe "
587 "callback frame.\n");
588 domain_crash(n->domain);
589 }
591 if ( test_bit(_VGCF_failsafe_disables_events,
592 &n->arch.guest_context.flags) )
593 n->vcpu_info->evtchn_upcall_mask = 1;
595 regs->entry_vector = TRAP_syscall;
596 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
597 X86_EFLAGS_NT|X86_EFLAGS_TF);
598 regs->ss = __GUEST_SS;
599 regs->rsp = (unsigned long)(rsp-11);
600 regs->cs = __GUEST_CS;
601 regs->rip = nctxt->failsafe_callback_eip;
602 }
603 }
605 static void save_segments(struct vcpu *v)
606 {
607 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
608 struct cpu_user_regs *regs = &ctxt->user_regs;
609 unsigned int dirty_segment_mask = 0;
611 regs->ds = read_segment_register(ds);
612 regs->es = read_segment_register(es);
613 regs->fs = read_segment_register(fs);
614 regs->gs = read_segment_register(gs);
616 if ( regs->ds )
617 dirty_segment_mask |= DIRTY_DS;
619 if ( regs->es )
620 dirty_segment_mask |= DIRTY_ES;
622 if ( regs->fs )
623 {
624 dirty_segment_mask |= DIRTY_FS;
625 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
626 }
627 else if ( ctxt->fs_base )
628 {
629 dirty_segment_mask |= DIRTY_FS_BASE;
630 }
632 if ( regs->gs )
633 {
634 dirty_segment_mask |= DIRTY_GS;
635 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
636 }
637 else if ( ctxt->gs_base_user )
638 {
639 dirty_segment_mask |= DIRTY_GS_BASE_USER;
640 }
642 this_cpu(dirty_segment_mask) = dirty_segment_mask;
643 }
645 #define switch_kernel_stack(v) ((void)0)
647 #elif defined(__i386__)
649 #define load_segments(n) ((void)0)
650 #define save_segments(p) ((void)0)
652 static inline void switch_kernel_stack(struct vcpu *v)
653 {
654 struct tss_struct *tss = &init_tss[smp_processor_id()];
655 tss->esp1 = v->arch.guest_context.kernel_sp;
656 tss->ss1 = v->arch.guest_context.kernel_ss;
657 }
659 #endif /* __i386__ */
661 static void paravirt_ctxt_switch_from(struct vcpu *v)
662 {
663 save_segments(v);
664 }
666 static void paravirt_ctxt_switch_to(struct vcpu *v)
667 {
668 set_int80_direct_trap(v);
669 switch_kernel_stack(v);
670 }
672 #define loaddebug(_v,_reg) \
673 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
675 static void __context_switch(void)
676 {
677 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
678 unsigned int cpu = smp_processor_id();
679 struct vcpu *p = per_cpu(curr_vcpu, cpu);
680 struct vcpu *n = current;
682 ASSERT(p != n);
683 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
685 if ( !is_idle_vcpu(p) )
686 {
687 memcpy(&p->arch.guest_context.user_regs,
688 stack_regs,
689 CTXT_SWITCH_STACK_BYTES);
690 unlazy_fpu(p);
691 p->arch.ctxt_switch_from(p);
692 }
694 if ( !is_idle_vcpu(n) )
695 {
696 memcpy(stack_regs,
697 &n->arch.guest_context.user_regs,
698 CTXT_SWITCH_STACK_BYTES);
700 /* Maybe switch the debug registers. */
701 if ( unlikely(n->arch.guest_context.debugreg[7]) )
702 {
703 loaddebug(&n->arch.guest_context, 0);
704 loaddebug(&n->arch.guest_context, 1);
705 loaddebug(&n->arch.guest_context, 2);
706 loaddebug(&n->arch.guest_context, 3);
707 /* no 4 and 5 */
708 loaddebug(&n->arch.guest_context, 6);
709 loaddebug(&n->arch.guest_context, 7);
710 }
711 n->arch.ctxt_switch_to(n);
712 }
714 if ( p->domain != n->domain )
715 cpu_set(cpu, n->domain->domain_dirty_cpumask);
716 cpu_set(cpu, n->vcpu_dirty_cpumask);
718 write_ptbase(n);
720 if ( p->vcpu_id != n->vcpu_id )
721 {
722 char gdt_load[10];
723 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
724 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
725 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
726 }
728 if ( p->domain != n->domain )
729 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
730 cpu_clear(cpu, p->vcpu_dirty_cpumask);
732 per_cpu(curr_vcpu, cpu) = n;
733 }
736 void context_switch(struct vcpu *prev, struct vcpu *next)
737 {
738 unsigned int cpu = smp_processor_id();
739 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
741 ASSERT(local_irq_is_enabled());
743 /* Allow at most one CPU at a time to be dirty. */
744 ASSERT(cpus_weight(dirty_mask) <= 1);
745 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
746 {
747 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
748 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
749 flush_tlb_mask(next->vcpu_dirty_cpumask);
750 }
752 local_irq_disable();
754 set_current(next);
756 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
757 {
758 local_irq_enable();
759 }
760 else
761 {
762 __context_switch();
764 /* Re-enable interrupts before restoring state which may fault. */
765 local_irq_enable();
767 if ( !is_hvm_vcpu(next) )
768 {
769 load_LDT(next);
770 load_segments(next);
771 }
772 }
774 context_saved(prev);
776 /* Update per-VCPU guest runstate shared memory area (if registered). */
777 if ( next->runstate_guest != NULL )
778 __copy_to_user(next->runstate_guest, &next->runstate,
779 sizeof(next->runstate));
781 schedule_tail(next);
782 BUG();
783 }
785 void continue_running(struct vcpu *same)
786 {
787 schedule_tail(same);
788 BUG();
789 }
791 int __sync_lazy_execstate(void)
792 {
793 unsigned long flags;
794 int switch_required;
796 local_irq_save(flags);
798 switch_required = (this_cpu(curr_vcpu) != current);
800 if ( switch_required )
801 {
802 ASSERT(current == idle_vcpu[smp_processor_id()]);
803 __context_switch();
804 }
806 local_irq_restore(flags);
808 return switch_required;
809 }
811 void sync_vcpu_execstate(struct vcpu *v)
812 {
813 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
814 (void)__sync_lazy_execstate();
816 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
817 flush_tlb_mask(v->vcpu_dirty_cpumask);
818 }
820 #define next_arg(fmt, args) ({ \
821 unsigned long __arg; \
822 switch ( *(fmt)++ ) \
823 { \
824 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
825 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
826 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
827 default: __arg = 0; BUG(); \
828 } \
829 __arg; \
830 })
832 unsigned long hypercall_create_continuation(
833 unsigned int op, const char *format, ...)
834 {
835 struct mc_state *mcs = &this_cpu(mc_state);
836 struct cpu_user_regs *regs;
837 const char *p = format;
838 unsigned long arg;
839 unsigned int i;
840 va_list args;
842 va_start(args, format);
844 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
845 {
846 __set_bit(_MCSF_call_preempted, &mcs->flags);
848 for ( i = 0; *p != '\0'; i++ )
849 mcs->call.args[i] = next_arg(p, args);
850 }
851 else
852 {
853 regs = guest_cpu_user_regs();
854 #if defined(__i386__)
855 regs->eax = op;
857 if ( supervisor_mode_kernel || is_hvm_vcpu(current) )
858 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
859 else
860 regs->eip -= 2; /* re-execute 'int 0x82' */
862 for ( i = 0; *p != '\0'; i++ )
863 {
864 arg = next_arg(p, args);
865 switch ( i )
866 {
867 case 0: regs->ebx = arg; break;
868 case 1: regs->ecx = arg; break;
869 case 2: regs->edx = arg; break;
870 case 3: regs->esi = arg; break;
871 case 4: regs->edi = arg; break;
872 case 5: regs->ebp = arg; break;
873 }
874 }
875 #elif defined(__x86_64__)
876 regs->rax = op;
877 regs->rip -= 2; /* re-execute 'syscall' */
879 for ( i = 0; *p != '\0'; i++ )
880 {
881 arg = next_arg(p, args);
882 switch ( i )
883 {
884 case 0: regs->rdi = arg; break;
885 case 1: regs->rsi = arg; break;
886 case 2: regs->rdx = arg; break;
887 case 3: regs->r10 = arg; break;
888 case 4: regs->r8 = arg; break;
889 case 5: regs->r9 = arg; break;
890 }
891 }
892 #endif
893 }
895 va_end(args);
897 return op;
898 }
900 static void relinquish_memory(struct domain *d, struct list_head *list)
901 {
902 struct list_head *ent;
903 struct page_info *page;
904 unsigned long x, y;
906 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
907 spin_lock_recursive(&d->page_alloc_lock);
909 ent = list->next;
910 while ( ent != list )
911 {
912 page = list_entry(ent, struct page_info, list);
914 /* Grab a reference to the page so it won't disappear from under us. */
915 if ( unlikely(!get_page(page, d)) )
916 {
917 /* Couldn't get a reference -- someone is freeing this page. */
918 ent = ent->next;
919 continue;
920 }
922 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
923 put_page_and_type(page);
925 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
926 put_page(page);
928 /*
929 * Forcibly invalidate base page tables at this point to break circular
930 * 'linear page table' references. This is okay because MMU structures
931 * are not shared across domains and this domain is now dead. Thus base
932 * tables are not in use so a non-zero count means circular reference.
933 */
934 y = page->u.inuse.type_info;
935 for ( ; ; )
936 {
937 x = y;
938 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
939 (PGT_base_page_table|PGT_validated)) )
940 break;
942 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
943 if ( likely(y == x) )
944 {
945 free_page_type(page, PGT_base_page_table);
946 break;
947 }
948 }
950 /* Follow the list chain and /then/ potentially free the page. */
951 ent = ent->next;
952 put_page(page);
953 }
955 spin_unlock_recursive(&d->page_alloc_lock);
956 }
958 void domain_relinquish_resources(struct domain *d)
959 {
960 struct vcpu *v;
961 unsigned long pfn;
963 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
965 /* Drop the in-use references to page-table bases. */
966 for_each_vcpu ( d, v )
967 {
968 /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
969 * or sh_update_paging_modes()) */
970 pfn = pagetable_get_pfn(v->arch.guest_table);
971 if ( pfn != 0 )
972 {
973 if ( shadow_mode_refcounts(d) )
974 put_page(mfn_to_page(pfn));
975 else
976 put_page_and_type(mfn_to_page(pfn));
977 v->arch.guest_table = pagetable_null();
978 }
980 #ifdef __x86_64__
981 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
982 pfn = pagetable_get_pfn(v->arch.guest_table_user);
983 if ( pfn != 0 )
984 {
985 if ( shadow_mode_refcounts(d) )
986 put_page(mfn_to_page(pfn));
987 else
988 put_page_and_type(mfn_to_page(pfn));
989 v->arch.guest_table_user = pagetable_null();
990 }
991 #endif
992 }
994 if ( is_hvm_domain(d) )
995 hvm_relinquish_guest_resources(d);
997 /* Tear down shadow mode stuff. */
998 shadow_teardown(d);
1000 /*
1001 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1002 * it automatically gets squashed when the guest's mappings go away.
1003 */
1004 for_each_vcpu(d, v)
1005 destroy_gdt(v);
1007 /* Relinquish every page of memory. */
1008 relinquish_memory(d, &d->xenpage_list);
1009 relinquish_memory(d, &d->page_list);
1011 /* Free page used by xen oprofile buffer */
1012 free_xenoprof_pages(d);
1015 void arch_dump_domain_info(struct domain *d)
1017 if ( shadow_mode_enabled(d) )
1019 printk(" shadow mode: ");
1020 if ( d->arch.shadow.mode & SHM2_enable )
1021 printk("enabled ");
1022 if ( shadow_mode_refcounts(d) )
1023 printk("refcounts ");
1024 if ( shadow_mode_log_dirty(d) )
1025 printk("log_dirty ");
1026 if ( shadow_mode_translate(d) )
1027 printk("translate ");
1028 if ( shadow_mode_external(d) )
1029 printk("external ");
1030 printk("\n");
1034 void arch_dump_vcpu_info(struct vcpu *v)
1036 if ( shadow_mode_enabled(v->domain) )
1038 if ( v->arch.shadow.mode )
1039 printk(" shadowed %u-on-%u, %stranslated\n",
1040 v->arch.shadow.mode->guest_levels,
1041 v->arch.shadow.mode->shadow_levels,
1042 shadow_vcpu_mode_translate(v) ? "" : "not ");
1043 else
1044 printk(" not shadowed\n");
1048 /*
1049 * Local variables:
1050 * mode: C
1051 * c-set-style: "BSD"
1052 * c-basic-offset: 4
1053 * tab-width: 4
1054 * indent-tabs-mode: nil
1055 * End:
1056 */