ia64/xen-unstable

view xen/arch/x86/domain.c @ 10892:0d2ba35c0cf2

[XEN] Add hypercall support for HVM guests. This is
fairly useless at the moment, since all of the hypercalls
fail, since copy_from_user doesn't work correctly in HVM
domains.

Signed-off-by: Steven Smith <ssmith@xensource.com>

Add a CPUID hypervisor platform interface at leaf
0x40000000. Allow hypercall transfer page to be filled
in via MSR 0x40000000.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Aug 01 17:18:05 2006 +0100 (2006-08-01)
parents 462d6e4cb29a
children 5fc926b58609
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <xen/kernel.h>
25 #include <xen/multicall.h>
26 #include <xen/irq.h>
27 #include <xen/event.h>
28 #include <xen/console.h>
29 #include <xen/percpu.h>
30 #include <asm/regs.h>
31 #include <asm/mc146818rtc.h>
32 #include <asm/system.h>
33 #include <asm/io.h>
34 #include <asm/processor.h>
35 #include <asm/desc.h>
36 #include <asm/i387.h>
37 #include <asm/mpspec.h>
38 #include <asm/ldt.h>
39 #include <asm/shadow.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/msr.h>
44 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
46 static void paravirt_ctxt_switch_from(struct vcpu *v);
47 static void paravirt_ctxt_switch_to(struct vcpu *v);
49 static void continue_idle_domain(struct vcpu *v)
50 {
51 reset_stack_and_jump(idle_loop);
52 }
54 static void continue_nonidle_domain(struct vcpu *v)
55 {
56 reset_stack_and_jump(ret_from_intr);
57 }
59 static void default_idle(void)
60 {
61 local_irq_disable();
62 if ( !softirq_pending(smp_processor_id()) )
63 safe_halt();
64 else
65 local_irq_enable();
66 }
68 void idle_loop(void)
69 {
70 for ( ; ; )
71 {
72 page_scrub_schedule_work();
73 default_idle();
74 do_softirq();
75 }
76 }
78 void startup_cpu_idle_loop(void)
79 {
80 struct vcpu *v = current;
82 ASSERT(is_idle_vcpu(v));
83 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
84 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
86 reset_stack_and_jump(idle_loop);
87 }
89 void dump_pageframe_info(struct domain *d)
90 {
91 struct page_info *page;
93 printk("Memory pages belonging to domain %u:\n", d->domain_id);
95 if ( d->tot_pages >= 10 )
96 {
97 printk(" DomPage list too long to display\n");
98 }
99 else
100 {
101 list_for_each_entry ( page, &d->page_list, list )
102 {
103 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
104 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
105 page->count_info, page->u.inuse.type_info);
106 }
107 }
109 list_for_each_entry ( page, &d->xenpage_list, list )
110 {
111 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
112 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
113 page->count_info, page->u.inuse.type_info);
114 }
115 }
117 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
118 {
119 struct vcpu *v;
121 if ( (v = xmalloc(struct vcpu)) == NULL )
122 return NULL;
124 memset(v, 0, sizeof(*v));
126 v->arch.flags = TF_kernel_mode;
128 v->arch.schedule_tail = is_idle_domain(d) ?
129 continue_idle_domain : continue_nonidle_domain;
131 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
132 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
134 v->arch.perdomain_ptes =
135 d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
137 v->arch.guest_vtable = __linear_l2_table;
138 v->arch.shadow_vtable = __shadow_linear_l2_table;
139 #if defined(__x86_64__)
140 v->arch.guest_vl3table = __linear_l3_table;
141 v->arch.guest_vl4table = __linear_l4_table;
142 #endif
144 pae_l3_cache_init(&v->arch.pae_l3_cache);
146 return v;
147 }
149 void free_vcpu_struct(struct vcpu *v)
150 {
151 xfree(v);
152 }
154 int arch_domain_create(struct domain *d)
155 {
156 l1_pgentry_t gdt_l1e;
157 int vcpuid, pdpt_order, rc;
158 #ifdef __x86_64__
159 int i;
160 #endif
162 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
163 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
164 if ( d->arch.mm_perdomain_pt == NULL )
165 goto fail_nomem;
166 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
168 /*
169 * Map Xen segments into every VCPU's GDT, irrespective of whether every
170 * VCPU will actually be used. This avoids an NMI race during context
171 * switch: if we take an interrupt after switching CR3 but before switching
172 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
173 * try to load CS from an invalid table.
174 */
175 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
176 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
177 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
178 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
180 #if defined(__i386__)
182 mapcache_init(d);
184 #else /* __x86_64__ */
186 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
187 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
188 if ( (d->arch.mm_perdomain_l2 == NULL) ||
189 (d->arch.mm_perdomain_l3 == NULL) )
190 goto fail_nomem;
192 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
193 for ( i = 0; i < (1 << pdpt_order); i++ )
194 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
195 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
196 __PAGE_HYPERVISOR);
198 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
199 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
200 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
201 __PAGE_HYPERVISOR);
203 #endif /* __x86_64__ */
205 shadow_lock_init(d);
206 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
208 if ( !is_idle_domain(d) )
209 {
210 d->arch.ioport_caps =
211 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
212 if ( d->arch.ioport_caps == NULL )
213 goto fail_nomem;
215 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
216 goto fail_nomem;
218 if ( (rc = ptwr_init(d)) != 0 )
219 goto fail_nomem;
221 memset(d->shared_info, 0, PAGE_SIZE);
222 share_xen_page_with_guest(
223 virt_to_page(d->shared_info), d, XENSHARE_writable);
224 }
226 return 0;
228 fail_nomem:
229 free_xenheap_page(d->shared_info);
230 #ifdef __x86_64__
231 free_xenheap_page(d->arch.mm_perdomain_l2);
232 free_xenheap_page(d->arch.mm_perdomain_l3);
233 #endif
234 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
235 return -ENOMEM;
236 }
238 void arch_domain_destroy(struct domain *d)
239 {
240 free_xenheap_pages(
241 d->arch.mm_perdomain_pt,
242 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
244 #ifdef __x86_64__
245 free_xenheap_page(d->arch.mm_perdomain_l2);
246 free_xenheap_page(d->arch.mm_perdomain_l3);
247 #endif
249 free_xenheap_page(d->shared_info);
250 }
252 /* This is called by arch_final_setup_guest and do_boot_vcpu */
253 int arch_set_info_guest(
254 struct vcpu *v, struct vcpu_guest_context *c)
255 {
256 struct domain *d = v->domain;
257 unsigned long cr3_pfn = INVALID_MFN;
258 int i, rc;
260 if ( !(c->flags & VGCF_HVM_GUEST) )
261 {
262 fixup_guest_stack_selector(c->user_regs.ss);
263 fixup_guest_stack_selector(c->kernel_ss);
264 fixup_guest_code_selector(c->user_regs.cs);
266 #ifdef __i386__
267 fixup_guest_code_selector(c->event_callback_cs);
268 fixup_guest_code_selector(c->failsafe_callback_cs);
269 #endif
271 for ( i = 0; i < 256; i++ )
272 fixup_guest_code_selector(c->trap_ctxt[i].cs);
273 }
274 else if ( !hvm_enabled )
275 return -EINVAL;
277 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
278 if ( c->flags & VGCF_I387_VALID )
279 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
281 v->arch.flags &= ~TF_kernel_mode;
282 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_HVM_GUEST) )
283 v->arch.flags |= TF_kernel_mode;
285 memcpy(&v->arch.guest_context, c, sizeof(*c));
287 /* Only CR0.TS is modifiable by guest or admin. */
288 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
289 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
291 init_int80_direct_trap(v);
293 if ( !(c->flags & VGCF_HVM_GUEST) )
294 {
295 /* IOPL privileges are virtualised. */
296 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
297 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
299 /* Ensure real hardware interrupts are enabled. */
300 v->arch.guest_context.user_regs.eflags |= EF_IE;
301 }
302 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
303 {
304 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
305 }
307 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
308 return 0;
310 memset(v->arch.guest_context.debugreg, 0,
311 sizeof(v->arch.guest_context.debugreg));
312 for ( i = 0; i < 8; i++ )
313 (void)set_debugreg(v, i, c->debugreg[i]);
315 if ( v->vcpu_id == 0 )
316 d->vm_assist = c->vm_assist;
318 if ( !(c->flags & VGCF_HVM_GUEST) )
319 {
320 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c->ctrlreg[3]));
321 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
322 }
324 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
325 return rc;
327 if ( c->flags & VGCF_HVM_GUEST )
328 {
329 v->arch.guest_table = pagetable_null();
331 if ( !hvm_initialize_guest_resources(v) )
332 return -EINVAL;
333 }
334 else if ( shadow_mode_refcounts(d) )
335 {
336 if ( !get_page(mfn_to_page(cr3_pfn), d) )
337 {
338 destroy_gdt(v);
339 return -EINVAL;
340 }
341 }
342 else
343 {
344 if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
345 PGT_base_page_table) )
346 {
347 destroy_gdt(v);
348 return -EINVAL;
349 }
350 }
352 update_pagetables(v);
354 if ( v->vcpu_id == 0 )
355 update_domain_wallclock_time(d);
357 /* Don't redo final setup */
358 set_bit(_VCPUF_initialised, &v->vcpu_flags);
360 return 0;
361 }
363 long
364 arch_do_vcpu_op(
365 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
366 {
367 long rc = 0;
369 switch ( cmd )
370 {
371 case VCPUOP_register_runstate_memory_area:
372 {
373 struct vcpu_register_runstate_memory_area area;
375 rc = -EFAULT;
376 if ( copy_from_guest(&area, arg, 1) )
377 break;
379 if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) )
380 break;
382 rc = 0;
383 v->runstate_guest = area.addr.v;
385 if ( v == current )
386 __copy_to_user(v->runstate_guest, &v->runstate,
387 sizeof(v->runstate));
389 break;
390 }
392 default:
393 rc = -ENOSYS;
394 break;
395 }
397 return rc;
398 }
400 void new_thread(struct vcpu *d,
401 unsigned long start_pc,
402 unsigned long start_stack,
403 unsigned long start_info)
404 {
405 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
407 /*
408 * Initial register values:
409 * DS,ES,FS,GS = FLAT_KERNEL_DS
410 * CS:EIP = FLAT_KERNEL_CS:start_pc
411 * SS:ESP = FLAT_KERNEL_SS:start_stack
412 * ESI = start_info
413 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
414 */
415 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
416 regs->ss = FLAT_KERNEL_SS;
417 regs->cs = FLAT_KERNEL_CS;
418 regs->eip = start_pc;
419 regs->esp = start_stack;
420 regs->esi = start_info;
422 __save_flags(regs->eflags);
423 regs->eflags |= X86_EFLAGS_IF;
424 }
427 #ifdef __x86_64__
429 #define loadsegment(seg,value) ({ \
430 int __r = 1; \
431 __asm__ __volatile__ ( \
432 "1: movl %k1,%%" #seg "\n2:\n" \
433 ".section .fixup,\"ax\"\n" \
434 "3: xorl %k0,%k0\n" \
435 " movl %k0,%%" #seg "\n" \
436 " jmp 2b\n" \
437 ".previous\n" \
438 ".section __ex_table,\"a\"\n" \
439 " .align 8\n" \
440 " .quad 1b,3b\n" \
441 ".previous" \
442 : "=r" (__r) : "r" (value), "0" (__r) );\
443 __r; })
445 /*
446 * save_segments() writes a mask of segments which are dirty (non-zero),
447 * allowing load_segments() to avoid some expensive segment loads and
448 * MSR writes.
449 */
450 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
451 #define DIRTY_DS 0x01
452 #define DIRTY_ES 0x02
453 #define DIRTY_FS 0x04
454 #define DIRTY_GS 0x08
455 #define DIRTY_FS_BASE 0x10
456 #define DIRTY_GS_BASE_USER 0x20
458 static void load_segments(struct vcpu *n)
459 {
460 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
461 int all_segs_okay = 1;
462 unsigned int dirty_segment_mask, cpu = smp_processor_id();
464 /* Load and clear the dirty segment mask. */
465 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
466 per_cpu(dirty_segment_mask, cpu) = 0;
468 /* Either selector != 0 ==> reload. */
469 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
470 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
472 /* Either selector != 0 ==> reload. */
473 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
474 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
476 /*
477 * Either selector != 0 ==> reload.
478 * Also reload to reset FS_BASE if it was non-zero.
479 */
480 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
481 nctxt->user_regs.fs) )
482 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
484 /*
485 * Either selector != 0 ==> reload.
486 * Also reload to reset GS_BASE if it was non-zero.
487 */
488 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
489 nctxt->user_regs.gs) )
490 {
491 /* Reset GS_BASE with user %gs? */
492 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
493 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
494 }
496 /* This can only be non-zero if selector is NULL. */
497 if ( nctxt->fs_base )
498 wrmsr(MSR_FS_BASE,
499 nctxt->fs_base,
500 nctxt->fs_base>>32);
502 /* Most kernels have non-zero GS base, so don't bother testing. */
503 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
504 wrmsr(MSR_SHADOW_GS_BASE,
505 nctxt->gs_base_kernel,
506 nctxt->gs_base_kernel>>32);
508 /* This can only be non-zero if selector is NULL. */
509 if ( nctxt->gs_base_user )
510 wrmsr(MSR_GS_BASE,
511 nctxt->gs_base_user,
512 nctxt->gs_base_user>>32);
514 /* If in kernel mode then switch the GS bases around. */
515 if ( n->arch.flags & TF_kernel_mode )
516 __asm__ __volatile__ ( "swapgs" );
518 if ( unlikely(!all_segs_okay) )
519 {
520 struct cpu_user_regs *regs = guest_cpu_user_regs();
521 unsigned long *rsp =
522 (n->arch.flags & TF_kernel_mode) ?
523 (unsigned long *)regs->rsp :
524 (unsigned long *)nctxt->kernel_sp;
525 unsigned long cs_and_mask, rflags;
527 if ( !(n->arch.flags & TF_kernel_mode) )
528 toggle_guest_mode(n);
529 else
530 regs->cs &= ~3;
532 /* CS longword also contains full evtchn_upcall_mask. */
533 cs_and_mask = (unsigned long)regs->cs |
534 ((unsigned long)n->vcpu_info->evtchn_upcall_mask << 32);
536 /* Fold upcall mask into RFLAGS.IF. */
537 rflags = regs->rflags & ~X86_EFLAGS_IF;
538 rflags |= !n->vcpu_info->evtchn_upcall_mask << 9;
540 if ( put_user(regs->ss, rsp- 1) |
541 put_user(regs->rsp, rsp- 2) |
542 put_user(rflags, rsp- 3) |
543 put_user(cs_and_mask, rsp- 4) |
544 put_user(regs->rip, rsp- 5) |
545 put_user(nctxt->user_regs.gs, rsp- 6) |
546 put_user(nctxt->user_regs.fs, rsp- 7) |
547 put_user(nctxt->user_regs.es, rsp- 8) |
548 put_user(nctxt->user_regs.ds, rsp- 9) |
549 put_user(regs->r11, rsp-10) |
550 put_user(regs->rcx, rsp-11) )
551 {
552 DPRINTK("Error while creating failsafe callback frame.\n");
553 domain_crash(n->domain);
554 }
556 if ( test_bit(_VGCF_failsafe_disables_events,
557 &n->arch.guest_context.flags) )
558 n->vcpu_info->evtchn_upcall_mask = 1;
560 regs->entry_vector = TRAP_syscall;
561 regs->rflags &= 0xFFFCBEFFUL;
562 regs->ss = __GUEST_SS;
563 regs->rsp = (unsigned long)(rsp-11);
564 regs->cs = __GUEST_CS;
565 regs->rip = nctxt->failsafe_callback_eip;
566 }
567 }
569 static void save_segments(struct vcpu *v)
570 {
571 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
572 struct cpu_user_regs *regs = &ctxt->user_regs;
573 unsigned int dirty_segment_mask = 0;
575 regs->ds = read_segment_register(ds);
576 regs->es = read_segment_register(es);
577 regs->fs = read_segment_register(fs);
578 regs->gs = read_segment_register(gs);
580 if ( regs->ds )
581 dirty_segment_mask |= DIRTY_DS;
583 if ( regs->es )
584 dirty_segment_mask |= DIRTY_ES;
586 if ( regs->fs )
587 {
588 dirty_segment_mask |= DIRTY_FS;
589 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
590 }
591 else if ( ctxt->fs_base )
592 {
593 dirty_segment_mask |= DIRTY_FS_BASE;
594 }
596 if ( regs->gs )
597 {
598 dirty_segment_mask |= DIRTY_GS;
599 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
600 }
601 else if ( ctxt->gs_base_user )
602 {
603 dirty_segment_mask |= DIRTY_GS_BASE_USER;
604 }
606 this_cpu(dirty_segment_mask) = dirty_segment_mask;
607 }
609 #define switch_kernel_stack(v) ((void)0)
611 #elif defined(__i386__)
613 #define load_segments(n) ((void)0)
614 #define save_segments(p) ((void)0)
616 static inline void switch_kernel_stack(struct vcpu *v)
617 {
618 struct tss_struct *tss = &init_tss[smp_processor_id()];
619 tss->esp1 = v->arch.guest_context.kernel_sp;
620 tss->ss1 = v->arch.guest_context.kernel_ss;
621 }
623 #endif /* __i386__ */
625 static void paravirt_ctxt_switch_from(struct vcpu *v)
626 {
627 save_segments(v);
628 }
630 static void paravirt_ctxt_switch_to(struct vcpu *v)
631 {
632 set_int80_direct_trap(v);
633 switch_kernel_stack(v);
634 }
636 #define loaddebug(_v,_reg) \
637 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
639 static void __context_switch(void)
640 {
641 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
642 unsigned int cpu = smp_processor_id();
643 struct vcpu *p = per_cpu(curr_vcpu, cpu);
644 struct vcpu *n = current;
646 ASSERT(p != n);
647 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
649 if ( !is_idle_vcpu(p) )
650 {
651 memcpy(&p->arch.guest_context.user_regs,
652 stack_regs,
653 CTXT_SWITCH_STACK_BYTES);
654 unlazy_fpu(p);
655 p->arch.ctxt_switch_from(p);
656 }
658 if ( !is_idle_vcpu(n) )
659 {
660 memcpy(stack_regs,
661 &n->arch.guest_context.user_regs,
662 CTXT_SWITCH_STACK_BYTES);
664 /* Maybe switch the debug registers. */
665 if ( unlikely(n->arch.guest_context.debugreg[7]) )
666 {
667 loaddebug(&n->arch.guest_context, 0);
668 loaddebug(&n->arch.guest_context, 1);
669 loaddebug(&n->arch.guest_context, 2);
670 loaddebug(&n->arch.guest_context, 3);
671 /* no 4 and 5 */
672 loaddebug(&n->arch.guest_context, 6);
673 loaddebug(&n->arch.guest_context, 7);
674 }
676 n->arch.ctxt_switch_to(n);
677 }
679 if ( p->domain != n->domain )
680 cpu_set(cpu, n->domain->domain_dirty_cpumask);
681 cpu_set(cpu, n->vcpu_dirty_cpumask);
683 write_ptbase(n);
685 if ( p->vcpu_id != n->vcpu_id )
686 {
687 char gdt_load[10];
688 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
689 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
690 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
691 }
693 if ( p->domain != n->domain )
694 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
695 cpu_clear(cpu, p->vcpu_dirty_cpumask);
697 per_cpu(curr_vcpu, cpu) = n;
698 }
701 void context_switch(struct vcpu *prev, struct vcpu *next)
702 {
703 unsigned int cpu = smp_processor_id();
704 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
706 ASSERT(local_irq_is_enabled());
708 /* Allow at most one CPU at a time to be dirty. */
709 ASSERT(cpus_weight(dirty_mask) <= 1);
710 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
711 {
712 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
713 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
714 flush_tlb_mask(next->vcpu_dirty_cpumask);
715 }
717 local_irq_disable();
719 set_current(next);
721 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
722 {
723 local_irq_enable();
724 }
725 else
726 {
727 __context_switch();
729 /* Re-enable interrupts before restoring state which may fault. */
730 local_irq_enable();
732 if ( !hvm_guest(next) )
733 {
734 load_LDT(next);
735 load_segments(next);
736 }
737 }
739 context_saved(prev);
741 /* Update per-VCPU guest runstate shared memory area (if registered). */
742 if ( next->runstate_guest != NULL )
743 __copy_to_user(next->runstate_guest, &next->runstate,
744 sizeof(next->runstate));
746 schedule_tail(next);
747 BUG();
748 }
750 void continue_running(struct vcpu *same)
751 {
752 schedule_tail(same);
753 BUG();
754 }
756 int __sync_lazy_execstate(void)
757 {
758 unsigned long flags;
759 int switch_required;
761 local_irq_save(flags);
763 switch_required = (this_cpu(curr_vcpu) != current);
765 if ( switch_required )
766 {
767 ASSERT(current == idle_vcpu[smp_processor_id()]);
768 __context_switch();
769 }
771 local_irq_restore(flags);
773 return switch_required;
774 }
776 void sync_vcpu_execstate(struct vcpu *v)
777 {
778 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
779 (void)__sync_lazy_execstate();
781 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
782 flush_tlb_mask(v->vcpu_dirty_cpumask);
783 }
785 #define next_arg(fmt, args) ({ \
786 unsigned long __arg; \
787 switch ( *(fmt)++ ) \
788 { \
789 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
790 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
791 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
792 default: __arg = 0; BUG(); \
793 } \
794 __arg; \
795 })
797 unsigned long hypercall_create_continuation(
798 unsigned int op, const char *format, ...)
799 {
800 struct mc_state *mcs = &mc_state[smp_processor_id()];
801 struct cpu_user_regs *regs;
802 const char *p = format;
803 unsigned long arg;
804 unsigned int i;
805 va_list args;
807 va_start(args, format);
809 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
810 {
811 __set_bit(_MCSF_call_preempted, &mcs->flags);
813 for ( i = 0; *p != '\0'; i++ )
814 mcs->call.args[i] = next_arg(p, args);
815 }
816 else
817 {
818 regs = guest_cpu_user_regs();
819 #if defined(__i386__)
820 regs->eax = op;
822 if ( supervisor_mode_kernel || hvm_guest(current) )
823 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
824 else
825 regs->eip -= 2; /* re-execute 'int 0x82' */
827 for ( i = 0; *p != '\0'; i++ )
828 {
829 arg = next_arg(p, args);
830 switch ( i )
831 {
832 case 0: regs->ebx = arg; break;
833 case 1: regs->ecx = arg; break;
834 case 2: regs->edx = arg; break;
835 case 3: regs->esi = arg; break;
836 case 4: regs->edi = arg; break;
837 case 5: regs->ebp = arg; break;
838 }
839 }
840 #elif defined(__x86_64__)
841 regs->rax = op;
842 regs->rip -= 2; /* re-execute 'syscall' */
844 for ( i = 0; *p != '\0'; i++ )
845 {
846 arg = next_arg(p, args);
847 switch ( i )
848 {
849 case 0: regs->rdi = arg; break;
850 case 1: regs->rsi = arg; break;
851 case 2: regs->rdx = arg; break;
852 case 3: regs->r10 = arg; break;
853 case 4: regs->r8 = arg; break;
854 case 5: regs->r9 = arg; break;
855 }
856 }
857 #endif
858 }
860 va_end(args);
862 return op;
863 }
865 static void relinquish_memory(struct domain *d, struct list_head *list)
866 {
867 struct list_head *ent;
868 struct page_info *page;
869 unsigned long x, y;
871 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
872 spin_lock_recursive(&d->page_alloc_lock);
874 ent = list->next;
875 while ( ent != list )
876 {
877 page = list_entry(ent, struct page_info, list);
879 /* Grab a reference to the page so it won't disappear from under us. */
880 if ( unlikely(!get_page(page, d)) )
881 {
882 /* Couldn't get a reference -- someone is freeing this page. */
883 ent = ent->next;
884 continue;
885 }
887 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
888 put_page_and_type(page);
890 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
891 put_page(page);
893 /*
894 * Forcibly invalidate base page tables at this point to break circular
895 * 'linear page table' references. This is okay because MMU structures
896 * are not shared across domains and this domain is now dead. Thus base
897 * tables are not in use so a non-zero count means circular reference.
898 */
899 y = page->u.inuse.type_info;
900 for ( ; ; )
901 {
902 x = y;
903 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
904 (PGT_base_page_table|PGT_validated)) )
905 break;
907 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
908 if ( likely(y == x) )
909 {
910 free_page_type(page, PGT_base_page_table);
911 break;
912 }
913 }
915 /* Follow the list chain and /then/ potentially free the page. */
916 ent = ent->next;
917 put_page(page);
918 }
920 spin_unlock_recursive(&d->page_alloc_lock);
921 }
923 void domain_relinquish_resources(struct domain *d)
924 {
925 struct vcpu *v;
926 unsigned long pfn;
928 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
930 ptwr_destroy(d);
932 /* Drop the in-use references to page-table bases. */
933 for_each_vcpu ( d, v )
934 {
935 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
936 {
937 if ( !shadow_mode_refcounts(d) )
938 put_page_type(mfn_to_page(pfn));
939 put_page(mfn_to_page(pfn));
941 v->arch.guest_table = pagetable_null();
942 }
944 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
945 {
946 if ( !shadow_mode_refcounts(d) )
947 put_page_type(mfn_to_page(pfn));
948 put_page(mfn_to_page(pfn));
950 v->arch.guest_table_user = pagetable_null();
951 }
952 }
954 if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) )
955 hvm_relinquish_guest_resources(d);
957 shadow_mode_disable(d);
959 /*
960 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
961 * it automatically gets squashed when the guest's mappings go away.
962 */
963 for_each_vcpu(d, v)
964 destroy_gdt(v);
966 /* Relinquish every page of memory. */
967 relinquish_memory(d, &d->xenpage_list);
968 relinquish_memory(d, &d->page_list);
970 /* Free page used by xen oprofile buffer */
971 free_xenoprof_pages(d);
973 }
975 void arch_dump_domain_info(struct domain *d)
976 {
977 if ( shadow_mode_enabled(d) )
978 {
979 printk(" shadow mode: ");
980 if ( shadow_mode_refcounts(d) )
981 printk("refcounts ");
982 if ( shadow_mode_write_all(d) )
983 printk("write_all ");
984 if ( shadow_mode_log_dirty(d) )
985 printk("log_dirty ");
986 if ( shadow_mode_translate(d) )
987 printk("translate ");
988 if ( shadow_mode_external(d) )
989 printk("external ");
990 if ( shadow_mode_wr_pt_pte(d) )
991 printk("wr_pt_pte ");
992 printk("\n");
993 }
994 }
996 /*
997 * Local variables:
998 * mode: C
999 * c-set-style: "BSD"
1000 * c-basic-offset: 4
1001 * tab-width: 4
1002 * indent-tabs-mode: nil
1003 * End:
1004 */