direct-io.hg

view xen/arch/x86/domain.c @ 8624:1b7d6c2ae2f7

Sync against scheduler tail on other CPUs when context
switching to a new VCPU. Otherwise we cannot pull the
VCPU's state off the other CPU.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Jan 17 13:25:30 2006 +0100 (2006-01-17)
parents 1ccc28e075ba
children af7e4ce46653
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <asm/regs.h>
25 #include <asm/mc146818rtc.h>
26 #include <asm/system.h>
27 #include <asm/io.h>
28 #include <asm/processor.h>
29 #include <asm/desc.h>
30 #include <asm/i387.h>
31 #include <asm/mpspec.h>
32 #include <asm/ldt.h>
33 #include <xen/irq.h>
34 #include <xen/event.h>
35 #include <asm/shadow.h>
36 #include <xen/console.h>
37 #include <xen/elf.h>
38 #include <asm/vmx.h>
39 #include <asm/msr.h>
40 #include <xen/kernel.h>
41 #include <xen/multicall.h>
43 /* opt_noreboot: If true, machine will need manual reset on error. */
44 static int opt_noreboot = 0;
45 boolean_param("noreboot", opt_noreboot);
47 struct percpu_ctxt {
48 struct vcpu *curr_vcpu;
49 unsigned int dirty_segment_mask;
50 } __cacheline_aligned;
51 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
53 static void continue_idle_domain(struct vcpu *v)
54 {
55 reset_stack_and_jump(idle_loop);
56 }
58 static void continue_nonidle_domain(struct vcpu *v)
59 {
60 reset_stack_and_jump(ret_from_intr);
61 }
63 static void default_idle(void)
64 {
65 local_irq_disable();
66 if ( !softirq_pending(smp_processor_id()) )
67 safe_halt();
68 else
69 local_irq_enable();
70 }
72 void idle_loop(void)
73 {
74 int cpu = smp_processor_id();
76 for ( ; ; )
77 {
78 irq_stat[cpu].idle_timestamp = jiffies;
80 while ( !softirq_pending(cpu) )
81 {
82 page_scrub_schedule_work();
83 default_idle();
84 }
86 do_softirq();
87 }
88 }
90 void startup_cpu_idle_loop(void)
91 {
92 struct vcpu *v = current;
94 ASSERT(is_idle_vcpu(v));
95 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
96 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
98 reset_stack_and_jump(idle_loop);
99 }
101 static long no_idt[2];
102 static int reboot_mode;
104 static inline void kb_wait(void)
105 {
106 int i;
108 for ( i = 0; i < 0x10000; i++ )
109 if ( (inb_p(0x64) & 0x02) == 0 )
110 break;
111 }
113 void machine_restart(char * __unused)
114 {
115 int i;
117 if ( opt_noreboot )
118 {
119 printk("Reboot disabled on cmdline: require manual reset\n");
120 for ( ; ; )
121 safe_halt();
122 }
124 watchdog_disable();
125 console_start_sync();
127 local_irq_enable();
129 /* Ensure we are the boot CPU. */
130 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
131 {
132 smp_call_function((void *)machine_restart, NULL, 1, 0);
133 for ( ; ; )
134 safe_halt();
135 }
137 /*
138 * Stop all CPUs and turn off local APICs and the IO-APIC, so
139 * other OSs see a clean IRQ state.
140 */
141 smp_send_stop();
142 disable_IO_APIC();
144 stop_vmx();
146 /* Rebooting needs to touch the page at absolute address 0. */
147 *((unsigned short *)__va(0x472)) = reboot_mode;
149 for ( ; ; )
150 {
151 /* Pulse the keyboard reset line. */
152 for ( i = 0; i < 100; i++ )
153 {
154 kb_wait();
155 udelay(50);
156 outb(0xfe,0x64); /* pulse reset low */
157 udelay(50);
158 }
160 /* That didn't work - force a triple fault.. */
161 __asm__ __volatile__("lidt %0": "=m" (no_idt));
162 __asm__ __volatile__("int3");
163 }
164 }
167 void __attribute__((noreturn)) __machine_halt(void *unused)
168 {
169 for ( ; ; )
170 safe_halt();
171 }
173 void machine_halt(void)
174 {
175 watchdog_disable();
176 console_start_sync();
177 smp_call_function(__machine_halt, NULL, 1, 0);
178 __machine_halt(NULL);
179 }
181 void dump_pageframe_info(struct domain *d)
182 {
183 struct pfn_info *page;
185 printk("Memory pages belonging to domain %u:\n", d->domain_id);
187 if ( d->tot_pages >= 10 )
188 {
189 printk(" DomPage list too long to display\n");
190 }
191 else
192 {
193 list_for_each_entry ( page, &d->page_list, list )
194 {
195 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
196 _p(page_to_phys(page)), _p(page_to_pfn(page)),
197 page->count_info, page->u.inuse.type_info);
198 }
199 }
201 list_for_each_entry ( page, &d->xenpage_list, list )
202 {
203 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
204 _p(page_to_phys(page)), _p(page_to_pfn(page)),
205 page->count_info, page->u.inuse.type_info);
206 }
207 }
209 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
210 {
211 struct vcpu *v;
213 if ( (v = xmalloc(struct vcpu)) == NULL )
214 return NULL;
216 memset(v, 0, sizeof(*v));
218 v->arch.flags = TF_kernel_mode;
220 if ( is_idle_domain(d) )
221 {
222 percpu_ctxt[vcpu_id].curr_vcpu = v;
223 v->arch.schedule_tail = continue_idle_domain;
224 }
225 else
226 {
227 v->arch.schedule_tail = continue_nonidle_domain;
228 }
230 v->arch.perdomain_ptes =
231 d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
233 v->arch.guest_vtable = __linear_l2_table;
234 v->arch.shadow_vtable = __shadow_linear_l2_table;
235 #if defined(__x86_64__)
236 v->arch.guest_vl3table = __linear_l3_table;
237 v->arch.guest_vl4table = __linear_l4_table;
238 #endif
240 return v;
241 }
243 void free_vcpu_struct(struct vcpu *v)
244 {
245 xfree(v);
246 }
248 int arch_domain_create(struct domain *d)
249 {
250 l1_pgentry_t gdt_l1e;
251 int vcpuid, pdpt_order, rc;
252 #ifdef __x86_64__
253 int i;
254 #endif
256 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
257 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
258 if ( d->arch.mm_perdomain_pt == NULL )
259 goto fail_nomem;
260 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
262 /*
263 * Map Xen segments into every VCPU's GDT, irrespective of whether every
264 * VCPU will actually be used. This avoids an NMI race during context
265 * switch: if we take an interrupt after switching CR3 but before switching
266 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
267 * try to load CS from an invalid table.
268 */
269 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
270 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
271 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
272 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
274 #if defined(__i386__)
276 mapcache_init(d);
278 #else /* __x86_64__ */
280 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
281 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
282 if ( (d->arch.mm_perdomain_l2 == NULL) ||
283 (d->arch.mm_perdomain_l3 == NULL) )
284 goto fail_nomem;
286 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
287 for ( i = 0; i < (1 << pdpt_order); i++ )
288 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
289 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
290 __PAGE_HYPERVISOR);
292 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
293 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
294 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
295 __PAGE_HYPERVISOR);
297 #endif /* __x86_64__ */
299 shadow_lock_init(d);
300 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
302 if ( !is_idle_domain(d) )
303 {
304 d->arch.ioport_caps =
305 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
306 if ( d->arch.ioport_caps == NULL )
307 goto fail_nomem;
309 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
310 goto fail_nomem;
312 if ( (rc = ptwr_init(d)) != 0 )
313 goto fail_nomem;
315 memset(d->shared_info, 0, PAGE_SIZE);
316 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
317 }
319 return 0;
321 fail_nomem:
322 free_xenheap_page(d->shared_info);
323 #ifdef __x86_64__
324 free_xenheap_page(d->arch.mm_perdomain_l2);
325 free_xenheap_page(d->arch.mm_perdomain_l3);
326 #endif
327 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
328 return -ENOMEM;
329 }
331 void arch_domain_destroy(struct domain *d)
332 {
333 free_xenheap_pages(
334 d->arch.mm_perdomain_pt,
335 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
337 #ifdef __x86_64__
338 free_xenheap_page(d->arch.mm_perdomain_l2);
339 free_xenheap_page(d->arch.mm_perdomain_l3);
340 #endif
342 free_xenheap_page(d->shared_info);
343 }
345 /* This is called by arch_final_setup_guest and do_boot_vcpu */
346 int arch_set_info_guest(
347 struct vcpu *v, struct vcpu_guest_context *c)
348 {
349 struct domain *d = v->domain;
350 unsigned long phys_basetab;
351 int i, rc;
353 /*
354 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
355 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
356 * If SS RPL or DPL differs from CS RPL then we'll #GP.
357 */
358 if ( !(c->flags & VGCF_VMX_GUEST) )
359 {
360 if ( ((c->user_regs.cs & 3) == 0) ||
361 ((c->user_regs.ss & 3) == 0) )
362 return -EINVAL;
363 }
364 else if ( !hvm_enabled )
365 return -EINVAL;
367 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
368 if ( c->flags & VGCF_I387_VALID )
369 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
371 v->arch.flags &= ~TF_kernel_mode;
372 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
373 v->arch.flags |= TF_kernel_mode;
375 memcpy(&v->arch.guest_context, c, sizeof(*c));
377 if ( !(c->flags & VGCF_VMX_GUEST) )
378 {
379 /* IOPL privileges are virtualised. */
380 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
381 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
383 /* Ensure real hardware interrupts are enabled. */
384 v->arch.guest_context.user_regs.eflags |= EF_IE;
385 }
386 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
387 {
388 return modify_vmcs(
389 &v->arch.arch_vmx,
390 &v->arch.guest_context.user_regs);
391 }
393 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
394 return 0;
396 memset(v->arch.guest_context.debugreg, 0,
397 sizeof(v->arch.guest_context.debugreg));
398 for ( i = 0; i < 8; i++ )
399 (void)set_debugreg(v, i, c->debugreg[i]);
401 if ( v->vcpu_id == 0 )
402 d->vm_assist = c->vm_assist;
404 phys_basetab = c->ctrlreg[3];
405 v->arch.guest_table = mk_pagetable(phys_basetab);
407 if ( shadow_mode_refcounts(d) )
408 {
409 if ( !get_page(pfn_to_page(phys_basetab>>PAGE_SHIFT), d) )
410 return -EINVAL;
411 }
412 else if ( !(c->flags & VGCF_VMX_GUEST) )
413 {
414 if ( !get_page_and_type(pfn_to_page(phys_basetab>>PAGE_SHIFT), d,
415 PGT_base_page_table) )
416 return -EINVAL;
417 }
419 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
420 {
421 put_page_and_type(pfn_to_page(phys_basetab>>PAGE_SHIFT));
422 return rc;
423 }
425 if ( c->flags & VGCF_VMX_GUEST )
426 {
427 /* VMX uses the initially provided page tables as the P2M map. */
428 if ( !pagetable_get_paddr(d->arch.phys_table) )
429 d->arch.phys_table = v->arch.guest_table;
430 v->arch.guest_table = mk_pagetable(0);
432 vmx_final_setup_guest(v);
433 }
435 update_pagetables(v);
437 if ( v->vcpu_id == 0 )
438 init_domain_time(d);
440 /* Don't redo final setup */
441 set_bit(_VCPUF_initialised, &v->vcpu_flags);
443 return 0;
444 }
447 void new_thread(struct vcpu *d,
448 unsigned long start_pc,
449 unsigned long start_stack,
450 unsigned long start_info)
451 {
452 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
454 /*
455 * Initial register values:
456 * DS,ES,FS,GS = FLAT_KERNEL_DS
457 * CS:EIP = FLAT_KERNEL_CS:start_pc
458 * SS:ESP = FLAT_KERNEL_SS:start_stack
459 * ESI = start_info
460 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
461 */
462 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
463 regs->ss = FLAT_KERNEL_SS;
464 regs->cs = FLAT_KERNEL_CS;
465 regs->eip = start_pc;
466 regs->esp = start_stack;
467 regs->esi = start_info;
469 __save_flags(regs->eflags);
470 regs->eflags |= X86_EFLAGS_IF;
471 }
474 #ifdef __x86_64__
476 #define loadsegment(seg,value) ({ \
477 int __r = 1; \
478 __asm__ __volatile__ ( \
479 "1: movl %k1,%%" #seg "\n2:\n" \
480 ".section .fixup,\"ax\"\n" \
481 "3: xorl %k0,%k0\n" \
482 " movl %k0,%%" #seg "\n" \
483 " jmp 2b\n" \
484 ".previous\n" \
485 ".section __ex_table,\"a\"\n" \
486 " .align 8\n" \
487 " .quad 1b,3b\n" \
488 ".previous" \
489 : "=r" (__r) : "r" (value), "0" (__r) );\
490 __r; })
492 /*
493 * save_segments() writes a mask of segments which are dirty (non-zero),
494 * allowing load_segments() to avoid some expensive segment loads and
495 * MSR writes.
496 */
497 #define DIRTY_DS 0x01
498 #define DIRTY_ES 0x02
499 #define DIRTY_FS 0x04
500 #define DIRTY_GS 0x08
501 #define DIRTY_FS_BASE 0x10
502 #define DIRTY_GS_BASE_USER 0x20
504 static void load_segments(struct vcpu *n)
505 {
506 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
507 int all_segs_okay = 1;
508 unsigned int dirty_segment_mask, cpu = smp_processor_id();
510 /* Load and clear the dirty segment mask. */
511 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
512 percpu_ctxt[cpu].dirty_segment_mask = 0;
514 /* Either selector != 0 ==> reload. */
515 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
516 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
518 /* Either selector != 0 ==> reload. */
519 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
520 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
522 /*
523 * Either selector != 0 ==> reload.
524 * Also reload to reset FS_BASE if it was non-zero.
525 */
526 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
527 nctxt->user_regs.fs) )
528 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
530 /*
531 * Either selector != 0 ==> reload.
532 * Also reload to reset GS_BASE if it was non-zero.
533 */
534 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
535 nctxt->user_regs.gs) )
536 {
537 /* Reset GS_BASE with user %gs? */
538 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
539 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
540 }
542 /* This can only be non-zero if selector is NULL. */
543 if ( nctxt->fs_base )
544 wrmsr(MSR_FS_BASE,
545 nctxt->fs_base,
546 nctxt->fs_base>>32);
548 /* Most kernels have non-zero GS base, so don't bother testing. */
549 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
550 wrmsr(MSR_SHADOW_GS_BASE,
551 nctxt->gs_base_kernel,
552 nctxt->gs_base_kernel>>32);
554 /* This can only be non-zero if selector is NULL. */
555 if ( nctxt->gs_base_user )
556 wrmsr(MSR_GS_BASE,
557 nctxt->gs_base_user,
558 nctxt->gs_base_user>>32);
560 /* If in kernel mode then switch the GS bases around. */
561 if ( n->arch.flags & TF_kernel_mode )
562 __asm__ __volatile__ ( "swapgs" );
564 if ( unlikely(!all_segs_okay) )
565 {
566 struct cpu_user_regs *regs = guest_cpu_user_regs();
567 unsigned long *rsp =
568 (n->arch.flags & TF_kernel_mode) ?
569 (unsigned long *)regs->rsp :
570 (unsigned long *)nctxt->kernel_sp;
572 if ( !(n->arch.flags & TF_kernel_mode) )
573 toggle_guest_mode(n);
574 else
575 regs->cs &= ~3;
577 if ( put_user(regs->ss, rsp- 1) |
578 put_user(regs->rsp, rsp- 2) |
579 put_user(regs->rflags, rsp- 3) |
580 put_user(regs->cs, rsp- 4) |
581 put_user(regs->rip, rsp- 5) |
582 put_user(nctxt->user_regs.gs, rsp- 6) |
583 put_user(nctxt->user_regs.fs, rsp- 7) |
584 put_user(nctxt->user_regs.es, rsp- 8) |
585 put_user(nctxt->user_regs.ds, rsp- 9) |
586 put_user(regs->r11, rsp-10) |
587 put_user(regs->rcx, rsp-11) )
588 {
589 DPRINTK("Error while creating failsafe callback frame.\n");
590 domain_crash(n->domain);
591 }
593 regs->entry_vector = TRAP_syscall;
594 regs->rflags &= 0xFFFCBEFFUL;
595 regs->ss = __GUEST_SS;
596 regs->rsp = (unsigned long)(rsp-11);
597 regs->cs = __GUEST_CS;
598 regs->rip = nctxt->failsafe_callback_eip;
599 }
600 }
602 static void save_segments(struct vcpu *v)
603 {
604 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
605 struct cpu_user_regs *regs = &ctxt->user_regs;
606 unsigned int dirty_segment_mask = 0;
608 if ( VMX_DOMAIN(v) )
609 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
611 __asm__ __volatile__ ( "mov %%ds,%0" : "=m" (regs->ds) );
612 __asm__ __volatile__ ( "mov %%es,%0" : "=m" (regs->es) );
613 __asm__ __volatile__ ( "mov %%fs,%0" : "=m" (regs->fs) );
614 __asm__ __volatile__ ( "mov %%gs,%0" : "=m" (regs->gs) );
616 if ( regs->ds )
617 dirty_segment_mask |= DIRTY_DS;
619 if ( regs->es )
620 dirty_segment_mask |= DIRTY_ES;
622 if ( regs->fs )
623 {
624 dirty_segment_mask |= DIRTY_FS;
625 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
626 }
627 else if ( ctxt->fs_base )
628 {
629 dirty_segment_mask |= DIRTY_FS_BASE;
630 }
632 if ( regs->gs )
633 {
634 dirty_segment_mask |= DIRTY_GS;
635 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
636 }
637 else if ( ctxt->gs_base_user )
638 {
639 dirty_segment_mask |= DIRTY_GS_BASE_USER;
640 }
642 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
643 }
645 #define switch_kernel_stack(_n,_c) ((void)0)
647 #elif defined(__i386__)
649 #define load_segments(n) ((void)0)
650 #define save_segments(p) ((void)0)
652 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
653 {
654 struct tss_struct *tss = &init_tss[cpu];
655 tss->esp1 = n->arch.guest_context.kernel_sp;
656 tss->ss1 = n->arch.guest_context.kernel_ss;
657 }
659 #endif
661 #define loaddebug(_v,_reg) \
662 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
664 static void __context_switch(void)
665 {
666 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
667 unsigned int cpu = smp_processor_id();
668 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
669 struct vcpu *n = current;
671 ASSERT(p != n);
672 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
674 if ( !is_idle_vcpu(p) )
675 {
676 memcpy(&p->arch.guest_context.user_regs,
677 stack_regs,
678 CTXT_SWITCH_STACK_BYTES);
679 unlazy_fpu(p);
680 save_segments(p);
681 }
683 if ( !is_idle_vcpu(n) )
684 {
685 memcpy(stack_regs,
686 &n->arch.guest_context.user_regs,
687 CTXT_SWITCH_STACK_BYTES);
689 /* Maybe switch the debug registers. */
690 if ( unlikely(n->arch.guest_context.debugreg[7]) )
691 {
692 loaddebug(&n->arch.guest_context, 0);
693 loaddebug(&n->arch.guest_context, 1);
694 loaddebug(&n->arch.guest_context, 2);
695 loaddebug(&n->arch.guest_context, 3);
696 /* no 4 and 5 */
697 loaddebug(&n->arch.guest_context, 6);
698 loaddebug(&n->arch.guest_context, 7);
699 }
701 if ( !VMX_DOMAIN(n) )
702 {
703 set_int80_direct_trap(n);
704 switch_kernel_stack(n, cpu);
705 }
706 }
708 if ( p->domain != n->domain )
709 cpu_set(cpu, n->domain->domain_dirty_cpumask);
710 cpu_set(cpu, n->vcpu_dirty_cpumask);
712 write_ptbase(n);
714 if ( p->vcpu_id != n->vcpu_id )
715 {
716 char gdt_load[10];
717 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
718 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
719 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
720 }
722 if ( p->domain != n->domain )
723 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
724 cpu_clear(cpu, p->vcpu_dirty_cpumask);
726 percpu_ctxt[cpu].curr_vcpu = n;
727 }
730 void context_switch(struct vcpu *prev, struct vcpu *next)
731 {
732 unsigned int cpu = smp_processor_id();
733 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
735 ASSERT(local_irq_is_enabled());
737 /* Allow at most one CPU at a time to be dirty. */
738 ASSERT(cpus_weight(dirty_mask) <= 1);
739 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
740 {
741 /* Make sure the next VCPU is not in a scheduling tail. */
742 while ( test_bit(_VCPUF_running, &next->vcpu_flags) )
743 cpu_relax();
744 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
745 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
746 flush_tlb_mask(next->vcpu_dirty_cpumask);
747 }
749 local_irq_disable();
751 set_current(next);
753 if ( (percpu_ctxt[cpu].curr_vcpu == next) || is_idle_vcpu(next) )
754 {
755 local_irq_enable();
756 }
757 else
758 {
759 __context_switch();
761 /* Re-enable interrupts before restoring state which may fault. */
762 local_irq_enable();
764 if ( VMX_DOMAIN(next) )
765 {
766 vmx_restore_msrs(next);
767 }
768 else
769 {
770 load_LDT(next);
771 load_segments(next);
772 vmx_load_msrs(next);
773 }
774 }
776 context_saved(prev);
778 schedule_tail(next);
779 BUG();
780 }
782 void continue_running(struct vcpu *same)
783 {
784 schedule_tail(same);
785 BUG();
786 }
788 int __sync_lazy_execstate(void)
789 {
790 unsigned long flags;
791 int switch_required;
793 local_irq_save(flags);
795 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
797 if ( switch_required )
798 __context_switch();
800 local_irq_restore(flags);
802 return switch_required;
803 }
805 void sync_vcpu_execstate(struct vcpu *v)
806 {
807 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
808 (void)__sync_lazy_execstate();
810 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
811 flush_tlb_mask(v->vcpu_dirty_cpumask);
812 }
814 unsigned long __hypercall_create_continuation(
815 unsigned int op, unsigned int nr_args, ...)
816 {
817 struct mc_state *mcs = &mc_state[smp_processor_id()];
818 struct cpu_user_regs *regs;
819 unsigned int i;
820 va_list args;
822 va_start(args, nr_args);
824 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
825 {
826 __set_bit(_MCSF_call_preempted, &mcs->flags);
828 for ( i = 0; i < nr_args; i++ )
829 mcs->call.args[i] = va_arg(args, unsigned long);
830 }
831 else
832 {
833 regs = guest_cpu_user_regs();
834 #if defined(__i386__)
835 regs->eax = op;
836 regs->eip -= 2; /* re-execute 'int 0x82' */
838 for ( i = 0; i < nr_args; i++ )
839 {
840 switch ( i )
841 {
842 case 0: regs->ebx = va_arg(args, unsigned long); break;
843 case 1: regs->ecx = va_arg(args, unsigned long); break;
844 case 2: regs->edx = va_arg(args, unsigned long); break;
845 case 3: regs->esi = va_arg(args, unsigned long); break;
846 case 4: regs->edi = va_arg(args, unsigned long); break;
847 case 5: regs->ebp = va_arg(args, unsigned long); break;
848 }
849 }
850 #elif defined(__x86_64__)
851 regs->rax = op;
852 regs->rip -= 2; /* re-execute 'syscall' */
854 for ( i = 0; i < nr_args; i++ )
855 {
856 switch ( i )
857 {
858 case 0: regs->rdi = va_arg(args, unsigned long); break;
859 case 1: regs->rsi = va_arg(args, unsigned long); break;
860 case 2: regs->rdx = va_arg(args, unsigned long); break;
861 case 3: regs->r10 = va_arg(args, unsigned long); break;
862 case 4: regs->r8 = va_arg(args, unsigned long); break;
863 case 5: regs->r9 = va_arg(args, unsigned long); break;
864 }
865 }
866 #endif
867 }
869 va_end(args);
871 return op;
872 }
874 static void relinquish_memory(struct domain *d, struct list_head *list)
875 {
876 struct list_head *ent;
877 struct pfn_info *page;
878 unsigned long x, y;
880 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
881 spin_lock_recursive(&d->page_alloc_lock);
883 ent = list->next;
884 while ( ent != list )
885 {
886 page = list_entry(ent, struct pfn_info, list);
888 /* Grab a reference to the page so it won't disappear from under us. */
889 if ( unlikely(!get_page(page, d)) )
890 {
891 /* Couldn't get a reference -- someone is freeing this page. */
892 ent = ent->next;
893 continue;
894 }
896 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
897 put_page_and_type(page);
899 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
900 put_page(page);
902 /*
903 * Forcibly invalidate base page tables at this point to break circular
904 * 'linear page table' references. This is okay because MMU structures
905 * are not shared across domains and this domain is now dead. Thus base
906 * tables are not in use so a non-zero count means circular reference.
907 */
908 y = page->u.inuse.type_info;
909 for ( ; ; )
910 {
911 x = y;
912 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
913 (PGT_base_page_table|PGT_validated)) )
914 break;
916 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
917 if ( likely(y == x) )
918 {
919 free_page_type(page, PGT_base_page_table);
920 break;
921 }
922 }
924 /* Follow the list chain and /then/ potentially free the page. */
925 ent = ent->next;
926 put_page(page);
927 }
929 spin_unlock_recursive(&d->page_alloc_lock);
930 }
932 void domain_relinquish_resources(struct domain *d)
933 {
934 struct vcpu *v;
935 unsigned long pfn;
937 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
939 ptwr_destroy(d);
941 /* Drop the in-use references to page-table bases. */
942 for_each_vcpu ( d, v )
943 {
944 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
945 {
946 if ( !shadow_mode_refcounts(d) )
947 put_page_type(pfn_to_page(pfn));
948 put_page(pfn_to_page(pfn));
950 v->arch.guest_table = mk_pagetable(0);
951 }
953 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
954 {
955 if ( !shadow_mode_refcounts(d) )
956 put_page_type(pfn_to_page(pfn));
957 put_page(pfn_to_page(pfn));
959 v->arch.guest_table_user = mk_pagetable(0);
960 }
962 vmx_relinquish_resources(v);
963 }
965 shadow_mode_disable(d);
967 /*
968 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
969 * it automatically gets squashed when the guest's mappings go away.
970 */
971 for_each_vcpu(d, v)
972 destroy_gdt(v);
974 /* Relinquish every page of memory. */
975 relinquish_memory(d, &d->xenpage_list);
976 relinquish_memory(d, &d->page_list);
977 }
980 /*
981 * Local variables:
982 * mode: C
983 * c-set-style: "BSD"
984 * c-basic-offset: 4
985 * tab-width: 4
986 * indent-tabs-mode: nil
987 * End:
988 */