direct-io.hg

view xen/arch/x86/domain.c @ 5669:ff5d7ccd8d69

No changes from me.
author cl349@firebug.cl.cam.ac.uk
date Tue Jul 05 08:47:55 2005 +0000 (2005-07-05)
parents 8bd2e8933277
children f261f14b9781 a83ac0806d6b
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <asm/physdev.h>
41 #include <xen/kernel.h>
42 #include <public/io/ioreq.h>
43 #include <xen/multicall.h>
45 /* opt_noreboot: If true, machine will need manual reset on error. */
46 static int opt_noreboot = 0;
47 boolean_param("noreboot", opt_noreboot);
49 struct percpu_ctxt {
50 struct vcpu *curr_vcpu;
51 } __cacheline_aligned;
52 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
54 static void continue_idle_task(struct vcpu *v)
55 {
56 reset_stack_and_jump(idle_loop);
57 }
59 static void continue_nonidle_task(struct vcpu *v)
60 {
61 reset_stack_and_jump(ret_from_intr);
62 }
64 static void default_idle(void)
65 {
66 local_irq_disable();
67 if ( !softirq_pending(smp_processor_id()) )
68 safe_halt();
69 else
70 local_irq_enable();
71 }
73 void idle_loop(void)
74 {
75 int cpu = smp_processor_id();
77 for ( ; ; )
78 {
79 irq_stat[cpu].idle_timestamp = jiffies;
81 while ( !softirq_pending(cpu) )
82 {
83 page_scrub_schedule_work();
84 default_idle();
85 }
87 do_softirq();
88 }
89 }
91 void startup_cpu_idle_loop(void)
92 {
93 struct vcpu *v = current;
95 ASSERT(is_idle_task(v->domain));
96 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
97 cpu_set(smp_processor_id(), v->domain->cpumask);
98 v->arch.schedule_tail = continue_idle_task;
100 idle_loop();
101 }
103 static long no_idt[2];
104 static int reboot_mode;
106 static inline void kb_wait(void)
107 {
108 int i;
110 for ( i = 0; i < 0x10000; i++ )
111 if ( (inb_p(0x64) & 0x02) == 0 )
112 break;
113 }
115 void machine_restart(char * __unused)
116 {
117 int i;
119 if ( opt_noreboot )
120 {
121 printk("Reboot disabled on cmdline: require manual reset\n");
122 for ( ; ; )
123 safe_halt();
124 }
126 watchdog_disable();
127 console_start_sync();
129 local_irq_enable();
131 /* Ensure we are the boot CPU. */
132 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
133 {
134 smp_call_function((void *)machine_restart, NULL, 1, 0);
135 for ( ; ; )
136 safe_halt();
137 }
139 /*
140 * Stop all CPUs and turn off local APICs and the IO-APIC, so
141 * other OSs see a clean IRQ state.
142 */
143 smp_send_stop();
144 disable_IO_APIC();
146 #ifdef CONFIG_VMX
147 stop_vmx();
148 #endif
150 /* Rebooting needs to touch the page at absolute address 0. */
151 *((unsigned short *)__va(0x472)) = reboot_mode;
153 for ( ; ; )
154 {
155 /* Pulse the keyboard reset line. */
156 for ( i = 0; i < 100; i++ )
157 {
158 kb_wait();
159 udelay(50);
160 outb(0xfe,0x64); /* pulse reset low */
161 udelay(50);
162 }
164 /* That didn't work - force a triple fault.. */
165 __asm__ __volatile__("lidt %0": "=m" (no_idt));
166 __asm__ __volatile__("int3");
167 }
168 }
171 void __attribute__((noreturn)) __machine_halt(void *unused)
172 {
173 for ( ; ; )
174 safe_halt();
175 }
177 void machine_halt(void)
178 {
179 watchdog_disable();
180 console_start_sync();
181 smp_call_function(__machine_halt, NULL, 1, 0);
182 __machine_halt(NULL);
183 }
185 void dump_pageframe_info(struct domain *d)
186 {
187 struct pfn_info *page;
189 if ( d->tot_pages < 10 )
190 {
191 list_for_each_entry ( page, &d->page_list, list )
192 {
193 printk("Page %p: caf=%08x, taf=%08x\n",
194 _p(page_to_phys(page)), page->count_info,
195 page->u.inuse.type_info);
196 }
197 }
199 list_for_each_entry ( page, &d->xenpage_list, list )
200 {
201 printk("XenPage %p: caf=%08x, taf=%08x\n",
202 _p(page_to_phys(page)), page->count_info,
203 page->u.inuse.type_info);
204 }
207 page = virt_to_page(d->shared_info);
208 printk("Shared_info@%p: caf=%08x, taf=%08x\n",
209 _p(page_to_phys(page)), page->count_info,
210 page->u.inuse.type_info);
211 }
213 struct vcpu *arch_alloc_vcpu_struct(void)
214 {
215 return xmalloc(struct vcpu);
216 }
218 void arch_free_vcpu_struct(struct vcpu *v)
219 {
220 xfree(v);
221 }
223 void free_perdomain_pt(struct domain *d)
224 {
225 free_xenheap_page(d->arch.mm_perdomain_pt);
226 #ifdef __x86_64__
227 free_xenheap_page(d->arch.mm_perdomain_l2);
228 free_xenheap_page(d->arch.mm_perdomain_l3);
229 #endif
230 }
232 void arch_do_createdomain(struct vcpu *v)
233 {
234 struct domain *d = v->domain;
236 v->arch.flags = TF_kernel_mode;
238 if ( is_idle_task(d) )
239 return;
241 v->arch.schedule_tail = continue_nonidle_task;
243 d->shared_info = alloc_xenheap_page();
244 memset(d->shared_info, 0, PAGE_SIZE);
245 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
246 v->cpumap = CPUMAP_RUNANYWHERE;
247 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
248 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
249 PAGE_SHIFT] = INVALID_M2P_ENTRY;
251 d->arch.mm_perdomain_pt = alloc_xenheap_page();
252 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
253 machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >>
254 PAGE_SHIFT] = INVALID_M2P_ENTRY;
255 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
256 v->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
257 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
259 v->arch.guest_vtable = __linear_l2_table;
260 v->arch.shadow_vtable = __shadow_linear_l2_table;
262 #ifdef __x86_64__
263 v->arch.guest_vl3table = __linear_l3_table;
264 v->arch.guest_vl4table = __linear_l4_table;
266 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
267 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
268 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
269 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt),
270 __PAGE_HYPERVISOR);
271 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
272 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
273 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
274 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
275 __PAGE_HYPERVISOR);
276 #endif
278 (void)ptwr_init(d);
280 shadow_lock_init(d);
281 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
282 }
284 void arch_do_boot_vcpu(struct vcpu *v)
285 {
286 struct domain *d = v->domain;
288 v->arch.flags = TF_kernel_mode;
290 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
292 v->arch.perdomain_ptes =
293 d->arch.mm_perdomain_pt + (v->vcpu_id << PDPT_VCPU_SHIFT);
294 v->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
295 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
296 }
298 #ifdef CONFIG_VMX
299 static int vmx_switch_on;
301 void arch_vmx_do_resume(struct vcpu *v)
302 {
303 u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
305 load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr);
306 vmx_do_resume(v);
307 reset_stack_and_jump(vmx_asm_do_resume);
308 }
310 void arch_vmx_do_launch(struct vcpu *v)
311 {
312 u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
314 load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr);
315 vmx_do_launch(v);
316 reset_stack_and_jump(vmx_asm_do_launch);
317 }
319 static int vmx_final_setup_guest(
320 struct vcpu *v, struct vcpu_guest_context *ctxt)
321 {
322 int error;
323 struct cpu_user_regs *regs;
324 struct vmcs_struct *vmcs;
326 regs = &ctxt->user_regs;
328 /*
329 * Create a new VMCS
330 */
331 if (!(vmcs = alloc_vmcs())) {
332 printk("Failed to create a new VMCS\n");
333 return -ENOMEM;
334 }
336 memset(&v->arch.arch_vmx, 0, sizeof (struct arch_vmx_struct));
338 v->arch.arch_vmx.vmcs = vmcs;
339 error = construct_vmcs(
340 &v->arch.arch_vmx, regs, ctxt, VMCS_USE_HOST_ENV);
341 if ( error < 0 )
342 {
343 printk("Failed to construct a new VMCS\n");
344 goto out;
345 }
347 v->arch.schedule_tail = arch_vmx_do_launch;
349 #if defined (__i386)
350 v->domain->arch.vmx_platform.real_mode_data =
351 (unsigned long *) regs->esi;
352 #endif
354 if (v == v->domain->vcpu[0]) {
355 /*
356 * Required to do this once per domain
357 * XXX todo: add a seperate function to do these.
358 */
359 memset(&v->domain->shared_info->evtchn_mask[0], 0xff,
360 sizeof(v->domain->shared_info->evtchn_mask));
362 /* Put the domain in shadow mode even though we're going to be using
363 * the shared 1:1 page table initially. It shouldn't hurt */
364 shadow_mode_enable(v->domain,
365 SHM_enable|SHM_refcounts|
366 SHM_translate|SHM_external);
367 }
369 if (!vmx_switch_on)
370 vmx_switch_on = 1;
372 return 0;
374 out:
375 free_vmcs(vmcs);
376 v->arch.arch_vmx.vmcs = 0;
377 return error;
378 }
379 #endif
382 /* This is called by arch_final_setup_guest and do_boot_vcpu */
383 int arch_set_info_guest(
384 struct vcpu *v, struct vcpu_guest_context *c)
385 {
386 struct domain *d = v->domain;
387 unsigned long phys_basetab;
388 int i, rc;
390 /*
391 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
392 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
393 * If SS RPL or DPL differs from CS RPL then we'll #GP.
394 */
395 if ( !(c->flags & VGCF_VMX_GUEST) )
396 {
397 if ( ((c->user_regs.cs & 3) == 0) ||
398 ((c->user_regs.ss & 3) == 0) )
399 return -EINVAL;
400 }
402 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
403 if ( c->flags & VGCF_I387_VALID )
404 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
406 v->arch.flags &= ~TF_kernel_mode;
407 if ( c->flags & VGCF_IN_KERNEL )
408 v->arch.flags |= TF_kernel_mode;
410 memcpy(&v->arch.guest_context, c, sizeof(*c));
412 if ( !(c->flags & VGCF_VMX_GUEST) )
413 {
414 /* IOPL privileges are virtualised. */
415 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
416 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
418 /* Ensure real hardware interrupts are enabled. */
419 v->arch.guest_context.user_regs.eflags |= EF_IE;
420 } else {
421 __vmwrite(GUEST_RFLAGS, v->arch.guest_context.user_regs.eflags);
422 if (v->arch.guest_context.user_regs.eflags & EF_TF)
423 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
424 else
425 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
426 }
428 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
429 return 0;
431 memset(v->arch.guest_context.debugreg, 0,
432 sizeof(v->arch.guest_context.debugreg));
433 for ( i = 0; i < 8; i++ )
434 (void)set_debugreg(v, i, c->debugreg[i]);
436 if ( v->vcpu_id == 0 )
437 d->vm_assist = c->vm_assist;
439 phys_basetab = c->ctrlreg[3];
440 v->arch.guest_table = mk_pagetable(phys_basetab);
442 if ( shadow_mode_refcounts(d) )
443 {
444 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
445 return -EINVAL;
446 }
447 else
448 {
449 #ifdef __x86_64__
450 if ( !(c->flags & VGCF_VMX_GUEST) )
451 #endif
452 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
453 PGT_base_page_table) )
454 return -EINVAL;
455 }
457 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
458 {
459 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
460 return rc;
461 }
463 if ( c->flags & VGCF_VMX_GUEST )
464 {
465 /* VMX uses the initially provided page tables as the P2M map. */
466 if ( !pagetable_get_paddr(d->arch.phys_table) )
467 d->arch.phys_table = v->arch.guest_table;
469 if ( (rc = vmx_final_setup_guest(v, c)) != 0 )
470 return rc;
471 }
473 update_pagetables(v);
475 /* Don't redo final setup */
476 set_bit(_VCPUF_initialised, &v->vcpu_flags);
478 return 0;
479 }
482 void new_thread(struct vcpu *d,
483 unsigned long start_pc,
484 unsigned long start_stack,
485 unsigned long start_info)
486 {
487 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
489 /*
490 * Initial register values:
491 * DS,ES,FS,GS = FLAT_KERNEL_DS
492 * CS:EIP = FLAT_KERNEL_CS:start_pc
493 * SS:ESP = FLAT_KERNEL_SS:start_stack
494 * ESI = start_info
495 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
496 */
497 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
498 regs->ss = FLAT_KERNEL_SS;
499 regs->cs = FLAT_KERNEL_CS;
500 regs->eip = start_pc;
501 regs->esp = start_stack;
502 regs->esi = start_info;
504 __save_flags(regs->eflags);
505 regs->eflags |= X86_EFLAGS_IF;
506 }
509 #ifdef __x86_64__
511 void toggle_guest_mode(struct vcpu *v)
512 {
513 v->arch.flags ^= TF_kernel_mode;
514 __asm__ __volatile__ ( "swapgs" );
515 update_pagetables(v);
516 write_ptbase(v);
517 }
519 #define loadsegment(seg,value) ({ \
520 int __r = 1; \
521 __asm__ __volatile__ ( \
522 "1: movl %k1,%%" #seg "\n2:\n" \
523 ".section .fixup,\"ax\"\n" \
524 "3: xorl %k0,%k0\n" \
525 " movl %k0,%%" #seg "\n" \
526 " jmp 2b\n" \
527 ".previous\n" \
528 ".section __ex_table,\"a\"\n" \
529 " .align 8\n" \
530 " .quad 1b,3b\n" \
531 ".previous" \
532 : "=r" (__r) : "r" (value), "0" (__r) );\
533 __r; })
535 #if CONFIG_VMX
536 #define load_msrs(_p, _n) if (vmx_switch_on) vmx_load_msrs((_p), (_n))
537 #else
538 #define load_msrs(_p, _n) ((void)0)
539 #endif
541 static void load_segments(struct vcpu *p, struct vcpu *n)
542 {
543 struct vcpu_guest_context *pctxt = &p->arch.guest_context;
544 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
545 int all_segs_okay = 1;
547 /* Either selector != 0 ==> reload. */
548 if ( unlikely(pctxt->user_regs.ds | nctxt->user_regs.ds) )
549 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
551 /* Either selector != 0 ==> reload. */
552 if ( unlikely(pctxt->user_regs.es | nctxt->user_regs.es) )
553 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
555 /*
556 * Either selector != 0 ==> reload.
557 * Also reload to reset FS_BASE if it was non-zero.
558 */
559 if ( unlikely(pctxt->user_regs.fs |
560 pctxt->fs_base |
561 nctxt->user_regs.fs) )
562 {
563 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
564 if ( pctxt->user_regs.fs ) /* != 0 selector kills fs_base */
565 pctxt->fs_base = 0;
566 }
568 /*
569 * Either selector != 0 ==> reload.
570 * Also reload to reset GS_BASE if it was non-zero.
571 */
572 if ( unlikely(pctxt->user_regs.gs |
573 pctxt->gs_base_user |
574 nctxt->user_regs.gs) )
575 {
576 /* Reset GS_BASE with user %gs? */
577 if ( pctxt->user_regs.gs || !nctxt->gs_base_user )
578 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
579 if ( pctxt->user_regs.gs ) /* != 0 selector kills gs_base_user */
580 pctxt->gs_base_user = 0;
581 }
583 /* This can only be non-zero if selector is NULL. */
584 if ( nctxt->fs_base )
585 wrmsr(MSR_FS_BASE,
586 nctxt->fs_base,
587 nctxt->fs_base>>32);
589 /* Most kernels have non-zero GS base, so don't bother testing. */
590 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
591 wrmsr(MSR_SHADOW_GS_BASE,
592 nctxt->gs_base_kernel,
593 nctxt->gs_base_kernel>>32);
595 /* This can only be non-zero if selector is NULL. */
596 if ( nctxt->gs_base_user )
597 wrmsr(MSR_GS_BASE,
598 nctxt->gs_base_user,
599 nctxt->gs_base_user>>32);
601 /* If in kernel mode then switch the GS bases around. */
602 if ( n->arch.flags & TF_kernel_mode )
603 __asm__ __volatile__ ( "swapgs" );
605 if ( unlikely(!all_segs_okay) )
606 {
607 struct cpu_user_regs *regs = guest_cpu_user_regs();
608 unsigned long *rsp =
609 (n->arch.flags & TF_kernel_mode) ?
610 (unsigned long *)regs->rsp :
611 (unsigned long *)nctxt->kernel_sp;
613 if ( !(n->arch.flags & TF_kernel_mode) )
614 toggle_guest_mode(n);
615 else
616 regs->cs &= ~3;
618 if ( put_user(regs->ss, rsp- 1) |
619 put_user(regs->rsp, rsp- 2) |
620 put_user(regs->rflags, rsp- 3) |
621 put_user(regs->cs, rsp- 4) |
622 put_user(regs->rip, rsp- 5) |
623 put_user(nctxt->user_regs.gs, rsp- 6) |
624 put_user(nctxt->user_regs.fs, rsp- 7) |
625 put_user(nctxt->user_regs.es, rsp- 8) |
626 put_user(nctxt->user_regs.ds, rsp- 9) |
627 put_user(regs->r11, rsp-10) |
628 put_user(regs->rcx, rsp-11) )
629 {
630 DPRINTK("Error while creating failsafe callback frame.\n");
631 domain_crash();
632 }
634 regs->entry_vector = TRAP_syscall;
635 regs->rflags &= 0xFFFCBEFFUL;
636 regs->ss = __GUEST_SS;
637 regs->rsp = (unsigned long)(rsp-11);
638 regs->cs = __GUEST_CS;
639 regs->rip = nctxt->failsafe_callback_eip;
640 }
641 }
643 static void save_segments(struct vcpu *v)
644 {
645 struct cpu_user_regs *regs = &v->arch.guest_context.user_regs;
646 __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (regs->ds) );
647 __asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) );
648 __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) );
649 __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) );
650 }
652 static void clear_segments(void)
653 {
654 __asm__ __volatile__ (
655 " movl %0,%%ds; "
656 " movl %0,%%es; "
657 " movl %0,%%fs; "
658 " movl %0,%%gs; "
659 ""safe_swapgs" "
660 " movl %0,%%gs"
661 : : "r" (0) );
662 }
664 long do_switch_to_user(void)
665 {
666 struct cpu_user_regs *regs = guest_cpu_user_regs();
667 struct switch_to_user stu;
668 struct vcpu *v = current;
670 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
671 unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
672 return -EFAULT;
674 toggle_guest_mode(v);
676 regs->rip = stu.rip;
677 regs->cs = stu.cs | 3; /* force guest privilege */
678 regs->rflags = stu.rflags;
679 regs->rsp = stu.rsp;
680 regs->ss = stu.ss | 3; /* force guest privilege */
682 if ( !(stu.flags & VGCF_IN_SYSCALL) )
683 {
684 regs->entry_vector = 0;
685 regs->r11 = stu.r11;
686 regs->rcx = stu.rcx;
687 }
689 /* Saved %rax gets written back to regs->rax in entry.S. */
690 return stu.rax;
691 }
693 #define switch_kernel_stack(_n,_c) ((void)0)
695 #elif defined(__i386__)
697 #define load_segments(_p, _n) ((void)0)
698 #define load_msrs(_p, _n) ((void)0)
699 #define save_segments(_p) ((void)0)
700 #define clear_segments() ((void)0)
702 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
703 {
704 struct tss_struct *tss = &init_tss[cpu];
705 tss->esp1 = n->arch.guest_context.kernel_sp;
706 tss->ss1 = n->arch.guest_context.kernel_ss;
707 }
709 #endif
711 #define loaddebug(_v,_reg) \
712 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
714 static void __context_switch(void)
715 {
716 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
717 unsigned int cpu = smp_processor_id();
718 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
719 struct vcpu *n = current;
721 if ( !is_idle_task(p->domain) )
722 {
723 memcpy(&p->arch.guest_context.user_regs,
724 stack_regs,
725 CTXT_SWITCH_STACK_BYTES);
726 unlazy_fpu(p);
727 save_segments(p);
728 }
730 if ( !is_idle_task(n->domain) )
731 {
732 memcpy(stack_regs,
733 &n->arch.guest_context.user_regs,
734 CTXT_SWITCH_STACK_BYTES);
736 /* Maybe switch the debug registers. */
737 if ( unlikely(n->arch.guest_context.debugreg[7]) )
738 {
739 loaddebug(&n->arch.guest_context, 0);
740 loaddebug(&n->arch.guest_context, 1);
741 loaddebug(&n->arch.guest_context, 2);
742 loaddebug(&n->arch.guest_context, 3);
743 /* no 4 and 5 */
744 loaddebug(&n->arch.guest_context, 6);
745 loaddebug(&n->arch.guest_context, 7);
746 }
748 if ( !VMX_DOMAIN(n) )
749 {
750 set_int80_direct_trap(n);
751 switch_kernel_stack(n, cpu);
752 }
753 }
755 if ( p->domain != n->domain )
756 cpu_set(cpu, n->domain->cpumask);
758 write_ptbase(n);
760 if ( p->vcpu_id != n->vcpu_id )
761 {
762 char gdt_load[10];
763 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
764 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
765 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
766 }
768 if ( p->domain != n->domain )
769 cpu_clear(cpu, p->domain->cpumask);
771 percpu_ctxt[cpu].curr_vcpu = n;
772 }
775 void context_switch(struct vcpu *prev, struct vcpu *next)
776 {
777 struct vcpu *realprev;
779 local_irq_disable();
781 set_current(next);
783 if ( ((realprev = percpu_ctxt[smp_processor_id()].curr_vcpu) == next) ||
784 is_idle_task(next->domain) )
785 {
786 local_irq_enable();
787 }
788 else
789 {
790 __context_switch();
792 local_irq_enable();
794 if ( !VMX_DOMAIN(next) )
795 {
796 load_LDT(next);
797 load_segments(realprev, next);
798 load_msrs(realprev, next);
799 }
800 }
802 /*
803 * We do this late on because it doesn't need to be protected by the
804 * schedule_lock, and because we want this to be the very last use of
805 * 'prev' (after this point, a dying domain's info structure may be freed
806 * without warning).
807 */
808 clear_bit(_VCPUF_running, &prev->vcpu_flags);
810 schedule_tail(next);
811 BUG();
812 }
814 void continue_running(struct vcpu *same)
815 {
816 schedule_tail(same);
817 BUG();
818 }
820 int __sync_lazy_execstate(void)
821 {
822 if ( percpu_ctxt[smp_processor_id()].curr_vcpu == current )
823 return 0;
824 __context_switch();
825 load_LDT(current);
826 clear_segments();
827 return 1;
828 }
830 void sync_lazy_execstate_cpu(unsigned int cpu)
831 {
832 if ( cpu == smp_processor_id() )
833 (void)__sync_lazy_execstate();
834 else
835 flush_tlb_mask(cpumask_of_cpu(cpu));
836 }
838 void sync_lazy_execstate_mask(cpumask_t mask)
839 {
840 if ( cpu_isset(smp_processor_id(), mask) )
841 (void)__sync_lazy_execstate();
842 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
843 flush_tlb_mask(mask);
844 }
846 void sync_lazy_execstate_all(void)
847 {
848 __sync_lazy_execstate();
849 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
850 flush_tlb_mask(cpu_online_map);
851 }
853 unsigned long __hypercall_create_continuation(
854 unsigned int op, unsigned int nr_args, ...)
855 {
856 struct mc_state *mcs = &mc_state[smp_processor_id()];
857 struct cpu_user_regs *regs;
858 unsigned int i;
859 va_list args;
861 va_start(args, nr_args);
863 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
864 {
865 __set_bit(_MCSF_call_preempted, &mcs->flags);
867 for ( i = 0; i < nr_args; i++ )
868 mcs->call.args[i] = va_arg(args, unsigned long);
869 }
870 else
871 {
872 regs = guest_cpu_user_regs();
873 #if defined(__i386__)
874 regs->eax = op;
875 regs->eip -= 2; /* re-execute 'int 0x82' */
877 for ( i = 0; i < nr_args; i++ )
878 {
879 switch ( i )
880 {
881 case 0: regs->ebx = va_arg(args, unsigned long); break;
882 case 1: regs->ecx = va_arg(args, unsigned long); break;
883 case 2: regs->edx = va_arg(args, unsigned long); break;
884 case 3: regs->esi = va_arg(args, unsigned long); break;
885 case 4: regs->edi = va_arg(args, unsigned long); break;
886 case 5: regs->ebp = va_arg(args, unsigned long); break;
887 }
888 }
889 #elif defined(__x86_64__)
890 regs->rax = op;
891 regs->rip -= 2; /* re-execute 'syscall' */
893 for ( i = 0; i < nr_args; i++ )
894 {
895 switch ( i )
896 {
897 case 0: regs->rdi = va_arg(args, unsigned long); break;
898 case 1: regs->rsi = va_arg(args, unsigned long); break;
899 case 2: regs->rdx = va_arg(args, unsigned long); break;
900 case 3: regs->r10 = va_arg(args, unsigned long); break;
901 case 4: regs->r8 = va_arg(args, unsigned long); break;
902 case 5: regs->r9 = va_arg(args, unsigned long); break;
903 }
904 }
905 #endif
906 }
908 va_end(args);
910 return op;
911 }
913 #ifdef CONFIG_VMX
914 static void vmx_relinquish_resources(struct vcpu *v)
915 {
916 if ( !VMX_DOMAIN(v) )
917 return;
919 BUG_ON(v->arch.arch_vmx.vmcs == NULL);
920 free_vmcs(v->arch.arch_vmx.vmcs);
921 v->arch.arch_vmx.vmcs = 0;
923 free_monitor_pagetable(v);
924 rem_ac_timer(&v->domain->arch.vmx_platform.vmx_pit.pit_timer);
925 }
926 #else
927 #define vmx_relinquish_resources(_v) ((void)0)
928 #endif
930 static void relinquish_memory(struct domain *d, struct list_head *list)
931 {
932 struct list_head *ent;
933 struct pfn_info *page;
934 unsigned long x, y;
936 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
937 spin_lock_recursive(&d->page_alloc_lock);
939 ent = list->next;
940 while ( ent != list )
941 {
942 page = list_entry(ent, struct pfn_info, list);
944 /* Grab a reference to the page so it won't disappear from under us. */
945 if ( unlikely(!get_page(page, d)) )
946 {
947 /* Couldn't get a reference -- someone is freeing this page. */
948 ent = ent->next;
949 continue;
950 }
952 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
953 put_page_and_type(page);
955 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
956 put_page(page);
958 /*
959 * Forcibly invalidate base page tables at this point to break circular
960 * 'linear page table' references. This is okay because MMU structures
961 * are not shared across domains and this domain is now dead. Thus base
962 * tables are not in use so a non-zero count means circular reference.
963 */
964 y = page->u.inuse.type_info;
965 for ( ; ; )
966 {
967 x = y;
968 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
969 (PGT_base_page_table|PGT_validated)) )
970 break;
972 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
973 if ( likely(y == x) )
974 {
975 free_page_type(page, PGT_base_page_table);
976 break;
977 }
978 }
980 /* Follow the list chain and /then/ potentially free the page. */
981 ent = ent->next;
982 put_page(page);
983 }
985 spin_unlock_recursive(&d->page_alloc_lock);
986 }
988 void domain_relinquish_resources(struct domain *d)
989 {
990 struct vcpu *v;
991 unsigned long pfn;
993 BUG_ON(!cpus_empty(d->cpumask));
995 physdev_destroy_state(d);
997 ptwr_destroy(d);
999 /* Release device mappings of other domains */
1000 gnttab_release_dev_mappings(d->grant_table);
1002 /* Drop the in-use references to page-table bases. */
1003 for_each_vcpu ( d, v )
1005 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
1007 if ( !shadow_mode_refcounts(d) )
1008 put_page_type(pfn_to_page(pfn));
1009 put_page(pfn_to_page(pfn));
1011 v->arch.guest_table = mk_pagetable(0);
1014 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
1016 if ( !shadow_mode_refcounts(d) )
1017 put_page_type(pfn_to_page(pfn));
1018 put_page(pfn_to_page(pfn));
1020 v->arch.guest_table_user = mk_pagetable(0);
1023 vmx_relinquish_resources(v);
1026 shadow_mode_disable(d);
1028 /*
1029 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1030 * it automatically gets squashed when the guest's mappings go away.
1031 */
1032 for_each_vcpu(d, v)
1033 destroy_gdt(v);
1035 /* Relinquish every page of memory. */
1036 relinquish_memory(d, &d->xenpage_list);
1037 relinquish_memory(d, &d->page_list);
1041 /*
1042 * Local variables:
1043 * mode: C
1044 * c-set-style: "BSD"
1045 * c-basic-offset: 4
1046 * tab-width: 4
1047 * indent-tabs-mode: nil
1048 * End:
1049 */