ia64/xen-unstable

view xen/arch/x86/domain.c @ 6215:c7689e1e0768

Add a couple of well chosen assertions to new context
switch code.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Aug 16 18:12:19 2005 +0000 (2005-08-16)
parents 027812e4a63c
children 3b0ce44f7b7a
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <asm/physdev.h>
41 #include <xen/kernel.h>
42 #include <public/io/ioreq.h>
43 #include <xen/multicall.h>
45 /* opt_noreboot: If true, machine will need manual reset on error. */
46 static int opt_noreboot = 0;
47 boolean_param("noreboot", opt_noreboot);
49 struct percpu_ctxt {
50 struct vcpu *curr_vcpu;
51 unsigned int context_not_finalised;
52 unsigned int dirty_segment_mask;
53 } __cacheline_aligned;
54 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
56 static void continue_idle_task(struct vcpu *v)
57 {
58 reset_stack_and_jump(idle_loop);
59 }
61 static void continue_nonidle_task(struct vcpu *v)
62 {
63 reset_stack_and_jump(ret_from_intr);
64 }
66 static void default_idle(void)
67 {
68 local_irq_disable();
69 if ( !softirq_pending(smp_processor_id()) )
70 safe_halt();
71 else
72 local_irq_enable();
73 }
75 void idle_loop(void)
76 {
77 int cpu = smp_processor_id();
79 for ( ; ; )
80 {
81 irq_stat[cpu].idle_timestamp = jiffies;
83 while ( !softirq_pending(cpu) )
84 {
85 page_scrub_schedule_work();
86 default_idle();
87 }
89 do_softirq();
90 }
91 }
93 void startup_cpu_idle_loop(void)
94 {
95 struct vcpu *v = current;
97 ASSERT(is_idle_task(v->domain));
98 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
99 cpu_set(smp_processor_id(), v->domain->cpumask);
100 v->arch.schedule_tail = continue_idle_task;
102 idle_loop();
103 }
105 static long no_idt[2];
106 static int reboot_mode;
108 static inline void kb_wait(void)
109 {
110 int i;
112 for ( i = 0; i < 0x10000; i++ )
113 if ( (inb_p(0x64) & 0x02) == 0 )
114 break;
115 }
117 void machine_restart(char * __unused)
118 {
119 int i;
121 if ( opt_noreboot )
122 {
123 printk("Reboot disabled on cmdline: require manual reset\n");
124 for ( ; ; )
125 safe_halt();
126 }
128 watchdog_disable();
129 console_start_sync();
131 local_irq_enable();
133 /* Ensure we are the boot CPU. */
134 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
135 {
136 smp_call_function((void *)machine_restart, NULL, 1, 0);
137 for ( ; ; )
138 safe_halt();
139 }
141 /*
142 * Stop all CPUs and turn off local APICs and the IO-APIC, so
143 * other OSs see a clean IRQ state.
144 */
145 smp_send_stop();
146 disable_IO_APIC();
148 #ifdef CONFIG_VMX
149 stop_vmx();
150 #endif
152 /* Rebooting needs to touch the page at absolute address 0. */
153 *((unsigned short *)__va(0x472)) = reboot_mode;
155 for ( ; ; )
156 {
157 /* Pulse the keyboard reset line. */
158 for ( i = 0; i < 100; i++ )
159 {
160 kb_wait();
161 udelay(50);
162 outb(0xfe,0x64); /* pulse reset low */
163 udelay(50);
164 }
166 /* That didn't work - force a triple fault.. */
167 __asm__ __volatile__("lidt %0": "=m" (no_idt));
168 __asm__ __volatile__("int3");
169 }
170 }
173 void __attribute__((noreturn)) __machine_halt(void *unused)
174 {
175 for ( ; ; )
176 safe_halt();
177 }
179 void machine_halt(void)
180 {
181 watchdog_disable();
182 console_start_sync();
183 smp_call_function(__machine_halt, NULL, 1, 0);
184 __machine_halt(NULL);
185 }
187 void dump_pageframe_info(struct domain *d)
188 {
189 struct pfn_info *page;
191 if ( d->tot_pages < 10 )
192 {
193 list_for_each_entry ( page, &d->page_list, list )
194 {
195 printk("Page %p: caf=%08x, taf=%" PRtype_info "\n",
196 _p(page_to_phys(page)), page->count_info,
197 page->u.inuse.type_info);
198 }
199 }
201 list_for_each_entry ( page, &d->xenpage_list, list )
202 {
203 printk("XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
204 _p(page_to_phys(page)), page->count_info,
205 page->u.inuse.type_info);
206 }
209 page = virt_to_page(d->shared_info);
210 printk("Shared_info@%p: caf=%08x, taf=%" PRtype_info "\n",
211 _p(page_to_phys(page)), page->count_info,
212 page->u.inuse.type_info);
213 }
215 struct vcpu *arch_alloc_vcpu_struct(void)
216 {
217 return xmalloc(struct vcpu);
218 }
220 void arch_free_vcpu_struct(struct vcpu *v)
221 {
222 xfree(v);
223 }
225 void free_perdomain_pt(struct domain *d)
226 {
227 free_xenheap_page(d->arch.mm_perdomain_pt);
228 #ifdef __x86_64__
229 free_xenheap_page(d->arch.mm_perdomain_l2);
230 free_xenheap_page(d->arch.mm_perdomain_l3);
231 #endif
232 }
234 void arch_do_createdomain(struct vcpu *v)
235 {
236 struct domain *d = v->domain;
238 v->arch.flags = TF_kernel_mode;
240 if ( is_idle_task(d) )
241 return;
243 v->arch.schedule_tail = continue_nonidle_task;
245 d->shared_info = alloc_xenheap_page();
246 memset(d->shared_info, 0, PAGE_SIZE);
247 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
248 v->cpumap = CPUMAP_RUNANYWHERE;
249 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
250 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
251 PAGE_SHIFT] = INVALID_M2P_ENTRY;
253 d->arch.mm_perdomain_pt = alloc_xenheap_page();
254 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
255 machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >>
256 PAGE_SHIFT] = INVALID_M2P_ENTRY;
257 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
258 v->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
259 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
261 v->arch.guest_vtable = __linear_l2_table;
262 v->arch.shadow_vtable = __shadow_linear_l2_table;
264 #ifdef __x86_64__
265 v->arch.guest_vl3table = __linear_l3_table;
266 v->arch.guest_vl4table = __linear_l4_table;
268 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
269 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
270 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
271 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt),
272 __PAGE_HYPERVISOR);
273 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
274 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
275 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
276 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
277 __PAGE_HYPERVISOR);
278 #endif
280 (void)ptwr_init(d);
282 shadow_lock_init(d);
283 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
284 }
286 void arch_do_boot_vcpu(struct vcpu *v)
287 {
288 struct domain *d = v->domain;
290 v->arch.flags = TF_kernel_mode;
292 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
294 v->arch.perdomain_ptes =
295 d->arch.mm_perdomain_pt + (v->vcpu_id << PDPT_VCPU_SHIFT);
296 v->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
297 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
298 }
300 void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
301 {
302 if ( v->processor == newcpu )
303 return;
305 set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
306 v->processor = newcpu;
308 if ( VMX_DOMAIN(v) )
309 {
310 __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs));
311 v->arch.schedule_tail = arch_vmx_do_relaunch;
312 }
313 }
315 #ifdef CONFIG_VMX
316 static int vmx_switch_on;
318 static int vmx_final_setup_guest(
319 struct vcpu *v, struct vcpu_guest_context *ctxt)
320 {
321 int error;
322 struct cpu_user_regs *regs;
323 struct vmcs_struct *vmcs;
325 regs = &ctxt->user_regs;
327 /*
328 * Create a new VMCS
329 */
330 if (!(vmcs = alloc_vmcs())) {
331 printk("Failed to create a new VMCS\n");
332 return -ENOMEM;
333 }
335 memset(&v->arch.arch_vmx, 0, sizeof (struct arch_vmx_struct));
337 v->arch.arch_vmx.vmcs = vmcs;
338 error = construct_vmcs(
339 &v->arch.arch_vmx, regs, ctxt, VMCS_USE_HOST_ENV);
340 if ( error < 0 )
341 {
342 printk("Failed to construct a new VMCS\n");
343 goto out;
344 }
346 v->arch.schedule_tail = arch_vmx_do_launch;
348 #if defined (__i386__)
349 v->domain->arch.vmx_platform.real_mode_data =
350 (unsigned long *) regs->esi;
351 #endif
353 if (v == v->domain->vcpu[0]) {
354 /*
355 * Required to do this once per domain
356 * XXX todo: add a seperate function to do these.
357 */
358 memset(&v->domain->shared_info->evtchn_mask[0], 0xff,
359 sizeof(v->domain->shared_info->evtchn_mask));
361 /* Put the domain in shadow mode even though we're going to be using
362 * the shared 1:1 page table initially. It shouldn't hurt */
363 shadow_mode_enable(v->domain,
364 SHM_enable|SHM_refcounts|
365 SHM_translate|SHM_external);
366 }
368 if (!vmx_switch_on)
369 vmx_switch_on = 1;
371 return 0;
373 out:
374 free_vmcs(vmcs);
375 if(v->arch.arch_vmx.io_bitmap_a != 0) {
376 free_xenheap_pages(v->arch.arch_vmx.io_bitmap_a, get_order(0x1000));
377 v->arch.arch_vmx.io_bitmap_a = 0;
378 }
379 if(v->arch.arch_vmx.io_bitmap_b != 0) {
380 free_xenheap_pages(v->arch.arch_vmx.io_bitmap_b, get_order(0x1000));
381 v->arch.arch_vmx.io_bitmap_b = 0;
382 }
383 v->arch.arch_vmx.vmcs = 0;
384 return error;
385 }
386 #endif
389 /* This is called by arch_final_setup_guest and do_boot_vcpu */
390 int arch_set_info_guest(
391 struct vcpu *v, struct vcpu_guest_context *c)
392 {
393 struct domain *d = v->domain;
394 unsigned long phys_basetab;
395 int i, rc;
397 /*
398 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
399 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
400 * If SS RPL or DPL differs from CS RPL then we'll #GP.
401 */
402 if ( !(c->flags & VGCF_VMX_GUEST) )
403 {
404 if ( ((c->user_regs.cs & 3) == 0) ||
405 ((c->user_regs.ss & 3) == 0) )
406 return -EINVAL;
407 }
409 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
410 if ( c->flags & VGCF_I387_VALID )
411 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
413 v->arch.flags &= ~TF_kernel_mode;
414 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
415 v->arch.flags |= TF_kernel_mode;
417 memcpy(&v->arch.guest_context, c, sizeof(*c));
419 if ( !(c->flags & VGCF_VMX_GUEST) )
420 {
421 /* IOPL privileges are virtualised. */
422 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
423 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
425 /* Ensure real hardware interrupts are enabled. */
426 v->arch.guest_context.user_regs.eflags |= EF_IE;
427 }
428 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
429 {
430 return modify_vmcs(
431 &v->arch.arch_vmx,
432 &v->arch.guest_context.user_regs);
433 }
435 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
436 return 0;
438 memset(v->arch.guest_context.debugreg, 0,
439 sizeof(v->arch.guest_context.debugreg));
440 for ( i = 0; i < 8; i++ )
441 (void)set_debugreg(v, i, c->debugreg[i]);
443 if ( v->vcpu_id == 0 )
444 d->vm_assist = c->vm_assist;
446 phys_basetab = c->ctrlreg[3];
447 v->arch.guest_table = mk_pagetable(phys_basetab);
449 if ( shadow_mode_refcounts(d) )
450 {
451 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
452 return -EINVAL;
453 }
454 else
455 {
456 #ifdef __x86_64__
457 if ( !(c->flags & VGCF_VMX_GUEST) )
458 #endif
459 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
460 PGT_base_page_table) )
461 return -EINVAL;
462 }
464 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
465 {
466 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
467 return rc;
468 }
470 if ( c->flags & VGCF_VMX_GUEST )
471 {
472 /* VMX uses the initially provided page tables as the P2M map. */
473 if ( !pagetable_get_paddr(d->arch.phys_table) )
474 d->arch.phys_table = v->arch.guest_table;
476 if ( (rc = vmx_final_setup_guest(v, c)) != 0 )
477 return rc;
478 }
480 update_pagetables(v);
482 if ( v->vcpu_id == 0 )
483 init_domain_time(d);
485 /* Don't redo final setup */
486 set_bit(_VCPUF_initialised, &v->vcpu_flags);
488 return 0;
489 }
492 void new_thread(struct vcpu *d,
493 unsigned long start_pc,
494 unsigned long start_stack,
495 unsigned long start_info)
496 {
497 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
499 /*
500 * Initial register values:
501 * DS,ES,FS,GS = FLAT_KERNEL_DS
502 * CS:EIP = FLAT_KERNEL_CS:start_pc
503 * SS:ESP = FLAT_KERNEL_SS:start_stack
504 * ESI = start_info
505 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
506 */
507 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
508 regs->ss = FLAT_KERNEL_SS;
509 regs->cs = FLAT_KERNEL_CS;
510 regs->eip = start_pc;
511 regs->esp = start_stack;
512 regs->esi = start_info;
514 __save_flags(regs->eflags);
515 regs->eflags |= X86_EFLAGS_IF;
516 }
519 #ifdef __x86_64__
521 void toggle_guest_mode(struct vcpu *v)
522 {
523 v->arch.flags ^= TF_kernel_mode;
524 __asm__ __volatile__ ( "swapgs" );
525 update_pagetables(v);
526 write_ptbase(v);
527 }
529 #define loadsegment(seg,value) ({ \
530 int __r = 1; \
531 __asm__ __volatile__ ( \
532 "1: movl %k1,%%" #seg "\n2:\n" \
533 ".section .fixup,\"ax\"\n" \
534 "3: xorl %k0,%k0\n" \
535 " movl %k0,%%" #seg "\n" \
536 " jmp 2b\n" \
537 ".previous\n" \
538 ".section __ex_table,\"a\"\n" \
539 " .align 8\n" \
540 " .quad 1b,3b\n" \
541 ".previous" \
542 : "=r" (__r) : "r" (value), "0" (__r) );\
543 __r; })
545 #if CONFIG_VMX
546 #define load_msrs(n) if (vmx_switch_on) vmx_load_msrs(n)
547 #else
548 #define load_msrs(n) ((void)0)
549 #endif
551 /*
552 * save_segments() writes a mask of segments which are dirty (non-zero),
553 * allowing load_segments() to avoid some expensive segment loads and
554 * MSR writes.
555 */
556 #define DIRTY_DS 0x01
557 #define DIRTY_ES 0x02
558 #define DIRTY_FS 0x04
559 #define DIRTY_GS 0x08
560 #define DIRTY_FS_BASE 0x10
561 #define DIRTY_GS_BASE_USER 0x20
563 static void load_segments(struct vcpu *n)
564 {
565 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
566 int all_segs_okay = 1;
567 unsigned int dirty_segment_mask, cpu = smp_processor_id();
569 /* Load and clear the dirty segment mask. */
570 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
571 percpu_ctxt[cpu].dirty_segment_mask = 0;
573 /* Either selector != 0 ==> reload. */
574 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
575 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
577 /* Either selector != 0 ==> reload. */
578 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
579 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
581 /*
582 * Either selector != 0 ==> reload.
583 * Also reload to reset FS_BASE if it was non-zero.
584 */
585 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
586 nctxt->user_regs.fs) )
587 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
589 /*
590 * Either selector != 0 ==> reload.
591 * Also reload to reset GS_BASE if it was non-zero.
592 */
593 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
594 nctxt->user_regs.gs) )
595 {
596 /* Reset GS_BASE with user %gs? */
597 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
598 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
599 }
601 /* This can only be non-zero if selector is NULL. */
602 if ( nctxt->fs_base )
603 wrmsr(MSR_FS_BASE,
604 nctxt->fs_base,
605 nctxt->fs_base>>32);
607 /* Most kernels have non-zero GS base, so don't bother testing. */
608 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
609 wrmsr(MSR_SHADOW_GS_BASE,
610 nctxt->gs_base_kernel,
611 nctxt->gs_base_kernel>>32);
613 /* This can only be non-zero if selector is NULL. */
614 if ( nctxt->gs_base_user )
615 wrmsr(MSR_GS_BASE,
616 nctxt->gs_base_user,
617 nctxt->gs_base_user>>32);
619 /* If in kernel mode then switch the GS bases around. */
620 if ( n->arch.flags & TF_kernel_mode )
621 __asm__ __volatile__ ( "swapgs" );
623 if ( unlikely(!all_segs_okay) )
624 {
625 struct cpu_user_regs *regs = guest_cpu_user_regs();
626 unsigned long *rsp =
627 (n->arch.flags & TF_kernel_mode) ?
628 (unsigned long *)regs->rsp :
629 (unsigned long *)nctxt->kernel_sp;
631 if ( !(n->arch.flags & TF_kernel_mode) )
632 toggle_guest_mode(n);
633 else
634 regs->cs &= ~3;
636 if ( put_user(regs->ss, rsp- 1) |
637 put_user(regs->rsp, rsp- 2) |
638 put_user(regs->rflags, rsp- 3) |
639 put_user(regs->cs, rsp- 4) |
640 put_user(regs->rip, rsp- 5) |
641 put_user(nctxt->user_regs.gs, rsp- 6) |
642 put_user(nctxt->user_regs.fs, rsp- 7) |
643 put_user(nctxt->user_regs.es, rsp- 8) |
644 put_user(nctxt->user_regs.ds, rsp- 9) |
645 put_user(regs->r11, rsp-10) |
646 put_user(regs->rcx, rsp-11) )
647 {
648 DPRINTK("Error while creating failsafe callback frame.\n");
649 domain_crash();
650 }
652 regs->entry_vector = TRAP_syscall;
653 regs->rflags &= 0xFFFCBEFFUL;
654 regs->ss = __GUEST_SS;
655 regs->rsp = (unsigned long)(rsp-11);
656 regs->cs = __GUEST_CS;
657 regs->rip = nctxt->failsafe_callback_eip;
658 }
659 }
661 static void save_segments(struct vcpu *v)
662 {
663 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
664 struct cpu_user_regs *regs = &ctxt->user_regs;
665 unsigned int dirty_segment_mask = 0;
667 if ( VMX_DOMAIN(v) )
668 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
670 __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (regs->ds) );
671 __asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) );
672 __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) );
673 __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) );
675 if ( regs->ds )
676 dirty_segment_mask |= DIRTY_DS;
678 if ( regs->es )
679 dirty_segment_mask |= DIRTY_ES;
681 if ( regs->fs )
682 {
683 dirty_segment_mask |= DIRTY_FS;
684 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
685 }
686 else if ( ctxt->fs_base )
687 {
688 dirty_segment_mask |= DIRTY_FS_BASE;
689 }
691 if ( regs->gs )
692 {
693 dirty_segment_mask |= DIRTY_GS;
694 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
695 }
696 else if ( ctxt->gs_base_user )
697 {
698 dirty_segment_mask |= DIRTY_GS_BASE_USER;
699 }
701 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
702 }
704 long do_switch_to_user(void)
705 {
706 struct cpu_user_regs *regs = guest_cpu_user_regs();
707 struct switch_to_user stu;
708 struct vcpu *v = current;
710 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
711 unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
712 return -EFAULT;
714 toggle_guest_mode(v);
716 regs->rip = stu.rip;
717 regs->cs = stu.cs | 3; /* force guest privilege */
718 regs->rflags = stu.rflags;
719 regs->rsp = stu.rsp;
720 regs->ss = stu.ss | 3; /* force guest privilege */
722 if ( !(stu.flags & VGCF_IN_SYSCALL) )
723 {
724 regs->entry_vector = 0;
725 regs->r11 = stu.r11;
726 regs->rcx = stu.rcx;
727 }
729 /* Saved %rax gets written back to regs->rax in entry.S. */
730 return stu.rax;
731 }
733 #define switch_kernel_stack(_n,_c) ((void)0)
735 #elif defined(__i386__)
737 #define load_segments(n) ((void)0)
738 #define load_msrs(n) ((void)0)
739 #define save_segments(p) ((void)0)
741 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
742 {
743 struct tss_struct *tss = &init_tss[cpu];
744 tss->esp1 = n->arch.guest_context.kernel_sp;
745 tss->ss1 = n->arch.guest_context.kernel_ss;
746 }
748 #endif
750 #define loaddebug(_v,_reg) \
751 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
753 static void __context_switch(void)
754 {
755 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
756 unsigned int cpu = smp_processor_id();
757 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
758 struct vcpu *n = current;
760 if ( !is_idle_task(p->domain) )
761 {
762 memcpy(&p->arch.guest_context.user_regs,
763 stack_regs,
764 CTXT_SWITCH_STACK_BYTES);
765 unlazy_fpu(p);
766 save_segments(p);
767 }
769 if ( !is_idle_task(n->domain) )
770 {
771 memcpy(stack_regs,
772 &n->arch.guest_context.user_regs,
773 CTXT_SWITCH_STACK_BYTES);
775 /* Maybe switch the debug registers. */
776 if ( unlikely(n->arch.guest_context.debugreg[7]) )
777 {
778 loaddebug(&n->arch.guest_context, 0);
779 loaddebug(&n->arch.guest_context, 1);
780 loaddebug(&n->arch.guest_context, 2);
781 loaddebug(&n->arch.guest_context, 3);
782 /* no 4 and 5 */
783 loaddebug(&n->arch.guest_context, 6);
784 loaddebug(&n->arch.guest_context, 7);
785 }
787 if ( !VMX_DOMAIN(n) )
788 {
789 set_int80_direct_trap(n);
790 switch_kernel_stack(n, cpu);
791 }
792 }
794 if ( p->domain != n->domain )
795 cpu_set(cpu, n->domain->cpumask);
797 write_ptbase(n);
799 if ( p->vcpu_id != n->vcpu_id )
800 {
801 char gdt_load[10];
802 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
803 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
804 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
805 }
807 if ( p->domain != n->domain )
808 cpu_clear(cpu, p->domain->cpumask);
810 percpu_ctxt[cpu].curr_vcpu = n;
811 }
814 void context_switch(struct vcpu *prev, struct vcpu *next)
815 {
816 unsigned int cpu = smp_processor_id();
818 ASSERT(!local_irq_is_enabled());
820 set_current(next);
822 if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
823 {
824 __context_switch();
825 percpu_ctxt[cpu].context_not_finalised = 1;
826 }
827 }
829 void context_switch_finalise(struct vcpu *next)
830 {
831 unsigned int cpu = smp_processor_id();
833 ASSERT(local_irq_is_enabled());
835 if ( percpu_ctxt[cpu].context_not_finalised )
836 {
837 percpu_ctxt[cpu].context_not_finalised = 0;
839 BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
841 if ( VMX_DOMAIN(next) )
842 {
843 vmx_restore_msrs(next);
844 }
845 else
846 {
847 load_LDT(next);
848 load_segments(next);
849 load_msrs(next);
850 }
851 }
853 schedule_tail(next);
854 BUG();
855 }
857 void continue_running(struct vcpu *same)
858 {
859 schedule_tail(same);
860 BUG();
861 }
863 int __sync_lazy_execstate(void)
864 {
865 unsigned long flags;
866 int switch_required;
868 local_irq_save(flags);
870 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
872 if ( switch_required )
873 __context_switch();
875 local_irq_restore(flags);
877 return switch_required;
878 }
880 void sync_lazy_execstate_cpu(unsigned int cpu)
881 {
882 if ( cpu == smp_processor_id() )
883 (void)__sync_lazy_execstate();
884 else
885 flush_tlb_mask(cpumask_of_cpu(cpu));
886 }
888 void sync_lazy_execstate_mask(cpumask_t mask)
889 {
890 if ( cpu_isset(smp_processor_id(), mask) )
891 (void)__sync_lazy_execstate();
892 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
893 flush_tlb_mask(mask);
894 }
896 void sync_lazy_execstate_all(void)
897 {
898 __sync_lazy_execstate();
899 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
900 flush_tlb_mask(cpu_online_map);
901 }
903 unsigned long __hypercall_create_continuation(
904 unsigned int op, unsigned int nr_args, ...)
905 {
906 struct mc_state *mcs = &mc_state[smp_processor_id()];
907 struct cpu_user_regs *regs;
908 unsigned int i;
909 va_list args;
911 va_start(args, nr_args);
913 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
914 {
915 __set_bit(_MCSF_call_preempted, &mcs->flags);
917 for ( i = 0; i < nr_args; i++ )
918 mcs->call.args[i] = va_arg(args, unsigned long);
919 }
920 else
921 {
922 regs = guest_cpu_user_regs();
923 #if defined(__i386__)
924 regs->eax = op;
925 regs->eip -= 2; /* re-execute 'int 0x82' */
927 for ( i = 0; i < nr_args; i++ )
928 {
929 switch ( i )
930 {
931 case 0: regs->ebx = va_arg(args, unsigned long); break;
932 case 1: regs->ecx = va_arg(args, unsigned long); break;
933 case 2: regs->edx = va_arg(args, unsigned long); break;
934 case 3: regs->esi = va_arg(args, unsigned long); break;
935 case 4: regs->edi = va_arg(args, unsigned long); break;
936 case 5: regs->ebp = va_arg(args, unsigned long); break;
937 }
938 }
939 #elif defined(__x86_64__)
940 regs->rax = op;
941 regs->rip -= 2; /* re-execute 'syscall' */
943 for ( i = 0; i < nr_args; i++ )
944 {
945 switch ( i )
946 {
947 case 0: regs->rdi = va_arg(args, unsigned long); break;
948 case 1: regs->rsi = va_arg(args, unsigned long); break;
949 case 2: regs->rdx = va_arg(args, unsigned long); break;
950 case 3: regs->r10 = va_arg(args, unsigned long); break;
951 case 4: regs->r8 = va_arg(args, unsigned long); break;
952 case 5: regs->r9 = va_arg(args, unsigned long); break;
953 }
954 }
955 #endif
956 }
958 va_end(args);
960 return op;
961 }
963 #ifdef CONFIG_VMX
964 static void vmx_relinquish_resources(struct vcpu *v)
965 {
966 if ( !VMX_DOMAIN(v) )
967 return;
969 BUG_ON(v->arch.arch_vmx.vmcs == NULL);
970 free_vmcs(v->arch.arch_vmx.vmcs);
971 if(v->arch.arch_vmx.io_bitmap_a != 0) {
972 free_xenheap_pages(v->arch.arch_vmx.io_bitmap_a, get_order(0x1000));
973 v->arch.arch_vmx.io_bitmap_a = 0;
974 }
975 if(v->arch.arch_vmx.io_bitmap_b != 0) {
976 free_xenheap_pages(v->arch.arch_vmx.io_bitmap_b, get_order(0x1000));
977 v->arch.arch_vmx.io_bitmap_b = 0;
978 }
979 v->arch.arch_vmx.vmcs = 0;
981 free_monitor_pagetable(v);
982 rem_ac_timer(&v->domain->arch.vmx_platform.vmx_pit.pit_timer);
983 }
984 #else
985 #define vmx_relinquish_resources(_v) ((void)0)
986 #endif
988 static void relinquish_memory(struct domain *d, struct list_head *list)
989 {
990 struct list_head *ent;
991 struct pfn_info *page;
992 unsigned long x, y;
994 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
995 spin_lock_recursive(&d->page_alloc_lock);
997 ent = list->next;
998 while ( ent != list )
999 {
1000 page = list_entry(ent, struct pfn_info, list);
1002 /* Grab a reference to the page so it won't disappear from under us. */
1003 if ( unlikely(!get_page(page, d)) )
1005 /* Couldn't get a reference -- someone is freeing this page. */
1006 ent = ent->next;
1007 continue;
1010 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1011 put_page_and_type(page);
1013 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1014 put_page(page);
1016 /*
1017 * Forcibly invalidate base page tables at this point to break circular
1018 * 'linear page table' references. This is okay because MMU structures
1019 * are not shared across domains and this domain is now dead. Thus base
1020 * tables are not in use so a non-zero count means circular reference.
1021 */
1022 y = page->u.inuse.type_info;
1023 for ( ; ; )
1025 x = y;
1026 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1027 (PGT_base_page_table|PGT_validated)) )
1028 break;
1030 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1031 if ( likely(y == x) )
1033 free_page_type(page, PGT_base_page_table);
1034 break;
1038 /* Follow the list chain and /then/ potentially free the page. */
1039 ent = ent->next;
1040 put_page(page);
1043 spin_unlock_recursive(&d->page_alloc_lock);
1046 void domain_relinquish_resources(struct domain *d)
1048 struct vcpu *v;
1049 unsigned long pfn;
1051 BUG_ON(!cpus_empty(d->cpumask));
1053 physdev_destroy_state(d);
1055 ptwr_destroy(d);
1057 /* Release device mappings of other domains */
1058 gnttab_release_dev_mappings(d->grant_table);
1060 /* Drop the in-use references to page-table bases. */
1061 for_each_vcpu ( d, v )
1063 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
1065 if ( !shadow_mode_refcounts(d) )
1066 put_page_type(pfn_to_page(pfn));
1067 put_page(pfn_to_page(pfn));
1069 v->arch.guest_table = mk_pagetable(0);
1072 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
1074 if ( !shadow_mode_refcounts(d) )
1075 put_page_type(pfn_to_page(pfn));
1076 put_page(pfn_to_page(pfn));
1078 v->arch.guest_table_user = mk_pagetable(0);
1081 vmx_relinquish_resources(v);
1084 shadow_mode_disable(d);
1086 /*
1087 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1088 * it automatically gets squashed when the guest's mappings go away.
1089 */
1090 for_each_vcpu(d, v)
1091 destroy_gdt(v);
1093 /* Relinquish every page of memory. */
1094 relinquish_memory(d, &d->xenpage_list);
1095 relinquish_memory(d, &d->page_list);
1099 /*
1100 * Local variables:
1101 * mode: C
1102 * c-set-style: "BSD"
1103 * c-basic-offset: 4
1104 * tab-width: 4
1105 * indent-tabs-mode: nil
1106 * End:
1107 */