ia64/xen-unstable

view xen/arch/x86/domain.c @ 6707:3bde4219c681

manual merge
author iap10@freefall.cl.cam.ac.uk
date Thu Sep 08 17:40:37 2005 +0000 (2005-09-08)
parents dd668f7527cb e3fd0fa58364
children aa0990ef260f
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <asm/physdev.h>
41 #include <xen/kernel.h>
42 #include <public/io/ioreq.h>
43 #include <xen/multicall.h>
45 /* opt_noreboot: If true, machine will need manual reset on error. */
46 static int opt_noreboot = 0;
47 boolean_param("noreboot", opt_noreboot);
49 struct percpu_ctxt {
50 struct vcpu *curr_vcpu;
51 unsigned int context_not_finalised;
52 unsigned int dirty_segment_mask;
53 } __cacheline_aligned;
54 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
56 static void continue_idle_task(struct vcpu *v)
57 {
58 reset_stack_and_jump(idle_loop);
59 }
61 static void continue_nonidle_task(struct vcpu *v)
62 {
63 reset_stack_and_jump(ret_from_intr);
64 }
66 static void default_idle(void)
67 {
68 local_irq_disable();
69 if ( !softirq_pending(smp_processor_id()) )
70 safe_halt();
71 else
72 local_irq_enable();
73 }
75 void idle_loop(void)
76 {
77 int cpu = smp_processor_id();
79 for ( ; ; )
80 {
81 irq_stat[cpu].idle_timestamp = jiffies;
83 while ( !softirq_pending(cpu) )
84 {
85 page_scrub_schedule_work();
86 default_idle();
87 }
89 do_softirq();
90 }
91 }
93 void startup_cpu_idle_loop(void)
94 {
95 struct vcpu *v = current;
97 ASSERT(is_idle_task(v->domain));
98 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
99 cpu_set(smp_processor_id(), v->domain->cpumask);
100 v->arch.schedule_tail = continue_idle_task;
102 idle_loop();
103 }
105 static long no_idt[2];
106 static int reboot_mode;
108 static inline void kb_wait(void)
109 {
110 int i;
112 for ( i = 0; i < 0x10000; i++ )
113 if ( (inb_p(0x64) & 0x02) == 0 )
114 break;
115 }
117 void machine_restart(char * __unused)
118 {
119 int i;
121 if ( opt_noreboot )
122 {
123 printk("Reboot disabled on cmdline: require manual reset\n");
124 for ( ; ; )
125 safe_halt();
126 }
128 watchdog_disable();
129 console_start_sync();
131 local_irq_enable();
133 /* Ensure we are the boot CPU. */
134 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
135 {
136 smp_call_function((void *)machine_restart, NULL, 1, 0);
137 for ( ; ; )
138 safe_halt();
139 }
141 /*
142 * Stop all CPUs and turn off local APICs and the IO-APIC, so
143 * other OSs see a clean IRQ state.
144 */
145 smp_send_stop();
146 disable_IO_APIC();
148 #ifdef CONFIG_VMX
149 stop_vmx();
150 #endif
152 /* Rebooting needs to touch the page at absolute address 0. */
153 *((unsigned short *)__va(0x472)) = reboot_mode;
155 for ( ; ; )
156 {
157 /* Pulse the keyboard reset line. */
158 for ( i = 0; i < 100; i++ )
159 {
160 kb_wait();
161 udelay(50);
162 outb(0xfe,0x64); /* pulse reset low */
163 udelay(50);
164 }
166 /* That didn't work - force a triple fault.. */
167 __asm__ __volatile__("lidt %0": "=m" (no_idt));
168 __asm__ __volatile__("int3");
169 }
170 }
173 void __attribute__((noreturn)) __machine_halt(void *unused)
174 {
175 for ( ; ; )
176 safe_halt();
177 }
179 void machine_halt(void)
180 {
181 watchdog_disable();
182 console_start_sync();
183 smp_call_function(__machine_halt, NULL, 1, 0);
184 __machine_halt(NULL);
185 }
187 void dump_pageframe_info(struct domain *d)
188 {
189 struct pfn_info *page;
191 if ( d->tot_pages < 10 )
192 {
193 list_for_each_entry ( page, &d->page_list, list )
194 {
195 printk("Page %p: caf=%08x, taf=%" PRtype_info "\n",
196 _p(page_to_phys(page)), page->count_info,
197 page->u.inuse.type_info);
198 }
199 }
201 list_for_each_entry ( page, &d->xenpage_list, list )
202 {
203 printk("XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
204 _p(page_to_phys(page)), page->count_info,
205 page->u.inuse.type_info);
206 }
209 page = virt_to_page(d->shared_info);
210 printk("Shared_info@%p: caf=%08x, taf=%" PRtype_info "\n",
211 _p(page_to_phys(page)), page->count_info,
212 page->u.inuse.type_info);
213 }
215 struct vcpu *arch_alloc_vcpu_struct(void)
216 {
217 return xmalloc(struct vcpu);
218 }
220 /* We assume that vcpu 0 is always the last one to be freed in a
221 domain i.e. if v->vcpu_id == 0, the domain should be
222 single-processor. */
223 void arch_free_vcpu_struct(struct vcpu *v)
224 {
225 struct vcpu *p;
226 for_each_vcpu(v->domain, p) {
227 if (p->next_in_list == v)
228 p->next_in_list = v->next_in_list;
229 }
230 xfree(v);
231 }
233 void free_perdomain_pt(struct domain *d)
234 {
235 free_xenheap_page(d->arch.mm_perdomain_pt);
236 #ifdef __x86_64__
237 free_xenheap_page(d->arch.mm_perdomain_l2);
238 free_xenheap_page(d->arch.mm_perdomain_l3);
239 #endif
240 }
242 void arch_do_createdomain(struct vcpu *v)
243 {
244 struct domain *d = v->domain;
246 v->arch.flags = TF_kernel_mode;
248 if ( is_idle_task(d) )
249 return;
251 v->arch.schedule_tail = continue_nonidle_task;
253 d->shared_info = alloc_xenheap_page();
254 memset(d->shared_info, 0, PAGE_SIZE);
255 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
256 v->cpumap = CPUMAP_RUNANYWHERE;
257 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
258 set_pfn_from_mfn(virt_to_phys(d->shared_info) >> PAGE_SHIFT,
259 INVALID_M2P_ENTRY);
261 d->arch.mm_perdomain_pt = alloc_xenheap_page();
262 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
263 set_pfn_from_mfn(virt_to_phys(d->arch.mm_perdomain_pt) >> PAGE_SHIFT,
264 INVALID_M2P_ENTRY);
265 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
266 v->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
267 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
269 v->arch.guest_vtable = __linear_l2_table;
270 v->arch.shadow_vtable = __shadow_linear_l2_table;
272 #ifdef __x86_64__
273 v->arch.guest_vl3table = __linear_l3_table;
274 v->arch.guest_vl4table = __linear_l4_table;
276 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
277 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
278 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
279 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt),
280 __PAGE_HYPERVISOR);
281 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
282 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
283 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
284 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
285 __PAGE_HYPERVISOR);
286 #endif
288 (void)ptwr_init(d);
290 shadow_lock_init(d);
291 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
292 }
294 void arch_do_boot_vcpu(struct vcpu *v)
295 {
296 struct domain *d = v->domain;
298 v->arch.flags = TF_kernel_mode;
300 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
302 v->arch.perdomain_ptes =
303 d->arch.mm_perdomain_pt + (v->vcpu_id << PDPT_VCPU_SHIFT);
304 v->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
305 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
306 }
308 void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
309 {
310 if ( v->processor == newcpu )
311 return;
313 set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
314 v->processor = newcpu;
316 if ( VMX_DOMAIN(v) )
317 {
318 __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs));
319 v->arch.schedule_tail = arch_vmx_do_relaunch;
320 }
321 }
323 #ifdef CONFIG_VMX
324 static int vmx_switch_on;
326 static int vmx_final_setup_guest(
327 struct vcpu *v, struct vcpu_guest_context *ctxt)
328 {
329 int error;
330 struct cpu_user_regs *regs;
331 struct vmcs_struct *vmcs;
333 regs = &ctxt->user_regs;
335 /*
336 * Create a new VMCS
337 */
338 if (!(vmcs = alloc_vmcs())) {
339 printk("Failed to create a new VMCS\n");
340 return -ENOMEM;
341 }
343 memset(&v->arch.arch_vmx, 0, sizeof (struct arch_vmx_struct));
345 v->arch.arch_vmx.vmcs = vmcs;
346 error = construct_vmcs(
347 &v->arch.arch_vmx, regs, ctxt, VMCS_USE_HOST_ENV);
348 if ( error < 0 )
349 {
350 printk("Failed to construct a new VMCS\n");
351 goto out;
352 }
354 v->arch.schedule_tail = arch_vmx_do_launch;
356 #if defined (__i386__)
357 v->domain->arch.vmx_platform.real_mode_data =
358 (unsigned long *) regs->esi;
359 #endif
361 if (v == v->domain->vcpu[0]) {
362 /*
363 * Required to do this once per domain
364 * XXX todo: add a seperate function to do these.
365 */
366 memset(&v->domain->shared_info->evtchn_mask[0], 0xff,
367 sizeof(v->domain->shared_info->evtchn_mask));
369 /* Put the domain in shadow mode even though we're going to be using
370 * the shared 1:1 page table initially. It shouldn't hurt */
371 shadow_mode_enable(v->domain,
372 SHM_enable|SHM_refcounts|
373 SHM_translate|SHM_external);
374 }
376 if (!vmx_switch_on)
377 vmx_switch_on = 1;
379 return 0;
381 out:
382 free_vmcs(vmcs);
383 if(v->arch.arch_vmx.io_bitmap_a != 0) {
384 free_xenheap_pages(
385 v->arch.arch_vmx.io_bitmap_a, get_order_from_bytes(0x1000));
386 v->arch.arch_vmx.io_bitmap_a = 0;
387 }
388 if(v->arch.arch_vmx.io_bitmap_b != 0) {
389 free_xenheap_pages(
390 v->arch.arch_vmx.io_bitmap_b, get_order_from_bytes(0x1000));
391 v->arch.arch_vmx.io_bitmap_b = 0;
392 }
393 v->arch.arch_vmx.vmcs = 0;
394 return error;
395 }
396 #endif
399 /* This is called by arch_final_setup_guest and do_boot_vcpu */
400 int arch_set_info_guest(
401 struct vcpu *v, struct vcpu_guest_context *c)
402 {
403 struct domain *d = v->domain;
404 unsigned long phys_basetab;
405 int i, rc;
407 /*
408 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
409 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
410 * If SS RPL or DPL differs from CS RPL then we'll #GP.
411 */
412 if ( !(c->flags & VGCF_VMX_GUEST) )
413 {
414 if ( ((c->user_regs.cs & 3) == 0) ||
415 ((c->user_regs.ss & 3) == 0) )
416 return -EINVAL;
417 }
419 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
420 if ( c->flags & VGCF_I387_VALID )
421 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
423 v->arch.flags &= ~TF_kernel_mode;
424 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
425 v->arch.flags |= TF_kernel_mode;
427 memcpy(&v->arch.guest_context, c, sizeof(*c));
429 if ( !(c->flags & VGCF_VMX_GUEST) )
430 {
431 /* IOPL privileges are virtualised. */
432 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
433 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
435 /* Ensure real hardware interrupts are enabled. */
436 v->arch.guest_context.user_regs.eflags |= EF_IE;
437 }
438 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
439 {
440 return modify_vmcs(
441 &v->arch.arch_vmx,
442 &v->arch.guest_context.user_regs);
443 }
445 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
446 return 0;
448 memset(v->arch.guest_context.debugreg, 0,
449 sizeof(v->arch.guest_context.debugreg));
450 for ( i = 0; i < 8; i++ )
451 (void)set_debugreg(v, i, c->debugreg[i]);
453 if ( v->vcpu_id == 0 )
454 d->vm_assist = c->vm_assist;
456 phys_basetab = c->ctrlreg[3];
457 v->arch.guest_table = mk_pagetable(phys_basetab);
459 if ( shadow_mode_refcounts(d) )
460 {
461 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
462 return -EINVAL;
463 }
464 else
465 {
466 #ifdef __x86_64__
467 if ( !(c->flags & VGCF_VMX_GUEST) )
468 #endif
469 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
470 PGT_base_page_table) )
471 return -EINVAL;
472 }
474 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
475 {
476 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
477 return rc;
478 }
480 if ( c->flags & VGCF_VMX_GUEST )
481 {
482 /* VMX uses the initially provided page tables as the P2M map. */
483 if ( !pagetable_get_paddr(d->arch.phys_table) )
484 d->arch.phys_table = v->arch.guest_table;
486 if ( (rc = vmx_final_setup_guest(v, c)) != 0 )
487 return rc;
488 }
490 update_pagetables(v);
492 if ( v->vcpu_id == 0 )
493 init_domain_time(d);
495 /* Don't redo final setup */
496 set_bit(_VCPUF_initialised, &v->vcpu_flags);
498 return 0;
499 }
502 void new_thread(struct vcpu *d,
503 unsigned long start_pc,
504 unsigned long start_stack,
505 unsigned long start_info)
506 {
507 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
509 /*
510 * Initial register values:
511 * DS,ES,FS,GS = FLAT_KERNEL_DS
512 * CS:EIP = FLAT_KERNEL_CS:start_pc
513 * SS:ESP = FLAT_KERNEL_SS:start_stack
514 * ESI = start_info
515 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
516 */
517 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
518 regs->ss = FLAT_KERNEL_SS;
519 regs->cs = FLAT_KERNEL_CS;
520 regs->eip = start_pc;
521 regs->esp = start_stack;
522 regs->esi = start_info;
524 __save_flags(regs->eflags);
525 regs->eflags |= X86_EFLAGS_IF;
526 }
529 #ifdef __x86_64__
531 void toggle_guest_mode(struct vcpu *v)
532 {
533 v->arch.flags ^= TF_kernel_mode;
534 __asm__ __volatile__ ( "swapgs" );
535 update_pagetables(v);
536 write_ptbase(v);
537 }
539 #define loadsegment(seg,value) ({ \
540 int __r = 1; \
541 __asm__ __volatile__ ( \
542 "1: movl %k1,%%" #seg "\n2:\n" \
543 ".section .fixup,\"ax\"\n" \
544 "3: xorl %k0,%k0\n" \
545 " movl %k0,%%" #seg "\n" \
546 " jmp 2b\n" \
547 ".previous\n" \
548 ".section __ex_table,\"a\"\n" \
549 " .align 8\n" \
550 " .quad 1b,3b\n" \
551 ".previous" \
552 : "=r" (__r) : "r" (value), "0" (__r) );\
553 __r; })
555 #if CONFIG_VMX
556 #define load_msrs(n) if (vmx_switch_on) vmx_load_msrs(n)
557 #else
558 #define load_msrs(n) ((void)0)
559 #endif
561 /*
562 * save_segments() writes a mask of segments which are dirty (non-zero),
563 * allowing load_segments() to avoid some expensive segment loads and
564 * MSR writes.
565 */
566 #define DIRTY_DS 0x01
567 #define DIRTY_ES 0x02
568 #define DIRTY_FS 0x04
569 #define DIRTY_GS 0x08
570 #define DIRTY_FS_BASE 0x10
571 #define DIRTY_GS_BASE_USER 0x20
573 static void load_segments(struct vcpu *n)
574 {
575 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
576 int all_segs_okay = 1;
577 unsigned int dirty_segment_mask, cpu = smp_processor_id();
579 /* Load and clear the dirty segment mask. */
580 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
581 percpu_ctxt[cpu].dirty_segment_mask = 0;
583 /* Either selector != 0 ==> reload. */
584 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
585 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
587 /* Either selector != 0 ==> reload. */
588 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
589 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
591 /*
592 * Either selector != 0 ==> reload.
593 * Also reload to reset FS_BASE if it was non-zero.
594 */
595 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
596 nctxt->user_regs.fs) )
597 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
599 /*
600 * Either selector != 0 ==> reload.
601 * Also reload to reset GS_BASE if it was non-zero.
602 */
603 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
604 nctxt->user_regs.gs) )
605 {
606 /* Reset GS_BASE with user %gs? */
607 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
608 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
609 }
611 /* This can only be non-zero if selector is NULL. */
612 if ( nctxt->fs_base )
613 wrmsr(MSR_FS_BASE,
614 nctxt->fs_base,
615 nctxt->fs_base>>32);
617 /* Most kernels have non-zero GS base, so don't bother testing. */
618 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
619 wrmsr(MSR_SHADOW_GS_BASE,
620 nctxt->gs_base_kernel,
621 nctxt->gs_base_kernel>>32);
623 /* This can only be non-zero if selector is NULL. */
624 if ( nctxt->gs_base_user )
625 wrmsr(MSR_GS_BASE,
626 nctxt->gs_base_user,
627 nctxt->gs_base_user>>32);
629 /* If in kernel mode then switch the GS bases around. */
630 if ( n->arch.flags & TF_kernel_mode )
631 __asm__ __volatile__ ( "swapgs" );
633 if ( unlikely(!all_segs_okay) )
634 {
635 struct cpu_user_regs *regs = guest_cpu_user_regs();
636 unsigned long *rsp =
637 (n->arch.flags & TF_kernel_mode) ?
638 (unsigned long *)regs->rsp :
639 (unsigned long *)nctxt->kernel_sp;
641 if ( !(n->arch.flags & TF_kernel_mode) )
642 toggle_guest_mode(n);
643 else
644 regs->cs &= ~3;
646 if ( put_user(regs->ss, rsp- 1) |
647 put_user(regs->rsp, rsp- 2) |
648 put_user(regs->rflags, rsp- 3) |
649 put_user(regs->cs, rsp- 4) |
650 put_user(regs->rip, rsp- 5) |
651 put_user(nctxt->user_regs.gs, rsp- 6) |
652 put_user(nctxt->user_regs.fs, rsp- 7) |
653 put_user(nctxt->user_regs.es, rsp- 8) |
654 put_user(nctxt->user_regs.ds, rsp- 9) |
655 put_user(regs->r11, rsp-10) |
656 put_user(regs->rcx, rsp-11) )
657 {
658 DPRINTK("Error while creating failsafe callback frame.\n");
659 domain_crash();
660 }
662 regs->entry_vector = TRAP_syscall;
663 regs->rflags &= 0xFFFCBEFFUL;
664 regs->ss = __GUEST_SS;
665 regs->rsp = (unsigned long)(rsp-11);
666 regs->cs = __GUEST_CS;
667 regs->rip = nctxt->failsafe_callback_eip;
668 }
669 }
671 static void save_segments(struct vcpu *v)
672 {
673 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
674 struct cpu_user_regs *regs = &ctxt->user_regs;
675 unsigned int dirty_segment_mask = 0;
677 if ( VMX_DOMAIN(v) )
678 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
680 __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (regs->ds) );
681 __asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) );
682 __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) );
683 __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) );
685 if ( regs->ds )
686 dirty_segment_mask |= DIRTY_DS;
688 if ( regs->es )
689 dirty_segment_mask |= DIRTY_ES;
691 if ( regs->fs )
692 {
693 dirty_segment_mask |= DIRTY_FS;
694 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
695 }
696 else if ( ctxt->fs_base )
697 {
698 dirty_segment_mask |= DIRTY_FS_BASE;
699 }
701 if ( regs->gs )
702 {
703 dirty_segment_mask |= DIRTY_GS;
704 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
705 }
706 else if ( ctxt->gs_base_user )
707 {
708 dirty_segment_mask |= DIRTY_GS_BASE_USER;
709 }
711 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
712 }
714 long do_switch_to_user(void)
715 {
716 struct cpu_user_regs *regs = guest_cpu_user_regs();
717 struct switch_to_user stu;
718 struct vcpu *v = current;
720 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
721 unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
722 return -EFAULT;
724 toggle_guest_mode(v);
726 regs->rip = stu.rip;
727 regs->cs = stu.cs | 3; /* force guest privilege */
728 regs->rflags = stu.rflags;
729 regs->rsp = stu.rsp;
730 regs->ss = stu.ss | 3; /* force guest privilege */
732 if ( !(stu.flags & VGCF_IN_SYSCALL) )
733 {
734 regs->entry_vector = 0;
735 regs->r11 = stu.r11;
736 regs->rcx = stu.rcx;
737 }
739 /* Saved %rax gets written back to regs->rax in entry.S. */
740 return stu.rax;
741 }
743 #define switch_kernel_stack(_n,_c) ((void)0)
745 #elif defined(__i386__)
747 #define load_segments(n) ((void)0)
748 #define load_msrs(n) ((void)0)
749 #define save_segments(p) ((void)0)
751 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
752 {
753 struct tss_struct *tss = &init_tss[cpu];
754 tss->esp1 = n->arch.guest_context.kernel_sp;
755 tss->ss1 = n->arch.guest_context.kernel_ss;
756 }
758 #endif
760 #define loaddebug(_v,_reg) \
761 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
763 static void __context_switch(void)
764 {
765 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
766 unsigned int cpu = smp_processor_id();
767 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
768 struct vcpu *n = current;
770 if ( !is_idle_task(p->domain) )
771 {
772 memcpy(&p->arch.guest_context.user_regs,
773 stack_regs,
774 CTXT_SWITCH_STACK_BYTES);
775 unlazy_fpu(p);
776 save_segments(p);
777 }
779 if ( !is_idle_task(n->domain) )
780 {
781 memcpy(stack_regs,
782 &n->arch.guest_context.user_regs,
783 CTXT_SWITCH_STACK_BYTES);
785 /* Maybe switch the debug registers. */
786 if ( unlikely(n->arch.guest_context.debugreg[7]) )
787 {
788 loaddebug(&n->arch.guest_context, 0);
789 loaddebug(&n->arch.guest_context, 1);
790 loaddebug(&n->arch.guest_context, 2);
791 loaddebug(&n->arch.guest_context, 3);
792 /* no 4 and 5 */
793 loaddebug(&n->arch.guest_context, 6);
794 loaddebug(&n->arch.guest_context, 7);
795 }
797 if ( !VMX_DOMAIN(n) )
798 {
799 set_int80_direct_trap(n);
800 switch_kernel_stack(n, cpu);
801 }
802 }
804 if ( p->domain != n->domain )
805 cpu_set(cpu, n->domain->cpumask);
807 write_ptbase(n);
809 if ( p->vcpu_id != n->vcpu_id )
810 {
811 char gdt_load[10];
812 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
813 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
814 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
815 }
817 if ( p->domain != n->domain )
818 cpu_clear(cpu, p->domain->cpumask);
820 percpu_ctxt[cpu].curr_vcpu = n;
821 }
824 void context_switch(struct vcpu *prev, struct vcpu *next)
825 {
826 unsigned int cpu = smp_processor_id();
828 ASSERT(!local_irq_is_enabled());
830 set_current(next);
832 if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
833 {
834 __context_switch();
835 percpu_ctxt[cpu].context_not_finalised = 1;
836 }
837 }
839 void context_switch_finalise(struct vcpu *next)
840 {
841 unsigned int cpu = smp_processor_id();
843 ASSERT(local_irq_is_enabled());
845 if ( percpu_ctxt[cpu].context_not_finalised )
846 {
847 percpu_ctxt[cpu].context_not_finalised = 0;
849 BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
851 if ( VMX_DOMAIN(next) )
852 {
853 vmx_restore_msrs(next);
854 }
855 else
856 {
857 load_LDT(next);
858 load_segments(next);
859 load_msrs(next);
860 }
861 }
863 schedule_tail(next);
864 BUG();
865 }
867 void continue_running(struct vcpu *same)
868 {
869 schedule_tail(same);
870 BUG();
871 }
873 int __sync_lazy_execstate(void)
874 {
875 unsigned long flags;
876 int switch_required;
878 local_irq_save(flags);
880 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
882 if ( switch_required )
883 __context_switch();
885 local_irq_restore(flags);
887 return switch_required;
888 }
890 void sync_vcpu_execstate(struct vcpu *v)
891 {
892 unsigned int cpu = v->processor;
894 if ( !cpu_isset(cpu, v->domain->cpumask) )
895 return;
897 if ( cpu == smp_processor_id() )
898 {
899 (void)__sync_lazy_execstate();
900 }
901 else
902 {
903 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
904 flush_tlb_mask(cpumask_of_cpu(cpu));
905 }
906 }
908 unsigned long __hypercall_create_continuation(
909 unsigned int op, unsigned int nr_args, ...)
910 {
911 struct mc_state *mcs = &mc_state[smp_processor_id()];
912 struct cpu_user_regs *regs;
913 unsigned int i;
914 va_list args;
916 va_start(args, nr_args);
918 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
919 {
920 __set_bit(_MCSF_call_preempted, &mcs->flags);
922 for ( i = 0; i < nr_args; i++ )
923 mcs->call.args[i] = va_arg(args, unsigned long);
924 }
925 else
926 {
927 regs = guest_cpu_user_regs();
928 #if defined(__i386__)
929 regs->eax = op;
930 regs->eip -= 2; /* re-execute 'int 0x82' */
932 for ( i = 0; i < nr_args; i++ )
933 {
934 switch ( i )
935 {
936 case 0: regs->ebx = va_arg(args, unsigned long); break;
937 case 1: regs->ecx = va_arg(args, unsigned long); break;
938 case 2: regs->edx = va_arg(args, unsigned long); break;
939 case 3: regs->esi = va_arg(args, unsigned long); break;
940 case 4: regs->edi = va_arg(args, unsigned long); break;
941 case 5: regs->ebp = va_arg(args, unsigned long); break;
942 }
943 }
944 #elif defined(__x86_64__)
945 regs->rax = op;
946 regs->rip -= 2; /* re-execute 'syscall' */
948 for ( i = 0; i < nr_args; i++ )
949 {
950 switch ( i )
951 {
952 case 0: regs->rdi = va_arg(args, unsigned long); break;
953 case 1: regs->rsi = va_arg(args, unsigned long); break;
954 case 2: regs->rdx = va_arg(args, unsigned long); break;
955 case 3: regs->r10 = va_arg(args, unsigned long); break;
956 case 4: regs->r8 = va_arg(args, unsigned long); break;
957 case 5: regs->r9 = va_arg(args, unsigned long); break;
958 }
959 }
960 #endif
961 }
963 va_end(args);
965 return op;
966 }
968 #ifdef CONFIG_VMX
969 static void vmx_relinquish_resources(struct vcpu *v)
970 {
971 if ( !VMX_DOMAIN(v) )
972 return;
974 BUG_ON(v->arch.arch_vmx.vmcs == NULL);
975 free_vmcs(v->arch.arch_vmx.vmcs);
976 if(v->arch.arch_vmx.io_bitmap_a != 0) {
977 free_xenheap_pages(
978 v->arch.arch_vmx.io_bitmap_a, get_order_from_bytes(0x1000));
979 v->arch.arch_vmx.io_bitmap_a = 0;
980 }
981 if(v->arch.arch_vmx.io_bitmap_b != 0) {
982 free_xenheap_pages(
983 v->arch.arch_vmx.io_bitmap_b, get_order_from_bytes(0x1000));
984 v->arch.arch_vmx.io_bitmap_b = 0;
985 }
986 v->arch.arch_vmx.vmcs = 0;
988 free_monitor_pagetable(v);
989 rem_ac_timer(&v->domain->arch.vmx_platform.vmx_pit.pit_timer);
990 }
991 #else
992 #define vmx_relinquish_resources(_v) ((void)0)
993 #endif
995 static void relinquish_memory(struct domain *d, struct list_head *list)
996 {
997 struct list_head *ent;
998 struct pfn_info *page;
999 unsigned long x, y;
1001 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1002 spin_lock_recursive(&d->page_alloc_lock);
1004 ent = list->next;
1005 while ( ent != list )
1007 page = list_entry(ent, struct pfn_info, list);
1009 /* Grab a reference to the page so it won't disappear from under us. */
1010 if ( unlikely(!get_page(page, d)) )
1012 /* Couldn't get a reference -- someone is freeing this page. */
1013 ent = ent->next;
1014 continue;
1017 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1018 put_page_and_type(page);
1020 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1021 put_page(page);
1023 /*
1024 * Forcibly invalidate base page tables at this point to break circular
1025 * 'linear page table' references. This is okay because MMU structures
1026 * are not shared across domains and this domain is now dead. Thus base
1027 * tables are not in use so a non-zero count means circular reference.
1028 */
1029 y = page->u.inuse.type_info;
1030 for ( ; ; )
1032 x = y;
1033 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1034 (PGT_base_page_table|PGT_validated)) )
1035 break;
1037 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1038 if ( likely(y == x) )
1040 free_page_type(page, PGT_base_page_table);
1041 break;
1045 /* Follow the list chain and /then/ potentially free the page. */
1046 ent = ent->next;
1047 put_page(page);
1050 spin_unlock_recursive(&d->page_alloc_lock);
1053 void domain_relinquish_resources(struct domain *d)
1055 struct vcpu *v;
1056 unsigned long pfn;
1058 BUG_ON(!cpus_empty(d->cpumask));
1060 physdev_destroy_state(d);
1062 ptwr_destroy(d);
1064 /* Release device mappings of other domains */
1065 gnttab_release_dev_mappings(d->grant_table);
1067 /* Drop the in-use references to page-table bases. */
1068 for_each_vcpu ( d, v )
1070 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
1072 if ( !shadow_mode_refcounts(d) )
1073 put_page_type(pfn_to_page(pfn));
1074 put_page(pfn_to_page(pfn));
1076 v->arch.guest_table = mk_pagetable(0);
1079 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
1081 if ( !shadow_mode_refcounts(d) )
1082 put_page_type(pfn_to_page(pfn));
1083 put_page(pfn_to_page(pfn));
1085 v->arch.guest_table_user = mk_pagetable(0);
1088 vmx_relinquish_resources(v);
1091 shadow_mode_disable(d);
1093 /*
1094 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1095 * it automatically gets squashed when the guest's mappings go away.
1096 */
1097 for_each_vcpu(d, v)
1098 destroy_gdt(v);
1100 /* Relinquish every page of memory. */
1101 relinquish_memory(d, &d->xenpage_list);
1102 relinquish_memory(d, &d->page_list);
1106 /*
1107 * Local variables:
1108 * mode: C
1109 * c-set-style: "BSD"
1110 * c-basic-offset: 4
1111 * tab-width: 4
1112 * indent-tabs-mode: nil
1113 * End:
1114 */