ia64/xen-unstable

view xen/arch/x86/domain.c @ 6766:219d96d545fc

merge?
author cl349@firebug.cl.cam.ac.uk
date Mon Sep 12 20:00:41 2005 +0000 (2005-09-12)
parents cdfa7dd00c44 888094e5ac07
children b5d91089e42c
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <asm/physdev.h>
41 #include <xen/kernel.h>
42 #include <public/io/ioreq.h>
43 #include <xen/multicall.h>
45 /* opt_noreboot: If true, machine will need manual reset on error. */
46 static int opt_noreboot = 0;
47 boolean_param("noreboot", opt_noreboot);
49 struct percpu_ctxt {
50 struct vcpu *curr_vcpu;
51 unsigned int context_not_finalised;
52 unsigned int dirty_segment_mask;
53 } __cacheline_aligned;
54 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
56 static void continue_idle_task(struct vcpu *v)
57 {
58 reset_stack_and_jump(idle_loop);
59 }
61 static void continue_nonidle_task(struct vcpu *v)
62 {
63 reset_stack_and_jump(ret_from_intr);
64 }
66 static void default_idle(void)
67 {
68 local_irq_disable();
69 if ( !softirq_pending(smp_processor_id()) )
70 safe_halt();
71 else
72 local_irq_enable();
73 }
75 void idle_loop(void)
76 {
77 int cpu = smp_processor_id();
79 for ( ; ; )
80 {
81 irq_stat[cpu].idle_timestamp = jiffies;
83 while ( !softirq_pending(cpu) )
84 {
85 page_scrub_schedule_work();
86 default_idle();
87 }
89 do_softirq();
90 }
91 }
93 void startup_cpu_idle_loop(void)
94 {
95 struct vcpu *v = current;
97 ASSERT(is_idle_task(v->domain));
98 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
99 cpu_set(smp_processor_id(), v->domain->cpumask);
100 v->arch.schedule_tail = continue_idle_task;
102 idle_loop();
103 }
105 static long no_idt[2];
106 static int reboot_mode;
108 static inline void kb_wait(void)
109 {
110 int i;
112 for ( i = 0; i < 0x10000; i++ )
113 if ( (inb_p(0x64) & 0x02) == 0 )
114 break;
115 }
117 void machine_restart(char * __unused)
118 {
119 int i;
121 if ( opt_noreboot )
122 {
123 printk("Reboot disabled on cmdline: require manual reset\n");
124 for ( ; ; )
125 safe_halt();
126 }
128 watchdog_disable();
129 console_start_sync();
131 local_irq_enable();
133 /* Ensure we are the boot CPU. */
134 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
135 {
136 smp_call_function((void *)machine_restart, NULL, 1, 0);
137 for ( ; ; )
138 safe_halt();
139 }
141 /*
142 * Stop all CPUs and turn off local APICs and the IO-APIC, so
143 * other OSs see a clean IRQ state.
144 */
145 smp_send_stop();
146 disable_IO_APIC();
148 #ifdef CONFIG_VMX
149 stop_vmx();
150 #endif
152 /* Rebooting needs to touch the page at absolute address 0. */
153 *((unsigned short *)__va(0x472)) = reboot_mode;
155 for ( ; ; )
156 {
157 /* Pulse the keyboard reset line. */
158 for ( i = 0; i < 100; i++ )
159 {
160 kb_wait();
161 udelay(50);
162 outb(0xfe,0x64); /* pulse reset low */
163 udelay(50);
164 }
166 /* That didn't work - force a triple fault.. */
167 __asm__ __volatile__("lidt %0": "=m" (no_idt));
168 __asm__ __volatile__("int3");
169 }
170 }
173 void __attribute__((noreturn)) __machine_halt(void *unused)
174 {
175 for ( ; ; )
176 safe_halt();
177 }
179 void machine_halt(void)
180 {
181 watchdog_disable();
182 console_start_sync();
183 smp_call_function(__machine_halt, NULL, 1, 0);
184 __machine_halt(NULL);
185 }
187 void dump_pageframe_info(struct domain *d)
188 {
189 struct pfn_info *page;
191 if ( d->tot_pages < 10 )
192 {
193 list_for_each_entry ( page, &d->page_list, list )
194 {
195 printk("Page %p: caf=%08x, taf=%" PRtype_info "\n",
196 _p(page_to_phys(page)), page->count_info,
197 page->u.inuse.type_info);
198 }
199 }
201 list_for_each_entry ( page, &d->xenpage_list, list )
202 {
203 printk("XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
204 _p(page_to_phys(page)), page->count_info,
205 page->u.inuse.type_info);
206 }
209 page = virt_to_page(d->shared_info);
210 printk("Shared_info@%p: caf=%08x, taf=%" PRtype_info "\n",
211 _p(page_to_phys(page)), page->count_info,
212 page->u.inuse.type_info);
213 }
215 struct vcpu *arch_alloc_vcpu_struct(void)
216 {
217 return xmalloc(struct vcpu);
218 }
220 /* We assume that vcpu 0 is always the last one to be freed in a
221 domain i.e. if v->vcpu_id == 0, the domain should be
222 single-processor. */
223 void arch_free_vcpu_struct(struct vcpu *v)
224 {
225 struct vcpu *p;
226 for_each_vcpu(v->domain, p) {
227 if (p->next_in_list == v)
228 p->next_in_list = v->next_in_list;
229 }
230 xfree(v);
231 }
233 void free_perdomain_pt(struct domain *d)
234 {
235 free_xenheap_page(d->arch.mm_perdomain_pt);
236 #ifdef __x86_64__
237 free_xenheap_page(d->arch.mm_perdomain_l2);
238 free_xenheap_page(d->arch.mm_perdomain_l3);
239 #endif
240 }
242 void arch_do_createdomain(struct vcpu *v)
243 {
244 struct domain *d = v->domain;
246 v->arch.flags = TF_kernel_mode;
248 if ( is_idle_task(d) )
249 return;
251 v->arch.schedule_tail = continue_nonidle_task;
253 d->shared_info = alloc_xenheap_page();
254 memset(d->shared_info, 0, PAGE_SIZE);
255 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
256 v->cpumap = CPUMAP_RUNANYWHERE;
257 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
258 set_pfn_from_mfn(virt_to_phys(d->shared_info) >> PAGE_SHIFT,
259 INVALID_M2P_ENTRY);
261 d->arch.mm_perdomain_pt = alloc_xenheap_page();
262 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
263 set_pfn_from_mfn(virt_to_phys(d->arch.mm_perdomain_pt) >> PAGE_SHIFT,
264 INVALID_M2P_ENTRY);
265 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
266 v->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
267 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
269 v->arch.guest_vtable = __linear_l2_table;
270 v->arch.shadow_vtable = __shadow_linear_l2_table;
272 #ifdef __x86_64__
273 v->arch.guest_vl3table = __linear_l3_table;
274 v->arch.guest_vl4table = __linear_l4_table;
276 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
277 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
278 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
279 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt),
280 __PAGE_HYPERVISOR);
281 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
282 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
283 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
284 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
285 __PAGE_HYPERVISOR);
286 #endif
288 (void)ptwr_init(d);
290 shadow_lock_init(d);
291 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
292 }
294 void arch_do_boot_vcpu(struct vcpu *v)
295 {
296 struct domain *d = v->domain;
298 v->arch.flags = TF_kernel_mode;
300 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
302 v->arch.perdomain_ptes =
303 d->arch.mm_perdomain_pt + (v->vcpu_id << PDPT_VCPU_SHIFT);
304 v->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
305 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
306 }
308 void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
309 {
310 if ( v->processor == newcpu )
311 return;
313 set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
314 v->processor = newcpu;
316 if ( VMX_DOMAIN(v) )
317 {
318 __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs));
319 v->arch.schedule_tail = arch_vmx_do_relaunch;
320 }
321 }
323 #ifdef CONFIG_VMX
324 static int vmx_switch_on;
326 static int vmx_final_setup_guest(
327 struct vcpu *v, struct vcpu_guest_context *ctxt)
328 {
329 int error;
330 struct cpu_user_regs *regs;
331 struct vmcs_struct *vmcs;
333 regs = &ctxt->user_regs;
335 /*
336 * Create a new VMCS
337 */
338 if (!(vmcs = alloc_vmcs())) {
339 printk("Failed to create a new VMCS\n");
340 return -ENOMEM;
341 }
343 memset(&v->arch.arch_vmx, 0, sizeof (struct arch_vmx_struct));
345 v->arch.arch_vmx.vmcs = vmcs;
346 error = construct_vmcs(
347 &v->arch.arch_vmx, regs, ctxt, VMCS_USE_HOST_ENV);
348 if ( error < 0 )
349 {
350 printk("Failed to construct a new VMCS\n");
351 goto out;
352 }
354 v->arch.schedule_tail = arch_vmx_do_launch;
356 #if defined (__i386__)
357 v->domain->arch.vmx_platform.real_mode_data =
358 (unsigned long *) regs->esi;
359 #endif
361 if (v == v->domain->vcpu[0]) {
362 /*
363 * Required to do this once per domain
364 * XXX todo: add a seperate function to do these.
365 */
366 memset(&v->domain->shared_info->evtchn_mask[0], 0xff,
367 sizeof(v->domain->shared_info->evtchn_mask));
369 /* Put the domain in shadow mode even though we're going to be using
370 * the shared 1:1 page table initially. It shouldn't hurt */
371 shadow_mode_enable(v->domain,
372 SHM_enable|SHM_refcounts|
373 SHM_translate|SHM_external);
374 }
376 if (!vmx_switch_on)
377 vmx_switch_on = 1;
379 return 0;
381 out:
382 free_vmcs(vmcs);
383 if(v->arch.arch_vmx.io_bitmap_a != 0) {
384 free_xenheap_pages(
385 v->arch.arch_vmx.io_bitmap_a, get_order_from_bytes(0x1000));
386 v->arch.arch_vmx.io_bitmap_a = 0;
387 }
388 if(v->arch.arch_vmx.io_bitmap_b != 0) {
389 free_xenheap_pages(
390 v->arch.arch_vmx.io_bitmap_b, get_order_from_bytes(0x1000));
391 v->arch.arch_vmx.io_bitmap_b = 0;
392 }
393 v->arch.arch_vmx.vmcs = 0;
394 return error;
395 }
396 #endif
399 /* This is called by arch_final_setup_guest and do_boot_vcpu */
400 int arch_set_info_guest(
401 struct vcpu *v, struct vcpu_guest_context *c)
402 {
403 struct domain *d = v->domain;
404 unsigned long phys_basetab;
405 int i, rc;
407 /*
408 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
409 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
410 * If SS RPL or DPL differs from CS RPL then we'll #GP.
411 */
412 if ( !(c->flags & VGCF_VMX_GUEST) )
413 {
414 if ( ((c->user_regs.cs & 3) == 0) ||
415 ((c->user_regs.ss & 3) == 0) )
416 return -EINVAL;
417 }
419 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
420 if ( c->flags & VGCF_I387_VALID )
421 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
423 v->arch.flags &= ~TF_kernel_mode;
424 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
425 v->arch.flags |= TF_kernel_mode;
427 memcpy(&v->arch.guest_context, c, sizeof(*c));
429 if ( !(c->flags & VGCF_VMX_GUEST) )
430 {
431 /* IOPL privileges are virtualised. */
432 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
433 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
435 /* Ensure real hardware interrupts are enabled. */
436 v->arch.guest_context.user_regs.eflags |= EF_IE;
437 }
438 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
439 {
440 return modify_vmcs(
441 &v->arch.arch_vmx,
442 &v->arch.guest_context.user_regs);
443 }
445 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
446 return 0;
448 memset(v->arch.guest_context.debugreg, 0,
449 sizeof(v->arch.guest_context.debugreg));
450 for ( i = 0; i < 8; i++ )
451 (void)set_debugreg(v, i, c->debugreg[i]);
453 if ( v->vcpu_id == 0 )
454 d->vm_assist = c->vm_assist;
456 phys_basetab = c->ctrlreg[3];
457 v->arch.guest_table = mk_pagetable(phys_basetab);
459 if ( shadow_mode_refcounts(d) )
460 {
461 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
462 return -EINVAL;
463 }
464 else if ( !(c->flags & VGCF_VMX_GUEST) )
465 {
466 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
467 PGT_base_page_table) )
468 return -EINVAL;
469 }
471 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
472 {
473 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
474 return rc;
475 }
477 if ( c->flags & VGCF_VMX_GUEST )
478 {
479 /* VMX uses the initially provided page tables as the P2M map. */
480 if ( !pagetable_get_paddr(d->arch.phys_table) )
481 d->arch.phys_table = v->arch.guest_table;
483 if ( (rc = vmx_final_setup_guest(v, c)) != 0 )
484 return rc;
485 }
487 update_pagetables(v);
489 if ( v->vcpu_id == 0 )
490 init_domain_time(d);
492 /* Don't redo final setup */
493 set_bit(_VCPUF_initialised, &v->vcpu_flags);
495 return 0;
496 }
499 void new_thread(struct vcpu *d,
500 unsigned long start_pc,
501 unsigned long start_stack,
502 unsigned long start_info)
503 {
504 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
506 /*
507 * Initial register values:
508 * DS,ES,FS,GS = FLAT_KERNEL_DS
509 * CS:EIP = FLAT_KERNEL_CS:start_pc
510 * SS:ESP = FLAT_KERNEL_SS:start_stack
511 * ESI = start_info
512 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
513 */
514 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
515 regs->ss = FLAT_KERNEL_SS;
516 regs->cs = FLAT_KERNEL_CS;
517 regs->eip = start_pc;
518 regs->esp = start_stack;
519 regs->esi = start_info;
521 __save_flags(regs->eflags);
522 regs->eflags |= X86_EFLAGS_IF;
523 }
526 #ifdef __x86_64__
528 void toggle_guest_mode(struct vcpu *v)
529 {
530 v->arch.flags ^= TF_kernel_mode;
531 __asm__ __volatile__ ( "swapgs" );
532 update_pagetables(v);
533 write_ptbase(v);
534 }
536 #define loadsegment(seg,value) ({ \
537 int __r = 1; \
538 __asm__ __volatile__ ( \
539 "1: movl %k1,%%" #seg "\n2:\n" \
540 ".section .fixup,\"ax\"\n" \
541 "3: xorl %k0,%k0\n" \
542 " movl %k0,%%" #seg "\n" \
543 " jmp 2b\n" \
544 ".previous\n" \
545 ".section __ex_table,\"a\"\n" \
546 " .align 8\n" \
547 " .quad 1b,3b\n" \
548 ".previous" \
549 : "=r" (__r) : "r" (value), "0" (__r) );\
550 __r; })
552 #if CONFIG_VMX
553 #define load_msrs(n) if (vmx_switch_on) vmx_load_msrs(n)
554 #else
555 #define load_msrs(n) ((void)0)
556 #endif
558 /*
559 * save_segments() writes a mask of segments which are dirty (non-zero),
560 * allowing load_segments() to avoid some expensive segment loads and
561 * MSR writes.
562 */
563 #define DIRTY_DS 0x01
564 #define DIRTY_ES 0x02
565 #define DIRTY_FS 0x04
566 #define DIRTY_GS 0x08
567 #define DIRTY_FS_BASE 0x10
568 #define DIRTY_GS_BASE_USER 0x20
570 static void load_segments(struct vcpu *n)
571 {
572 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
573 int all_segs_okay = 1;
574 unsigned int dirty_segment_mask, cpu = smp_processor_id();
576 /* Load and clear the dirty segment mask. */
577 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
578 percpu_ctxt[cpu].dirty_segment_mask = 0;
580 /* Either selector != 0 ==> reload. */
581 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
582 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
584 /* Either selector != 0 ==> reload. */
585 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
586 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
588 /*
589 * Either selector != 0 ==> reload.
590 * Also reload to reset FS_BASE if it was non-zero.
591 */
592 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
593 nctxt->user_regs.fs) )
594 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
596 /*
597 * Either selector != 0 ==> reload.
598 * Also reload to reset GS_BASE if it was non-zero.
599 */
600 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
601 nctxt->user_regs.gs) )
602 {
603 /* Reset GS_BASE with user %gs? */
604 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
605 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
606 }
608 /* This can only be non-zero if selector is NULL. */
609 if ( nctxt->fs_base )
610 wrmsr(MSR_FS_BASE,
611 nctxt->fs_base,
612 nctxt->fs_base>>32);
614 /* Most kernels have non-zero GS base, so don't bother testing. */
615 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
616 wrmsr(MSR_SHADOW_GS_BASE,
617 nctxt->gs_base_kernel,
618 nctxt->gs_base_kernel>>32);
620 /* This can only be non-zero if selector is NULL. */
621 if ( nctxt->gs_base_user )
622 wrmsr(MSR_GS_BASE,
623 nctxt->gs_base_user,
624 nctxt->gs_base_user>>32);
626 /* If in kernel mode then switch the GS bases around. */
627 if ( n->arch.flags & TF_kernel_mode )
628 __asm__ __volatile__ ( "swapgs" );
630 if ( unlikely(!all_segs_okay) )
631 {
632 struct cpu_user_regs *regs = guest_cpu_user_regs();
633 unsigned long *rsp =
634 (n->arch.flags & TF_kernel_mode) ?
635 (unsigned long *)regs->rsp :
636 (unsigned long *)nctxt->kernel_sp;
638 if ( !(n->arch.flags & TF_kernel_mode) )
639 toggle_guest_mode(n);
640 else
641 regs->cs &= ~3;
643 if ( put_user(regs->ss, rsp- 1) |
644 put_user(regs->rsp, rsp- 2) |
645 put_user(regs->rflags, rsp- 3) |
646 put_user(regs->cs, rsp- 4) |
647 put_user(regs->rip, rsp- 5) |
648 put_user(nctxt->user_regs.gs, rsp- 6) |
649 put_user(nctxt->user_regs.fs, rsp- 7) |
650 put_user(nctxt->user_regs.es, rsp- 8) |
651 put_user(nctxt->user_regs.ds, rsp- 9) |
652 put_user(regs->r11, rsp-10) |
653 put_user(regs->rcx, rsp-11) )
654 {
655 DPRINTK("Error while creating failsafe callback frame.\n");
656 domain_crash();
657 }
659 regs->entry_vector = TRAP_syscall;
660 regs->rflags &= 0xFFFCBEFFUL;
661 regs->ss = __GUEST_SS;
662 regs->rsp = (unsigned long)(rsp-11);
663 regs->cs = __GUEST_CS;
664 regs->rip = nctxt->failsafe_callback_eip;
665 }
666 }
668 static void save_segments(struct vcpu *v)
669 {
670 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
671 struct cpu_user_regs *regs = &ctxt->user_regs;
672 unsigned int dirty_segment_mask = 0;
674 if ( VMX_DOMAIN(v) )
675 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
677 __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (regs->ds) );
678 __asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) );
679 __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) );
680 __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) );
682 if ( regs->ds )
683 dirty_segment_mask |= DIRTY_DS;
685 if ( regs->es )
686 dirty_segment_mask |= DIRTY_ES;
688 if ( regs->fs )
689 {
690 dirty_segment_mask |= DIRTY_FS;
691 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
692 }
693 else if ( ctxt->fs_base )
694 {
695 dirty_segment_mask |= DIRTY_FS_BASE;
696 }
698 if ( regs->gs )
699 {
700 dirty_segment_mask |= DIRTY_GS;
701 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
702 }
703 else if ( ctxt->gs_base_user )
704 {
705 dirty_segment_mask |= DIRTY_GS_BASE_USER;
706 }
708 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
709 }
711 long do_switch_to_user(void)
712 {
713 struct cpu_user_regs *regs = guest_cpu_user_regs();
714 struct switch_to_user stu;
715 struct vcpu *v = current;
717 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
718 unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
719 return -EFAULT;
721 toggle_guest_mode(v);
723 regs->rip = stu.rip;
724 regs->cs = stu.cs | 3; /* force guest privilege */
725 regs->rflags = stu.rflags;
726 regs->rsp = stu.rsp;
727 regs->ss = stu.ss | 3; /* force guest privilege */
729 if ( !(stu.flags & VGCF_IN_SYSCALL) )
730 {
731 regs->entry_vector = 0;
732 regs->r11 = stu.r11;
733 regs->rcx = stu.rcx;
734 }
736 /* Saved %rax gets written back to regs->rax in entry.S. */
737 return stu.rax;
738 }
740 #define switch_kernel_stack(_n,_c) ((void)0)
742 #elif defined(__i386__)
744 #define load_segments(n) ((void)0)
745 #define load_msrs(n) ((void)0)
746 #define save_segments(p) ((void)0)
748 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
749 {
750 struct tss_struct *tss = &init_tss[cpu];
751 tss->esp1 = n->arch.guest_context.kernel_sp;
752 tss->ss1 = n->arch.guest_context.kernel_ss;
753 }
755 #endif
757 #define loaddebug(_v,_reg) \
758 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
760 static void __context_switch(void)
761 {
762 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
763 unsigned int cpu = smp_processor_id();
764 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
765 struct vcpu *n = current;
767 if ( !is_idle_task(p->domain) )
768 {
769 memcpy(&p->arch.guest_context.user_regs,
770 stack_regs,
771 CTXT_SWITCH_STACK_BYTES);
772 unlazy_fpu(p);
773 save_segments(p);
774 }
776 if ( !is_idle_task(n->domain) )
777 {
778 memcpy(stack_regs,
779 &n->arch.guest_context.user_regs,
780 CTXT_SWITCH_STACK_BYTES);
782 /* Maybe switch the debug registers. */
783 if ( unlikely(n->arch.guest_context.debugreg[7]) )
784 {
785 loaddebug(&n->arch.guest_context, 0);
786 loaddebug(&n->arch.guest_context, 1);
787 loaddebug(&n->arch.guest_context, 2);
788 loaddebug(&n->arch.guest_context, 3);
789 /* no 4 and 5 */
790 loaddebug(&n->arch.guest_context, 6);
791 loaddebug(&n->arch.guest_context, 7);
792 }
794 if ( !VMX_DOMAIN(n) )
795 {
796 set_int80_direct_trap(n);
797 switch_kernel_stack(n, cpu);
798 }
799 }
801 if ( p->domain != n->domain )
802 cpu_set(cpu, n->domain->cpumask);
804 write_ptbase(n);
806 if ( p->vcpu_id != n->vcpu_id )
807 {
808 char gdt_load[10];
809 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
810 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
811 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
812 }
814 if ( p->domain != n->domain )
815 cpu_clear(cpu, p->domain->cpumask);
817 percpu_ctxt[cpu].curr_vcpu = n;
818 }
821 void context_switch(struct vcpu *prev, struct vcpu *next)
822 {
823 unsigned int cpu = smp_processor_id();
825 ASSERT(!local_irq_is_enabled());
827 set_current(next);
829 if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
830 {
831 __context_switch();
832 percpu_ctxt[cpu].context_not_finalised = 1;
833 }
834 }
836 void context_switch_finalise(struct vcpu *next)
837 {
838 unsigned int cpu = smp_processor_id();
840 ASSERT(local_irq_is_enabled());
842 if ( percpu_ctxt[cpu].context_not_finalised )
843 {
844 percpu_ctxt[cpu].context_not_finalised = 0;
846 BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
848 if ( VMX_DOMAIN(next) )
849 {
850 vmx_restore_msrs(next);
851 }
852 else
853 {
854 load_LDT(next);
855 load_segments(next);
856 load_msrs(next);
857 }
858 }
860 schedule_tail(next);
861 BUG();
862 }
864 void continue_running(struct vcpu *same)
865 {
866 schedule_tail(same);
867 BUG();
868 }
870 int __sync_lazy_execstate(void)
871 {
872 unsigned long flags;
873 int switch_required;
875 local_irq_save(flags);
877 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
879 if ( switch_required )
880 __context_switch();
882 local_irq_restore(flags);
884 return switch_required;
885 }
887 void sync_vcpu_execstate(struct vcpu *v)
888 {
889 unsigned int cpu = v->processor;
891 if ( !cpu_isset(cpu, v->domain->cpumask) )
892 return;
894 if ( cpu == smp_processor_id() )
895 {
896 (void)__sync_lazy_execstate();
897 }
898 else
899 {
900 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
901 flush_tlb_mask(cpumask_of_cpu(cpu));
902 }
903 }
905 unsigned long __hypercall_create_continuation(
906 unsigned int op, unsigned int nr_args, ...)
907 {
908 struct mc_state *mcs = &mc_state[smp_processor_id()];
909 struct cpu_user_regs *regs;
910 unsigned int i;
911 va_list args;
913 va_start(args, nr_args);
915 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
916 {
917 __set_bit(_MCSF_call_preempted, &mcs->flags);
919 for ( i = 0; i < nr_args; i++ )
920 mcs->call.args[i] = va_arg(args, unsigned long);
921 }
922 else
923 {
924 regs = guest_cpu_user_regs();
925 #if defined(__i386__)
926 regs->eax = op;
927 regs->eip -= 2; /* re-execute 'int 0x82' */
929 for ( i = 0; i < nr_args; i++ )
930 {
931 switch ( i )
932 {
933 case 0: regs->ebx = va_arg(args, unsigned long); break;
934 case 1: regs->ecx = va_arg(args, unsigned long); break;
935 case 2: regs->edx = va_arg(args, unsigned long); break;
936 case 3: regs->esi = va_arg(args, unsigned long); break;
937 case 4: regs->edi = va_arg(args, unsigned long); break;
938 case 5: regs->ebp = va_arg(args, unsigned long); break;
939 }
940 }
941 #elif defined(__x86_64__)
942 regs->rax = op;
943 regs->rip -= 2; /* re-execute 'syscall' */
945 for ( i = 0; i < nr_args; i++ )
946 {
947 switch ( i )
948 {
949 case 0: regs->rdi = va_arg(args, unsigned long); break;
950 case 1: regs->rsi = va_arg(args, unsigned long); break;
951 case 2: regs->rdx = va_arg(args, unsigned long); break;
952 case 3: regs->r10 = va_arg(args, unsigned long); break;
953 case 4: regs->r8 = va_arg(args, unsigned long); break;
954 case 5: regs->r9 = va_arg(args, unsigned long); break;
955 }
956 }
957 #endif
958 }
960 va_end(args);
962 return op;
963 }
965 #ifdef CONFIG_VMX
966 static void vmx_relinquish_resources(struct vcpu *v)
967 {
968 if ( !VMX_DOMAIN(v) )
969 return;
971 BUG_ON(v->arch.arch_vmx.vmcs == NULL);
972 free_vmcs(v->arch.arch_vmx.vmcs);
973 if(v->arch.arch_vmx.io_bitmap_a != 0) {
974 free_xenheap_pages(
975 v->arch.arch_vmx.io_bitmap_a, get_order_from_bytes(0x1000));
976 v->arch.arch_vmx.io_bitmap_a = 0;
977 }
978 if(v->arch.arch_vmx.io_bitmap_b != 0) {
979 free_xenheap_pages(
980 v->arch.arch_vmx.io_bitmap_b, get_order_from_bytes(0x1000));
981 v->arch.arch_vmx.io_bitmap_b = 0;
982 }
983 v->arch.arch_vmx.vmcs = 0;
985 free_monitor_pagetable(v);
986 rem_ac_timer(&v->domain->arch.vmx_platform.vmx_pit.pit_timer);
987 }
988 #else
989 #define vmx_relinquish_resources(_v) ((void)0)
990 #endif
992 static void relinquish_memory(struct domain *d, struct list_head *list)
993 {
994 struct list_head *ent;
995 struct pfn_info *page;
996 unsigned long x, y;
998 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
999 spin_lock_recursive(&d->page_alloc_lock);
1001 ent = list->next;
1002 while ( ent != list )
1004 page = list_entry(ent, struct pfn_info, list);
1006 /* Grab a reference to the page so it won't disappear from under us. */
1007 if ( unlikely(!get_page(page, d)) )
1009 /* Couldn't get a reference -- someone is freeing this page. */
1010 ent = ent->next;
1011 continue;
1014 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1015 put_page_and_type(page);
1017 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1018 put_page(page);
1020 /*
1021 * Forcibly invalidate base page tables at this point to break circular
1022 * 'linear page table' references. This is okay because MMU structures
1023 * are not shared across domains and this domain is now dead. Thus base
1024 * tables are not in use so a non-zero count means circular reference.
1025 */
1026 y = page->u.inuse.type_info;
1027 for ( ; ; )
1029 x = y;
1030 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1031 (PGT_base_page_table|PGT_validated)) )
1032 break;
1034 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1035 if ( likely(y == x) )
1037 free_page_type(page, PGT_base_page_table);
1038 break;
1042 /* Follow the list chain and /then/ potentially free the page. */
1043 ent = ent->next;
1044 put_page(page);
1047 spin_unlock_recursive(&d->page_alloc_lock);
1050 void domain_relinquish_resources(struct domain *d)
1052 struct vcpu *v;
1053 unsigned long pfn;
1055 BUG_ON(!cpus_empty(d->cpumask));
1057 physdev_destroy_state(d);
1059 ptwr_destroy(d);
1061 /* Release device mappings of other domains */
1062 gnttab_release_dev_mappings(d->grant_table);
1064 /* Drop the in-use references to page-table bases. */
1065 for_each_vcpu ( d, v )
1067 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
1069 if ( !shadow_mode_refcounts(d) )
1070 put_page_type(pfn_to_page(pfn));
1071 put_page(pfn_to_page(pfn));
1073 v->arch.guest_table = mk_pagetable(0);
1076 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
1078 if ( !shadow_mode_refcounts(d) )
1079 put_page_type(pfn_to_page(pfn));
1080 put_page(pfn_to_page(pfn));
1082 v->arch.guest_table_user = mk_pagetable(0);
1085 vmx_relinquish_resources(v);
1088 shadow_mode_disable(d);
1090 /*
1091 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1092 * it automatically gets squashed when the guest's mappings go away.
1093 */
1094 for_each_vcpu(d, v)
1095 destroy_gdt(v);
1097 /* Relinquish every page of memory. */
1098 relinquish_memory(d, &d->xenpage_list);
1099 relinquish_memory(d, &d->page_list);
1103 /*
1104 * Local variables:
1105 * mode: C
1106 * c-set-style: "BSD"
1107 * c-basic-offset: 4
1108 * tab-width: 4
1109 * indent-tabs-mode: nil
1110 * End:
1111 */