ia64/xen-unstable

view xen/arch/x86/domain.c @ 5261:44063d9f39e4

bitkeeper revision 1.1628.1.1 (429dd92aYeqV9tl4b0g_F_deORFVAQ)

Fix sync_lazy_execstate functions to correctly sync the local cpu.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Jun 01 15:50:02 2005 +0000 (2005-06-01)
parents 0e97dc32ddf6
children d65ff8dafa15
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <asm/physdev.h>
41 #include <xen/kernel.h>
42 #include <public/io/ioreq.h>
43 #include <xen/multicall.h>
45 /* opt_noreboot: If true, machine will need manual reset on error. */
46 static int opt_noreboot = 0;
47 boolean_param("noreboot", opt_noreboot);
49 struct percpu_ctxt {
50 struct exec_domain *curr_ed;
51 } __cacheline_aligned;
52 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
54 static void continue_idle_task(struct exec_domain *ed)
55 {
56 reset_stack_and_jump(idle_loop);
57 }
59 static void continue_nonidle_task(struct exec_domain *ed)
60 {
61 reset_stack_and_jump(ret_from_intr);
62 }
64 static void default_idle(void)
65 {
66 local_irq_disable();
67 if ( !softirq_pending(smp_processor_id()) )
68 safe_halt();
69 else
70 local_irq_enable();
71 }
73 void idle_loop(void)
74 {
75 int cpu = smp_processor_id();
77 for ( ; ; )
78 {
79 irq_stat[cpu].idle_timestamp = jiffies;
81 while ( !softirq_pending(cpu) )
82 {
83 page_scrub_schedule_work();
84 default_idle();
85 }
87 do_softirq();
88 }
89 }
91 void startup_cpu_idle_loop(void)
92 {
93 struct exec_domain *ed = current;
95 ASSERT(is_idle_task(ed->domain));
96 percpu_ctxt[smp_processor_id()].curr_ed = ed;
97 set_bit(smp_processor_id(), &ed->domain->cpuset);
98 ed->arch.schedule_tail = continue_idle_task;
100 idle_loop();
101 }
103 static long no_idt[2];
104 static int reboot_mode;
106 static inline void kb_wait(void)
107 {
108 int i;
110 for ( i = 0; i < 0x10000; i++ )
111 if ( (inb_p(0x64) & 0x02) == 0 )
112 break;
113 }
115 void machine_restart(char * __unused)
116 {
117 int i;
119 if ( opt_noreboot )
120 {
121 printk("Reboot disabled on cmdline: require manual reset\n");
122 for ( ; ; )
123 safe_halt();
124 }
126 local_irq_enable();
128 /* Ensure we are the boot CPU. */
129 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
130 {
131 smp_call_function((void *)machine_restart, NULL, 1, 0);
132 for ( ; ; )
133 safe_halt();
134 }
136 /*
137 * Stop all CPUs and turn off local APICs and the IO-APIC, so
138 * other OSs see a clean IRQ state.
139 */
140 smp_send_stop();
141 disable_IO_APIC();
143 #ifdef CONFIG_VMX
144 stop_vmx();
145 #endif
147 /* Rebooting needs to touch the page at absolute address 0. */
148 *((unsigned short *)__va(0x472)) = reboot_mode;
150 for ( ; ; )
151 {
152 /* Pulse the keyboard reset line. */
153 for ( i = 0; i < 100; i++ )
154 {
155 kb_wait();
156 udelay(50);
157 outb(0xfe,0x64); /* pulse reset low */
158 udelay(50);
159 }
161 /* That didn't work - force a triple fault.. */
162 __asm__ __volatile__("lidt %0": "=m" (no_idt));
163 __asm__ __volatile__("int3");
164 }
165 }
168 void __attribute__((noreturn)) __machine_halt(void *unused)
169 {
170 for ( ; ; )
171 safe_halt();
172 }
174 void machine_halt(void)
175 {
176 watchdog_disable();
177 smp_call_function(__machine_halt, NULL, 1, 0);
178 __machine_halt(NULL);
179 }
181 void dump_pageframe_info(struct domain *d)
182 {
183 struct pfn_info *page;
185 if ( d->tot_pages < 10 )
186 {
187 list_for_each_entry ( page, &d->page_list, list )
188 {
189 printk("Page %p: caf=%08x, taf=%08x\n",
190 _p(page_to_phys(page)), page->count_info,
191 page->u.inuse.type_info);
192 }
193 }
195 list_for_each_entry ( page, &d->xenpage_list, list )
196 {
197 printk("XenPage %p: caf=%08x, taf=%08x\n",
198 _p(page_to_phys(page)), page->count_info,
199 page->u.inuse.type_info);
200 }
203 page = virt_to_page(d->shared_info);
204 printk("Shared_info@%p: caf=%08x, taf=%08x\n",
205 _p(page_to_phys(page)), page->count_info,
206 page->u.inuse.type_info);
207 }
209 struct exec_domain *arch_alloc_exec_domain_struct(void)
210 {
211 return xmalloc(struct exec_domain);
212 }
214 void arch_free_exec_domain_struct(struct exec_domain *ed)
215 {
216 xfree(ed);
217 }
219 void free_perdomain_pt(struct domain *d)
220 {
221 free_xenheap_page((unsigned long)d->arch.mm_perdomain_pt);
222 #ifdef __x86_64__
223 free_xenheap_page((unsigned long)d->arch.mm_perdomain_l2);
224 free_xenheap_page((unsigned long)d->arch.mm_perdomain_l3);
225 #endif
226 }
228 void arch_do_createdomain(struct exec_domain *ed)
229 {
230 struct domain *d = ed->domain;
232 ed->arch.flags = TF_kernel_mode;
234 if ( is_idle_task(d) )
235 return;
237 ed->arch.schedule_tail = continue_nonidle_task;
239 d->shared_info = (void *)alloc_xenheap_page();
240 memset(d->shared_info, 0, PAGE_SIZE);
241 ed->vcpu_info = &d->shared_info->vcpu_data[ed->vcpu_id];
242 ed->cpumap = CPUMAP_RUNANYWHERE;
243 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
244 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
245 PAGE_SHIFT] = INVALID_M2P_ENTRY;
247 d->arch.mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
248 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
249 machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >>
250 PAGE_SHIFT] = INVALID_M2P_ENTRY;
251 ed->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
252 ed->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
253 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
255 ed->arch.guest_vtable = __linear_l2_table;
256 ed->arch.shadow_vtable = __shadow_linear_l2_table;
258 #ifdef __x86_64__
259 ed->arch.guest_vl3table = __linear_l3_table;
260 ed->arch.guest_vl4table = __linear_l4_table;
262 d->arch.mm_perdomain_l2 = (l2_pgentry_t *)alloc_xenheap_page();
263 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
264 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
265 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt),
266 __PAGE_HYPERVISOR);
267 d->arch.mm_perdomain_l3 = (l3_pgentry_t *)alloc_xenheap_page();
268 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
269 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
270 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
271 __PAGE_HYPERVISOR);
272 #endif
274 (void)ptwr_init(d);
276 shadow_lock_init(d);
277 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
278 }
280 void arch_do_boot_vcpu(struct exec_domain *ed)
281 {
282 struct domain *d = ed->domain;
284 ed->arch.flags = TF_kernel_mode;
286 ed->arch.schedule_tail = d->exec_domain[0]->arch.schedule_tail;
288 ed->arch.perdomain_ptes =
289 d->arch.mm_perdomain_pt + (ed->vcpu_id << PDPT_VCPU_SHIFT);
290 ed->arch.perdomain_ptes[FIRST_RESERVED_GDT_PAGE] =
291 l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
292 }
294 #ifdef CONFIG_VMX
295 void arch_vmx_do_resume(struct exec_domain *ed)
296 {
297 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->arch.arch_vmx.vmcs);
299 load_vmcs(&ed->arch.arch_vmx, vmcs_phys_ptr);
300 vmx_do_resume(ed);
301 reset_stack_and_jump(vmx_asm_do_resume);
302 }
304 void arch_vmx_do_launch(struct exec_domain *ed)
305 {
306 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->arch.arch_vmx.vmcs);
308 load_vmcs(&ed->arch.arch_vmx, vmcs_phys_ptr);
309 vmx_do_launch(ed);
310 reset_stack_and_jump(vmx_asm_do_launch);
311 }
313 static int vmx_final_setup_guest(
314 struct exec_domain *ed, struct vcpu_guest_context *ctxt)
315 {
316 int error;
317 struct cpu_user_regs *regs;
318 struct vmcs_struct *vmcs;
320 regs = &ctxt->user_regs;
322 /*
323 * Create a new VMCS
324 */
325 if (!(vmcs = alloc_vmcs())) {
326 printk("Failed to create a new VMCS\n");
327 return -ENOMEM;
328 }
330 memset(&ed->arch.arch_vmx, 0, sizeof (struct arch_vmx_struct));
332 ed->arch.arch_vmx.vmcs = vmcs;
333 error = construct_vmcs(
334 &ed->arch.arch_vmx, regs, ctxt, VMCS_USE_HOST_ENV);
335 if ( error < 0 )
336 {
337 printk("Failed to construct a new VMCS\n");
338 goto out;
339 }
341 ed->arch.schedule_tail = arch_vmx_do_launch;
343 #if defined (__i386)
344 ed->arch.arch_vmx.vmx_platform.real_mode_data =
345 (unsigned long *) regs->esi;
346 #endif
348 if (ed == ed->domain->exec_domain[0]) {
349 /*
350 * Required to do this once per domain
351 * XXX todo: add a seperate function to do these.
352 */
353 memset(&ed->domain->shared_info->evtchn_mask[0], 0xff,
354 sizeof(ed->domain->shared_info->evtchn_mask));
355 clear_bit(IOPACKET_PORT, &ed->domain->shared_info->evtchn_mask[0]);
357 /* Put the domain in shadow mode even though we're going to be using
358 * the shared 1:1 page table initially. It shouldn't hurt */
359 shadow_mode_enable(ed->domain,
360 SHM_enable|SHM_refcounts|
361 SHM_translate|SHM_external);
362 }
364 return 0;
366 out:
367 free_vmcs(vmcs);
368 ed->arch.arch_vmx.vmcs = 0;
369 return error;
370 }
371 #endif
374 /* This is called by arch_final_setup_guest and do_boot_vcpu */
375 int arch_set_info_guest(
376 struct exec_domain *ed, struct vcpu_guest_context *c)
377 {
378 struct domain *d = ed->domain;
379 unsigned long phys_basetab;
380 int i, rc;
382 /*
383 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
384 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
385 * If SS RPL or DPL differs from CS RPL then we'll #GP.
386 */
387 if ( !(c->flags & VGCF_VMX_GUEST) )
388 {
389 if ( ((c->user_regs.cs & 3) == 0) ||
390 ((c->user_regs.ss & 3) == 0) )
391 return -EINVAL;
392 }
394 clear_bit(_VCPUF_fpu_initialised, &ed->vcpu_flags);
395 if ( c->flags & VGCF_I387_VALID )
396 set_bit(_VCPUF_fpu_initialised, &ed->vcpu_flags);
398 ed->arch.flags &= ~TF_kernel_mode;
399 if ( c->flags & VGCF_IN_KERNEL )
400 ed->arch.flags |= TF_kernel_mode;
402 memcpy(&ed->arch.guest_context, c, sizeof(*c));
404 if ( !(c->flags & VGCF_VMX_GUEST) )
405 {
406 /* IOPL privileges are virtualised. */
407 ed->arch.iopl = (ed->arch.guest_context.user_regs.eflags >> 12) & 3;
408 ed->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
410 /* Ensure real hardware interrupts are enabled. */
411 ed->arch.guest_context.user_regs.eflags |= EF_IE;
412 } else {
413 __vmwrite(GUEST_EFLAGS, ed->arch.guest_context.user_regs.eflags);
414 if (ed->arch.guest_context.user_regs.eflags & EF_TF)
415 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
416 else
417 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
418 }
420 if ( test_bit(_VCPUF_initialised, &ed->vcpu_flags) )
421 return 0;
423 memset(ed->arch.guest_context.debugreg, 0,
424 sizeof(ed->arch.guest_context.debugreg));
425 for ( i = 0; i < 8; i++ )
426 (void)set_debugreg(ed, i, c->debugreg[i]);
428 if ( ed->vcpu_id == 0 )
429 d->vm_assist = c->vm_assist;
431 phys_basetab = c->pt_base;
432 ed->arch.guest_table = mk_pagetable(phys_basetab);
434 if ( shadow_mode_refcounts(d) )
435 {
436 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
437 return -EINVAL;
438 }
439 else
440 {
441 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
442 PGT_base_page_table) )
443 return -EINVAL;
444 }
446 if ( (rc = (int)set_gdt(ed, c->gdt_frames, c->gdt_ents)) != 0 )
447 {
448 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
449 return rc;
450 }
452 #ifdef CONFIG_VMX
453 if ( c->flags & VGCF_VMX_GUEST )
454 {
455 int error;
457 // VMX uses the initially provided page tables as the P2M map.
458 //
459 // XXX: This creates a security issue -- Xen can't necessarily
460 // trust the VMX domain builder. Xen should validate this
461 // page table, and/or build the table itself, or ???
462 //
463 if ( !pagetable_get_paddr(d->arch.phys_table) )
464 d->arch.phys_table = ed->arch.guest_table;
466 if ( (error = vmx_final_setup_guest(ed, c)) )
467 return error;
468 }
469 #endif
471 update_pagetables(ed);
473 /* Don't redo final setup */
474 set_bit(_VCPUF_initialised, &ed->vcpu_flags);
476 return 0;
477 }
480 void new_thread(struct exec_domain *d,
481 unsigned long start_pc,
482 unsigned long start_stack,
483 unsigned long start_info)
484 {
485 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
487 /*
488 * Initial register values:
489 * DS,ES,FS,GS = FLAT_KERNEL_DS
490 * CS:EIP = FLAT_KERNEL_CS:start_pc
491 * SS:ESP = FLAT_KERNEL_SS:start_stack
492 * ESI = start_info
493 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
494 */
495 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
496 regs->ss = FLAT_KERNEL_SS;
497 regs->cs = FLAT_KERNEL_CS;
498 regs->eip = start_pc;
499 regs->esp = start_stack;
500 regs->esi = start_info;
502 __save_flags(regs->eflags);
503 regs->eflags |= X86_EFLAGS_IF;
504 }
507 #ifdef __x86_64__
509 void toggle_guest_mode(struct exec_domain *ed)
510 {
511 ed->arch.flags ^= TF_kernel_mode;
512 __asm__ __volatile__ ( "swapgs" );
513 update_pagetables(ed);
514 write_ptbase(ed);
515 }
517 #define loadsegment(seg,value) ({ \
518 int __r = 1; \
519 __asm__ __volatile__ ( \
520 "1: movl %k1,%%" #seg "\n2:\n" \
521 ".section .fixup,\"ax\"\n" \
522 "3: xorl %k0,%k0\n" \
523 " movl %k0,%%" #seg "\n" \
524 " jmp 2b\n" \
525 ".previous\n" \
526 ".section __ex_table,\"a\"\n" \
527 " .align 8\n" \
528 " .quad 1b,3b\n" \
529 ".previous" \
530 : "=r" (__r) : "r" (value), "0" (__r) );\
531 __r; })
533 static void load_segments(struct exec_domain *p, struct exec_domain *n)
534 {
535 struct vcpu_guest_context *pctxt = &p->arch.guest_context;
536 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
537 int all_segs_okay = 1;
539 /* Either selector != 0 ==> reload. */
540 if ( unlikely(pctxt->user_regs.ds | nctxt->user_regs.ds) )
541 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
543 /* Either selector != 0 ==> reload. */
544 if ( unlikely(pctxt->user_regs.es | nctxt->user_regs.es) )
545 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
547 /*
548 * Either selector != 0 ==> reload.
549 * Also reload to reset FS_BASE if it was non-zero.
550 */
551 if ( unlikely(pctxt->user_regs.fs |
552 pctxt->fs_base |
553 nctxt->user_regs.fs) )
554 {
555 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
556 if ( pctxt->user_regs.fs ) /* != 0 selector kills fs_base */
557 pctxt->fs_base = 0;
558 }
560 /*
561 * Either selector != 0 ==> reload.
562 * Also reload to reset GS_BASE if it was non-zero.
563 */
564 if ( unlikely(pctxt->user_regs.gs |
565 pctxt->gs_base_user |
566 nctxt->user_regs.gs) )
567 {
568 /* Reset GS_BASE with user %gs? */
569 if ( pctxt->user_regs.gs || !nctxt->gs_base_user )
570 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
571 if ( pctxt->user_regs.gs ) /* != 0 selector kills gs_base_user */
572 pctxt->gs_base_user = 0;
573 }
575 /* This can only be non-zero if selector is NULL. */
576 if ( nctxt->fs_base )
577 wrmsr(MSR_FS_BASE,
578 nctxt->fs_base,
579 nctxt->fs_base>>32);
581 /* Most kernels have non-zero GS base, so don't bother testing. */
582 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
583 wrmsr(MSR_SHADOW_GS_BASE,
584 nctxt->gs_base_kernel,
585 nctxt->gs_base_kernel>>32);
587 /* This can only be non-zero if selector is NULL. */
588 if ( nctxt->gs_base_user )
589 wrmsr(MSR_GS_BASE,
590 nctxt->gs_base_user,
591 nctxt->gs_base_user>>32);
593 /* If in kernel mode then switch the GS bases around. */
594 if ( n->arch.flags & TF_kernel_mode )
595 __asm__ __volatile__ ( "swapgs" );
597 if ( unlikely(!all_segs_okay) )
598 {
599 struct cpu_user_regs *regs = guest_cpu_user_regs();
600 unsigned long *rsp =
601 (n->arch.flags & TF_kernel_mode) ?
602 (unsigned long *)regs->rsp :
603 (unsigned long *)nctxt->kernel_sp;
605 if ( !(n->arch.flags & TF_kernel_mode) )
606 toggle_guest_mode(n);
607 else
608 regs->cs &= ~3;
610 if ( put_user(regs->ss, rsp- 1) |
611 put_user(regs->rsp, rsp- 2) |
612 put_user(regs->rflags, rsp- 3) |
613 put_user(regs->cs, rsp- 4) |
614 put_user(regs->rip, rsp- 5) |
615 put_user(nctxt->user_regs.gs, rsp- 6) |
616 put_user(nctxt->user_regs.fs, rsp- 7) |
617 put_user(nctxt->user_regs.es, rsp- 8) |
618 put_user(nctxt->user_regs.ds, rsp- 9) |
619 put_user(regs->r11, rsp-10) |
620 put_user(regs->rcx, rsp-11) )
621 {
622 DPRINTK("Error while creating failsafe callback frame.\n");
623 domain_crash();
624 }
626 regs->entry_vector = TRAP_syscall;
627 regs->rflags &= 0xFFFCBEFFUL;
628 regs->ss = __GUEST_SS;
629 regs->rsp = (unsigned long)(rsp-11);
630 regs->cs = __GUEST_CS;
631 regs->rip = nctxt->failsafe_callback_eip;
632 }
633 }
635 static void save_segments(struct exec_domain *ed)
636 {
637 struct cpu_user_regs *regs = &ed->arch.guest_context.user_regs;
638 __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (regs->ds) );
639 __asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) );
640 __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) );
641 __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) );
642 }
644 static void clear_segments(void)
645 {
646 __asm__ __volatile__ (
647 " movl %0,%%ds; "
648 " movl %0,%%es; "
649 " movl %0,%%fs; "
650 " movl %0,%%gs; "
651 ""safe_swapgs" "
652 " movl %0,%%gs"
653 : : "r" (0) );
654 }
656 long do_switch_to_user(void)
657 {
658 struct cpu_user_regs *regs = guest_cpu_user_regs();
659 struct switch_to_user stu;
660 struct exec_domain *ed = current;
662 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
663 unlikely(pagetable_get_paddr(ed->arch.guest_table_user) == 0) )
664 return -EFAULT;
666 toggle_guest_mode(ed);
668 regs->rip = stu.rip;
669 regs->cs = stu.cs | 3; /* force guest privilege */
670 regs->rflags = stu.rflags;
671 regs->rsp = stu.rsp;
672 regs->ss = stu.ss | 3; /* force guest privilege */
674 if ( !(stu.flags & VGCF_IN_SYSCALL) )
675 {
676 regs->entry_vector = 0;
677 regs->r11 = stu.r11;
678 regs->rcx = stu.rcx;
679 }
681 /* Saved %rax gets written back to regs->rax in entry.S. */
682 return stu.rax;
683 }
685 #define switch_kernel_stack(_n,_c) ((void)0)
687 #elif defined(__i386__)
689 #define load_segments(_p, _n) ((void)0)
690 #define save_segments(_p) ((void)0)
691 #define clear_segments() ((void)0)
693 static inline void switch_kernel_stack(struct exec_domain *n, unsigned int cpu)
694 {
695 struct tss_struct *tss = &init_tss[cpu];
696 tss->esp1 = n->arch.guest_context.kernel_sp;
697 tss->ss1 = n->arch.guest_context.kernel_ss;
698 }
700 #endif
702 #define loaddebug(_ed,_reg) \
703 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_ed)->debugreg[_reg]))
705 static void __context_switch(void)
706 {
707 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
708 unsigned int cpu = smp_processor_id();
709 struct exec_domain *p = percpu_ctxt[cpu].curr_ed;
710 struct exec_domain *n = current;
712 if ( !is_idle_task(p->domain) )
713 {
714 memcpy(&p->arch.guest_context.user_regs,
715 stack_regs,
716 CTXT_SWITCH_STACK_BYTES);
717 unlazy_fpu(p);
718 save_segments(p);
719 }
721 if ( !is_idle_task(n->domain) )
722 {
723 memcpy(stack_regs,
724 &n->arch.guest_context.user_regs,
725 CTXT_SWITCH_STACK_BYTES);
727 /* Maybe switch the debug registers. */
728 if ( unlikely(n->arch.guest_context.debugreg[7]) )
729 {
730 loaddebug(&n->arch.guest_context, 0);
731 loaddebug(&n->arch.guest_context, 1);
732 loaddebug(&n->arch.guest_context, 2);
733 loaddebug(&n->arch.guest_context, 3);
734 /* no 4 and 5 */
735 loaddebug(&n->arch.guest_context, 6);
736 loaddebug(&n->arch.guest_context, 7);
737 }
739 if ( !VMX_DOMAIN(n) )
740 {
741 set_int80_direct_trap(n);
742 switch_kernel_stack(n, cpu);
743 }
744 }
746 if ( p->domain != n->domain )
747 set_bit(cpu, &n->domain->cpuset);
749 write_ptbase(n);
751 if ( p->vcpu_id != n->vcpu_id )
752 {
753 char gdt_load[10];
754 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
755 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
756 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
757 }
759 if ( p->domain != n->domain )
760 clear_bit(cpu, &p->domain->cpuset);
762 percpu_ctxt[cpu].curr_ed = n;
763 }
766 void context_switch(struct exec_domain *prev, struct exec_domain *next)
767 {
768 struct exec_domain *realprev;
770 local_irq_disable();
772 set_current(next);
774 if ( ((realprev = percpu_ctxt[smp_processor_id()].curr_ed) == next) ||
775 is_idle_task(next->domain) )
776 {
777 local_irq_enable();
778 }
779 else
780 {
781 __context_switch();
783 local_irq_enable();
785 if ( !VMX_DOMAIN(next) )
786 {
787 load_LDT(next);
788 load_segments(realprev, next);
789 }
790 }
792 /*
793 * We do this late on because it doesn't need to be protected by the
794 * schedule_lock, and because we want this to be the very last use of
795 * 'prev' (after this point, a dying domain's info structure may be freed
796 * without warning).
797 */
798 clear_bit(_VCPUF_running, &prev->vcpu_flags);
800 schedule_tail(next);
801 BUG();
802 }
804 void continue_running(struct exec_domain *same)
805 {
806 schedule_tail(same);
807 BUG();
808 }
810 int __sync_lazy_execstate(void)
811 {
812 if ( percpu_ctxt[smp_processor_id()].curr_ed == current )
813 return 0;
814 __context_switch();
815 load_LDT(current);
816 clear_segments();
817 return 1;
818 }
820 void sync_lazy_execstate_cpuset(unsigned long cpuset)
821 {
822 if ( cpuset & (1 << smp_processor_id()) )
823 (void)__sync_lazy_execstate();
824 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
825 flush_tlb_mask(cpuset & ~(1 << smp_processor_id()));
826 }
828 void sync_lazy_execstate_all(void)
829 {
830 __sync_lazy_execstate();
831 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
832 flush_tlb_mask(((1<<num_online_cpus())-1) & ~(1 << smp_processor_id()));
833 }
835 unsigned long __hypercall_create_continuation(
836 unsigned int op, unsigned int nr_args, ...)
837 {
838 struct mc_state *mcs = &mc_state[smp_processor_id()];
839 struct cpu_user_regs *regs;
840 unsigned int i;
841 va_list args;
843 va_start(args, nr_args);
845 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
846 {
847 __set_bit(_MCSF_call_preempted, &mcs->flags);
849 for ( i = 0; i < nr_args; i++ )
850 mcs->call.args[i] = va_arg(args, unsigned long);
851 }
852 else
853 {
854 regs = guest_cpu_user_regs();
855 #if defined(__i386__)
856 regs->eax = op;
857 regs->eip -= 2; /* re-execute 'int 0x82' */
859 for ( i = 0; i < nr_args; i++ )
860 {
861 switch ( i )
862 {
863 case 0: regs->ebx = va_arg(args, unsigned long); break;
864 case 1: regs->ecx = va_arg(args, unsigned long); break;
865 case 2: regs->edx = va_arg(args, unsigned long); break;
866 case 3: regs->esi = va_arg(args, unsigned long); break;
867 case 4: regs->edi = va_arg(args, unsigned long); break;
868 case 5: regs->ebp = va_arg(args, unsigned long); break;
869 }
870 }
871 #elif defined(__x86_64__)
872 regs->rax = op;
873 regs->rip -= 2; /* re-execute 'syscall' */
875 for ( i = 0; i < nr_args; i++ )
876 {
877 switch ( i )
878 {
879 case 0: regs->rdi = va_arg(args, unsigned long); break;
880 case 1: regs->rsi = va_arg(args, unsigned long); break;
881 case 2: regs->rdx = va_arg(args, unsigned long); break;
882 case 3: regs->r10 = va_arg(args, unsigned long); break;
883 case 4: regs->r8 = va_arg(args, unsigned long); break;
884 case 5: regs->r9 = va_arg(args, unsigned long); break;
885 }
886 }
887 #endif
888 }
890 va_end(args);
892 return op;
893 }
895 #ifdef CONFIG_VMX
896 static void vmx_relinquish_resources(struct exec_domain *ed)
897 {
898 if ( !VMX_DOMAIN(ed) )
899 return;
901 BUG_ON(ed->arch.arch_vmx.vmcs == NULL);
902 free_vmcs(ed->arch.arch_vmx.vmcs);
903 ed->arch.arch_vmx.vmcs = 0;
905 free_monitor_pagetable(ed);
906 rem_ac_timer(&ed->arch.arch_vmx.vmx_platform.vmx_pit.pit_timer);
907 }
908 #else
909 #define vmx_relinquish_resources(_ed) ((void)0)
910 #endif
912 static void relinquish_memory(struct domain *d, struct list_head *list)
913 {
914 struct list_head *ent;
915 struct pfn_info *page;
916 unsigned long x, y;
918 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
919 spin_lock_recursive(&d->page_alloc_lock);
921 ent = list->next;
922 while ( ent != list )
923 {
924 page = list_entry(ent, struct pfn_info, list);
926 /* Grab a reference to the page so it won't disappear from under us. */
927 if ( unlikely(!get_page(page, d)) )
928 {
929 /* Couldn't get a reference -- someone is freeing this page. */
930 ent = ent->next;
931 continue;
932 }
934 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
935 put_page_and_type(page);
937 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
938 put_page(page);
940 /*
941 * Forcibly invalidate base page tables at this point to break circular
942 * 'linear page table' references. This is okay because MMU structures
943 * are not shared across domains and this domain is now dead. Thus base
944 * tables are not in use so a non-zero count means circular reference.
945 */
946 y = page->u.inuse.type_info;
947 for ( ; ; )
948 {
949 x = y;
950 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
951 (PGT_base_page_table|PGT_validated)) )
952 break;
954 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
955 if ( likely(y == x) )
956 {
957 free_page_type(page, PGT_base_page_table);
958 break;
959 }
960 }
962 /* Follow the list chain and /then/ potentially free the page. */
963 ent = ent->next;
964 put_page(page);
965 }
967 spin_unlock_recursive(&d->page_alloc_lock);
968 }
970 void domain_relinquish_resources(struct domain *d)
971 {
972 struct exec_domain *ed;
974 BUG_ON(d->cpuset != 0);
976 physdev_destroy_state(d);
978 ptwr_destroy(d);
980 /* Release device mappings of other domains */
981 gnttab_release_dev_mappings(d->grant_table);
983 /* Drop the in-use references to page-table bases. */
984 for_each_exec_domain ( d, ed )
985 {
986 if ( pagetable_get_paddr(ed->arch.guest_table) != 0 )
987 {
988 if ( shadow_mode_refcounts(d) )
989 put_page(&frame_table[pagetable_get_pfn(ed->arch.guest_table)]);
990 else
991 put_page_and_type(&frame_table[pagetable_get_pfn(ed->arch.guest_table)]);
993 ed->arch.guest_table = mk_pagetable(0);
994 }
996 if ( pagetable_get_paddr(ed->arch.guest_table_user) != 0 )
997 {
998 if ( shadow_mode_refcounts(d) )
999 put_page(&frame_table[pagetable_get_pfn(ed->arch.guest_table_user)]);
1000 else
1001 put_page_and_type(&frame_table[pagetable_get_pfn(ed->arch.guest_table_user)]);
1003 ed->arch.guest_table_user = mk_pagetable(0);
1006 vmx_relinquish_resources(ed);
1009 shadow_mode_disable(d);
1011 /*
1012 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1013 * it automatically gets squashed when the guest's mappings go away.
1014 */
1015 for_each_exec_domain(d, ed)
1016 destroy_gdt(ed);
1018 /* Relinquish every page of memory. */
1019 relinquish_memory(d, &d->xenpage_list);
1020 relinquish_memory(d, &d->page_list);
1024 /*
1025 * Local variables:
1026 * mode: C
1027 * c-set-style: "BSD"
1028 * c-basic-offset: 4
1029 * tab-width: 4
1030 * indent-tabs-mode: nil
1031 * End:
1032 */