ia64/xen-unstable

view xen/arch/x86/domain.c @ 3858:5b63436f25fe

bitkeeper revision 1.1205.1.2 (421527deX3t0INFwjrOweq0E7Le7pw)

Rename fields in arch_exec_domain to be more uniform.
Promote vmx_shadow_invlpg() to shadow_invlpg().
author maf46@burn.cl.cam.ac.uk
date Thu Feb 17 23:25:18 2005 +0000 (2005-02-17)
parents 917ea52007ea
children d6d5fbf0eee1
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * arch/x86/domain.c
4 *
5 * x86-specific domain handling (e.g., register setup and context switching).
6 */
8 /*
9 * Copyright (C) 1995 Linus Torvalds
10 *
11 * Pentium III FXSR, SSE support
12 * Gareth Hughes <gareth@valinux.com>, May 2000
13 */
15 #include <xen/config.h>
16 #include <xen/init.h>
17 #include <xen/lib.h>
18 #include <xen/errno.h>
19 #include <xen/sched.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/softirq.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <xen/kernel.h>
41 #include <public/io/ioreq.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 static void default_idle(void)
49 {
50 __cli();
51 if ( !softirq_pending(smp_processor_id()) )
52 safe_halt();
53 else
54 __sti();
55 }
57 static __attribute_used__ void idle_loop(void)
58 {
59 int cpu = smp_processor_id();
60 for ( ; ; )
61 {
62 irq_stat[cpu].idle_timestamp = jiffies;
63 while ( !softirq_pending(cpu) )
64 default_idle();
65 do_softirq();
66 }
67 }
69 void startup_cpu_idle_loop(void)
70 {
71 /* Just some sanity to ensure that the scheduler is set up okay. */
72 ASSERT(current->domain->id == IDLE_DOMAIN_ID);
73 domain_unpause_by_systemcontroller(current->domain);
74 __enter_scheduler();
76 /*
77 * Declares CPU setup done to the boot processor.
78 * Therefore memory barrier to ensure state is visible.
79 */
80 smp_mb();
81 init_idle();
83 idle_loop();
84 }
86 static long no_idt[2];
87 static int reboot_mode;
89 static inline void kb_wait(void)
90 {
91 int i;
93 for ( i = 0; i < 0x10000; i++ )
94 if ( (inb_p(0x64) & 0x02) == 0 )
95 break;
96 }
98 void machine_restart(char * __unused)
99 {
100 int i;
102 if ( opt_noreboot )
103 {
104 printk("Reboot disabled on cmdline: require manual reset\n");
105 for ( ; ; )
106 safe_halt();
107 }
109 __sti();
111 /* Ensure we are the boot CPU. */
112 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
113 {
114 smp_call_function((void *)machine_restart, NULL, 1, 0);
115 for ( ; ; )
116 safe_halt();
117 }
119 /*
120 * Stop all CPUs and turn off local APICs and the IO-APIC, so
121 * other OSs see a clean IRQ state.
122 */
123 smp_send_stop();
124 disable_IO_APIC();
126 #ifdef CONFIG_VMX
127 stop_vmx();
128 #endif
130 /* Rebooting needs to touch the page at absolute address 0. */
131 *((unsigned short *)__va(0x472)) = reboot_mode;
133 for ( ; ; )
134 {
135 /* Pulse the keyboard reset line. */
136 for ( i = 0; i < 100; i++ )
137 {
138 kb_wait();
139 udelay(50);
140 outb(0xfe,0x64); /* pulse reset low */
141 udelay(50);
142 }
144 /* That didn't work - force a triple fault.. */
145 __asm__ __volatile__("lidt %0": "=m" (no_idt));
146 __asm__ __volatile__("int3");
147 }
148 }
151 void __attribute__((noreturn)) __machine_halt(void *unused)
152 {
153 for ( ; ; )
154 safe_halt();
155 }
157 void machine_halt(void)
158 {
159 watchdog_on = 0;
160 smp_call_function(__machine_halt, NULL, 1, 0);
161 __machine_halt(NULL);
162 }
164 void dump_pageframe_info(struct domain *d)
165 {
166 struct pfn_info *page;
168 if ( d->tot_pages < 10 )
169 {
170 list_for_each_entry ( page, &d->page_list, list )
171 {
172 printk("Page %08x: caf=%08x, taf=%08x\n",
173 page_to_phys(page), page->count_info,
174 page->u.inuse.type_info);
175 }
176 }
178 page = virt_to_page(d->shared_info);
179 printk("Shared_info@%08x: caf=%08x, taf=%08x\n",
180 page_to_phys(page), page->count_info,
181 page->u.inuse.type_info);
182 }
184 struct domain *arch_alloc_domain_struct(void)
185 {
186 return xmalloc(struct domain);
187 }
189 void arch_free_domain_struct(struct domain *d)
190 {
191 xfree(d);
192 }
194 struct exec_domain *arch_alloc_exec_domain_struct(void)
195 {
196 return xmalloc(struct exec_domain);
197 }
199 void arch_free_exec_domain_struct(struct exec_domain *ed)
200 {
201 xfree(ed);
202 }
204 void free_perdomain_pt(struct domain *d)
205 {
206 free_xenheap_page((unsigned long)d->arch.mm_perdomain_pt);
207 #ifdef __x86_64__
208 free_xenheap_page((unsigned long)d->arch.mm_perdomain_l2);
209 free_xenheap_page((unsigned long)d->arch.mm_perdomain_l3);
210 #endif
211 }
213 static void continue_idle_task(struct exec_domain *ed)
214 {
215 reset_stack_and_jump(idle_loop);
216 }
218 static void continue_nonidle_task(struct exec_domain *ed)
219 {
220 reset_stack_and_jump(ret_from_intr);
221 }
223 void arch_do_createdomain(struct exec_domain *ed)
224 {
225 struct domain *d = ed->domain;
227 SET_DEFAULT_FAST_TRAP(&ed->arch);
229 ed->arch.flags = TF_kernel_mode;
231 if ( d->id == IDLE_DOMAIN_ID )
232 {
233 ed->arch.schedule_tail = continue_idle_task;
234 }
235 else
236 {
237 ed->arch.schedule_tail = continue_nonidle_task;
239 d->shared_info = (void *)alloc_xenheap_page();
240 memset(d->shared_info, 0, PAGE_SIZE);
241 ed->vcpu_info = &d->shared_info->vcpu_data[ed->eid];
242 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
243 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
244 PAGE_SHIFT] = INVALID_M2P_ENTRY;
246 d->arch.mm_perdomain_pt = (l1_pgentry_t *)alloc_xenheap_page();
247 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
248 machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >>
249 PAGE_SHIFT] = INVALID_M2P_ENTRY;
250 ed->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
252 #ifdef __x86_64__
253 d->arch.mm_perdomain_l2 = (l2_pgentry_t *)alloc_xenheap_page();
254 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
255 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
256 mk_l2_pgentry(__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR);
257 d->arch.mm_perdomain_l3 = (l3_pgentry_t *)alloc_xenheap_page();
258 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
259 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
260 mk_l3_pgentry(__pa(d->arch.mm_perdomain_l2) | __PAGE_HYPERVISOR);
261 #endif
263 shadow_lock_init(d);
264 }
265 }
267 void arch_do_boot_vcpu(struct exec_domain *ed)
268 {
269 struct domain *d = ed->domain;
270 ed->arch.schedule_tail = d->exec_domain[0]->arch.schedule_tail;
271 ed->arch.perdomain_ptes =
272 d->arch.mm_perdomain_pt + (ed->eid << PDPT_VCPU_SHIFT);
273 ed->arch.flags = TF_kernel_mode;
274 }
276 #ifdef CONFIG_VMX
277 void arch_vmx_do_resume(struct exec_domain *ed)
278 {
279 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->arch.arch_vmx.vmcs);
281 load_vmcs(&ed->arch.arch_vmx, vmcs_phys_ptr);
282 vmx_do_resume(ed);
283 reset_stack_and_jump(vmx_asm_do_resume);
284 }
286 void arch_vmx_do_launch(struct exec_domain *ed)
287 {
288 u64 vmcs_phys_ptr = (u64) virt_to_phys(ed->arch.arch_vmx.vmcs);
290 load_vmcs(&ed->arch.arch_vmx, vmcs_phys_ptr);
291 vmx_do_launch(ed);
292 reset_stack_and_jump(vmx_asm_do_launch);
293 }
295 static void alloc_monitor_pagetable(struct exec_domain *ed)
296 {
297 unsigned long mpfn;
298 l2_pgentry_t *mpl2e, *phys_table;
299 struct pfn_info *mpfn_info;
300 struct domain *d = ed->domain;
302 ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
304 mpfn_info = alloc_domheap_page(NULL);
305 ASSERT( mpfn_info );
307 mpfn = (unsigned long) (mpfn_info - frame_table);
308 mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << PAGE_SHIFT);
309 memset(mpl2e, 0, PAGE_SIZE);
311 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
312 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
313 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
315 ed->arch.monitor_table = mk_pagetable(mpfn << PAGE_SHIFT);
317 mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
318 mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK)
319 | __PAGE_HYPERVISOR);
321 phys_table = (l2_pgentry_t *)
322 map_domain_mem(pagetable_val(ed->arch.phys_table));
323 memcpy(d->arch.mm_perdomain_pt, phys_table,
324 L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t));
326 unmap_domain_mem(phys_table);
327 unmap_domain_mem(mpl2e);
328 }
330 /*
331 * Free the pages for monitor_table and hl2_table
332 */
333 static void free_monitor_pagetable(struct exec_domain *ed)
334 {
335 l2_pgentry_t *mpl2e;
336 unsigned long mpfn;
338 ASSERT( pagetable_val(ed->arch.monitor_table) );
340 mpl2e = (l2_pgentry_t *)
341 map_domain_mem(pagetable_val(ed->arch.monitor_table));
342 /*
343 * First get the pfn for hl2_table by looking at monitor_table
344 */
345 mpfn = l2_pgentry_val(mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])
346 >> PAGE_SHIFT;
348 free_domheap_page(&frame_table[mpfn]);
349 unmap_domain_mem(mpl2e);
351 /*
352 * Then free monitor_table.
353 */
354 mpfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
355 free_domheap_page(&frame_table[mpfn]);
357 ed->arch.monitor_table = mk_pagetable(0);
358 }
360 static int vmx_final_setup_guest(struct exec_domain *ed,
361 full_execution_context_t *full_context)
362 {
363 int error;
364 execution_context_t *context;
365 struct vmcs_struct *vmcs;
367 context = &full_context->cpu_ctxt;
369 /*
370 * Create a new VMCS
371 */
372 if (!(vmcs = alloc_vmcs())) {
373 printk("Failed to create a new VMCS\n");
374 return -ENOMEM;
375 }
377 memset(&ed->arch.arch_vmx, 0, sizeof (struct arch_vmx_struct));
379 ed->arch.arch_vmx.vmcs = vmcs;
380 error = construct_vmcs(
381 &ed->arch.arch_vmx, context, full_context, VMCS_USE_HOST_ENV);
382 if ( error < 0 )
383 {
384 printk("Failed to construct a new VMCS\n");
385 goto out;
386 }
388 ed->arch.schedule_tail = arch_vmx_do_launch;
389 clear_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state);
391 #if defined (__i386)
392 ed->arch.arch_vmx.vmx_platform.real_mode_data =
393 (unsigned long *) context->esi;
394 #endif
396 if (ed == ed->domain->exec_domain[0]) {
397 /*
398 * Required to do this once per domain
399 * XXX todo: add a seperate function to do these.
400 */
401 memset(&ed->domain->shared_info->evtchn_mask[0], 0xff,
402 sizeof(ed->domain->shared_info->evtchn_mask));
403 clear_bit(IOPACKET_PORT, &ed->domain->shared_info->evtchn_mask[0]);
405 /* Put the domain in shadow mode even though we're going to be using
406 * the shared 1:1 page table initially. It shouldn't hurt */
407 shadow_mode_enable(ed->domain, SHM_enable|SHM_translate|SHM_external);
408 }
410 /* We don't call update_pagetables() as we actively want fields such as
411 * the linear_pg_table to be null so that we bail out early of
412 * shadow_fault in case the vmx guest tries illegal accesses with
413 * paging turned off.
414 */
415 //update_pagetables(ed); /* this assigns shadow_pagetable */
416 alloc_monitor_pagetable(ed); /* this assigns monitor_pagetable */
418 return 0;
420 out:
421 free_vmcs(vmcs);
422 ed->arch.arch_vmx.vmcs = 0;
423 return error;
424 }
425 #endif
428 /* This is called by arch_final_setup_guest and do_boot_vcpu */
429 int arch_final_setup_guest(
430 struct exec_domain *d, full_execution_context_t *c)
431 {
432 unsigned long phys_basetab;
433 int i, rc;
435 clear_bit(EDF_DONEFPUINIT, &d->ed_flags);
436 if ( c->flags & ECF_I387_VALID )
437 set_bit(EDF_DONEFPUINIT, &d->ed_flags);
439 d->arch.flags &= ~TF_kernel_mode;
440 if ( c->flags & ECF_IN_KERNEL )
441 d->arch.flags |= TF_kernel_mode;
443 memcpy(&d->arch.user_ctxt,
444 &c->cpu_ctxt,
445 sizeof(d->arch.user_ctxt));
447 /* Clear IOPL for unprivileged domains. */
448 if (!IS_PRIV(d->domain))
449 d->arch.user_ctxt.eflags &= 0xffffcfff;
451 /*
452 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
453 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
454 * If SS RPL or DPL differs from CS RPL then we'll #GP.
455 */
456 if (!(c->flags & ECF_VMX_GUEST))
457 if ( ((d->arch.user_ctxt.cs & 3) == 0) ||
458 ((d->arch.user_ctxt.ss & 3) == 0) )
459 return -EINVAL;
461 memcpy(&d->arch.i387,
462 &c->fpu_ctxt,
463 sizeof(d->arch.i387));
465 memcpy(d->arch.traps,
466 &c->trap_ctxt,
467 sizeof(d->arch.traps));
469 if ( (rc = (int)set_fast_trap(d, c->fast_trap_idx)) != 0 )
470 return rc;
472 d->arch.ldt_base = c->ldt_base;
473 d->arch.ldt_ents = c->ldt_ents;
475 d->arch.kernel_ss = c->kernel_ss;
476 d->arch.kernel_sp = c->kernel_esp;
478 for ( i = 0; i < 8; i++ )
479 (void)set_debugreg(d, i, c->debugreg[i]);
481 d->arch.event_selector = c->event_callback_cs;
482 d->arch.event_address = c->event_callback_eip;
483 d->arch.failsafe_selector = c->failsafe_callback_cs;
484 d->arch.failsafe_address = c->failsafe_callback_eip;
486 phys_basetab = c->pt_base;
487 d->arch.guest_table = d->arch.phys_table = mk_pagetable(phys_basetab);
489 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain,
490 PGT_base_page_table) )
491 return -EINVAL;
493 /* Failure to set GDT is harmless. */
494 SET_GDT_ENTRIES(d, DEFAULT_GDT_ENTRIES);
495 SET_GDT_ADDRESS(d, DEFAULT_GDT_ADDRESS);
496 if ( c->gdt_ents != 0 )
497 {
498 if ( (rc = (int)set_gdt(d, c->gdt_frames, c->gdt_ents)) != 0 )
499 {
500 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
501 return rc;
502 }
503 }
505 #ifdef CONFIG_VMX
506 if (c->flags & ECF_VMX_GUEST)
507 return vmx_final_setup_guest(d, c);
508 #endif
510 update_pagetables(d);
512 return 0;
513 }
515 void new_thread(struct exec_domain *d,
516 unsigned long start_pc,
517 unsigned long start_stack,
518 unsigned long start_info)
519 {
520 execution_context_t *ec = &d->arch.user_ctxt;
522 /*
523 * Initial register values:
524 * DS,ES,FS,GS = FLAT_KERNEL_DS
525 * CS:EIP = FLAT_KERNEL_CS:start_pc
526 * SS:ESP = FLAT_KERNEL_SS:start_stack
527 * ESI = start_info
528 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
529 */
530 ec->ds = ec->es = ec->fs = ec->gs = FLAT_KERNEL_DS;
531 ec->ss = FLAT_KERNEL_SS;
532 ec->cs = FLAT_KERNEL_CS;
533 ec->eip = start_pc;
534 ec->esp = start_stack;
535 ec->esi = start_info;
537 __save_flags(ec->eflags);
538 ec->eflags |= X86_EFLAGS_IF;
539 }
542 #ifdef __x86_64__
544 void toggle_guest_mode(struct exec_domain *ed)
545 {
546 ed->arch.flags ^= TF_kernel_mode;
547 __asm__ __volatile__ ( "swapgs" );
548 update_pagetables(ed);
549 write_ptbase(ed);
550 }
552 #define loadsegment(seg,value) ({ \
553 int __r = 1; \
554 __asm__ __volatile__ ( \
555 "1: movl %k1,%%" #seg "\n2:\n" \
556 ".section .fixup,\"ax\"\n" \
557 "3: xorl %k0,%k0\n" \
558 " movl %k0,%%" #seg "\n" \
559 " jmp 2b\n" \
560 ".previous\n" \
561 ".section __ex_table,\"a\"\n" \
562 " .align 8\n" \
563 " .quad 1b,3b\n" \
564 ".previous" \
565 : "=r" (__r) : "r" (value), "0" (__r) );\
566 __r; })
568 static void switch_segments(
569 struct xen_regs *regs, struct exec_domain *p, struct exec_domain *n)
570 {
571 int all_segs_okay = 1;
573 if ( !is_idle_task(p->domain) )
574 {
575 __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
576 __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
577 __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
578 __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
579 }
581 /* Either selector != 0 ==> reload. */
582 if ( unlikely(p->arch.user_ctxt.ds |
583 n->arch.user_ctxt.ds) )
584 all_segs_okay &= loadsegment(ds, n->arch.user_ctxt.ds);
586 /* Either selector != 0 ==> reload. */
587 if ( unlikely(p->arch.user_ctxt.es |
588 n->arch.user_ctxt.es) )
589 all_segs_okay &= loadsegment(es, n->arch.user_ctxt.es);
591 /*
592 * Either selector != 0 ==> reload.
593 * Also reload to reset FS_BASE if it was non-zero.
594 */
595 if ( unlikely(p->arch.user_ctxt.fs |
596 p->arch.user_ctxt.fs_base |
597 n->arch.user_ctxt.fs) )
598 {
599 all_segs_okay &= loadsegment(fs, n->arch.user_ctxt.fs);
600 if ( p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */
601 p->arch.user_ctxt.fs_base = 0;
602 }
604 /*
605 * Either selector != 0 ==> reload.
606 * Also reload to reset GS_BASE if it was non-zero.
607 */
608 if ( unlikely(p->arch.user_ctxt.gs |
609 p->arch.user_ctxt.gs_base_user |
610 n->arch.user_ctxt.gs) )
611 {
612 /* Reset GS_BASE with user %gs? */
613 if ( p->arch.user_ctxt.gs || !n->arch.user_ctxt.gs_base_user )
614 all_segs_okay &= loadsegment(gs, n->arch.user_ctxt.gs);
615 if ( p->arch.user_ctxt.gs ) /* != 0 selector kills gs_base_user */
616 p->arch.user_ctxt.gs_base_user = 0;
617 }
619 /* This can only be non-zero if selector is NULL. */
620 if ( n->arch.user_ctxt.fs_base )
621 wrmsr(MSR_FS_BASE,
622 n->arch.user_ctxt.fs_base,
623 n->arch.user_ctxt.fs_base>>32);
625 /* This can only be non-zero if selector is NULL. */
626 if ( n->arch.user_ctxt.gs_base_user )
627 wrmsr(MSR_GS_BASE,
628 n->arch.user_ctxt.gs_base_user,
629 n->arch.user_ctxt.gs_base_user>>32);
631 /* This can only be non-zero if selector is NULL. */
632 if ( p->arch.user_ctxt.gs_base_kernel |
633 n->arch.user_ctxt.gs_base_kernel )
634 wrmsr(MSR_SHADOW_GS_BASE,
635 n->arch.user_ctxt.gs_base_kernel,
636 n->arch.user_ctxt.gs_base_kernel>>32);
638 /* If in kernel mode then switch the GS bases around. */
639 if ( n->arch.flags & TF_kernel_mode )
640 __asm__ __volatile__ ( "swapgs" );
642 if ( unlikely(!all_segs_okay) )
643 {
644 unsigned long *rsp =
645 (n->arch.flags & TF_kernel_mode) ?
646 (unsigned long *)regs->rsp :
647 (unsigned long *)n->arch.kernel_sp;
649 if ( put_user(regs->ss, rsp- 1) |
650 put_user(regs->rsp, rsp- 2) |
651 put_user(regs->rflags, rsp- 3) |
652 put_user(regs->cs, rsp- 4) |
653 put_user(regs->rip, rsp- 5) |
654 put_user(regs->gs, rsp- 6) |
655 put_user(regs->fs, rsp- 7) |
656 put_user(regs->es, rsp- 8) |
657 put_user(regs->ds, rsp- 9) |
658 put_user(regs->r11, rsp-10) |
659 put_user(regs->rcx, rsp-11) )
660 {
661 DPRINTK("Error while creating failsafe callback frame.\n");
662 domain_crash();
663 }
665 if ( !(n->arch.flags & TF_kernel_mode) )
666 toggle_guest_mode(n);
668 regs->entry_vector = TRAP_syscall;
669 regs->rflags &= 0xFFFCBEFFUL;
670 regs->ss = __GUEST_SS;
671 regs->rsp = (unsigned long)(rsp-11);
672 regs->cs = __GUEST_CS;
673 regs->rip = n->arch.failsafe_address;
674 }
675 }
677 long do_switch_to_user(void)
678 {
679 struct xen_regs *regs = get_execution_context();
680 struct switch_to_user stu;
681 struct exec_domain *ed = current;
683 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
684 unlikely(pagetable_val(ed->arch.guest_table_user) == 0) )
685 return -EFAULT;
687 toggle_guest_mode(ed);
689 regs->rip = stu.rip;
690 regs->cs = stu.cs;
691 regs->rflags = stu.rflags;
692 regs->rsp = stu.rsp;
693 regs->ss = stu.ss;
695 if ( !(stu.flags & ECF_IN_SYSCALL) )
696 {
697 regs->entry_vector = 0;
698 regs->r11 = stu.r11;
699 regs->rcx = stu.rcx;
700 }
702 return regs->rax;
703 }
705 #elif defined(__i386__)
707 #define switch_segments(_r, _p, _n) ((void)0)
709 #endif
711 /*
712 * This special macro can be used to load a debugging register
713 */
714 #define loaddebug(_ed,_reg) \
715 __asm__("mov %0,%%db" #_reg \
716 : /* no output */ \
717 :"r" ((_ed)->debugreg[_reg]))
719 void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
720 {
721 struct tss_struct *tss = init_tss + smp_processor_id();
722 execution_context_t *stack_ec = get_execution_context();
723 int i;
724 #ifdef CONFIG_VMX
725 unsigned long vmx_domain = next_p->arch.arch_vmx.flags;
726 #endif
728 __cli();
730 /* Switch guest general-register state. */
731 if ( !is_idle_task(prev_p->domain) )
732 {
733 memcpy(&prev_p->arch.user_ctxt,
734 stack_ec,
735 sizeof(*stack_ec));
736 unlazy_fpu(prev_p);
737 CLEAR_FAST_TRAP(&prev_p->arch);
738 }
740 if ( !is_idle_task(next_p->domain) )
741 {
742 memcpy(stack_ec,
743 &next_p->arch.user_ctxt,
744 sizeof(*stack_ec));
746 /* Maybe switch the debug registers. */
747 if ( unlikely(next_p->arch.debugreg[7]) )
748 {
749 loaddebug(&next_p->arch, 0);
750 loaddebug(&next_p->arch, 1);
751 loaddebug(&next_p->arch, 2);
752 loaddebug(&next_p->arch, 3);
753 /* no 4 and 5 */
754 loaddebug(&next_p->arch, 6);
755 loaddebug(&next_p->arch, 7);
756 }
758 #ifdef CONFIG_VMX
759 if ( vmx_domain )
760 {
761 /* Switch page tables. */
762 write_ptbase(next_p);
764 set_current(next_p);
765 /* Switch GDT and LDT. */
766 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->arch.gdt));
768 __sti();
769 return;
770 }
771 #endif
773 SET_FAST_TRAP(&next_p->arch);
775 #ifdef __i386__
776 /* Switch the kernel ring-1 stack. */
777 tss->esp1 = next_p->arch.kernel_sp;
778 tss->ss1 = next_p->arch.kernel_ss;
779 #endif
781 /* Switch page tables. */
782 write_ptbase(next_p);
783 }
785 if ( unlikely(prev_p->arch.io_bitmap != NULL) )
786 {
787 for ( i = 0; i < sizeof(prev_p->arch.io_bitmap_sel) * 8; i++ )
788 if ( !test_bit(i, &prev_p->arch.io_bitmap_sel) )
789 memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
790 ~0U, IOBMP_BYTES_PER_SELBIT);
791 tss->bitmap = IOBMP_INVALID_OFFSET;
792 }
794 if ( unlikely(next_p->arch.io_bitmap != NULL) )
795 {
796 for ( i = 0; i < sizeof(next_p->arch.io_bitmap_sel) * 8; i++ )
797 if ( !test_bit(i, &next_p->arch.io_bitmap_sel) )
798 memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
799 &next_p->arch.io_bitmap[i * IOBMP_BYTES_PER_SELBIT],
800 IOBMP_BYTES_PER_SELBIT);
801 tss->bitmap = IOBMP_OFFSET;
802 }
804 set_current(next_p);
806 /* Switch GDT and LDT. */
807 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->arch.gdt));
808 load_LDT(next_p);
810 __sti();
812 switch_segments(stack_ec, prev_p, next_p);
813 }
816 /* XXX Currently the 'domain' field is ignored! XXX */
817 long do_iopl(domid_t domain, unsigned int new_io_pl)
818 {
819 execution_context_t *ec = get_execution_context();
820 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12);
821 return 0;
822 }
824 unsigned long __hypercall_create_continuation(
825 unsigned int op, unsigned int nr_args, ...)
826 {
827 struct mc_state *mcs = &mc_state[smp_processor_id()];
828 execution_context_t *ec;
829 unsigned int i;
830 va_list args;
832 va_start(args, nr_args);
834 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
835 {
836 __set_bit(_MCSF_call_preempted, &mcs->flags);
838 for ( i = 0; i < nr_args; i++ )
839 mcs->call.args[i] = va_arg(args, unsigned long);
840 }
841 else
842 {
843 ec = get_execution_context();
844 #if defined(__i386__)
845 ec->eax = op;
846 ec->eip -= 2; /* re-execute 'int 0x82' */
848 for ( i = 0; i < nr_args; i++ )
849 {
850 switch ( i )
851 {
852 case 0: ec->ebx = va_arg(args, unsigned long); break;
853 case 1: ec->ecx = va_arg(args, unsigned long); break;
854 case 2: ec->edx = va_arg(args, unsigned long); break;
855 case 3: ec->esi = va_arg(args, unsigned long); break;
856 case 4: ec->edi = va_arg(args, unsigned long); break;
857 case 5: ec->ebp = va_arg(args, unsigned long); break;
858 }
859 }
860 #elif defined(__x86_64__)
861 ec->rax = op;
862 ec->rip -= 2; /* re-execute 'syscall' */
864 for ( i = 0; i < nr_args; i++ )
865 {
866 switch ( i )
867 {
868 case 0: ec->rdi = va_arg(args, unsigned long); break;
869 case 1: ec->rsi = va_arg(args, unsigned long); break;
870 case 2: ec->rdx = va_arg(args, unsigned long); break;
871 case 3: ec->r10 = va_arg(args, unsigned long); break;
872 case 4: ec->r8 = va_arg(args, unsigned long); break;
873 case 5: ec->r9 = va_arg(args, unsigned long); break;
874 }
875 }
876 #endif
877 }
879 va_end(args);
881 return op;
882 }
884 static void relinquish_list(struct domain *d, struct list_head *list)
885 {
886 struct list_head *ent;
887 struct pfn_info *page;
888 unsigned long x, y;
890 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
891 spin_lock_recursive(&d->page_alloc_lock);
893 ent = list->next;
894 while ( ent != list )
895 {
896 page = list_entry(ent, struct pfn_info, list);
898 /* Grab a reference to the page so it won't disappear from under us. */
899 if ( unlikely(!get_page(page, d)) )
900 {
901 /* Couldn't get a reference -- someone is freeing this page. */
902 ent = ent->next;
903 continue;
904 }
906 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
907 put_page_and_type(page);
909 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
910 put_page(page);
912 /*
913 * Forcibly invalidate base page tables at this point to break circular
914 * 'linear page table' references. This is okay because MMU structures
915 * are not shared across domains and this domain is now dead. Thus base
916 * tables are not in use so a non-zero count means circular reference.
917 */
918 y = page->u.inuse.type_info;
919 for ( ; ; )
920 {
921 x = y;
922 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
923 (PGT_base_page_table|PGT_validated)) )
924 break;
926 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
927 if ( likely(y == x) )
928 {
929 free_page_type(page, PGT_base_page_table);
930 break;
931 }
932 }
934 /* Follow the list chain and /then/ potentially free the page. */
935 ent = ent->next;
936 put_page(page);
937 }
939 spin_unlock_recursive(&d->page_alloc_lock);
940 }
942 #ifdef CONFIG_VMX
943 static void vmx_domain_relinquish_memory(struct exec_domain *ed)
944 {
945 struct vmx_virpit_t *vpit = &(ed->arch.arch_vmx.vmx_platform.vmx_pit);
946 /*
947 * Free VMCS
948 */
949 ASSERT(ed->arch.arch_vmx.vmcs);
950 free_vmcs(ed->arch.arch_vmx.vmcs);
951 ed->arch.arch_vmx.vmcs = 0;
953 free_monitor_pagetable(ed);
954 rem_ac_timer(&(vpit->pit_timer));
955 }
956 #endif
958 void domain_relinquish_memory(struct domain *d)
959 {
960 struct exec_domain *ed;
962 /* Ensure that noone is running over the dead domain's page tables. */
963 synchronise_pagetables(~0UL);
965 /* Exit shadow mode before deconstructing final guest page table. */
966 shadow_mode_disable(d);
968 /* Drop the in-use references to page-table bases. */
969 for_each_exec_domain ( d, ed )
970 {
971 if ( pagetable_val(ed->arch.guest_table) != 0 )
972 {
973 put_page_and_type(
974 &frame_table[pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT]);
975 ed->arch.guest_table = mk_pagetable(0);
976 }
978 if ( pagetable_val(ed->arch.guest_table_user) != 0 )
979 {
980 put_page_and_type(
981 &frame_table[pagetable_val(ed->arch.guest_table_user) >>
982 PAGE_SHIFT]);
983 ed->arch.guest_table_user = mk_pagetable(0);
984 }
985 }
987 #ifdef CONFIG_VMX
988 if ( VMX_DOMAIN(d->exec_domain[0]) )
989 for_each_exec_domain ( d, ed )
990 vmx_domain_relinquish_memory(ed);
991 #endif
993 /*
994 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
995 * it automatically gets squashed when the guest's mappings go away.
996 */
997 for_each_exec_domain(d, ed)
998 destroy_gdt(ed);
1000 /* Relinquish every page of memory. */
1001 relinquish_list(d, &d->xenpage_list);
1002 relinquish_list(d, &d->page_list);