ia64/xen-unstable

view xen/arch/x86/domain.c @ 7556:6304291cf08b

Print out MFN in audit code, for debugging purposes.

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author emellor@leeni.uk.xensource.com
date Sun Oct 30 13:24:45 2005 +0100 (2005-10-30)
parents e398a9797c4c
children 8ffba597c385
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <asm/physdev.h>
41 #include <xen/kernel.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 struct percpu_ctxt {
49 struct vcpu *curr_vcpu;
50 unsigned int context_not_finalised;
51 unsigned int dirty_segment_mask;
52 } __cacheline_aligned;
53 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
55 static void continue_idle_task(struct vcpu *v)
56 {
57 reset_stack_and_jump(idle_loop);
58 }
60 static void continue_nonidle_task(struct vcpu *v)
61 {
62 reset_stack_and_jump(ret_from_intr);
63 }
65 static void default_idle(void)
66 {
67 local_irq_disable();
68 if ( !softirq_pending(smp_processor_id()) )
69 safe_halt();
70 else
71 local_irq_enable();
72 }
74 void idle_loop(void)
75 {
76 int cpu = smp_processor_id();
78 for ( ; ; )
79 {
80 irq_stat[cpu].idle_timestamp = jiffies;
82 while ( !softirq_pending(cpu) )
83 {
84 page_scrub_schedule_work();
85 default_idle();
86 }
88 do_softirq();
89 }
90 }
92 void startup_cpu_idle_loop(void)
93 {
94 struct vcpu *v = current;
96 ASSERT(is_idle_task(v->domain));
97 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
98 cpu_set(smp_processor_id(), v->domain->cpumask);
99 v->arch.schedule_tail = continue_idle_task;
101 idle_loop();
102 }
104 static long no_idt[2];
105 static int reboot_mode;
107 static inline void kb_wait(void)
108 {
109 int i;
111 for ( i = 0; i < 0x10000; i++ )
112 if ( (inb_p(0x64) & 0x02) == 0 )
113 break;
114 }
116 void machine_restart(char * __unused)
117 {
118 int i;
120 if ( opt_noreboot )
121 {
122 printk("Reboot disabled on cmdline: require manual reset\n");
123 for ( ; ; )
124 safe_halt();
125 }
127 watchdog_disable();
128 console_start_sync();
130 local_irq_enable();
132 /* Ensure we are the boot CPU. */
133 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
134 {
135 smp_call_function((void *)machine_restart, NULL, 1, 0);
136 for ( ; ; )
137 safe_halt();
138 }
140 /*
141 * Stop all CPUs and turn off local APICs and the IO-APIC, so
142 * other OSs see a clean IRQ state.
143 */
144 smp_send_stop();
145 disable_IO_APIC();
147 stop_vmx();
149 /* Rebooting needs to touch the page at absolute address 0. */
150 *((unsigned short *)__va(0x472)) = reboot_mode;
152 for ( ; ; )
153 {
154 /* Pulse the keyboard reset line. */
155 for ( i = 0; i < 100; i++ )
156 {
157 kb_wait();
158 udelay(50);
159 outb(0xfe,0x64); /* pulse reset low */
160 udelay(50);
161 }
163 /* That didn't work - force a triple fault.. */
164 __asm__ __volatile__("lidt %0": "=m" (no_idt));
165 __asm__ __volatile__("int3");
166 }
167 }
170 void __attribute__((noreturn)) __machine_halt(void *unused)
171 {
172 for ( ; ; )
173 safe_halt();
174 }
176 void machine_halt(void)
177 {
178 watchdog_disable();
179 console_start_sync();
180 smp_call_function(__machine_halt, NULL, 1, 0);
181 __machine_halt(NULL);
182 }
184 void dump_pageframe_info(struct domain *d)
185 {
186 struct pfn_info *page;
188 if ( d->tot_pages < 10 )
189 {
190 list_for_each_entry ( page, &d->page_list, list )
191 {
192 printk("Page %p: mfn=%x, caf=%08x, taf=%" PRtype_info "\n",
193 _p(page_to_phys(page)), page-frame_table, page->count_info,
194 page->u.inuse.type_info);
195 }
196 }
198 list_for_each_entry ( page, &d->xenpage_list, list )
199 {
200 printk("XenPage %p: mfn=%x, caf=%08x, taf=%" PRtype_info "\n",
201 _p(page_to_phys(page)), page-frame_table, page->count_info,
202 page->u.inuse.type_info);
203 }
205 page = virt_to_page(d->shared_info);
206 printk("Shared_info@%p: mfn=%x, caf=%08x, taf=%" PRtype_info "\n",
207 _p(page_to_phys(page)), page-frame_table, page->count_info,
208 page->u.inuse.type_info);
209 }
211 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
212 {
213 struct vcpu *v;
215 if ( (v = xmalloc(struct vcpu)) == NULL )
216 return NULL;
218 memset(v, 0, sizeof(*v));
220 memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch));
221 v->arch.flags = TF_kernel_mode;
223 if ( (v->vcpu_id = vcpu_id) != 0 )
224 {
225 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
226 v->arch.perdomain_ptes =
227 d->arch.mm_perdomain_pt + (vcpu_id << PDPT_VCPU_SHIFT);
228 }
230 return v;
231 }
233 void free_vcpu_struct(struct vcpu *v)
234 {
235 BUG_ON(v->next_in_list != NULL);
236 if ( v->vcpu_id != 0 )
237 v->domain->vcpu[v->vcpu_id - 1]->next_in_list = NULL;
238 xfree(v);
239 }
241 void free_perdomain_pt(struct domain *d)
242 {
243 free_xenheap_page(d->arch.mm_perdomain_pt);
244 #ifdef __x86_64__
245 free_xenheap_page(d->arch.mm_perdomain_l2);
246 free_xenheap_page(d->arch.mm_perdomain_l3);
247 #endif
248 }
250 void arch_do_createdomain(struct vcpu *v)
251 {
252 struct domain *d = v->domain;
253 l1_pgentry_t gdt_l1e;
254 int vcpuid;
256 if ( is_idle_task(d) )
257 return;
259 v->arch.schedule_tail = continue_nonidle_task;
261 d->shared_info = alloc_xenheap_page();
262 memset(d->shared_info, 0, PAGE_SIZE);
263 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
264 v->cpumap = CPUMAP_RUNANYWHERE;
265 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
266 set_pfn_from_mfn(virt_to_phys(d->shared_info) >> PAGE_SHIFT,
267 INVALID_M2P_ENTRY);
269 d->arch.mm_perdomain_pt = alloc_xenheap_page();
270 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
271 set_pfn_from_mfn(virt_to_phys(d->arch.mm_perdomain_pt) >> PAGE_SHIFT,
272 INVALID_M2P_ENTRY);
273 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
275 /*
276 * Map Xen segments into every VCPU's GDT, irrespective of whether every
277 * VCPU will actually be used. This avoids an NMI race during context
278 * switch: if we take an interrupt after switching CR3 but before switching
279 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
280 * try to load CS from an invalid table.
281 */
282 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
283 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
284 d->arch.mm_perdomain_pt[
285 (vcpuid << PDPT_VCPU_SHIFT) + FIRST_RESERVED_GDT_PAGE] = gdt_l1e;
287 v->arch.guest_vtable = __linear_l2_table;
288 v->arch.shadow_vtable = __shadow_linear_l2_table;
290 #ifdef __x86_64__
291 v->arch.guest_vl3table = __linear_l3_table;
292 v->arch.guest_vl4table = __linear_l4_table;
294 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
295 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
296 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
297 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt),
298 __PAGE_HYPERVISOR);
299 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
300 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
301 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
302 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
303 __PAGE_HYPERVISOR);
304 #endif
306 (void)ptwr_init(d);
308 shadow_lock_init(d);
309 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
310 }
312 void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
313 {
314 if ( v->processor == newcpu )
315 return;
317 set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
318 v->processor = newcpu;
320 if ( VMX_DOMAIN(v) )
321 {
322 __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs));
323 v->arch.schedule_tail = arch_vmx_do_relaunch;
324 }
325 }
327 /* This is called by arch_final_setup_guest and do_boot_vcpu */
328 int arch_set_info_guest(
329 struct vcpu *v, struct vcpu_guest_context *c)
330 {
331 struct domain *d = v->domain;
332 unsigned long phys_basetab;
333 int i, rc;
335 /*
336 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
337 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
338 * If SS RPL or DPL differs from CS RPL then we'll #GP.
339 */
340 if ( !(c->flags & VGCF_VMX_GUEST) )
341 {
342 if ( ((c->user_regs.cs & 3) == 0) ||
343 ((c->user_regs.ss & 3) == 0) )
344 return -EINVAL;
345 }
347 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
348 if ( c->flags & VGCF_I387_VALID )
349 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
351 v->arch.flags &= ~TF_kernel_mode;
352 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
353 v->arch.flags |= TF_kernel_mode;
355 memcpy(&v->arch.guest_context, c, sizeof(*c));
357 if ( !(c->flags & VGCF_VMX_GUEST) )
358 {
359 /* IOPL privileges are virtualised. */
360 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
361 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
363 /* Ensure real hardware interrupts are enabled. */
364 v->arch.guest_context.user_regs.eflags |= EF_IE;
365 }
366 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
367 {
368 return modify_vmcs(
369 &v->arch.arch_vmx,
370 &v->arch.guest_context.user_regs);
371 }
373 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
374 return 0;
376 memset(v->arch.guest_context.debugreg, 0,
377 sizeof(v->arch.guest_context.debugreg));
378 for ( i = 0; i < 8; i++ )
379 (void)set_debugreg(v, i, c->debugreg[i]);
381 if ( v->vcpu_id == 0 )
382 d->vm_assist = c->vm_assist;
384 phys_basetab = c->ctrlreg[3];
385 v->arch.guest_table = mk_pagetable(phys_basetab);
387 if ( shadow_mode_refcounts(d) )
388 {
389 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
390 return -EINVAL;
391 }
392 else if ( !(c->flags & VGCF_VMX_GUEST) )
393 {
394 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
395 PGT_base_page_table) )
396 return -EINVAL;
397 }
399 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
400 {
401 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
402 return rc;
403 }
405 if ( c->flags & VGCF_VMX_GUEST )
406 {
407 /* VMX uses the initially provided page tables as the P2M map. */
408 if ( !pagetable_get_paddr(d->arch.phys_table) )
409 d->arch.phys_table = v->arch.guest_table;
411 vmx_final_setup_guest(v);
412 }
414 update_pagetables(v);
416 if ( v->vcpu_id == 0 )
417 init_domain_time(d);
419 /* Don't redo final setup */
420 set_bit(_VCPUF_initialised, &v->vcpu_flags);
422 return 0;
423 }
426 void new_thread(struct vcpu *d,
427 unsigned long start_pc,
428 unsigned long start_stack,
429 unsigned long start_info)
430 {
431 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
433 /*
434 * Initial register values:
435 * DS,ES,FS,GS = FLAT_KERNEL_DS
436 * CS:EIP = FLAT_KERNEL_CS:start_pc
437 * SS:ESP = FLAT_KERNEL_SS:start_stack
438 * ESI = start_info
439 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
440 */
441 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
442 regs->ss = FLAT_KERNEL_SS;
443 regs->cs = FLAT_KERNEL_CS;
444 regs->eip = start_pc;
445 regs->esp = start_stack;
446 regs->esi = start_info;
448 __save_flags(regs->eflags);
449 regs->eflags |= X86_EFLAGS_IF;
450 }
453 #ifdef __x86_64__
455 void toggle_guest_mode(struct vcpu *v)
456 {
457 v->arch.flags ^= TF_kernel_mode;
458 __asm__ __volatile__ ( "swapgs" );
459 update_pagetables(v);
460 write_ptbase(v);
461 }
463 #define loadsegment(seg,value) ({ \
464 int __r = 1; \
465 __asm__ __volatile__ ( \
466 "1: movl %k1,%%" #seg "\n2:\n" \
467 ".section .fixup,\"ax\"\n" \
468 "3: xorl %k0,%k0\n" \
469 " movl %k0,%%" #seg "\n" \
470 " jmp 2b\n" \
471 ".previous\n" \
472 ".section __ex_table,\"a\"\n" \
473 " .align 8\n" \
474 " .quad 1b,3b\n" \
475 ".previous" \
476 : "=r" (__r) : "r" (value), "0" (__r) );\
477 __r; })
479 /*
480 * save_segments() writes a mask of segments which are dirty (non-zero),
481 * allowing load_segments() to avoid some expensive segment loads and
482 * MSR writes.
483 */
484 #define DIRTY_DS 0x01
485 #define DIRTY_ES 0x02
486 #define DIRTY_FS 0x04
487 #define DIRTY_GS 0x08
488 #define DIRTY_FS_BASE 0x10
489 #define DIRTY_GS_BASE_USER 0x20
491 static void load_segments(struct vcpu *n)
492 {
493 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
494 int all_segs_okay = 1;
495 unsigned int dirty_segment_mask, cpu = smp_processor_id();
497 /* Load and clear the dirty segment mask. */
498 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
499 percpu_ctxt[cpu].dirty_segment_mask = 0;
501 /* Either selector != 0 ==> reload. */
502 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
503 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
505 /* Either selector != 0 ==> reload. */
506 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
507 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
509 /*
510 * Either selector != 0 ==> reload.
511 * Also reload to reset FS_BASE if it was non-zero.
512 */
513 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
514 nctxt->user_regs.fs) )
515 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
517 /*
518 * Either selector != 0 ==> reload.
519 * Also reload to reset GS_BASE if it was non-zero.
520 */
521 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
522 nctxt->user_regs.gs) )
523 {
524 /* Reset GS_BASE with user %gs? */
525 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
526 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
527 }
529 /* This can only be non-zero if selector is NULL. */
530 if ( nctxt->fs_base )
531 wrmsr(MSR_FS_BASE,
532 nctxt->fs_base,
533 nctxt->fs_base>>32);
535 /* Most kernels have non-zero GS base, so don't bother testing. */
536 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
537 wrmsr(MSR_SHADOW_GS_BASE,
538 nctxt->gs_base_kernel,
539 nctxt->gs_base_kernel>>32);
541 /* This can only be non-zero if selector is NULL. */
542 if ( nctxt->gs_base_user )
543 wrmsr(MSR_GS_BASE,
544 nctxt->gs_base_user,
545 nctxt->gs_base_user>>32);
547 /* If in kernel mode then switch the GS bases around. */
548 if ( n->arch.flags & TF_kernel_mode )
549 __asm__ __volatile__ ( "swapgs" );
551 if ( unlikely(!all_segs_okay) )
552 {
553 struct cpu_user_regs *regs = guest_cpu_user_regs();
554 unsigned long *rsp =
555 (n->arch.flags & TF_kernel_mode) ?
556 (unsigned long *)regs->rsp :
557 (unsigned long *)nctxt->kernel_sp;
559 if ( !(n->arch.flags & TF_kernel_mode) )
560 toggle_guest_mode(n);
561 else
562 regs->cs &= ~3;
564 if ( put_user(regs->ss, rsp- 1) |
565 put_user(regs->rsp, rsp- 2) |
566 put_user(regs->rflags, rsp- 3) |
567 put_user(regs->cs, rsp- 4) |
568 put_user(regs->rip, rsp- 5) |
569 put_user(nctxt->user_regs.gs, rsp- 6) |
570 put_user(nctxt->user_regs.fs, rsp- 7) |
571 put_user(nctxt->user_regs.es, rsp- 8) |
572 put_user(nctxt->user_regs.ds, rsp- 9) |
573 put_user(regs->r11, rsp-10) |
574 put_user(regs->rcx, rsp-11) )
575 {
576 DPRINTK("Error while creating failsafe callback frame.\n");
577 domain_crash();
578 }
580 regs->entry_vector = TRAP_syscall;
581 regs->rflags &= 0xFFFCBEFFUL;
582 regs->ss = __GUEST_SS;
583 regs->rsp = (unsigned long)(rsp-11);
584 regs->cs = __GUEST_CS;
585 regs->rip = nctxt->failsafe_callback_eip;
586 }
587 }
589 static void save_segments(struct vcpu *v)
590 {
591 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
592 struct cpu_user_regs *regs = &ctxt->user_regs;
593 unsigned int dirty_segment_mask = 0;
595 if ( VMX_DOMAIN(v) )
596 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
598 __asm__ __volatile__ ( "mov %%ds,%0" : "=m" (regs->ds) );
599 __asm__ __volatile__ ( "mov %%es,%0" : "=m" (regs->es) );
600 __asm__ __volatile__ ( "mov %%fs,%0" : "=m" (regs->fs) );
601 __asm__ __volatile__ ( "mov %%gs,%0" : "=m" (regs->gs) );
603 if ( regs->ds )
604 dirty_segment_mask |= DIRTY_DS;
606 if ( regs->es )
607 dirty_segment_mask |= DIRTY_ES;
609 if ( regs->fs )
610 {
611 dirty_segment_mask |= DIRTY_FS;
612 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
613 }
614 else if ( ctxt->fs_base )
615 {
616 dirty_segment_mask |= DIRTY_FS_BASE;
617 }
619 if ( regs->gs )
620 {
621 dirty_segment_mask |= DIRTY_GS;
622 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
623 }
624 else if ( ctxt->gs_base_user )
625 {
626 dirty_segment_mask |= DIRTY_GS_BASE_USER;
627 }
629 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
630 }
632 long do_switch_to_user(void)
633 {
634 struct cpu_user_regs *regs = guest_cpu_user_regs();
635 struct switch_to_user stu;
636 struct vcpu *v = current;
638 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
639 unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
640 return -EFAULT;
642 toggle_guest_mode(v);
644 regs->rip = stu.rip;
645 regs->cs = stu.cs | 3; /* force guest privilege */
646 regs->rflags = stu.rflags;
647 regs->rsp = stu.rsp;
648 regs->ss = stu.ss | 3; /* force guest privilege */
650 if ( !(stu.flags & VGCF_IN_SYSCALL) )
651 {
652 regs->entry_vector = 0;
653 regs->r11 = stu.r11;
654 regs->rcx = stu.rcx;
655 }
657 /* Saved %rax gets written back to regs->rax in entry.S. */
658 return stu.rax;
659 }
661 #define switch_kernel_stack(_n,_c) ((void)0)
663 #elif defined(__i386__)
665 #define load_segments(n) ((void)0)
666 #define save_segments(p) ((void)0)
668 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
669 {
670 struct tss_struct *tss = &init_tss[cpu];
671 tss->esp1 = n->arch.guest_context.kernel_sp;
672 tss->ss1 = n->arch.guest_context.kernel_ss;
673 }
675 #endif
677 #define loaddebug(_v,_reg) \
678 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
680 static void __context_switch(void)
681 {
682 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
683 unsigned int cpu = smp_processor_id();
684 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
685 struct vcpu *n = current;
687 if ( !is_idle_task(p->domain) )
688 {
689 memcpy(&p->arch.guest_context.user_regs,
690 stack_regs,
691 CTXT_SWITCH_STACK_BYTES);
692 unlazy_fpu(p);
693 save_segments(p);
694 }
696 if ( !is_idle_task(n->domain) )
697 {
698 memcpy(stack_regs,
699 &n->arch.guest_context.user_regs,
700 CTXT_SWITCH_STACK_BYTES);
702 /* Maybe switch the debug registers. */
703 if ( unlikely(n->arch.guest_context.debugreg[7]) )
704 {
705 loaddebug(&n->arch.guest_context, 0);
706 loaddebug(&n->arch.guest_context, 1);
707 loaddebug(&n->arch.guest_context, 2);
708 loaddebug(&n->arch.guest_context, 3);
709 /* no 4 and 5 */
710 loaddebug(&n->arch.guest_context, 6);
711 loaddebug(&n->arch.guest_context, 7);
712 }
714 if ( !VMX_DOMAIN(n) )
715 {
716 set_int80_direct_trap(n);
717 switch_kernel_stack(n, cpu);
718 }
719 }
721 if ( p->domain != n->domain )
722 cpu_set(cpu, n->domain->cpumask);
724 write_ptbase(n);
726 if ( p->vcpu_id != n->vcpu_id )
727 {
728 char gdt_load[10];
729 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
730 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
731 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
732 }
734 if ( p->domain != n->domain )
735 cpu_clear(cpu, p->domain->cpumask);
737 percpu_ctxt[cpu].curr_vcpu = n;
738 }
741 void context_switch(struct vcpu *prev, struct vcpu *next)
742 {
743 unsigned int cpu = smp_processor_id();
745 ASSERT(!local_irq_is_enabled());
747 set_current(next);
749 if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
750 {
751 __context_switch();
752 percpu_ctxt[cpu].context_not_finalised = 1;
753 }
754 }
756 void context_switch_finalise(struct vcpu *next)
757 {
758 unsigned int cpu = smp_processor_id();
760 ASSERT(local_irq_is_enabled());
762 if ( percpu_ctxt[cpu].context_not_finalised )
763 {
764 percpu_ctxt[cpu].context_not_finalised = 0;
766 BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
768 if ( VMX_DOMAIN(next) )
769 {
770 vmx_restore_msrs(next);
771 }
772 else
773 {
774 load_LDT(next);
775 load_segments(next);
776 vmx_load_msrs(next);
777 }
778 }
780 schedule_tail(next);
781 BUG();
782 }
784 void continue_running(struct vcpu *same)
785 {
786 schedule_tail(same);
787 BUG();
788 }
790 int __sync_lazy_execstate(void)
791 {
792 unsigned long flags;
793 int switch_required;
795 local_irq_save(flags);
797 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
799 if ( switch_required )
800 __context_switch();
802 local_irq_restore(flags);
804 return switch_required;
805 }
807 void sync_vcpu_execstate(struct vcpu *v)
808 {
809 unsigned int cpu = v->processor;
811 if ( !cpu_isset(cpu, v->domain->cpumask) )
812 return;
814 if ( cpu == smp_processor_id() )
815 {
816 (void)__sync_lazy_execstate();
817 }
818 else
819 {
820 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
821 flush_tlb_mask(cpumask_of_cpu(cpu));
822 }
823 }
825 unsigned long __hypercall_create_continuation(
826 unsigned int op, unsigned int nr_args, ...)
827 {
828 struct mc_state *mcs = &mc_state[smp_processor_id()];
829 struct cpu_user_regs *regs;
830 unsigned int i;
831 va_list args;
833 va_start(args, nr_args);
835 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
836 {
837 __set_bit(_MCSF_call_preempted, &mcs->flags);
839 for ( i = 0; i < nr_args; i++ )
840 mcs->call.args[i] = va_arg(args, unsigned long);
841 }
842 else
843 {
844 regs = guest_cpu_user_regs();
845 #if defined(__i386__)
846 regs->eax = op;
847 regs->eip -= 2; /* re-execute 'int 0x82' */
849 for ( i = 0; i < nr_args; i++ )
850 {
851 switch ( i )
852 {
853 case 0: regs->ebx = va_arg(args, unsigned long); break;
854 case 1: regs->ecx = va_arg(args, unsigned long); break;
855 case 2: regs->edx = va_arg(args, unsigned long); break;
856 case 3: regs->esi = va_arg(args, unsigned long); break;
857 case 4: regs->edi = va_arg(args, unsigned long); break;
858 case 5: regs->ebp = va_arg(args, unsigned long); break;
859 }
860 }
861 #elif defined(__x86_64__)
862 regs->rax = op;
863 regs->rip -= 2; /* re-execute 'syscall' */
865 for ( i = 0; i < nr_args; i++ )
866 {
867 switch ( i )
868 {
869 case 0: regs->rdi = va_arg(args, unsigned long); break;
870 case 1: regs->rsi = va_arg(args, unsigned long); break;
871 case 2: regs->rdx = va_arg(args, unsigned long); break;
872 case 3: regs->r10 = va_arg(args, unsigned long); break;
873 case 4: regs->r8 = va_arg(args, unsigned long); break;
874 case 5: regs->r9 = va_arg(args, unsigned long); break;
875 }
876 }
877 #endif
878 }
880 va_end(args);
882 return op;
883 }
885 static void relinquish_memory(struct domain *d, struct list_head *list)
886 {
887 struct list_head *ent;
888 struct pfn_info *page;
889 unsigned long x, y;
891 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
892 spin_lock_recursive(&d->page_alloc_lock);
894 ent = list->next;
895 while ( ent != list )
896 {
897 page = list_entry(ent, struct pfn_info, list);
899 /* Grab a reference to the page so it won't disappear from under us. */
900 if ( unlikely(!get_page(page, d)) )
901 {
902 /* Couldn't get a reference -- someone is freeing this page. */
903 ent = ent->next;
904 continue;
905 }
907 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
908 put_page_and_type(page);
910 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
911 put_page(page);
913 /*
914 * Forcibly invalidate base page tables at this point to break circular
915 * 'linear page table' references. This is okay because MMU structures
916 * are not shared across domains and this domain is now dead. Thus base
917 * tables are not in use so a non-zero count means circular reference.
918 */
919 y = page->u.inuse.type_info;
920 for ( ; ; )
921 {
922 x = y;
923 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
924 (PGT_base_page_table|PGT_validated)) )
925 break;
927 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
928 if ( likely(y == x) )
929 {
930 free_page_type(page, PGT_base_page_table);
931 break;
932 }
933 }
935 /* Follow the list chain and /then/ potentially free the page. */
936 ent = ent->next;
937 put_page(page);
938 }
940 spin_unlock_recursive(&d->page_alloc_lock);
941 }
943 void domain_relinquish_resources(struct domain *d)
944 {
945 struct vcpu *v;
946 unsigned long pfn;
948 BUG_ON(!cpus_empty(d->cpumask));
950 physdev_destroy_state(d);
952 ptwr_destroy(d);
954 /* Release device mappings of other domains */
955 gnttab_release_dev_mappings(d->grant_table);
957 /* Drop the in-use references to page-table bases. */
958 for_each_vcpu ( d, v )
959 {
960 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
961 {
962 if ( !shadow_mode_refcounts(d) )
963 put_page_type(pfn_to_page(pfn));
964 put_page(pfn_to_page(pfn));
966 v->arch.guest_table = mk_pagetable(0);
967 }
969 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
970 {
971 if ( !shadow_mode_refcounts(d) )
972 put_page_type(pfn_to_page(pfn));
973 put_page(pfn_to_page(pfn));
975 v->arch.guest_table_user = mk_pagetable(0);
976 }
978 vmx_relinquish_resources(v);
979 }
981 shadow_mode_disable(d);
983 /*
984 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
985 * it automatically gets squashed when the guest's mappings go away.
986 */
987 for_each_vcpu(d, v)
988 destroy_gdt(v);
990 /* Relinquish every page of memory. */
991 relinquish_memory(d, &d->xenpage_list);
992 relinquish_memory(d, &d->page_list);
993 }
996 /*
997 * Local variables:
998 * mode: C
999 * c-set-style: "BSD"
1000 * c-basic-offset: 4
1001 * tab-width: 4
1002 * indent-tabs-mode: nil
1003 * End:
1004 */