ia64/xen-unstable

view xen/arch/x86/domain.c @ 7609:ae2e13795c63

>I'm hitting an ASSERT in alloc_monitor_pagetable() with debug=y and I
>can reproduce the condition when debug isn't on. The monitor_table is
>not zeroed when entering alloc_monitor_pagetable() - this happens when I
>create a vmx guest. I'm not sure why this happening at first glance.
>File: xen/arch/x86/shadow32.c
>Line: 689
>System: VT/VMX FC3 x86_32 (I can reproduce on FC4 x86_64 too)
>ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);

Solution:
What happens is that the monitor page table is not zeroed out
during the initial setup for VMX guests. As a result, at a later
time when we enable shadow mode for these VMX guests, we trip on
the ASSERT problem above. To fix this, we just need to initialize
the monitor page table to 0 before calling vmx_final_setup_guest()
and enabling shadow mode.
author kaf24@firebug.cl.cam.ac.uk
date Tue Nov 01 19:13:06 2005 +0100 (2005-11-01)
parents 4e6c0734c409
children de41f5e24cf1
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <asm/physdev.h>
41 #include <xen/kernel.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 struct percpu_ctxt {
49 struct vcpu *curr_vcpu;
50 unsigned int context_not_finalised;
51 unsigned int dirty_segment_mask;
52 } __cacheline_aligned;
53 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
55 static void continue_idle_task(struct vcpu *v)
56 {
57 reset_stack_and_jump(idle_loop);
58 }
60 static void continue_nonidle_task(struct vcpu *v)
61 {
62 reset_stack_and_jump(ret_from_intr);
63 }
65 static void default_idle(void)
66 {
67 local_irq_disable();
68 if ( !softirq_pending(smp_processor_id()) )
69 safe_halt();
70 else
71 local_irq_enable();
72 }
74 void idle_loop(void)
75 {
76 int cpu = smp_processor_id();
78 for ( ; ; )
79 {
80 irq_stat[cpu].idle_timestamp = jiffies;
82 while ( !softirq_pending(cpu) )
83 {
84 page_scrub_schedule_work();
85 default_idle();
86 }
88 do_softirq();
89 }
90 }
92 void startup_cpu_idle_loop(void)
93 {
94 struct vcpu *v = current;
96 ASSERT(is_idle_task(v->domain));
97 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
98 cpu_set(smp_processor_id(), v->domain->cpumask);
99 v->arch.schedule_tail = continue_idle_task;
101 idle_loop();
102 }
104 static long no_idt[2];
105 static int reboot_mode;
107 static inline void kb_wait(void)
108 {
109 int i;
111 for ( i = 0; i < 0x10000; i++ )
112 if ( (inb_p(0x64) & 0x02) == 0 )
113 break;
114 }
116 void machine_restart(char * __unused)
117 {
118 int i;
120 if ( opt_noreboot )
121 {
122 printk("Reboot disabled on cmdline: require manual reset\n");
123 for ( ; ; )
124 safe_halt();
125 }
127 watchdog_disable();
128 console_start_sync();
130 local_irq_enable();
132 /* Ensure we are the boot CPU. */
133 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
134 {
135 smp_call_function((void *)machine_restart, NULL, 1, 0);
136 for ( ; ; )
137 safe_halt();
138 }
140 /*
141 * Stop all CPUs and turn off local APICs and the IO-APIC, so
142 * other OSs see a clean IRQ state.
143 */
144 smp_send_stop();
145 disable_IO_APIC();
147 stop_vmx();
149 /* Rebooting needs to touch the page at absolute address 0. */
150 *((unsigned short *)__va(0x472)) = reboot_mode;
152 for ( ; ; )
153 {
154 /* Pulse the keyboard reset line. */
155 for ( i = 0; i < 100; i++ )
156 {
157 kb_wait();
158 udelay(50);
159 outb(0xfe,0x64); /* pulse reset low */
160 udelay(50);
161 }
163 /* That didn't work - force a triple fault.. */
164 __asm__ __volatile__("lidt %0": "=m" (no_idt));
165 __asm__ __volatile__("int3");
166 }
167 }
170 void __attribute__((noreturn)) __machine_halt(void *unused)
171 {
172 for ( ; ; )
173 safe_halt();
174 }
176 void machine_halt(void)
177 {
178 watchdog_disable();
179 console_start_sync();
180 smp_call_function(__machine_halt, NULL, 1, 0);
181 __machine_halt(NULL);
182 }
184 void dump_pageframe_info(struct domain *d)
185 {
186 struct pfn_info *page;
188 if ( d->tot_pages < 10 )
189 {
190 list_for_each_entry ( page, &d->page_list, list )
191 {
192 printk("Page %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
193 _p(page_to_phys(page)), _p(page - frame_table),
194 page->count_info, page->u.inuse.type_info);
195 }
196 }
198 list_for_each_entry ( page, &d->xenpage_list, list )
199 {
200 printk("XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
201 _p(page_to_phys(page)), _p(page - frame_table),
202 page->count_info, page->u.inuse.type_info);
203 }
205 page = virt_to_page(d->shared_info);
206 printk("Shared_info@%p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
207 _p(page_to_phys(page)), _p(page - frame_table), page->count_info,
208 page->u.inuse.type_info);
209 }
211 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
212 {
213 struct vcpu *v;
215 if ( (v = xmalloc(struct vcpu)) == NULL )
216 return NULL;
218 memset(v, 0, sizeof(*v));
220 memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch));
221 v->arch.flags = TF_kernel_mode;
223 if ( (v->vcpu_id = vcpu_id) != 0 )
224 {
225 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
226 v->arch.perdomain_ptes =
227 d->arch.mm_perdomain_pt + (vcpu_id << PDPT_VCPU_SHIFT);
228 }
230 return v;
231 }
233 void free_vcpu_struct(struct vcpu *v)
234 {
235 BUG_ON(v->next_in_list != NULL);
236 if ( v->vcpu_id != 0 )
237 v->domain->vcpu[v->vcpu_id - 1]->next_in_list = NULL;
238 xfree(v);
239 }
241 void free_perdomain_pt(struct domain *d)
242 {
243 free_xenheap_page(d->arch.mm_perdomain_pt);
244 #ifdef __x86_64__
245 free_xenheap_page(d->arch.mm_perdomain_l2);
246 free_xenheap_page(d->arch.mm_perdomain_l3);
247 #endif
248 }
250 void arch_do_createdomain(struct vcpu *v)
251 {
252 struct domain *d = v->domain;
253 l1_pgentry_t gdt_l1e;
254 int vcpuid;
256 if ( is_idle_task(d) )
257 return;
259 v->arch.schedule_tail = continue_nonidle_task;
261 d->shared_info = alloc_xenheap_page();
262 memset(d->shared_info, 0, PAGE_SIZE);
263 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
264 v->cpumap = CPUMAP_RUNANYWHERE;
265 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
266 set_pfn_from_mfn(virt_to_phys(d->shared_info) >> PAGE_SHIFT,
267 INVALID_M2P_ENTRY);
269 d->arch.mm_perdomain_pt = alloc_xenheap_page();
270 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE);
271 set_pfn_from_mfn(virt_to_phys(d->arch.mm_perdomain_pt) >> PAGE_SHIFT,
272 INVALID_M2P_ENTRY);
273 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
275 /*
276 * Map Xen segments into every VCPU's GDT, irrespective of whether every
277 * VCPU will actually be used. This avoids an NMI race during context
278 * switch: if we take an interrupt after switching CR3 but before switching
279 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
280 * try to load CS from an invalid table.
281 */
282 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
283 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
284 d->arch.mm_perdomain_pt[
285 (vcpuid << PDPT_VCPU_SHIFT) + FIRST_RESERVED_GDT_PAGE] = gdt_l1e;
287 v->arch.guest_vtable = __linear_l2_table;
288 v->arch.shadow_vtable = __shadow_linear_l2_table;
290 #ifdef __x86_64__
291 v->arch.guest_vl3table = __linear_l3_table;
292 v->arch.guest_vl4table = __linear_l4_table;
294 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
295 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
296 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)] =
297 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt),
298 __PAGE_HYPERVISOR);
299 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
300 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
301 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
302 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
303 __PAGE_HYPERVISOR);
304 #endif
306 (void)ptwr_init(d);
308 shadow_lock_init(d);
309 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
310 }
312 void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
313 {
314 if ( v->processor == newcpu )
315 return;
317 set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
318 v->processor = newcpu;
320 if ( VMX_DOMAIN(v) )
321 {
322 __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs));
323 v->arch.schedule_tail = arch_vmx_do_relaunch;
324 }
325 }
327 /* This is called by arch_final_setup_guest and do_boot_vcpu */
328 int arch_set_info_guest(
329 struct vcpu *v, struct vcpu_guest_context *c)
330 {
331 struct domain *d = v->domain;
332 unsigned long phys_basetab;
333 int i, rc;
335 /*
336 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
337 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
338 * If SS RPL or DPL differs from CS RPL then we'll #GP.
339 */
340 if ( !(c->flags & VGCF_VMX_GUEST) )
341 {
342 if ( ((c->user_regs.cs & 3) == 0) ||
343 ((c->user_regs.ss & 3) == 0) )
344 return -EINVAL;
345 }
347 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
348 if ( c->flags & VGCF_I387_VALID )
349 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
351 v->arch.flags &= ~TF_kernel_mode;
352 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
353 v->arch.flags |= TF_kernel_mode;
355 memcpy(&v->arch.guest_context, c, sizeof(*c));
357 if ( !(c->flags & VGCF_VMX_GUEST) )
358 {
359 /* IOPL privileges are virtualised. */
360 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
361 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
363 /* Ensure real hardware interrupts are enabled. */
364 v->arch.guest_context.user_regs.eflags |= EF_IE;
365 }
366 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
367 {
368 return modify_vmcs(
369 &v->arch.arch_vmx,
370 &v->arch.guest_context.user_regs);
371 }
373 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
374 return 0;
376 memset(v->arch.guest_context.debugreg, 0,
377 sizeof(v->arch.guest_context.debugreg));
378 for ( i = 0; i < 8; i++ )
379 (void)set_debugreg(v, i, c->debugreg[i]);
381 if ( v->vcpu_id == 0 )
382 d->vm_assist = c->vm_assist;
384 phys_basetab = c->ctrlreg[3];
385 v->arch.guest_table = mk_pagetable(phys_basetab);
387 if ( shadow_mode_refcounts(d) )
388 {
389 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
390 return -EINVAL;
391 }
392 else if ( !(c->flags & VGCF_VMX_GUEST) )
393 {
394 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
395 PGT_base_page_table) )
396 return -EINVAL;
397 }
399 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
400 {
401 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
402 return rc;
403 }
405 if ( c->flags & VGCF_VMX_GUEST )
406 {
407 /* VMX uses the initially provided page tables as the P2M map. */
408 if ( !pagetable_get_paddr(d->arch.phys_table) )
409 d->arch.phys_table = v->arch.guest_table;
411 /* Initialize monitor page table */
412 v->arch.monitor_table = mk_pagetable(0);
414 vmx_final_setup_guest(v);
415 }
417 update_pagetables(v);
419 if ( v->vcpu_id == 0 )
420 init_domain_time(d);
422 /* Don't redo final setup */
423 set_bit(_VCPUF_initialised, &v->vcpu_flags);
425 return 0;
426 }
429 void new_thread(struct vcpu *d,
430 unsigned long start_pc,
431 unsigned long start_stack,
432 unsigned long start_info)
433 {
434 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
436 /*
437 * Initial register values:
438 * DS,ES,FS,GS = FLAT_KERNEL_DS
439 * CS:EIP = FLAT_KERNEL_CS:start_pc
440 * SS:ESP = FLAT_KERNEL_SS:start_stack
441 * ESI = start_info
442 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
443 */
444 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
445 regs->ss = FLAT_KERNEL_SS;
446 regs->cs = FLAT_KERNEL_CS;
447 regs->eip = start_pc;
448 regs->esp = start_stack;
449 regs->esi = start_info;
451 __save_flags(regs->eflags);
452 regs->eflags |= X86_EFLAGS_IF;
453 }
456 #ifdef __x86_64__
458 void toggle_guest_mode(struct vcpu *v)
459 {
460 v->arch.flags ^= TF_kernel_mode;
461 __asm__ __volatile__ ( "swapgs" );
462 update_pagetables(v);
463 write_ptbase(v);
464 }
466 #define loadsegment(seg,value) ({ \
467 int __r = 1; \
468 __asm__ __volatile__ ( \
469 "1: movl %k1,%%" #seg "\n2:\n" \
470 ".section .fixup,\"ax\"\n" \
471 "3: xorl %k0,%k0\n" \
472 " movl %k0,%%" #seg "\n" \
473 " jmp 2b\n" \
474 ".previous\n" \
475 ".section __ex_table,\"a\"\n" \
476 " .align 8\n" \
477 " .quad 1b,3b\n" \
478 ".previous" \
479 : "=r" (__r) : "r" (value), "0" (__r) );\
480 __r; })
482 /*
483 * save_segments() writes a mask of segments which are dirty (non-zero),
484 * allowing load_segments() to avoid some expensive segment loads and
485 * MSR writes.
486 */
487 #define DIRTY_DS 0x01
488 #define DIRTY_ES 0x02
489 #define DIRTY_FS 0x04
490 #define DIRTY_GS 0x08
491 #define DIRTY_FS_BASE 0x10
492 #define DIRTY_GS_BASE_USER 0x20
494 static void load_segments(struct vcpu *n)
495 {
496 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
497 int all_segs_okay = 1;
498 unsigned int dirty_segment_mask, cpu = smp_processor_id();
500 /* Load and clear the dirty segment mask. */
501 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
502 percpu_ctxt[cpu].dirty_segment_mask = 0;
504 /* Either selector != 0 ==> reload. */
505 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
506 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
508 /* Either selector != 0 ==> reload. */
509 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
510 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
512 /*
513 * Either selector != 0 ==> reload.
514 * Also reload to reset FS_BASE if it was non-zero.
515 */
516 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
517 nctxt->user_regs.fs) )
518 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
520 /*
521 * Either selector != 0 ==> reload.
522 * Also reload to reset GS_BASE if it was non-zero.
523 */
524 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
525 nctxt->user_regs.gs) )
526 {
527 /* Reset GS_BASE with user %gs? */
528 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
529 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
530 }
532 /* This can only be non-zero if selector is NULL. */
533 if ( nctxt->fs_base )
534 wrmsr(MSR_FS_BASE,
535 nctxt->fs_base,
536 nctxt->fs_base>>32);
538 /* Most kernels have non-zero GS base, so don't bother testing. */
539 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
540 wrmsr(MSR_SHADOW_GS_BASE,
541 nctxt->gs_base_kernel,
542 nctxt->gs_base_kernel>>32);
544 /* This can only be non-zero if selector is NULL. */
545 if ( nctxt->gs_base_user )
546 wrmsr(MSR_GS_BASE,
547 nctxt->gs_base_user,
548 nctxt->gs_base_user>>32);
550 /* If in kernel mode then switch the GS bases around. */
551 if ( n->arch.flags & TF_kernel_mode )
552 __asm__ __volatile__ ( "swapgs" );
554 if ( unlikely(!all_segs_okay) )
555 {
556 struct cpu_user_regs *regs = guest_cpu_user_regs();
557 unsigned long *rsp =
558 (n->arch.flags & TF_kernel_mode) ?
559 (unsigned long *)regs->rsp :
560 (unsigned long *)nctxt->kernel_sp;
562 if ( !(n->arch.flags & TF_kernel_mode) )
563 toggle_guest_mode(n);
564 else
565 regs->cs &= ~3;
567 if ( put_user(regs->ss, rsp- 1) |
568 put_user(regs->rsp, rsp- 2) |
569 put_user(regs->rflags, rsp- 3) |
570 put_user(regs->cs, rsp- 4) |
571 put_user(regs->rip, rsp- 5) |
572 put_user(nctxt->user_regs.gs, rsp- 6) |
573 put_user(nctxt->user_regs.fs, rsp- 7) |
574 put_user(nctxt->user_regs.es, rsp- 8) |
575 put_user(nctxt->user_regs.ds, rsp- 9) |
576 put_user(regs->r11, rsp-10) |
577 put_user(regs->rcx, rsp-11) )
578 {
579 DPRINTK("Error while creating failsafe callback frame.\n");
580 domain_crash();
581 }
583 regs->entry_vector = TRAP_syscall;
584 regs->rflags &= 0xFFFCBEFFUL;
585 regs->ss = __GUEST_SS;
586 regs->rsp = (unsigned long)(rsp-11);
587 regs->cs = __GUEST_CS;
588 regs->rip = nctxt->failsafe_callback_eip;
589 }
590 }
592 static void save_segments(struct vcpu *v)
593 {
594 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
595 struct cpu_user_regs *regs = &ctxt->user_regs;
596 unsigned int dirty_segment_mask = 0;
598 if ( VMX_DOMAIN(v) )
599 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
601 __asm__ __volatile__ ( "mov %%ds,%0" : "=m" (regs->ds) );
602 __asm__ __volatile__ ( "mov %%es,%0" : "=m" (regs->es) );
603 __asm__ __volatile__ ( "mov %%fs,%0" : "=m" (regs->fs) );
604 __asm__ __volatile__ ( "mov %%gs,%0" : "=m" (regs->gs) );
606 if ( regs->ds )
607 dirty_segment_mask |= DIRTY_DS;
609 if ( regs->es )
610 dirty_segment_mask |= DIRTY_ES;
612 if ( regs->fs )
613 {
614 dirty_segment_mask |= DIRTY_FS;
615 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
616 }
617 else if ( ctxt->fs_base )
618 {
619 dirty_segment_mask |= DIRTY_FS_BASE;
620 }
622 if ( regs->gs )
623 {
624 dirty_segment_mask |= DIRTY_GS;
625 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
626 }
627 else if ( ctxt->gs_base_user )
628 {
629 dirty_segment_mask |= DIRTY_GS_BASE_USER;
630 }
632 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
633 }
635 long do_switch_to_user(void)
636 {
637 struct cpu_user_regs *regs = guest_cpu_user_regs();
638 struct switch_to_user stu;
639 struct vcpu *v = current;
641 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
642 unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
643 return -EFAULT;
645 toggle_guest_mode(v);
647 regs->rip = stu.rip;
648 regs->cs = stu.cs | 3; /* force guest privilege */
649 regs->rflags = stu.rflags;
650 regs->rsp = stu.rsp;
651 regs->ss = stu.ss | 3; /* force guest privilege */
653 if ( !(stu.flags & VGCF_IN_SYSCALL) )
654 {
655 regs->entry_vector = 0;
656 regs->r11 = stu.r11;
657 regs->rcx = stu.rcx;
658 }
660 /* Saved %rax gets written back to regs->rax in entry.S. */
661 return stu.rax;
662 }
664 #define switch_kernel_stack(_n,_c) ((void)0)
666 #elif defined(__i386__)
668 #define load_segments(n) ((void)0)
669 #define save_segments(p) ((void)0)
671 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
672 {
673 struct tss_struct *tss = &init_tss[cpu];
674 tss->esp1 = n->arch.guest_context.kernel_sp;
675 tss->ss1 = n->arch.guest_context.kernel_ss;
676 }
678 #endif
680 #define loaddebug(_v,_reg) \
681 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
683 static void __context_switch(void)
684 {
685 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
686 unsigned int cpu = smp_processor_id();
687 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
688 struct vcpu *n = current;
690 if ( !is_idle_task(p->domain) )
691 {
692 memcpy(&p->arch.guest_context.user_regs,
693 stack_regs,
694 CTXT_SWITCH_STACK_BYTES);
695 unlazy_fpu(p);
696 save_segments(p);
697 }
699 if ( !is_idle_task(n->domain) )
700 {
701 memcpy(stack_regs,
702 &n->arch.guest_context.user_regs,
703 CTXT_SWITCH_STACK_BYTES);
705 /* Maybe switch the debug registers. */
706 if ( unlikely(n->arch.guest_context.debugreg[7]) )
707 {
708 loaddebug(&n->arch.guest_context, 0);
709 loaddebug(&n->arch.guest_context, 1);
710 loaddebug(&n->arch.guest_context, 2);
711 loaddebug(&n->arch.guest_context, 3);
712 /* no 4 and 5 */
713 loaddebug(&n->arch.guest_context, 6);
714 loaddebug(&n->arch.guest_context, 7);
715 }
717 if ( !VMX_DOMAIN(n) )
718 {
719 set_int80_direct_trap(n);
720 switch_kernel_stack(n, cpu);
721 }
722 }
724 if ( p->domain != n->domain )
725 cpu_set(cpu, n->domain->cpumask);
727 write_ptbase(n);
729 if ( p->vcpu_id != n->vcpu_id )
730 {
731 char gdt_load[10];
732 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
733 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
734 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
735 }
737 if ( p->domain != n->domain )
738 cpu_clear(cpu, p->domain->cpumask);
740 percpu_ctxt[cpu].curr_vcpu = n;
741 }
744 void context_switch(struct vcpu *prev, struct vcpu *next)
745 {
746 unsigned int cpu = smp_processor_id();
748 ASSERT(!local_irq_is_enabled());
750 set_current(next);
752 if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
753 {
754 __context_switch();
755 percpu_ctxt[cpu].context_not_finalised = 1;
756 }
757 }
759 void context_switch_finalise(struct vcpu *next)
760 {
761 unsigned int cpu = smp_processor_id();
763 ASSERT(local_irq_is_enabled());
765 if ( percpu_ctxt[cpu].context_not_finalised )
766 {
767 percpu_ctxt[cpu].context_not_finalised = 0;
769 BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
771 if ( VMX_DOMAIN(next) )
772 {
773 vmx_restore_msrs(next);
774 }
775 else
776 {
777 load_LDT(next);
778 load_segments(next);
779 vmx_load_msrs(next);
780 }
781 }
783 schedule_tail(next);
784 BUG();
785 }
787 void continue_running(struct vcpu *same)
788 {
789 schedule_tail(same);
790 BUG();
791 }
793 int __sync_lazy_execstate(void)
794 {
795 unsigned long flags;
796 int switch_required;
798 local_irq_save(flags);
800 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
802 if ( switch_required )
803 __context_switch();
805 local_irq_restore(flags);
807 return switch_required;
808 }
810 void sync_vcpu_execstate(struct vcpu *v)
811 {
812 unsigned int cpu = v->processor;
814 if ( !cpu_isset(cpu, v->domain->cpumask) )
815 return;
817 if ( cpu == smp_processor_id() )
818 {
819 (void)__sync_lazy_execstate();
820 }
821 else
822 {
823 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
824 flush_tlb_mask(cpumask_of_cpu(cpu));
825 }
826 }
828 unsigned long __hypercall_create_continuation(
829 unsigned int op, unsigned int nr_args, ...)
830 {
831 struct mc_state *mcs = &mc_state[smp_processor_id()];
832 struct cpu_user_regs *regs;
833 unsigned int i;
834 va_list args;
836 va_start(args, nr_args);
838 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
839 {
840 __set_bit(_MCSF_call_preempted, &mcs->flags);
842 for ( i = 0; i < nr_args; i++ )
843 mcs->call.args[i] = va_arg(args, unsigned long);
844 }
845 else
846 {
847 regs = guest_cpu_user_regs();
848 #if defined(__i386__)
849 regs->eax = op;
850 regs->eip -= 2; /* re-execute 'int 0x82' */
852 for ( i = 0; i < nr_args; i++ )
853 {
854 switch ( i )
855 {
856 case 0: regs->ebx = va_arg(args, unsigned long); break;
857 case 1: regs->ecx = va_arg(args, unsigned long); break;
858 case 2: regs->edx = va_arg(args, unsigned long); break;
859 case 3: regs->esi = va_arg(args, unsigned long); break;
860 case 4: regs->edi = va_arg(args, unsigned long); break;
861 case 5: regs->ebp = va_arg(args, unsigned long); break;
862 }
863 }
864 #elif defined(__x86_64__)
865 regs->rax = op;
866 regs->rip -= 2; /* re-execute 'syscall' */
868 for ( i = 0; i < nr_args; i++ )
869 {
870 switch ( i )
871 {
872 case 0: regs->rdi = va_arg(args, unsigned long); break;
873 case 1: regs->rsi = va_arg(args, unsigned long); break;
874 case 2: regs->rdx = va_arg(args, unsigned long); break;
875 case 3: regs->r10 = va_arg(args, unsigned long); break;
876 case 4: regs->r8 = va_arg(args, unsigned long); break;
877 case 5: regs->r9 = va_arg(args, unsigned long); break;
878 }
879 }
880 #endif
881 }
883 va_end(args);
885 return op;
886 }
888 static void relinquish_memory(struct domain *d, struct list_head *list)
889 {
890 struct list_head *ent;
891 struct pfn_info *page;
892 unsigned long x, y;
894 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
895 spin_lock_recursive(&d->page_alloc_lock);
897 ent = list->next;
898 while ( ent != list )
899 {
900 page = list_entry(ent, struct pfn_info, list);
902 /* Grab a reference to the page so it won't disappear from under us. */
903 if ( unlikely(!get_page(page, d)) )
904 {
905 /* Couldn't get a reference -- someone is freeing this page. */
906 ent = ent->next;
907 continue;
908 }
910 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
911 put_page_and_type(page);
913 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
914 put_page(page);
916 /*
917 * Forcibly invalidate base page tables at this point to break circular
918 * 'linear page table' references. This is okay because MMU structures
919 * are not shared across domains and this domain is now dead. Thus base
920 * tables are not in use so a non-zero count means circular reference.
921 */
922 y = page->u.inuse.type_info;
923 for ( ; ; )
924 {
925 x = y;
926 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
927 (PGT_base_page_table|PGT_validated)) )
928 break;
930 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
931 if ( likely(y == x) )
932 {
933 free_page_type(page, PGT_base_page_table);
934 break;
935 }
936 }
938 /* Follow the list chain and /then/ potentially free the page. */
939 ent = ent->next;
940 put_page(page);
941 }
943 spin_unlock_recursive(&d->page_alloc_lock);
944 }
946 void domain_relinquish_resources(struct domain *d)
947 {
948 struct vcpu *v;
949 unsigned long pfn;
951 BUG_ON(!cpus_empty(d->cpumask));
953 physdev_destroy_state(d);
955 ptwr_destroy(d);
957 /* Release device mappings of other domains */
958 gnttab_release_dev_mappings(d->grant_table);
960 /* Drop the in-use references to page-table bases. */
961 for_each_vcpu ( d, v )
962 {
963 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
964 {
965 if ( !shadow_mode_refcounts(d) )
966 put_page_type(pfn_to_page(pfn));
967 put_page(pfn_to_page(pfn));
969 v->arch.guest_table = mk_pagetable(0);
970 }
972 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
973 {
974 if ( !shadow_mode_refcounts(d) )
975 put_page_type(pfn_to_page(pfn));
976 put_page(pfn_to_page(pfn));
978 v->arch.guest_table_user = mk_pagetable(0);
979 }
981 vmx_relinquish_resources(v);
982 }
984 shadow_mode_disable(d);
986 /*
987 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
988 * it automatically gets squashed when the guest's mappings go away.
989 */
990 for_each_vcpu(d, v)
991 destroy_gdt(v);
993 /* Relinquish every page of memory. */
994 relinquish_memory(d, &d->xenpage_list);
995 relinquish_memory(d, &d->page_list);
996 }
999 /*
1000 * Local variables:
1001 * mode: C
1002 * c-set-style: "BSD"
1003 * c-basic-offset: 4
1004 * tab-width: 4
1005 * indent-tabs-mode: nil
1006 * End:
1007 */