ia64/xen-unstable

view xen/arch/x86/domain.c @ 8609:85d693e6f61a

Arch-specific per-vcpu info should be initialised to zero
when allocating a new vcpu structure, not copied from
CPU0's idle VCPU. Especially now that the idle VCPU itself
is dynamically allocated.

This should fix assertions people have been seeing in
getdomain_info_ctxt() relation to IOPL in eflags.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sat Jan 14 21:26:40 2006 +0100 (2006-01-14)
parents b7e88c83b2a0
children 1ccc28e075ba
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <asm/regs.h>
25 #include <asm/mc146818rtc.h>
26 #include <asm/system.h>
27 #include <asm/io.h>
28 #include <asm/processor.h>
29 #include <asm/desc.h>
30 #include <asm/i387.h>
31 #include <asm/mpspec.h>
32 #include <asm/ldt.h>
33 #include <xen/irq.h>
34 #include <xen/event.h>
35 #include <asm/shadow.h>
36 #include <xen/console.h>
37 #include <xen/elf.h>
38 #include <asm/vmx.h>
39 #include <asm/msr.h>
40 #include <xen/kernel.h>
41 #include <xen/multicall.h>
43 /* opt_noreboot: If true, machine will need manual reset on error. */
44 static int opt_noreboot = 0;
45 boolean_param("noreboot", opt_noreboot);
47 struct percpu_ctxt {
48 struct vcpu *curr_vcpu;
49 unsigned int dirty_segment_mask;
50 } __cacheline_aligned;
51 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
53 static void continue_idle_domain(struct vcpu *v)
54 {
55 reset_stack_and_jump(idle_loop);
56 }
58 static void continue_nonidle_domain(struct vcpu *v)
59 {
60 reset_stack_and_jump(ret_from_intr);
61 }
63 static void default_idle(void)
64 {
65 local_irq_disable();
66 if ( !softirq_pending(smp_processor_id()) )
67 safe_halt();
68 else
69 local_irq_enable();
70 }
72 void idle_loop(void)
73 {
74 int cpu = smp_processor_id();
76 for ( ; ; )
77 {
78 irq_stat[cpu].idle_timestamp = jiffies;
80 while ( !softirq_pending(cpu) )
81 {
82 page_scrub_schedule_work();
83 default_idle();
84 }
86 do_softirq();
87 }
88 }
90 void startup_cpu_idle_loop(void)
91 {
92 struct vcpu *v = current;
94 ASSERT(is_idle_vcpu(v));
95 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
96 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
98 reset_stack_and_jump(idle_loop);
99 }
101 static long no_idt[2];
102 static int reboot_mode;
104 static inline void kb_wait(void)
105 {
106 int i;
108 for ( i = 0; i < 0x10000; i++ )
109 if ( (inb_p(0x64) & 0x02) == 0 )
110 break;
111 }
113 void machine_restart(char * __unused)
114 {
115 int i;
117 if ( opt_noreboot )
118 {
119 printk("Reboot disabled on cmdline: require manual reset\n");
120 for ( ; ; )
121 safe_halt();
122 }
124 watchdog_disable();
125 console_start_sync();
127 local_irq_enable();
129 /* Ensure we are the boot CPU. */
130 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
131 {
132 smp_call_function((void *)machine_restart, NULL, 1, 0);
133 for ( ; ; )
134 safe_halt();
135 }
137 /*
138 * Stop all CPUs and turn off local APICs and the IO-APIC, so
139 * other OSs see a clean IRQ state.
140 */
141 smp_send_stop();
142 disable_IO_APIC();
144 stop_vmx();
146 /* Rebooting needs to touch the page at absolute address 0. */
147 *((unsigned short *)__va(0x472)) = reboot_mode;
149 for ( ; ; )
150 {
151 /* Pulse the keyboard reset line. */
152 for ( i = 0; i < 100; i++ )
153 {
154 kb_wait();
155 udelay(50);
156 outb(0xfe,0x64); /* pulse reset low */
157 udelay(50);
158 }
160 /* That didn't work - force a triple fault.. */
161 __asm__ __volatile__("lidt %0": "=m" (no_idt));
162 __asm__ __volatile__("int3");
163 }
164 }
167 void __attribute__((noreturn)) __machine_halt(void *unused)
168 {
169 for ( ; ; )
170 safe_halt();
171 }
173 void machine_halt(void)
174 {
175 watchdog_disable();
176 console_start_sync();
177 smp_call_function(__machine_halt, NULL, 1, 0);
178 __machine_halt(NULL);
179 }
181 void dump_pageframe_info(struct domain *d)
182 {
183 struct pfn_info *page;
185 printk("Memory pages belonging to domain %u:\n", d->domain_id);
187 if ( d->tot_pages >= 10 )
188 {
189 printk(" DomPage list too long to display\n");
190 }
191 else
192 {
193 list_for_each_entry ( page, &d->page_list, list )
194 {
195 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
196 _p(page_to_phys(page)), _p(page_to_pfn(page)),
197 page->count_info, page->u.inuse.type_info);
198 }
199 }
201 list_for_each_entry ( page, &d->xenpage_list, list )
202 {
203 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
204 _p(page_to_phys(page)), _p(page_to_pfn(page)),
205 page->count_info, page->u.inuse.type_info);
206 }
207 }
209 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
210 {
211 struct vcpu *v;
213 if ( (v = xmalloc(struct vcpu)) == NULL )
214 return NULL;
216 memset(v, 0, sizeof(*v));
218 v->arch.flags = TF_kernel_mode;
220 if ( is_idle_domain(d) )
221 percpu_ctxt[vcpu_id].curr_vcpu = v;
223 if ( (v->vcpu_id = vcpu_id) != 0 )
224 {
225 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
226 v->arch.perdomain_ptes =
227 d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
228 }
230 return v;
231 }
233 void free_vcpu_struct(struct vcpu *v)
234 {
235 BUG_ON(v->next_in_list != NULL);
236 if ( v->vcpu_id != 0 )
237 v->domain->vcpu[v->vcpu_id - 1]->next_in_list = NULL;
238 xfree(v);
239 }
241 void free_perdomain_pt(struct domain *d)
242 {
243 free_xenheap_pages(
244 d->arch.mm_perdomain_pt,
245 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
247 #ifdef __x86_64__
248 free_xenheap_page(d->arch.mm_perdomain_l2);
249 free_xenheap_page(d->arch.mm_perdomain_l3);
250 #endif
251 }
253 int arch_do_createdomain(struct vcpu *v)
254 {
255 struct domain *d = v->domain;
256 l1_pgentry_t gdt_l1e;
257 int vcpuid, pdpt_order, rc;
258 #ifdef __x86_64__
259 int i;
260 #endif
262 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
263 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
264 if ( d->arch.mm_perdomain_pt == NULL )
265 goto fail_nomem;
267 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
268 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
270 /*
271 * Map Xen segments into every VCPU's GDT, irrespective of whether every
272 * VCPU will actually be used. This avoids an NMI race during context
273 * switch: if we take an interrupt after switching CR3 but before switching
274 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
275 * try to load CS from an invalid table.
276 */
277 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
278 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
279 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
280 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
282 v->arch.guest_vtable = __linear_l2_table;
283 v->arch.shadow_vtable = __shadow_linear_l2_table;
285 #if defined(__i386__)
287 mapcache_init(d);
289 #else /* __x86_64__ */
291 v->arch.guest_vl3table = __linear_l3_table;
292 v->arch.guest_vl4table = __linear_l4_table;
294 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
295 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
296 if ( (d->arch.mm_perdomain_l2 == NULL) ||
297 (d->arch.mm_perdomain_l3 == NULL) )
298 goto fail_nomem;
300 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
301 for ( i = 0; i < (1 << pdpt_order); i++ )
302 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
303 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
304 __PAGE_HYPERVISOR);
306 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
307 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
308 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
309 __PAGE_HYPERVISOR);
311 #endif /* __x86_64__ */
313 shadow_lock_init(d);
314 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
316 if ( !is_idle_domain(d) )
317 {
318 d->arch.ioport_caps =
319 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
320 if ( d->arch.ioport_caps == NULL )
321 goto fail_nomem;
323 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
324 goto fail_nomem;
326 if ( (rc = ptwr_init(d)) != 0 )
327 goto fail_nomem;
329 memset(d->shared_info, 0, PAGE_SIZE);
330 v->vcpu_info = &d->shared_info->vcpu_info[v->vcpu_id];
331 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
332 }
334 v->arch.schedule_tail = is_idle_domain(d) ?
335 continue_idle_domain : continue_nonidle_domain;
337 return 0;
339 fail_nomem:
340 free_xenheap_page(d->shared_info);
341 #ifdef __x86_64__
342 free_xenheap_page(d->arch.mm_perdomain_l2);
343 free_xenheap_page(d->arch.mm_perdomain_l3);
344 #endif
345 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
346 return -ENOMEM;
347 }
349 /* This is called by arch_final_setup_guest and do_boot_vcpu */
350 int arch_set_info_guest(
351 struct vcpu *v, struct vcpu_guest_context *c)
352 {
353 struct domain *d = v->domain;
354 unsigned long phys_basetab;
355 int i, rc;
357 /*
358 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
359 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
360 * If SS RPL or DPL differs from CS RPL then we'll #GP.
361 */
362 if ( !(c->flags & VGCF_VMX_GUEST) )
363 {
364 if ( ((c->user_regs.cs & 3) == 0) ||
365 ((c->user_regs.ss & 3) == 0) )
366 return -EINVAL;
367 }
368 else if ( !hvm_enabled )
369 return -EINVAL;
371 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
372 if ( c->flags & VGCF_I387_VALID )
373 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
375 v->arch.flags &= ~TF_kernel_mode;
376 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
377 v->arch.flags |= TF_kernel_mode;
379 memcpy(&v->arch.guest_context, c, sizeof(*c));
381 if ( !(c->flags & VGCF_VMX_GUEST) )
382 {
383 /* IOPL privileges are virtualised. */
384 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
385 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
387 /* Ensure real hardware interrupts are enabled. */
388 v->arch.guest_context.user_regs.eflags |= EF_IE;
389 }
390 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
391 {
392 return modify_vmcs(
393 &v->arch.arch_vmx,
394 &v->arch.guest_context.user_regs);
395 }
397 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
398 return 0;
400 memset(v->arch.guest_context.debugreg, 0,
401 sizeof(v->arch.guest_context.debugreg));
402 for ( i = 0; i < 8; i++ )
403 (void)set_debugreg(v, i, c->debugreg[i]);
405 if ( v->vcpu_id == 0 )
406 d->vm_assist = c->vm_assist;
408 phys_basetab = c->ctrlreg[3];
409 v->arch.guest_table = mk_pagetable(phys_basetab);
411 if ( shadow_mode_refcounts(d) )
412 {
413 if ( !get_page(pfn_to_page(phys_basetab>>PAGE_SHIFT), d) )
414 return -EINVAL;
415 }
416 else if ( !(c->flags & VGCF_VMX_GUEST) )
417 {
418 if ( !get_page_and_type(pfn_to_page(phys_basetab>>PAGE_SHIFT), d,
419 PGT_base_page_table) )
420 return -EINVAL;
421 }
423 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
424 {
425 put_page_and_type(pfn_to_page(phys_basetab>>PAGE_SHIFT));
426 return rc;
427 }
429 if ( c->flags & VGCF_VMX_GUEST )
430 {
431 /* VMX uses the initially provided page tables as the P2M map. */
432 if ( !pagetable_get_paddr(d->arch.phys_table) )
433 d->arch.phys_table = v->arch.guest_table;
434 v->arch.guest_table = mk_pagetable(0);
436 vmx_final_setup_guest(v);
437 }
439 update_pagetables(v);
441 if ( v->vcpu_id == 0 )
442 init_domain_time(d);
444 /* Don't redo final setup */
445 set_bit(_VCPUF_initialised, &v->vcpu_flags);
447 return 0;
448 }
451 void new_thread(struct vcpu *d,
452 unsigned long start_pc,
453 unsigned long start_stack,
454 unsigned long start_info)
455 {
456 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
458 /*
459 * Initial register values:
460 * DS,ES,FS,GS = FLAT_KERNEL_DS
461 * CS:EIP = FLAT_KERNEL_CS:start_pc
462 * SS:ESP = FLAT_KERNEL_SS:start_stack
463 * ESI = start_info
464 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
465 */
466 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
467 regs->ss = FLAT_KERNEL_SS;
468 regs->cs = FLAT_KERNEL_CS;
469 regs->eip = start_pc;
470 regs->esp = start_stack;
471 regs->esi = start_info;
473 __save_flags(regs->eflags);
474 regs->eflags |= X86_EFLAGS_IF;
475 }
478 #ifdef __x86_64__
480 #define loadsegment(seg,value) ({ \
481 int __r = 1; \
482 __asm__ __volatile__ ( \
483 "1: movl %k1,%%" #seg "\n2:\n" \
484 ".section .fixup,\"ax\"\n" \
485 "3: xorl %k0,%k0\n" \
486 " movl %k0,%%" #seg "\n" \
487 " jmp 2b\n" \
488 ".previous\n" \
489 ".section __ex_table,\"a\"\n" \
490 " .align 8\n" \
491 " .quad 1b,3b\n" \
492 ".previous" \
493 : "=r" (__r) : "r" (value), "0" (__r) );\
494 __r; })
496 /*
497 * save_segments() writes a mask of segments which are dirty (non-zero),
498 * allowing load_segments() to avoid some expensive segment loads and
499 * MSR writes.
500 */
501 #define DIRTY_DS 0x01
502 #define DIRTY_ES 0x02
503 #define DIRTY_FS 0x04
504 #define DIRTY_GS 0x08
505 #define DIRTY_FS_BASE 0x10
506 #define DIRTY_GS_BASE_USER 0x20
508 static void load_segments(struct vcpu *n)
509 {
510 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
511 int all_segs_okay = 1;
512 unsigned int dirty_segment_mask, cpu = smp_processor_id();
514 /* Load and clear the dirty segment mask. */
515 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
516 percpu_ctxt[cpu].dirty_segment_mask = 0;
518 /* Either selector != 0 ==> reload. */
519 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
520 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
522 /* Either selector != 0 ==> reload. */
523 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
524 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
526 /*
527 * Either selector != 0 ==> reload.
528 * Also reload to reset FS_BASE if it was non-zero.
529 */
530 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
531 nctxt->user_regs.fs) )
532 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
534 /*
535 * Either selector != 0 ==> reload.
536 * Also reload to reset GS_BASE if it was non-zero.
537 */
538 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
539 nctxt->user_regs.gs) )
540 {
541 /* Reset GS_BASE with user %gs? */
542 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
543 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
544 }
546 /* This can only be non-zero if selector is NULL. */
547 if ( nctxt->fs_base )
548 wrmsr(MSR_FS_BASE,
549 nctxt->fs_base,
550 nctxt->fs_base>>32);
552 /* Most kernels have non-zero GS base, so don't bother testing. */
553 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
554 wrmsr(MSR_SHADOW_GS_BASE,
555 nctxt->gs_base_kernel,
556 nctxt->gs_base_kernel>>32);
558 /* This can only be non-zero if selector is NULL. */
559 if ( nctxt->gs_base_user )
560 wrmsr(MSR_GS_BASE,
561 nctxt->gs_base_user,
562 nctxt->gs_base_user>>32);
564 /* If in kernel mode then switch the GS bases around. */
565 if ( n->arch.flags & TF_kernel_mode )
566 __asm__ __volatile__ ( "swapgs" );
568 if ( unlikely(!all_segs_okay) )
569 {
570 struct cpu_user_regs *regs = guest_cpu_user_regs();
571 unsigned long *rsp =
572 (n->arch.flags & TF_kernel_mode) ?
573 (unsigned long *)regs->rsp :
574 (unsigned long *)nctxt->kernel_sp;
576 if ( !(n->arch.flags & TF_kernel_mode) )
577 toggle_guest_mode(n);
578 else
579 regs->cs &= ~3;
581 if ( put_user(regs->ss, rsp- 1) |
582 put_user(regs->rsp, rsp- 2) |
583 put_user(regs->rflags, rsp- 3) |
584 put_user(regs->cs, rsp- 4) |
585 put_user(regs->rip, rsp- 5) |
586 put_user(nctxt->user_regs.gs, rsp- 6) |
587 put_user(nctxt->user_regs.fs, rsp- 7) |
588 put_user(nctxt->user_regs.es, rsp- 8) |
589 put_user(nctxt->user_regs.ds, rsp- 9) |
590 put_user(regs->r11, rsp-10) |
591 put_user(regs->rcx, rsp-11) )
592 {
593 DPRINTK("Error while creating failsafe callback frame.\n");
594 domain_crash(n->domain);
595 }
597 regs->entry_vector = TRAP_syscall;
598 regs->rflags &= 0xFFFCBEFFUL;
599 regs->ss = __GUEST_SS;
600 regs->rsp = (unsigned long)(rsp-11);
601 regs->cs = __GUEST_CS;
602 regs->rip = nctxt->failsafe_callback_eip;
603 }
604 }
606 static void save_segments(struct vcpu *v)
607 {
608 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
609 struct cpu_user_regs *regs = &ctxt->user_regs;
610 unsigned int dirty_segment_mask = 0;
612 if ( VMX_DOMAIN(v) )
613 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
615 __asm__ __volatile__ ( "mov %%ds,%0" : "=m" (regs->ds) );
616 __asm__ __volatile__ ( "mov %%es,%0" : "=m" (regs->es) );
617 __asm__ __volatile__ ( "mov %%fs,%0" : "=m" (regs->fs) );
618 __asm__ __volatile__ ( "mov %%gs,%0" : "=m" (regs->gs) );
620 if ( regs->ds )
621 dirty_segment_mask |= DIRTY_DS;
623 if ( regs->es )
624 dirty_segment_mask |= DIRTY_ES;
626 if ( regs->fs )
627 {
628 dirty_segment_mask |= DIRTY_FS;
629 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
630 }
631 else if ( ctxt->fs_base )
632 {
633 dirty_segment_mask |= DIRTY_FS_BASE;
634 }
636 if ( regs->gs )
637 {
638 dirty_segment_mask |= DIRTY_GS;
639 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
640 }
641 else if ( ctxt->gs_base_user )
642 {
643 dirty_segment_mask |= DIRTY_GS_BASE_USER;
644 }
646 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
647 }
649 #define switch_kernel_stack(_n,_c) ((void)0)
651 #elif defined(__i386__)
653 #define load_segments(n) ((void)0)
654 #define save_segments(p) ((void)0)
656 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
657 {
658 struct tss_struct *tss = &init_tss[cpu];
659 tss->esp1 = n->arch.guest_context.kernel_sp;
660 tss->ss1 = n->arch.guest_context.kernel_ss;
661 }
663 #endif
665 #define loaddebug(_v,_reg) \
666 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
668 static void __context_switch(void)
669 {
670 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
671 unsigned int cpu = smp_processor_id();
672 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
673 struct vcpu *n = current;
675 ASSERT(p != n);
676 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
678 if ( !is_idle_vcpu(p) )
679 {
680 memcpy(&p->arch.guest_context.user_regs,
681 stack_regs,
682 CTXT_SWITCH_STACK_BYTES);
683 unlazy_fpu(p);
684 save_segments(p);
685 }
687 if ( !is_idle_vcpu(n) )
688 {
689 memcpy(stack_regs,
690 &n->arch.guest_context.user_regs,
691 CTXT_SWITCH_STACK_BYTES);
693 /* Maybe switch the debug registers. */
694 if ( unlikely(n->arch.guest_context.debugreg[7]) )
695 {
696 loaddebug(&n->arch.guest_context, 0);
697 loaddebug(&n->arch.guest_context, 1);
698 loaddebug(&n->arch.guest_context, 2);
699 loaddebug(&n->arch.guest_context, 3);
700 /* no 4 and 5 */
701 loaddebug(&n->arch.guest_context, 6);
702 loaddebug(&n->arch.guest_context, 7);
703 }
705 if ( !VMX_DOMAIN(n) )
706 {
707 set_int80_direct_trap(n);
708 switch_kernel_stack(n, cpu);
709 }
710 }
712 if ( p->domain != n->domain )
713 cpu_set(cpu, n->domain->domain_dirty_cpumask);
714 cpu_set(cpu, n->vcpu_dirty_cpumask);
716 write_ptbase(n);
718 if ( p->vcpu_id != n->vcpu_id )
719 {
720 char gdt_load[10];
721 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
722 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
723 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
724 }
726 if ( p->domain != n->domain )
727 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
728 cpu_clear(cpu, p->vcpu_dirty_cpumask);
730 percpu_ctxt[cpu].curr_vcpu = n;
731 }
734 void context_switch(struct vcpu *prev, struct vcpu *next)
735 {
736 unsigned int cpu = smp_processor_id();
737 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
739 ASSERT(local_irq_is_enabled());
741 /* Allow at most one CPU at a time to be dirty. */
742 ASSERT(cpus_weight(dirty_mask) <= 1);
743 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
744 {
745 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
746 flush_tlb_mask(dirty_mask);
747 }
749 local_irq_disable();
751 set_current(next);
753 if ( (percpu_ctxt[cpu].curr_vcpu == next) || is_idle_vcpu(next) )
754 {
755 local_irq_enable();
756 }
757 else
758 {
759 __context_switch();
761 /* Re-enable interrupts before restoring state which may fault. */
762 local_irq_enable();
764 if ( VMX_DOMAIN(next) )
765 {
766 vmx_restore_msrs(next);
767 }
768 else
769 {
770 load_LDT(next);
771 load_segments(next);
772 vmx_load_msrs(next);
773 }
774 }
776 context_saved(prev);
778 schedule_tail(next);
779 BUG();
780 }
782 void continue_running(struct vcpu *same)
783 {
784 schedule_tail(same);
785 BUG();
786 }
788 int __sync_lazy_execstate(void)
789 {
790 unsigned long flags;
791 int switch_required;
793 local_irq_save(flags);
795 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
797 if ( switch_required )
798 __context_switch();
800 local_irq_restore(flags);
802 return switch_required;
803 }
805 void sync_vcpu_execstate(struct vcpu *v)
806 {
807 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
808 (void)__sync_lazy_execstate();
810 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
811 flush_tlb_mask(v->vcpu_dirty_cpumask);
812 }
814 unsigned long __hypercall_create_continuation(
815 unsigned int op, unsigned int nr_args, ...)
816 {
817 struct mc_state *mcs = &mc_state[smp_processor_id()];
818 struct cpu_user_regs *regs;
819 unsigned int i;
820 va_list args;
822 va_start(args, nr_args);
824 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
825 {
826 __set_bit(_MCSF_call_preempted, &mcs->flags);
828 for ( i = 0; i < nr_args; i++ )
829 mcs->call.args[i] = va_arg(args, unsigned long);
830 }
831 else
832 {
833 regs = guest_cpu_user_regs();
834 #if defined(__i386__)
835 regs->eax = op;
836 regs->eip -= 2; /* re-execute 'int 0x82' */
838 for ( i = 0; i < nr_args; i++ )
839 {
840 switch ( i )
841 {
842 case 0: regs->ebx = va_arg(args, unsigned long); break;
843 case 1: regs->ecx = va_arg(args, unsigned long); break;
844 case 2: regs->edx = va_arg(args, unsigned long); break;
845 case 3: regs->esi = va_arg(args, unsigned long); break;
846 case 4: regs->edi = va_arg(args, unsigned long); break;
847 case 5: regs->ebp = va_arg(args, unsigned long); break;
848 }
849 }
850 #elif defined(__x86_64__)
851 regs->rax = op;
852 regs->rip -= 2; /* re-execute 'syscall' */
854 for ( i = 0; i < nr_args; i++ )
855 {
856 switch ( i )
857 {
858 case 0: regs->rdi = va_arg(args, unsigned long); break;
859 case 1: regs->rsi = va_arg(args, unsigned long); break;
860 case 2: regs->rdx = va_arg(args, unsigned long); break;
861 case 3: regs->r10 = va_arg(args, unsigned long); break;
862 case 4: regs->r8 = va_arg(args, unsigned long); break;
863 case 5: regs->r9 = va_arg(args, unsigned long); break;
864 }
865 }
866 #endif
867 }
869 va_end(args);
871 return op;
872 }
874 static void relinquish_memory(struct domain *d, struct list_head *list)
875 {
876 struct list_head *ent;
877 struct pfn_info *page;
878 unsigned long x, y;
880 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
881 spin_lock_recursive(&d->page_alloc_lock);
883 ent = list->next;
884 while ( ent != list )
885 {
886 page = list_entry(ent, struct pfn_info, list);
888 /* Grab a reference to the page so it won't disappear from under us. */
889 if ( unlikely(!get_page(page, d)) )
890 {
891 /* Couldn't get a reference -- someone is freeing this page. */
892 ent = ent->next;
893 continue;
894 }
896 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
897 put_page_and_type(page);
899 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
900 put_page(page);
902 /*
903 * Forcibly invalidate base page tables at this point to break circular
904 * 'linear page table' references. This is okay because MMU structures
905 * are not shared across domains and this domain is now dead. Thus base
906 * tables are not in use so a non-zero count means circular reference.
907 */
908 y = page->u.inuse.type_info;
909 for ( ; ; )
910 {
911 x = y;
912 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
913 (PGT_base_page_table|PGT_validated)) )
914 break;
916 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
917 if ( likely(y == x) )
918 {
919 free_page_type(page, PGT_base_page_table);
920 break;
921 }
922 }
924 /* Follow the list chain and /then/ potentially free the page. */
925 ent = ent->next;
926 put_page(page);
927 }
929 spin_unlock_recursive(&d->page_alloc_lock);
930 }
932 void domain_relinquish_resources(struct domain *d)
933 {
934 struct vcpu *v;
935 unsigned long pfn;
937 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
939 ptwr_destroy(d);
941 /* Drop the in-use references to page-table bases. */
942 for_each_vcpu ( d, v )
943 {
944 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
945 {
946 if ( !shadow_mode_refcounts(d) )
947 put_page_type(pfn_to_page(pfn));
948 put_page(pfn_to_page(pfn));
950 v->arch.guest_table = mk_pagetable(0);
951 }
953 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
954 {
955 if ( !shadow_mode_refcounts(d) )
956 put_page_type(pfn_to_page(pfn));
957 put_page(pfn_to_page(pfn));
959 v->arch.guest_table_user = mk_pagetable(0);
960 }
962 vmx_relinquish_resources(v);
963 }
965 shadow_mode_disable(d);
967 /*
968 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
969 * it automatically gets squashed when the guest's mappings go away.
970 */
971 for_each_vcpu(d, v)
972 destroy_gdt(v);
974 /* Relinquish every page of memory. */
975 relinquish_memory(d, &d->xenpage_list);
976 relinquish_memory(d, &d->page_list);
977 }
980 /*
981 * Local variables:
982 * mode: C
983 * c-set-style: "BSD"
984 * c-basic-offset: 4
985 * tab-width: 4
986 * indent-tabs-mode: nil
987 * End:
988 */