ia64/xen-unstable

view xen/arch/x86/domain.c @ 8964:8946b6dcd49e

Fix x86_64 Xen build.

event_callback_cs and failsafe_callback_cs are x86_32 only.

Signed-off-by: Ian Campbell <Ian.Campbell@XenSource.com>
author Ian.Campbell@xensource.com
date Wed Feb 22 17:26:39 2006 +0000 (2006-02-22)
parents 2d3124df8a0e
children 049929854f7f
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <asm/regs.h>
25 #include <asm/mc146818rtc.h>
26 #include <asm/system.h>
27 #include <asm/io.h>
28 #include <asm/processor.h>
29 #include <asm/desc.h>
30 #include <asm/i387.h>
31 #include <asm/mpspec.h>
32 #include <asm/ldt.h>
33 #include <xen/irq.h>
34 #include <xen/event.h>
35 #include <asm/shadow.h>
36 #include <xen/console.h>
37 #include <xen/elf.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/msr.h>
41 #include <xen/kernel.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 struct percpu_ctxt {
49 struct vcpu *curr_vcpu;
50 unsigned int dirty_segment_mask;
51 } __cacheline_aligned;
52 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
54 static void continue_idle_domain(struct vcpu *v)
55 {
56 reset_stack_and_jump(idle_loop);
57 }
59 static void continue_nonidle_domain(struct vcpu *v)
60 {
61 reset_stack_and_jump(ret_from_intr);
62 }
64 static void default_idle(void)
65 {
66 local_irq_disable();
67 if ( !softirq_pending(smp_processor_id()) )
68 safe_halt();
69 else
70 local_irq_enable();
71 }
73 void idle_loop(void)
74 {
75 int cpu = smp_processor_id();
77 for ( ; ; )
78 {
79 irq_stat[cpu].idle_timestamp = jiffies;
81 while ( !softirq_pending(cpu) )
82 {
83 page_scrub_schedule_work();
84 default_idle();
85 }
87 do_softirq();
88 }
89 }
91 void startup_cpu_idle_loop(void)
92 {
93 struct vcpu *v = current;
95 ASSERT(is_idle_vcpu(v));
96 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
97 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
99 reset_stack_and_jump(idle_loop);
100 }
102 static long no_idt[2];
103 static int reboot_mode;
105 static inline void kb_wait(void)
106 {
107 int i;
109 for ( i = 0; i < 0x10000; i++ )
110 if ( (inb_p(0x64) & 0x02) == 0 )
111 break;
112 }
114 void __attribute__((noreturn)) __machine_halt(void *unused)
115 {
116 for ( ; ; )
117 safe_halt();
118 }
120 void machine_halt(void)
121 {
122 watchdog_disable();
123 console_start_sync();
124 smp_call_function(__machine_halt, NULL, 1, 0);
125 __machine_halt(NULL);
126 }
128 void machine_restart(char * __unused)
129 {
130 int i;
132 if ( opt_noreboot )
133 {
134 printk("Reboot disabled on cmdline: require manual reset\n");
135 machine_halt();
136 }
138 watchdog_disable();
139 console_start_sync();
141 local_irq_enable();
143 /* Ensure we are the boot CPU. */
144 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
145 {
146 smp_call_function((void *)machine_restart, NULL, 1, 0);
147 for ( ; ; )
148 safe_halt();
149 }
151 /*
152 * Stop all CPUs and turn off local APICs and the IO-APIC, so
153 * other OSs see a clean IRQ state.
154 */
155 smp_send_stop();
156 disable_IO_APIC();
157 hvm_disable();
159 /* Rebooting needs to touch the page at absolute address 0. */
160 *((unsigned short *)__va(0x472)) = reboot_mode;
162 for ( ; ; )
163 {
164 /* Pulse the keyboard reset line. */
165 for ( i = 0; i < 100; i++ )
166 {
167 kb_wait();
168 udelay(50);
169 outb(0xfe,0x64); /* pulse reset low */
170 udelay(50);
171 }
173 /* That didn't work - force a triple fault.. */
174 __asm__ __volatile__("lidt %0": "=m" (no_idt));
175 __asm__ __volatile__("int3");
176 }
177 }
180 void dump_pageframe_info(struct domain *d)
181 {
182 struct page_info *page;
184 printk("Memory pages belonging to domain %u:\n", d->domain_id);
186 if ( d->tot_pages >= 10 )
187 {
188 printk(" DomPage list too long to display\n");
189 }
190 else
191 {
192 list_for_each_entry ( page, &d->page_list, list )
193 {
194 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
195 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
196 page->count_info, page->u.inuse.type_info);
197 }
198 }
200 list_for_each_entry ( page, &d->xenpage_list, list )
201 {
202 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
203 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
204 page->count_info, page->u.inuse.type_info);
205 }
206 }
208 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
209 {
210 struct vcpu *v;
212 if ( (v = xmalloc(struct vcpu)) == NULL )
213 return NULL;
215 memset(v, 0, sizeof(*v));
217 v->arch.flags = TF_kernel_mode;
219 if ( is_idle_domain(d) )
220 {
221 percpu_ctxt[vcpu_id].curr_vcpu = v;
222 v->arch.schedule_tail = continue_idle_domain;
223 }
224 else
225 {
226 v->arch.schedule_tail = continue_nonidle_domain;
227 }
229 v->arch.perdomain_ptes =
230 d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
232 v->arch.guest_vtable = __linear_l2_table;
233 v->arch.shadow_vtable = __shadow_linear_l2_table;
234 #if defined(__x86_64__)
235 v->arch.guest_vl3table = __linear_l3_table;
236 v->arch.guest_vl4table = __linear_l4_table;
237 #endif
239 return v;
240 }
242 void free_vcpu_struct(struct vcpu *v)
243 {
244 xfree(v);
245 }
247 int arch_domain_create(struct domain *d)
248 {
249 l1_pgentry_t gdt_l1e;
250 int vcpuid, pdpt_order, rc;
251 #ifdef __x86_64__
252 int i;
253 #endif
255 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
256 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
257 if ( d->arch.mm_perdomain_pt == NULL )
258 goto fail_nomem;
259 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
261 /*
262 * Map Xen segments into every VCPU's GDT, irrespective of whether every
263 * VCPU will actually be used. This avoids an NMI race during context
264 * switch: if we take an interrupt after switching CR3 but before switching
265 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
266 * try to load CS from an invalid table.
267 */
268 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
269 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
270 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
271 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
273 #if defined(__i386__)
275 mapcache_init(d);
277 #else /* __x86_64__ */
279 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
280 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
281 if ( (d->arch.mm_perdomain_l2 == NULL) ||
282 (d->arch.mm_perdomain_l3 == NULL) )
283 goto fail_nomem;
285 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
286 for ( i = 0; i < (1 << pdpt_order); i++ )
287 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
288 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
289 __PAGE_HYPERVISOR);
291 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
292 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
293 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
294 __PAGE_HYPERVISOR);
296 #endif /* __x86_64__ */
298 shadow_lock_init(d);
299 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
301 if ( !is_idle_domain(d) )
302 {
303 d->arch.ioport_caps =
304 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
305 if ( d->arch.ioport_caps == NULL )
306 goto fail_nomem;
308 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
309 goto fail_nomem;
311 if ( (rc = ptwr_init(d)) != 0 )
312 goto fail_nomem;
314 memset(d->shared_info, 0, PAGE_SIZE);
315 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
316 }
318 return 0;
320 fail_nomem:
321 free_xenheap_page(d->shared_info);
322 #ifdef __x86_64__
323 free_xenheap_page(d->arch.mm_perdomain_l2);
324 free_xenheap_page(d->arch.mm_perdomain_l3);
325 #endif
326 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
327 return -ENOMEM;
328 }
330 void arch_domain_destroy(struct domain *d)
331 {
332 free_xenheap_pages(
333 d->arch.mm_perdomain_pt,
334 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
336 #ifdef __x86_64__
337 free_xenheap_page(d->arch.mm_perdomain_l2);
338 free_xenheap_page(d->arch.mm_perdomain_l3);
339 #endif
341 free_xenheap_page(d->shared_info);
342 }
344 /* This is called by arch_final_setup_guest and do_boot_vcpu */
345 int arch_set_info_guest(
346 struct vcpu *v, struct vcpu_guest_context *c)
347 {
348 struct domain *d = v->domain;
349 unsigned long phys_basetab;
350 int i, rc;
352 /*
353 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
354 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
355 * If SS RPL or DPL differs from CS RPL then we'll #GP.
356 */
357 if ( !(c->flags & VGCF_HVM_GUEST) )
358 {
359 if ( ((c->user_regs.ss & 3) == 0) ||
360 !VALID_CODESEL(c->user_regs.cs) )
361 return -EINVAL;
363 #ifdef __i386__
364 if ( !VALID_CODESEL(c->event_callback_cs) ||
365 !VALID_CODESEL(c->failsafe_callback_cs) )
366 return -EINVAL;
367 #endif
369 for ( i = 0; i < 256; i++ )
370 if ( !VALID_CODESEL(c->trap_ctxt[i].cs) )
371 return -EINVAL;
372 }
373 else if ( !hvm_enabled )
374 return -EINVAL;
376 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
377 if ( c->flags & VGCF_I387_VALID )
378 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
380 v->arch.flags &= ~TF_kernel_mode;
381 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_HVM_GUEST) )
382 v->arch.flags |= TF_kernel_mode;
384 memcpy(&v->arch.guest_context, c, sizeof(*c));
386 if ( !(c->flags & VGCF_HVM_GUEST) )
387 {
388 /* IOPL privileges are virtualised. */
389 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
390 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
392 /* Ensure real hardware interrupts are enabled. */
393 v->arch.guest_context.user_regs.eflags |= EF_IE;
394 }
395 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
396 {
397 hvm_modify_guest_state(v);
398 }
400 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
401 return 0;
403 memset(v->arch.guest_context.debugreg, 0,
404 sizeof(v->arch.guest_context.debugreg));
405 for ( i = 0; i < 8; i++ )
406 (void)set_debugreg(v, i, c->debugreg[i]);
408 if ( v->vcpu_id == 0 )
409 d->vm_assist = c->vm_assist;
411 if ( !(c->flags & VGCF_HVM_GUEST) )
412 {
413 phys_basetab = c->ctrlreg[3];
414 phys_basetab =
415 (gmfn_to_mfn(d, phys_basetab >> PAGE_SHIFT) << PAGE_SHIFT) |
416 (phys_basetab & ~PAGE_MASK);
418 v->arch.guest_table = mk_pagetable(phys_basetab);
419 }
421 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
422 return rc;
424 if ( shadow_mode_refcounts(d) )
425 {
426 if ( !get_page(mfn_to_page(phys_basetab>>PAGE_SHIFT), d) )
427 {
428 destroy_gdt(v);
429 return -EINVAL;
430 }
431 }
432 else if ( !(c->flags & VGCF_HVM_GUEST) )
433 {
434 if ( !get_page_and_type(mfn_to_page(phys_basetab>>PAGE_SHIFT), d,
435 PGT_base_page_table) )
436 {
437 destroy_gdt(v);
438 return -EINVAL;
439 }
440 }
442 if ( c->flags & VGCF_HVM_GUEST )
443 {
444 v->arch.guest_table = mk_pagetable(0);
446 if ( !hvm_initialize_guest_resources(v) )
447 return -EINVAL;
448 }
450 update_pagetables(v);
452 if ( v->vcpu_id == 0 )
453 init_domain_time(d);
455 /* Don't redo final setup */
456 set_bit(_VCPUF_initialised, &v->vcpu_flags);
458 return 0;
459 }
462 void new_thread(struct vcpu *d,
463 unsigned long start_pc,
464 unsigned long start_stack,
465 unsigned long start_info)
466 {
467 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
469 /*
470 * Initial register values:
471 * DS,ES,FS,GS = FLAT_KERNEL_DS
472 * CS:EIP = FLAT_KERNEL_CS:start_pc
473 * SS:ESP = FLAT_KERNEL_SS:start_stack
474 * ESI = start_info
475 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
476 */
477 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
478 regs->ss = FLAT_KERNEL_SS;
479 regs->cs = FLAT_KERNEL_CS;
480 regs->eip = start_pc;
481 regs->esp = start_stack;
482 regs->esi = start_info;
484 __save_flags(regs->eflags);
485 regs->eflags |= X86_EFLAGS_IF;
486 }
489 #ifdef __x86_64__
491 #define loadsegment(seg,value) ({ \
492 int __r = 1; \
493 __asm__ __volatile__ ( \
494 "1: movl %k1,%%" #seg "\n2:\n" \
495 ".section .fixup,\"ax\"\n" \
496 "3: xorl %k0,%k0\n" \
497 " movl %k0,%%" #seg "\n" \
498 " jmp 2b\n" \
499 ".previous\n" \
500 ".section __ex_table,\"a\"\n" \
501 " .align 8\n" \
502 " .quad 1b,3b\n" \
503 ".previous" \
504 : "=r" (__r) : "r" (value), "0" (__r) );\
505 __r; })
507 /*
508 * save_segments() writes a mask of segments which are dirty (non-zero),
509 * allowing load_segments() to avoid some expensive segment loads and
510 * MSR writes.
511 */
512 #define DIRTY_DS 0x01
513 #define DIRTY_ES 0x02
514 #define DIRTY_FS 0x04
515 #define DIRTY_GS 0x08
516 #define DIRTY_FS_BASE 0x10
517 #define DIRTY_GS_BASE_USER 0x20
519 static void load_segments(struct vcpu *n)
520 {
521 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
522 int all_segs_okay = 1;
523 unsigned int dirty_segment_mask, cpu = smp_processor_id();
525 /* Load and clear the dirty segment mask. */
526 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
527 percpu_ctxt[cpu].dirty_segment_mask = 0;
529 /* Either selector != 0 ==> reload. */
530 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
531 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
533 /* Either selector != 0 ==> reload. */
534 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
535 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
537 /*
538 * Either selector != 0 ==> reload.
539 * Also reload to reset FS_BASE if it was non-zero.
540 */
541 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
542 nctxt->user_regs.fs) )
543 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
545 /*
546 * Either selector != 0 ==> reload.
547 * Also reload to reset GS_BASE if it was non-zero.
548 */
549 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
550 nctxt->user_regs.gs) )
551 {
552 /* Reset GS_BASE with user %gs? */
553 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
554 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
555 }
557 /* This can only be non-zero if selector is NULL. */
558 if ( nctxt->fs_base )
559 wrmsr(MSR_FS_BASE,
560 nctxt->fs_base,
561 nctxt->fs_base>>32);
563 /* Most kernels have non-zero GS base, so don't bother testing. */
564 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
565 wrmsr(MSR_SHADOW_GS_BASE,
566 nctxt->gs_base_kernel,
567 nctxt->gs_base_kernel>>32);
569 /* This can only be non-zero if selector is NULL. */
570 if ( nctxt->gs_base_user )
571 wrmsr(MSR_GS_BASE,
572 nctxt->gs_base_user,
573 nctxt->gs_base_user>>32);
575 /* If in kernel mode then switch the GS bases around. */
576 if ( n->arch.flags & TF_kernel_mode )
577 __asm__ __volatile__ ( "swapgs" );
579 if ( unlikely(!all_segs_okay) )
580 {
581 struct cpu_user_regs *regs = guest_cpu_user_regs();
582 unsigned long *rsp =
583 (n->arch.flags & TF_kernel_mode) ?
584 (unsigned long *)regs->rsp :
585 (unsigned long *)nctxt->kernel_sp;
587 if ( !(n->arch.flags & TF_kernel_mode) )
588 toggle_guest_mode(n);
589 else
590 regs->cs &= ~3;
592 if ( put_user(regs->ss, rsp- 1) |
593 put_user(regs->rsp, rsp- 2) |
594 put_user(regs->rflags, rsp- 3) |
595 put_user(regs->cs, rsp- 4) |
596 put_user(regs->rip, rsp- 5) |
597 put_user(nctxt->user_regs.gs, rsp- 6) |
598 put_user(nctxt->user_regs.fs, rsp- 7) |
599 put_user(nctxt->user_regs.es, rsp- 8) |
600 put_user(nctxt->user_regs.ds, rsp- 9) |
601 put_user(regs->r11, rsp-10) |
602 put_user(regs->rcx, rsp-11) )
603 {
604 DPRINTK("Error while creating failsafe callback frame.\n");
605 domain_crash(n->domain);
606 }
608 regs->entry_vector = TRAP_syscall;
609 regs->rflags &= 0xFFFCBEFFUL;
610 regs->ss = __GUEST_SS;
611 regs->rsp = (unsigned long)(rsp-11);
612 regs->cs = __GUEST_CS;
613 regs->rip = nctxt->failsafe_callback_eip;
614 }
615 }
617 static void save_segments(struct vcpu *v)
618 {
619 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
620 struct cpu_user_regs *regs = &ctxt->user_regs;
621 unsigned int dirty_segment_mask = 0;
623 regs->ds = read_segment_register(ds);
624 regs->es = read_segment_register(es);
625 regs->fs = read_segment_register(fs);
626 regs->gs = read_segment_register(gs);
628 if ( regs->ds )
629 dirty_segment_mask |= DIRTY_DS;
631 if ( regs->es )
632 dirty_segment_mask |= DIRTY_ES;
634 if ( regs->fs )
635 {
636 dirty_segment_mask |= DIRTY_FS;
637 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
638 }
639 else if ( ctxt->fs_base )
640 {
641 dirty_segment_mask |= DIRTY_FS_BASE;
642 }
644 if ( regs->gs )
645 {
646 dirty_segment_mask |= DIRTY_GS;
647 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
648 }
649 else if ( ctxt->gs_base_user )
650 {
651 dirty_segment_mask |= DIRTY_GS_BASE_USER;
652 }
654 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
655 }
657 #define switch_kernel_stack(_n,_c) ((void)0)
659 #elif defined(__i386__)
661 #define load_segments(n) ((void)0)
662 #define save_segments(p) ((void)0)
664 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
665 {
666 struct tss_struct *tss = &init_tss[cpu];
667 tss->esp1 = n->arch.guest_context.kernel_sp;
668 tss->ss1 = n->arch.guest_context.kernel_ss;
669 }
671 #endif
673 #define loaddebug(_v,_reg) \
674 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
676 static void __context_switch(void)
677 {
678 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
679 unsigned int cpu = smp_processor_id();
680 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
681 struct vcpu *n = current;
683 ASSERT(p != n);
684 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
686 if ( !is_idle_vcpu(p) )
687 {
688 memcpy(&p->arch.guest_context.user_regs,
689 stack_regs,
690 CTXT_SWITCH_STACK_BYTES);
691 unlazy_fpu(p);
692 if ( !HVM_DOMAIN(p) )
693 {
694 save_segments(p);
695 }
696 else
697 {
698 hvm_save_segments(p);
699 hvm_load_msrs();
700 }
701 }
703 if ( !is_idle_vcpu(n) )
704 {
705 memcpy(stack_regs,
706 &n->arch.guest_context.user_regs,
707 CTXT_SWITCH_STACK_BYTES);
709 /* Maybe switch the debug registers. */
710 if ( unlikely(n->arch.guest_context.debugreg[7]) )
711 {
712 loaddebug(&n->arch.guest_context, 0);
713 loaddebug(&n->arch.guest_context, 1);
714 loaddebug(&n->arch.guest_context, 2);
715 loaddebug(&n->arch.guest_context, 3);
716 /* no 4 and 5 */
717 loaddebug(&n->arch.guest_context, 6);
718 loaddebug(&n->arch.guest_context, 7);
719 }
721 if ( !HVM_DOMAIN(n) )
722 {
723 set_int80_direct_trap(n);
724 switch_kernel_stack(n, cpu);
725 }
726 else
727 {
728 hvm_restore_msrs(n);
729 }
730 }
732 if ( p->domain != n->domain )
733 cpu_set(cpu, n->domain->domain_dirty_cpumask);
734 cpu_set(cpu, n->vcpu_dirty_cpumask);
736 write_ptbase(n);
738 if ( p->vcpu_id != n->vcpu_id )
739 {
740 char gdt_load[10];
741 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
742 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
743 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
744 }
746 if ( p->domain != n->domain )
747 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
748 cpu_clear(cpu, p->vcpu_dirty_cpumask);
750 percpu_ctxt[cpu].curr_vcpu = n;
751 }
754 void context_switch(struct vcpu *prev, struct vcpu *next)
755 {
756 unsigned int cpu = smp_processor_id();
757 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
759 ASSERT(local_irq_is_enabled());
761 /* Allow at most one CPU at a time to be dirty. */
762 ASSERT(cpus_weight(dirty_mask) <= 1);
763 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
764 {
765 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
766 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
767 flush_tlb_mask(next->vcpu_dirty_cpumask);
768 }
770 local_irq_disable();
772 set_current(next);
774 if ( (percpu_ctxt[cpu].curr_vcpu == next) || is_idle_vcpu(next) )
775 {
776 local_irq_enable();
777 }
778 else
779 {
780 __context_switch();
782 /* Re-enable interrupts before restoring state which may fault. */
783 local_irq_enable();
785 if ( !HVM_DOMAIN(next) )
786 {
787 load_LDT(next);
788 load_segments(next);
789 }
790 }
792 context_saved(prev);
794 schedule_tail(next);
795 BUG();
796 }
798 void continue_running(struct vcpu *same)
799 {
800 schedule_tail(same);
801 BUG();
802 }
804 int __sync_lazy_execstate(void)
805 {
806 unsigned long flags;
807 int switch_required;
809 local_irq_save(flags);
811 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
813 if ( switch_required )
814 __context_switch();
816 local_irq_restore(flags);
818 return switch_required;
819 }
821 void sync_vcpu_execstate(struct vcpu *v)
822 {
823 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
824 (void)__sync_lazy_execstate();
826 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
827 flush_tlb_mask(v->vcpu_dirty_cpumask);
828 }
830 unsigned long __hypercall_create_continuation(
831 unsigned int op, unsigned int nr_args, ...)
832 {
833 struct mc_state *mcs = &mc_state[smp_processor_id()];
834 struct cpu_user_regs *regs;
835 unsigned int i;
836 va_list args;
838 va_start(args, nr_args);
840 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
841 {
842 __set_bit(_MCSF_call_preempted, &mcs->flags);
844 for ( i = 0; i < nr_args; i++ )
845 mcs->call.args[i] = va_arg(args, unsigned long);
846 }
847 else
848 {
849 regs = guest_cpu_user_regs();
850 #if defined(__i386__)
851 regs->eax = op;
852 regs->eip -= 2; /* re-execute 'int 0x82' */
854 for ( i = 0; i < nr_args; i++ )
855 {
856 switch ( i )
857 {
858 case 0: regs->ebx = va_arg(args, unsigned long); break;
859 case 1: regs->ecx = va_arg(args, unsigned long); break;
860 case 2: regs->edx = va_arg(args, unsigned long); break;
861 case 3: regs->esi = va_arg(args, unsigned long); break;
862 case 4: regs->edi = va_arg(args, unsigned long); break;
863 case 5: regs->ebp = va_arg(args, unsigned long); break;
864 }
865 }
866 #elif defined(__x86_64__)
867 regs->rax = op;
868 regs->rip -= 2; /* re-execute 'syscall' */
870 for ( i = 0; i < nr_args; i++ )
871 {
872 switch ( i )
873 {
874 case 0: regs->rdi = va_arg(args, unsigned long); break;
875 case 1: regs->rsi = va_arg(args, unsigned long); break;
876 case 2: regs->rdx = va_arg(args, unsigned long); break;
877 case 3: regs->r10 = va_arg(args, unsigned long); break;
878 case 4: regs->r8 = va_arg(args, unsigned long); break;
879 case 5: regs->r9 = va_arg(args, unsigned long); break;
880 }
881 }
882 #endif
883 }
885 va_end(args);
887 return op;
888 }
890 static void relinquish_memory(struct domain *d, struct list_head *list)
891 {
892 struct list_head *ent;
893 struct page_info *page;
894 unsigned long x, y;
896 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
897 spin_lock_recursive(&d->page_alloc_lock);
899 ent = list->next;
900 while ( ent != list )
901 {
902 page = list_entry(ent, struct page_info, list);
904 /* Grab a reference to the page so it won't disappear from under us. */
905 if ( unlikely(!get_page(page, d)) )
906 {
907 /* Couldn't get a reference -- someone is freeing this page. */
908 ent = ent->next;
909 continue;
910 }
912 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
913 put_page_and_type(page);
915 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
916 put_page(page);
918 /*
919 * Forcibly invalidate base page tables at this point to break circular
920 * 'linear page table' references. This is okay because MMU structures
921 * are not shared across domains and this domain is now dead. Thus base
922 * tables are not in use so a non-zero count means circular reference.
923 */
924 y = page->u.inuse.type_info;
925 for ( ; ; )
926 {
927 x = y;
928 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
929 (PGT_base_page_table|PGT_validated)) )
930 break;
932 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
933 if ( likely(y == x) )
934 {
935 free_page_type(page, PGT_base_page_table);
936 break;
937 }
938 }
940 /* Follow the list chain and /then/ potentially free the page. */
941 ent = ent->next;
942 put_page(page);
943 }
945 spin_unlock_recursive(&d->page_alloc_lock);
946 }
948 void domain_relinquish_resources(struct domain *d)
949 {
950 struct vcpu *v;
951 unsigned long pfn;
953 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
955 ptwr_destroy(d);
957 /* Drop the in-use references to page-table bases. */
958 for_each_vcpu ( d, v )
959 {
960 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
961 {
962 if ( !shadow_mode_refcounts(d) )
963 put_page_type(mfn_to_page(pfn));
964 put_page(mfn_to_page(pfn));
966 v->arch.guest_table = mk_pagetable(0);
967 }
969 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
970 {
971 if ( !shadow_mode_refcounts(d) )
972 put_page_type(mfn_to_page(pfn));
973 put_page(mfn_to_page(pfn));
975 v->arch.guest_table_user = mk_pagetable(0);
976 }
978 if ( HVM_DOMAIN(v) )
979 hvm_relinquish_guest_resources(v);
980 }
982 shadow_mode_disable(d);
984 /*
985 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
986 * it automatically gets squashed when the guest's mappings go away.
987 */
988 for_each_vcpu(d, v)
989 destroy_gdt(v);
991 /* Relinquish every page of memory. */
992 relinquish_memory(d, &d->xenpage_list);
993 relinquish_memory(d, &d->page_list);
994 }
997 /*
998 * Local variables:
999 * mode: C
1000 * c-set-style: "BSD"
1001 * c-basic-offset: 4
1002 * tab-width: 4
1003 * indent-tabs-mode: nil
1004 * End:
1005 */