ia64/xen-unstable

view xen/arch/x86/domain.c @ 9024:d0b7281556f2

New VCPUOP_register_runstate_memory_area hypercall. Avoids
need for a hypercall in the guest timer interrupt handler.

Cleaned up stolen/blocked tick handling in Linux.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sat Feb 25 21:28:27 2006 +0100 (2006-02-25)
parents 407a50974a46
children ee8041b0ab86
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <asm/regs.h>
25 #include <asm/mc146818rtc.h>
26 #include <asm/system.h>
27 #include <asm/io.h>
28 #include <asm/processor.h>
29 #include <asm/desc.h>
30 #include <asm/i387.h>
31 #include <asm/mpspec.h>
32 #include <asm/ldt.h>
33 #include <xen/irq.h>
34 #include <xen/event.h>
35 #include <asm/shadow.h>
36 #include <xen/console.h>
37 #include <xen/elf.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/msr.h>
41 #include <xen/kernel.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 struct percpu_ctxt {
49 struct vcpu *curr_vcpu;
50 unsigned int dirty_segment_mask;
51 } __cacheline_aligned;
52 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
54 static void continue_idle_domain(struct vcpu *v)
55 {
56 reset_stack_and_jump(idle_loop);
57 }
59 static void continue_nonidle_domain(struct vcpu *v)
60 {
61 reset_stack_and_jump(ret_from_intr);
62 }
64 static void default_idle(void)
65 {
66 local_irq_disable();
67 if ( !softirq_pending(smp_processor_id()) )
68 safe_halt();
69 else
70 local_irq_enable();
71 }
73 void idle_loop(void)
74 {
75 int cpu = smp_processor_id();
77 for ( ; ; )
78 {
79 irq_stat[cpu].idle_timestamp = jiffies;
81 while ( !softirq_pending(cpu) )
82 {
83 page_scrub_schedule_work();
84 default_idle();
85 }
87 do_softirq();
88 }
89 }
91 void startup_cpu_idle_loop(void)
92 {
93 struct vcpu *v = current;
95 ASSERT(is_idle_vcpu(v));
96 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
97 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
99 reset_stack_and_jump(idle_loop);
100 }
102 static long no_idt[2];
103 static int reboot_mode;
105 static inline void kb_wait(void)
106 {
107 int i;
109 for ( i = 0; i < 0x10000; i++ )
110 if ( (inb_p(0x64) & 0x02) == 0 )
111 break;
112 }
114 void __attribute__((noreturn)) __machine_halt(void *unused)
115 {
116 for ( ; ; )
117 safe_halt();
118 }
120 void machine_halt(void)
121 {
122 watchdog_disable();
123 console_start_sync();
124 smp_call_function(__machine_halt, NULL, 1, 0);
125 __machine_halt(NULL);
126 }
128 void machine_restart(char * __unused)
129 {
130 int i;
132 if ( opt_noreboot )
133 {
134 printk("Reboot disabled on cmdline: require manual reset\n");
135 machine_halt();
136 }
138 watchdog_disable();
139 console_start_sync();
141 local_irq_enable();
143 /* Ensure we are the boot CPU. */
144 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
145 {
146 smp_call_function((void *)machine_restart, NULL, 1, 0);
147 for ( ; ; )
148 safe_halt();
149 }
151 /*
152 * Stop all CPUs and turn off local APICs and the IO-APIC, so
153 * other OSs see a clean IRQ state.
154 */
155 smp_send_stop();
156 disable_IO_APIC();
157 hvm_disable();
159 /* Rebooting needs to touch the page at absolute address 0. */
160 *((unsigned short *)__va(0x472)) = reboot_mode;
162 for ( ; ; )
163 {
164 /* Pulse the keyboard reset line. */
165 for ( i = 0; i < 100; i++ )
166 {
167 kb_wait();
168 udelay(50);
169 outb(0xfe,0x64); /* pulse reset low */
170 udelay(50);
171 }
173 /* That didn't work - force a triple fault.. */
174 __asm__ __volatile__("lidt %0": "=m" (no_idt));
175 __asm__ __volatile__("int3");
176 }
177 }
180 void dump_pageframe_info(struct domain *d)
181 {
182 struct page_info *page;
184 printk("Memory pages belonging to domain %u:\n", d->domain_id);
186 if ( d->tot_pages >= 10 )
187 {
188 printk(" DomPage list too long to display\n");
189 }
190 else
191 {
192 list_for_each_entry ( page, &d->page_list, list )
193 {
194 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
195 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
196 page->count_info, page->u.inuse.type_info);
197 }
198 }
200 list_for_each_entry ( page, &d->xenpage_list, list )
201 {
202 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
203 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
204 page->count_info, page->u.inuse.type_info);
205 }
206 }
208 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
209 {
210 struct vcpu *v;
212 if ( (v = xmalloc(struct vcpu)) == NULL )
213 return NULL;
215 memset(v, 0, sizeof(*v));
217 v->arch.flags = TF_kernel_mode;
219 if ( is_idle_domain(d) )
220 {
221 percpu_ctxt[vcpu_id].curr_vcpu = v;
222 v->arch.schedule_tail = continue_idle_domain;
223 }
224 else
225 {
226 v->arch.schedule_tail = continue_nonidle_domain;
227 }
229 v->arch.perdomain_ptes =
230 d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
232 v->arch.guest_vtable = __linear_l2_table;
233 v->arch.shadow_vtable = __shadow_linear_l2_table;
234 #if defined(__x86_64__)
235 v->arch.guest_vl3table = __linear_l3_table;
236 v->arch.guest_vl4table = __linear_l4_table;
237 #endif
239 return v;
240 }
242 void free_vcpu_struct(struct vcpu *v)
243 {
244 xfree(v);
245 }
247 int arch_domain_create(struct domain *d)
248 {
249 l1_pgentry_t gdt_l1e;
250 int vcpuid, pdpt_order, rc;
251 #ifdef __x86_64__
252 int i;
253 #endif
255 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
256 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
257 if ( d->arch.mm_perdomain_pt == NULL )
258 goto fail_nomem;
259 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
261 /*
262 * Map Xen segments into every VCPU's GDT, irrespective of whether every
263 * VCPU will actually be used. This avoids an NMI race during context
264 * switch: if we take an interrupt after switching CR3 but before switching
265 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
266 * try to load CS from an invalid table.
267 */
268 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
269 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
270 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
271 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
273 #if defined(__i386__)
275 mapcache_init(d);
277 #else /* __x86_64__ */
279 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
280 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
281 if ( (d->arch.mm_perdomain_l2 == NULL) ||
282 (d->arch.mm_perdomain_l3 == NULL) )
283 goto fail_nomem;
285 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
286 for ( i = 0; i < (1 << pdpt_order); i++ )
287 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
288 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
289 __PAGE_HYPERVISOR);
291 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
292 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
293 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
294 __PAGE_HYPERVISOR);
296 #endif /* __x86_64__ */
298 shadow_lock_init(d);
299 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
301 if ( !is_idle_domain(d) )
302 {
303 d->arch.ioport_caps =
304 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
305 if ( d->arch.ioport_caps == NULL )
306 goto fail_nomem;
308 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
309 goto fail_nomem;
311 if ( (rc = ptwr_init(d)) != 0 )
312 goto fail_nomem;
314 memset(d->shared_info, 0, PAGE_SIZE);
315 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
316 }
318 return 0;
320 fail_nomem:
321 free_xenheap_page(d->shared_info);
322 #ifdef __x86_64__
323 free_xenheap_page(d->arch.mm_perdomain_l2);
324 free_xenheap_page(d->arch.mm_perdomain_l3);
325 #endif
326 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
327 return -ENOMEM;
328 }
330 void arch_domain_destroy(struct domain *d)
331 {
332 free_xenheap_pages(
333 d->arch.mm_perdomain_pt,
334 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
336 #ifdef __x86_64__
337 free_xenheap_page(d->arch.mm_perdomain_l2);
338 free_xenheap_page(d->arch.mm_perdomain_l3);
339 #endif
341 free_xenheap_page(d->shared_info);
342 }
344 /* This is called by arch_final_setup_guest and do_boot_vcpu */
345 int arch_set_info_guest(
346 struct vcpu *v, struct vcpu_guest_context *c)
347 {
348 struct domain *d = v->domain;
349 unsigned long phys_basetab = INVALID_MFN;
350 int i, rc;
352 if ( !(c->flags & VGCF_HVM_GUEST) )
353 {
354 fixup_guest_selector(c->user_regs.ss);
355 fixup_guest_selector(c->kernel_ss);
356 fixup_guest_selector(c->user_regs.cs);
358 #ifdef __i386__
359 fixup_guest_selector(c->event_callback_cs);
360 fixup_guest_selector(c->failsafe_callback_cs);
361 #endif
363 for ( i = 0; i < 256; i++ )
364 fixup_guest_selector(c->trap_ctxt[i].cs);
365 }
366 else if ( !hvm_enabled )
367 return -EINVAL;
369 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
370 if ( c->flags & VGCF_I387_VALID )
371 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
373 v->arch.flags &= ~TF_kernel_mode;
374 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_HVM_GUEST) )
375 v->arch.flags |= TF_kernel_mode;
377 memcpy(&v->arch.guest_context, c, sizeof(*c));
378 init_int80_direct_trap(v);
380 if ( !(c->flags & VGCF_HVM_GUEST) )
381 {
382 /* IOPL privileges are virtualised. */
383 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
384 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
386 /* Ensure real hardware interrupts are enabled. */
387 v->arch.guest_context.user_regs.eflags |= EF_IE;
388 }
389 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
390 {
391 hvm_modify_guest_state(v);
392 }
394 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
395 return 0;
397 memset(v->arch.guest_context.debugreg, 0,
398 sizeof(v->arch.guest_context.debugreg));
399 for ( i = 0; i < 8; i++ )
400 (void)set_debugreg(v, i, c->debugreg[i]);
402 if ( v->vcpu_id == 0 )
403 d->vm_assist = c->vm_assist;
405 if ( !(c->flags & VGCF_HVM_GUEST) )
406 {
407 phys_basetab = c->ctrlreg[3];
408 phys_basetab =
409 (gmfn_to_mfn(d, phys_basetab >> PAGE_SHIFT) << PAGE_SHIFT) |
410 (phys_basetab & ~PAGE_MASK);
412 v->arch.guest_table = mk_pagetable(phys_basetab);
413 }
415 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
416 return rc;
418 if ( c->flags & VGCF_HVM_GUEST )
419 {
420 v->arch.guest_table = mk_pagetable(0);
422 if ( !hvm_initialize_guest_resources(v) )
423 return -EINVAL;
424 }
425 else if ( shadow_mode_refcounts(d) )
426 {
427 if ( !get_page(mfn_to_page(phys_basetab>>PAGE_SHIFT), d) )
428 {
429 destroy_gdt(v);
430 return -EINVAL;
431 }
432 }
433 else
434 {
435 if ( !get_page_and_type(mfn_to_page(phys_basetab>>PAGE_SHIFT), d,
436 PGT_base_page_table) )
437 {
438 destroy_gdt(v);
439 return -EINVAL;
440 }
441 }
443 update_pagetables(v);
445 if ( v->vcpu_id == 0 )
446 init_domain_time(d);
448 /* Don't redo final setup */
449 set_bit(_VCPUF_initialised, &v->vcpu_flags);
451 return 0;
452 }
455 void new_thread(struct vcpu *d,
456 unsigned long start_pc,
457 unsigned long start_stack,
458 unsigned long start_info)
459 {
460 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
462 /*
463 * Initial register values:
464 * DS,ES,FS,GS = FLAT_KERNEL_DS
465 * CS:EIP = FLAT_KERNEL_CS:start_pc
466 * SS:ESP = FLAT_KERNEL_SS:start_stack
467 * ESI = start_info
468 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
469 */
470 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
471 regs->ss = FLAT_KERNEL_SS;
472 regs->cs = FLAT_KERNEL_CS;
473 regs->eip = start_pc;
474 regs->esp = start_stack;
475 regs->esi = start_info;
477 __save_flags(regs->eflags);
478 regs->eflags |= X86_EFLAGS_IF;
479 }
482 #ifdef __x86_64__
484 #define loadsegment(seg,value) ({ \
485 int __r = 1; \
486 __asm__ __volatile__ ( \
487 "1: movl %k1,%%" #seg "\n2:\n" \
488 ".section .fixup,\"ax\"\n" \
489 "3: xorl %k0,%k0\n" \
490 " movl %k0,%%" #seg "\n" \
491 " jmp 2b\n" \
492 ".previous\n" \
493 ".section __ex_table,\"a\"\n" \
494 " .align 8\n" \
495 " .quad 1b,3b\n" \
496 ".previous" \
497 : "=r" (__r) : "r" (value), "0" (__r) );\
498 __r; })
500 /*
501 * save_segments() writes a mask of segments which are dirty (non-zero),
502 * allowing load_segments() to avoid some expensive segment loads and
503 * MSR writes.
504 */
505 #define DIRTY_DS 0x01
506 #define DIRTY_ES 0x02
507 #define DIRTY_FS 0x04
508 #define DIRTY_GS 0x08
509 #define DIRTY_FS_BASE 0x10
510 #define DIRTY_GS_BASE_USER 0x20
512 static void load_segments(struct vcpu *n)
513 {
514 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
515 int all_segs_okay = 1;
516 unsigned int dirty_segment_mask, cpu = smp_processor_id();
518 /* Load and clear the dirty segment mask. */
519 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
520 percpu_ctxt[cpu].dirty_segment_mask = 0;
522 /* Either selector != 0 ==> reload. */
523 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
524 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
526 /* Either selector != 0 ==> reload. */
527 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
528 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
530 /*
531 * Either selector != 0 ==> reload.
532 * Also reload to reset FS_BASE if it was non-zero.
533 */
534 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
535 nctxt->user_regs.fs) )
536 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
538 /*
539 * Either selector != 0 ==> reload.
540 * Also reload to reset GS_BASE if it was non-zero.
541 */
542 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
543 nctxt->user_regs.gs) )
544 {
545 /* Reset GS_BASE with user %gs? */
546 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
547 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
548 }
550 /* This can only be non-zero if selector is NULL. */
551 if ( nctxt->fs_base )
552 wrmsr(MSR_FS_BASE,
553 nctxt->fs_base,
554 nctxt->fs_base>>32);
556 /* Most kernels have non-zero GS base, so don't bother testing. */
557 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
558 wrmsr(MSR_SHADOW_GS_BASE,
559 nctxt->gs_base_kernel,
560 nctxt->gs_base_kernel>>32);
562 /* This can only be non-zero if selector is NULL. */
563 if ( nctxt->gs_base_user )
564 wrmsr(MSR_GS_BASE,
565 nctxt->gs_base_user,
566 nctxt->gs_base_user>>32);
568 /* If in kernel mode then switch the GS bases around. */
569 if ( n->arch.flags & TF_kernel_mode )
570 __asm__ __volatile__ ( "swapgs" );
572 if ( unlikely(!all_segs_okay) )
573 {
574 struct cpu_user_regs *regs = guest_cpu_user_regs();
575 unsigned long *rsp =
576 (n->arch.flags & TF_kernel_mode) ?
577 (unsigned long *)regs->rsp :
578 (unsigned long *)nctxt->kernel_sp;
580 if ( !(n->arch.flags & TF_kernel_mode) )
581 toggle_guest_mode(n);
582 else
583 regs->cs &= ~3;
585 if ( put_user(regs->ss, rsp- 1) |
586 put_user(regs->rsp, rsp- 2) |
587 put_user(regs->rflags, rsp- 3) |
588 put_user(regs->cs, rsp- 4) |
589 put_user(regs->rip, rsp- 5) |
590 put_user(nctxt->user_regs.gs, rsp- 6) |
591 put_user(nctxt->user_regs.fs, rsp- 7) |
592 put_user(nctxt->user_regs.es, rsp- 8) |
593 put_user(nctxt->user_regs.ds, rsp- 9) |
594 put_user(regs->r11, rsp-10) |
595 put_user(regs->rcx, rsp-11) )
596 {
597 DPRINTK("Error while creating failsafe callback frame.\n");
598 domain_crash(n->domain);
599 }
601 regs->entry_vector = TRAP_syscall;
602 regs->rflags &= 0xFFFCBEFFUL;
603 regs->ss = __GUEST_SS;
604 regs->rsp = (unsigned long)(rsp-11);
605 regs->cs = __GUEST_CS;
606 regs->rip = nctxt->failsafe_callback_eip;
607 }
608 }
610 static void save_segments(struct vcpu *v)
611 {
612 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
613 struct cpu_user_regs *regs = &ctxt->user_regs;
614 unsigned int dirty_segment_mask = 0;
616 regs->ds = read_segment_register(ds);
617 regs->es = read_segment_register(es);
618 regs->fs = read_segment_register(fs);
619 regs->gs = read_segment_register(gs);
621 if ( regs->ds )
622 dirty_segment_mask |= DIRTY_DS;
624 if ( regs->es )
625 dirty_segment_mask |= DIRTY_ES;
627 if ( regs->fs )
628 {
629 dirty_segment_mask |= DIRTY_FS;
630 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
631 }
632 else if ( ctxt->fs_base )
633 {
634 dirty_segment_mask |= DIRTY_FS_BASE;
635 }
637 if ( regs->gs )
638 {
639 dirty_segment_mask |= DIRTY_GS;
640 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
641 }
642 else if ( ctxt->gs_base_user )
643 {
644 dirty_segment_mask |= DIRTY_GS_BASE_USER;
645 }
647 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
648 }
650 #define switch_kernel_stack(_n,_c) ((void)0)
652 #elif defined(__i386__)
654 #define load_segments(n) ((void)0)
655 #define save_segments(p) ((void)0)
657 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
658 {
659 struct tss_struct *tss = &init_tss[cpu];
660 tss->esp1 = n->arch.guest_context.kernel_sp;
661 tss->ss1 = n->arch.guest_context.kernel_ss;
662 }
664 #endif
666 #define loaddebug(_v,_reg) \
667 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
669 static void __context_switch(void)
670 {
671 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
672 unsigned int cpu = smp_processor_id();
673 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
674 struct vcpu *n = current;
676 ASSERT(p != n);
677 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
679 if ( !is_idle_vcpu(p) )
680 {
681 memcpy(&p->arch.guest_context.user_regs,
682 stack_regs,
683 CTXT_SWITCH_STACK_BYTES);
684 unlazy_fpu(p);
685 if ( !HVM_DOMAIN(p) )
686 {
687 save_segments(p);
688 }
689 else
690 {
691 hvm_save_segments(p);
692 hvm_load_msrs();
693 }
694 }
696 if ( !is_idle_vcpu(n) )
697 {
698 memcpy(stack_regs,
699 &n->arch.guest_context.user_regs,
700 CTXT_SWITCH_STACK_BYTES);
702 /* Maybe switch the debug registers. */
703 if ( unlikely(n->arch.guest_context.debugreg[7]) )
704 {
705 loaddebug(&n->arch.guest_context, 0);
706 loaddebug(&n->arch.guest_context, 1);
707 loaddebug(&n->arch.guest_context, 2);
708 loaddebug(&n->arch.guest_context, 3);
709 /* no 4 and 5 */
710 loaddebug(&n->arch.guest_context, 6);
711 loaddebug(&n->arch.guest_context, 7);
712 }
714 if ( !HVM_DOMAIN(n) )
715 {
716 set_int80_direct_trap(n);
717 switch_kernel_stack(n, cpu);
718 }
719 else
720 {
721 hvm_restore_msrs(n);
722 }
723 }
725 if ( p->domain != n->domain )
726 cpu_set(cpu, n->domain->domain_dirty_cpumask);
727 cpu_set(cpu, n->vcpu_dirty_cpumask);
729 write_ptbase(n);
731 if ( p->vcpu_id != n->vcpu_id )
732 {
733 char gdt_load[10];
734 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
735 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
736 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
737 }
739 if ( p->domain != n->domain )
740 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
741 cpu_clear(cpu, p->vcpu_dirty_cpumask);
743 percpu_ctxt[cpu].curr_vcpu = n;
744 }
747 void context_switch(struct vcpu *prev, struct vcpu *next)
748 {
749 unsigned int cpu = smp_processor_id();
750 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
752 ASSERT(local_irq_is_enabled());
754 /* Allow at most one CPU at a time to be dirty. */
755 ASSERT(cpus_weight(dirty_mask) <= 1);
756 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
757 {
758 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
759 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
760 flush_tlb_mask(next->vcpu_dirty_cpumask);
761 }
763 local_irq_disable();
765 set_current(next);
767 if ( (percpu_ctxt[cpu].curr_vcpu == next) || is_idle_vcpu(next) )
768 {
769 local_irq_enable();
770 }
771 else
772 {
773 __context_switch();
775 /* Re-enable interrupts before restoring state which may fault. */
776 local_irq_enable();
778 if ( !HVM_DOMAIN(next) )
779 {
780 load_LDT(next);
781 load_segments(next);
782 }
783 }
785 context_saved(prev);
787 /* Update per-VCPU guest runstate shared memory area (if registered). */
788 if ( next->runstate_guest != NULL )
789 __copy_to_user(next->runstate_guest, &next->runstate,
790 sizeof(next->runstate));
792 schedule_tail(next);
793 BUG();
794 }
796 void continue_running(struct vcpu *same)
797 {
798 schedule_tail(same);
799 BUG();
800 }
802 int __sync_lazy_execstate(void)
803 {
804 unsigned long flags;
805 int switch_required;
807 local_irq_save(flags);
809 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
811 if ( switch_required )
812 __context_switch();
814 local_irq_restore(flags);
816 return switch_required;
817 }
819 void sync_vcpu_execstate(struct vcpu *v)
820 {
821 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
822 (void)__sync_lazy_execstate();
824 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
825 flush_tlb_mask(v->vcpu_dirty_cpumask);
826 }
828 unsigned long __hypercall_create_continuation(
829 unsigned int op, unsigned int nr_args, ...)
830 {
831 struct mc_state *mcs = &mc_state[smp_processor_id()];
832 struct cpu_user_regs *regs;
833 unsigned int i;
834 va_list args;
836 va_start(args, nr_args);
838 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
839 {
840 __set_bit(_MCSF_call_preempted, &mcs->flags);
842 for ( i = 0; i < nr_args; i++ )
843 mcs->call.args[i] = va_arg(args, unsigned long);
844 }
845 else
846 {
847 regs = guest_cpu_user_regs();
848 #if defined(__i386__)
849 regs->eax = op;
850 regs->eip -= 2; /* re-execute 'int 0x82' */
852 for ( i = 0; i < nr_args; i++ )
853 {
854 switch ( i )
855 {
856 case 0: regs->ebx = va_arg(args, unsigned long); break;
857 case 1: regs->ecx = va_arg(args, unsigned long); break;
858 case 2: regs->edx = va_arg(args, unsigned long); break;
859 case 3: regs->esi = va_arg(args, unsigned long); break;
860 case 4: regs->edi = va_arg(args, unsigned long); break;
861 case 5: regs->ebp = va_arg(args, unsigned long); break;
862 }
863 }
864 #elif defined(__x86_64__)
865 regs->rax = op;
866 regs->rip -= 2; /* re-execute 'syscall' */
868 for ( i = 0; i < nr_args; i++ )
869 {
870 switch ( i )
871 {
872 case 0: regs->rdi = va_arg(args, unsigned long); break;
873 case 1: regs->rsi = va_arg(args, unsigned long); break;
874 case 2: regs->rdx = va_arg(args, unsigned long); break;
875 case 3: regs->r10 = va_arg(args, unsigned long); break;
876 case 4: regs->r8 = va_arg(args, unsigned long); break;
877 case 5: regs->r9 = va_arg(args, unsigned long); break;
878 }
879 }
880 #endif
881 }
883 va_end(args);
885 return op;
886 }
888 static void relinquish_memory(struct domain *d, struct list_head *list)
889 {
890 struct list_head *ent;
891 struct page_info *page;
892 unsigned long x, y;
894 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
895 spin_lock_recursive(&d->page_alloc_lock);
897 ent = list->next;
898 while ( ent != list )
899 {
900 page = list_entry(ent, struct page_info, list);
902 /* Grab a reference to the page so it won't disappear from under us. */
903 if ( unlikely(!get_page(page, d)) )
904 {
905 /* Couldn't get a reference -- someone is freeing this page. */
906 ent = ent->next;
907 continue;
908 }
910 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
911 put_page_and_type(page);
913 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
914 put_page(page);
916 /*
917 * Forcibly invalidate base page tables at this point to break circular
918 * 'linear page table' references. This is okay because MMU structures
919 * are not shared across domains and this domain is now dead. Thus base
920 * tables are not in use so a non-zero count means circular reference.
921 */
922 y = page->u.inuse.type_info;
923 for ( ; ; )
924 {
925 x = y;
926 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
927 (PGT_base_page_table|PGT_validated)) )
928 break;
930 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
931 if ( likely(y == x) )
932 {
933 free_page_type(page, PGT_base_page_table);
934 break;
935 }
936 }
938 /* Follow the list chain and /then/ potentially free the page. */
939 ent = ent->next;
940 put_page(page);
941 }
943 spin_unlock_recursive(&d->page_alloc_lock);
944 }
946 void domain_relinquish_resources(struct domain *d)
947 {
948 struct vcpu *v;
949 unsigned long pfn;
951 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
953 ptwr_destroy(d);
955 /* Drop the in-use references to page-table bases. */
956 for_each_vcpu ( d, v )
957 {
958 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
959 {
960 if ( !shadow_mode_refcounts(d) )
961 put_page_type(mfn_to_page(pfn));
962 put_page(mfn_to_page(pfn));
964 v->arch.guest_table = mk_pagetable(0);
965 }
967 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
968 {
969 if ( !shadow_mode_refcounts(d) )
970 put_page_type(mfn_to_page(pfn));
971 put_page(mfn_to_page(pfn));
973 v->arch.guest_table_user = mk_pagetable(0);
974 }
976 if ( HVM_DOMAIN(v) )
977 hvm_relinquish_guest_resources(v);
978 }
980 shadow_mode_disable(d);
982 /*
983 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
984 * it automatically gets squashed when the guest's mappings go away.
985 */
986 for_each_vcpu(d, v)
987 destroy_gdt(v);
989 /* Relinquish every page of memory. */
990 relinquish_memory(d, &d->xenpage_list);
991 relinquish_memory(d, &d->page_list);
992 }
994 void arch_dump_domain_info(struct domain *d)
995 {
996 if ( shadow_mode_enabled(d) )
997 {
998 printk(" shadow mode: ");
999 if ( shadow_mode_refcounts(d) )
1000 printk("refcounts ");
1001 if ( shadow_mode_write_all(d) )
1002 printk("write_all ");
1003 if ( shadow_mode_log_dirty(d) )
1004 printk("log_dirty ");
1005 if ( shadow_mode_translate(d) )
1006 printk("translate ");
1007 if ( shadow_mode_external(d) )
1008 printk("external ");
1009 if ( shadow_mode_wr_pt_pte(d) )
1010 printk("wr_pt_pte ");
1011 printk("\n");
1015 /*
1016 * Local variables:
1017 * mode: C
1018 * c-set-style: "BSD"
1019 * c-basic-offset: 4
1020 * tab-width: 4
1021 * indent-tabs-mode: nil
1022 * End:
1023 */