ia64/xen-unstable

view xen/arch/x86/domain.c @ 8518:83eeb056f7c2

Handle migration of x86 VCPUs between physical CPUs.
If required, context_switch() must pull state off of
old CPU.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Sat Jan 07 17:52:43 2006 +0100 (2006-01-07)
parents 0aff653824db
children 0ba3b9d60da6
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <asm/regs.h>
25 #include <asm/mc146818rtc.h>
26 #include <asm/system.h>
27 #include <asm/io.h>
28 #include <asm/processor.h>
29 #include <asm/desc.h>
30 #include <asm/i387.h>
31 #include <asm/mpspec.h>
32 #include <asm/ldt.h>
33 #include <xen/irq.h>
34 #include <xen/event.h>
35 #include <asm/shadow.h>
36 #include <xen/console.h>
37 #include <xen/elf.h>
38 #include <asm/vmx.h>
39 #include <asm/msr.h>
40 #include <xen/kernel.h>
41 #include <xen/multicall.h>
43 /* opt_noreboot: If true, machine will need manual reset on error. */
44 static int opt_noreboot = 0;
45 boolean_param("noreboot", opt_noreboot);
47 struct percpu_ctxt {
48 struct vcpu *curr_vcpu;
49 unsigned int dirty_segment_mask;
50 } __cacheline_aligned;
51 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
53 static void continue_idle_domain(struct vcpu *v)
54 {
55 reset_stack_and_jump(idle_loop);
56 }
58 static void continue_nonidle_domain(struct vcpu *v)
59 {
60 reset_stack_and_jump(ret_from_intr);
61 }
63 static void default_idle(void)
64 {
65 local_irq_disable();
66 if ( !softirq_pending(smp_processor_id()) )
67 safe_halt();
68 else
69 local_irq_enable();
70 }
72 void idle_loop(void)
73 {
74 int cpu = smp_processor_id();
76 for ( ; ; )
77 {
78 irq_stat[cpu].idle_timestamp = jiffies;
80 while ( !softirq_pending(cpu) )
81 {
82 page_scrub_schedule_work();
83 default_idle();
84 }
86 do_softirq();
87 }
88 }
90 void startup_cpu_idle_loop(void)
91 {
92 struct vcpu *v = current;
94 ASSERT(is_idle_domain(v->domain));
95 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
96 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
97 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
98 v->arch.schedule_tail = continue_idle_domain;
100 reset_stack_and_jump(idle_loop);
101 }
103 static long no_idt[2];
104 static int reboot_mode;
106 static inline void kb_wait(void)
107 {
108 int i;
110 for ( i = 0; i < 0x10000; i++ )
111 if ( (inb_p(0x64) & 0x02) == 0 )
112 break;
113 }
115 void machine_restart(char * __unused)
116 {
117 int i;
119 if ( opt_noreboot )
120 {
121 printk("Reboot disabled on cmdline: require manual reset\n");
122 for ( ; ; )
123 safe_halt();
124 }
126 watchdog_disable();
127 console_start_sync();
129 local_irq_enable();
131 /* Ensure we are the boot CPU. */
132 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
133 {
134 smp_call_function((void *)machine_restart, NULL, 1, 0);
135 for ( ; ; )
136 safe_halt();
137 }
139 /*
140 * Stop all CPUs and turn off local APICs and the IO-APIC, so
141 * other OSs see a clean IRQ state.
142 */
143 smp_send_stop();
144 disable_IO_APIC();
146 stop_vmx();
148 /* Rebooting needs to touch the page at absolute address 0. */
149 *((unsigned short *)__va(0x472)) = reboot_mode;
151 for ( ; ; )
152 {
153 /* Pulse the keyboard reset line. */
154 for ( i = 0; i < 100; i++ )
155 {
156 kb_wait();
157 udelay(50);
158 outb(0xfe,0x64); /* pulse reset low */
159 udelay(50);
160 }
162 /* That didn't work - force a triple fault.. */
163 __asm__ __volatile__("lidt %0": "=m" (no_idt));
164 __asm__ __volatile__("int3");
165 }
166 }
169 void __attribute__((noreturn)) __machine_halt(void *unused)
170 {
171 for ( ; ; )
172 safe_halt();
173 }
175 void machine_halt(void)
176 {
177 watchdog_disable();
178 console_start_sync();
179 smp_call_function(__machine_halt, NULL, 1, 0);
180 __machine_halt(NULL);
181 }
183 void dump_pageframe_info(struct domain *d)
184 {
185 struct pfn_info *page;
187 printk("Memory pages belonging to domain %u:\n", d->domain_id);
189 if ( d->tot_pages >= 10 )
190 {
191 printk(" DomPage list too long to display\n");
192 }
193 else
194 {
195 list_for_each_entry ( page, &d->page_list, list )
196 {
197 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
198 _p(page_to_phys(page)), _p(page_to_pfn(page)),
199 page->count_info, page->u.inuse.type_info);
200 }
201 }
203 list_for_each_entry ( page, &d->xenpage_list, list )
204 {
205 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
206 _p(page_to_phys(page)), _p(page_to_pfn(page)),
207 page->count_info, page->u.inuse.type_info);
208 }
209 }
211 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
212 {
213 struct vcpu *v;
215 if ( (v = xmalloc(struct vcpu)) == NULL )
216 return NULL;
218 memset(v, 0, sizeof(*v));
220 memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch));
221 v->arch.flags = TF_kernel_mode;
223 if ( (v->vcpu_id = vcpu_id) != 0 )
224 {
225 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
226 v->arch.perdomain_ptes =
227 d->arch.mm_perdomain_pt + (vcpu_id << PDPT_VCPU_SHIFT);
228 }
230 return v;
231 }
233 void free_vcpu_struct(struct vcpu *v)
234 {
235 BUG_ON(v->next_in_list != NULL);
236 if ( v->vcpu_id != 0 )
237 v->domain->vcpu[v->vcpu_id - 1]->next_in_list = NULL;
238 xfree(v);
239 }
241 void free_perdomain_pt(struct domain *d)
242 {
243 free_xenheap_pages(
244 d->arch.mm_perdomain_pt,
245 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
247 #ifdef __x86_64__
248 free_xenheap_page(d->arch.mm_perdomain_l2);
249 free_xenheap_page(d->arch.mm_perdomain_l3);
250 #endif
251 }
253 int arch_do_createdomain(struct vcpu *v)
254 {
255 struct domain *d = v->domain;
256 l1_pgentry_t gdt_l1e;
257 int vcpuid, pdpt_order, rc;
258 #ifdef __x86_64__
259 int i;
260 #endif
262 if ( is_idle_domain(d) )
263 return 0;
265 d->arch.ioport_caps =
266 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
267 if ( d->arch.ioport_caps == NULL )
268 return -ENOMEM;
270 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
271 return -ENOMEM;
273 if ( (rc = ptwr_init(d)) != 0 )
274 {
275 free_xenheap_page(d->shared_info);
276 return rc;
277 }
279 v->arch.schedule_tail = continue_nonidle_domain;
281 memset(d->shared_info, 0, PAGE_SIZE);
282 v->vcpu_info = &d->shared_info->vcpu_info[v->vcpu_id];
283 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
285 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
286 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
287 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
288 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
290 /*
291 * Map Xen segments into every VCPU's GDT, irrespective of whether every
292 * VCPU will actually be used. This avoids an NMI race during context
293 * switch: if we take an interrupt after switching CR3 but before switching
294 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
295 * try to load CS from an invalid table.
296 */
297 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
298 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
299 d->arch.mm_perdomain_pt[
300 (vcpuid << PDPT_VCPU_SHIFT) + FIRST_RESERVED_GDT_PAGE] = gdt_l1e;
302 v->arch.guest_vtable = __linear_l2_table;
303 v->arch.shadow_vtable = __shadow_linear_l2_table;
305 #ifdef __x86_64__
306 v->arch.guest_vl3table = __linear_l3_table;
307 v->arch.guest_vl4table = __linear_l4_table;
309 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
310 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
311 for ( i = 0; i < (1 << pdpt_order); i++ )
312 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
313 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
314 __PAGE_HYPERVISOR);
316 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
317 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
318 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
319 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
320 __PAGE_HYPERVISOR);
321 #endif
323 shadow_lock_init(d);
324 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
326 return 0;
327 }
329 /* This is called by arch_final_setup_guest and do_boot_vcpu */
330 int arch_set_info_guest(
331 struct vcpu *v, struct vcpu_guest_context *c)
332 {
333 struct domain *d = v->domain;
334 unsigned long phys_basetab;
335 int i, rc;
337 /*
338 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
339 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
340 * If SS RPL or DPL differs from CS RPL then we'll #GP.
341 */
342 if ( !(c->flags & VGCF_VMX_GUEST) )
343 {
344 if ( ((c->user_regs.cs & 3) == 0) ||
345 ((c->user_regs.ss & 3) == 0) )
346 return -EINVAL;
347 }
348 else if ( !hvm_enabled )
349 return -EINVAL;
351 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
352 if ( c->flags & VGCF_I387_VALID )
353 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
355 v->arch.flags &= ~TF_kernel_mode;
356 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
357 v->arch.flags |= TF_kernel_mode;
359 memcpy(&v->arch.guest_context, c, sizeof(*c));
361 if ( !(c->flags & VGCF_VMX_GUEST) )
362 {
363 /* IOPL privileges are virtualised. */
364 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
365 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
367 /* Ensure real hardware interrupts are enabled. */
368 v->arch.guest_context.user_regs.eflags |= EF_IE;
369 }
370 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
371 {
372 return modify_vmcs(
373 &v->arch.arch_vmx,
374 &v->arch.guest_context.user_regs);
375 }
377 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
378 return 0;
380 memset(v->arch.guest_context.debugreg, 0,
381 sizeof(v->arch.guest_context.debugreg));
382 for ( i = 0; i < 8; i++ )
383 (void)set_debugreg(v, i, c->debugreg[i]);
385 if ( v->vcpu_id == 0 )
386 d->vm_assist = c->vm_assist;
388 phys_basetab = c->ctrlreg[3];
389 v->arch.guest_table = mk_pagetable(phys_basetab);
391 if ( shadow_mode_refcounts(d) )
392 {
393 if ( !get_page(pfn_to_page(phys_basetab>>PAGE_SHIFT), d) )
394 return -EINVAL;
395 }
396 else if ( !(c->flags & VGCF_VMX_GUEST) )
397 {
398 if ( !get_page_and_type(pfn_to_page(phys_basetab>>PAGE_SHIFT), d,
399 PGT_base_page_table) )
400 return -EINVAL;
401 }
403 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
404 {
405 put_page_and_type(pfn_to_page(phys_basetab>>PAGE_SHIFT));
406 return rc;
407 }
409 if ( c->flags & VGCF_VMX_GUEST )
410 {
411 /* VMX uses the initially provided page tables as the P2M map. */
412 if ( !pagetable_get_paddr(d->arch.phys_table) )
413 d->arch.phys_table = v->arch.guest_table;
414 v->arch.guest_table = mk_pagetable(0);
416 vmx_final_setup_guest(v);
417 }
419 update_pagetables(v);
421 if ( v->vcpu_id == 0 )
422 init_domain_time(d);
424 /* Don't redo final setup */
425 set_bit(_VCPUF_initialised, &v->vcpu_flags);
427 return 0;
428 }
431 void new_thread(struct vcpu *d,
432 unsigned long start_pc,
433 unsigned long start_stack,
434 unsigned long start_info)
435 {
436 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
438 /*
439 * Initial register values:
440 * DS,ES,FS,GS = FLAT_KERNEL_DS
441 * CS:EIP = FLAT_KERNEL_CS:start_pc
442 * SS:ESP = FLAT_KERNEL_SS:start_stack
443 * ESI = start_info
444 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
445 */
446 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
447 regs->ss = FLAT_KERNEL_SS;
448 regs->cs = FLAT_KERNEL_CS;
449 regs->eip = start_pc;
450 regs->esp = start_stack;
451 regs->esi = start_info;
453 __save_flags(regs->eflags);
454 regs->eflags |= X86_EFLAGS_IF;
455 }
458 #ifdef __x86_64__
460 void toggle_guest_mode(struct vcpu *v)
461 {
462 v->arch.flags ^= TF_kernel_mode;
463 __asm__ __volatile__ ( "swapgs" );
464 update_pagetables(v);
465 write_ptbase(v);
466 }
468 #define loadsegment(seg,value) ({ \
469 int __r = 1; \
470 __asm__ __volatile__ ( \
471 "1: movl %k1,%%" #seg "\n2:\n" \
472 ".section .fixup,\"ax\"\n" \
473 "3: xorl %k0,%k0\n" \
474 " movl %k0,%%" #seg "\n" \
475 " jmp 2b\n" \
476 ".previous\n" \
477 ".section __ex_table,\"a\"\n" \
478 " .align 8\n" \
479 " .quad 1b,3b\n" \
480 ".previous" \
481 : "=r" (__r) : "r" (value), "0" (__r) );\
482 __r; })
484 /*
485 * save_segments() writes a mask of segments which are dirty (non-zero),
486 * allowing load_segments() to avoid some expensive segment loads and
487 * MSR writes.
488 */
489 #define DIRTY_DS 0x01
490 #define DIRTY_ES 0x02
491 #define DIRTY_FS 0x04
492 #define DIRTY_GS 0x08
493 #define DIRTY_FS_BASE 0x10
494 #define DIRTY_GS_BASE_USER 0x20
496 static void load_segments(struct vcpu *n)
497 {
498 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
499 int all_segs_okay = 1;
500 unsigned int dirty_segment_mask, cpu = smp_processor_id();
502 /* Load and clear the dirty segment mask. */
503 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
504 percpu_ctxt[cpu].dirty_segment_mask = 0;
506 /* Either selector != 0 ==> reload. */
507 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
508 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
510 /* Either selector != 0 ==> reload. */
511 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
512 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
514 /*
515 * Either selector != 0 ==> reload.
516 * Also reload to reset FS_BASE if it was non-zero.
517 */
518 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
519 nctxt->user_regs.fs) )
520 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
522 /*
523 * Either selector != 0 ==> reload.
524 * Also reload to reset GS_BASE if it was non-zero.
525 */
526 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
527 nctxt->user_regs.gs) )
528 {
529 /* Reset GS_BASE with user %gs? */
530 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
531 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
532 }
534 /* This can only be non-zero if selector is NULL. */
535 if ( nctxt->fs_base )
536 wrmsr(MSR_FS_BASE,
537 nctxt->fs_base,
538 nctxt->fs_base>>32);
540 /* Most kernels have non-zero GS base, so don't bother testing. */
541 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
542 wrmsr(MSR_SHADOW_GS_BASE,
543 nctxt->gs_base_kernel,
544 nctxt->gs_base_kernel>>32);
546 /* This can only be non-zero if selector is NULL. */
547 if ( nctxt->gs_base_user )
548 wrmsr(MSR_GS_BASE,
549 nctxt->gs_base_user,
550 nctxt->gs_base_user>>32);
552 /* If in kernel mode then switch the GS bases around. */
553 if ( n->arch.flags & TF_kernel_mode )
554 __asm__ __volatile__ ( "swapgs" );
556 if ( unlikely(!all_segs_okay) )
557 {
558 struct cpu_user_regs *regs = guest_cpu_user_regs();
559 unsigned long *rsp =
560 (n->arch.flags & TF_kernel_mode) ?
561 (unsigned long *)regs->rsp :
562 (unsigned long *)nctxt->kernel_sp;
564 if ( !(n->arch.flags & TF_kernel_mode) )
565 toggle_guest_mode(n);
566 else
567 regs->cs &= ~3;
569 if ( put_user(regs->ss, rsp- 1) |
570 put_user(regs->rsp, rsp- 2) |
571 put_user(regs->rflags, rsp- 3) |
572 put_user(regs->cs, rsp- 4) |
573 put_user(regs->rip, rsp- 5) |
574 put_user(nctxt->user_regs.gs, rsp- 6) |
575 put_user(nctxt->user_regs.fs, rsp- 7) |
576 put_user(nctxt->user_regs.es, rsp- 8) |
577 put_user(nctxt->user_regs.ds, rsp- 9) |
578 put_user(regs->r11, rsp-10) |
579 put_user(regs->rcx, rsp-11) )
580 {
581 DPRINTK("Error while creating failsafe callback frame.\n");
582 domain_crash(n->domain);
583 }
585 regs->entry_vector = TRAP_syscall;
586 regs->rflags &= 0xFFFCBEFFUL;
587 regs->ss = __GUEST_SS;
588 regs->rsp = (unsigned long)(rsp-11);
589 regs->cs = __GUEST_CS;
590 regs->rip = nctxt->failsafe_callback_eip;
591 }
592 }
594 static void save_segments(struct vcpu *v)
595 {
596 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
597 struct cpu_user_regs *regs = &ctxt->user_regs;
598 unsigned int dirty_segment_mask = 0;
600 if ( VMX_DOMAIN(v) )
601 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
603 __asm__ __volatile__ ( "mov %%ds,%0" : "=m" (regs->ds) );
604 __asm__ __volatile__ ( "mov %%es,%0" : "=m" (regs->es) );
605 __asm__ __volatile__ ( "mov %%fs,%0" : "=m" (regs->fs) );
606 __asm__ __volatile__ ( "mov %%gs,%0" : "=m" (regs->gs) );
608 if ( regs->ds )
609 dirty_segment_mask |= DIRTY_DS;
611 if ( regs->es )
612 dirty_segment_mask |= DIRTY_ES;
614 if ( regs->fs )
615 {
616 dirty_segment_mask |= DIRTY_FS;
617 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
618 }
619 else if ( ctxt->fs_base )
620 {
621 dirty_segment_mask |= DIRTY_FS_BASE;
622 }
624 if ( regs->gs )
625 {
626 dirty_segment_mask |= DIRTY_GS;
627 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
628 }
629 else if ( ctxt->gs_base_user )
630 {
631 dirty_segment_mask |= DIRTY_GS_BASE_USER;
632 }
634 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
635 }
637 long do_switch_to_user(void)
638 {
639 struct cpu_user_regs *regs = guest_cpu_user_regs();
640 struct switch_to_user stu;
641 struct vcpu *v = current;
643 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
644 unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
645 return -EFAULT;
647 toggle_guest_mode(v);
649 regs->rip = stu.rip;
650 regs->cs = stu.cs | 3; /* force guest privilege */
651 regs->rflags = (stu.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
652 regs->rsp = stu.rsp;
653 regs->ss = stu.ss | 3; /* force guest privilege */
655 if ( !(stu.flags & VGCF_IN_SYSCALL) )
656 {
657 regs->entry_vector = 0;
658 regs->r11 = stu.r11;
659 regs->rcx = stu.rcx;
660 }
662 /* Saved %rax gets written back to regs->rax in entry.S. */
663 return stu.rax;
664 }
666 #define switch_kernel_stack(_n,_c) ((void)0)
668 #elif defined(__i386__)
670 #define load_segments(n) ((void)0)
671 #define save_segments(p) ((void)0)
673 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
674 {
675 struct tss_struct *tss = &init_tss[cpu];
676 tss->esp1 = n->arch.guest_context.kernel_sp;
677 tss->ss1 = n->arch.guest_context.kernel_ss;
678 }
680 #endif
682 #define loaddebug(_v,_reg) \
683 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
685 static void __context_switch(void)
686 {
687 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
688 unsigned int cpu = smp_processor_id();
689 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
690 struct vcpu *n = current;
692 if ( !is_idle_domain(p->domain) )
693 {
694 memcpy(&p->arch.guest_context.user_regs,
695 stack_regs,
696 CTXT_SWITCH_STACK_BYTES);
697 unlazy_fpu(p);
698 save_segments(p);
699 }
701 if ( !is_idle_domain(n->domain) )
702 {
703 memcpy(stack_regs,
704 &n->arch.guest_context.user_regs,
705 CTXT_SWITCH_STACK_BYTES);
707 /* Maybe switch the debug registers. */
708 if ( unlikely(n->arch.guest_context.debugreg[7]) )
709 {
710 loaddebug(&n->arch.guest_context, 0);
711 loaddebug(&n->arch.guest_context, 1);
712 loaddebug(&n->arch.guest_context, 2);
713 loaddebug(&n->arch.guest_context, 3);
714 /* no 4 and 5 */
715 loaddebug(&n->arch.guest_context, 6);
716 loaddebug(&n->arch.guest_context, 7);
717 }
719 if ( !VMX_DOMAIN(n) )
720 {
721 set_int80_direct_trap(n);
722 switch_kernel_stack(n, cpu);
723 }
724 }
726 if ( p->domain != n->domain )
727 cpu_set(cpu, n->domain->domain_dirty_cpumask);
728 cpu_set(cpu, n->vcpu_dirty_cpumask);
730 write_ptbase(n);
732 if ( p->vcpu_id != n->vcpu_id )
733 {
734 char gdt_load[10];
735 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
736 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
737 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
738 }
740 if ( p->domain != n->domain )
741 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
742 cpu_clear(cpu, p->vcpu_dirty_cpumask);
744 percpu_ctxt[cpu].curr_vcpu = n;
745 }
748 void context_switch(struct vcpu *prev, struct vcpu *next)
749 {
750 unsigned int cpu = smp_processor_id();
752 ASSERT(local_irq_is_enabled());
754 set_current(next);
756 if ( (percpu_ctxt[cpu].curr_vcpu != next) &&
757 !is_idle_domain(next->domain) )
758 {
759 /* This may happen if next has been migrated by the scheduler. */
760 if ( unlikely(!cpus_empty(next->vcpu_dirty_cpumask)) )
761 {
762 ASSERT(!cpu_isset(cpu, next->vcpu_dirty_cpumask));
763 sync_vcpu_execstate(next);
764 ASSERT(cpus_empty(next->vcpu_dirty_cpumask));
765 }
767 local_irq_disable();
768 __context_switch();
769 local_irq_enable();
771 if ( VMX_DOMAIN(next) )
772 {
773 vmx_restore_msrs(next);
774 }
775 else
776 {
777 load_LDT(next);
778 load_segments(next);
779 vmx_load_msrs(next);
780 }
781 }
783 context_saved(prev);
785 schedule_tail(next);
786 BUG();
787 }
789 void continue_running(struct vcpu *same)
790 {
791 schedule_tail(same);
792 BUG();
793 }
795 int __sync_lazy_execstate(void)
796 {
797 unsigned long flags;
798 int switch_required;
800 local_irq_save(flags);
802 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
804 if ( switch_required )
805 __context_switch();
807 local_irq_restore(flags);
809 return switch_required;
810 }
812 void sync_vcpu_execstate(struct vcpu *v)
813 {
814 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
815 (void)__sync_lazy_execstate();
817 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
818 flush_tlb_mask(v->vcpu_dirty_cpumask);
819 }
821 unsigned long __hypercall_create_continuation(
822 unsigned int op, unsigned int nr_args, ...)
823 {
824 struct mc_state *mcs = &mc_state[smp_processor_id()];
825 struct cpu_user_regs *regs;
826 unsigned int i;
827 va_list args;
829 va_start(args, nr_args);
831 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
832 {
833 __set_bit(_MCSF_call_preempted, &mcs->flags);
835 for ( i = 0; i < nr_args; i++ )
836 mcs->call.args[i] = va_arg(args, unsigned long);
837 }
838 else
839 {
840 regs = guest_cpu_user_regs();
841 #if defined(__i386__)
842 regs->eax = op;
843 regs->eip -= 2; /* re-execute 'int 0x82' */
845 for ( i = 0; i < nr_args; i++ )
846 {
847 switch ( i )
848 {
849 case 0: regs->ebx = va_arg(args, unsigned long); break;
850 case 1: regs->ecx = va_arg(args, unsigned long); break;
851 case 2: regs->edx = va_arg(args, unsigned long); break;
852 case 3: regs->esi = va_arg(args, unsigned long); break;
853 case 4: regs->edi = va_arg(args, unsigned long); break;
854 case 5: regs->ebp = va_arg(args, unsigned long); break;
855 }
856 }
857 #elif defined(__x86_64__)
858 regs->rax = op;
859 regs->rip -= 2; /* re-execute 'syscall' */
861 for ( i = 0; i < nr_args; i++ )
862 {
863 switch ( i )
864 {
865 case 0: regs->rdi = va_arg(args, unsigned long); break;
866 case 1: regs->rsi = va_arg(args, unsigned long); break;
867 case 2: regs->rdx = va_arg(args, unsigned long); break;
868 case 3: regs->r10 = va_arg(args, unsigned long); break;
869 case 4: regs->r8 = va_arg(args, unsigned long); break;
870 case 5: regs->r9 = va_arg(args, unsigned long); break;
871 }
872 }
873 #endif
874 }
876 va_end(args);
878 return op;
879 }
881 static void relinquish_memory(struct domain *d, struct list_head *list)
882 {
883 struct list_head *ent;
884 struct pfn_info *page;
885 unsigned long x, y;
887 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
888 spin_lock_recursive(&d->page_alloc_lock);
890 ent = list->next;
891 while ( ent != list )
892 {
893 page = list_entry(ent, struct pfn_info, list);
895 /* Grab a reference to the page so it won't disappear from under us. */
896 if ( unlikely(!get_page(page, d)) )
897 {
898 /* Couldn't get a reference -- someone is freeing this page. */
899 ent = ent->next;
900 continue;
901 }
903 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
904 put_page_and_type(page);
906 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
907 put_page(page);
909 /*
910 * Forcibly invalidate base page tables at this point to break circular
911 * 'linear page table' references. This is okay because MMU structures
912 * are not shared across domains and this domain is now dead. Thus base
913 * tables are not in use so a non-zero count means circular reference.
914 */
915 y = page->u.inuse.type_info;
916 for ( ; ; )
917 {
918 x = y;
919 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
920 (PGT_base_page_table|PGT_validated)) )
921 break;
923 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
924 if ( likely(y == x) )
925 {
926 free_page_type(page, PGT_base_page_table);
927 break;
928 }
929 }
931 /* Follow the list chain and /then/ potentially free the page. */
932 ent = ent->next;
933 put_page(page);
934 }
936 spin_unlock_recursive(&d->page_alloc_lock);
937 }
939 void domain_relinquish_resources(struct domain *d)
940 {
941 struct vcpu *v;
942 unsigned long pfn;
944 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
946 ptwr_destroy(d);
948 /* Drop the in-use references to page-table bases. */
949 for_each_vcpu ( d, v )
950 {
951 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
952 {
953 if ( !shadow_mode_refcounts(d) )
954 put_page_type(pfn_to_page(pfn));
955 put_page(pfn_to_page(pfn));
957 v->arch.guest_table = mk_pagetable(0);
958 }
960 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
961 {
962 if ( !shadow_mode_refcounts(d) )
963 put_page_type(pfn_to_page(pfn));
964 put_page(pfn_to_page(pfn));
966 v->arch.guest_table_user = mk_pagetable(0);
967 }
969 vmx_relinquish_resources(v);
970 }
972 shadow_mode_disable(d);
974 /*
975 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
976 * it automatically gets squashed when the guest's mappings go away.
977 */
978 for_each_vcpu(d, v)
979 destroy_gdt(v);
981 /* Relinquish every page of memory. */
982 relinquish_memory(d, &d->xenpage_list);
983 relinquish_memory(d, &d->page_list);
984 }
987 /*
988 * Local variables:
989 * mode: C
990 * c-set-style: "BSD"
991 * c-basic-offset: 4
992 * tab-width: 4
993 * indent-tabs-mode: nil
994 * End:
995 */