ia64/xen-unstable

view xen/arch/x86/domain.c @ 7886:51f91ef6c3b5

Fix x86-64 build.

Signed-off-by: James Bulpin <james@xensource.com>
author jrb44@plym.cl.cam.ac.uk
date Thu Nov 17 13:32:05 2005 +0100 (2005-11-17)
parents c640c0c7f821
children dede6fb4c90e
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <asm/regs.h>
24 #include <asm/mc146818rtc.h>
25 #include <asm/system.h>
26 #include <asm/io.h>
27 #include <asm/processor.h>
28 #include <asm/desc.h>
29 #include <asm/i387.h>
30 #include <asm/mpspec.h>
31 #include <asm/ldt.h>
32 #include <xen/irq.h>
33 #include <xen/event.h>
34 #include <asm/shadow.h>
35 #include <xen/console.h>
36 #include <xen/elf.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/msr.h>
40 #include <asm/physdev.h>
41 #include <xen/kernel.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 struct percpu_ctxt {
49 struct vcpu *curr_vcpu;
50 unsigned int context_not_finalised;
51 unsigned int dirty_segment_mask;
52 } __cacheline_aligned;
53 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
55 static void continue_idle_task(struct vcpu *v)
56 {
57 reset_stack_and_jump(idle_loop);
58 }
60 static void continue_nonidle_task(struct vcpu *v)
61 {
62 reset_stack_and_jump(ret_from_intr);
63 }
65 static void default_idle(void)
66 {
67 local_irq_disable();
68 if ( !softirq_pending(smp_processor_id()) )
69 safe_halt();
70 else
71 local_irq_enable();
72 }
74 void idle_loop(void)
75 {
76 int cpu = smp_processor_id();
78 for ( ; ; )
79 {
80 irq_stat[cpu].idle_timestamp = jiffies;
82 while ( !softirq_pending(cpu) )
83 {
84 page_scrub_schedule_work();
85 default_idle();
86 }
88 do_softirq();
89 }
90 }
92 void startup_cpu_idle_loop(void)
93 {
94 struct vcpu *v = current;
96 ASSERT(is_idle_task(v->domain));
97 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
98 cpu_set(smp_processor_id(), v->domain->cpumask);
99 v->arch.schedule_tail = continue_idle_task;
101 idle_loop();
102 }
104 static long no_idt[2];
105 static int reboot_mode;
107 static inline void kb_wait(void)
108 {
109 int i;
111 for ( i = 0; i < 0x10000; i++ )
112 if ( (inb_p(0x64) & 0x02) == 0 )
113 break;
114 }
116 void machine_restart(char * __unused)
117 {
118 int i;
120 if ( opt_noreboot )
121 {
122 printk("Reboot disabled on cmdline: require manual reset\n");
123 for ( ; ; )
124 safe_halt();
125 }
127 watchdog_disable();
128 console_start_sync();
130 local_irq_enable();
132 /* Ensure we are the boot CPU. */
133 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
134 {
135 smp_call_function((void *)machine_restart, NULL, 1, 0);
136 for ( ; ; )
137 safe_halt();
138 }
140 /*
141 * Stop all CPUs and turn off local APICs and the IO-APIC, so
142 * other OSs see a clean IRQ state.
143 */
144 smp_send_stop();
145 disable_IO_APIC();
147 stop_vmx();
149 /* Rebooting needs to touch the page at absolute address 0. */
150 *((unsigned short *)__va(0x472)) = reboot_mode;
152 for ( ; ; )
153 {
154 /* Pulse the keyboard reset line. */
155 for ( i = 0; i < 100; i++ )
156 {
157 kb_wait();
158 udelay(50);
159 outb(0xfe,0x64); /* pulse reset low */
160 udelay(50);
161 }
163 /* That didn't work - force a triple fault.. */
164 __asm__ __volatile__("lidt %0": "=m" (no_idt));
165 __asm__ __volatile__("int3");
166 }
167 }
170 void __attribute__((noreturn)) __machine_halt(void *unused)
171 {
172 for ( ; ; )
173 safe_halt();
174 }
176 void machine_halt(void)
177 {
178 watchdog_disable();
179 console_start_sync();
180 smp_call_function(__machine_halt, NULL, 1, 0);
181 __machine_halt(NULL);
182 }
184 void dump_pageframe_info(struct domain *d)
185 {
186 struct pfn_info *page;
188 if ( d->tot_pages < 10 )
189 {
190 list_for_each_entry ( page, &d->page_list, list )
191 {
192 printk("Page %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
193 _p(page_to_phys(page)), _p(page - frame_table),
194 page->count_info, page->u.inuse.type_info);
195 }
196 }
198 list_for_each_entry ( page, &d->xenpage_list, list )
199 {
200 printk("XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
201 _p(page_to_phys(page)), _p(page - frame_table),
202 page->count_info, page->u.inuse.type_info);
203 }
205 page = virt_to_page(d->shared_info);
206 printk("Shared_info@%p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
207 _p(page_to_phys(page)), _p(page - frame_table), page->count_info,
208 page->u.inuse.type_info);
209 }
211 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
212 {
213 struct vcpu *v;
215 if ( (v = xmalloc(struct vcpu)) == NULL )
216 return NULL;
218 memset(v, 0, sizeof(*v));
220 memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch));
221 v->arch.flags = TF_kernel_mode;
223 if ( (v->vcpu_id = vcpu_id) != 0 )
224 {
225 v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
226 v->arch.perdomain_ptes =
227 d->arch.mm_perdomain_pt + (vcpu_id << PDPT_VCPU_SHIFT);
228 }
230 return v;
231 }
233 void free_vcpu_struct(struct vcpu *v)
234 {
235 BUG_ON(v->next_in_list != NULL);
236 if ( v->vcpu_id != 0 )
237 v->domain->vcpu[v->vcpu_id - 1]->next_in_list = NULL;
238 xfree(v);
239 }
241 void free_perdomain_pt(struct domain *d)
242 {
243 free_xenheap_pages(
244 d->arch.mm_perdomain_pt,
245 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
247 #ifdef __x86_64__
248 free_xenheap_page(d->arch.mm_perdomain_l2);
249 free_xenheap_page(d->arch.mm_perdomain_l3);
250 #endif
251 }
253 void arch_do_createdomain(struct vcpu *v)
254 {
255 struct domain *d = v->domain;
256 l1_pgentry_t gdt_l1e;
257 int vcpuid, pdpt_order;
258 #ifdef __x86_64__
259 int i;
260 #endif
262 if ( is_idle_task(d) )
263 return;
265 v->arch.schedule_tail = continue_nonidle_task;
267 d->shared_info = alloc_xenheap_page();
268 memset(d->shared_info, 0, PAGE_SIZE);
269 v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id];
270 v->cpumap = CPUMAP_RUNANYWHERE;
271 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
273 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
274 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
275 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
276 v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
278 /*
279 * Map Xen segments into every VCPU's GDT, irrespective of whether every
280 * VCPU will actually be used. This avoids an NMI race during context
281 * switch: if we take an interrupt after switching CR3 but before switching
282 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
283 * try to load CS from an invalid table.
284 */
285 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
286 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
287 d->arch.mm_perdomain_pt[
288 (vcpuid << PDPT_VCPU_SHIFT) + FIRST_RESERVED_GDT_PAGE] = gdt_l1e;
290 v->arch.guest_vtable = __linear_l2_table;
291 v->arch.shadow_vtable = __shadow_linear_l2_table;
293 #ifdef __x86_64__
294 v->arch.guest_vl3table = __linear_l3_table;
295 v->arch.guest_vl4table = __linear_l4_table;
297 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
298 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
299 for ( i = 0; i < (1 << pdpt_order); i++ )
300 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
301 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
302 __PAGE_HYPERVISOR);
304 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
305 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
306 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
307 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
308 __PAGE_HYPERVISOR);
309 #endif
311 (void)ptwr_init(d);
313 shadow_lock_init(d);
314 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
315 }
317 void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
318 {
319 if ( v->processor == newcpu )
320 return;
322 set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
323 v->processor = newcpu;
325 if ( VMX_DOMAIN(v) )
326 {
327 __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs));
328 v->arch.schedule_tail = arch_vmx_do_relaunch;
329 }
330 }
332 /* This is called by arch_final_setup_guest and do_boot_vcpu */
333 int arch_set_info_guest(
334 struct vcpu *v, struct vcpu_guest_context *c)
335 {
336 struct domain *d = v->domain;
337 unsigned long phys_basetab;
338 int i, rc;
340 /*
341 * This is sufficient! If the descriptor DPL differs from CS RPL then we'll
342 * #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared automatically.
343 * If SS RPL or DPL differs from CS RPL then we'll #GP.
344 */
345 if ( !(c->flags & VGCF_VMX_GUEST) )
346 {
347 if ( ((c->user_regs.cs & 3) == 0) ||
348 ((c->user_regs.ss & 3) == 0) )
349 return -EINVAL;
350 }
352 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
353 if ( c->flags & VGCF_I387_VALID )
354 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
356 v->arch.flags &= ~TF_kernel_mode;
357 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_VMX_GUEST) )
358 v->arch.flags |= TF_kernel_mode;
360 memcpy(&v->arch.guest_context, c, sizeof(*c));
362 if ( !(c->flags & VGCF_VMX_GUEST) )
363 {
364 /* IOPL privileges are virtualised. */
365 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
366 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
368 /* Ensure real hardware interrupts are enabled. */
369 v->arch.guest_context.user_regs.eflags |= EF_IE;
370 }
371 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
372 {
373 return modify_vmcs(
374 &v->arch.arch_vmx,
375 &v->arch.guest_context.user_regs);
376 }
378 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
379 return 0;
381 memset(v->arch.guest_context.debugreg, 0,
382 sizeof(v->arch.guest_context.debugreg));
383 for ( i = 0; i < 8; i++ )
384 (void)set_debugreg(v, i, c->debugreg[i]);
386 if ( v->vcpu_id == 0 )
387 d->vm_assist = c->vm_assist;
389 phys_basetab = c->ctrlreg[3];
390 v->arch.guest_table = mk_pagetable(phys_basetab);
392 if ( shadow_mode_refcounts(d) )
393 {
394 if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
395 return -EINVAL;
396 }
397 else if ( !(c->flags & VGCF_VMX_GUEST) )
398 {
399 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d,
400 PGT_base_page_table) )
401 return -EINVAL;
402 }
404 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
405 {
406 put_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT]);
407 return rc;
408 }
410 if ( c->flags & VGCF_VMX_GUEST )
411 {
412 /* VMX uses the initially provided page tables as the P2M map. */
413 if ( !pagetable_get_paddr(d->arch.phys_table) )
414 d->arch.phys_table = v->arch.guest_table;
415 v->arch.guest_table = mk_pagetable(0);
417 /* Initialize monitor page table */
418 v->arch.monitor_table = mk_pagetable(0);
420 vmx_final_setup_guest(v);
421 }
423 update_pagetables(v);
425 if ( v->vcpu_id == 0 )
426 init_domain_time(d);
428 /* Don't redo final setup */
429 set_bit(_VCPUF_initialised, &v->vcpu_flags);
431 return 0;
432 }
435 void new_thread(struct vcpu *d,
436 unsigned long start_pc,
437 unsigned long start_stack,
438 unsigned long start_info)
439 {
440 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
442 /*
443 * Initial register values:
444 * DS,ES,FS,GS = FLAT_KERNEL_DS
445 * CS:EIP = FLAT_KERNEL_CS:start_pc
446 * SS:ESP = FLAT_KERNEL_SS:start_stack
447 * ESI = start_info
448 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
449 */
450 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
451 regs->ss = FLAT_KERNEL_SS;
452 regs->cs = FLAT_KERNEL_CS;
453 regs->eip = start_pc;
454 regs->esp = start_stack;
455 regs->esi = start_info;
457 __save_flags(regs->eflags);
458 regs->eflags |= X86_EFLAGS_IF;
459 }
462 #ifdef __x86_64__
464 void toggle_guest_mode(struct vcpu *v)
465 {
466 v->arch.flags ^= TF_kernel_mode;
467 __asm__ __volatile__ ( "swapgs" );
468 update_pagetables(v);
469 write_ptbase(v);
470 }
472 #define loadsegment(seg,value) ({ \
473 int __r = 1; \
474 __asm__ __volatile__ ( \
475 "1: movl %k1,%%" #seg "\n2:\n" \
476 ".section .fixup,\"ax\"\n" \
477 "3: xorl %k0,%k0\n" \
478 " movl %k0,%%" #seg "\n" \
479 " jmp 2b\n" \
480 ".previous\n" \
481 ".section __ex_table,\"a\"\n" \
482 " .align 8\n" \
483 " .quad 1b,3b\n" \
484 ".previous" \
485 : "=r" (__r) : "r" (value), "0" (__r) );\
486 __r; })
488 /*
489 * save_segments() writes a mask of segments which are dirty (non-zero),
490 * allowing load_segments() to avoid some expensive segment loads and
491 * MSR writes.
492 */
493 #define DIRTY_DS 0x01
494 #define DIRTY_ES 0x02
495 #define DIRTY_FS 0x04
496 #define DIRTY_GS 0x08
497 #define DIRTY_FS_BASE 0x10
498 #define DIRTY_GS_BASE_USER 0x20
500 static void load_segments(struct vcpu *n)
501 {
502 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
503 int all_segs_okay = 1;
504 unsigned int dirty_segment_mask, cpu = smp_processor_id();
506 /* Load and clear the dirty segment mask. */
507 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
508 percpu_ctxt[cpu].dirty_segment_mask = 0;
510 /* Either selector != 0 ==> reload. */
511 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
512 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
514 /* Either selector != 0 ==> reload. */
515 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
516 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
518 /*
519 * Either selector != 0 ==> reload.
520 * Also reload to reset FS_BASE if it was non-zero.
521 */
522 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
523 nctxt->user_regs.fs) )
524 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
526 /*
527 * Either selector != 0 ==> reload.
528 * Also reload to reset GS_BASE if it was non-zero.
529 */
530 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
531 nctxt->user_regs.gs) )
532 {
533 /* Reset GS_BASE with user %gs? */
534 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
535 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
536 }
538 /* This can only be non-zero if selector is NULL. */
539 if ( nctxt->fs_base )
540 wrmsr(MSR_FS_BASE,
541 nctxt->fs_base,
542 nctxt->fs_base>>32);
544 /* Most kernels have non-zero GS base, so don't bother testing. */
545 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
546 wrmsr(MSR_SHADOW_GS_BASE,
547 nctxt->gs_base_kernel,
548 nctxt->gs_base_kernel>>32);
550 /* This can only be non-zero if selector is NULL. */
551 if ( nctxt->gs_base_user )
552 wrmsr(MSR_GS_BASE,
553 nctxt->gs_base_user,
554 nctxt->gs_base_user>>32);
556 /* If in kernel mode then switch the GS bases around. */
557 if ( n->arch.flags & TF_kernel_mode )
558 __asm__ __volatile__ ( "swapgs" );
560 if ( unlikely(!all_segs_okay) )
561 {
562 struct cpu_user_regs *regs = guest_cpu_user_regs();
563 unsigned long *rsp =
564 (n->arch.flags & TF_kernel_mode) ?
565 (unsigned long *)regs->rsp :
566 (unsigned long *)nctxt->kernel_sp;
568 if ( !(n->arch.flags & TF_kernel_mode) )
569 toggle_guest_mode(n);
570 else
571 regs->cs &= ~3;
573 if ( put_user(regs->ss, rsp- 1) |
574 put_user(regs->rsp, rsp- 2) |
575 put_user(regs->rflags, rsp- 3) |
576 put_user(regs->cs, rsp- 4) |
577 put_user(regs->rip, rsp- 5) |
578 put_user(nctxt->user_regs.gs, rsp- 6) |
579 put_user(nctxt->user_regs.fs, rsp- 7) |
580 put_user(nctxt->user_regs.es, rsp- 8) |
581 put_user(nctxt->user_regs.ds, rsp- 9) |
582 put_user(regs->r11, rsp-10) |
583 put_user(regs->rcx, rsp-11) )
584 {
585 DPRINTK("Error while creating failsafe callback frame.\n");
586 domain_crash(n->domain);
587 }
589 regs->entry_vector = TRAP_syscall;
590 regs->rflags &= 0xFFFCBEFFUL;
591 regs->ss = __GUEST_SS;
592 regs->rsp = (unsigned long)(rsp-11);
593 regs->cs = __GUEST_CS;
594 regs->rip = nctxt->failsafe_callback_eip;
595 }
596 }
598 static void save_segments(struct vcpu *v)
599 {
600 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
601 struct cpu_user_regs *regs = &ctxt->user_regs;
602 unsigned int dirty_segment_mask = 0;
604 if ( VMX_DOMAIN(v) )
605 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
607 __asm__ __volatile__ ( "mov %%ds,%0" : "=m" (regs->ds) );
608 __asm__ __volatile__ ( "mov %%es,%0" : "=m" (regs->es) );
609 __asm__ __volatile__ ( "mov %%fs,%0" : "=m" (regs->fs) );
610 __asm__ __volatile__ ( "mov %%gs,%0" : "=m" (regs->gs) );
612 if ( regs->ds )
613 dirty_segment_mask |= DIRTY_DS;
615 if ( regs->es )
616 dirty_segment_mask |= DIRTY_ES;
618 if ( regs->fs )
619 {
620 dirty_segment_mask |= DIRTY_FS;
621 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
622 }
623 else if ( ctxt->fs_base )
624 {
625 dirty_segment_mask |= DIRTY_FS_BASE;
626 }
628 if ( regs->gs )
629 {
630 dirty_segment_mask |= DIRTY_GS;
631 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
632 }
633 else if ( ctxt->gs_base_user )
634 {
635 dirty_segment_mask |= DIRTY_GS_BASE_USER;
636 }
638 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
639 }
641 long do_switch_to_user(void)
642 {
643 struct cpu_user_regs *regs = guest_cpu_user_regs();
644 struct switch_to_user stu;
645 struct vcpu *v = current;
647 if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
648 unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
649 return -EFAULT;
651 toggle_guest_mode(v);
653 regs->rip = stu.rip;
654 regs->cs = stu.cs | 3; /* force guest privilege */
655 regs->rflags = (stu.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
656 regs->rsp = stu.rsp;
657 regs->ss = stu.ss | 3; /* force guest privilege */
659 if ( !(stu.flags & VGCF_IN_SYSCALL) )
660 {
661 regs->entry_vector = 0;
662 regs->r11 = stu.r11;
663 regs->rcx = stu.rcx;
664 }
666 /* Saved %rax gets written back to regs->rax in entry.S. */
667 return stu.rax;
668 }
670 #define switch_kernel_stack(_n,_c) ((void)0)
672 #elif defined(__i386__)
674 #define load_segments(n) ((void)0)
675 #define save_segments(p) ((void)0)
677 static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
678 {
679 struct tss_struct *tss = &init_tss[cpu];
680 tss->esp1 = n->arch.guest_context.kernel_sp;
681 tss->ss1 = n->arch.guest_context.kernel_ss;
682 }
684 #endif
686 #define loaddebug(_v,_reg) \
687 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
689 static void __context_switch(void)
690 {
691 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
692 unsigned int cpu = smp_processor_id();
693 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
694 struct vcpu *n = current;
696 if ( !is_idle_task(p->domain) )
697 {
698 memcpy(&p->arch.guest_context.user_regs,
699 stack_regs,
700 CTXT_SWITCH_STACK_BYTES);
701 unlazy_fpu(p);
702 save_segments(p);
703 }
705 if ( !is_idle_task(n->domain) )
706 {
707 memcpy(stack_regs,
708 &n->arch.guest_context.user_regs,
709 CTXT_SWITCH_STACK_BYTES);
711 /* Maybe switch the debug registers. */
712 if ( unlikely(n->arch.guest_context.debugreg[7]) )
713 {
714 loaddebug(&n->arch.guest_context, 0);
715 loaddebug(&n->arch.guest_context, 1);
716 loaddebug(&n->arch.guest_context, 2);
717 loaddebug(&n->arch.guest_context, 3);
718 /* no 4 and 5 */
719 loaddebug(&n->arch.guest_context, 6);
720 loaddebug(&n->arch.guest_context, 7);
721 }
723 if ( !VMX_DOMAIN(n) )
724 {
725 set_int80_direct_trap(n);
726 switch_kernel_stack(n, cpu);
727 }
728 }
730 if ( p->domain != n->domain )
731 cpu_set(cpu, n->domain->cpumask);
733 write_ptbase(n);
735 if ( p->vcpu_id != n->vcpu_id )
736 {
737 char gdt_load[10];
738 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
739 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
740 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
741 }
743 if ( p->domain != n->domain )
744 cpu_clear(cpu, p->domain->cpumask);
746 percpu_ctxt[cpu].curr_vcpu = n;
747 }
750 void context_switch(struct vcpu *prev, struct vcpu *next)
751 {
752 unsigned int cpu = smp_processor_id();
754 ASSERT(!local_irq_is_enabled());
756 set_current(next);
758 if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
759 {
760 __context_switch();
761 percpu_ctxt[cpu].context_not_finalised = 1;
762 }
763 }
765 void context_switch_finalise(struct vcpu *next)
766 {
767 unsigned int cpu = smp_processor_id();
769 ASSERT(local_irq_is_enabled());
771 if ( percpu_ctxt[cpu].context_not_finalised )
772 {
773 percpu_ctxt[cpu].context_not_finalised = 0;
775 BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
777 if ( VMX_DOMAIN(next) )
778 {
779 vmx_restore_msrs(next);
780 }
781 else
782 {
783 load_LDT(next);
784 load_segments(next);
785 vmx_load_msrs(next);
786 }
787 }
789 schedule_tail(next);
790 BUG();
791 }
793 void continue_running(struct vcpu *same)
794 {
795 schedule_tail(same);
796 BUG();
797 }
799 int __sync_lazy_execstate(void)
800 {
801 unsigned long flags;
802 int switch_required;
804 local_irq_save(flags);
806 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
808 if ( switch_required )
809 __context_switch();
811 local_irq_restore(flags);
813 return switch_required;
814 }
816 void sync_vcpu_execstate(struct vcpu *v)
817 {
818 unsigned int cpu = v->processor;
820 if ( !cpu_isset(cpu, v->domain->cpumask) )
821 return;
823 if ( cpu == smp_processor_id() )
824 {
825 (void)__sync_lazy_execstate();
826 }
827 else
828 {
829 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
830 flush_tlb_mask(cpumask_of_cpu(cpu));
831 }
832 }
834 unsigned long __hypercall_create_continuation(
835 unsigned int op, unsigned int nr_args, ...)
836 {
837 struct mc_state *mcs = &mc_state[smp_processor_id()];
838 struct cpu_user_regs *regs;
839 unsigned int i;
840 va_list args;
842 va_start(args, nr_args);
844 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
845 {
846 __set_bit(_MCSF_call_preempted, &mcs->flags);
848 for ( i = 0; i < nr_args; i++ )
849 mcs->call.args[i] = va_arg(args, unsigned long);
850 }
851 else
852 {
853 regs = guest_cpu_user_regs();
854 #if defined(__i386__)
855 regs->eax = op;
856 regs->eip -= 2; /* re-execute 'int 0x82' */
858 for ( i = 0; i < nr_args; i++ )
859 {
860 switch ( i )
861 {
862 case 0: regs->ebx = va_arg(args, unsigned long); break;
863 case 1: regs->ecx = va_arg(args, unsigned long); break;
864 case 2: regs->edx = va_arg(args, unsigned long); break;
865 case 3: regs->esi = va_arg(args, unsigned long); break;
866 case 4: regs->edi = va_arg(args, unsigned long); break;
867 case 5: regs->ebp = va_arg(args, unsigned long); break;
868 }
869 }
870 #elif defined(__x86_64__)
871 regs->rax = op;
872 regs->rip -= 2; /* re-execute 'syscall' */
874 for ( i = 0; i < nr_args; i++ )
875 {
876 switch ( i )
877 {
878 case 0: regs->rdi = va_arg(args, unsigned long); break;
879 case 1: regs->rsi = va_arg(args, unsigned long); break;
880 case 2: regs->rdx = va_arg(args, unsigned long); break;
881 case 3: regs->r10 = va_arg(args, unsigned long); break;
882 case 4: regs->r8 = va_arg(args, unsigned long); break;
883 case 5: regs->r9 = va_arg(args, unsigned long); break;
884 }
885 }
886 #endif
887 }
889 va_end(args);
891 return op;
892 }
894 static void relinquish_memory(struct domain *d, struct list_head *list)
895 {
896 struct list_head *ent;
897 struct pfn_info *page;
898 unsigned long x, y;
900 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
901 spin_lock_recursive(&d->page_alloc_lock);
903 ent = list->next;
904 while ( ent != list )
905 {
906 page = list_entry(ent, struct pfn_info, list);
908 /* Grab a reference to the page so it won't disappear from under us. */
909 if ( unlikely(!get_page(page, d)) )
910 {
911 /* Couldn't get a reference -- someone is freeing this page. */
912 ent = ent->next;
913 continue;
914 }
916 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
917 put_page_and_type(page);
919 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
920 put_page(page);
922 /*
923 * Forcibly invalidate base page tables at this point to break circular
924 * 'linear page table' references. This is okay because MMU structures
925 * are not shared across domains and this domain is now dead. Thus base
926 * tables are not in use so a non-zero count means circular reference.
927 */
928 y = page->u.inuse.type_info;
929 for ( ; ; )
930 {
931 x = y;
932 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
933 (PGT_base_page_table|PGT_validated)) )
934 break;
936 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
937 if ( likely(y == x) )
938 {
939 free_page_type(page, PGT_base_page_table);
940 break;
941 }
942 }
944 /* Follow the list chain and /then/ potentially free the page. */
945 ent = ent->next;
946 put_page(page);
947 }
949 spin_unlock_recursive(&d->page_alloc_lock);
950 }
952 void domain_relinquish_resources(struct domain *d)
953 {
954 struct vcpu *v;
955 unsigned long pfn;
957 BUG_ON(!cpus_empty(d->cpumask));
959 physdev_destroy_state(d);
961 ptwr_destroy(d);
963 /* Release device mappings of other domains */
964 gnttab_release_dev_mappings(d->grant_table);
966 /* Drop the in-use references to page-table bases. */
967 for_each_vcpu ( d, v )
968 {
969 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
970 {
971 if ( !shadow_mode_refcounts(d) )
972 put_page_type(pfn_to_page(pfn));
973 put_page(pfn_to_page(pfn));
975 v->arch.guest_table = mk_pagetable(0);
976 }
978 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
979 {
980 if ( !shadow_mode_refcounts(d) )
981 put_page_type(pfn_to_page(pfn));
982 put_page(pfn_to_page(pfn));
984 v->arch.guest_table_user = mk_pagetable(0);
985 }
987 vmx_relinquish_resources(v);
988 }
990 shadow_mode_disable(d);
992 /*
993 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
994 * it automatically gets squashed when the guest's mappings go away.
995 */
996 for_each_vcpu(d, v)
997 destroy_gdt(v);
999 /* Relinquish every page of memory. */
1000 relinquish_memory(d, &d->xenpage_list);
1001 relinquish_memory(d, &d->page_list);
1005 /*
1006 * Local variables:
1007 * mode: C
1008 * c-set-style: "BSD"
1009 * c-basic-offset: 4
1010 * tab-width: 4
1011 * indent-tabs-mode: nil
1012 * End:
1013 */