ia64/xen-unstable

view xen/arch/x86/domain.c @ 9518:5715cf117178

Ensure curr_vcpu in domain.c is set correctly, even when
nr physical cpus is greater than max virtual cpus per domain.

Also do not initialise secondary CPU smp_processor_id() from
smpboot.c cpucount. It will be wrong if some CPUs fail to boot.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Mar 29 15:39:22 2006 +0100 (2006-03-29)
parents 760f9149dbaa
children 05d8c51c7550
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <asm/regs.h>
25 #include <asm/mc146818rtc.h>
26 #include <asm/system.h>
27 #include <asm/io.h>
28 #include <asm/processor.h>
29 #include <asm/desc.h>
30 #include <asm/i387.h>
31 #include <asm/mpspec.h>
32 #include <asm/ldt.h>
33 #include <xen/irq.h>
34 #include <xen/event.h>
35 #include <asm/shadow.h>
36 #include <xen/console.h>
37 #include <xen/elf.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/msr.h>
41 #include <xen/kernel.h>
42 #include <xen/multicall.h>
44 /* opt_noreboot: If true, machine will need manual reset on error. */
45 static int opt_noreboot = 0;
46 boolean_param("noreboot", opt_noreboot);
48 struct percpu_ctxt {
49 struct vcpu *curr_vcpu;
50 unsigned int dirty_segment_mask;
51 } __cacheline_aligned;
52 static struct percpu_ctxt percpu_ctxt[NR_CPUS];
54 static void paravirt_ctxt_switch_from(struct vcpu *v);
55 static void paravirt_ctxt_switch_to(struct vcpu *v);
57 static void continue_idle_domain(struct vcpu *v)
58 {
59 reset_stack_and_jump(idle_loop);
60 }
62 static void continue_nonidle_domain(struct vcpu *v)
63 {
64 reset_stack_and_jump(ret_from_intr);
65 }
67 static void default_idle(void)
68 {
69 local_irq_disable();
70 if ( !softirq_pending(smp_processor_id()) )
71 safe_halt();
72 else
73 local_irq_enable();
74 }
76 void idle_loop(void)
77 {
78 int cpu = smp_processor_id();
80 for ( ; ; )
81 {
82 irq_stat[cpu].idle_timestamp = jiffies;
84 while ( !softirq_pending(cpu) )
85 {
86 page_scrub_schedule_work();
87 default_idle();
88 }
90 do_softirq();
91 }
92 }
94 void startup_cpu_idle_loop(void)
95 {
96 struct vcpu *v = current;
98 ASSERT(is_idle_vcpu(v));
99 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
100 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
102 reset_stack_and_jump(idle_loop);
103 }
105 static long no_idt[2];
106 static int reboot_mode;
108 static inline void kb_wait(void)
109 {
110 int i;
112 for ( i = 0; i < 0x10000; i++ )
113 if ( (inb_p(0x64) & 0x02) == 0 )
114 break;
115 }
117 void __attribute__((noreturn)) __machine_halt(void *unused)
118 {
119 for ( ; ; )
120 safe_halt();
121 }
123 void machine_halt(void)
124 {
125 watchdog_disable();
126 console_start_sync();
127 smp_call_function(__machine_halt, NULL, 1, 0);
128 __machine_halt(NULL);
129 }
131 void machine_restart(char * __unused)
132 {
133 int i;
135 if ( opt_noreboot )
136 {
137 printk("Reboot disabled on cmdline: require manual reset\n");
138 machine_halt();
139 }
141 watchdog_disable();
142 console_start_sync();
144 local_irq_enable();
146 /* Ensure we are the boot CPU. */
147 if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
148 {
149 smp_call_function((void *)machine_restart, NULL, 1, 0);
150 for ( ; ; )
151 safe_halt();
152 }
154 /*
155 * Stop all CPUs and turn off local APICs and the IO-APIC, so
156 * other OSs see a clean IRQ state.
157 */
158 smp_send_stop();
159 disable_IO_APIC();
160 hvm_disable();
162 /* Rebooting needs to touch the page at absolute address 0. */
163 *((unsigned short *)__va(0x472)) = reboot_mode;
165 for ( ; ; )
166 {
167 /* Pulse the keyboard reset line. */
168 for ( i = 0; i < 100; i++ )
169 {
170 kb_wait();
171 udelay(50);
172 outb(0xfe,0x64); /* pulse reset low */
173 udelay(50);
174 }
176 /* That didn't work - force a triple fault.. */
177 __asm__ __volatile__("lidt %0": "=m" (no_idt));
178 __asm__ __volatile__("int3");
179 }
180 }
183 void dump_pageframe_info(struct domain *d)
184 {
185 struct page_info *page;
187 printk("Memory pages belonging to domain %u:\n", d->domain_id);
189 if ( d->tot_pages >= 10 )
190 {
191 printk(" DomPage list too long to display\n");
192 }
193 else
194 {
195 list_for_each_entry ( page, &d->page_list, list )
196 {
197 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
198 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
199 page->count_info, page->u.inuse.type_info);
200 }
201 }
203 list_for_each_entry ( page, &d->xenpage_list, list )
204 {
205 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
206 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
207 page->count_info, page->u.inuse.type_info);
208 }
209 }
211 void set_current_execstate(struct vcpu *v)
212 {
213 percpu_ctxt[smp_processor_id()].curr_vcpu = v;
214 }
216 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
217 {
218 struct vcpu *v;
220 if ( (v = xmalloc(struct vcpu)) == NULL )
221 return NULL;
223 memset(v, 0, sizeof(*v));
225 v->arch.flags = TF_kernel_mode;
227 v->arch.schedule_tail = is_idle_domain(d) ?
228 continue_idle_domain : continue_nonidle_domain;
230 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
231 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
233 v->arch.perdomain_ptes =
234 d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
236 v->arch.guest_vtable = __linear_l2_table;
237 v->arch.shadow_vtable = __shadow_linear_l2_table;
238 #if defined(__x86_64__)
239 v->arch.guest_vl3table = __linear_l3_table;
240 v->arch.guest_vl4table = __linear_l4_table;
241 #endif
243 return v;
244 }
246 void free_vcpu_struct(struct vcpu *v)
247 {
248 xfree(v);
249 }
251 int arch_domain_create(struct domain *d)
252 {
253 l1_pgentry_t gdt_l1e;
254 int vcpuid, pdpt_order, rc;
255 #ifdef __x86_64__
256 int i;
257 #endif
259 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
260 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
261 if ( d->arch.mm_perdomain_pt == NULL )
262 goto fail_nomem;
263 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
265 /*
266 * Map Xen segments into every VCPU's GDT, irrespective of whether every
267 * VCPU will actually be used. This avoids an NMI race during context
268 * switch: if we take an interrupt after switching CR3 but before switching
269 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
270 * try to load CS from an invalid table.
271 */
272 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
273 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
274 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
275 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
277 #if defined(__i386__)
279 mapcache_init(d);
281 #else /* __x86_64__ */
283 d->arch.mm_perdomain_l2 = alloc_xenheap_page();
284 d->arch.mm_perdomain_l3 = alloc_xenheap_page();
285 if ( (d->arch.mm_perdomain_l2 == NULL) ||
286 (d->arch.mm_perdomain_l3 == NULL) )
287 goto fail_nomem;
289 memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
290 for ( i = 0; i < (1 << pdpt_order); i++ )
291 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
292 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
293 __PAGE_HYPERVISOR);
295 memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
296 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
297 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
298 __PAGE_HYPERVISOR);
300 #endif /* __x86_64__ */
302 shadow_lock_init(d);
303 INIT_LIST_HEAD(&d->arch.free_shadow_frames);
305 if ( !is_idle_domain(d) )
306 {
307 d->arch.ioport_caps =
308 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
309 if ( d->arch.ioport_caps == NULL )
310 goto fail_nomem;
312 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
313 goto fail_nomem;
315 if ( (rc = ptwr_init(d)) != 0 )
316 goto fail_nomem;
318 memset(d->shared_info, 0, PAGE_SIZE);
319 share_xen_page_with_guest(
320 virt_to_page(d->shared_info), d, XENSHARE_writable);
321 }
323 return 0;
325 fail_nomem:
326 free_xenheap_page(d->shared_info);
327 #ifdef __x86_64__
328 free_xenheap_page(d->arch.mm_perdomain_l2);
329 free_xenheap_page(d->arch.mm_perdomain_l3);
330 #endif
331 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
332 return -ENOMEM;
333 }
335 void arch_domain_destroy(struct domain *d)
336 {
337 free_xenheap_pages(
338 d->arch.mm_perdomain_pt,
339 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
341 #ifdef __x86_64__
342 free_xenheap_page(d->arch.mm_perdomain_l2);
343 free_xenheap_page(d->arch.mm_perdomain_l3);
344 #endif
346 free_xenheap_page(d->shared_info);
347 }
349 /* This is called by arch_final_setup_guest and do_boot_vcpu */
350 int arch_set_info_guest(
351 struct vcpu *v, struct vcpu_guest_context *c)
352 {
353 struct domain *d = v->domain;
354 unsigned long phys_basetab = INVALID_MFN;
355 int i, rc;
357 if ( !(c->flags & VGCF_HVM_GUEST) )
358 {
359 fixup_guest_stack_selector(c->user_regs.ss);
360 fixup_guest_stack_selector(c->kernel_ss);
361 fixup_guest_code_selector(c->user_regs.cs);
363 #ifdef __i386__
364 fixup_guest_code_selector(c->event_callback_cs);
365 fixup_guest_code_selector(c->failsafe_callback_cs);
366 #endif
368 for ( i = 0; i < 256; i++ )
369 fixup_guest_code_selector(c->trap_ctxt[i].cs);
370 }
371 else if ( !hvm_enabled )
372 return -EINVAL;
374 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
375 if ( c->flags & VGCF_I387_VALID )
376 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
378 v->arch.flags &= ~TF_kernel_mode;
379 if ( (c->flags & VGCF_IN_KERNEL) || (c->flags & VGCF_HVM_GUEST) )
380 v->arch.flags |= TF_kernel_mode;
382 memcpy(&v->arch.guest_context, c, sizeof(*c));
383 init_int80_direct_trap(v);
385 if ( !(c->flags & VGCF_HVM_GUEST) )
386 {
387 /* IOPL privileges are virtualised. */
388 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
389 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
391 /* Ensure real hardware interrupts are enabled. */
392 v->arch.guest_context.user_regs.eflags |= EF_IE;
393 }
394 else if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
395 {
396 hvm_modify_guest_state(v);
397 }
399 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
400 return 0;
402 memset(v->arch.guest_context.debugreg, 0,
403 sizeof(v->arch.guest_context.debugreg));
404 for ( i = 0; i < 8; i++ )
405 (void)set_debugreg(v, i, c->debugreg[i]);
407 if ( v->vcpu_id == 0 )
408 d->vm_assist = c->vm_assist;
410 if ( !(c->flags & VGCF_HVM_GUEST) )
411 {
412 phys_basetab = c->ctrlreg[3];
413 phys_basetab =
414 (gmfn_to_mfn(d, phys_basetab >> PAGE_SHIFT) << PAGE_SHIFT) |
415 (phys_basetab & ~PAGE_MASK);
417 v->arch.guest_table = mk_pagetable(phys_basetab);
418 }
420 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
421 return rc;
423 if ( c->flags & VGCF_HVM_GUEST )
424 {
425 v->arch.guest_table = mk_pagetable(0);
427 if ( !hvm_initialize_guest_resources(v) )
428 return -EINVAL;
429 }
430 else if ( shadow_mode_refcounts(d) )
431 {
432 if ( !get_page(mfn_to_page(phys_basetab>>PAGE_SHIFT), d) )
433 {
434 destroy_gdt(v);
435 return -EINVAL;
436 }
437 }
438 else
439 {
440 if ( !get_page_and_type(mfn_to_page(phys_basetab>>PAGE_SHIFT), d,
441 PGT_base_page_table) )
442 {
443 destroy_gdt(v);
444 return -EINVAL;
445 }
446 }
448 update_pagetables(v);
450 if ( v->vcpu_id == 0 )
451 init_domain_time(d);
453 /* Don't redo final setup */
454 set_bit(_VCPUF_initialised, &v->vcpu_flags);
456 return 0;
457 }
459 long
460 arch_do_vcpu_op(
461 int cmd, struct vcpu *v, GUEST_HANDLE(void) arg)
462 {
463 long rc = 0;
465 switch ( cmd )
466 {
467 case VCPUOP_register_runstate_memory_area:
468 {
469 struct vcpu_register_runstate_memory_area area;
471 rc = -EINVAL;
472 if ( v != current )
473 break;
475 rc = -EFAULT;
476 if ( copy_from_guest(&area, arg, 1) )
477 break;
479 if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) )
480 break;
482 rc = 0;
483 v->runstate_guest = area.addr.v;
484 __copy_to_user(v->runstate_guest, &v->runstate, sizeof(v->runstate));
486 break;
487 }
489 default:
490 rc = -ENOSYS;
491 break;
492 }
494 return rc;
495 }
497 void new_thread(struct vcpu *d,
498 unsigned long start_pc,
499 unsigned long start_stack,
500 unsigned long start_info)
501 {
502 struct cpu_user_regs *regs = &d->arch.guest_context.user_regs;
504 /*
505 * Initial register values:
506 * DS,ES,FS,GS = FLAT_KERNEL_DS
507 * CS:EIP = FLAT_KERNEL_CS:start_pc
508 * SS:ESP = FLAT_KERNEL_SS:start_stack
509 * ESI = start_info
510 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
511 */
512 regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS;
513 regs->ss = FLAT_KERNEL_SS;
514 regs->cs = FLAT_KERNEL_CS;
515 regs->eip = start_pc;
516 regs->esp = start_stack;
517 regs->esi = start_info;
519 __save_flags(regs->eflags);
520 regs->eflags |= X86_EFLAGS_IF;
521 }
524 #ifdef __x86_64__
526 #define loadsegment(seg,value) ({ \
527 int __r = 1; \
528 __asm__ __volatile__ ( \
529 "1: movl %k1,%%" #seg "\n2:\n" \
530 ".section .fixup,\"ax\"\n" \
531 "3: xorl %k0,%k0\n" \
532 " movl %k0,%%" #seg "\n" \
533 " jmp 2b\n" \
534 ".previous\n" \
535 ".section __ex_table,\"a\"\n" \
536 " .align 8\n" \
537 " .quad 1b,3b\n" \
538 ".previous" \
539 : "=r" (__r) : "r" (value), "0" (__r) );\
540 __r; })
542 /*
543 * save_segments() writes a mask of segments which are dirty (non-zero),
544 * allowing load_segments() to avoid some expensive segment loads and
545 * MSR writes.
546 */
547 #define DIRTY_DS 0x01
548 #define DIRTY_ES 0x02
549 #define DIRTY_FS 0x04
550 #define DIRTY_GS 0x08
551 #define DIRTY_FS_BASE 0x10
552 #define DIRTY_GS_BASE_USER 0x20
554 static void load_segments(struct vcpu *n)
555 {
556 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
557 int all_segs_okay = 1;
558 unsigned int dirty_segment_mask, cpu = smp_processor_id();
560 /* Load and clear the dirty segment mask. */
561 dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
562 percpu_ctxt[cpu].dirty_segment_mask = 0;
564 /* Either selector != 0 ==> reload. */
565 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
566 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
568 /* Either selector != 0 ==> reload. */
569 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
570 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
572 /*
573 * Either selector != 0 ==> reload.
574 * Also reload to reset FS_BASE if it was non-zero.
575 */
576 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
577 nctxt->user_regs.fs) )
578 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
580 /*
581 * Either selector != 0 ==> reload.
582 * Also reload to reset GS_BASE if it was non-zero.
583 */
584 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
585 nctxt->user_regs.gs) )
586 {
587 /* Reset GS_BASE with user %gs? */
588 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
589 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
590 }
592 /* This can only be non-zero if selector is NULL. */
593 if ( nctxt->fs_base )
594 wrmsr(MSR_FS_BASE,
595 nctxt->fs_base,
596 nctxt->fs_base>>32);
598 /* Most kernels have non-zero GS base, so don't bother testing. */
599 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
600 wrmsr(MSR_SHADOW_GS_BASE,
601 nctxt->gs_base_kernel,
602 nctxt->gs_base_kernel>>32);
604 /* This can only be non-zero if selector is NULL. */
605 if ( nctxt->gs_base_user )
606 wrmsr(MSR_GS_BASE,
607 nctxt->gs_base_user,
608 nctxt->gs_base_user>>32);
610 /* If in kernel mode then switch the GS bases around. */
611 if ( n->arch.flags & TF_kernel_mode )
612 __asm__ __volatile__ ( "swapgs" );
614 if ( unlikely(!all_segs_okay) )
615 {
616 struct cpu_user_regs *regs = guest_cpu_user_regs();
617 unsigned long *rsp =
618 (n->arch.flags & TF_kernel_mode) ?
619 (unsigned long *)regs->rsp :
620 (unsigned long *)nctxt->kernel_sp;
622 if ( !(n->arch.flags & TF_kernel_mode) )
623 toggle_guest_mode(n);
624 else
625 regs->cs &= ~3;
627 if ( put_user(regs->ss, rsp- 1) |
628 put_user(regs->rsp, rsp- 2) |
629 put_user(regs->rflags, rsp- 3) |
630 put_user(regs->cs, rsp- 4) |
631 put_user(regs->rip, rsp- 5) |
632 put_user(nctxt->user_regs.gs, rsp- 6) |
633 put_user(nctxt->user_regs.fs, rsp- 7) |
634 put_user(nctxt->user_regs.es, rsp- 8) |
635 put_user(nctxt->user_regs.ds, rsp- 9) |
636 put_user(regs->r11, rsp-10) |
637 put_user(regs->rcx, rsp-11) )
638 {
639 DPRINTK("Error while creating failsafe callback frame.\n");
640 domain_crash(n->domain);
641 }
643 regs->entry_vector = TRAP_syscall;
644 regs->rflags &= 0xFFFCBEFFUL;
645 regs->ss = __GUEST_SS;
646 regs->rsp = (unsigned long)(rsp-11);
647 regs->cs = __GUEST_CS;
648 regs->rip = nctxt->failsafe_callback_eip;
649 }
650 }
652 static void save_segments(struct vcpu *v)
653 {
654 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
655 struct cpu_user_regs *regs = &ctxt->user_regs;
656 unsigned int dirty_segment_mask = 0;
658 regs->ds = read_segment_register(ds);
659 regs->es = read_segment_register(es);
660 regs->fs = read_segment_register(fs);
661 regs->gs = read_segment_register(gs);
663 if ( regs->ds )
664 dirty_segment_mask |= DIRTY_DS;
666 if ( regs->es )
667 dirty_segment_mask |= DIRTY_ES;
669 if ( regs->fs )
670 {
671 dirty_segment_mask |= DIRTY_FS;
672 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
673 }
674 else if ( ctxt->fs_base )
675 {
676 dirty_segment_mask |= DIRTY_FS_BASE;
677 }
679 if ( regs->gs )
680 {
681 dirty_segment_mask |= DIRTY_GS;
682 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
683 }
684 else if ( ctxt->gs_base_user )
685 {
686 dirty_segment_mask |= DIRTY_GS_BASE_USER;
687 }
689 percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
690 }
692 #define switch_kernel_stack(v) ((void)0)
694 #elif defined(__i386__)
696 #define load_segments(n) ((void)0)
697 #define save_segments(p) ((void)0)
699 static inline void switch_kernel_stack(struct vcpu *v)
700 {
701 struct tss_struct *tss = &init_tss[smp_processor_id()];
702 tss->esp1 = v->arch.guest_context.kernel_sp;
703 tss->ss1 = v->arch.guest_context.kernel_ss;
704 }
706 #endif /* __i386__ */
708 static void paravirt_ctxt_switch_from(struct vcpu *v)
709 {
710 save_segments(v);
711 }
713 static void paravirt_ctxt_switch_to(struct vcpu *v)
714 {
715 set_int80_direct_trap(v);
716 switch_kernel_stack(v);
717 }
719 #define loaddebug(_v,_reg) \
720 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
722 static void __context_switch(void)
723 {
724 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
725 unsigned int cpu = smp_processor_id();
726 struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
727 struct vcpu *n = current;
729 ASSERT(p != n);
730 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
732 if ( !is_idle_vcpu(p) )
733 {
734 memcpy(&p->arch.guest_context.user_regs,
735 stack_regs,
736 CTXT_SWITCH_STACK_BYTES);
737 unlazy_fpu(p);
738 p->arch.ctxt_switch_from(p);
739 }
741 if ( !is_idle_vcpu(n) )
742 {
743 memcpy(stack_regs,
744 &n->arch.guest_context.user_regs,
745 CTXT_SWITCH_STACK_BYTES);
747 /* Maybe switch the debug registers. */
748 if ( unlikely(n->arch.guest_context.debugreg[7]) )
749 {
750 loaddebug(&n->arch.guest_context, 0);
751 loaddebug(&n->arch.guest_context, 1);
752 loaddebug(&n->arch.guest_context, 2);
753 loaddebug(&n->arch.guest_context, 3);
754 /* no 4 and 5 */
755 loaddebug(&n->arch.guest_context, 6);
756 loaddebug(&n->arch.guest_context, 7);
757 }
759 n->arch.ctxt_switch_to(n);
760 }
762 if ( p->domain != n->domain )
763 cpu_set(cpu, n->domain->domain_dirty_cpumask);
764 cpu_set(cpu, n->vcpu_dirty_cpumask);
766 write_ptbase(n);
768 if ( p->vcpu_id != n->vcpu_id )
769 {
770 char gdt_load[10];
771 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
772 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
773 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
774 }
776 if ( p->domain != n->domain )
777 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
778 cpu_clear(cpu, p->vcpu_dirty_cpumask);
780 percpu_ctxt[cpu].curr_vcpu = n;
781 }
784 void context_switch(struct vcpu *prev, struct vcpu *next)
785 {
786 unsigned int cpu = smp_processor_id();
787 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
789 ASSERT(local_irq_is_enabled());
791 /* Allow at most one CPU at a time to be dirty. */
792 ASSERT(cpus_weight(dirty_mask) <= 1);
793 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
794 {
795 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
796 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
797 flush_tlb_mask(next->vcpu_dirty_cpumask);
798 }
800 local_irq_disable();
802 set_current(next);
804 if ( (percpu_ctxt[cpu].curr_vcpu == next) || is_idle_vcpu(next) )
805 {
806 local_irq_enable();
807 }
808 else
809 {
810 __context_switch();
812 /* Re-enable interrupts before restoring state which may fault. */
813 local_irq_enable();
815 if ( !hvm_guest(next) )
816 {
817 load_LDT(next);
818 load_segments(next);
819 }
820 }
822 context_saved(prev);
824 /* Update per-VCPU guest runstate shared memory area (if registered). */
825 if ( next->runstate_guest != NULL )
826 __copy_to_user(next->runstate_guest, &next->runstate,
827 sizeof(next->runstate));
829 schedule_tail(next);
830 BUG();
831 }
833 void continue_running(struct vcpu *same)
834 {
835 schedule_tail(same);
836 BUG();
837 }
839 int __sync_lazy_execstate(void)
840 {
841 unsigned long flags;
842 int switch_required;
844 local_irq_save(flags);
846 switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
848 if ( switch_required )
849 __context_switch();
851 local_irq_restore(flags);
853 return switch_required;
854 }
856 void sync_vcpu_execstate(struct vcpu *v)
857 {
858 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
859 (void)__sync_lazy_execstate();
861 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
862 flush_tlb_mask(v->vcpu_dirty_cpumask);
863 }
865 #define next_arg(fmt, args) ({ \
866 unsigned long __arg; \
867 switch ( *(fmt)++ ) \
868 { \
869 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
870 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
871 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
872 default: __arg = 0; BUG(); \
873 } \
874 __arg; \
875 })
877 unsigned long hypercall_create_continuation(
878 unsigned int op, const char *format, ...)
879 {
880 struct mc_state *mcs = &mc_state[smp_processor_id()];
881 struct cpu_user_regs *regs;
882 const char *p = format;
883 unsigned long arg;
884 unsigned int i;
885 va_list args;
887 va_start(args, format);
889 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
890 {
891 __set_bit(_MCSF_call_preempted, &mcs->flags);
893 for ( i = 0; *p != '\0'; i++ )
894 mcs->call.args[i] = next_arg(p, args);
895 }
896 else
897 {
898 regs = guest_cpu_user_regs();
899 #if defined(__i386__)
900 regs->eax = op;
902 if ( supervisor_mode_kernel )
903 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
904 else
905 regs->eip -= 2; /* re-execute 'int 0x82' */
907 for ( i = 0; *p != '\0'; i++ )
908 {
909 arg = next_arg(p, args);
910 switch ( i )
911 {
912 case 0: regs->ebx = arg; break;
913 case 1: regs->ecx = arg; break;
914 case 2: regs->edx = arg; break;
915 case 3: regs->esi = arg; break;
916 case 4: regs->edi = arg; break;
917 case 5: regs->ebp = arg; break;
918 }
919 }
920 #elif defined(__x86_64__)
921 regs->rax = op;
922 regs->rip -= 2; /* re-execute 'syscall' */
924 for ( i = 0; *p != '\0'; i++ )
925 {
926 arg = next_arg(p, args);
927 switch ( i )
928 {
929 case 0: regs->rdi = arg; break;
930 case 1: regs->rsi = arg; break;
931 case 2: regs->rdx = arg; break;
932 case 3: regs->r10 = arg; break;
933 case 4: regs->r8 = arg; break;
934 case 5: regs->r9 = arg; break;
935 }
936 }
937 #endif
938 }
940 va_end(args);
942 return op;
943 }
945 static void relinquish_memory(struct domain *d, struct list_head *list)
946 {
947 struct list_head *ent;
948 struct page_info *page;
949 unsigned long x, y;
951 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
952 spin_lock_recursive(&d->page_alloc_lock);
954 ent = list->next;
955 while ( ent != list )
956 {
957 page = list_entry(ent, struct page_info, list);
959 /* Grab a reference to the page so it won't disappear from under us. */
960 if ( unlikely(!get_page(page, d)) )
961 {
962 /* Couldn't get a reference -- someone is freeing this page. */
963 ent = ent->next;
964 continue;
965 }
967 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
968 put_page_and_type(page);
970 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
971 put_page(page);
973 /*
974 * Forcibly invalidate base page tables at this point to break circular
975 * 'linear page table' references. This is okay because MMU structures
976 * are not shared across domains and this domain is now dead. Thus base
977 * tables are not in use so a non-zero count means circular reference.
978 */
979 y = page->u.inuse.type_info;
980 for ( ; ; )
981 {
982 x = y;
983 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
984 (PGT_base_page_table|PGT_validated)) )
985 break;
987 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
988 if ( likely(y == x) )
989 {
990 free_page_type(page, PGT_base_page_table);
991 break;
992 }
993 }
995 /* Follow the list chain and /then/ potentially free the page. */
996 ent = ent->next;
997 put_page(page);
998 }
1000 spin_unlock_recursive(&d->page_alloc_lock);
1003 void domain_relinquish_resources(struct domain *d)
1005 struct vcpu *v;
1006 unsigned long pfn;
1008 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
1010 ptwr_destroy(d);
1012 /* Drop the in-use references to page-table bases. */
1013 for_each_vcpu ( d, v )
1015 if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
1017 if ( !shadow_mode_refcounts(d) )
1018 put_page_type(mfn_to_page(pfn));
1019 put_page(mfn_to_page(pfn));
1021 v->arch.guest_table = mk_pagetable(0);
1024 if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
1026 if ( !shadow_mode_refcounts(d) )
1027 put_page_type(mfn_to_page(pfn));
1028 put_page(mfn_to_page(pfn));
1030 v->arch.guest_table_user = mk_pagetable(0);
1034 if ( hvm_guest(d->vcpu[0]) )
1035 hvm_relinquish_guest_resources(d);
1037 shadow_mode_disable(d);
1039 /*
1040 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
1041 * it automatically gets squashed when the guest's mappings go away.
1042 */
1043 for_each_vcpu(d, v)
1044 destroy_gdt(v);
1046 /* Relinquish every page of memory. */
1047 relinquish_memory(d, &d->xenpage_list);
1048 relinquish_memory(d, &d->page_list);
1051 void arch_dump_domain_info(struct domain *d)
1053 if ( shadow_mode_enabled(d) )
1055 printk(" shadow mode: ");
1056 if ( shadow_mode_refcounts(d) )
1057 printk("refcounts ");
1058 if ( shadow_mode_write_all(d) )
1059 printk("write_all ");
1060 if ( shadow_mode_log_dirty(d) )
1061 printk("log_dirty ");
1062 if ( shadow_mode_translate(d) )
1063 printk("translate ");
1064 if ( shadow_mode_external(d) )
1065 printk("external ");
1066 if ( shadow_mode_wr_pt_pte(d) )
1067 printk("wr_pt_pte ");
1068 printk("\n");
1072 /*
1073 * Local variables:
1074 * mode: C
1075 * c-set-style: "BSD"
1076 * c-basic-offset: 4
1077 * tab-width: 4
1078 * indent-tabs-mode: nil
1079 * End:
1080 */