ia64/xen-unstable

view xen/arch/x86/domain.c @ 12850:a89599a12d4b

[XEN] Init pae_l3_cache lock earlier
In particular, before hvm_vcpu_initialise causes a CR3 update.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Thu Dec 07 13:14:44 2006 +0000 (2006-12-07)
parents 2fd223c64fc6
children cd89771ba550
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/init.h>
16 #include <xen/lib.h>
17 #include <xen/errno.h>
18 #include <xen/sched.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <xen/kernel.h>
25 #include <xen/multicall.h>
26 #include <xen/irq.h>
27 #include <xen/event.h>
28 #include <xen/console.h>
29 #include <xen/percpu.h>
30 #include <asm/regs.h>
31 #include <asm/mc146818rtc.h>
32 #include <asm/system.h>
33 #include <asm/io.h>
34 #include <asm/processor.h>
35 #include <asm/desc.h>
36 #include <asm/i387.h>
37 #include <asm/mpspec.h>
38 #include <asm/ldt.h>
39 #include <asm/shadow.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/msr.h>
44 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
46 static void paravirt_ctxt_switch_from(struct vcpu *v);
47 static void paravirt_ctxt_switch_to(struct vcpu *v);
49 static void continue_idle_domain(struct vcpu *v)
50 {
51 reset_stack_and_jump(idle_loop);
52 }
54 static void continue_nonidle_domain(struct vcpu *v)
55 {
56 reset_stack_and_jump(ret_from_intr);
57 }
59 static void default_idle(void)
60 {
61 local_irq_disable();
62 if ( !softirq_pending(smp_processor_id()) )
63 safe_halt();
64 else
65 local_irq_enable();
66 }
68 void idle_loop(void)
69 {
70 for ( ; ; )
71 {
72 page_scrub_schedule_work();
73 default_idle();
74 do_softirq();
75 }
76 }
78 void startup_cpu_idle_loop(void)
79 {
80 struct vcpu *v = current;
82 ASSERT(is_idle_vcpu(v));
83 cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
84 cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
86 reset_stack_and_jump(idle_loop);
87 }
89 void dump_pageframe_info(struct domain *d)
90 {
91 struct page_info *page;
93 printk("Memory pages belonging to domain %u:\n", d->domain_id);
95 if ( d->tot_pages >= 10 )
96 {
97 printk(" DomPage list too long to display\n");
98 }
99 else
100 {
101 list_for_each_entry ( page, &d->page_list, list )
102 {
103 printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
104 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
105 page->count_info, page->u.inuse.type_info);
106 }
107 }
109 list_for_each_entry ( page, &d->xenpage_list, list )
110 {
111 printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n",
112 _p(page_to_maddr(page)), _p(page_to_mfn(page)),
113 page->count_info, page->u.inuse.type_info);
114 }
115 }
117 struct vcpu *alloc_vcpu_struct(void)
118 {
119 struct vcpu *v;
120 if ( (v = xmalloc(struct vcpu)) != NULL )
121 memset(v, 0, sizeof(*v));
122 return v;
123 }
125 void free_vcpu_struct(struct vcpu *v)
126 {
127 xfree(v);
128 }
130 int vcpu_initialise(struct vcpu *v)
131 {
132 struct domain *d = v->domain;
133 int rc;
135 v->arch.flags = TF_kernel_mode;
137 pae_l3_cache_init(&v->arch.pae_l3_cache);
139 if ( is_hvm_domain(d) )
140 {
141 if ( (rc = hvm_vcpu_initialise(v)) != 0 )
142 return rc;
143 }
144 else
145 {
146 v->arch.schedule_tail = continue_nonidle_domain;
147 v->arch.ctxt_switch_from = paravirt_ctxt_switch_from;
148 v->arch.ctxt_switch_to = paravirt_ctxt_switch_to;
150 if ( is_idle_domain(d) )
151 {
152 v->arch.schedule_tail = continue_idle_domain;
153 v->arch.cr3 = __pa(idle_pg_table);
154 }
155 }
157 v->arch.perdomain_ptes =
158 d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
160 return 0;
161 }
163 void vcpu_destroy(struct vcpu *v)
164 {
165 }
167 int arch_domain_create(struct domain *d)
168 {
169 #ifdef __x86_64__
170 struct page_info *pg;
171 #endif
172 l1_pgentry_t gdt_l1e;
173 int vcpuid, pdpt_order;
174 int i, rc = -ENOMEM;
176 pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
177 d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
178 if ( d->arch.mm_perdomain_pt == NULL )
179 goto fail;
180 memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
182 /*
183 * Map Xen segments into every VCPU's GDT, irrespective of whether every
184 * VCPU will actually be used. This avoids an NMI race during context
185 * switch: if we take an interrupt after switching CR3 but before switching
186 * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
187 * try to load CS from an invalid table.
188 */
189 gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
190 for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
191 d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
192 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
194 #if defined(__i386__)
196 mapcache_init(d);
198 #else /* __x86_64__ */
200 if ( (pg = alloc_domheap_page(NULL)) == NULL )
201 goto fail;
202 d->arch.mm_perdomain_l2 = clear_page(page_to_virt(pg));
203 for ( i = 0; i < (1 << pdpt_order); i++ )
204 d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
205 l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
206 __PAGE_HYPERVISOR);
208 if ( (pg = alloc_domheap_page(NULL)) == NULL )
209 goto fail;
210 d->arch.mm_perdomain_l3 = clear_page(page_to_virt(pg));
211 d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
212 l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
213 __PAGE_HYPERVISOR);
215 #endif /* __x86_64__ */
217 shadow_lock_init(d);
218 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
219 INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
220 INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
221 INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
222 INIT_LIST_HEAD(&d->arch.shadow.pinned_shadows);
224 if ( !is_idle_domain(d) )
225 {
226 d->arch.ioport_caps =
227 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
228 if ( d->arch.ioport_caps == NULL )
229 goto fail;
231 if ( (d->shared_info = alloc_xenheap_page()) == NULL )
232 goto fail;
234 memset(d->shared_info, 0, PAGE_SIZE);
235 share_xen_page_with_guest(
236 virt_to_page(d->shared_info), d, XENSHARE_writable);
237 }
239 return is_hvm_domain(d) ? hvm_domain_initialise(d) : 0;
241 fail:
242 free_xenheap_page(d->shared_info);
243 #ifdef __x86_64__
244 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
245 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
246 #endif
247 free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
248 return rc;
249 }
251 void arch_domain_destroy(struct domain *d)
252 {
253 struct vcpu *v;
255 if ( is_hvm_domain(d) )
256 {
257 for_each_vcpu ( d, v )
258 hvm_vcpu_destroy(v);
259 hvm_domain_destroy(d);
260 }
262 shadow_final_teardown(d);
264 free_xenheap_pages(
265 d->arch.mm_perdomain_pt,
266 get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
268 #ifdef __x86_64__
269 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2));
270 free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3));
271 #endif
273 free_xenheap_page(d->shared_info);
274 }
276 /* This is called by arch_final_setup_guest and do_boot_vcpu */
277 int arch_set_info_guest(
278 struct vcpu *v, struct vcpu_guest_context *c)
279 {
280 struct domain *d = v->domain;
281 unsigned long cr3_pfn = INVALID_MFN;
282 int i, rc;
284 if ( !is_hvm_vcpu(v) )
285 {
286 fixup_guest_stack_selector(c->user_regs.ss);
287 fixup_guest_stack_selector(c->kernel_ss);
288 fixup_guest_code_selector(c->user_regs.cs);
290 #ifdef __i386__
291 fixup_guest_code_selector(c->event_callback_cs);
292 fixup_guest_code_selector(c->failsafe_callback_cs);
293 #endif
295 for ( i = 0; i < 256; i++ )
296 fixup_guest_code_selector(c->trap_ctxt[i].cs);
298 /* LDT safety checks. */
299 if ( ((c->ldt_base & (PAGE_SIZE-1)) != 0) ||
300 (c->ldt_ents > 8192) ||
301 !array_access_ok(c->ldt_base, c->ldt_ents, LDT_ENTRY_SIZE) )
302 return -EINVAL;
303 }
305 clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
306 if ( c->flags & VGCF_i387_valid )
307 set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags);
309 v->arch.flags &= ~TF_kernel_mode;
310 if ( (c->flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
311 v->arch.flags |= TF_kernel_mode;
313 memcpy(&v->arch.guest_context, c, sizeof(*c));
315 /* Only CR0.TS is modifiable by guest or admin. */
316 v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS;
317 v->arch.guest_context.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
319 init_int80_direct_trap(v);
321 if ( !is_hvm_vcpu(v) )
322 {
323 /* IOPL privileges are virtualised. */
324 v->arch.iopl = (v->arch.guest_context.user_regs.eflags >> 12) & 3;
325 v->arch.guest_context.user_regs.eflags &= ~EF_IOPL;
327 /* Ensure real hardware interrupts are enabled. */
328 v->arch.guest_context.user_regs.eflags |= EF_IE;
329 }
330 else
331 {
332 hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
333 }
335 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
336 return 0;
338 memset(v->arch.guest_context.debugreg, 0,
339 sizeof(v->arch.guest_context.debugreg));
340 for ( i = 0; i < 8; i++ )
341 (void)set_debugreg(v, i, c->debugreg[i]);
343 if ( v->vcpu_id == 0 )
344 d->vm_assist = c->vm_assist;
346 if ( !is_hvm_vcpu(v) )
347 {
348 if ( (rc = (int)set_gdt(v, c->gdt_frames, c->gdt_ents)) != 0 )
349 return rc;
351 cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c->ctrlreg[3]));
353 if ( shadow_mode_refcounts(d)
354 ? !get_page(mfn_to_page(cr3_pfn), d)
355 : !get_page_and_type(mfn_to_page(cr3_pfn), d,
356 PGT_base_page_table) )
357 {
358 destroy_gdt(v);
359 return -EINVAL;
360 }
362 v->arch.guest_table = pagetable_from_pfn(cr3_pfn);
363 }
365 /* Shadow: make sure the domain has enough shadow memory to
366 * boot another vcpu */
367 if ( shadow_mode_enabled(d)
368 && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) )
369 {
370 destroy_gdt(v);
371 return -ENOMEM;
372 }
374 if ( v->vcpu_id == 0 )
375 update_domain_wallclock_time(d);
377 /* Don't redo final setup */
378 set_bit(_VCPUF_initialised, &v->vcpu_flags);
380 if ( shadow_mode_enabled(d) )
381 shadow_update_paging_modes(v);
383 update_cr3(v);
385 return 0;
386 }
388 long
389 arch_do_vcpu_op(
390 int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
391 {
392 long rc = 0;
394 switch ( cmd )
395 {
396 case VCPUOP_register_runstate_memory_area:
397 {
398 struct vcpu_register_runstate_memory_area area;
399 struct vcpu_runstate_info runstate;
401 rc = -EFAULT;
402 if ( copy_from_guest(&area, arg, 1) )
403 break;
405 if ( !guest_handle_okay(area.addr.h, 1) )
406 break;
408 rc = 0;
409 v->runstate_guest = area.addr.h;
411 if ( v == current )
412 {
413 __copy_to_guest(v->runstate_guest, &v->runstate, 1);
414 }
415 else
416 {
417 vcpu_runstate_get(v, &runstate);
418 __copy_to_guest(v->runstate_guest, &runstate, 1);
419 }
421 break;
422 }
424 default:
425 rc = -ENOSYS;
426 break;
427 }
429 return rc;
430 }
432 #ifdef __x86_64__
434 #define loadsegment(seg,value) ({ \
435 int __r = 1; \
436 __asm__ __volatile__ ( \
437 "1: movl %k1,%%" #seg "\n2:\n" \
438 ".section .fixup,\"ax\"\n" \
439 "3: xorl %k0,%k0\n" \
440 " movl %k0,%%" #seg "\n" \
441 " jmp 2b\n" \
442 ".previous\n" \
443 ".section __ex_table,\"a\"\n" \
444 " .align 8\n" \
445 " .quad 1b,3b\n" \
446 ".previous" \
447 : "=r" (__r) : "r" (value), "0" (__r) );\
448 __r; })
450 /*
451 * save_segments() writes a mask of segments which are dirty (non-zero),
452 * allowing load_segments() to avoid some expensive segment loads and
453 * MSR writes.
454 */
455 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
456 #define DIRTY_DS 0x01
457 #define DIRTY_ES 0x02
458 #define DIRTY_FS 0x04
459 #define DIRTY_GS 0x08
460 #define DIRTY_FS_BASE 0x10
461 #define DIRTY_GS_BASE_USER 0x20
463 static void load_segments(struct vcpu *n)
464 {
465 struct vcpu_guest_context *nctxt = &n->arch.guest_context;
466 int all_segs_okay = 1;
467 unsigned int dirty_segment_mask, cpu = smp_processor_id();
469 /* Load and clear the dirty segment mask. */
470 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
471 per_cpu(dirty_segment_mask, cpu) = 0;
473 /* Either selector != 0 ==> reload. */
474 if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
475 all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
477 /* Either selector != 0 ==> reload. */
478 if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
479 all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
481 /*
482 * Either selector != 0 ==> reload.
483 * Also reload to reset FS_BASE if it was non-zero.
484 */
485 if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
486 nctxt->user_regs.fs) )
487 all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
489 /*
490 * Either selector != 0 ==> reload.
491 * Also reload to reset GS_BASE if it was non-zero.
492 */
493 if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
494 nctxt->user_regs.gs) )
495 {
496 /* Reset GS_BASE with user %gs? */
497 if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
498 all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
499 }
501 /* This can only be non-zero if selector is NULL. */
502 if ( nctxt->fs_base )
503 wrmsr(MSR_FS_BASE,
504 nctxt->fs_base,
505 nctxt->fs_base>>32);
507 /* Most kernels have non-zero GS base, so don't bother testing. */
508 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
509 wrmsr(MSR_SHADOW_GS_BASE,
510 nctxt->gs_base_kernel,
511 nctxt->gs_base_kernel>>32);
513 /* This can only be non-zero if selector is NULL. */
514 if ( nctxt->gs_base_user )
515 wrmsr(MSR_GS_BASE,
516 nctxt->gs_base_user,
517 nctxt->gs_base_user>>32);
519 /* If in kernel mode then switch the GS bases around. */
520 if ( n->arch.flags & TF_kernel_mode )
521 __asm__ __volatile__ ( "swapgs" );
523 if ( unlikely(!all_segs_okay) )
524 {
525 struct cpu_user_regs *regs = guest_cpu_user_regs();
526 unsigned long *rsp =
527 (n->arch.flags & TF_kernel_mode) ?
528 (unsigned long *)regs->rsp :
529 (unsigned long *)nctxt->kernel_sp;
530 unsigned long cs_and_mask, rflags;
532 if ( !(n->arch.flags & TF_kernel_mode) )
533 toggle_guest_mode(n);
534 else
535 regs->cs &= ~3;
537 /* CS longword also contains full evtchn_upcall_mask. */
538 cs_and_mask = (unsigned long)regs->cs |
539 ((unsigned long)n->vcpu_info->evtchn_upcall_mask << 32);
541 /* Fold upcall mask into RFLAGS.IF. */
542 rflags = regs->rflags & ~X86_EFLAGS_IF;
543 rflags |= !n->vcpu_info->evtchn_upcall_mask << 9;
545 if ( put_user(regs->ss, rsp- 1) |
546 put_user(regs->rsp, rsp- 2) |
547 put_user(rflags, rsp- 3) |
548 put_user(cs_and_mask, rsp- 4) |
549 put_user(regs->rip, rsp- 5) |
550 put_user(nctxt->user_regs.gs, rsp- 6) |
551 put_user(nctxt->user_regs.fs, rsp- 7) |
552 put_user(nctxt->user_regs.es, rsp- 8) |
553 put_user(nctxt->user_regs.ds, rsp- 9) |
554 put_user(regs->r11, rsp-10) |
555 put_user(regs->rcx, rsp-11) )
556 {
557 gdprintk(XENLOG_ERR, "Error while creating failsafe "
558 "callback frame.\n");
559 domain_crash(n->domain);
560 }
562 if ( test_bit(_VGCF_failsafe_disables_events,
563 &n->arch.guest_context.flags) )
564 n->vcpu_info->evtchn_upcall_mask = 1;
566 regs->entry_vector = TRAP_syscall;
567 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
568 X86_EFLAGS_NT|X86_EFLAGS_TF);
569 regs->ss = FLAT_KERNEL_SS;
570 regs->rsp = (unsigned long)(rsp-11);
571 regs->cs = FLAT_KERNEL_CS;
572 regs->rip = nctxt->failsafe_callback_eip;
573 }
574 }
576 static void save_segments(struct vcpu *v)
577 {
578 struct vcpu_guest_context *ctxt = &v->arch.guest_context;
579 struct cpu_user_regs *regs = &ctxt->user_regs;
580 unsigned int dirty_segment_mask = 0;
582 regs->ds = read_segment_register(ds);
583 regs->es = read_segment_register(es);
584 regs->fs = read_segment_register(fs);
585 regs->gs = read_segment_register(gs);
587 if ( regs->ds )
588 dirty_segment_mask |= DIRTY_DS;
590 if ( regs->es )
591 dirty_segment_mask |= DIRTY_ES;
593 if ( regs->fs )
594 {
595 dirty_segment_mask |= DIRTY_FS;
596 ctxt->fs_base = 0; /* != 0 selector kills fs_base */
597 }
598 else if ( ctxt->fs_base )
599 {
600 dirty_segment_mask |= DIRTY_FS_BASE;
601 }
603 if ( regs->gs )
604 {
605 dirty_segment_mask |= DIRTY_GS;
606 ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
607 }
608 else if ( ctxt->gs_base_user )
609 {
610 dirty_segment_mask |= DIRTY_GS_BASE_USER;
611 }
613 this_cpu(dirty_segment_mask) = dirty_segment_mask;
614 }
616 #define switch_kernel_stack(v) ((void)0)
618 #elif defined(__i386__)
620 #define load_segments(n) ((void)0)
621 #define save_segments(p) ((void)0)
623 static inline void switch_kernel_stack(struct vcpu *v)
624 {
625 struct tss_struct *tss = &init_tss[smp_processor_id()];
626 tss->esp1 = v->arch.guest_context.kernel_sp;
627 tss->ss1 = v->arch.guest_context.kernel_ss;
628 }
630 #endif /* __i386__ */
632 static void paravirt_ctxt_switch_from(struct vcpu *v)
633 {
634 save_segments(v);
635 }
637 static void paravirt_ctxt_switch_to(struct vcpu *v)
638 {
639 set_int80_direct_trap(v);
640 switch_kernel_stack(v);
641 }
643 #define loaddebug(_v,_reg) \
644 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
646 static void __context_switch(void)
647 {
648 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
649 unsigned int cpu = smp_processor_id();
650 struct vcpu *p = per_cpu(curr_vcpu, cpu);
651 struct vcpu *n = current;
653 ASSERT(p != n);
654 ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
656 if ( !is_idle_vcpu(p) )
657 {
658 memcpy(&p->arch.guest_context.user_regs,
659 stack_regs,
660 CTXT_SWITCH_STACK_BYTES);
661 unlazy_fpu(p);
662 p->arch.ctxt_switch_from(p);
663 }
665 if ( !is_idle_vcpu(n) )
666 {
667 memcpy(stack_regs,
668 &n->arch.guest_context.user_regs,
669 CTXT_SWITCH_STACK_BYTES);
671 /* Maybe switch the debug registers. */
672 if ( unlikely(n->arch.guest_context.debugreg[7]) )
673 {
674 loaddebug(&n->arch.guest_context, 0);
675 loaddebug(&n->arch.guest_context, 1);
676 loaddebug(&n->arch.guest_context, 2);
677 loaddebug(&n->arch.guest_context, 3);
678 /* no 4 and 5 */
679 loaddebug(&n->arch.guest_context, 6);
680 loaddebug(&n->arch.guest_context, 7);
681 }
682 n->arch.ctxt_switch_to(n);
683 }
685 if ( p->domain != n->domain )
686 cpu_set(cpu, n->domain->domain_dirty_cpumask);
687 cpu_set(cpu, n->vcpu_dirty_cpumask);
689 write_ptbase(n);
691 if ( p->vcpu_id != n->vcpu_id )
692 {
693 char gdt_load[10];
694 *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
695 *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
696 __asm__ __volatile__ ( "lgdt %0" : "=m" (gdt_load) );
697 }
699 if ( p->domain != n->domain )
700 cpu_clear(cpu, p->domain->domain_dirty_cpumask);
701 cpu_clear(cpu, p->vcpu_dirty_cpumask);
703 per_cpu(curr_vcpu, cpu) = n;
704 }
707 void context_switch(struct vcpu *prev, struct vcpu *next)
708 {
709 unsigned int cpu = smp_processor_id();
710 cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
712 ASSERT(local_irq_is_enabled());
714 /* Allow at most one CPU at a time to be dirty. */
715 ASSERT(cpus_weight(dirty_mask) <= 1);
716 if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
717 {
718 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
719 if ( !cpus_empty(next->vcpu_dirty_cpumask) )
720 flush_tlb_mask(next->vcpu_dirty_cpumask);
721 }
723 local_irq_disable();
725 set_current(next);
727 if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
728 {
729 local_irq_enable();
730 }
731 else
732 {
733 __context_switch();
735 /* Re-enable interrupts before restoring state which may fault. */
736 local_irq_enable();
738 if ( !is_hvm_vcpu(next) )
739 {
740 load_LDT(next);
741 load_segments(next);
742 }
743 }
745 context_saved(prev);
747 /* Update per-VCPU guest runstate shared memory area (if registered). */
748 if ( !guest_handle_is_null(next->runstate_guest) )
749 __copy_to_guest(next->runstate_guest, &next->runstate, 1);
751 schedule_tail(next);
752 BUG();
753 }
755 void continue_running(struct vcpu *same)
756 {
757 schedule_tail(same);
758 BUG();
759 }
761 int __sync_lazy_execstate(void)
762 {
763 unsigned long flags;
764 int switch_required;
766 local_irq_save(flags);
768 switch_required = (this_cpu(curr_vcpu) != current);
770 if ( switch_required )
771 {
772 ASSERT(current == idle_vcpu[smp_processor_id()]);
773 __context_switch();
774 }
776 local_irq_restore(flags);
778 return switch_required;
779 }
781 void sync_vcpu_execstate(struct vcpu *v)
782 {
783 if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
784 (void)__sync_lazy_execstate();
786 /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
787 flush_tlb_mask(v->vcpu_dirty_cpumask);
788 }
790 #define next_arg(fmt, args) ({ \
791 unsigned long __arg; \
792 switch ( *(fmt)++ ) \
793 { \
794 case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
795 case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
796 case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
797 default: __arg = 0; BUG(); \
798 } \
799 __arg; \
800 })
802 unsigned long hypercall_create_continuation(
803 unsigned int op, const char *format, ...)
804 {
805 struct mc_state *mcs = &this_cpu(mc_state);
806 struct cpu_user_regs *regs;
807 const char *p = format;
808 unsigned long arg;
809 unsigned int i;
810 va_list args;
812 va_start(args, format);
814 if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
815 {
816 __set_bit(_MCSF_call_preempted, &mcs->flags);
818 for ( i = 0; *p != '\0'; i++ )
819 mcs->call.args[i] = next_arg(p, args);
820 }
821 else
822 {
823 regs = guest_cpu_user_regs();
824 #if defined(__i386__)
825 regs->eax = op;
827 if ( supervisor_mode_kernel || is_hvm_vcpu(current) )
828 regs->eip &= ~31; /* re-execute entire hypercall entry stub */
829 else
830 regs->eip -= 2; /* re-execute 'int 0x82' */
832 for ( i = 0; *p != '\0'; i++ )
833 {
834 arg = next_arg(p, args);
835 switch ( i )
836 {
837 case 0: regs->ebx = arg; break;
838 case 1: regs->ecx = arg; break;
839 case 2: regs->edx = arg; break;
840 case 3: regs->esi = arg; break;
841 case 4: regs->edi = arg; break;
842 case 5: regs->ebp = arg; break;
843 }
844 }
845 #elif defined(__x86_64__)
846 regs->rax = op;
847 regs->rip -= 2; /* re-execute 'syscall' */
849 for ( i = 0; *p != '\0'; i++ )
850 {
851 arg = next_arg(p, args);
852 switch ( i )
853 {
854 case 0: regs->rdi = arg; break;
855 case 1: regs->rsi = arg; break;
856 case 2: regs->rdx = arg; break;
857 case 3: regs->r10 = arg; break;
858 case 4: regs->r8 = arg; break;
859 case 5: regs->r9 = arg; break;
860 }
861 }
862 #endif
863 }
865 va_end(args);
867 return op;
868 }
870 static void relinquish_memory(struct domain *d, struct list_head *list)
871 {
872 struct list_head *ent;
873 struct page_info *page;
874 unsigned long x, y;
876 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
877 spin_lock_recursive(&d->page_alloc_lock);
879 ent = list->next;
880 while ( ent != list )
881 {
882 page = list_entry(ent, struct page_info, list);
884 /* Grab a reference to the page so it won't disappear from under us. */
885 if ( unlikely(!get_page(page, d)) )
886 {
887 /* Couldn't get a reference -- someone is freeing this page. */
888 ent = ent->next;
889 continue;
890 }
892 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
893 put_page_and_type(page);
895 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
896 put_page(page);
898 /*
899 * Forcibly invalidate base page tables at this point to break circular
900 * 'linear page table' references. This is okay because MMU structures
901 * are not shared across domains and this domain is now dead. Thus base
902 * tables are not in use so a non-zero count means circular reference.
903 */
904 y = page->u.inuse.type_info;
905 for ( ; ; )
906 {
907 x = y;
908 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
909 (PGT_base_page_table|PGT_validated)) )
910 break;
912 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
913 if ( likely(y == x) )
914 {
915 free_page_type(page, PGT_base_page_table);
916 break;
917 }
918 }
920 /* Follow the list chain and /then/ potentially free the page. */
921 ent = ent->next;
922 put_page(page);
923 }
925 spin_unlock_recursive(&d->page_alloc_lock);
926 }
928 void domain_relinquish_resources(struct domain *d)
929 {
930 struct vcpu *v;
931 unsigned long pfn;
933 BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
935 /* Drop the in-use references to page-table bases. */
936 for_each_vcpu ( d, v )
937 {
938 /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
939 * or sh_update_paging_modes()) */
940 pfn = pagetable_get_pfn(v->arch.guest_table);
941 if ( pfn != 0 )
942 {
943 if ( shadow_mode_refcounts(d) )
944 put_page(mfn_to_page(pfn));
945 else
946 put_page_and_type(mfn_to_page(pfn));
947 v->arch.guest_table = pagetable_null();
948 }
950 #ifdef __x86_64__
951 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
952 pfn = pagetable_get_pfn(v->arch.guest_table_user);
953 if ( pfn != 0 )
954 {
955 if ( shadow_mode_refcounts(d) )
956 put_page(mfn_to_page(pfn));
957 else
958 put_page_and_type(mfn_to_page(pfn));
959 v->arch.guest_table_user = pagetable_null();
960 }
961 #endif
962 }
964 /* Tear down shadow mode stuff. */
965 shadow_teardown(d);
967 /*
968 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
969 * it automatically gets squashed when the guest's mappings go away.
970 */
971 for_each_vcpu(d, v)
972 destroy_gdt(v);
974 /* Relinquish every page of memory. */
975 relinquish_memory(d, &d->xenpage_list);
976 relinquish_memory(d, &d->page_list);
978 /* Free page used by xen oprofile buffer */
979 free_xenoprof_pages(d);
980 }
982 void arch_dump_domain_info(struct domain *d)
983 {
984 if ( shadow_mode_enabled(d) )
985 {
986 printk(" shadow mode: ");
987 if ( d->arch.shadow.mode & SHM2_enable )
988 printk("enabled ");
989 if ( shadow_mode_refcounts(d) )
990 printk("refcounts ");
991 if ( shadow_mode_log_dirty(d) )
992 printk("log_dirty ");
993 if ( shadow_mode_translate(d) )
994 printk("translate ");
995 if ( shadow_mode_external(d) )
996 printk("external ");
997 printk("\n");
998 }
999 }
1001 void arch_dump_vcpu_info(struct vcpu *v)
1003 if ( shadow_mode_enabled(v->domain) )
1005 if ( v->arch.shadow.mode )
1006 printk(" shadowed %u-on-%u, %stranslated\n",
1007 v->arch.shadow.mode->guest_levels,
1008 v->arch.shadow.mode->shadow_levels,
1009 shadow_vcpu_mode_translate(v) ? "" : "not ");
1010 else
1011 printk(" not shadowed\n");
1015 /*
1016 * Local variables:
1017 * mode: C
1018 * c-set-style: "BSD"
1019 * c-basic-offset: 4
1020 * tab-width: 4
1021 * indent-tabs-mode: nil
1022 * End:
1023 */