ia64/xen-unstable

view xen/arch/x86/x86_32/traps.c @ 17965:14fd83fe71c3

Add facility to get notification of domain suspend by event channel.
This event channel will be notified when the domain transitions to the
suspended state, which can be much faster than raising VIRQ_DOM_EXC
and waiting for the notification to be propagated via xenstore.

No attempt is made here to prevent multiple subscribers (last one
wins), or to detect that the subscriber has gone away. Userspace tools
should take care.

Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jul 04 12:00:24 2008 +0100 (2008-07-04)
parents 6b1795ee1b19
children d711529e3de1
line source
2 #include <xen/config.h>
3 #include <xen/version.h>
4 #include <xen/domain_page.h>
5 #include <xen/init.h>
6 #include <xen/sched.h>
7 #include <xen/lib.h>
8 #include <xen/console.h>
9 #include <xen/mm.h>
10 #include <xen/irq.h>
11 #include <xen/symbols.h>
12 #include <xen/shutdown.h>
13 #include <xen/nmi.h>
14 #include <asm/current.h>
15 #include <asm/flushtlb.h>
16 #include <asm/hvm/hvm.h>
17 #include <asm/hvm/support.h>
19 #include <public/callback.h>
21 static void print_xen_info(void)
22 {
23 char taint_str[TAINT_STRING_MAX_LEN];
24 char debug = 'n', *arch = "x86_32p";
26 #ifndef NDEBUG
27 debug = 'y';
28 #endif
30 printk("----[ Xen-%d.%d%s %s debug=%c %s ]----\n",
31 xen_major_version(), xen_minor_version(), xen_extra_version(),
32 arch, debug, print_tainted(taint_str));
33 }
35 static void _show_registers(const struct cpu_user_regs *regs,
36 unsigned long crs[8], int guest_mode,
37 const char *context)
38 {
39 printk("EIP: %04x:[<%08x>]", regs->cs, regs->eip);
40 if ( !guest_mode )
41 print_symbol(" %s", regs->eip);
42 printk("\nEFLAGS: %08x CONTEXT: %s\n", regs->eflags, context);
43 printk("eax: %08x ebx: %08x ecx: %08x edx: %08x\n",
44 regs->eax, regs->ebx, regs->ecx, regs->edx);
45 printk("esi: %08x edi: %08x ebp: %08x esp: %08x\n",
46 regs->esi, regs->edi, regs->ebp, regs->esp);
47 printk("cr0: %08lx cr4: %08lx cr3: %08lx cr2: %08lx\n",
48 crs[0], crs[4], crs[3], crs[2]);
49 printk("ds: %04x es: %04x fs: %04x gs: %04x "
50 "ss: %04x cs: %04x\n",
51 regs->ds, regs->es, regs->fs,
52 regs->gs, regs->ss, regs->cs);
53 }
55 void show_registers(struct cpu_user_regs *regs)
56 {
57 struct cpu_user_regs fault_regs = *regs;
58 unsigned long fault_crs[8];
59 const char *context;
60 struct vcpu *v = current;
62 if ( is_hvm_vcpu(v) && guest_mode(regs) )
63 {
64 struct segment_register sreg;
65 context = "hvm";
66 fault_crs[0] = v->arch.hvm_vcpu.guest_cr[0];
67 fault_crs[2] = v->arch.hvm_vcpu.guest_cr[2];
68 fault_crs[3] = v->arch.hvm_vcpu.guest_cr[3];
69 fault_crs[4] = v->arch.hvm_vcpu.guest_cr[4];
70 hvm_get_segment_register(v, x86_seg_cs, &sreg);
71 fault_regs.cs = sreg.sel;
72 hvm_get_segment_register(v, x86_seg_ds, &sreg);
73 fault_regs.ds = sreg.sel;
74 hvm_get_segment_register(v, x86_seg_es, &sreg);
75 fault_regs.es = sreg.sel;
76 hvm_get_segment_register(v, x86_seg_fs, &sreg);
77 fault_regs.fs = sreg.sel;
78 hvm_get_segment_register(v, x86_seg_gs, &sreg);
79 fault_regs.gs = sreg.sel;
80 hvm_get_segment_register(v, x86_seg_ss, &sreg);
81 fault_regs.ss = sreg.sel;
82 }
83 else
84 {
85 if ( !guest_mode(regs) )
86 {
87 context = "hypervisor";
88 fault_regs.esp = (unsigned long)&regs->esp;
89 fault_regs.ss = read_segment_register(ss);
90 fault_regs.ds = read_segment_register(ds);
91 fault_regs.es = read_segment_register(es);
92 fault_regs.fs = read_segment_register(fs);
93 fault_regs.gs = read_segment_register(gs);
94 fault_crs[2] = read_cr2();
95 }
96 else
97 {
98 context = "guest";
99 fault_crs[2] = v->vcpu_info->arch.cr2;
100 }
102 fault_crs[0] = read_cr0();
103 fault_crs[3] = read_cr3();
104 fault_crs[4] = read_cr4();
105 }
107 print_xen_info();
108 printk("CPU: %d\n", smp_processor_id());
109 _show_registers(&fault_regs, fault_crs, guest_mode(regs), context);
111 if ( this_cpu(ler_msr) && !guest_mode(regs) )
112 {
113 u32 from, to, hi;
114 rdmsr(this_cpu(ler_msr), from, hi);
115 rdmsr(this_cpu(ler_msr) + 1, to, hi);
116 printk("ler: %08x -> %08x\n", from, to);
117 }
118 }
120 void vcpu_show_registers(const struct vcpu *v)
121 {
122 unsigned long crs[8];
124 /* No need to handle HVM for now. */
125 if ( is_hvm_vcpu(v) )
126 return;
128 crs[0] = v->arch.guest_context.ctrlreg[0];
129 crs[2] = v->vcpu_info->arch.cr2;
130 crs[3] = pagetable_get_paddr(v->arch.guest_table);
131 crs[4] = v->arch.guest_context.ctrlreg[4];
133 _show_registers(&v->arch.guest_context.user_regs, crs, 1, "guest");
134 }
136 void show_page_walk(unsigned long addr)
137 {
138 unsigned long pfn, mfn, cr3 = read_cr3();
139 l3_pgentry_t l3e, *l3t;
140 l2_pgentry_t l2e, *l2t;
141 l1_pgentry_t l1e, *l1t;
143 printk("Pagetable walk from %08lx:\n", addr);
145 mfn = cr3 >> PAGE_SHIFT;
147 l3t = map_domain_page(mfn);
148 l3t += (cr3 & 0xFE0UL) >> 3;
149 l3e = l3t[l3_table_offset(addr)];
150 mfn = l3e_get_pfn(l3e);
151 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
152 printk(" L3[0x%03lx] = %"PRIpte" %08lx\n",
153 l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
154 unmap_domain_page(l3t);
155 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
156 return;
158 l2t = map_domain_page(mfn);
159 l2e = l2t[l2_table_offset(addr)];
160 mfn = l2e_get_pfn(l2e);
161 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
162 printk(" L2[0x%03lx] = %"PRIpte" %08lx %s\n",
163 l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
164 (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
165 unmap_domain_page(l2t);
166 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
167 (l2e_get_flags(l2e) & _PAGE_PSE) )
168 return;
170 l1t = map_domain_page(mfn);
171 l1e = l1t[l1_table_offset(addr)];
172 mfn = l1e_get_pfn(l1e);
173 pfn = mfn_valid(mfn) ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY;
174 printk(" L1[0x%03lx] = %"PRIpte" %08lx\n",
175 l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
176 unmap_domain_page(l1t);
177 }
179 #define DOUBLEFAULT_STACK_SIZE 2048
180 static struct tss_struct doublefault_tss;
181 static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE];
183 asmlinkage void do_double_fault(void)
184 {
185 struct tss_struct *tss = &doublefault_tss;
186 unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1;
188 watchdog_disable();
190 console_force_unlock();
192 /* Find information saved during fault and dump it to the console. */
193 tss = &init_tss[cpu];
194 printk("*** DOUBLE FAULT ***\n");
195 print_xen_info();
196 printk("CPU: %d\nEIP: %04x:[<%08x>]",
197 cpu, tss->cs, tss->eip);
198 print_symbol(" %s\n", tss->eip);
199 printk("EFLAGS: %08x\n", tss->eflags);
200 printk("CR3: %08x\n", tss->__cr3);
201 printk("eax: %08x ebx: %08x ecx: %08x edx: %08x\n",
202 tss->eax, tss->ebx, tss->ecx, tss->edx);
203 printk("esi: %08x edi: %08x ebp: %08x esp: %08x\n",
204 tss->esi, tss->edi, tss->ebp, tss->esp);
205 printk("ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
206 tss->ds, tss->es, tss->fs, tss->gs, tss->ss);
207 show_stack_overflow(cpu, tss->esp);
209 panic("DOUBLE FAULT -- system shutdown\n");
210 }
212 unsigned long do_iret(void)
213 {
214 struct cpu_user_regs *regs = guest_cpu_user_regs();
215 struct vcpu *v = current;
216 u32 eflags;
218 /* Check worst-case stack frame for overlap with Xen protected area. */
219 if ( unlikely(!access_ok(regs->esp, 40)) )
220 goto exit_and_crash;
222 /* Pop and restore EAX (clobbered by hypercall). */
223 if ( unlikely(__copy_from_user(&regs->eax, (void *)regs->esp, 4)) )
224 goto exit_and_crash;
225 regs->esp += 4;
227 /* Pop and restore CS and EIP. */
228 if ( unlikely(__copy_from_user(&regs->eip, (void *)regs->esp, 8)) )
229 goto exit_and_crash;
230 regs->esp += 8;
232 /*
233 * Pop, fix up and restore EFLAGS. We fix up in a local staging area
234 * to avoid firing the BUG_ON(IOPL) check in arch_get_info_guest.
235 */
236 if ( unlikely(__copy_from_user(&eflags, (void *)regs->esp, 4)) )
237 goto exit_and_crash;
238 regs->esp += 4;
239 regs->eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF;
241 if ( vm86_mode(regs) )
242 {
243 /* Return to VM86 mode: pop and restore ESP,SS,ES,DS,FS and GS. */
244 if ( __copy_from_user(&regs->esp, (void *)regs->esp, 24) )
245 goto exit_and_crash;
246 }
247 else if ( unlikely(ring_0(regs)) )
248 {
249 goto exit_and_crash;
250 }
251 else if ( !ring_1(regs) )
252 {
253 /* Return to ring 2/3: pop and restore ESP and SS. */
254 if ( __copy_from_user(&regs->esp, (void *)regs->esp, 8) )
255 goto exit_and_crash;
256 }
258 /* Restore affinity. */
259 if (!cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
260 vcpu_set_affinity(v, &v->cpu_affinity_tmp);
262 /* No longer in NMI context. */
263 v->nmi_masked = 0;
265 /* Restore upcall mask from supplied EFLAGS.IF. */
266 vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
268 /*
269 * The hypercall exit path will overwrite EAX with this return
270 * value.
271 */
272 return regs->eax;
274 exit_and_crash:
275 gdprintk(XENLOG_ERR, "Fatal error\n");
276 domain_crash(v->domain);
277 return 0;
278 }
280 static void set_task_gate(unsigned int n, unsigned int sel)
281 {
282 idt_table[n].b = 0;
283 wmb(); /* disable gate /then/ rewrite */
284 idt_table[n].a = sel << 16;
285 wmb(); /* rewrite /then/ enable gate */
286 idt_table[n].b = 0x8500;
287 }
289 void __devinit subarch_percpu_traps_init(void)
290 {
291 struct tss_struct *tss = &doublefault_tss;
292 asmlinkage int hypercall(void);
294 if ( smp_processor_id() != 0 )
295 return;
297 /* The hypercall entry vector is only accessible from ring 1. */
298 _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall);
300 /*
301 * Make a separate task for double faults. This will get us debug output if
302 * we blow the kernel stack.
303 */
304 memset(tss, 0, sizeof(*tss));
305 tss->ds = __HYPERVISOR_DS;
306 tss->es = __HYPERVISOR_DS;
307 tss->ss = __HYPERVISOR_DS;
308 tss->esp = (unsigned long)&doublefault_stack[DOUBLEFAULT_STACK_SIZE];
309 tss->__cr3 = __pa(idle_pg_table);
310 tss->cs = __HYPERVISOR_CS;
311 tss->eip = (unsigned long)do_double_fault;
312 tss->eflags = 2;
313 tss->bitmap = IOBMP_INVALID_OFFSET;
314 _set_tssldt_desc(
315 gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
316 (unsigned long)tss, 235, 9);
318 set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
319 }
321 void init_int80_direct_trap(struct vcpu *v)
322 {
323 struct trap_info *ti = &v->arch.guest_context.trap_ctxt[0x80];
325 /*
326 * We can't virtualise interrupt gates, as there's no way to get
327 * the CPU to automatically clear the events_mask variable. Also we
328 * must ensure that the CS is safe to poke into an interrupt gate.
329 *
330 * When running with supervisor_mode_kernel enabled a direct trap
331 * to the guest OS cannot be used because the INT instruction will
332 * switch to the Xen stack and we need to swap back to the guest
333 * kernel stack before passing control to the system call entry point.
334 */
335 if ( TI_GET_IF(ti) || !guest_gate_selector_okay(v->domain, ti->cs) ||
336 supervisor_mode_kernel )
337 {
338 v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
339 return;
340 }
342 v->arch.int80_desc.a = (ti->cs << 16) | (ti->address & 0xffff);
343 v->arch.int80_desc.b =
344 (ti->address & 0xffff0000) | 0x8f00 | ((TI_GET_DPL(ti) & 3) << 13);
346 if ( v == current )
347 set_int80_direct_trap(v);
348 }
350 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
351 static void do_update_sysenter(void *info)
352 {
353 xen_callback_t *address = info;
355 wrmsr(MSR_IA32_SYSENTER_CS, address->cs, 0);
356 wrmsr(MSR_IA32_SYSENTER_EIP, address->eip, 0);
357 }
358 #endif
360 static long register_guest_callback(struct callback_register *reg)
361 {
362 long ret = 0;
363 struct vcpu *v = current;
365 fixup_guest_code_selector(v->domain, reg->address.cs);
367 switch ( reg->type )
368 {
369 case CALLBACKTYPE_event:
370 v->arch.guest_context.event_callback_cs = reg->address.cs;
371 v->arch.guest_context.event_callback_eip = reg->address.eip;
372 break;
374 case CALLBACKTYPE_failsafe:
375 v->arch.guest_context.failsafe_callback_cs = reg->address.cs;
376 v->arch.guest_context.failsafe_callback_eip = reg->address.eip;
377 if ( reg->flags & CALLBACKF_mask_events )
378 set_bit(_VGCF_failsafe_disables_events,
379 &v->arch.guest_context.flags);
380 else
381 clear_bit(_VGCF_failsafe_disables_events,
382 &v->arch.guest_context.flags);
383 break;
385 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
386 case CALLBACKTYPE_sysenter_deprecated:
387 if ( !cpu_has_sep )
388 ret = -EINVAL;
389 else if ( on_each_cpu(do_update_sysenter, &reg->address, 1, 1) != 0 )
390 ret = -EIO;
391 break;
393 case CALLBACKTYPE_sysenter:
394 if ( !cpu_has_sep )
395 ret = -EINVAL;
396 else
397 do_update_sysenter(&reg->address);
398 break;
399 #endif
401 case CALLBACKTYPE_nmi:
402 ret = register_guest_nmi_callback(reg->address.eip);
403 break;
405 default:
406 ret = -ENOSYS;
407 break;
408 }
410 return ret;
411 }
413 static long unregister_guest_callback(struct callback_unregister *unreg)
414 {
415 long ret;
417 switch ( unreg->type )
418 {
419 case CALLBACKTYPE_event:
420 case CALLBACKTYPE_failsafe:
421 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
422 case CALLBACKTYPE_sysenter_deprecated:
423 case CALLBACKTYPE_sysenter:
424 #endif
425 ret = -EINVAL;
426 break;
428 case CALLBACKTYPE_nmi:
429 ret = unregister_guest_nmi_callback();
430 break;
432 default:
433 ret = -ENOSYS;
434 break;
435 }
437 return ret;
438 }
441 long do_callback_op(int cmd, XEN_GUEST_HANDLE(const_void) arg)
442 {
443 long ret;
445 switch ( cmd )
446 {
447 case CALLBACKOP_register:
448 {
449 struct callback_register reg;
451 ret = -EFAULT;
452 if ( copy_from_guest(&reg, arg, 1) )
453 break;
455 ret = register_guest_callback(&reg);
456 }
457 break;
459 case CALLBACKOP_unregister:
460 {
461 struct callback_unregister unreg;
463 ret = -EFAULT;
464 if ( copy_from_guest(&unreg, arg, 1) )
465 break;
467 ret = unregister_guest_callback(&unreg);
468 }
469 break;
471 default:
472 ret = -ENOSYS;
473 break;
474 }
476 return ret;
477 }
479 long do_set_callbacks(unsigned long event_selector,
480 unsigned long event_address,
481 unsigned long failsafe_selector,
482 unsigned long failsafe_address)
483 {
484 struct callback_register event = {
485 .type = CALLBACKTYPE_event,
486 .address = { event_selector, event_address },
487 };
488 struct callback_register failsafe = {
489 .type = CALLBACKTYPE_failsafe,
490 .address = { failsafe_selector, failsafe_address },
491 };
493 register_guest_callback(&event);
494 register_guest_callback(&failsafe);
496 return 0;
497 }
499 static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
500 {
501 extern asmlinkage int hypercall(void);
502 char *p;
503 int i;
505 /* Fill in all the transfer points with template machine code. */
507 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
508 {
509 p = (char *)(hypercall_page + (i * 32));
511 *(u8 *)(p+ 0) = 0x9c; /* pushf */
512 *(u8 *)(p+ 1) = 0xfa; /* cli */
513 *(u8 *)(p+ 2) = 0xb8; /* mov $<i>,%eax */
514 *(u32 *)(p+ 3) = i;
515 *(u8 *)(p+ 7) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */
516 *(u32 *)(p+ 8) = (u32)&hypercall;
517 *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
518 *(u8 *)(p+14) = 0xc3; /* ret */
519 }
521 /*
522 * HYPERVISOR_iret is special because it doesn't return and expects a
523 * special stack frame. Guests jump at this transfer point instead of
524 * calling it.
525 */
526 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
527 *(u8 *)(p+ 0) = 0x50; /* push %eax */
528 *(u8 *)(p+ 1) = 0x9c; /* pushf */
529 *(u8 *)(p+ 2) = 0xfa; /* cli */
530 *(u8 *)(p+ 3) = 0xb8; /* mov $<i>,%eax */
531 *(u32 *)(p+ 4) = __HYPERVISOR_iret;
532 *(u8 *)(p+ 8) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */
533 *(u32 *)(p+ 9) = (u32)&hypercall;
534 *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
535 }
537 static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
538 {
539 char *p;
540 int i;
542 /* Fill in all the transfer points with template machine code. */
544 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
545 {
546 p = (char *)(hypercall_page + (i * 32));
547 *(u8 *)(p+ 0) = 0xb8; /* mov $<i>,%eax */
548 *(u32 *)(p+ 1) = i;
549 *(u16 *)(p+ 5) = 0x82cd; /* int $0x82 */
550 *(u8 *)(p+ 7) = 0xc3; /* ret */
551 }
553 /*
554 * HYPERVISOR_iret is special because it doesn't return and expects a
555 * special stack frame. Guests jump at this transfer point instead of
556 * calling it.
557 */
558 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
559 *(u8 *)(p+ 0) = 0x50; /* push %eax */
560 *(u8 *)(p+ 1) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */
561 *(u32 *)(p+ 2) = __HYPERVISOR_iret;
562 *(u16 *)(p+ 6) = 0x82cd; /* int $0x82 */
563 }
565 void hypercall_page_initialise(struct domain *d, void *hypercall_page)
566 {
567 memset(hypercall_page, 0xCC, PAGE_SIZE);
568 if ( is_hvm_domain(d) )
569 hvm_hypercall_page_initialise(d, hypercall_page);
570 else if ( supervisor_mode_kernel )
571 hypercall_page_initialise_ring0_kernel(hypercall_page);
572 else
573 hypercall_page_initialise_ring1_kernel(hypercall_page);
574 }
576 /*
577 * Local variables:
578 * mode: C
579 * c-set-style: "BSD"
580 * c-basic-offset: 4
581 * tab-width: 4
582 * indent-tabs-mode: nil
583 * End:
584 */