direct-io.hg

view xen/arch/x86/x86_32/traps.c @ 15412:acb7aa72fac7

i386: remove NMI deferral by instead making sure selector registers
are always stored/restored correctly despite the potential for an NMI
(and also MCE, with a subsequent patch) to kick in.

The idea is to always check values read from %ds and %es against
__HYPERVISOR_DS, and only store into the current frame (all normal
handlers) or the outer-most one (NMI and MCE) if the value read is
different. That way, any NMI or MCE occurring during frame setup will
store selectors not saved so far on behalf of the interrupted handler,
with that interrupted handler either having managed to read the guest
selector (in which case it can store it regardless of whether NMI/MCE
kicked in between the read and the store) or finding __HYPERVISOR_DS
already in the register, in which case it'll know not to store (as the
nested handler would have done the store).

For the restore portion this makes use of the fact that there's
exactly one such code sequence, and by moving the selector restore
part past all other restores (including all stack pointer adjustments)
the NMI/MCE handlers can safely detect whether any selector would have
been restored already (by range checking EIP) and move EIP back to the
beginning of the selector restore sequence without having to play with
the stack pointer itself or any other gpr.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Jun 21 12:13:06 2007 +0100 (2007-06-21)
parents 005dd6b1cf8e
children 56da8753ba8d
line source
2 #include <xen/config.h>
3 #include <xen/version.h>
4 #include <xen/domain_page.h>
5 #include <xen/init.h>
6 #include <xen/sched.h>
7 #include <xen/lib.h>
8 #include <xen/console.h>
9 #include <xen/mm.h>
10 #include <xen/irq.h>
11 #include <xen/symbols.h>
12 #include <xen/shutdown.h>
13 #include <xen/nmi.h>
14 #include <asm/current.h>
15 #include <asm/flushtlb.h>
16 #include <asm/hvm/hvm.h>
17 #include <asm/hvm/support.h>
19 #include <public/callback.h>
21 static void print_xen_info(void)
22 {
23 char taint_str[TAINT_STRING_MAX_LEN];
24 char debug = 'n', *arch = "x86_32";
26 #ifndef NDEBUG
27 debug = 'y';
28 #endif
30 #ifdef CONFIG_X86_PAE
31 arch = "x86_32p";
32 #endif
34 printk("----[ Xen-%d.%d%s %s debug=%c %s ]----\n",
35 xen_major_version(), xen_minor_version(), xen_extra_version(),
36 arch, debug, print_tainted(taint_str));
37 }
39 void show_registers(struct cpu_user_regs *regs)
40 {
41 struct cpu_user_regs fault_regs = *regs;
42 unsigned long fault_crs[8];
43 const char *context;
45 if ( is_hvm_vcpu(current) && guest_mode(regs) )
46 {
47 context = "hvm";
48 hvm_store_cpu_guest_regs(current, &fault_regs, fault_crs);
49 }
50 else
51 {
52 if ( !guest_mode(regs) )
53 {
54 context = "hypervisor";
55 fault_regs.esp = (unsigned long)&regs->esp;
56 fault_regs.ss = read_segment_register(ss);
57 fault_regs.ds = read_segment_register(ds);
58 fault_regs.es = read_segment_register(es);
59 fault_regs.fs = read_segment_register(fs);
60 fault_regs.gs = read_segment_register(gs);
61 fault_crs[2] = read_cr2();
62 }
63 else
64 {
65 context = "guest";
66 fault_crs[2] = current->vcpu_info->arch.cr2;
67 }
69 fault_crs[0] = read_cr0();
70 fault_crs[3] = read_cr3();
71 fault_crs[4] = read_cr4();
72 }
74 print_xen_info();
75 printk("CPU: %d\nEIP: %04x:[<%08x>]",
76 smp_processor_id(), fault_regs.cs, fault_regs.eip);
77 if ( !guest_mode(regs) )
78 print_symbol(" %s", fault_regs.eip);
79 printk("\nEFLAGS: %08x CONTEXT: %s\n", fault_regs.eflags, context);
80 printk("eax: %08x ebx: %08x ecx: %08x edx: %08x\n",
81 fault_regs.eax, fault_regs.ebx, fault_regs.ecx, fault_regs.edx);
82 printk("esi: %08x edi: %08x ebp: %08x esp: %08x\n",
83 fault_regs.esi, fault_regs.edi, fault_regs.ebp, fault_regs.esp);
84 printk("cr0: %08lx cr4: %08lx cr3: %08lx cr2: %08lx\n",
85 fault_crs[0], fault_crs[4], fault_crs[3], fault_crs[2]);
86 printk("ds: %04x es: %04x fs: %04x gs: %04x "
87 "ss: %04x cs: %04x\n",
88 fault_regs.ds, fault_regs.es, fault_regs.fs,
89 fault_regs.gs, fault_regs.ss, fault_regs.cs);
90 }
92 void show_page_walk(unsigned long addr)
93 {
94 unsigned long pfn, mfn, cr3 = read_cr3();
95 #ifdef CONFIG_X86_PAE
96 l3_pgentry_t l3e, *l3t;
97 #endif
98 l2_pgentry_t l2e, *l2t;
99 l1_pgentry_t l1e, *l1t;
101 printk("Pagetable walk from %08lx:\n", addr);
103 mfn = cr3 >> PAGE_SHIFT;
105 #ifdef CONFIG_X86_PAE
106 l3t = map_domain_page(mfn);
107 l3t += (cr3 & 0xFE0UL) >> 3;
108 l3e = l3t[l3_table_offset(addr)];
109 mfn = l3e_get_pfn(l3e);
110 pfn = get_gpfn_from_mfn(mfn);
111 printk(" L3[0x%03lx] = %"PRIpte" %08lx\n",
112 l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
113 unmap_domain_page(l3t);
114 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
115 return;
116 #endif
118 l2t = map_domain_page(mfn);
119 l2e = l2t[l2_table_offset(addr)];
120 mfn = l2e_get_pfn(l2e);
121 pfn = get_gpfn_from_mfn(mfn);
122 printk(" L2[0x%03lx] = %"PRIpte" %08lx %s\n",
123 l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
124 (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
125 unmap_domain_page(l2t);
126 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
127 (l2e_get_flags(l2e) & _PAGE_PSE) )
128 return;
130 l1t = map_domain_page(mfn);
131 l1e = l1t[l1_table_offset(addr)];
132 mfn = l1e_get_pfn(l1e);
133 pfn = get_gpfn_from_mfn(mfn);
134 printk(" L1[0x%03lx] = %"PRIpte" %08lx\n",
135 l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
136 unmap_domain_page(l1t);
137 }
139 #define DOUBLEFAULT_STACK_SIZE 2048
140 static struct tss_struct doublefault_tss;
141 static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE];
143 asmlinkage void do_double_fault(void)
144 {
145 struct tss_struct *tss = &doublefault_tss;
146 unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1;
148 watchdog_disable();
150 console_force_unlock();
152 /* Find information saved during fault and dump it to the console. */
153 tss = &init_tss[cpu];
154 printk("*** DOUBLE FAULT ***\n");
155 print_xen_info();
156 printk("CPU: %d\nEIP: %04x:[<%08x>]",
157 cpu, tss->cs, tss->eip);
158 print_symbol(" %s\n", tss->eip);
159 printk("EFLAGS: %08x\n", tss->eflags);
160 printk("CR3: %08x\n", tss->__cr3);
161 printk("eax: %08x ebx: %08x ecx: %08x edx: %08x\n",
162 tss->eax, tss->ebx, tss->ecx, tss->edx);
163 printk("esi: %08x edi: %08x ebp: %08x esp: %08x\n",
164 tss->esi, tss->edi, tss->ebp, tss->esp);
165 printk("ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
166 tss->ds, tss->es, tss->fs, tss->gs, tss->ss);
167 show_stack_overflow(cpu, tss->esp);
169 panic("DOUBLE FAULT -- system shutdown\n");
170 }
172 unsigned long do_iret(void)
173 {
174 struct cpu_user_regs *regs = guest_cpu_user_regs();
175 u32 eflags;
177 /* Check worst-case stack frame for overlap with Xen protected area. */
178 if ( unlikely(!access_ok(regs->esp, 40)) )
179 goto exit_and_crash;
181 /* Pop and restore EAX (clobbered by hypercall). */
182 if ( unlikely(__copy_from_user(&regs->eax, (void __user *)regs->esp, 4)) )
183 goto exit_and_crash;
184 regs->esp += 4;
186 /* Pop and restore CS and EIP. */
187 if ( unlikely(__copy_from_user(&regs->eip, (void __user *)regs->esp, 8)) )
188 goto exit_and_crash;
189 regs->esp += 8;
191 /*
192 * Pop, fix up and restore EFLAGS. We fix up in a local staging area
193 * to avoid firing the BUG_ON(IOPL) check in arch_get_info_guest.
194 */
195 if ( unlikely(__copy_from_user(&eflags, (void __user *)regs->esp, 4)) )
196 goto exit_and_crash;
197 regs->esp += 4;
198 regs->eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF;
200 if ( vm86_mode(regs) )
201 {
202 /* Return to VM86 mode: pop and restore ESP,SS,ES,DS,FS and GS. */
203 if ( __copy_from_user(&regs->esp, (void __user *)regs->esp, 24) )
204 goto exit_and_crash;
205 }
206 else if ( unlikely(ring_0(regs)) )
207 {
208 goto exit_and_crash;
209 }
210 else if ( !ring_1(regs) )
211 {
212 /* Return to ring 2/3: pop and restore ESP and SS. */
213 if ( __copy_from_user(&regs->esp, (void __user *)regs->esp, 8) )
214 goto exit_and_crash;
215 }
217 /* No longer in NMI context. */
218 current->nmi_masked = 0;
220 /* Restore upcall mask from supplied EFLAGS.IF. */
221 current->vcpu_info->evtchn_upcall_mask = !(eflags & X86_EFLAGS_IF);
223 /*
224 * The hypercall exit path will overwrite EAX with this return
225 * value.
226 */
227 return regs->eax;
229 exit_and_crash:
230 gdprintk(XENLOG_ERR, "Fatal error\n");
231 domain_crash(current->domain);
232 return 0;
233 }
235 void __init percpu_traps_init(void)
236 {
237 struct tss_struct *tss = &doublefault_tss;
238 asmlinkage int hypercall(void);
240 if ( smp_processor_id() != 0 )
241 return;
243 /* The hypercall entry vector is only accessible from ring 1. */
244 _set_gate(idt_table+HYPERCALL_VECTOR, 14, 1, &hypercall);
246 /*
247 * Make a separate task for double faults. This will get us debug output if
248 * we blow the kernel stack.
249 */
250 memset(tss, 0, sizeof(*tss));
251 tss->ds = __HYPERVISOR_DS;
252 tss->es = __HYPERVISOR_DS;
253 tss->ss = __HYPERVISOR_DS;
254 tss->esp = (unsigned long)&doublefault_stack[DOUBLEFAULT_STACK_SIZE];
255 tss->__cr3 = __pa(idle_pg_table);
256 tss->cs = __HYPERVISOR_CS;
257 tss->eip = (unsigned long)do_double_fault;
258 tss->eflags = 2;
259 tss->bitmap = IOBMP_INVALID_OFFSET;
260 _set_tssldt_desc(
261 gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
262 (unsigned long)tss, 235, 9);
264 set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
265 }
267 void init_int80_direct_trap(struct vcpu *v)
268 {
269 struct trap_info *ti = &v->arch.guest_context.trap_ctxt[0x80];
271 /*
272 * We can't virtualise interrupt gates, as there's no way to get
273 * the CPU to automatically clear the events_mask variable. Also we
274 * must ensure that the CS is safe to poke into an interrupt gate.
275 *
276 * When running with supervisor_mode_kernel enabled a direct trap
277 * to the guest OS cannot be used because the INT instruction will
278 * switch to the Xen stack and we need to swap back to the guest
279 * kernel stack before passing control to the system call entry point.
280 */
281 if ( TI_GET_IF(ti) || !guest_gate_selector_okay(v->domain, ti->cs) ||
282 supervisor_mode_kernel )
283 {
284 v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
285 return;
286 }
288 v->arch.int80_desc.a = (ti->cs << 16) | (ti->address & 0xffff);
289 v->arch.int80_desc.b =
290 (ti->address & 0xffff0000) | 0x8f00 | ((TI_GET_DPL(ti) & 3) << 13);
292 if ( v == current )
293 set_int80_direct_trap(v);
294 }
296 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
297 static void do_update_sysenter(void *info)
298 {
299 xen_callback_t *address = info;
301 wrmsr(MSR_IA32_SYSENTER_CS, address->cs, 0);
302 wrmsr(MSR_IA32_SYSENTER_EIP, address->eip, 0);
303 }
304 #endif
306 static long register_guest_callback(struct callback_register *reg)
307 {
308 long ret = 0;
309 struct vcpu *v = current;
311 fixup_guest_code_selector(v->domain, reg->address.cs);
313 switch ( reg->type )
314 {
315 case CALLBACKTYPE_event:
316 v->arch.guest_context.event_callback_cs = reg->address.cs;
317 v->arch.guest_context.event_callback_eip = reg->address.eip;
318 break;
320 case CALLBACKTYPE_failsafe:
321 v->arch.guest_context.failsafe_callback_cs = reg->address.cs;
322 v->arch.guest_context.failsafe_callback_eip = reg->address.eip;
323 if ( reg->flags & CALLBACKF_mask_events )
324 set_bit(_VGCF_failsafe_disables_events,
325 &v->arch.guest_context.flags);
326 else
327 clear_bit(_VGCF_failsafe_disables_events,
328 &v->arch.guest_context.flags);
329 break;
331 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
332 case CALLBACKTYPE_sysenter:
333 if ( ! cpu_has_sep )
334 ret = -EINVAL;
335 else if ( on_each_cpu(do_update_sysenter, &reg->address, 1, 1) != 0 )
336 ret = -EIO;
337 break;
338 #endif
340 case CALLBACKTYPE_nmi:
341 ret = register_guest_nmi_callback(reg->address.eip);
342 break;
344 default:
345 ret = -ENOSYS;
346 break;
347 }
349 return ret;
350 }
352 static long unregister_guest_callback(struct callback_unregister *unreg)
353 {
354 long ret;
356 switch ( unreg->type )
357 {
358 case CALLBACKTYPE_event:
359 case CALLBACKTYPE_failsafe:
360 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
361 case CALLBACKTYPE_sysenter:
362 #endif
363 ret = -EINVAL;
364 break;
366 case CALLBACKTYPE_nmi:
367 ret = unregister_guest_nmi_callback();
368 break;
370 default:
371 ret = -ENOSYS;
372 break;
373 }
375 return ret;
376 }
379 long do_callback_op(int cmd, XEN_GUEST_HANDLE(void) arg)
380 {
381 long ret;
383 switch ( cmd )
384 {
385 case CALLBACKOP_register:
386 {
387 struct callback_register reg;
389 ret = -EFAULT;
390 if ( copy_from_guest(&reg, arg, 1) )
391 break;
393 ret = register_guest_callback(&reg);
394 }
395 break;
397 case CALLBACKOP_unregister:
398 {
399 struct callback_unregister unreg;
401 ret = -EFAULT;
402 if ( copy_from_guest(&unreg, arg, 1) )
403 break;
405 ret = unregister_guest_callback(&unreg);
406 }
407 break;
409 default:
410 ret = -ENOSYS;
411 break;
412 }
414 return ret;
415 }
417 long do_set_callbacks(unsigned long event_selector,
418 unsigned long event_address,
419 unsigned long failsafe_selector,
420 unsigned long failsafe_address)
421 {
422 struct callback_register event = {
423 .type = CALLBACKTYPE_event,
424 .address = { event_selector, event_address },
425 };
426 struct callback_register failsafe = {
427 .type = CALLBACKTYPE_failsafe,
428 .address = { failsafe_selector, failsafe_address },
429 };
431 register_guest_callback(&event);
432 register_guest_callback(&failsafe);
434 return 0;
435 }
437 static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
438 {
439 extern asmlinkage int hypercall(void);
440 char *p;
441 int i;
443 /* Fill in all the transfer points with template machine code. */
445 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
446 {
447 p = (char *)(hypercall_page + (i * 32));
449 *(u8 *)(p+ 0) = 0x9c; /* pushf */
450 *(u8 *)(p+ 1) = 0xfa; /* cli */
451 *(u8 *)(p+ 2) = 0xb8; /* mov $<i>,%eax */
452 *(u32 *)(p+ 3) = i;
453 *(u8 *)(p+ 7) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */
454 *(u32 *)(p+ 8) = (u32)&hypercall;
455 *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
456 *(u8 *)(p+14) = 0xc3; /* ret */
457 }
459 /*
460 * HYPERVISOR_iret is special because it doesn't return and expects a
461 * special stack frame. Guests jump at this transfer point instead of
462 * calling it.
463 */
464 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
465 *(u8 *)(p+ 0) = 0x50; /* push %eax */
466 *(u8 *)(p+ 1) = 0x9c; /* pushf */
467 *(u8 *)(p+ 2) = 0xfa; /* cli */
468 *(u8 *)(p+ 3) = 0xb8; /* mov $<i>,%eax */
469 *(u32 *)(p+ 4) = __HYPERVISOR_iret;
470 *(u8 *)(p+ 8) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */
471 *(u32 *)(p+ 9) = (u32)&hypercall;
472 *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
473 }
475 static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
476 {
477 char *p;
478 int i;
480 /* Fill in all the transfer points with template machine code. */
482 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
483 {
484 p = (char *)(hypercall_page + (i * 32));
485 *(u8 *)(p+ 0) = 0xb8; /* mov $<i>,%eax */
486 *(u32 *)(p+ 1) = i;
487 *(u16 *)(p+ 5) = 0x82cd; /* int $0x82 */
488 *(u8 *)(p+ 7) = 0xc3; /* ret */
489 }
491 /*
492 * HYPERVISOR_iret is special because it doesn't return and expects a
493 * special stack frame. Guests jump at this transfer point instead of
494 * calling it.
495 */
496 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
497 *(u8 *)(p+ 0) = 0x50; /* push %eax */
498 *(u8 *)(p+ 1) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */
499 *(u32 *)(p+ 2) = __HYPERVISOR_iret;
500 *(u16 *)(p+ 6) = 0x82cd; /* int $0x82 */
501 }
503 void hypercall_page_initialise(struct domain *d, void *hypercall_page)
504 {
505 memset(hypercall_page, 0xCC, PAGE_SIZE);
506 if ( is_hvm_domain(d) )
507 hvm_hypercall_page_initialise(d, hypercall_page);
508 else if ( supervisor_mode_kernel )
509 hypercall_page_initialise_ring0_kernel(hypercall_page);
510 else
511 hypercall_page_initialise_ring1_kernel(hypercall_page);
512 }
514 /*
515 * Local variables:
516 * mode: C
517 * c-set-style: "BSD"
518 * c-basic-offset: 4
519 * tab-width: 4
520 * indent-tabs-mode: nil
521 * End:
522 */