direct-io.hg

view xen/arch/x86/x86_64/traps.c @ 15454:83cbda5c1e1b

x86-64: bump STACK_SIZE to 32 so that trampoline and IST stacks fit
without undue squeezing.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Tue Jul 03 11:41:25 2007 +0100 (2007-07-03)
parents 3cf5052ba5e5
children
line source
2 #include <xen/config.h>
3 #include <xen/version.h>
4 #include <xen/init.h>
5 #include <xen/sched.h>
6 #include <xen/lib.h>
7 #include <xen/errno.h>
8 #include <xen/mm.h>
9 #include <xen/irq.h>
10 #include <xen/symbols.h>
11 #include <xen/console.h>
12 #include <xen/sched.h>
13 #include <xen/shutdown.h>
14 #include <xen/nmi.h>
15 #include <asm/current.h>
16 #include <asm/flushtlb.h>
17 #include <asm/msr.h>
18 #include <asm/page.h>
19 #include <asm/shared.h>
20 #include <asm/hvm/hvm.h>
21 #include <asm/hvm/support.h>
22 #include <public/callback.h>
24 asmlinkage void syscall_enter(void);
25 asmlinkage void compat_hypercall(void);
26 asmlinkage void int80_direct_trap(void);
28 static void print_xen_info(void)
29 {
30 char taint_str[TAINT_STRING_MAX_LEN];
31 char debug = 'n';
33 #ifndef NDEBUG
34 debug = 'y';
35 #endif
37 printk("----[ Xen-%d.%d%s x86_64 debug=%c %s ]----\n",
38 xen_major_version(), xen_minor_version(), xen_extra_version(),
39 debug, print_tainted(taint_str));
40 }
42 void show_registers(struct cpu_user_regs *regs)
43 {
44 struct cpu_user_regs fault_regs = *regs;
45 unsigned long fault_crs[8];
46 const char *context;
48 if ( is_hvm_vcpu(current) && guest_mode(regs) )
49 {
50 context = "hvm";
51 hvm_store_cpu_guest_regs(current, &fault_regs, fault_crs);
52 }
53 else
54 {
55 if ( guest_mode(regs) )
56 {
57 context = "guest";
58 fault_crs[2] = arch_get_cr2(current);
59 }
60 else
61 {
62 context = "hypervisor";
63 fault_crs[2] = read_cr2();
64 }
66 fault_crs[0] = read_cr0();
67 fault_crs[3] = read_cr3();
68 fault_crs[4] = read_cr4();
69 fault_regs.ds = read_segment_register(ds);
70 fault_regs.es = read_segment_register(es);
71 fault_regs.fs = read_segment_register(fs);
72 fault_regs.gs = read_segment_register(gs);
73 }
75 print_xen_info();
76 printk("CPU: %d\nRIP: %04x:[<%016lx>]",
77 smp_processor_id(), fault_regs.cs, fault_regs.rip);
78 if ( !guest_mode(regs) )
79 print_symbol(" %s", fault_regs.rip);
80 printk("\nRFLAGS: %016lx CONTEXT: %s\n", fault_regs.rflags, context);
81 printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
82 fault_regs.rax, fault_regs.rbx, fault_regs.rcx);
83 printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
84 fault_regs.rdx, fault_regs.rsi, fault_regs.rdi);
85 printk("rbp: %016lx rsp: %016lx r8: %016lx\n",
86 fault_regs.rbp, fault_regs.rsp, fault_regs.r8);
87 printk("r9: %016lx r10: %016lx r11: %016lx\n",
88 fault_regs.r9, fault_regs.r10, fault_regs.r11);
89 printk("r12: %016lx r13: %016lx r14: %016lx\n",
90 fault_regs.r12, fault_regs.r13, fault_regs.r14);
91 printk("r15: %016lx cr0: %016lx cr4: %016lx\n",
92 fault_regs.r15, fault_crs[0], fault_crs[4]);
93 printk("cr3: %016lx cr2: %016lx\n", fault_crs[3], fault_crs[2]);
94 printk("ds: %04x es: %04x fs: %04x gs: %04x "
95 "ss: %04x cs: %04x\n",
96 fault_regs.ds, fault_regs.es, fault_regs.fs,
97 fault_regs.gs, fault_regs.ss, fault_regs.cs);
98 }
100 void show_page_walk(unsigned long addr)
101 {
102 unsigned long pfn, mfn = read_cr3() >> PAGE_SHIFT;
103 l4_pgentry_t l4e, *l4t;
104 l3_pgentry_t l3e, *l3t;
105 l2_pgentry_t l2e, *l2t;
106 l1_pgentry_t l1e, *l1t;
108 printk("Pagetable walk from %016lx:\n", addr);
110 l4t = mfn_to_virt(mfn);
111 l4e = l4t[l4_table_offset(addr)];
112 mfn = l4e_get_pfn(l4e);
113 pfn = get_gpfn_from_mfn(mfn);
114 printk(" L4[0x%03lx] = %"PRIpte" %016lx\n",
115 l4_table_offset(addr), l4e_get_intpte(l4e), pfn);
116 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
117 return;
119 l3t = mfn_to_virt(mfn);
120 l3e = l3t[l3_table_offset(addr)];
121 mfn = l3e_get_pfn(l3e);
122 pfn = get_gpfn_from_mfn(mfn);
123 printk(" L3[0x%03lx] = %"PRIpte" %016lx\n",
124 l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
125 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
126 return;
128 l2t = mfn_to_virt(mfn);
129 l2e = l2t[l2_table_offset(addr)];
130 mfn = l2e_get_pfn(l2e);
131 pfn = get_gpfn_from_mfn(mfn);
132 printk(" L2[0x%03lx] = %"PRIpte" %016lx %s\n",
133 l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
134 (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
135 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
136 (l2e_get_flags(l2e) & _PAGE_PSE) )
137 return;
139 l1t = mfn_to_virt(mfn);
140 l1e = l1t[l1_table_offset(addr)];
141 mfn = l1e_get_pfn(l1e);
142 pfn = get_gpfn_from_mfn(mfn);
143 printk(" L1[0x%03lx] = %"PRIpte" %016lx\n",
144 l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
145 }
147 asmlinkage void double_fault(void);
148 asmlinkage void do_double_fault(struct cpu_user_regs *regs)
149 {
150 unsigned int cpu, tr;
152 asm ( "str %0" : "=r" (tr) );
153 cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
155 watchdog_disable();
157 console_force_unlock();
159 /* Find information saved during fault and dump it to the console. */
160 printk("*** DOUBLE FAULT ***\n");
161 print_xen_info();
162 printk("CPU: %d\nRIP: %04x:[<%016lx>]",
163 cpu, regs->cs, regs->rip);
164 print_symbol(" %s", regs->rip);
165 printk("\nRFLAGS: %016lx\n", regs->rflags);
166 printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
167 regs->rax, regs->rbx, regs->rcx);
168 printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
169 regs->rdx, regs->rsi, regs->rdi);
170 printk("rbp: %016lx rsp: %016lx r8: %016lx\n",
171 regs->rbp, regs->rsp, regs->r8);
172 printk("r9: %016lx r10: %016lx r11: %016lx\n",
173 regs->r9, regs->r10, regs->r11);
174 printk("r12: %016lx r13: %016lx r14: %016lx\n",
175 regs->r12, regs->r13, regs->r14);
176 printk("r15: %016lx cs: %016lx ss: %016lx\n",
177 regs->r15, (long)regs->cs, (long)regs->ss);
178 show_stack_overflow(cpu, regs->rsp);
180 panic("DOUBLE FAULT -- system shutdown\n");
181 }
183 void toggle_guest_mode(struct vcpu *v)
184 {
185 if ( is_pv_32bit_vcpu(v) )
186 return;
187 v->arch.flags ^= TF_kernel_mode;
188 __asm__ __volatile__ ( "swapgs" );
189 update_cr3(v);
190 #ifdef USER_MAPPINGS_ARE_GLOBAL
191 /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
192 __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
193 #else
194 write_ptbase(v);
195 #endif
196 }
198 unsigned long do_iret(void)
199 {
200 struct cpu_user_regs *regs = guest_cpu_user_regs();
201 struct iret_context iret_saved;
202 struct vcpu *v = current;
204 if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
205 sizeof(iret_saved))) )
206 {
207 gdprintk(XENLOG_ERR, "Fault while reading IRET context from "
208 "guest stack\n");
209 goto exit_and_crash;
210 }
212 /* Returning to user mode? */
213 if ( (iret_saved.cs & 3) == 3 )
214 {
215 if ( unlikely(pagetable_is_null(v->arch.guest_table_user)) )
216 {
217 gdprintk(XENLOG_ERR, "Guest switching to user mode with no "
218 "user page tables\n");
219 goto exit_and_crash;
220 }
221 toggle_guest_mode(v);
222 }
224 regs->rip = iret_saved.rip;
225 regs->cs = iret_saved.cs | 3; /* force guest privilege */
226 regs->rflags = (iret_saved.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
227 regs->rsp = iret_saved.rsp;
228 regs->ss = iret_saved.ss | 3; /* force guest privilege */
230 if ( !(iret_saved.flags & VGCF_in_syscall) )
231 {
232 regs->entry_vector = 0;
233 regs->r11 = iret_saved.r11;
234 regs->rcx = iret_saved.rcx;
235 }
237 /* No longer in NMI context. */
238 current->nmi_masked = 0;
240 /* Restore upcall mask from supplied EFLAGS.IF. */
241 vcpu_info(current, evtchn_upcall_mask) = !(iret_saved.rflags & EF_IE);
243 /* Saved %rax gets written back to regs->rax in entry.S. */
244 return iret_saved.rax;
246 exit_and_crash:
247 gdprintk(XENLOG_ERR, "Fatal error\n");
248 domain_crash(v->domain);
249 return 0;
250 }
252 static int write_stack_trampoline(
253 char *stack, char *stack_bottom, uint16_t cs_seg)
254 {
255 /* movq %rsp, saversp(%rip) */
256 stack[0] = 0x48;
257 stack[1] = 0x89;
258 stack[2] = 0x25;
259 *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
261 /* leaq saversp(%rip), %rsp */
262 stack[7] = 0x48;
263 stack[8] = 0x8d;
264 stack[9] = 0x25;
265 *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
267 /* pushq %r11 */
268 stack[14] = 0x41;
269 stack[15] = 0x53;
271 /* pushq $<cs_seg> */
272 stack[16] = 0x68;
273 *(u32 *)&stack[17] = cs_seg;
275 /* movq $syscall_enter,%r11 */
276 stack[21] = 0x49;
277 stack[22] = 0xbb;
278 *(void **)&stack[23] = (void *)syscall_enter;
280 /* jmpq *%r11 */
281 stack[31] = 0x41;
282 stack[32] = 0xff;
283 stack[33] = 0xe3;
285 return 34;
286 }
288 void __init percpu_traps_init(void)
289 {
290 char *stack_bottom, *stack;
291 int cpu = smp_processor_id();
293 if ( cpu == 0 )
294 {
295 /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */
296 set_intr_gate(TRAP_double_fault, &double_fault);
297 idt_table[TRAP_double_fault].a |= IST_DF << 32;
298 idt_table[TRAP_nmi].a |= IST_NMI << 32;
299 idt_table[TRAP_machine_check].a |= IST_MCE << 32;
301 /*
302 * The 32-on-64 hypercall entry vector is only accessible from ring 1.
303 * Also note that this is a trap gate, not an interrupt gate.
304 */
305 _set_gate(idt_table+HYPERCALL_VECTOR, 15, 1, &compat_hypercall);
307 /* Fast trap for int80 (faster than taking the #GP-fixup path). */
308 _set_gate(idt_table+0x80, 15, 3, &int80_direct_trap);
309 }
311 stack_bottom = (char *)get_stack_bottom();
312 stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
314 /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
315 BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
317 /* Machine Check handler has its own per-CPU 4kB stack. */
318 init_tss[cpu].ist[IST_MCE] = (unsigned long)&stack[IST_MCE * PAGE_SIZE];
320 /* Double-fault handler has its own per-CPU 4kB stack. */
321 init_tss[cpu].ist[IST_DF] = (unsigned long)&stack[IST_DF * PAGE_SIZE];
323 /* NMI handler has its own per-CPU 4kB stack. */
324 init_tss[cpu].ist[IST_NMI] = (unsigned long)&stack[IST_NMI * PAGE_SIZE];
326 /* Trampoline for SYSCALL entry from long mode. */
327 stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
328 wrmsr(MSR_LSTAR, (unsigned long)stack, ((unsigned long)stack>>32));
329 stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
331 /* Trampoline for SYSCALL entry from compatibility mode. */
332 wrmsr(MSR_CSTAR, (unsigned long)stack, ((unsigned long)stack>>32));
333 stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS32);
335 /* Common SYSCALL parameters. */
336 wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
337 wrmsr(MSR_SYSCALL_MASK, EF_VM|EF_RF|EF_NT|EF_DF|EF_IE|EF_TF, 0U);
338 }
340 void init_int80_direct_trap(struct vcpu *v)
341 {
342 struct trap_info *ti = &v->arch.guest_context.trap_ctxt[0x80];
343 struct trap_bounce *tb = &v->arch.int80_bounce;
345 tb->flags = TBF_EXCEPTION;
346 tb->cs = ti->cs;
347 tb->eip = ti->address;
349 if ( null_trap_bounce(v, tb) )
350 tb->flags = 0;
351 }
353 static long register_guest_callback(struct callback_register *reg)
354 {
355 long ret = 0;
356 struct vcpu *v = current;
358 switch ( reg->type )
359 {
360 case CALLBACKTYPE_event:
361 v->arch.guest_context.event_callback_eip = reg->address;
362 break;
364 case CALLBACKTYPE_failsafe:
365 v->arch.guest_context.failsafe_callback_eip = reg->address;
366 if ( reg->flags & CALLBACKF_mask_events )
367 set_bit(_VGCF_failsafe_disables_events,
368 &v->arch.guest_context.flags);
369 else
370 clear_bit(_VGCF_failsafe_disables_events,
371 &v->arch.guest_context.flags);
372 break;
374 case CALLBACKTYPE_syscall:
375 v->arch.guest_context.syscall_callback_eip = reg->address;
376 if ( reg->flags & CALLBACKF_mask_events )
377 set_bit(_VGCF_syscall_disables_events,
378 &v->arch.guest_context.flags);
379 else
380 clear_bit(_VGCF_syscall_disables_events,
381 &v->arch.guest_context.flags);
382 break;
384 case CALLBACKTYPE_nmi:
385 ret = register_guest_nmi_callback(reg->address);
386 break;
388 default:
389 ret = -ENOSYS;
390 break;
391 }
393 return ret;
394 }
396 static long unregister_guest_callback(struct callback_unregister *unreg)
397 {
398 long ret;
400 switch ( unreg->type )
401 {
402 case CALLBACKTYPE_event:
403 case CALLBACKTYPE_failsafe:
404 case CALLBACKTYPE_syscall:
405 ret = -EINVAL;
406 break;
408 case CALLBACKTYPE_nmi:
409 ret = unregister_guest_nmi_callback();
410 break;
412 default:
413 ret = -ENOSYS;
414 break;
415 }
417 return ret;
418 }
421 long do_callback_op(int cmd, XEN_GUEST_HANDLE(void) arg)
422 {
423 long ret;
425 switch ( cmd )
426 {
427 case CALLBACKOP_register:
428 {
429 struct callback_register reg;
431 ret = -EFAULT;
432 if ( copy_from_guest(&reg, arg, 1) )
433 break;
435 ret = register_guest_callback(&reg);
436 }
437 break;
439 case CALLBACKOP_unregister:
440 {
441 struct callback_unregister unreg;
443 ret = -EFAULT;
444 if ( copy_from_guest(&unreg, arg, 1) )
445 break;
447 ret = unregister_guest_callback(&unreg);
448 }
449 break;
451 default:
452 ret = -ENOSYS;
453 break;
454 }
456 return ret;
457 }
459 long do_set_callbacks(unsigned long event_address,
460 unsigned long failsafe_address,
461 unsigned long syscall_address)
462 {
463 struct callback_register event = {
464 .type = CALLBACKTYPE_event,
465 .address = event_address,
466 };
467 struct callback_register failsafe = {
468 .type = CALLBACKTYPE_failsafe,
469 .address = failsafe_address,
470 };
471 struct callback_register syscall = {
472 .type = CALLBACKTYPE_syscall,
473 .address = syscall_address,
474 };
476 register_guest_callback(&event);
477 register_guest_callback(&failsafe);
478 register_guest_callback(&syscall);
480 return 0;
481 }
483 static void hypercall_page_initialise_ring3_kernel(void *hypercall_page)
484 {
485 char *p;
486 int i;
488 /* Fill in all the transfer points with template machine code. */
489 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
490 {
491 p = (char *)(hypercall_page + (i * 32));
492 *(u8 *)(p+ 0) = 0x51; /* push %rcx */
493 *(u16 *)(p+ 1) = 0x5341; /* push %r11 */
494 *(u8 *)(p+ 3) = 0xb8; /* mov $<i>,%eax */
495 *(u32 *)(p+ 4) = i;
496 *(u16 *)(p+ 8) = 0x050f; /* syscall */
497 *(u16 *)(p+10) = 0x5b41; /* pop %r11 */
498 *(u8 *)(p+12) = 0x59; /* pop %rcx */
499 *(u8 *)(p+13) = 0xc3; /* ret */
500 }
502 /*
503 * HYPERVISOR_iret is special because it doesn't return and expects a
504 * special stack frame. Guests jump at this transfer point instead of
505 * calling it.
506 */
507 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
508 *(u8 *)(p+ 0) = 0x51; /* push %rcx */
509 *(u16 *)(p+ 1) = 0x5341; /* push %r11 */
510 *(u8 *)(p+ 3) = 0x50; /* push %rax */
511 *(u8 *)(p+ 4) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */
512 *(u32 *)(p+ 5) = __HYPERVISOR_iret;
513 *(u16 *)(p+ 9) = 0x050f; /* syscall */
514 }
516 #include "compat/traps.c"
518 void hypercall_page_initialise(struct domain *d, void *hypercall_page)
519 {
520 memset(hypercall_page, 0xCC, PAGE_SIZE);
521 if ( is_hvm_domain(d) )
522 hvm_hypercall_page_initialise(d, hypercall_page);
523 else if ( !is_pv_32bit_domain(d) )
524 hypercall_page_initialise_ring3_kernel(hypercall_page);
525 else
526 hypercall_page_initialise_ring1_kernel(hypercall_page);
527 }
529 /*
530 * Local variables:
531 * mode: C
532 * c-set-style: "BSD"
533 * c-basic-offset: 4
534 * tab-width: 4
535 * indent-tabs-mode: nil
536 * End:
537 */