direct-io.hg

view xen/arch/x86/x86_64/traps.c @ 14445:522a1cd17b6d

[XEN] Implement faster int 0x80 handling for compat mode guests.

Using the GPF handler to spot the software interrupt and pass it back
to the guest increases the base syscall time by a factor of 2.7
compared with 32on32 using direct trap to ring 1. (0.3270->0.8680
microseconds, measured with lmbench lat_syscall).

Since the 64 bit IDT can only contain 64 bit segment selectors we
cannot trap directly to compat mode ring 1. However implementing a
dedicated 64 bit ring 0 trap handler allows us to avoid much of the
GPF handler overhead and reduces the overhead to 1.7 times
(0.3270->0.5497 microseconds).

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
author Ian Campbell <ian.campbell@xensource.com>
date Tue Mar 20 14:33:15 2007 +0000 (2007-03-20)
parents 6daa91dc9247
children f830c5719e74
line source
2 #include <xen/config.h>
3 #include <xen/version.h>
4 #include <xen/init.h>
5 #include <xen/sched.h>
6 #include <xen/lib.h>
7 #include <xen/errno.h>
8 #include <xen/mm.h>
9 #include <xen/irq.h>
10 #include <xen/symbols.h>
11 #include <xen/console.h>
12 #include <xen/sched.h>
13 #include <xen/shutdown.h>
14 #include <xen/nmi.h>
15 #include <asm/current.h>
16 #include <asm/flushtlb.h>
17 #include <asm/msr.h>
18 #include <asm/page.h>
19 #include <asm/shared.h>
20 #include <asm/hvm/hvm.h>
21 #include <asm/hvm/support.h>
23 #include <public/callback.h>
25 static void print_xen_info(void)
26 {
27 char taint_str[TAINT_STRING_MAX_LEN];
28 char debug = 'n';
30 #ifndef NDEBUG
31 debug = 'y';
32 #endif
34 printk("----[ Xen-%d.%d%s x86_64 debug=%c %s ]----\n",
35 xen_major_version(), xen_minor_version(), xen_extra_version(),
36 debug, print_tainted(taint_str));
37 }
39 void show_registers(struct cpu_user_regs *regs)
40 {
41 struct cpu_user_regs fault_regs = *regs;
42 unsigned long fault_crs[8];
43 const char *context;
45 if ( is_hvm_vcpu(current) && guest_mode(regs) )
46 {
47 context = "hvm";
48 hvm_store_cpu_guest_regs(current, &fault_regs, fault_crs);
49 }
50 else
51 {
52 if ( guest_mode(regs) )
53 {
54 context = "guest";
55 fault_crs[2] = arch_get_cr2(current);
56 }
57 else
58 {
59 context = "hypervisor";
60 fault_crs[2] = read_cr2();
61 }
63 fault_crs[0] = read_cr0();
64 fault_crs[3] = read_cr3();
65 fault_crs[4] = read_cr4();
66 fault_regs.ds = read_segment_register(ds);
67 fault_regs.es = read_segment_register(es);
68 fault_regs.fs = read_segment_register(fs);
69 fault_regs.gs = read_segment_register(gs);
70 }
72 print_xen_info();
73 printk("CPU: %d\nRIP: %04x:[<%016lx>]",
74 smp_processor_id(), fault_regs.cs, fault_regs.rip);
75 if ( !guest_mode(regs) )
76 print_symbol(" %s", fault_regs.rip);
77 printk("\nRFLAGS: %016lx CONTEXT: %s\n", fault_regs.rflags, context);
78 printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
79 fault_regs.rax, fault_regs.rbx, fault_regs.rcx);
80 printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
81 fault_regs.rdx, fault_regs.rsi, fault_regs.rdi);
82 printk("rbp: %016lx rsp: %016lx r8: %016lx\n",
83 fault_regs.rbp, fault_regs.rsp, fault_regs.r8);
84 printk("r9: %016lx r10: %016lx r11: %016lx\n",
85 fault_regs.r9, fault_regs.r10, fault_regs.r11);
86 printk("r12: %016lx r13: %016lx r14: %016lx\n",
87 fault_regs.r12, fault_regs.r13, fault_regs.r14);
88 printk("r15: %016lx cr0: %016lx cr4: %016lx\n",
89 fault_regs.r15, fault_crs[0], fault_crs[4]);
90 printk("cr3: %016lx cr2: %016lx\n", fault_crs[3], fault_crs[2]);
91 printk("ds: %04x es: %04x fs: %04x gs: %04x "
92 "ss: %04x cs: %04x\n",
93 fault_regs.ds, fault_regs.es, fault_regs.fs,
94 fault_regs.gs, fault_regs.ss, fault_regs.cs);
95 }
97 void show_page_walk(unsigned long addr)
98 {
99 unsigned long pfn, mfn = read_cr3() >> PAGE_SHIFT;
100 l4_pgentry_t l4e, *l4t;
101 l3_pgentry_t l3e, *l3t;
102 l2_pgentry_t l2e, *l2t;
103 l1_pgentry_t l1e, *l1t;
105 printk("Pagetable walk from %016lx:\n", addr);
107 l4t = mfn_to_virt(mfn);
108 l4e = l4t[l4_table_offset(addr)];
109 mfn = l4e_get_pfn(l4e);
110 pfn = get_gpfn_from_mfn(mfn);
111 printk(" L4[0x%03lx] = %"PRIpte" %016lx\n",
112 l4_table_offset(addr), l4e_get_intpte(l4e), pfn);
113 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
114 return;
116 l3t = mfn_to_virt(mfn);
117 l3e = l3t[l3_table_offset(addr)];
118 mfn = l3e_get_pfn(l3e);
119 pfn = get_gpfn_from_mfn(mfn);
120 printk(" L3[0x%03lx] = %"PRIpte" %016lx\n",
121 l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
122 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
123 return;
125 l2t = mfn_to_virt(mfn);
126 l2e = l2t[l2_table_offset(addr)];
127 mfn = l2e_get_pfn(l2e);
128 pfn = get_gpfn_from_mfn(mfn);
129 printk(" L2[0x%03lx] = %"PRIpte" %016lx %s\n",
130 l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
131 (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
132 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
133 (l2e_get_flags(l2e) & _PAGE_PSE) )
134 return;
136 l1t = mfn_to_virt(mfn);
137 l1e = l1t[l1_table_offset(addr)];
138 mfn = l1e_get_pfn(l1e);
139 pfn = get_gpfn_from_mfn(mfn);
140 printk(" L1[0x%03lx] = %"PRIpte" %016lx\n",
141 l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
142 }
144 asmlinkage void double_fault(void);
145 asmlinkage void do_double_fault(struct cpu_user_regs *regs)
146 {
147 unsigned int cpu, tr;
149 asm ( "str %0" : "=r" (tr) );
150 cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
152 watchdog_disable();
154 console_force_unlock();
156 /* Find information saved during fault and dump it to the console. */
157 printk("*** DOUBLE FAULT ***\n");
158 print_xen_info();
159 printk("CPU: %d\nRIP: %04x:[<%016lx>]",
160 cpu, regs->cs, regs->rip);
161 print_symbol(" %s", regs->rip);
162 printk("\nRFLAGS: %016lx\n", regs->rflags);
163 printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
164 regs->rax, regs->rbx, regs->rcx);
165 printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
166 regs->rdx, regs->rsi, regs->rdi);
167 printk("rbp: %016lx rsp: %016lx r8: %016lx\n",
168 regs->rbp, regs->rsp, regs->r8);
169 printk("r9: %016lx r10: %016lx r11: %016lx\n",
170 regs->r9, regs->r10, regs->r11);
171 printk("r12: %016lx r13: %016lx r14: %016lx\n",
172 regs->r12, regs->r13, regs->r14);
173 printk("r15: %016lx\n", regs->r15);
174 show_stack_overflow(regs->rsp);
176 panic("DOUBLE FAULT -- system shutdown\n");
177 }
179 void toggle_guest_mode(struct vcpu *v)
180 {
181 if ( IS_COMPAT(v->domain) )
182 return;
183 v->arch.flags ^= TF_kernel_mode;
184 __asm__ __volatile__ ( "swapgs" );
185 update_cr3(v);
186 #ifdef USER_MAPPINGS_ARE_GLOBAL
187 /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
188 __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
189 #else
190 write_ptbase(v);
191 #endif
192 }
194 unsigned long do_iret(void)
195 {
196 struct cpu_user_regs *regs = guest_cpu_user_regs();
197 struct iret_context iret_saved;
198 struct vcpu *v = current;
200 if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
201 sizeof(iret_saved))) )
202 {
203 gdprintk(XENLOG_ERR, "Fault while reading IRET context from "
204 "guest stack\n");
205 goto exit_and_crash;
206 }
208 /* Returning to user mode? */
209 if ( (iret_saved.cs & 3) == 3 )
210 {
211 if ( unlikely(pagetable_is_null(v->arch.guest_table_user)) )
212 {
213 gdprintk(XENLOG_ERR, "Guest switching to user mode with no "
214 "user page tables\n");
215 goto exit_and_crash;
216 }
217 toggle_guest_mode(v);
218 }
220 regs->rip = iret_saved.rip;
221 regs->cs = iret_saved.cs | 3; /* force guest privilege */
222 regs->rflags = (iret_saved.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
223 regs->rsp = iret_saved.rsp;
224 regs->ss = iret_saved.ss | 3; /* force guest privilege */
226 if ( !(iret_saved.flags & VGCF_in_syscall) )
227 {
228 regs->entry_vector = 0;
229 regs->r11 = iret_saved.r11;
230 regs->rcx = iret_saved.rcx;
231 }
233 /* No longer in NMI context. */
234 clear_bit(_VCPUF_nmi_masked, &current->vcpu_flags);
236 /* Restore upcall mask from supplied EFLAGS.IF. */
237 vcpu_info(current, evtchn_upcall_mask) = !(iret_saved.rflags & EF_IE);
239 /* Saved %rax gets written back to regs->rax in entry.S. */
240 return iret_saved.rax;
242 exit_and_crash:
243 gdprintk(XENLOG_ERR, "Fatal error\n");
244 domain_crash(v->domain);
245 return 0;
246 }
248 asmlinkage void syscall_enter(void);
249 asmlinkage void compat_hypercall(void);
250 asmlinkage void int80_direct_trap(void);
251 void __init percpu_traps_init(void)
252 {
253 char *stack_bottom, *stack;
254 int cpu = smp_processor_id();
256 if ( cpu == 0 )
257 {
258 /* Specify dedicated interrupt stacks for NMIs and double faults. */
259 set_intr_gate(TRAP_double_fault, &double_fault);
260 idt_table[TRAP_double_fault].a |= 1UL << 32; /* IST1 */
261 idt_table[TRAP_nmi].a |= 2UL << 32; /* IST2 */
263 #ifdef CONFIG_COMPAT
264 /* The hypercall entry vector is only accessible from ring 1. */
265 _set_gate(idt_table+HYPERCALL_VECTOR, 15, 1, &compat_hypercall);
266 _set_gate(idt_table+0x80, 15, 3, &int80_direct_trap);
267 #endif
268 }
270 stack_bottom = (char *)get_stack_bottom();
271 stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
273 /* Double-fault handler has its own per-CPU 1kB stack. */
274 init_tss[cpu].ist[0] = (unsigned long)&stack[1024];
276 /* NMI handler has its own per-CPU 1kB stack. */
277 init_tss[cpu].ist[1] = (unsigned long)&stack[2048];
279 /*
280 * Trampoline for SYSCALL entry from long mode.
281 */
283 /* Skip the NMI and DF stacks. */
284 stack = &stack[2048];
285 wrmsr(MSR_LSTAR, (unsigned long)stack, ((unsigned long)stack>>32));
287 /* movq %rsp, saversp(%rip) */
288 stack[0] = 0x48;
289 stack[1] = 0x89;
290 stack[2] = 0x25;
291 *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
293 /* leaq saversp(%rip), %rsp */
294 stack[7] = 0x48;
295 stack[8] = 0x8d;
296 stack[9] = 0x25;
297 *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
299 /* pushq %r11 */
300 stack[14] = 0x41;
301 stack[15] = 0x53;
303 /* pushq $FLAT_KERNEL_CS64 */
304 stack[16] = 0x68;
305 *(u32 *)&stack[17] = FLAT_KERNEL_CS64;
307 /* jmp syscall_enter */
308 stack[21] = 0xe9;
309 *(u32 *)&stack[22] = (char *)syscall_enter - &stack[26];
311 /*
312 * Trampoline for SYSCALL entry from compatibility mode.
313 */
315 /* Skip the long-mode entry trampoline. */
316 stack = &stack[26];
317 wrmsr(MSR_CSTAR, (unsigned long)stack, ((unsigned long)stack>>32));
319 /* movq %rsp, saversp(%rip) */
320 stack[0] = 0x48;
321 stack[1] = 0x89;
322 stack[2] = 0x25;
323 *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
325 /* leaq saversp(%rip), %rsp */
326 stack[7] = 0x48;
327 stack[8] = 0x8d;
328 stack[9] = 0x25;
329 *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
331 /* pushq %r11 */
332 stack[14] = 0x41;
333 stack[15] = 0x53;
335 /* pushq $FLAT_KERNEL_CS32 */
336 stack[16] = 0x68;
337 *(u32 *)&stack[17] = FLAT_KERNEL_CS32;
339 /* jmp syscall_enter */
340 stack[21] = 0xe9;
341 *(u32 *)&stack[22] = (char *)syscall_enter - &stack[26];
343 /*
344 * Common SYSCALL parameters.
345 */
347 wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
348 wrmsr(MSR_SYSCALL_MASK, EF_VM|EF_RF|EF_NT|EF_DF|EF_IE|EF_TF, 0U);
349 }
351 void init_int80_direct_trap(struct vcpu *v)
352 {
353 struct trap_info *ti = &v->arch.guest_context.trap_ctxt[0x80];
354 struct trap_bounce *tb = &v->arch.int80_bounce;
356 if ( !guest_gate_selector_okay(v->domain, ti->cs) )
357 return;
359 tb->flags = TBF_EXCEPTION;
360 tb->cs = ti->cs;
361 tb->eip = ti->address;
363 if ( null_trap_bounce(v, tb) )
364 tb->flags = 0;
365 }
367 static long register_guest_callback(struct callback_register *reg)
368 {
369 long ret = 0;
370 struct vcpu *v = current;
372 switch ( reg->type )
373 {
374 case CALLBACKTYPE_event:
375 v->arch.guest_context.event_callback_eip = reg->address;
376 break;
378 case CALLBACKTYPE_failsafe:
379 v->arch.guest_context.failsafe_callback_eip = reg->address;
380 if ( reg->flags & CALLBACKF_mask_events )
381 set_bit(_VGCF_failsafe_disables_events,
382 &v->arch.guest_context.flags);
383 else
384 clear_bit(_VGCF_failsafe_disables_events,
385 &v->arch.guest_context.flags);
386 break;
388 case CALLBACKTYPE_syscall:
389 v->arch.guest_context.syscall_callback_eip = reg->address;
390 if ( reg->flags & CALLBACKF_mask_events )
391 set_bit(_VGCF_syscall_disables_events,
392 &v->arch.guest_context.flags);
393 else
394 clear_bit(_VGCF_syscall_disables_events,
395 &v->arch.guest_context.flags);
396 break;
398 case CALLBACKTYPE_nmi:
399 ret = register_guest_nmi_callback(reg->address);
400 break;
402 default:
403 ret = -ENOSYS;
404 break;
405 }
407 return ret;
408 }
410 static long unregister_guest_callback(struct callback_unregister *unreg)
411 {
412 long ret;
414 switch ( unreg->type )
415 {
416 case CALLBACKTYPE_event:
417 case CALLBACKTYPE_failsafe:
418 case CALLBACKTYPE_syscall:
419 ret = -EINVAL;
420 break;
422 case CALLBACKTYPE_nmi:
423 ret = unregister_guest_nmi_callback();
424 break;
426 default:
427 ret = -ENOSYS;
428 break;
429 }
431 return ret;
432 }
435 long do_callback_op(int cmd, XEN_GUEST_HANDLE(void) arg)
436 {
437 long ret;
439 switch ( cmd )
440 {
441 case CALLBACKOP_register:
442 {
443 struct callback_register reg;
445 ret = -EFAULT;
446 if ( copy_from_guest(&reg, arg, 1) )
447 break;
449 ret = register_guest_callback(&reg);
450 }
451 break;
453 case CALLBACKOP_unregister:
454 {
455 struct callback_unregister unreg;
457 ret = -EFAULT;
458 if ( copy_from_guest(&unreg, arg, 1) )
459 break;
461 ret = unregister_guest_callback(&unreg);
462 }
463 break;
465 default:
466 ret = -ENOSYS;
467 break;
468 }
470 return ret;
471 }
473 long do_set_callbacks(unsigned long event_address,
474 unsigned long failsafe_address,
475 unsigned long syscall_address)
476 {
477 struct callback_register event = {
478 .type = CALLBACKTYPE_event,
479 .address = event_address,
480 };
481 struct callback_register failsafe = {
482 .type = CALLBACKTYPE_failsafe,
483 .address = failsafe_address,
484 };
485 struct callback_register syscall = {
486 .type = CALLBACKTYPE_syscall,
487 .address = syscall_address,
488 };
490 register_guest_callback(&event);
491 register_guest_callback(&failsafe);
492 register_guest_callback(&syscall);
494 return 0;
495 }
497 static void hypercall_page_initialise_ring3_kernel(void *hypercall_page)
498 {
499 char *p;
500 int i;
502 /* Fill in all the transfer points with template machine code. */
503 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
504 {
505 p = (char *)(hypercall_page + (i * 32));
506 *(u8 *)(p+ 0) = 0x51; /* push %rcx */
507 *(u16 *)(p+ 1) = 0x5341; /* push %r11 */
508 *(u8 *)(p+ 3) = 0xb8; /* mov $<i>,%eax */
509 *(u32 *)(p+ 4) = i;
510 *(u16 *)(p+ 8) = 0x050f; /* syscall */
511 *(u16 *)(p+10) = 0x5b41; /* pop %r11 */
512 *(u8 *)(p+12) = 0x59; /* pop %rcx */
513 *(u8 *)(p+13) = 0xc3; /* ret */
514 }
516 /*
517 * HYPERVISOR_iret is special because it doesn't return and expects a
518 * special stack frame. Guests jump at this transfer point instead of
519 * calling it.
520 */
521 p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
522 *(u8 *)(p+ 0) = 0x51; /* push %rcx */
523 *(u16 *)(p+ 1) = 0x5341; /* push %r11 */
524 *(u8 *)(p+ 3) = 0x50; /* push %rax */
525 *(u8 *)(p+ 4) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */
526 *(u32 *)(p+ 5) = __HYPERVISOR_iret;
527 *(u16 *)(p+ 9) = 0x050f; /* syscall */
528 }
530 #include "compat/traps.c"
532 void hypercall_page_initialise(struct domain *d, void *hypercall_page)
533 {
534 if ( is_hvm_domain(d) )
535 hvm_hypercall_page_initialise(d, hypercall_page);
536 else if ( !IS_COMPAT(d) )
537 hypercall_page_initialise_ring3_kernel(hypercall_page);
538 else
539 hypercall_page_initialise_ring1_kernel(hypercall_page);
540 }
542 /*
543 * Local variables:
544 * mode: C
545 * c-set-style: "BSD"
546 * c-basic-offset: 4
547 * tab-width: 4
548 * indent-tabs-mode: nil
549 * End:
550 */