ia64/xen-unstable

view xen/arch/x86/traps.c @ 19848:5839491bbf20

[IA64] replace MAX_VCPUS with d->max_vcpus where necessary.

don't use MAX_VCPUS, and use vcpu::max_vcpus.
The changeset of 2f9e1348aa98 introduced max_vcpus to allow more vcpus
per guest. This patch is ia64 counter part.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Isaku Yamahata <yamahata@valinux.co.jp>
date Mon Jun 29 11:26:05 2009 +0900 (2009-06-29)
parents 2f9e1348aa98
children
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/traps.h>
65 #include <asm/hvm/vpt.h>
66 #include <public/arch-x86/cpuid.h>
68 /*
69 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
70 * fatal: Xen prints diagnostic message and then hangs.
71 * dom0: The NMI is virtualised to DOM0.
72 * ignore: The NMI error is cleared and ignored.
73 */
74 #ifdef NDEBUG
75 char opt_nmi[10] = "dom0";
76 #else
77 char opt_nmi[10] = "fatal";
78 #endif
79 string_param("nmi", opt_nmi);
81 DEFINE_PER_CPU(u32, ler_msr);
83 /* Master table, used by CPU0. */
84 idt_entry_t idt_table[IDT_ENTRIES];
86 /* Pointer to the IDT of every CPU. */
87 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
89 #define DECLARE_TRAP_HANDLER(_name) \
90 asmlinkage void _name(void); \
91 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(nmi);
96 DECLARE_TRAP_HANDLER(int3);
97 DECLARE_TRAP_HANDLER(overflow);
98 DECLARE_TRAP_HANDLER(bounds);
99 DECLARE_TRAP_HANDLER(invalid_op);
100 DECLARE_TRAP_HANDLER(device_not_available);
101 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
102 DECLARE_TRAP_HANDLER(invalid_TSS);
103 DECLARE_TRAP_HANDLER(segment_not_present);
104 DECLARE_TRAP_HANDLER(stack_segment);
105 DECLARE_TRAP_HANDLER(general_protection);
106 DECLARE_TRAP_HANDLER(page_fault);
107 DECLARE_TRAP_HANDLER(coprocessor_error);
108 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
109 DECLARE_TRAP_HANDLER(machine_check);
110 DECLARE_TRAP_HANDLER(alignment_check);
111 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
113 long do_set_debugreg(int reg, unsigned long value);
114 unsigned long do_get_debugreg(int reg);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct vcpu *v, struct cpu_user_regs *regs)
133 {
134 int i;
135 unsigned long *stack, addr;
136 unsigned long mask = STACK_SIZE;
138 if ( is_hvm_vcpu(v) )
139 return;
141 if ( is_pv_32on64_vcpu(v) )
142 {
143 compat_show_guest_stack(v, regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 if ( !access_ok(stack, sizeof(*stack)) )
160 {
161 printk("Guest-inaccessible memory.\n");
162 return;
163 }
165 if ( v != current )
166 {
167 struct vcpu *vcpu;
169 ASSERT(guest_kernel_mode(v, regs));
170 #ifndef __x86_64__
171 addr = read_cr3();
172 for_each_vcpu( v->domain, vcpu )
173 if ( vcpu->arch.cr3 == addr )
174 break;
175 #else
176 vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
177 #endif
178 if ( !vcpu )
179 {
180 stack = do_page_walk(v, (unsigned long)stack);
181 if ( (unsigned long)stack < PAGE_SIZE )
182 {
183 printk("Inaccessible guest memory.\n");
184 return;
185 }
186 mask = PAGE_SIZE;
187 }
188 }
190 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
191 {
192 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
193 break;
194 if ( __get_user(addr, stack) )
195 {
196 if ( i != 0 )
197 printk("\n ");
198 printk("Fault while accessing guest memory.");
199 i = 1;
200 break;
201 }
202 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
203 printk("\n ");
204 printk(" %p", _p(addr));
205 stack++;
206 }
207 if ( i == 0 )
208 printk("Stack empty.");
209 printk("\n");
210 }
212 #if !defined(CONFIG_FRAME_POINTER)
214 static void show_trace(struct cpu_user_regs *regs)
215 {
216 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
218 printk("Xen call trace:\n ");
220 printk("[<%p>]", _p(regs->eip));
221 print_symbol(" %s\n ", regs->eip);
223 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
224 {
225 addr = *stack++;
226 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
227 {
228 printk("[<%p>]", _p(addr));
229 print_symbol(" %s\n ", addr);
230 }
231 }
233 printk("\n");
234 }
236 #else
238 static void show_trace(struct cpu_user_regs *regs)
239 {
240 unsigned long *frame, next, addr, low, high;
242 printk("Xen call trace:\n ");
244 printk("[<%p>]", _p(regs->eip));
245 print_symbol(" %s\n ", regs->eip);
247 /* Bounds for range of valid frame pointer. */
248 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
249 high = (low & ~(STACK_SIZE - 1)) +
250 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
252 /* The initial frame pointer. */
253 next = regs->ebp;
255 for ( ; ; )
256 {
257 /* Valid frame pointer? */
258 if ( (next < low) || (next >= high) )
259 {
260 /*
261 * Exception stack frames have a different layout, denoted by an
262 * inverted frame pointer.
263 */
264 next = ~next;
265 if ( (next < low) || (next >= high) )
266 break;
267 frame = (unsigned long *)next;
268 next = frame[0];
269 addr = frame[(offsetof(struct cpu_user_regs, eip) -
270 offsetof(struct cpu_user_regs, ebp))
271 / BYTES_PER_LONG];
272 }
273 else
274 {
275 /* Ordinary stack frame. */
276 frame = (unsigned long *)next;
277 next = frame[0];
278 addr = frame[1];
279 }
281 printk("[<%p>]", _p(addr));
282 print_symbol(" %s\n ", addr);
284 low = (unsigned long)&frame[2];
285 }
287 printk("\n");
288 }
290 #endif
292 void show_stack(struct cpu_user_regs *regs)
293 {
294 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
295 int i;
297 if ( guest_mode(regs) )
298 return show_guest_stack(current, regs);
300 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
302 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
303 {
304 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
305 break;
306 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
307 printk("\n ");
308 addr = *stack++;
309 printk(" %p", _p(addr));
310 }
311 if ( i == 0 )
312 printk("Stack empty.");
313 printk("\n");
315 show_trace(regs);
316 }
318 void show_stack_overflow(unsigned int cpu, unsigned long esp)
319 {
320 #ifdef MEMORY_GUARD
321 unsigned long esp_top, esp_bottom;
322 unsigned long *stack, addr;
324 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
325 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
327 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
328 (void *)esp_top, (void *)esp_bottom, (void *)esp,
329 (void *)init_tss[cpu].esp0);
331 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
332 if ( ((unsigned long)(esp - esp_top) > 512) &&
333 ((unsigned long)(esp_top - esp) > 512) )
334 {
335 printk("No stack overflow detected. Skipping stack trace.\n");
336 return;
337 }
339 if ( esp < esp_top )
340 esp = esp_top;
342 printk("Xen stack overflow (dumping trace %p-%p):\n ",
343 (void *)esp, (void *)esp_bottom);
345 stack = (unsigned long *)esp;
346 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
347 {
348 addr = *stack++;
349 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
350 {
351 printk("%p: [<%p>]", stack, _p(addr));
352 print_symbol(" %s\n ", addr);
353 }
354 }
356 printk("\n");
357 #endif
358 }
360 void show_execution_state(struct cpu_user_regs *regs)
361 {
362 show_registers(regs);
363 show_stack(regs);
364 }
366 void vcpu_show_execution_state(struct vcpu *v)
367 {
368 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
369 v->domain->domain_id, v->vcpu_id);
371 if ( v == current )
372 {
373 show_execution_state(guest_cpu_user_regs());
374 return;
375 }
377 vcpu_pause(v); /* acceptably dangerous */
379 vcpu_show_registers(v);
380 if ( guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
381 show_guest_stack(v, &v->arch.guest_context.user_regs);
383 vcpu_unpause(v);
384 }
386 char *trapstr(int trapnr)
387 {
388 static char *strings[] = {
389 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
390 "invalid opcode", "device not available", "double fault",
391 "coprocessor segment", "invalid tss", "segment not found",
392 "stack error", "general protection fault", "page fault",
393 "spurious interrupt", "coprocessor error", "alignment check",
394 "machine check", "simd error"
395 };
397 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
398 return "???";
400 return strings[trapnr];
401 }
403 /*
404 * This is called for faults at very unexpected times (e.g., when interrupts
405 * are disabled). In such situations we can't do much that is safe. We try to
406 * print out some tracing and then we just spin.
407 */
408 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
409 {
410 static DEFINE_PER_CPU(char, depth);
412 /*
413 * In some cases, we can end up in a vicious cycle of fatal_trap()s
414 * within fatal_trap()s. We give the problem a couple of iterations to
415 * bottom out, and then we just panic.
416 */
417 if ( ++this_cpu(depth) < 3 )
418 {
419 watchdog_disable();
420 console_start_sync();
422 show_execution_state(regs);
424 if ( trapnr == TRAP_page_fault )
425 {
426 unsigned long cr2 = read_cr2();
427 printk("Faulting linear address: %p\n", _p(cr2));
428 show_page_walk(cr2);
429 }
430 }
432 panic("FATAL TRAP: vector = %d (%s)\n"
433 "[error_code=%04x] %s\n",
434 trapnr, trapstr(trapnr), regs->error_code,
435 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
436 }
438 static void do_guest_trap(
439 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
440 {
441 struct vcpu *v = current;
442 struct trap_bounce *tb;
443 const struct trap_info *ti;
445 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
447 tb = &v->arch.trap_bounce;
448 ti = &v->arch.guest_context.trap_ctxt[trapnr];
450 tb->flags = TBF_EXCEPTION;
451 tb->cs = ti->cs;
452 tb->eip = ti->address;
454 if ( use_error_code )
455 {
456 tb->flags |= TBF_EXCEPTION_ERRCODE;
457 tb->error_code = regs->error_code;
458 }
460 if ( TI_GET_IF(ti) )
461 tb->flags |= TBF_INTERRUPT;
463 if ( unlikely(null_trap_bounce(v, tb)) )
464 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
465 "on VCPU %d [ec=%04x]\n",
466 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
467 }
469 static void instruction_done(
470 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
471 {
472 regs->eip = eip;
473 regs->eflags &= ~X86_EFLAGS_RF;
474 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
475 {
476 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
477 if ( regs->eflags & X86_EFLAGS_TF )
478 current->arch.guest_context.debugreg[6] |= 0x4000;
479 do_guest_trap(TRAP_debug, regs, 0);
480 }
481 }
483 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
484 unsigned int port, unsigned int len)
485 {
486 unsigned int width, i, match = 0;
487 unsigned long start;
489 if ( !(v->arch.guest_context.debugreg[5]) ||
490 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
491 return 0;
493 for ( i = 0; i < 4; i++ )
494 {
495 if ( !(v->arch.guest_context.debugreg[5] &
496 (3 << (i * DR_ENABLE_SIZE))) )
497 continue;
499 start = v->arch.guest_context.debugreg[i];
500 width = 0;
502 switch ( (v->arch.guest_context.debugreg[7] >>
503 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
504 {
505 case DR_LEN_1: width = 1; break;
506 case DR_LEN_2: width = 2; break;
507 case DR_LEN_4: width = 4; break;
508 case DR_LEN_8: width = 8; break;
509 }
511 if ( (start < (port + len)) && ((start + width) > port) )
512 match |= 1 << i;
513 }
515 return match;
516 }
518 /*
519 * Called from asm to set up the MCE trapbounce info.
520 * Returns 0 if no callback is set up, else 1.
521 */
522 asmlinkage int set_guest_machinecheck_trapbounce(void)
523 {
524 struct vcpu *v = current;
525 struct trap_bounce *tb = &v->arch.trap_bounce;
527 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
528 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
529 return !null_trap_bounce(v, tb);
530 }
532 /*
533 * Called from asm to set up the NMI trapbounce info.
534 * Returns 0 if no callback is set up, else 1.
535 */
536 asmlinkage int set_guest_nmi_trapbounce(void)
537 {
538 struct vcpu *v = current;
539 struct trap_bounce *tb = &v->arch.trap_bounce;
540 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
541 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
542 return !null_trap_bounce(v, tb);
543 }
545 static inline void do_trap(
546 int trapnr, struct cpu_user_regs *regs, int use_error_code)
547 {
548 struct vcpu *curr = current;
549 unsigned long fixup;
551 DEBUGGER_trap_entry(trapnr, regs);
553 if ( guest_mode(regs) )
554 {
555 do_guest_trap(trapnr, regs, use_error_code);
556 return;
557 }
559 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
560 {
561 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
562 trapnr, _p(regs->eip), _p(fixup));
563 regs->eip = fixup;
564 return;
565 }
567 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
568 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
569 {
570 curr->arch.hvm_vcpu.fpu_exception_callback(
571 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
572 return;
573 }
575 DEBUGGER_trap_fatal(trapnr, regs);
577 show_execution_state(regs);
578 panic("FATAL TRAP: vector = %d (%s)\n"
579 "[error_code=%04x]\n",
580 trapnr, trapstr(trapnr), regs->error_code);
581 }
583 #define DO_ERROR_NOCODE(trapnr, name) \
584 asmlinkage void do_##name(struct cpu_user_regs *regs) \
585 { \
586 do_trap(trapnr, regs, 0); \
587 }
589 #define DO_ERROR(trapnr, name) \
590 asmlinkage void do_##name(struct cpu_user_regs *regs) \
591 { \
592 do_trap(trapnr, regs, 1); \
593 }
595 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
596 DO_ERROR_NOCODE(TRAP_overflow, overflow)
597 DO_ERROR_NOCODE(TRAP_bounds, bounds)
598 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
599 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
600 DO_ERROR( TRAP_no_segment, segment_not_present)
601 DO_ERROR( TRAP_stack_error, stack_segment)
602 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
603 DO_ERROR( TRAP_alignment_check, alignment_check)
604 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
606 int rdmsr_hypervisor_regs(
607 uint32_t idx, uint32_t *eax, uint32_t *edx)
608 {
609 struct domain *d = current->domain;
610 /* Optionally shift out of the way of Viridian architectural MSRs. */
611 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
613 idx -= base;
614 if ( idx > 0 )
615 return 0;
617 switch ( idx )
618 {
619 case 0:
620 {
621 *eax = *edx = 0;
622 break;
623 }
624 default:
625 BUG();
626 }
628 return 1;
629 }
631 int wrmsr_hypervisor_regs(
632 uint32_t idx, uint32_t eax, uint32_t edx)
633 {
634 struct domain *d = current->domain;
635 /* Optionally shift out of the way of Viridian architectural MSRs. */
636 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
638 idx -= base;
639 if ( idx > 0 )
640 return 0;
642 switch ( idx )
643 {
644 case 0:
645 {
646 void *hypercall_page;
647 unsigned long mfn;
648 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
649 unsigned int idx = eax & 0xfff;
651 if ( idx > 0 )
652 {
653 gdprintk(XENLOG_WARNING,
654 "Out of range index %u to MSR %08x\n",
655 idx, 0x40000000);
656 return 0;
657 }
659 mfn = gmfn_to_mfn(d, gmfn);
661 if ( !mfn_valid(mfn) ||
662 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
663 {
664 gdprintk(XENLOG_WARNING,
665 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
666 gmfn, mfn, base + idx);
667 return 0;
668 }
670 hypercall_page = map_domain_page(mfn);
671 hypercall_page_initialise(d, hypercall_page);
672 unmap_domain_page(hypercall_page);
674 put_page_and_type(mfn_to_page(mfn));
675 break;
676 }
678 default:
679 BUG();
680 }
682 return 1;
683 }
685 int cpuid_hypervisor_leaves(
686 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
687 {
688 struct domain *d = current->domain;
689 /* Optionally shift out of the way of Viridian architectural leaves. */
690 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
692 idx -= base;
693 if ( idx > 2 )
694 return 0;
696 switch ( idx )
697 {
698 case 0:
699 *eax = base + 2; /* Largest leaf */
700 *ebx = XEN_CPUID_SIGNATURE_EBX;
701 *ecx = XEN_CPUID_SIGNATURE_ECX;
702 *edx = XEN_CPUID_SIGNATURE_EDX;
703 break;
705 case 1:
706 *eax = (xen_major_version() << 16) | xen_minor_version();
707 *ebx = 0; /* Reserved */
708 *ecx = 0; /* Reserved */
709 *edx = 0; /* Reserved */
710 break;
712 case 2:
713 *eax = 1; /* Number of hypercall-transfer pages */
714 *ebx = 0x40000000; /* MSR base address */
715 if ( is_viridian_domain(d) )
716 *ebx = 0x40000200;
717 *ecx = 0; /* Features 1 */
718 *edx = 0; /* Features 2 */
719 if ( !is_hvm_vcpu(current) )
720 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
721 break;
723 default:
724 BUG();
725 }
727 return 1;
728 }
730 static void pv_cpuid(struct cpu_user_regs *regs)
731 {
732 uint32_t a, b, c, d;
734 a = regs->eax;
735 b = regs->ebx;
736 c = regs->ecx;
737 d = regs->edx;
739 if ( current->domain->domain_id != 0 )
740 {
741 if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
742 domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
743 goto out;
744 }
746 asm (
747 "cpuid"
748 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
749 : "0" (a), "1" (b), "2" (c), "3" (d) );
751 if ( (regs->eax & 0x7fffffff) == 1 )
752 {
753 /* Modify Feature Information. */
754 __clear_bit(X86_FEATURE_VME, &d);
755 if ( !cpu_has_apic )
756 __clear_bit(X86_FEATURE_APIC, &d);
757 if ( !opt_allow_hugepage )
758 __clear_bit(X86_FEATURE_PSE, &d);
759 __clear_bit(X86_FEATURE_PGE, &d);
760 __clear_bit(X86_FEATURE_PSE36, &d);
761 }
762 switch ( (uint32_t)regs->eax )
763 {
764 case 1:
765 /* Modify Feature Information. */
766 if ( !cpu_has_sep )
767 __clear_bit(X86_FEATURE_SEP, &d);
768 #ifdef __i386__
769 if ( !supervisor_mode_kernel )
770 __clear_bit(X86_FEATURE_SEP, &d);
771 #endif
772 __clear_bit(X86_FEATURE_DS, &d);
773 __clear_bit(X86_FEATURE_ACC, &d);
774 __clear_bit(X86_FEATURE_PBE, &d);
776 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
777 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
778 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
779 __clear_bit(X86_FEATURE_VMXE % 32, &c);
780 __clear_bit(X86_FEATURE_SMXE % 32, &c);
781 __clear_bit(X86_FEATURE_TM2 % 32, &c);
782 if ( is_pv_32bit_vcpu(current) )
783 __clear_bit(X86_FEATURE_CX16 % 32, &c);
784 __clear_bit(X86_FEATURE_XTPR % 32, &c);
785 __clear_bit(X86_FEATURE_PDCM % 32, &c);
786 __clear_bit(X86_FEATURE_DCA % 32, &c);
787 __clear_bit(X86_FEATURE_XSAVE % 32, &c);
788 if ( !cpu_has_apic )
789 __clear_bit(X86_FEATURE_X2APIC % 32, &c);
790 __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
791 break;
792 case 0x80000001:
793 /* Modify Feature Information. */
794 if ( is_pv_32bit_vcpu(current) )
795 {
796 __clear_bit(X86_FEATURE_LM % 32, &d);
797 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
798 }
799 #ifndef __i386__
800 if ( is_pv_32on64_vcpu(current) &&
801 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
802 #endif
803 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
804 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
805 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
807 __clear_bit(X86_FEATURE_SVME % 32, &c);
808 if ( !cpu_has_apic )
809 __clear_bit(X86_FEATURE_EXTAPICSPACE % 32, &c);
810 __clear_bit(X86_FEATURE_OSVW % 32, &c);
811 __clear_bit(X86_FEATURE_IBS % 32, &c);
812 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
813 __clear_bit(X86_FEATURE_WDT % 32, &c);
814 break;
815 case 5: /* MONITOR/MWAIT */
816 case 0xa: /* Architectural Performance Monitor Features */
817 case 0x8000000a: /* SVM revision and features */
818 case 0x8000001b: /* Instruction Based Sampling */
819 a = b = c = d = 0;
820 break;
821 default:
822 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
823 break;
824 }
826 out:
827 regs->eax = a;
828 regs->ebx = b;
829 regs->ecx = c;
830 regs->edx = d;
831 }
833 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
834 {
835 char sig[5], instr[2];
836 unsigned long eip, rc;
838 eip = regs->eip;
840 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
841 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
842 {
843 propagate_page_fault(eip + sizeof(sig) - rc, 0);
844 return EXCRET_fault_fixed;
845 }
846 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
847 return 0;
848 eip += sizeof(sig);
850 /* We only emulate CPUID. */
851 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
852 {
853 propagate_page_fault(eip + sizeof(instr) - rc, 0);
854 return EXCRET_fault_fixed;
855 }
856 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
857 return 0;
858 eip += sizeof(instr);
860 pv_cpuid(regs);
862 instruction_done(regs, eip, 0);
864 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
866 return EXCRET_fault_fixed;
867 }
869 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
870 {
871 struct bug_frame bug;
872 struct bug_frame_str bug_str;
873 const char *filename, *predicate, *eip = (char *)regs->eip;
874 unsigned long fixup;
875 int id, lineno;
877 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
879 if ( likely(guest_mode(regs)) )
880 {
881 if ( !emulate_forced_invalid_op(regs) )
882 do_guest_trap(TRAP_invalid_op, regs, 0);
883 return;
884 }
886 if ( !is_kernel(eip) ||
887 __copy_from_user(&bug, eip, sizeof(bug)) ||
888 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
889 (bug.ret != 0xc2) )
890 goto die;
891 eip += sizeof(bug);
893 id = bug.id & 3;
895 if ( id == BUGFRAME_dump )
896 {
897 show_execution_state(regs);
898 regs->eip = (unsigned long)eip;
899 return;
900 }
902 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
903 if ( !is_kernel(eip) ||
904 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
905 (bug_str.mov != 0xbc) )
906 goto die;
907 filename = bug_str(bug_str, eip);
908 eip += sizeof(bug_str);
910 if ( !is_kernel(filename) )
911 filename = "<unknown>";
912 lineno = bug.id >> 2;
914 if ( id == BUGFRAME_warn )
915 {
916 printk("Xen WARN at %.50s:%d\n", filename, lineno);
917 show_execution_state(regs);
918 regs->eip = (unsigned long)eip;
919 return;
920 }
922 if ( id == BUGFRAME_bug )
923 {
924 printk("Xen BUG at %.50s:%d\n", filename, lineno);
925 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
926 show_execution_state(regs);
927 panic("Xen BUG at %.50s:%d\n", filename, lineno);
928 }
930 /* ASSERT: decode the predicate string pointer. */
931 ASSERT(id == BUGFRAME_assert);
932 if ( !is_kernel(eip) ||
933 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
934 (bug_str.mov != 0xbc) )
935 goto die;
936 predicate = bug_str(bug_str, eip);
937 eip += sizeof(bug_str);
939 if ( !is_kernel(predicate) )
940 predicate = "<unknown>";
941 printk("Assertion '%s' failed at %.50s:%d\n",
942 predicate, filename, lineno);
943 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
944 show_execution_state(regs);
945 panic("Assertion '%s' failed at %.50s:%d\n",
946 predicate, filename, lineno);
948 die:
949 if ( (fixup = search_exception_table(regs->eip)) != 0 )
950 {
951 regs->eip = fixup;
952 return;
953 }
954 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
955 show_execution_state(regs);
956 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
957 }
959 asmlinkage void do_int3(struct cpu_user_regs *regs)
960 {
961 DEBUGGER_trap_entry(TRAP_int3, regs);
963 if ( !guest_mode(regs) )
964 {
965 debugger_trap_fatal(TRAP_int3, regs);
966 return;
967 }
969 do_guest_trap(TRAP_int3, regs, 0);
970 }
972 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
973 {
974 machine_check_vector(regs, regs->error_code);
975 }
977 static void reserved_bit_page_fault(
978 unsigned long addr, struct cpu_user_regs *regs)
979 {
980 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
981 current->domain->domain_id, current->vcpu_id, regs->error_code);
982 show_page_walk(addr);
983 show_execution_state(regs);
984 }
986 void propagate_page_fault(unsigned long addr, u16 error_code)
987 {
988 struct trap_info *ti;
989 struct vcpu *v = current;
990 struct trap_bounce *tb = &v->arch.trap_bounce;
992 v->arch.guest_context.ctrlreg[2] = addr;
993 arch_set_cr2(v, addr);
995 /* Re-set error_code.user flag appropriately for the guest. */
996 error_code &= ~PFEC_user_mode;
997 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
998 error_code |= PFEC_user_mode;
1000 trace_pv_page_fault(addr, error_code);
1002 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
1003 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1004 tb->error_code = error_code;
1005 tb->cs = ti->cs;
1006 tb->eip = ti->address;
1007 if ( TI_GET_IF(ti) )
1008 tb->flags |= TBF_INTERRUPT;
1009 if ( unlikely(null_trap_bounce(v, tb)) )
1011 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
1012 v->domain->domain_id, v->vcpu_id, error_code);
1013 show_page_walk(addr);
1016 if ( unlikely(error_code & PFEC_reserved_bit) )
1017 reserved_bit_page_fault(addr, guest_cpu_user_regs());
1020 static int handle_gdt_ldt_mapping_fault(
1021 unsigned long offset, struct cpu_user_regs *regs)
1023 struct vcpu *curr = current;
1024 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1025 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1026 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1028 /* Should never fault in another vcpu's area. */
1029 BUG_ON(vcpu_area != curr->vcpu_id);
1031 /* Byte offset within the gdt/ldt sub-area. */
1032 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1034 if ( likely(is_ldt_area) )
1036 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1037 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
1039 if ( guest_mode(regs) )
1040 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1041 regs->eip, offset);
1043 else
1045 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1046 if ( !guest_mode(regs) )
1047 return 0;
1048 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
1049 propagate_page_fault(
1050 curr->arch.guest_context.ldt_base + offset,
1051 regs->error_code);
1054 else
1056 /* GDT fault: handle the fault as #GP(selector). */
1057 regs->error_code = (u16)offset & ~7;
1058 (void)do_general_protection(regs);
1061 return EXCRET_fault_fixed;
1064 #ifdef HYPERVISOR_VIRT_END
1065 #define IN_HYPERVISOR_RANGE(va) \
1066 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1067 #else
1068 #define IN_HYPERVISOR_RANGE(va) \
1069 (((va) >= HYPERVISOR_VIRT_START))
1070 #endif
1072 static int __spurious_page_fault(
1073 unsigned long addr, unsigned int error_code)
1075 unsigned long mfn, cr3 = read_cr3();
1076 #if CONFIG_PAGING_LEVELS >= 4
1077 l4_pgentry_t l4e, *l4t;
1078 #endif
1079 #if CONFIG_PAGING_LEVELS >= 3
1080 l3_pgentry_t l3e, *l3t;
1081 #endif
1082 l2_pgentry_t l2e, *l2t;
1083 l1_pgentry_t l1e, *l1t;
1084 unsigned int required_flags, disallowed_flags;
1086 /*
1087 * We do not take spurious page faults in IRQ handlers as we do not
1088 * modify page tables in IRQ context. We therefore bail here because
1089 * map_domain_page() is not IRQ-safe.
1090 */
1091 if ( in_irq() )
1092 return 0;
1094 /* Reserved bit violations are never spurious faults. */
1095 if ( error_code & PFEC_reserved_bit )
1096 return 0;
1098 required_flags = _PAGE_PRESENT;
1099 if ( error_code & PFEC_write_access )
1100 required_flags |= _PAGE_RW;
1101 if ( error_code & PFEC_user_mode )
1102 required_flags |= _PAGE_USER;
1104 disallowed_flags = 0;
1105 if ( error_code & PFEC_insn_fetch )
1106 disallowed_flags |= _PAGE_NX;
1108 mfn = cr3 >> PAGE_SHIFT;
1110 #if CONFIG_PAGING_LEVELS >= 4
1111 l4t = map_domain_page(mfn);
1112 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1113 mfn = l4e_get_pfn(l4e);
1114 unmap_domain_page(l4t);
1115 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1116 (l4e_get_flags(l4e) & disallowed_flags) )
1117 return 0;
1118 #endif
1120 #if CONFIG_PAGING_LEVELS >= 3
1121 l3t = map_domain_page(mfn);
1122 #if CONFIG_PAGING_LEVELS == 3
1123 l3t += (cr3 & 0xFE0UL) >> 3;
1124 #endif
1125 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1126 mfn = l3e_get_pfn(l3e);
1127 unmap_domain_page(l3t);
1128 #if CONFIG_PAGING_LEVELS == 3
1129 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1130 return 0;
1131 #else
1132 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1133 (l3e_get_flags(l3e) & disallowed_flags) )
1134 return 0;
1135 #endif
1136 #endif
1138 l2t = map_domain_page(mfn);
1139 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1140 mfn = l2e_get_pfn(l2e);
1141 unmap_domain_page(l2t);
1142 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1143 (l2e_get_flags(l2e) & disallowed_flags) )
1144 return 0;
1145 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1147 l1e = l1e_empty(); /* define before use in debug tracing */
1148 goto spurious;
1151 l1t = map_domain_page(mfn);
1152 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1153 mfn = l1e_get_pfn(l1e);
1154 unmap_domain_page(l1t);
1155 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1156 (l1e_get_flags(l1e) & disallowed_flags) )
1157 return 0;
1159 spurious:
1160 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1161 "at addr %lx, e/c %04x\n",
1162 current->domain->domain_id, current->vcpu_id,
1163 addr, error_code);
1164 #if CONFIG_PAGING_LEVELS >= 4
1165 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1166 #endif
1167 #if CONFIG_PAGING_LEVELS >= 3
1168 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1169 #endif
1170 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1171 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1172 return 1;
1175 static int spurious_page_fault(
1176 unsigned long addr, unsigned int error_code)
1178 unsigned long flags;
1179 int is_spurious;
1181 /*
1182 * Disabling interrupts prevents TLB flushing, and hence prevents
1183 * page tables from becoming invalid under our feet during the walk.
1184 */
1185 local_irq_save(flags);
1186 is_spurious = __spurious_page_fault(addr, error_code);
1187 local_irq_restore(flags);
1189 return is_spurious;
1192 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1194 struct vcpu *v = current;
1195 struct domain *d = v->domain;
1197 /* No fixups in interrupt context or when interrupts are disabled. */
1198 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1199 return 0;
1201 /* Faults from external-mode guests are handled by shadow/hap */
1202 if ( paging_mode_external(d) && guest_mode(regs) )
1204 int ret = paging_fault(addr, regs);
1205 if ( ret == EXCRET_fault_fixed )
1206 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1207 return ret;
1210 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1212 if ( !(regs->error_code & PFEC_reserved_bit) &&
1213 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1214 return handle_gdt_ldt_mapping_fault(
1215 addr - GDT_LDT_VIRT_START, regs);
1216 return 0;
1219 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1220 guest_kernel_mode(v, regs) &&
1221 /* Do not check if access-protection fault since the page may
1222 legitimately be not present in shadow page tables */
1223 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1224 PFEC_write_access) &&
1225 ptwr_do_page_fault(v, addr, regs) )
1226 return EXCRET_fault_fixed;
1228 /* For non-external shadowed guests, we fix up both their own
1229 * pagefaults and Xen's, since they share the pagetables. */
1230 if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1232 int ret = paging_fault(addr, regs);
1233 if ( ret == EXCRET_fault_fixed )
1234 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1235 return ret;
1238 return 0;
1241 /*
1242 * #PF error code:
1243 * Bit 0: Protection violation (=1) ; Page not present (=0)
1244 * Bit 1: Write access
1245 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1246 * Bit 3: Reserved bit violation
1247 * Bit 4: Instruction fetch
1248 */
1249 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1251 unsigned long addr, fixup;
1252 unsigned int error_code;
1254 addr = read_cr2();
1256 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1257 error_code = regs->error_code;
1259 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1261 perfc_incr(page_faults);
1263 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1264 return;
1266 if ( unlikely(!guest_mode(regs)) )
1268 if ( spurious_page_fault(addr, error_code) )
1269 return;
1271 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1273 perfc_incr(copy_user_faults);
1274 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1275 reserved_bit_page_fault(addr, regs);
1276 regs->eip = fixup;
1277 return;
1280 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1282 show_execution_state(regs);
1283 show_page_walk(addr);
1284 panic("FATAL PAGE FAULT\n"
1285 "[error_code=%04x]\n"
1286 "Faulting linear address: %p\n",
1287 error_code, _p(addr));
1290 if ( unlikely(current->domain->arch.suppress_spurious_page_faults
1291 && spurious_page_fault(addr, error_code)) )
1292 return;
1294 propagate_page_fault(addr, regs->error_code);
1297 /*
1298 * Early #PF handler to print CR2, error code, and stack.
1300 * We also deal with spurious faults here, even though they should never happen
1301 * during early boot (an issue was seen once, but was most likely a hardware
1302 * problem).
1303 */
1304 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1306 static int stuck;
1307 static unsigned long prev_eip, prev_cr2;
1308 unsigned long cr2 = read_cr2();
1310 BUG_ON(smp_processor_id() != 0);
1312 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1314 prev_eip = regs->eip;
1315 prev_cr2 = cr2;
1316 stuck = 0;
1317 return;
1320 if ( stuck++ == 1000 )
1322 unsigned long *stk = (unsigned long *)regs;
1323 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1324 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1325 printk("Stack dump: ");
1326 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1327 printk("%p ", _p(*stk++));
1328 for ( ; ; ) ;
1332 long do_fpu_taskswitch(int set)
1334 struct vcpu *v = current;
1336 if ( set )
1338 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1339 stts();
1341 else
1343 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1344 if ( v->fpu_dirtied )
1345 clts();
1348 return 0;
1351 static int read_descriptor(unsigned int sel,
1352 const struct vcpu *v,
1353 const struct cpu_user_regs * regs,
1354 unsigned long *base,
1355 unsigned long *limit,
1356 unsigned int *ar,
1357 unsigned int vm86attr)
1359 struct desc_struct desc;
1361 if ( !vm86_mode(regs) )
1363 if ( sel < 4)
1364 desc.b = desc.a = 0;
1365 else if ( __get_user(desc,
1366 (const struct desc_struct *)(!(sel & 4)
1367 ? GDT_VIRT_START(v)
1368 : LDT_VIRT_START(v))
1369 + (sel >> 3)) )
1370 return 0;
1371 if ( !(vm86attr & _SEGMENT_CODE) )
1372 desc.b &= ~_SEGMENT_L;
1374 else
1376 desc.a = (sel << 20) | 0xffff;
1377 desc.b = vm86attr | (sel >> 12);
1380 *ar = desc.b & 0x00f0ff00;
1381 if ( !(desc.b & _SEGMENT_L) )
1383 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1384 (desc.b & 0xff000000));
1385 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1386 if ( desc.b & _SEGMENT_G )
1387 *limit = ((*limit + 1) << 12) - 1;
1388 #ifndef NDEBUG
1389 if ( !vm86_mode(regs) && (sel > 3) )
1391 unsigned int a, l;
1392 unsigned char valid;
1394 asm volatile (
1395 "larl %2,%0 ; setz %1"
1396 : "=r" (a), "=qm" (valid) : "rm" (sel));
1397 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1398 asm volatile (
1399 "lsll %2,%0 ; setz %1"
1400 : "=r" (l), "=qm" (valid) : "rm" (sel));
1401 BUG_ON(valid && (l != *limit));
1403 #endif
1405 else
1407 *base = 0UL;
1408 *limit = ~0UL;
1411 return 1;
1414 #ifdef __x86_64__
1415 static int read_gate_descriptor(unsigned int gate_sel,
1416 const struct vcpu *v,
1417 unsigned int *sel,
1418 unsigned long *off,
1419 unsigned int *ar)
1421 struct desc_struct desc;
1422 const struct desc_struct *pdesc;
1425 pdesc = (const struct desc_struct *)
1426 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1427 + (gate_sel >> 3);
1428 if ( (gate_sel < 4) ||
1429 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1430 __get_user(desc, pdesc) )
1431 return 0;
1433 *sel = (desc.a >> 16) & 0x0000fffc;
1434 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1435 *ar = desc.b & 0x0000ffff;
1437 /*
1438 * check_descriptor() clears the DPL field and stores the
1439 * guest requested DPL in the selector's RPL field.
1440 */
1441 if ( *ar & _SEGMENT_DPL )
1442 return 0;
1443 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1445 if ( !is_pv_32bit_vcpu(v) )
1447 if ( (*ar & 0x1f00) != 0x0c00 ||
1448 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1449 __get_user(desc, pdesc + 1) ||
1450 (desc.b & 0x1f00) )
1451 return 0;
1453 *off |= (unsigned long)desc.a << 32;
1454 return 1;
1457 switch ( *ar & 0x1f00 )
1459 case 0x0400:
1460 *off &= 0xffff;
1461 break;
1462 case 0x0c00:
1463 break;
1464 default:
1465 return 0;
1468 return 1;
1470 #endif
1472 /* Has the guest requested sufficient permission for this I/O access? */
1473 static int guest_io_okay(
1474 unsigned int port, unsigned int bytes,
1475 struct vcpu *v, struct cpu_user_regs *regs)
1477 #if defined(__x86_64__)
1478 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1479 int user_mode = !(v->arch.flags & TF_kernel_mode);
1480 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1481 #elif defined(__i386__)
1482 #define TOGGLE_MODE() ((void)0)
1483 #endif
1485 if ( !vm86_mode(regs) &&
1486 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1487 return 1;
1489 if ( v->arch.iobmp_limit > (port + bytes) )
1491 union { uint8_t bytes[2]; uint16_t mask; } x;
1493 /*
1494 * Grab permission bytes from guest space. Inaccessible bytes are
1495 * read as 0xff (no access allowed).
1496 */
1497 TOGGLE_MODE();
1498 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1499 port>>3, 2) )
1501 default: x.bytes[0] = ~0;
1502 case 1: x.bytes[1] = ~0;
1503 case 0: break;
1505 TOGGLE_MODE();
1507 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1508 return 1;
1511 return 0;
1514 /* Has the administrator granted sufficient permission for this I/O access? */
1515 static int admin_io_okay(
1516 unsigned int port, unsigned int bytes,
1517 struct vcpu *v, struct cpu_user_regs *regs)
1519 /*
1520 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1521 * We never permit direct access to that register.
1522 */
1523 if ( (port == 0xcf8) && (bytes == 4) )
1524 return 0;
1526 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1529 static uint32_t guest_io_read(
1530 unsigned int port, unsigned int bytes,
1531 struct vcpu *v, struct cpu_user_regs *regs)
1533 extern uint32_t pci_conf_read(
1534 uint32_t cf8, uint8_t offset, uint8_t bytes);
1536 uint32_t data = 0;
1537 unsigned int shift = 0;
1539 if ( admin_io_okay(port, bytes, v, regs) )
1541 switch ( bytes )
1543 case 1: return inb(port);
1544 case 2: return inw(port);
1545 case 4: return inl(port);
1549 while ( bytes != 0 )
1551 unsigned int size = 1;
1552 uint32_t sub_data = 0xff;
1554 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1556 sub_data = pv_pit_handler(port, 0, 0);
1558 else if ( (port == 0xcf8) && (bytes == 4) )
1560 size = 4;
1561 sub_data = v->domain->arch.pci_cf8;
1563 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1565 size = min(bytes, 4 - (port & 3));
1566 if ( size == 3 )
1567 size = 2;
1568 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1571 if ( size == 4 )
1572 return sub_data;
1574 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1575 shift += size * 8;
1576 port += size;
1577 bytes -= size;
1580 return data;
1583 extern void (*pv_rtc_handler)(unsigned int port, uint8_t value);
1585 static void guest_io_write(
1586 unsigned int port, unsigned int bytes, uint32_t data,
1587 struct vcpu *v, struct cpu_user_regs *regs)
1589 extern void pci_conf_write(
1590 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1592 if ( admin_io_okay(port, bytes, v, regs) )
1594 switch ( bytes ) {
1595 case 1:
1596 if ( ((port == 0x70) || (port == 0x71)) && pv_rtc_handler )
1597 pv_rtc_handler(port, (uint8_t)data);
1598 outb((uint8_t)data, port);
1599 if ( pv_post_outb_hook )
1600 pv_post_outb_hook(port, (uint8_t)data);
1601 break;
1602 case 2:
1603 outw((uint16_t)data, port);
1604 break;
1605 case 4:
1606 outl(data, port);
1607 break;
1609 return;
1612 while ( bytes != 0 )
1614 unsigned int size = 1;
1616 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1618 pv_pit_handler(port, (uint8_t)data, 1);
1620 else if ( (port == 0xcf8) && (bytes == 4) )
1622 size = 4;
1623 v->domain->arch.pci_cf8 = data;
1625 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1627 size = min(bytes, 4 - (port & 3));
1628 if ( size == 3 )
1629 size = 2;
1630 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1633 if ( size == 4 )
1634 return;
1636 port += size;
1637 bytes -= size;
1638 data >>= size * 8;
1642 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1643 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1644 __attribute__((__regparm__(1)));
1645 unsigned long guest_to_host_gpr_switch(unsigned long)
1646 __attribute__((__regparm__(1)));
1648 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1650 /* Instruction fetch with error handling. */
1651 #define insn_fetch(type, base, eip, limit) \
1652 ({ unsigned long _rc, _ptr = (base) + (eip); \
1653 type _x; \
1654 if ( ad_default < 8 ) \
1655 _ptr = (unsigned int)_ptr; \
1656 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1657 goto fail; \
1658 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1659 { \
1660 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1661 goto skip; \
1662 } \
1663 (eip) += sizeof(_x); _x; })
1665 #if defined(CONFIG_X86_32)
1666 # define read_sreg(regs, sr) ((regs)->sr)
1667 #elif defined(CONFIG_X86_64)
1668 # define read_sreg(regs, sr) read_segment_register(sr)
1669 #endif
1671 static int is_cpufreq_controller(struct domain *d)
1673 return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
1674 (d->domain_id == 0));
1677 static int emulate_privileged_op(struct cpu_user_regs *regs)
1679 struct vcpu *v = current;
1680 unsigned long *reg, eip = regs->eip, res;
1681 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1682 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1683 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1684 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1685 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1686 ? regs->reg \
1687 : ad_bytes == 4 \
1688 ? (u32)regs->reg \
1689 : (u16)regs->reg)
1690 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1691 ? regs->reg = (val) \
1692 : ad_bytes == 4 \
1693 ? (*(u32 *)&regs->reg = (val)) \
1694 : (*(u16 *)&regs->reg = (val)))
1695 unsigned long code_base, code_limit;
1696 char io_emul_stub[32];
1697 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1698 u32 l, h, eax, edx;
1700 if ( !read_descriptor(regs->cs, v, regs,
1701 &code_base, &code_limit, &ar,
1702 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1703 goto fail;
1704 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1705 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1706 if ( !(ar & _SEGMENT_S) ||
1707 !(ar & _SEGMENT_P) ||
1708 !(ar & _SEGMENT_CODE) )
1709 goto fail;
1711 /* emulating only opcodes not allowing SS to be default */
1712 data_sel = read_sreg(regs, ds);
1714 /* Legacy prefixes. */
1715 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1717 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1719 case 0x66: /* operand-size override */
1720 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1721 continue;
1722 case 0x67: /* address-size override */
1723 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1724 continue;
1725 case 0x2e: /* CS override */
1726 data_sel = regs->cs;
1727 continue;
1728 case 0x3e: /* DS override */
1729 data_sel = read_sreg(regs, ds);
1730 continue;
1731 case 0x26: /* ES override */
1732 data_sel = read_sreg(regs, es);
1733 continue;
1734 case 0x64: /* FS override */
1735 data_sel = read_sreg(regs, fs);
1736 lm_ovr = lm_seg_fs;
1737 continue;
1738 case 0x65: /* GS override */
1739 data_sel = read_sreg(regs, gs);
1740 lm_ovr = lm_seg_gs;
1741 continue;
1742 case 0x36: /* SS override */
1743 data_sel = regs->ss;
1744 continue;
1745 case 0xf0: /* LOCK */
1746 lock = 1;
1747 continue;
1748 case 0xf2: /* REPNE/REPNZ */
1749 case 0xf3: /* REP/REPE/REPZ */
1750 rep_prefix = 1;
1751 continue;
1752 default:
1753 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1755 rex = opcode;
1756 continue;
1758 break;
1760 break;
1763 /* REX prefix. */
1764 if ( rex & 8 ) /* REX.W */
1765 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1766 modrm_reg = (rex & 4) << 1; /* REX.R */
1767 /* REX.X does not need to be decoded. */
1768 modrm_rm = (rex & 1) << 3; /* REX.B */
1770 if ( opcode == 0x0f )
1771 goto twobyte_opcode;
1773 if ( lock )
1774 goto fail;
1776 /* Input/Output String instructions. */
1777 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1779 unsigned long data_base, data_limit;
1781 if ( rep_prefix && (rd_ad(ecx) == 0) )
1782 goto done;
1784 if ( !(opcode & 2) )
1786 data_sel = read_sreg(regs, es);
1787 lm_ovr = lm_seg_none;
1790 if ( !(ar & _SEGMENT_L) )
1792 if ( !read_descriptor(data_sel, v, regs,
1793 &data_base, &data_limit, &ar,
1794 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1795 _SEGMENT_P) )
1796 goto fail;
1797 if ( !(ar & _SEGMENT_S) ||
1798 !(ar & _SEGMENT_P) ||
1799 (opcode & 2 ?
1800 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1801 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1802 goto fail;
1804 #ifdef CONFIG_X86_64
1805 else
1807 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1809 switch ( lm_ovr )
1811 case lm_seg_none:
1812 data_base = 0UL;
1813 break;
1814 case lm_seg_fs:
1815 data_base = v->arch.guest_context.fs_base;
1816 break;
1817 case lm_seg_gs:
1818 if ( guest_kernel_mode(v, regs) )
1819 data_base = v->arch.guest_context.gs_base_kernel;
1820 else
1821 data_base = v->arch.guest_context.gs_base_user;
1822 break;
1825 else
1826 read_descriptor(data_sel, v, regs,
1827 &data_base, &data_limit, &ar,
1828 0);
1829 data_limit = ~0UL;
1830 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1832 #endif
1834 port = (u16)regs->edx;
1836 continue_io_string:
1837 switch ( opcode )
1839 case 0x6c: /* INSB */
1840 op_bytes = 1;
1841 case 0x6d: /* INSW/INSL */
1842 if ( (data_limit < (op_bytes - 1)) ||
1843 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1844 !guest_io_okay(port, op_bytes, v, regs) )
1845 goto fail;
1846 data = guest_io_read(port, op_bytes, v, regs);
1847 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1848 &data, op_bytes)) != 0 )
1850 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1851 PFEC_write_access);
1852 return EXCRET_fault_fixed;
1854 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
1855 ? -op_bytes : op_bytes));
1856 break;
1858 case 0x6e: /* OUTSB */
1859 op_bytes = 1;
1860 case 0x6f: /* OUTSW/OUTSL */
1861 if ( (data_limit < (op_bytes - 1)) ||
1862 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1863 !guest_io_okay(port, op_bytes, v, regs) )
1864 goto fail;
1865 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1866 op_bytes)) != 0 )
1868 propagate_page_fault(data_base + rd_ad(esi)
1869 + op_bytes - rc, 0);
1870 return EXCRET_fault_fixed;
1872 guest_io_write(port, op_bytes, data, v, regs);
1873 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
1874 ? -op_bytes : op_bytes));
1875 break;
1878 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1880 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1882 if ( !bpmatch && !hypercall_preempt_check() )
1883 goto continue_io_string;
1884 eip = regs->eip;
1887 goto done;
1890 /*
1891 * Very likely to be an I/O instruction (IN/OUT).
1892 * Build an on-stack stub to execute the instruction with full guest
1893 * GPR context. This is needed for some systems which (ab)use IN/OUT
1894 * to communicate with BIOS code in system-management mode.
1895 */
1896 #ifdef __x86_64__
1897 /* movq $host_to_guest_gpr_switch,%rcx */
1898 io_emul_stub[0] = 0x48;
1899 io_emul_stub[1] = 0xb9;
1900 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1901 /* callq *%rcx */
1902 io_emul_stub[10] = 0xff;
1903 io_emul_stub[11] = 0xd1;
1904 #else
1905 /* call host_to_guest_gpr_switch */
1906 io_emul_stub[0] = 0xe8;
1907 *(s32 *)&io_emul_stub[1] =
1908 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1909 /* 7 x nop */
1910 memset(&io_emul_stub[5], 0x90, 7);
1911 #endif
1912 /* data16 or nop */
1913 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1914 /* <io-access opcode> */
1915 io_emul_stub[13] = opcode;
1916 /* imm8 or nop */
1917 io_emul_stub[14] = 0x90;
1918 /* ret (jumps to guest_to_host_gpr_switch) */
1919 io_emul_stub[15] = 0xc3;
1921 /* Handy function-typed pointer to the stub. */
1922 io_emul = (void *)io_emul_stub;
1924 if ( ioemul_handle_quirk )
1925 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1927 /* I/O Port and Interrupt Flag instructions. */
1928 switch ( opcode )
1930 case 0xe4: /* IN imm8,%al */
1931 op_bytes = 1;
1932 case 0xe5: /* IN imm8,%eax */
1933 port = insn_fetch(u8, code_base, eip, code_limit);
1934 io_emul_stub[14] = port; /* imm8 */
1935 exec_in:
1936 if ( !guest_io_okay(port, op_bytes, v, regs) )
1937 goto fail;
1938 if ( admin_io_okay(port, op_bytes, v, regs) )
1940 io_emul(regs);
1942 else
1944 if ( op_bytes == 4 )
1945 regs->eax = 0;
1946 else
1947 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1948 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1950 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1951 goto done;
1953 case 0xec: /* IN %dx,%al */
1954 op_bytes = 1;
1955 case 0xed: /* IN %dx,%eax */
1956 port = (u16)regs->edx;
1957 goto exec_in;
1959 case 0xe6: /* OUT %al,imm8 */
1960 op_bytes = 1;
1961 case 0xe7: /* OUT %eax,imm8 */
1962 port = insn_fetch(u8, code_base, eip, code_limit);
1963 io_emul_stub[14] = port; /* imm8 */
1964 exec_out:
1965 if ( !guest_io_okay(port, op_bytes, v, regs) )
1966 goto fail;
1967 if ( admin_io_okay(port, op_bytes, v, regs) )
1969 if ( (op_bytes == 1) &&
1970 ((port == 0x71) || (port == 0x70)) &&
1971 pv_rtc_handler )
1972 pv_rtc_handler(port, regs->eax);
1973 io_emul(regs);
1974 if ( (op_bytes == 1) && pv_post_outb_hook )
1975 pv_post_outb_hook(port, regs->eax);
1977 else
1979 guest_io_write(port, op_bytes, regs->eax, v, regs);
1981 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1982 goto done;
1984 case 0xee: /* OUT %al,%dx */
1985 op_bytes = 1;
1986 case 0xef: /* OUT %eax,%dx */
1987 port = (u16)regs->edx;
1988 goto exec_out;
1990 case 0xfa: /* CLI */
1991 case 0xfb: /* STI */
1992 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1993 goto fail;
1994 /*
1995 * This is just too dangerous to allow, in my opinion. Consider if the
1996 * caller then tries to reenable interrupts using POPF: we can't trap
1997 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1998 * do for us. :-)
1999 */
2000 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
2001 goto done;
2004 /* No decode of this single-byte opcode. */
2005 goto fail;
2007 twobyte_opcode:
2008 /* Two-byte opcodes only emulated from guest kernel. */
2009 if ( !guest_kernel_mode(v, regs) )
2010 goto fail;
2012 /* Privileged (ring 0) instructions. */
2013 opcode = insn_fetch(u8, code_base, eip, code_limit);
2014 if ( lock && (opcode & ~3) != 0x20 )
2015 goto fail;
2016 switch ( opcode )
2018 case 0x06: /* CLTS */
2019 (void)do_fpu_taskswitch(0);
2020 break;
2022 case 0x09: /* WBINVD */
2023 /* Ignore the instruction if unprivileged. */
2024 if ( !cache_flush_permitted(v->domain) )
2025 /* Non-physdev domain attempted WBINVD; ignore for now since
2026 newer linux uses this in some start-of-day timing loops */
2028 else
2029 wbinvd();
2030 break;
2032 case 0x20: /* MOV CR?,<reg> */
2033 opcode = insn_fetch(u8, code_base, eip, code_limit);
2034 if ( opcode < 0xc0 )
2035 goto fail;
2036 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2037 modrm_rm |= (opcode >> 0) & 7;
2038 reg = decode_register(modrm_rm, regs, 0);
2039 switch ( modrm_reg )
2041 case 0: /* Read CR0 */
2042 *reg = (read_cr0() & ~X86_CR0_TS) |
2043 v->arch.guest_context.ctrlreg[0];
2044 break;
2046 case 2: /* Read CR2 */
2047 *reg = v->arch.guest_context.ctrlreg[2];
2048 break;
2050 case 3: /* Read CR3 */
2051 if ( !is_pv_32on64_vcpu(v) )
2052 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
2053 v->domain, pagetable_get_pfn(v->arch.guest_table)));
2054 #ifdef CONFIG_COMPAT
2055 else
2056 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
2057 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
2058 #endif
2059 break;
2061 case 4: /* Read CR4 */
2062 /*
2063 * Guests can read CR4 to see what features Xen has enabled. We
2064 * therefore lie about PGE as it is unavailable to guests.
2065 * Also disallow PSE if hugepages are not enabled.
2066 */
2067 *reg = read_cr4() & ~X86_CR4_PGE;
2068 if ( !opt_allow_hugepage )
2069 *reg &= ~X86_CR4_PSE;
2070 break;
2072 default:
2073 goto fail;
2075 break;
2077 case 0x21: /* MOV DR?,<reg> */
2078 opcode = insn_fetch(u8, code_base, eip, code_limit);
2079 if ( opcode < 0xc0 )
2080 goto fail;
2081 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2082 modrm_rm |= (opcode >> 0) & 7;
2083 reg = decode_register(modrm_rm, regs, 0);
2084 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2085 goto fail;
2086 *reg = res;
2087 break;
2089 case 0x22: /* MOV <reg>,CR? */
2090 opcode = insn_fetch(u8, code_base, eip, code_limit);
2091 if ( opcode < 0xc0 )
2092 goto fail;
2093 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2094 modrm_rm |= (opcode >> 0) & 7;
2095 reg = decode_register(modrm_rm, regs, 0);
2096 switch ( modrm_reg )
2098 case 0: /* Write CR0 */
2099 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2101 gdprintk(XENLOG_WARNING,
2102 "Attempt to change unmodifiable CR0 flags.\n");
2103 goto fail;
2105 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2106 break;
2108 case 2: /* Write CR2 */
2109 v->arch.guest_context.ctrlreg[2] = *reg;
2110 arch_set_cr2(v, *reg);
2111 break;
2113 case 3: /* Write CR3 */
2114 domain_lock(v->domain);
2115 if ( !is_pv_32on64_vcpu(v) )
2116 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2117 #ifdef CONFIG_COMPAT
2118 else
2119 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2120 #endif
2121 domain_unlock(v->domain);
2122 if ( rc == 0 ) /* not okay */
2123 goto fail;
2124 break;
2126 case 4: /* Write CR4 */
2127 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2128 write_cr4(pv_guest_cr4_to_real_cr4(
2129 v->arch.guest_context.ctrlreg[4]));
2130 break;
2132 default:
2133 goto fail;
2135 break;
2137 case 0x23: /* MOV <reg>,DR? */
2138 opcode = insn_fetch(u8, code_base, eip, code_limit);
2139 if ( opcode < 0xc0 )
2140 goto fail;
2141 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2142 modrm_rm |= (opcode >> 0) & 7;
2143 reg = decode_register(modrm_rm, regs, 0);
2144 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2145 goto fail;
2146 break;
2148 case 0x30: /* WRMSR */
2149 eax = regs->eax;
2150 edx = regs->edx;
2151 res = ((u64)edx << 32) | eax;
2152 switch ( (u32)regs->ecx )
2154 #ifdef CONFIG_X86_64
2155 case MSR_FS_BASE:
2156 if ( is_pv_32on64_vcpu(v) )
2157 goto fail;
2158 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2159 goto fail;
2160 v->arch.guest_context.fs_base = res;
2161 break;
2162 case MSR_GS_BASE:
2163 if ( is_pv_32on64_vcpu(v) )
2164 goto fail;
2165 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2166 goto fail;
2167 v->arch.guest_context.gs_base_kernel = res;
2168 break;
2169 case MSR_SHADOW_GS_BASE:
2170 if ( is_pv_32on64_vcpu(v) )
2171 goto fail;
2172 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2173 goto fail;
2174 v->arch.guest_context.gs_base_user = res;
2175 break;
2176 #endif
2177 case MSR_K7_FID_VID_STATUS:
2178 case MSR_K7_FID_VID_CTL:
2179 case MSR_K8_PSTATE_LIMIT:
2180 case MSR_K8_PSTATE_CTRL:
2181 case MSR_K8_PSTATE_STATUS:
2182 case MSR_K8_PSTATE0:
2183 case MSR_K8_PSTATE1:
2184 case MSR_K8_PSTATE2:
2185 case MSR_K8_PSTATE3:
2186 case MSR_K8_PSTATE4:
2187 case MSR_K8_PSTATE5:
2188 case MSR_K8_PSTATE6:
2189 case MSR_K8_PSTATE7:
2190 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2191 goto fail;
2192 if ( !is_cpufreq_controller(v->domain) )
2193 break;
2194 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2195 goto fail;
2196 break;
2197 case MSR_AMD64_NB_CFG:
2198 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2199 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2200 goto fail;
2201 if ( !IS_PRIV(v->domain) )
2202 break;
2203 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2204 (eax != l) ||
2205 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2206 goto invalid;
2207 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2208 goto fail;
2209 break;
2210 case MSR_FAM10H_MMIO_CONF_BASE:
2211 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2212 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2213 goto fail;
2214 if ( !IS_PRIV(v->domain) )
2215 break;
2216 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2217 (((((u64)h << 32) | l) ^ res) &
2218 ~( FAM10H_MMIO_CONF_ENABLE |
2219 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2220 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2221 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2222 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2223 goto invalid;
2224 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2225 goto fail;
2226 break;
2227 case MSR_IA32_MPERF:
2228 case MSR_IA32_APERF:
2229 case MSR_IA32_PERF_CTL:
2230 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2231 goto fail;
2232 if ( !is_cpufreq_controller(v->domain) )
2233 break;
2234 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2235 goto fail;
2236 break;
2237 case MSR_IA32_THERM_CONTROL:
2238 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2239 goto fail;
2240 if ( (v->domain->domain_id != 0) || !v->domain->is_pinned )
2241 break;
2242 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2243 goto fail;
2244 break;
2245 default:
2246 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2247 break;
2248 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
2250 int rc = intel_mce_wrmsr(regs->ecx, res);
2251 if ( rc < 0 )
2252 goto fail;
2253 if ( rc )
2254 break;
2257 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2258 (eax != l) || (edx != h) )
2259 invalid:
2260 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2261 "%08x:%08x to %08x:%08x.\n",
2262 _p(regs->ecx), h, l, edx, eax);
2263 break;
2265 break;
2267 case 0x31: /* RDTSC */
2268 rdtsc(regs->eax, regs->edx);
2269 break;
2271 case 0x32: /* RDMSR */
2272 switch ( (u32)regs->ecx )
2274 #ifdef CONFIG_X86_64
2275 case MSR_FS_BASE:
2276 if ( is_pv_32on64_vcpu(v) )
2277 goto fail;
2278 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2279 regs->edx = v->arch.guest_context.fs_base >> 32;
2280 break;
2281 case MSR_GS_BASE:
2282 if ( is_pv_32on64_vcpu(v) )
2283 goto fail;
2284 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2285 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2286 break;
2287 case MSR_SHADOW_GS_BASE:
2288 if ( is_pv_32on64_vcpu(v) )
2289 goto fail;
2290 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2291 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2292 break;
2293 #endif
2294 case MSR_K7_FID_VID_CTL:
2295 case MSR_K7_FID_VID_STATUS:
2296 case MSR_K8_PSTATE_LIMIT:
2297 case MSR_K8_PSTATE_CTRL:
2298 case MSR_K8_PSTATE_STATUS:
2299 case MSR_K8_PSTATE0:
2300 case MSR_K8_PSTATE1:
2301 case MSR_K8_PSTATE2:
2302 case MSR_K8_PSTATE3:
2303 case MSR_K8_PSTATE4:
2304 case MSR_K8_PSTATE5:
2305 case MSR_K8_PSTATE6:
2306 case MSR_K8_PSTATE7:
2307 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2308 goto fail;
2309 if ( !is_cpufreq_controller(v->domain) )
2311 regs->eax = regs->edx = 0;
2312 break;
2314 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2315 goto fail;
2316 break;
2317 case MSR_IA32_MISC_ENABLE:
2318 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2319 goto fail;
2320 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2321 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2322 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2323 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2324 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2325 break;
2326 case MSR_EFER:
2327 case MSR_AMD_PATCHLEVEL:
2328 default:
2329 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2331 rdmsr_writeback:
2332 regs->eax = l;
2333 regs->edx = h;
2334 break;
2337 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
2339 int rc = intel_mce_rdmsr(regs->ecx, &l, &h);
2341 if ( rc < 0 )
2342 goto fail;
2343 if ( rc )
2344 goto rdmsr_writeback;
2347 /* Everyone can read the MSR space. */
2348 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2349 _p(regs->ecx));*/
2350 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2351 goto fail;
2352 break;
2354 break;
2356 default:
2357 goto fail;
2360 #undef wr_ad
2361 #undef rd_ad
2363 done:
2364 instruction_done(regs, eip, bpmatch);
2365 skip:
2366 return EXCRET_fault_fixed;
2368 fail:
2369 return 0;
2372 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2373 unsigned int esp, unsigned int decr)
2375 return (((esp - decr) < (esp - 1)) &&
2376 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2379 static void emulate_gate_op(struct cpu_user_regs *regs)
2381 #ifdef __x86_64__
2382 struct vcpu *v = current;
2383 unsigned int sel, ar, dpl, nparm, opnd_sel;
2384 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2385 unsigned long off, eip, opnd_off, base, limit;
2386 int jump;
2388 /* Check whether this fault is due to the use of a call gate. */
2389 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2390 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2391 ((ar & _SEGMENT_TYPE) != 0xc00) )
2393 do_guest_trap(TRAP_gp_fault, regs, 1);
2394 return;
2396 if ( !(ar & _SEGMENT_P) )
2398 do_guest_trap(TRAP_no_segment, regs, 1);
2399 return;
2401 dpl = (ar >> 13) & 3;
2402 nparm = ar & 0x1f;
2404 /*
2405 * Decode instruction (and perhaps operand) to determine RPL,
2406 * whether this is a jump or a call, and the call return offset.
2407 */
2408 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2409 !(ar & _SEGMENT_S) ||
2410 !(ar & _SEGMENT_P) ||
2411 !(ar & _SEGMENT_CODE) )
2413 do_guest_trap(TRAP_gp_fault, regs, 1);
2414 return;
2417 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2418 ad_default = ad_bytes = op_default;
2419 opnd_sel = opnd_off = 0;
2420 jump = -1;
2421 for ( eip = regs->eip; eip - regs->_eip < 10; )
2423 switch ( insn_fetch(u8, base, eip, limit) )
2425 case 0x66: /* operand-size override */
2426 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2427 continue;
2428 case 0x67: /* address-size override */
2429 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2430 continue;
2431 case 0x2e: /* CS override */
2432 opnd_sel = regs->cs;
2433 ASSERT(opnd_sel);
2434 continue;
2435 case 0x3e: /* DS override */
2436 opnd_sel = read_sreg(regs, ds);
2437 if ( !opnd_sel )
2438 opnd_sel = dpl;
2439 continue;
2440 case 0x26: /* ES override */
2441 opnd_sel = read_sreg(regs, es);
2442 if ( !opnd_sel )
2443 opnd_sel = dpl;
2444 continue;
2445 case 0x64: /* FS override */
2446 opnd_sel = read_sreg(regs, fs);
2447 if ( !opnd_sel )
2448 opnd_sel = dpl;
2449 continue;
2450 case 0x65: /* GS override */
2451 opnd_sel = read_sreg(regs, gs);
2452 if ( !opnd_sel )
2453 opnd_sel = dpl;
2454 continue;
2455 case 0x36: /* SS override */
2456 opnd_sel = regs->ss;
2457 if ( !opnd_sel )
2458 opnd_sel = dpl;
2459 continue;
2460 case 0xea:
2461 ++jump;
2462 /* FALLTHROUGH */
2463 case 0x9a:
2464 ++jump;
2465 opnd_sel = regs->cs;
2466 opnd_off = eip;
2467 ad_bytes = ad_default;
2468 eip += op_bytes + 2;
2469 break;
2470 case 0xff:
2472 unsigned int modrm;
2474 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2476 case 0x28: case 0x68: case 0xa8:
2477 ++jump;
2478 /* FALLTHROUGH */
2479 case 0x18: case 0x58: case 0x98:
2480 ++jump;
2481 if ( ad_bytes != 2 )
2483 if ( (modrm & 7) == 4 )
2485 unsigned int sib;
2486 sib = insn_fetch(u8, base, eip, limit);
2488 modrm = (modrm & ~7) | (sib & 7);
2489 if ( (sib >>= 3) != 4 )
2490 opnd_off = *(unsigned long *)
2491 decode_register(sib & 7, regs, 0);
2492 opnd_off <<= sib >> 3;
2494 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2495 opnd_off += *(unsigned long *)
2496 decode_register(modrm & 7, regs, 0);
2497 else
2498 modrm |= 0x87;
2499 if ( !opnd_sel )
2501 switch ( modrm & 7 )
2503 default:
2504 opnd_sel = read_sreg(regs, ds);
2505 break;
2506 case 4: case 5:
2507 opnd_sel = regs->ss;
2508 break;
2512 else
2514 switch ( modrm & 7 )
2516 case 0: case 1: case 7:
2517 opnd_off = regs->ebx;
2518 break;
2519 case 6:
2520 if ( !(modrm & 0xc0) )
2521 modrm |= 0x80;
2522 else
2523 case 2: case 3:
2525 opnd_off = regs->ebp;
2526 if ( !opnd_sel )
2527 opnd_sel = regs->ss;
2529 break;
2531 if ( !opnd_sel )
2532 opnd_sel = read_sreg(regs, ds);
2533 switch ( modrm & 7 )
2535 case 0: case 2: case 4:
2536 opnd_off += regs->esi;
2537 break;
2538 case 1: case 3: case 5:
2539 opnd_off += regs->edi;
2540 break;
2543 switch ( modrm & 0xc0 )
2545 case 0x40:
2546 opnd_off += insn_fetch(s8, base, eip, limit);
2547 break;
2548 case 0x80:
2549 opnd_off += insn_fetch(s32, base, eip, limit);
2550 break;
2552 if ( ad_bytes == 4 )
2553 opnd_off = (unsigned int)opnd_off;
2554 else if ( ad_bytes == 2 )
2555 opnd_off = (unsigned short)opnd_off;
2556 break;
2559 break;
2561 break;
2564 if ( jump < 0 )
2566 fail:
2567 do_guest_trap(TRAP_gp_fault, regs, 1);
2568 skip:
2569 return;
2572 if ( (opnd_sel != regs->cs &&
2573 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2574 !(ar & _SEGMENT_S) ||
2575 !(ar & _SEGMENT_P) ||
2576 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2578 do_guest_trap(TRAP_gp_fault, regs, 1);
2579 return;
2582 opnd_off += op_bytes;
2583 #define ad_default ad_bytes
2584 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2585 #undef ad_default
2586 ASSERT((opnd_sel & ~3) == regs->error_code);
2587 if ( dpl < (opnd_sel & 3) )
2589 do_guest_trap(TRAP_gp_fault, regs, 1);
2590 return;
2593 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2594 !(ar & _SEGMENT_S) ||
2595 !(ar & _SEGMENT_CODE) ||
2596 (!jump || (ar & _SEGMENT_EC) ?
2597 ((ar >> 13) & 3) > (regs->cs & 3) :
2598 ((ar >> 13) & 3) != (regs->cs & 3)) )
2600 regs->error_code = sel;
2601 do_guest_trap(TRAP_gp_fault, regs, 1);
2602 return;
2604 if ( !(ar & _SEGMENT_P) )
2606 regs->error_code = sel;
2607 do_guest_trap(TRAP_no_segment, regs, 1);
2608 return;
2610 if ( off > limit )
2612 regs->error_code = 0;
2613 do_guest_trap(TRAP_gp_fault, regs, 1);
2614 return;
2617 if ( !jump )
2619 unsigned int ss, esp, *stkp;
2620 int rc;
2621 #define push(item) do \
2622 { \
2623 --stkp; \
2624 esp -= 4; \
2625 rc = __put_user(item, stkp); \
2626 if ( rc ) \
2627 { \
2628 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2629 PFEC_write_access); \
2630 return; \
2631 } \
2632 } while ( 0 )
2634 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2636 sel |= (ar >> 13) & 3;
2637 /* Inner stack known only for kernel ring. */
2638 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2640 do_guest_trap(TRAP_gp_fault, regs, 1);
2641 return;
2643 esp = v->arch.guest_context.kernel_sp;
2644 ss = v->arch.guest_context.kernel_ss;
2645 if ( (ss & 3) != (sel & 3) ||
2646 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2647 ((ar >> 13) & 3) != (sel & 3) ||
2648 !(ar & _SEGMENT_S) ||
2649 (ar & _SEGMENT_CODE) ||
2650 !(ar & _SEGMENT_WR) )
2652 regs->error_code = ss & ~3;
2653 do_guest_trap(TRAP_invalid_tss, regs, 1);
2654 return;
2656 if ( !(ar & _SEGMENT_P) ||
2657 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2659 regs->error_code = ss & ~3;
2660 do_guest_trap(TRAP_stack_error, regs, 1);
2661 return;
2663 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2664 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2666 do_guest_trap(TRAP_gp_fault, regs, 1);
2667 return;
2669 push(regs->ss);
2670 push(regs->esp);
2671 if ( nparm )
2673 const unsigned int *ustkp;
2675 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2676 ((ar >> 13) & 3) != (regs->cs & 3) ||
2677 !(ar & _SEGMENT_S) ||
2678 (ar & _SEGMENT_CODE) ||
2679 !(ar & _SEGMENT_WR) ||
2680 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2681 return do_guest_trap(TRAP_gp_fault, regs, 1);
2682 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2683 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2685 do_guest_trap(TRAP_gp_fault, regs, 1);
2686 return;
2688 do
2690 unsigned int parm;
2692 --ustkp;
2693 rc = __get_user(parm, ustkp);
2694 if ( rc )
2696 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2697 return;
2699 push(parm);
2700 } while ( --nparm );
2703 else
2705 sel |= (regs->cs & 3);
2706 esp = regs->esp;
2707 ss = regs->ss;
2708 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2709 ((ar >> 13) & 3) != (sel & 3) )
2711 do_guest_trap(TRAP_gp_fault, regs, 1);
2712 return;
2714 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2716 regs->error_code = 0;
2717 do_guest_trap(TRAP_stack_error, regs, 1);
2718 return;
2720 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2721 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2723 do_guest_trap(TRAP_gp_fault, regs, 1);
2724 return;
2727 push(regs->cs);
2728 push(eip);
2729 #undef push
2730 regs->esp = esp;
2731 regs->ss = ss;
2733 else
2734 sel |= (regs->cs & 3);
2736 regs->cs = sel;
2737 instruction_done(regs, off, 0);
2738 #endif
2741 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2743 struct vcpu *v = current;
2744 unsigned long fixup;
2746 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2748 if ( regs->error_code & 1 )
2749 goto hardware_gp;
2751 if ( !guest_mode(regs) )
2752 goto gp_in_kernel;
2754 /*
2755 * Cunning trick to allow arbitrary "INT n" handling.
2757 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2758 * instruction from trapping to the appropriate vector, when that might not
2759 * be expected by Xen or the guest OS. For example, that entry might be for
2760 * a fault handler (unlike traps, faults don't increment EIP), or might
2761 * expect an error code on the stack (which a software trap never
2762 * provides), or might be a hardware interrupt handler that doesn't like
2763 * being called spuriously.
2765 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2766 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2767 * clear to indicate that it's a software fault, not hardware.
2769 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2770 * okay because they can only be triggered by an explicit DPL-checked
2771 * instruction. The DPL specified by the guest OS for these vectors is NOT
2772 * CHECKED!!
2773 */
2774 if ( (regs->error_code & 3) == 2 )
2776 /* This fault must be due to <INT n> instruction. */
2777 const struct trap_info *ti;
2778 unsigned char vector = regs->error_code >> 3;
2779 ti = &v->arch.guest_context.trap_ctxt[vector];
2780 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2782 regs->eip += 2;
2783 do_guest_trap(vector, regs, 0);
2784 return;
2787 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2789 emulate_gate_op(regs);
2790 return;
2793 /* Emulate some simple privileged and I/O instructions. */
2794 if ( (regs->error_code == 0) &&
2795 emulate_privileged_op(regs) )
2797 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2798 return;
2801 #if defined(__i386__)
2802 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2803 (regs->error_code == 0) &&
2804 gpf_emulate_4gb(regs) )
2806 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2807 return;
2809 #endif
2811 /* Pass on GPF as is. */
2812 do_guest_trap(TRAP_gp_fault, regs, 1);
2813 return;
2815 gp_in_kernel:
2817 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2819 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2820 regs->error_code, _p(regs->eip), _p(fixup));
2821 regs->eip = fixup;
2822 return;
2825 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2827 hardware_gp:
2828 show_execution_state(regs);
2829 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2832 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2834 static void nmi_mce_softirq(void)
2836 int cpu = smp_processor_id();
2837 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2838 cpumask_t affinity;
2840 BUG_ON(st == NULL);
2841 BUG_ON(st->vcpu == NULL);
2843 /* Set the tmp value unconditionally, so that
2844 * the check in the iret hypercall works. */
2845 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2847 if ((cpu != st->processor)
2848 || (st->processor != st->vcpu->processor))
2850 /* We are on a different physical cpu.
2851 * Make sure to wakeup the vcpu on the
2852 * specified processor.
2853 */
2854 cpus_clear(affinity);
2855 cpu_set(st->processor, affinity);
2856 vcpu_set_affinity(st->vcpu, &affinity);
2858 /* Affinity is restored in the iret hypercall. */
2861 /* Only used to defer wakeup of domain/vcpu to
2862 * a safe (non-NMI/MCE) context.
2863 */
2864 vcpu_kick(st->vcpu);
2867 static void nmi_dom0_report(unsigned int reason_idx)
2869 struct domain *d = dom0;
2871 if ( (d == NULL) || (d->vcpu == NULL) || (d->vcpu[0] == NULL) )
2872 return;
2874 set_bit(reason_idx, nmi_reason(d));
2876 send_guest_trap(d, 0, TRAP_nmi);
2879 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2881 switch ( opt_nmi[0] )
2883 case 'd': /* 'dom0' */
2884 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2885 case 'i': /* 'ignore' */
2886 break;
2887 default: /* 'fatal' */
2888 console_force_unlock();
2889 printk("\n\nNMI - MEMORY ERROR\n");
2890 fatal_trap(TRAP_nmi, regs);
2893 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2894 mdelay(1);
2895 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2898 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2900 switch ( opt_nmi[0] )
2902 case 'd': /* 'dom0' */
2903 nmi_dom0_report(_XEN_NMIREASON_io_error);
2904 case 'i': /* 'ignore' */
2905 break;
2906 default: /* 'fatal' */
2907 console_force_unlock();
2908 printk("\n\nNMI - I/O ERROR\n");
2909 fatal_trap(TRAP_nmi, regs);
2912 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2913 mdelay(1);
2914 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2917 static void unknown_nmi_error(unsigned char reason)
2919 switch ( opt_nmi[0] )
2921 case 'd': /* 'dom0' */
2922 nmi_dom0_report(_XEN_NMIREASON_unknown);
2923 case 'i': /* 'ignore' */
2924 break;
2925 default: /* 'fatal' */
2926 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2927 printk("Dazed and confused, but trying to continue\n");
2928 printk("Do you have a strange power saving mode enabled?\n");
2929 kexec_crash();
2933 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2935 return 0;
2938 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2940 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2942 unsigned int cpu = smp_processor_id();
2943 unsigned char reason;
2945 ++nmi_count(cpu);
2947 if ( nmi_callback(regs, cpu) )
2948 return;
2950 if ( nmi_watchdog )
2951 nmi_watchdog_tick(regs);
2953 /* Only the BSP gets external NMIs from the system. */
2954 if ( cpu == 0 )
2956 reason = inb(0x61);
2957 if ( reason & 0x80 )
2958 mem_parity_error(regs);
2959 else if ( reason & 0x40 )
2960 io_check_error(regs);
2961 else if ( !nmi_watchdog )
2962 unknown_nmi_error((unsigned char)(reason&0xff));
2966 void set_nmi_callback(nmi_callback_t callback)
2968 nmi_callback = callback;
2971 void unset_nmi_callback(void)
2973 nmi_callback = dummy_nmi_callback;
2976 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2978 struct vcpu *curr = current;
2980 BUG_ON(!guest_mode(regs));
2982 setup_fpu(curr);
2984 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2986 do_guest_trap(TRAP_no_device, regs, 0);
2987 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2989 else
2990 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2992 return;
2995 asmlinkage void do_debug(struct cpu_user_regs *regs)
2997 struct vcpu *v = current;
2999 DEBUGGER_trap_entry(TRAP_debug, regs);
3001 if ( !guest_mode(regs) )
3003 if ( regs->eflags & EF_TF )
3005 #ifdef __x86_64__
3006 void sysenter_entry(void);
3007 void sysenter_eflags_saved(void);
3008 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
3009 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
3010 (regs->rip < (unsigned long)sysenter_eflags_saved) )
3011 goto out;
3012 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
3013 #else
3014 WARN_ON(1);
3015 #endif
3016 regs->eflags &= ~EF_TF;
3018 else
3020 /*
3021 * We ignore watchpoints when they trigger within Xen. This may
3022 * happen when a buffer is passed to us which previously had a
3023 * watchpoint set on it. No need to bump EIP; the only faulting
3024 * trap is an instruction breakpoint, which can't happen to us.
3025 */
3026 WARN_ON(!search_exception_table(regs->eip));
3028 goto out;
3031 /* Save debug status register where guest OS can peek at it */
3032 v->arch.guest_context.debugreg[6] = read_debugreg(6);
3034 ler_enable();
3035 do_guest_trap(TRAP_debug, regs, 0);
3036 return;
3038 out:
3039 ler_enable();
3040 return;
3043 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
3047 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
3049 int i;
3050 /* Keep secondary tables in sync with IRQ updates. */
3051 for ( i = 1; i < NR_CPUS; i++ )
3052 if ( idt_tables[i] != NULL )
3053 _set_gate(&idt_tables[i][n], 14, dpl, addr);
3054 _set_gate(&idt_table[n], 14, dpl, addr);
3057 static void set_swint_gate(unsigned int n, void *addr)
3059 __set_intr_gate(n, 3, addr);
3062 void set_intr_gate(unsigned int n, void *addr)
3064 __set_intr_gate(n, 0, addr);
3067 void load_TR(void)
3069 struct tss_struct *tss = &init_tss[smp_processor_id()];
3070 struct desc_ptr old_gdt, tss_gdt = {
3071 .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
3072 .limit = LAST_RESERVED_GDT_BYTE
3073 };
3075 _set_tssldt_desc(
3076 this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3077 (unsigned long)tss,
3078 offsetof(struct tss_struct, __cacheline_filler) - 1,
3079 9);
3080 #ifdef CONFIG_COMPAT
3081 _set_tssldt_desc(
3082 this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3083 (unsigned long)tss,
3084 offsetof(struct tss_struct, __cacheline_filler) - 1,
3085 11);
3086 #endif
3088 /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
3089 asm volatile (
3090 "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
3091 : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
3094 void __devinit percpu_traps_init(void)
3096 subarch_percpu_traps_init();
3098 if ( !opt_ler )
3099 return;
3101 switch ( boot_cpu_data.x86_vendor )
3103 case X86_VENDOR_INTEL:
3104 switch ( boot_cpu_data.x86 )
3106 case 6:
3107 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3108 break;
3109 case 15:
3110 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3111 break;
3113 break;
3114 case X86_VENDOR_AMD:
3115 switch ( boot_cpu_data.x86 )
3117 case 6:
3118 case 15:
3119 case 16:
3120 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3121 break;
3123 break;
3126 ler_enable();
3129 void __init trap_init(void)
3131 /*
3132 * Note that interrupt gates are always used, rather than trap gates. We
3133 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3134 * first activation must have the "bad" value(s) for these registers and
3135 * we may lose them if another activation is installed before they are
3136 * saved. The page-fault handler also needs interrupts disabled until %cr2
3137 * has been read and saved on the stack.
3138 */
3139 set_intr_gate(TRAP_divide_error,&divide_error);
3140 set_intr_gate(TRAP_debug,&debug);
3141 set_intr_gate(TRAP_nmi,&nmi);
3142 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3143 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3144 set_intr_gate(TRAP_bounds,&bounds);
3145 set_intr_gate(TRAP_invalid_op,&invalid_op);
3146 set_intr_gate(TRAP_no_device,&device_not_available);
3147 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3148 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3149 set_intr_gate(TRAP_no_segment,&segment_not_present);
3150 set_intr_gate(TRAP_stack_error,&stack_segment);
3151 set_intr_gate(TRAP_gp_fault,&general_protection);
3152 set_intr_gate(TRAP_page_fault,&page_fault);
3153 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3154 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3155 set_intr_gate(TRAP_alignment_check,&alignment_check);
3156 set_intr_gate(TRAP_machine_check,&machine_check);
3157 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3159 /* CPU0 uses the master IDT. */
3160 idt_tables[0] = idt_table;
3162 percpu_traps_init();
3164 cpu_init();
3166 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3169 long register_guest_nmi_callback(unsigned long address)
3171 struct vcpu *v = current;
3172 struct domain *d = v->domain;
3173 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3175 t->vector = TRAP_nmi;
3176 t->flags = 0;
3177 t->cs = (is_pv_32on64_domain(d) ?
3178 FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS);
3179 t->address = address;
3180 TI_SET_IF(t, 1);
3182 /*
3183 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3184 * now.
3185 */
3186 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3187 v->nmi_pending = 1;
3189 return 0;
3192 long unregister_guest_nmi_callback(void)
3194 struct vcpu *v = current;
3195 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3197 memset(t, 0, sizeof(*t));
3199 return 0;
3202 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3204 struct vcpu *v;
3205 struct trap_info *t;
3207 BUG_ON(d == NULL);
3208 BUG_ON(vcpuid >= d->max_vcpus);
3210 /* Sanity check - XXX should be more fine grained. */
3211 BUG_ON(trap_nr > TRAP_syscall);
3213 v = d->vcpu[vcpuid];
3214 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3216 return (t->address != 0);
3220 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3222 struct vcpu *v;
3223 struct softirq_trap *st;
3225 BUG_ON(d == NULL);
3226 BUG_ON(vcpuid >= d->max_vcpus);
3227 v = d->vcpu[vcpuid];
3229 switch (trap_nr) {
3230 case TRAP_nmi:
3231 if ( !test_and_set_bool(v->nmi_pending) ) {
3232 st = &per_cpu(softirq_trap, smp_processor_id());
3233 st->domain = dom0;
3234 st->vcpu = dom0->vcpu[0];
3235 st->processor = st->vcpu->processor;
3237 /* not safe to wake up a vcpu here */
3238 raise_softirq(NMI_MCE_SOFTIRQ);
3239 return 0;
3241 break;
3243 case TRAP_machine_check:
3245 /* We are called by the machine check (exception or polling) handlers
3246 * on the physical CPU that reported a machine check error. */
3248 if ( !test_and_set_bool(v->mce_pending) ) {
3249 st = &per_cpu(softirq_trap, smp_processor_id());
3250 st->domain = d;
3251 st->vcpu = v;
3252 st->processor = v->processor;
3254 /* not safe to wake up a vcpu here */
3255 raise_softirq(NMI_MCE_SOFTIRQ);
3256 return 0;
3258 break;
3261 /* delivery failed */
3262 return -EIO;
3266 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3268 struct trap_info cur;
3269 struct vcpu *curr = current;
3270 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3271 long rc = 0;
3273 /* If no table is presented then clear the entire virtual IDT. */
3274 if ( guest_handle_is_null(traps) )
3276 memset(dst, 0, 256 * sizeof(*dst));
3277 init_int80_direct_trap(curr);
3278 return 0;
3281 for ( ; ; )
3283 if ( hypercall_preempt_check() )
3285 rc = hypercall_create_continuation(
3286 __HYPERVISOR_set_trap_table, "h", traps);
3287 break;
3290 if ( copy_from_guest(&cur, traps, 1) )
3292 rc = -EFAULT;
3293 break;
3296 if ( cur.address == 0 )
3297 break;
3299 fixup_guest_code_selector(curr->domain, cur.cs);
3301 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3303 if ( cur.vector == 0x80 )
3304 init_int80_direct_trap(curr);
3306 guest_handle_add_offset(traps, 1);
3309 return rc;
3312 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3314 int i;
3315 struct vcpu *curr = current;
3317 switch ( reg )
3319 case 0:
3320 if ( !access_ok(value, sizeof(long)) )
3321 return -EPERM;
3322 if ( v == curr )
3323 write_debugreg(0, value);
3324 break;
3325 case 1:
3326 if ( !access_ok(value, sizeof(long)) )
3327 return -EPERM;
3328 if ( v == curr )
3329 write_debugreg(1, value);
3330 break;
3331 case 2:
3332 if ( !access_ok(value, sizeof(long)) )
3333 return -EPERM;
3334 if ( v == curr )
3335 write_debugreg(2, value);
3336 break;
3337 case 3:
3338 if ( !access_ok(value, sizeof(long)) )
3339 return -EPERM;
3340 if ( v == curr )
3341 write_debugreg(3, value);
3342 break;
3343 case 6:
3344 /*
3345 * DR6: Bits 4-11,16-31 reserved (set to 1).
3346 * Bit 12 reserved (set to 0).
3347 */
3348 value &= 0xffffefff; /* reserved bits => 0 */
3349 value |= 0xffff0ff0; /* reserved bits => 1 */
3350 if ( v == curr )
3351 write_debugreg(6, value);
3352 break;
3353 case 7:
3354 /*
3355 * DR7: Bit 10 reserved (set to 1).
3356 * Bits 11-12,14-15 reserved (set to 0).
3357 */
3358 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3359 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3360 /*
3361 * Privileged bits:
3362 * GD (bit 13): must be 0.
3363 */
3364 if ( value & DR_GENERAL_DETECT )
3365 return -EPERM;
3366 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3367 if ( value & DR7_ACTIVE_MASK )
3369 unsigned int io_enable = 0;
3371 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3373 if ( ((value >> i) & 3) == DR_IO )
3375 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3376 return -EPERM;
3377 io_enable |= value & (3 << ((i - 16) >> 1));
3379 #ifdef __i386__
3380 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3381 !boot_cpu_has(X86_FEATURE_LM)) &&
3382 (((value >> i) & 0xc) == DR_LEN_8) )
3383 return -EPERM;
3384 #endif
3387 /* Guest DR5 is a handy stash for I/O intercept information. */
3388 v->arch.guest_context.debugreg[5] = io_enable;
3389 value &= ~io_enable;
3391 /*
3392 * If DR7 was previously clear then we need to load all other
3393 * debug registers at this point as they were not restored during
3394 * context switch.
3395 */
3396 if ( (v == curr) &&
3397 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3399 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3400 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3401 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3402 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3403 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3406 if ( v == curr )
3407 write_debugreg(7, value);
3408 break;
3409 default:
3410 return -EINVAL;
3413 v->arch.guest_context.debugreg[reg] = value;
3414 return 0;
3417 long do_set_debugreg(int reg, unsigned long value)
3419 return set_debugreg(current, reg, value);
3422 unsigned long do_get_debugreg(int reg)
3424 struct vcpu *curr = current;
3426 switch ( reg )
3428 case 0 ... 3:
3429 case 6:
3430 return curr->arch.guest_context.debugreg[reg];
3431 case 7:
3432 return (curr->arch.guest_context.debugreg[7] |
3433 curr->arch.guest_context.debugreg[5]);
3434 case 4 ... 5:
3435 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3436 curr->arch.guest_context.debugreg[reg + 2] : 0);
3439 return -EINVAL;
3442 /*
3443 * Local variables:
3444 * mode: C
3445 * c-set-style: "BSD"
3446 * c-basic-offset: 4
3447 * tab-width: 4
3448 * indent-tabs-mode: nil
3449 * End:
3450 */